diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,45624 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.24282982791587, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019120458891013384, + "grad_norm": 9.746596336364746, + "learning_rate": 0.0, + "loss": 0.8644, + "step": 1 + }, + { + "epoch": 0.0003824091778202677, + "grad_norm": 7.742455005645752, + "learning_rate": 4.404279284341596e-07, + "loss": 0.6036, + "step": 2 + }, + { + "epoch": 0.0005736137667304016, + "grad_norm": 4.959498405456543, + "learning_rate": 6.980617508384441e-07, + "loss": 0.5138, + "step": 3 + }, + { + "epoch": 0.0007648183556405354, + "grad_norm": 12.7059326171875, + "learning_rate": 8.808558568683192e-07, + "loss": 0.6537, + "step": 4 + }, + { + "epoch": 0.0009560229445506692, + "grad_norm": 9.178160667419434, + "learning_rate": 1.0226419808043158e-06, + "loss": 0.5527, + "step": 5 + }, + { + "epoch": 0.001147227533460803, + "grad_norm": 9.766277313232422, + "learning_rate": 1.1384896792726035e-06, + "loss": 0.4063, + "step": 6 + }, + { + "epoch": 0.0013384321223709368, + "grad_norm": 5.554647922515869, + "learning_rate": 1.236437512701272e-06, + "loss": 0.7499, + "step": 7 + }, + { + "epoch": 0.0015296367112810707, + "grad_norm": 8.689834594726562, + "learning_rate": 1.3212837853024787e-06, + "loss": 1.4177, + "step": 8 + }, + { + "epoch": 0.0017208413001912047, + "grad_norm": 3.352787494659424, + "learning_rate": 1.3961235016768883e-06, + "loss": 0.5945, + "step": 9 + }, + { + "epoch": 0.0019120458891013384, + "grad_norm": 8.300889015197754, + "learning_rate": 1.4630699092384754e-06, + "loss": 0.7862, + "step": 10 + }, + { + "epoch": 0.002103250478011472, + "grad_norm": 4.62298583984375, + "learning_rate": 1.5236303013560567e-06, + "loss": 0.4052, + "step": 11 + }, + { + "epoch": 0.002294455066921606, + "grad_norm": 4.900755405426025, + "learning_rate": 1.578917607706763e-06, + "loss": 0.2796, + "step": 12 + }, + { + "epoch": 0.00248565965583174, + "grad_norm": 4.150928497314453, + "learning_rate": 1.6297769993563666e-06, + "loss": 0.3957, + "step": 13 + }, + { + "epoch": 0.0026768642447418736, + "grad_norm": 3.4511361122131348, + "learning_rate": 1.6768654411354316e-06, + "loss": 0.5152, + "step": 14 + }, + { + "epoch": 0.0028680688336520078, + "grad_norm": 3.906311511993408, + "learning_rate": 1.7207037316427597e-06, + "loss": 0.5973, + "step": 15 + }, + { + "epoch": 0.0030592734225621415, + "grad_norm": 5.145625591278076, + "learning_rate": 1.7617117137366383e-06, + "loss": 0.5577, + "step": 16 + }, + { + "epoch": 0.003250478011472275, + "grad_norm": 4.597131252288818, + "learning_rate": 1.800232791723491e-06, + "loss": 0.4065, + "step": 17 + }, + { + "epoch": 0.0034416826003824093, + "grad_norm": 5.256167411804199, + "learning_rate": 1.8365514301110474e-06, + "loss": 0.4937, + "step": 18 + }, + { + "epoch": 0.003632887189292543, + "grad_norm": 6.020605087280273, + "learning_rate": 1.8709059148844288e-06, + "loss": 0.2741, + "step": 19 + }, + { + "epoch": 0.0038240917782026767, + "grad_norm": 6.1135969161987305, + "learning_rate": 1.9034978376726347e-06, + "loss": 0.6598, + "step": 20 + }, + { + "epoch": 0.00401529636711281, + "grad_norm": 2.16821026802063, + "learning_rate": 1.934499263539716e-06, + "loss": 0.4142, + "step": 21 + }, + { + "epoch": 0.004206500956022944, + "grad_norm": 4.964519023895264, + "learning_rate": 1.964058229790216e-06, + "loss": 0.5294, + "step": 22 + }, + { + "epoch": 0.004397705544933079, + "grad_norm": 2.558614492416382, + "learning_rate": 1.992303021449765e-06, + "loss": 0.451, + "step": 23 + }, + { + "epoch": 0.004588910133843212, + "grad_norm": 5.189733028411865, + "learning_rate": 2.019345536140923e-06, + "loss": 0.5024, + "step": 24 + }, + { + "epoch": 0.004780114722753346, + "grad_norm": 5.467967510223389, + "learning_rate": 2.0452839616086316e-06, + "loss": 0.6787, + "step": 25 + }, + { + "epoch": 0.00497131931166348, + "grad_norm": 4.121450424194336, + "learning_rate": 2.070204927790526e-06, + "loss": 0.6174, + "step": 26 + }, + { + "epoch": 0.0051625239005736135, + "grad_norm": 2.465310573577881, + "learning_rate": 2.094185252515332e-06, + "loss": 0.3246, + "step": 27 + }, + { + "epoch": 0.005353728489483747, + "grad_norm": 3.9040215015411377, + "learning_rate": 2.117293369569591e-06, + "loss": 0.3824, + "step": 28 + }, + { + "epoch": 0.005544933078393882, + "grad_norm": 4.060995101928711, + "learning_rate": 2.1395905060565537e-06, + "loss": 0.6113, + "step": 29 + }, + { + "epoch": 0.0057361376673040155, + "grad_norm": 3.5145015716552734, + "learning_rate": 2.1611316600769195e-06, + "loss": 0.25, + "step": 30 + }, + { + "epoch": 0.005927342256214149, + "grad_norm": 4.692299842834473, + "learning_rate": 2.1819664180398482e-06, + "loss": 0.3016, + "step": 31 + }, + { + "epoch": 0.006118546845124283, + "grad_norm": 4.2027788162231445, + "learning_rate": 2.202139642170798e-06, + "loss": 0.873, + "step": 32 + }, + { + "epoch": 0.006309751434034417, + "grad_norm": 2.9636874198913574, + "learning_rate": 2.2216920521945006e-06, + "loss": 0.8008, + "step": 33 + }, + { + "epoch": 0.00650095602294455, + "grad_norm": 1.739428162574768, + "learning_rate": 2.240660720157651e-06, + "loss": 0.1883, + "step": 34 + }, + { + "epoch": 0.006692160611854685, + "grad_norm": 4.254518032073975, + "learning_rate": 2.2590794935055878e-06, + "loss": 0.5216, + "step": 35 + }, + { + "epoch": 0.006883365200764819, + "grad_norm": 3.1227238178253174, + "learning_rate": 2.276979358545207e-06, + "loss": 0.4041, + "step": 36 + }, + { + "epoch": 0.007074569789674952, + "grad_norm": 3.3138492107391357, + "learning_rate": 2.2943887540983186e-06, + "loss": 0.2946, + "step": 37 + }, + { + "epoch": 0.007265774378585086, + "grad_norm": 4.404989719390869, + "learning_rate": 2.3113338433185884e-06, + "loss": 0.476, + "step": 38 + }, + { + "epoch": 0.00745697896749522, + "grad_norm": 3.9010329246520996, + "learning_rate": 2.3278387501948105e-06, + "loss": 0.7358, + "step": 39 + }, + { + "epoch": 0.0076481835564053535, + "grad_norm": 4.1798624992370605, + "learning_rate": 2.3439257661067945e-06, + "loss": 0.51, + "step": 40 + }, + { + "epoch": 0.007839388145315488, + "grad_norm": 3.3263652324676514, + "learning_rate": 2.3596155308722216e-06, + "loss": 0.4974, + "step": 41 + }, + { + "epoch": 0.00803059273422562, + "grad_norm": 5.814546585083008, + "learning_rate": 2.3749271919738757e-06, + "loss": 0.4005, + "step": 42 + }, + { + "epoch": 0.008221797323135755, + "grad_norm": 2.7462821006774902, + "learning_rate": 2.389878545048738e-06, + "loss": 0.391, + "step": 43 + }, + { + "epoch": 0.008413001912045888, + "grad_norm": 4.502755165100098, + "learning_rate": 2.4044861582243756e-06, + "loss": 0.2376, + "step": 44 + }, + { + "epoch": 0.008604206500956023, + "grad_norm": 4.496071815490723, + "learning_rate": 2.4187654824812036e-06, + "loss": 0.8074, + "step": 45 + }, + { + "epoch": 0.008795411089866157, + "grad_norm": 4.023655414581299, + "learning_rate": 2.4327309498839246e-06, + "loss": 0.5484, + "step": 46 + }, + { + "epoch": 0.00898661567877629, + "grad_norm": 2.9175775051116943, + "learning_rate": 2.446396061247859e-06, + "loss": 0.3416, + "step": 47 + }, + { + "epoch": 0.009177820267686425, + "grad_norm": 2.873981237411499, + "learning_rate": 2.4597734645750825e-06, + "loss": 0.5039, + "step": 48 + }, + { + "epoch": 0.009369024856596558, + "grad_norm": 2.632161855697632, + "learning_rate": 2.472875025402544e-06, + "loss": 0.4038, + "step": 49 + }, + { + "epoch": 0.009560229445506692, + "grad_norm": 3.235424041748047, + "learning_rate": 2.4857118900427907e-06, + "loss": 0.3458, + "step": 50 + }, + { + "epoch": 0.009751434034416827, + "grad_norm": 2.7174906730651855, + "learning_rate": 2.498294542561935e-06, + "loss": 0.4972, + "step": 51 + }, + { + "epoch": 0.00994263862332696, + "grad_norm": 3.14343523979187, + "learning_rate": 2.5106328562246856e-06, + "loss": 0.4773, + "step": 52 + }, + { + "epoch": 0.010133843212237094, + "grad_norm": 3.828115224838257, + "learning_rate": 2.5227361400389193e-06, + "loss": 0.4097, + "step": 53 + }, + { + "epoch": 0.010325047801147227, + "grad_norm": 3.4634366035461426, + "learning_rate": 2.5346131809494915e-06, + "loss": 0.582, + "step": 54 + }, + { + "epoch": 0.010516252390057362, + "grad_norm": 3.1296660900115967, + "learning_rate": 2.5462722821603727e-06, + "loss": 0.3681, + "step": 55 + }, + { + "epoch": 0.010707456978967494, + "grad_norm": 2.358661651611328, + "learning_rate": 2.557721298003751e-06, + "loss": 0.1711, + "step": 56 + }, + { + "epoch": 0.010898661567877629, + "grad_norm": 4.519606113433838, + "learning_rate": 2.568967665722873e-06, + "loss": 0.7541, + "step": 57 + }, + { + "epoch": 0.011089866156787764, + "grad_norm": 2.1992197036743164, + "learning_rate": 2.580018434490713e-06, + "loss": 0.2106, + "step": 58 + }, + { + "epoch": 0.011281070745697896, + "grad_norm": 2.597233295440674, + "learning_rate": 2.5908802919480436e-06, + "loss": 0.4043, + "step": 59 + }, + { + "epoch": 0.011472275334608031, + "grad_norm": 3.097158670425415, + "learning_rate": 2.601559588511079e-06, + "loss": 0.3287, + "step": 60 + }, + { + "epoch": 0.011663479923518164, + "grad_norm": 2.8475654125213623, + "learning_rate": 2.6120623596699453e-06, + "loss": 0.3242, + "step": 61 + }, + { + "epoch": 0.011854684512428298, + "grad_norm": 2.1070339679718018, + "learning_rate": 2.622394346474008e-06, + "loss": 0.1107, + "step": 62 + }, + { + "epoch": 0.012045889101338431, + "grad_norm": 3.0335943698883057, + "learning_rate": 2.63256101437816e-06, + "loss": 0.5753, + "step": 63 + }, + { + "epoch": 0.012237093690248566, + "grad_norm": 5.001894950866699, + "learning_rate": 2.6425675706049575e-06, + "loss": 0.6856, + "step": 64 + }, + { + "epoch": 0.0124282982791587, + "grad_norm": 2.677110433578491, + "learning_rate": 2.652418980160682e-06, + "loss": 0.1709, + "step": 65 + }, + { + "epoch": 0.012619502868068833, + "grad_norm": 2.887890100479126, + "learning_rate": 2.6621199806286598e-06, + "loss": 0.5874, + "step": 66 + }, + { + "epoch": 0.012810707456978968, + "grad_norm": 2.3578383922576904, + "learning_rate": 2.6716750958501646e-06, + "loss": 0.1294, + "step": 67 + }, + { + "epoch": 0.0130019120458891, + "grad_norm": 3.1397762298583984, + "learning_rate": 2.6810886485918104e-06, + "loss": 0.3335, + "step": 68 + }, + { + "epoch": 0.013193116634799235, + "grad_norm": 2.663269281387329, + "learning_rate": 2.690364772288209e-06, + "loss": 0.2245, + "step": 69 + }, + { + "epoch": 0.01338432122370937, + "grad_norm": 5.426806449890137, + "learning_rate": 2.6995074219397478e-06, + "loss": 0.5171, + "step": 70 + }, + { + "epoch": 0.013575525812619503, + "grad_norm": 5.457904815673828, + "learning_rate": 2.708520384237387e-06, + "loss": 0.3069, + "step": 71 + }, + { + "epoch": 0.013766730401529637, + "grad_norm": 2.8190979957580566, + "learning_rate": 2.7174072869793666e-06, + "loss": 0.4272, + "step": 72 + }, + { + "epoch": 0.01395793499043977, + "grad_norm": 2.0407586097717285, + "learning_rate": 2.7261716078384117e-06, + "loss": 0.2898, + "step": 73 + }, + { + "epoch": 0.014149139579349905, + "grad_norm": 2.2423548698425293, + "learning_rate": 2.734816682532479e-06, + "loss": 0.2225, + "step": 74 + }, + { + "epoch": 0.014340344168260038, + "grad_norm": 2.801767587661743, + "learning_rate": 2.7433457124470753e-06, + "loss": 0.1297, + "step": 75 + }, + { + "epoch": 0.014531548757170172, + "grad_norm": 3.7313790321350098, + "learning_rate": 2.751761771752748e-06, + "loss": 0.6037, + "step": 76 + }, + { + "epoch": 0.014722753346080307, + "grad_norm": 2.962400436401367, + "learning_rate": 2.760067814057329e-06, + "loss": 0.4091, + "step": 77 + }, + { + "epoch": 0.01491395793499044, + "grad_norm": 1.2434055805206299, + "learning_rate": 2.76826667862897e-06, + "loss": 0.3433, + "step": 78 + }, + { + "epoch": 0.015105162523900574, + "grad_norm": 4.778526782989502, + "learning_rate": 2.7763610962227783e-06, + "loss": 0.4217, + "step": 79 + }, + { + "epoch": 0.015296367112810707, + "grad_norm": 1.6460481882095337, + "learning_rate": 2.7843536945409537e-06, + "loss": 0.0981, + "step": 80 + }, + { + "epoch": 0.015487571701720841, + "grad_norm": 3.009162187576294, + "learning_rate": 2.7922470033537765e-06, + "loss": 0.1744, + "step": 81 + }, + { + "epoch": 0.015678776290630976, + "grad_norm": 4.354190826416016, + "learning_rate": 2.8000434593063812e-06, + "loss": 0.4956, + "step": 82 + }, + { + "epoch": 0.01586998087954111, + "grad_norm": 3.888239860534668, + "learning_rate": 2.807745410434209e-06, + "loss": 0.4807, + "step": 83 + }, + { + "epoch": 0.01606118546845124, + "grad_norm": 2.944679021835327, + "learning_rate": 2.8153551204080353e-06, + "loss": 0.4177, + "step": 84 + }, + { + "epoch": 0.016252390057361378, + "grad_norm": 3.1436245441436768, + "learning_rate": 2.822874772527807e-06, + "loss": 0.2304, + "step": 85 + }, + { + "epoch": 0.01644359464627151, + "grad_norm": 4.823106288909912, + "learning_rate": 2.830306473482897e-06, + "loss": 0.5006, + "step": 86 + }, + { + "epoch": 0.016634799235181644, + "grad_norm": 2.8725435733795166, + "learning_rate": 2.837652256894998e-06, + "loss": 0.128, + "step": 87 + }, + { + "epoch": 0.016826003824091777, + "grad_norm": 4.338479995727539, + "learning_rate": 2.8449140866585356e-06, + "loss": 0.2579, + "step": 88 + }, + { + "epoch": 0.017017208413001913, + "grad_norm": 4.045529365539551, + "learning_rate": 2.852093860092363e-06, + "loss": 0.9362, + "step": 89 + }, + { + "epoch": 0.017208413001912046, + "grad_norm": 3.180530309677124, + "learning_rate": 2.8591934109153636e-06, + "loss": 0.3104, + "step": 90 + }, + { + "epoch": 0.01739961759082218, + "grad_norm": 3.4100022315979004, + "learning_rate": 2.8662145120576384e-06, + "loss": 0.3072, + "step": 91 + }, + { + "epoch": 0.017590822179732315, + "grad_norm": 2.9597487449645996, + "learning_rate": 2.873158878318084e-06, + "loss": 0.2439, + "step": 92 + }, + { + "epoch": 0.017782026768642448, + "grad_norm": 3.1329643726348877, + "learning_rate": 2.8800281688782923e-06, + "loss": 0.1225, + "step": 93 + }, + { + "epoch": 0.01797323135755258, + "grad_norm": 3.316901445388794, + "learning_rate": 2.8868239896820188e-06, + "loss": 0.2914, + "step": 94 + }, + { + "epoch": 0.018164435946462717, + "grad_norm": 2.207570791244507, + "learning_rate": 2.8935478956887446e-06, + "loss": 0.294, + "step": 95 + }, + { + "epoch": 0.01835564053537285, + "grad_norm": 2.481140375137329, + "learning_rate": 2.900201393009242e-06, + "loss": 0.5148, + "step": 96 + }, + { + "epoch": 0.018546845124282983, + "grad_norm": 2.634587526321411, + "learning_rate": 2.906785940930483e-06, + "loss": 0.5636, + "step": 97 + }, + { + "epoch": 0.018738049713193115, + "grad_norm": 2.602517604827881, + "learning_rate": 2.913302953836704e-06, + "loss": 0.0733, + "step": 98 + }, + { + "epoch": 0.01892925430210325, + "grad_norm": 2.664775848388672, + "learning_rate": 2.9197538030329443e-06, + "loss": 0.2034, + "step": 99 + }, + { + "epoch": 0.019120458891013385, + "grad_norm": 4.784229278564453, + "learning_rate": 2.9261398184769508e-06, + "loss": 0.4161, + "step": 100 + }, + { + "epoch": 0.019311663479923517, + "grad_norm": 20.801212310791016, + "learning_rate": 2.9324622904249074e-06, + "loss": 0.312, + "step": 101 + }, + { + "epoch": 0.019502868068833654, + "grad_norm": 2.7974696159362793, + "learning_rate": 2.938722470996094e-06, + "loss": 0.436, + "step": 102 + }, + { + "epoch": 0.019694072657743786, + "grad_norm": 2.719644784927368, + "learning_rate": 2.944921575661221e-06, + "loss": 0.2729, + "step": 103 + }, + { + "epoch": 0.01988527724665392, + "grad_norm": 2.62326717376709, + "learning_rate": 2.951060784658845e-06, + "loss": 0.1857, + "step": 104 + }, + { + "epoch": 0.020076481835564052, + "grad_norm": 3.2945799827575684, + "learning_rate": 2.957141244344032e-06, + "loss": 0.2618, + "step": 105 + }, + { + "epoch": 0.02026768642447419, + "grad_norm": 3.1361422538757324, + "learning_rate": 2.963164068473079e-06, + "loss": 0.2186, + "step": 106 + }, + { + "epoch": 0.02045889101338432, + "grad_norm": 3.901604175567627, + "learning_rate": 2.9691303394279335e-06, + "loss": 0.5334, + "step": 107 + }, + { + "epoch": 0.020650095602294454, + "grad_norm": 4.767889499664307, + "learning_rate": 2.975041109383651e-06, + "loss": 0.4494, + "step": 108 + }, + { + "epoch": 0.02084130019120459, + "grad_norm": 2.728724241256714, + "learning_rate": 2.9808974014220527e-06, + "loss": 0.3868, + "step": 109 + }, + { + "epoch": 0.021032504780114723, + "grad_norm": 3.014492988586426, + "learning_rate": 2.986700210594532e-06, + "loss": 0.3775, + "step": 110 + }, + { + "epoch": 0.021223709369024856, + "grad_norm": 2.4681789875030518, + "learning_rate": 2.9924505049367623e-06, + "loss": 0.3131, + "step": 111 + }, + { + "epoch": 0.02141491395793499, + "grad_norm": 3.5321526527404785, + "learning_rate": 2.9981492264379103e-06, + "loss": 0.3168, + "step": 112 + }, + { + "epoch": 0.021606118546845125, + "grad_norm": 2.0447781085968018, + "learning_rate": 3.003797291966757e-06, + "loss": 0.3725, + "step": 113 + }, + { + "epoch": 0.021797323135755258, + "grad_norm": 3.804521322250366, + "learning_rate": 3.0093955941570325e-06, + "loss": 0.571, + "step": 114 + }, + { + "epoch": 0.02198852772466539, + "grad_norm": 2.5446314811706543, + "learning_rate": 3.014945002254081e-06, + "loss": 0.3944, + "step": 115 + }, + { + "epoch": 0.022179732313575527, + "grad_norm": 2.264204502105713, + "learning_rate": 3.0204463629248725e-06, + "loss": 0.2348, + "step": 116 + }, + { + "epoch": 0.02237093690248566, + "grad_norm": 2.108393907546997, + "learning_rate": 3.0259005010332546e-06, + "loss": 0.259, + "step": 117 + }, + { + "epoch": 0.022562141491395793, + "grad_norm": 4.435889720916748, + "learning_rate": 3.031308220382203e-06, + "loss": 0.2287, + "step": 118 + }, + { + "epoch": 0.022753346080305926, + "grad_norm": 2.867352247238159, + "learning_rate": 3.0366703044247632e-06, + "loss": 0.1899, + "step": 119 + }, + { + "epoch": 0.022944550669216062, + "grad_norm": 4.31091833114624, + "learning_rate": 3.0419875169452382e-06, + "loss": 0.771, + "step": 120 + }, + { + "epoch": 0.023135755258126195, + "grad_norm": 2.2673258781433105, + "learning_rate": 3.0472606027121134e-06, + "loss": 0.3879, + "step": 121 + }, + { + "epoch": 0.023326959847036328, + "grad_norm": 2.5156142711639404, + "learning_rate": 3.0524902881041045e-06, + "loss": 0.2798, + "step": 122 + }, + { + "epoch": 0.023518164435946464, + "grad_norm": 1.8811085224151611, + "learning_rate": 3.0576772817106653e-06, + "loss": 0.1592, + "step": 123 + }, + { + "epoch": 0.023709369024856597, + "grad_norm": 2.619985818862915, + "learning_rate": 3.062822274908168e-06, + "loss": 0.2492, + "step": 124 + }, + { + "epoch": 0.02390057361376673, + "grad_norm": 2.3442344665527344, + "learning_rate": 3.067925942412948e-06, + "loss": 0.1004, + "step": 125 + }, + { + "epoch": 0.024091778202676863, + "grad_norm": 3.19574236869812, + "learning_rate": 3.0729889428123194e-06, + "loss": 0.6887, + "step": 126 + }, + { + "epoch": 0.024282982791587, + "grad_norm": 1.725019097328186, + "learning_rate": 3.0780119190745983e-06, + "loss": 0.2958, + "step": 127 + }, + { + "epoch": 0.024474187380497132, + "grad_norm": 2.455963373184204, + "learning_rate": 3.082995499039117e-06, + "loss": 0.2027, + "step": 128 + }, + { + "epoch": 0.024665391969407265, + "grad_norm": 6.7689619064331055, + "learning_rate": 3.0879402958871817e-06, + "loss": 0.0587, + "step": 129 + }, + { + "epoch": 0.0248565965583174, + "grad_norm": 1.6544665098190308, + "learning_rate": 3.0928469085948413e-06, + "loss": 0.0546, + "step": 130 + }, + { + "epoch": 0.025047801147227534, + "grad_norm": 3.8221983909606934, + "learning_rate": 3.0977159223683077e-06, + "loss": 0.1562, + "step": 131 + }, + { + "epoch": 0.025239005736137667, + "grad_norm": 2.911043167114258, + "learning_rate": 3.102547909062819e-06, + "loss": 0.3307, + "step": 132 + }, + { + "epoch": 0.025430210325047803, + "grad_norm": 3.8134329319000244, + "learning_rate": 3.1073434275857012e-06, + "loss": 0.6647, + "step": 133 + }, + { + "epoch": 0.025621414913957936, + "grad_norm": 3.2995803356170654, + "learning_rate": 3.112103024284324e-06, + "loss": 0.1994, + "step": 134 + }, + { + "epoch": 0.02581261950286807, + "grad_norm": 1.8985707759857178, + "learning_rate": 3.1168272333196477e-06, + "loss": 0.2707, + "step": 135 + }, + { + "epoch": 0.0260038240917782, + "grad_norm": 2.5111820697784424, + "learning_rate": 3.1215165770259696e-06, + "loss": 0.2573, + "step": 136 + }, + { + "epoch": 0.026195028680688338, + "grad_norm": 1.4703351259231567, + "learning_rate": 3.1261715662575076e-06, + "loss": 0.032, + "step": 137 + }, + { + "epoch": 0.02638623326959847, + "grad_norm": 3.487067937850952, + "learning_rate": 3.1307927007223687e-06, + "loss": 0.3179, + "step": 138 + }, + { + "epoch": 0.026577437858508603, + "grad_norm": 3.4684128761291504, + "learning_rate": 3.135380469304468e-06, + "loss": 0.4447, + "step": 139 + }, + { + "epoch": 0.02676864244741874, + "grad_norm": 3.0979361534118652, + "learning_rate": 3.1399353503739065e-06, + "loss": 0.2441, + "step": 140 + }, + { + "epoch": 0.026959847036328873, + "grad_norm": 3.7381913661956787, + "learning_rate": 3.1444578120863033e-06, + "loss": 0.3272, + "step": 141 + }, + { + "epoch": 0.027151051625239005, + "grad_norm": 2.558560609817505, + "learning_rate": 3.148948312671547e-06, + "loss": 0.3229, + "step": 142 + }, + { + "epoch": 0.027342256214149138, + "grad_norm": 2.115128755569458, + "learning_rate": 3.153407300712423e-06, + "loss": 0.1838, + "step": 143 + }, + { + "epoch": 0.027533460803059275, + "grad_norm": 3.6304619312286377, + "learning_rate": 3.157835215413526e-06, + "loss": 0.3281, + "step": 144 + }, + { + "epoch": 0.027724665391969407, + "grad_norm": 7.865360260009766, + "learning_rate": 3.1622324868608695e-06, + "loss": 0.8346, + "step": 145 + }, + { + "epoch": 0.02791586998087954, + "grad_norm": 3.5878102779388428, + "learning_rate": 3.166599536272571e-06, + "loss": 0.5395, + "step": 146 + }, + { + "epoch": 0.028107074569789676, + "grad_norm": 3.4643099308013916, + "learning_rate": 3.170936776240988e-06, + "loss": 0.3664, + "step": 147 + }, + { + "epoch": 0.02829827915869981, + "grad_norm": 3.340297222137451, + "learning_rate": 3.1752446109666377e-06, + "loss": 0.3959, + "step": 148 + }, + { + "epoch": 0.028489483747609942, + "grad_norm": 3.389150619506836, + "learning_rate": 3.1795234364842463e-06, + "loss": 0.0883, + "step": 149 + }, + { + "epoch": 0.028680688336520075, + "grad_norm": 8.659844398498535, + "learning_rate": 3.183773640881235e-06, + "loss": 0.2891, + "step": 150 + }, + { + "epoch": 0.02887189292543021, + "grad_norm": 2.904853582382202, + "learning_rate": 3.1879956045089473e-06, + "loss": 0.4851, + "step": 151 + }, + { + "epoch": 0.029063097514340344, + "grad_norm": 2.432720184326172, + "learning_rate": 3.192189700186908e-06, + "loss": 0.4643, + "step": 152 + }, + { + "epoch": 0.029254302103250477, + "grad_norm": 2.321363925933838, + "learning_rate": 3.196356293400379e-06, + "loss": 0.1767, + "step": 153 + }, + { + "epoch": 0.029445506692160613, + "grad_norm": 1.9591343402862549, + "learning_rate": 3.2004957424914884e-06, + "loss": 0.1611, + "step": 154 + }, + { + "epoch": 0.029636711281070746, + "grad_norm": 2.822035551071167, + "learning_rate": 3.204608398844164e-06, + "loss": 0.2053, + "step": 155 + }, + { + "epoch": 0.02982791586998088, + "grad_norm": 2.73078989982605, + "learning_rate": 3.2086946070631297e-06, + "loss": 0.095, + "step": 156 + }, + { + "epoch": 0.030019120458891012, + "grad_norm": 4.012321949005127, + "learning_rate": 3.212754705147177e-06, + "loss": 0.4317, + "step": 157 + }, + { + "epoch": 0.030210325047801148, + "grad_norm": 2.332084894180298, + "learning_rate": 3.216789024656938e-06, + "loss": 0.4723, + "step": 158 + }, + { + "epoch": 0.03040152963671128, + "grad_norm": 3.0491888523101807, + "learning_rate": 3.2207978908773634e-06, + "loss": 0.4482, + "step": 159 + }, + { + "epoch": 0.030592734225621414, + "grad_norm": 1.8330838680267334, + "learning_rate": 3.2247816229751133e-06, + "loss": 0.1647, + "step": 160 + }, + { + "epoch": 0.03078393881453155, + "grad_norm": 2.9033660888671875, + "learning_rate": 3.228740534151037e-06, + "loss": 0.3954, + "step": 161 + }, + { + "epoch": 0.030975143403441683, + "grad_norm": 2.846273899078369, + "learning_rate": 3.2326749317879352e-06, + "loss": 0.1128, + "step": 162 + }, + { + "epoch": 0.031166347992351816, + "grad_norm": 3.386489152908325, + "learning_rate": 3.2365851175937783e-06, + "loss": 0.4637, + "step": 163 + }, + { + "epoch": 0.03135755258126195, + "grad_norm": 2.893277883529663, + "learning_rate": 3.240471387740541e-06, + "loss": 0.5229, + "step": 164 + }, + { + "epoch": 0.03154875717017208, + "grad_norm": 2.0880420207977295, + "learning_rate": 3.244334032998816e-06, + "loss": 0.4664, + "step": 165 + }, + { + "epoch": 0.03173996175908222, + "grad_norm": 4.319767475128174, + "learning_rate": 3.2481733388683686e-06, + "loss": 0.5711, + "step": 166 + }, + { + "epoch": 0.031931166347992354, + "grad_norm": 3.115651845932007, + "learning_rate": 3.251989585704759e-06, + "loss": 0.1742, + "step": 167 + }, + { + "epoch": 0.03212237093690248, + "grad_norm": 3.5364396572113037, + "learning_rate": 3.255783048842195e-06, + "loss": 0.1186, + "step": 168 + }, + { + "epoch": 0.03231357552581262, + "grad_norm": 2.261878252029419, + "learning_rate": 3.2595539987127332e-06, + "loss": 0.1794, + "step": 169 + }, + { + "epoch": 0.032504780114722756, + "grad_norm": 3.346421003341675, + "learning_rate": 3.2633027009619666e-06, + "loss": 0.5637, + "step": 170 + }, + { + "epoch": 0.032695984703632885, + "grad_norm": 3.0582921504974365, + "learning_rate": 3.267029416561317e-06, + "loss": 0.3209, + "step": 171 + }, + { + "epoch": 0.03288718929254302, + "grad_norm": 2.956470251083374, + "learning_rate": 3.270734401917057e-06, + "loss": 0.3218, + "step": 172 + }, + { + "epoch": 0.03307839388145316, + "grad_norm": 1.4681670665740967, + "learning_rate": 3.27441790897617e-06, + "loss": 0.2129, + "step": 173 + }, + { + "epoch": 0.03326959847036329, + "grad_norm": 4.20260763168335, + "learning_rate": 3.278080185329157e-06, + "loss": 0.3151, + "step": 174 + }, + { + "epoch": 0.033460803059273424, + "grad_norm": 4.02779483795166, + "learning_rate": 3.281721474309904e-06, + "loss": 0.3233, + "step": 175 + }, + { + "epoch": 0.03365200764818355, + "grad_norm": 3.5959632396698, + "learning_rate": 3.2853420150926944e-06, + "loss": 0.7, + "step": 176 + }, + { + "epoch": 0.03384321223709369, + "grad_norm": 2.106358528137207, + "learning_rate": 3.2889420427864873e-06, + "loss": 0.6374, + "step": 177 + }, + { + "epoch": 0.034034416826003826, + "grad_norm": 2.9504098892211914, + "learning_rate": 3.2925217885265225e-06, + "loss": 0.408, + "step": 178 + }, + { + "epoch": 0.034225621414913955, + "grad_norm": 2.7030961513519287, + "learning_rate": 3.296081479563376e-06, + "loss": 0.3322, + "step": 179 + }, + { + "epoch": 0.03441682600382409, + "grad_norm": 2.271064281463623, + "learning_rate": 3.299621339349523e-06, + "loss": 0.1597, + "step": 180 + }, + { + "epoch": 0.03460803059273423, + "grad_norm": 3.3034112453460693, + "learning_rate": 3.3031415876235085e-06, + "loss": 0.1215, + "step": 181 + }, + { + "epoch": 0.03479923518164436, + "grad_norm": 3.9516632556915283, + "learning_rate": 3.306642440491798e-06, + "loss": 0.6046, + "step": 182 + }, + { + "epoch": 0.03499043977055449, + "grad_norm": 5.616006374359131, + "learning_rate": 3.310124110508389e-06, + "loss": 0.8098, + "step": 183 + }, + { + "epoch": 0.03518164435946463, + "grad_norm": 1.1683584451675415, + "learning_rate": 3.3135868067522437e-06, + "loss": 0.2074, + "step": 184 + }, + { + "epoch": 0.03537284894837476, + "grad_norm": 4.46233606338501, + "learning_rate": 3.3170307349026344e-06, + "loss": 0.2912, + "step": 185 + }, + { + "epoch": 0.035564053537284895, + "grad_norm": 2.703484535217285, + "learning_rate": 3.3204560973124523e-06, + "loss": 0.1338, + "step": 186 + }, + { + "epoch": 0.03575525812619503, + "grad_norm": 2.688272476196289, + "learning_rate": 3.3238630930795473e-06, + "loss": 0.1497, + "step": 187 + }, + { + "epoch": 0.03594646271510516, + "grad_norm": 3.20407772064209, + "learning_rate": 3.3272519181161784e-06, + "loss": 0.3101, + "step": 188 + }, + { + "epoch": 0.0361376673040153, + "grad_norm": 3.1082072257995605, + "learning_rate": 3.330622765216604e-06, + "loss": 0.2251, + "step": 189 + }, + { + "epoch": 0.036328871892925434, + "grad_norm": 2.8750967979431152, + "learning_rate": 3.333975824122904e-06, + "loss": 0.475, + "step": 190 + }, + { + "epoch": 0.03652007648183556, + "grad_norm": 3.8573219776153564, + "learning_rate": 3.337311281589066e-06, + "loss": 0.093, + "step": 191 + }, + { + "epoch": 0.0367112810707457, + "grad_norm": 1.341035008430481, + "learning_rate": 3.3406293214434016e-06, + "loss": 0.053, + "step": 192 + }, + { + "epoch": 0.03690248565965583, + "grad_norm": 2.519542694091797, + "learning_rate": 3.343930124649337e-06, + "loss": 0.1018, + "step": 193 + }, + { + "epoch": 0.037093690248565965, + "grad_norm": 3.034273386001587, + "learning_rate": 3.3472138693646427e-06, + "loss": 0.3481, + "step": 194 + }, + { + "epoch": 0.0372848948374761, + "grad_norm": 1.9574315547943115, + "learning_rate": 3.350480730999126e-06, + "loss": 0.2867, + "step": 195 + }, + { + "epoch": 0.03747609942638623, + "grad_norm": 2.153280735015869, + "learning_rate": 3.353730882270863e-06, + "loss": 0.3375, + "step": 196 + }, + { + "epoch": 0.03766730401529637, + "grad_norm": 3.426328420639038, + "learning_rate": 3.3569644932609887e-06, + "loss": 0.4611, + "step": 197 + }, + { + "epoch": 0.0378585086042065, + "grad_norm": 2.072275161743164, + "learning_rate": 3.360181731467104e-06, + "loss": 0.2144, + "step": 198 + }, + { + "epoch": 0.03804971319311663, + "grad_norm": 2.887531280517578, + "learning_rate": 3.3633827618553393e-06, + "loss": 0.2432, + "step": 199 + }, + { + "epoch": 0.03824091778202677, + "grad_norm": 5.023746013641357, + "learning_rate": 3.3665677469111103e-06, + "loss": 0.2016, + "step": 200 + }, + { + "epoch": 0.038432122370936905, + "grad_norm": 3.695518970489502, + "learning_rate": 3.3697368466886087e-06, + "loss": 0.5206, + "step": 201 + }, + { + "epoch": 0.038623326959847035, + "grad_norm": 3.6168696880340576, + "learning_rate": 3.3728902188590666e-06, + "loss": 0.5451, + "step": 202 + }, + { + "epoch": 0.03881453154875717, + "grad_norm": 4.44756555557251, + "learning_rate": 3.376028018757826e-06, + "loss": 0.4795, + "step": 203 + }, + { + "epoch": 0.03900573613766731, + "grad_norm": 3.0557193756103516, + "learning_rate": 3.3791503994302537e-06, + "loss": 0.2987, + "step": 204 + }, + { + "epoch": 0.03919694072657744, + "grad_norm": 3.2316536903381348, + "learning_rate": 3.382257511676538e-06, + "loss": 0.3986, + "step": 205 + }, + { + "epoch": 0.03938814531548757, + "grad_norm": 3.7219014167785645, + "learning_rate": 3.3853495040953806e-06, + "loss": 0.3432, + "step": 206 + }, + { + "epoch": 0.0395793499043977, + "grad_norm": 3.6749324798583984, + "learning_rate": 3.388426523126653e-06, + "loss": 0.5342, + "step": 207 + }, + { + "epoch": 0.03977055449330784, + "grad_norm": 3.951026439666748, + "learning_rate": 3.3914887130930047e-06, + "loss": 0.6743, + "step": 208 + }, + { + "epoch": 0.039961759082217975, + "grad_norm": 2.203550338745117, + "learning_rate": 3.3945362162404853e-06, + "loss": 0.272, + "step": 209 + }, + { + "epoch": 0.040152963671128104, + "grad_norm": 3.5759997367858887, + "learning_rate": 3.397569172778191e-06, + "loss": 0.4403, + "step": 210 + }, + { + "epoch": 0.04034416826003824, + "grad_norm": 2.468674421310425, + "learning_rate": 3.400587720916976e-06, + "loss": 0.2374, + "step": 211 + }, + { + "epoch": 0.04053537284894838, + "grad_norm": 1.7186170816421509, + "learning_rate": 3.4035919969072384e-06, + "loss": 0.1339, + "step": 212 + }, + { + "epoch": 0.040726577437858506, + "grad_norm": 3.243154525756836, + "learning_rate": 3.4065821350758317e-06, + "loss": 0.2984, + "step": 213 + }, + { + "epoch": 0.04091778202676864, + "grad_norm": 3.2775168418884277, + "learning_rate": 3.409558267862093e-06, + "loss": 0.5302, + "step": 214 + }, + { + "epoch": 0.04110898661567878, + "grad_norm": 1.8264061212539673, + "learning_rate": 3.4125205258530534e-06, + "loss": 0.5952, + "step": 215 + }, + { + "epoch": 0.04130019120458891, + "grad_norm": 5.033473491668701, + "learning_rate": 3.4154690378178107e-06, + "loss": 0.2228, + "step": 216 + }, + { + "epoch": 0.041491395793499045, + "grad_norm": 1.7373385429382324, + "learning_rate": 3.4184039307411206e-06, + "loss": 0.1134, + "step": 217 + }, + { + "epoch": 0.04168260038240918, + "grad_norm": 2.2630789279937744, + "learning_rate": 3.4213253298562123e-06, + "loss": 0.1515, + "step": 218 + }, + { + "epoch": 0.04187380497131931, + "grad_norm": 3.2471377849578857, + "learning_rate": 3.4242333586768554e-06, + "loss": 0.1851, + "step": 219 + }, + { + "epoch": 0.04206500956022945, + "grad_norm": 4.196602821350098, + "learning_rate": 3.4271281390286914e-06, + "loss": 0.9757, + "step": 220 + }, + { + "epoch": 0.042256214149139576, + "grad_norm": 3.0383758544921875, + "learning_rate": 3.4300097910798572e-06, + "loss": 0.426, + "step": 221 + }, + { + "epoch": 0.04244741873804971, + "grad_norm": 2.5604920387268066, + "learning_rate": 3.4328784333709227e-06, + "loss": 0.2252, + "step": 222 + }, + { + "epoch": 0.04263862332695985, + "grad_norm": 3.8304648399353027, + "learning_rate": 3.4357341828441426e-06, + "loss": 0.3957, + "step": 223 + }, + { + "epoch": 0.04282982791586998, + "grad_norm": 2.3922934532165527, + "learning_rate": 3.43857715487207e-06, + "loss": 0.1992, + "step": 224 + }, + { + "epoch": 0.043021032504780114, + "grad_norm": 3.5151939392089844, + "learning_rate": 3.4414074632855194e-06, + "loss": 0.2571, + "step": 225 + }, + { + "epoch": 0.04321223709369025, + "grad_norm": 3.84830904006958, + "learning_rate": 3.4442252204009168e-06, + "loss": 0.7981, + "step": 226 + }, + { + "epoch": 0.04340344168260038, + "grad_norm": 3.0907559394836426, + "learning_rate": 3.447030537047043e-06, + "loss": 0.4319, + "step": 227 + }, + { + "epoch": 0.043594646271510516, + "grad_norm": 3.6448781490325928, + "learning_rate": 3.4498235225911925e-06, + "loss": 0.4267, + "step": 228 + }, + { + "epoch": 0.04378585086042065, + "grad_norm": 1.7876298427581787, + "learning_rate": 3.452604284964754e-06, + "loss": 0.2185, + "step": 229 + }, + { + "epoch": 0.04397705544933078, + "grad_norm": 1.8112496137619019, + "learning_rate": 3.4553729306882404e-06, + "loss": 0.1537, + "step": 230 + }, + { + "epoch": 0.04416826003824092, + "grad_norm": 4.762514591217041, + "learning_rate": 3.4581295648957726e-06, + "loss": 0.4439, + "step": 231 + }, + { + "epoch": 0.044359464627151055, + "grad_norm": 2.5081305503845215, + "learning_rate": 3.4608742913590325e-06, + "loss": 0.2384, + "step": 232 + }, + { + "epoch": 0.044550669216061184, + "grad_norm": 4.172029495239258, + "learning_rate": 3.4636072125107046e-06, + "loss": 0.6397, + "step": 233 + }, + { + "epoch": 0.04474187380497132, + "grad_norm": 2.4700138568878174, + "learning_rate": 3.4663284294674142e-06, + "loss": 0.5316, + "step": 234 + }, + { + "epoch": 0.044933078393881457, + "grad_norm": 2.9252090454101562, + "learning_rate": 3.4690380420521746e-06, + "loss": 0.2955, + "step": 235 + }, + { + "epoch": 0.045124282982791586, + "grad_norm": 2.3378536701202393, + "learning_rate": 3.4717361488163627e-06, + "loss": 0.3204, + "step": 236 + }, + { + "epoch": 0.04531548757170172, + "grad_norm": 5.816604137420654, + "learning_rate": 3.4744228470612224e-06, + "loss": 0.1649, + "step": 237 + }, + { + "epoch": 0.04550669216061185, + "grad_norm": 2.326578140258789, + "learning_rate": 3.4770982328589232e-06, + "loss": 0.201, + "step": 238 + }, + { + "epoch": 0.04569789674952199, + "grad_norm": 4.211781024932861, + "learning_rate": 3.4797624010731722e-06, + "loss": 0.7306, + "step": 239 + }, + { + "epoch": 0.045889101338432124, + "grad_norm": 1.4428257942199707, + "learning_rate": 3.482415445379398e-06, + "loss": 0.1414, + "step": 240 + }, + { + "epoch": 0.046080305927342254, + "grad_norm": 3.0482757091522217, + "learning_rate": 3.4850574582845136e-06, + "loss": 0.5237, + "step": 241 + }, + { + "epoch": 0.04627151051625239, + "grad_norm": 2.439641237258911, + "learning_rate": 3.4876885311462725e-06, + "loss": 0.2411, + "step": 242 + }, + { + "epoch": 0.046462715105162526, + "grad_norm": 3.4746406078338623, + "learning_rate": 3.4903087541922198e-06, + "loss": 0.2901, + "step": 243 + }, + { + "epoch": 0.046653919694072656, + "grad_norm": 2.937816619873047, + "learning_rate": 3.492918216538264e-06, + "loss": 0.1849, + "step": 244 + }, + { + "epoch": 0.04684512428298279, + "grad_norm": 4.481417655944824, + "learning_rate": 3.49551700620686e-06, + "loss": 0.6734, + "step": 245 + }, + { + "epoch": 0.04703632887189293, + "grad_norm": 3.3815813064575195, + "learning_rate": 3.498105210144825e-06, + "loss": 0.7956, + "step": 246 + }, + { + "epoch": 0.04722753346080306, + "grad_norm": 1.8896292448043823, + "learning_rate": 3.5006829142407957e-06, + "loss": 0.1625, + "step": 247 + }, + { + "epoch": 0.047418738049713194, + "grad_norm": 2.2965781688690186, + "learning_rate": 3.503250203342327e-06, + "loss": 0.1043, + "step": 248 + }, + { + "epoch": 0.04760994263862333, + "grad_norm": 3.055568218231201, + "learning_rate": 3.5058071612726523e-06, + "loss": 0.2573, + "step": 249 + }, + { + "epoch": 0.04780114722753346, + "grad_norm": 2.876594305038452, + "learning_rate": 3.5083538708471065e-06, + "loss": 0.1837, + "step": 250 + }, + { + "epoch": 0.047992351816443596, + "grad_norm": 2.6785998344421387, + "learning_rate": 3.5108904138892164e-06, + "loss": 0.3405, + "step": 251 + }, + { + "epoch": 0.048183556405353725, + "grad_norm": 3.1598598957061768, + "learning_rate": 3.5134168712464794e-06, + "loss": 0.3016, + "step": 252 + }, + { + "epoch": 0.04837476099426386, + "grad_norm": 2.128307342529297, + "learning_rate": 3.515933322805821e-06, + "loss": 0.3255, + "step": 253 + }, + { + "epoch": 0.048565965583174, + "grad_norm": 2.671571969985962, + "learning_rate": 3.518439847508758e-06, + "loss": 0.2336, + "step": 254 + }, + { + "epoch": 0.04875717017208413, + "grad_norm": 5.808933734893799, + "learning_rate": 3.5209365233662508e-06, + "loss": 0.2801, + "step": 255 + }, + { + "epoch": 0.048948374760994263, + "grad_norm": 2.248617649078369, + "learning_rate": 3.5234234274732767e-06, + "loss": 0.1844, + "step": 256 + }, + { + "epoch": 0.0491395793499044, + "grad_norm": 2.858762264251709, + "learning_rate": 3.5259006360231123e-06, + "loss": 0.4732, + "step": 257 + }, + { + "epoch": 0.04933078393881453, + "grad_norm": 2.7468910217285156, + "learning_rate": 3.5283682243213417e-06, + "loss": 0.2897, + "step": 258 + }, + { + "epoch": 0.049521988527724665, + "grad_norm": 1.2300924062728882, + "learning_rate": 3.5308262667995906e-06, + "loss": 0.2216, + "step": 259 + }, + { + "epoch": 0.0497131931166348, + "grad_norm": 3.6707677841186523, + "learning_rate": 3.5332748370290014e-06, + "loss": 0.5293, + "step": 260 + }, + { + "epoch": 0.04990439770554493, + "grad_norm": 2.7596583366394043, + "learning_rate": 3.5357140077334416e-06, + "loss": 0.2129, + "step": 261 + }, + { + "epoch": 0.05009560229445507, + "grad_norm": 5.763503074645996, + "learning_rate": 3.5381438508024672e-06, + "loss": 0.1661, + "step": 262 + }, + { + "epoch": 0.050286806883365204, + "grad_norm": 3.31182599067688, + "learning_rate": 3.5405644373040366e-06, + "loss": 0.3727, + "step": 263 + }, + { + "epoch": 0.05047801147227533, + "grad_norm": 2.9972293376922607, + "learning_rate": 3.5429758374969793e-06, + "loss": 0.3564, + "step": 264 + }, + { + "epoch": 0.05066921606118547, + "grad_norm": 2.814368963241577, + "learning_rate": 3.5453781208432355e-06, + "loss": 0.4932, + "step": 265 + }, + { + "epoch": 0.050860420650095606, + "grad_norm": 4.843621730804443, + "learning_rate": 3.547771356019861e-06, + "loss": 0.3832, + "step": 266 + }, + { + "epoch": 0.051051625239005735, + "grad_norm": 3.9224517345428467, + "learning_rate": 3.550155610930807e-06, + "loss": 0.1663, + "step": 267 + }, + { + "epoch": 0.05124282982791587, + "grad_norm": 2.4663190841674805, + "learning_rate": 3.5525309527184838e-06, + "loss": 0.0759, + "step": 268 + }, + { + "epoch": 0.051434034416826, + "grad_norm": 2.4381537437438965, + "learning_rate": 3.5548974477751014e-06, + "loss": 0.1891, + "step": 269 + }, + { + "epoch": 0.05162523900573614, + "grad_norm": 3.2417197227478027, + "learning_rate": 3.5572551617538078e-06, + "loss": 0.5146, + "step": 270 + }, + { + "epoch": 0.05181644359464627, + "grad_norm": 2.6413793563842773, + "learning_rate": 3.559604159579614e-06, + "loss": 0.2155, + "step": 271 + }, + { + "epoch": 0.0520076481835564, + "grad_norm": 2.974642753601074, + "learning_rate": 3.56194450546013e-06, + "loss": 0.3031, + "step": 272 + }, + { + "epoch": 0.05219885277246654, + "grad_norm": 2.5996057987213135, + "learning_rate": 3.5642762628960825e-06, + "loss": 0.236, + "step": 273 + }, + { + "epoch": 0.052390057361376675, + "grad_norm": 2.6430821418762207, + "learning_rate": 3.566599494691667e-06, + "loss": 0.1634, + "step": 274 + }, + { + "epoch": 0.052581261950286805, + "grad_norm": 4.012979984283447, + "learning_rate": 3.5689142629646885e-06, + "loss": 0.1486, + "step": 275 + }, + { + "epoch": 0.05277246653919694, + "grad_norm": 2.250370740890503, + "learning_rate": 3.571220629156528e-06, + "loss": 0.3647, + "step": 276 + }, + { + "epoch": 0.05296367112810708, + "grad_norm": 2.271556854248047, + "learning_rate": 3.5735186540419348e-06, + "loss": 0.21, + "step": 277 + }, + { + "epoch": 0.05315487571701721, + "grad_norm": 1.3703874349594116, + "learning_rate": 3.5758083977386276e-06, + "loss": 0.2091, + "step": 278 + }, + { + "epoch": 0.05334608030592734, + "grad_norm": 2.649837017059326, + "learning_rate": 3.5780899197167356e-06, + "loss": 0.1355, + "step": 279 + }, + { + "epoch": 0.05353728489483748, + "grad_norm": 3.166032552719116, + "learning_rate": 3.580363278808066e-06, + "loss": 0.4169, + "step": 280 + }, + { + "epoch": 0.05372848948374761, + "grad_norm": 4.7268290519714355, + "learning_rate": 3.582628533215206e-06, + "loss": 0.2532, + "step": 281 + }, + { + "epoch": 0.053919694072657745, + "grad_norm": 2.934070587158203, + "learning_rate": 3.584885740520463e-06, + "loss": 0.4672, + "step": 282 + }, + { + "epoch": 0.054110898661567874, + "grad_norm": 2.0285329818725586, + "learning_rate": 3.5871349576946483e-06, + "loss": 0.3499, + "step": 283 + }, + { + "epoch": 0.05430210325047801, + "grad_norm": 2.122689962387085, + "learning_rate": 3.5893762411057063e-06, + "loss": 0.2839, + "step": 284 + }, + { + "epoch": 0.05449330783938815, + "grad_norm": 3.105699300765991, + "learning_rate": 3.5916096465271888e-06, + "loss": 0.4167, + "step": 285 + }, + { + "epoch": 0.054684512428298276, + "grad_norm": 2.428645372390747, + "learning_rate": 3.593835229146582e-06, + "loss": 0.2457, + "step": 286 + }, + { + "epoch": 0.05487571701720841, + "grad_norm": 2.716440439224243, + "learning_rate": 3.5960530435734936e-06, + "loss": 0.252, + "step": 287 + }, + { + "epoch": 0.05506692160611855, + "grad_norm": 2.986215829849243, + "learning_rate": 3.598263143847686e-06, + "loss": 0.1737, + "step": 288 + }, + { + "epoch": 0.05525812619502868, + "grad_norm": 1.8082088232040405, + "learning_rate": 3.600465583446982e-06, + "loss": 0.1929, + "step": 289 + }, + { + "epoch": 0.055449330783938815, + "grad_norm": 2.961946964263916, + "learning_rate": 3.602660415295029e-06, + "loss": 0.2653, + "step": 290 + }, + { + "epoch": 0.05564053537284895, + "grad_norm": 3.455840587615967, + "learning_rate": 3.6048476917689273e-06, + "loss": 0.3367, + "step": 291 + }, + { + "epoch": 0.05583173996175908, + "grad_norm": 1.9973338842391968, + "learning_rate": 3.607027464706731e-06, + "loss": 0.1757, + "step": 292 + }, + { + "epoch": 0.05602294455066922, + "grad_norm": 3.1876704692840576, + "learning_rate": 3.609199785414821e-06, + "loss": 0.3237, + "step": 293 + }, + { + "epoch": 0.05621414913957935, + "grad_norm": 2.5493903160095215, + "learning_rate": 3.6113647046751477e-06, + "loss": 0.1988, + "step": 294 + }, + { + "epoch": 0.05640535372848948, + "grad_norm": 3.4992916584014893, + "learning_rate": 3.6135222727523593e-06, + "loss": 0.3859, + "step": 295 + }, + { + "epoch": 0.05659655831739962, + "grad_norm": 1.860656499862671, + "learning_rate": 3.6156725394007973e-06, + "loss": 0.2747, + "step": 296 + }, + { + "epoch": 0.05678776290630975, + "grad_norm": 2.110995054244995, + "learning_rate": 3.6178155538713884e-06, + "loss": 0.2186, + "step": 297 + }, + { + "epoch": 0.056978967495219884, + "grad_norm": 2.3457248210906982, + "learning_rate": 3.6199513649184063e-06, + "loss": 0.2764, + "step": 298 + }, + { + "epoch": 0.05717017208413002, + "grad_norm": 2.250549554824829, + "learning_rate": 3.622080020806132e-06, + "loss": 0.1175, + "step": 299 + }, + { + "epoch": 0.05736137667304015, + "grad_norm": 3.1696152687072754, + "learning_rate": 3.6242015693153945e-06, + "loss": 0.1506, + "step": 300 + }, + { + "epoch": 0.057552581261950286, + "grad_norm": 2.9207370281219482, + "learning_rate": 3.62631605775001e-06, + "loss": 0.3109, + "step": 301 + }, + { + "epoch": 0.05774378585086042, + "grad_norm": 2.2986581325531006, + "learning_rate": 3.6284235329431073e-06, + "loss": 0.2265, + "step": 302 + }, + { + "epoch": 0.05793499043977055, + "grad_norm": 2.269641876220703, + "learning_rate": 3.6305240412633507e-06, + "loss": 0.1851, + "step": 303 + }, + { + "epoch": 0.05812619502868069, + "grad_norm": 2.343125104904175, + "learning_rate": 3.632617628621067e-06, + "loss": 0.1895, + "step": 304 + }, + { + "epoch": 0.058317399617590825, + "grad_norm": 3.1341187953948975, + "learning_rate": 3.634704340474261e-06, + "loss": 0.3916, + "step": 305 + }, + { + "epoch": 0.058508604206500954, + "grad_norm": 1.92935311794281, + "learning_rate": 3.6367842218345383e-06, + "loss": 0.1707, + "step": 306 + }, + { + "epoch": 0.05869980879541109, + "grad_norm": 2.106807231903076, + "learning_rate": 3.6388573172729357e-06, + "loss": 0.3863, + "step": 307 + }, + { + "epoch": 0.05889101338432123, + "grad_norm": 0.9721649885177612, + "learning_rate": 3.640923670925647e-06, + "loss": 0.3013, + "step": 308 + }, + { + "epoch": 0.059082217973231356, + "grad_norm": 2.205176830291748, + "learning_rate": 3.642983326499665e-06, + "loss": 0.2873, + "step": 309 + }, + { + "epoch": 0.05927342256214149, + "grad_norm": 2.2065727710723877, + "learning_rate": 3.6450363272783236e-06, + "loss": 0.2406, + "step": 310 + }, + { + "epoch": 0.05946462715105163, + "grad_norm": 5.1225361824035645, + "learning_rate": 3.647082716126761e-06, + "loss": 0.143, + "step": 311 + }, + { + "epoch": 0.05965583173996176, + "grad_norm": 3.16314435005188, + "learning_rate": 3.6491225354972893e-06, + "loss": 0.1193, + "step": 312 + }, + { + "epoch": 0.059847036328871894, + "grad_norm": 3.749077558517456, + "learning_rate": 3.651155827434673e-06, + "loss": 0.6098, + "step": 313 + }, + { + "epoch": 0.060038240917782024, + "grad_norm": 3.1285479068756104, + "learning_rate": 3.6531826335813365e-06, + "loss": 0.6629, + "step": 314 + }, + { + "epoch": 0.06022944550669216, + "grad_norm": 3.448866128921509, + "learning_rate": 3.6552029951824756e-06, + "loss": 0.3374, + "step": 315 + }, + { + "epoch": 0.060420650095602296, + "grad_norm": 1.7784894704818726, + "learning_rate": 3.6572169530910974e-06, + "loss": 0.1474, + "step": 316 + }, + { + "epoch": 0.060611854684512426, + "grad_norm": 2.4160869121551514, + "learning_rate": 3.6592245477729737e-06, + "loss": 0.3054, + "step": 317 + }, + { + "epoch": 0.06080305927342256, + "grad_norm": 3.9507014751434326, + "learning_rate": 3.661225819311523e-06, + "loss": 0.2749, + "step": 318 + }, + { + "epoch": 0.0609942638623327, + "grad_norm": 1.2071470022201538, + "learning_rate": 3.66322080741261e-06, + "loss": 0.1813, + "step": 319 + }, + { + "epoch": 0.06118546845124283, + "grad_norm": 4.6793646812438965, + "learning_rate": 3.665209551409273e-06, + "loss": 0.3954, + "step": 320 + }, + { + "epoch": 0.061376673040152964, + "grad_norm": 3.9796018600463867, + "learning_rate": 3.6671920902663776e-06, + "loss": 0.5827, + "step": 321 + }, + { + "epoch": 0.0615678776290631, + "grad_norm": 3.5386040210723877, + "learning_rate": 3.669168462585197e-06, + "loss": 0.5768, + "step": 322 + }, + { + "epoch": 0.06175908221797323, + "grad_norm": 1.5878996849060059, + "learning_rate": 3.67113870660792e-06, + "loss": 0.203, + "step": 323 + }, + { + "epoch": 0.061950286806883366, + "grad_norm": 1.8842241764068604, + "learning_rate": 3.673102860222095e-06, + "loss": 0.1778, + "step": 324 + }, + { + "epoch": 0.0621414913957935, + "grad_norm": 2.419157028198242, + "learning_rate": 3.675060960964998e-06, + "loss": 0.1822, + "step": 325 + }, + { + "epoch": 0.06233269598470363, + "grad_norm": 3.7156035900115967, + "learning_rate": 3.6770130460279383e-06, + "loss": 0.6939, + "step": 326 + }, + { + "epoch": 0.06252390057361376, + "grad_norm": 2.3214938640594482, + "learning_rate": 3.678959152260497e-06, + "loss": 0.3233, + "step": 327 + }, + { + "epoch": 0.0627151051625239, + "grad_norm": 6.248021602630615, + "learning_rate": 3.6808993161747004e-06, + "loss": 0.291, + "step": 328 + }, + { + "epoch": 0.06290630975143403, + "grad_norm": 3.342606782913208, + "learning_rate": 3.682833573949131e-06, + "loss": 0.1396, + "step": 329 + }, + { + "epoch": 0.06309751434034416, + "grad_norm": 2.0580525398254395, + "learning_rate": 3.6847619614329755e-06, + "loss": 0.0865, + "step": 330 + }, + { + "epoch": 0.0632887189292543, + "grad_norm": 2.6703569889068604, + "learning_rate": 3.68668451415001e-06, + "loss": 0.1062, + "step": 331 + }, + { + "epoch": 0.06347992351816444, + "grad_norm": 3.680206775665283, + "learning_rate": 3.6886012673025277e-06, + "loss": 0.7988, + "step": 332 + }, + { + "epoch": 0.06367112810707456, + "grad_norm": 2.1449387073516846, + "learning_rate": 3.6905122557752073e-06, + "loss": 0.2702, + "step": 333 + }, + { + "epoch": 0.06386233269598471, + "grad_norm": 3.1063783168792725, + "learning_rate": 3.6924175141389183e-06, + "loss": 0.2693, + "step": 334 + }, + { + "epoch": 0.06405353728489484, + "grad_norm": 2.218688488006592, + "learning_rate": 3.6943170766544804e-06, + "loss": 0.3532, + "step": 335 + }, + { + "epoch": 0.06424474187380497, + "grad_norm": 2.191000461578369, + "learning_rate": 3.6962109772763544e-06, + "loss": 0.2497, + "step": 336 + }, + { + "epoch": 0.06443594646271511, + "grad_norm": 2.405402898788452, + "learning_rate": 3.6980992496562857e-06, + "loss": 0.1078, + "step": 337 + }, + { + "epoch": 0.06462715105162524, + "grad_norm": 2.5214316844940186, + "learning_rate": 3.699981927146893e-06, + "loss": 0.1989, + "step": 338 + }, + { + "epoch": 0.06481835564053537, + "grad_norm": 3.761211633682251, + "learning_rate": 3.701859042805201e-06, + "loss": 0.4005, + "step": 339 + }, + { + "epoch": 0.06500956022944551, + "grad_norm": 1.934031367301941, + "learning_rate": 3.7037306293961262e-06, + "loss": 0.4611, + "step": 340 + }, + { + "epoch": 0.06520076481835564, + "grad_norm": 2.1387293338775635, + "learning_rate": 3.7055967193959047e-06, + "loss": 0.2271, + "step": 341 + }, + { + "epoch": 0.06539196940726577, + "grad_norm": 2.621551990509033, + "learning_rate": 3.7074573449954767e-06, + "loss": 0.1125, + "step": 342 + }, + { + "epoch": 0.06558317399617591, + "grad_norm": 1.7968058586120605, + "learning_rate": 3.709312538103816e-06, + "loss": 0.113, + "step": 343 + }, + { + "epoch": 0.06577437858508604, + "grad_norm": 2.509631395339966, + "learning_rate": 3.7111623303512168e-06, + "loss": 0.2285, + "step": 344 + }, + { + "epoch": 0.06596558317399617, + "grad_norm": 2.8840248584747314, + "learning_rate": 3.7130067530925253e-06, + "loss": 0.2937, + "step": 345 + }, + { + "epoch": 0.06615678776290632, + "grad_norm": 3.3847973346710205, + "learning_rate": 3.7148458374103296e-06, + "loss": 0.1772, + "step": 346 + }, + { + "epoch": 0.06634799235181645, + "grad_norm": 2.8074636459350586, + "learning_rate": 3.716679614118107e-06, + "loss": 0.3393, + "step": 347 + }, + { + "epoch": 0.06653919694072657, + "grad_norm": 9.130067825317383, + "learning_rate": 3.7185081137633166e-06, + "loss": 1.0307, + "step": 348 + }, + { + "epoch": 0.06673040152963672, + "grad_norm": 1.991129994392395, + "learning_rate": 3.7203313666304545e-06, + "loss": 0.1372, + "step": 349 + }, + { + "epoch": 0.06692160611854685, + "grad_norm": 3.1855194568634033, + "learning_rate": 3.722149402744063e-06, + "loss": 0.1485, + "step": 350 + }, + { + "epoch": 0.06711281070745698, + "grad_norm": 3.3848493099212646, + "learning_rate": 3.7239622518716983e-06, + "loss": 0.5177, + "step": 351 + }, + { + "epoch": 0.0673040152963671, + "grad_norm": 3.134887933731079, + "learning_rate": 3.725769943526854e-06, + "loss": 0.6924, + "step": 352 + }, + { + "epoch": 0.06749521988527725, + "grad_norm": 3.605255126953125, + "learning_rate": 3.727572506971844e-06, + "loss": 0.4911, + "step": 353 + }, + { + "epoch": 0.06768642447418738, + "grad_norm": 2.0924346446990967, + "learning_rate": 3.7293699712206464e-06, + "loss": 0.2743, + "step": 354 + }, + { + "epoch": 0.06787762906309751, + "grad_norm": 3.687945604324341, + "learning_rate": 3.731162365041703e-06, + "loss": 0.1613, + "step": 355 + }, + { + "epoch": 0.06806883365200765, + "grad_norm": 5.654581546783447, + "learning_rate": 3.7329497169606825e-06, + "loss": 0.2055, + "step": 356 + }, + { + "epoch": 0.06826003824091778, + "grad_norm": 1.5821623802185059, + "learning_rate": 3.734732055263207e-06, + "loss": 0.1343, + "step": 357 + }, + { + "epoch": 0.06845124282982791, + "grad_norm": 3.3485209941864014, + "learning_rate": 3.736509407997536e-06, + "loss": 0.6085, + "step": 358 + }, + { + "epoch": 0.06864244741873805, + "grad_norm": 3.3173418045043945, + "learning_rate": 3.738281802977213e-06, + "loss": 0.3427, + "step": 359 + }, + { + "epoch": 0.06883365200764818, + "grad_norm": 2.4630320072174072, + "learning_rate": 3.7400492677836824e-06, + "loss": 0.1692, + "step": 360 + }, + { + "epoch": 0.06902485659655831, + "grad_norm": 2.6465280055999756, + "learning_rate": 3.7418118297688577e-06, + "loss": 0.1822, + "step": 361 + }, + { + "epoch": 0.06921606118546846, + "grad_norm": 2.0654947757720947, + "learning_rate": 3.743569516057668e-06, + "loss": 0.091, + "step": 362 + }, + { + "epoch": 0.06940726577437858, + "grad_norm": 3.139086961746216, + "learning_rate": 3.7453223535505566e-06, + "loss": 0.2461, + "step": 363 + }, + { + "epoch": 0.06959847036328871, + "grad_norm": 3.0427610874176025, + "learning_rate": 3.747070368925958e-06, + "loss": 0.3526, + "step": 364 + }, + { + "epoch": 0.06978967495219886, + "grad_norm": 3.79207181930542, + "learning_rate": 3.7488135886427275e-06, + "loss": 0.4793, + "step": 365 + }, + { + "epoch": 0.06998087954110899, + "grad_norm": 1.8372082710266113, + "learning_rate": 3.750552038942548e-06, + "loss": 0.1645, + "step": 366 + }, + { + "epoch": 0.07017208413001912, + "grad_norm": 2.4285526275634766, + "learning_rate": 3.7522857458523022e-06, + "loss": 0.1195, + "step": 367 + }, + { + "epoch": 0.07036328871892926, + "grad_norm": 2.658700942993164, + "learning_rate": 3.7540147351864037e-06, + "loss": 0.2826, + "step": 368 + }, + { + "epoch": 0.07055449330783939, + "grad_norm": 3.1620702743530273, + "learning_rate": 3.7557390325491095e-06, + "loss": 0.2795, + "step": 369 + }, + { + "epoch": 0.07074569789674952, + "grad_norm": 2.690842628479004, + "learning_rate": 3.757458663336794e-06, + "loss": 0.4693, + "step": 370 + }, + { + "epoch": 0.07093690248565966, + "grad_norm": 2.008033275604248, + "learning_rate": 3.7591736527401917e-06, + "loss": 0.2736, + "step": 371 + }, + { + "epoch": 0.07112810707456979, + "grad_norm": 3.240271806716919, + "learning_rate": 3.760884025746611e-06, + "loss": 0.3499, + "step": 372 + }, + { + "epoch": 0.07131931166347992, + "grad_norm": 1.3944287300109863, + "learning_rate": 3.7625898071421226e-06, + "loss": 0.0905, + "step": 373 + }, + { + "epoch": 0.07151051625239006, + "grad_norm": 2.2824788093566895, + "learning_rate": 3.7642910215137073e-06, + "loss": 0.2282, + "step": 374 + }, + { + "epoch": 0.07170172084130019, + "grad_norm": 4.351092338562012, + "learning_rate": 3.765987693251391e-06, + "loss": 0.3069, + "step": 375 + }, + { + "epoch": 0.07189292543021032, + "grad_norm": 4.460353851318359, + "learning_rate": 3.767679846550338e-06, + "loss": 0.6941, + "step": 376 + }, + { + "epoch": 0.07208413001912047, + "grad_norm": 2.765394926071167, + "learning_rate": 3.76936750541292e-06, + "loss": 0.4267, + "step": 377 + }, + { + "epoch": 0.0722753346080306, + "grad_norm": 2.731600284576416, + "learning_rate": 3.771050693650764e-06, + "loss": 0.3801, + "step": 378 + }, + { + "epoch": 0.07246653919694072, + "grad_norm": 1.8360090255737305, + "learning_rate": 3.772729434886761e-06, + "loss": 0.2339, + "step": 379 + }, + { + "epoch": 0.07265774378585087, + "grad_norm": 1.9759913682937622, + "learning_rate": 3.774403752557064e-06, + "loss": 0.3144, + "step": 380 + }, + { + "epoch": 0.072848948374761, + "grad_norm": 1.7601401805877686, + "learning_rate": 3.7760736699130425e-06, + "loss": 0.1128, + "step": 381 + }, + { + "epoch": 0.07304015296367113, + "grad_norm": 4.08651065826416, + "learning_rate": 3.7777392100232265e-06, + "loss": 0.4442, + "step": 382 + }, + { + "epoch": 0.07323135755258126, + "grad_norm": 3.4187510013580322, + "learning_rate": 3.7794003957752135e-06, + "loss": 0.4448, + "step": 383 + }, + { + "epoch": 0.0734225621414914, + "grad_norm": 1.9716267585754395, + "learning_rate": 3.781057249877561e-06, + "loss": 0.3026, + "step": 384 + }, + { + "epoch": 0.07361376673040153, + "grad_norm": 1.3850023746490479, + "learning_rate": 3.7827097948616442e-06, + "loss": 0.2962, + "step": 385 + }, + { + "epoch": 0.07380497131931166, + "grad_norm": 3.1978061199188232, + "learning_rate": 3.784358053083497e-06, + "loss": 0.4785, + "step": 386 + }, + { + "epoch": 0.0739961759082218, + "grad_norm": 2.065589666366577, + "learning_rate": 3.786002046725626e-06, + "loss": 0.1068, + "step": 387 + }, + { + "epoch": 0.07418738049713193, + "grad_norm": 2.613898277282715, + "learning_rate": 3.7876417977988023e-06, + "loss": 0.2658, + "step": 388 + }, + { + "epoch": 0.07437858508604206, + "grad_norm": 2.723231077194214, + "learning_rate": 3.7892773281438285e-06, + "loss": 0.451, + "step": 389 + }, + { + "epoch": 0.0745697896749522, + "grad_norm": 11.916027069091797, + "learning_rate": 3.7909086594332863e-06, + "loss": 0.4356, + "step": 390 + }, + { + "epoch": 0.07476099426386233, + "grad_norm": 1.5050503015518188, + "learning_rate": 3.792535813173256e-06, + "loss": 0.1137, + "step": 391 + }, + { + "epoch": 0.07495219885277246, + "grad_norm": 2.0423741340637207, + "learning_rate": 3.7941588107050227e-06, + "loss": 0.228, + "step": 392 + }, + { + "epoch": 0.0751434034416826, + "grad_norm": 5.4225969314575195, + "learning_rate": 3.795777673206752e-06, + "loss": 0.3271, + "step": 393 + }, + { + "epoch": 0.07533460803059273, + "grad_norm": 2.6373708248138428, + "learning_rate": 3.7973924216951487e-06, + "loss": 0.2295, + "step": 394 + }, + { + "epoch": 0.07552581261950286, + "grad_norm": 7.188790798187256, + "learning_rate": 3.7990030770270936e-06, + "loss": 0.2779, + "step": 395 + }, + { + "epoch": 0.075717017208413, + "grad_norm": 2.465902805328369, + "learning_rate": 3.800609659901264e-06, + "loss": 0.2997, + "step": 396 + }, + { + "epoch": 0.07590822179732314, + "grad_norm": 2.4251866340637207, + "learning_rate": 3.802212190859722e-06, + "loss": 0.3933, + "step": 397 + }, + { + "epoch": 0.07609942638623327, + "grad_norm": 2.4671387672424316, + "learning_rate": 3.803810690289499e-06, + "loss": 0.2564, + "step": 398 + }, + { + "epoch": 0.07629063097514341, + "grad_norm": 2.4797089099884033, + "learning_rate": 3.8054051784241454e-06, + "loss": 0.1865, + "step": 399 + }, + { + "epoch": 0.07648183556405354, + "grad_norm": 2.997403860092163, + "learning_rate": 3.8069956753452695e-06, + "loss": 0.3207, + "step": 400 + }, + { + "epoch": 0.07667304015296367, + "grad_norm": 1.8929557800292969, + "learning_rate": 3.808582200984058e-06, + "loss": 0.1249, + "step": 401 + }, + { + "epoch": 0.07686424474187381, + "grad_norm": 3.276320695877075, + "learning_rate": 3.8101647751227683e-06, + "loss": 0.4781, + "step": 402 + }, + { + "epoch": 0.07705544933078394, + "grad_norm": 1.9976557493209839, + "learning_rate": 3.811743417396214e-06, + "loss": 0.184, + "step": 403 + }, + { + "epoch": 0.07724665391969407, + "grad_norm": 3.980438232421875, + "learning_rate": 3.813318147293226e-06, + "loss": 0.4318, + "step": 404 + }, + { + "epoch": 0.07743785850860421, + "grad_norm": 2.1951019763946533, + "learning_rate": 3.8148889841580915e-06, + "loss": 0.2133, + "step": 405 + }, + { + "epoch": 0.07762906309751434, + "grad_norm": 4.515348434448242, + "learning_rate": 3.816455947191985e-06, + "loss": 0.5805, + "step": 406 + }, + { + "epoch": 0.07782026768642447, + "grad_norm": 2.6555490493774414, + "learning_rate": 3.818019055454375e-06, + "loss": 0.3141, + "step": 407 + }, + { + "epoch": 0.07801147227533461, + "grad_norm": 1.6512001752853394, + "learning_rate": 3.819578327864414e-06, + "loss": 0.2455, + "step": 408 + }, + { + "epoch": 0.07820267686424474, + "grad_norm": 5.637848854064941, + "learning_rate": 3.821133783202312e-06, + "loss": 0.335, + "step": 409 + }, + { + "epoch": 0.07839388145315487, + "grad_norm": 1.5848608016967773, + "learning_rate": 3.8226854401106974e-06, + "loss": 0.2007, + "step": 410 + }, + { + "epoch": 0.078585086042065, + "grad_norm": 2.4538025856018066, + "learning_rate": 3.824233317095951e-06, + "loss": 0.309, + "step": 411 + }, + { + "epoch": 0.07877629063097515, + "grad_norm": 3.901400089263916, + "learning_rate": 3.82577743252954e-06, + "loss": 0.1759, + "step": 412 + }, + { + "epoch": 0.07896749521988528, + "grad_norm": 3.340503454208374, + "learning_rate": 3.8273178046493155e-06, + "loss": 0.4415, + "step": 413 + }, + { + "epoch": 0.0791586998087954, + "grad_norm": 3.1490180492401123, + "learning_rate": 3.828854451560812e-06, + "loss": 0.3878, + "step": 414 + }, + { + "epoch": 0.07934990439770555, + "grad_norm": 1.568618893623352, + "learning_rate": 3.830387391238524e-06, + "loss": 0.2918, + "step": 415 + }, + { + "epoch": 0.07954110898661568, + "grad_norm": 2.177776336669922, + "learning_rate": 3.831916641527165e-06, + "loss": 0.1442, + "step": 416 + }, + { + "epoch": 0.0797323135755258, + "grad_norm": 2.933709144592285, + "learning_rate": 3.833442220142912e-06, + "loss": 0.4187, + "step": 417 + }, + { + "epoch": 0.07992351816443595, + "grad_norm": 1.9458264112472534, + "learning_rate": 3.834964144674645e-06, + "loss": 0.0881, + "step": 418 + }, + { + "epoch": 0.08011472275334608, + "grad_norm": 2.3854639530181885, + "learning_rate": 3.836482432585154e-06, + "loss": 0.1411, + "step": 419 + }, + { + "epoch": 0.08030592734225621, + "grad_norm": 2.503591299057007, + "learning_rate": 3.837997101212351e-06, + "loss": 0.2308, + "step": 420 + }, + { + "epoch": 0.08049713193116635, + "grad_norm": 2.7284929752349854, + "learning_rate": 3.839508167770447e-06, + "loss": 0.5046, + "step": 421 + }, + { + "epoch": 0.08068833652007648, + "grad_norm": 2.8158528804779053, + "learning_rate": 3.8410156493511355e-06, + "loss": 0.4343, + "step": 422 + }, + { + "epoch": 0.08087954110898661, + "grad_norm": 2.605060577392578, + "learning_rate": 3.842519562924747e-06, + "loss": 0.242, + "step": 423 + }, + { + "epoch": 0.08107074569789675, + "grad_norm": 2.482698440551758, + "learning_rate": 3.8440199253413985e-06, + "loss": 0.2665, + "step": 424 + }, + { + "epoch": 0.08126195028680688, + "grad_norm": 2.105672836303711, + "learning_rate": 3.8455167533321224e-06, + "loss": 0.1729, + "step": 425 + }, + { + "epoch": 0.08145315487571701, + "grad_norm": 4.377143383026123, + "learning_rate": 3.847010063509991e-06, + "loss": 0.5099, + "step": 426 + }, + { + "epoch": 0.08164435946462716, + "grad_norm": 1.9579622745513916, + "learning_rate": 3.848499872371217e-06, + "loss": 0.1786, + "step": 427 + }, + { + "epoch": 0.08183556405353729, + "grad_norm": 2.5516083240509033, + "learning_rate": 3.849986196296252e-06, + "loss": 0.1474, + "step": 428 + }, + { + "epoch": 0.08202676864244741, + "grad_norm": 2.7220704555511475, + "learning_rate": 3.851469051550867e-06, + "loss": 0.3341, + "step": 429 + }, + { + "epoch": 0.08221797323135756, + "grad_norm": 1.9412697553634644, + "learning_rate": 3.852948454287213e-06, + "loss": 0.163, + "step": 430 + }, + { + "epoch": 0.08240917782026769, + "grad_norm": 2.0283761024475098, + "learning_rate": 3.854424420544888e-06, + "loss": 0.0943, + "step": 431 + }, + { + "epoch": 0.08260038240917782, + "grad_norm": 2.609199285507202, + "learning_rate": 3.855896966251971e-06, + "loss": 0.246, + "step": 432 + }, + { + "epoch": 0.08279158699808796, + "grad_norm": 1.5453547239303589, + "learning_rate": 3.857366107226057e-06, + "loss": 0.224, + "step": 433 + }, + { + "epoch": 0.08298279158699809, + "grad_norm": 1.983446717262268, + "learning_rate": 3.85883185917528e-06, + "loss": 0.2313, + "step": 434 + }, + { + "epoch": 0.08317399617590822, + "grad_norm": 2.101289749145508, + "learning_rate": 3.860294237699313e-06, + "loss": 0.1372, + "step": 435 + }, + { + "epoch": 0.08336520076481836, + "grad_norm": 1.78105628490448, + "learning_rate": 3.861753258290372e-06, + "loss": 0.1718, + "step": 436 + }, + { + "epoch": 0.08355640535372849, + "grad_norm": 2.88455867767334, + "learning_rate": 3.863208936334194e-06, + "loss": 0.1571, + "step": 437 + }, + { + "epoch": 0.08374760994263862, + "grad_norm": 2.6141340732574463, + "learning_rate": 3.864661287111015e-06, + "loss": 0.2811, + "step": 438 + }, + { + "epoch": 0.08393881453154876, + "grad_norm": 2.1665003299713135, + "learning_rate": 3.866110325796531e-06, + "loss": 0.1615, + "step": 439 + }, + { + "epoch": 0.0841300191204589, + "grad_norm": 2.2355403900146484, + "learning_rate": 3.86755606746285e-06, + "loss": 0.2134, + "step": 440 + }, + { + "epoch": 0.08432122370936902, + "grad_norm": 25.508962631225586, + "learning_rate": 3.868998527079432e-06, + "loss": 0.5128, + "step": 441 + }, + { + "epoch": 0.08451242829827915, + "grad_norm": 3.2912681102752686, + "learning_rate": 3.870437719514017e-06, + "loss": 0.2624, + "step": 442 + }, + { + "epoch": 0.0847036328871893, + "grad_norm": 2.363339900970459, + "learning_rate": 3.871873659533549e-06, + "loss": 0.1605, + "step": 443 + }, + { + "epoch": 0.08489483747609942, + "grad_norm": 4.588367938995361, + "learning_rate": 3.873306361805082e-06, + "loss": 0.3243, + "step": 444 + }, + { + "epoch": 0.08508604206500955, + "grad_norm": 4.233880519866943, + "learning_rate": 3.874735840896679e-06, + "loss": 0.6475, + "step": 445 + }, + { + "epoch": 0.0852772466539197, + "grad_norm": 2.803955316543579, + "learning_rate": 3.876162111278302e-06, + "loss": 0.2044, + "step": 446 + }, + { + "epoch": 0.08546845124282983, + "grad_norm": 2.126622438430786, + "learning_rate": 3.877585187322691e-06, + "loss": 0.1412, + "step": 447 + }, + { + "epoch": 0.08565965583173996, + "grad_norm": 2.711604356765747, + "learning_rate": 3.879005083306229e-06, + "loss": 0.4514, + "step": 448 + }, + { + "epoch": 0.0858508604206501, + "grad_norm": 1.7695578336715698, + "learning_rate": 3.88042181340981e-06, + "loss": 0.0854, + "step": 449 + }, + { + "epoch": 0.08604206500956023, + "grad_norm": 3.0042665004730225, + "learning_rate": 3.881835391719679e-06, + "loss": 0.1316, + "step": 450 + }, + { + "epoch": 0.08623326959847036, + "grad_norm": 4.021963119506836, + "learning_rate": 3.883245832228278e-06, + "loss": 0.5587, + "step": 451 + }, + { + "epoch": 0.0864244741873805, + "grad_norm": 2.566257953643799, + "learning_rate": 3.884653148835076e-06, + "loss": 0.5083, + "step": 452 + }, + { + "epoch": 0.08661567877629063, + "grad_norm": 2.0462639331817627, + "learning_rate": 3.8860573553473914e-06, + "loss": 0.161, + "step": 453 + }, + { + "epoch": 0.08680688336520076, + "grad_norm": 2.4460551738739014, + "learning_rate": 3.887458465481203e-06, + "loss": 0.1692, + "step": 454 + }, + { + "epoch": 0.0869980879541109, + "grad_norm": 2.9268269538879395, + "learning_rate": 3.888856492861955e-06, + "loss": 0.3161, + "step": 455 + }, + { + "epoch": 0.08718929254302103, + "grad_norm": 2.3640105724334717, + "learning_rate": 3.890251451025352e-06, + "loss": 0.1184, + "step": 456 + }, + { + "epoch": 0.08738049713193116, + "grad_norm": 3.2469921112060547, + "learning_rate": 3.8916433534181465e-06, + "loss": 0.3819, + "step": 457 + }, + { + "epoch": 0.0875717017208413, + "grad_norm": 3.044752359390259, + "learning_rate": 3.893032213398913e-06, + "loss": 0.5221, + "step": 458 + }, + { + "epoch": 0.08776290630975143, + "grad_norm": 1.492080807685852, + "learning_rate": 3.894418044238823e-06, + "loss": 0.1747, + "step": 459 + }, + { + "epoch": 0.08795411089866156, + "grad_norm": 1.967621922492981, + "learning_rate": 3.8958008591224e-06, + "loss": 0.4279, + "step": 460 + }, + { + "epoch": 0.08814531548757171, + "grad_norm": 4.419346809387207, + "learning_rate": 3.897180671148275e-06, + "loss": 0.222, + "step": 461 + }, + { + "epoch": 0.08833652007648184, + "grad_norm": 3.055210590362549, + "learning_rate": 3.898557493329932e-06, + "loss": 0.203, + "step": 462 + }, + { + "epoch": 0.08852772466539197, + "grad_norm": 2.82248854637146, + "learning_rate": 3.899931338596441e-06, + "loss": 0.2677, + "step": 463 + }, + { + "epoch": 0.08871892925430211, + "grad_norm": 4.254226207733154, + "learning_rate": 3.901302219793192e-06, + "loss": 0.6093, + "step": 464 + }, + { + "epoch": 0.08891013384321224, + "grad_norm": 3.696042537689209, + "learning_rate": 3.902670149682608e-06, + "loss": 0.6688, + "step": 465 + }, + { + "epoch": 0.08910133843212237, + "grad_norm": 2.048828601837158, + "learning_rate": 3.904035140944864e-06, + "loss": 0.0932, + "step": 466 + }, + { + "epoch": 0.08929254302103251, + "grad_norm": 1.4884488582611084, + "learning_rate": 3.905397206178592e-06, + "loss": 0.1926, + "step": 467 + }, + { + "epoch": 0.08948374760994264, + "grad_norm": 4.860929012298584, + "learning_rate": 3.906756357901574e-06, + "loss": 0.2957, + "step": 468 + }, + { + "epoch": 0.08967495219885277, + "grad_norm": 3.0578386783599854, + "learning_rate": 3.908112608551437e-06, + "loss": 0.2792, + "step": 469 + }, + { + "epoch": 0.08986615678776291, + "grad_norm": 2.4391660690307617, + "learning_rate": 3.9094659704863346e-06, + "loss": 0.3934, + "step": 470 + }, + { + "epoch": 0.09005736137667304, + "grad_norm": 3.1707816123962402, + "learning_rate": 3.910816455985621e-06, + "loss": 0.4196, + "step": 471 + }, + { + "epoch": 0.09024856596558317, + "grad_norm": 3.7489700317382812, + "learning_rate": 3.912164077250522e-06, + "loss": 0.5584, + "step": 472 + }, + { + "epoch": 0.0904397705544933, + "grad_norm": 3.4774014949798584, + "learning_rate": 3.9135088464047945e-06, + "loss": 0.3015, + "step": 473 + }, + { + "epoch": 0.09063097514340344, + "grad_norm": 3.203939914703369, + "learning_rate": 3.9148507754953815e-06, + "loss": 0.1764, + "step": 474 + }, + { + "epoch": 0.09082217973231357, + "grad_norm": 5.508978366851807, + "learning_rate": 3.91618987649306e-06, + "loss": 0.1797, + "step": 475 + }, + { + "epoch": 0.0910133843212237, + "grad_norm": 3.0496325492858887, + "learning_rate": 3.917526161293082e-06, + "loss": 0.2904, + "step": 476 + }, + { + "epoch": 0.09120458891013385, + "grad_norm": 2.569993257522583, + "learning_rate": 3.9188596417158075e-06, + "loss": 0.2979, + "step": 477 + }, + { + "epoch": 0.09139579349904398, + "grad_norm": 2.830799102783203, + "learning_rate": 3.920190329507332e-06, + "loss": 0.1373, + "step": 478 + }, + { + "epoch": 0.0915869980879541, + "grad_norm": 1.5657826662063599, + "learning_rate": 3.921518236340108e-06, + "loss": 0.1339, + "step": 479 + }, + { + "epoch": 0.09177820267686425, + "grad_norm": 1.5757731199264526, + "learning_rate": 3.922843373813557e-06, + "loss": 0.1992, + "step": 480 + }, + { + "epoch": 0.09196940726577438, + "grad_norm": 3.687089443206787, + "learning_rate": 3.924165753454686e-06, + "loss": 0.1859, + "step": 481 + }, + { + "epoch": 0.09216061185468451, + "grad_norm": 4.091752052307129, + "learning_rate": 3.925485386718674e-06, + "loss": 0.2528, + "step": 482 + }, + { + "epoch": 0.09235181644359465, + "grad_norm": 1.9640034437179565, + "learning_rate": 3.926802284989481e-06, + "loss": 0.3403, + "step": 483 + }, + { + "epoch": 0.09254302103250478, + "grad_norm": 2.387704849243164, + "learning_rate": 3.928116459580432e-06, + "loss": 0.4675, + "step": 484 + }, + { + "epoch": 0.09273422562141491, + "grad_norm": 1.9029995203018188, + "learning_rate": 3.9294279217347985e-06, + "loss": 0.2266, + "step": 485 + }, + { + "epoch": 0.09292543021032505, + "grad_norm": 3.2065725326538086, + "learning_rate": 3.930736682626379e-06, + "loss": 0.25, + "step": 486 + }, + { + "epoch": 0.09311663479923518, + "grad_norm": 2.709934711456299, + "learning_rate": 3.932042753360069e-06, + "loss": 0.2074, + "step": 487 + }, + { + "epoch": 0.09330783938814531, + "grad_norm": 3.0203776359558105, + "learning_rate": 3.933346144972424e-06, + "loss": 0.3794, + "step": 488 + }, + { + "epoch": 0.09349904397705545, + "grad_norm": 3.989088296890259, + "learning_rate": 3.934646868432223e-06, + "loss": 0.3628, + "step": 489 + }, + { + "epoch": 0.09369024856596558, + "grad_norm": 1.333524465560913, + "learning_rate": 3.93594493464102e-06, + "loss": 0.2219, + "step": 490 + }, + { + "epoch": 0.09388145315487571, + "grad_norm": 2.184206247329712, + "learning_rate": 3.937240354433692e-06, + "loss": 0.4398, + "step": 491 + }, + { + "epoch": 0.09407265774378586, + "grad_norm": 2.465756893157959, + "learning_rate": 3.9385331385789845e-06, + "loss": 0.1526, + "step": 492 + }, + { + "epoch": 0.09426386233269599, + "grad_norm": 2.6147758960723877, + "learning_rate": 3.939823297780045e-06, + "loss": 0.1571, + "step": 493 + }, + { + "epoch": 0.09445506692160612, + "grad_norm": 2.757384777069092, + "learning_rate": 3.941110842674955e-06, + "loss": 0.178, + "step": 494 + }, + { + "epoch": 0.09464627151051626, + "grad_norm": 3.065559148788452, + "learning_rate": 3.94239578383726e-06, + "loss": 0.5259, + "step": 495 + }, + { + "epoch": 0.09483747609942639, + "grad_norm": 3.7982611656188965, + "learning_rate": 3.9436781317764865e-06, + "loss": 0.6654, + "step": 496 + }, + { + "epoch": 0.09502868068833652, + "grad_norm": 1.8163230419158936, + "learning_rate": 3.944957896938659e-06, + "loss": 0.2204, + "step": 497 + }, + { + "epoch": 0.09521988527724666, + "grad_norm": 2.6650335788726807, + "learning_rate": 3.946235089706813e-06, + "loss": 0.2265, + "step": 498 + }, + { + "epoch": 0.09541108986615679, + "grad_norm": 1.7298262119293213, + "learning_rate": 3.947509720401494e-06, + "loss": 0.0825, + "step": 499 + }, + { + "epoch": 0.09560229445506692, + "grad_norm": 2.199256181716919, + "learning_rate": 3.948781799281266e-06, + "loss": 0.0587, + "step": 500 + }, + { + "epoch": 0.09560229445506692, + "eval_runtime": 758.2728, + "eval_samples_per_second": 2.023, + "eval_steps_per_second": 0.253, + "step": 500 + }, + { + "epoch": 0.09579349904397706, + "grad_norm": 2.560211658477783, + "learning_rate": 3.950051336543202e-06, + "loss": 0.3781, + "step": 501 + }, + { + "epoch": 0.09598470363288719, + "grad_norm": 2.047006845474243, + "learning_rate": 3.951318342323376e-06, + "loss": 0.3203, + "step": 502 + }, + { + "epoch": 0.09617590822179732, + "grad_norm": 2.891329050064087, + "learning_rate": 3.952582826697346e-06, + "loss": 0.3811, + "step": 503 + }, + { + "epoch": 0.09636711281070745, + "grad_norm": 2.8886239528656006, + "learning_rate": 3.953844799680639e-06, + "loss": 0.269, + "step": 504 + }, + { + "epoch": 0.0965583173996176, + "grad_norm": 1.302022099494934, + "learning_rate": 3.955104271229223e-06, + "loss": 0.0563, + "step": 505 + }, + { + "epoch": 0.09674952198852772, + "grad_norm": 2.516767740249634, + "learning_rate": 3.9563612512399815e-06, + "loss": 0.1752, + "step": 506 + }, + { + "epoch": 0.09694072657743785, + "grad_norm": 5.090005874633789, + "learning_rate": 3.957615749551177e-06, + "loss": 0.3137, + "step": 507 + }, + { + "epoch": 0.097131931166348, + "grad_norm": 2.3271141052246094, + "learning_rate": 3.958867775942918e-06, + "loss": 0.2331, + "step": 508 + }, + { + "epoch": 0.09732313575525812, + "grad_norm": 5.051674842834473, + "learning_rate": 3.960117340137608e-06, + "loss": 0.2622, + "step": 509 + }, + { + "epoch": 0.09751434034416825, + "grad_norm": 2.13322114944458, + "learning_rate": 3.96136445180041e-06, + "loss": 0.1548, + "step": 510 + }, + { + "epoch": 0.0977055449330784, + "grad_norm": 2.8355138301849365, + "learning_rate": 3.962609120539684e-06, + "loss": 0.1487, + "step": 511 + }, + { + "epoch": 0.09789674952198853, + "grad_norm": 1.6841158866882324, + "learning_rate": 3.963851355907436e-06, + "loss": 0.1633, + "step": 512 + }, + { + "epoch": 0.09808795411089866, + "grad_norm": 3.454113483428955, + "learning_rate": 3.965091167399761e-06, + "loss": 0.2414, + "step": 513 + }, + { + "epoch": 0.0982791586998088, + "grad_norm": 3.103166103363037, + "learning_rate": 3.966328564457273e-06, + "loss": 0.2673, + "step": 514 + }, + { + "epoch": 0.09847036328871893, + "grad_norm": 3.4055354595184326, + "learning_rate": 3.967563556465537e-06, + "loss": 0.434, + "step": 515 + }, + { + "epoch": 0.09866156787762906, + "grad_norm": 2.7894411087036133, + "learning_rate": 3.968796152755501e-06, + "loss": 0.2902, + "step": 516 + }, + { + "epoch": 0.0988527724665392, + "grad_norm": 2.1280155181884766, + "learning_rate": 3.970026362603916e-06, + "loss": 0.2733, + "step": 517 + }, + { + "epoch": 0.09904397705544933, + "grad_norm": 3.4343643188476562, + "learning_rate": 3.97125419523375e-06, + "loss": 0.5206, + "step": 518 + }, + { + "epoch": 0.09923518164435946, + "grad_norm": 2.7457900047302246, + "learning_rate": 3.972479659814613e-06, + "loss": 0.2337, + "step": 519 + }, + { + "epoch": 0.0994263862332696, + "grad_norm": 3.7752184867858887, + "learning_rate": 3.973702765463161e-06, + "loss": 0.5257, + "step": 520 + }, + { + "epoch": 0.09961759082217973, + "grad_norm": 2.813133478164673, + "learning_rate": 3.974923521243501e-06, + "loss": 0.2109, + "step": 521 + }, + { + "epoch": 0.09980879541108986, + "grad_norm": 2.6638405323028564, + "learning_rate": 3.976141936167601e-06, + "loss": 0.2409, + "step": 522 + }, + { + "epoch": 0.1, + "grad_norm": 2.6184728145599365, + "learning_rate": 3.9773580191956855e-06, + "loss": 0.4386, + "step": 523 + }, + { + "epoch": 0.10019120458891013, + "grad_norm": 2.8022634983062744, + "learning_rate": 3.978571779236627e-06, + "loss": 0.2676, + "step": 524 + }, + { + "epoch": 0.10038240917782026, + "grad_norm": 3.0874242782592773, + "learning_rate": 3.979783225148348e-06, + "loss": 0.1031, + "step": 525 + }, + { + "epoch": 0.10057361376673041, + "grad_norm": 2.6263234615325928, + "learning_rate": 3.980992365738197e-06, + "loss": 0.3226, + "step": 526 + }, + { + "epoch": 0.10076481835564054, + "grad_norm": 3.469144821166992, + "learning_rate": 3.982199209763339e-06, + "loss": 0.1735, + "step": 527 + }, + { + "epoch": 0.10095602294455067, + "grad_norm": 3.713228940963745, + "learning_rate": 3.983403765931139e-06, + "loss": 0.2317, + "step": 528 + }, + { + "epoch": 0.10114722753346081, + "grad_norm": 4.205535411834717, + "learning_rate": 3.98460604289953e-06, + "loss": 0.1452, + "step": 529 + }, + { + "epoch": 0.10133843212237094, + "grad_norm": 2.1234936714172363, + "learning_rate": 3.985806049277395e-06, + "loss": 0.1145, + "step": 530 + }, + { + "epoch": 0.10152963671128107, + "grad_norm": 2.30922794342041, + "learning_rate": 3.987003793624931e-06, + "loss": 0.144, + "step": 531 + }, + { + "epoch": 0.10172084130019121, + "grad_norm": 1.6173641681671143, + "learning_rate": 3.98819928445402e-06, + "loss": 0.3568, + "step": 532 + }, + { + "epoch": 0.10191204588910134, + "grad_norm": 3.590470552444458, + "learning_rate": 3.989392530228588e-06, + "loss": 0.4316, + "step": 533 + }, + { + "epoch": 0.10210325047801147, + "grad_norm": 3.209644079208374, + "learning_rate": 3.990583539364967e-06, + "loss": 0.4416, + "step": 534 + }, + { + "epoch": 0.1022944550669216, + "grad_norm": 3.220745801925659, + "learning_rate": 3.99177232023225e-06, + "loss": 0.1935, + "step": 535 + }, + { + "epoch": 0.10248565965583174, + "grad_norm": 1.88006591796875, + "learning_rate": 3.992958881152644e-06, + "loss": 0.2317, + "step": 536 + }, + { + "epoch": 0.10267686424474187, + "grad_norm": 1.5669440031051636, + "learning_rate": 3.9941432304018205e-06, + "loss": 0.0725, + "step": 537 + }, + { + "epoch": 0.102868068833652, + "grad_norm": 2.0758368968963623, + "learning_rate": 3.995325376209261e-06, + "loss": 0.2837, + "step": 538 + }, + { + "epoch": 0.10305927342256214, + "grad_norm": 3.3589015007019043, + "learning_rate": 3.996505326758601e-06, + "loss": 0.5328, + "step": 539 + }, + { + "epoch": 0.10325047801147227, + "grad_norm": 2.2015810012817383, + "learning_rate": 3.997683090187967e-06, + "loss": 0.3724, + "step": 540 + }, + { + "epoch": 0.1034416826003824, + "grad_norm": 3.714031934738159, + "learning_rate": 3.998858674590317e-06, + "loss": 0.2856, + "step": 541 + }, + { + "epoch": 0.10363288718929255, + "grad_norm": 2.511425018310547, + "learning_rate": 4.000032088013775e-06, + "loss": 0.2015, + "step": 542 + }, + { + "epoch": 0.10382409177820268, + "grad_norm": 2.7374191284179688, + "learning_rate": 4.001203338461952e-06, + "loss": 0.1779, + "step": 543 + }, + { + "epoch": 0.1040152963671128, + "grad_norm": 3.1923158168792725, + "learning_rate": 4.002372433894288e-06, + "loss": 0.2595, + "step": 544 + }, + { + "epoch": 0.10420650095602295, + "grad_norm": 2.5342373847961426, + "learning_rate": 4.0035393822263685e-06, + "loss": 0.3994, + "step": 545 + }, + { + "epoch": 0.10439770554493308, + "grad_norm": 2.794058084487915, + "learning_rate": 4.0047041913302425e-06, + "loss": 0.5341, + "step": 546 + }, + { + "epoch": 0.10458891013384321, + "grad_norm": 2.975745439529419, + "learning_rate": 4.005866869034748e-06, + "loss": 0.3757, + "step": 547 + }, + { + "epoch": 0.10478011472275335, + "grad_norm": 1.5141457319259644, + "learning_rate": 4.0070274231258276e-06, + "loss": 0.2261, + "step": 548 + }, + { + "epoch": 0.10497131931166348, + "grad_norm": 1.4179555177688599, + "learning_rate": 4.008185861346832e-06, + "loss": 0.1121, + "step": 549 + }, + { + "epoch": 0.10516252390057361, + "grad_norm": 1.3781744241714478, + "learning_rate": 4.009342191398848e-06, + "loss": 0.0929, + "step": 550 + }, + { + "epoch": 0.10535372848948375, + "grad_norm": 3.2945396900177, + "learning_rate": 4.010496420940983e-06, + "loss": 0.5864, + "step": 551 + }, + { + "epoch": 0.10554493307839388, + "grad_norm": 2.5260989665985107, + "learning_rate": 4.0116485575906874e-06, + "loss": 0.3214, + "step": 552 + }, + { + "epoch": 0.10573613766730401, + "grad_norm": 2.2214319705963135, + "learning_rate": 4.01279860892405e-06, + "loss": 0.3037, + "step": 553 + }, + { + "epoch": 0.10592734225621415, + "grad_norm": 2.315720796585083, + "learning_rate": 4.013946582476095e-06, + "loss": 0.3353, + "step": 554 + }, + { + "epoch": 0.10611854684512428, + "grad_norm": 2.7136638164520264, + "learning_rate": 4.015092485741078e-06, + "loss": 0.2516, + "step": 555 + }, + { + "epoch": 0.10630975143403441, + "grad_norm": 2.8678059577941895, + "learning_rate": 4.0162363261727876e-06, + "loss": 0.1223, + "step": 556 + }, + { + "epoch": 0.10650095602294456, + "grad_norm": 3.997982978820801, + "learning_rate": 4.017378111184824e-06, + "loss": 0.2095, + "step": 557 + }, + { + "epoch": 0.10669216061185469, + "grad_norm": 2.3306539058685303, + "learning_rate": 4.018517848150896e-06, + "loss": 0.2294, + "step": 558 + }, + { + "epoch": 0.10688336520076482, + "grad_norm": 1.9543094635009766, + "learning_rate": 4.019655544405104e-06, + "loss": 0.1078, + "step": 559 + }, + { + "epoch": 0.10707456978967496, + "grad_norm": 2.487516403198242, + "learning_rate": 4.0207912072422265e-06, + "loss": 0.2011, + "step": 560 + }, + { + "epoch": 0.10726577437858509, + "grad_norm": 1.3109228610992432, + "learning_rate": 4.0219248439179914e-06, + "loss": 0.1198, + "step": 561 + }, + { + "epoch": 0.10745697896749522, + "grad_norm": 1.7776157855987549, + "learning_rate": 4.023056461649366e-06, + "loss": 0.0965, + "step": 562 + }, + { + "epoch": 0.10764818355640535, + "grad_norm": 2.4455111026763916, + "learning_rate": 4.024186067614824e-06, + "loss": 0.1997, + "step": 563 + }, + { + "epoch": 0.10783938814531549, + "grad_norm": 2.7624261379241943, + "learning_rate": 4.0253136689546225e-06, + "loss": 0.6754, + "step": 564 + }, + { + "epoch": 0.10803059273422562, + "grad_norm": 1.7886794805526733, + "learning_rate": 4.026439272771073e-06, + "loss": 0.292, + "step": 565 + }, + { + "epoch": 0.10822179732313575, + "grad_norm": 1.396551489830017, + "learning_rate": 4.027562886128808e-06, + "loss": 0.095, + "step": 566 + }, + { + "epoch": 0.10841300191204589, + "grad_norm": 2.8856143951416016, + "learning_rate": 4.028684516055048e-06, + "loss": 0.2296, + "step": 567 + }, + { + "epoch": 0.10860420650095602, + "grad_norm": 2.7726597785949707, + "learning_rate": 4.029804169539866e-06, + "loss": 0.1276, + "step": 568 + }, + { + "epoch": 0.10879541108986615, + "grad_norm": 2.4296844005584717, + "learning_rate": 4.030921853536447e-06, + "loss": 0.2504, + "step": 569 + }, + { + "epoch": 0.1089866156787763, + "grad_norm": 2.3982651233673096, + "learning_rate": 4.032037574961348e-06, + "loss": 0.487, + "step": 570 + }, + { + "epoch": 0.10917782026768642, + "grad_norm": 2.2958669662475586, + "learning_rate": 4.033151340694757e-06, + "loss": 0.352, + "step": 571 + }, + { + "epoch": 0.10936902485659655, + "grad_norm": 2.792046546936035, + "learning_rate": 4.0342631575807424e-06, + "loss": 0.3026, + "step": 572 + }, + { + "epoch": 0.1095602294455067, + "grad_norm": 2.0497055053710938, + "learning_rate": 4.035373032427511e-06, + "loss": 0.207, + "step": 573 + }, + { + "epoch": 0.10975143403441683, + "grad_norm": 1.5417126417160034, + "learning_rate": 4.036480972007654e-06, + "loss": 0.0919, + "step": 574 + }, + { + "epoch": 0.10994263862332695, + "grad_norm": 2.187443494796753, + "learning_rate": 4.037586983058396e-06, + "loss": 0.134, + "step": 575 + }, + { + "epoch": 0.1101338432122371, + "grad_norm": 3.1193389892578125, + "learning_rate": 4.038691072281846e-06, + "loss": 0.5783, + "step": 576 + }, + { + "epoch": 0.11032504780114723, + "grad_norm": 2.0621426105499268, + "learning_rate": 4.0397932463452296e-06, + "loss": 0.3393, + "step": 577 + }, + { + "epoch": 0.11051625239005736, + "grad_norm": 3.7791013717651367, + "learning_rate": 4.040893511881142e-06, + "loss": 0.469, + "step": 578 + }, + { + "epoch": 0.1107074569789675, + "grad_norm": 1.6285041570663452, + "learning_rate": 4.0419918754877816e-06, + "loss": 0.0855, + "step": 579 + }, + { + "epoch": 0.11089866156787763, + "grad_norm": 2.735530138015747, + "learning_rate": 4.043088343729189e-06, + "loss": 0.2253, + "step": 580 + }, + { + "epoch": 0.11108986615678776, + "grad_norm": 3.3225202560424805, + "learning_rate": 4.0441829231354805e-06, + "loss": 0.2688, + "step": 581 + }, + { + "epoch": 0.1112810707456979, + "grad_norm": 2.049799680709839, + "learning_rate": 4.0452756202030864e-06, + "loss": 0.2561, + "step": 582 + }, + { + "epoch": 0.11147227533460803, + "grad_norm": 2.2837252616882324, + "learning_rate": 4.046366441394976e-06, + "loss": 0.4603, + "step": 583 + }, + { + "epoch": 0.11166347992351816, + "grad_norm": 1.8498018980026245, + "learning_rate": 4.0474553931408905e-06, + "loss": 0.3059, + "step": 584 + }, + { + "epoch": 0.1118546845124283, + "grad_norm": 2.7567930221557617, + "learning_rate": 4.0485424818375704e-06, + "loss": 0.3465, + "step": 585 + }, + { + "epoch": 0.11204588910133843, + "grad_norm": 2.339862585067749, + "learning_rate": 4.04962771384898e-06, + "loss": 0.1157, + "step": 586 + }, + { + "epoch": 0.11223709369024856, + "grad_norm": 2.2230653762817383, + "learning_rate": 4.050711095506532e-06, + "loss": 0.1013, + "step": 587 + }, + { + "epoch": 0.1124282982791587, + "grad_norm": 4.637607574462891, + "learning_rate": 4.051792633109307e-06, + "loss": 0.3786, + "step": 588 + }, + { + "epoch": 0.11261950286806884, + "grad_norm": 2.5454115867614746, + "learning_rate": 4.052872332924277e-06, + "loss": 0.223, + "step": 589 + }, + { + "epoch": 0.11281070745697896, + "grad_norm": 2.093069076538086, + "learning_rate": 4.053950201186518e-06, + "loss": 0.1641, + "step": 590 + }, + { + "epoch": 0.11300191204588911, + "grad_norm": 2.5368142127990723, + "learning_rate": 4.055026244099433e-06, + "loss": 0.3767, + "step": 591 + }, + { + "epoch": 0.11319311663479924, + "grad_norm": 1.601399302482605, + "learning_rate": 4.056100467834957e-06, + "loss": 0.1006, + "step": 592 + }, + { + "epoch": 0.11338432122370937, + "grad_norm": 2.5372474193573, + "learning_rate": 4.05717287853378e-06, + "loss": 0.1315, + "step": 593 + }, + { + "epoch": 0.1135755258126195, + "grad_norm": 2.6447410583496094, + "learning_rate": 4.058243482305548e-06, + "loss": 0.1529, + "step": 594 + }, + { + "epoch": 0.11376673040152964, + "grad_norm": 3.427429676055908, + "learning_rate": 4.059312285229079e-06, + "loss": 0.6611, + "step": 595 + }, + { + "epoch": 0.11395793499043977, + "grad_norm": 2.3087265491485596, + "learning_rate": 4.060379293352566e-06, + "loss": 0.226, + "step": 596 + }, + { + "epoch": 0.1141491395793499, + "grad_norm": 2.3593530654907227, + "learning_rate": 4.061444512693784e-06, + "loss": 0.3704, + "step": 597 + }, + { + "epoch": 0.11434034416826004, + "grad_norm": 2.03262996673584, + "learning_rate": 4.062507949240291e-06, + "loss": 0.1638, + "step": 598 + }, + { + "epoch": 0.11453154875717017, + "grad_norm": 2.4928765296936035, + "learning_rate": 4.063569608949637e-06, + "loss": 0.1744, + "step": 599 + }, + { + "epoch": 0.1147227533460803, + "grad_norm": 6.897516250610352, + "learning_rate": 4.0646294977495545e-06, + "loss": 0.2404, + "step": 600 + }, + { + "epoch": 0.11491395793499044, + "grad_norm": 1.9926093816757202, + "learning_rate": 4.065687621538164e-06, + "loss": 0.2995, + "step": 601 + }, + { + "epoch": 0.11510516252390057, + "grad_norm": 3.1814188957214355, + "learning_rate": 4.066743986184169e-06, + "loss": 0.4707, + "step": 602 + }, + { + "epoch": 0.1152963671128107, + "grad_norm": 1.9630004167556763, + "learning_rate": 4.067798597527053e-06, + "loss": 0.2011, + "step": 603 + }, + { + "epoch": 0.11548757170172085, + "grad_norm": 3.1169941425323486, + "learning_rate": 4.068851461377267e-06, + "loss": 0.1188, + "step": 604 + }, + { + "epoch": 0.11567877629063097, + "grad_norm": 1.6685168743133545, + "learning_rate": 4.069902583516429e-06, + "loss": 0.0651, + "step": 605 + }, + { + "epoch": 0.1158699808795411, + "grad_norm": 3.466280937194824, + "learning_rate": 4.070951969697511e-06, + "loss": 0.1449, + "step": 606 + }, + { + "epoch": 0.11606118546845125, + "grad_norm": 2.8022851943969727, + "learning_rate": 4.071999625645027e-06, + "loss": 0.2323, + "step": 607 + }, + { + "epoch": 0.11625239005736138, + "grad_norm": 2.7856459617614746, + "learning_rate": 4.073045557055227e-06, + "loss": 0.4914, + "step": 608 + }, + { + "epoch": 0.1164435946462715, + "grad_norm": 2.7790656089782715, + "learning_rate": 4.07408976959627e-06, + "loss": 0.5342, + "step": 609 + }, + { + "epoch": 0.11663479923518165, + "grad_norm": 2.095447063446045, + "learning_rate": 4.075132268908421e-06, + "loss": 0.2162, + "step": 610 + }, + { + "epoch": 0.11682600382409178, + "grad_norm": 1.7271842956542969, + "learning_rate": 4.076173060604226e-06, + "loss": 0.1058, + "step": 611 + }, + { + "epoch": 0.11701720841300191, + "grad_norm": 3.3794445991516113, + "learning_rate": 4.077212150268698e-06, + "loss": 0.1215, + "step": 612 + }, + { + "epoch": 0.11720841300191205, + "grad_norm": 5.20923376083374, + "learning_rate": 4.078249543459495e-06, + "loss": 0.2695, + "step": 613 + }, + { + "epoch": 0.11739961759082218, + "grad_norm": 3.670212984085083, + "learning_rate": 4.079285245707096e-06, + "loss": 0.9118, + "step": 614 + }, + { + "epoch": 0.11759082217973231, + "grad_norm": 4.551514148712158, + "learning_rate": 4.080319262514981e-06, + "loss": 0.407, + "step": 615 + }, + { + "epoch": 0.11778202676864245, + "grad_norm": 2.7722604274749756, + "learning_rate": 4.081351599359807e-06, + "loss": 0.3355, + "step": 616 + }, + { + "epoch": 0.11797323135755258, + "grad_norm": 2.6713385581970215, + "learning_rate": 4.0823822616915795e-06, + "loss": 0.1726, + "step": 617 + }, + { + "epoch": 0.11816443594646271, + "grad_norm": 2.3525755405426025, + "learning_rate": 4.083411254933824e-06, + "loss": 0.1748, + "step": 618 + }, + { + "epoch": 0.11835564053537286, + "grad_norm": 2.2042713165283203, + "learning_rate": 4.084438584483764e-06, + "loss": 0.2662, + "step": 619 + }, + { + "epoch": 0.11854684512428298, + "grad_norm": 2.0608386993408203, + "learning_rate": 4.085464255712483e-06, + "loss": 0.2041, + "step": 620 + }, + { + "epoch": 0.11873804971319311, + "grad_norm": 2.0503997802734375, + "learning_rate": 4.0864882739650965e-06, + "loss": 0.2242, + "step": 621 + }, + { + "epoch": 0.11892925430210326, + "grad_norm": 1.8372803926467896, + "learning_rate": 4.087510644560921e-06, + "loss": 0.1795, + "step": 622 + }, + { + "epoch": 0.11912045889101339, + "grad_norm": 1.664029598236084, + "learning_rate": 4.088531372793636e-06, + "loss": 0.1379, + "step": 623 + }, + { + "epoch": 0.11931166347992352, + "grad_norm": 1.8293601274490356, + "learning_rate": 4.089550463931449e-06, + "loss": 0.1597, + "step": 624 + }, + { + "epoch": 0.11950286806883365, + "grad_norm": 3.36576771736145, + "learning_rate": 4.090567923217263e-06, + "loss": 0.1534, + "step": 625 + }, + { + "epoch": 0.11969407265774379, + "grad_norm": 2.3628504276275635, + "learning_rate": 4.091583755868833e-06, + "loss": 0.348, + "step": 626 + }, + { + "epoch": 0.11988527724665392, + "grad_norm": 2.525311231613159, + "learning_rate": 4.0925979670789294e-06, + "loss": 0.4548, + "step": 627 + }, + { + "epoch": 0.12007648183556405, + "grad_norm": 2.786006450653076, + "learning_rate": 4.093610562015496e-06, + "loss": 0.3034, + "step": 628 + }, + { + "epoch": 0.12026768642447419, + "grad_norm": 2.4381093978881836, + "learning_rate": 4.09462154582181e-06, + "loss": 0.3277, + "step": 629 + }, + { + "epoch": 0.12045889101338432, + "grad_norm": 1.9374213218688965, + "learning_rate": 4.095630923616636e-06, + "loss": 0.1188, + "step": 630 + }, + { + "epoch": 0.12065009560229445, + "grad_norm": 3.029554843902588, + "learning_rate": 4.096638700494381e-06, + "loss": 0.1719, + "step": 631 + }, + { + "epoch": 0.12084130019120459, + "grad_norm": 2.0337328910827637, + "learning_rate": 4.097644881525257e-06, + "loss": 0.3093, + "step": 632 + }, + { + "epoch": 0.12103250478011472, + "grad_norm": 2.4461922645568848, + "learning_rate": 4.098649471755419e-06, + "loss": 0.3777, + "step": 633 + }, + { + "epoch": 0.12122370936902485, + "grad_norm": 2.149134635925293, + "learning_rate": 4.099652476207133e-06, + "loss": 0.1888, + "step": 634 + }, + { + "epoch": 0.121414913957935, + "grad_norm": 3.174314022064209, + "learning_rate": 4.1006538998789146e-06, + "loss": 0.4976, + "step": 635 + }, + { + "epoch": 0.12160611854684512, + "grad_norm": 2.157924175262451, + "learning_rate": 4.101653747745683e-06, + "loss": 0.2177, + "step": 636 + }, + { + "epoch": 0.12179732313575525, + "grad_norm": 9.099021911621094, + "learning_rate": 4.102652024758911e-06, + "loss": 0.3181, + "step": 637 + }, + { + "epoch": 0.1219885277246654, + "grad_norm": 2.279358386993408, + "learning_rate": 4.10364873584677e-06, + "loss": 0.1611, + "step": 638 + }, + { + "epoch": 0.12217973231357553, + "grad_norm": 6.471635818481445, + "learning_rate": 4.104643885914275e-06, + "loss": 0.2495, + "step": 639 + }, + { + "epoch": 0.12237093690248566, + "grad_norm": 1.9951330423355103, + "learning_rate": 4.1056374798434325e-06, + "loss": 0.1462, + "step": 640 + }, + { + "epoch": 0.1225621414913958, + "grad_norm": 2.0457870960235596, + "learning_rate": 4.106629522493381e-06, + "loss": 0.2578, + "step": 641 + }, + { + "epoch": 0.12275334608030593, + "grad_norm": 2.4925954341888428, + "learning_rate": 4.107620018700538e-06, + "loss": 0.2238, + "step": 642 + }, + { + "epoch": 0.12294455066921606, + "grad_norm": 3.2228965759277344, + "learning_rate": 4.1086089732787325e-06, + "loss": 0.1979, + "step": 643 + }, + { + "epoch": 0.1231357552581262, + "grad_norm": 1.5814366340637207, + "learning_rate": 4.109596391019356e-06, + "loss": 0.097, + "step": 644 + }, + { + "epoch": 0.12332695984703633, + "grad_norm": 3.6417267322540283, + "learning_rate": 4.110582276691498e-06, + "loss": 0.354, + "step": 645 + }, + { + "epoch": 0.12351816443594646, + "grad_norm": 2.278165102005005, + "learning_rate": 4.11156663504208e-06, + "loss": 0.1463, + "step": 646 + }, + { + "epoch": 0.1237093690248566, + "grad_norm": 1.5671286582946777, + "learning_rate": 4.112549470795998e-06, + "loss": 0.1445, + "step": 647 + }, + { + "epoch": 0.12390057361376673, + "grad_norm": 3.1425745487213135, + "learning_rate": 4.113530788656255e-06, + "loss": 0.2828, + "step": 648 + }, + { + "epoch": 0.12409177820267686, + "grad_norm": 3.6428639888763428, + "learning_rate": 4.1145105933041e-06, + "loss": 0.1176, + "step": 649 + }, + { + "epoch": 0.124282982791587, + "grad_norm": 1.9820659160614014, + "learning_rate": 4.115488889399157e-06, + "loss": 0.1025, + "step": 650 + }, + { + "epoch": 0.12447418738049713, + "grad_norm": 2.4328808784484863, + "learning_rate": 4.116465681579564e-06, + "loss": 0.3629, + "step": 651 + }, + { + "epoch": 0.12466539196940726, + "grad_norm": 2.9326987266540527, + "learning_rate": 4.117440974462098e-06, + "loss": 0.1771, + "step": 652 + }, + { + "epoch": 0.1248565965583174, + "grad_norm": 2.000542640686035, + "learning_rate": 4.11841477264231e-06, + "loss": 0.1175, + "step": 653 + }, + { + "epoch": 0.12504780114722752, + "grad_norm": 2.4583964347839355, + "learning_rate": 4.119387080694656e-06, + "loss": 0.2045, + "step": 654 + }, + { + "epoch": 0.12523900573613767, + "grad_norm": 2.519341230392456, + "learning_rate": 4.1203579031726235e-06, + "loss": 0.2256, + "step": 655 + }, + { + "epoch": 0.1254302103250478, + "grad_norm": 1.732568621635437, + "learning_rate": 4.12132724460886e-06, + "loss": 0.0891, + "step": 656 + }, + { + "epoch": 0.12562141491395792, + "grad_norm": 3.676959276199341, + "learning_rate": 4.122295109515299e-06, + "loss": 0.5975, + "step": 657 + }, + { + "epoch": 0.12581261950286807, + "grad_norm": 3.2852377891540527, + "learning_rate": 4.123261502383291e-06, + "loss": 0.6327, + "step": 658 + }, + { + "epoch": 0.1260038240917782, + "grad_norm": 2.5524721145629883, + "learning_rate": 4.12422642768372e-06, + "loss": 0.2364, + "step": 659 + }, + { + "epoch": 0.12619502868068833, + "grad_norm": 2.0922858715057373, + "learning_rate": 4.125189889867135e-06, + "loss": 0.147, + "step": 660 + }, + { + "epoch": 0.12638623326959847, + "grad_norm": 2.2585864067077637, + "learning_rate": 4.126151893363871e-06, + "loss": 0.1199, + "step": 661 + }, + { + "epoch": 0.1265774378585086, + "grad_norm": 1.344876766204834, + "learning_rate": 4.12711244258417e-06, + "loss": 0.0836, + "step": 662 + }, + { + "epoch": 0.12676864244741873, + "grad_norm": 4.379039764404297, + "learning_rate": 4.128071541918302e-06, + "loss": 0.2658, + "step": 663 + }, + { + "epoch": 0.12695984703632887, + "grad_norm": 2.2609031200408936, + "learning_rate": 4.129029195736687e-06, + "loss": 0.3937, + "step": 664 + }, + { + "epoch": 0.12715105162523901, + "grad_norm": 2.849423408508301, + "learning_rate": 4.129985408390017e-06, + "loss": 0.1937, + "step": 665 + }, + { + "epoch": 0.12734225621414913, + "grad_norm": 3.229069471359253, + "learning_rate": 4.130940184209367e-06, + "loss": 0.4889, + "step": 666 + }, + { + "epoch": 0.12753346080305927, + "grad_norm": 3.168386459350586, + "learning_rate": 4.131893527506318e-06, + "loss": 0.3612, + "step": 667 + }, + { + "epoch": 0.12772466539196942, + "grad_norm": 1.0114667415618896, + "learning_rate": 4.132845442573078e-06, + "loss": 0.0365, + "step": 668 + }, + { + "epoch": 0.12791586998087953, + "grad_norm": 1.751471757888794, + "learning_rate": 4.133795933682587e-06, + "loss": 0.097, + "step": 669 + }, + { + "epoch": 0.12810707456978968, + "grad_norm": 7.675336837768555, + "learning_rate": 4.13474500508864e-06, + "loss": 0.6252, + "step": 670 + }, + { + "epoch": 0.12829827915869982, + "grad_norm": 2.195767402648926, + "learning_rate": 4.135692661026002e-06, + "loss": 0.2651, + "step": 671 + }, + { + "epoch": 0.12848948374760993, + "grad_norm": 2.2194595336914062, + "learning_rate": 4.136638905710514e-06, + "loss": 0.1452, + "step": 672 + }, + { + "epoch": 0.12868068833652008, + "grad_norm": 2.8059849739074707, + "learning_rate": 4.137583743339215e-06, + "loss": 0.1959, + "step": 673 + }, + { + "epoch": 0.12887189292543022, + "grad_norm": 2.4581644535064697, + "learning_rate": 4.138527178090445e-06, + "loss": 0.1656, + "step": 674 + }, + { + "epoch": 0.12906309751434034, + "grad_norm": 2.713122606277466, + "learning_rate": 4.1394692141239635e-06, + "loss": 0.172, + "step": 675 + }, + { + "epoch": 0.12925430210325048, + "grad_norm": 3.6402595043182373, + "learning_rate": 4.140409855581052e-06, + "loss": 0.5809, + "step": 676 + }, + { + "epoch": 0.12944550669216062, + "grad_norm": 3.1227595806121826, + "learning_rate": 4.14134910658463e-06, + "loss": 0.3983, + "step": 677 + }, + { + "epoch": 0.12963671128107074, + "grad_norm": 2.864481210708618, + "learning_rate": 4.14228697123936e-06, + "loss": 0.3984, + "step": 678 + }, + { + "epoch": 0.12982791586998088, + "grad_norm": 1.6978756189346313, + "learning_rate": 4.143223453631755e-06, + "loss": 0.074, + "step": 679 + }, + { + "epoch": 0.13001912045889102, + "grad_norm": 1.8725084066390991, + "learning_rate": 4.144158557830285e-06, + "loss": 0.1087, + "step": 680 + }, + { + "epoch": 0.13021032504780114, + "grad_norm": 3.0994725227355957, + "learning_rate": 4.1450922878854865e-06, + "loss": 0.3181, + "step": 681 + }, + { + "epoch": 0.13040152963671128, + "grad_norm": 2.7692768573760986, + "learning_rate": 4.146024647830064e-06, + "loss": 0.3154, + "step": 682 + }, + { + "epoch": 0.13059273422562143, + "grad_norm": 2.107736825942993, + "learning_rate": 4.146955641678995e-06, + "loss": 0.3758, + "step": 683 + }, + { + "epoch": 0.13078393881453154, + "grad_norm": 1.6708070039749146, + "learning_rate": 4.147885273429636e-06, + "loss": 0.1446, + "step": 684 + }, + { + "epoch": 0.13097514340344169, + "grad_norm": 2.343808174133301, + "learning_rate": 4.148813547061823e-06, + "loss": 0.2898, + "step": 685 + }, + { + "epoch": 0.13116634799235183, + "grad_norm": 3.282604932785034, + "learning_rate": 4.1497404665379755e-06, + "loss": 0.3803, + "step": 686 + }, + { + "epoch": 0.13135755258126194, + "grad_norm": 2.8634464740753174, + "learning_rate": 4.150666035803198e-06, + "loss": 0.1605, + "step": 687 + }, + { + "epoch": 0.1315487571701721, + "grad_norm": 2.119781255722046, + "learning_rate": 4.151590258785376e-06, + "loss": 0.1974, + "step": 688 + }, + { + "epoch": 0.13173996175908223, + "grad_norm": 2.3915913105010986, + "learning_rate": 4.1525131393952865e-06, + "loss": 0.2118, + "step": 689 + }, + { + "epoch": 0.13193116634799235, + "grad_norm": 2.3511242866516113, + "learning_rate": 4.153434681526684e-06, + "loss": 0.3729, + "step": 690 + }, + { + "epoch": 0.1321223709369025, + "grad_norm": 2.7544589042663574, + "learning_rate": 4.154354889056412e-06, + "loss": 0.3998, + "step": 691 + }, + { + "epoch": 0.13231357552581263, + "grad_norm": 2.3922064304351807, + "learning_rate": 4.155273765844489e-06, + "loss": 0.1746, + "step": 692 + }, + { + "epoch": 0.13250478011472275, + "grad_norm": 2.2704615592956543, + "learning_rate": 4.156191315734217e-06, + "loss": 0.1901, + "step": 693 + }, + { + "epoch": 0.1326959847036329, + "grad_norm": 1.6022690534591675, + "learning_rate": 4.157107542552267e-06, + "loss": 0.1309, + "step": 694 + }, + { + "epoch": 0.13288718929254303, + "grad_norm": 2.8731253147125244, + "learning_rate": 4.158022450108784e-06, + "loss": 0.3855, + "step": 695 + }, + { + "epoch": 0.13307839388145315, + "grad_norm": 2.325465679168701, + "learning_rate": 4.158936042197477e-06, + "loss": 0.1592, + "step": 696 + }, + { + "epoch": 0.1332695984703633, + "grad_norm": 3.571281909942627, + "learning_rate": 4.1598483225957125e-06, + "loss": 0.5375, + "step": 697 + }, + { + "epoch": 0.13346080305927344, + "grad_norm": 2.267007350921631, + "learning_rate": 4.1607592950646145e-06, + "loss": 0.1196, + "step": 698 + }, + { + "epoch": 0.13365200764818355, + "grad_norm": 2.3478941917419434, + "learning_rate": 4.161668963349148e-06, + "loss": 0.1641, + "step": 699 + }, + { + "epoch": 0.1338432122370937, + "grad_norm": 2.05322265625, + "learning_rate": 4.162577331178222e-06, + "loss": 0.131, + "step": 700 + }, + { + "epoch": 0.1340344168260038, + "grad_norm": 4.005086898803711, + "learning_rate": 4.163484402264773e-06, + "loss": 0.4016, + "step": 701 + }, + { + "epoch": 0.13422562141491395, + "grad_norm": 3.990110397338867, + "learning_rate": 4.1643901803058575e-06, + "loss": 0.4845, + "step": 702 + }, + { + "epoch": 0.1344168260038241, + "grad_norm": 2.7093546390533447, + "learning_rate": 4.165294668982747e-06, + "loss": 0.3717, + "step": 703 + }, + { + "epoch": 0.1346080305927342, + "grad_norm": 2.439100980758667, + "learning_rate": 4.1661978719610135e-06, + "loss": 0.3189, + "step": 704 + }, + { + "epoch": 0.13479923518164436, + "grad_norm": 2.0016961097717285, + "learning_rate": 4.167099792890619e-06, + "loss": 0.1995, + "step": 705 + }, + { + "epoch": 0.1349904397705545, + "grad_norm": 2.2571163177490234, + "learning_rate": 4.168000435406005e-06, + "loss": 0.1032, + "step": 706 + }, + { + "epoch": 0.13518164435946461, + "grad_norm": 2.6907873153686523, + "learning_rate": 4.168899803126179e-06, + "loss": 0.3268, + "step": 707 + }, + { + "epoch": 0.13537284894837476, + "grad_norm": 2.1795923709869385, + "learning_rate": 4.169797899654807e-06, + "loss": 0.2979, + "step": 708 + }, + { + "epoch": 0.1355640535372849, + "grad_norm": 1.801849126815796, + "learning_rate": 4.1706947285802905e-06, + "loss": 0.2202, + "step": 709 + }, + { + "epoch": 0.13575525812619502, + "grad_norm": 3.593956232070923, + "learning_rate": 4.1715902934758625e-06, + "loss": 0.4659, + "step": 710 + }, + { + "epoch": 0.13594646271510516, + "grad_norm": 2.4480674266815186, + "learning_rate": 4.172484597899666e-06, + "loss": 0.1247, + "step": 711 + }, + { + "epoch": 0.1361376673040153, + "grad_norm": 2.3875763416290283, + "learning_rate": 4.1733776453948425e-06, + "loss": 0.064, + "step": 712 + }, + { + "epoch": 0.13632887189292542, + "grad_norm": 2.754842758178711, + "learning_rate": 4.174269439489614e-06, + "loss": 0.1734, + "step": 713 + }, + { + "epoch": 0.13652007648183556, + "grad_norm": 3.3177061080932617, + "learning_rate": 4.175159983697367e-06, + "loss": 0.452, + "step": 714 + }, + { + "epoch": 0.1367112810707457, + "grad_norm": 1.6432844400405884, + "learning_rate": 4.176049281516738e-06, + "loss": 0.1835, + "step": 715 + }, + { + "epoch": 0.13690248565965582, + "grad_norm": 3.1820502281188965, + "learning_rate": 4.176937336431696e-06, + "loss": 0.3835, + "step": 716 + }, + { + "epoch": 0.13709369024856596, + "grad_norm": 2.664802312850952, + "learning_rate": 4.177824151911616e-06, + "loss": 0.1186, + "step": 717 + }, + { + "epoch": 0.1372848948374761, + "grad_norm": 2.4787399768829346, + "learning_rate": 4.178709731411373e-06, + "loss": 0.3117, + "step": 718 + }, + { + "epoch": 0.13747609942638622, + "grad_norm": 2.4012043476104736, + "learning_rate": 4.179594078371414e-06, + "loss": 0.0682, + "step": 719 + }, + { + "epoch": 0.13766730401529637, + "grad_norm": 2.465858221054077, + "learning_rate": 4.180477196217842e-06, + "loss": 0.335, + "step": 720 + }, + { + "epoch": 0.1378585086042065, + "grad_norm": 2.301172971725464, + "learning_rate": 4.181359088362493e-06, + "loss": 0.3335, + "step": 721 + }, + { + "epoch": 0.13804971319311662, + "grad_norm": 1.929276466369629, + "learning_rate": 4.182239758203017e-06, + "loss": 0.0915, + "step": 722 + }, + { + "epoch": 0.13824091778202677, + "grad_norm": 2.7580771446228027, + "learning_rate": 4.183119209122958e-06, + "loss": 0.3561, + "step": 723 + }, + { + "epoch": 0.1384321223709369, + "grad_norm": 1.6526697874069214, + "learning_rate": 4.183997444491827e-06, + "loss": 0.2137, + "step": 724 + }, + { + "epoch": 0.13862332695984703, + "grad_norm": 2.3815810680389404, + "learning_rate": 4.184874467665185e-06, + "loss": 0.1152, + "step": 725 + }, + { + "epoch": 0.13881453154875717, + "grad_norm": 2.2507095336914062, + "learning_rate": 4.185750281984717e-06, + "loss": 0.3442, + "step": 726 + }, + { + "epoch": 0.1390057361376673, + "grad_norm": 3.3361330032348633, + "learning_rate": 4.1866248907783065e-06, + "loss": 0.2196, + "step": 727 + }, + { + "epoch": 0.13919694072657743, + "grad_norm": 2.5841755867004395, + "learning_rate": 4.187498297360117e-06, + "loss": 0.2489, + "step": 728 + }, + { + "epoch": 0.13938814531548757, + "grad_norm": 2.5357816219329834, + "learning_rate": 4.188370505030664e-06, + "loss": 0.1614, + "step": 729 + }, + { + "epoch": 0.13957934990439771, + "grad_norm": 2.3405823707580566, + "learning_rate": 4.189241517076887e-06, + "loss": 0.1311, + "step": 730 + }, + { + "epoch": 0.13977055449330783, + "grad_norm": 2.4516336917877197, + "learning_rate": 4.190111336772229e-06, + "loss": 0.1509, + "step": 731 + }, + { + "epoch": 0.13996175908221797, + "grad_norm": 3.084892511367798, + "learning_rate": 4.190979967376708e-06, + "loss": 0.2953, + "step": 732 + }, + { + "epoch": 0.14015296367112812, + "grad_norm": 2.2274246215820312, + "learning_rate": 4.19184741213699e-06, + "loss": 0.4075, + "step": 733 + }, + { + "epoch": 0.14034416826003823, + "grad_norm": 3.0207102298736572, + "learning_rate": 4.192713674286461e-06, + "loss": 0.2865, + "step": 734 + }, + { + "epoch": 0.14053537284894838, + "grad_norm": 1.836266040802002, + "learning_rate": 4.193578757045304e-06, + "loss": 0.1552, + "step": 735 + }, + { + "epoch": 0.14072657743785852, + "grad_norm": 3.116115093231201, + "learning_rate": 4.194442663620563e-06, + "loss": 0.2001, + "step": 736 + }, + { + "epoch": 0.14091778202676863, + "grad_norm": 3.3889942169189453, + "learning_rate": 4.1953053972062215e-06, + "loss": 0.1891, + "step": 737 + }, + { + "epoch": 0.14110898661567878, + "grad_norm": 2.072631359100342, + "learning_rate": 4.196166960983269e-06, + "loss": 0.1645, + "step": 738 + }, + { + "epoch": 0.14130019120458892, + "grad_norm": 3.678781032562256, + "learning_rate": 4.197027358119775e-06, + "loss": 0.5332, + "step": 739 + }, + { + "epoch": 0.14149139579349904, + "grad_norm": 2.225128650665283, + "learning_rate": 4.1978865917709535e-06, + "loss": 0.1687, + "step": 740 + }, + { + "epoch": 0.14168260038240918, + "grad_norm": 5.0827131271362305, + "learning_rate": 4.198744665079239e-06, + "loss": 0.4454, + "step": 741 + }, + { + "epoch": 0.14187380497131932, + "grad_norm": 3.1659598350524902, + "learning_rate": 4.199601581174351e-06, + "loss": 0.4498, + "step": 742 + }, + { + "epoch": 0.14206500956022944, + "grad_norm": 2.8837954998016357, + "learning_rate": 4.200457343173363e-06, + "loss": 0.1854, + "step": 743 + }, + { + "epoch": 0.14225621414913958, + "grad_norm": 2.6731128692626953, + "learning_rate": 4.20131195418077e-06, + "loss": 0.2493, + "step": 744 + }, + { + "epoch": 0.14244741873804972, + "grad_norm": 2.6178650856018066, + "learning_rate": 4.2021654172885625e-06, + "loss": 0.4237, + "step": 745 + }, + { + "epoch": 0.14263862332695984, + "grad_norm": 2.731067657470703, + "learning_rate": 4.203017735576281e-06, + "loss": 0.3222, + "step": 746 + }, + { + "epoch": 0.14282982791586998, + "grad_norm": 2.4263575077056885, + "learning_rate": 4.203868912111097e-06, + "loss": 0.3628, + "step": 747 + }, + { + "epoch": 0.14302103250478013, + "grad_norm": 1.9994317293167114, + "learning_rate": 4.204718949947867e-06, + "loss": 0.1756, + "step": 748 + }, + { + "epoch": 0.14321223709369024, + "grad_norm": 3.0634548664093018, + "learning_rate": 4.205567852129206e-06, + "loss": 0.2029, + "step": 749 + }, + { + "epoch": 0.14340344168260039, + "grad_norm": 2.2715470790863037, + "learning_rate": 4.20641562168555e-06, + "loss": 0.2621, + "step": 750 + }, + { + "epoch": 0.14359464627151053, + "grad_norm": 4.5768842697143555, + "learning_rate": 4.207262261635224e-06, + "loss": 0.3184, + "step": 751 + }, + { + "epoch": 0.14378585086042064, + "grad_norm": 2.8422701358795166, + "learning_rate": 4.208107774984498e-06, + "loss": 0.3711, + "step": 752 + }, + { + "epoch": 0.1439770554493308, + "grad_norm": 1.645248293876648, + "learning_rate": 4.20895216472766e-06, + "loss": 0.1714, + "step": 753 + }, + { + "epoch": 0.14416826003824093, + "grad_norm": 2.0097696781158447, + "learning_rate": 4.20979543384708e-06, + "loss": 0.2713, + "step": 754 + }, + { + "epoch": 0.14435946462715105, + "grad_norm": 1.5780304670333862, + "learning_rate": 4.210637585313263e-06, + "loss": 0.1144, + "step": 755 + }, + { + "epoch": 0.1445506692160612, + "grad_norm": 2.3878743648529053, + "learning_rate": 4.2114786220849235e-06, + "loss": 0.1314, + "step": 756 + }, + { + "epoch": 0.14474187380497133, + "grad_norm": 2.5952939987182617, + "learning_rate": 4.212318547109039e-06, + "loss": 0.3168, + "step": 757 + }, + { + "epoch": 0.14493307839388145, + "grad_norm": 1.6484967470169067, + "learning_rate": 4.213157363320921e-06, + "loss": 0.195, + "step": 758 + }, + { + "epoch": 0.1451242829827916, + "grad_norm": 1.6973950862884521, + "learning_rate": 4.213995073644266e-06, + "loss": 0.0901, + "step": 759 + }, + { + "epoch": 0.14531548757170173, + "grad_norm": 2.196223258972168, + "learning_rate": 4.214831680991224e-06, + "loss": 0.1255, + "step": 760 + }, + { + "epoch": 0.14550669216061185, + "grad_norm": 1.6075400114059448, + "learning_rate": 4.2156671882624575e-06, + "loss": 0.2015, + "step": 761 + }, + { + "epoch": 0.145697896749522, + "grad_norm": 1.5651001930236816, + "learning_rate": 4.2165015983472025e-06, + "loss": 0.0658, + "step": 762 + }, + { + "epoch": 0.1458891013384321, + "grad_norm": 2.242047071456909, + "learning_rate": 4.217334914123325e-06, + "loss": 0.2173, + "step": 763 + }, + { + "epoch": 0.14608030592734225, + "grad_norm": 4.271725654602051, + "learning_rate": 4.218167138457386e-06, + "loss": 0.3126, + "step": 764 + }, + { + "epoch": 0.1462715105162524, + "grad_norm": 1.832981824874878, + "learning_rate": 4.218998274204695e-06, + "loss": 0.1925, + "step": 765 + }, + { + "epoch": 0.1464627151051625, + "grad_norm": 2.5712363719940186, + "learning_rate": 4.219828324209373e-06, + "loss": 0.2876, + "step": 766 + }, + { + "epoch": 0.14665391969407265, + "grad_norm": 1.7425990104675293, + "learning_rate": 4.2206572913044095e-06, + "loss": 0.1229, + "step": 767 + }, + { + "epoch": 0.1468451242829828, + "grad_norm": 2.2643516063690186, + "learning_rate": 4.22148517831172e-06, + "loss": 0.1774, + "step": 768 + }, + { + "epoch": 0.1470363288718929, + "grad_norm": 2.998650312423706, + "learning_rate": 4.222311988042205e-06, + "loss": 0.1941, + "step": 769 + }, + { + "epoch": 0.14722753346080306, + "grad_norm": 3.894683361053467, + "learning_rate": 4.223137723295803e-06, + "loss": 0.6313, + "step": 770 + }, + { + "epoch": 0.1474187380497132, + "grad_norm": 1.2565118074417114, + "learning_rate": 4.2239623868615564e-06, + "loss": 0.2454, + "step": 771 + }, + { + "epoch": 0.14760994263862331, + "grad_norm": 3.5236380100250244, + "learning_rate": 4.224785981517657e-06, + "loss": 0.3753, + "step": 772 + }, + { + "epoch": 0.14780114722753346, + "grad_norm": 2.124140501022339, + "learning_rate": 4.225608510031509e-06, + "loss": 0.241, + "step": 773 + }, + { + "epoch": 0.1479923518164436, + "grad_norm": 1.7390496730804443, + "learning_rate": 4.226429975159786e-06, + "loss": 0.0968, + "step": 774 + }, + { + "epoch": 0.14818355640535372, + "grad_norm": 3.544078826904297, + "learning_rate": 4.22725037964848e-06, + "loss": 0.1911, + "step": 775 + }, + { + "epoch": 0.14837476099426386, + "grad_norm": 4.519067287445068, + "learning_rate": 4.228069726232962e-06, + "loss": 0.8982, + "step": 776 + }, + { + "epoch": 0.148565965583174, + "grad_norm": 2.7855448722839355, + "learning_rate": 4.228888017638035e-06, + "loss": 0.3423, + "step": 777 + }, + { + "epoch": 0.14875717017208412, + "grad_norm": 3.33547043800354, + "learning_rate": 4.229705256577988e-06, + "loss": 0.2157, + "step": 778 + }, + { + "epoch": 0.14894837476099426, + "grad_norm": 2.3527631759643555, + "learning_rate": 4.2305214457566505e-06, + "loss": 0.1733, + "step": 779 + }, + { + "epoch": 0.1491395793499044, + "grad_norm": 3.6699483394622803, + "learning_rate": 4.231336587867446e-06, + "loss": 0.2432, + "step": 780 + }, + { + "epoch": 0.14933078393881452, + "grad_norm": 1.638396143913269, + "learning_rate": 4.232150685593444e-06, + "loss": 0.0568, + "step": 781 + }, + { + "epoch": 0.14952198852772466, + "grad_norm": 3.646986961364746, + "learning_rate": 4.232963741607416e-06, + "loss": 0.4378, + "step": 782 + }, + { + "epoch": 0.1497131931166348, + "grad_norm": 2.220862865447998, + "learning_rate": 4.233775758571886e-06, + "loss": 0.2993, + "step": 783 + }, + { + "epoch": 0.14990439770554492, + "grad_norm": 3.7839043140411377, + "learning_rate": 4.234586739139182e-06, + "loss": 0.47, + "step": 784 + }, + { + "epoch": 0.15009560229445507, + "grad_norm": 1.9454379081726074, + "learning_rate": 4.235396685951493e-06, + "loss": 0.1672, + "step": 785 + }, + { + "epoch": 0.1502868068833652, + "grad_norm": 2.693408727645874, + "learning_rate": 4.236205601640911e-06, + "loss": 0.3557, + "step": 786 + }, + { + "epoch": 0.15047801147227532, + "grad_norm": 2.1505634784698486, + "learning_rate": 4.237013488829494e-06, + "loss": 0.0909, + "step": 787 + }, + { + "epoch": 0.15066921606118547, + "grad_norm": 1.8870456218719482, + "learning_rate": 4.237820350129308e-06, + "loss": 0.1915, + "step": 788 + }, + { + "epoch": 0.1508604206500956, + "grad_norm": 3.362215518951416, + "learning_rate": 4.23862618814248e-06, + "loss": 0.549, + "step": 789 + }, + { + "epoch": 0.15105162523900573, + "grad_norm": 3.406558036804199, + "learning_rate": 4.239431005461254e-06, + "loss": 0.3686, + "step": 790 + }, + { + "epoch": 0.15124282982791587, + "grad_norm": 2.572495460510254, + "learning_rate": 4.240234804668029e-06, + "loss": 0.2132, + "step": 791 + }, + { + "epoch": 0.151434034416826, + "grad_norm": 3.3531334400177, + "learning_rate": 4.2410375883354235e-06, + "loss": 0.2027, + "step": 792 + }, + { + "epoch": 0.15162523900573613, + "grad_norm": 1.9929064512252808, + "learning_rate": 4.241839359026311e-06, + "loss": 0.0925, + "step": 793 + }, + { + "epoch": 0.15181644359464627, + "grad_norm": 4.2206010818481445, + "learning_rate": 4.242640119293882e-06, + "loss": 0.3048, + "step": 794 + }, + { + "epoch": 0.15200764818355642, + "grad_norm": 3.0227558612823486, + "learning_rate": 4.24343987168168e-06, + "loss": 0.372, + "step": 795 + }, + { + "epoch": 0.15219885277246653, + "grad_norm": 2.1193037033081055, + "learning_rate": 4.2442386187236584e-06, + "loss": 0.2249, + "step": 796 + }, + { + "epoch": 0.15239005736137667, + "grad_norm": 2.863941192626953, + "learning_rate": 4.2450363629442295e-06, + "loss": 0.3385, + "step": 797 + }, + { + "epoch": 0.15258126195028682, + "grad_norm": 1.4282262325286865, + "learning_rate": 4.245833106858305e-06, + "loss": 0.1862, + "step": 798 + }, + { + "epoch": 0.15277246653919693, + "grad_norm": 3.935479164123535, + "learning_rate": 4.2466288529713505e-06, + "loss": 0.2419, + "step": 799 + }, + { + "epoch": 0.15296367112810708, + "grad_norm": 1.3507992029190063, + "learning_rate": 4.247423603779429e-06, + "loss": 0.1504, + "step": 800 + }, + { + "epoch": 0.15315487571701722, + "grad_norm": 3.4311065673828125, + "learning_rate": 4.248217361769252e-06, + "loss": 0.4921, + "step": 801 + }, + { + "epoch": 0.15334608030592733, + "grad_norm": 1.9401636123657227, + "learning_rate": 4.2490101294182175e-06, + "loss": 0.3157, + "step": 802 + }, + { + "epoch": 0.15353728489483748, + "grad_norm": 2.6393685340881348, + "learning_rate": 4.249801909194468e-06, + "loss": 0.3513, + "step": 803 + }, + { + "epoch": 0.15372848948374762, + "grad_norm": 2.4486782550811768, + "learning_rate": 4.250592703556928e-06, + "loss": 0.1468, + "step": 804 + }, + { + "epoch": 0.15391969407265774, + "grad_norm": 1.5397764444351196, + "learning_rate": 4.251382514955353e-06, + "loss": 0.1181, + "step": 805 + }, + { + "epoch": 0.15411089866156788, + "grad_norm": 2.7118899822235107, + "learning_rate": 4.252171345830375e-06, + "loss": 0.1423, + "step": 806 + }, + { + "epoch": 0.15430210325047802, + "grad_norm": 2.3002309799194336, + "learning_rate": 4.252959198613545e-06, + "loss": 0.2643, + "step": 807 + }, + { + "epoch": 0.15449330783938814, + "grad_norm": 4.120339393615723, + "learning_rate": 4.253746075727386e-06, + "loss": 0.3035, + "step": 808 + }, + { + "epoch": 0.15468451242829828, + "grad_norm": 1.6706206798553467, + "learning_rate": 4.254531979585426e-06, + "loss": 0.1403, + "step": 809 + }, + { + "epoch": 0.15487571701720843, + "grad_norm": 1.846777319908142, + "learning_rate": 4.2553169125922515e-06, + "loss": 0.1069, + "step": 810 + }, + { + "epoch": 0.15506692160611854, + "grad_norm": 2.311530590057373, + "learning_rate": 4.256100877143548e-06, + "loss": 0.1826, + "step": 811 + }, + { + "epoch": 0.15525812619502868, + "grad_norm": 1.510845422744751, + "learning_rate": 4.256883875626145e-06, + "loss": 0.0622, + "step": 812 + }, + { + "epoch": 0.15544933078393883, + "grad_norm": 1.923916220664978, + "learning_rate": 4.2576659104180575e-06, + "loss": 0.2099, + "step": 813 + }, + { + "epoch": 0.15564053537284894, + "grad_norm": 3.773207902908325, + "learning_rate": 4.258446983888535e-06, + "loss": 0.5412, + "step": 814 + }, + { + "epoch": 0.15583173996175909, + "grad_norm": 2.4419755935668945, + "learning_rate": 4.259227098398094e-06, + "loss": 0.4255, + "step": 815 + }, + { + "epoch": 0.15602294455066923, + "grad_norm": 4.022161960601807, + "learning_rate": 4.260006256298574e-06, + "loss": 0.4799, + "step": 816 + }, + { + "epoch": 0.15621414913957934, + "grad_norm": 1.609773874282837, + "learning_rate": 4.260784459933167e-06, + "loss": 0.1932, + "step": 817 + }, + { + "epoch": 0.1564053537284895, + "grad_norm": 1.0987634658813477, + "learning_rate": 4.261561711636471e-06, + "loss": 0.051, + "step": 818 + }, + { + "epoch": 0.15659655831739963, + "grad_norm": 2.9759936332702637, + "learning_rate": 4.262338013734527e-06, + "loss": 0.178, + "step": 819 + }, + { + "epoch": 0.15678776290630975, + "grad_norm": 1.6191765069961548, + "learning_rate": 4.263113368544856e-06, + "loss": 0.1116, + "step": 820 + }, + { + "epoch": 0.1569789674952199, + "grad_norm": 2.261420965194702, + "learning_rate": 4.2638877783765115e-06, + "loss": 0.2843, + "step": 821 + }, + { + "epoch": 0.15717017208413, + "grad_norm": 1.752107858657837, + "learning_rate": 4.26466124553011e-06, + "loss": 0.2237, + "step": 822 + }, + { + "epoch": 0.15736137667304015, + "grad_norm": 2.615859270095825, + "learning_rate": 4.265433772297882e-06, + "loss": 0.1974, + "step": 823 + }, + { + "epoch": 0.1575525812619503, + "grad_norm": 1.7172234058380127, + "learning_rate": 4.2662053609637e-06, + "loss": 0.1186, + "step": 824 + }, + { + "epoch": 0.1577437858508604, + "grad_norm": 1.4844310283660889, + "learning_rate": 4.266976013803132e-06, + "loss": 0.0642, + "step": 825 + }, + { + "epoch": 0.15793499043977055, + "grad_norm": 2.956435441970825, + "learning_rate": 4.267745733083475e-06, + "loss": 0.2152, + "step": 826 + }, + { + "epoch": 0.1581261950286807, + "grad_norm": 3.139362335205078, + "learning_rate": 4.268514521063796e-06, + "loss": 0.3454, + "step": 827 + }, + { + "epoch": 0.1583173996175908, + "grad_norm": 3.0984015464782715, + "learning_rate": 4.269282379994972e-06, + "loss": 0.3052, + "step": 828 + }, + { + "epoch": 0.15850860420650095, + "grad_norm": 2.265077829360962, + "learning_rate": 4.27004931211973e-06, + "loss": 0.1137, + "step": 829 + }, + { + "epoch": 0.1586998087954111, + "grad_norm": 3.9762401580810547, + "learning_rate": 4.270815319672684e-06, + "loss": 0.3162, + "step": 830 + }, + { + "epoch": 0.1588910133843212, + "grad_norm": 2.093393325805664, + "learning_rate": 4.2715804048803785e-06, + "loss": 0.0985, + "step": 831 + }, + { + "epoch": 0.15908221797323135, + "grad_norm": 1.6958585977554321, + "learning_rate": 4.272344569961324e-06, + "loss": 0.1414, + "step": 832 + }, + { + "epoch": 0.1592734225621415, + "grad_norm": 1.754672884941101, + "learning_rate": 4.273107817126036e-06, + "loss": 0.1115, + "step": 833 + }, + { + "epoch": 0.1594646271510516, + "grad_norm": 2.3991124629974365, + "learning_rate": 4.273870148577072e-06, + "loss": 0.3149, + "step": 834 + }, + { + "epoch": 0.15965583173996176, + "grad_norm": 1.6448285579681396, + "learning_rate": 4.2746315665090745e-06, + "loss": 0.0904, + "step": 835 + }, + { + "epoch": 0.1598470363288719, + "grad_norm": 2.705641984939575, + "learning_rate": 4.275392073108804e-06, + "loss": 0.2193, + "step": 836 + }, + { + "epoch": 0.16003824091778202, + "grad_norm": 3.0307109355926514, + "learning_rate": 4.27615167055518e-06, + "loss": 0.1033, + "step": 837 + }, + { + "epoch": 0.16022944550669216, + "grad_norm": 3.0683274269104004, + "learning_rate": 4.276910361019314e-06, + "loss": 0.3433, + "step": 838 + }, + { + "epoch": 0.1604206500956023, + "grad_norm": 2.956444501876831, + "learning_rate": 4.277668146664553e-06, + "loss": 0.3323, + "step": 839 + }, + { + "epoch": 0.16061185468451242, + "grad_norm": 1.917574167251587, + "learning_rate": 4.278425029646511e-06, + "loss": 0.1704, + "step": 840 + }, + { + "epoch": 0.16080305927342256, + "grad_norm": 2.8657712936401367, + "learning_rate": 4.2791810121131075e-06, + "loss": 0.217, + "step": 841 + }, + { + "epoch": 0.1609942638623327, + "grad_norm": 2.1672983169555664, + "learning_rate": 4.279936096204607e-06, + "loss": 0.1102, + "step": 842 + }, + { + "epoch": 0.16118546845124282, + "grad_norm": 2.7191829681396484, + "learning_rate": 4.2806902840536505e-06, + "loss": 0.2781, + "step": 843 + }, + { + "epoch": 0.16137667304015296, + "grad_norm": 1.849647045135498, + "learning_rate": 4.2814435777852955e-06, + "loss": 0.2105, + "step": 844 + }, + { + "epoch": 0.1615678776290631, + "grad_norm": 2.904238700866699, + "learning_rate": 4.2821959795170494e-06, + "loss": 0.5104, + "step": 845 + }, + { + "epoch": 0.16175908221797322, + "grad_norm": 1.7526648044586182, + "learning_rate": 4.282947491358906e-06, + "loss": 0.1464, + "step": 846 + }, + { + "epoch": 0.16195028680688336, + "grad_norm": 3.317267417907715, + "learning_rate": 4.283698115413385e-06, + "loss": 0.4144, + "step": 847 + }, + { + "epoch": 0.1621414913957935, + "grad_norm": 1.7838001251220703, + "learning_rate": 4.284447853775558e-06, + "loss": 0.0672, + "step": 848 + }, + { + "epoch": 0.16233269598470362, + "grad_norm": 2.0435261726379395, + "learning_rate": 4.2851967085330925e-06, + "loss": 0.1293, + "step": 849 + }, + { + "epoch": 0.16252390057361377, + "grad_norm": 2.0593795776367188, + "learning_rate": 4.2859446817662824e-06, + "loss": 0.0951, + "step": 850 + }, + { + "epoch": 0.1627151051625239, + "grad_norm": 2.386403799057007, + "learning_rate": 4.286691775548084e-06, + "loss": 0.3164, + "step": 851 + }, + { + "epoch": 0.16290630975143403, + "grad_norm": 1.7959471940994263, + "learning_rate": 4.28743799194415e-06, + "loss": 0.1443, + "step": 852 + }, + { + "epoch": 0.16309751434034417, + "grad_norm": 4.4852681159973145, + "learning_rate": 4.288183333012865e-06, + "loss": 0.6776, + "step": 853 + }, + { + "epoch": 0.1632887189292543, + "grad_norm": 1.6934727430343628, + "learning_rate": 4.288927800805377e-06, + "loss": 0.1368, + "step": 854 + }, + { + "epoch": 0.16347992351816443, + "grad_norm": 3.038947582244873, + "learning_rate": 4.289671397365632e-06, + "loss": 0.3026, + "step": 855 + }, + { + "epoch": 0.16367112810707457, + "grad_norm": 2.0727596282958984, + "learning_rate": 4.290414124730413e-06, + "loss": 0.1034, + "step": 856 + }, + { + "epoch": 0.1638623326959847, + "grad_norm": 3.337085008621216, + "learning_rate": 4.291155984929362e-06, + "loss": 0.4055, + "step": 857 + }, + { + "epoch": 0.16405353728489483, + "grad_norm": 2.652127504348755, + "learning_rate": 4.291896979985027e-06, + "loss": 0.3585, + "step": 858 + }, + { + "epoch": 0.16424474187380497, + "grad_norm": 3.1562952995300293, + "learning_rate": 4.292637111912882e-06, + "loss": 0.446, + "step": 859 + }, + { + "epoch": 0.16443594646271512, + "grad_norm": 2.3302462100982666, + "learning_rate": 4.293376382721373e-06, + "loss": 0.2846, + "step": 860 + }, + { + "epoch": 0.16462715105162523, + "grad_norm": 1.904043197631836, + "learning_rate": 4.294114794411938e-06, + "loss": 0.1925, + "step": 861 + }, + { + "epoch": 0.16481835564053537, + "grad_norm": 2.2849550247192383, + "learning_rate": 4.294852348979047e-06, + "loss": 0.1391, + "step": 862 + }, + { + "epoch": 0.16500956022944552, + "grad_norm": 2.6952767372131348, + "learning_rate": 4.295589048410235e-06, + "loss": 0.302, + "step": 863 + }, + { + "epoch": 0.16520076481835563, + "grad_norm": 3.0628738403320312, + "learning_rate": 4.2963248946861294e-06, + "loss": 0.4776, + "step": 864 + }, + { + "epoch": 0.16539196940726578, + "grad_norm": 2.940101385116577, + "learning_rate": 4.297059889780485e-06, + "loss": 0.2961, + "step": 865 + }, + { + "epoch": 0.16558317399617592, + "grad_norm": 2.9635932445526123, + "learning_rate": 4.297794035660217e-06, + "loss": 0.4084, + "step": 866 + }, + { + "epoch": 0.16577437858508604, + "grad_norm": 1.4240761995315552, + "learning_rate": 4.298527334285426e-06, + "loss": 0.0877, + "step": 867 + }, + { + "epoch": 0.16596558317399618, + "grad_norm": 1.576123595237732, + "learning_rate": 4.29925978760944e-06, + "loss": 0.1101, + "step": 868 + }, + { + "epoch": 0.16615678776290632, + "grad_norm": 2.8661000728607178, + "learning_rate": 4.299991397578835e-06, + "loss": 0.1854, + "step": 869 + }, + { + "epoch": 0.16634799235181644, + "grad_norm": 2.202101469039917, + "learning_rate": 4.300722166133473e-06, + "loss": 0.1717, + "step": 870 + }, + { + "epoch": 0.16653919694072658, + "grad_norm": 1.9804801940917969, + "learning_rate": 4.301452095206531e-06, + "loss": 0.419, + "step": 871 + }, + { + "epoch": 0.16673040152963672, + "grad_norm": 1.402471899986267, + "learning_rate": 4.302181186724532e-06, + "loss": 0.1489, + "step": 872 + }, + { + "epoch": 0.16692160611854684, + "grad_norm": 2.836812973022461, + "learning_rate": 4.302909442607371e-06, + "loss": 0.2623, + "step": 873 + }, + { + "epoch": 0.16711281070745698, + "grad_norm": 1.8831483125686646, + "learning_rate": 4.303636864768353e-06, + "loss": 0.1072, + "step": 874 + }, + { + "epoch": 0.16730401529636713, + "grad_norm": 1.8472790718078613, + "learning_rate": 4.304363455114219e-06, + "loss": 0.1246, + "step": 875 + }, + { + "epoch": 0.16749521988527724, + "grad_norm": 2.792137622833252, + "learning_rate": 4.305089215545175e-06, + "loss": 0.3111, + "step": 876 + }, + { + "epoch": 0.16768642447418738, + "grad_norm": 2.943768262863159, + "learning_rate": 4.305814147954922e-06, + "loss": 0.6068, + "step": 877 + }, + { + "epoch": 0.16787762906309753, + "grad_norm": 2.1078765392303467, + "learning_rate": 4.306538254230691e-06, + "loss": 0.1484, + "step": 878 + }, + { + "epoch": 0.16806883365200764, + "grad_norm": 3.882357597351074, + "learning_rate": 4.307261536253264e-06, + "loss": 0.2091, + "step": 879 + }, + { + "epoch": 0.1682600382409178, + "grad_norm": 1.381016492843628, + "learning_rate": 4.30798399589701e-06, + "loss": 0.0775, + "step": 880 + }, + { + "epoch": 0.16845124282982793, + "grad_norm": 2.2446751594543457, + "learning_rate": 4.308705635029911e-06, + "loss": 0.0931, + "step": 881 + }, + { + "epoch": 0.16864244741873805, + "grad_norm": 3.000154733657837, + "learning_rate": 4.309426455513591e-06, + "loss": 0.3437, + "step": 882 + }, + { + "epoch": 0.1688336520076482, + "grad_norm": 2.4180312156677246, + "learning_rate": 4.310146459203349e-06, + "loss": 0.1738, + "step": 883 + }, + { + "epoch": 0.1690248565965583, + "grad_norm": 1.9331579208374023, + "learning_rate": 4.310865647948177e-06, + "loss": 0.2825, + "step": 884 + }, + { + "epoch": 0.16921606118546845, + "grad_norm": 3.176043748855591, + "learning_rate": 4.311584023590803e-06, + "loss": 0.435, + "step": 885 + }, + { + "epoch": 0.1694072657743786, + "grad_norm": 2.9748902320861816, + "learning_rate": 4.312301587967709e-06, + "loss": 0.442, + "step": 886 + }, + { + "epoch": 0.1695984703632887, + "grad_norm": 1.8928931951522827, + "learning_rate": 4.313018342909162e-06, + "loss": 0.0825, + "step": 887 + }, + { + "epoch": 0.16978967495219885, + "grad_norm": 3.570660352706909, + "learning_rate": 4.3137342902392415e-06, + "loss": 0.4021, + "step": 888 + }, + { + "epoch": 0.169980879541109, + "grad_norm": 1.958633542060852, + "learning_rate": 4.31444943177587e-06, + "loss": 0.2978, + "step": 889 + }, + { + "epoch": 0.1701720841300191, + "grad_norm": 2.055731773376465, + "learning_rate": 4.315163769330838e-06, + "loss": 0.2606, + "step": 890 + }, + { + "epoch": 0.17036328871892925, + "grad_norm": 2.733567714691162, + "learning_rate": 4.3158773047098325e-06, + "loss": 0.3208, + "step": 891 + }, + { + "epoch": 0.1705544933078394, + "grad_norm": 2.0781280994415283, + "learning_rate": 4.3165900397124626e-06, + "loss": 0.2838, + "step": 892 + }, + { + "epoch": 0.1707456978967495, + "grad_norm": 4.172160625457764, + "learning_rate": 4.317301976132288e-06, + "loss": 0.1485, + "step": 893 + }, + { + "epoch": 0.17093690248565965, + "grad_norm": 2.199040174484253, + "learning_rate": 4.31801311575685e-06, + "loss": 0.1481, + "step": 894 + }, + { + "epoch": 0.1711281070745698, + "grad_norm": 3.143498420715332, + "learning_rate": 4.318723460367692e-06, + "loss": 0.5856, + "step": 895 + }, + { + "epoch": 0.1713193116634799, + "grad_norm": 2.4403462409973145, + "learning_rate": 4.319433011740389e-06, + "loss": 0.4743, + "step": 896 + }, + { + "epoch": 0.17151051625239006, + "grad_norm": 1.9097824096679688, + "learning_rate": 4.3201417716445755e-06, + "loss": 0.1086, + "step": 897 + }, + { + "epoch": 0.1717017208413002, + "grad_norm": 2.7453420162200928, + "learning_rate": 4.32084974184397e-06, + "loss": 0.1981, + "step": 898 + }, + { + "epoch": 0.1718929254302103, + "grad_norm": 1.3835936784744263, + "learning_rate": 4.321556924096402e-06, + "loss": 0.1109, + "step": 899 + }, + { + "epoch": 0.17208413001912046, + "grad_norm": 1.855629563331604, + "learning_rate": 4.322263320153839e-06, + "loss": 0.0757, + "step": 900 + }, + { + "epoch": 0.1722753346080306, + "grad_norm": 2.9253125190734863, + "learning_rate": 4.322968931762411e-06, + "loss": 0.2818, + "step": 901 + }, + { + "epoch": 0.17246653919694072, + "grad_norm": 2.60461163520813, + "learning_rate": 4.323673760662438e-06, + "loss": 0.2558, + "step": 902 + }, + { + "epoch": 0.17265774378585086, + "grad_norm": 1.764817476272583, + "learning_rate": 4.324377808588454e-06, + "loss": 0.1228, + "step": 903 + }, + { + "epoch": 0.172848948374761, + "grad_norm": 2.949141502380371, + "learning_rate": 4.3250810772692355e-06, + "loss": 0.3251, + "step": 904 + }, + { + "epoch": 0.17304015296367112, + "grad_norm": 2.435056686401367, + "learning_rate": 4.3257835684278235e-06, + "loss": 0.3511, + "step": 905 + }, + { + "epoch": 0.17323135755258126, + "grad_norm": 2.2405662536621094, + "learning_rate": 4.3264852837815515e-06, + "loss": 0.0803, + "step": 906 + }, + { + "epoch": 0.1734225621414914, + "grad_norm": 2.5514614582061768, + "learning_rate": 4.327186225042066e-06, + "loss": 0.3172, + "step": 907 + }, + { + "epoch": 0.17361376673040152, + "grad_norm": 2.14652419090271, + "learning_rate": 4.327886393915363e-06, + "loss": 0.1575, + "step": 908 + }, + { + "epoch": 0.17380497131931166, + "grad_norm": 1.3867746591567993, + "learning_rate": 4.328585792101795e-06, + "loss": 0.1291, + "step": 909 + }, + { + "epoch": 0.1739961759082218, + "grad_norm": 3.6421027183532715, + "learning_rate": 4.329284421296114e-06, + "loss": 0.5589, + "step": 910 + }, + { + "epoch": 0.17418738049713192, + "grad_norm": 2.516690731048584, + "learning_rate": 4.329982283187484e-06, + "loss": 0.1707, + "step": 911 + }, + { + "epoch": 0.17437858508604206, + "grad_norm": 3.2310264110565186, + "learning_rate": 4.330679379459511e-06, + "loss": 0.2891, + "step": 912 + }, + { + "epoch": 0.1745697896749522, + "grad_norm": 1.8623366355895996, + "learning_rate": 4.331375711790265e-06, + "loss": 0.1513, + "step": 913 + }, + { + "epoch": 0.17476099426386232, + "grad_norm": 3.0351459980010986, + "learning_rate": 4.332071281852306e-06, + "loss": 0.3003, + "step": 914 + }, + { + "epoch": 0.17495219885277247, + "grad_norm": 3.0663366317749023, + "learning_rate": 4.332766091312704e-06, + "loss": 0.3235, + "step": 915 + }, + { + "epoch": 0.1751434034416826, + "grad_norm": 1.8554519414901733, + "learning_rate": 4.333460141833072e-06, + "loss": 0.1387, + "step": 916 + }, + { + "epoch": 0.17533460803059273, + "grad_norm": 3.4480252265930176, + "learning_rate": 4.33415343506958e-06, + "loss": 0.2172, + "step": 917 + }, + { + "epoch": 0.17552581261950287, + "grad_norm": 2.396263837814331, + "learning_rate": 4.334845972672983e-06, + "loss": 0.2369, + "step": 918 + }, + { + "epoch": 0.175717017208413, + "grad_norm": 2.6219048500061035, + "learning_rate": 4.335537756288644e-06, + "loss": 0.2771, + "step": 919 + }, + { + "epoch": 0.17590822179732313, + "grad_norm": 2.162111759185791, + "learning_rate": 4.3362287875565595e-06, + "loss": 0.3263, + "step": 920 + }, + { + "epoch": 0.17609942638623327, + "grad_norm": 2.3528635501861572, + "learning_rate": 4.33691906811138e-06, + "loss": 0.1726, + "step": 921 + }, + { + "epoch": 0.17629063097514341, + "grad_norm": 3.782550096511841, + "learning_rate": 4.337608599582434e-06, + "loss": 0.3164, + "step": 922 + }, + { + "epoch": 0.17648183556405353, + "grad_norm": 2.536262035369873, + "learning_rate": 4.338297383593754e-06, + "loss": 0.3199, + "step": 923 + }, + { + "epoch": 0.17667304015296367, + "grad_norm": 4.701323986053467, + "learning_rate": 4.338985421764091e-06, + "loss": 0.258, + "step": 924 + }, + { + "epoch": 0.17686424474187382, + "grad_norm": 2.707467555999756, + "learning_rate": 4.33967271570695e-06, + "loss": 0.105, + "step": 925 + }, + { + "epoch": 0.17705544933078393, + "grad_norm": 3.847241163253784, + "learning_rate": 4.340359267030601e-06, + "loss": 0.6552, + "step": 926 + }, + { + "epoch": 0.17724665391969407, + "grad_norm": 1.8708347082138062, + "learning_rate": 4.341045077338109e-06, + "loss": 0.2187, + "step": 927 + }, + { + "epoch": 0.17743785850860422, + "grad_norm": 2.500690460205078, + "learning_rate": 4.341730148227351e-06, + "loss": 0.1601, + "step": 928 + }, + { + "epoch": 0.17762906309751433, + "grad_norm": 1.6813768148422241, + "learning_rate": 4.342414481291046e-06, + "loss": 0.0899, + "step": 929 + }, + { + "epoch": 0.17782026768642448, + "grad_norm": 1.971083641052246, + "learning_rate": 4.343098078116767e-06, + "loss": 0.1689, + "step": 930 + }, + { + "epoch": 0.17801147227533462, + "grad_norm": 1.656010389328003, + "learning_rate": 4.343780940286974e-06, + "loss": 0.0773, + "step": 931 + }, + { + "epoch": 0.17820267686424474, + "grad_norm": 3.547978639602661, + "learning_rate": 4.344463069379024e-06, + "loss": 0.4317, + "step": 932 + }, + { + "epoch": 0.17839388145315488, + "grad_norm": 1.7893317937850952, + "learning_rate": 4.345144466965205e-06, + "loss": 0.1723, + "step": 933 + }, + { + "epoch": 0.17858508604206502, + "grad_norm": 1.7867648601531982, + "learning_rate": 4.345825134612752e-06, + "loss": 0.1345, + "step": 934 + }, + { + "epoch": 0.17877629063097514, + "grad_norm": 3.299664258956909, + "learning_rate": 4.3465050738838635e-06, + "loss": 0.4693, + "step": 935 + }, + { + "epoch": 0.17896749521988528, + "grad_norm": 2.6684377193450928, + "learning_rate": 4.347184286335733e-06, + "loss": 0.3825, + "step": 936 + }, + { + "epoch": 0.17915869980879542, + "grad_norm": 1.9024335145950317, + "learning_rate": 4.347862773520565e-06, + "loss": 0.1348, + "step": 937 + }, + { + "epoch": 0.17934990439770554, + "grad_norm": 2.407212972640991, + "learning_rate": 4.348540536985596e-06, + "loss": 0.3481, + "step": 938 + }, + { + "epoch": 0.17954110898661568, + "grad_norm": 2.431903600692749, + "learning_rate": 4.349217578273117e-06, + "loss": 0.326, + "step": 939 + }, + { + "epoch": 0.17973231357552583, + "grad_norm": 2.8086395263671875, + "learning_rate": 4.349893898920495e-06, + "loss": 0.3743, + "step": 940 + }, + { + "epoch": 0.17992351816443594, + "grad_norm": 1.9264482259750366, + "learning_rate": 4.350569500460188e-06, + "loss": 0.2072, + "step": 941 + }, + { + "epoch": 0.18011472275334608, + "grad_norm": 1.8798692226409912, + "learning_rate": 4.35124438441978e-06, + "loss": 0.0674, + "step": 942 + }, + { + "epoch": 0.1803059273422562, + "grad_norm": 1.4653009176254272, + "learning_rate": 4.351918552321986e-06, + "loss": 0.0763, + "step": 943 + }, + { + "epoch": 0.18049713193116634, + "grad_norm": 2.255692958831787, + "learning_rate": 4.3525920056846815e-06, + "loss": 0.124, + "step": 944 + }, + { + "epoch": 0.1806883365200765, + "grad_norm": 2.478980779647827, + "learning_rate": 4.35326474602092e-06, + "loss": 0.3314, + "step": 945 + }, + { + "epoch": 0.1808795411089866, + "grad_norm": 1.5225646495819092, + "learning_rate": 4.353936774838954e-06, + "loss": 0.353, + "step": 946 + }, + { + "epoch": 0.18107074569789675, + "grad_norm": 1.214327096939087, + "learning_rate": 4.354608093642257e-06, + "loss": 0.0809, + "step": 947 + }, + { + "epoch": 0.1812619502868069, + "grad_norm": 2.2034029960632324, + "learning_rate": 4.355278703929541e-06, + "loss": 0.2837, + "step": 948 + }, + { + "epoch": 0.181453154875717, + "grad_norm": 3.38645076751709, + "learning_rate": 4.3559486071947785e-06, + "loss": 0.378, + "step": 949 + }, + { + "epoch": 0.18164435946462715, + "grad_norm": 1.7553558349609375, + "learning_rate": 4.3566178049272204e-06, + "loss": 0.1021, + "step": 950 + }, + { + "epoch": 0.1818355640535373, + "grad_norm": 2.1390013694763184, + "learning_rate": 4.357286298611418e-06, + "loss": 0.2978, + "step": 951 + }, + { + "epoch": 0.1820267686424474, + "grad_norm": 2.9334323406219482, + "learning_rate": 4.3579540897272416e-06, + "loss": 0.5175, + "step": 952 + }, + { + "epoch": 0.18221797323135755, + "grad_norm": 1.2445647716522217, + "learning_rate": 4.358621179749902e-06, + "loss": 0.108, + "step": 953 + }, + { + "epoch": 0.1824091778202677, + "grad_norm": 1.3846619129180908, + "learning_rate": 4.359287570149967e-06, + "loss": 0.1046, + "step": 954 + }, + { + "epoch": 0.1826003824091778, + "grad_norm": 2.8698277473449707, + "learning_rate": 4.359953262393382e-06, + "loss": 0.1461, + "step": 955 + }, + { + "epoch": 0.18279158699808795, + "grad_norm": 1.6638492345809937, + "learning_rate": 4.3606182579414905e-06, + "loss": 0.1026, + "step": 956 + }, + { + "epoch": 0.1829827915869981, + "grad_norm": 2.5066792964935303, + "learning_rate": 4.361282558251054e-06, + "loss": 0.2583, + "step": 957 + }, + { + "epoch": 0.1831739961759082, + "grad_norm": 3.5391042232513428, + "learning_rate": 4.361946164774267e-06, + "loss": 0.5604, + "step": 958 + }, + { + "epoch": 0.18336520076481835, + "grad_norm": 2.347865104675293, + "learning_rate": 4.36260907895878e-06, + "loss": 0.2277, + "step": 959 + }, + { + "epoch": 0.1835564053537285, + "grad_norm": 2.174129009246826, + "learning_rate": 4.363271302247718e-06, + "loss": 0.1745, + "step": 960 + }, + { + "epoch": 0.1837476099426386, + "grad_norm": 1.4797195196151733, + "learning_rate": 4.3639328360796964e-06, + "loss": 0.1074, + "step": 961 + }, + { + "epoch": 0.18393881453154876, + "grad_norm": 2.326012134552002, + "learning_rate": 4.364593681888845e-06, + "loss": 0.1232, + "step": 962 + }, + { + "epoch": 0.1841300191204589, + "grad_norm": 2.7982571125030518, + "learning_rate": 4.365253841104822e-06, + "loss": 0.5077, + "step": 963 + }, + { + "epoch": 0.18432122370936901, + "grad_norm": 3.0020434856414795, + "learning_rate": 4.365913315152833e-06, + "loss": 0.5043, + "step": 964 + }, + { + "epoch": 0.18451242829827916, + "grad_norm": 4.000023365020752, + "learning_rate": 4.366572105453653e-06, + "loss": 0.3364, + "step": 965 + }, + { + "epoch": 0.1847036328871893, + "grad_norm": 1.9444904327392578, + "learning_rate": 4.367230213423641e-06, + "loss": 0.1445, + "step": 966 + }, + { + "epoch": 0.18489483747609942, + "grad_norm": 1.875847578048706, + "learning_rate": 4.367887640474758e-06, + "loss": 0.2279, + "step": 967 + }, + { + "epoch": 0.18508604206500956, + "grad_norm": 1.9915648698806763, + "learning_rate": 4.368544388014591e-06, + "loss": 0.1222, + "step": 968 + }, + { + "epoch": 0.1852772466539197, + "grad_norm": 2.044407844543457, + "learning_rate": 4.369200457446364e-06, + "loss": 0.1119, + "step": 969 + }, + { + "epoch": 0.18546845124282982, + "grad_norm": 1.7450714111328125, + "learning_rate": 4.3698558501689585e-06, + "loss": 0.1761, + "step": 970 + }, + { + "epoch": 0.18565965583173996, + "grad_norm": 1.9846551418304443, + "learning_rate": 4.370510567576933e-06, + "loss": 0.2268, + "step": 971 + }, + { + "epoch": 0.1858508604206501, + "grad_norm": 2.735543727874756, + "learning_rate": 4.3711646110605385e-06, + "loss": 0.2611, + "step": 972 + }, + { + "epoch": 0.18604206500956022, + "grad_norm": 2.1292803287506104, + "learning_rate": 4.371817982005741e-06, + "loss": 0.1536, + "step": 973 + }, + { + "epoch": 0.18623326959847036, + "grad_norm": 2.738206386566162, + "learning_rate": 4.372470681794229e-06, + "loss": 0.1698, + "step": 974 + }, + { + "epoch": 0.1864244741873805, + "grad_norm": 3.367114543914795, + "learning_rate": 4.373122711803442e-06, + "loss": 0.1996, + "step": 975 + }, + { + "epoch": 0.18661567877629062, + "grad_norm": 3.1926040649414062, + "learning_rate": 4.373774073406583e-06, + "loss": 0.3569, + "step": 976 + }, + { + "epoch": 0.18680688336520077, + "grad_norm": 3.1637473106384277, + "learning_rate": 4.374424767972636e-06, + "loss": 0.228, + "step": 977 + }, + { + "epoch": 0.1869980879541109, + "grad_norm": 3.3024322986602783, + "learning_rate": 4.375074796866383e-06, + "loss": 0.5118, + "step": 978 + }, + { + "epoch": 0.18718929254302102, + "grad_norm": 3.079582929611206, + "learning_rate": 4.375724161448419e-06, + "loss": 0.1612, + "step": 979 + }, + { + "epoch": 0.18738049713193117, + "grad_norm": 1.2228554487228394, + "learning_rate": 4.376372863075179e-06, + "loss": 0.0437, + "step": 980 + }, + { + "epoch": 0.1875717017208413, + "grad_norm": 3.335177183151245, + "learning_rate": 4.377020903098941e-06, + "loss": 0.169, + "step": 981 + }, + { + "epoch": 0.18776290630975143, + "grad_norm": 2.6680805683135986, + "learning_rate": 4.377668282867852e-06, + "loss": 0.3572, + "step": 982 + }, + { + "epoch": 0.18795411089866157, + "grad_norm": 2.7457680702209473, + "learning_rate": 4.378315003725942e-06, + "loss": 0.6047, + "step": 983 + }, + { + "epoch": 0.1881453154875717, + "grad_norm": 2.021188974380493, + "learning_rate": 4.3789610670131445e-06, + "loss": 0.2667, + "step": 984 + }, + { + "epoch": 0.18833652007648183, + "grad_norm": 2.8314571380615234, + "learning_rate": 4.3796064740653045e-06, + "loss": 0.4146, + "step": 985 + }, + { + "epoch": 0.18852772466539197, + "grad_norm": 1.4372998476028442, + "learning_rate": 4.380251226214205e-06, + "loss": 0.1061, + "step": 986 + }, + { + "epoch": 0.18871892925430211, + "grad_norm": 3.8322417736053467, + "learning_rate": 4.380895324787575e-06, + "loss": 0.2334, + "step": 987 + }, + { + "epoch": 0.18891013384321223, + "grad_norm": 1.867491602897644, + "learning_rate": 4.381538771109115e-06, + "loss": 0.1069, + "step": 988 + }, + { + "epoch": 0.18910133843212237, + "grad_norm": 2.0068299770355225, + "learning_rate": 4.382181566498504e-06, + "loss": 0.1687, + "step": 989 + }, + { + "epoch": 0.18929254302103252, + "grad_norm": 2.4695441722869873, + "learning_rate": 4.382823712271419e-06, + "loss": 0.2867, + "step": 990 + }, + { + "epoch": 0.18948374760994263, + "grad_norm": 3.3172388076782227, + "learning_rate": 4.383465209739559e-06, + "loss": 0.2976, + "step": 991 + }, + { + "epoch": 0.18967495219885278, + "grad_norm": 1.749539852142334, + "learning_rate": 4.384106060210646e-06, + "loss": 0.1326, + "step": 992 + }, + { + "epoch": 0.18986615678776292, + "grad_norm": 2.1479973793029785, + "learning_rate": 4.384746264988454e-06, + "loss": 0.1488, + "step": 993 + }, + { + "epoch": 0.19005736137667303, + "grad_norm": 2.0028390884399414, + "learning_rate": 4.385385825372819e-06, + "loss": 0.1353, + "step": 994 + }, + { + "epoch": 0.19024856596558318, + "grad_norm": 2.371243953704834, + "learning_rate": 4.386024742659655e-06, + "loss": 0.2165, + "step": 995 + }, + { + "epoch": 0.19043977055449332, + "grad_norm": 1.8113404512405396, + "learning_rate": 4.386663018140972e-06, + "loss": 0.2138, + "step": 996 + }, + { + "epoch": 0.19063097514340344, + "grad_norm": 2.1039295196533203, + "learning_rate": 4.38730065310489e-06, + "loss": 0.1077, + "step": 997 + }, + { + "epoch": 0.19082217973231358, + "grad_norm": 1.7272989749908447, + "learning_rate": 4.387937648835653e-06, + "loss": 0.1083, + "step": 998 + }, + { + "epoch": 0.19101338432122372, + "grad_norm": 3.76637601852417, + "learning_rate": 4.388574006613651e-06, + "loss": 0.2184, + "step": 999 + }, + { + "epoch": 0.19120458891013384, + "grad_norm": 2.589611053466797, + "learning_rate": 4.389209727715426e-06, + "loss": 0.111, + "step": 1000 + }, + { + "epoch": 0.19120458891013384, + "eval_runtime": 801.9588, + "eval_samples_per_second": 1.913, + "eval_steps_per_second": 0.239, + "step": 1000 + }, + { + "epoch": 0.19139579349904398, + "grad_norm": 2.5449726581573486, + "learning_rate": 4.389844813413695e-06, + "loss": 0.2094, + "step": 1001 + }, + { + "epoch": 0.19158699808795412, + "grad_norm": 2.380741834640503, + "learning_rate": 4.390479264977362e-06, + "loss": 0.2957, + "step": 1002 + }, + { + "epoch": 0.19177820267686424, + "grad_norm": 2.056061029434204, + "learning_rate": 4.391113083671535e-06, + "loss": 0.3615, + "step": 1003 + }, + { + "epoch": 0.19196940726577438, + "grad_norm": 2.0398001670837402, + "learning_rate": 4.391746270757536e-06, + "loss": 0.1292, + "step": 1004 + }, + { + "epoch": 0.1921606118546845, + "grad_norm": 4.531376838684082, + "learning_rate": 4.392378827492925e-06, + "loss": 0.1914, + "step": 1005 + }, + { + "epoch": 0.19235181644359464, + "grad_norm": 2.163278102874756, + "learning_rate": 4.393010755131506e-06, + "loss": 0.0862, + "step": 1006 + }, + { + "epoch": 0.19254302103250479, + "grad_norm": 2.507943868637085, + "learning_rate": 4.393642054923349e-06, + "loss": 0.4044, + "step": 1007 + }, + { + "epoch": 0.1927342256214149, + "grad_norm": 3.3545026779174805, + "learning_rate": 4.394272728114798e-06, + "loss": 0.4105, + "step": 1008 + }, + { + "epoch": 0.19292543021032504, + "grad_norm": 3.325700044631958, + "learning_rate": 4.3949027759484944e-06, + "loss": 0.3183, + "step": 1009 + }, + { + "epoch": 0.1931166347992352, + "grad_norm": 1.7217702865600586, + "learning_rate": 4.395532199663382e-06, + "loss": 0.1855, + "step": 1010 + }, + { + "epoch": 0.1933078393881453, + "grad_norm": 1.5247466564178467, + "learning_rate": 4.396161000494729e-06, + "loss": 0.0998, + "step": 1011 + }, + { + "epoch": 0.19349904397705545, + "grad_norm": 2.8211965560913086, + "learning_rate": 4.396789179674141e-06, + "loss": 0.0913, + "step": 1012 + }, + { + "epoch": 0.1936902485659656, + "grad_norm": 3.427171468734741, + "learning_rate": 4.397416738429569e-06, + "loss": 0.1638, + "step": 1013 + }, + { + "epoch": 0.1938814531548757, + "grad_norm": 2.2865030765533447, + "learning_rate": 4.398043677985337e-06, + "loss": 0.1951, + "step": 1014 + }, + { + "epoch": 0.19407265774378585, + "grad_norm": 1.854646921157837, + "learning_rate": 4.398669999562141e-06, + "loss": 0.1315, + "step": 1015 + }, + { + "epoch": 0.194263862332696, + "grad_norm": 2.7197680473327637, + "learning_rate": 4.399295704377077e-06, + "loss": 0.3981, + "step": 1016 + }, + { + "epoch": 0.1944550669216061, + "grad_norm": 1.8270578384399414, + "learning_rate": 4.399920793643645e-06, + "loss": 0.1379, + "step": 1017 + }, + { + "epoch": 0.19464627151051625, + "grad_norm": 3.02530574798584, + "learning_rate": 4.400545268571768e-06, + "loss": 0.2511, + "step": 1018 + }, + { + "epoch": 0.1948374760994264, + "grad_norm": 2.5796334743499756, + "learning_rate": 4.401169130367807e-06, + "loss": 0.1854, + "step": 1019 + }, + { + "epoch": 0.1950286806883365, + "grad_norm": 3.3476648330688477, + "learning_rate": 4.4017923802345695e-06, + "loss": 0.3143, + "step": 1020 + }, + { + "epoch": 0.19521988527724665, + "grad_norm": 2.5378341674804688, + "learning_rate": 4.4024150193713316e-06, + "loss": 0.4362, + "step": 1021 + }, + { + "epoch": 0.1954110898661568, + "grad_norm": 1.8505767583847046, + "learning_rate": 4.403037048973844e-06, + "loss": 0.2285, + "step": 1022 + }, + { + "epoch": 0.1956022944550669, + "grad_norm": 2.6891872882843018, + "learning_rate": 4.403658470234348e-06, + "loss": 0.2091, + "step": 1023 + }, + { + "epoch": 0.19579349904397705, + "grad_norm": 2.9312164783477783, + "learning_rate": 4.404279284341596e-06, + "loss": 0.2019, + "step": 1024 + }, + { + "epoch": 0.1959847036328872, + "grad_norm": 2.8806815147399902, + "learning_rate": 4.404899492480853e-06, + "loss": 0.1688, + "step": 1025 + }, + { + "epoch": 0.1961759082217973, + "grad_norm": 3.487785577774048, + "learning_rate": 4.405519095833921e-06, + "loss": 0.502, + "step": 1026 + }, + { + "epoch": 0.19636711281070746, + "grad_norm": 2.5800065994262695, + "learning_rate": 4.406138095579144e-06, + "loss": 0.3615, + "step": 1027 + }, + { + "epoch": 0.1965583173996176, + "grad_norm": 2.3235068321228027, + "learning_rate": 4.406756492891431e-06, + "loss": 0.2984, + "step": 1028 + }, + { + "epoch": 0.19674952198852771, + "grad_norm": 4.1836628913879395, + "learning_rate": 4.407374288942261e-06, + "loss": 0.3625, + "step": 1029 + }, + { + "epoch": 0.19694072657743786, + "grad_norm": 3.1203243732452393, + "learning_rate": 4.407991484899696e-06, + "loss": 0.1182, + "step": 1030 + }, + { + "epoch": 0.197131931166348, + "grad_norm": 1.4133257865905762, + "learning_rate": 4.408608081928404e-06, + "loss": 0.0475, + "step": 1031 + }, + { + "epoch": 0.19732313575525812, + "grad_norm": 2.494713068008423, + "learning_rate": 4.4092240811896605e-06, + "loss": 0.2211, + "step": 1032 + }, + { + "epoch": 0.19751434034416826, + "grad_norm": 1.6299430131912231, + "learning_rate": 4.40983948384137e-06, + "loss": 0.1825, + "step": 1033 + }, + { + "epoch": 0.1977055449330784, + "grad_norm": 3.060887575149536, + "learning_rate": 4.410454291038076e-06, + "loss": 0.307, + "step": 1034 + }, + { + "epoch": 0.19789674952198852, + "grad_norm": 2.6344666481018066, + "learning_rate": 4.411068503930969e-06, + "loss": 0.2975, + "step": 1035 + }, + { + "epoch": 0.19808795411089866, + "grad_norm": 1.8494610786437988, + "learning_rate": 4.41168212366791e-06, + "loss": 0.0681, + "step": 1036 + }, + { + "epoch": 0.1982791586998088, + "grad_norm": 2.327357530593872, + "learning_rate": 4.412295151393437e-06, + "loss": 0.1178, + "step": 1037 + }, + { + "epoch": 0.19847036328871892, + "grad_norm": 1.8669428825378418, + "learning_rate": 4.412907588248774e-06, + "loss": 0.2461, + "step": 1038 + }, + { + "epoch": 0.19866156787762906, + "grad_norm": 3.062044143676758, + "learning_rate": 4.413519435371853e-06, + "loss": 0.468, + "step": 1039 + }, + { + "epoch": 0.1988527724665392, + "grad_norm": 3.0303285121917725, + "learning_rate": 4.4141306938973205e-06, + "loss": 0.3054, + "step": 1040 + }, + { + "epoch": 0.19904397705544932, + "grad_norm": 2.5444071292877197, + "learning_rate": 4.414741364956551e-06, + "loss": 0.1862, + "step": 1041 + }, + { + "epoch": 0.19923518164435947, + "grad_norm": 2.047647476196289, + "learning_rate": 4.415351449677661e-06, + "loss": 0.2525, + "step": 1042 + }, + { + "epoch": 0.1994263862332696, + "grad_norm": 3.2048020362854004, + "learning_rate": 4.415960949185519e-06, + "loss": 0.1087, + "step": 1043 + }, + { + "epoch": 0.19961759082217972, + "grad_norm": 1.5147563219070435, + "learning_rate": 4.416569864601761e-06, + "loss": 0.0919, + "step": 1044 + }, + { + "epoch": 0.19980879541108987, + "grad_norm": 2.4613304138183594, + "learning_rate": 4.4171781970448015e-06, + "loss": 0.4064, + "step": 1045 + }, + { + "epoch": 0.2, + "grad_norm": 2.334059953689575, + "learning_rate": 4.417785947629845e-06, + "loss": 0.2635, + "step": 1046 + }, + { + "epoch": 0.20019120458891013, + "grad_norm": 1.446562647819519, + "learning_rate": 4.418393117468899e-06, + "loss": 0.1417, + "step": 1047 + }, + { + "epoch": 0.20038240917782027, + "grad_norm": 3.4641342163085938, + "learning_rate": 4.418999707670787e-06, + "loss": 0.3766, + "step": 1048 + }, + { + "epoch": 0.2005736137667304, + "grad_norm": 1.2733123302459717, + "learning_rate": 4.4196057193411596e-06, + "loss": 0.0523, + "step": 1049 + }, + { + "epoch": 0.20076481835564053, + "grad_norm": 1.9554495811462402, + "learning_rate": 4.420211153582507e-06, + "loss": 0.1459, + "step": 1050 + }, + { + "epoch": 0.20095602294455067, + "grad_norm": 3.5012214183807373, + "learning_rate": 4.4208160114941716e-06, + "loss": 0.4968, + "step": 1051 + }, + { + "epoch": 0.20114722753346082, + "grad_norm": 2.0732827186584473, + "learning_rate": 4.421420294172356e-06, + "loss": 0.318, + "step": 1052 + }, + { + "epoch": 0.20133843212237093, + "grad_norm": 2.6593754291534424, + "learning_rate": 4.422024002710142e-06, + "loss": 0.2944, + "step": 1053 + }, + { + "epoch": 0.20152963671128107, + "grad_norm": 2.268272876739502, + "learning_rate": 4.422627138197499e-06, + "loss": 0.2813, + "step": 1054 + }, + { + "epoch": 0.20172084130019122, + "grad_norm": 2.346163034439087, + "learning_rate": 4.423229701721291e-06, + "loss": 0.1823, + "step": 1055 + }, + { + "epoch": 0.20191204588910133, + "grad_norm": 1.8060640096664429, + "learning_rate": 4.423831694365298e-06, + "loss": 0.1057, + "step": 1056 + }, + { + "epoch": 0.20210325047801148, + "grad_norm": 2.9678351879119873, + "learning_rate": 4.424433117210219e-06, + "loss": 0.4058, + "step": 1057 + }, + { + "epoch": 0.20229445506692162, + "grad_norm": 2.440826416015625, + "learning_rate": 4.42503397133369e-06, + "loss": 0.314, + "step": 1058 + }, + { + "epoch": 0.20248565965583173, + "grad_norm": 2.8080196380615234, + "learning_rate": 4.425634257810289e-06, + "loss": 0.6267, + "step": 1059 + }, + { + "epoch": 0.20267686424474188, + "grad_norm": 3.3547277450561523, + "learning_rate": 4.426233977711554e-06, + "loss": 0.2995, + "step": 1060 + }, + { + "epoch": 0.20286806883365202, + "grad_norm": 1.7534793615341187, + "learning_rate": 4.426833132105993e-06, + "loss": 0.2257, + "step": 1061 + }, + { + "epoch": 0.20305927342256214, + "grad_norm": 1.749898910522461, + "learning_rate": 4.4274317220590905e-06, + "loss": 0.1218, + "step": 1062 + }, + { + "epoch": 0.20325047801147228, + "grad_norm": 1.7936410903930664, + "learning_rate": 4.428029748633326e-06, + "loss": 0.1347, + "step": 1063 + }, + { + "epoch": 0.20344168260038242, + "grad_norm": 3.712153196334839, + "learning_rate": 4.42862721288818e-06, + "loss": 0.4845, + "step": 1064 + }, + { + "epoch": 0.20363288718929254, + "grad_norm": 3.04533052444458, + "learning_rate": 4.429224115880146e-06, + "loss": 0.3209, + "step": 1065 + }, + { + "epoch": 0.20382409177820268, + "grad_norm": 1.8066027164459229, + "learning_rate": 4.429820458662747e-06, + "loss": 0.1314, + "step": 1066 + }, + { + "epoch": 0.2040152963671128, + "grad_norm": 1.466841697692871, + "learning_rate": 4.43041624228654e-06, + "loss": 0.0976, + "step": 1067 + }, + { + "epoch": 0.20420650095602294, + "grad_norm": 2.905015468597412, + "learning_rate": 4.431011467799127e-06, + "loss": 0.273, + "step": 1068 + }, + { + "epoch": 0.20439770554493308, + "grad_norm": 2.144706964492798, + "learning_rate": 4.4316061362451714e-06, + "loss": 0.1751, + "step": 1069 + }, + { + "epoch": 0.2045889101338432, + "grad_norm": 2.6781883239746094, + "learning_rate": 4.432200248666409e-06, + "loss": 0.535, + "step": 1070 + }, + { + "epoch": 0.20478011472275334, + "grad_norm": 2.1494102478027344, + "learning_rate": 4.4327938061016515e-06, + "loss": 0.1759, + "step": 1071 + }, + { + "epoch": 0.20497131931166349, + "grad_norm": 2.51594877243042, + "learning_rate": 4.433386809586803e-06, + "loss": 0.1144, + "step": 1072 + }, + { + "epoch": 0.2051625239005736, + "grad_norm": 1.9181592464447021, + "learning_rate": 4.433979260154872e-06, + "loss": 0.1752, + "step": 1073 + }, + { + "epoch": 0.20535372848948374, + "grad_norm": 2.254838466644287, + "learning_rate": 4.4345711588359805e-06, + "loss": 0.0968, + "step": 1074 + }, + { + "epoch": 0.2055449330783939, + "grad_norm": 2.161107063293457, + "learning_rate": 4.435162506657369e-06, + "loss": 0.0813, + "step": 1075 + }, + { + "epoch": 0.205736137667304, + "grad_norm": 2.8448431491851807, + "learning_rate": 4.435753304643421e-06, + "loss": 0.6405, + "step": 1076 + }, + { + "epoch": 0.20592734225621415, + "grad_norm": 1.856491208076477, + "learning_rate": 4.436343553815657e-06, + "loss": 0.1512, + "step": 1077 + }, + { + "epoch": 0.2061185468451243, + "grad_norm": 2.017235517501831, + "learning_rate": 4.43693325519276e-06, + "loss": 0.1791, + "step": 1078 + }, + { + "epoch": 0.2063097514340344, + "grad_norm": 2.606391429901123, + "learning_rate": 4.437522409790575e-06, + "loss": 0.2601, + "step": 1079 + }, + { + "epoch": 0.20650095602294455, + "grad_norm": 2.5404584407806396, + "learning_rate": 4.438111018622127e-06, + "loss": 0.2392, + "step": 1080 + }, + { + "epoch": 0.2066921606118547, + "grad_norm": 2.630312919616699, + "learning_rate": 4.438699082697624e-06, + "loss": 0.1322, + "step": 1081 + }, + { + "epoch": 0.2068833652007648, + "grad_norm": 2.1289303302764893, + "learning_rate": 4.4392866030244765e-06, + "loss": 0.2648, + "step": 1082 + }, + { + "epoch": 0.20707456978967495, + "grad_norm": 3.3554677963256836, + "learning_rate": 4.439873580607301e-06, + "loss": 0.5565, + "step": 1083 + }, + { + "epoch": 0.2072657743785851, + "grad_norm": 2.1717491149902344, + "learning_rate": 4.440460016447934e-06, + "loss": 0.1442, + "step": 1084 + }, + { + "epoch": 0.2074569789674952, + "grad_norm": 2.714172601699829, + "learning_rate": 4.441045911545436e-06, + "loss": 0.3317, + "step": 1085 + }, + { + "epoch": 0.20764818355640535, + "grad_norm": 2.02815318107605, + "learning_rate": 4.441631266896111e-06, + "loss": 0.1441, + "step": 1086 + }, + { + "epoch": 0.2078393881453155, + "grad_norm": 1.8019261360168457, + "learning_rate": 4.442216083493512e-06, + "loss": 0.0926, + "step": 1087 + }, + { + "epoch": 0.2080305927342256, + "grad_norm": 1.846985101699829, + "learning_rate": 4.442800362328448e-06, + "loss": 0.1832, + "step": 1088 + }, + { + "epoch": 0.20822179732313575, + "grad_norm": 2.089106321334839, + "learning_rate": 4.443384104389001e-06, + "loss": 0.1436, + "step": 1089 + }, + { + "epoch": 0.2084130019120459, + "grad_norm": 2.037452459335327, + "learning_rate": 4.443967310660528e-06, + "loss": 0.1578, + "step": 1090 + }, + { + "epoch": 0.208604206500956, + "grad_norm": 2.5428967475891113, + "learning_rate": 4.444549982125679e-06, + "loss": 0.5048, + "step": 1091 + }, + { + "epoch": 0.20879541108986616, + "grad_norm": 1.7422195672988892, + "learning_rate": 4.445132119764402e-06, + "loss": 0.1375, + "step": 1092 + }, + { + "epoch": 0.2089866156787763, + "grad_norm": 1.4205963611602783, + "learning_rate": 4.445713724553953e-06, + "loss": 0.0716, + "step": 1093 + }, + { + "epoch": 0.20917782026768642, + "grad_norm": 2.83103346824646, + "learning_rate": 4.446294797468908e-06, + "loss": 0.1791, + "step": 1094 + }, + { + "epoch": 0.20936902485659656, + "grad_norm": 2.2832252979278564, + "learning_rate": 4.446875339481172e-06, + "loss": 0.2271, + "step": 1095 + }, + { + "epoch": 0.2095602294455067, + "grad_norm": 2.6716253757476807, + "learning_rate": 4.447455351559987e-06, + "loss": 0.4096, + "step": 1096 + }, + { + "epoch": 0.20975143403441682, + "grad_norm": 1.877040147781372, + "learning_rate": 4.448034834671944e-06, + "loss": 0.27, + "step": 1097 + }, + { + "epoch": 0.20994263862332696, + "grad_norm": 1.4391602277755737, + "learning_rate": 4.448613789780993e-06, + "loss": 0.076, + "step": 1098 + }, + { + "epoch": 0.2101338432122371, + "grad_norm": 1.844596266746521, + "learning_rate": 4.449192217848449e-06, + "loss": 0.1119, + "step": 1099 + }, + { + "epoch": 0.21032504780114722, + "grad_norm": 4.228497505187988, + "learning_rate": 4.449770119833007e-06, + "loss": 0.1945, + "step": 1100 + }, + { + "epoch": 0.21051625239005736, + "grad_norm": 2.4177355766296387, + "learning_rate": 4.450347496690746e-06, + "loss": 0.3176, + "step": 1101 + }, + { + "epoch": 0.2107074569789675, + "grad_norm": 3.1131842136383057, + "learning_rate": 4.450924349375142e-06, + "loss": 0.3662, + "step": 1102 + }, + { + "epoch": 0.21089866156787762, + "grad_norm": 1.61544668674469, + "learning_rate": 4.4515006788370775e-06, + "loss": 0.2676, + "step": 1103 + }, + { + "epoch": 0.21108986615678776, + "grad_norm": 4.033684253692627, + "learning_rate": 4.4520764860248474e-06, + "loss": 0.4563, + "step": 1104 + }, + { + "epoch": 0.2112810707456979, + "grad_norm": 2.592637062072754, + "learning_rate": 4.452651771884173e-06, + "loss": 0.3416, + "step": 1105 + }, + { + "epoch": 0.21147227533460802, + "grad_norm": 1.7672158479690552, + "learning_rate": 4.45322653735821e-06, + "loss": 0.1088, + "step": 1106 + }, + { + "epoch": 0.21166347992351817, + "grad_norm": 1.9779208898544312, + "learning_rate": 4.453800783387554e-06, + "loss": 0.2377, + "step": 1107 + }, + { + "epoch": 0.2118546845124283, + "grad_norm": 2.543431043624878, + "learning_rate": 4.454374510910254e-06, + "loss": 0.2436, + "step": 1108 + }, + { + "epoch": 0.21204588910133843, + "grad_norm": 4.803256988525391, + "learning_rate": 4.454947720861822e-06, + "loss": 0.3721, + "step": 1109 + }, + { + "epoch": 0.21223709369024857, + "grad_norm": 2.2504935264587402, + "learning_rate": 4.455520414175238e-06, + "loss": 0.2385, + "step": 1110 + }, + { + "epoch": 0.2124282982791587, + "grad_norm": 2.656966209411621, + "learning_rate": 4.456092591780964e-06, + "loss": 0.1749, + "step": 1111 + }, + { + "epoch": 0.21261950286806883, + "grad_norm": 2.746769666671753, + "learning_rate": 4.4566642546069476e-06, + "loss": 0.109, + "step": 1112 + }, + { + "epoch": 0.21281070745697897, + "grad_norm": 3.2161478996276855, + "learning_rate": 4.457235403578636e-06, + "loss": 0.2271, + "step": 1113 + }, + { + "epoch": 0.2130019120458891, + "grad_norm": 4.615819931030273, + "learning_rate": 4.457806039618983e-06, + "loss": 0.5054, + "step": 1114 + }, + { + "epoch": 0.21319311663479923, + "grad_norm": 2.4721786975860596, + "learning_rate": 4.458376163648458e-06, + "loss": 0.2637, + "step": 1115 + }, + { + "epoch": 0.21338432122370937, + "grad_norm": 1.3068219423294067, + "learning_rate": 4.458945776585056e-06, + "loss": 0.1306, + "step": 1116 + }, + { + "epoch": 0.21357552581261952, + "grad_norm": 1.7614368200302124, + "learning_rate": 4.459514879344301e-06, + "loss": 0.0854, + "step": 1117 + }, + { + "epoch": 0.21376673040152963, + "grad_norm": 2.025942325592041, + "learning_rate": 4.460083472839265e-06, + "loss": 0.1001, + "step": 1118 + }, + { + "epoch": 0.21395793499043977, + "grad_norm": 1.3237824440002441, + "learning_rate": 4.460651557980566e-06, + "loss": 0.091, + "step": 1119 + }, + { + "epoch": 0.21414913957934992, + "grad_norm": 3.109684705734253, + "learning_rate": 4.461219135676386e-06, + "loss": 0.6804, + "step": 1120 + }, + { + "epoch": 0.21434034416826003, + "grad_norm": 3.659759044647217, + "learning_rate": 4.461786206832473e-06, + "loss": 0.4264, + "step": 1121 + }, + { + "epoch": 0.21453154875717018, + "grad_norm": 3.0191221237182617, + "learning_rate": 4.462352772352151e-06, + "loss": 0.2061, + "step": 1122 + }, + { + "epoch": 0.21472275334608032, + "grad_norm": 0.91303551197052, + "learning_rate": 4.4629188331363334e-06, + "loss": 0.1592, + "step": 1123 + }, + { + "epoch": 0.21491395793499043, + "grad_norm": 2.4983134269714355, + "learning_rate": 4.463484390083525e-06, + "loss": 0.1401, + "step": 1124 + }, + { + "epoch": 0.21510516252390058, + "grad_norm": 2.4315645694732666, + "learning_rate": 4.464049444089835e-06, + "loss": 0.1035, + "step": 1125 + }, + { + "epoch": 0.2152963671128107, + "grad_norm": 2.098078489303589, + "learning_rate": 4.464613996048983e-06, + "loss": 0.2432, + "step": 1126 + }, + { + "epoch": 0.21548757170172084, + "grad_norm": 2.7613978385925293, + "learning_rate": 4.465178046852309e-06, + "loss": 0.401, + "step": 1127 + }, + { + "epoch": 0.21567877629063098, + "grad_norm": 2.0903496742248535, + "learning_rate": 4.465741597388782e-06, + "loss": 0.1743, + "step": 1128 + }, + { + "epoch": 0.2158699808795411, + "grad_norm": 2.099884271621704, + "learning_rate": 4.466304648545006e-06, + "loss": 0.1594, + "step": 1129 + }, + { + "epoch": 0.21606118546845124, + "grad_norm": 1.7775846719741821, + "learning_rate": 4.466867201205232e-06, + "loss": 0.0927, + "step": 1130 + }, + { + "epoch": 0.21625239005736138, + "grad_norm": 1.9474855661392212, + "learning_rate": 4.467429256251365e-06, + "loss": 0.1128, + "step": 1131 + }, + { + "epoch": 0.2164435946462715, + "grad_norm": 2.1535003185272217, + "learning_rate": 4.467990814562967e-06, + "loss": 0.2644, + "step": 1132 + }, + { + "epoch": 0.21663479923518164, + "grad_norm": 3.28275203704834, + "learning_rate": 4.4685518770172775e-06, + "loss": 0.4298, + "step": 1133 + }, + { + "epoch": 0.21682600382409178, + "grad_norm": 2.098271608352661, + "learning_rate": 4.469112444489207e-06, + "loss": 0.1012, + "step": 1134 + }, + { + "epoch": 0.2170172084130019, + "grad_norm": 2.1434457302093506, + "learning_rate": 4.469672517851359e-06, + "loss": 0.2505, + "step": 1135 + }, + { + "epoch": 0.21720841300191204, + "grad_norm": 2.248441219329834, + "learning_rate": 4.470232097974025e-06, + "loss": 0.2486, + "step": 1136 + }, + { + "epoch": 0.2173996175908222, + "grad_norm": 1.4229638576507568, + "learning_rate": 4.470791185725206e-06, + "loss": 0.0635, + "step": 1137 + }, + { + "epoch": 0.2175908221797323, + "grad_norm": 2.0013930797576904, + "learning_rate": 4.4713497819706065e-06, + "loss": 0.1122, + "step": 1138 + }, + { + "epoch": 0.21778202676864244, + "grad_norm": 2.527571678161621, + "learning_rate": 4.471907887573656e-06, + "loss": 0.3839, + "step": 1139 + }, + { + "epoch": 0.2179732313575526, + "grad_norm": 3.140901565551758, + "learning_rate": 4.472465503395508e-06, + "loss": 0.5159, + "step": 1140 + }, + { + "epoch": 0.2181644359464627, + "grad_norm": 3.4777331352233887, + "learning_rate": 4.473022630295051e-06, + "loss": 0.2159, + "step": 1141 + }, + { + "epoch": 0.21835564053537285, + "grad_norm": 1.6637810468673706, + "learning_rate": 4.473579269128917e-06, + "loss": 0.2454, + "step": 1142 + }, + { + "epoch": 0.218546845124283, + "grad_norm": 3.4433083534240723, + "learning_rate": 4.474135420751486e-06, + "loss": 0.2348, + "step": 1143 + }, + { + "epoch": 0.2187380497131931, + "grad_norm": 2.1617848873138428, + "learning_rate": 4.474691086014902e-06, + "loss": 0.2981, + "step": 1144 + }, + { + "epoch": 0.21892925430210325, + "grad_norm": 2.460859775543213, + "learning_rate": 4.475246265769069e-06, + "loss": 0.4682, + "step": 1145 + }, + { + "epoch": 0.2191204588910134, + "grad_norm": 1.9391944408416748, + "learning_rate": 4.47580096086167e-06, + "loss": 0.2622, + "step": 1146 + }, + { + "epoch": 0.2193116634799235, + "grad_norm": 2.875788927078247, + "learning_rate": 4.476355172138167e-06, + "loss": 0.4374, + "step": 1147 + }, + { + "epoch": 0.21950286806883365, + "grad_norm": 2.8496835231781006, + "learning_rate": 4.476908900441812e-06, + "loss": 0.297, + "step": 1148 + }, + { + "epoch": 0.2196940726577438, + "grad_norm": 2.6283702850341797, + "learning_rate": 4.477462146613657e-06, + "loss": 0.1253, + "step": 1149 + }, + { + "epoch": 0.2198852772466539, + "grad_norm": 1.8835588693618774, + "learning_rate": 4.478014911492556e-06, + "loss": 0.1859, + "step": 1150 + }, + { + "epoch": 0.22007648183556405, + "grad_norm": 13.009642601013184, + "learning_rate": 4.478567195915176e-06, + "loss": 0.3384, + "step": 1151 + }, + { + "epoch": 0.2202676864244742, + "grad_norm": 0.9838836193084717, + "learning_rate": 4.479119000716005e-06, + "loss": 0.0698, + "step": 1152 + }, + { + "epoch": 0.2204588910133843, + "grad_norm": 2.835535764694214, + "learning_rate": 4.479670326727359e-06, + "loss": 0.394, + "step": 1153 + }, + { + "epoch": 0.22065009560229445, + "grad_norm": 2.8879711627960205, + "learning_rate": 4.480221174779389e-06, + "loss": 0.2163, + "step": 1154 + }, + { + "epoch": 0.2208413001912046, + "grad_norm": 2.8582956790924072, + "learning_rate": 4.480771545700088e-06, + "loss": 0.0787, + "step": 1155 + }, + { + "epoch": 0.2210325047801147, + "grad_norm": 2.854708194732666, + "learning_rate": 4.481321440315302e-06, + "loss": 0.2055, + "step": 1156 + }, + { + "epoch": 0.22122370936902486, + "grad_norm": 1.8422484397888184, + "learning_rate": 4.481870859448731e-06, + "loss": 0.1446, + "step": 1157 + }, + { + "epoch": 0.221414913957935, + "grad_norm": 2.160968542098999, + "learning_rate": 4.482419803921941e-06, + "loss": 0.2785, + "step": 1158 + }, + { + "epoch": 0.22160611854684512, + "grad_norm": 2.0334866046905518, + "learning_rate": 4.482968274554374e-06, + "loss": 0.1943, + "step": 1159 + }, + { + "epoch": 0.22179732313575526, + "grad_norm": 1.5216292142868042, + "learning_rate": 4.483516272163348e-06, + "loss": 0.0838, + "step": 1160 + }, + { + "epoch": 0.2219885277246654, + "grad_norm": 3.173959970474243, + "learning_rate": 4.4840637975640696e-06, + "loss": 0.272, + "step": 1161 + }, + { + "epoch": 0.22217973231357552, + "grad_norm": 1.9395912885665894, + "learning_rate": 4.4846108515696406e-06, + "loss": 0.2258, + "step": 1162 + }, + { + "epoch": 0.22237093690248566, + "grad_norm": 1.6804591417312622, + "learning_rate": 4.485157434991062e-06, + "loss": 0.0927, + "step": 1163 + }, + { + "epoch": 0.2225621414913958, + "grad_norm": 2.5178074836730957, + "learning_rate": 4.485703548637246e-06, + "loss": 0.2079, + "step": 1164 + }, + { + "epoch": 0.22275334608030592, + "grad_norm": 2.004671096801758, + "learning_rate": 4.48624919331502e-06, + "loss": 0.232, + "step": 1165 + }, + { + "epoch": 0.22294455066921606, + "grad_norm": 1.9772891998291016, + "learning_rate": 4.486794369829135e-06, + "loss": 0.1463, + "step": 1166 + }, + { + "epoch": 0.2231357552581262, + "grad_norm": 2.044581174850464, + "learning_rate": 4.487339078982273e-06, + "loss": 0.1012, + "step": 1167 + }, + { + "epoch": 0.22332695984703632, + "grad_norm": 2.186039924621582, + "learning_rate": 4.48788332157505e-06, + "loss": 0.1192, + "step": 1168 + }, + { + "epoch": 0.22351816443594646, + "grad_norm": 1.783949851989746, + "learning_rate": 4.488427098406031e-06, + "loss": 0.0964, + "step": 1169 + }, + { + "epoch": 0.2237093690248566, + "grad_norm": 2.8822665214538574, + "learning_rate": 4.4889704102717304e-06, + "loss": 0.402, + "step": 1170 + }, + { + "epoch": 0.22390057361376672, + "grad_norm": 2.142519235610962, + "learning_rate": 4.48951325796662e-06, + "loss": 0.302, + "step": 1171 + }, + { + "epoch": 0.22409177820267687, + "grad_norm": 1.857020616531372, + "learning_rate": 4.49005564228314e-06, + "loss": 0.3136, + "step": 1172 + }, + { + "epoch": 0.224282982791587, + "grad_norm": 2.0371313095092773, + "learning_rate": 4.490597564011701e-06, + "loss": 0.2399, + "step": 1173 + }, + { + "epoch": 0.22447418738049713, + "grad_norm": 2.0099854469299316, + "learning_rate": 4.491139023940692e-06, + "loss": 0.0889, + "step": 1174 + }, + { + "epoch": 0.22466539196940727, + "grad_norm": 2.0622718334198, + "learning_rate": 4.49168002285649e-06, + "loss": 0.1362, + "step": 1175 + }, + { + "epoch": 0.2248565965583174, + "grad_norm": 3.3179798126220703, + "learning_rate": 4.492220561543467e-06, + "loss": 0.4877, + "step": 1176 + }, + { + "epoch": 0.22504780114722753, + "grad_norm": 2.5217320919036865, + "learning_rate": 4.49276064078399e-06, + "loss": 0.5047, + "step": 1177 + }, + { + "epoch": 0.22523900573613767, + "grad_norm": 3.4361255168914795, + "learning_rate": 4.493300261358436e-06, + "loss": 0.5802, + "step": 1178 + }, + { + "epoch": 0.22543021032504781, + "grad_norm": 2.50624942779541, + "learning_rate": 4.493839424045196e-06, + "loss": 0.2938, + "step": 1179 + }, + { + "epoch": 0.22562141491395793, + "grad_norm": 2.448352575302124, + "learning_rate": 4.494378129620678e-06, + "loss": 0.1441, + "step": 1180 + }, + { + "epoch": 0.22581261950286807, + "grad_norm": 1.622539758682251, + "learning_rate": 4.494916378859321e-06, + "loss": 0.092, + "step": 1181 + }, + { + "epoch": 0.22600382409177822, + "grad_norm": 2.642896890640259, + "learning_rate": 4.495454172533592e-06, + "loss": 0.3514, + "step": 1182 + }, + { + "epoch": 0.22619502868068833, + "grad_norm": 2.202483892440796, + "learning_rate": 4.495991511414005e-06, + "loss": 0.2871, + "step": 1183 + }, + { + "epoch": 0.22638623326959847, + "grad_norm": 1.2011011838912964, + "learning_rate": 4.496528396269117e-06, + "loss": 0.0971, + "step": 1184 + }, + { + "epoch": 0.22657743785850862, + "grad_norm": 2.026979923248291, + "learning_rate": 4.497064827865537e-06, + "loss": 0.1101, + "step": 1185 + }, + { + "epoch": 0.22676864244741873, + "grad_norm": 1.4967538118362427, + "learning_rate": 4.49760080696794e-06, + "loss": 0.2007, + "step": 1186 + }, + { + "epoch": 0.22695984703632888, + "grad_norm": 2.381471872329712, + "learning_rate": 4.498136334339059e-06, + "loss": 0.0943, + "step": 1187 + }, + { + "epoch": 0.227151051625239, + "grad_norm": 2.089556932449341, + "learning_rate": 4.498671410739708e-06, + "loss": 0.1357, + "step": 1188 + }, + { + "epoch": 0.22734225621414914, + "grad_norm": 3.158884048461914, + "learning_rate": 4.4992060369287745e-06, + "loss": 0.4473, + "step": 1189 + }, + { + "epoch": 0.22753346080305928, + "grad_norm": 1.9269723892211914, + "learning_rate": 4.499740213663238e-06, + "loss": 0.2718, + "step": 1190 + }, + { + "epoch": 0.2277246653919694, + "grad_norm": 3.3276264667510986, + "learning_rate": 4.500273941698166e-06, + "loss": 0.4174, + "step": 1191 + }, + { + "epoch": 0.22791586998087954, + "grad_norm": 1.854248285293579, + "learning_rate": 4.500807221786725e-06, + "loss": 0.1561, + "step": 1192 + }, + { + "epoch": 0.22810707456978968, + "grad_norm": 5.148835182189941, + "learning_rate": 4.50134005468019e-06, + "loss": 0.3131, + "step": 1193 + }, + { + "epoch": 0.2282982791586998, + "grad_norm": 2.1514906883239746, + "learning_rate": 4.501872441127943e-06, + "loss": 0.1113, + "step": 1194 + }, + { + "epoch": 0.22848948374760994, + "grad_norm": 2.2153143882751465, + "learning_rate": 4.502404381877488e-06, + "loss": 0.2921, + "step": 1195 + }, + { + "epoch": 0.22868068833652008, + "grad_norm": 1.5667790174484253, + "learning_rate": 4.502935877674451e-06, + "loss": 0.1318, + "step": 1196 + }, + { + "epoch": 0.2288718929254302, + "grad_norm": 3.2153244018554688, + "learning_rate": 4.503466929262589e-06, + "loss": 0.5028, + "step": 1197 + }, + { + "epoch": 0.22906309751434034, + "grad_norm": 1.3133012056350708, + "learning_rate": 4.503997537383796e-06, + "loss": 0.1566, + "step": 1198 + }, + { + "epoch": 0.22925430210325048, + "grad_norm": 1.9280524253845215, + "learning_rate": 4.50452770277811e-06, + "loss": 0.2353, + "step": 1199 + }, + { + "epoch": 0.2294455066921606, + "grad_norm": 2.680346965789795, + "learning_rate": 4.505057426183714e-06, + "loss": 0.1869, + "step": 1200 + }, + { + "epoch": 0.22963671128107074, + "grad_norm": 3.4344375133514404, + "learning_rate": 4.505586708336951e-06, + "loss": 0.4891, + "step": 1201 + }, + { + "epoch": 0.2298279158699809, + "grad_norm": 1.4899710416793823, + "learning_rate": 4.506115549972324e-06, + "loss": 0.1304, + "step": 1202 + }, + { + "epoch": 0.230019120458891, + "grad_norm": 2.121846914291382, + "learning_rate": 4.506643951822501e-06, + "loss": 0.1658, + "step": 1203 + }, + { + "epoch": 0.23021032504780115, + "grad_norm": 2.3687522411346436, + "learning_rate": 4.507171914618329e-06, + "loss": 0.3938, + "step": 1204 + }, + { + "epoch": 0.2304015296367113, + "grad_norm": 1.6581428050994873, + "learning_rate": 4.5076994390888294e-06, + "loss": 0.1058, + "step": 1205 + }, + { + "epoch": 0.2305927342256214, + "grad_norm": 1.87697434425354, + "learning_rate": 4.508226525961212e-06, + "loss": 0.1238, + "step": 1206 + }, + { + "epoch": 0.23078393881453155, + "grad_norm": 2.034977912902832, + "learning_rate": 4.508753175960878e-06, + "loss": 0.183, + "step": 1207 + }, + { + "epoch": 0.2309751434034417, + "grad_norm": 2.1185624599456787, + "learning_rate": 4.509279389811426e-06, + "loss": 0.2888, + "step": 1208 + }, + { + "epoch": 0.2311663479923518, + "grad_norm": 2.6012215614318848, + "learning_rate": 4.509805168234659e-06, + "loss": 0.3785, + "step": 1209 + }, + { + "epoch": 0.23135755258126195, + "grad_norm": 1.3720933198928833, + "learning_rate": 4.510330511950588e-06, + "loss": 0.1433, + "step": 1210 + }, + { + "epoch": 0.2315487571701721, + "grad_norm": 1.6616960763931274, + "learning_rate": 4.510855421677442e-06, + "loss": 0.1277, + "step": 1211 + }, + { + "epoch": 0.2317399617590822, + "grad_norm": 1.7231650352478027, + "learning_rate": 4.511379898131671e-06, + "loss": 0.1076, + "step": 1212 + }, + { + "epoch": 0.23193116634799235, + "grad_norm": 2.2084357738494873, + "learning_rate": 4.511903942027949e-06, + "loss": 0.2608, + "step": 1213 + }, + { + "epoch": 0.2321223709369025, + "grad_norm": 2.0652458667755127, + "learning_rate": 4.512427554079188e-06, + "loss": 0.2228, + "step": 1214 + }, + { + "epoch": 0.2323135755258126, + "grad_norm": 1.695842981338501, + "learning_rate": 4.512950734996536e-06, + "loss": 0.1546, + "step": 1215 + }, + { + "epoch": 0.23250478011472275, + "grad_norm": 3.044466972351074, + "learning_rate": 4.513473485489387e-06, + "loss": 0.3004, + "step": 1216 + }, + { + "epoch": 0.2326959847036329, + "grad_norm": 2.2261133193969727, + "learning_rate": 4.513995806265384e-06, + "loss": 0.2103, + "step": 1217 + }, + { + "epoch": 0.232887189292543, + "grad_norm": 2.5913712978363037, + "learning_rate": 4.51451769803043e-06, + "loss": 0.3076, + "step": 1218 + }, + { + "epoch": 0.23307839388145316, + "grad_norm": 2.4403324127197266, + "learning_rate": 4.515039161488684e-06, + "loss": 0.1843, + "step": 1219 + }, + { + "epoch": 0.2332695984703633, + "grad_norm": 3.694639205932617, + "learning_rate": 4.51556019734258e-06, + "loss": 0.3922, + "step": 1220 + }, + { + "epoch": 0.23346080305927341, + "grad_norm": 3.163925886154175, + "learning_rate": 4.516080806292819e-06, + "loss": 0.3171, + "step": 1221 + }, + { + "epoch": 0.23365200764818356, + "grad_norm": 2.774327039718628, + "learning_rate": 4.516600989038385e-06, + "loss": 0.3307, + "step": 1222 + }, + { + "epoch": 0.2338432122370937, + "grad_norm": 1.897996425628662, + "learning_rate": 4.517120746276545e-06, + "loss": 0.1318, + "step": 1223 + }, + { + "epoch": 0.23403441682600382, + "grad_norm": 1.8080636262893677, + "learning_rate": 4.517640078702858e-06, + "loss": 0.089, + "step": 1224 + }, + { + "epoch": 0.23422562141491396, + "grad_norm": 1.8752037286758423, + "learning_rate": 4.5181589870111755e-06, + "loss": 0.1162, + "step": 1225 + }, + { + "epoch": 0.2344168260038241, + "grad_norm": 1.7227205038070679, + "learning_rate": 4.518677471893654e-06, + "loss": 0.316, + "step": 1226 + }, + { + "epoch": 0.23460803059273422, + "grad_norm": 2.1087758541107178, + "learning_rate": 4.519195534040756e-06, + "loss": 0.1585, + "step": 1227 + }, + { + "epoch": 0.23479923518164436, + "grad_norm": 2.5245354175567627, + "learning_rate": 4.519713174141255e-06, + "loss": 0.2401, + "step": 1228 + }, + { + "epoch": 0.2349904397705545, + "grad_norm": 2.966695547103882, + "learning_rate": 4.520230392882245e-06, + "loss": 0.0818, + "step": 1229 + }, + { + "epoch": 0.23518164435946462, + "grad_norm": 1.3945024013519287, + "learning_rate": 4.520747190949141e-06, + "loss": 0.1178, + "step": 1230 + }, + { + "epoch": 0.23537284894837476, + "grad_norm": 2.4835667610168457, + "learning_rate": 4.5212635690256885e-06, + "loss": 0.0745, + "step": 1231 + }, + { + "epoch": 0.2355640535372849, + "grad_norm": 3.491046667098999, + "learning_rate": 4.521779527793967e-06, + "loss": 0.2478, + "step": 1232 + }, + { + "epoch": 0.23575525812619502, + "grad_norm": 1.6101934909820557, + "learning_rate": 4.522295067934395e-06, + "loss": 0.1821, + "step": 1233 + }, + { + "epoch": 0.23594646271510517, + "grad_norm": 1.2053178548812866, + "learning_rate": 4.522810190125739e-06, + "loss": 0.1249, + "step": 1234 + }, + { + "epoch": 0.2361376673040153, + "grad_norm": 2.9630486965179443, + "learning_rate": 4.523324895045111e-06, + "loss": 0.2589, + "step": 1235 + }, + { + "epoch": 0.23632887189292542, + "grad_norm": 1.9277501106262207, + "learning_rate": 4.523839183367984e-06, + "loss": 0.2001, + "step": 1236 + }, + { + "epoch": 0.23652007648183557, + "grad_norm": 3.066760540008545, + "learning_rate": 4.5243530557681885e-06, + "loss": 0.4899, + "step": 1237 + }, + { + "epoch": 0.2367112810707457, + "grad_norm": 1.5749143362045288, + "learning_rate": 4.524866512917924e-06, + "loss": 0.0899, + "step": 1238 + }, + { + "epoch": 0.23690248565965583, + "grad_norm": 2.5981080532073975, + "learning_rate": 4.525379555487759e-06, + "loss": 0.4254, + "step": 1239 + }, + { + "epoch": 0.23709369024856597, + "grad_norm": 2.3466320037841797, + "learning_rate": 4.525892184146643e-06, + "loss": 0.3603, + "step": 1240 + }, + { + "epoch": 0.2372848948374761, + "grad_norm": 1.4082063436508179, + "learning_rate": 4.526404399561903e-06, + "loss": 0.1948, + "step": 1241 + }, + { + "epoch": 0.23747609942638623, + "grad_norm": 2.5637047290802, + "learning_rate": 4.5269162023992565e-06, + "loss": 0.2129, + "step": 1242 + }, + { + "epoch": 0.23766730401529637, + "grad_norm": 6.297000885009766, + "learning_rate": 4.527427593322813e-06, + "loss": 0.1063, + "step": 1243 + }, + { + "epoch": 0.23785850860420651, + "grad_norm": 1.5783936977386475, + "learning_rate": 4.527938572995081e-06, + "loss": 0.0618, + "step": 1244 + }, + { + "epoch": 0.23804971319311663, + "grad_norm": 2.314439535140991, + "learning_rate": 4.5284491420769685e-06, + "loss": 0.2352, + "step": 1245 + }, + { + "epoch": 0.23824091778202677, + "grad_norm": 2.707080841064453, + "learning_rate": 4.528959301227795e-06, + "loss": 0.5317, + "step": 1246 + }, + { + "epoch": 0.2384321223709369, + "grad_norm": 3.3706955909729004, + "learning_rate": 4.529469051105292e-06, + "loss": 0.2381, + "step": 1247 + }, + { + "epoch": 0.23862332695984703, + "grad_norm": 2.027712345123291, + "learning_rate": 4.5299783923656084e-06, + "loss": 0.2689, + "step": 1248 + }, + { + "epoch": 0.23881453154875718, + "grad_norm": 2.501119375228882, + "learning_rate": 4.530487325663318e-06, + "loss": 0.1832, + "step": 1249 + }, + { + "epoch": 0.2390057361376673, + "grad_norm": 1.212647795677185, + "learning_rate": 4.530995851651422e-06, + "loss": 0.0478, + "step": 1250 + }, + { + "epoch": 0.23919694072657743, + "grad_norm": 3.2780611515045166, + "learning_rate": 4.531503970981357e-06, + "loss": 0.5961, + "step": 1251 + }, + { + "epoch": 0.23938814531548758, + "grad_norm": 2.3319296836853027, + "learning_rate": 4.532011684302993e-06, + "loss": 0.1756, + "step": 1252 + }, + { + "epoch": 0.2395793499043977, + "grad_norm": 1.8689370155334473, + "learning_rate": 4.532518992264648e-06, + "loss": 0.0948, + "step": 1253 + }, + { + "epoch": 0.23977055449330784, + "grad_norm": 2.4608871936798096, + "learning_rate": 4.5330258955130894e-06, + "loss": 0.1658, + "step": 1254 + }, + { + "epoch": 0.23996175908221798, + "grad_norm": 3.9478836059570312, + "learning_rate": 4.533532394693532e-06, + "loss": 0.3201, + "step": 1255 + }, + { + "epoch": 0.2401529636711281, + "grad_norm": 1.9749811887741089, + "learning_rate": 4.534038490449656e-06, + "loss": 0.0873, + "step": 1256 + }, + { + "epoch": 0.24034416826003824, + "grad_norm": 1.7174595594406128, + "learning_rate": 4.534544183423599e-06, + "loss": 0.1389, + "step": 1257 + }, + { + "epoch": 0.24053537284894838, + "grad_norm": 1.479738712310791, + "learning_rate": 4.5350494742559694e-06, + "loss": 0.2478, + "step": 1258 + }, + { + "epoch": 0.2407265774378585, + "grad_norm": 1.9513295888900757, + "learning_rate": 4.535554363585849e-06, + "loss": 0.2366, + "step": 1259 + }, + { + "epoch": 0.24091778202676864, + "grad_norm": 1.9584410190582275, + "learning_rate": 4.536058852050796e-06, + "loss": 0.1392, + "step": 1260 + }, + { + "epoch": 0.24110898661567878, + "grad_norm": 1.891797423362732, + "learning_rate": 4.53656294028685e-06, + "loss": 0.0977, + "step": 1261 + }, + { + "epoch": 0.2413001912045889, + "grad_norm": 1.9594978094100952, + "learning_rate": 4.537066628928541e-06, + "loss": 0.1411, + "step": 1262 + }, + { + "epoch": 0.24149139579349904, + "grad_norm": 2.9643545150756836, + "learning_rate": 4.537569918608891e-06, + "loss": 0.2389, + "step": 1263 + }, + { + "epoch": 0.24168260038240919, + "grad_norm": 1.6669223308563232, + "learning_rate": 4.538072809959417e-06, + "loss": 0.1562, + "step": 1264 + }, + { + "epoch": 0.2418738049713193, + "grad_norm": 2.446153402328491, + "learning_rate": 4.538575303610137e-06, + "loss": 0.1492, + "step": 1265 + }, + { + "epoch": 0.24206500956022944, + "grad_norm": 1.535170555114746, + "learning_rate": 4.539077400189579e-06, + "loss": 0.0918, + "step": 1266 + }, + { + "epoch": 0.2422562141491396, + "grad_norm": 1.8623381853103638, + "learning_rate": 4.53957910032478e-06, + "loss": 0.107, + "step": 1267 + }, + { + "epoch": 0.2424474187380497, + "grad_norm": 2.7198643684387207, + "learning_rate": 4.5400804046412925e-06, + "loss": 0.256, + "step": 1268 + }, + { + "epoch": 0.24263862332695985, + "grad_norm": 2.15179443359375, + "learning_rate": 4.540581313763191e-06, + "loss": 0.1535, + "step": 1269 + }, + { + "epoch": 0.24282982791587, + "grad_norm": 2.4457833766937256, + "learning_rate": 4.541081828313074e-06, + "loss": 0.4226, + "step": 1270 + }, + { + "epoch": 0.2430210325047801, + "grad_norm": 2.3401358127593994, + "learning_rate": 4.54158194891207e-06, + "loss": 0.1621, + "step": 1271 + }, + { + "epoch": 0.24321223709369025, + "grad_norm": 2.9228503704071045, + "learning_rate": 4.542081676179842e-06, + "loss": 0.1591, + "step": 1272 + }, + { + "epoch": 0.2434034416826004, + "grad_norm": 1.8721615076065063, + "learning_rate": 4.542581010734594e-06, + "loss": 0.2154, + "step": 1273 + }, + { + "epoch": 0.2435946462715105, + "grad_norm": 1.5572518110275269, + "learning_rate": 4.54307995319307e-06, + "loss": 0.084, + "step": 1274 + }, + { + "epoch": 0.24378585086042065, + "grad_norm": 2.8869805335998535, + "learning_rate": 4.543578504170567e-06, + "loss": 0.1584, + "step": 1275 + }, + { + "epoch": 0.2439770554493308, + "grad_norm": 2.540313243865967, + "learning_rate": 4.544076664280929e-06, + "loss": 0.2337, + "step": 1276 + }, + { + "epoch": 0.2441682600382409, + "grad_norm": 2.9700610637664795, + "learning_rate": 4.544574434136564e-06, + "loss": 0.2049, + "step": 1277 + }, + { + "epoch": 0.24435946462715105, + "grad_norm": 2.179842710494995, + "learning_rate": 4.545071814348435e-06, + "loss": 0.3845, + "step": 1278 + }, + { + "epoch": 0.2445506692160612, + "grad_norm": 2.47133731842041, + "learning_rate": 4.5455688055260765e-06, + "loss": 0.2824, + "step": 1279 + }, + { + "epoch": 0.2447418738049713, + "grad_norm": 1.5787922143936157, + "learning_rate": 4.5460654082775925e-06, + "loss": 0.079, + "step": 1280 + }, + { + "epoch": 0.24493307839388145, + "grad_norm": 2.6020150184631348, + "learning_rate": 4.546561623209661e-06, + "loss": 0.1166, + "step": 1281 + }, + { + "epoch": 0.2451242829827916, + "grad_norm": 5.912609100341797, + "learning_rate": 4.547057450927541e-06, + "loss": 0.6601, + "step": 1282 + }, + { + "epoch": 0.2453154875717017, + "grad_norm": 3.048389434814453, + "learning_rate": 4.547552892035077e-06, + "loss": 0.5557, + "step": 1283 + }, + { + "epoch": 0.24550669216061186, + "grad_norm": 2.1050589084625244, + "learning_rate": 4.548047947134698e-06, + "loss": 0.1083, + "step": 1284 + }, + { + "epoch": 0.245697896749522, + "grad_norm": 2.0243983268737793, + "learning_rate": 4.5485426168274285e-06, + "loss": 0.1569, + "step": 1285 + }, + { + "epoch": 0.24588910133843211, + "grad_norm": 1.9739599227905273, + "learning_rate": 4.549036901712892e-06, + "loss": 0.1624, + "step": 1286 + }, + { + "epoch": 0.24608030592734226, + "grad_norm": 1.8565188646316528, + "learning_rate": 4.549530802389311e-06, + "loss": 0.0998, + "step": 1287 + }, + { + "epoch": 0.2462715105162524, + "grad_norm": 2.806781053543091, + "learning_rate": 4.550024319453516e-06, + "loss": 0.3977, + "step": 1288 + }, + { + "epoch": 0.24646271510516252, + "grad_norm": 2.98370623588562, + "learning_rate": 4.550517453500946e-06, + "loss": 0.4309, + "step": 1289 + }, + { + "epoch": 0.24665391969407266, + "grad_norm": 2.2158684730529785, + "learning_rate": 4.551010205125657e-06, + "loss": 0.3051, + "step": 1290 + }, + { + "epoch": 0.2468451242829828, + "grad_norm": 2.2170090675354004, + "learning_rate": 4.551502574920322e-06, + "loss": 0.2447, + "step": 1291 + }, + { + "epoch": 0.24703632887189292, + "grad_norm": 2.093726634979248, + "learning_rate": 4.551994563476239e-06, + "loss": 0.1615, + "step": 1292 + }, + { + "epoch": 0.24722753346080306, + "grad_norm": 0.953401505947113, + "learning_rate": 4.5524861713833315e-06, + "loss": 0.0517, + "step": 1293 + }, + { + "epoch": 0.2474187380497132, + "grad_norm": 2.464068651199341, + "learning_rate": 4.552977399230156e-06, + "loss": 0.1319, + "step": 1294 + }, + { + "epoch": 0.24760994263862332, + "grad_norm": 1.8242707252502441, + "learning_rate": 4.553468247603907e-06, + "loss": 0.1472, + "step": 1295 + }, + { + "epoch": 0.24780114722753346, + "grad_norm": 1.8389755487442017, + "learning_rate": 4.553958717090414e-06, + "loss": 0.2597, + "step": 1296 + }, + { + "epoch": 0.2479923518164436, + "grad_norm": 2.3550262451171875, + "learning_rate": 4.554448808274157e-06, + "loss": 0.1745, + "step": 1297 + }, + { + "epoch": 0.24818355640535372, + "grad_norm": 1.627742886543274, + "learning_rate": 4.554938521738259e-06, + "loss": 0.0905, + "step": 1298 + }, + { + "epoch": 0.24837476099426387, + "grad_norm": 3.0560617446899414, + "learning_rate": 4.555427858064501e-06, + "loss": 0.5314, + "step": 1299 + }, + { + "epoch": 0.248565965583174, + "grad_norm": 2.0782976150512695, + "learning_rate": 4.555916817833317e-06, + "loss": 0.0902, + "step": 1300 + }, + { + "epoch": 0.24875717017208412, + "grad_norm": 2.113381862640381, + "learning_rate": 4.556405401623804e-06, + "loss": 0.4115, + "step": 1301 + }, + { + "epoch": 0.24894837476099427, + "grad_norm": 1.6574503183364868, + "learning_rate": 4.5568936100137235e-06, + "loss": 0.1394, + "step": 1302 + }, + { + "epoch": 0.2491395793499044, + "grad_norm": 2.0017340183258057, + "learning_rate": 4.557381443579506e-06, + "loss": 0.1937, + "step": 1303 + }, + { + "epoch": 0.24933078393881453, + "grad_norm": 2.2529678344726562, + "learning_rate": 4.5578689028962575e-06, + "loss": 0.2572, + "step": 1304 + }, + { + "epoch": 0.24952198852772467, + "grad_norm": 1.7279530763626099, + "learning_rate": 4.558355988537758e-06, + "loss": 0.0845, + "step": 1305 + }, + { + "epoch": 0.2497131931166348, + "grad_norm": 3.168016195297241, + "learning_rate": 4.558842701076469e-06, + "loss": 0.174, + "step": 1306 + }, + { + "epoch": 0.24990439770554493, + "grad_norm": 2.5194199085235596, + "learning_rate": 4.559329041083543e-06, + "loss": 0.3312, + "step": 1307 + }, + { + "epoch": 0.25009560229445504, + "grad_norm": 3.844399929046631, + "learning_rate": 4.5598150091288164e-06, + "loss": 0.3948, + "step": 1308 + }, + { + "epoch": 0.2502868068833652, + "grad_norm": 1.3260141611099243, + "learning_rate": 4.560300605780819e-06, + "loss": 0.0967, + "step": 1309 + }, + { + "epoch": 0.25047801147227533, + "grad_norm": 1.3077999353408813, + "learning_rate": 4.5607858316067835e-06, + "loss": 0.1281, + "step": 1310 + }, + { + "epoch": 0.2506692160611855, + "grad_norm": 2.026874542236328, + "learning_rate": 4.561270687172638e-06, + "loss": 0.2637, + "step": 1311 + }, + { + "epoch": 0.2508604206500956, + "grad_norm": 1.997843861579895, + "learning_rate": 4.561755173043019e-06, + "loss": 0.1711, + "step": 1312 + }, + { + "epoch": 0.25105162523900576, + "grad_norm": 1.434065341949463, + "learning_rate": 4.562239289781273e-06, + "loss": 0.1526, + "step": 1313 + }, + { + "epoch": 0.25124282982791585, + "grad_norm": 2.5661134719848633, + "learning_rate": 4.5627230379494595e-06, + "loss": 0.4677, + "step": 1314 + }, + { + "epoch": 0.251434034416826, + "grad_norm": 2.9416589736938477, + "learning_rate": 4.5632064181083524e-06, + "loss": 0.337, + "step": 1315 + }, + { + "epoch": 0.25162523900573613, + "grad_norm": 2.0680315494537354, + "learning_rate": 4.56368943081745e-06, + "loss": 0.2415, + "step": 1316 + }, + { + "epoch": 0.2518164435946463, + "grad_norm": 1.4894253015518188, + "learning_rate": 4.564172076634976e-06, + "loss": 0.2191, + "step": 1317 + }, + { + "epoch": 0.2520076481835564, + "grad_norm": 2.523341655731201, + "learning_rate": 4.56465435611788e-06, + "loss": 0.1231, + "step": 1318 + }, + { + "epoch": 0.25219885277246656, + "grad_norm": 2.880688190460205, + "learning_rate": 4.5651362698218455e-06, + "loss": 0.2017, + "step": 1319 + }, + { + "epoch": 0.25239005736137665, + "grad_norm": 2.6411757469177246, + "learning_rate": 4.565617818301295e-06, + "loss": 0.5086, + "step": 1320 + }, + { + "epoch": 0.2525812619502868, + "grad_norm": 2.0425150394439697, + "learning_rate": 4.566099002109388e-06, + "loss": 0.2197, + "step": 1321 + }, + { + "epoch": 0.25277246653919694, + "grad_norm": 2.203280448913574, + "learning_rate": 4.56657982179803e-06, + "loss": 0.2555, + "step": 1322 + }, + { + "epoch": 0.2529636711281071, + "grad_norm": 1.5817688703536987, + "learning_rate": 4.567060277917876e-06, + "loss": 0.107, + "step": 1323 + }, + { + "epoch": 0.2531548757170172, + "grad_norm": 2.743864059448242, + "learning_rate": 4.567540371018329e-06, + "loss": 0.1049, + "step": 1324 + }, + { + "epoch": 0.25334608030592737, + "grad_norm": 1.1663135290145874, + "learning_rate": 4.568020101647551e-06, + "loss": 0.0449, + "step": 1325 + }, + { + "epoch": 0.25353728489483746, + "grad_norm": 2.348593235015869, + "learning_rate": 4.568499470352461e-06, + "loss": 0.4336, + "step": 1326 + }, + { + "epoch": 0.2537284894837476, + "grad_norm": 2.174994468688965, + "learning_rate": 4.568978477678743e-06, + "loss": 0.4255, + "step": 1327 + }, + { + "epoch": 0.25391969407265774, + "grad_norm": 2.499854564666748, + "learning_rate": 4.5694571241708465e-06, + "loss": 0.1284, + "step": 1328 + }, + { + "epoch": 0.2541108986615679, + "grad_norm": 1.8215583562850952, + "learning_rate": 4.5699354103719936e-06, + "loss": 0.1327, + "step": 1329 + }, + { + "epoch": 0.25430210325047803, + "grad_norm": 2.302138328552246, + "learning_rate": 4.570413336824176e-06, + "loss": 0.0985, + "step": 1330 + }, + { + "epoch": 0.25449330783938817, + "grad_norm": 2.055473804473877, + "learning_rate": 4.570890904068169e-06, + "loss": 0.0946, + "step": 1331 + }, + { + "epoch": 0.25468451242829826, + "grad_norm": 1.9759926795959473, + "learning_rate": 4.571368112643526e-06, + "loss": 0.1447, + "step": 1332 + }, + { + "epoch": 0.2548757170172084, + "grad_norm": 2.6056742668151855, + "learning_rate": 4.571844963088587e-06, + "loss": 0.4277, + "step": 1333 + }, + { + "epoch": 0.25506692160611855, + "grad_norm": 1.6580945253372192, + "learning_rate": 4.572321455940478e-06, + "loss": 0.3823, + "step": 1334 + }, + { + "epoch": 0.2552581261950287, + "grad_norm": 3.8294076919555664, + "learning_rate": 4.572797591735123e-06, + "loss": 0.0972, + "step": 1335 + }, + { + "epoch": 0.25544933078393883, + "grad_norm": 1.2979289293289185, + "learning_rate": 4.573273371007238e-06, + "loss": 0.0785, + "step": 1336 + }, + { + "epoch": 0.255640535372849, + "grad_norm": 1.569770097732544, + "learning_rate": 4.573748794290339e-06, + "loss": 0.1083, + "step": 1337 + }, + { + "epoch": 0.25583173996175906, + "grad_norm": 1.5974154472351074, + "learning_rate": 4.574223862116746e-06, + "loss": 0.1379, + "step": 1338 + }, + { + "epoch": 0.2560229445506692, + "grad_norm": 2.4823997020721436, + "learning_rate": 4.574698575017587e-06, + "loss": 0.3444, + "step": 1339 + }, + { + "epoch": 0.25621414913957935, + "grad_norm": 1.7080856561660767, + "learning_rate": 4.5751729335227995e-06, + "loss": 0.1214, + "step": 1340 + }, + { + "epoch": 0.2564053537284895, + "grad_norm": 2.0777196884155273, + "learning_rate": 4.575646938161135e-06, + "loss": 0.1628, + "step": 1341 + }, + { + "epoch": 0.25659655831739964, + "grad_norm": 1.514445424079895, + "learning_rate": 4.576120589460161e-06, + "loss": 0.1017, + "step": 1342 + }, + { + "epoch": 0.2567877629063097, + "grad_norm": 3.241065502166748, + "learning_rate": 4.576593887946269e-06, + "loss": 0.1865, + "step": 1343 + }, + { + "epoch": 0.25697896749521987, + "grad_norm": 1.9930219650268555, + "learning_rate": 4.577066834144674e-06, + "loss": 0.2197, + "step": 1344 + }, + { + "epoch": 0.25717017208413, + "grad_norm": 2.6061015129089355, + "learning_rate": 4.577539428579417e-06, + "loss": 0.4761, + "step": 1345 + }, + { + "epoch": 0.25736137667304015, + "grad_norm": 1.4190788269042969, + "learning_rate": 4.578011671773374e-06, + "loss": 0.1455, + "step": 1346 + }, + { + "epoch": 0.2575525812619503, + "grad_norm": 1.4043774604797363, + "learning_rate": 4.578483564248254e-06, + "loss": 0.0912, + "step": 1347 + }, + { + "epoch": 0.25774378585086044, + "grad_norm": 2.920255184173584, + "learning_rate": 4.578955106524605e-06, + "loss": 0.1808, + "step": 1348 + }, + { + "epoch": 0.25793499043977053, + "grad_norm": 2.587714672088623, + "learning_rate": 4.5794262991218156e-06, + "loss": 0.1429, + "step": 1349 + }, + { + "epoch": 0.25812619502868067, + "grad_norm": 2.5600852966308594, + "learning_rate": 4.5798971425581235e-06, + "loss": 0.2295, + "step": 1350 + }, + { + "epoch": 0.2583173996175908, + "grad_norm": 1.5082076787948608, + "learning_rate": 4.580367637350609e-06, + "loss": 0.174, + "step": 1351 + }, + { + "epoch": 0.25850860420650096, + "grad_norm": 1.4872022867202759, + "learning_rate": 4.580837784015212e-06, + "loss": 0.1193, + "step": 1352 + }, + { + "epoch": 0.2586998087954111, + "grad_norm": 3.185533285140991, + "learning_rate": 4.581307583066722e-06, + "loss": 0.1592, + "step": 1353 + }, + { + "epoch": 0.25889101338432124, + "grad_norm": 2.905579090118408, + "learning_rate": 4.58177703501879e-06, + "loss": 0.1793, + "step": 1354 + }, + { + "epoch": 0.25908221797323133, + "grad_norm": 1.6768579483032227, + "learning_rate": 4.58224614038393e-06, + "loss": 0.0814, + "step": 1355 + }, + { + "epoch": 0.2592734225621415, + "grad_norm": 4.425497055053711, + "learning_rate": 4.58271489967352e-06, + "loss": 0.2171, + "step": 1356 + }, + { + "epoch": 0.2594646271510516, + "grad_norm": 2.09071946144104, + "learning_rate": 4.5831833133978085e-06, + "loss": 0.1503, + "step": 1357 + }, + { + "epoch": 0.25965583173996176, + "grad_norm": 2.006558895111084, + "learning_rate": 4.583651382065915e-06, + "loss": 0.4281, + "step": 1358 + }, + { + "epoch": 0.2598470363288719, + "grad_norm": 1.6992747783660889, + "learning_rate": 4.584119106185835e-06, + "loss": 0.1838, + "step": 1359 + }, + { + "epoch": 0.26003824091778205, + "grad_norm": 2.895442485809326, + "learning_rate": 4.584586486264445e-06, + "loss": 0.5134, + "step": 1360 + }, + { + "epoch": 0.26022944550669214, + "grad_norm": 3.2242867946624756, + "learning_rate": 4.585053522807501e-06, + "loss": 0.1576, + "step": 1361 + }, + { + "epoch": 0.2604206500956023, + "grad_norm": 3.0295143127441406, + "learning_rate": 4.5855202163196466e-06, + "loss": 0.3062, + "step": 1362 + }, + { + "epoch": 0.2606118546845124, + "grad_norm": 1.9487686157226562, + "learning_rate": 4.585986567304413e-06, + "loss": 0.2205, + "step": 1363 + }, + { + "epoch": 0.26080305927342257, + "grad_norm": 2.4985392093658447, + "learning_rate": 4.586452576264223e-06, + "loss": 0.3626, + "step": 1364 + }, + { + "epoch": 0.2609942638623327, + "grad_norm": 3.514467716217041, + "learning_rate": 4.586918243700398e-06, + "loss": 0.365, + "step": 1365 + }, + { + "epoch": 0.26118546845124285, + "grad_norm": 2.703725576400757, + "learning_rate": 4.587383570113155e-06, + "loss": 0.4229, + "step": 1366 + }, + { + "epoch": 0.26137667304015294, + "grad_norm": 2.9062862396240234, + "learning_rate": 4.587848556001613e-06, + "loss": 0.2676, + "step": 1367 + }, + { + "epoch": 0.2615678776290631, + "grad_norm": 1.557067632675171, + "learning_rate": 4.588313201863795e-06, + "loss": 0.092, + "step": 1368 + }, + { + "epoch": 0.2617590822179732, + "grad_norm": 2.868450880050659, + "learning_rate": 4.588777508196637e-06, + "loss": 0.2177, + "step": 1369 + }, + { + "epoch": 0.26195028680688337, + "grad_norm": 2.503908157348633, + "learning_rate": 4.589241475495983e-06, + "loss": 0.3252, + "step": 1370 + }, + { + "epoch": 0.2621414913957935, + "grad_norm": 3.2766878604888916, + "learning_rate": 4.58970510425659e-06, + "loss": 0.5061, + "step": 1371 + }, + { + "epoch": 0.26233269598470366, + "grad_norm": 1.436415672302246, + "learning_rate": 4.5901683949721355e-06, + "loss": 0.113, + "step": 1372 + }, + { + "epoch": 0.26252390057361374, + "grad_norm": 1.4718507528305054, + "learning_rate": 4.590631348135217e-06, + "loss": 0.2098, + "step": 1373 + }, + { + "epoch": 0.2627151051625239, + "grad_norm": 1.3220497369766235, + "learning_rate": 4.591093964237357e-06, + "loss": 0.0534, + "step": 1374 + }, + { + "epoch": 0.26290630975143403, + "grad_norm": 2.3449716567993164, + "learning_rate": 4.591556243769003e-06, + "loss": 0.1064, + "step": 1375 + }, + { + "epoch": 0.2630975143403442, + "grad_norm": 2.357490301132202, + "learning_rate": 4.592018187219536e-06, + "loss": 0.5121, + "step": 1376 + }, + { + "epoch": 0.2632887189292543, + "grad_norm": 2.1978540420532227, + "learning_rate": 4.5924797950772665e-06, + "loss": 0.2471, + "step": 1377 + }, + { + "epoch": 0.26347992351816446, + "grad_norm": 2.617004871368408, + "learning_rate": 4.592941067829446e-06, + "loss": 0.3932, + "step": 1378 + }, + { + "epoch": 0.26367112810707455, + "grad_norm": 2.3759875297546387, + "learning_rate": 4.593402005962261e-06, + "loss": 0.3458, + "step": 1379 + }, + { + "epoch": 0.2638623326959847, + "grad_norm": 1.166843056678772, + "learning_rate": 4.593862609960843e-06, + "loss": 0.0701, + "step": 1380 + }, + { + "epoch": 0.26405353728489483, + "grad_norm": 2.4391300678253174, + "learning_rate": 4.594322880309272e-06, + "loss": 0.1061, + "step": 1381 + }, + { + "epoch": 0.264244741873805, + "grad_norm": 1.7300337553024292, + "learning_rate": 4.594782817490571e-06, + "loss": 0.1513, + "step": 1382 + }, + { + "epoch": 0.2644359464627151, + "grad_norm": 2.3044793605804443, + "learning_rate": 4.595242421986719e-06, + "loss": 0.2708, + "step": 1383 + }, + { + "epoch": 0.26462715105162526, + "grad_norm": 1.8326455354690552, + "learning_rate": 4.595701694278649e-06, + "loss": 0.253, + "step": 1384 + }, + { + "epoch": 0.26481835564053535, + "grad_norm": 2.248283863067627, + "learning_rate": 4.5961606348462506e-06, + "loss": 0.2861, + "step": 1385 + }, + { + "epoch": 0.2650095602294455, + "grad_norm": 3.2349724769592285, + "learning_rate": 4.596619244168376e-06, + "loss": 0.2644, + "step": 1386 + }, + { + "epoch": 0.26520076481835564, + "grad_norm": 3.4575836658477783, + "learning_rate": 4.59707752272284e-06, + "loss": 0.1486, + "step": 1387 + }, + { + "epoch": 0.2653919694072658, + "grad_norm": 1.597733497619629, + "learning_rate": 4.597535470986426e-06, + "loss": 0.1201, + "step": 1388 + }, + { + "epoch": 0.2655831739961759, + "grad_norm": 2.6269469261169434, + "learning_rate": 4.597993089434886e-06, + "loss": 0.18, + "step": 1389 + }, + { + "epoch": 0.26577437858508607, + "grad_norm": 2.98217511177063, + "learning_rate": 4.598450378542943e-06, + "loss": 0.3596, + "step": 1390 + }, + { + "epoch": 0.26596558317399616, + "grad_norm": 1.9663608074188232, + "learning_rate": 4.5989073387843e-06, + "loss": 0.2026, + "step": 1391 + }, + { + "epoch": 0.2661567877629063, + "grad_norm": 1.8245049715042114, + "learning_rate": 4.599363970631637e-06, + "loss": 0.1057, + "step": 1392 + }, + { + "epoch": 0.26634799235181644, + "grad_norm": 2.081806182861328, + "learning_rate": 4.599820274556611e-06, + "loss": 0.1292, + "step": 1393 + }, + { + "epoch": 0.2665391969407266, + "grad_norm": 2.341907262802124, + "learning_rate": 4.6002762510298725e-06, + "loss": 0.159, + "step": 1394 + }, + { + "epoch": 0.26673040152963673, + "grad_norm": 4.16287899017334, + "learning_rate": 4.600731900521051e-06, + "loss": 0.5912, + "step": 1395 + }, + { + "epoch": 0.2669216061185469, + "grad_norm": 2.3653371334075928, + "learning_rate": 4.601187223498774e-06, + "loss": 0.1569, + "step": 1396 + }, + { + "epoch": 0.26711281070745696, + "grad_norm": 1.7293826341629028, + "learning_rate": 4.601642220430655e-06, + "loss": 0.1551, + "step": 1397 + }, + { + "epoch": 0.2673040152963671, + "grad_norm": 2.207956314086914, + "learning_rate": 4.602096891783308e-06, + "loss": 0.1525, + "step": 1398 + }, + { + "epoch": 0.26749521988527725, + "grad_norm": 2.079894542694092, + "learning_rate": 4.6025512380223466e-06, + "loss": 0.2216, + "step": 1399 + }, + { + "epoch": 0.2676864244741874, + "grad_norm": 1.731223702430725, + "learning_rate": 4.603005259612382e-06, + "loss": 0.1097, + "step": 1400 + }, + { + "epoch": 0.26787762906309753, + "grad_norm": 2.4742074012756348, + "learning_rate": 4.603458957017036e-06, + "loss": 0.3192, + "step": 1401 + }, + { + "epoch": 0.2680688336520076, + "grad_norm": 2.1221306324005127, + "learning_rate": 4.603912330698932e-06, + "loss": 0.2922, + "step": 1402 + }, + { + "epoch": 0.26826003824091776, + "grad_norm": 3.109562873840332, + "learning_rate": 4.60436538111971e-06, + "loss": 0.3177, + "step": 1403 + }, + { + "epoch": 0.2684512428298279, + "grad_norm": 2.970853567123413, + "learning_rate": 4.6048181087400175e-06, + "loss": 0.3191, + "step": 1404 + }, + { + "epoch": 0.26864244741873805, + "grad_norm": 1.673386812210083, + "learning_rate": 4.605270514019522e-06, + "loss": 0.0833, + "step": 1405 + }, + { + "epoch": 0.2688336520076482, + "grad_norm": 2.224393606185913, + "learning_rate": 4.605722597416907e-06, + "loss": 0.2605, + "step": 1406 + }, + { + "epoch": 0.26902485659655834, + "grad_norm": 1.8234128952026367, + "learning_rate": 4.6061743593898815e-06, + "loss": 0.1346, + "step": 1407 + }, + { + "epoch": 0.2692160611854684, + "grad_norm": 1.949435830116272, + "learning_rate": 4.6066258003951735e-06, + "loss": 0.157, + "step": 1408 + }, + { + "epoch": 0.26940726577437857, + "grad_norm": 1.833545446395874, + "learning_rate": 4.607076920888543e-06, + "loss": 0.1708, + "step": 1409 + }, + { + "epoch": 0.2695984703632887, + "grad_norm": 2.0920236110687256, + "learning_rate": 4.607527721324779e-06, + "loss": 0.32, + "step": 1410 + }, + { + "epoch": 0.26978967495219885, + "grad_norm": 1.772287368774414, + "learning_rate": 4.6079782021577e-06, + "loss": 0.1181, + "step": 1411 + }, + { + "epoch": 0.269980879541109, + "grad_norm": 3.1658973693847656, + "learning_rate": 4.608428363840164e-06, + "loss": 0.1821, + "step": 1412 + }, + { + "epoch": 0.27017208413001914, + "grad_norm": 1.7124018669128418, + "learning_rate": 4.608878206824065e-06, + "loss": 0.0898, + "step": 1413 + }, + { + "epoch": 0.27036328871892923, + "grad_norm": 4.624269008636475, + "learning_rate": 4.6093277315603385e-06, + "loss": 0.461, + "step": 1414 + }, + { + "epoch": 0.27055449330783937, + "grad_norm": 3.7334680557250977, + "learning_rate": 4.609776938498964e-06, + "loss": 0.2932, + "step": 1415 + }, + { + "epoch": 0.2707456978967495, + "grad_norm": 2.483091354370117, + "learning_rate": 4.610225828088966e-06, + "loss": 0.2618, + "step": 1416 + }, + { + "epoch": 0.27093690248565966, + "grad_norm": 2.0719807147979736, + "learning_rate": 4.610674400778419e-06, + "loss": 0.256, + "step": 1417 + }, + { + "epoch": 0.2711281070745698, + "grad_norm": 2.7514090538024902, + "learning_rate": 4.6111226570144505e-06, + "loss": 0.2337, + "step": 1418 + }, + { + "epoch": 0.27131931166347995, + "grad_norm": 4.372493267059326, + "learning_rate": 4.611570597243238e-06, + "loss": 0.348, + "step": 1419 + }, + { + "epoch": 0.27151051625239003, + "grad_norm": 3.5149691104888916, + "learning_rate": 4.6120182219100225e-06, + "loss": 0.6993, + "step": 1420 + }, + { + "epoch": 0.2717017208413002, + "grad_norm": 2.984778881072998, + "learning_rate": 4.612465531459098e-06, + "loss": 0.2772, + "step": 1421 + }, + { + "epoch": 0.2718929254302103, + "grad_norm": 2.6740102767944336, + "learning_rate": 4.612912526333825e-06, + "loss": 0.3069, + "step": 1422 + }, + { + "epoch": 0.27208413001912046, + "grad_norm": 1.7484631538391113, + "learning_rate": 4.613359206976629e-06, + "loss": 0.0702, + "step": 1423 + }, + { + "epoch": 0.2722753346080306, + "grad_norm": 2.2864737510681152, + "learning_rate": 4.613805573829002e-06, + "loss": 0.1678, + "step": 1424 + }, + { + "epoch": 0.27246653919694075, + "grad_norm": 1.9719682931900024, + "learning_rate": 4.614251627331505e-06, + "loss": 0.1079, + "step": 1425 + }, + { + "epoch": 0.27265774378585084, + "grad_norm": 3.1089677810668945, + "learning_rate": 4.614697367923773e-06, + "loss": 0.4868, + "step": 1426 + }, + { + "epoch": 0.272848948374761, + "grad_norm": 3.177788734436035, + "learning_rate": 4.615142796044517e-06, + "loss": 0.1517, + "step": 1427 + }, + { + "epoch": 0.2730401529636711, + "grad_norm": 1.7205730676651, + "learning_rate": 4.615587912131526e-06, + "loss": 0.1392, + "step": 1428 + }, + { + "epoch": 0.27323135755258127, + "grad_norm": 2.4936370849609375, + "learning_rate": 4.61603271662167e-06, + "loss": 0.2818, + "step": 1429 + }, + { + "epoch": 0.2734225621414914, + "grad_norm": 2.6876871585845947, + "learning_rate": 4.616477209950898e-06, + "loss": 0.2261, + "step": 1430 + }, + { + "epoch": 0.27361376673040155, + "grad_norm": 2.5818753242492676, + "learning_rate": 4.616921392554251e-06, + "loss": 0.1231, + "step": 1431 + }, + { + "epoch": 0.27380497131931164, + "grad_norm": 2.133584499359131, + "learning_rate": 4.617365264865855e-06, + "loss": 0.2894, + "step": 1432 + }, + { + "epoch": 0.2739961759082218, + "grad_norm": 3.4310872554779053, + "learning_rate": 4.6178088273189265e-06, + "loss": 0.7312, + "step": 1433 + }, + { + "epoch": 0.2741873804971319, + "grad_norm": 1.8406034708023071, + "learning_rate": 4.618252080345775e-06, + "loss": 0.1771, + "step": 1434 + }, + { + "epoch": 0.27437858508604207, + "grad_norm": 1.9556293487548828, + "learning_rate": 4.61869502437781e-06, + "loss": 0.1997, + "step": 1435 + }, + { + "epoch": 0.2745697896749522, + "grad_norm": 2.400635242462158, + "learning_rate": 4.619137659845533e-06, + "loss": 0.1104, + "step": 1436 + }, + { + "epoch": 0.27476099426386236, + "grad_norm": 3.1298184394836426, + "learning_rate": 4.619579987178551e-06, + "loss": 0.2818, + "step": 1437 + }, + { + "epoch": 0.27495219885277244, + "grad_norm": 3.0977323055267334, + "learning_rate": 4.620022006805574e-06, + "loss": 0.5284, + "step": 1438 + }, + { + "epoch": 0.2751434034416826, + "grad_norm": 1.5940463542938232, + "learning_rate": 4.620463719154416e-06, + "loss": 0.2201, + "step": 1439 + }, + { + "epoch": 0.27533460803059273, + "grad_norm": 2.438342809677124, + "learning_rate": 4.620905124652002e-06, + "loss": 0.2274, + "step": 1440 + }, + { + "epoch": 0.2755258126195029, + "grad_norm": 1.9232128858566284, + "learning_rate": 4.6213462237243646e-06, + "loss": 0.1143, + "step": 1441 + }, + { + "epoch": 0.275717017208413, + "grad_norm": 2.3613779544830322, + "learning_rate": 4.621787016796653e-06, + "loss": 0.1331, + "step": 1442 + }, + { + "epoch": 0.27590822179732316, + "grad_norm": 1.824241280555725, + "learning_rate": 4.6222275042931295e-06, + "loss": 0.086, + "step": 1443 + }, + { + "epoch": 0.27609942638623325, + "grad_norm": 2.2183074951171875, + "learning_rate": 4.622667686637177e-06, + "loss": 0.1208, + "step": 1444 + }, + { + "epoch": 0.2762906309751434, + "grad_norm": 2.0111348628997803, + "learning_rate": 4.623107564251298e-06, + "loss": 0.1653, + "step": 1445 + }, + { + "epoch": 0.27648183556405354, + "grad_norm": 2.0708916187286377, + "learning_rate": 4.623547137557118e-06, + "loss": 0.1958, + "step": 1446 + }, + { + "epoch": 0.2766730401529637, + "grad_norm": 3.2641139030456543, + "learning_rate": 4.623986406975387e-06, + "loss": 0.2679, + "step": 1447 + }, + { + "epoch": 0.2768642447418738, + "grad_norm": 1.6562097072601318, + "learning_rate": 4.624425372925986e-06, + "loss": 0.1593, + "step": 1448 + }, + { + "epoch": 0.27705544933078396, + "grad_norm": 3.0824685096740723, + "learning_rate": 4.624864035827925e-06, + "loss": 0.1947, + "step": 1449 + }, + { + "epoch": 0.27724665391969405, + "grad_norm": 2.1108744144439697, + "learning_rate": 4.6253023960993445e-06, + "loss": 0.0826, + "step": 1450 + }, + { + "epoch": 0.2774378585086042, + "grad_norm": 1.9569447040557861, + "learning_rate": 4.625740454157524e-06, + "loss": 0.2948, + "step": 1451 + }, + { + "epoch": 0.27762906309751434, + "grad_norm": 2.475095272064209, + "learning_rate": 4.626178210418876e-06, + "loss": 0.2801, + "step": 1452 + }, + { + "epoch": 0.2778202676864245, + "grad_norm": 1.949391484260559, + "learning_rate": 4.626615665298957e-06, + "loss": 0.2627, + "step": 1453 + }, + { + "epoch": 0.2780114722753346, + "grad_norm": 3.6278679370880127, + "learning_rate": 4.627052819212466e-06, + "loss": 0.3923, + "step": 1454 + }, + { + "epoch": 0.27820267686424477, + "grad_norm": 1.5531980991363525, + "learning_rate": 4.627489672573243e-06, + "loss": 0.0759, + "step": 1455 + }, + { + "epoch": 0.27839388145315486, + "grad_norm": 2.221863269805908, + "learning_rate": 4.627926225794277e-06, + "loss": 0.1375, + "step": 1456 + }, + { + "epoch": 0.278585086042065, + "grad_norm": 3.4804158210754395, + "learning_rate": 4.628362479287708e-06, + "loss": 0.31, + "step": 1457 + }, + { + "epoch": 0.27877629063097514, + "grad_norm": 2.319563627243042, + "learning_rate": 4.628798433464823e-06, + "loss": 0.3307, + "step": 1458 + }, + { + "epoch": 0.2789674952198853, + "grad_norm": 2.1878955364227295, + "learning_rate": 4.62923408873607e-06, + "loss": 0.1792, + "step": 1459 + }, + { + "epoch": 0.27915869980879543, + "grad_norm": 1.8384076356887817, + "learning_rate": 4.629669445511046e-06, + "loss": 0.1425, + "step": 1460 + }, + { + "epoch": 0.2793499043977055, + "grad_norm": 3.533768892288208, + "learning_rate": 4.630104504198513e-06, + "loss": 0.2667, + "step": 1461 + }, + { + "epoch": 0.27954110898661566, + "grad_norm": 2.315150499343872, + "learning_rate": 4.6305392652063885e-06, + "loss": 0.1398, + "step": 1462 + }, + { + "epoch": 0.2797323135755258, + "grad_norm": 3.2896182537078857, + "learning_rate": 4.630973728941758e-06, + "loss": 0.3448, + "step": 1463 + }, + { + "epoch": 0.27992351816443595, + "grad_norm": 2.074253797531128, + "learning_rate": 4.631407895810868e-06, + "loss": 0.2655, + "step": 1464 + }, + { + "epoch": 0.2801147227533461, + "grad_norm": 2.220587730407715, + "learning_rate": 4.631841766219136e-06, + "loss": 0.4133, + "step": 1465 + }, + { + "epoch": 0.28030592734225623, + "grad_norm": 2.7555809020996094, + "learning_rate": 4.63227534057115e-06, + "loss": 0.3825, + "step": 1466 + }, + { + "epoch": 0.2804971319311663, + "grad_norm": 1.95787513256073, + "learning_rate": 4.6327086192706666e-06, + "loss": 0.1425, + "step": 1467 + }, + { + "epoch": 0.28068833652007646, + "grad_norm": 2.1310410499572754, + "learning_rate": 4.633141602720621e-06, + "loss": 0.1743, + "step": 1468 + }, + { + "epoch": 0.2808795411089866, + "grad_norm": 1.3947631120681763, + "learning_rate": 4.633574291323124e-06, + "loss": 0.0875, + "step": 1469 + }, + { + "epoch": 0.28107074569789675, + "grad_norm": 3.2543869018554688, + "learning_rate": 4.6340066854794634e-06, + "loss": 0.5823, + "step": 1470 + }, + { + "epoch": 0.2812619502868069, + "grad_norm": 2.7726821899414062, + "learning_rate": 4.634438785590112e-06, + "loss": 0.4261, + "step": 1471 + }, + { + "epoch": 0.28145315487571704, + "grad_norm": 1.8597933053970337, + "learning_rate": 4.634870592054722e-06, + "loss": 0.3469, + "step": 1472 + }, + { + "epoch": 0.2816443594646271, + "grad_norm": 1.49411141872406, + "learning_rate": 4.635302105272136e-06, + "loss": 0.0695, + "step": 1473 + }, + { + "epoch": 0.28183556405353727, + "grad_norm": 2.444284677505493, + "learning_rate": 4.635733325640381e-06, + "loss": 0.2761, + "step": 1474 + }, + { + "epoch": 0.2820267686424474, + "grad_norm": 2.646402359008789, + "learning_rate": 4.636164253556675e-06, + "loss": 0.1316, + "step": 1475 + }, + { + "epoch": 0.28221797323135756, + "grad_norm": 2.195561408996582, + "learning_rate": 4.636594889417429e-06, + "loss": 0.4064, + "step": 1476 + }, + { + "epoch": 0.2824091778202677, + "grad_norm": 1.763237476348877, + "learning_rate": 4.637025233618248e-06, + "loss": 0.1922, + "step": 1477 + }, + { + "epoch": 0.28260038240917784, + "grad_norm": 2.47198748588562, + "learning_rate": 4.637455286553934e-06, + "loss": 0.4326, + "step": 1478 + }, + { + "epoch": 0.28279158699808793, + "grad_norm": 1.6601879596710205, + "learning_rate": 4.637885048618489e-06, + "loss": 0.124, + "step": 1479 + }, + { + "epoch": 0.2829827915869981, + "grad_norm": 1.9923487901687622, + "learning_rate": 4.6383145202051135e-06, + "loss": 0.1602, + "step": 1480 + }, + { + "epoch": 0.2831739961759082, + "grad_norm": 1.6631501913070679, + "learning_rate": 4.638743701706214e-06, + "loss": 0.1, + "step": 1481 + }, + { + "epoch": 0.28336520076481836, + "grad_norm": 2.8882148265838623, + "learning_rate": 4.639172593513399e-06, + "loss": 0.3787, + "step": 1482 + }, + { + "epoch": 0.2835564053537285, + "grad_norm": 2.454380750656128, + "learning_rate": 4.639601196017489e-06, + "loss": 0.3458, + "step": 1483 + }, + { + "epoch": 0.28374760994263865, + "grad_norm": 2.356696367263794, + "learning_rate": 4.640029509608511e-06, + "loss": 0.2473, + "step": 1484 + }, + { + "epoch": 0.28393881453154873, + "grad_norm": 2.13679838180542, + "learning_rate": 4.640457534675704e-06, + "loss": 0.318, + "step": 1485 + }, + { + "epoch": 0.2841300191204589, + "grad_norm": 1.2381728887557983, + "learning_rate": 4.640885271607523e-06, + "loss": 0.0621, + "step": 1486 + }, + { + "epoch": 0.284321223709369, + "grad_norm": 2.054654121398926, + "learning_rate": 4.641312720791636e-06, + "loss": 0.33, + "step": 1487 + }, + { + "epoch": 0.28451242829827916, + "grad_norm": 3.917362928390503, + "learning_rate": 4.64173988261493e-06, + "loss": 0.1747, + "step": 1488 + }, + { + "epoch": 0.2847036328871893, + "grad_norm": 2.648787260055542, + "learning_rate": 4.642166757463516e-06, + "loss": 0.5647, + "step": 1489 + }, + { + "epoch": 0.28489483747609945, + "grad_norm": 2.680501937866211, + "learning_rate": 4.6425933457227225e-06, + "loss": 0.3256, + "step": 1490 + }, + { + "epoch": 0.28508604206500954, + "grad_norm": 2.612128734588623, + "learning_rate": 4.643019647777103e-06, + "loss": 0.2337, + "step": 1491 + }, + { + "epoch": 0.2852772466539197, + "grad_norm": 2.540062189102173, + "learning_rate": 4.6434456640104405e-06, + "loss": 0.1389, + "step": 1492 + }, + { + "epoch": 0.2854684512428298, + "grad_norm": 3.286160469055176, + "learning_rate": 4.643871394805745e-06, + "loss": 0.1752, + "step": 1493 + }, + { + "epoch": 0.28565965583173997, + "grad_norm": 1.7292871475219727, + "learning_rate": 4.644296840545256e-06, + "loss": 0.0987, + "step": 1494 + }, + { + "epoch": 0.2858508604206501, + "grad_norm": 2.197441816329956, + "learning_rate": 4.644722001610448e-06, + "loss": 0.1405, + "step": 1495 + }, + { + "epoch": 0.28604206500956025, + "grad_norm": 1.9219545125961304, + "learning_rate": 4.645146878382026e-06, + "loss": 0.2266, + "step": 1496 + }, + { + "epoch": 0.28623326959847034, + "grad_norm": 2.036558151245117, + "learning_rate": 4.645571471239938e-06, + "loss": 0.1706, + "step": 1497 + }, + { + "epoch": 0.2864244741873805, + "grad_norm": 1.5000076293945312, + "learning_rate": 4.6459957805633654e-06, + "loss": 0.1039, + "step": 1498 + }, + { + "epoch": 0.2866156787762906, + "grad_norm": 2.3707828521728516, + "learning_rate": 4.646419806730734e-06, + "loss": 0.1623, + "step": 1499 + }, + { + "epoch": 0.28680688336520077, + "grad_norm": 1.9618526697158813, + "learning_rate": 4.64684355011971e-06, + "loss": 0.1254, + "step": 1500 + }, + { + "epoch": 0.28680688336520077, + "eval_runtime": 761.9792, + "eval_samples_per_second": 2.013, + "eval_steps_per_second": 0.252, + "step": 1500 + }, + { + "epoch": 0.2869980879541109, + "grad_norm": 2.2166504859924316, + "learning_rate": 4.6472670111072075e-06, + "loss": 0.37, + "step": 1501 + }, + { + "epoch": 0.28718929254302106, + "grad_norm": 2.407583236694336, + "learning_rate": 4.647690190069383e-06, + "loss": 0.3667, + "step": 1502 + }, + { + "epoch": 0.28738049713193115, + "grad_norm": 2.0655288696289062, + "learning_rate": 4.648113087381647e-06, + "loss": 0.1062, + "step": 1503 + }, + { + "epoch": 0.2875717017208413, + "grad_norm": 1.6839008331298828, + "learning_rate": 4.648535703418657e-06, + "loss": 0.122, + "step": 1504 + }, + { + "epoch": 0.28776290630975143, + "grad_norm": 2.2787880897521973, + "learning_rate": 4.648958038554326e-06, + "loss": 0.3146, + "step": 1505 + }, + { + "epoch": 0.2879541108986616, + "grad_norm": 1.7219648361206055, + "learning_rate": 4.64938009316182e-06, + "loss": 0.0914, + "step": 1506 + }, + { + "epoch": 0.2881453154875717, + "grad_norm": 2.2042160034179688, + "learning_rate": 4.6498018676135644e-06, + "loss": 0.291, + "step": 1507 + }, + { + "epoch": 0.28833652007648186, + "grad_norm": 2.321742057800293, + "learning_rate": 4.65022336228124e-06, + "loss": 0.2794, + "step": 1508 + }, + { + "epoch": 0.28852772466539195, + "grad_norm": 2.547917127609253, + "learning_rate": 4.650644577535791e-06, + "loss": 0.5054, + "step": 1509 + }, + { + "epoch": 0.2887189292543021, + "grad_norm": 1.275026798248291, + "learning_rate": 4.651065513747423e-06, + "loss": 0.0787, + "step": 1510 + }, + { + "epoch": 0.28891013384321224, + "grad_norm": 3.1266400814056396, + "learning_rate": 4.651486171285608e-06, + "loss": 0.3595, + "step": 1511 + }, + { + "epoch": 0.2891013384321224, + "grad_norm": 1.9492202997207642, + "learning_rate": 4.651906550519083e-06, + "loss": 0.123, + "step": 1512 + }, + { + "epoch": 0.2892925430210325, + "grad_norm": 1.362431526184082, + "learning_rate": 4.652326651815854e-06, + "loss": 0.1088, + "step": 1513 + }, + { + "epoch": 0.28948374760994267, + "grad_norm": 1.5959398746490479, + "learning_rate": 4.652746475543199e-06, + "loss": 0.1612, + "step": 1514 + }, + { + "epoch": 0.28967495219885275, + "grad_norm": 1.9055774211883545, + "learning_rate": 4.6531660220676665e-06, + "loss": 0.2062, + "step": 1515 + }, + { + "epoch": 0.2898661567877629, + "grad_norm": 1.3554742336273193, + "learning_rate": 4.653585291755081e-06, + "loss": 0.1628, + "step": 1516 + }, + { + "epoch": 0.29005736137667304, + "grad_norm": 1.883872628211975, + "learning_rate": 4.65400428497054e-06, + "loss": 0.1861, + "step": 1517 + }, + { + "epoch": 0.2902485659655832, + "grad_norm": 1.0759433507919312, + "learning_rate": 4.654423002078425e-06, + "loss": 0.0531, + "step": 1518 + }, + { + "epoch": 0.2904397705544933, + "grad_norm": 2.0732216835021973, + "learning_rate": 4.654841443442393e-06, + "loss": 0.1208, + "step": 1519 + }, + { + "epoch": 0.29063097514340347, + "grad_norm": 2.8344123363494873, + "learning_rate": 4.655259609425383e-06, + "loss": 0.7317, + "step": 1520 + }, + { + "epoch": 0.29082217973231356, + "grad_norm": 1.3993836641311646, + "learning_rate": 4.655677500389621e-06, + "loss": 0.1063, + "step": 1521 + }, + { + "epoch": 0.2910133843212237, + "grad_norm": 1.489634394645691, + "learning_rate": 4.6560951166966175e-06, + "loss": 0.1145, + "step": 1522 + }, + { + "epoch": 0.29120458891013384, + "grad_norm": 1.8000231981277466, + "learning_rate": 4.656512458707168e-06, + "loss": 0.1213, + "step": 1523 + }, + { + "epoch": 0.291395793499044, + "grad_norm": 2.340092658996582, + "learning_rate": 4.656929526781362e-06, + "loss": 0.2737, + "step": 1524 + }, + { + "epoch": 0.29158699808795413, + "grad_norm": 2.1177775859832764, + "learning_rate": 4.6573463212785765e-06, + "loss": 0.0867, + "step": 1525 + }, + { + "epoch": 0.2917782026768642, + "grad_norm": 2.30710768699646, + "learning_rate": 4.657762842557484e-06, + "loss": 0.3876, + "step": 1526 + }, + { + "epoch": 0.29196940726577436, + "grad_norm": 2.207430362701416, + "learning_rate": 4.658179090976053e-06, + "loss": 0.2844, + "step": 1527 + }, + { + "epoch": 0.2921606118546845, + "grad_norm": 2.2533726692199707, + "learning_rate": 4.658595066891546e-06, + "loss": 0.1704, + "step": 1528 + }, + { + "epoch": 0.29235181644359465, + "grad_norm": 1.5354182720184326, + "learning_rate": 4.6590107706605244e-06, + "loss": 0.1379, + "step": 1529 + }, + { + "epoch": 0.2925430210325048, + "grad_norm": 1.293443202972412, + "learning_rate": 4.659426202638854e-06, + "loss": 0.0735, + "step": 1530 + }, + { + "epoch": 0.29273422562141493, + "grad_norm": 2.447760820388794, + "learning_rate": 4.6598413631817005e-06, + "loss": 0.0711, + "step": 1531 + }, + { + "epoch": 0.292925430210325, + "grad_norm": 3.0195038318634033, + "learning_rate": 4.660256252643533e-06, + "loss": 0.3391, + "step": 1532 + }, + { + "epoch": 0.29311663479923517, + "grad_norm": 2.5726795196533203, + "learning_rate": 4.660670871378128e-06, + "loss": 0.456, + "step": 1533 + }, + { + "epoch": 0.2933078393881453, + "grad_norm": 2.395726442337036, + "learning_rate": 4.6610852197385695e-06, + "loss": 0.4479, + "step": 1534 + }, + { + "epoch": 0.29349904397705545, + "grad_norm": 4.810997486114502, + "learning_rate": 4.661499298077252e-06, + "loss": 0.2757, + "step": 1535 + }, + { + "epoch": 0.2936902485659656, + "grad_norm": 1.4835280179977417, + "learning_rate": 4.66191310674588e-06, + "loss": 0.1364, + "step": 1536 + }, + { + "epoch": 0.29388145315487574, + "grad_norm": 1.9309308528900146, + "learning_rate": 4.662326646095474e-06, + "loss": 0.081, + "step": 1537 + }, + { + "epoch": 0.2940726577437858, + "grad_norm": 3.1075916290283203, + "learning_rate": 4.662739916476364e-06, + "loss": 0.3218, + "step": 1538 + }, + { + "epoch": 0.29426386233269597, + "grad_norm": 3.229728937149048, + "learning_rate": 4.663152918238205e-06, + "loss": 0.3299, + "step": 1539 + }, + { + "epoch": 0.2944550669216061, + "grad_norm": 2.6888089179992676, + "learning_rate": 4.663565651729963e-06, + "loss": 0.2274, + "step": 1540 + }, + { + "epoch": 0.29464627151051626, + "grad_norm": 1.4698622226715088, + "learning_rate": 4.66397811729993e-06, + "loss": 0.0906, + "step": 1541 + }, + { + "epoch": 0.2948374760994264, + "grad_norm": 2.7869653701782227, + "learning_rate": 4.664390315295716e-06, + "loss": 0.17, + "step": 1542 + }, + { + "epoch": 0.29502868068833654, + "grad_norm": 3.202296257019043, + "learning_rate": 4.664802246064258e-06, + "loss": 0.1493, + "step": 1543 + }, + { + "epoch": 0.29521988527724663, + "grad_norm": 1.7716546058654785, + "learning_rate": 4.665213909951816e-06, + "loss": 0.0703, + "step": 1544 + }, + { + "epoch": 0.2954110898661568, + "grad_norm": 4.290736675262451, + "learning_rate": 4.66562530730398e-06, + "loss": 0.3951, + "step": 1545 + }, + { + "epoch": 0.2956022944550669, + "grad_norm": 1.5324119329452515, + "learning_rate": 4.666036438465668e-06, + "loss": 0.167, + "step": 1546 + }, + { + "epoch": 0.29579349904397706, + "grad_norm": 1.529327154159546, + "learning_rate": 4.66644730378113e-06, + "loss": 0.158, + "step": 1547 + }, + { + "epoch": 0.2959847036328872, + "grad_norm": 1.6390931606292725, + "learning_rate": 4.666857903593945e-06, + "loss": 0.136, + "step": 1548 + }, + { + "epoch": 0.29617590822179735, + "grad_norm": 1.1221572160720825, + "learning_rate": 4.667268238247031e-06, + "loss": 0.0375, + "step": 1549 + }, + { + "epoch": 0.29636711281070743, + "grad_norm": 3.3055922985076904, + "learning_rate": 4.667678308082639e-06, + "loss": 0.1424, + "step": 1550 + }, + { + "epoch": 0.2965583173996176, + "grad_norm": 1.8237640857696533, + "learning_rate": 4.668088113442359e-06, + "loss": 0.2064, + "step": 1551 + }, + { + "epoch": 0.2967495219885277, + "grad_norm": 2.7643344402313232, + "learning_rate": 4.668497654667122e-06, + "loss": 0.415, + "step": 1552 + }, + { + "epoch": 0.29694072657743786, + "grad_norm": 3.103980302810669, + "learning_rate": 4.668906932097195e-06, + "loss": 0.5029, + "step": 1553 + }, + { + "epoch": 0.297131931166348, + "grad_norm": 2.982830286026001, + "learning_rate": 4.669315946072195e-06, + "loss": 0.4939, + "step": 1554 + }, + { + "epoch": 0.29732313575525815, + "grad_norm": 2.102238416671753, + "learning_rate": 4.669724696931077e-06, + "loss": 0.2851, + "step": 1555 + }, + { + "epoch": 0.29751434034416824, + "grad_norm": 2.107391357421875, + "learning_rate": 4.670133185012147e-06, + "loss": 0.1171, + "step": 1556 + }, + { + "epoch": 0.2977055449330784, + "grad_norm": 7.794981002807617, + "learning_rate": 4.670541410653059e-06, + "loss": 0.3182, + "step": 1557 + }, + { + "epoch": 0.2978967495219885, + "grad_norm": 3.63592529296875, + "learning_rate": 4.6709493741908105e-06, + "loss": 0.5255, + "step": 1558 + }, + { + "epoch": 0.29808795411089867, + "grad_norm": 2.3166344165802, + "learning_rate": 4.671357075961757e-06, + "loss": 0.2296, + "step": 1559 + }, + { + "epoch": 0.2982791586998088, + "grad_norm": 1.7584208250045776, + "learning_rate": 4.671764516301605e-06, + "loss": 0.2356, + "step": 1560 + }, + { + "epoch": 0.29847036328871895, + "grad_norm": 1.8798422813415527, + "learning_rate": 4.672171695545415e-06, + "loss": 0.1173, + "step": 1561 + }, + { + "epoch": 0.29866156787762904, + "grad_norm": 2.359449863433838, + "learning_rate": 4.672578614027603e-06, + "loss": 0.1093, + "step": 1562 + }, + { + "epoch": 0.2988527724665392, + "grad_norm": 2.927279233932495, + "learning_rate": 4.672985272081945e-06, + "loss": 0.4062, + "step": 1563 + }, + { + "epoch": 0.29904397705544933, + "grad_norm": 3.0216593742370605, + "learning_rate": 4.673391670041575e-06, + "loss": 0.5828, + "step": 1564 + }, + { + "epoch": 0.29923518164435947, + "grad_norm": 2.489259719848633, + "learning_rate": 4.673797808238989e-06, + "loss": 0.3301, + "step": 1565 + }, + { + "epoch": 0.2994263862332696, + "grad_norm": 2.547424793243408, + "learning_rate": 4.674203687006046e-06, + "loss": 0.2759, + "step": 1566 + }, + { + "epoch": 0.29961759082217976, + "grad_norm": 1.899907112121582, + "learning_rate": 4.674609306673967e-06, + "loss": 0.1262, + "step": 1567 + }, + { + "epoch": 0.29980879541108985, + "grad_norm": 1.9912757873535156, + "learning_rate": 4.675014667573342e-06, + "loss": 0.1042, + "step": 1568 + }, + { + "epoch": 0.3, + "grad_norm": 1.6315016746520996, + "learning_rate": 4.675419770034128e-06, + "loss": 0.0955, + "step": 1569 + }, + { + "epoch": 0.30019120458891013, + "grad_norm": 2.126676321029663, + "learning_rate": 4.675824614385652e-06, + "loss": 0.4287, + "step": 1570 + }, + { + "epoch": 0.3003824091778203, + "grad_norm": 1.8122913837432861, + "learning_rate": 4.6762292009566105e-06, + "loss": 0.1287, + "step": 1571 + }, + { + "epoch": 0.3005736137667304, + "grad_norm": 16.377099990844727, + "learning_rate": 4.676633530075071e-06, + "loss": 0.2717, + "step": 1572 + }, + { + "epoch": 0.30076481835564056, + "grad_norm": 2.3182194232940674, + "learning_rate": 4.677037602068479e-06, + "loss": 0.2243, + "step": 1573 + }, + { + "epoch": 0.30095602294455065, + "grad_norm": 1.5239766836166382, + "learning_rate": 4.677441417263654e-06, + "loss": 0.124, + "step": 1574 + }, + { + "epoch": 0.3011472275334608, + "grad_norm": 2.429558753967285, + "learning_rate": 4.677844975986791e-06, + "loss": 0.156, + "step": 1575 + }, + { + "epoch": 0.30133843212237094, + "grad_norm": 2.8561208248138428, + "learning_rate": 4.678248278563468e-06, + "loss": 0.316, + "step": 1576 + }, + { + "epoch": 0.3015296367112811, + "grad_norm": 2.1259777545928955, + "learning_rate": 4.678651325318638e-06, + "loss": 0.1972, + "step": 1577 + }, + { + "epoch": 0.3017208413001912, + "grad_norm": 1.0598630905151367, + "learning_rate": 4.67905411657664e-06, + "loss": 0.0997, + "step": 1578 + }, + { + "epoch": 0.30191204588910137, + "grad_norm": 2.3135368824005127, + "learning_rate": 4.679456652661196e-06, + "loss": 0.1955, + "step": 1579 + }, + { + "epoch": 0.30210325047801145, + "grad_norm": 1.8581372499465942, + "learning_rate": 4.679858933895413e-06, + "loss": 0.1274, + "step": 1580 + }, + { + "epoch": 0.3022944550669216, + "grad_norm": 1.0900535583496094, + "learning_rate": 4.680260960601784e-06, + "loss": 0.0515, + "step": 1581 + }, + { + "epoch": 0.30248565965583174, + "grad_norm": 2.1805429458618164, + "learning_rate": 4.680662733102189e-06, + "loss": 0.4408, + "step": 1582 + }, + { + "epoch": 0.3026768642447419, + "grad_norm": 3.509096145629883, + "learning_rate": 4.681064251717901e-06, + "loss": 0.4217, + "step": 1583 + }, + { + "epoch": 0.302868068833652, + "grad_norm": 2.643540620803833, + "learning_rate": 4.681465516769583e-06, + "loss": 0.1418, + "step": 1584 + }, + { + "epoch": 0.3030592734225621, + "grad_norm": 1.8429009914398193, + "learning_rate": 4.681866528577289e-06, + "loss": 0.1108, + "step": 1585 + }, + { + "epoch": 0.30325047801147226, + "grad_norm": 1.879594087600708, + "learning_rate": 4.6822672874604705e-06, + "loss": 0.2356, + "step": 1586 + }, + { + "epoch": 0.3034416826003824, + "grad_norm": 2.853865623474121, + "learning_rate": 4.6826677937379745e-06, + "loss": 0.1654, + "step": 1587 + }, + { + "epoch": 0.30363288718929254, + "grad_norm": 1.8738490343093872, + "learning_rate": 4.683068047728041e-06, + "loss": 0.2092, + "step": 1588 + }, + { + "epoch": 0.3038240917782027, + "grad_norm": 3.2390451431274414, + "learning_rate": 4.683468049748315e-06, + "loss": 0.528, + "step": 1589 + }, + { + "epoch": 0.30401529636711283, + "grad_norm": 2.1751222610473633, + "learning_rate": 4.683867800115839e-06, + "loss": 0.3197, + "step": 1590 + }, + { + "epoch": 0.3042065009560229, + "grad_norm": 3.2450201511383057, + "learning_rate": 4.684267299147057e-06, + "loss": 0.4242, + "step": 1591 + }, + { + "epoch": 0.30439770554493306, + "grad_norm": 1.056480050086975, + "learning_rate": 4.684666547157818e-06, + "loss": 0.1298, + "step": 1592 + }, + { + "epoch": 0.3045889101338432, + "grad_norm": 1.7769147157669067, + "learning_rate": 4.685065544463375e-06, + "loss": 0.1012, + "step": 1593 + }, + { + "epoch": 0.30478011472275335, + "grad_norm": 1.4946969747543335, + "learning_rate": 4.685464291378389e-06, + "loss": 0.0821, + "step": 1594 + }, + { + "epoch": 0.3049713193116635, + "grad_norm": 2.63144850730896, + "learning_rate": 4.6858627882169256e-06, + "loss": 0.3787, + "step": 1595 + }, + { + "epoch": 0.30516252390057363, + "grad_norm": 2.6494715213775635, + "learning_rate": 4.686261035292464e-06, + "loss": 0.3208, + "step": 1596 + }, + { + "epoch": 0.3053537284894837, + "grad_norm": 4.6903791427612305, + "learning_rate": 4.6866590329178915e-06, + "loss": 0.363, + "step": 1597 + }, + { + "epoch": 0.30554493307839387, + "grad_norm": 2.102142572402954, + "learning_rate": 4.68705678140551e-06, + "loss": 0.1513, + "step": 1598 + }, + { + "epoch": 0.305736137667304, + "grad_norm": 1.5582478046417236, + "learning_rate": 4.687454281067032e-06, + "loss": 0.0899, + "step": 1599 + }, + { + "epoch": 0.30592734225621415, + "grad_norm": 1.9125760793685913, + "learning_rate": 4.687851532213589e-06, + "loss": 0.0984, + "step": 1600 + }, + { + "epoch": 0.3061185468451243, + "grad_norm": 2.396864414215088, + "learning_rate": 4.688248535155727e-06, + "loss": 0.3593, + "step": 1601 + }, + { + "epoch": 0.30630975143403444, + "grad_norm": 1.5265965461730957, + "learning_rate": 4.688645290203411e-06, + "loss": 0.0922, + "step": 1602 + }, + { + "epoch": 0.3065009560229445, + "grad_norm": 2.4949896335601807, + "learning_rate": 4.689041797666025e-06, + "loss": 0.1977, + "step": 1603 + }, + { + "epoch": 0.30669216061185467, + "grad_norm": 2.181450605392456, + "learning_rate": 4.6894380578523775e-06, + "loss": 0.1048, + "step": 1604 + }, + { + "epoch": 0.3068833652007648, + "grad_norm": 1.2355436086654663, + "learning_rate": 4.689834071070693e-06, + "loss": 0.0392, + "step": 1605 + }, + { + "epoch": 0.30707456978967496, + "grad_norm": 3.254880666732788, + "learning_rate": 4.690229837628627e-06, + "loss": 0.1794, + "step": 1606 + }, + { + "epoch": 0.3072657743785851, + "grad_norm": 2.0266425609588623, + "learning_rate": 4.690625357833257e-06, + "loss": 0.1934, + "step": 1607 + }, + { + "epoch": 0.30745697896749524, + "grad_norm": 1.5074000358581543, + "learning_rate": 4.6910206319910875e-06, + "loss": 0.206, + "step": 1608 + }, + { + "epoch": 0.30764818355640533, + "grad_norm": 1.8750319480895996, + "learning_rate": 4.6914156604080515e-06, + "loss": 0.1769, + "step": 1609 + }, + { + "epoch": 0.3078393881453155, + "grad_norm": 1.9686181545257568, + "learning_rate": 4.691810443389513e-06, + "loss": 0.1552, + "step": 1610 + }, + { + "epoch": 0.3080305927342256, + "grad_norm": 2.02163028717041, + "learning_rate": 4.692204981240264e-06, + "loss": 0.1021, + "step": 1611 + }, + { + "epoch": 0.30822179732313576, + "grad_norm": 2.6845357418060303, + "learning_rate": 4.692599274264534e-06, + "loss": 0.1429, + "step": 1612 + }, + { + "epoch": 0.3084130019120459, + "grad_norm": 2.7308313846588135, + "learning_rate": 4.692993322765983e-06, + "loss": 0.3626, + "step": 1613 + }, + { + "epoch": 0.30860420650095605, + "grad_norm": 1.9989246129989624, + "learning_rate": 4.693387127047705e-06, + "loss": 0.3389, + "step": 1614 + }, + { + "epoch": 0.30879541108986613, + "grad_norm": 2.3005568981170654, + "learning_rate": 4.693780687412236e-06, + "loss": 0.3273, + "step": 1615 + }, + { + "epoch": 0.3089866156787763, + "grad_norm": 2.354796886444092, + "learning_rate": 4.694174004161545e-06, + "loss": 0.1299, + "step": 1616 + }, + { + "epoch": 0.3091778202676864, + "grad_norm": 1.5658437013626099, + "learning_rate": 4.6945670775970445e-06, + "loss": 0.0881, + "step": 1617 + }, + { + "epoch": 0.30936902485659656, + "grad_norm": 2.1076741218566895, + "learning_rate": 4.694959908019585e-06, + "loss": 0.2679, + "step": 1618 + }, + { + "epoch": 0.3095602294455067, + "grad_norm": 2.51861572265625, + "learning_rate": 4.6953524957294615e-06, + "loss": 0.1314, + "step": 1619 + }, + { + "epoch": 0.30975143403441685, + "grad_norm": 2.3451156616210938, + "learning_rate": 4.695744841026411e-06, + "loss": 0.2426, + "step": 1620 + }, + { + "epoch": 0.30994263862332694, + "grad_norm": 2.2577333450317383, + "learning_rate": 4.696136944209617e-06, + "loss": 0.3822, + "step": 1621 + }, + { + "epoch": 0.3101338432122371, + "grad_norm": 2.172588348388672, + "learning_rate": 4.696528805577708e-06, + "loss": 0.1405, + "step": 1622 + }, + { + "epoch": 0.3103250478011472, + "grad_norm": 3.17950177192688, + "learning_rate": 4.696920425428762e-06, + "loss": 0.3541, + "step": 1623 + }, + { + "epoch": 0.31051625239005737, + "grad_norm": 4.15213680267334, + "learning_rate": 4.6973118040603045e-06, + "loss": 0.1327, + "step": 1624 + }, + { + "epoch": 0.3107074569789675, + "grad_norm": 1.5607051849365234, + "learning_rate": 4.697702941769314e-06, + "loss": 0.0799, + "step": 1625 + }, + { + "epoch": 0.31089866156787765, + "grad_norm": 1.972217082977295, + "learning_rate": 4.698093838852218e-06, + "loss": 0.3492, + "step": 1626 + }, + { + "epoch": 0.31108986615678774, + "grad_norm": 2.502187728881836, + "learning_rate": 4.6984844956048994e-06, + "loss": 0.3884, + "step": 1627 + }, + { + "epoch": 0.3112810707456979, + "grad_norm": 3.3736958503723145, + "learning_rate": 4.698874912322695e-06, + "loss": 0.2595, + "step": 1628 + }, + { + "epoch": 0.31147227533460803, + "grad_norm": 1.729385495185852, + "learning_rate": 4.699265089300396e-06, + "loss": 0.0982, + "step": 1629 + }, + { + "epoch": 0.31166347992351817, + "grad_norm": 9.559845924377441, + "learning_rate": 4.699655026832254e-06, + "loss": 0.5083, + "step": 1630 + }, + { + "epoch": 0.3118546845124283, + "grad_norm": 2.132229804992676, + "learning_rate": 4.700044725211977e-06, + "loss": 0.1439, + "step": 1631 + }, + { + "epoch": 0.31204588910133846, + "grad_norm": 2.046917676925659, + "learning_rate": 4.700434184732733e-06, + "loss": 0.345, + "step": 1632 + }, + { + "epoch": 0.31223709369024855, + "grad_norm": 2.525454044342041, + "learning_rate": 4.700823405687152e-06, + "loss": 0.4536, + "step": 1633 + }, + { + "epoch": 0.3124282982791587, + "grad_norm": 2.227076768875122, + "learning_rate": 4.7012123883673265e-06, + "loss": 0.2261, + "step": 1634 + }, + { + "epoch": 0.31261950286806883, + "grad_norm": 1.19818115234375, + "learning_rate": 4.701601133064812e-06, + "loss": 0.0679, + "step": 1635 + }, + { + "epoch": 0.312810707456979, + "grad_norm": 2.9948644638061523, + "learning_rate": 4.701989640070631e-06, + "loss": 0.1852, + "step": 1636 + }, + { + "epoch": 0.3130019120458891, + "grad_norm": 2.5695416927337646, + "learning_rate": 4.70237790967527e-06, + "loss": 0.3184, + "step": 1637 + }, + { + "epoch": 0.31319311663479926, + "grad_norm": 2.033766508102417, + "learning_rate": 4.702765942168687e-06, + "loss": 0.1388, + "step": 1638 + }, + { + "epoch": 0.31338432122370935, + "grad_norm": 2.1387429237365723, + "learning_rate": 4.703153737840303e-06, + "loss": 0.2016, + "step": 1639 + }, + { + "epoch": 0.3135755258126195, + "grad_norm": 1.2150760889053345, + "learning_rate": 4.703541296979016e-06, + "loss": 0.1351, + "step": 1640 + }, + { + "epoch": 0.31376673040152964, + "grad_norm": 1.4617748260498047, + "learning_rate": 4.703928619873192e-06, + "loss": 0.0998, + "step": 1641 + }, + { + "epoch": 0.3139579349904398, + "grad_norm": 2.583080291748047, + "learning_rate": 4.704315706810671e-06, + "loss": 0.1376, + "step": 1642 + }, + { + "epoch": 0.3141491395793499, + "grad_norm": 1.4851268529891968, + "learning_rate": 4.7047025580787675e-06, + "loss": 0.0733, + "step": 1643 + }, + { + "epoch": 0.31434034416826, + "grad_norm": 2.7585723400115967, + "learning_rate": 4.7050891739642704e-06, + "loss": 0.1577, + "step": 1644 + }, + { + "epoch": 0.31453154875717015, + "grad_norm": 2.7079718112945557, + "learning_rate": 4.705475554753447e-06, + "loss": 0.2468, + "step": 1645 + }, + { + "epoch": 0.3147227533460803, + "grad_norm": 2.084747791290283, + "learning_rate": 4.705861700732041e-06, + "loss": 0.1464, + "step": 1646 + }, + { + "epoch": 0.31491395793499044, + "grad_norm": 3.7070984840393066, + "learning_rate": 4.706247612185277e-06, + "loss": 0.1427, + "step": 1647 + }, + { + "epoch": 0.3151051625239006, + "grad_norm": 1.74429190158844, + "learning_rate": 4.70663328939786e-06, + "loss": 0.1169, + "step": 1648 + }, + { + "epoch": 0.3152963671128107, + "grad_norm": 2.154435157775879, + "learning_rate": 4.707018732653974e-06, + "loss": 0.2084, + "step": 1649 + }, + { + "epoch": 0.3154875717017208, + "grad_norm": 2.1565818786621094, + "learning_rate": 4.707403942237291e-06, + "loss": 0.1462, + "step": 1650 + }, + { + "epoch": 0.31567877629063096, + "grad_norm": 3.50270676612854, + "learning_rate": 4.707788918430965e-06, + "loss": 0.6104, + "step": 1651 + }, + { + "epoch": 0.3158699808795411, + "grad_norm": 1.7462185621261597, + "learning_rate": 4.708173661517635e-06, + "loss": 0.1656, + "step": 1652 + }, + { + "epoch": 0.31606118546845124, + "grad_norm": 1.6207178831100464, + "learning_rate": 4.708558171779426e-06, + "loss": 0.1279, + "step": 1653 + }, + { + "epoch": 0.3162523900573614, + "grad_norm": 2.2919764518737793, + "learning_rate": 4.7089424494979555e-06, + "loss": 0.2565, + "step": 1654 + }, + { + "epoch": 0.31644359464627153, + "grad_norm": 2.5331718921661377, + "learning_rate": 4.709326494954326e-06, + "loss": 0.1646, + "step": 1655 + }, + { + "epoch": 0.3166347992351816, + "grad_norm": 2.5295658111572266, + "learning_rate": 4.709710308429132e-06, + "loss": 0.1328, + "step": 1656 + }, + { + "epoch": 0.31682600382409176, + "grad_norm": 2.147109031677246, + "learning_rate": 4.710093890202459e-06, + "loss": 0.2947, + "step": 1657 + }, + { + "epoch": 0.3170172084130019, + "grad_norm": 2.6603047847747803, + "learning_rate": 4.71047724055389e-06, + "loss": 0.4228, + "step": 1658 + }, + { + "epoch": 0.31720841300191205, + "grad_norm": 3.075491189956665, + "learning_rate": 4.710860359762494e-06, + "loss": 0.4414, + "step": 1659 + }, + { + "epoch": 0.3173996175908222, + "grad_norm": 2.007510185241699, + "learning_rate": 4.711243248106844e-06, + "loss": 0.1255, + "step": 1660 + }, + { + "epoch": 0.31759082217973233, + "grad_norm": 1.8735504150390625, + "learning_rate": 4.711625905865004e-06, + "loss": 0.1407, + "step": 1661 + }, + { + "epoch": 0.3177820267686424, + "grad_norm": 1.5158590078353882, + "learning_rate": 4.7120083333145385e-06, + "loss": 0.0653, + "step": 1662 + }, + { + "epoch": 0.31797323135755257, + "grad_norm": 2.0995800495147705, + "learning_rate": 4.712390530732511e-06, + "loss": 0.3412, + "step": 1663 + }, + { + "epoch": 0.3181644359464627, + "grad_norm": 2.8367748260498047, + "learning_rate": 4.712772498395484e-06, + "loss": 0.3504, + "step": 1664 + }, + { + "epoch": 0.31835564053537285, + "grad_norm": 2.228076696395874, + "learning_rate": 4.713154236579523e-06, + "loss": 0.2741, + "step": 1665 + }, + { + "epoch": 0.318546845124283, + "grad_norm": 2.8032071590423584, + "learning_rate": 4.713535745560195e-06, + "loss": 0.4121, + "step": 1666 + }, + { + "epoch": 0.31873804971319314, + "grad_norm": 2.5112833976745605, + "learning_rate": 4.713917025612572e-06, + "loss": 0.4035, + "step": 1667 + }, + { + "epoch": 0.3189292543021032, + "grad_norm": 4.7532758712768555, + "learning_rate": 4.714298077011231e-06, + "loss": 0.1276, + "step": 1668 + }, + { + "epoch": 0.31912045889101337, + "grad_norm": 2.9184582233428955, + "learning_rate": 4.714678900030255e-06, + "loss": 0.1993, + "step": 1669 + }, + { + "epoch": 0.3193116634799235, + "grad_norm": 3.2624402046203613, + "learning_rate": 4.715059494943234e-06, + "loss": 0.5957, + "step": 1670 + }, + { + "epoch": 0.31950286806883366, + "grad_norm": 2.0953452587127686, + "learning_rate": 4.715439862023267e-06, + "loss": 0.2429, + "step": 1671 + }, + { + "epoch": 0.3196940726577438, + "grad_norm": 2.3828635215759277, + "learning_rate": 4.715820001542965e-06, + "loss": 0.2674, + "step": 1672 + }, + { + "epoch": 0.31988527724665394, + "grad_norm": 1.713576078414917, + "learning_rate": 4.716199913774444e-06, + "loss": 0.1027, + "step": 1673 + }, + { + "epoch": 0.32007648183556403, + "grad_norm": 2.204185724258423, + "learning_rate": 4.71657959898934e-06, + "loss": 0.0693, + "step": 1674 + }, + { + "epoch": 0.3202676864244742, + "grad_norm": 2.4515421390533447, + "learning_rate": 4.716959057458796e-06, + "loss": 0.2329, + "step": 1675 + }, + { + "epoch": 0.3204588910133843, + "grad_norm": 2.277780055999756, + "learning_rate": 4.717338289453474e-06, + "loss": 0.2594, + "step": 1676 + }, + { + "epoch": 0.32065009560229446, + "grad_norm": 1.873704433441162, + "learning_rate": 4.717717295243549e-06, + "loss": 0.3301, + "step": 1677 + }, + { + "epoch": 0.3208413001912046, + "grad_norm": 1.6470894813537598, + "learning_rate": 4.718096075098712e-06, + "loss": 0.1803, + "step": 1678 + }, + { + "epoch": 0.32103250478011475, + "grad_norm": 2.8109920024871826, + "learning_rate": 4.718474629288177e-06, + "loss": 0.1192, + "step": 1679 + }, + { + "epoch": 0.32122370936902483, + "grad_norm": 2.7735977172851562, + "learning_rate": 4.71885295808067e-06, + "loss": 0.3198, + "step": 1680 + }, + { + "epoch": 0.321414913957935, + "grad_norm": 3.0144803524017334, + "learning_rate": 4.719231061744443e-06, + "loss": 0.1269, + "step": 1681 + }, + { + "epoch": 0.3216061185468451, + "grad_norm": 2.3814454078674316, + "learning_rate": 4.7196089405472675e-06, + "loss": 0.1253, + "step": 1682 + }, + { + "epoch": 0.32179732313575526, + "grad_norm": 2.3879141807556152, + "learning_rate": 4.719986594756435e-06, + "loss": 0.4084, + "step": 1683 + }, + { + "epoch": 0.3219885277246654, + "grad_norm": 1.6740145683288574, + "learning_rate": 4.720364024638766e-06, + "loss": 0.0784, + "step": 1684 + }, + { + "epoch": 0.32217973231357555, + "grad_norm": 2.241508722305298, + "learning_rate": 4.7207412304606015e-06, + "loss": 0.298, + "step": 1685 + }, + { + "epoch": 0.32237093690248564, + "grad_norm": 1.0748635530471802, + "learning_rate": 4.72111821248781e-06, + "loss": 0.048, + "step": 1686 + }, + { + "epoch": 0.3225621414913958, + "grad_norm": 2.42177677154541, + "learning_rate": 4.721494970985786e-06, + "loss": 0.1027, + "step": 1687 + }, + { + "epoch": 0.3227533460803059, + "grad_norm": 2.5916969776153564, + "learning_rate": 4.721871506219455e-06, + "loss": 0.1818, + "step": 1688 + }, + { + "epoch": 0.32294455066921607, + "grad_norm": 1.406157374382019, + "learning_rate": 4.7222478184532676e-06, + "loss": 0.1544, + "step": 1689 + }, + { + "epoch": 0.3231357552581262, + "grad_norm": 1.2543416023254395, + "learning_rate": 4.722623907951209e-06, + "loss": 0.1498, + "step": 1690 + }, + { + "epoch": 0.32332695984703635, + "grad_norm": 1.768796443939209, + "learning_rate": 4.722999774976792e-06, + "loss": 0.2372, + "step": 1691 + }, + { + "epoch": 0.32351816443594644, + "grad_norm": 2.0685126781463623, + "learning_rate": 4.723375419793066e-06, + "loss": 0.1395, + "step": 1692 + }, + { + "epoch": 0.3237093690248566, + "grad_norm": 3.7179152965545654, + "learning_rate": 4.723750842662612e-06, + "loss": 0.1303, + "step": 1693 + }, + { + "epoch": 0.32390057361376673, + "grad_norm": 2.238060712814331, + "learning_rate": 4.7241260438475445e-06, + "loss": 0.2068, + "step": 1694 + }, + { + "epoch": 0.3240917782026769, + "grad_norm": 2.760972023010254, + "learning_rate": 4.724501023609517e-06, + "loss": 0.4147, + "step": 1695 + }, + { + "epoch": 0.324282982791587, + "grad_norm": 3.159468412399292, + "learning_rate": 4.724875782209718e-06, + "loss": 0.6906, + "step": 1696 + }, + { + "epoch": 0.32447418738049716, + "grad_norm": 2.3904199600219727, + "learning_rate": 4.725250319908874e-06, + "loss": 0.4181, + "step": 1697 + }, + { + "epoch": 0.32466539196940725, + "grad_norm": 2.083958625793457, + "learning_rate": 4.725624636967252e-06, + "loss": 0.1321, + "step": 1698 + }, + { + "epoch": 0.3248565965583174, + "grad_norm": 2.2649729251861572, + "learning_rate": 4.725998733644659e-06, + "loss": 0.1378, + "step": 1699 + }, + { + "epoch": 0.32504780114722753, + "grad_norm": 2.6033074855804443, + "learning_rate": 4.726372610200442e-06, + "loss": 0.1434, + "step": 1700 + }, + { + "epoch": 0.3252390057361377, + "grad_norm": 2.7515151500701904, + "learning_rate": 4.726746266893492e-06, + "loss": 0.4188, + "step": 1701 + }, + { + "epoch": 0.3254302103250478, + "grad_norm": 2.8080029487609863, + "learning_rate": 4.727119703982244e-06, + "loss": 0.3093, + "step": 1702 + }, + { + "epoch": 0.3256214149139579, + "grad_norm": 3.603713274002075, + "learning_rate": 4.727492921724675e-06, + "loss": 0.4832, + "step": 1703 + }, + { + "epoch": 0.32581261950286805, + "grad_norm": 2.3233444690704346, + "learning_rate": 4.72786592037831e-06, + "loss": 0.3555, + "step": 1704 + }, + { + "epoch": 0.3260038240917782, + "grad_norm": 3.610562801361084, + "learning_rate": 4.728238700200221e-06, + "loss": 0.2113, + "step": 1705 + }, + { + "epoch": 0.32619502868068834, + "grad_norm": 1.6778076887130737, + "learning_rate": 4.728611261447025e-06, + "loss": 0.0959, + "step": 1706 + }, + { + "epoch": 0.3263862332695985, + "grad_norm": 2.616176128387451, + "learning_rate": 4.728983604374891e-06, + "loss": 0.3212, + "step": 1707 + }, + { + "epoch": 0.3265774378585086, + "grad_norm": 2.2144649028778076, + "learning_rate": 4.7293557292395365e-06, + "loss": 0.3198, + "step": 1708 + }, + { + "epoch": 0.3267686424474187, + "grad_norm": 1.6588743925094604, + "learning_rate": 4.72972763629623e-06, + "loss": 0.1418, + "step": 1709 + }, + { + "epoch": 0.32695984703632885, + "grad_norm": 3.0176148414611816, + "learning_rate": 4.730099325799792e-06, + "loss": 0.2423, + "step": 1710 + }, + { + "epoch": 0.327151051625239, + "grad_norm": 1.894723653793335, + "learning_rate": 4.730470798004597e-06, + "loss": 0.1414, + "step": 1711 + }, + { + "epoch": 0.32734225621414914, + "grad_norm": 2.1323254108428955, + "learning_rate": 4.730842053164572e-06, + "loss": 0.0995, + "step": 1712 + }, + { + "epoch": 0.3275334608030593, + "grad_norm": 3.2055537700653076, + "learning_rate": 4.7312130915332e-06, + "loss": 0.3597, + "step": 1713 + }, + { + "epoch": 0.3277246653919694, + "grad_norm": 2.552307605743408, + "learning_rate": 4.731583913363522e-06, + "loss": 0.1933, + "step": 1714 + }, + { + "epoch": 0.3279158699808795, + "grad_norm": 1.2854914665222168, + "learning_rate": 4.731954518908132e-06, + "loss": 0.2917, + "step": 1715 + }, + { + "epoch": 0.32810707456978966, + "grad_norm": 2.1804897785186768, + "learning_rate": 4.732324908419186e-06, + "loss": 0.2287, + "step": 1716 + }, + { + "epoch": 0.3282982791586998, + "grad_norm": 1.7139678001403809, + "learning_rate": 4.732695082148399e-06, + "loss": 0.1392, + "step": 1717 + }, + { + "epoch": 0.32848948374760994, + "grad_norm": 2.053713321685791, + "learning_rate": 4.733065040347042e-06, + "loss": 0.1129, + "step": 1718 + }, + { + "epoch": 0.3286806883365201, + "grad_norm": 1.8551483154296875, + "learning_rate": 4.733434783265955e-06, + "loss": 0.096, + "step": 1719 + }, + { + "epoch": 0.32887189292543023, + "grad_norm": 2.319152593612671, + "learning_rate": 4.733804311155533e-06, + "loss": 0.5004, + "step": 1720 + }, + { + "epoch": 0.3290630975143403, + "grad_norm": 2.038278102874756, + "learning_rate": 4.734173624265738e-06, + "loss": 0.1583, + "step": 1721 + }, + { + "epoch": 0.32925430210325046, + "grad_norm": 1.7227160930633545, + "learning_rate": 4.734542722846097e-06, + "loss": 0.2504, + "step": 1722 + }, + { + "epoch": 0.3294455066921606, + "grad_norm": 2.386646270751953, + "learning_rate": 4.734911607145701e-06, + "loss": 0.3351, + "step": 1723 + }, + { + "epoch": 0.32963671128107075, + "grad_norm": 2.143439769744873, + "learning_rate": 4.735280277413207e-06, + "loss": 0.1811, + "step": 1724 + }, + { + "epoch": 0.3298279158699809, + "grad_norm": 2.0049850940704346, + "learning_rate": 4.735648733896841e-06, + "loss": 0.1041, + "step": 1725 + }, + { + "epoch": 0.33001912045889104, + "grad_norm": 2.2885913848876953, + "learning_rate": 4.736016976844395e-06, + "loss": 0.2313, + "step": 1726 + }, + { + "epoch": 0.3302103250478011, + "grad_norm": 3.8230414390563965, + "learning_rate": 4.736385006503233e-06, + "loss": 0.6751, + "step": 1727 + }, + { + "epoch": 0.33040152963671127, + "grad_norm": 2.540400266647339, + "learning_rate": 4.7367528231202895e-06, + "loss": 0.2008, + "step": 1728 + }, + { + "epoch": 0.3305927342256214, + "grad_norm": 2.929203987121582, + "learning_rate": 4.737120426942068e-06, + "loss": 0.3909, + "step": 1729 + }, + { + "epoch": 0.33078393881453155, + "grad_norm": 1.263774037361145, + "learning_rate": 4.737487818214645e-06, + "loss": 0.1669, + "step": 1730 + }, + { + "epoch": 0.3309751434034417, + "grad_norm": 1.2126981019973755, + "learning_rate": 4.737854997183673e-06, + "loss": 0.1578, + "step": 1731 + }, + { + "epoch": 0.33116634799235184, + "grad_norm": 3.105952739715576, + "learning_rate": 4.738221964094376e-06, + "loss": 0.4275, + "step": 1732 + }, + { + "epoch": 0.3313575525812619, + "grad_norm": 1.326631784439087, + "learning_rate": 4.738588719191555e-06, + "loss": 0.1018, + "step": 1733 + }, + { + "epoch": 0.33154875717017207, + "grad_norm": 1.790029764175415, + "learning_rate": 4.738955262719585e-06, + "loss": 0.1735, + "step": 1734 + }, + { + "epoch": 0.3317399617590822, + "grad_norm": 2.4458529949188232, + "learning_rate": 4.739321594922423e-06, + "loss": 0.1493, + "step": 1735 + }, + { + "epoch": 0.33193116634799236, + "grad_norm": 1.3313581943511963, + "learning_rate": 4.7396877160436e-06, + "loss": 0.048, + "step": 1736 + }, + { + "epoch": 0.3321223709369025, + "grad_norm": 1.1828702688217163, + "learning_rate": 4.740053626326225e-06, + "loss": 0.0575, + "step": 1737 + }, + { + "epoch": 0.33231357552581264, + "grad_norm": 3.091031551361084, + "learning_rate": 4.740419326012995e-06, + "loss": 0.4912, + "step": 1738 + }, + { + "epoch": 0.33250478011472273, + "grad_norm": 2.2998857498168945, + "learning_rate": 4.740784815346178e-06, + "loss": 0.2701, + "step": 1739 + }, + { + "epoch": 0.3326959847036329, + "grad_norm": 1.3561292886734009, + "learning_rate": 4.741150094567632e-06, + "loss": 0.2125, + "step": 1740 + }, + { + "epoch": 0.332887189292543, + "grad_norm": 0.8512531518936157, + "learning_rate": 4.7415151639187964e-06, + "loss": 0.077, + "step": 1741 + }, + { + "epoch": 0.33307839388145316, + "grad_norm": 1.6298433542251587, + "learning_rate": 4.741880023640691e-06, + "loss": 0.1782, + "step": 1742 + }, + { + "epoch": 0.3332695984703633, + "grad_norm": 3.3909101486206055, + "learning_rate": 4.742244673973925e-06, + "loss": 0.2529, + "step": 1743 + }, + { + "epoch": 0.33346080305927345, + "grad_norm": 2.4966087341308594, + "learning_rate": 4.742609115158691e-06, + "loss": 0.2019, + "step": 1744 + }, + { + "epoch": 0.33365200764818354, + "grad_norm": 4.023033142089844, + "learning_rate": 4.74297334743477e-06, + "loss": 0.4731, + "step": 1745 + }, + { + "epoch": 0.3338432122370937, + "grad_norm": 3.0190768241882324, + "learning_rate": 4.74333737104153e-06, + "loss": 0.4793, + "step": 1746 + }, + { + "epoch": 0.3340344168260038, + "grad_norm": 3.289560556411743, + "learning_rate": 4.743701186217929e-06, + "loss": 0.1652, + "step": 1747 + }, + { + "epoch": 0.33422562141491396, + "grad_norm": 2.139427423477173, + "learning_rate": 4.744064793202513e-06, + "loss": 0.137, + "step": 1748 + }, + { + "epoch": 0.3344168260038241, + "grad_norm": 3.575605869293213, + "learning_rate": 4.74442819223342e-06, + "loss": 0.3851, + "step": 1749 + }, + { + "epoch": 0.33460803059273425, + "grad_norm": 3.1366186141967773, + "learning_rate": 4.744791383548379e-06, + "loss": 0.1949, + "step": 1750 + }, + { + "epoch": 0.33479923518164434, + "grad_norm": 1.9750595092773438, + "learning_rate": 4.745154367384712e-06, + "loss": 0.1648, + "step": 1751 + }, + { + "epoch": 0.3349904397705545, + "grad_norm": 1.4047598838806152, + "learning_rate": 4.745517143979335e-06, + "loss": 0.1517, + "step": 1752 + }, + { + "epoch": 0.3351816443594646, + "grad_norm": 1.289894700050354, + "learning_rate": 4.7458797135687565e-06, + "loss": 0.1399, + "step": 1753 + }, + { + "epoch": 0.33537284894837477, + "grad_norm": 0.9425712823867798, + "learning_rate": 4.746242076389082e-06, + "loss": 0.1055, + "step": 1754 + }, + { + "epoch": 0.3355640535372849, + "grad_norm": 2.4297313690185547, + "learning_rate": 4.746604232676014e-06, + "loss": 0.1552, + "step": 1755 + }, + { + "epoch": 0.33575525812619506, + "grad_norm": 2.6048977375030518, + "learning_rate": 4.746966182664851e-06, + "loss": 0.1051, + "step": 1756 + }, + { + "epoch": 0.33594646271510514, + "grad_norm": 3.050530433654785, + "learning_rate": 4.747327926590489e-06, + "loss": 0.3244, + "step": 1757 + }, + { + "epoch": 0.3361376673040153, + "grad_norm": 2.425839900970459, + "learning_rate": 4.747689464687424e-06, + "loss": 0.4203, + "step": 1758 + }, + { + "epoch": 0.33632887189292543, + "grad_norm": 1.1890416145324707, + "learning_rate": 4.748050797189752e-06, + "loss": 0.1145, + "step": 1759 + }, + { + "epoch": 0.3365200764818356, + "grad_norm": 2.5795624256134033, + "learning_rate": 4.74841192433117e-06, + "loss": 0.2012, + "step": 1760 + }, + { + "epoch": 0.3367112810707457, + "grad_norm": 1.8465324640274048, + "learning_rate": 4.7487728463449755e-06, + "loss": 0.1086, + "step": 1761 + }, + { + "epoch": 0.33690248565965586, + "grad_norm": 1.6357309818267822, + "learning_rate": 4.749133563464071e-06, + "loss": 0.1407, + "step": 1762 + }, + { + "epoch": 0.33709369024856595, + "grad_norm": 1.5176138877868652, + "learning_rate": 4.749494075920959e-06, + "loss": 0.0951, + "step": 1763 + }, + { + "epoch": 0.3372848948374761, + "grad_norm": 3.06494140625, + "learning_rate": 4.749854383947751e-06, + "loss": 0.453, + "step": 1764 + }, + { + "epoch": 0.33747609942638623, + "grad_norm": 3.943192481994629, + "learning_rate": 4.7502144877761604e-06, + "loss": 0.5386, + "step": 1765 + }, + { + "epoch": 0.3376673040152964, + "grad_norm": 1.6208007335662842, + "learning_rate": 4.750574387637508e-06, + "loss": 0.1078, + "step": 1766 + }, + { + "epoch": 0.3378585086042065, + "grad_norm": 1.6230381727218628, + "learning_rate": 4.750934083762721e-06, + "loss": 0.0768, + "step": 1767 + }, + { + "epoch": 0.3380497131931166, + "grad_norm": 1.910230040550232, + "learning_rate": 4.751293576382336e-06, + "loss": 0.1267, + "step": 1768 + }, + { + "epoch": 0.33824091778202675, + "grad_norm": 2.327993869781494, + "learning_rate": 4.751652865726499e-06, + "loss": 0.1338, + "step": 1769 + }, + { + "epoch": 0.3384321223709369, + "grad_norm": 2.1939034461975098, + "learning_rate": 4.752011952024963e-06, + "loss": 0.2042, + "step": 1770 + }, + { + "epoch": 0.33862332695984704, + "grad_norm": 1.9725449085235596, + "learning_rate": 4.752370835507094e-06, + "loss": 0.2497, + "step": 1771 + }, + { + "epoch": 0.3388145315487572, + "grad_norm": 3.0306098461151123, + "learning_rate": 4.752729516401868e-06, + "loss": 0.3951, + "step": 1772 + }, + { + "epoch": 0.3390057361376673, + "grad_norm": 2.3561394214630127, + "learning_rate": 4.753087994937877e-06, + "loss": 0.2254, + "step": 1773 + }, + { + "epoch": 0.3391969407265774, + "grad_norm": 2.5845770835876465, + "learning_rate": 4.753446271343321e-06, + "loss": 0.2025, + "step": 1774 + }, + { + "epoch": 0.33938814531548755, + "grad_norm": 2.2755463123321533, + "learning_rate": 4.753804345846018e-06, + "loss": 0.1169, + "step": 1775 + }, + { + "epoch": 0.3395793499043977, + "grad_norm": 2.320425271987915, + "learning_rate": 4.754162218673401e-06, + "loss": 0.3507, + "step": 1776 + }, + { + "epoch": 0.33977055449330784, + "grad_norm": 1.7574282884597778, + "learning_rate": 4.754519890052516e-06, + "loss": 0.3012, + "step": 1777 + }, + { + "epoch": 0.339961759082218, + "grad_norm": 1.9989606142044067, + "learning_rate": 4.75487736021003e-06, + "loss": 0.2198, + "step": 1778 + }, + { + "epoch": 0.34015296367112813, + "grad_norm": 1.8190861940383911, + "learning_rate": 4.7552346293722235e-06, + "loss": 0.1237, + "step": 1779 + }, + { + "epoch": 0.3403441682600382, + "grad_norm": 1.9230531454086304, + "learning_rate": 4.755591697764998e-06, + "loss": 0.1621, + "step": 1780 + }, + { + "epoch": 0.34053537284894836, + "grad_norm": 2.197153091430664, + "learning_rate": 4.755948565613874e-06, + "loss": 0.1253, + "step": 1781 + }, + { + "epoch": 0.3407265774378585, + "grad_norm": 2.13446044921875, + "learning_rate": 4.756305233143992e-06, + "loss": 0.2251, + "step": 1782 + }, + { + "epoch": 0.34091778202676865, + "grad_norm": 1.634322166442871, + "learning_rate": 4.756661700580113e-06, + "loss": 0.1433, + "step": 1783 + }, + { + "epoch": 0.3411089866156788, + "grad_norm": 1.0584380626678467, + "learning_rate": 4.757017968146622e-06, + "loss": 0.0601, + "step": 1784 + }, + { + "epoch": 0.34130019120458893, + "grad_norm": 2.613311529159546, + "learning_rate": 4.757374036067523e-06, + "loss": 0.3052, + "step": 1785 + }, + { + "epoch": 0.341491395793499, + "grad_norm": 2.105159044265747, + "learning_rate": 4.757729904566448e-06, + "loss": 0.0758, + "step": 1786 + }, + { + "epoch": 0.34168260038240916, + "grad_norm": 2.2200515270233154, + "learning_rate": 4.75808557386665e-06, + "loss": 0.2331, + "step": 1787 + }, + { + "epoch": 0.3418738049713193, + "grad_norm": 1.8687654733657837, + "learning_rate": 4.75844104419101e-06, + "loss": 0.1472, + "step": 1788 + }, + { + "epoch": 0.34206500956022945, + "grad_norm": 3.0916874408721924, + "learning_rate": 4.758796315762033e-06, + "loss": 0.3935, + "step": 1789 + }, + { + "epoch": 0.3422562141491396, + "grad_norm": 2.0571913719177246, + "learning_rate": 4.759151388801852e-06, + "loss": 0.1408, + "step": 1790 + }, + { + "epoch": 0.34244741873804974, + "grad_norm": 2.078958511352539, + "learning_rate": 4.759506263532227e-06, + "loss": 0.2162, + "step": 1791 + }, + { + "epoch": 0.3426386233269598, + "grad_norm": 1.2904046773910522, + "learning_rate": 4.759860940174549e-06, + "loss": 0.1126, + "step": 1792 + }, + { + "epoch": 0.34282982791586997, + "grad_norm": 1.762441635131836, + "learning_rate": 4.760215418949835e-06, + "loss": 0.1004, + "step": 1793 + }, + { + "epoch": 0.3430210325047801, + "grad_norm": 1.8758217096328735, + "learning_rate": 4.760569700078735e-06, + "loss": 0.0913, + "step": 1794 + }, + { + "epoch": 0.34321223709369025, + "grad_norm": 2.6033754348754883, + "learning_rate": 4.760923783781529e-06, + "loss": 0.2553, + "step": 1795 + }, + { + "epoch": 0.3434034416826004, + "grad_norm": 3.3962483406066895, + "learning_rate": 4.76127767027813e-06, + "loss": 0.5496, + "step": 1796 + }, + { + "epoch": 0.34359464627151054, + "grad_norm": 1.7760767936706543, + "learning_rate": 4.7616313597880805e-06, + "loss": 0.223, + "step": 1797 + }, + { + "epoch": 0.3437858508604206, + "grad_norm": 2.5091898441314697, + "learning_rate": 4.761984852530561e-06, + "loss": 0.0785, + "step": 1798 + }, + { + "epoch": 0.34397705544933077, + "grad_norm": 2.1788854598999023, + "learning_rate": 4.762338148724385e-06, + "loss": 0.2495, + "step": 1799 + }, + { + "epoch": 0.3441682600382409, + "grad_norm": 2.8695642948150635, + "learning_rate": 4.762691248587998e-06, + "loss": 0.1116, + "step": 1800 + }, + { + "epoch": 0.34435946462715106, + "grad_norm": 2.591621160507202, + "learning_rate": 4.763044152339487e-06, + "loss": 0.5715, + "step": 1801 + }, + { + "epoch": 0.3445506692160612, + "grad_norm": 2.8537497520446777, + "learning_rate": 4.76339686019657e-06, + "loss": 0.4617, + "step": 1802 + }, + { + "epoch": 0.34474187380497134, + "grad_norm": 2.406893014907837, + "learning_rate": 4.763749372376608e-06, + "loss": 0.0832, + "step": 1803 + }, + { + "epoch": 0.34493307839388143, + "grad_norm": 1.8675955533981323, + "learning_rate": 4.764101689096597e-06, + "loss": 0.1626, + "step": 1804 + }, + { + "epoch": 0.3451242829827916, + "grad_norm": 3.17629337310791, + "learning_rate": 4.7644538105731735e-06, + "loss": 0.3307, + "step": 1805 + }, + { + "epoch": 0.3453154875717017, + "grad_norm": 2.8467295169830322, + "learning_rate": 4.764805737022614e-06, + "loss": 0.2995, + "step": 1806 + }, + { + "epoch": 0.34550669216061186, + "grad_norm": 2.333732843399048, + "learning_rate": 4.765157468660835e-06, + "loss": 0.3135, + "step": 1807 + }, + { + "epoch": 0.345697896749522, + "grad_norm": 3.3674793243408203, + "learning_rate": 4.7655090057033955e-06, + "loss": 0.651, + "step": 1808 + }, + { + "epoch": 0.34588910133843215, + "grad_norm": 2.2502448558807373, + "learning_rate": 4.7658603483654965e-06, + "loss": 0.2983, + "step": 1809 + }, + { + "epoch": 0.34608030592734224, + "grad_norm": 2.531153917312622, + "learning_rate": 4.7662114968619835e-06, + "loss": 0.3743, + "step": 1810 + }, + { + "epoch": 0.3462715105162524, + "grad_norm": 2.929173231124878, + "learning_rate": 4.766562451407343e-06, + "loss": 0.2767, + "step": 1811 + }, + { + "epoch": 0.3464627151051625, + "grad_norm": 2.2841217517852783, + "learning_rate": 4.766913212215711e-06, + "loss": 0.1143, + "step": 1812 + }, + { + "epoch": 0.34665391969407267, + "grad_norm": 1.7222204208374023, + "learning_rate": 4.767263779500863e-06, + "loss": 0.118, + "step": 1813 + }, + { + "epoch": 0.3468451242829828, + "grad_norm": 2.1262247562408447, + "learning_rate": 4.767614153476226e-06, + "loss": 0.1722, + "step": 1814 + }, + { + "epoch": 0.34703632887189295, + "grad_norm": 1.8248090744018555, + "learning_rate": 4.7679643343548724e-06, + "loss": 0.1271, + "step": 1815 + }, + { + "epoch": 0.34722753346080304, + "grad_norm": 1.4642949104309082, + "learning_rate": 4.768314322349521e-06, + "loss": 0.0987, + "step": 1816 + }, + { + "epoch": 0.3474187380497132, + "grad_norm": 2.0707128047943115, + "learning_rate": 4.768664117672543e-06, + "loss": 0.0991, + "step": 1817 + }, + { + "epoch": 0.3476099426386233, + "grad_norm": 1.105255126953125, + "learning_rate": 4.769013720535954e-06, + "loss": 0.0739, + "step": 1818 + }, + { + "epoch": 0.34780114722753347, + "grad_norm": 1.9014976024627686, + "learning_rate": 4.769363131151425e-06, + "loss": 0.1378, + "step": 1819 + }, + { + "epoch": 0.3479923518164436, + "grad_norm": 2.5240933895111084, + "learning_rate": 4.769712349730274e-06, + "loss": 0.4471, + "step": 1820 + }, + { + "epoch": 0.34818355640535376, + "grad_norm": 1.6601219177246094, + "learning_rate": 4.770061376483473e-06, + "loss": 0.1254, + "step": 1821 + }, + { + "epoch": 0.34837476099426384, + "grad_norm": 2.0070979595184326, + "learning_rate": 4.770410211621644e-06, + "loss": 0.1297, + "step": 1822 + }, + { + "epoch": 0.348565965583174, + "grad_norm": 2.647048234939575, + "learning_rate": 4.7707588553550665e-06, + "loss": 0.2616, + "step": 1823 + }, + { + "epoch": 0.34875717017208413, + "grad_norm": 1.199033260345459, + "learning_rate": 4.77110730789367e-06, + "loss": 0.0857, + "step": 1824 + }, + { + "epoch": 0.3489483747609943, + "grad_norm": 2.3734302520751953, + "learning_rate": 4.771455569447043e-06, + "loss": 0.0676, + "step": 1825 + }, + { + "epoch": 0.3491395793499044, + "grad_norm": 2.570103883743286, + "learning_rate": 4.771803640224424e-06, + "loss": 0.3698, + "step": 1826 + }, + { + "epoch": 0.3493307839388145, + "grad_norm": 1.8896373510360718, + "learning_rate": 4.7721515204347135e-06, + "loss": 0.1165, + "step": 1827 + }, + { + "epoch": 0.34952198852772465, + "grad_norm": 2.3569037914276123, + "learning_rate": 4.772499210286465e-06, + "loss": 0.2643, + "step": 1828 + }, + { + "epoch": 0.3497131931166348, + "grad_norm": 3.0266520977020264, + "learning_rate": 4.772846709987891e-06, + "loss": 0.3738, + "step": 1829 + }, + { + "epoch": 0.34990439770554493, + "grad_norm": 3.1515417098999023, + "learning_rate": 4.773194019746864e-06, + "loss": 0.1745, + "step": 1830 + }, + { + "epoch": 0.3500956022944551, + "grad_norm": 1.1789801120758057, + "learning_rate": 4.773541139770914e-06, + "loss": 0.0523, + "step": 1831 + }, + { + "epoch": 0.3502868068833652, + "grad_norm": 2.357591152191162, + "learning_rate": 4.7738880702672316e-06, + "loss": 0.3621, + "step": 1832 + }, + { + "epoch": 0.3504780114722753, + "grad_norm": 1.8474746942520142, + "learning_rate": 4.77423481144267e-06, + "loss": 0.3042, + "step": 1833 + }, + { + "epoch": 0.35066921606118545, + "grad_norm": 1.6401458978652954, + "learning_rate": 4.77458136350374e-06, + "loss": 0.1098, + "step": 1834 + }, + { + "epoch": 0.3508604206500956, + "grad_norm": 2.561168670654297, + "learning_rate": 4.774927726656617e-06, + "loss": 0.3783, + "step": 1835 + }, + { + "epoch": 0.35105162523900574, + "grad_norm": 2.586186170578003, + "learning_rate": 4.775273901107143e-06, + "loss": 0.1671, + "step": 1836 + }, + { + "epoch": 0.3512428298279159, + "grad_norm": 0.993121325969696, + "learning_rate": 4.775619887060815e-06, + "loss": 0.0358, + "step": 1837 + }, + { + "epoch": 0.351434034416826, + "grad_norm": 3.0413339138031006, + "learning_rate": 4.775965684722804e-06, + "loss": 0.1799, + "step": 1838 + }, + { + "epoch": 0.3516252390057361, + "grad_norm": 2.575594902038574, + "learning_rate": 4.776311294297939e-06, + "loss": 0.3987, + "step": 1839 + }, + { + "epoch": 0.35181644359464626, + "grad_norm": 2.484976053237915, + "learning_rate": 4.776656715990719e-06, + "loss": 0.201, + "step": 1840 + }, + { + "epoch": 0.3520076481835564, + "grad_norm": 2.7293996810913086, + "learning_rate": 4.777001950005309e-06, + "loss": 0.5425, + "step": 1841 + }, + { + "epoch": 0.35219885277246654, + "grad_norm": 1.7221540212631226, + "learning_rate": 4.77734699654554e-06, + "loss": 0.1215, + "step": 1842 + }, + { + "epoch": 0.3523900573613767, + "grad_norm": 2.226285696029663, + "learning_rate": 4.777691855814912e-06, + "loss": 0.2002, + "step": 1843 + }, + { + "epoch": 0.35258126195028683, + "grad_norm": 3.086306571960449, + "learning_rate": 4.778036528016594e-06, + "loss": 0.2252, + "step": 1844 + }, + { + "epoch": 0.3527724665391969, + "grad_norm": 2.252471685409546, + "learning_rate": 4.778381013353426e-06, + "loss": 0.2922, + "step": 1845 + }, + { + "epoch": 0.35296367112810706, + "grad_norm": 3.0337913036346436, + "learning_rate": 4.778725312027913e-06, + "loss": 0.3342, + "step": 1846 + }, + { + "epoch": 0.3531548757170172, + "grad_norm": 2.6601107120513916, + "learning_rate": 4.7790694242422385e-06, + "loss": 0.2105, + "step": 1847 + }, + { + "epoch": 0.35334608030592735, + "grad_norm": 1.9365030527114868, + "learning_rate": 4.779413350198251e-06, + "loss": 0.192, + "step": 1848 + }, + { + "epoch": 0.3535372848948375, + "grad_norm": 2.894613027572632, + "learning_rate": 4.779757090097476e-06, + "loss": 0.175, + "step": 1849 + }, + { + "epoch": 0.35372848948374763, + "grad_norm": 2.1319048404693604, + "learning_rate": 4.780100644141109e-06, + "loss": 0.1175, + "step": 1850 + }, + { + "epoch": 0.3539196940726577, + "grad_norm": 2.9696707725524902, + "learning_rate": 4.780444012530022e-06, + "loss": 0.2393, + "step": 1851 + }, + { + "epoch": 0.35411089866156786, + "grad_norm": 2.417073965072632, + "learning_rate": 4.780787195464761e-06, + "loss": 0.335, + "step": 1852 + }, + { + "epoch": 0.354302103250478, + "grad_norm": 2.466055154800415, + "learning_rate": 4.7811301931455436e-06, + "loss": 0.3433, + "step": 1853 + }, + { + "epoch": 0.35449330783938815, + "grad_norm": 2.6776459217071533, + "learning_rate": 4.781473005772269e-06, + "loss": 0.3252, + "step": 1854 + }, + { + "epoch": 0.3546845124282983, + "grad_norm": 1.4896794557571411, + "learning_rate": 4.7818156335445075e-06, + "loss": 0.1096, + "step": 1855 + }, + { + "epoch": 0.35487571701720844, + "grad_norm": 2.7048983573913574, + "learning_rate": 4.782158076661511e-06, + "loss": 0.2342, + "step": 1856 + }, + { + "epoch": 0.3550669216061185, + "grad_norm": 2.1952528953552246, + "learning_rate": 4.782500335322208e-06, + "loss": 0.2551, + "step": 1857 + }, + { + "epoch": 0.35525812619502867, + "grad_norm": 2.7355797290802, + "learning_rate": 4.782842409725205e-06, + "loss": 0.4026, + "step": 1858 + }, + { + "epoch": 0.3554493307839388, + "grad_norm": 1.9009243249893188, + "learning_rate": 4.78318430006879e-06, + "loss": 0.169, + "step": 1859 + }, + { + "epoch": 0.35564053537284895, + "grad_norm": 1.3028790950775146, + "learning_rate": 4.783526006550927e-06, + "loss": 0.1357, + "step": 1860 + }, + { + "epoch": 0.3558317399617591, + "grad_norm": 2.2286794185638428, + "learning_rate": 4.783867529369265e-06, + "loss": 0.1787, + "step": 1861 + }, + { + "epoch": 0.35602294455066924, + "grad_norm": 1.97344970703125, + "learning_rate": 4.784208868721133e-06, + "loss": 0.0833, + "step": 1862 + }, + { + "epoch": 0.35621414913957933, + "grad_norm": 2.151817560195923, + "learning_rate": 4.784550024803541e-06, + "loss": 0.154, + "step": 1863 + }, + { + "epoch": 0.35640535372848947, + "grad_norm": 1.631117582321167, + "learning_rate": 4.784890997813184e-06, + "loss": 0.1474, + "step": 1864 + }, + { + "epoch": 0.3565965583173996, + "grad_norm": 2.0637216567993164, + "learning_rate": 4.785231787946437e-06, + "loss": 0.1154, + "step": 1865 + }, + { + "epoch": 0.35678776290630976, + "grad_norm": 1.8624078035354614, + "learning_rate": 4.785572395399365e-06, + "loss": 0.1261, + "step": 1866 + }, + { + "epoch": 0.3569789674952199, + "grad_norm": 2.4838550090789795, + "learning_rate": 4.785912820367712e-06, + "loss": 0.2282, + "step": 1867 + }, + { + "epoch": 0.35717017208413004, + "grad_norm": 2.778296709060669, + "learning_rate": 4.786253063046911e-06, + "loss": 0.3457, + "step": 1868 + }, + { + "epoch": 0.35736137667304013, + "grad_norm": 3.7155518531799316, + "learning_rate": 4.7865931236320795e-06, + "loss": 0.4199, + "step": 1869 + }, + { + "epoch": 0.3575525812619503, + "grad_norm": 2.4353044033050537, + "learning_rate": 4.7869330023180235e-06, + "loss": 0.1885, + "step": 1870 + }, + { + "epoch": 0.3577437858508604, + "grad_norm": 2.149007797241211, + "learning_rate": 4.787272699299234e-06, + "loss": 0.1746, + "step": 1871 + }, + { + "epoch": 0.35793499043977056, + "grad_norm": 3.386817216873169, + "learning_rate": 4.787612214769893e-06, + "loss": 0.2888, + "step": 1872 + }, + { + "epoch": 0.3581261950286807, + "grad_norm": 1.4428246021270752, + "learning_rate": 4.78795154892387e-06, + "loss": 0.1703, + "step": 1873 + }, + { + "epoch": 0.35831739961759085, + "grad_norm": 1.719489574432373, + "learning_rate": 4.788290701954725e-06, + "loss": 0.113, + "step": 1874 + }, + { + "epoch": 0.35850860420650094, + "grad_norm": 2.9463346004486084, + "learning_rate": 4.788629674055707e-06, + "loss": 0.1265, + "step": 1875 + }, + { + "epoch": 0.3586998087954111, + "grad_norm": 1.8955507278442383, + "learning_rate": 4.788968465419756e-06, + "loss": 0.1493, + "step": 1876 + }, + { + "epoch": 0.3588910133843212, + "grad_norm": 1.885257601737976, + "learning_rate": 4.789307076239504e-06, + "loss": 0.3264, + "step": 1877 + }, + { + "epoch": 0.35908221797323137, + "grad_norm": 2.1236987113952637, + "learning_rate": 4.789645506707277e-06, + "loss": 0.1504, + "step": 1878 + }, + { + "epoch": 0.3592734225621415, + "grad_norm": 1.592720627784729, + "learning_rate": 4.789983757015089e-06, + "loss": 0.2391, + "step": 1879 + }, + { + "epoch": 0.35946462715105165, + "grad_norm": 1.6620198488235474, + "learning_rate": 4.790321827354654e-06, + "loss": 0.2443, + "step": 1880 + }, + { + "epoch": 0.35965583173996174, + "grad_norm": 1.3143545389175415, + "learning_rate": 4.790659717917373e-06, + "loss": 0.1046, + "step": 1881 + }, + { + "epoch": 0.3598470363288719, + "grad_norm": 1.7060904502868652, + "learning_rate": 4.790997428894348e-06, + "loss": 0.2334, + "step": 1882 + }, + { + "epoch": 0.360038240917782, + "grad_norm": 2.8742563724517822, + "learning_rate": 4.791334960476374e-06, + "loss": 0.4188, + "step": 1883 + }, + { + "epoch": 0.36022944550669217, + "grad_norm": 3.0253193378448486, + "learning_rate": 4.79167231285394e-06, + "loss": 0.267, + "step": 1884 + }, + { + "epoch": 0.3604206500956023, + "grad_norm": 1.5449588298797607, + "learning_rate": 4.792009486217236e-06, + "loss": 0.1025, + "step": 1885 + }, + { + "epoch": 0.3606118546845124, + "grad_norm": 2.831562042236328, + "learning_rate": 4.792346480756146e-06, + "loss": 0.2385, + "step": 1886 + }, + { + "epoch": 0.36080305927342254, + "grad_norm": 2.17584490776062, + "learning_rate": 4.792683296660254e-06, + "loss": 0.1348, + "step": 1887 + }, + { + "epoch": 0.3609942638623327, + "grad_norm": 2.886467456817627, + "learning_rate": 4.793019934118841e-06, + "loss": 0.3341, + "step": 1888 + }, + { + "epoch": 0.36118546845124283, + "grad_norm": 2.703188896179199, + "learning_rate": 4.793356393320889e-06, + "loss": 0.5411, + "step": 1889 + }, + { + "epoch": 0.361376673040153, + "grad_norm": 2.883558988571167, + "learning_rate": 4.79369267445508e-06, + "loss": 0.4724, + "step": 1890 + }, + { + "epoch": 0.3615678776290631, + "grad_norm": 2.0322422981262207, + "learning_rate": 4.794028777709793e-06, + "loss": 0.1397, + "step": 1891 + }, + { + "epoch": 0.3617590822179732, + "grad_norm": 2.538921594619751, + "learning_rate": 4.794364703273114e-06, + "loss": 0.1836, + "step": 1892 + }, + { + "epoch": 0.36195028680688335, + "grad_norm": 1.7409203052520752, + "learning_rate": 4.7947004513328256e-06, + "loss": 0.1028, + "step": 1893 + }, + { + "epoch": 0.3621414913957935, + "grad_norm": 2.6174840927124023, + "learning_rate": 4.795036022076417e-06, + "loss": 0.228, + "step": 1894 + }, + { + "epoch": 0.36233269598470363, + "grad_norm": 1.768669605255127, + "learning_rate": 4.795371415691077e-06, + "loss": 0.2068, + "step": 1895 + }, + { + "epoch": 0.3625239005736138, + "grad_norm": 1.426334261894226, + "learning_rate": 4.795706632363701e-06, + "loss": 0.1385, + "step": 1896 + }, + { + "epoch": 0.3627151051625239, + "grad_norm": 1.599705457687378, + "learning_rate": 4.796041672280887e-06, + "loss": 0.1014, + "step": 1897 + }, + { + "epoch": 0.362906309751434, + "grad_norm": 3.4072515964508057, + "learning_rate": 4.7963765356289385e-06, + "loss": 0.1648, + "step": 1898 + }, + { + "epoch": 0.36309751434034415, + "grad_norm": 2.493661642074585, + "learning_rate": 4.796711222593864e-06, + "loss": 0.106, + "step": 1899 + }, + { + "epoch": 0.3632887189292543, + "grad_norm": 1.6522670984268188, + "learning_rate": 4.79704573336138e-06, + "loss": 0.0677, + "step": 1900 + }, + { + "epoch": 0.36347992351816444, + "grad_norm": 2.6305341720581055, + "learning_rate": 4.797380068116908e-06, + "loss": 0.3414, + "step": 1901 + }, + { + "epoch": 0.3636711281070746, + "grad_norm": 2.2357025146484375, + "learning_rate": 4.797714227045577e-06, + "loss": 0.3488, + "step": 1902 + }, + { + "epoch": 0.3638623326959847, + "grad_norm": 2.4729673862457275, + "learning_rate": 4.7980482103322265e-06, + "loss": 0.3622, + "step": 1903 + }, + { + "epoch": 0.3640535372848948, + "grad_norm": 2.1345574855804443, + "learning_rate": 4.7983820181614016e-06, + "loss": 0.2182, + "step": 1904 + }, + { + "epoch": 0.36424474187380496, + "grad_norm": 3.113025188446045, + "learning_rate": 4.798715650717358e-06, + "loss": 0.1702, + "step": 1905 + }, + { + "epoch": 0.3644359464627151, + "grad_norm": 1.9702924489974976, + "learning_rate": 4.799049108184062e-06, + "loss": 0.1392, + "step": 1906 + }, + { + "epoch": 0.36462715105162524, + "grad_norm": 3.3608129024505615, + "learning_rate": 4.799382390745188e-06, + "loss": 0.6729, + "step": 1907 + }, + { + "epoch": 0.3648183556405354, + "grad_norm": 1.8882901668548584, + "learning_rate": 4.799715498584127e-06, + "loss": 0.1397, + "step": 1908 + }, + { + "epoch": 0.36500956022944553, + "grad_norm": 2.139448642730713, + "learning_rate": 4.800048431883974e-06, + "loss": 0.2139, + "step": 1909 + }, + { + "epoch": 0.3652007648183556, + "grad_norm": 1.5788694620132446, + "learning_rate": 4.800381190827542e-06, + "loss": 0.1232, + "step": 1910 + }, + { + "epoch": 0.36539196940726576, + "grad_norm": 1.1630561351776123, + "learning_rate": 4.800713775597355e-06, + "loss": 0.0644, + "step": 1911 + }, + { + "epoch": 0.3655831739961759, + "grad_norm": 1.5363394021987915, + "learning_rate": 4.8010461863756505e-06, + "loss": 0.0722, + "step": 1912 + }, + { + "epoch": 0.36577437858508605, + "grad_norm": 2.2994883060455322, + "learning_rate": 4.801378423344381e-06, + "loss": 0.234, + "step": 1913 + }, + { + "epoch": 0.3659655831739962, + "grad_norm": 2.314574956893921, + "learning_rate": 4.8017104866852135e-06, + "loss": 0.3868, + "step": 1914 + }, + { + "epoch": 0.36615678776290633, + "grad_norm": 2.861659288406372, + "learning_rate": 4.802042376579529e-06, + "loss": 0.3618, + "step": 1915 + }, + { + "epoch": 0.3663479923518164, + "grad_norm": 2.1803858280181885, + "learning_rate": 4.802374093208426e-06, + "loss": 0.2072, + "step": 1916 + }, + { + "epoch": 0.36653919694072656, + "grad_norm": 2.282540798187256, + "learning_rate": 4.8027056367527195e-06, + "loss": 0.1491, + "step": 1917 + }, + { + "epoch": 0.3667304015296367, + "grad_norm": 2.8533167839050293, + "learning_rate": 4.803037007392939e-06, + "loss": 0.2092, + "step": 1918 + }, + { + "epoch": 0.36692160611854685, + "grad_norm": 1.306018590927124, + "learning_rate": 4.803368205309336e-06, + "loss": 0.0812, + "step": 1919 + }, + { + "epoch": 0.367112810707457, + "grad_norm": 2.2597286701202393, + "learning_rate": 4.803699230681877e-06, + "loss": 0.3965, + "step": 1920 + }, + { + "epoch": 0.36730401529636714, + "grad_norm": 1.759872555732727, + "learning_rate": 4.804030083690248e-06, + "loss": 0.1801, + "step": 1921 + }, + { + "epoch": 0.3674952198852772, + "grad_norm": 2.418565273284912, + "learning_rate": 4.804360764513856e-06, + "loss": 0.3234, + "step": 1922 + }, + { + "epoch": 0.36768642447418737, + "grad_norm": 2.85204815864563, + "learning_rate": 4.804691273331826e-06, + "loss": 0.1746, + "step": 1923 + }, + { + "epoch": 0.3678776290630975, + "grad_norm": 3.226734161376953, + "learning_rate": 4.805021610323005e-06, + "loss": 0.115, + "step": 1924 + }, + { + "epoch": 0.36806883365200765, + "grad_norm": 2.01751446723938, + "learning_rate": 4.805351775665959e-06, + "loss": 0.1831, + "step": 1925 + }, + { + "epoch": 0.3682600382409178, + "grad_norm": 6.024718761444092, + "learning_rate": 4.805681769538982e-06, + "loss": 0.3726, + "step": 1926 + }, + { + "epoch": 0.36845124282982794, + "grad_norm": 1.2591511011123657, + "learning_rate": 4.80601159212008e-06, + "loss": 0.1463, + "step": 1927 + }, + { + "epoch": 0.36864244741873803, + "grad_norm": 4.118191719055176, + "learning_rate": 4.806341243586993e-06, + "loss": 0.4118, + "step": 1928 + }, + { + "epoch": 0.36883365200764817, + "grad_norm": 3.087899923324585, + "learning_rate": 4.806670724117176e-06, + "loss": 0.4401, + "step": 1929 + }, + { + "epoch": 0.3690248565965583, + "grad_norm": 1.9501676559448242, + "learning_rate": 4.807000033887813e-06, + "loss": 0.1647, + "step": 1930 + }, + { + "epoch": 0.36921606118546846, + "grad_norm": 1.5643742084503174, + "learning_rate": 4.807329173075811e-06, + "loss": 0.1077, + "step": 1931 + }, + { + "epoch": 0.3694072657743786, + "grad_norm": 1.5807640552520752, + "learning_rate": 4.8076581418578e-06, + "loss": 0.2282, + "step": 1932 + }, + { + "epoch": 0.36959847036328874, + "grad_norm": 2.416539430618286, + "learning_rate": 4.807986940410142e-06, + "loss": 0.2228, + "step": 1933 + }, + { + "epoch": 0.36978967495219883, + "grad_norm": 1.8993134498596191, + "learning_rate": 4.808315568908919e-06, + "loss": 0.1577, + "step": 1934 + }, + { + "epoch": 0.369980879541109, + "grad_norm": 2.963465690612793, + "learning_rate": 4.808644027529942e-06, + "loss": 0.3729, + "step": 1935 + }, + { + "epoch": 0.3701720841300191, + "grad_norm": 2.0274300575256348, + "learning_rate": 4.808972316448751e-06, + "loss": 0.1123, + "step": 1936 + }, + { + "epoch": 0.37036328871892926, + "grad_norm": 1.7501300573349, + "learning_rate": 4.809300435840613e-06, + "loss": 0.0785, + "step": 1937 + }, + { + "epoch": 0.3705544933078394, + "grad_norm": 1.8195841312408447, + "learning_rate": 4.809628385880523e-06, + "loss": 0.138, + "step": 1938 + }, + { + "epoch": 0.37074569789674955, + "grad_norm": 1.7135430574417114, + "learning_rate": 4.809956166743207e-06, + "loss": 0.1738, + "step": 1939 + }, + { + "epoch": 0.37093690248565964, + "grad_norm": 2.0836780071258545, + "learning_rate": 4.8102837786031185e-06, + "loss": 0.1386, + "step": 1940 + }, + { + "epoch": 0.3711281070745698, + "grad_norm": 1.6961652040481567, + "learning_rate": 4.8106112216344405e-06, + "loss": 0.1152, + "step": 1941 + }, + { + "epoch": 0.3713193116634799, + "grad_norm": 1.7368990182876587, + "learning_rate": 4.810938496011093e-06, + "loss": 0.1961, + "step": 1942 + }, + { + "epoch": 0.37151051625239007, + "grad_norm": 1.4816842079162598, + "learning_rate": 4.811265601906719e-06, + "loss": 0.0777, + "step": 1943 + }, + { + "epoch": 0.3717017208413002, + "grad_norm": 2.1479954719543457, + "learning_rate": 4.8115925394946985e-06, + "loss": 0.2723, + "step": 1944 + }, + { + "epoch": 0.37189292543021035, + "grad_norm": 2.2252044677734375, + "learning_rate": 4.811919308948144e-06, + "loss": 0.2639, + "step": 1945 + }, + { + "epoch": 0.37208413001912044, + "grad_norm": 1.9705275297164917, + "learning_rate": 4.8122459104399e-06, + "loss": 0.1681, + "step": 1946 + }, + { + "epoch": 0.3722753346080306, + "grad_norm": 3.6255764961242676, + "learning_rate": 4.812572344142544e-06, + "loss": 0.1158, + "step": 1947 + }, + { + "epoch": 0.3724665391969407, + "grad_norm": 1.2052736282348633, + "learning_rate": 4.812898610228388e-06, + "loss": 0.1344, + "step": 1948 + }, + { + "epoch": 0.37265774378585087, + "grad_norm": 2.316469669342041, + "learning_rate": 4.81322470886948e-06, + "loss": 0.2374, + "step": 1949 + }, + { + "epoch": 0.372848948374761, + "grad_norm": 2.0329482555389404, + "learning_rate": 4.813550640237602e-06, + "loss": 0.0955, + "step": 1950 + }, + { + "epoch": 0.3730401529636711, + "grad_norm": 2.659797191619873, + "learning_rate": 4.813876404504271e-06, + "loss": 0.4339, + "step": 1951 + }, + { + "epoch": 0.37323135755258124, + "grad_norm": 1.6562273502349854, + "learning_rate": 4.814202001840742e-06, + "loss": 0.1375, + "step": 1952 + }, + { + "epoch": 0.3734225621414914, + "grad_norm": 3.6155762672424316, + "learning_rate": 4.814527432418008e-06, + "loss": 0.1702, + "step": 1953 + }, + { + "epoch": 0.37361376673040153, + "grad_norm": 1.579047679901123, + "learning_rate": 4.814852696406796e-06, + "loss": 0.1116, + "step": 1954 + }, + { + "epoch": 0.3738049713193117, + "grad_norm": 2.239649772644043, + "learning_rate": 4.815177793977572e-06, + "loss": 0.1462, + "step": 1955 + }, + { + "epoch": 0.3739961759082218, + "grad_norm": 0.9502508640289307, + "learning_rate": 4.815502725300541e-06, + "loss": 0.0592, + "step": 1956 + }, + { + "epoch": 0.3741873804971319, + "grad_norm": 3.136199712753296, + "learning_rate": 4.815827490545649e-06, + "loss": 0.5801, + "step": 1957 + }, + { + "epoch": 0.37437858508604205, + "grad_norm": 4.284992694854736, + "learning_rate": 4.8161520898825794e-06, + "loss": 0.1291, + "step": 1958 + }, + { + "epoch": 0.3745697896749522, + "grad_norm": 2.919841766357422, + "learning_rate": 4.816476523480754e-06, + "loss": 0.4008, + "step": 1959 + }, + { + "epoch": 0.37476099426386233, + "grad_norm": 2.5471062660217285, + "learning_rate": 4.816800791509338e-06, + "loss": 0.0981, + "step": 1960 + }, + { + "epoch": 0.3749521988527725, + "grad_norm": 2.21247935295105, + "learning_rate": 4.817124894137239e-06, + "loss": 0.153, + "step": 1961 + }, + { + "epoch": 0.3751434034416826, + "grad_norm": 2.344128131866455, + "learning_rate": 4.8174488315331e-06, + "loss": 0.1729, + "step": 1962 + }, + { + "epoch": 0.3753346080305927, + "grad_norm": 1.6539994478225708, + "learning_rate": 4.817772603865314e-06, + "loss": 0.1111, + "step": 1963 + }, + { + "epoch": 0.37552581261950285, + "grad_norm": 2.1132073402404785, + "learning_rate": 4.818096211302012e-06, + "loss": 0.3434, + "step": 1964 + }, + { + "epoch": 0.375717017208413, + "grad_norm": 2.918445110321045, + "learning_rate": 4.818419654011068e-06, + "loss": 0.4585, + "step": 1965 + }, + { + "epoch": 0.37590822179732314, + "grad_norm": 2.2636373043060303, + "learning_rate": 4.818742932160102e-06, + "loss": 0.3373, + "step": 1966 + }, + { + "epoch": 0.3760994263862333, + "grad_norm": 6.420876502990723, + "learning_rate": 4.8190660459164775e-06, + "loss": 0.1555, + "step": 1967 + }, + { + "epoch": 0.3762906309751434, + "grad_norm": 3.735738754272461, + "learning_rate": 4.819388995447304e-06, + "loss": 0.2304, + "step": 1968 + }, + { + "epoch": 0.3764818355640535, + "grad_norm": 0.7488231062889099, + "learning_rate": 4.819711780919433e-06, + "loss": 0.0496, + "step": 1969 + }, + { + "epoch": 0.37667304015296366, + "grad_norm": 3.412890672683716, + "learning_rate": 4.8200344024994645e-06, + "loss": 0.6286, + "step": 1970 + }, + { + "epoch": 0.3768642447418738, + "grad_norm": 1.569946050643921, + "learning_rate": 4.820356860353744e-06, + "loss": 0.1469, + "step": 1971 + }, + { + "epoch": 0.37705544933078394, + "grad_norm": 1.486785650253296, + "learning_rate": 4.820679154648364e-06, + "loss": 0.1043, + "step": 1972 + }, + { + "epoch": 0.3772466539196941, + "grad_norm": 1.4714351892471313, + "learning_rate": 4.821001285549165e-06, + "loss": 0.0955, + "step": 1973 + }, + { + "epoch": 0.37743785850860423, + "grad_norm": 3.8283934593200684, + "learning_rate": 4.821323253221735e-06, + "loss": 0.24, + "step": 1974 + }, + { + "epoch": 0.3776290630975143, + "grad_norm": 2.3622210025787354, + "learning_rate": 4.821645057831409e-06, + "loss": 0.2372, + "step": 1975 + }, + { + "epoch": 0.37782026768642446, + "grad_norm": 2.4711544513702393, + "learning_rate": 4.821966699543274e-06, + "loss": 0.2733, + "step": 1976 + }, + { + "epoch": 0.3780114722753346, + "grad_norm": 3.0288548469543457, + "learning_rate": 4.822288178522164e-06, + "loss": 0.3468, + "step": 1977 + }, + { + "epoch": 0.37820267686424475, + "grad_norm": 2.11275577545166, + "learning_rate": 4.822609494932662e-06, + "loss": 0.4129, + "step": 1978 + }, + { + "epoch": 0.3783938814531549, + "grad_norm": 2.2081563472747803, + "learning_rate": 4.822930648939106e-06, + "loss": 0.2547, + "step": 1979 + }, + { + "epoch": 0.37858508604206503, + "grad_norm": 1.3358185291290283, + "learning_rate": 4.823251640705579e-06, + "loss": 0.0734, + "step": 1980 + }, + { + "epoch": 0.3787762906309751, + "grad_norm": 1.5681957006454468, + "learning_rate": 4.82357247039592e-06, + "loss": 0.0682, + "step": 1981 + }, + { + "epoch": 0.37896749521988526, + "grad_norm": 3.349141836166382, + "learning_rate": 4.823893138173719e-06, + "loss": 0.2218, + "step": 1982 + }, + { + "epoch": 0.3791586998087954, + "grad_norm": 1.5513498783111572, + "learning_rate": 4.824213644202315e-06, + "loss": 0.1234, + "step": 1983 + }, + { + "epoch": 0.37934990439770555, + "grad_norm": 1.504607081413269, + "learning_rate": 4.824533988644806e-06, + "loss": 0.114, + "step": 1984 + }, + { + "epoch": 0.3795411089866157, + "grad_norm": 1.5265016555786133, + "learning_rate": 4.8248541716640375e-06, + "loss": 0.1458, + "step": 1985 + }, + { + "epoch": 0.37973231357552584, + "grad_norm": 1.8571374416351318, + "learning_rate": 4.825174193422613e-06, + "loss": 0.0707, + "step": 1986 + }, + { + "epoch": 0.3799235181644359, + "grad_norm": 1.7218410968780518, + "learning_rate": 4.82549405408289e-06, + "loss": 0.0912, + "step": 1987 + }, + { + "epoch": 0.38011472275334607, + "grad_norm": 2.492093324661255, + "learning_rate": 4.825813753806979e-06, + "loss": 0.2172, + "step": 1988 + }, + { + "epoch": 0.3803059273422562, + "grad_norm": 1.584241271018982, + "learning_rate": 4.8261332927567455e-06, + "loss": 0.1455, + "step": 1989 + }, + { + "epoch": 0.38049713193116635, + "grad_norm": 1.470940351486206, + "learning_rate": 4.826452671093815e-06, + "loss": 0.1538, + "step": 1990 + }, + { + "epoch": 0.3806883365200765, + "grad_norm": 0.8330782055854797, + "learning_rate": 4.826771888979564e-06, + "loss": 0.0504, + "step": 1991 + }, + { + "epoch": 0.38087954110898664, + "grad_norm": 2.128312110900879, + "learning_rate": 4.827090946575131e-06, + "loss": 0.1854, + "step": 1992 + }, + { + "epoch": 0.38107074569789673, + "grad_norm": 2.1762027740478516, + "learning_rate": 4.827409844041409e-06, + "loss": 0.1338, + "step": 1993 + }, + { + "epoch": 0.38126195028680687, + "grad_norm": 6.44410514831543, + "learning_rate": 4.827728581539049e-06, + "loss": 0.4493, + "step": 1994 + }, + { + "epoch": 0.381453154875717, + "grad_norm": 2.3031179904937744, + "learning_rate": 4.828047159228461e-06, + "loss": 0.1516, + "step": 1995 + }, + { + "epoch": 0.38164435946462716, + "grad_norm": 1.6921497583389282, + "learning_rate": 4.828365577269813e-06, + "loss": 0.2124, + "step": 1996 + }, + { + "epoch": 0.3818355640535373, + "grad_norm": 2.609638214111328, + "learning_rate": 4.828683835823034e-06, + "loss": 0.3682, + "step": 1997 + }, + { + "epoch": 0.38202676864244745, + "grad_norm": 1.6939135789871216, + "learning_rate": 4.82900193504781e-06, + "loss": 0.1457, + "step": 1998 + }, + { + "epoch": 0.38221797323135753, + "grad_norm": 1.4097681045532227, + "learning_rate": 4.829319875103591e-06, + "loss": 0.0805, + "step": 1999 + }, + { + "epoch": 0.3824091778202677, + "grad_norm": 1.39816153049469, + "learning_rate": 4.829637656149586e-06, + "loss": 0.1119, + "step": 2000 + }, + { + "epoch": 0.3824091778202677, + "eval_runtime": 849.7838, + "eval_samples_per_second": 1.805, + "eval_steps_per_second": 0.226, + "step": 2000 + }, + { + "epoch": 0.3826003824091778, + "grad_norm": 2.89485502243042, + "learning_rate": 4.829955278344763e-06, + "loss": 0.2111, + "step": 2001 + }, + { + "epoch": 0.38279158699808796, + "grad_norm": 2.1980834007263184, + "learning_rate": 4.830272741847855e-06, + "loss": 0.2678, + "step": 2002 + }, + { + "epoch": 0.3829827915869981, + "grad_norm": 6.06950569152832, + "learning_rate": 4.830590046817356e-06, + "loss": 0.5378, + "step": 2003 + }, + { + "epoch": 0.38317399617590825, + "grad_norm": 1.7232050895690918, + "learning_rate": 4.830907193411522e-06, + "loss": 0.1375, + "step": 2004 + }, + { + "epoch": 0.38336520076481834, + "grad_norm": 2.8828940391540527, + "learning_rate": 4.831224181788373e-06, + "loss": 0.2377, + "step": 2005 + }, + { + "epoch": 0.3835564053537285, + "grad_norm": 2.6020548343658447, + "learning_rate": 4.831541012105694e-06, + "loss": 0.1445, + "step": 2006 + }, + { + "epoch": 0.3837476099426386, + "grad_norm": 4.2495341300964355, + "learning_rate": 4.83185768452103e-06, + "loss": 0.3976, + "step": 2007 + }, + { + "epoch": 0.38393881453154877, + "grad_norm": 1.559543251991272, + "learning_rate": 4.832174199191696e-06, + "loss": 0.2264, + "step": 2008 + }, + { + "epoch": 0.3841300191204589, + "grad_norm": 1.8970059156417847, + "learning_rate": 4.8324905562747656e-06, + "loss": 0.1736, + "step": 2009 + }, + { + "epoch": 0.384321223709369, + "grad_norm": 1.7815033197402954, + "learning_rate": 4.832806755927084e-06, + "loss": 0.1395, + "step": 2010 + }, + { + "epoch": 0.38451242829827914, + "grad_norm": 1.9595319032669067, + "learning_rate": 4.833122798305259e-06, + "loss": 0.0992, + "step": 2011 + }, + { + "epoch": 0.3847036328871893, + "grad_norm": 1.122615098953247, + "learning_rate": 4.8334386835656655e-06, + "loss": 0.0475, + "step": 2012 + }, + { + "epoch": 0.3848948374760994, + "grad_norm": 1.7938095331192017, + "learning_rate": 4.8337544118644455e-06, + "loss": 0.0843, + "step": 2013 + }, + { + "epoch": 0.38508604206500957, + "grad_norm": 2.116589307785034, + "learning_rate": 4.834069983357508e-06, + "loss": 0.2812, + "step": 2014 + }, + { + "epoch": 0.3852772466539197, + "grad_norm": 1.9789377450942993, + "learning_rate": 4.83438539820053e-06, + "loss": 0.1502, + "step": 2015 + }, + { + "epoch": 0.3854684512428298, + "grad_norm": 1.4772371053695679, + "learning_rate": 4.834700656548958e-06, + "loss": 0.119, + "step": 2016 + }, + { + "epoch": 0.38565965583173994, + "grad_norm": 1.6071560382843018, + "learning_rate": 4.835015758558004e-06, + "loss": 0.1266, + "step": 2017 + }, + { + "epoch": 0.3858508604206501, + "grad_norm": 1.8250123262405396, + "learning_rate": 4.835330704382654e-06, + "loss": 0.1408, + "step": 2018 + }, + { + "epoch": 0.38604206500956023, + "grad_norm": 1.7217341661453247, + "learning_rate": 4.835645494177658e-06, + "loss": 0.0784, + "step": 2019 + }, + { + "epoch": 0.3862332695984704, + "grad_norm": 2.3684465885162354, + "learning_rate": 4.835960128097542e-06, + "loss": 0.336, + "step": 2020 + }, + { + "epoch": 0.3864244741873805, + "grad_norm": 2.564565420150757, + "learning_rate": 4.836274606296597e-06, + "loss": 0.4045, + "step": 2021 + }, + { + "epoch": 0.3866156787762906, + "grad_norm": 1.4181162118911743, + "learning_rate": 4.8365889289288894e-06, + "loss": 0.0994, + "step": 2022 + }, + { + "epoch": 0.38680688336520075, + "grad_norm": 2.774085760116577, + "learning_rate": 4.836903096148254e-06, + "loss": 0.2666, + "step": 2023 + }, + { + "epoch": 0.3869980879541109, + "grad_norm": 2.653083086013794, + "learning_rate": 4.837217108108301e-06, + "loss": 0.3805, + "step": 2024 + }, + { + "epoch": 0.38718929254302104, + "grad_norm": 2.3239336013793945, + "learning_rate": 4.837530964962407e-06, + "loss": 0.086, + "step": 2025 + }, + { + "epoch": 0.3873804971319312, + "grad_norm": 1.9220002889633179, + "learning_rate": 4.837844666863729e-06, + "loss": 0.2595, + "step": 2026 + }, + { + "epoch": 0.3875717017208413, + "grad_norm": 2.506913900375366, + "learning_rate": 4.838158213965192e-06, + "loss": 0.3344, + "step": 2027 + }, + { + "epoch": 0.3877629063097514, + "grad_norm": 2.293449640274048, + "learning_rate": 4.838471606419496e-06, + "loss": 0.1295, + "step": 2028 + }, + { + "epoch": 0.38795411089866155, + "grad_norm": 2.00219464302063, + "learning_rate": 4.8387848443791165e-06, + "loss": 0.1283, + "step": 2029 + }, + { + "epoch": 0.3881453154875717, + "grad_norm": 2.1243393421173096, + "learning_rate": 4.839097927996301e-06, + "loss": 0.1196, + "step": 2030 + }, + { + "epoch": 0.38833652007648184, + "grad_norm": 1.4236724376678467, + "learning_rate": 4.839410857423075e-06, + "loss": 0.1003, + "step": 2031 + }, + { + "epoch": 0.388527724665392, + "grad_norm": 1.9056960344314575, + "learning_rate": 4.839723632811237e-06, + "loss": 0.1268, + "step": 2032 + }, + { + "epoch": 0.3887189292543021, + "grad_norm": 2.2061145305633545, + "learning_rate": 4.840036254312363e-06, + "loss": 0.3048, + "step": 2033 + }, + { + "epoch": 0.3889101338432122, + "grad_norm": 1.490218997001648, + "learning_rate": 4.840348722077805e-06, + "loss": 0.1018, + "step": 2034 + }, + { + "epoch": 0.38910133843212236, + "grad_norm": 1.3906128406524658, + "learning_rate": 4.840661036258691e-06, + "loss": 0.079, + "step": 2035 + }, + { + "epoch": 0.3892925430210325, + "grad_norm": 2.2867212295532227, + "learning_rate": 4.840973197005928e-06, + "loss": 0.2561, + "step": 2036 + }, + { + "epoch": 0.38948374760994264, + "grad_norm": 2.6874730587005615, + "learning_rate": 4.841285204470199e-06, + "loss": 0.1251, + "step": 2037 + }, + { + "epoch": 0.3896749521988528, + "grad_norm": 2.0944433212280273, + "learning_rate": 4.841597058801967e-06, + "loss": 0.2646, + "step": 2038 + }, + { + "epoch": 0.38986615678776293, + "grad_norm": 1.4850279092788696, + "learning_rate": 4.84190876015147e-06, + "loss": 0.1377, + "step": 2039 + }, + { + "epoch": 0.390057361376673, + "grad_norm": 1.729572057723999, + "learning_rate": 4.8422203086687295e-06, + "loss": 0.1603, + "step": 2040 + }, + { + "epoch": 0.39024856596558316, + "grad_norm": 2.221207618713379, + "learning_rate": 4.842531704503544e-06, + "loss": 0.1859, + "step": 2041 + }, + { + "epoch": 0.3904397705544933, + "grad_norm": 1.757230281829834, + "learning_rate": 4.842842947805491e-06, + "loss": 0.1199, + "step": 2042 + }, + { + "epoch": 0.39063097514340345, + "grad_norm": 1.1581106185913086, + "learning_rate": 4.843154038723931e-06, + "loss": 0.0563, + "step": 2043 + }, + { + "epoch": 0.3908221797323136, + "grad_norm": 1.65236234664917, + "learning_rate": 4.843464977408003e-06, + "loss": 0.1155, + "step": 2044 + }, + { + "epoch": 0.39101338432122373, + "grad_norm": 3.325693130493164, + "learning_rate": 4.843775764006627e-06, + "loss": 0.4827, + "step": 2045 + }, + { + "epoch": 0.3912045889101338, + "grad_norm": 2.740530014038086, + "learning_rate": 4.844086398668508e-06, + "loss": 0.3053, + "step": 2046 + }, + { + "epoch": 0.39139579349904396, + "grad_norm": 4.023008346557617, + "learning_rate": 4.844396881542128e-06, + "loss": 0.3783, + "step": 2047 + }, + { + "epoch": 0.3915869980879541, + "grad_norm": 1.8242052793502808, + "learning_rate": 4.844707212775756e-06, + "loss": 0.1804, + "step": 2048 + }, + { + "epoch": 0.39177820267686425, + "grad_norm": 1.4388545751571655, + "learning_rate": 4.8450173925174395e-06, + "loss": 0.0858, + "step": 2049 + }, + { + "epoch": 0.3919694072657744, + "grad_norm": 2.0084853172302246, + "learning_rate": 4.845327420915012e-06, + "loss": 0.1291, + "step": 2050 + }, + { + "epoch": 0.39216061185468454, + "grad_norm": 2.2301721572875977, + "learning_rate": 4.845637298116093e-06, + "loss": 0.3989, + "step": 2051 + }, + { + "epoch": 0.3923518164435946, + "grad_norm": 1.3058888912200928, + "learning_rate": 4.84594702426808e-06, + "loss": 0.1435, + "step": 2052 + }, + { + "epoch": 0.39254302103250477, + "grad_norm": 2.7791600227355957, + "learning_rate": 4.84625659951816e-06, + "loss": 0.4347, + "step": 2053 + }, + { + "epoch": 0.3927342256214149, + "grad_norm": 2.30441951751709, + "learning_rate": 4.846566024013304e-06, + "loss": 0.1264, + "step": 2054 + }, + { + "epoch": 0.39292543021032506, + "grad_norm": 1.0920432806015015, + "learning_rate": 4.846875297900267e-06, + "loss": 0.0552, + "step": 2055 + }, + { + "epoch": 0.3931166347992352, + "grad_norm": 1.924726128578186, + "learning_rate": 4.847184421325591e-06, + "loss": 0.1238, + "step": 2056 + }, + { + "epoch": 0.39330783938814534, + "grad_norm": 2.9873266220092773, + "learning_rate": 4.847493394435604e-06, + "loss": 0.3692, + "step": 2057 + }, + { + "epoch": 0.39349904397705543, + "grad_norm": 3.197446584701538, + "learning_rate": 4.84780221737642e-06, + "loss": 0.3972, + "step": 2058 + }, + { + "epoch": 0.3936902485659656, + "grad_norm": 3.1228456497192383, + "learning_rate": 4.8481108902939405e-06, + "loss": 0.4105, + "step": 2059 + }, + { + "epoch": 0.3938814531548757, + "grad_norm": 5.650298595428467, + "learning_rate": 4.8484194133338555e-06, + "loss": 0.3185, + "step": 2060 + }, + { + "epoch": 0.39407265774378586, + "grad_norm": 1.6081198453903198, + "learning_rate": 4.8487277866416415e-06, + "loss": 0.1289, + "step": 2061 + }, + { + "epoch": 0.394263862332696, + "grad_norm": 2.226999521255493, + "learning_rate": 4.849036010362564e-06, + "loss": 0.0994, + "step": 2062 + }, + { + "epoch": 0.39445506692160615, + "grad_norm": 2.3722176551818848, + "learning_rate": 4.8493440846416755e-06, + "loss": 0.1002, + "step": 2063 + }, + { + "epoch": 0.39464627151051623, + "grad_norm": 3.996410369873047, + "learning_rate": 4.8496520096238205e-06, + "loss": 0.5538, + "step": 2064 + }, + { + "epoch": 0.3948374760994264, + "grad_norm": 2.3079299926757812, + "learning_rate": 4.8499597854536305e-06, + "loss": 0.2703, + "step": 2065 + }, + { + "epoch": 0.3950286806883365, + "grad_norm": 2.9113502502441406, + "learning_rate": 4.85026741227553e-06, + "loss": 0.3456, + "step": 2066 + }, + { + "epoch": 0.39521988527724666, + "grad_norm": 2.183955192565918, + "learning_rate": 4.850574890233729e-06, + "loss": 0.11, + "step": 2067 + }, + { + "epoch": 0.3954110898661568, + "grad_norm": 2.493313789367676, + "learning_rate": 4.850882219472235e-06, + "loss": 0.2962, + "step": 2068 + }, + { + "epoch": 0.3956022944550669, + "grad_norm": 1.643670678138733, + "learning_rate": 4.851189400134838e-06, + "loss": 0.1328, + "step": 2069 + }, + { + "epoch": 0.39579349904397704, + "grad_norm": 2.830077886581421, + "learning_rate": 4.851496432365128e-06, + "loss": 0.3916, + "step": 2070 + }, + { + "epoch": 0.3959847036328872, + "grad_norm": 1.6414769887924194, + "learning_rate": 4.851803316306482e-06, + "loss": 0.1513, + "step": 2071 + }, + { + "epoch": 0.3961759082217973, + "grad_norm": 1.641139268875122, + "learning_rate": 4.85211005210207e-06, + "loss": 0.1359, + "step": 2072 + }, + { + "epoch": 0.39636711281070747, + "grad_norm": 2.166752576828003, + "learning_rate": 4.852416639894855e-06, + "loss": 0.1225, + "step": 2073 + }, + { + "epoch": 0.3965583173996176, + "grad_norm": 2.580700397491455, + "learning_rate": 4.852723079827596e-06, + "loss": 0.2727, + "step": 2074 + }, + { + "epoch": 0.3967495219885277, + "grad_norm": 1.7222399711608887, + "learning_rate": 4.85302937204284e-06, + "loss": 0.1269, + "step": 2075 + }, + { + "epoch": 0.39694072657743784, + "grad_norm": 1.9561702013015747, + "learning_rate": 4.853335516682933e-06, + "loss": 0.2164, + "step": 2076 + }, + { + "epoch": 0.397131931166348, + "grad_norm": 2.9243381023406982, + "learning_rate": 4.853641513890013e-06, + "loss": 0.3344, + "step": 2077 + }, + { + "epoch": 0.39732313575525813, + "grad_norm": 2.7178955078125, + "learning_rate": 4.853947363806012e-06, + "loss": 0.2405, + "step": 2078 + }, + { + "epoch": 0.39751434034416827, + "grad_norm": 1.9798709154129028, + "learning_rate": 4.85425306657266e-06, + "loss": 0.2382, + "step": 2079 + }, + { + "epoch": 0.3977055449330784, + "grad_norm": 3.2675580978393555, + "learning_rate": 4.8545586223314805e-06, + "loss": 0.1743, + "step": 2080 + }, + { + "epoch": 0.3978967495219885, + "grad_norm": 2.0814285278320312, + "learning_rate": 4.8548640312237916e-06, + "loss": 0.0812, + "step": 2081 + }, + { + "epoch": 0.39808795411089865, + "grad_norm": 3.074506998062134, + "learning_rate": 4.855169293390711e-06, + "loss": 0.4034, + "step": 2082 + }, + { + "epoch": 0.3982791586998088, + "grad_norm": 2.637707233428955, + "learning_rate": 4.85547440897315e-06, + "loss": 0.4595, + "step": 2083 + }, + { + "epoch": 0.39847036328871893, + "grad_norm": 1.7692375183105469, + "learning_rate": 4.855779378111821e-06, + "loss": 0.1297, + "step": 2084 + }, + { + "epoch": 0.3986615678776291, + "grad_norm": 1.5920054912567139, + "learning_rate": 4.856084200947228e-06, + "loss": 0.1393, + "step": 2085 + }, + { + "epoch": 0.3988527724665392, + "grad_norm": 2.117548942565918, + "learning_rate": 4.856388877619678e-06, + "loss": 0.1037, + "step": 2086 + }, + { + "epoch": 0.3990439770554493, + "grad_norm": 1.231958270072937, + "learning_rate": 4.856693408269275e-06, + "loss": 0.0662, + "step": 2087 + }, + { + "epoch": 0.39923518164435945, + "grad_norm": 1.5708861351013184, + "learning_rate": 4.85699779303592e-06, + "loss": 0.1032, + "step": 2088 + }, + { + "epoch": 0.3994263862332696, + "grad_norm": 2.8182146549224854, + "learning_rate": 4.857302032059316e-06, + "loss": 0.5743, + "step": 2089 + }, + { + "epoch": 0.39961759082217974, + "grad_norm": 2.6597273349761963, + "learning_rate": 4.857606125478961e-06, + "loss": 0.3972, + "step": 2090 + }, + { + "epoch": 0.3998087954110899, + "grad_norm": 2.8121769428253174, + "learning_rate": 4.857910073434157e-06, + "loss": 0.3327, + "step": 2091 + }, + { + "epoch": 0.4, + "grad_norm": 2.188518762588501, + "learning_rate": 4.858213876064004e-06, + "loss": 0.2586, + "step": 2092 + }, + { + "epoch": 0.4001912045889101, + "grad_norm": 1.4912960529327393, + "learning_rate": 4.858517533507403e-06, + "loss": 0.0882, + "step": 2093 + }, + { + "epoch": 0.40038240917782025, + "grad_norm": 3.883857011795044, + "learning_rate": 4.858821045903058e-06, + "loss": 0.1254, + "step": 2094 + }, + { + "epoch": 0.4005736137667304, + "grad_norm": 3.3286659717559814, + "learning_rate": 4.85912441338947e-06, + "loss": 0.741, + "step": 2095 + }, + { + "epoch": 0.40076481835564054, + "grad_norm": 2.54927134513855, + "learning_rate": 4.859427636104946e-06, + "loss": 0.1917, + "step": 2096 + }, + { + "epoch": 0.4009560229445507, + "grad_norm": 3.058338165283203, + "learning_rate": 4.859730714187593e-06, + "loss": 0.2758, + "step": 2097 + }, + { + "epoch": 0.4011472275334608, + "grad_norm": 2.1191134452819824, + "learning_rate": 4.860033647775319e-06, + "loss": 0.0992, + "step": 2098 + }, + { + "epoch": 0.4013384321223709, + "grad_norm": 1.2811925411224365, + "learning_rate": 4.860336437005838e-06, + "loss": 0.066, + "step": 2099 + }, + { + "epoch": 0.40152963671128106, + "grad_norm": 1.372006893157959, + "learning_rate": 4.860639082016667e-06, + "loss": 0.0831, + "step": 2100 + }, + { + "epoch": 0.4017208413001912, + "grad_norm": 2.105086088180542, + "learning_rate": 4.8609415829451225e-06, + "loss": 0.1654, + "step": 2101 + }, + { + "epoch": 0.40191204588910134, + "grad_norm": 2.2075912952423096, + "learning_rate": 4.861243939928331e-06, + "loss": 0.2912, + "step": 2102 + }, + { + "epoch": 0.4021032504780115, + "grad_norm": 1.9700030088424683, + "learning_rate": 4.861546153103217e-06, + "loss": 0.2135, + "step": 2103 + }, + { + "epoch": 0.40229445506692163, + "grad_norm": 1.7116718292236328, + "learning_rate": 4.861848222606516e-06, + "loss": 0.1778, + "step": 2104 + }, + { + "epoch": 0.4024856596558317, + "grad_norm": 3.5548574924468994, + "learning_rate": 4.8621501485747625e-06, + "loss": 0.078, + "step": 2105 + }, + { + "epoch": 0.40267686424474186, + "grad_norm": 1.4873651266098022, + "learning_rate": 4.862451931144302e-06, + "loss": 0.0588, + "step": 2106 + }, + { + "epoch": 0.402868068833652, + "grad_norm": 2.477473497390747, + "learning_rate": 4.862753570451282e-06, + "loss": 0.2443, + "step": 2107 + }, + { + "epoch": 0.40305927342256215, + "grad_norm": 2.1252503395080566, + "learning_rate": 4.863055066631658e-06, + "loss": 0.4126, + "step": 2108 + }, + { + "epoch": 0.4032504780114723, + "grad_norm": 1.0193214416503906, + "learning_rate": 4.863356419821191e-06, + "loss": 0.0968, + "step": 2109 + }, + { + "epoch": 0.40344168260038243, + "grad_norm": 3.261441946029663, + "learning_rate": 4.863657630155451e-06, + "loss": 0.5482, + "step": 2110 + }, + { + "epoch": 0.4036328871892925, + "grad_norm": 1.2448210716247559, + "learning_rate": 4.863958697769811e-06, + "loss": 0.1252, + "step": 2111 + }, + { + "epoch": 0.40382409177820267, + "grad_norm": 2.254920482635498, + "learning_rate": 4.864259622799458e-06, + "loss": 0.0864, + "step": 2112 + }, + { + "epoch": 0.4040152963671128, + "grad_norm": 2.846625804901123, + "learning_rate": 4.8645604053793795e-06, + "loss": 0.2891, + "step": 2113 + }, + { + "epoch": 0.40420650095602295, + "grad_norm": 2.426459789276123, + "learning_rate": 4.864861045644379e-06, + "loss": 0.2404, + "step": 2114 + }, + { + "epoch": 0.4043977055449331, + "grad_norm": 2.1458327770233154, + "learning_rate": 4.865161543729063e-06, + "loss": 0.1713, + "step": 2115 + }, + { + "epoch": 0.40458891013384324, + "grad_norm": 3.3918213844299316, + "learning_rate": 4.865461899767849e-06, + "loss": 0.2598, + "step": 2116 + }, + { + "epoch": 0.4047801147227533, + "grad_norm": 2.337883710861206, + "learning_rate": 4.865762113894966e-06, + "loss": 0.2028, + "step": 2117 + }, + { + "epoch": 0.40497131931166347, + "grad_norm": 1.5774587392807007, + "learning_rate": 4.866062186244448e-06, + "loss": 0.1215, + "step": 2118 + }, + { + "epoch": 0.4051625239005736, + "grad_norm": 1.7613880634307861, + "learning_rate": 4.8663621169501456e-06, + "loss": 0.1357, + "step": 2119 + }, + { + "epoch": 0.40535372848948376, + "grad_norm": 3.2717082500457764, + "learning_rate": 4.866661906145713e-06, + "loss": 0.433, + "step": 2120 + }, + { + "epoch": 0.4055449330783939, + "grad_norm": 2.3337671756744385, + "learning_rate": 4.866961553964623e-06, + "loss": 0.2526, + "step": 2121 + }, + { + "epoch": 0.40573613766730404, + "grad_norm": 1.9872267246246338, + "learning_rate": 4.867261060540153e-06, + "loss": 0.1941, + "step": 2122 + }, + { + "epoch": 0.40592734225621413, + "grad_norm": 1.2843959331512451, + "learning_rate": 4.867560426005394e-06, + "loss": 0.0682, + "step": 2123 + }, + { + "epoch": 0.4061185468451243, + "grad_norm": 1.543263554573059, + "learning_rate": 4.8678596504932505e-06, + "loss": 0.1491, + "step": 2124 + }, + { + "epoch": 0.4063097514340344, + "grad_norm": 2.1265337467193604, + "learning_rate": 4.868158734136438e-06, + "loss": 0.0946, + "step": 2125 + }, + { + "epoch": 0.40650095602294456, + "grad_norm": 1.6050384044647217, + "learning_rate": 4.868457677067485e-06, + "loss": 0.2419, + "step": 2126 + }, + { + "epoch": 0.4066921606118547, + "grad_norm": 2.0895438194274902, + "learning_rate": 4.868756479418735e-06, + "loss": 0.1516, + "step": 2127 + }, + { + "epoch": 0.40688336520076485, + "grad_norm": 1.8586974143981934, + "learning_rate": 4.8690551413223396e-06, + "loss": 0.2194, + "step": 2128 + }, + { + "epoch": 0.40707456978967493, + "grad_norm": 2.5971615314483643, + "learning_rate": 4.869353662910269e-06, + "loss": 0.3191, + "step": 2129 + }, + { + "epoch": 0.4072657743785851, + "grad_norm": 1.6561384201049805, + "learning_rate": 4.869652044314306e-06, + "loss": 0.1768, + "step": 2130 + }, + { + "epoch": 0.4074569789674952, + "grad_norm": 2.3525571823120117, + "learning_rate": 4.869950285666048e-06, + "loss": 0.154, + "step": 2131 + }, + { + "epoch": 0.40764818355640536, + "grad_norm": 2.6129164695739746, + "learning_rate": 4.870248387096907e-06, + "loss": 0.1612, + "step": 2132 + }, + { + "epoch": 0.4078393881453155, + "grad_norm": 1.5268852710723877, + "learning_rate": 4.87054634873811e-06, + "loss": 0.1151, + "step": 2133 + }, + { + "epoch": 0.4080305927342256, + "grad_norm": 2.9393773078918457, + "learning_rate": 4.8708441707207e-06, + "loss": 0.1944, + "step": 2134 + }, + { + "epoch": 0.40822179732313574, + "grad_norm": 0.9411885738372803, + "learning_rate": 4.871141853175533e-06, + "loss": 0.0954, + "step": 2135 + }, + { + "epoch": 0.4084130019120459, + "grad_norm": 1.751671552658081, + "learning_rate": 4.871439396233286e-06, + "loss": 0.2188, + "step": 2136 + }, + { + "epoch": 0.408604206500956, + "grad_norm": 2.3655753135681152, + "learning_rate": 4.8717368000244496e-06, + "loss": 0.3265, + "step": 2137 + }, + { + "epoch": 0.40879541108986617, + "grad_norm": 1.446745753288269, + "learning_rate": 4.8720340646793314e-06, + "loss": 0.1341, + "step": 2138 + }, + { + "epoch": 0.4089866156787763, + "grad_norm": 1.964414358139038, + "learning_rate": 4.872331190328057e-06, + "loss": 0.3604, + "step": 2139 + }, + { + "epoch": 0.4091778202676864, + "grad_norm": 1.6994702816009521, + "learning_rate": 4.872628177100569e-06, + "loss": 0.1487, + "step": 2140 + }, + { + "epoch": 0.40936902485659654, + "grad_norm": 2.1562836170196533, + "learning_rate": 4.872925025126627e-06, + "loss": 0.1985, + "step": 2141 + }, + { + "epoch": 0.4095602294455067, + "grad_norm": 2.705679416656494, + "learning_rate": 4.873221734535811e-06, + "loss": 0.3949, + "step": 2142 + }, + { + "epoch": 0.40975143403441683, + "grad_norm": 2.84946870803833, + "learning_rate": 4.873518305457518e-06, + "loss": 0.1461, + "step": 2143 + }, + { + "epoch": 0.40994263862332697, + "grad_norm": 5.145754814147949, + "learning_rate": 4.873814738020963e-06, + "loss": 0.1118, + "step": 2144 + }, + { + "epoch": 0.4101338432122371, + "grad_norm": 2.6604089736938477, + "learning_rate": 4.874111032355183e-06, + "loss": 0.3319, + "step": 2145 + }, + { + "epoch": 0.4103250478011472, + "grad_norm": 2.2231945991516113, + "learning_rate": 4.874407188589032e-06, + "loss": 0.2436, + "step": 2146 + }, + { + "epoch": 0.41051625239005735, + "grad_norm": 1.9829926490783691, + "learning_rate": 4.874703206851186e-06, + "loss": 0.1137, + "step": 2147 + }, + { + "epoch": 0.4107074569789675, + "grad_norm": 1.793606162071228, + "learning_rate": 4.87499908727014e-06, + "loss": 0.1267, + "step": 2148 + }, + { + "epoch": 0.41089866156787763, + "grad_norm": 2.4400620460510254, + "learning_rate": 4.8752948299742085e-06, + "loss": 0.087, + "step": 2149 + }, + { + "epoch": 0.4110898661567878, + "grad_norm": 1.6873563528060913, + "learning_rate": 4.875590435091529e-06, + "loss": 0.0621, + "step": 2150 + }, + { + "epoch": 0.4112810707456979, + "grad_norm": 1.6609550714492798, + "learning_rate": 4.87588590275006e-06, + "loss": 0.1239, + "step": 2151 + }, + { + "epoch": 0.411472275334608, + "grad_norm": 1.9967857599258423, + "learning_rate": 4.876181233077581e-06, + "loss": 0.3056, + "step": 2152 + }, + { + "epoch": 0.41166347992351815, + "grad_norm": 2.5311481952667236, + "learning_rate": 4.876476426201691e-06, + "loss": 0.3258, + "step": 2153 + }, + { + "epoch": 0.4118546845124283, + "grad_norm": 2.0180413722991943, + "learning_rate": 4.876771482249817e-06, + "loss": 0.2722, + "step": 2154 + }, + { + "epoch": 0.41204588910133844, + "grad_norm": 1.2949248552322388, + "learning_rate": 4.877066401349204e-06, + "loss": 0.091, + "step": 2155 + }, + { + "epoch": 0.4122370936902486, + "grad_norm": 1.2472654581069946, + "learning_rate": 4.87736118362692e-06, + "loss": 0.0818, + "step": 2156 + }, + { + "epoch": 0.4124282982791587, + "grad_norm": 1.8975611925125122, + "learning_rate": 4.877655829209858e-06, + "loss": 0.2136, + "step": 2157 + }, + { + "epoch": 0.4126195028680688, + "grad_norm": 2.191755771636963, + "learning_rate": 4.877950338224735e-06, + "loss": 0.3722, + "step": 2158 + }, + { + "epoch": 0.41281070745697895, + "grad_norm": 2.048837661743164, + "learning_rate": 4.87824471079809e-06, + "loss": 0.2638, + "step": 2159 + }, + { + "epoch": 0.4130019120458891, + "grad_norm": 2.1126198768615723, + "learning_rate": 4.878538947056285e-06, + "loss": 0.2743, + "step": 2160 + }, + { + "epoch": 0.41319311663479924, + "grad_norm": 2.0426793098449707, + "learning_rate": 4.878833047125512e-06, + "loss": 0.3363, + "step": 2161 + }, + { + "epoch": 0.4133843212237094, + "grad_norm": 1.8592612743377686, + "learning_rate": 4.879127011131783e-06, + "loss": 0.1042, + "step": 2162 + }, + { + "epoch": 0.4135755258126195, + "grad_norm": 7.511478424072266, + "learning_rate": 4.879420839200937e-06, + "loss": 0.2228, + "step": 2163 + }, + { + "epoch": 0.4137667304015296, + "grad_norm": 1.5293527841567993, + "learning_rate": 4.8797145314586365e-06, + "loss": 0.1024, + "step": 2164 + }, + { + "epoch": 0.41395793499043976, + "grad_norm": 4.5838141441345215, + "learning_rate": 4.880008088030373e-06, + "loss": 0.5163, + "step": 2165 + }, + { + "epoch": 0.4141491395793499, + "grad_norm": 2.471611738204956, + "learning_rate": 4.880301509041461e-06, + "loss": 0.2742, + "step": 2166 + }, + { + "epoch": 0.41434034416826004, + "grad_norm": 2.4510347843170166, + "learning_rate": 4.880594794617045e-06, + "loss": 0.1341, + "step": 2167 + }, + { + "epoch": 0.4145315487571702, + "grad_norm": 2.3072543144226074, + "learning_rate": 4.880887944882094e-06, + "loss": 0.1352, + "step": 2168 + }, + { + "epoch": 0.41472275334608033, + "grad_norm": 0.9170379638671875, + "learning_rate": 4.881180959961403e-06, + "loss": 0.0542, + "step": 2169 + }, + { + "epoch": 0.4149139579349904, + "grad_norm": 2.4316747188568115, + "learning_rate": 4.8814738399795956e-06, + "loss": 0.4565, + "step": 2170 + }, + { + "epoch": 0.41510516252390056, + "grad_norm": 2.150325298309326, + "learning_rate": 4.881766585061125e-06, + "loss": 0.2609, + "step": 2171 + }, + { + "epoch": 0.4152963671128107, + "grad_norm": 2.830181360244751, + "learning_rate": 4.8820591953302706e-06, + "loss": 0.3472, + "step": 2172 + }, + { + "epoch": 0.41548757170172085, + "grad_norm": 1.9228092432022095, + "learning_rate": 4.882351670911141e-06, + "loss": 0.1312, + "step": 2173 + }, + { + "epoch": 0.415678776290631, + "grad_norm": 2.210085868835449, + "learning_rate": 4.8826440119276715e-06, + "loss": 0.1784, + "step": 2174 + }, + { + "epoch": 0.41586998087954113, + "grad_norm": 5.4337849617004395, + "learning_rate": 4.882936218503629e-06, + "loss": 0.1746, + "step": 2175 + }, + { + "epoch": 0.4160611854684512, + "grad_norm": 1.8646759986877441, + "learning_rate": 4.883228290762608e-06, + "loss": 0.2276, + "step": 2176 + }, + { + "epoch": 0.41625239005736137, + "grad_norm": 2.343479871749878, + "learning_rate": 4.883520228828034e-06, + "loss": 0.2489, + "step": 2177 + }, + { + "epoch": 0.4164435946462715, + "grad_norm": 2.6077628135681152, + "learning_rate": 4.88381203282316e-06, + "loss": 0.4214, + "step": 2178 + }, + { + "epoch": 0.41663479923518165, + "grad_norm": 2.1604247093200684, + "learning_rate": 4.884103702871074e-06, + "loss": 0.1684, + "step": 2179 + }, + { + "epoch": 0.4168260038240918, + "grad_norm": 2.03054141998291, + "learning_rate": 4.884395239094688e-06, + "loss": 0.2568, + "step": 2180 + }, + { + "epoch": 0.41701720841300194, + "grad_norm": 1.8029322624206543, + "learning_rate": 4.88468664161675e-06, + "loss": 0.3231, + "step": 2181 + }, + { + "epoch": 0.417208413001912, + "grad_norm": 2.567142963409424, + "learning_rate": 4.884977910559839e-06, + "loss": 0.4366, + "step": 2182 + }, + { + "epoch": 0.41739961759082217, + "grad_norm": 1.7099403142929077, + "learning_rate": 4.885269046046362e-06, + "loss": 0.1528, + "step": 2183 + }, + { + "epoch": 0.4175908221797323, + "grad_norm": 2.3315675258636475, + "learning_rate": 4.885560048198562e-06, + "loss": 0.5276, + "step": 2184 + }, + { + "epoch": 0.41778202676864246, + "grad_norm": 1.4180094003677368, + "learning_rate": 4.885850917138509e-06, + "loss": 0.0885, + "step": 2185 + }, + { + "epoch": 0.4179732313575526, + "grad_norm": 1.8287487030029297, + "learning_rate": 4.886141652988113e-06, + "loss": 0.1236, + "step": 2186 + }, + { + "epoch": 0.41816443594646274, + "grad_norm": 2.6701676845550537, + "learning_rate": 4.886432255869108e-06, + "loss": 0.169, + "step": 2187 + }, + { + "epoch": 0.41835564053537283, + "grad_norm": 2.2629594802856445, + "learning_rate": 4.886722725903068e-06, + "loss": 0.2824, + "step": 2188 + }, + { + "epoch": 0.418546845124283, + "grad_norm": 2.599104881286621, + "learning_rate": 4.887013063211395e-06, + "loss": 0.2329, + "step": 2189 + }, + { + "epoch": 0.4187380497131931, + "grad_norm": 2.1738619804382324, + "learning_rate": 4.887303267915331e-06, + "loss": 0.307, + "step": 2190 + }, + { + "epoch": 0.41892925430210326, + "grad_norm": 2.778266668319702, + "learning_rate": 4.887593340135946e-06, + "loss": 0.1392, + "step": 2191 + }, + { + "epoch": 0.4191204588910134, + "grad_norm": 2.5189621448516846, + "learning_rate": 4.887883279994146e-06, + "loss": 0.3267, + "step": 2192 + }, + { + "epoch": 0.4193116634799235, + "grad_norm": 2.6800177097320557, + "learning_rate": 4.888173087610673e-06, + "loss": 0.4112, + "step": 2193 + }, + { + "epoch": 0.41950286806883363, + "grad_norm": 4.370938777923584, + "learning_rate": 4.888462763106103e-06, + "loss": 0.2855, + "step": 2194 + }, + { + "epoch": 0.4196940726577438, + "grad_norm": 2.941232442855835, + "learning_rate": 4.888752306600847e-06, + "loss": 0.4048, + "step": 2195 + }, + { + "epoch": 0.4198852772466539, + "grad_norm": 2.167144775390625, + "learning_rate": 4.889041718215152e-06, + "loss": 0.3469, + "step": 2196 + }, + { + "epoch": 0.42007648183556406, + "grad_norm": 3.1527018547058105, + "learning_rate": 4.889330998069099e-06, + "loss": 0.4365, + "step": 2197 + }, + { + "epoch": 0.4202676864244742, + "grad_norm": 2.6182942390441895, + "learning_rate": 4.889620146282609e-06, + "loss": 0.2454, + "step": 2198 + }, + { + "epoch": 0.4204588910133843, + "grad_norm": 1.247412919998169, + "learning_rate": 4.889909162975434e-06, + "loss": 0.0454, + "step": 2199 + }, + { + "epoch": 0.42065009560229444, + "grad_norm": 2.049236536026001, + "learning_rate": 4.890198048267166e-06, + "loss": 0.0835, + "step": 2200 + }, + { + "epoch": 0.4208413001912046, + "grad_norm": 2.416959285736084, + "learning_rate": 4.890486802277235e-06, + "loss": 0.2328, + "step": 2201 + }, + { + "epoch": 0.4210325047801147, + "grad_norm": 2.8074474334716797, + "learning_rate": 4.890775425124906e-06, + "loss": 0.3223, + "step": 2202 + }, + { + "epoch": 0.42122370936902487, + "grad_norm": 2.717810869216919, + "learning_rate": 4.89106391692928e-06, + "loss": 0.3238, + "step": 2203 + }, + { + "epoch": 0.421414913957935, + "grad_norm": 1.5587702989578247, + "learning_rate": 4.891352277809302e-06, + "loss": 0.1786, + "step": 2204 + }, + { + "epoch": 0.4216061185468451, + "grad_norm": 1.952847957611084, + "learning_rate": 4.891640507883748e-06, + "loss": 0.174, + "step": 2205 + }, + { + "epoch": 0.42179732313575524, + "grad_norm": 3.144420623779297, + "learning_rate": 4.891928607271237e-06, + "loss": 0.1949, + "step": 2206 + }, + { + "epoch": 0.4219885277246654, + "grad_norm": 2.8562870025634766, + "learning_rate": 4.892216576090225e-06, + "loss": 0.3854, + "step": 2207 + }, + { + "epoch": 0.42217973231357553, + "grad_norm": 2.9668025970458984, + "learning_rate": 4.892504414459007e-06, + "loss": 0.4332, + "step": 2208 + }, + { + "epoch": 0.42237093690248567, + "grad_norm": 2.1173808574676514, + "learning_rate": 4.892792122495718e-06, + "loss": 0.1733, + "step": 2209 + }, + { + "epoch": 0.4225621414913958, + "grad_norm": 2.9007081985473633, + "learning_rate": 4.893079700318333e-06, + "loss": 0.2412, + "step": 2210 + }, + { + "epoch": 0.4227533460803059, + "grad_norm": 2.2025012969970703, + "learning_rate": 4.893367148044665e-06, + "loss": 0.2325, + "step": 2211 + }, + { + "epoch": 0.42294455066921605, + "grad_norm": 2.527050733566284, + "learning_rate": 4.893654465792369e-06, + "loss": 0.128, + "step": 2212 + }, + { + "epoch": 0.4231357552581262, + "grad_norm": 1.7862459421157837, + "learning_rate": 4.89394165367894e-06, + "loss": 0.2455, + "step": 2213 + }, + { + "epoch": 0.42332695984703633, + "grad_norm": 1.6860567331314087, + "learning_rate": 4.894228711821714e-06, + "loss": 0.1739, + "step": 2214 + }, + { + "epoch": 0.4235181644359465, + "grad_norm": 1.391953706741333, + "learning_rate": 4.894515640337865e-06, + "loss": 0.1023, + "step": 2215 + }, + { + "epoch": 0.4237093690248566, + "grad_norm": 2.542024850845337, + "learning_rate": 4.894802439344414e-06, + "loss": 0.352, + "step": 2216 + }, + { + "epoch": 0.4239005736137667, + "grad_norm": 1.902836799621582, + "learning_rate": 4.8950891089582185e-06, + "loss": 0.1247, + "step": 2217 + }, + { + "epoch": 0.42409177820267685, + "grad_norm": 1.8815006017684937, + "learning_rate": 4.895375649295982e-06, + "loss": 0.1437, + "step": 2218 + }, + { + "epoch": 0.424282982791587, + "grad_norm": 2.594043731689453, + "learning_rate": 4.895662060474246e-06, + "loss": 0.1224, + "step": 2219 + }, + { + "epoch": 0.42447418738049714, + "grad_norm": 2.592609405517578, + "learning_rate": 4.895948342609398e-06, + "loss": 0.5407, + "step": 2220 + }, + { + "epoch": 0.4246653919694073, + "grad_norm": 1.1108200550079346, + "learning_rate": 4.8962344958176664e-06, + "loss": 0.1188, + "step": 2221 + }, + { + "epoch": 0.4248565965583174, + "grad_norm": 1.9550951719284058, + "learning_rate": 4.896520520215123e-06, + "loss": 0.1843, + "step": 2222 + }, + { + "epoch": 0.4250478011472275, + "grad_norm": 2.217928886413574, + "learning_rate": 4.8968064159176835e-06, + "loss": 0.1519, + "step": 2223 + }, + { + "epoch": 0.42523900573613765, + "grad_norm": 2.6040468215942383, + "learning_rate": 4.897092183041107e-06, + "loss": 0.2007, + "step": 2224 + }, + { + "epoch": 0.4254302103250478, + "grad_norm": 1.7855995893478394, + "learning_rate": 4.897377821700995e-06, + "loss": 0.1069, + "step": 2225 + }, + { + "epoch": 0.42562141491395794, + "grad_norm": 2.076179027557373, + "learning_rate": 4.897663332012795e-06, + "loss": 0.4742, + "step": 2226 + }, + { + "epoch": 0.4258126195028681, + "grad_norm": 2.4149672985076904, + "learning_rate": 4.897948714091799e-06, + "loss": 0.2418, + "step": 2227 + }, + { + "epoch": 0.4260038240917782, + "grad_norm": 2.047053575515747, + "learning_rate": 4.898233968053142e-06, + "loss": 0.2112, + "step": 2228 + }, + { + "epoch": 0.4261950286806883, + "grad_norm": 1.8152729272842407, + "learning_rate": 4.8985190940118074e-06, + "loss": 0.0942, + "step": 2229 + }, + { + "epoch": 0.42638623326959846, + "grad_norm": 1.7877713441848755, + "learning_rate": 4.898804092082618e-06, + "loss": 0.1056, + "step": 2230 + }, + { + "epoch": 0.4265774378585086, + "grad_norm": 1.7800778150558472, + "learning_rate": 4.899088962380248e-06, + "loss": 0.0929, + "step": 2231 + }, + { + "epoch": 0.42676864244741874, + "grad_norm": 3.8843071460723877, + "learning_rate": 4.899373705019215e-06, + "loss": 0.628, + "step": 2232 + }, + { + "epoch": 0.4269598470363289, + "grad_norm": 3.350219964981079, + "learning_rate": 4.899658320113882e-06, + "loss": 0.591, + "step": 2233 + }, + { + "epoch": 0.42715105162523903, + "grad_norm": 1.9053051471710205, + "learning_rate": 4.899942807778461e-06, + "loss": 0.1806, + "step": 2234 + }, + { + "epoch": 0.4273422562141491, + "grad_norm": 1.257057547569275, + "learning_rate": 4.900227168127006e-06, + "loss": 0.0753, + "step": 2235 + }, + { + "epoch": 0.42753346080305926, + "grad_norm": 1.812961220741272, + "learning_rate": 4.900511401273424e-06, + "loss": 0.1633, + "step": 2236 + }, + { + "epoch": 0.4277246653919694, + "grad_norm": 2.06535267829895, + "learning_rate": 4.900795507331465e-06, + "loss": 0.1393, + "step": 2237 + }, + { + "epoch": 0.42791586998087955, + "grad_norm": 1.0033485889434814, + "learning_rate": 4.901079486414725e-06, + "loss": 0.0774, + "step": 2238 + }, + { + "epoch": 0.4281070745697897, + "grad_norm": 2.0232021808624268, + "learning_rate": 4.9013633386366545e-06, + "loss": 0.2666, + "step": 2239 + }, + { + "epoch": 0.42829827915869984, + "grad_norm": 2.4945590496063232, + "learning_rate": 4.901647064110545e-06, + "loss": 0.4225, + "step": 2240 + }, + { + "epoch": 0.4284894837476099, + "grad_norm": 2.9069554805755615, + "learning_rate": 4.901930662949541e-06, + "loss": 0.4286, + "step": 2241 + }, + { + "epoch": 0.42868068833652007, + "grad_norm": 1.4231261014938354, + "learning_rate": 4.902214135266632e-06, + "loss": 0.072, + "step": 2242 + }, + { + "epoch": 0.4288718929254302, + "grad_norm": 1.2749229669570923, + "learning_rate": 4.902497481174659e-06, + "loss": 0.1584, + "step": 2243 + }, + { + "epoch": 0.42906309751434035, + "grad_norm": 3.1102404594421387, + "learning_rate": 4.902780700786311e-06, + "loss": 0.163, + "step": 2244 + }, + { + "epoch": 0.4292543021032505, + "grad_norm": 2.0278851985931396, + "learning_rate": 4.903063794214126e-06, + "loss": 0.4301, + "step": 2245 + }, + { + "epoch": 0.42944550669216064, + "grad_norm": 2.2186572551727295, + "learning_rate": 4.903346761570493e-06, + "loss": 0.0785, + "step": 2246 + }, + { + "epoch": 0.4296367112810707, + "grad_norm": 2.2448575496673584, + "learning_rate": 4.90362960296765e-06, + "loss": 0.2338, + "step": 2247 + }, + { + "epoch": 0.42982791586998087, + "grad_norm": 2.3453407287597656, + "learning_rate": 4.903912318517684e-06, + "loss": 0.269, + "step": 2248 + }, + { + "epoch": 0.430019120458891, + "grad_norm": 1.2500512599945068, + "learning_rate": 4.904194908332537e-06, + "loss": 0.0725, + "step": 2249 + }, + { + "epoch": 0.43021032504780116, + "grad_norm": 2.4721336364746094, + "learning_rate": 4.904477372523995e-06, + "loss": 0.07, + "step": 2250 + }, + { + "epoch": 0.4304015296367113, + "grad_norm": 2.766767740249634, + "learning_rate": 4.9047597112037e-06, + "loss": 0.5043, + "step": 2251 + }, + { + "epoch": 0.4305927342256214, + "grad_norm": 1.717307686805725, + "learning_rate": 4.905041924483143e-06, + "loss": 0.1655, + "step": 2252 + }, + { + "epoch": 0.43078393881453153, + "grad_norm": 3.6042754650115967, + "learning_rate": 4.905324012473667e-06, + "loss": 0.4827, + "step": 2253 + }, + { + "epoch": 0.4309751434034417, + "grad_norm": 1.890688419342041, + "learning_rate": 4.905605975286469e-06, + "loss": 0.1853, + "step": 2254 + }, + { + "epoch": 0.4311663479923518, + "grad_norm": 1.5629879236221313, + "learning_rate": 4.9058878130325935e-06, + "loss": 0.1271, + "step": 2255 + }, + { + "epoch": 0.43135755258126196, + "grad_norm": 2.144808053970337, + "learning_rate": 4.906169525822942e-06, + "loss": 0.1081, + "step": 2256 + }, + { + "epoch": 0.4315487571701721, + "grad_norm": 1.9382816553115845, + "learning_rate": 4.9064511137682635e-06, + "loss": 0.2521, + "step": 2257 + }, + { + "epoch": 0.4317399617590822, + "grad_norm": 1.2302849292755127, + "learning_rate": 4.906732576979165e-06, + "loss": 0.1157, + "step": 2258 + }, + { + "epoch": 0.43193116634799233, + "grad_norm": 1.9480595588684082, + "learning_rate": 4.907013915566105e-06, + "loss": 0.22, + "step": 2259 + }, + { + "epoch": 0.4321223709369025, + "grad_norm": 1.205723762512207, + "learning_rate": 4.907295129639391e-06, + "loss": 0.0779, + "step": 2260 + }, + { + "epoch": 0.4323135755258126, + "grad_norm": 2.0031495094299316, + "learning_rate": 4.907576219309192e-06, + "loss": 0.2669, + "step": 2261 + }, + { + "epoch": 0.43250478011472276, + "grad_norm": 1.9002723693847656, + "learning_rate": 4.907857184685524e-06, + "loss": 0.0908, + "step": 2262 + }, + { + "epoch": 0.4326959847036329, + "grad_norm": 2.721759796142578, + "learning_rate": 4.9081380258782595e-06, + "loss": 0.2764, + "step": 2263 + }, + { + "epoch": 0.432887189292543, + "grad_norm": 3.628314733505249, + "learning_rate": 4.908418742997127e-06, + "loss": 0.768, + "step": 2264 + }, + { + "epoch": 0.43307839388145314, + "grad_norm": 1.3902168273925781, + "learning_rate": 4.908699336151707e-06, + "loss": 0.1548, + "step": 2265 + }, + { + "epoch": 0.4332695984703633, + "grad_norm": 2.48300838470459, + "learning_rate": 4.908979805451437e-06, + "loss": 0.4893, + "step": 2266 + }, + { + "epoch": 0.4334608030592734, + "grad_norm": 2.2376482486724854, + "learning_rate": 4.909260151005608e-06, + "loss": 0.3067, + "step": 2267 + }, + { + "epoch": 0.43365200764818357, + "grad_norm": 1.2431788444519043, + "learning_rate": 4.909540372923367e-06, + "loss": 0.0815, + "step": 2268 + }, + { + "epoch": 0.4338432122370937, + "grad_norm": 2.4078571796417236, + "learning_rate": 4.9098204713137175e-06, + "loss": 0.2497, + "step": 2269 + }, + { + "epoch": 0.4340344168260038, + "grad_norm": 2.028752088546753, + "learning_rate": 4.910100446285518e-06, + "loss": 0.3107, + "step": 2270 + }, + { + "epoch": 0.43422562141491394, + "grad_norm": 2.2157468795776367, + "learning_rate": 4.910380297947484e-06, + "loss": 0.2083, + "step": 2271 + }, + { + "epoch": 0.4344168260038241, + "grad_norm": 2.5187478065490723, + "learning_rate": 4.910660026408185e-06, + "loss": 0.2862, + "step": 2272 + }, + { + "epoch": 0.43460803059273423, + "grad_norm": 2.122492790222168, + "learning_rate": 4.910939631776051e-06, + "loss": 0.1934, + "step": 2273 + }, + { + "epoch": 0.4347992351816444, + "grad_norm": 1.5224970579147339, + "learning_rate": 4.911219114159365e-06, + "loss": 0.0709, + "step": 2274 + }, + { + "epoch": 0.4349904397705545, + "grad_norm": 1.999548077583313, + "learning_rate": 4.91149847366627e-06, + "loss": 0.1255, + "step": 2275 + }, + { + "epoch": 0.4351816443594646, + "grad_norm": 1.97611665725708, + "learning_rate": 4.911777710404766e-06, + "loss": 0.1653, + "step": 2276 + }, + { + "epoch": 0.43537284894837475, + "grad_norm": 1.5954374074935913, + "learning_rate": 4.91205682448271e-06, + "loss": 0.365, + "step": 2277 + }, + { + "epoch": 0.4355640535372849, + "grad_norm": 1.9923690557479858, + "learning_rate": 4.9123358160078154e-06, + "loss": 0.4458, + "step": 2278 + }, + { + "epoch": 0.43575525812619503, + "grad_norm": 2.6444168090820312, + "learning_rate": 4.912614685087658e-06, + "loss": 0.3102, + "step": 2279 + }, + { + "epoch": 0.4359464627151052, + "grad_norm": 3.0847392082214355, + "learning_rate": 4.9128934318296675e-06, + "loss": 0.0966, + "step": 2280 + }, + { + "epoch": 0.4361376673040153, + "grad_norm": 1.3775651454925537, + "learning_rate": 4.913172056341135e-06, + "loss": 0.0513, + "step": 2281 + }, + { + "epoch": 0.4363288718929254, + "grad_norm": 2.156324625015259, + "learning_rate": 4.91345055872921e-06, + "loss": 0.3312, + "step": 2282 + }, + { + "epoch": 0.43652007648183555, + "grad_norm": 2.418484926223755, + "learning_rate": 4.913728939100901e-06, + "loss": 0.1921, + "step": 2283 + }, + { + "epoch": 0.4367112810707457, + "grad_norm": 0.8661771416664124, + "learning_rate": 4.914007197563076e-06, + "loss": 0.0958, + "step": 2284 + }, + { + "epoch": 0.43690248565965584, + "grad_norm": 2.313511371612549, + "learning_rate": 4.914285334222461e-06, + "loss": 0.3075, + "step": 2285 + }, + { + "epoch": 0.437093690248566, + "grad_norm": 0.9290204048156738, + "learning_rate": 4.914563349185646e-06, + "loss": 0.091, + "step": 2286 + }, + { + "epoch": 0.4372848948374761, + "grad_norm": 1.7797731161117554, + "learning_rate": 4.914841242559077e-06, + "loss": 0.117, + "step": 2287 + }, + { + "epoch": 0.4374760994263862, + "grad_norm": 3.0950605869293213, + "learning_rate": 4.915119014449062e-06, + "loss": 0.4731, + "step": 2288 + }, + { + "epoch": 0.43766730401529635, + "grad_norm": 2.342931032180786, + "learning_rate": 4.915396664961769e-06, + "loss": 0.1476, + "step": 2289 + }, + { + "epoch": 0.4378585086042065, + "grad_norm": 2.6388490200042725, + "learning_rate": 4.915674194203229e-06, + "loss": 0.5004, + "step": 2290 + }, + { + "epoch": 0.43804971319311664, + "grad_norm": 1.8596402406692505, + "learning_rate": 4.9159516022793316e-06, + "loss": 0.1407, + "step": 2291 + }, + { + "epoch": 0.4382409177820268, + "grad_norm": 2.9163615703582764, + "learning_rate": 4.916228889295829e-06, + "loss": 0.272, + "step": 2292 + }, + { + "epoch": 0.4384321223709369, + "grad_norm": 4.297892093658447, + "learning_rate": 4.916506055358336e-06, + "loss": 0.1782, + "step": 2293 + }, + { + "epoch": 0.438623326959847, + "grad_norm": 1.6136482954025269, + "learning_rate": 4.916783100572327e-06, + "loss": 0.1081, + "step": 2294 + }, + { + "epoch": 0.43881453154875716, + "grad_norm": 2.648573398590088, + "learning_rate": 4.917060025043139e-06, + "loss": 0.369, + "step": 2295 + }, + { + "epoch": 0.4390057361376673, + "grad_norm": 2.309370756149292, + "learning_rate": 4.917336828875972e-06, + "loss": 0.3044, + "step": 2296 + }, + { + "epoch": 0.43919694072657744, + "grad_norm": 2.0419301986694336, + "learning_rate": 4.91761351217589e-06, + "loss": 0.1293, + "step": 2297 + }, + { + "epoch": 0.4393881453154876, + "grad_norm": 1.3703843355178833, + "learning_rate": 4.917890075047817e-06, + "loss": 0.0763, + "step": 2298 + }, + { + "epoch": 0.43957934990439773, + "grad_norm": 2.651688814163208, + "learning_rate": 4.918166517596542e-06, + "loss": 0.0794, + "step": 2299 + }, + { + "epoch": 0.4397705544933078, + "grad_norm": 2.5015828609466553, + "learning_rate": 4.918442839926716e-06, + "loss": 0.1203, + "step": 2300 + }, + { + "epoch": 0.43996175908221796, + "grad_norm": 3.4868128299713135, + "learning_rate": 4.918719042142854e-06, + "loss": 0.3761, + "step": 2301 + }, + { + "epoch": 0.4401529636711281, + "grad_norm": 3.072807550430298, + "learning_rate": 4.918995124349335e-06, + "loss": 0.4009, + "step": 2302 + }, + { + "epoch": 0.44034416826003825, + "grad_norm": 1.4773715734481812, + "learning_rate": 4.9192710866504036e-06, + "loss": 0.1333, + "step": 2303 + }, + { + "epoch": 0.4405353728489484, + "grad_norm": 1.7132066488265991, + "learning_rate": 4.919546929150165e-06, + "loss": 0.2852, + "step": 2304 + }, + { + "epoch": 0.44072657743785854, + "grad_norm": 1.9559893608093262, + "learning_rate": 4.919822651952591e-06, + "loss": 0.0896, + "step": 2305 + }, + { + "epoch": 0.4409177820267686, + "grad_norm": 3.859649658203125, + "learning_rate": 4.920098255161518e-06, + "loss": 0.1275, + "step": 2306 + }, + { + "epoch": 0.44110898661567877, + "grad_norm": 2.1365041732788086, + "learning_rate": 4.920373738880649e-06, + "loss": 0.2009, + "step": 2307 + }, + { + "epoch": 0.4413001912045889, + "grad_norm": 2.118523120880127, + "learning_rate": 4.920649103213549e-06, + "loss": 0.2082, + "step": 2308 + }, + { + "epoch": 0.44149139579349905, + "grad_norm": 3.3567910194396973, + "learning_rate": 4.92092434826365e-06, + "loss": 0.2772, + "step": 2309 + }, + { + "epoch": 0.4416826003824092, + "grad_norm": 2.266333818435669, + "learning_rate": 4.921199474134248e-06, + "loss": 0.2863, + "step": 2310 + }, + { + "epoch": 0.4418738049713193, + "grad_norm": 2.4705424308776855, + "learning_rate": 4.921474480928509e-06, + "loss": 0.1557, + "step": 2311 + }, + { + "epoch": 0.4420650095602294, + "grad_norm": 0.8634752631187439, + "learning_rate": 4.921749368749461e-06, + "loss": 0.0341, + "step": 2312 + }, + { + "epoch": 0.44225621414913957, + "grad_norm": 3.7431929111480713, + "learning_rate": 4.9220241377e-06, + "loss": 0.337, + "step": 2313 + }, + { + "epoch": 0.4424474187380497, + "grad_norm": 3.0681843757629395, + "learning_rate": 4.92229878788289e-06, + "loss": 0.3988, + "step": 2314 + }, + { + "epoch": 0.44263862332695986, + "grad_norm": 1.9662657976150513, + "learning_rate": 4.922573319400757e-06, + "loss": 0.2526, + "step": 2315 + }, + { + "epoch": 0.44282982791587, + "grad_norm": 2.1128218173980713, + "learning_rate": 4.9228477323561e-06, + "loss": 0.1578, + "step": 2316 + }, + { + "epoch": 0.4430210325047801, + "grad_norm": 2.3029232025146484, + "learning_rate": 4.923122026851282e-06, + "loss": 0.188, + "step": 2317 + }, + { + "epoch": 0.44321223709369023, + "grad_norm": 1.4844509363174438, + "learning_rate": 4.923396202988534e-06, + "loss": 0.0395, + "step": 2318 + }, + { + "epoch": 0.4434034416826004, + "grad_norm": 1.5008915662765503, + "learning_rate": 4.923670260869953e-06, + "loss": 0.0757, + "step": 2319 + }, + { + "epoch": 0.4435946462715105, + "grad_norm": 3.172825813293457, + "learning_rate": 4.923944200597508e-06, + "loss": 0.5372, + "step": 2320 + }, + { + "epoch": 0.44378585086042066, + "grad_norm": 3.6113059520721436, + "learning_rate": 4.924218022273032e-06, + "loss": 0.4811, + "step": 2321 + }, + { + "epoch": 0.4439770554493308, + "grad_norm": 1.0926949977874756, + "learning_rate": 4.9244917259982296e-06, + "loss": 0.1378, + "step": 2322 + }, + { + "epoch": 0.4441682600382409, + "grad_norm": 2.9779396057128906, + "learning_rate": 4.9247653118746715e-06, + "loss": 0.1651, + "step": 2323 + }, + { + "epoch": 0.44435946462715104, + "grad_norm": 1.3684827089309692, + "learning_rate": 4.9250387800038e-06, + "loss": 0.1091, + "step": 2324 + }, + { + "epoch": 0.4445506692160612, + "grad_norm": 1.5437180995941162, + "learning_rate": 4.9253121304869235e-06, + "loss": 0.0794, + "step": 2325 + }, + { + "epoch": 0.4447418738049713, + "grad_norm": 1.8073052167892456, + "learning_rate": 4.925585363425222e-06, + "loss": 0.1806, + "step": 2326 + }, + { + "epoch": 0.44493307839388146, + "grad_norm": 2.79695463180542, + "learning_rate": 4.925858478919743e-06, + "loss": 0.4065, + "step": 2327 + }, + { + "epoch": 0.4451242829827916, + "grad_norm": 2.834144115447998, + "learning_rate": 4.926131477071406e-06, + "loss": 0.4724, + "step": 2328 + }, + { + "epoch": 0.4453154875717017, + "grad_norm": 1.8955655097961426, + "learning_rate": 4.926404357980999e-06, + "loss": 0.1503, + "step": 2329 + }, + { + "epoch": 0.44550669216061184, + "grad_norm": 2.6821250915527344, + "learning_rate": 4.9266771217491796e-06, + "loss": 0.172, + "step": 2330 + }, + { + "epoch": 0.445697896749522, + "grad_norm": 1.803351640701294, + "learning_rate": 4.926949768476479e-06, + "loss": 0.1085, + "step": 2331 + }, + { + "epoch": 0.4458891013384321, + "grad_norm": 1.929173469543457, + "learning_rate": 4.927222298263295e-06, + "loss": 0.1473, + "step": 2332 + }, + { + "epoch": 0.44608030592734227, + "grad_norm": 1.8458374738693237, + "learning_rate": 4.927494711209899e-06, + "loss": 0.2217, + "step": 2333 + }, + { + "epoch": 0.4462715105162524, + "grad_norm": 2.2545127868652344, + "learning_rate": 4.927767007416432e-06, + "loss": 0.2251, + "step": 2334 + }, + { + "epoch": 0.4464627151051625, + "grad_norm": 2.347503185272217, + "learning_rate": 4.928039186982908e-06, + "loss": 0.2747, + "step": 2335 + }, + { + "epoch": 0.44665391969407264, + "grad_norm": 2.8446438312530518, + "learning_rate": 4.92831125000921e-06, + "loss": 0.2503, + "step": 2336 + }, + { + "epoch": 0.4468451242829828, + "grad_norm": 1.7588080167770386, + "learning_rate": 4.928583196595095e-06, + "loss": 0.0731, + "step": 2337 + }, + { + "epoch": 0.44703632887189293, + "grad_norm": 2.6320197582244873, + "learning_rate": 4.92885502684019e-06, + "loss": 0.1976, + "step": 2338 + }, + { + "epoch": 0.4472275334608031, + "grad_norm": 2.0502002239227295, + "learning_rate": 4.929126740843998e-06, + "loss": 0.2047, + "step": 2339 + }, + { + "epoch": 0.4474187380497132, + "grad_norm": 1.7559682130813599, + "learning_rate": 4.929398338705889e-06, + "loss": 0.1507, + "step": 2340 + }, + { + "epoch": 0.4476099426386233, + "grad_norm": 1.752091407775879, + "learning_rate": 4.92966982052511e-06, + "loss": 0.1281, + "step": 2341 + }, + { + "epoch": 0.44780114722753345, + "grad_norm": 3.454530954360962, + "learning_rate": 4.92994118640078e-06, + "loss": 0.2777, + "step": 2342 + }, + { + "epoch": 0.4479923518164436, + "grad_norm": 3.0871379375457764, + "learning_rate": 4.930212436431887e-06, + "loss": 0.2577, + "step": 2343 + }, + { + "epoch": 0.44818355640535373, + "grad_norm": 2.506565809249878, + "learning_rate": 4.9304835707173e-06, + "loss": 0.1185, + "step": 2344 + }, + { + "epoch": 0.4483747609942639, + "grad_norm": 2.834341287612915, + "learning_rate": 4.930754589355753e-06, + "loss": 0.4537, + "step": 2345 + }, + { + "epoch": 0.448565965583174, + "grad_norm": 3.0927183628082275, + "learning_rate": 4.931025492445859e-06, + "loss": 0.6336, + "step": 2346 + }, + { + "epoch": 0.4487571701720841, + "grad_norm": 2.212230920791626, + "learning_rate": 4.931296280086106e-06, + "loss": 0.2144, + "step": 2347 + }, + { + "epoch": 0.44894837476099425, + "grad_norm": 1.8910106420516968, + "learning_rate": 4.931566952374851e-06, + "loss": 0.1857, + "step": 2348 + }, + { + "epoch": 0.4491395793499044, + "grad_norm": 1.7347790002822876, + "learning_rate": 4.93183750941033e-06, + "loss": 0.094, + "step": 2349 + }, + { + "epoch": 0.44933078393881454, + "grad_norm": 2.411543846130371, + "learning_rate": 4.93210795129065e-06, + "loss": 0.1578, + "step": 2350 + }, + { + "epoch": 0.4495219885277247, + "grad_norm": 1.223103642463684, + "learning_rate": 4.932378278113796e-06, + "loss": 0.1855, + "step": 2351 + }, + { + "epoch": 0.4497131931166348, + "grad_norm": 2.896897554397583, + "learning_rate": 4.932648489977627e-06, + "loss": 0.562, + "step": 2352 + }, + { + "epoch": 0.4499043977055449, + "grad_norm": 2.2476577758789062, + "learning_rate": 4.932918586979875e-06, + "loss": 0.2595, + "step": 2353 + }, + { + "epoch": 0.45009560229445505, + "grad_norm": 1.3098033666610718, + "learning_rate": 4.93318856921815e-06, + "loss": 0.0972, + "step": 2354 + }, + { + "epoch": 0.4502868068833652, + "grad_norm": 1.3734967708587646, + "learning_rate": 4.933458436789937e-06, + "loss": 0.1431, + "step": 2355 + }, + { + "epoch": 0.45047801147227534, + "grad_norm": 1.1524994373321533, + "learning_rate": 4.933728189792596e-06, + "loss": 0.0973, + "step": 2356 + }, + { + "epoch": 0.4506692160611855, + "grad_norm": 3.060199022293091, + "learning_rate": 4.933997828323365e-06, + "loss": 0.4424, + "step": 2357 + }, + { + "epoch": 0.45086042065009563, + "grad_norm": 3.3533732891082764, + "learning_rate": 4.934267352479356e-06, + "loss": 0.3997, + "step": 2358 + }, + { + "epoch": 0.4510516252390057, + "grad_norm": 2.261230945587158, + "learning_rate": 4.934536762357558e-06, + "loss": 0.2667, + "step": 2359 + }, + { + "epoch": 0.45124282982791586, + "grad_norm": 1.9926918745040894, + "learning_rate": 4.934806058054837e-06, + "loss": 0.1408, + "step": 2360 + }, + { + "epoch": 0.451434034416826, + "grad_norm": 1.3215872049331665, + "learning_rate": 4.935075239667939e-06, + "loss": 0.1143, + "step": 2361 + }, + { + "epoch": 0.45162523900573615, + "grad_norm": 2.423924684524536, + "learning_rate": 4.93534430729348e-06, + "loss": 0.1111, + "step": 2362 + }, + { + "epoch": 0.4518164435946463, + "grad_norm": 2.364931583404541, + "learning_rate": 4.935613261027959e-06, + "loss": 0.3041, + "step": 2363 + }, + { + "epoch": 0.45200764818355643, + "grad_norm": 3.6321725845336914, + "learning_rate": 4.9358821009677516e-06, + "loss": 0.7906, + "step": 2364 + }, + { + "epoch": 0.4521988527724665, + "grad_norm": 1.2970473766326904, + "learning_rate": 4.93615082720911e-06, + "loss": 0.1492, + "step": 2365 + }, + { + "epoch": 0.45239005736137666, + "grad_norm": 1.3972055912017822, + "learning_rate": 4.936419439848165e-06, + "loss": 0.2284, + "step": 2366 + }, + { + "epoch": 0.4525812619502868, + "grad_norm": 2.145207166671753, + "learning_rate": 4.936687938980925e-06, + "loss": 0.1145, + "step": 2367 + }, + { + "epoch": 0.45277246653919695, + "grad_norm": 1.5136973857879639, + "learning_rate": 4.936956324703276e-06, + "loss": 0.1319, + "step": 2368 + }, + { + "epoch": 0.4529636711281071, + "grad_norm": 2.3273026943206787, + "learning_rate": 4.937224597110986e-06, + "loss": 0.2864, + "step": 2369 + }, + { + "epoch": 0.45315487571701724, + "grad_norm": 2.093433141708374, + "learning_rate": 4.937492756299697e-06, + "loss": 0.1958, + "step": 2370 + }, + { + "epoch": 0.4533460803059273, + "grad_norm": 1.7759649753570557, + "learning_rate": 4.937760802364934e-06, + "loss": 0.3099, + "step": 2371 + }, + { + "epoch": 0.45353728489483747, + "grad_norm": 2.3750076293945312, + "learning_rate": 4.938028735402098e-06, + "loss": 0.2607, + "step": 2372 + }, + { + "epoch": 0.4537284894837476, + "grad_norm": 1.7167375087738037, + "learning_rate": 4.938296555506473e-06, + "loss": 0.1179, + "step": 2373 + }, + { + "epoch": 0.45391969407265775, + "grad_norm": 2.261927604675293, + "learning_rate": 4.938564262773219e-06, + "loss": 0.1355, + "step": 2374 + }, + { + "epoch": 0.4541108986615679, + "grad_norm": 1.3687342405319214, + "learning_rate": 4.938831857297376e-06, + "loss": 0.0843, + "step": 2375 + }, + { + "epoch": 0.454302103250478, + "grad_norm": 2.428593158721924, + "learning_rate": 4.939099339173867e-06, + "loss": 0.1995, + "step": 2376 + }, + { + "epoch": 0.4544933078393881, + "grad_norm": 1.6332013607025146, + "learning_rate": 4.9393667084974925e-06, + "loss": 0.1263, + "step": 2377 + }, + { + "epoch": 0.45468451242829827, + "grad_norm": 1.8396623134613037, + "learning_rate": 4.9396339653629345e-06, + "loss": 0.153, + "step": 2378 + }, + { + "epoch": 0.4548757170172084, + "grad_norm": 1.5843632221221924, + "learning_rate": 4.939901109864755e-06, + "loss": 0.1409, + "step": 2379 + }, + { + "epoch": 0.45506692160611856, + "grad_norm": 2.6990528106689453, + "learning_rate": 4.940168142097398e-06, + "loss": 0.2067, + "step": 2380 + }, + { + "epoch": 0.4552581261950287, + "grad_norm": 1.8671232461929321, + "learning_rate": 4.940435062155186e-06, + "loss": 0.105, + "step": 2381 + }, + { + "epoch": 0.4554493307839388, + "grad_norm": 2.4851431846618652, + "learning_rate": 4.940701870132325e-06, + "loss": 0.277, + "step": 2382 + }, + { + "epoch": 0.45564053537284893, + "grad_norm": 3.0492827892303467, + "learning_rate": 4.940968566122902e-06, + "loss": 0.4704, + "step": 2383 + }, + { + "epoch": 0.4558317399617591, + "grad_norm": 3.329960823059082, + "learning_rate": 4.941235150220885e-06, + "loss": 0.1627, + "step": 2384 + }, + { + "epoch": 0.4560229445506692, + "grad_norm": 2.0267832279205322, + "learning_rate": 4.941501622520123e-06, + "loss": 0.1374, + "step": 2385 + }, + { + "epoch": 0.45621414913957936, + "grad_norm": 1.8187408447265625, + "learning_rate": 4.941767983114349e-06, + "loss": 0.2506, + "step": 2386 + }, + { + "epoch": 0.4564053537284895, + "grad_norm": 1.1165252923965454, + "learning_rate": 4.942034232097177e-06, + "loss": 0.1032, + "step": 2387 + }, + { + "epoch": 0.4565965583173996, + "grad_norm": 2.010648250579834, + "learning_rate": 4.942300369562102e-06, + "loss": 0.1881, + "step": 2388 + }, + { + "epoch": 0.45678776290630974, + "grad_norm": 4.388535499572754, + "learning_rate": 4.942566395602506e-06, + "loss": 0.338, + "step": 2389 + }, + { + "epoch": 0.4569789674952199, + "grad_norm": 2.1628904342651367, + "learning_rate": 4.942832310311647e-06, + "loss": 0.2078, + "step": 2390 + }, + { + "epoch": 0.45717017208413, + "grad_norm": 1.473975658416748, + "learning_rate": 4.943098113782672e-06, + "loss": 0.1039, + "step": 2391 + }, + { + "epoch": 0.45736137667304017, + "grad_norm": 2.243346691131592, + "learning_rate": 4.94336380610861e-06, + "loss": 0.0887, + "step": 2392 + }, + { + "epoch": 0.4575525812619503, + "grad_norm": 1.2243520021438599, + "learning_rate": 4.9436293873823705e-06, + "loss": 0.1352, + "step": 2393 + }, + { + "epoch": 0.4577437858508604, + "grad_norm": 1.9870951175689697, + "learning_rate": 4.943894857696749e-06, + "loss": 0.1092, + "step": 2394 + }, + { + "epoch": 0.45793499043977054, + "grad_norm": 1.85684335231781, + "learning_rate": 4.9441602171444225e-06, + "loss": 0.1827, + "step": 2395 + }, + { + "epoch": 0.4581261950286807, + "grad_norm": 2.156700849533081, + "learning_rate": 4.944425465817956e-06, + "loss": 0.1789, + "step": 2396 + }, + { + "epoch": 0.4583173996175908, + "grad_norm": 1.204390287399292, + "learning_rate": 4.944690603809794e-06, + "loss": 0.0855, + "step": 2397 + }, + { + "epoch": 0.45850860420650097, + "grad_norm": 1.6291753053665161, + "learning_rate": 4.944955631212269e-06, + "loss": 0.1035, + "step": 2398 + }, + { + "epoch": 0.4586998087954111, + "grad_norm": 2.399693012237549, + "learning_rate": 4.945220548117595e-06, + "loss": 0.2533, + "step": 2399 + }, + { + "epoch": 0.4588910133843212, + "grad_norm": 1.6935648918151855, + "learning_rate": 4.945485354617874e-06, + "loss": 0.089, + "step": 2400 + }, + { + "epoch": 0.45908221797323134, + "grad_norm": 1.8837876319885254, + "learning_rate": 4.945750050805088e-06, + "loss": 0.2955, + "step": 2401 + }, + { + "epoch": 0.4592734225621415, + "grad_norm": 1.520376205444336, + "learning_rate": 4.946014636771111e-06, + "loss": 0.0884, + "step": 2402 + }, + { + "epoch": 0.45946462715105163, + "grad_norm": 6.492906093597412, + "learning_rate": 4.946279112607695e-06, + "loss": 0.8193, + "step": 2403 + }, + { + "epoch": 0.4596558317399618, + "grad_norm": 2.74912691116333, + "learning_rate": 4.946543478406484e-06, + "loss": 0.2926, + "step": 2404 + }, + { + "epoch": 0.4598470363288719, + "grad_norm": 1.9996367692947388, + "learning_rate": 4.946807734259001e-06, + "loss": 0.1018, + "step": 2405 + }, + { + "epoch": 0.460038240917782, + "grad_norm": 1.903185486793518, + "learning_rate": 4.947071880256661e-06, + "loss": 0.243, + "step": 2406 + }, + { + "epoch": 0.46022944550669215, + "grad_norm": 2.6955156326293945, + "learning_rate": 4.947335916490763e-06, + "loss": 0.3418, + "step": 2407 + }, + { + "epoch": 0.4604206500956023, + "grad_norm": 4.031216144561768, + "learning_rate": 4.947599843052489e-06, + "loss": 0.4078, + "step": 2408 + }, + { + "epoch": 0.46061185468451243, + "grad_norm": 2.110466241836548, + "learning_rate": 4.947863660032912e-06, + "loss": 0.2372, + "step": 2409 + }, + { + "epoch": 0.4608030592734226, + "grad_norm": 1.633364200592041, + "learning_rate": 4.9481273675229894e-06, + "loss": 0.1019, + "step": 2410 + }, + { + "epoch": 0.4609942638623327, + "grad_norm": 1.452785611152649, + "learning_rate": 4.948390965613565e-06, + "loss": 0.0943, + "step": 2411 + }, + { + "epoch": 0.4611854684512428, + "grad_norm": 2.504720449447632, + "learning_rate": 4.948654454395372e-06, + "loss": 0.1355, + "step": 2412 + }, + { + "epoch": 0.46137667304015295, + "grad_norm": 2.703850269317627, + "learning_rate": 4.948917833959027e-06, + "loss": 0.3414, + "step": 2413 + }, + { + "epoch": 0.4615678776290631, + "grad_norm": 2.818969964981079, + "learning_rate": 4.949181104395038e-06, + "loss": 0.528, + "step": 2414 + }, + { + "epoch": 0.46175908221797324, + "grad_norm": 2.11025071144104, + "learning_rate": 4.949444265793797e-06, + "loss": 0.1207, + "step": 2415 + }, + { + "epoch": 0.4619502868068834, + "grad_norm": 2.019381523132324, + "learning_rate": 4.949707318245586e-06, + "loss": 0.1208, + "step": 2416 + }, + { + "epoch": 0.4621414913957935, + "grad_norm": 2.8823390007019043, + "learning_rate": 4.949970261840574e-06, + "loss": 0.3685, + "step": 2417 + }, + { + "epoch": 0.4623326959847036, + "grad_norm": 1.2599728107452393, + "learning_rate": 4.950233096668818e-06, + "loss": 0.0653, + "step": 2418 + }, + { + "epoch": 0.46252390057361376, + "grad_norm": 2.618422508239746, + "learning_rate": 4.950495822820266e-06, + "loss": 0.1693, + "step": 2419 + }, + { + "epoch": 0.4627151051625239, + "grad_norm": 2.234100341796875, + "learning_rate": 4.950758440384748e-06, + "loss": 0.1476, + "step": 2420 + }, + { + "epoch": 0.46290630975143404, + "grad_norm": 1.7235227823257446, + "learning_rate": 4.95102094945199e-06, + "loss": 0.1403, + "step": 2421 + }, + { + "epoch": 0.4630975143403442, + "grad_norm": 3.084684133529663, + "learning_rate": 4.951283350111601e-06, + "loss": 0.4766, + "step": 2422 + }, + { + "epoch": 0.46328871892925433, + "grad_norm": 1.7967596054077148, + "learning_rate": 4.951545642453086e-06, + "loss": 0.1491, + "step": 2423 + }, + { + "epoch": 0.4634799235181644, + "grad_norm": 2.608936071395874, + "learning_rate": 4.95180782656583e-06, + "loss": 0.2224, + "step": 2424 + }, + { + "epoch": 0.46367112810707456, + "grad_norm": 1.7240525484085083, + "learning_rate": 4.952069902539114e-06, + "loss": 0.1001, + "step": 2425 + }, + { + "epoch": 0.4638623326959847, + "grad_norm": 2.553189754486084, + "learning_rate": 4.952331870462108e-06, + "loss": 0.3537, + "step": 2426 + }, + { + "epoch": 0.46405353728489485, + "grad_norm": 1.4191901683807373, + "learning_rate": 4.952593730423869e-06, + "loss": 0.0765, + "step": 2427 + }, + { + "epoch": 0.464244741873805, + "grad_norm": 2.872032880783081, + "learning_rate": 4.952855482513347e-06, + "loss": 0.5042, + "step": 2428 + }, + { + "epoch": 0.46443594646271513, + "grad_norm": 1.699519157409668, + "learning_rate": 4.95311712681938e-06, + "loss": 0.2303, + "step": 2429 + }, + { + "epoch": 0.4646271510516252, + "grad_norm": 3.224975109100342, + "learning_rate": 4.953378663430695e-06, + "loss": 0.1607, + "step": 2430 + }, + { + "epoch": 0.46481835564053536, + "grad_norm": 2.4731459617614746, + "learning_rate": 4.953640092435914e-06, + "loss": 0.1178, + "step": 2431 + }, + { + "epoch": 0.4650095602294455, + "grad_norm": 2.1776509284973145, + "learning_rate": 4.953901413923546e-06, + "loss": 0.2036, + "step": 2432 + }, + { + "epoch": 0.46520076481835565, + "grad_norm": 2.7886409759521484, + "learning_rate": 4.9541626279819915e-06, + "loss": 0.334, + "step": 2433 + }, + { + "epoch": 0.4653919694072658, + "grad_norm": 1.9042888879776, + "learning_rate": 4.954423734699544e-06, + "loss": 0.2839, + "step": 2434 + }, + { + "epoch": 0.4655831739961759, + "grad_norm": 3.304387092590332, + "learning_rate": 4.954684734164385e-06, + "loss": 0.5778, + "step": 2435 + }, + { + "epoch": 0.465774378585086, + "grad_norm": 2.7068235874176025, + "learning_rate": 4.954945626464589e-06, + "loss": 0.2185, + "step": 2436 + }, + { + "epoch": 0.46596558317399617, + "grad_norm": 2.0537827014923096, + "learning_rate": 4.955206411688123e-06, + "loss": 0.1307, + "step": 2437 + }, + { + "epoch": 0.4661567877629063, + "grad_norm": 1.799789309501648, + "learning_rate": 4.955467089922844e-06, + "loss": 0.1601, + "step": 2438 + }, + { + "epoch": 0.46634799235181645, + "grad_norm": 2.1717841625213623, + "learning_rate": 4.955727661256503e-06, + "loss": 0.2017, + "step": 2439 + }, + { + "epoch": 0.4665391969407266, + "grad_norm": 2.0859739780426025, + "learning_rate": 4.95598812577674e-06, + "loss": 0.1489, + "step": 2440 + }, + { + "epoch": 0.4667304015296367, + "grad_norm": 1.309865117073059, + "learning_rate": 4.95624848357109e-06, + "loss": 0.0694, + "step": 2441 + }, + { + "epoch": 0.46692160611854683, + "grad_norm": 1.9916973114013672, + "learning_rate": 4.956508734726978e-06, + "loss": 0.1463, + "step": 2442 + }, + { + "epoch": 0.46711281070745697, + "grad_norm": 2.035287618637085, + "learning_rate": 4.956768879331726e-06, + "loss": 0.1042, + "step": 2443 + }, + { + "epoch": 0.4673040152963671, + "grad_norm": 1.9258490800857544, + "learning_rate": 4.957028917472544e-06, + "loss": 0.1315, + "step": 2444 + }, + { + "epoch": 0.46749521988527726, + "grad_norm": 2.847909688949585, + "learning_rate": 4.957288849236539e-06, + "loss": 0.4624, + "step": 2445 + }, + { + "epoch": 0.4676864244741874, + "grad_norm": 3.1480817794799805, + "learning_rate": 4.957548674710705e-06, + "loss": 0.3211, + "step": 2446 + }, + { + "epoch": 0.4678776290630975, + "grad_norm": 2.3119571208953857, + "learning_rate": 4.957808393981937e-06, + "loss": 0.2032, + "step": 2447 + }, + { + "epoch": 0.46806883365200763, + "grad_norm": 2.356323480606079, + "learning_rate": 4.9580680071370174e-06, + "loss": 0.2908, + "step": 2448 + }, + { + "epoch": 0.4682600382409178, + "grad_norm": 0.683710515499115, + "learning_rate": 4.958327514262626e-06, + "loss": 0.0256, + "step": 2449 + }, + { + "epoch": 0.4684512428298279, + "grad_norm": 2.147235155105591, + "learning_rate": 4.9585869154453355e-06, + "loss": 0.186, + "step": 2450 + }, + { + "epoch": 0.46864244741873806, + "grad_norm": 2.6111466884613037, + "learning_rate": 4.958846210771611e-06, + "loss": 0.3993, + "step": 2451 + }, + { + "epoch": 0.4688336520076482, + "grad_norm": 1.6357756853103638, + "learning_rate": 4.959105400327814e-06, + "loss": 0.1385, + "step": 2452 + }, + { + "epoch": 0.4690248565965583, + "grad_norm": 2.0699405670166016, + "learning_rate": 4.9593644842001994e-06, + "loss": 0.1961, + "step": 2453 + }, + { + "epoch": 0.46921606118546844, + "grad_norm": 2.4611475467681885, + "learning_rate": 4.959623462474916e-06, + "loss": 0.194, + "step": 2454 + }, + { + "epoch": 0.4694072657743786, + "grad_norm": 2.70579195022583, + "learning_rate": 4.9598823352380075e-06, + "loss": 0.3494, + "step": 2455 + }, + { + "epoch": 0.4695984703632887, + "grad_norm": 1.362758994102478, + "learning_rate": 4.9601411025754144e-06, + "loss": 0.0551, + "step": 2456 + }, + { + "epoch": 0.46978967495219887, + "grad_norm": 2.263684034347534, + "learning_rate": 4.960399764572971e-06, + "loss": 0.4414, + "step": 2457 + }, + { + "epoch": 0.469980879541109, + "grad_norm": 2.3129522800445557, + "learning_rate": 4.960658321316405e-06, + "loss": 0.337, + "step": 2458 + }, + { + "epoch": 0.4701720841300191, + "grad_norm": 3.598054885864258, + "learning_rate": 4.960916772891341e-06, + "loss": 0.6028, + "step": 2459 + }, + { + "epoch": 0.47036328871892924, + "grad_norm": 2.352046489715576, + "learning_rate": 4.9611751193833e-06, + "loss": 0.1063, + "step": 2460 + }, + { + "epoch": 0.4705544933078394, + "grad_norm": 2.858907461166382, + "learning_rate": 4.9614333608776984e-06, + "loss": 0.3046, + "step": 2461 + }, + { + "epoch": 0.4707456978967495, + "grad_norm": 1.6364027261734009, + "learning_rate": 4.9616914974598485e-06, + "loss": 0.0841, + "step": 2462 + }, + { + "epoch": 0.47093690248565967, + "grad_norm": 3.1622374057769775, + "learning_rate": 4.961949529214955e-06, + "loss": 0.3993, + "step": 2463 + }, + { + "epoch": 0.4711281070745698, + "grad_norm": 2.4431493282318115, + "learning_rate": 4.962207456228127e-06, + "loss": 0.3394, + "step": 2464 + }, + { + "epoch": 0.4713193116634799, + "grad_norm": 1.4499131441116333, + "learning_rate": 4.96246527858436e-06, + "loss": 0.1154, + "step": 2465 + }, + { + "epoch": 0.47151051625239004, + "grad_norm": 1.915971279144287, + "learning_rate": 4.962722996368555e-06, + "loss": 0.3057, + "step": 2466 + }, + { + "epoch": 0.4717017208413002, + "grad_norm": 2.2309677600860596, + "learning_rate": 4.9629806096655045e-06, + "loss": 0.3469, + "step": 2467 + }, + { + "epoch": 0.47189292543021033, + "grad_norm": 1.4278203248977661, + "learning_rate": 4.963238118559899e-06, + "loss": 0.0782, + "step": 2468 + }, + { + "epoch": 0.4720841300191205, + "grad_norm": 1.626359462738037, + "learning_rate": 4.963495523136326e-06, + "loss": 0.0824, + "step": 2469 + }, + { + "epoch": 0.4722753346080306, + "grad_norm": 6.234444618225098, + "learning_rate": 4.96375282347927e-06, + "loss": 0.3704, + "step": 2470 + }, + { + "epoch": 0.4724665391969407, + "grad_norm": 1.2472459077835083, + "learning_rate": 4.964010019673117e-06, + "loss": 0.1072, + "step": 2471 + }, + { + "epoch": 0.47265774378585085, + "grad_norm": 0.9773194193840027, + "learning_rate": 4.9642671118021435e-06, + "loss": 0.1166, + "step": 2472 + }, + { + "epoch": 0.472848948374761, + "grad_norm": 1.2697404623031616, + "learning_rate": 4.9645240999505284e-06, + "loss": 0.0847, + "step": 2473 + }, + { + "epoch": 0.47304015296367113, + "grad_norm": 1.7373942136764526, + "learning_rate": 4.9647809842023485e-06, + "loss": 0.1414, + "step": 2474 + }, + { + "epoch": 0.4732313575525813, + "grad_norm": 1.2510534524917603, + "learning_rate": 4.965037764641576e-06, + "loss": 0.0646, + "step": 2475 + }, + { + "epoch": 0.4734225621414914, + "grad_norm": 3.523763418197632, + "learning_rate": 4.965294441352084e-06, + "loss": 0.6183, + "step": 2476 + }, + { + "epoch": 0.4736137667304015, + "grad_norm": 2.536613941192627, + "learning_rate": 4.965551014417641e-06, + "loss": 0.307, + "step": 2477 + }, + { + "epoch": 0.47380497131931165, + "grad_norm": 1.2308690547943115, + "learning_rate": 4.965807483921919e-06, + "loss": 0.121, + "step": 2478 + }, + { + "epoch": 0.4739961759082218, + "grad_norm": 1.2739108800888062, + "learning_rate": 4.966063849948484e-06, + "loss": 0.0985, + "step": 2479 + }, + { + "epoch": 0.47418738049713194, + "grad_norm": 4.273107528686523, + "learning_rate": 4.966320112580802e-06, + "loss": 0.1709, + "step": 2480 + }, + { + "epoch": 0.4743785850860421, + "grad_norm": 2.6034741401672363, + "learning_rate": 4.96657627190224e-06, + "loss": 0.1497, + "step": 2481 + }, + { + "epoch": 0.4745697896749522, + "grad_norm": 2.563988208770752, + "learning_rate": 4.966832327996062e-06, + "loss": 0.3513, + "step": 2482 + }, + { + "epoch": 0.4747609942638623, + "grad_norm": 1.5361602306365967, + "learning_rate": 4.967088280945433e-06, + "loss": 0.1868, + "step": 2483 + }, + { + "epoch": 0.47495219885277246, + "grad_norm": 1.8172998428344727, + "learning_rate": 4.9673441308334165e-06, + "loss": 0.2232, + "step": 2484 + }, + { + "epoch": 0.4751434034416826, + "grad_norm": 1.6137768030166626, + "learning_rate": 4.967599877742975e-06, + "loss": 0.1634, + "step": 2485 + }, + { + "epoch": 0.47533460803059274, + "grad_norm": 1.552061915397644, + "learning_rate": 4.967855521756973e-06, + "loss": 0.094, + "step": 2486 + }, + { + "epoch": 0.4755258126195029, + "grad_norm": 2.0277292728424072, + "learning_rate": 4.9681110629581734e-06, + "loss": 0.1203, + "step": 2487 + }, + { + "epoch": 0.47571701720841303, + "grad_norm": 2.038464307785034, + "learning_rate": 4.96836650142924e-06, + "loss": 0.164, + "step": 2488 + }, + { + "epoch": 0.4759082217973231, + "grad_norm": 2.4230268001556396, + "learning_rate": 4.968621837252737e-06, + "loss": 0.3603, + "step": 2489 + }, + { + "epoch": 0.47609942638623326, + "grad_norm": 1.4726464748382568, + "learning_rate": 4.9688770705111285e-06, + "loss": 0.1528, + "step": 2490 + }, + { + "epoch": 0.4762906309751434, + "grad_norm": 1.6719284057617188, + "learning_rate": 4.969132201286779e-06, + "loss": 0.2506, + "step": 2491 + }, + { + "epoch": 0.47648183556405355, + "grad_norm": 1.5102533102035522, + "learning_rate": 4.969387229661954e-06, + "loss": 0.0726, + "step": 2492 + }, + { + "epoch": 0.4766730401529637, + "grad_norm": 2.305950880050659, + "learning_rate": 4.969642155718823e-06, + "loss": 0.2434, + "step": 2493 + }, + { + "epoch": 0.4768642447418738, + "grad_norm": 2.0720887184143066, + "learning_rate": 4.969896979539451e-06, + "loss": 0.249, + "step": 2494 + }, + { + "epoch": 0.4770554493307839, + "grad_norm": 2.4147567749023438, + "learning_rate": 4.97015170120581e-06, + "loss": 0.3391, + "step": 2495 + }, + { + "epoch": 0.47724665391969406, + "grad_norm": 2.9447648525238037, + "learning_rate": 4.9704063207997684e-06, + "loss": 0.4681, + "step": 2496 + }, + { + "epoch": 0.4774378585086042, + "grad_norm": 2.2584710121154785, + "learning_rate": 4.9706608384031e-06, + "loss": 0.1599, + "step": 2497 + }, + { + "epoch": 0.47762906309751435, + "grad_norm": 1.4370557069778442, + "learning_rate": 4.970915254097478e-06, + "loss": 0.1377, + "step": 2498 + }, + { + "epoch": 0.4778202676864245, + "grad_norm": 1.5512670278549194, + "learning_rate": 4.971169567964479e-06, + "loss": 0.111, + "step": 2499 + }, + { + "epoch": 0.4780114722753346, + "grad_norm": 1.5349520444869995, + "learning_rate": 4.9714237800855815e-06, + "loss": 0.0762, + "step": 2500 + }, + { + "epoch": 0.4780114722753346, + "eval_runtime": 838.1854, + "eval_samples_per_second": 1.83, + "eval_steps_per_second": 0.229, + "step": 2500 + }, + { + "epoch": 0.4782026768642447, + "grad_norm": 1.6777958869934082, + "learning_rate": 4.971677890542167e-06, + "loss": 0.1465, + "step": 2501 + }, + { + "epoch": 0.47839388145315487, + "grad_norm": 1.7863197326660156, + "learning_rate": 4.971931899415515e-06, + "loss": 0.1716, + "step": 2502 + }, + { + "epoch": 0.478585086042065, + "grad_norm": 1.1726549863815308, + "learning_rate": 4.972185806786815e-06, + "loss": 0.1211, + "step": 2503 + }, + { + "epoch": 0.47877629063097515, + "grad_norm": 3.095215320587158, + "learning_rate": 4.972439612737152e-06, + "loss": 0.3554, + "step": 2504 + }, + { + "epoch": 0.4789674952198853, + "grad_norm": 1.327844262123108, + "learning_rate": 4.972693317347518e-06, + "loss": 0.0942, + "step": 2505 + }, + { + "epoch": 0.4791586998087954, + "grad_norm": 1.6837940216064453, + "learning_rate": 4.9729469206988075e-06, + "loss": 0.0878, + "step": 2506 + }, + { + "epoch": 0.47934990439770553, + "grad_norm": 2.4635088443756104, + "learning_rate": 4.973200422871818e-06, + "loss": 0.1642, + "step": 2507 + }, + { + "epoch": 0.47954110898661567, + "grad_norm": 2.1021690368652344, + "learning_rate": 4.973453823947249e-06, + "loss": 0.3213, + "step": 2508 + }, + { + "epoch": 0.4797323135755258, + "grad_norm": 1.8503338098526, + "learning_rate": 4.973707124005704e-06, + "loss": 0.1949, + "step": 2509 + }, + { + "epoch": 0.47992351816443596, + "grad_norm": 1.9696707725524902, + "learning_rate": 4.973960323127691e-06, + "loss": 0.3777, + "step": 2510 + }, + { + "epoch": 0.4801147227533461, + "grad_norm": 1.7098878622055054, + "learning_rate": 4.974213421393625e-06, + "loss": 0.1056, + "step": 2511 + }, + { + "epoch": 0.4803059273422562, + "grad_norm": 2.5690712928771973, + "learning_rate": 4.974466418883816e-06, + "loss": 0.1538, + "step": 2512 + }, + { + "epoch": 0.48049713193116633, + "grad_norm": 2.4078571796417236, + "learning_rate": 4.974719315678486e-06, + "loss": 0.4023, + "step": 2513 + }, + { + "epoch": 0.4806883365200765, + "grad_norm": 1.6684014797210693, + "learning_rate": 4.974972111857759e-06, + "loss": 0.1205, + "step": 2514 + }, + { + "epoch": 0.4808795411089866, + "grad_norm": 2.405083417892456, + "learning_rate": 4.975224807501662e-06, + "loss": 0.3595, + "step": 2515 + }, + { + "epoch": 0.48107074569789676, + "grad_norm": 1.4291092157363892, + "learning_rate": 4.975477402690129e-06, + "loss": 0.0913, + "step": 2516 + }, + { + "epoch": 0.4812619502868069, + "grad_norm": 6.145540237426758, + "learning_rate": 4.975729897502997e-06, + "loss": 0.1871, + "step": 2517 + }, + { + "epoch": 0.481453154875717, + "grad_norm": 1.5036752223968506, + "learning_rate": 4.975982292020009e-06, + "loss": 0.1577, + "step": 2518 + }, + { + "epoch": 0.48164435946462714, + "grad_norm": 2.249161720275879, + "learning_rate": 4.97623458632081e-06, + "loss": 0.1663, + "step": 2519 + }, + { + "epoch": 0.4818355640535373, + "grad_norm": 1.631930947303772, + "learning_rate": 4.976486780484955e-06, + "loss": 0.1467, + "step": 2520 + }, + { + "epoch": 0.4820267686424474, + "grad_norm": 1.7321642637252808, + "learning_rate": 4.9767388745919e-06, + "loss": 0.2776, + "step": 2521 + }, + { + "epoch": 0.48221797323135757, + "grad_norm": 3.218299388885498, + "learning_rate": 4.976990868721009e-06, + "loss": 0.5657, + "step": 2522 + }, + { + "epoch": 0.4824091778202677, + "grad_norm": 2.6193222999572754, + "learning_rate": 4.977242762951551e-06, + "loss": 0.2718, + "step": 2523 + }, + { + "epoch": 0.4826003824091778, + "grad_norm": 2.339751720428467, + "learning_rate": 4.977494557362701e-06, + "loss": 0.1729, + "step": 2524 + }, + { + "epoch": 0.48279158699808794, + "grad_norm": 1.910982370376587, + "learning_rate": 4.9777462520335386e-06, + "loss": 0.0808, + "step": 2525 + }, + { + "epoch": 0.4829827915869981, + "grad_norm": 1.3816866874694824, + "learning_rate": 4.97799784704305e-06, + "loss": 0.0891, + "step": 2526 + }, + { + "epoch": 0.4831739961759082, + "grad_norm": 1.6029402017593384, + "learning_rate": 4.97824934247013e-06, + "loss": 0.1892, + "step": 2527 + }, + { + "epoch": 0.48336520076481837, + "grad_norm": 1.5500452518463135, + "learning_rate": 4.978500738393576e-06, + "loss": 0.143, + "step": 2528 + }, + { + "epoch": 0.4835564053537285, + "grad_norm": 2.6044082641601562, + "learning_rate": 4.978752034892094e-06, + "loss": 0.132, + "step": 2529 + }, + { + "epoch": 0.4837476099426386, + "grad_norm": 1.7775001525878906, + "learning_rate": 4.979003232044297e-06, + "loss": 0.1912, + "step": 2530 + }, + { + "epoch": 0.48393881453154874, + "grad_norm": 1.5503368377685547, + "learning_rate": 4.979254329928704e-06, + "loss": 0.0685, + "step": 2531 + }, + { + "epoch": 0.4841300191204589, + "grad_norm": 1.6482033729553223, + "learning_rate": 4.979505328623739e-06, + "loss": 0.2095, + "step": 2532 + }, + { + "epoch": 0.48432122370936903, + "grad_norm": 3.1637353897094727, + "learning_rate": 4.9797562282077376e-06, + "loss": 0.4826, + "step": 2533 + }, + { + "epoch": 0.4845124282982792, + "grad_norm": 1.1990470886230469, + "learning_rate": 4.98000702875894e-06, + "loss": 0.0619, + "step": 2534 + }, + { + "epoch": 0.4847036328871893, + "grad_norm": 1.7547122240066528, + "learning_rate": 4.980257730355493e-06, + "loss": 0.1307, + "step": 2535 + }, + { + "epoch": 0.4848948374760994, + "grad_norm": 1.7956041097640991, + "learning_rate": 4.9805083330754525e-06, + "loss": 0.1256, + "step": 2536 + }, + { + "epoch": 0.48508604206500955, + "grad_norm": 1.7650172710418701, + "learning_rate": 4.980758836996782e-06, + "loss": 0.1144, + "step": 2537 + }, + { + "epoch": 0.4852772466539197, + "grad_norm": 3.838001251220703, + "learning_rate": 4.981009242197351e-06, + "loss": 0.252, + "step": 2538 + }, + { + "epoch": 0.48546845124282983, + "grad_norm": 2.8733246326446533, + "learning_rate": 4.981259548754939e-06, + "loss": 0.3944, + "step": 2539 + }, + { + "epoch": 0.48565965583174, + "grad_norm": 1.7851463556289673, + "learning_rate": 4.981509756747234e-06, + "loss": 0.1834, + "step": 2540 + }, + { + "epoch": 0.4858508604206501, + "grad_norm": 1.990270972251892, + "learning_rate": 4.981759866251829e-06, + "loss": 0.2053, + "step": 2541 + }, + { + "epoch": 0.4860420650095602, + "grad_norm": 1.489159107208252, + "learning_rate": 4.98200987734623e-06, + "loss": 0.1285, + "step": 2542 + }, + { + "epoch": 0.48623326959847035, + "grad_norm": 1.8262524604797363, + "learning_rate": 4.9822597901078465e-06, + "loss": 0.1314, + "step": 2543 + }, + { + "epoch": 0.4864244741873805, + "grad_norm": 1.9886893033981323, + "learning_rate": 4.982509604614002e-06, + "loss": 0.1196, + "step": 2544 + }, + { + "epoch": 0.48661567877629064, + "grad_norm": 1.8472392559051514, + "learning_rate": 4.982759320941924e-06, + "loss": 0.1214, + "step": 2545 + }, + { + "epoch": 0.4868068833652008, + "grad_norm": 1.5206496715545654, + "learning_rate": 4.9830089391687534e-06, + "loss": 0.1371, + "step": 2546 + }, + { + "epoch": 0.4869980879541109, + "grad_norm": 2.1726207733154297, + "learning_rate": 4.983258459371536e-06, + "loss": 0.1734, + "step": 2547 + }, + { + "epoch": 0.487189292543021, + "grad_norm": 1.3305667638778687, + "learning_rate": 4.9835078816272295e-06, + "loss": 0.0642, + "step": 2548 + }, + { + "epoch": 0.48738049713193116, + "grad_norm": 1.633368968963623, + "learning_rate": 4.983757206012702e-06, + "loss": 0.0843, + "step": 2549 + }, + { + "epoch": 0.4875717017208413, + "grad_norm": 2.334226608276367, + "learning_rate": 4.984006432604726e-06, + "loss": 0.1321, + "step": 2550 + }, + { + "epoch": 0.48776290630975144, + "grad_norm": 1.615264654159546, + "learning_rate": 4.98425556147999e-06, + "loss": 0.1884, + "step": 2551 + }, + { + "epoch": 0.4879541108986616, + "grad_norm": 2.8392958641052246, + "learning_rate": 4.984504592715089e-06, + "loss": 0.5023, + "step": 2552 + }, + { + "epoch": 0.48814531548757173, + "grad_norm": 2.3064663410186768, + "learning_rate": 4.984753526386528e-06, + "loss": 0.2044, + "step": 2553 + }, + { + "epoch": 0.4883365200764818, + "grad_norm": 2.2431957721710205, + "learning_rate": 4.985002362570723e-06, + "loss": 0.2858, + "step": 2554 + }, + { + "epoch": 0.48852772466539196, + "grad_norm": 2.3659920692443848, + "learning_rate": 4.9852511013439995e-06, + "loss": 0.1595, + "step": 2555 + }, + { + "epoch": 0.4887189292543021, + "grad_norm": 2.477073907852173, + "learning_rate": 4.985499742782594e-06, + "loss": 0.0791, + "step": 2556 + }, + { + "epoch": 0.48891013384321225, + "grad_norm": 3.7777740955352783, + "learning_rate": 4.985748286962654e-06, + "loss": 0.6394, + "step": 2557 + }, + { + "epoch": 0.4891013384321224, + "grad_norm": 2.1839075088500977, + "learning_rate": 4.9859967339602365e-06, + "loss": 0.2495, + "step": 2558 + }, + { + "epoch": 0.4892925430210325, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.986245083851309e-06, + "loss": 0.419, + "step": 2559 + }, + { + "epoch": 0.4894837476099426, + "grad_norm": 1.4429762363433838, + "learning_rate": 4.986493336711752e-06, + "loss": 0.2267, + "step": 2560 + }, + { + "epoch": 0.48967495219885276, + "grad_norm": 5.334555149078369, + "learning_rate": 4.986741492617356e-06, + "loss": 0.2136, + "step": 2561 + }, + { + "epoch": 0.4898661567877629, + "grad_norm": 1.6938073635101318, + "learning_rate": 4.98698955164382e-06, + "loss": 0.0614, + "step": 2562 + }, + { + "epoch": 0.49005736137667305, + "grad_norm": 4.193806171417236, + "learning_rate": 4.9872375138667615e-06, + "loss": 0.2824, + "step": 2563 + }, + { + "epoch": 0.4902485659655832, + "grad_norm": 2.4448256492614746, + "learning_rate": 4.987485379361701e-06, + "loss": 0.362, + "step": 2564 + }, + { + "epoch": 0.4904397705544933, + "grad_norm": 2.175049066543579, + "learning_rate": 4.987733148204077e-06, + "loss": 0.159, + "step": 2565 + }, + { + "epoch": 0.4906309751434034, + "grad_norm": 1.4821659326553345, + "learning_rate": 4.987980820469236e-06, + "loss": 0.1167, + "step": 2566 + }, + { + "epoch": 0.49082217973231357, + "grad_norm": 1.6503522396087646, + "learning_rate": 4.988228396232439e-06, + "loss": 0.0725, + "step": 2567 + }, + { + "epoch": 0.4910133843212237, + "grad_norm": 3.942607879638672, + "learning_rate": 4.988475875568857e-06, + "loss": 0.136, + "step": 2568 + }, + { + "epoch": 0.49120458891013385, + "grad_norm": 1.6592808961868286, + "learning_rate": 4.988723258553574e-06, + "loss": 0.1298, + "step": 2569 + }, + { + "epoch": 0.491395793499044, + "grad_norm": 1.752287745475769, + "learning_rate": 4.988970545261588e-06, + "loss": 0.311, + "step": 2570 + }, + { + "epoch": 0.4915869980879541, + "grad_norm": 2.297788143157959, + "learning_rate": 4.989217735767806e-06, + "loss": 0.2016, + "step": 2571 + }, + { + "epoch": 0.49177820267686423, + "grad_norm": 1.5098553895950317, + "learning_rate": 4.989464830147051e-06, + "loss": 0.1153, + "step": 2572 + }, + { + "epoch": 0.49196940726577437, + "grad_norm": 1.6489616632461548, + "learning_rate": 4.989711828474057e-06, + "loss": 0.0871, + "step": 2573 + }, + { + "epoch": 0.4921606118546845, + "grad_norm": 1.4597628116607666, + "learning_rate": 4.989958730823471e-06, + "loss": 0.1124, + "step": 2574 + }, + { + "epoch": 0.49235181644359466, + "grad_norm": 1.9017386436462402, + "learning_rate": 4.990205537269853e-06, + "loss": 0.1342, + "step": 2575 + }, + { + "epoch": 0.4925430210325048, + "grad_norm": 2.946324586868286, + "learning_rate": 4.990452247887675e-06, + "loss": 0.3603, + "step": 2576 + }, + { + "epoch": 0.4927342256214149, + "grad_norm": 2.808943510055542, + "learning_rate": 4.9906988627513265e-06, + "loss": 0.3344, + "step": 2577 + }, + { + "epoch": 0.49292543021032503, + "grad_norm": 2.313256025314331, + "learning_rate": 4.990945381935106e-06, + "loss": 0.336, + "step": 2578 + }, + { + "epoch": 0.4931166347992352, + "grad_norm": 1.7406623363494873, + "learning_rate": 4.991191805513227e-06, + "loss": 0.1274, + "step": 2579 + }, + { + "epoch": 0.4933078393881453, + "grad_norm": 2.546191692352295, + "learning_rate": 4.991438133559817e-06, + "loss": 0.3057, + "step": 2580 + }, + { + "epoch": 0.49349904397705546, + "grad_norm": 3.7837564945220947, + "learning_rate": 4.991684366148917e-06, + "loss": 0.1454, + "step": 2581 + }, + { + "epoch": 0.4936902485659656, + "grad_norm": 3.7757515907287598, + "learning_rate": 4.991930503354482e-06, + "loss": 0.3576, + "step": 2582 + }, + { + "epoch": 0.4938814531548757, + "grad_norm": 2.6701161861419678, + "learning_rate": 4.9921765452503814e-06, + "loss": 0.4054, + "step": 2583 + }, + { + "epoch": 0.49407265774378584, + "grad_norm": 3.300074577331543, + "learning_rate": 4.992422491910399e-06, + "loss": 0.3313, + "step": 2584 + }, + { + "epoch": 0.494263862332696, + "grad_norm": 1.8099370002746582, + "learning_rate": 4.9926683434082315e-06, + "loss": 0.1173, + "step": 2585 + }, + { + "epoch": 0.4944550669216061, + "grad_norm": 1.8686485290527344, + "learning_rate": 4.9929140998174915e-06, + "loss": 0.1983, + "step": 2586 + }, + { + "epoch": 0.49464627151051627, + "grad_norm": 1.4938725233078003, + "learning_rate": 4.9931597612117065e-06, + "loss": 0.0842, + "step": 2587 + }, + { + "epoch": 0.4948374760994264, + "grad_norm": 1.6325137615203857, + "learning_rate": 4.993405327664316e-06, + "loss": 0.093, + "step": 2588 + }, + { + "epoch": 0.4950286806883365, + "grad_norm": 1.7486827373504639, + "learning_rate": 4.99365079924868e-06, + "loss": 0.2226, + "step": 2589 + }, + { + "epoch": 0.49521988527724664, + "grad_norm": 1.7181081771850586, + "learning_rate": 4.993896176038066e-06, + "loss": 0.1983, + "step": 2590 + }, + { + "epoch": 0.4954110898661568, + "grad_norm": 1.6922866106033325, + "learning_rate": 4.9941414581056636e-06, + "loss": 0.3837, + "step": 2591 + }, + { + "epoch": 0.4956022944550669, + "grad_norm": 1.5740710496902466, + "learning_rate": 4.994386645524574e-06, + "loss": 0.1492, + "step": 2592 + }, + { + "epoch": 0.49579349904397707, + "grad_norm": 1.319801926612854, + "learning_rate": 4.994631738367814e-06, + "loss": 0.1124, + "step": 2593 + }, + { + "epoch": 0.4959847036328872, + "grad_norm": 1.6975462436676025, + "learning_rate": 4.994876736708317e-06, + "loss": 0.1612, + "step": 2594 + }, + { + "epoch": 0.4961759082217973, + "grad_norm": 2.0203516483306885, + "learning_rate": 4.99512164061893e-06, + "loss": 0.2532, + "step": 2595 + }, + { + "epoch": 0.49636711281070744, + "grad_norm": 1.9539330005645752, + "learning_rate": 4.995366450172418e-06, + "loss": 0.2315, + "step": 2596 + }, + { + "epoch": 0.4965583173996176, + "grad_norm": 1.986435890197754, + "learning_rate": 4.995611165441463e-06, + "loss": 0.1917, + "step": 2597 + }, + { + "epoch": 0.49674952198852773, + "grad_norm": 1.494219422340393, + "learning_rate": 4.99585578649866e-06, + "loss": 0.096, + "step": 2598 + }, + { + "epoch": 0.4969407265774379, + "grad_norm": 1.1928995847702026, + "learning_rate": 4.996100313416522e-06, + "loss": 0.0925, + "step": 2599 + }, + { + "epoch": 0.497131931166348, + "grad_norm": 2.613600492477417, + "learning_rate": 4.996344746267477e-06, + "loss": 0.1606, + "step": 2600 + }, + { + "epoch": 0.4973231357552581, + "grad_norm": 1.8623335361480713, + "learning_rate": 4.99658908512387e-06, + "loss": 0.1377, + "step": 2601 + }, + { + "epoch": 0.49751434034416825, + "grad_norm": 2.6297202110290527, + "learning_rate": 4.996833330057964e-06, + "loss": 0.3241, + "step": 2602 + }, + { + "epoch": 0.4977055449330784, + "grad_norm": 1.7815700769424438, + "learning_rate": 4.997077481141936e-06, + "loss": 0.2196, + "step": 2603 + }, + { + "epoch": 0.49789674952198854, + "grad_norm": 3.413189172744751, + "learning_rate": 4.9973215384478835e-06, + "loss": 0.1373, + "step": 2604 + }, + { + "epoch": 0.4980879541108987, + "grad_norm": 4.574033737182617, + "learning_rate": 4.997565502047817e-06, + "loss": 0.2807, + "step": 2605 + }, + { + "epoch": 0.4982791586998088, + "grad_norm": 3.2377426624298096, + "learning_rate": 4.997809372013666e-06, + "loss": 0.3188, + "step": 2606 + }, + { + "epoch": 0.4984703632887189, + "grad_norm": 1.8162243366241455, + "learning_rate": 4.998053148417279e-06, + "loss": 0.2634, + "step": 2607 + }, + { + "epoch": 0.49866156787762905, + "grad_norm": 3.267460823059082, + "learning_rate": 4.998296831330417e-06, + "loss": 0.2784, + "step": 2608 + }, + { + "epoch": 0.4988527724665392, + "grad_norm": 1.5525761842727661, + "learning_rate": 4.998540420824764e-06, + "loss": 0.1681, + "step": 2609 + }, + { + "epoch": 0.49904397705544934, + "grad_norm": 2.893181324005127, + "learning_rate": 4.998783916971917e-06, + "loss": 0.3174, + "step": 2610 + }, + { + "epoch": 0.4992351816443595, + "grad_norm": 1.829892873764038, + "learning_rate": 4.999027319843394e-06, + "loss": 0.1275, + "step": 2611 + }, + { + "epoch": 0.4994263862332696, + "grad_norm": 2.012143611907959, + "learning_rate": 4.999270629510629e-06, + "loss": 0.1867, + "step": 2612 + }, + { + "epoch": 0.4996175908221797, + "grad_norm": 2.164092779159546, + "learning_rate": 4.9995138460449755e-06, + "loss": 0.2112, + "step": 2613 + }, + { + "epoch": 0.49980879541108986, + "grad_norm": 1.7680532932281494, + "learning_rate": 4.999756969517703e-06, + "loss": 0.1469, + "step": 2614 + }, + { + "epoch": 0.5, + "grad_norm": 4.590065002441406, + "learning_rate": 5e-06, + "loss": 0.2257, + "step": 2615 + }, + { + "epoch": 0.5001912045889101, + "grad_norm": 1.274849772453308, + "learning_rate": 5e-06, + "loss": 0.0882, + "step": 2616 + }, + { + "epoch": 0.5003824091778203, + "grad_norm": 2.5290040969848633, + "learning_rate": 5e-06, + "loss": 0.0899, + "step": 2617 + }, + { + "epoch": 0.5005736137667304, + "grad_norm": 2.3508877754211426, + "learning_rate": 5e-06, + "loss": 0.165, + "step": 2618 + }, + { + "epoch": 0.5007648183556406, + "grad_norm": 2.5142104625701904, + "learning_rate": 5e-06, + "loss": 0.1671, + "step": 2619 + }, + { + "epoch": 0.5009560229445507, + "grad_norm": 2.028837203979492, + "learning_rate": 5e-06, + "loss": 0.265, + "step": 2620 + }, + { + "epoch": 0.5011472275334607, + "grad_norm": 1.2833895683288574, + "learning_rate": 5e-06, + "loss": 0.1018, + "step": 2621 + }, + { + "epoch": 0.501338432122371, + "grad_norm": 1.112088918685913, + "learning_rate": 5e-06, + "loss": 0.09, + "step": 2622 + }, + { + "epoch": 0.501529636711281, + "grad_norm": 1.1832149028778076, + "learning_rate": 5e-06, + "loss": 0.096, + "step": 2623 + }, + { + "epoch": 0.5017208413001912, + "grad_norm": 1.8202910423278809, + "learning_rate": 5e-06, + "loss": 0.1305, + "step": 2624 + }, + { + "epoch": 0.5019120458891013, + "grad_norm": 2.20587420463562, + "learning_rate": 5e-06, + "loss": 0.0921, + "step": 2625 + }, + { + "epoch": 0.5021032504780115, + "grad_norm": 2.4090912342071533, + "learning_rate": 5e-06, + "loss": 0.401, + "step": 2626 + }, + { + "epoch": 0.5022944550669216, + "grad_norm": 2.0977654457092285, + "learning_rate": 5e-06, + "loss": 0.3296, + "step": 2627 + }, + { + "epoch": 0.5024856596558317, + "grad_norm": 1.8550901412963867, + "learning_rate": 5e-06, + "loss": 0.1836, + "step": 2628 + }, + { + "epoch": 0.5026768642447419, + "grad_norm": 0.8883583545684814, + "learning_rate": 5e-06, + "loss": 0.0655, + "step": 2629 + }, + { + "epoch": 0.502868068833652, + "grad_norm": 1.4066579341888428, + "learning_rate": 5e-06, + "loss": 0.0553, + "step": 2630 + }, + { + "epoch": 0.5030592734225622, + "grad_norm": 1.9968935251235962, + "learning_rate": 5e-06, + "loss": 0.0832, + "step": 2631 + }, + { + "epoch": 0.5032504780114723, + "grad_norm": 1.5393012762069702, + "learning_rate": 5e-06, + "loss": 0.1059, + "step": 2632 + }, + { + "epoch": 0.5034416826003824, + "grad_norm": 2.020643949508667, + "learning_rate": 5e-06, + "loss": 0.2575, + "step": 2633 + }, + { + "epoch": 0.5036328871892926, + "grad_norm": 1.2440476417541504, + "learning_rate": 5e-06, + "loss": 0.0954, + "step": 2634 + }, + { + "epoch": 0.5038240917782026, + "grad_norm": 1.243212103843689, + "learning_rate": 5e-06, + "loss": 0.1172, + "step": 2635 + }, + { + "epoch": 0.5040152963671128, + "grad_norm": 2.4254207611083984, + "learning_rate": 5e-06, + "loss": 0.2976, + "step": 2636 + }, + { + "epoch": 0.5042065009560229, + "grad_norm": 1.2954270839691162, + "learning_rate": 5e-06, + "loss": 0.0682, + "step": 2637 + }, + { + "epoch": 0.5043977055449331, + "grad_norm": 1.661173701286316, + "learning_rate": 5e-06, + "loss": 0.2638, + "step": 2638 + }, + { + "epoch": 0.5045889101338432, + "grad_norm": 2.2583632469177246, + "learning_rate": 5e-06, + "loss": 0.2787, + "step": 2639 + }, + { + "epoch": 0.5047801147227533, + "grad_norm": 3.421473979949951, + "learning_rate": 5e-06, + "loss": 0.4827, + "step": 2640 + }, + { + "epoch": 0.5049713193116635, + "grad_norm": 1.8076733350753784, + "learning_rate": 5e-06, + "loss": 0.154, + "step": 2641 + }, + { + "epoch": 0.5051625239005736, + "grad_norm": 2.341878652572632, + "learning_rate": 5e-06, + "loss": 0.2703, + "step": 2642 + }, + { + "epoch": 0.5053537284894838, + "grad_norm": 1.1928160190582275, + "learning_rate": 5e-06, + "loss": 0.1011, + "step": 2643 + }, + { + "epoch": 0.5055449330783939, + "grad_norm": 1.0467259883880615, + "learning_rate": 5e-06, + "loss": 0.0831, + "step": 2644 + }, + { + "epoch": 0.505736137667304, + "grad_norm": 1.8891057968139648, + "learning_rate": 5e-06, + "loss": 0.3587, + "step": 2645 + }, + { + "epoch": 0.5059273422562142, + "grad_norm": 2.7861201763153076, + "learning_rate": 5e-06, + "loss": 0.4989, + "step": 2646 + }, + { + "epoch": 0.5061185468451243, + "grad_norm": 1.5840612649917603, + "learning_rate": 5e-06, + "loss": 0.1864, + "step": 2647 + }, + { + "epoch": 0.5063097514340344, + "grad_norm": 3.3611576557159424, + "learning_rate": 5e-06, + "loss": 0.3172, + "step": 2648 + }, + { + "epoch": 0.5065009560229445, + "grad_norm": 1.371387004852295, + "learning_rate": 5e-06, + "loss": 0.0707, + "step": 2649 + }, + { + "epoch": 0.5066921606118547, + "grad_norm": 1.2657071352005005, + "learning_rate": 5e-06, + "loss": 0.0526, + "step": 2650 + }, + { + "epoch": 0.5068833652007648, + "grad_norm": 3.1202003955841064, + "learning_rate": 5e-06, + "loss": 0.3824, + "step": 2651 + }, + { + "epoch": 0.5070745697896749, + "grad_norm": 1.1883548498153687, + "learning_rate": 5e-06, + "loss": 0.1146, + "step": 2652 + }, + { + "epoch": 0.5072657743785851, + "grad_norm": 2.3430683612823486, + "learning_rate": 5e-06, + "loss": 0.3705, + "step": 2653 + }, + { + "epoch": 0.5074569789674952, + "grad_norm": 1.8830441236495972, + "learning_rate": 5e-06, + "loss": 0.0895, + "step": 2654 + }, + { + "epoch": 0.5076481835564054, + "grad_norm": 2.3886196613311768, + "learning_rate": 5e-06, + "loss": 0.257, + "step": 2655 + }, + { + "epoch": 0.5078393881453155, + "grad_norm": 1.6566410064697266, + "learning_rate": 5e-06, + "loss": 0.1023, + "step": 2656 + }, + { + "epoch": 0.5080305927342256, + "grad_norm": 1.7959635257720947, + "learning_rate": 5e-06, + "loss": 0.1833, + "step": 2657 + }, + { + "epoch": 0.5082217973231358, + "grad_norm": 1.3540290594100952, + "learning_rate": 5e-06, + "loss": 0.1589, + "step": 2658 + }, + { + "epoch": 0.5084130019120459, + "grad_norm": 1.84811532497406, + "learning_rate": 5e-06, + "loss": 0.1614, + "step": 2659 + }, + { + "epoch": 0.5086042065009561, + "grad_norm": 1.4192479848861694, + "learning_rate": 5e-06, + "loss": 0.1005, + "step": 2660 + }, + { + "epoch": 0.5087954110898661, + "grad_norm": 2.123854160308838, + "learning_rate": 5e-06, + "loss": 0.1387, + "step": 2661 + }, + { + "epoch": 0.5089866156787763, + "grad_norm": 3.7756459712982178, + "learning_rate": 5e-06, + "loss": 0.215, + "step": 2662 + }, + { + "epoch": 0.5091778202676864, + "grad_norm": 1.6984196901321411, + "learning_rate": 5e-06, + "loss": 0.0966, + "step": 2663 + }, + { + "epoch": 0.5093690248565965, + "grad_norm": 2.247396469116211, + "learning_rate": 5e-06, + "loss": 0.1488, + "step": 2664 + }, + { + "epoch": 0.5095602294455067, + "grad_norm": 2.6015279293060303, + "learning_rate": 5e-06, + "loss": 0.3206, + "step": 2665 + }, + { + "epoch": 0.5097514340344168, + "grad_norm": 1.4520198106765747, + "learning_rate": 5e-06, + "loss": 0.1184, + "step": 2666 + }, + { + "epoch": 0.509942638623327, + "grad_norm": 2.357475757598877, + "learning_rate": 5e-06, + "loss": 0.1052, + "step": 2667 + }, + { + "epoch": 0.5101338432122371, + "grad_norm": 1.6518830060958862, + "learning_rate": 5e-06, + "loss": 0.0741, + "step": 2668 + }, + { + "epoch": 0.5103250478011472, + "grad_norm": 3.9007205963134766, + "learning_rate": 5e-06, + "loss": 0.3746, + "step": 2669 + }, + { + "epoch": 0.5105162523900574, + "grad_norm": 2.126702308654785, + "learning_rate": 5e-06, + "loss": 0.332, + "step": 2670 + }, + { + "epoch": 0.5107074569789675, + "grad_norm": 1.7568902969360352, + "learning_rate": 5e-06, + "loss": 0.1567, + "step": 2671 + }, + { + "epoch": 0.5108986615678777, + "grad_norm": 1.939828634262085, + "learning_rate": 5e-06, + "loss": 0.2383, + "step": 2672 + }, + { + "epoch": 0.5110898661567878, + "grad_norm": 1.8241344690322876, + "learning_rate": 5e-06, + "loss": 0.1064, + "step": 2673 + }, + { + "epoch": 0.511281070745698, + "grad_norm": 2.5553126335144043, + "learning_rate": 5e-06, + "loss": 0.2161, + "step": 2674 + }, + { + "epoch": 0.511472275334608, + "grad_norm": 1.7424609661102295, + "learning_rate": 5e-06, + "loss": 0.091, + "step": 2675 + }, + { + "epoch": 0.5116634799235181, + "grad_norm": 1.8869847059249878, + "learning_rate": 5e-06, + "loss": 0.2992, + "step": 2676 + }, + { + "epoch": 0.5118546845124283, + "grad_norm": 1.1718688011169434, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 2677 + }, + { + "epoch": 0.5120458891013384, + "grad_norm": 1.0170719623565674, + "learning_rate": 5e-06, + "loss": 0.1267, + "step": 2678 + }, + { + "epoch": 0.5122370936902486, + "grad_norm": 1.5140657424926758, + "learning_rate": 5e-06, + "loss": 0.0969, + "step": 2679 + }, + { + "epoch": 0.5124282982791587, + "grad_norm": 1.8344894647598267, + "learning_rate": 5e-06, + "loss": 0.14, + "step": 2680 + }, + { + "epoch": 0.5126195028680688, + "grad_norm": 2.2053725719451904, + "learning_rate": 5e-06, + "loss": 0.2002, + "step": 2681 + }, + { + "epoch": 0.512810707456979, + "grad_norm": 1.968572974205017, + "learning_rate": 5e-06, + "loss": 0.1644, + "step": 2682 + }, + { + "epoch": 0.5130019120458891, + "grad_norm": 2.810896158218384, + "learning_rate": 5e-06, + "loss": 0.3554, + "step": 2683 + }, + { + "epoch": 0.5131931166347993, + "grad_norm": 1.951485276222229, + "learning_rate": 5e-06, + "loss": 0.2156, + "step": 2684 + }, + { + "epoch": 0.5133843212237094, + "grad_norm": 1.31484055519104, + "learning_rate": 5e-06, + "loss": 0.1424, + "step": 2685 + }, + { + "epoch": 0.5135755258126194, + "grad_norm": 2.028074026107788, + "learning_rate": 5e-06, + "loss": 0.165, + "step": 2686 + }, + { + "epoch": 0.5137667304015296, + "grad_norm": 1.3632593154907227, + "learning_rate": 5e-06, + "loss": 0.067, + "step": 2687 + }, + { + "epoch": 0.5139579349904397, + "grad_norm": 1.8360464572906494, + "learning_rate": 5e-06, + "loss": 0.1073, + "step": 2688 + }, + { + "epoch": 0.5141491395793499, + "grad_norm": 2.7221839427948, + "learning_rate": 5e-06, + "loss": 0.536, + "step": 2689 + }, + { + "epoch": 0.51434034416826, + "grad_norm": 1.6568959951400757, + "learning_rate": 5e-06, + "loss": 0.133, + "step": 2690 + }, + { + "epoch": 0.5145315487571702, + "grad_norm": 2.4238178730010986, + "learning_rate": 5e-06, + "loss": 0.2072, + "step": 2691 + }, + { + "epoch": 0.5147227533460803, + "grad_norm": 1.4915567636489868, + "learning_rate": 5e-06, + "loss": 0.1145, + "step": 2692 + }, + { + "epoch": 0.5149139579349904, + "grad_norm": 1.5793148279190063, + "learning_rate": 5e-06, + "loss": 0.1, + "step": 2693 + }, + { + "epoch": 0.5151051625239006, + "grad_norm": 1.486043930053711, + "learning_rate": 5e-06, + "loss": 0.1559, + "step": 2694 + }, + { + "epoch": 0.5152963671128107, + "grad_norm": 2.671621561050415, + "learning_rate": 5e-06, + "loss": 0.6114, + "step": 2695 + }, + { + "epoch": 0.5154875717017209, + "grad_norm": 1.9827364683151245, + "learning_rate": 5e-06, + "loss": 0.148, + "step": 2696 + }, + { + "epoch": 0.515678776290631, + "grad_norm": 1.608793020248413, + "learning_rate": 5e-06, + "loss": 0.1065, + "step": 2697 + }, + { + "epoch": 0.5158699808795411, + "grad_norm": 1.5601345300674438, + "learning_rate": 5e-06, + "loss": 0.1278, + "step": 2698 + }, + { + "epoch": 0.5160611854684513, + "grad_norm": 2.0068085193634033, + "learning_rate": 5e-06, + "loss": 0.1, + "step": 2699 + }, + { + "epoch": 0.5162523900573613, + "grad_norm": 1.006656289100647, + "learning_rate": 5e-06, + "loss": 0.0425, + "step": 2700 + }, + { + "epoch": 0.5164435946462715, + "grad_norm": 4.04417085647583, + "learning_rate": 5e-06, + "loss": 0.7525, + "step": 2701 + }, + { + "epoch": 0.5166347992351816, + "grad_norm": 2.4919166564941406, + "learning_rate": 5e-06, + "loss": 0.3474, + "step": 2702 + }, + { + "epoch": 0.5168260038240918, + "grad_norm": 1.7118905782699585, + "learning_rate": 5e-06, + "loss": 0.1593, + "step": 2703 + }, + { + "epoch": 0.5170172084130019, + "grad_norm": 2.105720043182373, + "learning_rate": 5e-06, + "loss": 0.2862, + "step": 2704 + }, + { + "epoch": 0.517208413001912, + "grad_norm": 2.6111350059509277, + "learning_rate": 5e-06, + "loss": 0.1986, + "step": 2705 + }, + { + "epoch": 0.5173996175908222, + "grad_norm": 1.308851718902588, + "learning_rate": 5e-06, + "loss": 0.0404, + "step": 2706 + }, + { + "epoch": 0.5175908221797323, + "grad_norm": 2.451007127761841, + "learning_rate": 5e-06, + "loss": 0.241, + "step": 2707 + }, + { + "epoch": 0.5177820267686425, + "grad_norm": 1.6604572534561157, + "learning_rate": 5e-06, + "loss": 0.1542, + "step": 2708 + }, + { + "epoch": 0.5179732313575526, + "grad_norm": 1.995052456855774, + "learning_rate": 5e-06, + "loss": 0.2343, + "step": 2709 + }, + { + "epoch": 0.5181644359464627, + "grad_norm": 1.6679072380065918, + "learning_rate": 5e-06, + "loss": 0.1014, + "step": 2710 + }, + { + "epoch": 0.5183556405353729, + "grad_norm": 1.284263253211975, + "learning_rate": 5e-06, + "loss": 0.0664, + "step": 2711 + }, + { + "epoch": 0.518546845124283, + "grad_norm": 1.0988402366638184, + "learning_rate": 5e-06, + "loss": 0.0266, + "step": 2712 + }, + { + "epoch": 0.5187380497131932, + "grad_norm": 1.9333552122116089, + "learning_rate": 5e-06, + "loss": 0.1244, + "step": 2713 + }, + { + "epoch": 0.5189292543021032, + "grad_norm": 2.2870166301727295, + "learning_rate": 5e-06, + "loss": 0.4859, + "step": 2714 + }, + { + "epoch": 0.5191204588910134, + "grad_norm": 2.012103319168091, + "learning_rate": 5e-06, + "loss": 0.1198, + "step": 2715 + }, + { + "epoch": 0.5193116634799235, + "grad_norm": 1.6784409284591675, + "learning_rate": 5e-06, + "loss": 0.1635, + "step": 2716 + }, + { + "epoch": 0.5195028680688336, + "grad_norm": 2.0358691215515137, + "learning_rate": 5e-06, + "loss": 0.15, + "step": 2717 + }, + { + "epoch": 0.5196940726577438, + "grad_norm": 1.3582711219787598, + "learning_rate": 5e-06, + "loss": 0.0842, + "step": 2718 + }, + { + "epoch": 0.5198852772466539, + "grad_norm": 2.2164931297302246, + "learning_rate": 5e-06, + "loss": 0.1049, + "step": 2719 + }, + { + "epoch": 0.5200764818355641, + "grad_norm": 3.631624698638916, + "learning_rate": 5e-06, + "loss": 0.2526, + "step": 2720 + }, + { + "epoch": 0.5202676864244742, + "grad_norm": 2.41479229927063, + "learning_rate": 5e-06, + "loss": 0.3641, + "step": 2721 + }, + { + "epoch": 0.5204588910133843, + "grad_norm": 1.9997187852859497, + "learning_rate": 5e-06, + "loss": 0.1708, + "step": 2722 + }, + { + "epoch": 0.5206500956022945, + "grad_norm": 3.5295379161834717, + "learning_rate": 5e-06, + "loss": 0.2561, + "step": 2723 + }, + { + "epoch": 0.5208413001912046, + "grad_norm": 4.27695369720459, + "learning_rate": 5e-06, + "loss": 0.2081, + "step": 2724 + }, + { + "epoch": 0.5210325047801148, + "grad_norm": 1.7342065572738647, + "learning_rate": 5e-06, + "loss": 0.0791, + "step": 2725 + }, + { + "epoch": 0.5212237093690248, + "grad_norm": 1.875627040863037, + "learning_rate": 5e-06, + "loss": 0.1565, + "step": 2726 + }, + { + "epoch": 0.521414913957935, + "grad_norm": 1.8105803728103638, + "learning_rate": 5e-06, + "loss": 0.2063, + "step": 2727 + }, + { + "epoch": 0.5216061185468451, + "grad_norm": 1.5556526184082031, + "learning_rate": 5e-06, + "loss": 0.205, + "step": 2728 + }, + { + "epoch": 0.5217973231357552, + "grad_norm": 1.5404901504516602, + "learning_rate": 5e-06, + "loss": 0.1059, + "step": 2729 + }, + { + "epoch": 0.5219885277246654, + "grad_norm": 1.3888638019561768, + "learning_rate": 5e-06, + "loss": 0.1064, + "step": 2730 + }, + { + "epoch": 0.5221797323135755, + "grad_norm": 1.3965755701065063, + "learning_rate": 5e-06, + "loss": 0.0549, + "step": 2731 + }, + { + "epoch": 0.5223709369024857, + "grad_norm": 2.155271530151367, + "learning_rate": 5e-06, + "loss": 0.2185, + "step": 2732 + }, + { + "epoch": 0.5225621414913958, + "grad_norm": 3.022123336791992, + "learning_rate": 5e-06, + "loss": 0.3292, + "step": 2733 + }, + { + "epoch": 0.5227533460803059, + "grad_norm": 1.5752053260803223, + "learning_rate": 5e-06, + "loss": 0.1342, + "step": 2734 + }, + { + "epoch": 0.5229445506692161, + "grad_norm": 1.3086416721343994, + "learning_rate": 5e-06, + "loss": 0.0672, + "step": 2735 + }, + { + "epoch": 0.5231357552581262, + "grad_norm": 1.8340917825698853, + "learning_rate": 5e-06, + "loss": 0.0859, + "step": 2736 + }, + { + "epoch": 0.5233269598470364, + "grad_norm": 4.185413360595703, + "learning_rate": 5e-06, + "loss": 0.1021, + "step": 2737 + }, + { + "epoch": 0.5235181644359465, + "grad_norm": 3.0778019428253174, + "learning_rate": 5e-06, + "loss": 0.3103, + "step": 2738 + }, + { + "epoch": 0.5237093690248565, + "grad_norm": 2.644026041030884, + "learning_rate": 5e-06, + "loss": 0.5049, + "step": 2739 + }, + { + "epoch": 0.5239005736137667, + "grad_norm": 1.7256097793579102, + "learning_rate": 5e-06, + "loss": 0.2243, + "step": 2740 + }, + { + "epoch": 0.5240917782026768, + "grad_norm": 2.3407437801361084, + "learning_rate": 5e-06, + "loss": 0.1834, + "step": 2741 + }, + { + "epoch": 0.524282982791587, + "grad_norm": 1.885779619216919, + "learning_rate": 5e-06, + "loss": 0.1551, + "step": 2742 + }, + { + "epoch": 0.5244741873804971, + "grad_norm": 2.010530710220337, + "learning_rate": 5e-06, + "loss": 0.1249, + "step": 2743 + }, + { + "epoch": 0.5246653919694073, + "grad_norm": 2.1776020526885986, + "learning_rate": 5e-06, + "loss": 0.131, + "step": 2744 + }, + { + "epoch": 0.5248565965583174, + "grad_norm": 2.62050199508667, + "learning_rate": 5e-06, + "loss": 0.3972, + "step": 2745 + }, + { + "epoch": 0.5250478011472275, + "grad_norm": 2.356659173965454, + "learning_rate": 5e-06, + "loss": 0.1496, + "step": 2746 + }, + { + "epoch": 0.5252390057361377, + "grad_norm": 1.9196159839630127, + "learning_rate": 5e-06, + "loss": 0.1202, + "step": 2747 + }, + { + "epoch": 0.5254302103250478, + "grad_norm": 2.2066104412078857, + "learning_rate": 5e-06, + "loss": 0.1625, + "step": 2748 + }, + { + "epoch": 0.525621414913958, + "grad_norm": 1.1974610090255737, + "learning_rate": 5e-06, + "loss": 0.0551, + "step": 2749 + }, + { + "epoch": 0.5258126195028681, + "grad_norm": 2.4025866985321045, + "learning_rate": 5e-06, + "loss": 0.1537, + "step": 2750 + }, + { + "epoch": 0.5260038240917781, + "grad_norm": 2.276185989379883, + "learning_rate": 5e-06, + "loss": 0.2141, + "step": 2751 + }, + { + "epoch": 0.5261950286806883, + "grad_norm": 1.9449247121810913, + "learning_rate": 5e-06, + "loss": 0.2303, + "step": 2752 + }, + { + "epoch": 0.5263862332695984, + "grad_norm": 1.2716035842895508, + "learning_rate": 5e-06, + "loss": 0.1516, + "step": 2753 + }, + { + "epoch": 0.5265774378585086, + "grad_norm": 1.2771272659301758, + "learning_rate": 5e-06, + "loss": 0.088, + "step": 2754 + }, + { + "epoch": 0.5267686424474187, + "grad_norm": 2.0387823581695557, + "learning_rate": 5e-06, + "loss": 0.1187, + "step": 2755 + }, + { + "epoch": 0.5269598470363289, + "grad_norm": 2.576063394546509, + "learning_rate": 5e-06, + "loss": 0.1408, + "step": 2756 + }, + { + "epoch": 0.527151051625239, + "grad_norm": 2.7884414196014404, + "learning_rate": 5e-06, + "loss": 0.3298, + "step": 2757 + }, + { + "epoch": 0.5273422562141491, + "grad_norm": 3.2937912940979004, + "learning_rate": 5e-06, + "loss": 0.5074, + "step": 2758 + }, + { + "epoch": 0.5275334608030593, + "grad_norm": 1.2143149375915527, + "learning_rate": 5e-06, + "loss": 0.0907, + "step": 2759 + }, + { + "epoch": 0.5277246653919694, + "grad_norm": 1.9609935283660889, + "learning_rate": 5e-06, + "loss": 0.1422, + "step": 2760 + }, + { + "epoch": 0.5279158699808796, + "grad_norm": 2.3065104484558105, + "learning_rate": 5e-06, + "loss": 0.2394, + "step": 2761 + }, + { + "epoch": 0.5281070745697897, + "grad_norm": 1.3439974784851074, + "learning_rate": 5e-06, + "loss": 0.0567, + "step": 2762 + }, + { + "epoch": 0.5282982791586998, + "grad_norm": 2.701871395111084, + "learning_rate": 5e-06, + "loss": 0.2844, + "step": 2763 + }, + { + "epoch": 0.52848948374761, + "grad_norm": 2.580606460571289, + "learning_rate": 5e-06, + "loss": 0.3827, + "step": 2764 + }, + { + "epoch": 0.52868068833652, + "grad_norm": 2.0490102767944336, + "learning_rate": 5e-06, + "loss": 0.164, + "step": 2765 + }, + { + "epoch": 0.5288718929254302, + "grad_norm": 2.0676021575927734, + "learning_rate": 5e-06, + "loss": 0.3342, + "step": 2766 + }, + { + "epoch": 0.5290630975143403, + "grad_norm": 1.146315097808838, + "learning_rate": 5e-06, + "loss": 0.0772, + "step": 2767 + }, + { + "epoch": 0.5292543021032505, + "grad_norm": 1.5692076683044434, + "learning_rate": 5e-06, + "loss": 0.1324, + "step": 2768 + }, + { + "epoch": 0.5294455066921606, + "grad_norm": 2.6447064876556396, + "learning_rate": 5e-06, + "loss": 0.2929, + "step": 2769 + }, + { + "epoch": 0.5296367112810707, + "grad_norm": 3.0837621688842773, + "learning_rate": 5e-06, + "loss": 0.4175, + "step": 2770 + }, + { + "epoch": 0.5298279158699809, + "grad_norm": 2.180725574493408, + "learning_rate": 5e-06, + "loss": 0.2276, + "step": 2771 + }, + { + "epoch": 0.530019120458891, + "grad_norm": 1.3501646518707275, + "learning_rate": 5e-06, + "loss": 0.1305, + "step": 2772 + }, + { + "epoch": 0.5302103250478012, + "grad_norm": 2.3673691749572754, + "learning_rate": 5e-06, + "loss": 0.2671, + "step": 2773 + }, + { + "epoch": 0.5304015296367113, + "grad_norm": 2.891281843185425, + "learning_rate": 5e-06, + "loss": 0.2122, + "step": 2774 + }, + { + "epoch": 0.5305927342256214, + "grad_norm": 2.67206072807312, + "learning_rate": 5e-06, + "loss": 0.112, + "step": 2775 + }, + { + "epoch": 0.5307839388145316, + "grad_norm": 2.2440712451934814, + "learning_rate": 5e-06, + "loss": 0.3392, + "step": 2776 + }, + { + "epoch": 0.5309751434034417, + "grad_norm": 1.790481686592102, + "learning_rate": 5e-06, + "loss": 0.2133, + "step": 2777 + }, + { + "epoch": 0.5311663479923519, + "grad_norm": 1.2576000690460205, + "learning_rate": 5e-06, + "loss": 0.0937, + "step": 2778 + }, + { + "epoch": 0.5313575525812619, + "grad_norm": 1.711289405822754, + "learning_rate": 5e-06, + "loss": 0.1502, + "step": 2779 + }, + { + "epoch": 0.5315487571701721, + "grad_norm": 1.4342141151428223, + "learning_rate": 5e-06, + "loss": 0.1056, + "step": 2780 + }, + { + "epoch": 0.5317399617590822, + "grad_norm": 2.1560006141662598, + "learning_rate": 5e-06, + "loss": 0.1531, + "step": 2781 + }, + { + "epoch": 0.5319311663479923, + "grad_norm": 2.112128734588623, + "learning_rate": 5e-06, + "loss": 0.1361, + "step": 2782 + }, + { + "epoch": 0.5321223709369025, + "grad_norm": 2.3116440773010254, + "learning_rate": 5e-06, + "loss": 0.2867, + "step": 2783 + }, + { + "epoch": 0.5323135755258126, + "grad_norm": 2.4027740955352783, + "learning_rate": 5e-06, + "loss": 0.2486, + "step": 2784 + }, + { + "epoch": 0.5325047801147228, + "grad_norm": 1.1170294284820557, + "learning_rate": 5e-06, + "loss": 0.0558, + "step": 2785 + }, + { + "epoch": 0.5326959847036329, + "grad_norm": 2.2636983394622803, + "learning_rate": 5e-06, + "loss": 0.0858, + "step": 2786 + }, + { + "epoch": 0.532887189292543, + "grad_norm": 1.4869250059127808, + "learning_rate": 5e-06, + "loss": 0.0778, + "step": 2787 + }, + { + "epoch": 0.5330783938814532, + "grad_norm": 2.6340017318725586, + "learning_rate": 5e-06, + "loss": 0.1319, + "step": 2788 + }, + { + "epoch": 0.5332695984703633, + "grad_norm": 2.0733349323272705, + "learning_rate": 5e-06, + "loss": 0.2641, + "step": 2789 + }, + { + "epoch": 0.5334608030592735, + "grad_norm": 1.8458443880081177, + "learning_rate": 5e-06, + "loss": 0.1147, + "step": 2790 + }, + { + "epoch": 0.5336520076481835, + "grad_norm": 0.9659456610679626, + "learning_rate": 5e-06, + "loss": 0.0781, + "step": 2791 + }, + { + "epoch": 0.5338432122370937, + "grad_norm": 2.7698538303375244, + "learning_rate": 5e-06, + "loss": 0.2659, + "step": 2792 + }, + { + "epoch": 0.5340344168260038, + "grad_norm": 0.9814829230308533, + "learning_rate": 5e-06, + "loss": 0.0433, + "step": 2793 + }, + { + "epoch": 0.5342256214149139, + "grad_norm": 2.5287699699401855, + "learning_rate": 5e-06, + "loss": 0.1128, + "step": 2794 + }, + { + "epoch": 0.5344168260038241, + "grad_norm": 2.2758593559265137, + "learning_rate": 5e-06, + "loss": 0.2237, + "step": 2795 + }, + { + "epoch": 0.5346080305927342, + "grad_norm": 3.0711472034454346, + "learning_rate": 5e-06, + "loss": 0.1646, + "step": 2796 + }, + { + "epoch": 0.5347992351816444, + "grad_norm": 2.538210153579712, + "learning_rate": 5e-06, + "loss": 0.3324, + "step": 2797 + }, + { + "epoch": 0.5349904397705545, + "grad_norm": 1.7886542081832886, + "learning_rate": 5e-06, + "loss": 0.1307, + "step": 2798 + }, + { + "epoch": 0.5351816443594646, + "grad_norm": 1.5378341674804688, + "learning_rate": 5e-06, + "loss": 0.1657, + "step": 2799 + }, + { + "epoch": 0.5353728489483748, + "grad_norm": 1.748417615890503, + "learning_rate": 5e-06, + "loss": 0.0942, + "step": 2800 + }, + { + "epoch": 0.5355640535372849, + "grad_norm": 1.9090436697006226, + "learning_rate": 5e-06, + "loss": 0.3028, + "step": 2801 + }, + { + "epoch": 0.5357552581261951, + "grad_norm": 2.4026999473571777, + "learning_rate": 5e-06, + "loss": 0.2898, + "step": 2802 + }, + { + "epoch": 0.5359464627151052, + "grad_norm": 1.4179044961929321, + "learning_rate": 5e-06, + "loss": 0.1178, + "step": 2803 + }, + { + "epoch": 0.5361376673040152, + "grad_norm": 1.6482465267181396, + "learning_rate": 5e-06, + "loss": 0.0878, + "step": 2804 + }, + { + "epoch": 0.5363288718929254, + "grad_norm": 2.1801981925964355, + "learning_rate": 5e-06, + "loss": 0.1139, + "step": 2805 + }, + { + "epoch": 0.5365200764818355, + "grad_norm": 1.33127760887146, + "learning_rate": 5e-06, + "loss": 0.0881, + "step": 2806 + }, + { + "epoch": 0.5367112810707457, + "grad_norm": 3.296818971633911, + "learning_rate": 5e-06, + "loss": 0.3499, + "step": 2807 + }, + { + "epoch": 0.5369024856596558, + "grad_norm": 2.183624267578125, + "learning_rate": 5e-06, + "loss": 0.303, + "step": 2808 + }, + { + "epoch": 0.537093690248566, + "grad_norm": 2.4392447471618652, + "learning_rate": 5e-06, + "loss": 0.3532, + "step": 2809 + }, + { + "epoch": 0.5372848948374761, + "grad_norm": 1.9336990118026733, + "learning_rate": 5e-06, + "loss": 0.1537, + "step": 2810 + }, + { + "epoch": 0.5374760994263862, + "grad_norm": 1.5660754442214966, + "learning_rate": 5e-06, + "loss": 0.1199, + "step": 2811 + }, + { + "epoch": 0.5376673040152964, + "grad_norm": 2.9620461463928223, + "learning_rate": 5e-06, + "loss": 0.2485, + "step": 2812 + }, + { + "epoch": 0.5378585086042065, + "grad_norm": 1.6001290082931519, + "learning_rate": 5e-06, + "loss": 0.1115, + "step": 2813 + }, + { + "epoch": 0.5380497131931167, + "grad_norm": 1.5141234397888184, + "learning_rate": 5e-06, + "loss": 0.1407, + "step": 2814 + }, + { + "epoch": 0.5382409177820268, + "grad_norm": 1.9983782768249512, + "learning_rate": 5e-06, + "loss": 0.24, + "step": 2815 + }, + { + "epoch": 0.5384321223709368, + "grad_norm": 1.7045232057571411, + "learning_rate": 5e-06, + "loss": 0.1267, + "step": 2816 + }, + { + "epoch": 0.538623326959847, + "grad_norm": 1.1079213619232178, + "learning_rate": 5e-06, + "loss": 0.1043, + "step": 2817 + }, + { + "epoch": 0.5388145315487571, + "grad_norm": 2.0188820362091064, + "learning_rate": 5e-06, + "loss": 0.314, + "step": 2818 + }, + { + "epoch": 0.5390057361376673, + "grad_norm": 1.7051680088043213, + "learning_rate": 5e-06, + "loss": 0.1723, + "step": 2819 + }, + { + "epoch": 0.5391969407265774, + "grad_norm": 2.6241707801818848, + "learning_rate": 5e-06, + "loss": 0.3201, + "step": 2820 + }, + { + "epoch": 0.5393881453154876, + "grad_norm": 1.4381252527236938, + "learning_rate": 5e-06, + "loss": 0.1995, + "step": 2821 + }, + { + "epoch": 0.5395793499043977, + "grad_norm": 2.13104248046875, + "learning_rate": 5e-06, + "loss": 0.3106, + "step": 2822 + }, + { + "epoch": 0.5397705544933078, + "grad_norm": 2.429593086242676, + "learning_rate": 5e-06, + "loss": 0.1502, + "step": 2823 + }, + { + "epoch": 0.539961759082218, + "grad_norm": 1.9471462965011597, + "learning_rate": 5e-06, + "loss": 0.1125, + "step": 2824 + }, + { + "epoch": 0.5401529636711281, + "grad_norm": 2.1036086082458496, + "learning_rate": 5e-06, + "loss": 0.1223, + "step": 2825 + }, + { + "epoch": 0.5403441682600383, + "grad_norm": 3.6985058784484863, + "learning_rate": 5e-06, + "loss": 0.6219, + "step": 2826 + }, + { + "epoch": 0.5405353728489484, + "grad_norm": 2.2251152992248535, + "learning_rate": 5e-06, + "loss": 0.2841, + "step": 2827 + }, + { + "epoch": 0.5407265774378585, + "grad_norm": 1.7809268236160278, + "learning_rate": 5e-06, + "loss": 0.3284, + "step": 2828 + }, + { + "epoch": 0.5409177820267687, + "grad_norm": 2.0658373832702637, + "learning_rate": 5e-06, + "loss": 0.1905, + "step": 2829 + }, + { + "epoch": 0.5411089866156787, + "grad_norm": 2.7861831188201904, + "learning_rate": 5e-06, + "loss": 0.1682, + "step": 2830 + }, + { + "epoch": 0.5413001912045889, + "grad_norm": 2.1867973804473877, + "learning_rate": 5e-06, + "loss": 0.1547, + "step": 2831 + }, + { + "epoch": 0.541491395793499, + "grad_norm": 2.6366775035858154, + "learning_rate": 5e-06, + "loss": 0.4594, + "step": 2832 + }, + { + "epoch": 0.5416826003824092, + "grad_norm": 2.0173962116241455, + "learning_rate": 5e-06, + "loss": 0.1957, + "step": 2833 + }, + { + "epoch": 0.5418738049713193, + "grad_norm": 2.2647476196289062, + "learning_rate": 5e-06, + "loss": 0.1583, + "step": 2834 + }, + { + "epoch": 0.5420650095602294, + "grad_norm": 1.8371925354003906, + "learning_rate": 5e-06, + "loss": 0.1247, + "step": 2835 + }, + { + "epoch": 0.5422562141491396, + "grad_norm": 2.1972312927246094, + "learning_rate": 5e-06, + "loss": 0.22, + "step": 2836 + }, + { + "epoch": 0.5424474187380497, + "grad_norm": 1.5029915571212769, + "learning_rate": 5e-06, + "loss": 0.0747, + "step": 2837 + }, + { + "epoch": 0.5426386233269599, + "grad_norm": 1.6815667152404785, + "learning_rate": 5e-06, + "loss": 0.1455, + "step": 2838 + }, + { + "epoch": 0.54282982791587, + "grad_norm": 2.7903823852539062, + "learning_rate": 5e-06, + "loss": 0.5205, + "step": 2839 + }, + { + "epoch": 0.5430210325047801, + "grad_norm": 2.121938943862915, + "learning_rate": 5e-06, + "loss": 0.374, + "step": 2840 + }, + { + "epoch": 0.5432122370936903, + "grad_norm": 2.8478827476501465, + "learning_rate": 5e-06, + "loss": 0.3739, + "step": 2841 + }, + { + "epoch": 0.5434034416826004, + "grad_norm": 2.33632755279541, + "learning_rate": 5e-06, + "loss": 0.1146, + "step": 2842 + }, + { + "epoch": 0.5435946462715106, + "grad_norm": 1.6369229555130005, + "learning_rate": 5e-06, + "loss": 0.0817, + "step": 2843 + }, + { + "epoch": 0.5437858508604206, + "grad_norm": 2.857534885406494, + "learning_rate": 5e-06, + "loss": 0.4311, + "step": 2844 + }, + { + "epoch": 0.5439770554493308, + "grad_norm": 2.2772583961486816, + "learning_rate": 5e-06, + "loss": 0.3075, + "step": 2845 + }, + { + "epoch": 0.5441682600382409, + "grad_norm": 1.84805166721344, + "learning_rate": 5e-06, + "loss": 0.16, + "step": 2846 + }, + { + "epoch": 0.544359464627151, + "grad_norm": 3.0901503562927246, + "learning_rate": 5e-06, + "loss": 0.2931, + "step": 2847 + }, + { + "epoch": 0.5445506692160612, + "grad_norm": 1.3856008052825928, + "learning_rate": 5e-06, + "loss": 0.0772, + "step": 2848 + }, + { + "epoch": 0.5447418738049713, + "grad_norm": 1.3322361707687378, + "learning_rate": 5e-06, + "loss": 0.0977, + "step": 2849 + }, + { + "epoch": 0.5449330783938815, + "grad_norm": 2.8700478076934814, + "learning_rate": 5e-06, + "loss": 0.1916, + "step": 2850 + }, + { + "epoch": 0.5451242829827916, + "grad_norm": 2.482931137084961, + "learning_rate": 5e-06, + "loss": 0.3922, + "step": 2851 + }, + { + "epoch": 0.5453154875717017, + "grad_norm": 2.8786845207214355, + "learning_rate": 5e-06, + "loss": 0.3643, + "step": 2852 + }, + { + "epoch": 0.5455066921606119, + "grad_norm": 1.5687427520751953, + "learning_rate": 5e-06, + "loss": 0.1403, + "step": 2853 + }, + { + "epoch": 0.545697896749522, + "grad_norm": 1.8267560005187988, + "learning_rate": 5e-06, + "loss": 0.1301, + "step": 2854 + }, + { + "epoch": 0.5458891013384322, + "grad_norm": 2.0420382022857666, + "learning_rate": 5e-06, + "loss": 0.1072, + "step": 2855 + }, + { + "epoch": 0.5460803059273422, + "grad_norm": 2.005113124847412, + "learning_rate": 5e-06, + "loss": 0.106, + "step": 2856 + }, + { + "epoch": 0.5462715105162524, + "grad_norm": 2.4532604217529297, + "learning_rate": 5e-06, + "loss": 0.4297, + "step": 2857 + }, + { + "epoch": 0.5464627151051625, + "grad_norm": 1.8568321466445923, + "learning_rate": 5e-06, + "loss": 0.1724, + "step": 2858 + }, + { + "epoch": 0.5466539196940726, + "grad_norm": 2.2571911811828613, + "learning_rate": 5e-06, + "loss": 0.2347, + "step": 2859 + }, + { + "epoch": 0.5468451242829828, + "grad_norm": 1.288896918296814, + "learning_rate": 5e-06, + "loss": 0.0996, + "step": 2860 + }, + { + "epoch": 0.5470363288718929, + "grad_norm": 1.4428579807281494, + "learning_rate": 5e-06, + "loss": 0.1142, + "step": 2861 + }, + { + "epoch": 0.5472275334608031, + "grad_norm": 1.6780493259429932, + "learning_rate": 5e-06, + "loss": 0.1161, + "step": 2862 + }, + { + "epoch": 0.5474187380497132, + "grad_norm": 2.8386378288269043, + "learning_rate": 5e-06, + "loss": 0.3605, + "step": 2863 + }, + { + "epoch": 0.5476099426386233, + "grad_norm": 2.42722487449646, + "learning_rate": 5e-06, + "loss": 0.2646, + "step": 2864 + }, + { + "epoch": 0.5478011472275335, + "grad_norm": 1.9471642971038818, + "learning_rate": 5e-06, + "loss": 0.4178, + "step": 2865 + }, + { + "epoch": 0.5479923518164436, + "grad_norm": 1.1738109588623047, + "learning_rate": 5e-06, + "loss": 0.101, + "step": 2866 + }, + { + "epoch": 0.5481835564053538, + "grad_norm": 1.1273823976516724, + "learning_rate": 5e-06, + "loss": 0.052, + "step": 2867 + }, + { + "epoch": 0.5483747609942639, + "grad_norm": 0.9076278805732727, + "learning_rate": 5e-06, + "loss": 0.0353, + "step": 2868 + }, + { + "epoch": 0.5485659655831739, + "grad_norm": 1.6331543922424316, + "learning_rate": 5e-06, + "loss": 0.1101, + "step": 2869 + }, + { + "epoch": 0.5487571701720841, + "grad_norm": 4.053876876831055, + "learning_rate": 5e-06, + "loss": 0.6708, + "step": 2870 + }, + { + "epoch": 0.5489483747609942, + "grad_norm": 2.119873523712158, + "learning_rate": 5e-06, + "loss": 0.1168, + "step": 2871 + }, + { + "epoch": 0.5491395793499044, + "grad_norm": 1.7409799098968506, + "learning_rate": 5e-06, + "loss": 0.1408, + "step": 2872 + }, + { + "epoch": 0.5493307839388145, + "grad_norm": 0.972379207611084, + "learning_rate": 5e-06, + "loss": 0.0649, + "step": 2873 + }, + { + "epoch": 0.5495219885277247, + "grad_norm": 3.567614793777466, + "learning_rate": 5e-06, + "loss": 0.089, + "step": 2874 + }, + { + "epoch": 0.5497131931166348, + "grad_norm": 1.5383042097091675, + "learning_rate": 5e-06, + "loss": 0.0641, + "step": 2875 + }, + { + "epoch": 0.5499043977055449, + "grad_norm": 2.714123010635376, + "learning_rate": 5e-06, + "loss": 0.3569, + "step": 2876 + }, + { + "epoch": 0.5500956022944551, + "grad_norm": 2.086832046508789, + "learning_rate": 5e-06, + "loss": 0.296, + "step": 2877 + }, + { + "epoch": 0.5502868068833652, + "grad_norm": 2.510014295578003, + "learning_rate": 5e-06, + "loss": 0.2874, + "step": 2878 + }, + { + "epoch": 0.5504780114722754, + "grad_norm": 1.494666576385498, + "learning_rate": 5e-06, + "loss": 0.1435, + "step": 2879 + }, + { + "epoch": 0.5506692160611855, + "grad_norm": 2.3554635047912598, + "learning_rate": 5e-06, + "loss": 0.101, + "step": 2880 + }, + { + "epoch": 0.5508604206500956, + "grad_norm": 1.758353352546692, + "learning_rate": 5e-06, + "loss": 0.1163, + "step": 2881 + }, + { + "epoch": 0.5510516252390057, + "grad_norm": 1.8508045673370361, + "learning_rate": 5e-06, + "loss": 0.2355, + "step": 2882 + }, + { + "epoch": 0.5512428298279158, + "grad_norm": 1.8111543655395508, + "learning_rate": 5e-06, + "loss": 0.2789, + "step": 2883 + }, + { + "epoch": 0.551434034416826, + "grad_norm": 2.13771915435791, + "learning_rate": 5e-06, + "loss": 0.1331, + "step": 2884 + }, + { + "epoch": 0.5516252390057361, + "grad_norm": 1.0804390907287598, + "learning_rate": 5e-06, + "loss": 0.1287, + "step": 2885 + }, + { + "epoch": 0.5518164435946463, + "grad_norm": 1.9480468034744263, + "learning_rate": 5e-06, + "loss": 0.1915, + "step": 2886 + }, + { + "epoch": 0.5520076481835564, + "grad_norm": 1.979048252105713, + "learning_rate": 5e-06, + "loss": 0.1247, + "step": 2887 + }, + { + "epoch": 0.5521988527724665, + "grad_norm": 2.188206434249878, + "learning_rate": 5e-06, + "loss": 0.2204, + "step": 2888 + }, + { + "epoch": 0.5523900573613767, + "grad_norm": 4.077398777008057, + "learning_rate": 5e-06, + "loss": 0.4834, + "step": 2889 + }, + { + "epoch": 0.5525812619502868, + "grad_norm": 1.1730834245681763, + "learning_rate": 5e-06, + "loss": 0.0996, + "step": 2890 + }, + { + "epoch": 0.552772466539197, + "grad_norm": 2.131211280822754, + "learning_rate": 5e-06, + "loss": 0.2714, + "step": 2891 + }, + { + "epoch": 0.5529636711281071, + "grad_norm": 0.8780372738838196, + "learning_rate": 5e-06, + "loss": 0.0451, + "step": 2892 + }, + { + "epoch": 0.5531548757170172, + "grad_norm": 1.3180248737335205, + "learning_rate": 5e-06, + "loss": 0.0978, + "step": 2893 + }, + { + "epoch": 0.5533460803059274, + "grad_norm": 2.865992784500122, + "learning_rate": 5e-06, + "loss": 0.0841, + "step": 2894 + }, + { + "epoch": 0.5535372848948374, + "grad_norm": 2.895068407058716, + "learning_rate": 5e-06, + "loss": 0.6579, + "step": 2895 + }, + { + "epoch": 0.5537284894837476, + "grad_norm": 1.6795144081115723, + "learning_rate": 5e-06, + "loss": 0.3405, + "step": 2896 + }, + { + "epoch": 0.5539196940726577, + "grad_norm": 2.2790634632110596, + "learning_rate": 5e-06, + "loss": 0.189, + "step": 2897 + }, + { + "epoch": 0.5541108986615679, + "grad_norm": 2.1477177143096924, + "learning_rate": 5e-06, + "loss": 0.3104, + "step": 2898 + }, + { + "epoch": 0.554302103250478, + "grad_norm": 1.7578496932983398, + "learning_rate": 5e-06, + "loss": 0.1977, + "step": 2899 + }, + { + "epoch": 0.5544933078393881, + "grad_norm": 1.507967233657837, + "learning_rate": 5e-06, + "loss": 0.0621, + "step": 2900 + }, + { + "epoch": 0.5546845124282983, + "grad_norm": 2.4397711753845215, + "learning_rate": 5e-06, + "loss": 0.3803, + "step": 2901 + }, + { + "epoch": 0.5548757170172084, + "grad_norm": 1.9248405694961548, + "learning_rate": 5e-06, + "loss": 0.1198, + "step": 2902 + }, + { + "epoch": 0.5550669216061186, + "grad_norm": 2.194777011871338, + "learning_rate": 5e-06, + "loss": 0.2595, + "step": 2903 + }, + { + "epoch": 0.5552581261950287, + "grad_norm": 1.9272375106811523, + "learning_rate": 5e-06, + "loss": 0.1686, + "step": 2904 + }, + { + "epoch": 0.5554493307839388, + "grad_norm": 1.9887967109680176, + "learning_rate": 5e-06, + "loss": 0.0554, + "step": 2905 + }, + { + "epoch": 0.555640535372849, + "grad_norm": 2.538315773010254, + "learning_rate": 5e-06, + "loss": 0.1184, + "step": 2906 + }, + { + "epoch": 0.555831739961759, + "grad_norm": 2.3389220237731934, + "learning_rate": 5e-06, + "loss": 0.2485, + "step": 2907 + }, + { + "epoch": 0.5560229445506693, + "grad_norm": 4.87516450881958, + "learning_rate": 5e-06, + "loss": 0.3167, + "step": 2908 + }, + { + "epoch": 0.5562141491395793, + "grad_norm": 2.524181365966797, + "learning_rate": 5e-06, + "loss": 0.3042, + "step": 2909 + }, + { + "epoch": 0.5564053537284895, + "grad_norm": 1.860914707183838, + "learning_rate": 5e-06, + "loss": 0.1627, + "step": 2910 + }, + { + "epoch": 0.5565965583173996, + "grad_norm": 1.0398834943771362, + "learning_rate": 5e-06, + "loss": 0.1021, + "step": 2911 + }, + { + "epoch": 0.5567877629063097, + "grad_norm": 2.4790728092193604, + "learning_rate": 5e-06, + "loss": 0.1407, + "step": 2912 + }, + { + "epoch": 0.5569789674952199, + "grad_norm": 1.9124921560287476, + "learning_rate": 5e-06, + "loss": 0.1495, + "step": 2913 + }, + { + "epoch": 0.55717017208413, + "grad_norm": 3.33526349067688, + "learning_rate": 5e-06, + "loss": 0.7095, + "step": 2914 + }, + { + "epoch": 0.5573613766730402, + "grad_norm": 1.0180929899215698, + "learning_rate": 5e-06, + "loss": 0.1051, + "step": 2915 + }, + { + "epoch": 0.5575525812619503, + "grad_norm": 3.9493212699890137, + "learning_rate": 5e-06, + "loss": 0.335, + "step": 2916 + }, + { + "epoch": 0.5577437858508604, + "grad_norm": 3.441434621810913, + "learning_rate": 5e-06, + "loss": 0.1332, + "step": 2917 + }, + { + "epoch": 0.5579349904397706, + "grad_norm": 1.9761435985565186, + "learning_rate": 5e-06, + "loss": 0.1083, + "step": 2918 + }, + { + "epoch": 0.5581261950286807, + "grad_norm": 1.1638832092285156, + "learning_rate": 5e-06, + "loss": 0.0597, + "step": 2919 + }, + { + "epoch": 0.5583173996175909, + "grad_norm": 3.2977254390716553, + "learning_rate": 5e-06, + "loss": 0.7461, + "step": 2920 + }, + { + "epoch": 0.558508604206501, + "grad_norm": 1.3047654628753662, + "learning_rate": 5e-06, + "loss": 0.0716, + "step": 2921 + }, + { + "epoch": 0.558699808795411, + "grad_norm": 2.069331169128418, + "learning_rate": 5e-06, + "loss": 0.2381, + "step": 2922 + }, + { + "epoch": 0.5588910133843212, + "grad_norm": 1.0748863220214844, + "learning_rate": 5e-06, + "loss": 0.0702, + "step": 2923 + }, + { + "epoch": 0.5590822179732313, + "grad_norm": 1.0665748119354248, + "learning_rate": 5e-06, + "loss": 0.0602, + "step": 2924 + }, + { + "epoch": 0.5592734225621415, + "grad_norm": 0.8832602500915527, + "learning_rate": 5e-06, + "loss": 0.0312, + "step": 2925 + }, + { + "epoch": 0.5594646271510516, + "grad_norm": 2.666064739227295, + "learning_rate": 5e-06, + "loss": 0.4669, + "step": 2926 + }, + { + "epoch": 0.5596558317399618, + "grad_norm": 1.9858882427215576, + "learning_rate": 5e-06, + "loss": 0.1749, + "step": 2927 + }, + { + "epoch": 0.5598470363288719, + "grad_norm": 2.102036237716675, + "learning_rate": 5e-06, + "loss": 0.1507, + "step": 2928 + }, + { + "epoch": 0.560038240917782, + "grad_norm": 1.3998578786849976, + "learning_rate": 5e-06, + "loss": 0.1824, + "step": 2929 + }, + { + "epoch": 0.5602294455066922, + "grad_norm": 1.6715340614318848, + "learning_rate": 5e-06, + "loss": 0.0916, + "step": 2930 + }, + { + "epoch": 0.5604206500956023, + "grad_norm": 1.6286790370941162, + "learning_rate": 5e-06, + "loss": 0.1024, + "step": 2931 + }, + { + "epoch": 0.5606118546845125, + "grad_norm": 2.3067750930786133, + "learning_rate": 5e-06, + "loss": 0.3355, + "step": 2932 + }, + { + "epoch": 0.5608030592734226, + "grad_norm": 2.1875245571136475, + "learning_rate": 5e-06, + "loss": 0.3912, + "step": 2933 + }, + { + "epoch": 0.5609942638623326, + "grad_norm": 3.647351026535034, + "learning_rate": 5e-06, + "loss": 0.3519, + "step": 2934 + }, + { + "epoch": 0.5611854684512428, + "grad_norm": 1.6276153326034546, + "learning_rate": 5e-06, + "loss": 0.1034, + "step": 2935 + }, + { + "epoch": 0.5613766730401529, + "grad_norm": 1.9249351024627686, + "learning_rate": 5e-06, + "loss": 0.3196, + "step": 2936 + }, + { + "epoch": 0.5615678776290631, + "grad_norm": 2.1490015983581543, + "learning_rate": 5e-06, + "loss": 0.1547, + "step": 2937 + }, + { + "epoch": 0.5617590822179732, + "grad_norm": 1.8144192695617676, + "learning_rate": 5e-06, + "loss": 0.1045, + "step": 2938 + }, + { + "epoch": 0.5619502868068834, + "grad_norm": 1.4243121147155762, + "learning_rate": 5e-06, + "loss": 0.1824, + "step": 2939 + }, + { + "epoch": 0.5621414913957935, + "grad_norm": 2.2722017765045166, + "learning_rate": 5e-06, + "loss": 0.3895, + "step": 2940 + }, + { + "epoch": 0.5623326959847036, + "grad_norm": 1.8516919612884521, + "learning_rate": 5e-06, + "loss": 0.0461, + "step": 2941 + }, + { + "epoch": 0.5625239005736138, + "grad_norm": 2.4373631477355957, + "learning_rate": 5e-06, + "loss": 0.2125, + "step": 2942 + }, + { + "epoch": 0.5627151051625239, + "grad_norm": 2.4550974369049072, + "learning_rate": 5e-06, + "loss": 0.13, + "step": 2943 + }, + { + "epoch": 0.5629063097514341, + "grad_norm": 1.8102632761001587, + "learning_rate": 5e-06, + "loss": 0.0701, + "step": 2944 + }, + { + "epoch": 0.5630975143403442, + "grad_norm": 2.8042306900024414, + "learning_rate": 5e-06, + "loss": 0.5347, + "step": 2945 + }, + { + "epoch": 0.5632887189292543, + "grad_norm": 1.6445013284683228, + "learning_rate": 5e-06, + "loss": 0.1356, + "step": 2946 + }, + { + "epoch": 0.5634799235181644, + "grad_norm": 1.7900830507278442, + "learning_rate": 5e-06, + "loss": 0.2208, + "step": 2947 + }, + { + "epoch": 0.5636711281070745, + "grad_norm": 2.265395164489746, + "learning_rate": 5e-06, + "loss": 0.2767, + "step": 2948 + }, + { + "epoch": 0.5638623326959847, + "grad_norm": 0.9388930201530457, + "learning_rate": 5e-06, + "loss": 0.0764, + "step": 2949 + }, + { + "epoch": 0.5640535372848948, + "grad_norm": 2.3902626037597656, + "learning_rate": 5e-06, + "loss": 0.1168, + "step": 2950 + }, + { + "epoch": 0.564244741873805, + "grad_norm": 2.793389320373535, + "learning_rate": 5e-06, + "loss": 0.5114, + "step": 2951 + }, + { + "epoch": 0.5644359464627151, + "grad_norm": 1.428493618965149, + "learning_rate": 5e-06, + "loss": 0.1409, + "step": 2952 + }, + { + "epoch": 0.5646271510516252, + "grad_norm": 2.532050848007202, + "learning_rate": 5e-06, + "loss": 0.2788, + "step": 2953 + }, + { + "epoch": 0.5648183556405354, + "grad_norm": 2.010270357131958, + "learning_rate": 5e-06, + "loss": 0.0942, + "step": 2954 + }, + { + "epoch": 0.5650095602294455, + "grad_norm": 2.6057164669036865, + "learning_rate": 5e-06, + "loss": 0.2012, + "step": 2955 + }, + { + "epoch": 0.5652007648183557, + "grad_norm": 1.232944130897522, + "learning_rate": 5e-06, + "loss": 0.0479, + "step": 2956 + }, + { + "epoch": 0.5653919694072658, + "grad_norm": 2.149996519088745, + "learning_rate": 5e-06, + "loss": 0.2825, + "step": 2957 + }, + { + "epoch": 0.5655831739961759, + "grad_norm": 1.599258303642273, + "learning_rate": 5e-06, + "loss": 0.2824, + "step": 2958 + }, + { + "epoch": 0.5657743785850861, + "grad_norm": 2.3001720905303955, + "learning_rate": 5e-06, + "loss": 0.4862, + "step": 2959 + }, + { + "epoch": 0.5659655831739961, + "grad_norm": 3.394070863723755, + "learning_rate": 5e-06, + "loss": 0.3695, + "step": 2960 + }, + { + "epoch": 0.5661567877629063, + "grad_norm": 2.2293701171875, + "learning_rate": 5e-06, + "loss": 0.0652, + "step": 2961 + }, + { + "epoch": 0.5663479923518164, + "grad_norm": 2.403902769088745, + "learning_rate": 5e-06, + "loss": 0.17, + "step": 2962 + }, + { + "epoch": 0.5665391969407266, + "grad_norm": 1.5070533752441406, + "learning_rate": 5e-06, + "loss": 0.0996, + "step": 2963 + }, + { + "epoch": 0.5667304015296367, + "grad_norm": 3.1238303184509277, + "learning_rate": 5e-06, + "loss": 0.3401, + "step": 2964 + }, + { + "epoch": 0.5669216061185468, + "grad_norm": 2.93849778175354, + "learning_rate": 5e-06, + "loss": 0.3098, + "step": 2965 + }, + { + "epoch": 0.567112810707457, + "grad_norm": 1.2622096538543701, + "learning_rate": 5e-06, + "loss": 0.0951, + "step": 2966 + }, + { + "epoch": 0.5673040152963671, + "grad_norm": 1.6056170463562012, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 2967 + }, + { + "epoch": 0.5674952198852773, + "grad_norm": 1.6788007020950317, + "learning_rate": 5e-06, + "loss": 0.128, + "step": 2968 + }, + { + "epoch": 0.5676864244741874, + "grad_norm": 2.0569911003112793, + "learning_rate": 5e-06, + "loss": 0.1404, + "step": 2969 + }, + { + "epoch": 0.5678776290630975, + "grad_norm": 3.8002560138702393, + "learning_rate": 5e-06, + "loss": 1.0882, + "step": 2970 + }, + { + "epoch": 0.5680688336520077, + "grad_norm": 1.7185357809066772, + "learning_rate": 5e-06, + "loss": 0.1324, + "step": 2971 + }, + { + "epoch": 0.5682600382409178, + "grad_norm": 1.3213095664978027, + "learning_rate": 5e-06, + "loss": 0.104, + "step": 2972 + }, + { + "epoch": 0.568451242829828, + "grad_norm": 1.4530354738235474, + "learning_rate": 5e-06, + "loss": 0.1042, + "step": 2973 + }, + { + "epoch": 0.568642447418738, + "grad_norm": 3.0464630126953125, + "learning_rate": 5e-06, + "loss": 0.2976, + "step": 2974 + }, + { + "epoch": 0.5688336520076482, + "grad_norm": 2.337543249130249, + "learning_rate": 5e-06, + "loss": 0.135, + "step": 2975 + }, + { + "epoch": 0.5690248565965583, + "grad_norm": 2.6473655700683594, + "learning_rate": 5e-06, + "loss": 0.4181, + "step": 2976 + }, + { + "epoch": 0.5692160611854684, + "grad_norm": 2.9804956912994385, + "learning_rate": 5e-06, + "loss": 0.5121, + "step": 2977 + }, + { + "epoch": 0.5694072657743786, + "grad_norm": 1.9707882404327393, + "learning_rate": 5e-06, + "loss": 0.2911, + "step": 2978 + }, + { + "epoch": 0.5695984703632887, + "grad_norm": 1.867041826248169, + "learning_rate": 5e-06, + "loss": 0.2345, + "step": 2979 + }, + { + "epoch": 0.5697896749521989, + "grad_norm": 1.9662240743637085, + "learning_rate": 5e-06, + "loss": 0.1211, + "step": 2980 + }, + { + "epoch": 0.569980879541109, + "grad_norm": 1.9298244714736938, + "learning_rate": 5e-06, + "loss": 0.0948, + "step": 2981 + }, + { + "epoch": 0.5701720841300191, + "grad_norm": 2.791269063949585, + "learning_rate": 5e-06, + "loss": 0.3562, + "step": 2982 + }, + { + "epoch": 0.5703632887189293, + "grad_norm": 1.7784298658370972, + "learning_rate": 5e-06, + "loss": 0.153, + "step": 2983 + }, + { + "epoch": 0.5705544933078394, + "grad_norm": 2.0003790855407715, + "learning_rate": 5e-06, + "loss": 0.3397, + "step": 2984 + }, + { + "epoch": 0.5707456978967496, + "grad_norm": 1.470927119255066, + "learning_rate": 5e-06, + "loss": 0.1265, + "step": 2985 + }, + { + "epoch": 0.5709369024856596, + "grad_norm": 2.981893539428711, + "learning_rate": 5e-06, + "loss": 0.3084, + "step": 2986 + }, + { + "epoch": 0.5711281070745697, + "grad_norm": 1.706616997718811, + "learning_rate": 5e-06, + "loss": 0.1126, + "step": 2987 + }, + { + "epoch": 0.5713193116634799, + "grad_norm": 1.6704820394515991, + "learning_rate": 5e-06, + "loss": 0.1063, + "step": 2988 + }, + { + "epoch": 0.57151051625239, + "grad_norm": 1.1496891975402832, + "learning_rate": 5e-06, + "loss": 0.1615, + "step": 2989 + }, + { + "epoch": 0.5717017208413002, + "grad_norm": 2.2269532680511475, + "learning_rate": 5e-06, + "loss": 0.1694, + "step": 2990 + }, + { + "epoch": 0.5718929254302103, + "grad_norm": 0.9735349416732788, + "learning_rate": 5e-06, + "loss": 0.0651, + "step": 2991 + }, + { + "epoch": 0.5720841300191205, + "grad_norm": 1.315924048423767, + "learning_rate": 5e-06, + "loss": 0.0818, + "step": 2992 + }, + { + "epoch": 0.5722753346080306, + "grad_norm": 2.6886305809020996, + "learning_rate": 5e-06, + "loss": 0.4085, + "step": 2993 + }, + { + "epoch": 0.5724665391969407, + "grad_norm": 1.624668836593628, + "learning_rate": 5e-06, + "loss": 0.1512, + "step": 2994 + }, + { + "epoch": 0.5726577437858509, + "grad_norm": 2.3353240489959717, + "learning_rate": 5e-06, + "loss": 0.2948, + "step": 2995 + }, + { + "epoch": 0.572848948374761, + "grad_norm": 2.4698197841644287, + "learning_rate": 5e-06, + "loss": 0.2742, + "step": 2996 + }, + { + "epoch": 0.5730401529636712, + "grad_norm": 2.138054370880127, + "learning_rate": 5e-06, + "loss": 0.3292, + "step": 2997 + }, + { + "epoch": 0.5732313575525813, + "grad_norm": 2.4792983531951904, + "learning_rate": 5e-06, + "loss": 0.2956, + "step": 2998 + }, + { + "epoch": 0.5734225621414913, + "grad_norm": 1.899868130683899, + "learning_rate": 5e-06, + "loss": 0.1315, + "step": 2999 + }, + { + "epoch": 0.5736137667304015, + "grad_norm": 3.1172773838043213, + "learning_rate": 5e-06, + "loss": 0.2009, + "step": 3000 + }, + { + "epoch": 0.5736137667304015, + "eval_runtime": 817.4268, + "eval_samples_per_second": 1.877, + "eval_steps_per_second": 0.235, + "step": 3000 + }, + { + "epoch": 0.5738049713193116, + "grad_norm": 2.6903367042541504, + "learning_rate": 5e-06, + "loss": 0.1569, + "step": 3001 + }, + { + "epoch": 0.5739961759082218, + "grad_norm": 1.9376270771026611, + "learning_rate": 5e-06, + "loss": 0.2842, + "step": 3002 + }, + { + "epoch": 0.5741873804971319, + "grad_norm": 1.1127880811691284, + "learning_rate": 5e-06, + "loss": 0.0979, + "step": 3003 + }, + { + "epoch": 0.5743785850860421, + "grad_norm": 1.381749153137207, + "learning_rate": 5e-06, + "loss": 0.1271, + "step": 3004 + }, + { + "epoch": 0.5745697896749522, + "grad_norm": 3.1074182987213135, + "learning_rate": 5e-06, + "loss": 0.1865, + "step": 3005 + }, + { + "epoch": 0.5747609942638623, + "grad_norm": 1.8517677783966064, + "learning_rate": 5e-06, + "loss": 0.0811, + "step": 3006 + }, + { + "epoch": 0.5749521988527725, + "grad_norm": 2.8040478229522705, + "learning_rate": 5e-06, + "loss": 0.4882, + "step": 3007 + }, + { + "epoch": 0.5751434034416826, + "grad_norm": 1.2875118255615234, + "learning_rate": 5e-06, + "loss": 0.1718, + "step": 3008 + }, + { + "epoch": 0.5753346080305928, + "grad_norm": 1.20366370677948, + "learning_rate": 5e-06, + "loss": 0.0758, + "step": 3009 + }, + { + "epoch": 0.5755258126195029, + "grad_norm": 1.6420012712478638, + "learning_rate": 5e-06, + "loss": 0.1576, + "step": 3010 + }, + { + "epoch": 0.575717017208413, + "grad_norm": 1.7481510639190674, + "learning_rate": 5e-06, + "loss": 0.0927, + "step": 3011 + }, + { + "epoch": 0.5759082217973231, + "grad_norm": 2.993232011795044, + "learning_rate": 5e-06, + "loss": 0.2496, + "step": 3012 + }, + { + "epoch": 0.5760994263862332, + "grad_norm": 1.9660049676895142, + "learning_rate": 5e-06, + "loss": 0.2413, + "step": 3013 + }, + { + "epoch": 0.5762906309751434, + "grad_norm": 1.9961297512054443, + "learning_rate": 5e-06, + "loss": 0.3713, + "step": 3014 + }, + { + "epoch": 0.5764818355640535, + "grad_norm": 1.5734777450561523, + "learning_rate": 5e-06, + "loss": 0.1268, + "step": 3015 + }, + { + "epoch": 0.5766730401529637, + "grad_norm": 1.7662030458450317, + "learning_rate": 5e-06, + "loss": 0.2618, + "step": 3016 + }, + { + "epoch": 0.5768642447418738, + "grad_norm": 1.6102837324142456, + "learning_rate": 5e-06, + "loss": 0.1833, + "step": 3017 + }, + { + "epoch": 0.5770554493307839, + "grad_norm": 2.5440475940704346, + "learning_rate": 5e-06, + "loss": 0.1946, + "step": 3018 + }, + { + "epoch": 0.5772466539196941, + "grad_norm": 2.081432580947876, + "learning_rate": 5e-06, + "loss": 0.1695, + "step": 3019 + }, + { + "epoch": 0.5774378585086042, + "grad_norm": 5.738420009613037, + "learning_rate": 5e-06, + "loss": 0.2006, + "step": 3020 + }, + { + "epoch": 0.5776290630975144, + "grad_norm": 1.8196947574615479, + "learning_rate": 5e-06, + "loss": 0.117, + "step": 3021 + }, + { + "epoch": 0.5778202676864245, + "grad_norm": 1.6107805967330933, + "learning_rate": 5e-06, + "loss": 0.2009, + "step": 3022 + }, + { + "epoch": 0.5780114722753346, + "grad_norm": 1.1476129293441772, + "learning_rate": 5e-06, + "loss": 0.0376, + "step": 3023 + }, + { + "epoch": 0.5782026768642448, + "grad_norm": 2.0225865840911865, + "learning_rate": 5e-06, + "loss": 0.1687, + "step": 3024 + }, + { + "epoch": 0.5783938814531548, + "grad_norm": 2.241004228591919, + "learning_rate": 5e-06, + "loss": 0.2727, + "step": 3025 + }, + { + "epoch": 0.578585086042065, + "grad_norm": 2.0061728954315186, + "learning_rate": 5e-06, + "loss": 0.1177, + "step": 3026 + }, + { + "epoch": 0.5787762906309751, + "grad_norm": 2.0233190059661865, + "learning_rate": 5e-06, + "loss": 0.1115, + "step": 3027 + }, + { + "epoch": 0.5789674952198853, + "grad_norm": 1.3441364765167236, + "learning_rate": 5e-06, + "loss": 0.0912, + "step": 3028 + }, + { + "epoch": 0.5791586998087954, + "grad_norm": 1.7095820903778076, + "learning_rate": 5e-06, + "loss": 0.1289, + "step": 3029 + }, + { + "epoch": 0.5793499043977055, + "grad_norm": 1.2957032918930054, + "learning_rate": 5e-06, + "loss": 0.0431, + "step": 3030 + }, + { + "epoch": 0.5795411089866157, + "grad_norm": 2.276420831680298, + "learning_rate": 5e-06, + "loss": 0.2461, + "step": 3031 + }, + { + "epoch": 0.5797323135755258, + "grad_norm": 2.8638715744018555, + "learning_rate": 5e-06, + "loss": 0.4221, + "step": 3032 + }, + { + "epoch": 0.579923518164436, + "grad_norm": 1.7681334018707275, + "learning_rate": 5e-06, + "loss": 0.218, + "step": 3033 + }, + { + "epoch": 0.5801147227533461, + "grad_norm": 2.0881717205047607, + "learning_rate": 5e-06, + "loss": 0.3088, + "step": 3034 + }, + { + "epoch": 0.5803059273422562, + "grad_norm": 1.6630779504776, + "learning_rate": 5e-06, + "loss": 0.1146, + "step": 3035 + }, + { + "epoch": 0.5804971319311664, + "grad_norm": 1.7102560997009277, + "learning_rate": 5e-06, + "loss": 0.123, + "step": 3036 + }, + { + "epoch": 0.5806883365200765, + "grad_norm": 1.850621223449707, + "learning_rate": 5e-06, + "loss": 0.0667, + "step": 3037 + }, + { + "epoch": 0.5808795411089867, + "grad_norm": 2.2591307163238525, + "learning_rate": 5e-06, + "loss": 0.1723, + "step": 3038 + }, + { + "epoch": 0.5810707456978967, + "grad_norm": 3.2868549823760986, + "learning_rate": 5e-06, + "loss": 0.7069, + "step": 3039 + }, + { + "epoch": 0.5812619502868069, + "grad_norm": 2.1534550189971924, + "learning_rate": 5e-06, + "loss": 0.1613, + "step": 3040 + }, + { + "epoch": 0.581453154875717, + "grad_norm": 2.671644687652588, + "learning_rate": 5e-06, + "loss": 0.6401, + "step": 3041 + }, + { + "epoch": 0.5816443594646271, + "grad_norm": 1.346547245979309, + "learning_rate": 5e-06, + "loss": 0.0957, + "step": 3042 + }, + { + "epoch": 0.5818355640535373, + "grad_norm": 1.1658374071121216, + "learning_rate": 5e-06, + "loss": 0.1317, + "step": 3043 + }, + { + "epoch": 0.5820267686424474, + "grad_norm": 1.5041284561157227, + "learning_rate": 5e-06, + "loss": 0.1007, + "step": 3044 + }, + { + "epoch": 0.5822179732313576, + "grad_norm": 2.461108922958374, + "learning_rate": 5e-06, + "loss": 0.4222, + "step": 3045 + }, + { + "epoch": 0.5824091778202677, + "grad_norm": 1.9935189485549927, + "learning_rate": 5e-06, + "loss": 0.2012, + "step": 3046 + }, + { + "epoch": 0.5826003824091778, + "grad_norm": 3.317734718322754, + "learning_rate": 5e-06, + "loss": 0.2825, + "step": 3047 + }, + { + "epoch": 0.582791586998088, + "grad_norm": 2.7666308879852295, + "learning_rate": 5e-06, + "loss": 0.1017, + "step": 3048 + }, + { + "epoch": 0.5829827915869981, + "grad_norm": 1.4334791898727417, + "learning_rate": 5e-06, + "loss": 0.0966, + "step": 3049 + }, + { + "epoch": 0.5831739961759083, + "grad_norm": 2.323887825012207, + "learning_rate": 5e-06, + "loss": 0.1215, + "step": 3050 + }, + { + "epoch": 0.5833652007648183, + "grad_norm": 2.63179874420166, + "learning_rate": 5e-06, + "loss": 0.5225, + "step": 3051 + }, + { + "epoch": 0.5835564053537284, + "grad_norm": 2.2976789474487305, + "learning_rate": 5e-06, + "loss": 0.3479, + "step": 3052 + }, + { + "epoch": 0.5837476099426386, + "grad_norm": 1.7970479726791382, + "learning_rate": 5e-06, + "loss": 0.0878, + "step": 3053 + }, + { + "epoch": 0.5839388145315487, + "grad_norm": 2.2620391845703125, + "learning_rate": 5e-06, + "loss": 0.2336, + "step": 3054 + }, + { + "epoch": 0.5841300191204589, + "grad_norm": 1.0997439622879028, + "learning_rate": 5e-06, + "loss": 0.0781, + "step": 3055 + }, + { + "epoch": 0.584321223709369, + "grad_norm": 1.8581743240356445, + "learning_rate": 5e-06, + "loss": 0.0843, + "step": 3056 + }, + { + "epoch": 0.5845124282982792, + "grad_norm": 2.1834423542022705, + "learning_rate": 5e-06, + "loss": 0.1402, + "step": 3057 + }, + { + "epoch": 0.5847036328871893, + "grad_norm": 1.7100716829299927, + "learning_rate": 5e-06, + "loss": 0.194, + "step": 3058 + }, + { + "epoch": 0.5848948374760994, + "grad_norm": 3.5780153274536133, + "learning_rate": 5e-06, + "loss": 0.1706, + "step": 3059 + }, + { + "epoch": 0.5850860420650096, + "grad_norm": 1.2528570890426636, + "learning_rate": 5e-06, + "loss": 0.0991, + "step": 3060 + }, + { + "epoch": 0.5852772466539197, + "grad_norm": 1.7608826160430908, + "learning_rate": 5e-06, + "loss": 0.1739, + "step": 3061 + }, + { + "epoch": 0.5854684512428299, + "grad_norm": 2.2554149627685547, + "learning_rate": 5e-06, + "loss": 0.1713, + "step": 3062 + }, + { + "epoch": 0.58565965583174, + "grad_norm": 1.9988476037979126, + "learning_rate": 5e-06, + "loss": 0.3087, + "step": 3063 + }, + { + "epoch": 0.58585086042065, + "grad_norm": 2.21211314201355, + "learning_rate": 5e-06, + "loss": 0.273, + "step": 3064 + }, + { + "epoch": 0.5860420650095602, + "grad_norm": 1.947128415107727, + "learning_rate": 5e-06, + "loss": 0.2207, + "step": 3065 + }, + { + "epoch": 0.5862332695984703, + "grad_norm": 3.2861714363098145, + "learning_rate": 5e-06, + "loss": 0.4373, + "step": 3066 + }, + { + "epoch": 0.5864244741873805, + "grad_norm": 1.3944896459579468, + "learning_rate": 5e-06, + "loss": 0.0627, + "step": 3067 + }, + { + "epoch": 0.5866156787762906, + "grad_norm": 2.653266429901123, + "learning_rate": 5e-06, + "loss": 0.1445, + "step": 3068 + }, + { + "epoch": 0.5868068833652008, + "grad_norm": 9.50674819946289, + "learning_rate": 5e-06, + "loss": 0.2661, + "step": 3069 + }, + { + "epoch": 0.5869980879541109, + "grad_norm": 2.211970329284668, + "learning_rate": 5e-06, + "loss": 0.2697, + "step": 3070 + }, + { + "epoch": 0.587189292543021, + "grad_norm": 1.0039657354354858, + "learning_rate": 5e-06, + "loss": 0.0918, + "step": 3071 + }, + { + "epoch": 0.5873804971319312, + "grad_norm": 2.002715587615967, + "learning_rate": 5e-06, + "loss": 0.2972, + "step": 3072 + }, + { + "epoch": 0.5875717017208413, + "grad_norm": 1.1226190328598022, + "learning_rate": 5e-06, + "loss": 0.0923, + "step": 3073 + }, + { + "epoch": 0.5877629063097515, + "grad_norm": 1.6911029815673828, + "learning_rate": 5e-06, + "loss": 0.0904, + "step": 3074 + }, + { + "epoch": 0.5879541108986616, + "grad_norm": 2.079878568649292, + "learning_rate": 5e-06, + "loss": 0.0876, + "step": 3075 + }, + { + "epoch": 0.5881453154875717, + "grad_norm": 2.1699066162109375, + "learning_rate": 5e-06, + "loss": 0.4323, + "step": 3076 + }, + { + "epoch": 0.5883365200764819, + "grad_norm": 1.8481723070144653, + "learning_rate": 5e-06, + "loss": 0.208, + "step": 3077 + }, + { + "epoch": 0.5885277246653919, + "grad_norm": 1.2353150844573975, + "learning_rate": 5e-06, + "loss": 0.0871, + "step": 3078 + }, + { + "epoch": 0.5887189292543021, + "grad_norm": 3.055333375930786, + "learning_rate": 5e-06, + "loss": 0.1118, + "step": 3079 + }, + { + "epoch": 0.5889101338432122, + "grad_norm": 1.535212755203247, + "learning_rate": 5e-06, + "loss": 0.2145, + "step": 3080 + }, + { + "epoch": 0.5891013384321224, + "grad_norm": 3.490246295928955, + "learning_rate": 5e-06, + "loss": 0.0861, + "step": 3081 + }, + { + "epoch": 0.5892925430210325, + "grad_norm": 1.9713093042373657, + "learning_rate": 5e-06, + "loss": 0.2104, + "step": 3082 + }, + { + "epoch": 0.5894837476099426, + "grad_norm": 2.162919282913208, + "learning_rate": 5e-06, + "loss": 0.3863, + "step": 3083 + }, + { + "epoch": 0.5896749521988528, + "grad_norm": 1.0454967021942139, + "learning_rate": 5e-06, + "loss": 0.0944, + "step": 3084 + }, + { + "epoch": 0.5898661567877629, + "grad_norm": 1.190918207168579, + "learning_rate": 5e-06, + "loss": 0.1009, + "step": 3085 + }, + { + "epoch": 0.5900573613766731, + "grad_norm": 1.4480558633804321, + "learning_rate": 5e-06, + "loss": 0.0612, + "step": 3086 + }, + { + "epoch": 0.5902485659655832, + "grad_norm": 2.896592140197754, + "learning_rate": 5e-06, + "loss": 0.121, + "step": 3087 + }, + { + "epoch": 0.5904397705544933, + "grad_norm": 1.869307279586792, + "learning_rate": 5e-06, + "loss": 0.1258, + "step": 3088 + }, + { + "epoch": 0.5906309751434035, + "grad_norm": 1.9074268341064453, + "learning_rate": 5e-06, + "loss": 0.1778, + "step": 3089 + }, + { + "epoch": 0.5908221797323135, + "grad_norm": 2.827545166015625, + "learning_rate": 5e-06, + "loss": 0.5372, + "step": 3090 + }, + { + "epoch": 0.5910133843212237, + "grad_norm": 3.0285253524780273, + "learning_rate": 5e-06, + "loss": 0.1217, + "step": 3091 + }, + { + "epoch": 0.5912045889101338, + "grad_norm": 1.5774853229522705, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 3092 + }, + { + "epoch": 0.591395793499044, + "grad_norm": 1.55910062789917, + "learning_rate": 5e-06, + "loss": 0.0962, + "step": 3093 + }, + { + "epoch": 0.5915869980879541, + "grad_norm": 2.466980457305908, + "learning_rate": 5e-06, + "loss": 0.1794, + "step": 3094 + }, + { + "epoch": 0.5917782026768642, + "grad_norm": 1.3452435731887817, + "learning_rate": 5e-06, + "loss": 0.1445, + "step": 3095 + }, + { + "epoch": 0.5919694072657744, + "grad_norm": 1.193184494972229, + "learning_rate": 5e-06, + "loss": 0.1735, + "step": 3096 + }, + { + "epoch": 0.5921606118546845, + "grad_norm": 4.413208961486816, + "learning_rate": 5e-06, + "loss": 0.1679, + "step": 3097 + }, + { + "epoch": 0.5923518164435947, + "grad_norm": 1.8981341123580933, + "learning_rate": 5e-06, + "loss": 0.1866, + "step": 3098 + }, + { + "epoch": 0.5925430210325048, + "grad_norm": 1.924362301826477, + "learning_rate": 5e-06, + "loss": 0.1081, + "step": 3099 + }, + { + "epoch": 0.5927342256214149, + "grad_norm": 3.043461561203003, + "learning_rate": 5e-06, + "loss": 0.1592, + "step": 3100 + }, + { + "epoch": 0.5929254302103251, + "grad_norm": 2.672950506210327, + "learning_rate": 5e-06, + "loss": 0.5285, + "step": 3101 + }, + { + "epoch": 0.5931166347992352, + "grad_norm": 2.3183720111846924, + "learning_rate": 5e-06, + "loss": 0.5674, + "step": 3102 + }, + { + "epoch": 0.5933078393881454, + "grad_norm": 2.15521502494812, + "learning_rate": 5e-06, + "loss": 0.23, + "step": 3103 + }, + { + "epoch": 0.5934990439770554, + "grad_norm": 2.041330575942993, + "learning_rate": 5e-06, + "loss": 0.1286, + "step": 3104 + }, + { + "epoch": 0.5936902485659655, + "grad_norm": 2.0473520755767822, + "learning_rate": 5e-06, + "loss": 0.3554, + "step": 3105 + }, + { + "epoch": 0.5938814531548757, + "grad_norm": 1.3054406642913818, + "learning_rate": 5e-06, + "loss": 0.1113, + "step": 3106 + }, + { + "epoch": 0.5940726577437858, + "grad_norm": 2.71018385887146, + "learning_rate": 5e-06, + "loss": 0.3178, + "step": 3107 + }, + { + "epoch": 0.594263862332696, + "grad_norm": 1.7151776552200317, + "learning_rate": 5e-06, + "loss": 0.1841, + "step": 3108 + }, + { + "epoch": 0.5944550669216061, + "grad_norm": 2.065749168395996, + "learning_rate": 5e-06, + "loss": 0.157, + "step": 3109 + }, + { + "epoch": 0.5946462715105163, + "grad_norm": 0.8944826126098633, + "learning_rate": 5e-06, + "loss": 0.0657, + "step": 3110 + }, + { + "epoch": 0.5948374760994264, + "grad_norm": 1.4383631944656372, + "learning_rate": 5e-06, + "loss": 0.0874, + "step": 3111 + }, + { + "epoch": 0.5950286806883365, + "grad_norm": 1.2038031816482544, + "learning_rate": 5e-06, + "loss": 0.0653, + "step": 3112 + }, + { + "epoch": 0.5952198852772467, + "grad_norm": 1.0844436883926392, + "learning_rate": 5e-06, + "loss": 0.0806, + "step": 3113 + }, + { + "epoch": 0.5954110898661568, + "grad_norm": 1.6793997287750244, + "learning_rate": 5e-06, + "loss": 0.1552, + "step": 3114 + }, + { + "epoch": 0.595602294455067, + "grad_norm": 2.1270453929901123, + "learning_rate": 5e-06, + "loss": 0.3629, + "step": 3115 + }, + { + "epoch": 0.595793499043977, + "grad_norm": 2.146496295928955, + "learning_rate": 5e-06, + "loss": 0.317, + "step": 3116 + }, + { + "epoch": 0.5959847036328871, + "grad_norm": 1.409468173980713, + "learning_rate": 5e-06, + "loss": 0.0948, + "step": 3117 + }, + { + "epoch": 0.5961759082217973, + "grad_norm": 1.9205509424209595, + "learning_rate": 5e-06, + "loss": 0.2199, + "step": 3118 + }, + { + "epoch": 0.5963671128107074, + "grad_norm": 1.7777714729309082, + "learning_rate": 5e-06, + "loss": 0.1007, + "step": 3119 + }, + { + "epoch": 0.5965583173996176, + "grad_norm": 3.087475061416626, + "learning_rate": 5e-06, + "loss": 0.3876, + "step": 3120 + }, + { + "epoch": 0.5967495219885277, + "grad_norm": 2.7254316806793213, + "learning_rate": 5e-06, + "loss": 0.5461, + "step": 3121 + }, + { + "epoch": 0.5969407265774379, + "grad_norm": 1.9217915534973145, + "learning_rate": 5e-06, + "loss": 0.2579, + "step": 3122 + }, + { + "epoch": 0.597131931166348, + "grad_norm": 2.051192283630371, + "learning_rate": 5e-06, + "loss": 0.2243, + "step": 3123 + }, + { + "epoch": 0.5973231357552581, + "grad_norm": 1.4439209699630737, + "learning_rate": 5e-06, + "loss": 0.0946, + "step": 3124 + }, + { + "epoch": 0.5975143403441683, + "grad_norm": 0.8825449347496033, + "learning_rate": 5e-06, + "loss": 0.0568, + "step": 3125 + }, + { + "epoch": 0.5977055449330784, + "grad_norm": 13.12146282196045, + "learning_rate": 5e-06, + "loss": 0.5537, + "step": 3126 + }, + { + "epoch": 0.5978967495219886, + "grad_norm": 1.4262953996658325, + "learning_rate": 5e-06, + "loss": 0.1899, + "step": 3127 + }, + { + "epoch": 0.5980879541108987, + "grad_norm": 2.342287063598633, + "learning_rate": 5e-06, + "loss": 0.312, + "step": 3128 + }, + { + "epoch": 0.5982791586998087, + "grad_norm": 2.072007179260254, + "learning_rate": 5e-06, + "loss": 0.1151, + "step": 3129 + }, + { + "epoch": 0.5984703632887189, + "grad_norm": 4.116969108581543, + "learning_rate": 5e-06, + "loss": 0.257, + "step": 3130 + }, + { + "epoch": 0.598661567877629, + "grad_norm": 1.1240900754928589, + "learning_rate": 5e-06, + "loss": 0.0532, + "step": 3131 + }, + { + "epoch": 0.5988527724665392, + "grad_norm": 1.6373003721237183, + "learning_rate": 5e-06, + "loss": 0.177, + "step": 3132 + }, + { + "epoch": 0.5990439770554493, + "grad_norm": 2.227210521697998, + "learning_rate": 5e-06, + "loss": 0.1885, + "step": 3133 + }, + { + "epoch": 0.5992351816443595, + "grad_norm": 1.2359378337860107, + "learning_rate": 5e-06, + "loss": 0.1457, + "step": 3134 + }, + { + "epoch": 0.5994263862332696, + "grad_norm": 1.923424482345581, + "learning_rate": 5e-06, + "loss": 0.247, + "step": 3135 + }, + { + "epoch": 0.5996175908221797, + "grad_norm": 1.0987998247146606, + "learning_rate": 5e-06, + "loss": 0.1003, + "step": 3136 + }, + { + "epoch": 0.5998087954110899, + "grad_norm": 2.2175567150115967, + "learning_rate": 5e-06, + "loss": 0.1486, + "step": 3137 + }, + { + "epoch": 0.6, + "grad_norm": 2.118872880935669, + "learning_rate": 5e-06, + "loss": 0.1863, + "step": 3138 + }, + { + "epoch": 0.6001912045889102, + "grad_norm": 2.241590738296509, + "learning_rate": 5e-06, + "loss": 0.3768, + "step": 3139 + }, + { + "epoch": 0.6003824091778203, + "grad_norm": 2.5006096363067627, + "learning_rate": 5e-06, + "loss": 0.3921, + "step": 3140 + }, + { + "epoch": 0.6005736137667304, + "grad_norm": 1.5924909114837646, + "learning_rate": 5e-06, + "loss": 0.1555, + "step": 3141 + }, + { + "epoch": 0.6007648183556406, + "grad_norm": 2.0221755504608154, + "learning_rate": 5e-06, + "loss": 0.1312, + "step": 3142 + }, + { + "epoch": 0.6009560229445506, + "grad_norm": 2.870837450027466, + "learning_rate": 5e-06, + "loss": 0.0977, + "step": 3143 + }, + { + "epoch": 0.6011472275334608, + "grad_norm": 2.1391944885253906, + "learning_rate": 5e-06, + "loss": 0.1124, + "step": 3144 + }, + { + "epoch": 0.6013384321223709, + "grad_norm": 2.3611199855804443, + "learning_rate": 5e-06, + "loss": 0.5372, + "step": 3145 + }, + { + "epoch": 0.6015296367112811, + "grad_norm": 2.637998104095459, + "learning_rate": 5e-06, + "loss": 0.4384, + "step": 3146 + }, + { + "epoch": 0.6017208413001912, + "grad_norm": 3.041144609451294, + "learning_rate": 5e-06, + "loss": 0.2068, + "step": 3147 + }, + { + "epoch": 0.6019120458891013, + "grad_norm": 2.0565388202667236, + "learning_rate": 5e-06, + "loss": 0.2483, + "step": 3148 + }, + { + "epoch": 0.6021032504780115, + "grad_norm": 2.326119899749756, + "learning_rate": 5e-06, + "loss": 0.3495, + "step": 3149 + }, + { + "epoch": 0.6022944550669216, + "grad_norm": 2.5809388160705566, + "learning_rate": 5e-06, + "loss": 0.1141, + "step": 3150 + }, + { + "epoch": 0.6024856596558318, + "grad_norm": 2.951171398162842, + "learning_rate": 5e-06, + "loss": 0.4543, + "step": 3151 + }, + { + "epoch": 0.6026768642447419, + "grad_norm": 1.1240613460540771, + "learning_rate": 5e-06, + "loss": 0.0896, + "step": 3152 + }, + { + "epoch": 0.602868068833652, + "grad_norm": 1.6378625631332397, + "learning_rate": 5e-06, + "loss": 0.1978, + "step": 3153 + }, + { + "epoch": 0.6030592734225622, + "grad_norm": 2.002222776412964, + "learning_rate": 5e-06, + "loss": 0.1737, + "step": 3154 + }, + { + "epoch": 0.6032504780114722, + "grad_norm": 1.2147256135940552, + "learning_rate": 5e-06, + "loss": 0.0322, + "step": 3155 + }, + { + "epoch": 0.6034416826003824, + "grad_norm": 1.5315823554992676, + "learning_rate": 5e-06, + "loss": 0.0978, + "step": 3156 + }, + { + "epoch": 0.6036328871892925, + "grad_norm": 2.548656463623047, + "learning_rate": 5e-06, + "loss": 0.376, + "step": 3157 + }, + { + "epoch": 0.6038240917782027, + "grad_norm": 2.330667495727539, + "learning_rate": 5e-06, + "loss": 0.3136, + "step": 3158 + }, + { + "epoch": 0.6040152963671128, + "grad_norm": 3.249701738357544, + "learning_rate": 5e-06, + "loss": 0.7531, + "step": 3159 + }, + { + "epoch": 0.6042065009560229, + "grad_norm": 1.688836932182312, + "learning_rate": 5e-06, + "loss": 0.1473, + "step": 3160 + }, + { + "epoch": 0.6043977055449331, + "grad_norm": 2.461451530456543, + "learning_rate": 5e-06, + "loss": 0.246, + "step": 3161 + }, + { + "epoch": 0.6045889101338432, + "grad_norm": 1.6401633024215698, + "learning_rate": 5e-06, + "loss": 0.094, + "step": 3162 + }, + { + "epoch": 0.6047801147227534, + "grad_norm": 2.8850514888763428, + "learning_rate": 5e-06, + "loss": 0.466, + "step": 3163 + }, + { + "epoch": 0.6049713193116635, + "grad_norm": 3.2717480659484863, + "learning_rate": 5e-06, + "loss": 0.5378, + "step": 3164 + }, + { + "epoch": 0.6051625239005736, + "grad_norm": 3.4596874713897705, + "learning_rate": 5e-06, + "loss": 0.3068, + "step": 3165 + }, + { + "epoch": 0.6053537284894838, + "grad_norm": 1.0679250955581665, + "learning_rate": 5e-06, + "loss": 0.0964, + "step": 3166 + }, + { + "epoch": 0.6055449330783939, + "grad_norm": 1.2040866613388062, + "learning_rate": 5e-06, + "loss": 0.0847, + "step": 3167 + }, + { + "epoch": 0.605736137667304, + "grad_norm": 3.1101861000061035, + "learning_rate": 5e-06, + "loss": 0.2559, + "step": 3168 + }, + { + "epoch": 0.6059273422562141, + "grad_norm": 2.351142168045044, + "learning_rate": 5e-06, + "loss": 0.1671, + "step": 3169 + }, + { + "epoch": 0.6061185468451242, + "grad_norm": 1.722732663154602, + "learning_rate": 5e-06, + "loss": 0.2063, + "step": 3170 + }, + { + "epoch": 0.6063097514340344, + "grad_norm": 2.38865327835083, + "learning_rate": 5e-06, + "loss": 0.4377, + "step": 3171 + }, + { + "epoch": 0.6065009560229445, + "grad_norm": 2.085334300994873, + "learning_rate": 5e-06, + "loss": 0.1241, + "step": 3172 + }, + { + "epoch": 0.6066921606118547, + "grad_norm": 1.9132529497146606, + "learning_rate": 5e-06, + "loss": 0.1219, + "step": 3173 + }, + { + "epoch": 0.6068833652007648, + "grad_norm": 2.437124013900757, + "learning_rate": 5e-06, + "loss": 0.1314, + "step": 3174 + }, + { + "epoch": 0.607074569789675, + "grad_norm": 2.508739471435547, + "learning_rate": 5e-06, + "loss": 0.1578, + "step": 3175 + }, + { + "epoch": 0.6072657743785851, + "grad_norm": 2.6360321044921875, + "learning_rate": 5e-06, + "loss": 0.2388, + "step": 3176 + }, + { + "epoch": 0.6074569789674952, + "grad_norm": 1.8685966730117798, + "learning_rate": 5e-06, + "loss": 0.2663, + "step": 3177 + }, + { + "epoch": 0.6076481835564054, + "grad_norm": 1.755370020866394, + "learning_rate": 5e-06, + "loss": 0.1159, + "step": 3178 + }, + { + "epoch": 0.6078393881453155, + "grad_norm": 1.2698965072631836, + "learning_rate": 5e-06, + "loss": 0.1185, + "step": 3179 + }, + { + "epoch": 0.6080305927342257, + "grad_norm": 3.567136764526367, + "learning_rate": 5e-06, + "loss": 0.2535, + "step": 3180 + }, + { + "epoch": 0.6082217973231357, + "grad_norm": 1.5889770984649658, + "learning_rate": 5e-06, + "loss": 0.0809, + "step": 3181 + }, + { + "epoch": 0.6084130019120458, + "grad_norm": 1.9720900058746338, + "learning_rate": 5e-06, + "loss": 0.1599, + "step": 3182 + }, + { + "epoch": 0.608604206500956, + "grad_norm": 1.6620185375213623, + "learning_rate": 5e-06, + "loss": 0.2844, + "step": 3183 + }, + { + "epoch": 0.6087954110898661, + "grad_norm": 2.424464464187622, + "learning_rate": 5e-06, + "loss": 0.2883, + "step": 3184 + }, + { + "epoch": 0.6089866156787763, + "grad_norm": 3.4349234104156494, + "learning_rate": 5e-06, + "loss": 0.4843, + "step": 3185 + }, + { + "epoch": 0.6091778202676864, + "grad_norm": 1.0751681327819824, + "learning_rate": 5e-06, + "loss": 0.0913, + "step": 3186 + }, + { + "epoch": 0.6093690248565966, + "grad_norm": 1.671789526939392, + "learning_rate": 5e-06, + "loss": 0.1138, + "step": 3187 + }, + { + "epoch": 0.6095602294455067, + "grad_norm": 2.0283384323120117, + "learning_rate": 5e-06, + "loss": 0.1114, + "step": 3188 + }, + { + "epoch": 0.6097514340344168, + "grad_norm": 2.488652229309082, + "learning_rate": 5e-06, + "loss": 0.3425, + "step": 3189 + }, + { + "epoch": 0.609942638623327, + "grad_norm": 1.1259547472000122, + "learning_rate": 5e-06, + "loss": 0.1119, + "step": 3190 + }, + { + "epoch": 0.6101338432122371, + "grad_norm": 5.288424968719482, + "learning_rate": 5e-06, + "loss": 0.3013, + "step": 3191 + }, + { + "epoch": 0.6103250478011473, + "grad_norm": 1.1626574993133545, + "learning_rate": 5e-06, + "loss": 0.1034, + "step": 3192 + }, + { + "epoch": 0.6105162523900574, + "grad_norm": 2.4435818195343018, + "learning_rate": 5e-06, + "loss": 0.2786, + "step": 3193 + }, + { + "epoch": 0.6107074569789674, + "grad_norm": 1.540985107421875, + "learning_rate": 5e-06, + "loss": 0.0953, + "step": 3194 + }, + { + "epoch": 0.6108986615678776, + "grad_norm": 2.901230812072754, + "learning_rate": 5e-06, + "loss": 0.4028, + "step": 3195 + }, + { + "epoch": 0.6110898661567877, + "grad_norm": 1.308781385421753, + "learning_rate": 5e-06, + "loss": 0.1013, + "step": 3196 + }, + { + "epoch": 0.6112810707456979, + "grad_norm": 1.9014428853988647, + "learning_rate": 5e-06, + "loss": 0.2525, + "step": 3197 + }, + { + "epoch": 0.611472275334608, + "grad_norm": 1.3016637563705444, + "learning_rate": 5e-06, + "loss": 0.1119, + "step": 3198 + }, + { + "epoch": 0.6116634799235182, + "grad_norm": 1.2226108312606812, + "learning_rate": 5e-06, + "loss": 0.1074, + "step": 3199 + }, + { + "epoch": 0.6118546845124283, + "grad_norm": 1.6033170223236084, + "learning_rate": 5e-06, + "loss": 0.0845, + "step": 3200 + }, + { + "epoch": 0.6120458891013384, + "grad_norm": 1.5445128679275513, + "learning_rate": 5e-06, + "loss": 0.1334, + "step": 3201 + }, + { + "epoch": 0.6122370936902486, + "grad_norm": 2.2498819828033447, + "learning_rate": 5e-06, + "loss": 0.3817, + "step": 3202 + }, + { + "epoch": 0.6124282982791587, + "grad_norm": 1.170806646347046, + "learning_rate": 5e-06, + "loss": 0.0951, + "step": 3203 + }, + { + "epoch": 0.6126195028680689, + "grad_norm": 1.942797303199768, + "learning_rate": 5e-06, + "loss": 0.179, + "step": 3204 + }, + { + "epoch": 0.612810707456979, + "grad_norm": 1.344512701034546, + "learning_rate": 5e-06, + "loss": 0.081, + "step": 3205 + }, + { + "epoch": 0.613001912045889, + "grad_norm": 1.1535552740097046, + "learning_rate": 5e-06, + "loss": 0.0581, + "step": 3206 + }, + { + "epoch": 0.6131931166347993, + "grad_norm": 1.867598056793213, + "learning_rate": 5e-06, + "loss": 0.1903, + "step": 3207 + }, + { + "epoch": 0.6133843212237093, + "grad_norm": 2.3806843757629395, + "learning_rate": 5e-06, + "loss": 0.3296, + "step": 3208 + }, + { + "epoch": 0.6135755258126195, + "grad_norm": 1.6405065059661865, + "learning_rate": 5e-06, + "loss": 0.161, + "step": 3209 + }, + { + "epoch": 0.6137667304015296, + "grad_norm": 2.7260189056396484, + "learning_rate": 5e-06, + "loss": 0.3575, + "step": 3210 + }, + { + "epoch": 0.6139579349904398, + "grad_norm": 2.125488519668579, + "learning_rate": 5e-06, + "loss": 0.339, + "step": 3211 + }, + { + "epoch": 0.6141491395793499, + "grad_norm": 2.283906936645508, + "learning_rate": 5e-06, + "loss": 0.1835, + "step": 3212 + }, + { + "epoch": 0.61434034416826, + "grad_norm": 2.345285654067993, + "learning_rate": 5e-06, + "loss": 0.1712, + "step": 3213 + }, + { + "epoch": 0.6145315487571702, + "grad_norm": 2.2793378829956055, + "learning_rate": 5e-06, + "loss": 0.3886, + "step": 3214 + }, + { + "epoch": 0.6147227533460803, + "grad_norm": 2.4484992027282715, + "learning_rate": 5e-06, + "loss": 0.4042, + "step": 3215 + }, + { + "epoch": 0.6149139579349905, + "grad_norm": 2.2618656158447266, + "learning_rate": 5e-06, + "loss": 0.2397, + "step": 3216 + }, + { + "epoch": 0.6151051625239006, + "grad_norm": 2.3636066913604736, + "learning_rate": 5e-06, + "loss": 0.1085, + "step": 3217 + }, + { + "epoch": 0.6152963671128107, + "grad_norm": 1.7464470863342285, + "learning_rate": 5e-06, + "loss": 0.0766, + "step": 3218 + }, + { + "epoch": 0.6154875717017209, + "grad_norm": 3.1975414752960205, + "learning_rate": 5e-06, + "loss": 0.1565, + "step": 3219 + }, + { + "epoch": 0.615678776290631, + "grad_norm": 2.530433416366577, + "learning_rate": 5e-06, + "loss": 0.3814, + "step": 3220 + }, + { + "epoch": 0.6158699808795411, + "grad_norm": 1.9266382455825806, + "learning_rate": 5e-06, + "loss": 0.253, + "step": 3221 + }, + { + "epoch": 0.6160611854684512, + "grad_norm": 2.1497132778167725, + "learning_rate": 5e-06, + "loss": 0.2549, + "step": 3222 + }, + { + "epoch": 0.6162523900573614, + "grad_norm": 1.5742179155349731, + "learning_rate": 5e-06, + "loss": 0.1135, + "step": 3223 + }, + { + "epoch": 0.6164435946462715, + "grad_norm": 1.3919093608856201, + "learning_rate": 5e-06, + "loss": 0.0582, + "step": 3224 + }, + { + "epoch": 0.6166347992351816, + "grad_norm": 2.1460893154144287, + "learning_rate": 5e-06, + "loss": 0.1063, + "step": 3225 + }, + { + "epoch": 0.6168260038240918, + "grad_norm": 3.082763671875, + "learning_rate": 5e-06, + "loss": 0.3345, + "step": 3226 + }, + { + "epoch": 0.6170172084130019, + "grad_norm": 2.1403768062591553, + "learning_rate": 5e-06, + "loss": 0.1945, + "step": 3227 + }, + { + "epoch": 0.6172084130019121, + "grad_norm": 2.1468288898468018, + "learning_rate": 5e-06, + "loss": 0.1464, + "step": 3228 + }, + { + "epoch": 0.6173996175908222, + "grad_norm": 1.821694254875183, + "learning_rate": 5e-06, + "loss": 0.1214, + "step": 3229 + }, + { + "epoch": 0.6175908221797323, + "grad_norm": 0.9342910051345825, + "learning_rate": 5e-06, + "loss": 0.0562, + "step": 3230 + }, + { + "epoch": 0.6177820267686425, + "grad_norm": 1.4590644836425781, + "learning_rate": 5e-06, + "loss": 0.0798, + "step": 3231 + }, + { + "epoch": 0.6179732313575526, + "grad_norm": 2.7206311225891113, + "learning_rate": 5e-06, + "loss": 0.3394, + "step": 3232 + }, + { + "epoch": 0.6181644359464628, + "grad_norm": 2.2347638607025146, + "learning_rate": 5e-06, + "loss": 0.152, + "step": 3233 + }, + { + "epoch": 0.6183556405353728, + "grad_norm": 2.2184712886810303, + "learning_rate": 5e-06, + "loss": 0.3338, + "step": 3234 + }, + { + "epoch": 0.6185468451242829, + "grad_norm": 1.4767361879348755, + "learning_rate": 5e-06, + "loss": 0.1558, + "step": 3235 + }, + { + "epoch": 0.6187380497131931, + "grad_norm": 1.7414273023605347, + "learning_rate": 5e-06, + "loss": 0.108, + "step": 3236 + }, + { + "epoch": 0.6189292543021032, + "grad_norm": 2.4318485260009766, + "learning_rate": 5e-06, + "loss": 0.148, + "step": 3237 + }, + { + "epoch": 0.6191204588910134, + "grad_norm": 2.138657808303833, + "learning_rate": 5e-06, + "loss": 0.2567, + "step": 3238 + }, + { + "epoch": 0.6193116634799235, + "grad_norm": 1.6914198398590088, + "learning_rate": 5e-06, + "loss": 0.1906, + "step": 3239 + }, + { + "epoch": 0.6195028680688337, + "grad_norm": 2.306057929992676, + "learning_rate": 5e-06, + "loss": 0.1104, + "step": 3240 + }, + { + "epoch": 0.6196940726577438, + "grad_norm": 2.9438154697418213, + "learning_rate": 5e-06, + "loss": 0.2681, + "step": 3241 + }, + { + "epoch": 0.6198852772466539, + "grad_norm": 0.8288871049880981, + "learning_rate": 5e-06, + "loss": 0.043, + "step": 3242 + }, + { + "epoch": 0.6200764818355641, + "grad_norm": 2.2429580688476562, + "learning_rate": 5e-06, + "loss": 0.2368, + "step": 3243 + }, + { + "epoch": 0.6202676864244742, + "grad_norm": 2.557755708694458, + "learning_rate": 5e-06, + "loss": 0.3005, + "step": 3244 + }, + { + "epoch": 0.6204588910133844, + "grad_norm": 1.9078497886657715, + "learning_rate": 5e-06, + "loss": 0.1692, + "step": 3245 + }, + { + "epoch": 0.6206500956022944, + "grad_norm": 2.475344657897949, + "learning_rate": 5e-06, + "loss": 0.2898, + "step": 3246 + }, + { + "epoch": 0.6208413001912045, + "grad_norm": 2.7360498905181885, + "learning_rate": 5e-06, + "loss": 0.283, + "step": 3247 + }, + { + "epoch": 0.6210325047801147, + "grad_norm": 1.3334991931915283, + "learning_rate": 5e-06, + "loss": 0.1089, + "step": 3248 + }, + { + "epoch": 0.6212237093690248, + "grad_norm": 1.7204349040985107, + "learning_rate": 5e-06, + "loss": 0.1098, + "step": 3249 + }, + { + "epoch": 0.621414913957935, + "grad_norm": 4.411588191986084, + "learning_rate": 5e-06, + "loss": 0.1599, + "step": 3250 + }, + { + "epoch": 0.6216061185468451, + "grad_norm": 1.615220308303833, + "learning_rate": 5e-06, + "loss": 0.128, + "step": 3251 + }, + { + "epoch": 0.6217973231357553, + "grad_norm": 2.4545962810516357, + "learning_rate": 5e-06, + "loss": 0.3416, + "step": 3252 + }, + { + "epoch": 0.6219885277246654, + "grad_norm": 1.9672613143920898, + "learning_rate": 5e-06, + "loss": 0.1455, + "step": 3253 + }, + { + "epoch": 0.6221797323135755, + "grad_norm": 2.6472153663635254, + "learning_rate": 5e-06, + "loss": 0.3045, + "step": 3254 + }, + { + "epoch": 0.6223709369024857, + "grad_norm": 1.7558777332305908, + "learning_rate": 5e-06, + "loss": 0.1715, + "step": 3255 + }, + { + "epoch": 0.6225621414913958, + "grad_norm": 1.1533193588256836, + "learning_rate": 5e-06, + "loss": 0.0723, + "step": 3256 + }, + { + "epoch": 0.622753346080306, + "grad_norm": 2.2439403533935547, + "learning_rate": 5e-06, + "loss": 0.1156, + "step": 3257 + }, + { + "epoch": 0.6229445506692161, + "grad_norm": 2.317619800567627, + "learning_rate": 5e-06, + "loss": 0.2825, + "step": 3258 + }, + { + "epoch": 0.6231357552581261, + "grad_norm": 1.0267956256866455, + "learning_rate": 5e-06, + "loss": 0.0867, + "step": 3259 + }, + { + "epoch": 0.6233269598470363, + "grad_norm": 1.7949508428573608, + "learning_rate": 5e-06, + "loss": 0.1758, + "step": 3260 + }, + { + "epoch": 0.6235181644359464, + "grad_norm": 0.780097246170044, + "learning_rate": 5e-06, + "loss": 0.0627, + "step": 3261 + }, + { + "epoch": 0.6237093690248566, + "grad_norm": 3.0155014991760254, + "learning_rate": 5e-06, + "loss": 0.125, + "step": 3262 + }, + { + "epoch": 0.6239005736137667, + "grad_norm": 1.4573692083358765, + "learning_rate": 5e-06, + "loss": 0.1154, + "step": 3263 + }, + { + "epoch": 0.6240917782026769, + "grad_norm": 2.20967173576355, + "learning_rate": 5e-06, + "loss": 0.265, + "step": 3264 + }, + { + "epoch": 0.624282982791587, + "grad_norm": 2.5910797119140625, + "learning_rate": 5e-06, + "loss": 0.2157, + "step": 3265 + }, + { + "epoch": 0.6244741873804971, + "grad_norm": 1.6436420679092407, + "learning_rate": 5e-06, + "loss": 0.1606, + "step": 3266 + }, + { + "epoch": 0.6246653919694073, + "grad_norm": 1.099359154701233, + "learning_rate": 5e-06, + "loss": 0.0784, + "step": 3267 + }, + { + "epoch": 0.6248565965583174, + "grad_norm": 2.09850811958313, + "learning_rate": 5e-06, + "loss": 0.1143, + "step": 3268 + }, + { + "epoch": 0.6250478011472276, + "grad_norm": 2.030501365661621, + "learning_rate": 5e-06, + "loss": 0.1151, + "step": 3269 + }, + { + "epoch": 0.6252390057361377, + "grad_norm": 1.77583646774292, + "learning_rate": 5e-06, + "loss": 0.2257, + "step": 3270 + }, + { + "epoch": 0.6254302103250478, + "grad_norm": 2.196660280227661, + "learning_rate": 5e-06, + "loss": 0.2526, + "step": 3271 + }, + { + "epoch": 0.625621414913958, + "grad_norm": 2.8073387145996094, + "learning_rate": 5e-06, + "loss": 0.1847, + "step": 3272 + }, + { + "epoch": 0.625812619502868, + "grad_norm": 1.610040307044983, + "learning_rate": 5e-06, + "loss": 0.0844, + "step": 3273 + }, + { + "epoch": 0.6260038240917782, + "grad_norm": 4.642181396484375, + "learning_rate": 5e-06, + "loss": 0.1936, + "step": 3274 + }, + { + "epoch": 0.6261950286806883, + "grad_norm": 1.8837900161743164, + "learning_rate": 5e-06, + "loss": 0.1336, + "step": 3275 + }, + { + "epoch": 0.6263862332695985, + "grad_norm": 2.7366063594818115, + "learning_rate": 5e-06, + "loss": 0.3408, + "step": 3276 + }, + { + "epoch": 0.6265774378585086, + "grad_norm": 2.5217320919036865, + "learning_rate": 5e-06, + "loss": 0.3369, + "step": 3277 + }, + { + "epoch": 0.6267686424474187, + "grad_norm": 2.0307939052581787, + "learning_rate": 5e-06, + "loss": 0.2868, + "step": 3278 + }, + { + "epoch": 0.6269598470363289, + "grad_norm": 2.022230386734009, + "learning_rate": 5e-06, + "loss": 0.2864, + "step": 3279 + }, + { + "epoch": 0.627151051625239, + "grad_norm": 1.771817684173584, + "learning_rate": 5e-06, + "loss": 0.086, + "step": 3280 + }, + { + "epoch": 0.6273422562141492, + "grad_norm": 1.1006555557250977, + "learning_rate": 5e-06, + "loss": 0.036, + "step": 3281 + }, + { + "epoch": 0.6275334608030593, + "grad_norm": 1.6326366662979126, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 3282 + }, + { + "epoch": 0.6277246653919694, + "grad_norm": 2.166058301925659, + "learning_rate": 5e-06, + "loss": 0.3915, + "step": 3283 + }, + { + "epoch": 0.6279158699808796, + "grad_norm": 1.752289056777954, + "learning_rate": 5e-06, + "loss": 0.1433, + "step": 3284 + }, + { + "epoch": 0.6281070745697896, + "grad_norm": 1.6301438808441162, + "learning_rate": 5e-06, + "loss": 0.1176, + "step": 3285 + }, + { + "epoch": 0.6282982791586998, + "grad_norm": 3.006061553955078, + "learning_rate": 5e-06, + "loss": 0.0893, + "step": 3286 + }, + { + "epoch": 0.6284894837476099, + "grad_norm": 1.6843498945236206, + "learning_rate": 5e-06, + "loss": 0.0768, + "step": 3287 + }, + { + "epoch": 0.62868068833652, + "grad_norm": 2.755248546600342, + "learning_rate": 5e-06, + "loss": 0.4565, + "step": 3288 + }, + { + "epoch": 0.6288718929254302, + "grad_norm": 1.122692584991455, + "learning_rate": 5e-06, + "loss": 0.1002, + "step": 3289 + }, + { + "epoch": 0.6290630975143403, + "grad_norm": 1.585771918296814, + "learning_rate": 5e-06, + "loss": 0.1586, + "step": 3290 + }, + { + "epoch": 0.6292543021032505, + "grad_norm": 1.6943069696426392, + "learning_rate": 5e-06, + "loss": 0.0915, + "step": 3291 + }, + { + "epoch": 0.6294455066921606, + "grad_norm": 1.3940718173980713, + "learning_rate": 5e-06, + "loss": 0.101, + "step": 3292 + }, + { + "epoch": 0.6296367112810708, + "grad_norm": 1.1676661968231201, + "learning_rate": 5e-06, + "loss": 0.0367, + "step": 3293 + }, + { + "epoch": 0.6298279158699809, + "grad_norm": 2.5931499004364014, + "learning_rate": 5e-06, + "loss": 0.2025, + "step": 3294 + }, + { + "epoch": 0.630019120458891, + "grad_norm": 1.825156331062317, + "learning_rate": 5e-06, + "loss": 0.1334, + "step": 3295 + }, + { + "epoch": 0.6302103250478012, + "grad_norm": 0.9374414086341858, + "learning_rate": 5e-06, + "loss": 0.0959, + "step": 3296 + }, + { + "epoch": 0.6304015296367113, + "grad_norm": 1.9733580350875854, + "learning_rate": 5e-06, + "loss": 0.1064, + "step": 3297 + }, + { + "epoch": 0.6305927342256215, + "grad_norm": 1.1101990938186646, + "learning_rate": 5e-06, + "loss": 0.0681, + "step": 3298 + }, + { + "epoch": 0.6307839388145315, + "grad_norm": 1.4817825555801392, + "learning_rate": 5e-06, + "loss": 0.0942, + "step": 3299 + }, + { + "epoch": 0.6309751434034416, + "grad_norm": 1.6432628631591797, + "learning_rate": 5e-06, + "loss": 0.0721, + "step": 3300 + }, + { + "epoch": 0.6311663479923518, + "grad_norm": 3.0500617027282715, + "learning_rate": 5e-06, + "loss": 0.4934, + "step": 3301 + }, + { + "epoch": 0.6313575525812619, + "grad_norm": 1.2397555112838745, + "learning_rate": 5e-06, + "loss": 0.1243, + "step": 3302 + }, + { + "epoch": 0.6315487571701721, + "grad_norm": 1.4868794679641724, + "learning_rate": 5e-06, + "loss": 0.251, + "step": 3303 + }, + { + "epoch": 0.6317399617590822, + "grad_norm": 1.124295711517334, + "learning_rate": 5e-06, + "loss": 0.1263, + "step": 3304 + }, + { + "epoch": 0.6319311663479924, + "grad_norm": 1.9603227376937866, + "learning_rate": 5e-06, + "loss": 0.2923, + "step": 3305 + }, + { + "epoch": 0.6321223709369025, + "grad_norm": 1.851251482963562, + "learning_rate": 5e-06, + "loss": 0.1242, + "step": 3306 + }, + { + "epoch": 0.6323135755258126, + "grad_norm": 2.5106050968170166, + "learning_rate": 5e-06, + "loss": 0.2472, + "step": 3307 + }, + { + "epoch": 0.6325047801147228, + "grad_norm": 1.8182765245437622, + "learning_rate": 5e-06, + "loss": 0.221, + "step": 3308 + }, + { + "epoch": 0.6326959847036329, + "grad_norm": 1.9880139827728271, + "learning_rate": 5e-06, + "loss": 0.075, + "step": 3309 + }, + { + "epoch": 0.6328871892925431, + "grad_norm": 1.8051013946533203, + "learning_rate": 5e-06, + "loss": 0.1106, + "step": 3310 + }, + { + "epoch": 0.6330783938814531, + "grad_norm": 1.6434516906738281, + "learning_rate": 5e-06, + "loss": 0.1318, + "step": 3311 + }, + { + "epoch": 0.6332695984703632, + "grad_norm": 1.805076241493225, + "learning_rate": 5e-06, + "loss": 0.0783, + "step": 3312 + }, + { + "epoch": 0.6334608030592734, + "grad_norm": 2.02512788772583, + "learning_rate": 5e-06, + "loss": 0.1746, + "step": 3313 + }, + { + "epoch": 0.6336520076481835, + "grad_norm": 1.037001132965088, + "learning_rate": 5e-06, + "loss": 0.0982, + "step": 3314 + }, + { + "epoch": 0.6338432122370937, + "grad_norm": 1.426772117614746, + "learning_rate": 5e-06, + "loss": 0.1482, + "step": 3315 + }, + { + "epoch": 0.6340344168260038, + "grad_norm": 1.1725845336914062, + "learning_rate": 5e-06, + "loss": 0.0841, + "step": 3316 + }, + { + "epoch": 0.634225621414914, + "grad_norm": 1.404199242591858, + "learning_rate": 5e-06, + "loss": 0.0808, + "step": 3317 + }, + { + "epoch": 0.6344168260038241, + "grad_norm": 1.5793739557266235, + "learning_rate": 5e-06, + "loss": 0.1145, + "step": 3318 + }, + { + "epoch": 0.6346080305927342, + "grad_norm": 2.362203359603882, + "learning_rate": 5e-06, + "loss": 0.1639, + "step": 3319 + }, + { + "epoch": 0.6347992351816444, + "grad_norm": 2.5650269985198975, + "learning_rate": 5e-06, + "loss": 0.3538, + "step": 3320 + }, + { + "epoch": 0.6349904397705545, + "grad_norm": 1.7669709920883179, + "learning_rate": 5e-06, + "loss": 0.1892, + "step": 3321 + }, + { + "epoch": 0.6351816443594647, + "grad_norm": 3.3054776191711426, + "learning_rate": 5e-06, + "loss": 0.5275, + "step": 3322 + }, + { + "epoch": 0.6353728489483748, + "grad_norm": 1.7212293148040771, + "learning_rate": 5e-06, + "loss": 0.1369, + "step": 3323 + }, + { + "epoch": 0.6355640535372848, + "grad_norm": 2.0429680347442627, + "learning_rate": 5e-06, + "loss": 0.0866, + "step": 3324 + }, + { + "epoch": 0.635755258126195, + "grad_norm": 1.428225040435791, + "learning_rate": 5e-06, + "loss": 0.0705, + "step": 3325 + }, + { + "epoch": 0.6359464627151051, + "grad_norm": 2.1418018341064453, + "learning_rate": 5e-06, + "loss": 0.3357, + "step": 3326 + }, + { + "epoch": 0.6361376673040153, + "grad_norm": 1.53099524974823, + "learning_rate": 5e-06, + "loss": 0.1017, + "step": 3327 + }, + { + "epoch": 0.6363288718929254, + "grad_norm": 1.5691248178482056, + "learning_rate": 5e-06, + "loss": 0.1724, + "step": 3328 + }, + { + "epoch": 0.6365200764818356, + "grad_norm": 2.0633819103240967, + "learning_rate": 5e-06, + "loss": 0.2149, + "step": 3329 + }, + { + "epoch": 0.6367112810707457, + "grad_norm": 2.932884693145752, + "learning_rate": 5e-06, + "loss": 0.3623, + "step": 3330 + }, + { + "epoch": 0.6369024856596558, + "grad_norm": 1.8070765733718872, + "learning_rate": 5e-06, + "loss": 0.097, + "step": 3331 + }, + { + "epoch": 0.637093690248566, + "grad_norm": 2.7510499954223633, + "learning_rate": 5e-06, + "loss": 0.3189, + "step": 3332 + }, + { + "epoch": 0.6372848948374761, + "grad_norm": 2.5223939418792725, + "learning_rate": 5e-06, + "loss": 0.4771, + "step": 3333 + }, + { + "epoch": 0.6374760994263863, + "grad_norm": 2.9238736629486084, + "learning_rate": 5e-06, + "loss": 0.2895, + "step": 3334 + }, + { + "epoch": 0.6376673040152964, + "grad_norm": 0.8285096883773804, + "learning_rate": 5e-06, + "loss": 0.0522, + "step": 3335 + }, + { + "epoch": 0.6378585086042065, + "grad_norm": 1.8238945007324219, + "learning_rate": 5e-06, + "loss": 0.1194, + "step": 3336 + }, + { + "epoch": 0.6380497131931167, + "grad_norm": 2.4051716327667236, + "learning_rate": 5e-06, + "loss": 0.1254, + "step": 3337 + }, + { + "epoch": 0.6382409177820267, + "grad_norm": 3.326117515563965, + "learning_rate": 5e-06, + "loss": 0.5696, + "step": 3338 + }, + { + "epoch": 0.6384321223709369, + "grad_norm": 2.6902754306793213, + "learning_rate": 5e-06, + "loss": 0.211, + "step": 3339 + }, + { + "epoch": 0.638623326959847, + "grad_norm": 2.737630605697632, + "learning_rate": 5e-06, + "loss": 0.3434, + "step": 3340 + }, + { + "epoch": 0.6388145315487572, + "grad_norm": 1.2848268747329712, + "learning_rate": 5e-06, + "loss": 0.0847, + "step": 3341 + }, + { + "epoch": 0.6390057361376673, + "grad_norm": 2.3309013843536377, + "learning_rate": 5e-06, + "loss": 0.1437, + "step": 3342 + }, + { + "epoch": 0.6391969407265774, + "grad_norm": 2.1647744178771973, + "learning_rate": 5e-06, + "loss": 0.0887, + "step": 3343 + }, + { + "epoch": 0.6393881453154876, + "grad_norm": 1.908402681350708, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 3344 + }, + { + "epoch": 0.6395793499043977, + "grad_norm": 1.631221055984497, + "learning_rate": 5e-06, + "loss": 0.1566, + "step": 3345 + }, + { + "epoch": 0.6397705544933079, + "grad_norm": 3.7973790168762207, + "learning_rate": 5e-06, + "loss": 0.5618, + "step": 3346 + }, + { + "epoch": 0.639961759082218, + "grad_norm": 1.2875862121582031, + "learning_rate": 5e-06, + "loss": 0.0914, + "step": 3347 + }, + { + "epoch": 0.6401529636711281, + "grad_norm": 1.294562578201294, + "learning_rate": 5e-06, + "loss": 0.0698, + "step": 3348 + }, + { + "epoch": 0.6403441682600383, + "grad_norm": 2.602620840072632, + "learning_rate": 5e-06, + "loss": 0.1843, + "step": 3349 + }, + { + "epoch": 0.6405353728489483, + "grad_norm": 2.0691466331481934, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 3350 + }, + { + "epoch": 0.6407265774378585, + "grad_norm": 1.8351796865463257, + "learning_rate": 5e-06, + "loss": 0.2551, + "step": 3351 + }, + { + "epoch": 0.6409177820267686, + "grad_norm": 2.922478199005127, + "learning_rate": 5e-06, + "loss": 0.3711, + "step": 3352 + }, + { + "epoch": 0.6411089866156787, + "grad_norm": 1.2165088653564453, + "learning_rate": 5e-06, + "loss": 0.0966, + "step": 3353 + }, + { + "epoch": 0.6413001912045889, + "grad_norm": 1.7305443286895752, + "learning_rate": 5e-06, + "loss": 0.0971, + "step": 3354 + }, + { + "epoch": 0.641491395793499, + "grad_norm": 2.2913918495178223, + "learning_rate": 5e-06, + "loss": 0.0785, + "step": 3355 + }, + { + "epoch": 0.6416826003824092, + "grad_norm": 2.2279272079467773, + "learning_rate": 5e-06, + "loss": 0.143, + "step": 3356 + }, + { + "epoch": 0.6418738049713193, + "grad_norm": 2.924514055252075, + "learning_rate": 5e-06, + "loss": 0.4447, + "step": 3357 + }, + { + "epoch": 0.6420650095602295, + "grad_norm": 2.028049945831299, + "learning_rate": 5e-06, + "loss": 0.2733, + "step": 3358 + }, + { + "epoch": 0.6422562141491396, + "grad_norm": 1.2572370767593384, + "learning_rate": 5e-06, + "loss": 0.1236, + "step": 3359 + }, + { + "epoch": 0.6424474187380497, + "grad_norm": 1.216548204421997, + "learning_rate": 5e-06, + "loss": 0.0975, + "step": 3360 + }, + { + "epoch": 0.6426386233269599, + "grad_norm": 1.6940453052520752, + "learning_rate": 5e-06, + "loss": 0.0758, + "step": 3361 + }, + { + "epoch": 0.64282982791587, + "grad_norm": 1.7871954441070557, + "learning_rate": 5e-06, + "loss": 0.0795, + "step": 3362 + }, + { + "epoch": 0.6430210325047802, + "grad_norm": 1.6503570079803467, + "learning_rate": 5e-06, + "loss": 0.1087, + "step": 3363 + }, + { + "epoch": 0.6432122370936902, + "grad_norm": 2.5519793033599854, + "learning_rate": 5e-06, + "loss": 0.4499, + "step": 3364 + }, + { + "epoch": 0.6434034416826003, + "grad_norm": 1.5863988399505615, + "learning_rate": 5e-06, + "loss": 0.085, + "step": 3365 + }, + { + "epoch": 0.6435946462715105, + "grad_norm": 0.7910031080245972, + "learning_rate": 5e-06, + "loss": 0.068, + "step": 3366 + }, + { + "epoch": 0.6437858508604206, + "grad_norm": 1.732580304145813, + "learning_rate": 5e-06, + "loss": 0.1334, + "step": 3367 + }, + { + "epoch": 0.6439770554493308, + "grad_norm": 2.4041748046875, + "learning_rate": 5e-06, + "loss": 0.3406, + "step": 3368 + }, + { + "epoch": 0.6441682600382409, + "grad_norm": 1.4867603778839111, + "learning_rate": 5e-06, + "loss": 0.0677, + "step": 3369 + }, + { + "epoch": 0.6443594646271511, + "grad_norm": 1.6989647150039673, + "learning_rate": 5e-06, + "loss": 0.1288, + "step": 3370 + }, + { + "epoch": 0.6445506692160612, + "grad_norm": 1.4293967485427856, + "learning_rate": 5e-06, + "loss": 0.1531, + "step": 3371 + }, + { + "epoch": 0.6447418738049713, + "grad_norm": 1.3375941514968872, + "learning_rate": 5e-06, + "loss": 0.1033, + "step": 3372 + }, + { + "epoch": 0.6449330783938815, + "grad_norm": 0.9712766408920288, + "learning_rate": 5e-06, + "loss": 0.0382, + "step": 3373 + }, + { + "epoch": 0.6451242829827916, + "grad_norm": 2.1650278568267822, + "learning_rate": 5e-06, + "loss": 0.2237, + "step": 3374 + }, + { + "epoch": 0.6453154875717018, + "grad_norm": 1.0781400203704834, + "learning_rate": 5e-06, + "loss": 0.0441, + "step": 3375 + }, + { + "epoch": 0.6455066921606119, + "grad_norm": 2.823831081390381, + "learning_rate": 5e-06, + "loss": 0.3515, + "step": 3376 + }, + { + "epoch": 0.6456978967495219, + "grad_norm": 1.8595666885375977, + "learning_rate": 5e-06, + "loss": 0.3496, + "step": 3377 + }, + { + "epoch": 0.6458891013384321, + "grad_norm": 2.762394905090332, + "learning_rate": 5e-06, + "loss": 0.4007, + "step": 3378 + }, + { + "epoch": 0.6460803059273422, + "grad_norm": 1.8678456544876099, + "learning_rate": 5e-06, + "loss": 0.2212, + "step": 3379 + }, + { + "epoch": 0.6462715105162524, + "grad_norm": 1.9255629777908325, + "learning_rate": 5e-06, + "loss": 0.2845, + "step": 3380 + }, + { + "epoch": 0.6464627151051625, + "grad_norm": 2.22768497467041, + "learning_rate": 5e-06, + "loss": 0.0916, + "step": 3381 + }, + { + "epoch": 0.6466539196940727, + "grad_norm": 2.3426296710968018, + "learning_rate": 5e-06, + "loss": 0.4442, + "step": 3382 + }, + { + "epoch": 0.6468451242829828, + "grad_norm": 2.8555681705474854, + "learning_rate": 5e-06, + "loss": 0.1754, + "step": 3383 + }, + { + "epoch": 0.6470363288718929, + "grad_norm": 2.99185848236084, + "learning_rate": 5e-06, + "loss": 0.2644, + "step": 3384 + }, + { + "epoch": 0.6472275334608031, + "grad_norm": 2.5336363315582275, + "learning_rate": 5e-06, + "loss": 0.4541, + "step": 3385 + }, + { + "epoch": 0.6474187380497132, + "grad_norm": 2.059492826461792, + "learning_rate": 5e-06, + "loss": 0.2929, + "step": 3386 + }, + { + "epoch": 0.6476099426386234, + "grad_norm": 1.113668441772461, + "learning_rate": 5e-06, + "loss": 0.1024, + "step": 3387 + }, + { + "epoch": 0.6478011472275335, + "grad_norm": 2.365506649017334, + "learning_rate": 5e-06, + "loss": 0.3054, + "step": 3388 + }, + { + "epoch": 0.6479923518164435, + "grad_norm": 2.7506308555603027, + "learning_rate": 5e-06, + "loss": 0.4685, + "step": 3389 + }, + { + "epoch": 0.6481835564053537, + "grad_norm": 1.8486632108688354, + "learning_rate": 5e-06, + "loss": 0.1623, + "step": 3390 + }, + { + "epoch": 0.6483747609942638, + "grad_norm": 3.001056432723999, + "learning_rate": 5e-06, + "loss": 0.1733, + "step": 3391 + }, + { + "epoch": 0.648565965583174, + "grad_norm": 1.8992429971694946, + "learning_rate": 5e-06, + "loss": 0.1681, + "step": 3392 + }, + { + "epoch": 0.6487571701720841, + "grad_norm": 0.8879466652870178, + "learning_rate": 5e-06, + "loss": 0.0552, + "step": 3393 + }, + { + "epoch": 0.6489483747609943, + "grad_norm": 1.9001550674438477, + "learning_rate": 5e-06, + "loss": 0.1817, + "step": 3394 + }, + { + "epoch": 0.6491395793499044, + "grad_norm": 2.546579122543335, + "learning_rate": 5e-06, + "loss": 0.2765, + "step": 3395 + }, + { + "epoch": 0.6493307839388145, + "grad_norm": 1.5954774618148804, + "learning_rate": 5e-06, + "loss": 0.3091, + "step": 3396 + }, + { + "epoch": 0.6495219885277247, + "grad_norm": 2.02475905418396, + "learning_rate": 5e-06, + "loss": 0.2654, + "step": 3397 + }, + { + "epoch": 0.6497131931166348, + "grad_norm": 2.6336565017700195, + "learning_rate": 5e-06, + "loss": 0.1651, + "step": 3398 + }, + { + "epoch": 0.649904397705545, + "grad_norm": 2.454435348510742, + "learning_rate": 5e-06, + "loss": 0.0972, + "step": 3399 + }, + { + "epoch": 0.6500956022944551, + "grad_norm": 1.3472257852554321, + "learning_rate": 5e-06, + "loss": 0.0623, + "step": 3400 + }, + { + "epoch": 0.6502868068833652, + "grad_norm": 2.588073492050171, + "learning_rate": 5e-06, + "loss": 0.3797, + "step": 3401 + }, + { + "epoch": 0.6504780114722754, + "grad_norm": 1.4676856994628906, + "learning_rate": 5e-06, + "loss": 0.1556, + "step": 3402 + }, + { + "epoch": 0.6506692160611854, + "grad_norm": 1.6890233755111694, + "learning_rate": 5e-06, + "loss": 0.1048, + "step": 3403 + }, + { + "epoch": 0.6508604206500956, + "grad_norm": 2.6827495098114014, + "learning_rate": 5e-06, + "loss": 0.223, + "step": 3404 + }, + { + "epoch": 0.6510516252390057, + "grad_norm": 2.6940207481384277, + "learning_rate": 5e-06, + "loss": 0.1186, + "step": 3405 + }, + { + "epoch": 0.6512428298279158, + "grad_norm": 2.2096269130706787, + "learning_rate": 5e-06, + "loss": 0.1084, + "step": 3406 + }, + { + "epoch": 0.651434034416826, + "grad_norm": 1.9896823167800903, + "learning_rate": 5e-06, + "loss": 0.3026, + "step": 3407 + }, + { + "epoch": 0.6516252390057361, + "grad_norm": 2.7920219898223877, + "learning_rate": 5e-06, + "loss": 0.3761, + "step": 3408 + }, + { + "epoch": 0.6518164435946463, + "grad_norm": 1.8184442520141602, + "learning_rate": 5e-06, + "loss": 0.126, + "step": 3409 + }, + { + "epoch": 0.6520076481835564, + "grad_norm": 1.1374013423919678, + "learning_rate": 5e-06, + "loss": 0.0662, + "step": 3410 + }, + { + "epoch": 0.6521988527724666, + "grad_norm": 2.6526119709014893, + "learning_rate": 5e-06, + "loss": 0.3993, + "step": 3411 + }, + { + "epoch": 0.6523900573613767, + "grad_norm": 2.8433001041412354, + "learning_rate": 5e-06, + "loss": 0.1707, + "step": 3412 + }, + { + "epoch": 0.6525812619502868, + "grad_norm": 0.9985430836677551, + "learning_rate": 5e-06, + "loss": 0.08, + "step": 3413 + }, + { + "epoch": 0.652772466539197, + "grad_norm": 2.4228732585906982, + "learning_rate": 5e-06, + "loss": 0.3744, + "step": 3414 + }, + { + "epoch": 0.652963671128107, + "grad_norm": 1.5509470701217651, + "learning_rate": 5e-06, + "loss": 0.2037, + "step": 3415 + }, + { + "epoch": 0.6531548757170172, + "grad_norm": 1.4869673252105713, + "learning_rate": 5e-06, + "loss": 0.093, + "step": 3416 + }, + { + "epoch": 0.6533460803059273, + "grad_norm": 0.611136257648468, + "learning_rate": 5e-06, + "loss": 0.0593, + "step": 3417 + }, + { + "epoch": 0.6535372848948374, + "grad_norm": 1.8549314737319946, + "learning_rate": 5e-06, + "loss": 0.0853, + "step": 3418 + }, + { + "epoch": 0.6537284894837476, + "grad_norm": 1.9247181415557861, + "learning_rate": 5e-06, + "loss": 0.2044, + "step": 3419 + }, + { + "epoch": 0.6539196940726577, + "grad_norm": 2.686331033706665, + "learning_rate": 5e-06, + "loss": 0.18, + "step": 3420 + }, + { + "epoch": 0.6541108986615679, + "grad_norm": 1.9724444150924683, + "learning_rate": 5e-06, + "loss": 0.2007, + "step": 3421 + }, + { + "epoch": 0.654302103250478, + "grad_norm": 1.914251685142517, + "learning_rate": 5e-06, + "loss": 0.2638, + "step": 3422 + }, + { + "epoch": 0.6544933078393882, + "grad_norm": 1.8022854328155518, + "learning_rate": 5e-06, + "loss": 0.1179, + "step": 3423 + }, + { + "epoch": 0.6546845124282983, + "grad_norm": 2.4049770832061768, + "learning_rate": 5e-06, + "loss": 0.1338, + "step": 3424 + }, + { + "epoch": 0.6548757170172084, + "grad_norm": 1.664724349975586, + "learning_rate": 5e-06, + "loss": 0.0854, + "step": 3425 + }, + { + "epoch": 0.6550669216061186, + "grad_norm": 2.4440340995788574, + "learning_rate": 5e-06, + "loss": 0.2504, + "step": 3426 + }, + { + "epoch": 0.6552581261950287, + "grad_norm": 1.7133796215057373, + "learning_rate": 5e-06, + "loss": 0.148, + "step": 3427 + }, + { + "epoch": 0.6554493307839389, + "grad_norm": 2.668142080307007, + "learning_rate": 5e-06, + "loss": 0.2902, + "step": 3428 + }, + { + "epoch": 0.6556405353728489, + "grad_norm": 2.5641353130340576, + "learning_rate": 5e-06, + "loss": 0.3005, + "step": 3429 + }, + { + "epoch": 0.655831739961759, + "grad_norm": 2.3122782707214355, + "learning_rate": 5e-06, + "loss": 0.1071, + "step": 3430 + }, + { + "epoch": 0.6560229445506692, + "grad_norm": 1.7354764938354492, + "learning_rate": 5e-06, + "loss": 0.0937, + "step": 3431 + }, + { + "epoch": 0.6562141491395793, + "grad_norm": 2.0829017162323, + "learning_rate": 5e-06, + "loss": 0.3719, + "step": 3432 + }, + { + "epoch": 0.6564053537284895, + "grad_norm": 1.4630341529846191, + "learning_rate": 5e-06, + "loss": 0.1413, + "step": 3433 + }, + { + "epoch": 0.6565965583173996, + "grad_norm": 1.0866786241531372, + "learning_rate": 5e-06, + "loss": 0.0894, + "step": 3434 + }, + { + "epoch": 0.6567877629063098, + "grad_norm": 2.1426541805267334, + "learning_rate": 5e-06, + "loss": 0.4141, + "step": 3435 + }, + { + "epoch": 0.6569789674952199, + "grad_norm": 3.4362006187438965, + "learning_rate": 5e-06, + "loss": 0.2099, + "step": 3436 + }, + { + "epoch": 0.65717017208413, + "grad_norm": 2.1101267337799072, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 3437 + }, + { + "epoch": 0.6573613766730402, + "grad_norm": 1.7151868343353271, + "learning_rate": 5e-06, + "loss": 0.1146, + "step": 3438 + }, + { + "epoch": 0.6575525812619503, + "grad_norm": 1.1048760414123535, + "learning_rate": 5e-06, + "loss": 0.1355, + "step": 3439 + }, + { + "epoch": 0.6577437858508605, + "grad_norm": 2.037752628326416, + "learning_rate": 5e-06, + "loss": 0.2758, + "step": 3440 + }, + { + "epoch": 0.6579349904397706, + "grad_norm": 1.7046453952789307, + "learning_rate": 5e-06, + "loss": 0.1341, + "step": 3441 + }, + { + "epoch": 0.6581261950286806, + "grad_norm": 1.204085350036621, + "learning_rate": 5e-06, + "loss": 0.0877, + "step": 3442 + }, + { + "epoch": 0.6583173996175908, + "grad_norm": 1.8280205726623535, + "learning_rate": 5e-06, + "loss": 0.1489, + "step": 3443 + }, + { + "epoch": 0.6585086042065009, + "grad_norm": 1.589545726776123, + "learning_rate": 5e-06, + "loss": 0.0657, + "step": 3444 + }, + { + "epoch": 0.6586998087954111, + "grad_norm": 2.3866069316864014, + "learning_rate": 5e-06, + "loss": 0.2623, + "step": 3445 + }, + { + "epoch": 0.6588910133843212, + "grad_norm": 1.6342370510101318, + "learning_rate": 5e-06, + "loss": 0.336, + "step": 3446 + }, + { + "epoch": 0.6590822179732314, + "grad_norm": 1.841517448425293, + "learning_rate": 5e-06, + "loss": 0.1794, + "step": 3447 + }, + { + "epoch": 0.6592734225621415, + "grad_norm": 1.5720770359039307, + "learning_rate": 5e-06, + "loss": 0.0911, + "step": 3448 + }, + { + "epoch": 0.6594646271510516, + "grad_norm": 3.1381449699401855, + "learning_rate": 5e-06, + "loss": 0.1462, + "step": 3449 + }, + { + "epoch": 0.6596558317399618, + "grad_norm": 2.0292165279388428, + "learning_rate": 5e-06, + "loss": 0.1095, + "step": 3450 + }, + { + "epoch": 0.6598470363288719, + "grad_norm": 2.1982815265655518, + "learning_rate": 5e-06, + "loss": 0.2026, + "step": 3451 + }, + { + "epoch": 0.6600382409177821, + "grad_norm": 3.106961727142334, + "learning_rate": 5e-06, + "loss": 0.0886, + "step": 3452 + }, + { + "epoch": 0.6602294455066922, + "grad_norm": 2.6791203022003174, + "learning_rate": 5e-06, + "loss": 0.1841, + "step": 3453 + }, + { + "epoch": 0.6604206500956022, + "grad_norm": 0.9955388903617859, + "learning_rate": 5e-06, + "loss": 0.0829, + "step": 3454 + }, + { + "epoch": 0.6606118546845124, + "grad_norm": 1.959532618522644, + "learning_rate": 5e-06, + "loss": 0.1379, + "step": 3455 + }, + { + "epoch": 0.6608030592734225, + "grad_norm": 1.4918891191482544, + "learning_rate": 5e-06, + "loss": 0.0437, + "step": 3456 + }, + { + "epoch": 0.6609942638623327, + "grad_norm": 2.387288808822632, + "learning_rate": 5e-06, + "loss": 0.2389, + "step": 3457 + }, + { + "epoch": 0.6611854684512428, + "grad_norm": 2.7315800189971924, + "learning_rate": 5e-06, + "loss": 0.4122, + "step": 3458 + }, + { + "epoch": 0.661376673040153, + "grad_norm": 3.008780002593994, + "learning_rate": 5e-06, + "loss": 0.5461, + "step": 3459 + }, + { + "epoch": 0.6615678776290631, + "grad_norm": 1.4275813102722168, + "learning_rate": 5e-06, + "loss": 0.0968, + "step": 3460 + }, + { + "epoch": 0.6617590822179732, + "grad_norm": 2.498218297958374, + "learning_rate": 5e-06, + "loss": 0.176, + "step": 3461 + }, + { + "epoch": 0.6619502868068834, + "grad_norm": 2.3589062690734863, + "learning_rate": 5e-06, + "loss": 0.1207, + "step": 3462 + }, + { + "epoch": 0.6621414913957935, + "grad_norm": 1.8128048181533813, + "learning_rate": 5e-06, + "loss": 0.1377, + "step": 3463 + }, + { + "epoch": 0.6623326959847037, + "grad_norm": 2.6147453784942627, + "learning_rate": 5e-06, + "loss": 0.4276, + "step": 3464 + }, + { + "epoch": 0.6625239005736138, + "grad_norm": 1.5070366859436035, + "learning_rate": 5e-06, + "loss": 0.1426, + "step": 3465 + }, + { + "epoch": 0.6627151051625239, + "grad_norm": 1.6204683780670166, + "learning_rate": 5e-06, + "loss": 0.0838, + "step": 3466 + }, + { + "epoch": 0.662906309751434, + "grad_norm": 0.9713101983070374, + "learning_rate": 5e-06, + "loss": 0.1332, + "step": 3467 + }, + { + "epoch": 0.6630975143403441, + "grad_norm": 2.181532621383667, + "learning_rate": 5e-06, + "loss": 0.1285, + "step": 3468 + }, + { + "epoch": 0.6632887189292543, + "grad_norm": 2.44734787940979, + "learning_rate": 5e-06, + "loss": 0.1991, + "step": 3469 + }, + { + "epoch": 0.6634799235181644, + "grad_norm": 3.1234562397003174, + "learning_rate": 5e-06, + "loss": 0.6639, + "step": 3470 + }, + { + "epoch": 0.6636711281070745, + "grad_norm": 1.2073078155517578, + "learning_rate": 5e-06, + "loss": 0.123, + "step": 3471 + }, + { + "epoch": 0.6638623326959847, + "grad_norm": 2.172306537628174, + "learning_rate": 5e-06, + "loss": 0.2327, + "step": 3472 + }, + { + "epoch": 0.6640535372848948, + "grad_norm": 1.4012967348098755, + "learning_rate": 5e-06, + "loss": 0.1017, + "step": 3473 + }, + { + "epoch": 0.664244741873805, + "grad_norm": 1.9511357545852661, + "learning_rate": 5e-06, + "loss": 0.1336, + "step": 3474 + }, + { + "epoch": 0.6644359464627151, + "grad_norm": 4.60706090927124, + "learning_rate": 5e-06, + "loss": 0.1754, + "step": 3475 + }, + { + "epoch": 0.6646271510516253, + "grad_norm": 2.3865597248077393, + "learning_rate": 5e-06, + "loss": 0.3496, + "step": 3476 + }, + { + "epoch": 0.6648183556405354, + "grad_norm": 2.6536009311676025, + "learning_rate": 5e-06, + "loss": 0.2316, + "step": 3477 + }, + { + "epoch": 0.6650095602294455, + "grad_norm": 1.3826388120651245, + "learning_rate": 5e-06, + "loss": 0.159, + "step": 3478 + }, + { + "epoch": 0.6652007648183557, + "grad_norm": 1.5564355850219727, + "learning_rate": 5e-06, + "loss": 0.1318, + "step": 3479 + }, + { + "epoch": 0.6653919694072657, + "grad_norm": 1.9521403312683105, + "learning_rate": 5e-06, + "loss": 0.1619, + "step": 3480 + }, + { + "epoch": 0.665583173996176, + "grad_norm": 1.7389386892318726, + "learning_rate": 5e-06, + "loss": 0.0573, + "step": 3481 + }, + { + "epoch": 0.665774378585086, + "grad_norm": 1.4144505262374878, + "learning_rate": 5e-06, + "loss": 0.245, + "step": 3482 + }, + { + "epoch": 0.6659655831739961, + "grad_norm": 1.4570914506912231, + "learning_rate": 5e-06, + "loss": 0.1495, + "step": 3483 + }, + { + "epoch": 0.6661567877629063, + "grad_norm": 1.8888745307922363, + "learning_rate": 5e-06, + "loss": 0.2384, + "step": 3484 + }, + { + "epoch": 0.6663479923518164, + "grad_norm": 2.382516860961914, + "learning_rate": 5e-06, + "loss": 0.2814, + "step": 3485 + }, + { + "epoch": 0.6665391969407266, + "grad_norm": 1.7876882553100586, + "learning_rate": 5e-06, + "loss": 0.1326, + "step": 3486 + }, + { + "epoch": 0.6667304015296367, + "grad_norm": 1.878159999847412, + "learning_rate": 5e-06, + "loss": 0.1178, + "step": 3487 + }, + { + "epoch": 0.6669216061185469, + "grad_norm": 2.022584915161133, + "learning_rate": 5e-06, + "loss": 0.1486, + "step": 3488 + }, + { + "epoch": 0.667112810707457, + "grad_norm": 2.123523473739624, + "learning_rate": 5e-06, + "loss": 0.1952, + "step": 3489 + }, + { + "epoch": 0.6673040152963671, + "grad_norm": 2.8303143978118896, + "learning_rate": 5e-06, + "loss": 0.3464, + "step": 3490 + }, + { + "epoch": 0.6674952198852773, + "grad_norm": 1.546257495880127, + "learning_rate": 5e-06, + "loss": 0.1826, + "step": 3491 + }, + { + "epoch": 0.6676864244741874, + "grad_norm": 1.338302731513977, + "learning_rate": 5e-06, + "loss": 0.109, + "step": 3492 + }, + { + "epoch": 0.6678776290630976, + "grad_norm": 1.4451181888580322, + "learning_rate": 5e-06, + "loss": 0.1015, + "step": 3493 + }, + { + "epoch": 0.6680688336520076, + "grad_norm": 1.242767095565796, + "learning_rate": 5e-06, + "loss": 0.0837, + "step": 3494 + }, + { + "epoch": 0.6682600382409177, + "grad_norm": 2.2983875274658203, + "learning_rate": 5e-06, + "loss": 0.3422, + "step": 3495 + }, + { + "epoch": 0.6684512428298279, + "grad_norm": 2.008904218673706, + "learning_rate": 5e-06, + "loss": 0.2969, + "step": 3496 + }, + { + "epoch": 0.668642447418738, + "grad_norm": 2.3049376010894775, + "learning_rate": 5e-06, + "loss": 0.3266, + "step": 3497 + }, + { + "epoch": 0.6688336520076482, + "grad_norm": 2.297036647796631, + "learning_rate": 5e-06, + "loss": 0.2258, + "step": 3498 + }, + { + "epoch": 0.6690248565965583, + "grad_norm": 1.6770265102386475, + "learning_rate": 5e-06, + "loss": 0.0976, + "step": 3499 + }, + { + "epoch": 0.6692160611854685, + "grad_norm": 0.9409367442131042, + "learning_rate": 5e-06, + "loss": 0.034, + "step": 3500 + }, + { + "epoch": 0.6692160611854685, + "eval_runtime": 786.6312, + "eval_samples_per_second": 1.95, + "eval_steps_per_second": 0.244, + "step": 3500 + }, + { + "epoch": 0.6694072657743786, + "grad_norm": 1.6677579879760742, + "learning_rate": 5e-06, + "loss": 0.1108, + "step": 3501 + }, + { + "epoch": 0.6695984703632887, + "grad_norm": 1.5019170045852661, + "learning_rate": 5e-06, + "loss": 0.2266, + "step": 3502 + }, + { + "epoch": 0.6697896749521989, + "grad_norm": 2.645740509033203, + "learning_rate": 5e-06, + "loss": 0.3297, + "step": 3503 + }, + { + "epoch": 0.669980879541109, + "grad_norm": 2.4888534545898438, + "learning_rate": 5e-06, + "loss": 0.2745, + "step": 3504 + }, + { + "epoch": 0.6701720841300192, + "grad_norm": 1.4102755784988403, + "learning_rate": 5e-06, + "loss": 0.0735, + "step": 3505 + }, + { + "epoch": 0.6703632887189293, + "grad_norm": 11.278008460998535, + "learning_rate": 5e-06, + "loss": 0.1773, + "step": 3506 + }, + { + "epoch": 0.6705544933078393, + "grad_norm": 2.274052381515503, + "learning_rate": 5e-06, + "loss": 0.1174, + "step": 3507 + }, + { + "epoch": 0.6707456978967495, + "grad_norm": 1.99003005027771, + "learning_rate": 5e-06, + "loss": 0.268, + "step": 3508 + }, + { + "epoch": 0.6709369024856596, + "grad_norm": 1.6028283834457397, + "learning_rate": 5e-06, + "loss": 0.1354, + "step": 3509 + }, + { + "epoch": 0.6711281070745698, + "grad_norm": 2.1967668533325195, + "learning_rate": 5e-06, + "loss": 0.1541, + "step": 3510 + }, + { + "epoch": 0.6713193116634799, + "grad_norm": 2.1785519123077393, + "learning_rate": 5e-06, + "loss": 0.1173, + "step": 3511 + }, + { + "epoch": 0.6715105162523901, + "grad_norm": 1.8150266408920288, + "learning_rate": 5e-06, + "loss": 0.0938, + "step": 3512 + }, + { + "epoch": 0.6717017208413002, + "grad_norm": 1.4618399143218994, + "learning_rate": 5e-06, + "loss": 0.1253, + "step": 3513 + }, + { + "epoch": 0.6718929254302103, + "grad_norm": 3.069847822189331, + "learning_rate": 5e-06, + "loss": 0.5254, + "step": 3514 + }, + { + "epoch": 0.6720841300191205, + "grad_norm": 1.6214332580566406, + "learning_rate": 5e-06, + "loss": 0.1171, + "step": 3515 + }, + { + "epoch": 0.6722753346080306, + "grad_norm": 2.7934930324554443, + "learning_rate": 5e-06, + "loss": 0.2811, + "step": 3516 + }, + { + "epoch": 0.6724665391969408, + "grad_norm": 1.2103495597839355, + "learning_rate": 5e-06, + "loss": 0.075, + "step": 3517 + }, + { + "epoch": 0.6726577437858509, + "grad_norm": 1.6577422618865967, + "learning_rate": 5e-06, + "loss": 0.1344, + "step": 3518 + }, + { + "epoch": 0.672848948374761, + "grad_norm": 1.5630927085876465, + "learning_rate": 5e-06, + "loss": 0.1212, + "step": 3519 + }, + { + "epoch": 0.6730401529636711, + "grad_norm": 3.0157723426818848, + "learning_rate": 5e-06, + "loss": 0.5516, + "step": 3520 + }, + { + "epoch": 0.6732313575525812, + "grad_norm": 2.152554988861084, + "learning_rate": 5e-06, + "loss": 0.3016, + "step": 3521 + }, + { + "epoch": 0.6734225621414914, + "grad_norm": 2.146681547164917, + "learning_rate": 5e-06, + "loss": 0.4707, + "step": 3522 + }, + { + "epoch": 0.6736137667304015, + "grad_norm": 2.0331432819366455, + "learning_rate": 5e-06, + "loss": 0.1046, + "step": 3523 + }, + { + "epoch": 0.6738049713193117, + "grad_norm": 1.3789559602737427, + "learning_rate": 5e-06, + "loss": 0.076, + "step": 3524 + }, + { + "epoch": 0.6739961759082218, + "grad_norm": 0.8906503319740295, + "learning_rate": 5e-06, + "loss": 0.0667, + "step": 3525 + }, + { + "epoch": 0.6741873804971319, + "grad_norm": 2.7111520767211914, + "learning_rate": 5e-06, + "loss": 0.3719, + "step": 3526 + }, + { + "epoch": 0.6743785850860421, + "grad_norm": 2.096019744873047, + "learning_rate": 5e-06, + "loss": 0.3089, + "step": 3527 + }, + { + "epoch": 0.6745697896749522, + "grad_norm": 1.231835961341858, + "learning_rate": 5e-06, + "loss": 0.0677, + "step": 3528 + }, + { + "epoch": 0.6747609942638624, + "grad_norm": 1.7469233274459839, + "learning_rate": 5e-06, + "loss": 0.0908, + "step": 3529 + }, + { + "epoch": 0.6749521988527725, + "grad_norm": 0.5927736163139343, + "learning_rate": 5e-06, + "loss": 0.0342, + "step": 3530 + }, + { + "epoch": 0.6751434034416826, + "grad_norm": 3.6534321308135986, + "learning_rate": 5e-06, + "loss": 0.1764, + "step": 3531 + }, + { + "epoch": 0.6753346080305928, + "grad_norm": 1.5184030532836914, + "learning_rate": 5e-06, + "loss": 0.1878, + "step": 3532 + }, + { + "epoch": 0.6755258126195028, + "grad_norm": 1.760200023651123, + "learning_rate": 5e-06, + "loss": 0.1343, + "step": 3533 + }, + { + "epoch": 0.675717017208413, + "grad_norm": 2.823066234588623, + "learning_rate": 5e-06, + "loss": 0.3365, + "step": 3534 + }, + { + "epoch": 0.6759082217973231, + "grad_norm": 1.697743535041809, + "learning_rate": 5e-06, + "loss": 0.1053, + "step": 3535 + }, + { + "epoch": 0.6760994263862332, + "grad_norm": 1.5451792478561401, + "learning_rate": 5e-06, + "loss": 0.1168, + "step": 3536 + }, + { + "epoch": 0.6762906309751434, + "grad_norm": 2.006969690322876, + "learning_rate": 5e-06, + "loss": 0.1252, + "step": 3537 + }, + { + "epoch": 0.6764818355640535, + "grad_norm": 2.5118765830993652, + "learning_rate": 5e-06, + "loss": 0.3674, + "step": 3538 + }, + { + "epoch": 0.6766730401529637, + "grad_norm": 2.0223960876464844, + "learning_rate": 5e-06, + "loss": 0.2248, + "step": 3539 + }, + { + "epoch": 0.6768642447418738, + "grad_norm": 1.5017013549804688, + "learning_rate": 5e-06, + "loss": 0.1731, + "step": 3540 + }, + { + "epoch": 0.677055449330784, + "grad_norm": 1.1571253538131714, + "learning_rate": 5e-06, + "loss": 0.1285, + "step": 3541 + }, + { + "epoch": 0.6772466539196941, + "grad_norm": 0.9846636652946472, + "learning_rate": 5e-06, + "loss": 0.092, + "step": 3542 + }, + { + "epoch": 0.6774378585086042, + "grad_norm": 1.6082611083984375, + "learning_rate": 5e-06, + "loss": 0.0561, + "step": 3543 + }, + { + "epoch": 0.6776290630975144, + "grad_norm": 2.4058167934417725, + "learning_rate": 5e-06, + "loss": 0.1535, + "step": 3544 + }, + { + "epoch": 0.6778202676864244, + "grad_norm": 1.8551628589630127, + "learning_rate": 5e-06, + "loss": 0.227, + "step": 3545 + }, + { + "epoch": 0.6780114722753346, + "grad_norm": 2.5418436527252197, + "learning_rate": 5e-06, + "loss": 0.4509, + "step": 3546 + }, + { + "epoch": 0.6782026768642447, + "grad_norm": 1.8027793169021606, + "learning_rate": 5e-06, + "loss": 0.1219, + "step": 3547 + }, + { + "epoch": 0.6783938814531548, + "grad_norm": 1.5673744678497314, + "learning_rate": 5e-06, + "loss": 0.2043, + "step": 3548 + }, + { + "epoch": 0.678585086042065, + "grad_norm": 1.2510675191879272, + "learning_rate": 5e-06, + "loss": 0.0827, + "step": 3549 + }, + { + "epoch": 0.6787762906309751, + "grad_norm": 1.4993951320648193, + "learning_rate": 5e-06, + "loss": 0.0514, + "step": 3550 + }, + { + "epoch": 0.6789674952198853, + "grad_norm": 2.15187931060791, + "learning_rate": 5e-06, + "loss": 0.2552, + "step": 3551 + }, + { + "epoch": 0.6791586998087954, + "grad_norm": 2.6083855628967285, + "learning_rate": 5e-06, + "loss": 0.4038, + "step": 3552 + }, + { + "epoch": 0.6793499043977056, + "grad_norm": 2.16745924949646, + "learning_rate": 5e-06, + "loss": 0.2588, + "step": 3553 + }, + { + "epoch": 0.6795411089866157, + "grad_norm": 1.6964585781097412, + "learning_rate": 5e-06, + "loss": 0.1775, + "step": 3554 + }, + { + "epoch": 0.6797323135755258, + "grad_norm": 2.437232732772827, + "learning_rate": 5e-06, + "loss": 0.1336, + "step": 3555 + }, + { + "epoch": 0.679923518164436, + "grad_norm": 1.8153268098831177, + "learning_rate": 5e-06, + "loss": 0.095, + "step": 3556 + }, + { + "epoch": 0.6801147227533461, + "grad_norm": 2.4039676189422607, + "learning_rate": 5e-06, + "loss": 0.2803, + "step": 3557 + }, + { + "epoch": 0.6803059273422563, + "grad_norm": 3.8345534801483154, + "learning_rate": 5e-06, + "loss": 0.2945, + "step": 3558 + }, + { + "epoch": 0.6804971319311663, + "grad_norm": 2.759597063064575, + "learning_rate": 5e-06, + "loss": 0.2355, + "step": 3559 + }, + { + "epoch": 0.6806883365200764, + "grad_norm": 1.6006584167480469, + "learning_rate": 5e-06, + "loss": 0.1212, + "step": 3560 + }, + { + "epoch": 0.6808795411089866, + "grad_norm": 1.7173149585723877, + "learning_rate": 5e-06, + "loss": 0.1152, + "step": 3561 + }, + { + "epoch": 0.6810707456978967, + "grad_norm": 1.0151087045669556, + "learning_rate": 5e-06, + "loss": 0.0712, + "step": 3562 + }, + { + "epoch": 0.6812619502868069, + "grad_norm": 1.4937494993209839, + "learning_rate": 5e-06, + "loss": 0.0642, + "step": 3563 + }, + { + "epoch": 0.681453154875717, + "grad_norm": 1.3477442264556885, + "learning_rate": 5e-06, + "loss": 0.1069, + "step": 3564 + }, + { + "epoch": 0.6816443594646272, + "grad_norm": 1.9342656135559082, + "learning_rate": 5e-06, + "loss": 0.4498, + "step": 3565 + }, + { + "epoch": 0.6818355640535373, + "grad_norm": 2.319324254989624, + "learning_rate": 5e-06, + "loss": 0.3148, + "step": 3566 + }, + { + "epoch": 0.6820267686424474, + "grad_norm": 2.8179759979248047, + "learning_rate": 5e-06, + "loss": 0.362, + "step": 3567 + }, + { + "epoch": 0.6822179732313576, + "grad_norm": 2.6950948238372803, + "learning_rate": 5e-06, + "loss": 0.0819, + "step": 3568 + }, + { + "epoch": 0.6824091778202677, + "grad_norm": 2.1264123916625977, + "learning_rate": 5e-06, + "loss": 0.1911, + "step": 3569 + }, + { + "epoch": 0.6826003824091779, + "grad_norm": 2.7540876865386963, + "learning_rate": 5e-06, + "loss": 0.6599, + "step": 3570 + }, + { + "epoch": 0.682791586998088, + "grad_norm": 2.1270487308502197, + "learning_rate": 5e-06, + "loss": 0.3011, + "step": 3571 + }, + { + "epoch": 0.682982791586998, + "grad_norm": 2.540781021118164, + "learning_rate": 5e-06, + "loss": 0.4374, + "step": 3572 + }, + { + "epoch": 0.6831739961759082, + "grad_norm": 1.4416691064834595, + "learning_rate": 5e-06, + "loss": 0.118, + "step": 3573 + }, + { + "epoch": 0.6833652007648183, + "grad_norm": 1.5677932500839233, + "learning_rate": 5e-06, + "loss": 0.0783, + "step": 3574 + }, + { + "epoch": 0.6835564053537285, + "grad_norm": 2.3644917011260986, + "learning_rate": 5e-06, + "loss": 0.0778, + "step": 3575 + }, + { + "epoch": 0.6837476099426386, + "grad_norm": 2.1362829208374023, + "learning_rate": 5e-06, + "loss": 0.2909, + "step": 3576 + }, + { + "epoch": 0.6839388145315488, + "grad_norm": 3.296583890914917, + "learning_rate": 5e-06, + "loss": 0.4958, + "step": 3577 + }, + { + "epoch": 0.6841300191204589, + "grad_norm": 1.5384232997894287, + "learning_rate": 5e-06, + "loss": 0.1698, + "step": 3578 + }, + { + "epoch": 0.684321223709369, + "grad_norm": 2.5473268032073975, + "learning_rate": 5e-06, + "loss": 0.2099, + "step": 3579 + }, + { + "epoch": 0.6845124282982792, + "grad_norm": 1.863061547279358, + "learning_rate": 5e-06, + "loss": 0.1071, + "step": 3580 + }, + { + "epoch": 0.6847036328871893, + "grad_norm": 2.5463814735412598, + "learning_rate": 5e-06, + "loss": 0.0986, + "step": 3581 + }, + { + "epoch": 0.6848948374760995, + "grad_norm": 1.9547677040100098, + "learning_rate": 5e-06, + "loss": 0.2291, + "step": 3582 + }, + { + "epoch": 0.6850860420650096, + "grad_norm": 1.2805721759796143, + "learning_rate": 5e-06, + "loss": 0.1025, + "step": 3583 + }, + { + "epoch": 0.6852772466539196, + "grad_norm": 2.379103183746338, + "learning_rate": 5e-06, + "loss": 0.3174, + "step": 3584 + }, + { + "epoch": 0.6854684512428298, + "grad_norm": 1.308780312538147, + "learning_rate": 5e-06, + "loss": 0.1225, + "step": 3585 + }, + { + "epoch": 0.6856596558317399, + "grad_norm": 2.074080228805542, + "learning_rate": 5e-06, + "loss": 0.1967, + "step": 3586 + }, + { + "epoch": 0.6858508604206501, + "grad_norm": 1.9603567123413086, + "learning_rate": 5e-06, + "loss": 0.1479, + "step": 3587 + }, + { + "epoch": 0.6860420650095602, + "grad_norm": 1.7865123748779297, + "learning_rate": 5e-06, + "loss": 0.1054, + "step": 3588 + }, + { + "epoch": 0.6862332695984703, + "grad_norm": 3.6250503063201904, + "learning_rate": 5e-06, + "loss": 0.4375, + "step": 3589 + }, + { + "epoch": 0.6864244741873805, + "grad_norm": 2.0481324195861816, + "learning_rate": 5e-06, + "loss": 0.1674, + "step": 3590 + }, + { + "epoch": 0.6866156787762906, + "grad_norm": 2.7558176517486572, + "learning_rate": 5e-06, + "loss": 0.1392, + "step": 3591 + }, + { + "epoch": 0.6868068833652008, + "grad_norm": 1.8074321746826172, + "learning_rate": 5e-06, + "loss": 0.1178, + "step": 3592 + }, + { + "epoch": 0.6869980879541109, + "grad_norm": 1.997239351272583, + "learning_rate": 5e-06, + "loss": 0.1253, + "step": 3593 + }, + { + "epoch": 0.6871892925430211, + "grad_norm": 2.378185987472534, + "learning_rate": 5e-06, + "loss": 0.3194, + "step": 3594 + }, + { + "epoch": 0.6873804971319312, + "grad_norm": 2.6446828842163086, + "learning_rate": 5e-06, + "loss": 0.4638, + "step": 3595 + }, + { + "epoch": 0.6875717017208413, + "grad_norm": 2.4682164192199707, + "learning_rate": 5e-06, + "loss": 0.3686, + "step": 3596 + }, + { + "epoch": 0.6877629063097515, + "grad_norm": 1.9417331218719482, + "learning_rate": 5e-06, + "loss": 0.2668, + "step": 3597 + }, + { + "epoch": 0.6879541108986615, + "grad_norm": 1.9014066457748413, + "learning_rate": 5e-06, + "loss": 0.1559, + "step": 3598 + }, + { + "epoch": 0.6881453154875717, + "grad_norm": 1.8162672519683838, + "learning_rate": 5e-06, + "loss": 0.1298, + "step": 3599 + }, + { + "epoch": 0.6883365200764818, + "grad_norm": 1.8843580484390259, + "learning_rate": 5e-06, + "loss": 0.1089, + "step": 3600 + }, + { + "epoch": 0.6885277246653919, + "grad_norm": 1.4576982259750366, + "learning_rate": 5e-06, + "loss": 0.1632, + "step": 3601 + }, + { + "epoch": 0.6887189292543021, + "grad_norm": 3.355142831802368, + "learning_rate": 5e-06, + "loss": 0.2791, + "step": 3602 + }, + { + "epoch": 0.6889101338432122, + "grad_norm": 1.7258144617080688, + "learning_rate": 5e-06, + "loss": 0.2368, + "step": 3603 + }, + { + "epoch": 0.6891013384321224, + "grad_norm": 1.7668720483779907, + "learning_rate": 5e-06, + "loss": 0.132, + "step": 3604 + }, + { + "epoch": 0.6892925430210325, + "grad_norm": 1.8233599662780762, + "learning_rate": 5e-06, + "loss": 0.0762, + "step": 3605 + }, + { + "epoch": 0.6894837476099427, + "grad_norm": 2.0434274673461914, + "learning_rate": 5e-06, + "loss": 0.071, + "step": 3606 + }, + { + "epoch": 0.6896749521988528, + "grad_norm": 2.575824022293091, + "learning_rate": 5e-06, + "loss": 0.4385, + "step": 3607 + }, + { + "epoch": 0.6898661567877629, + "grad_norm": 3.179419755935669, + "learning_rate": 5e-06, + "loss": 0.4052, + "step": 3608 + }, + { + "epoch": 0.6900573613766731, + "grad_norm": 2.368760347366333, + "learning_rate": 5e-06, + "loss": 0.4097, + "step": 3609 + }, + { + "epoch": 0.6902485659655831, + "grad_norm": 1.6637201309204102, + "learning_rate": 5e-06, + "loss": 0.1209, + "step": 3610 + }, + { + "epoch": 0.6904397705544933, + "grad_norm": 1.4172734022140503, + "learning_rate": 5e-06, + "loss": 0.1386, + "step": 3611 + }, + { + "epoch": 0.6906309751434034, + "grad_norm": 2.328533411026001, + "learning_rate": 5e-06, + "loss": 0.2946, + "step": 3612 + }, + { + "epoch": 0.6908221797323135, + "grad_norm": 1.887550711631775, + "learning_rate": 5e-06, + "loss": 0.1478, + "step": 3613 + }, + { + "epoch": 0.6910133843212237, + "grad_norm": 1.8322516679763794, + "learning_rate": 5e-06, + "loss": 0.1951, + "step": 3614 + }, + { + "epoch": 0.6912045889101338, + "grad_norm": 1.0440239906311035, + "learning_rate": 5e-06, + "loss": 0.1293, + "step": 3615 + }, + { + "epoch": 0.691395793499044, + "grad_norm": 2.5914018154144287, + "learning_rate": 5e-06, + "loss": 0.3502, + "step": 3616 + }, + { + "epoch": 0.6915869980879541, + "grad_norm": 2.4446136951446533, + "learning_rate": 5e-06, + "loss": 0.1806, + "step": 3617 + }, + { + "epoch": 0.6917782026768643, + "grad_norm": 1.4871900081634521, + "learning_rate": 5e-06, + "loss": 0.0797, + "step": 3618 + }, + { + "epoch": 0.6919694072657744, + "grad_norm": 2.3120055198669434, + "learning_rate": 5e-06, + "loss": 0.1075, + "step": 3619 + }, + { + "epoch": 0.6921606118546845, + "grad_norm": 1.4232051372528076, + "learning_rate": 5e-06, + "loss": 0.1484, + "step": 3620 + }, + { + "epoch": 0.6923518164435947, + "grad_norm": 3.1616592407226562, + "learning_rate": 5e-06, + "loss": 0.5661, + "step": 3621 + }, + { + "epoch": 0.6925430210325048, + "grad_norm": 0.8112595081329346, + "learning_rate": 5e-06, + "loss": 0.0917, + "step": 3622 + }, + { + "epoch": 0.692734225621415, + "grad_norm": 1.0516867637634277, + "learning_rate": 5e-06, + "loss": 0.0633, + "step": 3623 + }, + { + "epoch": 0.692925430210325, + "grad_norm": 1.4513195753097534, + "learning_rate": 5e-06, + "loss": 0.0825, + "step": 3624 + }, + { + "epoch": 0.6931166347992351, + "grad_norm": 2.015878200531006, + "learning_rate": 5e-06, + "loss": 0.1085, + "step": 3625 + }, + { + "epoch": 0.6933078393881453, + "grad_norm": 2.6140496730804443, + "learning_rate": 5e-06, + "loss": 0.5673, + "step": 3626 + }, + { + "epoch": 0.6934990439770554, + "grad_norm": 2.6710994243621826, + "learning_rate": 5e-06, + "loss": 0.1457, + "step": 3627 + }, + { + "epoch": 0.6936902485659656, + "grad_norm": 1.289347529411316, + "learning_rate": 5e-06, + "loss": 0.0971, + "step": 3628 + }, + { + "epoch": 0.6938814531548757, + "grad_norm": 1.6329401731491089, + "learning_rate": 5e-06, + "loss": 0.1046, + "step": 3629 + }, + { + "epoch": 0.6940726577437859, + "grad_norm": 1.3376904726028442, + "learning_rate": 5e-06, + "loss": 0.0794, + "step": 3630 + }, + { + "epoch": 0.694263862332696, + "grad_norm": 2.8650128841400146, + "learning_rate": 5e-06, + "loss": 0.1426, + "step": 3631 + }, + { + "epoch": 0.6944550669216061, + "grad_norm": 3.3484082221984863, + "learning_rate": 5e-06, + "loss": 0.5369, + "step": 3632 + }, + { + "epoch": 0.6946462715105163, + "grad_norm": 2.710831880569458, + "learning_rate": 5e-06, + "loss": 0.5368, + "step": 3633 + }, + { + "epoch": 0.6948374760994264, + "grad_norm": 1.1810380220413208, + "learning_rate": 5e-06, + "loss": 0.1185, + "step": 3634 + }, + { + "epoch": 0.6950286806883366, + "grad_norm": 0.5851054787635803, + "learning_rate": 5e-06, + "loss": 0.0376, + "step": 3635 + }, + { + "epoch": 0.6952198852772467, + "grad_norm": 3.097205400466919, + "learning_rate": 5e-06, + "loss": 0.2126, + "step": 3636 + }, + { + "epoch": 0.6954110898661567, + "grad_norm": 1.8246523141860962, + "learning_rate": 5e-06, + "loss": 0.1213, + "step": 3637 + }, + { + "epoch": 0.6956022944550669, + "grad_norm": 1.682036280632019, + "learning_rate": 5e-06, + "loss": 0.0875, + "step": 3638 + }, + { + "epoch": 0.695793499043977, + "grad_norm": 3.0946667194366455, + "learning_rate": 5e-06, + "loss": 0.6476, + "step": 3639 + }, + { + "epoch": 0.6959847036328872, + "grad_norm": 1.0556777715682983, + "learning_rate": 5e-06, + "loss": 0.0874, + "step": 3640 + }, + { + "epoch": 0.6961759082217973, + "grad_norm": 1.5672385692596436, + "learning_rate": 5e-06, + "loss": 0.1432, + "step": 3641 + }, + { + "epoch": 0.6963671128107075, + "grad_norm": 1.1081948280334473, + "learning_rate": 5e-06, + "loss": 0.1039, + "step": 3642 + }, + { + "epoch": 0.6965583173996176, + "grad_norm": 3.239100694656372, + "learning_rate": 5e-06, + "loss": 0.3624, + "step": 3643 + }, + { + "epoch": 0.6967495219885277, + "grad_norm": 1.96116304397583, + "learning_rate": 5e-06, + "loss": 0.1417, + "step": 3644 + }, + { + "epoch": 0.6969407265774379, + "grad_norm": 2.3989415168762207, + "learning_rate": 5e-06, + "loss": 0.3667, + "step": 3645 + }, + { + "epoch": 0.697131931166348, + "grad_norm": 1.7304235696792603, + "learning_rate": 5e-06, + "loss": 0.3046, + "step": 3646 + }, + { + "epoch": 0.6973231357552582, + "grad_norm": 1.469014286994934, + "learning_rate": 5e-06, + "loss": 0.1986, + "step": 3647 + }, + { + "epoch": 0.6975143403441683, + "grad_norm": 2.6511895656585693, + "learning_rate": 5e-06, + "loss": 0.1332, + "step": 3648 + }, + { + "epoch": 0.6977055449330783, + "grad_norm": 4.76706600189209, + "learning_rate": 5e-06, + "loss": 0.3297, + "step": 3649 + }, + { + "epoch": 0.6978967495219885, + "grad_norm": 1.9821535348892212, + "learning_rate": 5e-06, + "loss": 0.0866, + "step": 3650 + }, + { + "epoch": 0.6980879541108986, + "grad_norm": 1.3378146886825562, + "learning_rate": 5e-06, + "loss": 0.1156, + "step": 3651 + }, + { + "epoch": 0.6982791586998088, + "grad_norm": 1.85330069065094, + "learning_rate": 5e-06, + "loss": 0.2699, + "step": 3652 + }, + { + "epoch": 0.6984703632887189, + "grad_norm": 2.159931182861328, + "learning_rate": 5e-06, + "loss": 0.3349, + "step": 3653 + }, + { + "epoch": 0.698661567877629, + "grad_norm": 2.206817150115967, + "learning_rate": 5e-06, + "loss": 0.1541, + "step": 3654 + }, + { + "epoch": 0.6988527724665392, + "grad_norm": 2.111424446105957, + "learning_rate": 5e-06, + "loss": 0.2197, + "step": 3655 + }, + { + "epoch": 0.6990439770554493, + "grad_norm": 2.3077478408813477, + "learning_rate": 5e-06, + "loss": 0.3232, + "step": 3656 + }, + { + "epoch": 0.6992351816443595, + "grad_norm": 2.5184547901153564, + "learning_rate": 5e-06, + "loss": 0.1036, + "step": 3657 + }, + { + "epoch": 0.6994263862332696, + "grad_norm": 2.8431849479675293, + "learning_rate": 5e-06, + "loss": 0.5781, + "step": 3658 + }, + { + "epoch": 0.6996175908221798, + "grad_norm": 2.026881694793701, + "learning_rate": 5e-06, + "loss": 0.3044, + "step": 3659 + }, + { + "epoch": 0.6998087954110899, + "grad_norm": 1.5575225353240967, + "learning_rate": 5e-06, + "loss": 0.0619, + "step": 3660 + }, + { + "epoch": 0.7, + "grad_norm": 1.6749697923660278, + "learning_rate": 5e-06, + "loss": 0.1297, + "step": 3661 + }, + { + "epoch": 0.7001912045889102, + "grad_norm": 1.1908769607543945, + "learning_rate": 5e-06, + "loss": 0.0485, + "step": 3662 + }, + { + "epoch": 0.7003824091778202, + "grad_norm": 2.0859763622283936, + "learning_rate": 5e-06, + "loss": 0.2337, + "step": 3663 + }, + { + "epoch": 0.7005736137667304, + "grad_norm": 1.7970099449157715, + "learning_rate": 5e-06, + "loss": 0.2513, + "step": 3664 + }, + { + "epoch": 0.7007648183556405, + "grad_norm": 2.7064900398254395, + "learning_rate": 5e-06, + "loss": 0.3637, + "step": 3665 + }, + { + "epoch": 0.7009560229445506, + "grad_norm": 3.3071577548980713, + "learning_rate": 5e-06, + "loss": 0.3557, + "step": 3666 + }, + { + "epoch": 0.7011472275334608, + "grad_norm": 1.6861001253128052, + "learning_rate": 5e-06, + "loss": 0.1279, + "step": 3667 + }, + { + "epoch": 0.7013384321223709, + "grad_norm": 1.3847708702087402, + "learning_rate": 5e-06, + "loss": 0.0656, + "step": 3668 + }, + { + "epoch": 0.7015296367112811, + "grad_norm": 3.5664327144622803, + "learning_rate": 5e-06, + "loss": 0.1027, + "step": 3669 + }, + { + "epoch": 0.7017208413001912, + "grad_norm": 3.75789213180542, + "learning_rate": 5e-06, + "loss": 0.5611, + "step": 3670 + }, + { + "epoch": 0.7019120458891014, + "grad_norm": 2.001065254211426, + "learning_rate": 5e-06, + "loss": 0.2467, + "step": 3671 + }, + { + "epoch": 0.7021032504780115, + "grad_norm": 2.242720603942871, + "learning_rate": 5e-06, + "loss": 0.1748, + "step": 3672 + }, + { + "epoch": 0.7022944550669216, + "grad_norm": 1.567443609237671, + "learning_rate": 5e-06, + "loss": 0.1055, + "step": 3673 + }, + { + "epoch": 0.7024856596558318, + "grad_norm": 1.640781044960022, + "learning_rate": 5e-06, + "loss": 0.0745, + "step": 3674 + }, + { + "epoch": 0.7026768642447419, + "grad_norm": 1.6443709135055542, + "learning_rate": 5e-06, + "loss": 0.1, + "step": 3675 + }, + { + "epoch": 0.702868068833652, + "grad_norm": 2.157893419265747, + "learning_rate": 5e-06, + "loss": 0.2708, + "step": 3676 + }, + { + "epoch": 0.7030592734225621, + "grad_norm": 1.9968886375427246, + "learning_rate": 5e-06, + "loss": 0.1628, + "step": 3677 + }, + { + "epoch": 0.7032504780114722, + "grad_norm": 1.2432177066802979, + "learning_rate": 5e-06, + "loss": 0.0915, + "step": 3678 + }, + { + "epoch": 0.7034416826003824, + "grad_norm": 1.2381335496902466, + "learning_rate": 5e-06, + "loss": 0.1092, + "step": 3679 + }, + { + "epoch": 0.7036328871892925, + "grad_norm": 1.1749067306518555, + "learning_rate": 5e-06, + "loss": 0.101, + "step": 3680 + }, + { + "epoch": 0.7038240917782027, + "grad_norm": 3.458665370941162, + "learning_rate": 5e-06, + "loss": 0.1514, + "step": 3681 + }, + { + "epoch": 0.7040152963671128, + "grad_norm": 2.4206576347351074, + "learning_rate": 5e-06, + "loss": 0.2704, + "step": 3682 + }, + { + "epoch": 0.704206500956023, + "grad_norm": 1.8550755977630615, + "learning_rate": 5e-06, + "loss": 0.2404, + "step": 3683 + }, + { + "epoch": 0.7043977055449331, + "grad_norm": 1.5968091487884521, + "learning_rate": 5e-06, + "loss": 0.0821, + "step": 3684 + }, + { + "epoch": 0.7045889101338432, + "grad_norm": 2.3588109016418457, + "learning_rate": 5e-06, + "loss": 0.1878, + "step": 3685 + }, + { + "epoch": 0.7047801147227534, + "grad_norm": 1.6067357063293457, + "learning_rate": 5e-06, + "loss": 0.0753, + "step": 3686 + }, + { + "epoch": 0.7049713193116635, + "grad_norm": 1.9464937448501587, + "learning_rate": 5e-06, + "loss": 0.0893, + "step": 3687 + }, + { + "epoch": 0.7051625239005737, + "grad_norm": 2.355674982070923, + "learning_rate": 5e-06, + "loss": 0.1453, + "step": 3688 + }, + { + "epoch": 0.7053537284894837, + "grad_norm": 3.20164155960083, + "learning_rate": 5e-06, + "loss": 0.4407, + "step": 3689 + }, + { + "epoch": 0.7055449330783938, + "grad_norm": 2.1521313190460205, + "learning_rate": 5e-06, + "loss": 0.1517, + "step": 3690 + }, + { + "epoch": 0.705736137667304, + "grad_norm": 1.2731857299804688, + "learning_rate": 5e-06, + "loss": 0.1149, + "step": 3691 + }, + { + "epoch": 0.7059273422562141, + "grad_norm": 1.5699528455734253, + "learning_rate": 5e-06, + "loss": 0.0717, + "step": 3692 + }, + { + "epoch": 0.7061185468451243, + "grad_norm": 1.5166230201721191, + "learning_rate": 5e-06, + "loss": 0.0978, + "step": 3693 + }, + { + "epoch": 0.7063097514340344, + "grad_norm": 1.5976234674453735, + "learning_rate": 5e-06, + "loss": 0.118, + "step": 3694 + }, + { + "epoch": 0.7065009560229446, + "grad_norm": 2.420431137084961, + "learning_rate": 5e-06, + "loss": 0.3126, + "step": 3695 + }, + { + "epoch": 0.7066921606118547, + "grad_norm": 2.7159039974212646, + "learning_rate": 5e-06, + "loss": 0.5264, + "step": 3696 + }, + { + "epoch": 0.7068833652007648, + "grad_norm": 2.4235072135925293, + "learning_rate": 5e-06, + "loss": 0.4054, + "step": 3697 + }, + { + "epoch": 0.707074569789675, + "grad_norm": 1.8437212705612183, + "learning_rate": 5e-06, + "loss": 0.2117, + "step": 3698 + }, + { + "epoch": 0.7072657743785851, + "grad_norm": 1.3941651582717896, + "learning_rate": 5e-06, + "loss": 0.1185, + "step": 3699 + }, + { + "epoch": 0.7074569789674953, + "grad_norm": 1.5819287300109863, + "learning_rate": 5e-06, + "loss": 0.1037, + "step": 3700 + }, + { + "epoch": 0.7076481835564054, + "grad_norm": 3.036449670791626, + "learning_rate": 5e-06, + "loss": 0.4601, + "step": 3701 + }, + { + "epoch": 0.7078393881453154, + "grad_norm": 1.64691960811615, + "learning_rate": 5e-06, + "loss": 0.2143, + "step": 3702 + }, + { + "epoch": 0.7080305927342256, + "grad_norm": 1.3194284439086914, + "learning_rate": 5e-06, + "loss": 0.1746, + "step": 3703 + }, + { + "epoch": 0.7082217973231357, + "grad_norm": 1.3173198699951172, + "learning_rate": 5e-06, + "loss": 0.1449, + "step": 3704 + }, + { + "epoch": 0.7084130019120459, + "grad_norm": 2.3822851181030273, + "learning_rate": 5e-06, + "loss": 0.2857, + "step": 3705 + }, + { + "epoch": 0.708604206500956, + "grad_norm": 0.9149222373962402, + "learning_rate": 5e-06, + "loss": 0.0497, + "step": 3706 + }, + { + "epoch": 0.7087954110898662, + "grad_norm": 2.2636375427246094, + "learning_rate": 5e-06, + "loss": 0.2972, + "step": 3707 + }, + { + "epoch": 0.7089866156787763, + "grad_norm": 1.518246054649353, + "learning_rate": 5e-06, + "loss": 0.1799, + "step": 3708 + }, + { + "epoch": 0.7091778202676864, + "grad_norm": 1.2613885402679443, + "learning_rate": 5e-06, + "loss": 0.1001, + "step": 3709 + }, + { + "epoch": 0.7093690248565966, + "grad_norm": 2.623847723007202, + "learning_rate": 5e-06, + "loss": 0.2827, + "step": 3710 + }, + { + "epoch": 0.7095602294455067, + "grad_norm": 1.9074766635894775, + "learning_rate": 5e-06, + "loss": 0.0718, + "step": 3711 + }, + { + "epoch": 0.7097514340344169, + "grad_norm": 2.9987568855285645, + "learning_rate": 5e-06, + "loss": 0.1303, + "step": 3712 + }, + { + "epoch": 0.709942638623327, + "grad_norm": 1.0691415071487427, + "learning_rate": 5e-06, + "loss": 0.0659, + "step": 3713 + }, + { + "epoch": 0.710133843212237, + "grad_norm": 3.7600460052490234, + "learning_rate": 5e-06, + "loss": 0.86, + "step": 3714 + }, + { + "epoch": 0.7103250478011472, + "grad_norm": 1.6110256910324097, + "learning_rate": 5e-06, + "loss": 0.1318, + "step": 3715 + }, + { + "epoch": 0.7105162523900573, + "grad_norm": 2.0216493606567383, + "learning_rate": 5e-06, + "loss": 0.2925, + "step": 3716 + }, + { + "epoch": 0.7107074569789675, + "grad_norm": 1.7469826936721802, + "learning_rate": 5e-06, + "loss": 0.2517, + "step": 3717 + }, + { + "epoch": 0.7108986615678776, + "grad_norm": 1.7995054721832275, + "learning_rate": 5e-06, + "loss": 0.1195, + "step": 3718 + }, + { + "epoch": 0.7110898661567877, + "grad_norm": 2.31731915473938, + "learning_rate": 5e-06, + "loss": 0.1143, + "step": 3719 + }, + { + "epoch": 0.7112810707456979, + "grad_norm": 3.077112913131714, + "learning_rate": 5e-06, + "loss": 0.7406, + "step": 3720 + }, + { + "epoch": 0.711472275334608, + "grad_norm": 3.16402530670166, + "learning_rate": 5e-06, + "loss": 0.465, + "step": 3721 + }, + { + "epoch": 0.7116634799235182, + "grad_norm": 2.755120038986206, + "learning_rate": 5e-06, + "loss": 0.2713, + "step": 3722 + }, + { + "epoch": 0.7118546845124283, + "grad_norm": 1.240608811378479, + "learning_rate": 5e-06, + "loss": 0.0644, + "step": 3723 + }, + { + "epoch": 0.7120458891013385, + "grad_norm": 1.4552818536758423, + "learning_rate": 5e-06, + "loss": 0.1006, + "step": 3724 + }, + { + "epoch": 0.7122370936902486, + "grad_norm": 1.733351707458496, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 3725 + }, + { + "epoch": 0.7124282982791587, + "grad_norm": 2.0551533699035645, + "learning_rate": 5e-06, + "loss": 0.3772, + "step": 3726 + }, + { + "epoch": 0.7126195028680689, + "grad_norm": 3.2883100509643555, + "learning_rate": 5e-06, + "loss": 0.5013, + "step": 3727 + }, + { + "epoch": 0.7128107074569789, + "grad_norm": 1.076749563217163, + "learning_rate": 5e-06, + "loss": 0.1152, + "step": 3728 + }, + { + "epoch": 0.7130019120458891, + "grad_norm": 2.451246976852417, + "learning_rate": 5e-06, + "loss": 0.2865, + "step": 3729 + }, + { + "epoch": 0.7131931166347992, + "grad_norm": 1.9702866077423096, + "learning_rate": 5e-06, + "loss": 0.2526, + "step": 3730 + }, + { + "epoch": 0.7133843212237093, + "grad_norm": 2.362886667251587, + "learning_rate": 5e-06, + "loss": 0.1457, + "step": 3731 + }, + { + "epoch": 0.7135755258126195, + "grad_norm": 2.858502149581909, + "learning_rate": 5e-06, + "loss": 0.4665, + "step": 3732 + }, + { + "epoch": 0.7137667304015296, + "grad_norm": 1.5832735300064087, + "learning_rate": 5e-06, + "loss": 0.1224, + "step": 3733 + }, + { + "epoch": 0.7139579349904398, + "grad_norm": 1.0978338718414307, + "learning_rate": 5e-06, + "loss": 0.1259, + "step": 3734 + }, + { + "epoch": 0.7141491395793499, + "grad_norm": 2.182143211364746, + "learning_rate": 5e-06, + "loss": 0.2562, + "step": 3735 + }, + { + "epoch": 0.7143403441682601, + "grad_norm": 2.5188608169555664, + "learning_rate": 5e-06, + "loss": 0.2833, + "step": 3736 + }, + { + "epoch": 0.7145315487571702, + "grad_norm": 2.875762701034546, + "learning_rate": 5e-06, + "loss": 0.163, + "step": 3737 + }, + { + "epoch": 0.7147227533460803, + "grad_norm": 1.844509243965149, + "learning_rate": 5e-06, + "loss": 0.123, + "step": 3738 + }, + { + "epoch": 0.7149139579349905, + "grad_norm": 2.9208295345306396, + "learning_rate": 5e-06, + "loss": 0.3887, + "step": 3739 + }, + { + "epoch": 0.7151051625239006, + "grad_norm": 2.217679977416992, + "learning_rate": 5e-06, + "loss": 0.2439, + "step": 3740 + }, + { + "epoch": 0.7152963671128107, + "grad_norm": 2.5245535373687744, + "learning_rate": 5e-06, + "loss": 0.3163, + "step": 3741 + }, + { + "epoch": 0.7154875717017208, + "grad_norm": 1.9459753036499023, + "learning_rate": 5e-06, + "loss": 0.1344, + "step": 3742 + }, + { + "epoch": 0.7156787762906309, + "grad_norm": 2.5548386573791504, + "learning_rate": 5e-06, + "loss": 0.0823, + "step": 3743 + }, + { + "epoch": 0.7158699808795411, + "grad_norm": 1.1367732286453247, + "learning_rate": 5e-06, + "loss": 0.0603, + "step": 3744 + }, + { + "epoch": 0.7160611854684512, + "grad_norm": 1.7649719715118408, + "learning_rate": 5e-06, + "loss": 0.2555, + "step": 3745 + }, + { + "epoch": 0.7162523900573614, + "grad_norm": 2.476152181625366, + "learning_rate": 5e-06, + "loss": 0.2607, + "step": 3746 + }, + { + "epoch": 0.7164435946462715, + "grad_norm": 1.505000114440918, + "learning_rate": 5e-06, + "loss": 0.1498, + "step": 3747 + }, + { + "epoch": 0.7166347992351817, + "grad_norm": 1.3874759674072266, + "learning_rate": 5e-06, + "loss": 0.0955, + "step": 3748 + }, + { + "epoch": 0.7168260038240918, + "grad_norm": 1.742844581604004, + "learning_rate": 5e-06, + "loss": 0.1141, + "step": 3749 + }, + { + "epoch": 0.7170172084130019, + "grad_norm": 2.4118449687957764, + "learning_rate": 5e-06, + "loss": 0.176, + "step": 3750 + }, + { + "epoch": 0.7172084130019121, + "grad_norm": 1.4940874576568604, + "learning_rate": 5e-06, + "loss": 0.1553, + "step": 3751 + }, + { + "epoch": 0.7173996175908222, + "grad_norm": 1.963732361793518, + "learning_rate": 5e-06, + "loss": 0.1711, + "step": 3752 + }, + { + "epoch": 0.7175908221797324, + "grad_norm": 1.9962869882583618, + "learning_rate": 5e-06, + "loss": 0.2072, + "step": 3753 + }, + { + "epoch": 0.7177820267686424, + "grad_norm": 1.2074509859085083, + "learning_rate": 5e-06, + "loss": 0.1099, + "step": 3754 + }, + { + "epoch": 0.7179732313575525, + "grad_norm": 1.593335509300232, + "learning_rate": 5e-06, + "loss": 0.1583, + "step": 3755 + }, + { + "epoch": 0.7181644359464627, + "grad_norm": 1.2521311044692993, + "learning_rate": 5e-06, + "loss": 0.0686, + "step": 3756 + }, + { + "epoch": 0.7183556405353728, + "grad_norm": 2.19175124168396, + "learning_rate": 5e-06, + "loss": 0.3097, + "step": 3757 + }, + { + "epoch": 0.718546845124283, + "grad_norm": 1.7197211980819702, + "learning_rate": 5e-06, + "loss": 0.1602, + "step": 3758 + }, + { + "epoch": 0.7187380497131931, + "grad_norm": 2.827324867248535, + "learning_rate": 5e-06, + "loss": 0.5046, + "step": 3759 + }, + { + "epoch": 0.7189292543021033, + "grad_norm": 2.1176669597625732, + "learning_rate": 5e-06, + "loss": 0.0945, + "step": 3760 + }, + { + "epoch": 0.7191204588910134, + "grad_norm": 1.07952880859375, + "learning_rate": 5e-06, + "loss": 0.0495, + "step": 3761 + }, + { + "epoch": 0.7193116634799235, + "grad_norm": 2.3675689697265625, + "learning_rate": 5e-06, + "loss": 0.2094, + "step": 3762 + }, + { + "epoch": 0.7195028680688337, + "grad_norm": 2.8417303562164307, + "learning_rate": 5e-06, + "loss": 0.4245, + "step": 3763 + }, + { + "epoch": 0.7196940726577438, + "grad_norm": 3.0691707134246826, + "learning_rate": 5e-06, + "loss": 0.5019, + "step": 3764 + }, + { + "epoch": 0.719885277246654, + "grad_norm": 1.418975591659546, + "learning_rate": 5e-06, + "loss": 0.1174, + "step": 3765 + }, + { + "epoch": 0.720076481835564, + "grad_norm": 2.2532196044921875, + "learning_rate": 5e-06, + "loss": 0.1952, + "step": 3766 + }, + { + "epoch": 0.7202676864244741, + "grad_norm": 1.7203007936477661, + "learning_rate": 5e-06, + "loss": 0.1286, + "step": 3767 + }, + { + "epoch": 0.7204588910133843, + "grad_norm": 2.12467885017395, + "learning_rate": 5e-06, + "loss": 0.155, + "step": 3768 + }, + { + "epoch": 0.7206500956022944, + "grad_norm": 1.4739885330200195, + "learning_rate": 5e-06, + "loss": 0.0862, + "step": 3769 + }, + { + "epoch": 0.7208413001912046, + "grad_norm": 3.242621898651123, + "learning_rate": 5e-06, + "loss": 0.6335, + "step": 3770 + }, + { + "epoch": 0.7210325047801147, + "grad_norm": 2.5275886058807373, + "learning_rate": 5e-06, + "loss": 0.4305, + "step": 3771 + }, + { + "epoch": 0.7212237093690248, + "grad_norm": 1.7171810865402222, + "learning_rate": 5e-06, + "loss": 0.1955, + "step": 3772 + }, + { + "epoch": 0.721414913957935, + "grad_norm": 1.231096625328064, + "learning_rate": 5e-06, + "loss": 0.092, + "step": 3773 + }, + { + "epoch": 0.7216061185468451, + "grad_norm": 1.7485922574996948, + "learning_rate": 5e-06, + "loss": 0.1059, + "step": 3774 + }, + { + "epoch": 0.7217973231357553, + "grad_norm": 2.117539882659912, + "learning_rate": 5e-06, + "loss": 0.1215, + "step": 3775 + }, + { + "epoch": 0.7219885277246654, + "grad_norm": 3.3793411254882812, + "learning_rate": 5e-06, + "loss": 0.5776, + "step": 3776 + }, + { + "epoch": 0.7221797323135756, + "grad_norm": 1.7460490465164185, + "learning_rate": 5e-06, + "loss": 0.1292, + "step": 3777 + }, + { + "epoch": 0.7223709369024857, + "grad_norm": 2.388984441757202, + "learning_rate": 5e-06, + "loss": 0.3883, + "step": 3778 + }, + { + "epoch": 0.7225621414913957, + "grad_norm": 2.096761703491211, + "learning_rate": 5e-06, + "loss": 0.1509, + "step": 3779 + }, + { + "epoch": 0.722753346080306, + "grad_norm": 1.783759593963623, + "learning_rate": 5e-06, + "loss": 0.2602, + "step": 3780 + }, + { + "epoch": 0.722944550669216, + "grad_norm": 1.4529781341552734, + "learning_rate": 5e-06, + "loss": 0.0799, + "step": 3781 + }, + { + "epoch": 0.7231357552581262, + "grad_norm": 2.474672317504883, + "learning_rate": 5e-06, + "loss": 0.2724, + "step": 3782 + }, + { + "epoch": 0.7233269598470363, + "grad_norm": 2.139212131500244, + "learning_rate": 5e-06, + "loss": 0.236, + "step": 3783 + }, + { + "epoch": 0.7235181644359464, + "grad_norm": 2.131075859069824, + "learning_rate": 5e-06, + "loss": 0.3092, + "step": 3784 + }, + { + "epoch": 0.7237093690248566, + "grad_norm": 1.3767306804656982, + "learning_rate": 5e-06, + "loss": 0.1263, + "step": 3785 + }, + { + "epoch": 0.7239005736137667, + "grad_norm": 1.0231573581695557, + "learning_rate": 5e-06, + "loss": 0.0584, + "step": 3786 + }, + { + "epoch": 0.7240917782026769, + "grad_norm": 1.745872139930725, + "learning_rate": 5e-06, + "loss": 0.132, + "step": 3787 + }, + { + "epoch": 0.724282982791587, + "grad_norm": 2.5864417552948, + "learning_rate": 5e-06, + "loss": 0.4019, + "step": 3788 + }, + { + "epoch": 0.7244741873804972, + "grad_norm": 1.4365683794021606, + "learning_rate": 5e-06, + "loss": 0.089, + "step": 3789 + }, + { + "epoch": 0.7246653919694073, + "grad_norm": 2.468015193939209, + "learning_rate": 5e-06, + "loss": 0.3741, + "step": 3790 + }, + { + "epoch": 0.7248565965583174, + "grad_norm": 3.0470778942108154, + "learning_rate": 5e-06, + "loss": 0.4877, + "step": 3791 + }, + { + "epoch": 0.7250478011472276, + "grad_norm": 1.820553183555603, + "learning_rate": 5e-06, + "loss": 0.1056, + "step": 3792 + }, + { + "epoch": 0.7252390057361376, + "grad_norm": 1.5439107418060303, + "learning_rate": 5e-06, + "loss": 0.0992, + "step": 3793 + }, + { + "epoch": 0.7254302103250478, + "grad_norm": 2.6746387481689453, + "learning_rate": 5e-06, + "loss": 0.1281, + "step": 3794 + }, + { + "epoch": 0.7256214149139579, + "grad_norm": 2.123269557952881, + "learning_rate": 5e-06, + "loss": 0.2376, + "step": 3795 + }, + { + "epoch": 0.725812619502868, + "grad_norm": 1.2966785430908203, + "learning_rate": 5e-06, + "loss": 0.107, + "step": 3796 + }, + { + "epoch": 0.7260038240917782, + "grad_norm": 1.3482623100280762, + "learning_rate": 5e-06, + "loss": 0.1186, + "step": 3797 + }, + { + "epoch": 0.7261950286806883, + "grad_norm": 1.8395178318023682, + "learning_rate": 5e-06, + "loss": 0.2555, + "step": 3798 + }, + { + "epoch": 0.7263862332695985, + "grad_norm": 1.2841962575912476, + "learning_rate": 5e-06, + "loss": 0.0857, + "step": 3799 + }, + { + "epoch": 0.7265774378585086, + "grad_norm": 2.664163827896118, + "learning_rate": 5e-06, + "loss": 0.1759, + "step": 3800 + }, + { + "epoch": 0.7267686424474188, + "grad_norm": 2.564634323120117, + "learning_rate": 5e-06, + "loss": 0.4485, + "step": 3801 + }, + { + "epoch": 0.7269598470363289, + "grad_norm": 2.6707558631896973, + "learning_rate": 5e-06, + "loss": 0.4055, + "step": 3802 + }, + { + "epoch": 0.727151051625239, + "grad_norm": 1.4090027809143066, + "learning_rate": 5e-06, + "loss": 0.1511, + "step": 3803 + }, + { + "epoch": 0.7273422562141492, + "grad_norm": 1.7918213605880737, + "learning_rate": 5e-06, + "loss": 0.1137, + "step": 3804 + }, + { + "epoch": 0.7275334608030593, + "grad_norm": 1.4990943670272827, + "learning_rate": 5e-06, + "loss": 0.166, + "step": 3805 + }, + { + "epoch": 0.7277246653919694, + "grad_norm": 2.7204136848449707, + "learning_rate": 5e-06, + "loss": 0.1131, + "step": 3806 + }, + { + "epoch": 0.7279158699808795, + "grad_norm": 2.5874335765838623, + "learning_rate": 5e-06, + "loss": 0.1842, + "step": 3807 + }, + { + "epoch": 0.7281070745697896, + "grad_norm": 2.7016761302948, + "learning_rate": 5e-06, + "loss": 0.3025, + "step": 3808 + }, + { + "epoch": 0.7282982791586998, + "grad_norm": 1.1666982173919678, + "learning_rate": 5e-06, + "loss": 0.1359, + "step": 3809 + }, + { + "epoch": 0.7284894837476099, + "grad_norm": 1.7149765491485596, + "learning_rate": 5e-06, + "loss": 0.0882, + "step": 3810 + }, + { + "epoch": 0.7286806883365201, + "grad_norm": 2.1318461894989014, + "learning_rate": 5e-06, + "loss": 0.1363, + "step": 3811 + }, + { + "epoch": 0.7288718929254302, + "grad_norm": 1.7442179918289185, + "learning_rate": 5e-06, + "loss": 0.1217, + "step": 3812 + }, + { + "epoch": 0.7290630975143404, + "grad_norm": 2.2777628898620605, + "learning_rate": 5e-06, + "loss": 0.1651, + "step": 3813 + }, + { + "epoch": 0.7292543021032505, + "grad_norm": 6.601057052612305, + "learning_rate": 5e-06, + "loss": 0.5386, + "step": 3814 + }, + { + "epoch": 0.7294455066921606, + "grad_norm": 1.6855504512786865, + "learning_rate": 5e-06, + "loss": 0.1608, + "step": 3815 + }, + { + "epoch": 0.7296367112810708, + "grad_norm": 1.998500943183899, + "learning_rate": 5e-06, + "loss": 0.2681, + "step": 3816 + }, + { + "epoch": 0.7298279158699809, + "grad_norm": 1.0050654411315918, + "learning_rate": 5e-06, + "loss": 0.1253, + "step": 3817 + }, + { + "epoch": 0.7300191204588911, + "grad_norm": 2.510183811187744, + "learning_rate": 5e-06, + "loss": 0.2464, + "step": 3818 + }, + { + "epoch": 0.7302103250478011, + "grad_norm": 1.5420417785644531, + "learning_rate": 5e-06, + "loss": 0.0933, + "step": 3819 + }, + { + "epoch": 0.7304015296367112, + "grad_norm": 2.9417569637298584, + "learning_rate": 5e-06, + "loss": 0.5287, + "step": 3820 + }, + { + "epoch": 0.7305927342256214, + "grad_norm": 2.2762291431427, + "learning_rate": 5e-06, + "loss": 0.2584, + "step": 3821 + }, + { + "epoch": 0.7307839388145315, + "grad_norm": 3.564465045928955, + "learning_rate": 5e-06, + "loss": 0.2454, + "step": 3822 + }, + { + "epoch": 0.7309751434034417, + "grad_norm": 1.3632512092590332, + "learning_rate": 5e-06, + "loss": 0.1078, + "step": 3823 + }, + { + "epoch": 0.7311663479923518, + "grad_norm": 2.418616771697998, + "learning_rate": 5e-06, + "loss": 0.1328, + "step": 3824 + }, + { + "epoch": 0.731357552581262, + "grad_norm": 1.0035635232925415, + "learning_rate": 5e-06, + "loss": 0.0504, + "step": 3825 + }, + { + "epoch": 0.7315487571701721, + "grad_norm": 2.8982577323913574, + "learning_rate": 5e-06, + "loss": 0.5397, + "step": 3826 + }, + { + "epoch": 0.7317399617590822, + "grad_norm": 2.096374273300171, + "learning_rate": 5e-06, + "loss": 0.3442, + "step": 3827 + }, + { + "epoch": 0.7319311663479924, + "grad_norm": 2.2000863552093506, + "learning_rate": 5e-06, + "loss": 0.1408, + "step": 3828 + }, + { + "epoch": 0.7321223709369025, + "grad_norm": 2.5556576251983643, + "learning_rate": 5e-06, + "loss": 0.2146, + "step": 3829 + }, + { + "epoch": 0.7323135755258127, + "grad_norm": 1.023587942123413, + "learning_rate": 5e-06, + "loss": 0.0599, + "step": 3830 + }, + { + "epoch": 0.7325047801147228, + "grad_norm": 2.018570899963379, + "learning_rate": 5e-06, + "loss": 0.1337, + "step": 3831 + }, + { + "epoch": 0.7326959847036328, + "grad_norm": 1.6819159984588623, + "learning_rate": 5e-06, + "loss": 0.1293, + "step": 3832 + }, + { + "epoch": 0.732887189292543, + "grad_norm": 10.953661918640137, + "learning_rate": 5e-06, + "loss": 0.1749, + "step": 3833 + }, + { + "epoch": 0.7330783938814531, + "grad_norm": 2.3553245067596436, + "learning_rate": 5e-06, + "loss": 0.1587, + "step": 3834 + }, + { + "epoch": 0.7332695984703633, + "grad_norm": 3.415651798248291, + "learning_rate": 5e-06, + "loss": 0.6602, + "step": 3835 + }, + { + "epoch": 0.7334608030592734, + "grad_norm": 1.051983118057251, + "learning_rate": 5e-06, + "loss": 0.0589, + "step": 3836 + }, + { + "epoch": 0.7336520076481835, + "grad_norm": 1.977689504623413, + "learning_rate": 5e-06, + "loss": 0.1378, + "step": 3837 + }, + { + "epoch": 0.7338432122370937, + "grad_norm": 2.170513391494751, + "learning_rate": 5e-06, + "loss": 0.2598, + "step": 3838 + }, + { + "epoch": 0.7340344168260038, + "grad_norm": 2.0110361576080322, + "learning_rate": 5e-06, + "loss": 0.3082, + "step": 3839 + }, + { + "epoch": 0.734225621414914, + "grad_norm": 1.711444616317749, + "learning_rate": 5e-06, + "loss": 0.159, + "step": 3840 + }, + { + "epoch": 0.7344168260038241, + "grad_norm": 1.1422127485275269, + "learning_rate": 5e-06, + "loss": 0.1196, + "step": 3841 + }, + { + "epoch": 0.7346080305927343, + "grad_norm": 1.628873586654663, + "learning_rate": 5e-06, + "loss": 0.0867, + "step": 3842 + }, + { + "epoch": 0.7347992351816444, + "grad_norm": 2.754671335220337, + "learning_rate": 5e-06, + "loss": 0.2112, + "step": 3843 + }, + { + "epoch": 0.7349904397705544, + "grad_norm": 2.050431728363037, + "learning_rate": 5e-06, + "loss": 0.2173, + "step": 3844 + }, + { + "epoch": 0.7351816443594646, + "grad_norm": 1.6508783102035522, + "learning_rate": 5e-06, + "loss": 0.1271, + "step": 3845 + }, + { + "epoch": 0.7353728489483747, + "grad_norm": 1.9832112789154053, + "learning_rate": 5e-06, + "loss": 0.2658, + "step": 3846 + }, + { + "epoch": 0.7355640535372849, + "grad_norm": 1.4163705110549927, + "learning_rate": 5e-06, + "loss": 0.0685, + "step": 3847 + }, + { + "epoch": 0.735755258126195, + "grad_norm": 2.1025025844573975, + "learning_rate": 5e-06, + "loss": 0.0973, + "step": 3848 + }, + { + "epoch": 0.7359464627151051, + "grad_norm": 1.321691632270813, + "learning_rate": 5e-06, + "loss": 0.0648, + "step": 3849 + }, + { + "epoch": 0.7361376673040153, + "grad_norm": 1.1205233335494995, + "learning_rate": 5e-06, + "loss": 0.0491, + "step": 3850 + }, + { + "epoch": 0.7363288718929254, + "grad_norm": 1.8860313892364502, + "learning_rate": 5e-06, + "loss": 0.246, + "step": 3851 + }, + { + "epoch": 0.7365200764818356, + "grad_norm": 2.046706438064575, + "learning_rate": 5e-06, + "loss": 0.3516, + "step": 3852 + }, + { + "epoch": 0.7367112810707457, + "grad_norm": 1.3845429420471191, + "learning_rate": 5e-06, + "loss": 0.115, + "step": 3853 + }, + { + "epoch": 0.7369024856596559, + "grad_norm": 1.796648383140564, + "learning_rate": 5e-06, + "loss": 0.1873, + "step": 3854 + }, + { + "epoch": 0.737093690248566, + "grad_norm": 1.5792717933654785, + "learning_rate": 5e-06, + "loss": 0.2649, + "step": 3855 + }, + { + "epoch": 0.7372848948374761, + "grad_norm": 1.4286028146743774, + "learning_rate": 5e-06, + "loss": 0.0831, + "step": 3856 + }, + { + "epoch": 0.7374760994263863, + "grad_norm": 1.319799780845642, + "learning_rate": 5e-06, + "loss": 0.1116, + "step": 3857 + }, + { + "epoch": 0.7376673040152963, + "grad_norm": 1.7097197771072388, + "learning_rate": 5e-06, + "loss": 0.1823, + "step": 3858 + }, + { + "epoch": 0.7378585086042065, + "grad_norm": 2.7568390369415283, + "learning_rate": 5e-06, + "loss": 0.1829, + "step": 3859 + }, + { + "epoch": 0.7380497131931166, + "grad_norm": 2.267392873764038, + "learning_rate": 5e-06, + "loss": 0.3423, + "step": 3860 + }, + { + "epoch": 0.7382409177820267, + "grad_norm": 1.5888205766677856, + "learning_rate": 5e-06, + "loss": 0.1232, + "step": 3861 + }, + { + "epoch": 0.7384321223709369, + "grad_norm": 0.9303539991378784, + "learning_rate": 5e-06, + "loss": 0.0497, + "step": 3862 + }, + { + "epoch": 0.738623326959847, + "grad_norm": 3.3895652294158936, + "learning_rate": 5e-06, + "loss": 0.1808, + "step": 3863 + }, + { + "epoch": 0.7388145315487572, + "grad_norm": 3.354686975479126, + "learning_rate": 5e-06, + "loss": 0.3795, + "step": 3864 + }, + { + "epoch": 0.7390057361376673, + "grad_norm": 2.8291988372802734, + "learning_rate": 5e-06, + "loss": 0.1875, + "step": 3865 + }, + { + "epoch": 0.7391969407265775, + "grad_norm": 1.4242098331451416, + "learning_rate": 5e-06, + "loss": 0.1156, + "step": 3866 + }, + { + "epoch": 0.7393881453154876, + "grad_norm": 0.8613669276237488, + "learning_rate": 5e-06, + "loss": 0.0874, + "step": 3867 + }, + { + "epoch": 0.7395793499043977, + "grad_norm": 1.4547524452209473, + "learning_rate": 5e-06, + "loss": 0.1247, + "step": 3868 + }, + { + "epoch": 0.7397705544933079, + "grad_norm": 1.1557694673538208, + "learning_rate": 5e-06, + "loss": 0.0494, + "step": 3869 + }, + { + "epoch": 0.739961759082218, + "grad_norm": 2.8671276569366455, + "learning_rate": 5e-06, + "loss": 0.3717, + "step": 3870 + }, + { + "epoch": 0.7401529636711282, + "grad_norm": 1.4811517000198364, + "learning_rate": 5e-06, + "loss": 0.1741, + "step": 3871 + }, + { + "epoch": 0.7403441682600382, + "grad_norm": 1.3373982906341553, + "learning_rate": 5e-06, + "loss": 0.1384, + "step": 3872 + }, + { + "epoch": 0.7405353728489483, + "grad_norm": 0.8785936832427979, + "learning_rate": 5e-06, + "loss": 0.1139, + "step": 3873 + }, + { + "epoch": 0.7407265774378585, + "grad_norm": 0.9371583461761475, + "learning_rate": 5e-06, + "loss": 0.0445, + "step": 3874 + }, + { + "epoch": 0.7409177820267686, + "grad_norm": 1.681098222732544, + "learning_rate": 5e-06, + "loss": 0.1076, + "step": 3875 + }, + { + "epoch": 0.7411089866156788, + "grad_norm": 2.265328884124756, + "learning_rate": 5e-06, + "loss": 0.4473, + "step": 3876 + }, + { + "epoch": 0.7413001912045889, + "grad_norm": 1.790353536605835, + "learning_rate": 5e-06, + "loss": 0.0827, + "step": 3877 + }, + { + "epoch": 0.7414913957934991, + "grad_norm": 2.781074047088623, + "learning_rate": 5e-06, + "loss": 0.3687, + "step": 3878 + }, + { + "epoch": 0.7416826003824092, + "grad_norm": 1.5163137912750244, + "learning_rate": 5e-06, + "loss": 0.0667, + "step": 3879 + }, + { + "epoch": 0.7418738049713193, + "grad_norm": 1.391534686088562, + "learning_rate": 5e-06, + "loss": 0.0774, + "step": 3880 + }, + { + "epoch": 0.7420650095602295, + "grad_norm": 1.9660178422927856, + "learning_rate": 5e-06, + "loss": 0.1352, + "step": 3881 + }, + { + "epoch": 0.7422562141491396, + "grad_norm": 1.3986618518829346, + "learning_rate": 5e-06, + "loss": 0.1077, + "step": 3882 + }, + { + "epoch": 0.7424474187380498, + "grad_norm": 2.422389507293701, + "learning_rate": 5e-06, + "loss": 0.3266, + "step": 3883 + }, + { + "epoch": 0.7426386233269598, + "grad_norm": 1.2864700555801392, + "learning_rate": 5e-06, + "loss": 0.2191, + "step": 3884 + }, + { + "epoch": 0.7428298279158699, + "grad_norm": 1.7133296728134155, + "learning_rate": 5e-06, + "loss": 0.1452, + "step": 3885 + }, + { + "epoch": 0.7430210325047801, + "grad_norm": 2.243800163269043, + "learning_rate": 5e-06, + "loss": 0.1434, + "step": 3886 + }, + { + "epoch": 0.7432122370936902, + "grad_norm": 2.1287524700164795, + "learning_rate": 5e-06, + "loss": 0.3237, + "step": 3887 + }, + { + "epoch": 0.7434034416826004, + "grad_norm": 3.481367588043213, + "learning_rate": 5e-06, + "loss": 0.5053, + "step": 3888 + }, + { + "epoch": 0.7435946462715105, + "grad_norm": 2.098524332046509, + "learning_rate": 5e-06, + "loss": 0.2776, + "step": 3889 + }, + { + "epoch": 0.7437858508604207, + "grad_norm": 0.9846514463424683, + "learning_rate": 5e-06, + "loss": 0.0791, + "step": 3890 + }, + { + "epoch": 0.7439770554493308, + "grad_norm": 1.8553352355957031, + "learning_rate": 5e-06, + "loss": 0.2831, + "step": 3891 + }, + { + "epoch": 0.7441682600382409, + "grad_norm": 2.703209638595581, + "learning_rate": 5e-06, + "loss": 0.1956, + "step": 3892 + }, + { + "epoch": 0.7443594646271511, + "grad_norm": 0.9680504202842712, + "learning_rate": 5e-06, + "loss": 0.0764, + "step": 3893 + }, + { + "epoch": 0.7445506692160612, + "grad_norm": 1.8443779945373535, + "learning_rate": 5e-06, + "loss": 0.2195, + "step": 3894 + }, + { + "epoch": 0.7447418738049714, + "grad_norm": 2.6461219787597656, + "learning_rate": 5e-06, + "loss": 0.4909, + "step": 3895 + }, + { + "epoch": 0.7449330783938815, + "grad_norm": 1.3912622928619385, + "learning_rate": 5e-06, + "loss": 0.1268, + "step": 3896 + }, + { + "epoch": 0.7451242829827915, + "grad_norm": 1.7276599407196045, + "learning_rate": 5e-06, + "loss": 0.226, + "step": 3897 + }, + { + "epoch": 0.7453154875717017, + "grad_norm": 2.662609815597534, + "learning_rate": 5e-06, + "loss": 0.1942, + "step": 3898 + }, + { + "epoch": 0.7455066921606118, + "grad_norm": 1.6807820796966553, + "learning_rate": 5e-06, + "loss": 0.1768, + "step": 3899 + }, + { + "epoch": 0.745697896749522, + "grad_norm": 1.3572360277175903, + "learning_rate": 5e-06, + "loss": 0.0603, + "step": 3900 + }, + { + "epoch": 0.7458891013384321, + "grad_norm": 1.2281373739242554, + "learning_rate": 5e-06, + "loss": 0.1086, + "step": 3901 + }, + { + "epoch": 0.7460803059273422, + "grad_norm": 1.9232803583145142, + "learning_rate": 5e-06, + "loss": 0.2239, + "step": 3902 + }, + { + "epoch": 0.7462715105162524, + "grad_norm": 2.2120046615600586, + "learning_rate": 5e-06, + "loss": 0.2476, + "step": 3903 + }, + { + "epoch": 0.7464627151051625, + "grad_norm": 2.8236958980560303, + "learning_rate": 5e-06, + "loss": 0.5103, + "step": 3904 + }, + { + "epoch": 0.7466539196940727, + "grad_norm": 2.782604694366455, + "learning_rate": 5e-06, + "loss": 0.3232, + "step": 3905 + }, + { + "epoch": 0.7468451242829828, + "grad_norm": 1.3832602500915527, + "learning_rate": 5e-06, + "loss": 0.15, + "step": 3906 + }, + { + "epoch": 0.747036328871893, + "grad_norm": 3.4751503467559814, + "learning_rate": 5e-06, + "loss": 0.3804, + "step": 3907 + }, + { + "epoch": 0.7472275334608031, + "grad_norm": 1.4347681999206543, + "learning_rate": 5e-06, + "loss": 0.1287, + "step": 3908 + }, + { + "epoch": 0.7474187380497131, + "grad_norm": 1.5916887521743774, + "learning_rate": 5e-06, + "loss": 0.2899, + "step": 3909 + }, + { + "epoch": 0.7476099426386233, + "grad_norm": 1.5356292724609375, + "learning_rate": 5e-06, + "loss": 0.2278, + "step": 3910 + }, + { + "epoch": 0.7478011472275334, + "grad_norm": 1.4358553886413574, + "learning_rate": 5e-06, + "loss": 0.0868, + "step": 3911 + }, + { + "epoch": 0.7479923518164436, + "grad_norm": 2.5048086643218994, + "learning_rate": 5e-06, + "loss": 0.227, + "step": 3912 + }, + { + "epoch": 0.7481835564053537, + "grad_norm": 1.7658495903015137, + "learning_rate": 5e-06, + "loss": 0.1685, + "step": 3913 + }, + { + "epoch": 0.7483747609942638, + "grad_norm": 2.723329782485962, + "learning_rate": 5e-06, + "loss": 0.3353, + "step": 3914 + }, + { + "epoch": 0.748565965583174, + "grad_norm": 2.999716281890869, + "learning_rate": 5e-06, + "loss": 0.3282, + "step": 3915 + }, + { + "epoch": 0.7487571701720841, + "grad_norm": 3.776826858520508, + "learning_rate": 5e-06, + "loss": 0.2084, + "step": 3916 + }, + { + "epoch": 0.7489483747609943, + "grad_norm": 1.5918197631835938, + "learning_rate": 5e-06, + "loss": 0.1376, + "step": 3917 + }, + { + "epoch": 0.7491395793499044, + "grad_norm": 1.6553537845611572, + "learning_rate": 5e-06, + "loss": 0.1036, + "step": 3918 + }, + { + "epoch": 0.7493307839388146, + "grad_norm": 1.9501301050186157, + "learning_rate": 5e-06, + "loss": 0.1137, + "step": 3919 + }, + { + "epoch": 0.7495219885277247, + "grad_norm": 3.1001572608947754, + "learning_rate": 5e-06, + "loss": 0.7937, + "step": 3920 + }, + { + "epoch": 0.7497131931166348, + "grad_norm": 1.8700470924377441, + "learning_rate": 5e-06, + "loss": 0.1598, + "step": 3921 + }, + { + "epoch": 0.749904397705545, + "grad_norm": 1.2993032932281494, + "learning_rate": 5e-06, + "loss": 0.1429, + "step": 3922 + }, + { + "epoch": 0.750095602294455, + "grad_norm": 2.387141466140747, + "learning_rate": 5e-06, + "loss": 0.1336, + "step": 3923 + }, + { + "epoch": 0.7502868068833652, + "grad_norm": 1.0744256973266602, + "learning_rate": 5e-06, + "loss": 0.0685, + "step": 3924 + }, + { + "epoch": 0.7504780114722753, + "grad_norm": 3.1105291843414307, + "learning_rate": 5e-06, + "loss": 0.2699, + "step": 3925 + }, + { + "epoch": 0.7506692160611854, + "grad_norm": 3.3172290325164795, + "learning_rate": 5e-06, + "loss": 0.6928, + "step": 3926 + }, + { + "epoch": 0.7508604206500956, + "grad_norm": 1.4145816564559937, + "learning_rate": 5e-06, + "loss": 0.1114, + "step": 3927 + }, + { + "epoch": 0.7510516252390057, + "grad_norm": 2.017582893371582, + "learning_rate": 5e-06, + "loss": 0.3379, + "step": 3928 + }, + { + "epoch": 0.7512428298279159, + "grad_norm": 2.438030481338501, + "learning_rate": 5e-06, + "loss": 0.273, + "step": 3929 + }, + { + "epoch": 0.751434034416826, + "grad_norm": 2.810269832611084, + "learning_rate": 5e-06, + "loss": 0.1784, + "step": 3930 + }, + { + "epoch": 0.7516252390057362, + "grad_norm": 2.189603328704834, + "learning_rate": 5e-06, + "loss": 0.1627, + "step": 3931 + }, + { + "epoch": 0.7518164435946463, + "grad_norm": 1.3899329900741577, + "learning_rate": 5e-06, + "loss": 0.1443, + "step": 3932 + }, + { + "epoch": 0.7520076481835564, + "grad_norm": 2.109938144683838, + "learning_rate": 5e-06, + "loss": 0.1432, + "step": 3933 + }, + { + "epoch": 0.7521988527724666, + "grad_norm": 2.4114205837249756, + "learning_rate": 5e-06, + "loss": 0.3787, + "step": 3934 + }, + { + "epoch": 0.7523900573613767, + "grad_norm": 1.5396156311035156, + "learning_rate": 5e-06, + "loss": 0.1456, + "step": 3935 + }, + { + "epoch": 0.7525812619502869, + "grad_norm": 1.966928482055664, + "learning_rate": 5e-06, + "loss": 0.2534, + "step": 3936 + }, + { + "epoch": 0.7527724665391969, + "grad_norm": 1.3008625507354736, + "learning_rate": 5e-06, + "loss": 0.0839, + "step": 3937 + }, + { + "epoch": 0.752963671128107, + "grad_norm": 3.0656418800354004, + "learning_rate": 5e-06, + "loss": 0.3491, + "step": 3938 + }, + { + "epoch": 0.7531548757170172, + "grad_norm": 2.63382887840271, + "learning_rate": 5e-06, + "loss": 0.3588, + "step": 3939 + }, + { + "epoch": 0.7533460803059273, + "grad_norm": 1.6612306833267212, + "learning_rate": 5e-06, + "loss": 0.1236, + "step": 3940 + }, + { + "epoch": 0.7535372848948375, + "grad_norm": 2.234178304672241, + "learning_rate": 5e-06, + "loss": 0.1589, + "step": 3941 + }, + { + "epoch": 0.7537284894837476, + "grad_norm": 3.02970814704895, + "learning_rate": 5e-06, + "loss": 0.4124, + "step": 3942 + }, + { + "epoch": 0.7539196940726578, + "grad_norm": 2.506859540939331, + "learning_rate": 5e-06, + "loss": 0.2223, + "step": 3943 + }, + { + "epoch": 0.7541108986615679, + "grad_norm": 1.3613660335540771, + "learning_rate": 5e-06, + "loss": 0.1406, + "step": 3944 + }, + { + "epoch": 0.754302103250478, + "grad_norm": 2.071458578109741, + "learning_rate": 5e-06, + "loss": 0.2519, + "step": 3945 + }, + { + "epoch": 0.7544933078393882, + "grad_norm": 1.1499487161636353, + "learning_rate": 5e-06, + "loss": 0.1091, + "step": 3946 + }, + { + "epoch": 0.7546845124282983, + "grad_norm": 2.7436764240264893, + "learning_rate": 5e-06, + "loss": 0.1509, + "step": 3947 + }, + { + "epoch": 0.7548757170172085, + "grad_norm": 2.140192985534668, + "learning_rate": 5e-06, + "loss": 0.2268, + "step": 3948 + }, + { + "epoch": 0.7550669216061185, + "grad_norm": 1.5796760320663452, + "learning_rate": 5e-06, + "loss": 0.113, + "step": 3949 + }, + { + "epoch": 0.7552581261950286, + "grad_norm": 1.0383970737457275, + "learning_rate": 5e-06, + "loss": 0.0439, + "step": 3950 + }, + { + "epoch": 0.7554493307839388, + "grad_norm": 2.50980281829834, + "learning_rate": 5e-06, + "loss": 0.4777, + "step": 3951 + }, + { + "epoch": 0.7556405353728489, + "grad_norm": 1.7971466779708862, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 3952 + }, + { + "epoch": 0.7558317399617591, + "grad_norm": 1.9320497512817383, + "learning_rate": 5e-06, + "loss": 0.1991, + "step": 3953 + }, + { + "epoch": 0.7560229445506692, + "grad_norm": 1.246254324913025, + "learning_rate": 5e-06, + "loss": 0.0755, + "step": 3954 + }, + { + "epoch": 0.7562141491395793, + "grad_norm": 1.7969180345535278, + "learning_rate": 5e-06, + "loss": 0.1124, + "step": 3955 + }, + { + "epoch": 0.7564053537284895, + "grad_norm": 1.317395567893982, + "learning_rate": 5e-06, + "loss": 0.0401, + "step": 3956 + }, + { + "epoch": 0.7565965583173996, + "grad_norm": 1.8792531490325928, + "learning_rate": 5e-06, + "loss": 0.308, + "step": 3957 + }, + { + "epoch": 0.7567877629063098, + "grad_norm": 1.601319670677185, + "learning_rate": 5e-06, + "loss": 0.1683, + "step": 3958 + }, + { + "epoch": 0.7569789674952199, + "grad_norm": 1.1691665649414062, + "learning_rate": 5e-06, + "loss": 0.0809, + "step": 3959 + }, + { + "epoch": 0.7571701720841301, + "grad_norm": 1.8878777027130127, + "learning_rate": 5e-06, + "loss": 0.1181, + "step": 3960 + }, + { + "epoch": 0.7573613766730402, + "grad_norm": 1.6922614574432373, + "learning_rate": 5e-06, + "loss": 0.1016, + "step": 3961 + }, + { + "epoch": 0.7575525812619502, + "grad_norm": 2.844416379928589, + "learning_rate": 5e-06, + "loss": 0.1851, + "step": 3962 + }, + { + "epoch": 0.7577437858508604, + "grad_norm": 2.265589952468872, + "learning_rate": 5e-06, + "loss": 0.2272, + "step": 3963 + }, + { + "epoch": 0.7579349904397705, + "grad_norm": 2.574376344680786, + "learning_rate": 5e-06, + "loss": 0.4007, + "step": 3964 + }, + { + "epoch": 0.7581261950286807, + "grad_norm": 2.0190422534942627, + "learning_rate": 5e-06, + "loss": 0.2131, + "step": 3965 + }, + { + "epoch": 0.7583173996175908, + "grad_norm": 1.53504478931427, + "learning_rate": 5e-06, + "loss": 0.1167, + "step": 3966 + }, + { + "epoch": 0.7585086042065009, + "grad_norm": 1.4824988842010498, + "learning_rate": 5e-06, + "loss": 0.16, + "step": 3967 + }, + { + "epoch": 0.7586998087954111, + "grad_norm": 2.1277225017547607, + "learning_rate": 5e-06, + "loss": 0.15, + "step": 3968 + }, + { + "epoch": 0.7588910133843212, + "grad_norm": 0.9543938636779785, + "learning_rate": 5e-06, + "loss": 0.0364, + "step": 3969 + }, + { + "epoch": 0.7590822179732314, + "grad_norm": 3.176586151123047, + "learning_rate": 5e-06, + "loss": 0.5473, + "step": 3970 + }, + { + "epoch": 0.7592734225621415, + "grad_norm": 1.5333970785140991, + "learning_rate": 5e-06, + "loss": 0.1819, + "step": 3971 + }, + { + "epoch": 0.7594646271510517, + "grad_norm": 1.2855944633483887, + "learning_rate": 5e-06, + "loss": 0.1069, + "step": 3972 + }, + { + "epoch": 0.7596558317399618, + "grad_norm": 1.5741631984710693, + "learning_rate": 5e-06, + "loss": 0.0947, + "step": 3973 + }, + { + "epoch": 0.7598470363288718, + "grad_norm": 1.533542513847351, + "learning_rate": 5e-06, + "loss": 0.1809, + "step": 3974 + }, + { + "epoch": 0.760038240917782, + "grad_norm": 2.86518931388855, + "learning_rate": 5e-06, + "loss": 0.2103, + "step": 3975 + }, + { + "epoch": 0.7602294455066921, + "grad_norm": 2.7985222339630127, + "learning_rate": 5e-06, + "loss": 0.2571, + "step": 3976 + }, + { + "epoch": 0.7604206500956023, + "grad_norm": 2.5995070934295654, + "learning_rate": 5e-06, + "loss": 0.4758, + "step": 3977 + }, + { + "epoch": 0.7606118546845124, + "grad_norm": 1.5301446914672852, + "learning_rate": 5e-06, + "loss": 0.1074, + "step": 3978 + }, + { + "epoch": 0.7608030592734225, + "grad_norm": 0.876580536365509, + "learning_rate": 5e-06, + "loss": 0.0367, + "step": 3979 + }, + { + "epoch": 0.7609942638623327, + "grad_norm": 1.7435474395751953, + "learning_rate": 5e-06, + "loss": 0.1279, + "step": 3980 + }, + { + "epoch": 0.7611854684512428, + "grad_norm": 1.7620370388031006, + "learning_rate": 5e-06, + "loss": 0.1008, + "step": 3981 + }, + { + "epoch": 0.761376673040153, + "grad_norm": 1.7652002573013306, + "learning_rate": 5e-06, + "loss": 0.1108, + "step": 3982 + }, + { + "epoch": 0.7615678776290631, + "grad_norm": 3.072636127471924, + "learning_rate": 5e-06, + "loss": 0.4287, + "step": 3983 + }, + { + "epoch": 0.7617590822179733, + "grad_norm": 2.880697727203369, + "learning_rate": 5e-06, + "loss": 0.3342, + "step": 3984 + }, + { + "epoch": 0.7619502868068834, + "grad_norm": 1.7110553979873657, + "learning_rate": 5e-06, + "loss": 0.1209, + "step": 3985 + }, + { + "epoch": 0.7621414913957935, + "grad_norm": 1.7476038932800293, + "learning_rate": 5e-06, + "loss": 0.0955, + "step": 3986 + }, + { + "epoch": 0.7623326959847037, + "grad_norm": 2.2482314109802246, + "learning_rate": 5e-06, + "loss": 0.1621, + "step": 3987 + }, + { + "epoch": 0.7625239005736137, + "grad_norm": 1.754885196685791, + "learning_rate": 5e-06, + "loss": 0.1183, + "step": 3988 + }, + { + "epoch": 0.7627151051625239, + "grad_norm": 2.0199167728424072, + "learning_rate": 5e-06, + "loss": 0.2003, + "step": 3989 + }, + { + "epoch": 0.762906309751434, + "grad_norm": 2.1370720863342285, + "learning_rate": 5e-06, + "loss": 0.2559, + "step": 3990 + }, + { + "epoch": 0.7630975143403441, + "grad_norm": 1.4818934202194214, + "learning_rate": 5e-06, + "loss": 0.119, + "step": 3991 + }, + { + "epoch": 0.7632887189292543, + "grad_norm": 2.2542731761932373, + "learning_rate": 5e-06, + "loss": 0.1821, + "step": 3992 + }, + { + "epoch": 0.7634799235181644, + "grad_norm": 2.6181223392486572, + "learning_rate": 5e-06, + "loss": 0.2197, + "step": 3993 + }, + { + "epoch": 0.7636711281070746, + "grad_norm": 2.795283317565918, + "learning_rate": 5e-06, + "loss": 0.2661, + "step": 3994 + }, + { + "epoch": 0.7638623326959847, + "grad_norm": 4.159665107727051, + "learning_rate": 5e-06, + "loss": 0.8165, + "step": 3995 + }, + { + "epoch": 0.7640535372848949, + "grad_norm": 1.8209826946258545, + "learning_rate": 5e-06, + "loss": 0.221, + "step": 3996 + }, + { + "epoch": 0.764244741873805, + "grad_norm": 1.490784764289856, + "learning_rate": 5e-06, + "loss": 0.1202, + "step": 3997 + }, + { + "epoch": 0.7644359464627151, + "grad_norm": 1.8983914852142334, + "learning_rate": 5e-06, + "loss": 0.2311, + "step": 3998 + }, + { + "epoch": 0.7646271510516253, + "grad_norm": 2.7288661003112793, + "learning_rate": 5e-06, + "loss": 0.1842, + "step": 3999 + }, + { + "epoch": 0.7648183556405354, + "grad_norm": 1.5600886344909668, + "learning_rate": 5e-06, + "loss": 0.0708, + "step": 4000 + }, + { + "epoch": 0.7648183556405354, + "eval_runtime": 831.7494, + "eval_samples_per_second": 1.844, + "eval_steps_per_second": 0.231, + "step": 4000 + }, + { + "epoch": 0.7650095602294456, + "grad_norm": 1.5232917070388794, + "learning_rate": 5e-06, + "loss": 0.1157, + "step": 4001 + }, + { + "epoch": 0.7652007648183556, + "grad_norm": 1.4118472337722778, + "learning_rate": 5e-06, + "loss": 0.1306, + "step": 4002 + }, + { + "epoch": 0.7653919694072657, + "grad_norm": 2.754512310028076, + "learning_rate": 5e-06, + "loss": 0.3549, + "step": 4003 + }, + { + "epoch": 0.7655831739961759, + "grad_norm": 1.406946063041687, + "learning_rate": 5e-06, + "loss": 0.136, + "step": 4004 + }, + { + "epoch": 0.765774378585086, + "grad_norm": 2.2281954288482666, + "learning_rate": 5e-06, + "loss": 0.0896, + "step": 4005 + }, + { + "epoch": 0.7659655831739962, + "grad_norm": 1.819032907485962, + "learning_rate": 5e-06, + "loss": 0.1981, + "step": 4006 + }, + { + "epoch": 0.7661567877629063, + "grad_norm": 2.1007330417633057, + "learning_rate": 5e-06, + "loss": 0.3524, + "step": 4007 + }, + { + "epoch": 0.7663479923518165, + "grad_norm": 2.4521172046661377, + "learning_rate": 5e-06, + "loss": 0.4039, + "step": 4008 + }, + { + "epoch": 0.7665391969407266, + "grad_norm": 2.598778247833252, + "learning_rate": 5e-06, + "loss": 0.2918, + "step": 4009 + }, + { + "epoch": 0.7667304015296367, + "grad_norm": 2.3652114868164062, + "learning_rate": 5e-06, + "loss": 0.2555, + "step": 4010 + }, + { + "epoch": 0.7669216061185469, + "grad_norm": 1.7355620861053467, + "learning_rate": 5e-06, + "loss": 0.1323, + "step": 4011 + }, + { + "epoch": 0.767112810707457, + "grad_norm": 3.2190678119659424, + "learning_rate": 5e-06, + "loss": 0.1842, + "step": 4012 + }, + { + "epoch": 0.7673040152963672, + "grad_norm": 2.945258617401123, + "learning_rate": 5e-06, + "loss": 0.261, + "step": 4013 + }, + { + "epoch": 0.7674952198852772, + "grad_norm": 4.270470142364502, + "learning_rate": 5e-06, + "loss": 0.7201, + "step": 4014 + }, + { + "epoch": 0.7676864244741873, + "grad_norm": 1.7441529035568237, + "learning_rate": 5e-06, + "loss": 0.1405, + "step": 4015 + }, + { + "epoch": 0.7678776290630975, + "grad_norm": 1.0526031255722046, + "learning_rate": 5e-06, + "loss": 0.1045, + "step": 4016 + }, + { + "epoch": 0.7680688336520076, + "grad_norm": 2.0954020023345947, + "learning_rate": 5e-06, + "loss": 0.2734, + "step": 4017 + }, + { + "epoch": 0.7682600382409178, + "grad_norm": 1.8673603534698486, + "learning_rate": 5e-06, + "loss": 0.1173, + "step": 4018 + }, + { + "epoch": 0.7684512428298279, + "grad_norm": 1.6339843273162842, + "learning_rate": 5e-06, + "loss": 0.1095, + "step": 4019 + }, + { + "epoch": 0.768642447418738, + "grad_norm": 1.8186370134353638, + "learning_rate": 5e-06, + "loss": 0.2133, + "step": 4020 + }, + { + "epoch": 0.7688336520076482, + "grad_norm": 2.424779176712036, + "learning_rate": 5e-06, + "loss": 0.2374, + "step": 4021 + }, + { + "epoch": 0.7690248565965583, + "grad_norm": 1.2409683465957642, + "learning_rate": 5e-06, + "loss": 0.0816, + "step": 4022 + }, + { + "epoch": 0.7692160611854685, + "grad_norm": 2.5901854038238525, + "learning_rate": 5e-06, + "loss": 0.134, + "step": 4023 + }, + { + "epoch": 0.7694072657743786, + "grad_norm": 1.4344134330749512, + "learning_rate": 5e-06, + "loss": 0.1287, + "step": 4024 + }, + { + "epoch": 0.7695984703632888, + "grad_norm": 1.9438978433609009, + "learning_rate": 5e-06, + "loss": 0.0898, + "step": 4025 + }, + { + "epoch": 0.7697896749521989, + "grad_norm": 3.3850362300872803, + "learning_rate": 5e-06, + "loss": 0.6324, + "step": 4026 + }, + { + "epoch": 0.7699808795411089, + "grad_norm": 1.7402697801589966, + "learning_rate": 5e-06, + "loss": 0.2097, + "step": 4027 + }, + { + "epoch": 0.7701720841300191, + "grad_norm": 1.9264459609985352, + "learning_rate": 5e-06, + "loss": 0.14, + "step": 4028 + }, + { + "epoch": 0.7703632887189292, + "grad_norm": 1.2523527145385742, + "learning_rate": 5e-06, + "loss": 0.0865, + "step": 4029 + }, + { + "epoch": 0.7705544933078394, + "grad_norm": 2.009114980697632, + "learning_rate": 5e-06, + "loss": 0.1727, + "step": 4030 + }, + { + "epoch": 0.7707456978967495, + "grad_norm": 1.3284012079238892, + "learning_rate": 5e-06, + "loss": 0.0669, + "step": 4031 + }, + { + "epoch": 0.7709369024856596, + "grad_norm": 3.460033893585205, + "learning_rate": 5e-06, + "loss": 0.653, + "step": 4032 + }, + { + "epoch": 0.7711281070745698, + "grad_norm": 1.1250536441802979, + "learning_rate": 5e-06, + "loss": 0.1061, + "step": 4033 + }, + { + "epoch": 0.7713193116634799, + "grad_norm": 2.0778872966766357, + "learning_rate": 5e-06, + "loss": 0.4068, + "step": 4034 + }, + { + "epoch": 0.7715105162523901, + "grad_norm": 2.448514461517334, + "learning_rate": 5e-06, + "loss": 0.1578, + "step": 4035 + }, + { + "epoch": 0.7717017208413002, + "grad_norm": 1.0364723205566406, + "learning_rate": 5e-06, + "loss": 0.0746, + "step": 4036 + }, + { + "epoch": 0.7718929254302104, + "grad_norm": 4.103033065795898, + "learning_rate": 5e-06, + "loss": 0.3191, + "step": 4037 + }, + { + "epoch": 0.7720841300191205, + "grad_norm": 2.208747148513794, + "learning_rate": 5e-06, + "loss": 0.2741, + "step": 4038 + }, + { + "epoch": 0.7722753346080306, + "grad_norm": 3.062417984008789, + "learning_rate": 5e-06, + "loss": 0.6122, + "step": 4039 + }, + { + "epoch": 0.7724665391969407, + "grad_norm": 1.5257611274719238, + "learning_rate": 5e-06, + "loss": 0.1512, + "step": 4040 + }, + { + "epoch": 0.7726577437858508, + "grad_norm": 1.7130473852157593, + "learning_rate": 5e-06, + "loss": 0.1445, + "step": 4041 + }, + { + "epoch": 0.772848948374761, + "grad_norm": 2.802297353744507, + "learning_rate": 5e-06, + "loss": 0.0933, + "step": 4042 + }, + { + "epoch": 0.7730401529636711, + "grad_norm": 1.7665363550186157, + "learning_rate": 5e-06, + "loss": 0.1321, + "step": 4043 + }, + { + "epoch": 0.7732313575525812, + "grad_norm": 1.023848533630371, + "learning_rate": 5e-06, + "loss": 0.0549, + "step": 4044 + }, + { + "epoch": 0.7734225621414914, + "grad_norm": 2.230396270751953, + "learning_rate": 5e-06, + "loss": 0.3784, + "step": 4045 + }, + { + "epoch": 0.7736137667304015, + "grad_norm": 1.3800081014633179, + "learning_rate": 5e-06, + "loss": 0.1241, + "step": 4046 + }, + { + "epoch": 0.7738049713193117, + "grad_norm": 2.2208352088928223, + "learning_rate": 5e-06, + "loss": 0.2697, + "step": 4047 + }, + { + "epoch": 0.7739961759082218, + "grad_norm": 2.5575742721557617, + "learning_rate": 5e-06, + "loss": 0.2564, + "step": 4048 + }, + { + "epoch": 0.774187380497132, + "grad_norm": 1.4058862924575806, + "learning_rate": 5e-06, + "loss": 0.0901, + "step": 4049 + }, + { + "epoch": 0.7743785850860421, + "grad_norm": 2.812058687210083, + "learning_rate": 5e-06, + "loss": 0.1837, + "step": 4050 + }, + { + "epoch": 0.7745697896749522, + "grad_norm": 2.573519706726074, + "learning_rate": 5e-06, + "loss": 0.483, + "step": 4051 + }, + { + "epoch": 0.7747609942638624, + "grad_norm": 3.118896961212158, + "learning_rate": 5e-06, + "loss": 0.3858, + "step": 4052 + }, + { + "epoch": 0.7749521988527724, + "grad_norm": 1.3650435209274292, + "learning_rate": 5e-06, + "loss": 0.0917, + "step": 4053 + }, + { + "epoch": 0.7751434034416826, + "grad_norm": 3.2142133712768555, + "learning_rate": 5e-06, + "loss": 0.1071, + "step": 4054 + }, + { + "epoch": 0.7753346080305927, + "grad_norm": 1.4311920404434204, + "learning_rate": 5e-06, + "loss": 0.095, + "step": 4055 + }, + { + "epoch": 0.7755258126195028, + "grad_norm": 1.5923914909362793, + "learning_rate": 5e-06, + "loss": 0.0758, + "step": 4056 + }, + { + "epoch": 0.775717017208413, + "grad_norm": 1.7863245010375977, + "learning_rate": 5e-06, + "loss": 0.1589, + "step": 4057 + }, + { + "epoch": 0.7759082217973231, + "grad_norm": 2.2520720958709717, + "learning_rate": 5e-06, + "loss": 0.2503, + "step": 4058 + }, + { + "epoch": 0.7760994263862333, + "grad_norm": 3.211054563522339, + "learning_rate": 5e-06, + "loss": 0.6649, + "step": 4059 + }, + { + "epoch": 0.7762906309751434, + "grad_norm": 0.8572031855583191, + "learning_rate": 5e-06, + "loss": 0.0593, + "step": 4060 + }, + { + "epoch": 0.7764818355640536, + "grad_norm": 1.4232878684997559, + "learning_rate": 5e-06, + "loss": 0.0751, + "step": 4061 + }, + { + "epoch": 0.7766730401529637, + "grad_norm": 1.295220971107483, + "learning_rate": 5e-06, + "loss": 0.0767, + "step": 4062 + }, + { + "epoch": 0.7768642447418738, + "grad_norm": 2.2189230918884277, + "learning_rate": 5e-06, + "loss": 0.1053, + "step": 4063 + }, + { + "epoch": 0.777055449330784, + "grad_norm": 2.4356725215911865, + "learning_rate": 5e-06, + "loss": 0.3199, + "step": 4064 + }, + { + "epoch": 0.777246653919694, + "grad_norm": 1.6857635974884033, + "learning_rate": 5e-06, + "loss": 0.1233, + "step": 4065 + }, + { + "epoch": 0.7774378585086043, + "grad_norm": 2.9625089168548584, + "learning_rate": 5e-06, + "loss": 0.3591, + "step": 4066 + }, + { + "epoch": 0.7776290630975143, + "grad_norm": 1.5815578699111938, + "learning_rate": 5e-06, + "loss": 0.0995, + "step": 4067 + }, + { + "epoch": 0.7778202676864244, + "grad_norm": 1.3760156631469727, + "learning_rate": 5e-06, + "loss": 0.1136, + "step": 4068 + }, + { + "epoch": 0.7780114722753346, + "grad_norm": 3.329010486602783, + "learning_rate": 5e-06, + "loss": 0.167, + "step": 4069 + }, + { + "epoch": 0.7782026768642447, + "grad_norm": 2.279073715209961, + "learning_rate": 5e-06, + "loss": 0.3059, + "step": 4070 + }, + { + "epoch": 0.7783938814531549, + "grad_norm": 1.5212337970733643, + "learning_rate": 5e-06, + "loss": 0.1047, + "step": 4071 + }, + { + "epoch": 0.778585086042065, + "grad_norm": 1.8684823513031006, + "learning_rate": 5e-06, + "loss": 0.1369, + "step": 4072 + }, + { + "epoch": 0.7787762906309752, + "grad_norm": 1.487801432609558, + "learning_rate": 5e-06, + "loss": 0.0738, + "step": 4073 + }, + { + "epoch": 0.7789674952198853, + "grad_norm": 2.3601162433624268, + "learning_rate": 5e-06, + "loss": 0.0753, + "step": 4074 + }, + { + "epoch": 0.7791586998087954, + "grad_norm": 1.5325002670288086, + "learning_rate": 5e-06, + "loss": 0.0589, + "step": 4075 + }, + { + "epoch": 0.7793499043977056, + "grad_norm": 2.376420497894287, + "learning_rate": 5e-06, + "loss": 0.3178, + "step": 4076 + }, + { + "epoch": 0.7795411089866157, + "grad_norm": 1.3300342559814453, + "learning_rate": 5e-06, + "loss": 0.1267, + "step": 4077 + }, + { + "epoch": 0.7797323135755259, + "grad_norm": 1.2323758602142334, + "learning_rate": 5e-06, + "loss": 0.1096, + "step": 4078 + }, + { + "epoch": 0.779923518164436, + "grad_norm": 2.7862613201141357, + "learning_rate": 5e-06, + "loss": 0.4678, + "step": 4079 + }, + { + "epoch": 0.780114722753346, + "grad_norm": 1.2939293384552002, + "learning_rate": 5e-06, + "loss": 0.0784, + "step": 4080 + }, + { + "epoch": 0.7803059273422562, + "grad_norm": 2.031770944595337, + "learning_rate": 5e-06, + "loss": 0.076, + "step": 4081 + }, + { + "epoch": 0.7804971319311663, + "grad_norm": 1.987504243850708, + "learning_rate": 5e-06, + "loss": 0.2646, + "step": 4082 + }, + { + "epoch": 0.7806883365200765, + "grad_norm": 3.281132936477661, + "learning_rate": 5e-06, + "loss": 0.4443, + "step": 4083 + }, + { + "epoch": 0.7808795411089866, + "grad_norm": 1.6435940265655518, + "learning_rate": 5e-06, + "loss": 0.1493, + "step": 4084 + }, + { + "epoch": 0.7810707456978967, + "grad_norm": 2.0128467082977295, + "learning_rate": 5e-06, + "loss": 0.1378, + "step": 4085 + }, + { + "epoch": 0.7812619502868069, + "grad_norm": 1.966070532798767, + "learning_rate": 5e-06, + "loss": 0.1004, + "step": 4086 + }, + { + "epoch": 0.781453154875717, + "grad_norm": 2.714599370956421, + "learning_rate": 5e-06, + "loss": 0.1898, + "step": 4087 + }, + { + "epoch": 0.7816443594646272, + "grad_norm": 2.605646848678589, + "learning_rate": 5e-06, + "loss": 0.5377, + "step": 4088 + }, + { + "epoch": 0.7818355640535373, + "grad_norm": 2.5793659687042236, + "learning_rate": 5e-06, + "loss": 0.3054, + "step": 4089 + }, + { + "epoch": 0.7820267686424475, + "grad_norm": 1.0576157569885254, + "learning_rate": 5e-06, + "loss": 0.1205, + "step": 4090 + }, + { + "epoch": 0.7822179732313576, + "grad_norm": 2.084843397140503, + "learning_rate": 5e-06, + "loss": 0.1685, + "step": 4091 + }, + { + "epoch": 0.7824091778202676, + "grad_norm": 1.4531314373016357, + "learning_rate": 5e-06, + "loss": 0.1244, + "step": 4092 + }, + { + "epoch": 0.7826003824091778, + "grad_norm": 1.2267307043075562, + "learning_rate": 5e-06, + "loss": 0.0672, + "step": 4093 + }, + { + "epoch": 0.7827915869980879, + "grad_norm": 2.5020968914031982, + "learning_rate": 5e-06, + "loss": 0.0903, + "step": 4094 + }, + { + "epoch": 0.7829827915869981, + "grad_norm": 2.856722354888916, + "learning_rate": 5e-06, + "loss": 0.6113, + "step": 4095 + }, + { + "epoch": 0.7831739961759082, + "grad_norm": 3.0301995277404785, + "learning_rate": 5e-06, + "loss": 0.3728, + "step": 4096 + }, + { + "epoch": 0.7833652007648183, + "grad_norm": 2.254077911376953, + "learning_rate": 5e-06, + "loss": 0.3342, + "step": 4097 + }, + { + "epoch": 0.7835564053537285, + "grad_norm": 1.5700175762176514, + "learning_rate": 5e-06, + "loss": 0.0846, + "step": 4098 + }, + { + "epoch": 0.7837476099426386, + "grad_norm": 1.795477032661438, + "learning_rate": 5e-06, + "loss": 0.1586, + "step": 4099 + }, + { + "epoch": 0.7839388145315488, + "grad_norm": 3.384065866470337, + "learning_rate": 5e-06, + "loss": 0.1139, + "step": 4100 + }, + { + "epoch": 0.7841300191204589, + "grad_norm": 1.4682124853134155, + "learning_rate": 5e-06, + "loss": 0.185, + "step": 4101 + }, + { + "epoch": 0.7843212237093691, + "grad_norm": 1.558057188987732, + "learning_rate": 5e-06, + "loss": 0.1074, + "step": 4102 + }, + { + "epoch": 0.7845124282982792, + "grad_norm": 1.5690349340438843, + "learning_rate": 5e-06, + "loss": 0.1275, + "step": 4103 + }, + { + "epoch": 0.7847036328871893, + "grad_norm": 1.0313045978546143, + "learning_rate": 5e-06, + "loss": 0.0571, + "step": 4104 + }, + { + "epoch": 0.7848948374760994, + "grad_norm": 1.4557856321334839, + "learning_rate": 5e-06, + "loss": 0.0803, + "step": 4105 + }, + { + "epoch": 0.7850860420650095, + "grad_norm": 1.7461309432983398, + "learning_rate": 5e-06, + "loss": 0.0582, + "step": 4106 + }, + { + "epoch": 0.7852772466539197, + "grad_norm": 3.133112668991089, + "learning_rate": 5e-06, + "loss": 0.4884, + "step": 4107 + }, + { + "epoch": 0.7854684512428298, + "grad_norm": 2.038489580154419, + "learning_rate": 5e-06, + "loss": 0.2294, + "step": 4108 + }, + { + "epoch": 0.7856596558317399, + "grad_norm": 1.714369535446167, + "learning_rate": 5e-06, + "loss": 0.1776, + "step": 4109 + }, + { + "epoch": 0.7858508604206501, + "grad_norm": 2.173279047012329, + "learning_rate": 5e-06, + "loss": 0.1365, + "step": 4110 + }, + { + "epoch": 0.7860420650095602, + "grad_norm": 1.4787670373916626, + "learning_rate": 5e-06, + "loss": 0.0929, + "step": 4111 + }, + { + "epoch": 0.7862332695984704, + "grad_norm": 2.0035345554351807, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 4112 + }, + { + "epoch": 0.7864244741873805, + "grad_norm": 2.027458906173706, + "learning_rate": 5e-06, + "loss": 0.3061, + "step": 4113 + }, + { + "epoch": 0.7866156787762907, + "grad_norm": 3.2767152786254883, + "learning_rate": 5e-06, + "loss": 0.372, + "step": 4114 + }, + { + "epoch": 0.7868068833652008, + "grad_norm": 2.2546582221984863, + "learning_rate": 5e-06, + "loss": 0.2883, + "step": 4115 + }, + { + "epoch": 0.7869980879541109, + "grad_norm": 1.9276899099349976, + "learning_rate": 5e-06, + "loss": 0.1248, + "step": 4116 + }, + { + "epoch": 0.7871892925430211, + "grad_norm": 1.4487714767456055, + "learning_rate": 5e-06, + "loss": 0.1152, + "step": 4117 + }, + { + "epoch": 0.7873804971319311, + "grad_norm": 0.9173452854156494, + "learning_rate": 5e-06, + "loss": 0.0181, + "step": 4118 + }, + { + "epoch": 0.7875717017208413, + "grad_norm": 2.420795440673828, + "learning_rate": 5e-06, + "loss": 0.1725, + "step": 4119 + }, + { + "epoch": 0.7877629063097514, + "grad_norm": 1.8025798797607422, + "learning_rate": 5e-06, + "loss": 0.2264, + "step": 4120 + }, + { + "epoch": 0.7879541108986615, + "grad_norm": 2.618238925933838, + "learning_rate": 5e-06, + "loss": 0.4072, + "step": 4121 + }, + { + "epoch": 0.7881453154875717, + "grad_norm": 2.343017101287842, + "learning_rate": 5e-06, + "loss": 0.195, + "step": 4122 + }, + { + "epoch": 0.7883365200764818, + "grad_norm": 1.8181397914886475, + "learning_rate": 5e-06, + "loss": 0.0863, + "step": 4123 + }, + { + "epoch": 0.788527724665392, + "grad_norm": 1.8634696006774902, + "learning_rate": 5e-06, + "loss": 0.1055, + "step": 4124 + }, + { + "epoch": 0.7887189292543021, + "grad_norm": 2.9761476516723633, + "learning_rate": 5e-06, + "loss": 0.1362, + "step": 4125 + }, + { + "epoch": 0.7889101338432123, + "grad_norm": 2.6176702976226807, + "learning_rate": 5e-06, + "loss": 0.3955, + "step": 4126 + }, + { + "epoch": 0.7891013384321224, + "grad_norm": 2.2697925567626953, + "learning_rate": 5e-06, + "loss": 0.3743, + "step": 4127 + }, + { + "epoch": 0.7892925430210325, + "grad_norm": 2.294552803039551, + "learning_rate": 5e-06, + "loss": 0.2497, + "step": 4128 + }, + { + "epoch": 0.7894837476099427, + "grad_norm": 3.0859687328338623, + "learning_rate": 5e-06, + "loss": 0.3342, + "step": 4129 + }, + { + "epoch": 0.7896749521988528, + "grad_norm": 3.8342316150665283, + "learning_rate": 5e-06, + "loss": 0.2451, + "step": 4130 + }, + { + "epoch": 0.789866156787763, + "grad_norm": 1.9949150085449219, + "learning_rate": 5e-06, + "loss": 0.1862, + "step": 4131 + }, + { + "epoch": 0.790057361376673, + "grad_norm": 1.5915290117263794, + "learning_rate": 5e-06, + "loss": 0.1456, + "step": 4132 + }, + { + "epoch": 0.7902485659655831, + "grad_norm": 2.4300689697265625, + "learning_rate": 5e-06, + "loss": 0.3243, + "step": 4133 + }, + { + "epoch": 0.7904397705544933, + "grad_norm": 2.693277597427368, + "learning_rate": 5e-06, + "loss": 0.2611, + "step": 4134 + }, + { + "epoch": 0.7906309751434034, + "grad_norm": 1.841639518737793, + "learning_rate": 5e-06, + "loss": 0.1271, + "step": 4135 + }, + { + "epoch": 0.7908221797323136, + "grad_norm": 3.0559420585632324, + "learning_rate": 5e-06, + "loss": 0.25, + "step": 4136 + }, + { + "epoch": 0.7910133843212237, + "grad_norm": 2.3945229053497314, + "learning_rate": 5e-06, + "loss": 0.1167, + "step": 4137 + }, + { + "epoch": 0.7912045889101338, + "grad_norm": 1.6890507936477661, + "learning_rate": 5e-06, + "loss": 0.1109, + "step": 4138 + }, + { + "epoch": 0.791395793499044, + "grad_norm": 4.068086624145508, + "learning_rate": 5e-06, + "loss": 0.4148, + "step": 4139 + }, + { + "epoch": 0.7915869980879541, + "grad_norm": 1.1503350734710693, + "learning_rate": 5e-06, + "loss": 0.1057, + "step": 4140 + }, + { + "epoch": 0.7917782026768643, + "grad_norm": 1.6826403141021729, + "learning_rate": 5e-06, + "loss": 0.2092, + "step": 4141 + }, + { + "epoch": 0.7919694072657744, + "grad_norm": 2.374329090118408, + "learning_rate": 5e-06, + "loss": 0.1302, + "step": 4142 + }, + { + "epoch": 0.7921606118546846, + "grad_norm": 1.413367509841919, + "learning_rate": 5e-06, + "loss": 0.0968, + "step": 4143 + }, + { + "epoch": 0.7923518164435946, + "grad_norm": 0.6716412305831909, + "learning_rate": 5e-06, + "loss": 0.0294, + "step": 4144 + }, + { + "epoch": 0.7925430210325047, + "grad_norm": 1.7345741987228394, + "learning_rate": 5e-06, + "loss": 0.1587, + "step": 4145 + }, + { + "epoch": 0.7927342256214149, + "grad_norm": 2.5227432250976562, + "learning_rate": 5e-06, + "loss": 0.2367, + "step": 4146 + }, + { + "epoch": 0.792925430210325, + "grad_norm": 1.6562448740005493, + "learning_rate": 5e-06, + "loss": 0.1356, + "step": 4147 + }, + { + "epoch": 0.7931166347992352, + "grad_norm": 1.9978289604187012, + "learning_rate": 5e-06, + "loss": 0.0856, + "step": 4148 + }, + { + "epoch": 0.7933078393881453, + "grad_norm": 1.263630747795105, + "learning_rate": 5e-06, + "loss": 0.0879, + "step": 4149 + }, + { + "epoch": 0.7934990439770554, + "grad_norm": 1.8136980533599854, + "learning_rate": 5e-06, + "loss": 0.1034, + "step": 4150 + }, + { + "epoch": 0.7936902485659656, + "grad_norm": 2.311706781387329, + "learning_rate": 5e-06, + "loss": 0.1771, + "step": 4151 + }, + { + "epoch": 0.7938814531548757, + "grad_norm": 2.5121023654937744, + "learning_rate": 5e-06, + "loss": 0.2529, + "step": 4152 + }, + { + "epoch": 0.7940726577437859, + "grad_norm": 2.3066329956054688, + "learning_rate": 5e-06, + "loss": 0.3744, + "step": 4153 + }, + { + "epoch": 0.794263862332696, + "grad_norm": 1.4853479862213135, + "learning_rate": 5e-06, + "loss": 0.1752, + "step": 4154 + }, + { + "epoch": 0.7944550669216062, + "grad_norm": 1.425007939338684, + "learning_rate": 5e-06, + "loss": 0.0867, + "step": 4155 + }, + { + "epoch": 0.7946462715105163, + "grad_norm": 2.0261852741241455, + "learning_rate": 5e-06, + "loss": 0.1272, + "step": 4156 + }, + { + "epoch": 0.7948374760994263, + "grad_norm": 2.156608819961548, + "learning_rate": 5e-06, + "loss": 0.3367, + "step": 4157 + }, + { + "epoch": 0.7950286806883365, + "grad_norm": 1.3627783060073853, + "learning_rate": 5e-06, + "loss": 0.1305, + "step": 4158 + }, + { + "epoch": 0.7952198852772466, + "grad_norm": 2.0210812091827393, + "learning_rate": 5e-06, + "loss": 0.2219, + "step": 4159 + }, + { + "epoch": 0.7954110898661568, + "grad_norm": 1.251874327659607, + "learning_rate": 5e-06, + "loss": 0.0726, + "step": 4160 + }, + { + "epoch": 0.7956022944550669, + "grad_norm": 1.1710023880004883, + "learning_rate": 5e-06, + "loss": 0.0736, + "step": 4161 + }, + { + "epoch": 0.795793499043977, + "grad_norm": 1.9900565147399902, + "learning_rate": 5e-06, + "loss": 0.2058, + "step": 4162 + }, + { + "epoch": 0.7959847036328872, + "grad_norm": 1.1456722021102905, + "learning_rate": 5e-06, + "loss": 0.0785, + "step": 4163 + }, + { + "epoch": 0.7961759082217973, + "grad_norm": 2.3891546726226807, + "learning_rate": 5e-06, + "loss": 0.3063, + "step": 4164 + }, + { + "epoch": 0.7963671128107075, + "grad_norm": 1.7114723920822144, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 4165 + }, + { + "epoch": 0.7965583173996176, + "grad_norm": 3.918834686279297, + "learning_rate": 5e-06, + "loss": 0.4596, + "step": 4166 + }, + { + "epoch": 0.7967495219885278, + "grad_norm": 1.0303666591644287, + "learning_rate": 5e-06, + "loss": 0.0694, + "step": 4167 + }, + { + "epoch": 0.7969407265774379, + "grad_norm": 1.4666823148727417, + "learning_rate": 5e-06, + "loss": 0.0555, + "step": 4168 + }, + { + "epoch": 0.797131931166348, + "grad_norm": 1.776019811630249, + "learning_rate": 5e-06, + "loss": 0.1054, + "step": 4169 + }, + { + "epoch": 0.7973231357552581, + "grad_norm": 3.0186281204223633, + "learning_rate": 5e-06, + "loss": 0.4206, + "step": 4170 + }, + { + "epoch": 0.7975143403441682, + "grad_norm": 1.698807716369629, + "learning_rate": 5e-06, + "loss": 0.1383, + "step": 4171 + }, + { + "epoch": 0.7977055449330784, + "grad_norm": 3.590824604034424, + "learning_rate": 5e-06, + "loss": 0.5016, + "step": 4172 + }, + { + "epoch": 0.7978967495219885, + "grad_norm": 1.6603933572769165, + "learning_rate": 5e-06, + "loss": 0.1306, + "step": 4173 + }, + { + "epoch": 0.7980879541108986, + "grad_norm": 2.0738401412963867, + "learning_rate": 5e-06, + "loss": 0.1074, + "step": 4174 + }, + { + "epoch": 0.7982791586998088, + "grad_norm": 1.8535012006759644, + "learning_rate": 5e-06, + "loss": 0.1028, + "step": 4175 + }, + { + "epoch": 0.7984703632887189, + "grad_norm": 2.4131641387939453, + "learning_rate": 5e-06, + "loss": 0.3586, + "step": 4176 + }, + { + "epoch": 0.7986615678776291, + "grad_norm": 3.5233068466186523, + "learning_rate": 5e-06, + "loss": 0.6267, + "step": 4177 + }, + { + "epoch": 0.7988527724665392, + "grad_norm": 1.7418206930160522, + "learning_rate": 5e-06, + "loss": 0.1779, + "step": 4178 + }, + { + "epoch": 0.7990439770554494, + "grad_norm": 1.813206672668457, + "learning_rate": 5e-06, + "loss": 0.1225, + "step": 4179 + }, + { + "epoch": 0.7992351816443595, + "grad_norm": 1.5349483489990234, + "learning_rate": 5e-06, + "loss": 0.1117, + "step": 4180 + }, + { + "epoch": 0.7994263862332696, + "grad_norm": 1.7856202125549316, + "learning_rate": 5e-06, + "loss": 0.1138, + "step": 4181 + }, + { + "epoch": 0.7996175908221798, + "grad_norm": 1.9936387538909912, + "learning_rate": 5e-06, + "loss": 0.2355, + "step": 4182 + }, + { + "epoch": 0.7998087954110898, + "grad_norm": 1.6209287643432617, + "learning_rate": 5e-06, + "loss": 0.1824, + "step": 4183 + }, + { + "epoch": 0.8, + "grad_norm": 2.2457656860351562, + "learning_rate": 5e-06, + "loss": 0.3357, + "step": 4184 + }, + { + "epoch": 0.8001912045889101, + "grad_norm": 2.640838861465454, + "learning_rate": 5e-06, + "loss": 0.3455, + "step": 4185 + }, + { + "epoch": 0.8003824091778202, + "grad_norm": 2.3119945526123047, + "learning_rate": 5e-06, + "loss": 0.2474, + "step": 4186 + }, + { + "epoch": 0.8005736137667304, + "grad_norm": 3.9085891246795654, + "learning_rate": 5e-06, + "loss": 0.3996, + "step": 4187 + }, + { + "epoch": 0.8007648183556405, + "grad_norm": 2.8617188930511475, + "learning_rate": 5e-06, + "loss": 0.4351, + "step": 4188 + }, + { + "epoch": 0.8009560229445507, + "grad_norm": 0.908932089805603, + "learning_rate": 5e-06, + "loss": 0.0602, + "step": 4189 + }, + { + "epoch": 0.8011472275334608, + "grad_norm": 1.0495219230651855, + "learning_rate": 5e-06, + "loss": 0.0867, + "step": 4190 + }, + { + "epoch": 0.801338432122371, + "grad_norm": 2.866431951522827, + "learning_rate": 5e-06, + "loss": 0.4275, + "step": 4191 + }, + { + "epoch": 0.8015296367112811, + "grad_norm": 1.4292815923690796, + "learning_rate": 5e-06, + "loss": 0.1008, + "step": 4192 + }, + { + "epoch": 0.8017208413001912, + "grad_norm": 1.7318116426467896, + "learning_rate": 5e-06, + "loss": 0.1159, + "step": 4193 + }, + { + "epoch": 0.8019120458891014, + "grad_norm": 1.6380189657211304, + "learning_rate": 5e-06, + "loss": 0.1392, + "step": 4194 + }, + { + "epoch": 0.8021032504780115, + "grad_norm": 2.122971534729004, + "learning_rate": 5e-06, + "loss": 0.2761, + "step": 4195 + }, + { + "epoch": 0.8022944550669217, + "grad_norm": 2.2278707027435303, + "learning_rate": 5e-06, + "loss": 0.1824, + "step": 4196 + }, + { + "epoch": 0.8024856596558317, + "grad_norm": 1.6206939220428467, + "learning_rate": 5e-06, + "loss": 0.2697, + "step": 4197 + }, + { + "epoch": 0.8026768642447418, + "grad_norm": 2.017160654067993, + "learning_rate": 5e-06, + "loss": 0.1218, + "step": 4198 + }, + { + "epoch": 0.802868068833652, + "grad_norm": 2.2551474571228027, + "learning_rate": 5e-06, + "loss": 0.1287, + "step": 4199 + }, + { + "epoch": 0.8030592734225621, + "grad_norm": 0.9970536231994629, + "learning_rate": 5e-06, + "loss": 0.0528, + "step": 4200 + }, + { + "epoch": 0.8032504780114723, + "grad_norm": 1.8154901266098022, + "learning_rate": 5e-06, + "loss": 0.2272, + "step": 4201 + }, + { + "epoch": 0.8034416826003824, + "grad_norm": 3.230262041091919, + "learning_rate": 5e-06, + "loss": 0.5354, + "step": 4202 + }, + { + "epoch": 0.8036328871892925, + "grad_norm": 2.7765953540802, + "learning_rate": 5e-06, + "loss": 0.247, + "step": 4203 + }, + { + "epoch": 0.8038240917782027, + "grad_norm": 1.6847764253616333, + "learning_rate": 5e-06, + "loss": 0.148, + "step": 4204 + }, + { + "epoch": 0.8040152963671128, + "grad_norm": 1.6412423849105835, + "learning_rate": 5e-06, + "loss": 0.2218, + "step": 4205 + }, + { + "epoch": 0.804206500956023, + "grad_norm": 1.1843851804733276, + "learning_rate": 5e-06, + "loss": 0.0497, + "step": 4206 + }, + { + "epoch": 0.8043977055449331, + "grad_norm": 2.4135520458221436, + "learning_rate": 5e-06, + "loss": 0.3082, + "step": 4207 + }, + { + "epoch": 0.8045889101338433, + "grad_norm": 2.1240572929382324, + "learning_rate": 5e-06, + "loss": 0.3888, + "step": 4208 + }, + { + "epoch": 0.8047801147227533, + "grad_norm": 1.9531352519989014, + "learning_rate": 5e-06, + "loss": 0.183, + "step": 4209 + }, + { + "epoch": 0.8049713193116634, + "grad_norm": 1.7559449672698975, + "learning_rate": 5e-06, + "loss": 0.1177, + "step": 4210 + }, + { + "epoch": 0.8051625239005736, + "grad_norm": 1.8385118246078491, + "learning_rate": 5e-06, + "loss": 0.0969, + "step": 4211 + }, + { + "epoch": 0.8053537284894837, + "grad_norm": 2.0431954860687256, + "learning_rate": 5e-06, + "loss": 0.1699, + "step": 4212 + }, + { + "epoch": 0.8055449330783939, + "grad_norm": 1.8413811922073364, + "learning_rate": 5e-06, + "loss": 0.1222, + "step": 4213 + }, + { + "epoch": 0.805736137667304, + "grad_norm": 2.166095018386841, + "learning_rate": 5e-06, + "loss": 0.2953, + "step": 4214 + }, + { + "epoch": 0.8059273422562141, + "grad_norm": 1.8633686304092407, + "learning_rate": 5e-06, + "loss": 0.1473, + "step": 4215 + }, + { + "epoch": 0.8061185468451243, + "grad_norm": 6.3489861488342285, + "learning_rate": 5e-06, + "loss": 0.417, + "step": 4216 + }, + { + "epoch": 0.8063097514340344, + "grad_norm": 1.1961588859558105, + "learning_rate": 5e-06, + "loss": 0.0894, + "step": 4217 + }, + { + "epoch": 0.8065009560229446, + "grad_norm": 2.456892728805542, + "learning_rate": 5e-06, + "loss": 0.2194, + "step": 4218 + }, + { + "epoch": 0.8066921606118547, + "grad_norm": 1.9836076498031616, + "learning_rate": 5e-06, + "loss": 0.1635, + "step": 4219 + }, + { + "epoch": 0.8068833652007649, + "grad_norm": 2.320347785949707, + "learning_rate": 5e-06, + "loss": 0.3102, + "step": 4220 + }, + { + "epoch": 0.807074569789675, + "grad_norm": 1.4314194917678833, + "learning_rate": 5e-06, + "loss": 0.1227, + "step": 4221 + }, + { + "epoch": 0.807265774378585, + "grad_norm": 2.4354302883148193, + "learning_rate": 5e-06, + "loss": 0.3198, + "step": 4222 + }, + { + "epoch": 0.8074569789674952, + "grad_norm": 3.135202407836914, + "learning_rate": 5e-06, + "loss": 0.2219, + "step": 4223 + }, + { + "epoch": 0.8076481835564053, + "grad_norm": 1.7042864561080933, + "learning_rate": 5e-06, + "loss": 0.1439, + "step": 4224 + }, + { + "epoch": 0.8078393881453155, + "grad_norm": 2.076150417327881, + "learning_rate": 5e-06, + "loss": 0.1322, + "step": 4225 + }, + { + "epoch": 0.8080305927342256, + "grad_norm": 2.117445707321167, + "learning_rate": 5e-06, + "loss": 0.223, + "step": 4226 + }, + { + "epoch": 0.8082217973231357, + "grad_norm": 2.044095993041992, + "learning_rate": 5e-06, + "loss": 0.2822, + "step": 4227 + }, + { + "epoch": 0.8084130019120459, + "grad_norm": 2.046536684036255, + "learning_rate": 5e-06, + "loss": 0.184, + "step": 4228 + }, + { + "epoch": 0.808604206500956, + "grad_norm": 1.4895386695861816, + "learning_rate": 5e-06, + "loss": 0.0805, + "step": 4229 + }, + { + "epoch": 0.8087954110898662, + "grad_norm": 0.9864174723625183, + "learning_rate": 5e-06, + "loss": 0.0903, + "step": 4230 + }, + { + "epoch": 0.8089866156787763, + "grad_norm": 1.7758212089538574, + "learning_rate": 5e-06, + "loss": 0.2118, + "step": 4231 + }, + { + "epoch": 0.8091778202676865, + "grad_norm": 2.2796030044555664, + "learning_rate": 5e-06, + "loss": 0.2636, + "step": 4232 + }, + { + "epoch": 0.8093690248565966, + "grad_norm": 1.5463277101516724, + "learning_rate": 5e-06, + "loss": 0.1616, + "step": 4233 + }, + { + "epoch": 0.8095602294455067, + "grad_norm": 1.383233904838562, + "learning_rate": 5e-06, + "loss": 0.1344, + "step": 4234 + }, + { + "epoch": 0.8097514340344169, + "grad_norm": 1.3084925413131714, + "learning_rate": 5e-06, + "loss": 0.139, + "step": 4235 + }, + { + "epoch": 0.8099426386233269, + "grad_norm": 1.4625215530395508, + "learning_rate": 5e-06, + "loss": 0.1294, + "step": 4236 + }, + { + "epoch": 0.8101338432122371, + "grad_norm": 1.1085083484649658, + "learning_rate": 5e-06, + "loss": 0.0566, + "step": 4237 + }, + { + "epoch": 0.8103250478011472, + "grad_norm": 2.045025110244751, + "learning_rate": 5e-06, + "loss": 0.1817, + "step": 4238 + }, + { + "epoch": 0.8105162523900573, + "grad_norm": 2.7750654220581055, + "learning_rate": 5e-06, + "loss": 0.3498, + "step": 4239 + }, + { + "epoch": 0.8107074569789675, + "grad_norm": 1.906060814857483, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 4240 + }, + { + "epoch": 0.8108986615678776, + "grad_norm": 3.07199764251709, + "learning_rate": 5e-06, + "loss": 0.3699, + "step": 4241 + }, + { + "epoch": 0.8110898661567878, + "grad_norm": 1.987126350402832, + "learning_rate": 5e-06, + "loss": 0.2444, + "step": 4242 + }, + { + "epoch": 0.8112810707456979, + "grad_norm": 1.435240387916565, + "learning_rate": 5e-06, + "loss": 0.0708, + "step": 4243 + }, + { + "epoch": 0.8114722753346081, + "grad_norm": 1.3612838983535767, + "learning_rate": 5e-06, + "loss": 0.0594, + "step": 4244 + }, + { + "epoch": 0.8116634799235182, + "grad_norm": 2.586632013320923, + "learning_rate": 5e-06, + "loss": 0.3922, + "step": 4245 + }, + { + "epoch": 0.8118546845124283, + "grad_norm": 1.353735327720642, + "learning_rate": 5e-06, + "loss": 0.0882, + "step": 4246 + }, + { + "epoch": 0.8120458891013385, + "grad_norm": 1.7560728788375854, + "learning_rate": 5e-06, + "loss": 0.2315, + "step": 4247 + }, + { + "epoch": 0.8122370936902485, + "grad_norm": 1.8247698545455933, + "learning_rate": 5e-06, + "loss": 0.1895, + "step": 4248 + }, + { + "epoch": 0.8124282982791587, + "grad_norm": 1.327373743057251, + "learning_rate": 5e-06, + "loss": 0.0454, + "step": 4249 + }, + { + "epoch": 0.8126195028680688, + "grad_norm": 1.8750287294387817, + "learning_rate": 5e-06, + "loss": 0.1399, + "step": 4250 + }, + { + "epoch": 0.8128107074569789, + "grad_norm": 2.295496702194214, + "learning_rate": 5e-06, + "loss": 0.2928, + "step": 4251 + }, + { + "epoch": 0.8130019120458891, + "grad_norm": 2.422713041305542, + "learning_rate": 5e-06, + "loss": 0.283, + "step": 4252 + }, + { + "epoch": 0.8131931166347992, + "grad_norm": 1.8091152906417847, + "learning_rate": 5e-06, + "loss": 0.1467, + "step": 4253 + }, + { + "epoch": 0.8133843212237094, + "grad_norm": 1.2801040410995483, + "learning_rate": 5e-06, + "loss": 0.1223, + "step": 4254 + }, + { + "epoch": 0.8135755258126195, + "grad_norm": 1.0623581409454346, + "learning_rate": 5e-06, + "loss": 0.0903, + "step": 4255 + }, + { + "epoch": 0.8137667304015297, + "grad_norm": 2.4953532218933105, + "learning_rate": 5e-06, + "loss": 0.1855, + "step": 4256 + }, + { + "epoch": 0.8139579349904398, + "grad_norm": 2.282299757003784, + "learning_rate": 5e-06, + "loss": 0.4767, + "step": 4257 + }, + { + "epoch": 0.8141491395793499, + "grad_norm": 1.9749537706375122, + "learning_rate": 5e-06, + "loss": 0.1855, + "step": 4258 + }, + { + "epoch": 0.8143403441682601, + "grad_norm": 1.3995250463485718, + "learning_rate": 5e-06, + "loss": 0.1177, + "step": 4259 + }, + { + "epoch": 0.8145315487571702, + "grad_norm": 2.201573133468628, + "learning_rate": 5e-06, + "loss": 0.1921, + "step": 4260 + }, + { + "epoch": 0.8147227533460804, + "grad_norm": 1.34676194190979, + "learning_rate": 5e-06, + "loss": 0.0499, + "step": 4261 + }, + { + "epoch": 0.8149139579349904, + "grad_norm": 1.8636780977249146, + "learning_rate": 5e-06, + "loss": 0.2137, + "step": 4262 + }, + { + "epoch": 0.8151051625239005, + "grad_norm": 1.9577280282974243, + "learning_rate": 5e-06, + "loss": 0.3015, + "step": 4263 + }, + { + "epoch": 0.8152963671128107, + "grad_norm": 2.576096296310425, + "learning_rate": 5e-06, + "loss": 0.2557, + "step": 4264 + }, + { + "epoch": 0.8154875717017208, + "grad_norm": 1.2772091627120972, + "learning_rate": 5e-06, + "loss": 0.1053, + "step": 4265 + }, + { + "epoch": 0.815678776290631, + "grad_norm": 1.6483020782470703, + "learning_rate": 5e-06, + "loss": 0.1241, + "step": 4266 + }, + { + "epoch": 0.8158699808795411, + "grad_norm": 2.198559522628784, + "learning_rate": 5e-06, + "loss": 0.1151, + "step": 4267 + }, + { + "epoch": 0.8160611854684512, + "grad_norm": 0.5627172589302063, + "learning_rate": 5e-06, + "loss": 0.021, + "step": 4268 + }, + { + "epoch": 0.8162523900573614, + "grad_norm": 1.8419286012649536, + "learning_rate": 5e-06, + "loss": 0.1135, + "step": 4269 + }, + { + "epoch": 0.8164435946462715, + "grad_norm": 2.3597888946533203, + "learning_rate": 5e-06, + "loss": 0.5168, + "step": 4270 + }, + { + "epoch": 0.8166347992351817, + "grad_norm": 1.2093944549560547, + "learning_rate": 5e-06, + "loss": 0.1298, + "step": 4271 + }, + { + "epoch": 0.8168260038240918, + "grad_norm": 2.0463786125183105, + "learning_rate": 5e-06, + "loss": 0.2085, + "step": 4272 + }, + { + "epoch": 0.817017208413002, + "grad_norm": 2.7457947731018066, + "learning_rate": 5e-06, + "loss": 0.246, + "step": 4273 + }, + { + "epoch": 0.817208413001912, + "grad_norm": 1.0748988389968872, + "learning_rate": 5e-06, + "loss": 0.0544, + "step": 4274 + }, + { + "epoch": 0.8173996175908221, + "grad_norm": 3.576910972595215, + "learning_rate": 5e-06, + "loss": 0.3724, + "step": 4275 + }, + { + "epoch": 0.8175908221797323, + "grad_norm": 2.98368763923645, + "learning_rate": 5e-06, + "loss": 0.5464, + "step": 4276 + }, + { + "epoch": 0.8177820267686424, + "grad_norm": 2.850346565246582, + "learning_rate": 5e-06, + "loss": 0.4558, + "step": 4277 + }, + { + "epoch": 0.8179732313575526, + "grad_norm": 2.513153553009033, + "learning_rate": 5e-06, + "loss": 0.2441, + "step": 4278 + }, + { + "epoch": 0.8181644359464627, + "grad_norm": 0.7250884175300598, + "learning_rate": 5e-06, + "loss": 0.0649, + "step": 4279 + }, + { + "epoch": 0.8183556405353728, + "grad_norm": 1.4739652872085571, + "learning_rate": 5e-06, + "loss": 0.1276, + "step": 4280 + }, + { + "epoch": 0.818546845124283, + "grad_norm": 2.511843681335449, + "learning_rate": 5e-06, + "loss": 0.1925, + "step": 4281 + }, + { + "epoch": 0.8187380497131931, + "grad_norm": 2.0406453609466553, + "learning_rate": 5e-06, + "loss": 0.3064, + "step": 4282 + }, + { + "epoch": 0.8189292543021033, + "grad_norm": 2.142224073410034, + "learning_rate": 5e-06, + "loss": 0.2985, + "step": 4283 + }, + { + "epoch": 0.8191204588910134, + "grad_norm": 1.9565352201461792, + "learning_rate": 5e-06, + "loss": 0.2398, + "step": 4284 + }, + { + "epoch": 0.8193116634799236, + "grad_norm": 1.1692537069320679, + "learning_rate": 5e-06, + "loss": 0.1201, + "step": 4285 + }, + { + "epoch": 0.8195028680688337, + "grad_norm": 0.9735251665115356, + "learning_rate": 5e-06, + "loss": 0.0589, + "step": 4286 + }, + { + "epoch": 0.8196940726577437, + "grad_norm": 1.0305509567260742, + "learning_rate": 5e-06, + "loss": 0.0485, + "step": 4287 + }, + { + "epoch": 0.8198852772466539, + "grad_norm": 2.014246940612793, + "learning_rate": 5e-06, + "loss": 0.2368, + "step": 4288 + }, + { + "epoch": 0.820076481835564, + "grad_norm": 3.3924779891967773, + "learning_rate": 5e-06, + "loss": 0.381, + "step": 4289 + }, + { + "epoch": 0.8202676864244742, + "grad_norm": 2.973250389099121, + "learning_rate": 5e-06, + "loss": 0.4036, + "step": 4290 + }, + { + "epoch": 0.8204588910133843, + "grad_norm": 1.6813417673110962, + "learning_rate": 5e-06, + "loss": 0.1751, + "step": 4291 + }, + { + "epoch": 0.8206500956022944, + "grad_norm": 1.9835141897201538, + "learning_rate": 5e-06, + "loss": 0.1582, + "step": 4292 + }, + { + "epoch": 0.8208413001912046, + "grad_norm": 2.34795880317688, + "learning_rate": 5e-06, + "loss": 0.2726, + "step": 4293 + }, + { + "epoch": 0.8210325047801147, + "grad_norm": 2.353971004486084, + "learning_rate": 5e-06, + "loss": 0.2274, + "step": 4294 + }, + { + "epoch": 0.8212237093690249, + "grad_norm": 2.977048635482788, + "learning_rate": 5e-06, + "loss": 0.4693, + "step": 4295 + }, + { + "epoch": 0.821414913957935, + "grad_norm": 1.9630651473999023, + "learning_rate": 5e-06, + "loss": 0.1426, + "step": 4296 + }, + { + "epoch": 0.8216061185468452, + "grad_norm": 1.8660008907318115, + "learning_rate": 5e-06, + "loss": 0.1312, + "step": 4297 + }, + { + "epoch": 0.8217973231357553, + "grad_norm": 2.163969039916992, + "learning_rate": 5e-06, + "loss": 0.1436, + "step": 4298 + }, + { + "epoch": 0.8219885277246654, + "grad_norm": 1.4228352308273315, + "learning_rate": 5e-06, + "loss": 0.1172, + "step": 4299 + }, + { + "epoch": 0.8221797323135756, + "grad_norm": 2.8689212799072266, + "learning_rate": 5e-06, + "loss": 0.3061, + "step": 4300 + }, + { + "epoch": 0.8223709369024856, + "grad_norm": 2.3054051399230957, + "learning_rate": 5e-06, + "loss": 0.3133, + "step": 4301 + }, + { + "epoch": 0.8225621414913958, + "grad_norm": 2.6142942905426025, + "learning_rate": 5e-06, + "loss": 0.5324, + "step": 4302 + }, + { + "epoch": 0.8227533460803059, + "grad_norm": 1.669835090637207, + "learning_rate": 5e-06, + "loss": 0.1098, + "step": 4303 + }, + { + "epoch": 0.822944550669216, + "grad_norm": 1.9401994943618774, + "learning_rate": 5e-06, + "loss": 0.2461, + "step": 4304 + }, + { + "epoch": 0.8231357552581262, + "grad_norm": 3.1794354915618896, + "learning_rate": 5e-06, + "loss": 0.3799, + "step": 4305 + }, + { + "epoch": 0.8233269598470363, + "grad_norm": 2.4734315872192383, + "learning_rate": 5e-06, + "loss": 0.0708, + "step": 4306 + }, + { + "epoch": 0.8235181644359465, + "grad_norm": 2.165508508682251, + "learning_rate": 5e-06, + "loss": 0.1704, + "step": 4307 + }, + { + "epoch": 0.8237093690248566, + "grad_norm": 3.454468250274658, + "learning_rate": 5e-06, + "loss": 0.4224, + "step": 4308 + }, + { + "epoch": 0.8239005736137668, + "grad_norm": 1.4881060123443604, + "learning_rate": 5e-06, + "loss": 0.1237, + "step": 4309 + }, + { + "epoch": 0.8240917782026769, + "grad_norm": 2.7141613960266113, + "learning_rate": 5e-06, + "loss": 0.4328, + "step": 4310 + }, + { + "epoch": 0.824282982791587, + "grad_norm": 1.3264292478561401, + "learning_rate": 5e-06, + "loss": 0.0853, + "step": 4311 + }, + { + "epoch": 0.8244741873804972, + "grad_norm": 1.4157055616378784, + "learning_rate": 5e-06, + "loss": 0.0634, + "step": 4312 + }, + { + "epoch": 0.8246653919694072, + "grad_norm": 2.11893630027771, + "learning_rate": 5e-06, + "loss": 0.2645, + "step": 4313 + }, + { + "epoch": 0.8248565965583174, + "grad_norm": 2.659165620803833, + "learning_rate": 5e-06, + "loss": 0.4391, + "step": 4314 + }, + { + "epoch": 0.8250478011472275, + "grad_norm": 3.263123035430908, + "learning_rate": 5e-06, + "loss": 0.1809, + "step": 4315 + }, + { + "epoch": 0.8252390057361376, + "grad_norm": 1.4275321960449219, + "learning_rate": 5e-06, + "loss": 0.1575, + "step": 4316 + }, + { + "epoch": 0.8254302103250478, + "grad_norm": 1.7727454900741577, + "learning_rate": 5e-06, + "loss": 0.0899, + "step": 4317 + }, + { + "epoch": 0.8256214149139579, + "grad_norm": 2.1350901126861572, + "learning_rate": 5e-06, + "loss": 0.1986, + "step": 4318 + }, + { + "epoch": 0.8258126195028681, + "grad_norm": 3.224039316177368, + "learning_rate": 5e-06, + "loss": 0.2131, + "step": 4319 + }, + { + "epoch": 0.8260038240917782, + "grad_norm": 2.624913215637207, + "learning_rate": 5e-06, + "loss": 0.4338, + "step": 4320 + }, + { + "epoch": 0.8261950286806883, + "grad_norm": 1.3268989324569702, + "learning_rate": 5e-06, + "loss": 0.0897, + "step": 4321 + }, + { + "epoch": 0.8263862332695985, + "grad_norm": 1.9256988763809204, + "learning_rate": 5e-06, + "loss": 0.177, + "step": 4322 + }, + { + "epoch": 0.8265774378585086, + "grad_norm": 3.2375950813293457, + "learning_rate": 5e-06, + "loss": 0.2375, + "step": 4323 + }, + { + "epoch": 0.8267686424474188, + "grad_norm": 3.528641700744629, + "learning_rate": 5e-06, + "loss": 0.3476, + "step": 4324 + }, + { + "epoch": 0.8269598470363289, + "grad_norm": 5.507506847381592, + "learning_rate": 5e-06, + "loss": 0.3687, + "step": 4325 + }, + { + "epoch": 0.827151051625239, + "grad_norm": 2.442595958709717, + "learning_rate": 5e-06, + "loss": 0.3488, + "step": 4326 + }, + { + "epoch": 0.8273422562141491, + "grad_norm": 4.454098224639893, + "learning_rate": 5e-06, + "loss": 0.1144, + "step": 4327 + }, + { + "epoch": 0.8275334608030592, + "grad_norm": 2.602275848388672, + "learning_rate": 5e-06, + "loss": 0.3673, + "step": 4328 + }, + { + "epoch": 0.8277246653919694, + "grad_norm": 1.1066206693649292, + "learning_rate": 5e-06, + "loss": 0.0566, + "step": 4329 + }, + { + "epoch": 0.8279158699808795, + "grad_norm": 2.778461217880249, + "learning_rate": 5e-06, + "loss": 0.1148, + "step": 4330 + }, + { + "epoch": 0.8281070745697897, + "grad_norm": 1.5432476997375488, + "learning_rate": 5e-06, + "loss": 0.1084, + "step": 4331 + }, + { + "epoch": 0.8282982791586998, + "grad_norm": 1.3616504669189453, + "learning_rate": 5e-06, + "loss": 0.1148, + "step": 4332 + }, + { + "epoch": 0.8284894837476099, + "grad_norm": 2.906709671020508, + "learning_rate": 5e-06, + "loss": 0.2886, + "step": 4333 + }, + { + "epoch": 0.8286806883365201, + "grad_norm": 3.554781913757324, + "learning_rate": 5e-06, + "loss": 0.1041, + "step": 4334 + }, + { + "epoch": 0.8288718929254302, + "grad_norm": 2.459982395172119, + "learning_rate": 5e-06, + "loss": 0.1131, + "step": 4335 + }, + { + "epoch": 0.8290630975143404, + "grad_norm": 2.0762665271759033, + "learning_rate": 5e-06, + "loss": 0.0706, + "step": 4336 + }, + { + "epoch": 0.8292543021032505, + "grad_norm": 1.5950919389724731, + "learning_rate": 5e-06, + "loss": 0.0903, + "step": 4337 + }, + { + "epoch": 0.8294455066921607, + "grad_norm": 2.4426164627075195, + "learning_rate": 5e-06, + "loss": 0.4334, + "step": 4338 + }, + { + "epoch": 0.8296367112810707, + "grad_norm": 2.9879558086395264, + "learning_rate": 5e-06, + "loss": 0.5294, + "step": 4339 + }, + { + "epoch": 0.8298279158699808, + "grad_norm": 1.552777886390686, + "learning_rate": 5e-06, + "loss": 0.1626, + "step": 4340 + }, + { + "epoch": 0.830019120458891, + "grad_norm": 2.607077121734619, + "learning_rate": 5e-06, + "loss": 0.1706, + "step": 4341 + }, + { + "epoch": 0.8302103250478011, + "grad_norm": 1.6359832286834717, + "learning_rate": 5e-06, + "loss": 0.079, + "step": 4342 + }, + { + "epoch": 0.8304015296367113, + "grad_norm": 1.792439341545105, + "learning_rate": 5e-06, + "loss": 0.1382, + "step": 4343 + }, + { + "epoch": 0.8305927342256214, + "grad_norm": 1.6875145435333252, + "learning_rate": 5e-06, + "loss": 0.0992, + "step": 4344 + }, + { + "epoch": 0.8307839388145315, + "grad_norm": 1.9896464347839355, + "learning_rate": 5e-06, + "loss": 0.2902, + "step": 4345 + }, + { + "epoch": 0.8309751434034417, + "grad_norm": 1.053695797920227, + "learning_rate": 5e-06, + "loss": 0.1258, + "step": 4346 + }, + { + "epoch": 0.8311663479923518, + "grad_norm": 2.7914059162139893, + "learning_rate": 5e-06, + "loss": 0.3776, + "step": 4347 + }, + { + "epoch": 0.831357552581262, + "grad_norm": 2.2700257301330566, + "learning_rate": 5e-06, + "loss": 0.2601, + "step": 4348 + }, + { + "epoch": 0.8315487571701721, + "grad_norm": 2.1097259521484375, + "learning_rate": 5e-06, + "loss": 0.1924, + "step": 4349 + }, + { + "epoch": 0.8317399617590823, + "grad_norm": 2.5981993675231934, + "learning_rate": 5e-06, + "loss": 0.1349, + "step": 4350 + }, + { + "epoch": 0.8319311663479924, + "grad_norm": 2.368746280670166, + "learning_rate": 5e-06, + "loss": 0.3906, + "step": 4351 + }, + { + "epoch": 0.8321223709369024, + "grad_norm": 3.390716075897217, + "learning_rate": 5e-06, + "loss": 0.3522, + "step": 4352 + }, + { + "epoch": 0.8323135755258126, + "grad_norm": 2.0879626274108887, + "learning_rate": 5e-06, + "loss": 0.3756, + "step": 4353 + }, + { + "epoch": 0.8325047801147227, + "grad_norm": 1.396887183189392, + "learning_rate": 5e-06, + "loss": 0.1102, + "step": 4354 + }, + { + "epoch": 0.8326959847036329, + "grad_norm": 1.5137981176376343, + "learning_rate": 5e-06, + "loss": 0.0845, + "step": 4355 + }, + { + "epoch": 0.832887189292543, + "grad_norm": 2.094395637512207, + "learning_rate": 5e-06, + "loss": 0.127, + "step": 4356 + }, + { + "epoch": 0.8330783938814531, + "grad_norm": 2.304523229598999, + "learning_rate": 5e-06, + "loss": 0.309, + "step": 4357 + }, + { + "epoch": 0.8332695984703633, + "grad_norm": 1.592523455619812, + "learning_rate": 5e-06, + "loss": 0.2083, + "step": 4358 + }, + { + "epoch": 0.8334608030592734, + "grad_norm": 1.6386781930923462, + "learning_rate": 5e-06, + "loss": 0.1019, + "step": 4359 + }, + { + "epoch": 0.8336520076481836, + "grad_norm": 1.4175693988800049, + "learning_rate": 5e-06, + "loss": 0.0789, + "step": 4360 + }, + { + "epoch": 0.8338432122370937, + "grad_norm": 1.1679911613464355, + "learning_rate": 5e-06, + "loss": 0.068, + "step": 4361 + }, + { + "epoch": 0.8340344168260039, + "grad_norm": 2.1180410385131836, + "learning_rate": 5e-06, + "loss": 0.1132, + "step": 4362 + }, + { + "epoch": 0.834225621414914, + "grad_norm": 1.151820182800293, + "learning_rate": 5e-06, + "loss": 0.094, + "step": 4363 + }, + { + "epoch": 0.834416826003824, + "grad_norm": 2.254429817199707, + "learning_rate": 5e-06, + "loss": 0.3339, + "step": 4364 + }, + { + "epoch": 0.8346080305927343, + "grad_norm": 1.0687077045440674, + "learning_rate": 5e-06, + "loss": 0.1122, + "step": 4365 + }, + { + "epoch": 0.8347992351816443, + "grad_norm": 3.133803129196167, + "learning_rate": 5e-06, + "loss": 0.32, + "step": 4366 + }, + { + "epoch": 0.8349904397705545, + "grad_norm": 2.1867737770080566, + "learning_rate": 5e-06, + "loss": 0.2895, + "step": 4367 + }, + { + "epoch": 0.8351816443594646, + "grad_norm": 1.6033610105514526, + "learning_rate": 5e-06, + "loss": 0.2681, + "step": 4368 + }, + { + "epoch": 0.8353728489483747, + "grad_norm": 1.4587613344192505, + "learning_rate": 5e-06, + "loss": 0.1197, + "step": 4369 + }, + { + "epoch": 0.8355640535372849, + "grad_norm": 2.3070292472839355, + "learning_rate": 5e-06, + "loss": 0.5036, + "step": 4370 + }, + { + "epoch": 0.835755258126195, + "grad_norm": 1.722882628440857, + "learning_rate": 5e-06, + "loss": 0.2494, + "step": 4371 + }, + { + "epoch": 0.8359464627151052, + "grad_norm": 1.6158150434494019, + "learning_rate": 5e-06, + "loss": 0.1648, + "step": 4372 + }, + { + "epoch": 0.8361376673040153, + "grad_norm": 1.1540052890777588, + "learning_rate": 5e-06, + "loss": 0.0811, + "step": 4373 + }, + { + "epoch": 0.8363288718929255, + "grad_norm": 0.9952983260154724, + "learning_rate": 5e-06, + "loss": 0.0439, + "step": 4374 + }, + { + "epoch": 0.8365200764818356, + "grad_norm": 1.722088098526001, + "learning_rate": 5e-06, + "loss": 0.0845, + "step": 4375 + }, + { + "epoch": 0.8367112810707457, + "grad_norm": 1.8059070110321045, + "learning_rate": 5e-06, + "loss": 0.3103, + "step": 4376 + }, + { + "epoch": 0.8369024856596559, + "grad_norm": 1.7783750295639038, + "learning_rate": 5e-06, + "loss": 0.2965, + "step": 4377 + }, + { + "epoch": 0.837093690248566, + "grad_norm": 1.6145813465118408, + "learning_rate": 5e-06, + "loss": 0.1776, + "step": 4378 + }, + { + "epoch": 0.8372848948374761, + "grad_norm": 3.1882593631744385, + "learning_rate": 5e-06, + "loss": 0.3732, + "step": 4379 + }, + { + "epoch": 0.8374760994263862, + "grad_norm": 1.8107242584228516, + "learning_rate": 5e-06, + "loss": 0.2011, + "step": 4380 + }, + { + "epoch": 0.8376673040152963, + "grad_norm": 2.4885756969451904, + "learning_rate": 5e-06, + "loss": 0.1237, + "step": 4381 + }, + { + "epoch": 0.8378585086042065, + "grad_norm": 3.0721991062164307, + "learning_rate": 5e-06, + "loss": 0.2544, + "step": 4382 + }, + { + "epoch": 0.8380497131931166, + "grad_norm": 2.654618740081787, + "learning_rate": 5e-06, + "loss": 0.3133, + "step": 4383 + }, + { + "epoch": 0.8382409177820268, + "grad_norm": 1.3207848072052002, + "learning_rate": 5e-06, + "loss": 0.2198, + "step": 4384 + }, + { + "epoch": 0.8384321223709369, + "grad_norm": 1.5825202465057373, + "learning_rate": 5e-06, + "loss": 0.1373, + "step": 4385 + }, + { + "epoch": 0.838623326959847, + "grad_norm": 3.0859808921813965, + "learning_rate": 5e-06, + "loss": 0.2396, + "step": 4386 + }, + { + "epoch": 0.8388145315487572, + "grad_norm": 1.6806787252426147, + "learning_rate": 5e-06, + "loss": 0.0929, + "step": 4387 + }, + { + "epoch": 0.8390057361376673, + "grad_norm": 2.5619802474975586, + "learning_rate": 5e-06, + "loss": 0.164, + "step": 4388 + }, + { + "epoch": 0.8391969407265775, + "grad_norm": 2.1878180503845215, + "learning_rate": 5e-06, + "loss": 0.2702, + "step": 4389 + }, + { + "epoch": 0.8393881453154876, + "grad_norm": 1.7535722255706787, + "learning_rate": 5e-06, + "loss": 0.1757, + "step": 4390 + }, + { + "epoch": 0.8395793499043978, + "grad_norm": 2.350954055786133, + "learning_rate": 5e-06, + "loss": 0.1784, + "step": 4391 + }, + { + "epoch": 0.8397705544933078, + "grad_norm": 1.941375494003296, + "learning_rate": 5e-06, + "loss": 0.0602, + "step": 4392 + }, + { + "epoch": 0.8399617590822179, + "grad_norm": 0.9962285161018372, + "learning_rate": 5e-06, + "loss": 0.0257, + "step": 4393 + }, + { + "epoch": 0.8401529636711281, + "grad_norm": 1.4986315965652466, + "learning_rate": 5e-06, + "loss": 0.0806, + "step": 4394 + }, + { + "epoch": 0.8403441682600382, + "grad_norm": 2.466259717941284, + "learning_rate": 5e-06, + "loss": 0.1361, + "step": 4395 + }, + { + "epoch": 0.8405353728489484, + "grad_norm": 1.511971116065979, + "learning_rate": 5e-06, + "loss": 0.146, + "step": 4396 + }, + { + "epoch": 0.8407265774378585, + "grad_norm": 3.6628222465515137, + "learning_rate": 5e-06, + "loss": 0.4908, + "step": 4397 + }, + { + "epoch": 0.8409177820267686, + "grad_norm": 1.4136232137680054, + "learning_rate": 5e-06, + "loss": 0.1436, + "step": 4398 + }, + { + "epoch": 0.8411089866156788, + "grad_norm": 1.857986569404602, + "learning_rate": 5e-06, + "loss": 0.1274, + "step": 4399 + }, + { + "epoch": 0.8413001912045889, + "grad_norm": 1.0941559076309204, + "learning_rate": 5e-06, + "loss": 0.0448, + "step": 4400 + }, + { + "epoch": 0.8414913957934991, + "grad_norm": 2.9076249599456787, + "learning_rate": 5e-06, + "loss": 0.4855, + "step": 4401 + }, + { + "epoch": 0.8416826003824092, + "grad_norm": 2.237715721130371, + "learning_rate": 5e-06, + "loss": 0.2907, + "step": 4402 + }, + { + "epoch": 0.8418738049713194, + "grad_norm": 1.244497537612915, + "learning_rate": 5e-06, + "loss": 0.0821, + "step": 4403 + }, + { + "epoch": 0.8420650095602294, + "grad_norm": 1.071437120437622, + "learning_rate": 5e-06, + "loss": 0.1103, + "step": 4404 + }, + { + "epoch": 0.8422562141491395, + "grad_norm": 1.5743379592895508, + "learning_rate": 5e-06, + "loss": 0.0687, + "step": 4405 + }, + { + "epoch": 0.8424474187380497, + "grad_norm": 2.162822723388672, + "learning_rate": 5e-06, + "loss": 0.1724, + "step": 4406 + }, + { + "epoch": 0.8426386233269598, + "grad_norm": 1.7346101999282837, + "learning_rate": 5e-06, + "loss": 0.2139, + "step": 4407 + }, + { + "epoch": 0.84282982791587, + "grad_norm": 1.8639531135559082, + "learning_rate": 5e-06, + "loss": 0.2507, + "step": 4408 + }, + { + "epoch": 0.8430210325047801, + "grad_norm": 3.444169044494629, + "learning_rate": 5e-06, + "loss": 0.4141, + "step": 4409 + }, + { + "epoch": 0.8432122370936902, + "grad_norm": 2.168269395828247, + "learning_rate": 5e-06, + "loss": 0.2911, + "step": 4410 + }, + { + "epoch": 0.8434034416826004, + "grad_norm": 1.8232005834579468, + "learning_rate": 5e-06, + "loss": 0.1244, + "step": 4411 + }, + { + "epoch": 0.8435946462715105, + "grad_norm": 2.408576488494873, + "learning_rate": 5e-06, + "loss": 0.121, + "step": 4412 + }, + { + "epoch": 0.8437858508604207, + "grad_norm": 3.882044792175293, + "learning_rate": 5e-06, + "loss": 0.3033, + "step": 4413 + }, + { + "epoch": 0.8439770554493308, + "grad_norm": 1.6000193357467651, + "learning_rate": 5e-06, + "loss": 0.1496, + "step": 4414 + }, + { + "epoch": 0.844168260038241, + "grad_norm": 1.694844126701355, + "learning_rate": 5e-06, + "loss": 0.1392, + "step": 4415 + }, + { + "epoch": 0.8443594646271511, + "grad_norm": 2.597773551940918, + "learning_rate": 5e-06, + "loss": 0.4699, + "step": 4416 + }, + { + "epoch": 0.8445506692160611, + "grad_norm": 1.8982315063476562, + "learning_rate": 5e-06, + "loss": 0.0863, + "step": 4417 + }, + { + "epoch": 0.8447418738049713, + "grad_norm": 2.2443645000457764, + "learning_rate": 5e-06, + "loss": 0.0798, + "step": 4418 + }, + { + "epoch": 0.8449330783938814, + "grad_norm": 1.6703152656555176, + "learning_rate": 5e-06, + "loss": 0.117, + "step": 4419 + }, + { + "epoch": 0.8451242829827916, + "grad_norm": 2.743288993835449, + "learning_rate": 5e-06, + "loss": 0.3322, + "step": 4420 + }, + { + "epoch": 0.8453154875717017, + "grad_norm": 1.3997440338134766, + "learning_rate": 5e-06, + "loss": 0.1167, + "step": 4421 + }, + { + "epoch": 0.8455066921606118, + "grad_norm": 1.7278668880462646, + "learning_rate": 5e-06, + "loss": 0.2425, + "step": 4422 + }, + { + "epoch": 0.845697896749522, + "grad_norm": 2.021827459335327, + "learning_rate": 5e-06, + "loss": 0.1857, + "step": 4423 + }, + { + "epoch": 0.8458891013384321, + "grad_norm": 1.941655158996582, + "learning_rate": 5e-06, + "loss": 0.2009, + "step": 4424 + }, + { + "epoch": 0.8460803059273423, + "grad_norm": 2.000415563583374, + "learning_rate": 5e-06, + "loss": 0.1906, + "step": 4425 + }, + { + "epoch": 0.8462715105162524, + "grad_norm": 2.7244679927825928, + "learning_rate": 5e-06, + "loss": 0.3982, + "step": 4426 + }, + { + "epoch": 0.8464627151051626, + "grad_norm": 3.001358985900879, + "learning_rate": 5e-06, + "loss": 0.5675, + "step": 4427 + }, + { + "epoch": 0.8466539196940727, + "grad_norm": 1.9397376775741577, + "learning_rate": 5e-06, + "loss": 0.3761, + "step": 4428 + }, + { + "epoch": 0.8468451242829828, + "grad_norm": 2.951451539993286, + "learning_rate": 5e-06, + "loss": 0.3985, + "step": 4429 + }, + { + "epoch": 0.847036328871893, + "grad_norm": 0.8639160990715027, + "learning_rate": 5e-06, + "loss": 0.0371, + "step": 4430 + }, + { + "epoch": 0.847227533460803, + "grad_norm": 1.7857511043548584, + "learning_rate": 5e-06, + "loss": 0.0694, + "step": 4431 + }, + { + "epoch": 0.8474187380497132, + "grad_norm": 1.9648922681808472, + "learning_rate": 5e-06, + "loss": 0.2294, + "step": 4432 + }, + { + "epoch": 0.8476099426386233, + "grad_norm": 1.5596647262573242, + "learning_rate": 5e-06, + "loss": 0.2072, + "step": 4433 + }, + { + "epoch": 0.8478011472275334, + "grad_norm": 1.354530692100525, + "learning_rate": 5e-06, + "loss": 0.0821, + "step": 4434 + }, + { + "epoch": 0.8479923518164436, + "grad_norm": 2.014345407485962, + "learning_rate": 5e-06, + "loss": 0.1577, + "step": 4435 + }, + { + "epoch": 0.8481835564053537, + "grad_norm": 0.9619423747062683, + "learning_rate": 5e-06, + "loss": 0.0551, + "step": 4436 + }, + { + "epoch": 0.8483747609942639, + "grad_norm": 2.9514081478118896, + "learning_rate": 5e-06, + "loss": 0.2785, + "step": 4437 + }, + { + "epoch": 0.848565965583174, + "grad_norm": 3.460991859436035, + "learning_rate": 5e-06, + "loss": 0.5416, + "step": 4438 + }, + { + "epoch": 0.8487571701720842, + "grad_norm": 1.313177227973938, + "learning_rate": 5e-06, + "loss": 0.1352, + "step": 4439 + }, + { + "epoch": 0.8489483747609943, + "grad_norm": 2.4580676555633545, + "learning_rate": 5e-06, + "loss": 0.3445, + "step": 4440 + }, + { + "epoch": 0.8491395793499044, + "grad_norm": 1.3704016208648682, + "learning_rate": 5e-06, + "loss": 0.0929, + "step": 4441 + }, + { + "epoch": 0.8493307839388146, + "grad_norm": 1.3280479907989502, + "learning_rate": 5e-06, + "loss": 0.0958, + "step": 4442 + }, + { + "epoch": 0.8495219885277246, + "grad_norm": 2.1530704498291016, + "learning_rate": 5e-06, + "loss": 0.1695, + "step": 4443 + }, + { + "epoch": 0.8497131931166348, + "grad_norm": 1.4062306880950928, + "learning_rate": 5e-06, + "loss": 0.0833, + "step": 4444 + }, + { + "epoch": 0.8499043977055449, + "grad_norm": 1.9014475345611572, + "learning_rate": 5e-06, + "loss": 0.2039, + "step": 4445 + }, + { + "epoch": 0.850095602294455, + "grad_norm": 1.7016692161560059, + "learning_rate": 5e-06, + "loss": 0.0979, + "step": 4446 + }, + { + "epoch": 0.8502868068833652, + "grad_norm": 1.8992486000061035, + "learning_rate": 5e-06, + "loss": 0.2124, + "step": 4447 + }, + { + "epoch": 0.8504780114722753, + "grad_norm": 2.011763095855713, + "learning_rate": 5e-06, + "loss": 0.1776, + "step": 4448 + }, + { + "epoch": 0.8506692160611855, + "grad_norm": 0.7455622553825378, + "learning_rate": 5e-06, + "loss": 0.0826, + "step": 4449 + }, + { + "epoch": 0.8508604206500956, + "grad_norm": 1.7753809690475464, + "learning_rate": 5e-06, + "loss": 0.1413, + "step": 4450 + }, + { + "epoch": 0.8510516252390057, + "grad_norm": 1.924593448638916, + "learning_rate": 5e-06, + "loss": 0.3003, + "step": 4451 + }, + { + "epoch": 0.8512428298279159, + "grad_norm": 1.3443734645843506, + "learning_rate": 5e-06, + "loss": 0.1168, + "step": 4452 + }, + { + "epoch": 0.851434034416826, + "grad_norm": 1.712401032447815, + "learning_rate": 5e-06, + "loss": 0.1064, + "step": 4453 + }, + { + "epoch": 0.8516252390057362, + "grad_norm": 2.6731860637664795, + "learning_rate": 5e-06, + "loss": 0.1451, + "step": 4454 + }, + { + "epoch": 0.8518164435946463, + "grad_norm": 1.2535629272460938, + "learning_rate": 5e-06, + "loss": 0.1456, + "step": 4455 + }, + { + "epoch": 0.8520076481835565, + "grad_norm": 1.3341909646987915, + "learning_rate": 5e-06, + "loss": 0.0821, + "step": 4456 + }, + { + "epoch": 0.8521988527724665, + "grad_norm": 2.0949151515960693, + "learning_rate": 5e-06, + "loss": 0.3415, + "step": 4457 + }, + { + "epoch": 0.8523900573613766, + "grad_norm": 2.8425564765930176, + "learning_rate": 5e-06, + "loss": 0.6229, + "step": 4458 + }, + { + "epoch": 0.8525812619502868, + "grad_norm": 2.830179214477539, + "learning_rate": 5e-06, + "loss": 0.4966, + "step": 4459 + }, + { + "epoch": 0.8527724665391969, + "grad_norm": 3.527388334274292, + "learning_rate": 5e-06, + "loss": 0.5026, + "step": 4460 + }, + { + "epoch": 0.8529636711281071, + "grad_norm": 1.1000351905822754, + "learning_rate": 5e-06, + "loss": 0.0993, + "step": 4461 + }, + { + "epoch": 0.8531548757170172, + "grad_norm": 1.7830493450164795, + "learning_rate": 5e-06, + "loss": 0.0735, + "step": 4462 + }, + { + "epoch": 0.8533460803059273, + "grad_norm": 2.3873448371887207, + "learning_rate": 5e-06, + "loss": 0.271, + "step": 4463 + }, + { + "epoch": 0.8535372848948375, + "grad_norm": 2.0163981914520264, + "learning_rate": 5e-06, + "loss": 0.1629, + "step": 4464 + }, + { + "epoch": 0.8537284894837476, + "grad_norm": 1.701276183128357, + "learning_rate": 5e-06, + "loss": 0.2418, + "step": 4465 + }, + { + "epoch": 0.8539196940726578, + "grad_norm": 1.9887462854385376, + "learning_rate": 5e-06, + "loss": 0.2069, + "step": 4466 + }, + { + "epoch": 0.8541108986615679, + "grad_norm": 1.4311776161193848, + "learning_rate": 5e-06, + "loss": 0.0844, + "step": 4467 + }, + { + "epoch": 0.8543021032504781, + "grad_norm": 1.4655697345733643, + "learning_rate": 5e-06, + "loss": 0.1087, + "step": 4468 + }, + { + "epoch": 0.8544933078393881, + "grad_norm": 2.374417781829834, + "learning_rate": 5e-06, + "loss": 0.1252, + "step": 4469 + }, + { + "epoch": 0.8546845124282982, + "grad_norm": 1.744074821472168, + "learning_rate": 5e-06, + "loss": 0.1841, + "step": 4470 + }, + { + "epoch": 0.8548757170172084, + "grad_norm": 1.7160265445709229, + "learning_rate": 5e-06, + "loss": 0.1687, + "step": 4471 + }, + { + "epoch": 0.8550669216061185, + "grad_norm": 2.1769425868988037, + "learning_rate": 5e-06, + "loss": 0.2596, + "step": 4472 + }, + { + "epoch": 0.8552581261950287, + "grad_norm": 2.5814943313598633, + "learning_rate": 5e-06, + "loss": 0.2571, + "step": 4473 + }, + { + "epoch": 0.8554493307839388, + "grad_norm": 1.3457120656967163, + "learning_rate": 5e-06, + "loss": 0.1145, + "step": 4474 + }, + { + "epoch": 0.8556405353728489, + "grad_norm": 1.8106434345245361, + "learning_rate": 5e-06, + "loss": 0.1167, + "step": 4475 + }, + { + "epoch": 0.8558317399617591, + "grad_norm": 1.8618403673171997, + "learning_rate": 5e-06, + "loss": 0.1876, + "step": 4476 + }, + { + "epoch": 0.8560229445506692, + "grad_norm": 1.3731857538223267, + "learning_rate": 5e-06, + "loss": 0.1171, + "step": 4477 + }, + { + "epoch": 0.8562141491395794, + "grad_norm": 3.5358798503875732, + "learning_rate": 5e-06, + "loss": 0.1766, + "step": 4478 + }, + { + "epoch": 0.8564053537284895, + "grad_norm": 2.0287272930145264, + "learning_rate": 5e-06, + "loss": 0.1085, + "step": 4479 + }, + { + "epoch": 0.8565965583173997, + "grad_norm": 1.2044274806976318, + "learning_rate": 5e-06, + "loss": 0.0622, + "step": 4480 + }, + { + "epoch": 0.8567877629063098, + "grad_norm": 1.2688884735107422, + "learning_rate": 5e-06, + "loss": 0.0831, + "step": 4481 + }, + { + "epoch": 0.8569789674952198, + "grad_norm": 3.058166980743408, + "learning_rate": 5e-06, + "loss": 0.5819, + "step": 4482 + }, + { + "epoch": 0.85717017208413, + "grad_norm": 2.472184896469116, + "learning_rate": 5e-06, + "loss": 0.3191, + "step": 4483 + }, + { + "epoch": 0.8573613766730401, + "grad_norm": 2.22625732421875, + "learning_rate": 5e-06, + "loss": 0.2619, + "step": 4484 + }, + { + "epoch": 0.8575525812619503, + "grad_norm": 1.0253074169158936, + "learning_rate": 5e-06, + "loss": 0.0643, + "step": 4485 + }, + { + "epoch": 0.8577437858508604, + "grad_norm": 1.16357421875, + "learning_rate": 5e-06, + "loss": 0.0652, + "step": 4486 + }, + { + "epoch": 0.8579349904397705, + "grad_norm": 1.9855798482894897, + "learning_rate": 5e-06, + "loss": 0.136, + "step": 4487 + }, + { + "epoch": 0.8581261950286807, + "grad_norm": 2.942619800567627, + "learning_rate": 5e-06, + "loss": 0.1555, + "step": 4488 + }, + { + "epoch": 0.8583173996175908, + "grad_norm": 1.8146679401397705, + "learning_rate": 5e-06, + "loss": 0.136, + "step": 4489 + }, + { + "epoch": 0.858508604206501, + "grad_norm": 1.2012311220169067, + "learning_rate": 5e-06, + "loss": 0.0923, + "step": 4490 + }, + { + "epoch": 0.8586998087954111, + "grad_norm": 2.180446147918701, + "learning_rate": 5e-06, + "loss": 0.4023, + "step": 4491 + }, + { + "epoch": 0.8588910133843213, + "grad_norm": 2.272432327270508, + "learning_rate": 5e-06, + "loss": 0.2363, + "step": 4492 + }, + { + "epoch": 0.8590822179732314, + "grad_norm": 1.4137390851974487, + "learning_rate": 5e-06, + "loss": 0.1623, + "step": 4493 + }, + { + "epoch": 0.8592734225621415, + "grad_norm": 3.289259433746338, + "learning_rate": 5e-06, + "loss": 0.3883, + "step": 4494 + }, + { + "epoch": 0.8594646271510517, + "grad_norm": 1.947654128074646, + "learning_rate": 5e-06, + "loss": 0.2005, + "step": 4495 + }, + { + "epoch": 0.8596558317399617, + "grad_norm": 1.1897157430648804, + "learning_rate": 5e-06, + "loss": 0.1192, + "step": 4496 + }, + { + "epoch": 0.8598470363288719, + "grad_norm": 1.1473478078842163, + "learning_rate": 5e-06, + "loss": 0.1185, + "step": 4497 + }, + { + "epoch": 0.860038240917782, + "grad_norm": 1.4698823690414429, + "learning_rate": 5e-06, + "loss": 0.0852, + "step": 4498 + }, + { + "epoch": 0.8602294455066921, + "grad_norm": 3.318509578704834, + "learning_rate": 5e-06, + "loss": 0.3434, + "step": 4499 + }, + { + "epoch": 0.8604206500956023, + "grad_norm": 1.8506419658660889, + "learning_rate": 5e-06, + "loss": 0.1088, + "step": 4500 + }, + { + "epoch": 0.8604206500956023, + "eval_runtime": 769.0971, + "eval_samples_per_second": 1.995, + "eval_steps_per_second": 0.25, + "step": 4500 + }, + { + "epoch": 0.8606118546845124, + "grad_norm": 1.9987014532089233, + "learning_rate": 5e-06, + "loss": 0.276, + "step": 4501 + }, + { + "epoch": 0.8608030592734226, + "grad_norm": 14.492589950561523, + "learning_rate": 5e-06, + "loss": 0.4812, + "step": 4502 + }, + { + "epoch": 0.8609942638623327, + "grad_norm": 2.069856882095337, + "learning_rate": 5e-06, + "loss": 0.2254, + "step": 4503 + }, + { + "epoch": 0.8611854684512428, + "grad_norm": 2.994016647338867, + "learning_rate": 5e-06, + "loss": 0.291, + "step": 4504 + }, + { + "epoch": 0.861376673040153, + "grad_norm": 1.7594068050384521, + "learning_rate": 5e-06, + "loss": 0.0717, + "step": 4505 + }, + { + "epoch": 0.8615678776290631, + "grad_norm": 2.7324862480163574, + "learning_rate": 5e-06, + "loss": 0.0629, + "step": 4506 + }, + { + "epoch": 0.8617590822179733, + "grad_norm": 3.193103075027466, + "learning_rate": 5e-06, + "loss": 0.6214, + "step": 4507 + }, + { + "epoch": 0.8619502868068833, + "grad_norm": 2.8593437671661377, + "learning_rate": 5e-06, + "loss": 0.3593, + "step": 4508 + }, + { + "epoch": 0.8621414913957935, + "grad_norm": 0.9549627304077148, + "learning_rate": 5e-06, + "loss": 0.0506, + "step": 4509 + }, + { + "epoch": 0.8623326959847036, + "grad_norm": 0.934559166431427, + "learning_rate": 5e-06, + "loss": 0.0455, + "step": 4510 + }, + { + "epoch": 0.8625239005736137, + "grad_norm": 1.489575743675232, + "learning_rate": 5e-06, + "loss": 0.1217, + "step": 4511 + }, + { + "epoch": 0.8627151051625239, + "grad_norm": 2.525219202041626, + "learning_rate": 5e-06, + "loss": 0.1192, + "step": 4512 + }, + { + "epoch": 0.862906309751434, + "grad_norm": 2.3394064903259277, + "learning_rate": 5e-06, + "loss": 0.2619, + "step": 4513 + }, + { + "epoch": 0.8630975143403442, + "grad_norm": 3.068735361099243, + "learning_rate": 5e-06, + "loss": 0.497, + "step": 4514 + }, + { + "epoch": 0.8632887189292543, + "grad_norm": 1.4633656740188599, + "learning_rate": 5e-06, + "loss": 0.1086, + "step": 4515 + }, + { + "epoch": 0.8634799235181644, + "grad_norm": 1.276939868927002, + "learning_rate": 5e-06, + "loss": 0.071, + "step": 4516 + }, + { + "epoch": 0.8636711281070746, + "grad_norm": 1.701491355895996, + "learning_rate": 5e-06, + "loss": 0.0944, + "step": 4517 + }, + { + "epoch": 0.8638623326959847, + "grad_norm": 1.7778617143630981, + "learning_rate": 5e-06, + "loss": 0.158, + "step": 4518 + }, + { + "epoch": 0.8640535372848949, + "grad_norm": 2.2561211585998535, + "learning_rate": 5e-06, + "loss": 0.1649, + "step": 4519 + }, + { + "epoch": 0.864244741873805, + "grad_norm": 1.6372724771499634, + "learning_rate": 5e-06, + "loss": 0.165, + "step": 4520 + }, + { + "epoch": 0.8644359464627152, + "grad_norm": 1.8756248950958252, + "learning_rate": 5e-06, + "loss": 0.1654, + "step": 4521 + }, + { + "epoch": 0.8646271510516252, + "grad_norm": 1.4479553699493408, + "learning_rate": 5e-06, + "loss": 0.2137, + "step": 4522 + }, + { + "epoch": 0.8648183556405353, + "grad_norm": 1.0815504789352417, + "learning_rate": 5e-06, + "loss": 0.0721, + "step": 4523 + }, + { + "epoch": 0.8650095602294455, + "grad_norm": 1.7075153589248657, + "learning_rate": 5e-06, + "loss": 0.0744, + "step": 4524 + }, + { + "epoch": 0.8652007648183556, + "grad_norm": 2.4414803981781006, + "learning_rate": 5e-06, + "loss": 0.1563, + "step": 4525 + }, + { + "epoch": 0.8653919694072658, + "grad_norm": 2.04904842376709, + "learning_rate": 5e-06, + "loss": 0.2584, + "step": 4526 + }, + { + "epoch": 0.8655831739961759, + "grad_norm": 2.7054378986358643, + "learning_rate": 5e-06, + "loss": 0.4034, + "step": 4527 + }, + { + "epoch": 0.865774378585086, + "grad_norm": 1.9329419136047363, + "learning_rate": 5e-06, + "loss": 0.1169, + "step": 4528 + }, + { + "epoch": 0.8659655831739962, + "grad_norm": 2.828913450241089, + "learning_rate": 5e-06, + "loss": 0.4183, + "step": 4529 + }, + { + "epoch": 0.8661567877629063, + "grad_norm": 2.64652419090271, + "learning_rate": 5e-06, + "loss": 0.0719, + "step": 4530 + }, + { + "epoch": 0.8663479923518165, + "grad_norm": 1.249855399131775, + "learning_rate": 5e-06, + "loss": 0.0515, + "step": 4531 + }, + { + "epoch": 0.8665391969407266, + "grad_norm": 2.1562163829803467, + "learning_rate": 5e-06, + "loss": 0.1446, + "step": 4532 + }, + { + "epoch": 0.8667304015296368, + "grad_norm": 2.4176135063171387, + "learning_rate": 5e-06, + "loss": 0.4205, + "step": 4533 + }, + { + "epoch": 0.8669216061185469, + "grad_norm": 1.6887050867080688, + "learning_rate": 5e-06, + "loss": 0.1691, + "step": 4534 + }, + { + "epoch": 0.8671128107074569, + "grad_norm": 3.1317782402038574, + "learning_rate": 5e-06, + "loss": 0.2806, + "step": 4535 + }, + { + "epoch": 0.8673040152963671, + "grad_norm": 2.647075891494751, + "learning_rate": 5e-06, + "loss": 0.3814, + "step": 4536 + }, + { + "epoch": 0.8674952198852772, + "grad_norm": 1.211713194847107, + "learning_rate": 5e-06, + "loss": 0.0743, + "step": 4537 + }, + { + "epoch": 0.8676864244741874, + "grad_norm": 1.462576150894165, + "learning_rate": 5e-06, + "loss": 0.1225, + "step": 4538 + }, + { + "epoch": 0.8678776290630975, + "grad_norm": 2.324312925338745, + "learning_rate": 5e-06, + "loss": 0.2452, + "step": 4539 + }, + { + "epoch": 0.8680688336520076, + "grad_norm": 1.4178894758224487, + "learning_rate": 5e-06, + "loss": 0.0916, + "step": 4540 + }, + { + "epoch": 0.8682600382409178, + "grad_norm": 4.281589508056641, + "learning_rate": 5e-06, + "loss": 0.1797, + "step": 4541 + }, + { + "epoch": 0.8684512428298279, + "grad_norm": 2.9054739475250244, + "learning_rate": 5e-06, + "loss": 0.1756, + "step": 4542 + }, + { + "epoch": 0.8686424474187381, + "grad_norm": 1.1191303730010986, + "learning_rate": 5e-06, + "loss": 0.0303, + "step": 4543 + }, + { + "epoch": 0.8688336520076482, + "grad_norm": 4.550809383392334, + "learning_rate": 5e-06, + "loss": 0.2272, + "step": 4544 + }, + { + "epoch": 0.8690248565965584, + "grad_norm": 2.311413526535034, + "learning_rate": 5e-06, + "loss": 0.2753, + "step": 4545 + }, + { + "epoch": 0.8692160611854685, + "grad_norm": 2.0396246910095215, + "learning_rate": 5e-06, + "loss": 0.2035, + "step": 4546 + }, + { + "epoch": 0.8694072657743785, + "grad_norm": 3.7515199184417725, + "learning_rate": 5e-06, + "loss": 0.3386, + "step": 4547 + }, + { + "epoch": 0.8695984703632887, + "grad_norm": 2.69758939743042, + "learning_rate": 5e-06, + "loss": 0.3004, + "step": 4548 + }, + { + "epoch": 0.8697896749521988, + "grad_norm": 1.6221609115600586, + "learning_rate": 5e-06, + "loss": 0.1011, + "step": 4549 + }, + { + "epoch": 0.869980879541109, + "grad_norm": 1.3611334562301636, + "learning_rate": 5e-06, + "loss": 0.0866, + "step": 4550 + }, + { + "epoch": 0.8701720841300191, + "grad_norm": 2.460482120513916, + "learning_rate": 5e-06, + "loss": 0.3597, + "step": 4551 + }, + { + "epoch": 0.8703632887189292, + "grad_norm": 1.2450191974639893, + "learning_rate": 5e-06, + "loss": 0.1289, + "step": 4552 + }, + { + "epoch": 0.8705544933078394, + "grad_norm": 1.1274508237838745, + "learning_rate": 5e-06, + "loss": 0.0882, + "step": 4553 + }, + { + "epoch": 0.8707456978967495, + "grad_norm": 1.571494221687317, + "learning_rate": 5e-06, + "loss": 0.2236, + "step": 4554 + }, + { + "epoch": 0.8709369024856597, + "grad_norm": 1.62301504611969, + "learning_rate": 5e-06, + "loss": 0.0966, + "step": 4555 + }, + { + "epoch": 0.8711281070745698, + "grad_norm": 0.9670308828353882, + "learning_rate": 5e-06, + "loss": 0.0481, + "step": 4556 + }, + { + "epoch": 0.87131931166348, + "grad_norm": 1.0194993019104004, + "learning_rate": 5e-06, + "loss": 0.0995, + "step": 4557 + }, + { + "epoch": 0.8715105162523901, + "grad_norm": 2.630286455154419, + "learning_rate": 5e-06, + "loss": 0.526, + "step": 4558 + }, + { + "epoch": 0.8717017208413002, + "grad_norm": 2.8174970149993896, + "learning_rate": 5e-06, + "loss": 0.1966, + "step": 4559 + }, + { + "epoch": 0.8718929254302104, + "grad_norm": 1.6453908681869507, + "learning_rate": 5e-06, + "loss": 0.1281, + "step": 4560 + }, + { + "epoch": 0.8720841300191204, + "grad_norm": 1.6050821542739868, + "learning_rate": 5e-06, + "loss": 0.1121, + "step": 4561 + }, + { + "epoch": 0.8722753346080306, + "grad_norm": 2.457448959350586, + "learning_rate": 5e-06, + "loss": 0.2686, + "step": 4562 + }, + { + "epoch": 0.8724665391969407, + "grad_norm": 1.3800158500671387, + "learning_rate": 5e-06, + "loss": 0.0852, + "step": 4563 + }, + { + "epoch": 0.8726577437858508, + "grad_norm": 3.040879726409912, + "learning_rate": 5e-06, + "loss": 0.4089, + "step": 4564 + }, + { + "epoch": 0.872848948374761, + "grad_norm": 1.8485065698623657, + "learning_rate": 5e-06, + "loss": 0.1646, + "step": 4565 + }, + { + "epoch": 0.8730401529636711, + "grad_norm": 1.5646363496780396, + "learning_rate": 5e-06, + "loss": 0.1381, + "step": 4566 + }, + { + "epoch": 0.8732313575525813, + "grad_norm": 2.561878204345703, + "learning_rate": 5e-06, + "loss": 0.1572, + "step": 4567 + }, + { + "epoch": 0.8734225621414914, + "grad_norm": 1.187648892402649, + "learning_rate": 5e-06, + "loss": 0.0962, + "step": 4568 + }, + { + "epoch": 0.8736137667304015, + "grad_norm": 2.9988815784454346, + "learning_rate": 5e-06, + "loss": 0.2014, + "step": 4569 + }, + { + "epoch": 0.8738049713193117, + "grad_norm": 1.185423731803894, + "learning_rate": 5e-06, + "loss": 0.1753, + "step": 4570 + }, + { + "epoch": 0.8739961759082218, + "grad_norm": 1.1318641901016235, + "learning_rate": 5e-06, + "loss": 0.0889, + "step": 4571 + }, + { + "epoch": 0.874187380497132, + "grad_norm": 1.3041385412216187, + "learning_rate": 5e-06, + "loss": 0.0902, + "step": 4572 + }, + { + "epoch": 0.874378585086042, + "grad_norm": 2.061596632003784, + "learning_rate": 5e-06, + "loss": 0.3094, + "step": 4573 + }, + { + "epoch": 0.8745697896749522, + "grad_norm": 1.7980538606643677, + "learning_rate": 5e-06, + "loss": 0.2838, + "step": 4574 + }, + { + "epoch": 0.8747609942638623, + "grad_norm": 4.6933135986328125, + "learning_rate": 5e-06, + "loss": 0.1503, + "step": 4575 + }, + { + "epoch": 0.8749521988527724, + "grad_norm": 2.4895479679107666, + "learning_rate": 5e-06, + "loss": 0.385, + "step": 4576 + }, + { + "epoch": 0.8751434034416826, + "grad_norm": 2.2268948554992676, + "learning_rate": 5e-06, + "loss": 0.2536, + "step": 4577 + }, + { + "epoch": 0.8753346080305927, + "grad_norm": 1.4584846496582031, + "learning_rate": 5e-06, + "loss": 0.0736, + "step": 4578 + }, + { + "epoch": 0.8755258126195029, + "grad_norm": 1.1387372016906738, + "learning_rate": 5e-06, + "loss": 0.0894, + "step": 4579 + }, + { + "epoch": 0.875717017208413, + "grad_norm": 1.2333475351333618, + "learning_rate": 5e-06, + "loss": 0.0964, + "step": 4580 + }, + { + "epoch": 0.8759082217973231, + "grad_norm": 1.2063689231872559, + "learning_rate": 5e-06, + "loss": 0.0427, + "step": 4581 + }, + { + "epoch": 0.8760994263862333, + "grad_norm": 2.055612087249756, + "learning_rate": 5e-06, + "loss": 0.3624, + "step": 4582 + }, + { + "epoch": 0.8762906309751434, + "grad_norm": 1.719244360923767, + "learning_rate": 5e-06, + "loss": 0.1258, + "step": 4583 + }, + { + "epoch": 0.8764818355640536, + "grad_norm": 1.0476865768432617, + "learning_rate": 5e-06, + "loss": 0.0592, + "step": 4584 + }, + { + "epoch": 0.8766730401529637, + "grad_norm": 1.689634919166565, + "learning_rate": 5e-06, + "loss": 0.2333, + "step": 4585 + }, + { + "epoch": 0.8768642447418739, + "grad_norm": 1.7837719917297363, + "learning_rate": 5e-06, + "loss": 0.135, + "step": 4586 + }, + { + "epoch": 0.8770554493307839, + "grad_norm": 1.6960902214050293, + "learning_rate": 5e-06, + "loss": 0.0815, + "step": 4587 + }, + { + "epoch": 0.877246653919694, + "grad_norm": 3.9408388137817383, + "learning_rate": 5e-06, + "loss": 0.3912, + "step": 4588 + }, + { + "epoch": 0.8774378585086042, + "grad_norm": 2.848705530166626, + "learning_rate": 5e-06, + "loss": 0.2726, + "step": 4589 + }, + { + "epoch": 0.8776290630975143, + "grad_norm": 1.9206777811050415, + "learning_rate": 5e-06, + "loss": 0.1705, + "step": 4590 + }, + { + "epoch": 0.8778202676864245, + "grad_norm": 1.366650104522705, + "learning_rate": 5e-06, + "loss": 0.1402, + "step": 4591 + }, + { + "epoch": 0.8780114722753346, + "grad_norm": 2.2299587726593018, + "learning_rate": 5e-06, + "loss": 0.1416, + "step": 4592 + }, + { + "epoch": 0.8782026768642447, + "grad_norm": 1.1620739698410034, + "learning_rate": 5e-06, + "loss": 0.0649, + "step": 4593 + }, + { + "epoch": 0.8783938814531549, + "grad_norm": 1.5828089714050293, + "learning_rate": 5e-06, + "loss": 0.129, + "step": 4594 + }, + { + "epoch": 0.878585086042065, + "grad_norm": 1.6714706420898438, + "learning_rate": 5e-06, + "loss": 0.213, + "step": 4595 + }, + { + "epoch": 0.8787762906309752, + "grad_norm": 2.159733533859253, + "learning_rate": 5e-06, + "loss": 0.1566, + "step": 4596 + }, + { + "epoch": 0.8789674952198853, + "grad_norm": 4.393616676330566, + "learning_rate": 5e-06, + "loss": 0.2552, + "step": 4597 + }, + { + "epoch": 0.8791586998087955, + "grad_norm": 1.0370787382125854, + "learning_rate": 5e-06, + "loss": 0.0785, + "step": 4598 + }, + { + "epoch": 0.8793499043977056, + "grad_norm": 1.0146803855895996, + "learning_rate": 5e-06, + "loss": 0.0634, + "step": 4599 + }, + { + "epoch": 0.8795411089866156, + "grad_norm": 2.365654945373535, + "learning_rate": 5e-06, + "loss": 0.1258, + "step": 4600 + }, + { + "epoch": 0.8797323135755258, + "grad_norm": 2.3569753170013428, + "learning_rate": 5e-06, + "loss": 0.4793, + "step": 4601 + }, + { + "epoch": 0.8799235181644359, + "grad_norm": 3.6621952056884766, + "learning_rate": 5e-06, + "loss": 0.6331, + "step": 4602 + }, + { + "epoch": 0.8801147227533461, + "grad_norm": 2.0783820152282715, + "learning_rate": 5e-06, + "loss": 0.3076, + "step": 4603 + }, + { + "epoch": 0.8803059273422562, + "grad_norm": 1.9632409811019897, + "learning_rate": 5e-06, + "loss": 0.0954, + "step": 4604 + }, + { + "epoch": 0.8804971319311663, + "grad_norm": 1.1970844268798828, + "learning_rate": 5e-06, + "loss": 0.0962, + "step": 4605 + }, + { + "epoch": 0.8806883365200765, + "grad_norm": 1.9226810932159424, + "learning_rate": 5e-06, + "loss": 0.0956, + "step": 4606 + }, + { + "epoch": 0.8808795411089866, + "grad_norm": 2.701702833175659, + "learning_rate": 5e-06, + "loss": 0.4784, + "step": 4607 + }, + { + "epoch": 0.8810707456978968, + "grad_norm": 1.3243826627731323, + "learning_rate": 5e-06, + "loss": 0.0847, + "step": 4608 + }, + { + "epoch": 0.8812619502868069, + "grad_norm": 1.84163236618042, + "learning_rate": 5e-06, + "loss": 0.1919, + "step": 4609 + }, + { + "epoch": 0.8814531548757171, + "grad_norm": 2.9473471641540527, + "learning_rate": 5e-06, + "loss": 0.2047, + "step": 4610 + }, + { + "epoch": 0.8816443594646272, + "grad_norm": 1.4090523719787598, + "learning_rate": 5e-06, + "loss": 0.1088, + "step": 4611 + }, + { + "epoch": 0.8818355640535372, + "grad_norm": 1.92579185962677, + "learning_rate": 5e-06, + "loss": 0.0899, + "step": 4612 + }, + { + "epoch": 0.8820267686424474, + "grad_norm": 2.1861867904663086, + "learning_rate": 5e-06, + "loss": 0.283, + "step": 4613 + }, + { + "epoch": 0.8822179732313575, + "grad_norm": 1.9572925567626953, + "learning_rate": 5e-06, + "loss": 0.2118, + "step": 4614 + }, + { + "epoch": 0.8824091778202677, + "grad_norm": 1.4317727088928223, + "learning_rate": 5e-06, + "loss": 0.1227, + "step": 4615 + }, + { + "epoch": 0.8826003824091778, + "grad_norm": 0.7707310914993286, + "learning_rate": 5e-06, + "loss": 0.0597, + "step": 4616 + }, + { + "epoch": 0.8827915869980879, + "grad_norm": 3.6430459022521973, + "learning_rate": 5e-06, + "loss": 0.5254, + "step": 4617 + }, + { + "epoch": 0.8829827915869981, + "grad_norm": 0.7240305542945862, + "learning_rate": 5e-06, + "loss": 0.035, + "step": 4618 + }, + { + "epoch": 0.8831739961759082, + "grad_norm": 1.372766137123108, + "learning_rate": 5e-06, + "loss": 0.0557, + "step": 4619 + }, + { + "epoch": 0.8833652007648184, + "grad_norm": 2.592245578765869, + "learning_rate": 5e-06, + "loss": 0.3585, + "step": 4620 + }, + { + "epoch": 0.8835564053537285, + "grad_norm": 1.7263092994689941, + "learning_rate": 5e-06, + "loss": 0.1779, + "step": 4621 + }, + { + "epoch": 0.8837476099426386, + "grad_norm": 1.6172292232513428, + "learning_rate": 5e-06, + "loss": 0.1043, + "step": 4622 + }, + { + "epoch": 0.8839388145315488, + "grad_norm": 1.8567088842391968, + "learning_rate": 5e-06, + "loss": 0.0783, + "step": 4623 + }, + { + "epoch": 0.8841300191204589, + "grad_norm": 1.0310090780258179, + "learning_rate": 5e-06, + "loss": 0.0906, + "step": 4624 + }, + { + "epoch": 0.884321223709369, + "grad_norm": 2.3422353267669678, + "learning_rate": 5e-06, + "loss": 0.1059, + "step": 4625 + }, + { + "epoch": 0.8845124282982791, + "grad_norm": 2.5427935123443604, + "learning_rate": 5e-06, + "loss": 0.4579, + "step": 4626 + }, + { + "epoch": 0.8847036328871893, + "grad_norm": 1.8456017971038818, + "learning_rate": 5e-06, + "loss": 0.1279, + "step": 4627 + }, + { + "epoch": 0.8848948374760994, + "grad_norm": 1.4140512943267822, + "learning_rate": 5e-06, + "loss": 0.0981, + "step": 4628 + }, + { + "epoch": 0.8850860420650095, + "grad_norm": 1.278416395187378, + "learning_rate": 5e-06, + "loss": 0.1425, + "step": 4629 + }, + { + "epoch": 0.8852772466539197, + "grad_norm": 0.722949206829071, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 4630 + }, + { + "epoch": 0.8854684512428298, + "grad_norm": 1.8881372213363647, + "learning_rate": 5e-06, + "loss": 0.094, + "step": 4631 + }, + { + "epoch": 0.88565965583174, + "grad_norm": 2.0543901920318604, + "learning_rate": 5e-06, + "loss": 0.1314, + "step": 4632 + }, + { + "epoch": 0.8858508604206501, + "grad_norm": 2.746734619140625, + "learning_rate": 5e-06, + "loss": 0.4295, + "step": 4633 + }, + { + "epoch": 0.8860420650095602, + "grad_norm": 1.3126775026321411, + "learning_rate": 5e-06, + "loss": 0.072, + "step": 4634 + }, + { + "epoch": 0.8862332695984704, + "grad_norm": 2.1847662925720215, + "learning_rate": 5e-06, + "loss": 0.193, + "step": 4635 + }, + { + "epoch": 0.8864244741873805, + "grad_norm": 2.4903340339660645, + "learning_rate": 5e-06, + "loss": 0.0869, + "step": 4636 + }, + { + "epoch": 0.8866156787762907, + "grad_norm": 2.261547803878784, + "learning_rate": 5e-06, + "loss": 0.1426, + "step": 4637 + }, + { + "epoch": 0.8868068833652007, + "grad_norm": 3.0043082237243652, + "learning_rate": 5e-06, + "loss": 0.1545, + "step": 4638 + }, + { + "epoch": 0.886998087954111, + "grad_norm": 1.8434854745864868, + "learning_rate": 5e-06, + "loss": 0.2356, + "step": 4639 + }, + { + "epoch": 0.887189292543021, + "grad_norm": 1.221226453781128, + "learning_rate": 5e-06, + "loss": 0.0681, + "step": 4640 + }, + { + "epoch": 0.8873804971319311, + "grad_norm": 1.7801470756530762, + "learning_rate": 5e-06, + "loss": 0.0785, + "step": 4641 + }, + { + "epoch": 0.8875717017208413, + "grad_norm": 1.2522605657577515, + "learning_rate": 5e-06, + "loss": 0.0611, + "step": 4642 + }, + { + "epoch": 0.8877629063097514, + "grad_norm": 1.8315064907073975, + "learning_rate": 5e-06, + "loss": 0.0613, + "step": 4643 + }, + { + "epoch": 0.8879541108986616, + "grad_norm": 1.2048170566558838, + "learning_rate": 5e-06, + "loss": 0.0696, + "step": 4644 + }, + { + "epoch": 0.8881453154875717, + "grad_norm": 2.3258895874023438, + "learning_rate": 5e-06, + "loss": 0.3081, + "step": 4645 + }, + { + "epoch": 0.8883365200764818, + "grad_norm": 1.9494102001190186, + "learning_rate": 5e-06, + "loss": 0.1162, + "step": 4646 + }, + { + "epoch": 0.888527724665392, + "grad_norm": 3.9364311695098877, + "learning_rate": 5e-06, + "loss": 0.5175, + "step": 4647 + }, + { + "epoch": 0.8887189292543021, + "grad_norm": 2.00209641456604, + "learning_rate": 5e-06, + "loss": 0.2127, + "step": 4648 + }, + { + "epoch": 0.8889101338432123, + "grad_norm": 1.54474675655365, + "learning_rate": 5e-06, + "loss": 0.0865, + "step": 4649 + }, + { + "epoch": 0.8891013384321224, + "grad_norm": 2.311948537826538, + "learning_rate": 5e-06, + "loss": 0.1372, + "step": 4650 + }, + { + "epoch": 0.8892925430210326, + "grad_norm": 2.3398048877716064, + "learning_rate": 5e-06, + "loss": 0.2935, + "step": 4651 + }, + { + "epoch": 0.8894837476099426, + "grad_norm": 3.875178575515747, + "learning_rate": 5e-06, + "loss": 0.4512, + "step": 4652 + }, + { + "epoch": 0.8896749521988527, + "grad_norm": 1.4064953327178955, + "learning_rate": 5e-06, + "loss": 0.1605, + "step": 4653 + }, + { + "epoch": 0.8898661567877629, + "grad_norm": 2.825348138809204, + "learning_rate": 5e-06, + "loss": 0.2991, + "step": 4654 + }, + { + "epoch": 0.890057361376673, + "grad_norm": 1.4716694355010986, + "learning_rate": 5e-06, + "loss": 0.092, + "step": 4655 + }, + { + "epoch": 0.8902485659655832, + "grad_norm": 1.371217966079712, + "learning_rate": 5e-06, + "loss": 0.1255, + "step": 4656 + }, + { + "epoch": 0.8904397705544933, + "grad_norm": 2.445219039916992, + "learning_rate": 5e-06, + "loss": 0.1148, + "step": 4657 + }, + { + "epoch": 0.8906309751434034, + "grad_norm": 2.0857105255126953, + "learning_rate": 5e-06, + "loss": 0.33, + "step": 4658 + }, + { + "epoch": 0.8908221797323136, + "grad_norm": 1.8164504766464233, + "learning_rate": 5e-06, + "loss": 0.256, + "step": 4659 + }, + { + "epoch": 0.8910133843212237, + "grad_norm": 1.9205447435379028, + "learning_rate": 5e-06, + "loss": 0.1559, + "step": 4660 + }, + { + "epoch": 0.8912045889101339, + "grad_norm": 1.0629031658172607, + "learning_rate": 5e-06, + "loss": 0.054, + "step": 4661 + }, + { + "epoch": 0.891395793499044, + "grad_norm": 2.1165101528167725, + "learning_rate": 5e-06, + "loss": 0.2257, + "step": 4662 + }, + { + "epoch": 0.8915869980879542, + "grad_norm": 2.4531354904174805, + "learning_rate": 5e-06, + "loss": 0.2545, + "step": 4663 + }, + { + "epoch": 0.8917782026768643, + "grad_norm": 1.8205480575561523, + "learning_rate": 5e-06, + "loss": 0.2478, + "step": 4664 + }, + { + "epoch": 0.8919694072657743, + "grad_norm": 1.4492504596710205, + "learning_rate": 5e-06, + "loss": 0.1171, + "step": 4665 + }, + { + "epoch": 0.8921606118546845, + "grad_norm": 2.0282204151153564, + "learning_rate": 5e-06, + "loss": 0.109, + "step": 4666 + }, + { + "epoch": 0.8923518164435946, + "grad_norm": 1.1359578371047974, + "learning_rate": 5e-06, + "loss": 0.0532, + "step": 4667 + }, + { + "epoch": 0.8925430210325048, + "grad_norm": 1.3664461374282837, + "learning_rate": 5e-06, + "loss": 0.1644, + "step": 4668 + }, + { + "epoch": 0.8927342256214149, + "grad_norm": 2.3731942176818848, + "learning_rate": 5e-06, + "loss": 0.1219, + "step": 4669 + }, + { + "epoch": 0.892925430210325, + "grad_norm": 1.5064061880111694, + "learning_rate": 5e-06, + "loss": 0.157, + "step": 4670 + }, + { + "epoch": 0.8931166347992352, + "grad_norm": 3.036389112472534, + "learning_rate": 5e-06, + "loss": 0.3978, + "step": 4671 + }, + { + "epoch": 0.8933078393881453, + "grad_norm": 1.1229474544525146, + "learning_rate": 5e-06, + "loss": 0.1008, + "step": 4672 + }, + { + "epoch": 0.8934990439770555, + "grad_norm": 1.6844687461853027, + "learning_rate": 5e-06, + "loss": 0.2174, + "step": 4673 + }, + { + "epoch": 0.8936902485659656, + "grad_norm": 1.0797251462936401, + "learning_rate": 5e-06, + "loss": 0.0696, + "step": 4674 + }, + { + "epoch": 0.8938814531548758, + "grad_norm": 1.7389681339263916, + "learning_rate": 5e-06, + "loss": 0.1154, + "step": 4675 + }, + { + "epoch": 0.8940726577437859, + "grad_norm": 1.4130902290344238, + "learning_rate": 5e-06, + "loss": 0.144, + "step": 4676 + }, + { + "epoch": 0.894263862332696, + "grad_norm": 1.8311725854873657, + "learning_rate": 5e-06, + "loss": 0.2501, + "step": 4677 + }, + { + "epoch": 0.8944550669216061, + "grad_norm": 3.1457982063293457, + "learning_rate": 5e-06, + "loss": 0.2693, + "step": 4678 + }, + { + "epoch": 0.8946462715105162, + "grad_norm": 1.2318836450576782, + "learning_rate": 5e-06, + "loss": 0.1091, + "step": 4679 + }, + { + "epoch": 0.8948374760994264, + "grad_norm": 1.2631968259811401, + "learning_rate": 5e-06, + "loss": 0.0692, + "step": 4680 + }, + { + "epoch": 0.8950286806883365, + "grad_norm": 1.3267889022827148, + "learning_rate": 5e-06, + "loss": 0.1024, + "step": 4681 + }, + { + "epoch": 0.8952198852772466, + "grad_norm": 3.25459361076355, + "learning_rate": 5e-06, + "loss": 0.4319, + "step": 4682 + }, + { + "epoch": 0.8954110898661568, + "grad_norm": 3.177501678466797, + "learning_rate": 5e-06, + "loss": 0.7296, + "step": 4683 + }, + { + "epoch": 0.8956022944550669, + "grad_norm": 2.4800193309783936, + "learning_rate": 5e-06, + "loss": 0.309, + "step": 4684 + }, + { + "epoch": 0.8957934990439771, + "grad_norm": 1.386919379234314, + "learning_rate": 5e-06, + "loss": 0.1178, + "step": 4685 + }, + { + "epoch": 0.8959847036328872, + "grad_norm": 0.9860485196113586, + "learning_rate": 5e-06, + "loss": 0.0742, + "step": 4686 + }, + { + "epoch": 0.8961759082217973, + "grad_norm": 1.5006732940673828, + "learning_rate": 5e-06, + "loss": 0.0625, + "step": 4687 + }, + { + "epoch": 0.8963671128107075, + "grad_norm": 1.8717820644378662, + "learning_rate": 5e-06, + "loss": 0.1307, + "step": 4688 + }, + { + "epoch": 0.8965583173996176, + "grad_norm": 2.3897619247436523, + "learning_rate": 5e-06, + "loss": 0.6113, + "step": 4689 + }, + { + "epoch": 0.8967495219885278, + "grad_norm": 1.6034753322601318, + "learning_rate": 5e-06, + "loss": 0.1415, + "step": 4690 + }, + { + "epoch": 0.8969407265774378, + "grad_norm": 2.3642208576202393, + "learning_rate": 5e-06, + "loss": 0.3357, + "step": 4691 + }, + { + "epoch": 0.897131931166348, + "grad_norm": 2.514803409576416, + "learning_rate": 5e-06, + "loss": 0.1529, + "step": 4692 + }, + { + "epoch": 0.8973231357552581, + "grad_norm": 1.097195029258728, + "learning_rate": 5e-06, + "loss": 0.0548, + "step": 4693 + }, + { + "epoch": 0.8975143403441682, + "grad_norm": 1.1098480224609375, + "learning_rate": 5e-06, + "loss": 0.0912, + "step": 4694 + }, + { + "epoch": 0.8977055449330784, + "grad_norm": 2.1470210552215576, + "learning_rate": 5e-06, + "loss": 0.3154, + "step": 4695 + }, + { + "epoch": 0.8978967495219885, + "grad_norm": 1.509371042251587, + "learning_rate": 5e-06, + "loss": 0.1229, + "step": 4696 + }, + { + "epoch": 0.8980879541108987, + "grad_norm": 1.2485581636428833, + "learning_rate": 5e-06, + "loss": 0.1053, + "step": 4697 + }, + { + "epoch": 0.8982791586998088, + "grad_norm": 1.2133026123046875, + "learning_rate": 5e-06, + "loss": 0.1103, + "step": 4698 + }, + { + "epoch": 0.8984703632887189, + "grad_norm": 1.5444940328598022, + "learning_rate": 5e-06, + "loss": 0.1142, + "step": 4699 + }, + { + "epoch": 0.8986615678776291, + "grad_norm": 2.6265218257904053, + "learning_rate": 5e-06, + "loss": 0.2122, + "step": 4700 + }, + { + "epoch": 0.8988527724665392, + "grad_norm": 2.756988286972046, + "learning_rate": 5e-06, + "loss": 0.5817, + "step": 4701 + }, + { + "epoch": 0.8990439770554494, + "grad_norm": 2.6874217987060547, + "learning_rate": 5e-06, + "loss": 0.3232, + "step": 4702 + }, + { + "epoch": 0.8992351816443594, + "grad_norm": 2.7615904808044434, + "learning_rate": 5e-06, + "loss": 0.3782, + "step": 4703 + }, + { + "epoch": 0.8994263862332696, + "grad_norm": 2.0286366939544678, + "learning_rate": 5e-06, + "loss": 0.2288, + "step": 4704 + }, + { + "epoch": 0.8996175908221797, + "grad_norm": 1.6421332359313965, + "learning_rate": 5e-06, + "loss": 0.1077, + "step": 4705 + }, + { + "epoch": 0.8998087954110898, + "grad_norm": 1.2805842161178589, + "learning_rate": 5e-06, + "loss": 0.0755, + "step": 4706 + }, + { + "epoch": 0.9, + "grad_norm": 1.985251545906067, + "learning_rate": 5e-06, + "loss": 0.3243, + "step": 4707 + }, + { + "epoch": 0.9001912045889101, + "grad_norm": 1.3914923667907715, + "learning_rate": 5e-06, + "loss": 0.2003, + "step": 4708 + }, + { + "epoch": 0.9003824091778203, + "grad_norm": 2.9044227600097656, + "learning_rate": 5e-06, + "loss": 0.3824, + "step": 4709 + }, + { + "epoch": 0.9005736137667304, + "grad_norm": 1.989436149597168, + "learning_rate": 5e-06, + "loss": 0.0969, + "step": 4710 + }, + { + "epoch": 0.9007648183556405, + "grad_norm": 3.248147487640381, + "learning_rate": 5e-06, + "loss": 0.1397, + "step": 4711 + }, + { + "epoch": 0.9009560229445507, + "grad_norm": 3.772764205932617, + "learning_rate": 5e-06, + "loss": 0.1007, + "step": 4712 + }, + { + "epoch": 0.9011472275334608, + "grad_norm": 1.53439199924469, + "learning_rate": 5e-06, + "loss": 0.1392, + "step": 4713 + }, + { + "epoch": 0.901338432122371, + "grad_norm": 2.0962002277374268, + "learning_rate": 5e-06, + "loss": 0.2381, + "step": 4714 + }, + { + "epoch": 0.9015296367112811, + "grad_norm": 1.7860227823257446, + "learning_rate": 5e-06, + "loss": 0.0828, + "step": 4715 + }, + { + "epoch": 0.9017208413001913, + "grad_norm": 2.519381284713745, + "learning_rate": 5e-06, + "loss": 0.2643, + "step": 4716 + }, + { + "epoch": 0.9019120458891013, + "grad_norm": 2.8200933933258057, + "learning_rate": 5e-06, + "loss": 0.3501, + "step": 4717 + }, + { + "epoch": 0.9021032504780114, + "grad_norm": 1.7289233207702637, + "learning_rate": 5e-06, + "loss": 0.1181, + "step": 4718 + }, + { + "epoch": 0.9022944550669216, + "grad_norm": 2.348267078399658, + "learning_rate": 5e-06, + "loss": 0.2862, + "step": 4719 + }, + { + "epoch": 0.9024856596558317, + "grad_norm": 1.9853914976119995, + "learning_rate": 5e-06, + "loss": 0.3011, + "step": 4720 + }, + { + "epoch": 0.9026768642447419, + "grad_norm": 1.2755216360092163, + "learning_rate": 5e-06, + "loss": 0.1263, + "step": 4721 + }, + { + "epoch": 0.902868068833652, + "grad_norm": 2.737816095352173, + "learning_rate": 5e-06, + "loss": 0.3493, + "step": 4722 + }, + { + "epoch": 0.9030592734225621, + "grad_norm": 2.026895523071289, + "learning_rate": 5e-06, + "loss": 0.2523, + "step": 4723 + }, + { + "epoch": 0.9032504780114723, + "grad_norm": 1.8737328052520752, + "learning_rate": 5e-06, + "loss": 0.1153, + "step": 4724 + }, + { + "epoch": 0.9034416826003824, + "grad_norm": 2.0649120807647705, + "learning_rate": 5e-06, + "loss": 0.1239, + "step": 4725 + }, + { + "epoch": 0.9036328871892926, + "grad_norm": 2.1456220149993896, + "learning_rate": 5e-06, + "loss": 0.1975, + "step": 4726 + }, + { + "epoch": 0.9038240917782027, + "grad_norm": 1.953724980354309, + "learning_rate": 5e-06, + "loss": 0.1403, + "step": 4727 + }, + { + "epoch": 0.9040152963671129, + "grad_norm": 1.8601813316345215, + "learning_rate": 5e-06, + "loss": 0.1244, + "step": 4728 + }, + { + "epoch": 0.904206500956023, + "grad_norm": 2.1330983638763428, + "learning_rate": 5e-06, + "loss": 0.1379, + "step": 4729 + }, + { + "epoch": 0.904397705544933, + "grad_norm": 2.6561925411224365, + "learning_rate": 5e-06, + "loss": 0.277, + "step": 4730 + }, + { + "epoch": 0.9045889101338432, + "grad_norm": 2.5746819972991943, + "learning_rate": 5e-06, + "loss": 0.0965, + "step": 4731 + }, + { + "epoch": 0.9047801147227533, + "grad_norm": 2.185016632080078, + "learning_rate": 5e-06, + "loss": 0.2095, + "step": 4732 + }, + { + "epoch": 0.9049713193116635, + "grad_norm": 2.8116836547851562, + "learning_rate": 5e-06, + "loss": 0.5521, + "step": 4733 + }, + { + "epoch": 0.9051625239005736, + "grad_norm": 2.427065134048462, + "learning_rate": 5e-06, + "loss": 0.2564, + "step": 4734 + }, + { + "epoch": 0.9053537284894837, + "grad_norm": 2.053335666656494, + "learning_rate": 5e-06, + "loss": 0.272, + "step": 4735 + }, + { + "epoch": 0.9055449330783939, + "grad_norm": 2.481733798980713, + "learning_rate": 5e-06, + "loss": 0.3331, + "step": 4736 + }, + { + "epoch": 0.905736137667304, + "grad_norm": 2.1682722568511963, + "learning_rate": 5e-06, + "loss": 0.0942, + "step": 4737 + }, + { + "epoch": 0.9059273422562142, + "grad_norm": 2.0562095642089844, + "learning_rate": 5e-06, + "loss": 0.1516, + "step": 4738 + }, + { + "epoch": 0.9061185468451243, + "grad_norm": 1.898068904876709, + "learning_rate": 5e-06, + "loss": 0.2335, + "step": 4739 + }, + { + "epoch": 0.9063097514340345, + "grad_norm": 1.218976378440857, + "learning_rate": 5e-06, + "loss": 0.085, + "step": 4740 + }, + { + "epoch": 0.9065009560229446, + "grad_norm": 1.4753073453903198, + "learning_rate": 5e-06, + "loss": 0.181, + "step": 4741 + }, + { + "epoch": 0.9066921606118546, + "grad_norm": 1.8766474723815918, + "learning_rate": 5e-06, + "loss": 0.1318, + "step": 4742 + }, + { + "epoch": 0.9068833652007648, + "grad_norm": 1.521669626235962, + "learning_rate": 5e-06, + "loss": 0.0619, + "step": 4743 + }, + { + "epoch": 0.9070745697896749, + "grad_norm": 1.3307064771652222, + "learning_rate": 5e-06, + "loss": 0.1082, + "step": 4744 + }, + { + "epoch": 0.9072657743785851, + "grad_norm": 2.4466915130615234, + "learning_rate": 5e-06, + "loss": 0.6762, + "step": 4745 + }, + { + "epoch": 0.9074569789674952, + "grad_norm": 1.5775731801986694, + "learning_rate": 5e-06, + "loss": 0.261, + "step": 4746 + }, + { + "epoch": 0.9076481835564053, + "grad_norm": 1.7367063760757446, + "learning_rate": 5e-06, + "loss": 0.217, + "step": 4747 + }, + { + "epoch": 0.9078393881453155, + "grad_norm": 2.7421231269836426, + "learning_rate": 5e-06, + "loss": 0.2712, + "step": 4748 + }, + { + "epoch": 0.9080305927342256, + "grad_norm": 1.4664653539657593, + "learning_rate": 5e-06, + "loss": 0.1045, + "step": 4749 + }, + { + "epoch": 0.9082217973231358, + "grad_norm": 6.154154300689697, + "learning_rate": 5e-06, + "loss": 0.1511, + "step": 4750 + }, + { + "epoch": 0.9084130019120459, + "grad_norm": 1.722429871559143, + "learning_rate": 5e-06, + "loss": 0.1382, + "step": 4751 + }, + { + "epoch": 0.908604206500956, + "grad_norm": 3.028075695037842, + "learning_rate": 5e-06, + "loss": 0.5589, + "step": 4752 + }, + { + "epoch": 0.9087954110898662, + "grad_norm": 1.7719171047210693, + "learning_rate": 5e-06, + "loss": 0.2481, + "step": 4753 + }, + { + "epoch": 0.9089866156787763, + "grad_norm": 2.284649610519409, + "learning_rate": 5e-06, + "loss": 0.3038, + "step": 4754 + }, + { + "epoch": 0.9091778202676865, + "grad_norm": 1.697432041168213, + "learning_rate": 5e-06, + "loss": 0.093, + "step": 4755 + }, + { + "epoch": 0.9093690248565965, + "grad_norm": 1.257247805595398, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 4756 + }, + { + "epoch": 0.9095602294455067, + "grad_norm": 4.2492523193359375, + "learning_rate": 5e-06, + "loss": 0.2155, + "step": 4757 + }, + { + "epoch": 0.9097514340344168, + "grad_norm": 1.3657695055007935, + "learning_rate": 5e-06, + "loss": 0.2232, + "step": 4758 + }, + { + "epoch": 0.9099426386233269, + "grad_norm": 1.7062994241714478, + "learning_rate": 5e-06, + "loss": 0.2433, + "step": 4759 + }, + { + "epoch": 0.9101338432122371, + "grad_norm": 1.1820368766784668, + "learning_rate": 5e-06, + "loss": 0.0536, + "step": 4760 + }, + { + "epoch": 0.9103250478011472, + "grad_norm": 1.561043620109558, + "learning_rate": 5e-06, + "loss": 0.0761, + "step": 4761 + }, + { + "epoch": 0.9105162523900574, + "grad_norm": 2.44998836517334, + "learning_rate": 5e-06, + "loss": 0.2631, + "step": 4762 + }, + { + "epoch": 0.9107074569789675, + "grad_norm": 2.2695345878601074, + "learning_rate": 5e-06, + "loss": 0.2851, + "step": 4763 + }, + { + "epoch": 0.9108986615678776, + "grad_norm": 2.3107171058654785, + "learning_rate": 5e-06, + "loss": 0.3215, + "step": 4764 + }, + { + "epoch": 0.9110898661567878, + "grad_norm": 2.1133556365966797, + "learning_rate": 5e-06, + "loss": 0.2423, + "step": 4765 + }, + { + "epoch": 0.9112810707456979, + "grad_norm": 1.5830267667770386, + "learning_rate": 5e-06, + "loss": 0.1177, + "step": 4766 + }, + { + "epoch": 0.9114722753346081, + "grad_norm": 2.6378262042999268, + "learning_rate": 5e-06, + "loss": 0.1486, + "step": 4767 + }, + { + "epoch": 0.9116634799235181, + "grad_norm": 1.1338980197906494, + "learning_rate": 5e-06, + "loss": 0.0795, + "step": 4768 + }, + { + "epoch": 0.9118546845124283, + "grad_norm": 2.1762497425079346, + "learning_rate": 5e-06, + "loss": 0.1608, + "step": 4769 + }, + { + "epoch": 0.9120458891013384, + "grad_norm": 2.5121426582336426, + "learning_rate": 5e-06, + "loss": 0.3784, + "step": 4770 + }, + { + "epoch": 0.9122370936902485, + "grad_norm": 2.436002731323242, + "learning_rate": 5e-06, + "loss": 0.3143, + "step": 4771 + }, + { + "epoch": 0.9124282982791587, + "grad_norm": 2.774428129196167, + "learning_rate": 5e-06, + "loss": 0.3945, + "step": 4772 + }, + { + "epoch": 0.9126195028680688, + "grad_norm": 1.4498441219329834, + "learning_rate": 5e-06, + "loss": 0.1016, + "step": 4773 + }, + { + "epoch": 0.912810707456979, + "grad_norm": 2.4954347610473633, + "learning_rate": 5e-06, + "loss": 0.0956, + "step": 4774 + }, + { + "epoch": 0.9130019120458891, + "grad_norm": 1.8431687355041504, + "learning_rate": 5e-06, + "loss": 0.0724, + "step": 4775 + }, + { + "epoch": 0.9131931166347992, + "grad_norm": 1.2787989377975464, + "learning_rate": 5e-06, + "loss": 0.1331, + "step": 4776 + }, + { + "epoch": 0.9133843212237094, + "grad_norm": 1.2174687385559082, + "learning_rate": 5e-06, + "loss": 0.1211, + "step": 4777 + }, + { + "epoch": 0.9135755258126195, + "grad_norm": 2.3031578063964844, + "learning_rate": 5e-06, + "loss": 0.3992, + "step": 4778 + }, + { + "epoch": 0.9137667304015297, + "grad_norm": 2.224651336669922, + "learning_rate": 5e-06, + "loss": 0.3414, + "step": 4779 + }, + { + "epoch": 0.9139579349904398, + "grad_norm": 1.978184700012207, + "learning_rate": 5e-06, + "loss": 0.2514, + "step": 4780 + }, + { + "epoch": 0.91414913957935, + "grad_norm": 1.6605628728866577, + "learning_rate": 5e-06, + "loss": 0.0844, + "step": 4781 + }, + { + "epoch": 0.91434034416826, + "grad_norm": 1.5570906400680542, + "learning_rate": 5e-06, + "loss": 0.1116, + "step": 4782 + }, + { + "epoch": 0.9145315487571701, + "grad_norm": 1.553684949874878, + "learning_rate": 5e-06, + "loss": 0.1289, + "step": 4783 + }, + { + "epoch": 0.9147227533460803, + "grad_norm": 1.4999451637268066, + "learning_rate": 5e-06, + "loss": 0.0977, + "step": 4784 + }, + { + "epoch": 0.9149139579349904, + "grad_norm": 1.102944254875183, + "learning_rate": 5e-06, + "loss": 0.0922, + "step": 4785 + }, + { + "epoch": 0.9151051625239006, + "grad_norm": 1.5913718938827515, + "learning_rate": 5e-06, + "loss": 0.0704, + "step": 4786 + }, + { + "epoch": 0.9152963671128107, + "grad_norm": 1.003926396369934, + "learning_rate": 5e-06, + "loss": 0.0447, + "step": 4787 + }, + { + "epoch": 0.9154875717017208, + "grad_norm": 3.418393135070801, + "learning_rate": 5e-06, + "loss": 0.5752, + "step": 4788 + }, + { + "epoch": 0.915678776290631, + "grad_norm": 1.964396595954895, + "learning_rate": 5e-06, + "loss": 0.374, + "step": 4789 + }, + { + "epoch": 0.9158699808795411, + "grad_norm": 2.352846145629883, + "learning_rate": 5e-06, + "loss": 0.2557, + "step": 4790 + }, + { + "epoch": 0.9160611854684513, + "grad_norm": 1.851499319076538, + "learning_rate": 5e-06, + "loss": 0.1095, + "step": 4791 + }, + { + "epoch": 0.9162523900573614, + "grad_norm": 2.526007652282715, + "learning_rate": 5e-06, + "loss": 0.1511, + "step": 4792 + }, + { + "epoch": 0.9164435946462716, + "grad_norm": 2.6899187564849854, + "learning_rate": 5e-06, + "loss": 0.2373, + "step": 4793 + }, + { + "epoch": 0.9166347992351817, + "grad_norm": 1.4613804817199707, + "learning_rate": 5e-06, + "loss": 0.0741, + "step": 4794 + }, + { + "epoch": 0.9168260038240917, + "grad_norm": 1.7826400995254517, + "learning_rate": 5e-06, + "loss": 0.1562, + "step": 4795 + }, + { + "epoch": 0.9170172084130019, + "grad_norm": 2.50441575050354, + "learning_rate": 5e-06, + "loss": 0.1662, + "step": 4796 + }, + { + "epoch": 0.917208413001912, + "grad_norm": 1.1559553146362305, + "learning_rate": 5e-06, + "loss": 0.0645, + "step": 4797 + }, + { + "epoch": 0.9173996175908222, + "grad_norm": 1.493688702583313, + "learning_rate": 5e-06, + "loss": 0.1094, + "step": 4798 + }, + { + "epoch": 0.9175908221797323, + "grad_norm": 2.530729055404663, + "learning_rate": 5e-06, + "loss": 0.127, + "step": 4799 + }, + { + "epoch": 0.9177820267686424, + "grad_norm": 2.9667983055114746, + "learning_rate": 5e-06, + "loss": 0.1215, + "step": 4800 + }, + { + "epoch": 0.9179732313575526, + "grad_norm": 1.1197313070297241, + "learning_rate": 5e-06, + "loss": 0.0769, + "step": 4801 + }, + { + "epoch": 0.9181644359464627, + "grad_norm": 1.4685611724853516, + "learning_rate": 5e-06, + "loss": 0.1881, + "step": 4802 + }, + { + "epoch": 0.9183556405353729, + "grad_norm": 1.4080411195755005, + "learning_rate": 5e-06, + "loss": 0.1195, + "step": 4803 + }, + { + "epoch": 0.918546845124283, + "grad_norm": 3.9118635654449463, + "learning_rate": 5e-06, + "loss": 0.3449, + "step": 4804 + }, + { + "epoch": 0.9187380497131931, + "grad_norm": 1.7937930822372437, + "learning_rate": 5e-06, + "loss": 0.2372, + "step": 4805 + }, + { + "epoch": 0.9189292543021033, + "grad_norm": 1.3490447998046875, + "learning_rate": 5e-06, + "loss": 0.0773, + "step": 4806 + }, + { + "epoch": 0.9191204588910133, + "grad_norm": 2.1110494136810303, + "learning_rate": 5e-06, + "loss": 0.2215, + "step": 4807 + }, + { + "epoch": 0.9193116634799235, + "grad_norm": 2.0199520587921143, + "learning_rate": 5e-06, + "loss": 0.2441, + "step": 4808 + }, + { + "epoch": 0.9195028680688336, + "grad_norm": 1.7356966733932495, + "learning_rate": 5e-06, + "loss": 0.1303, + "step": 4809 + }, + { + "epoch": 0.9196940726577438, + "grad_norm": 1.952366828918457, + "learning_rate": 5e-06, + "loss": 0.1433, + "step": 4810 + }, + { + "epoch": 0.9198852772466539, + "grad_norm": 2.1498913764953613, + "learning_rate": 5e-06, + "loss": 0.1087, + "step": 4811 + }, + { + "epoch": 0.920076481835564, + "grad_norm": 1.1379492282867432, + "learning_rate": 5e-06, + "loss": 0.0524, + "step": 4812 + }, + { + "epoch": 0.9202676864244742, + "grad_norm": 2.057628870010376, + "learning_rate": 5e-06, + "loss": 0.1946, + "step": 4813 + }, + { + "epoch": 0.9204588910133843, + "grad_norm": 2.6569430828094482, + "learning_rate": 5e-06, + "loss": 0.4691, + "step": 4814 + }, + { + "epoch": 0.9206500956022945, + "grad_norm": 2.3530871868133545, + "learning_rate": 5e-06, + "loss": 0.2668, + "step": 4815 + }, + { + "epoch": 0.9208413001912046, + "grad_norm": 2.5906331539154053, + "learning_rate": 5e-06, + "loss": 0.2775, + "step": 4816 + }, + { + "epoch": 0.9210325047801147, + "grad_norm": 1.04031503200531, + "learning_rate": 5e-06, + "loss": 0.0453, + "step": 4817 + }, + { + "epoch": 0.9212237093690249, + "grad_norm": 1.8329519033432007, + "learning_rate": 5e-06, + "loss": 0.0703, + "step": 4818 + }, + { + "epoch": 0.921414913957935, + "grad_norm": 1.4083631038665771, + "learning_rate": 5e-06, + "loss": 0.0891, + "step": 4819 + }, + { + "epoch": 0.9216061185468452, + "grad_norm": 2.5658862590789795, + "learning_rate": 5e-06, + "loss": 0.4482, + "step": 4820 + }, + { + "epoch": 0.9217973231357552, + "grad_norm": 2.2376391887664795, + "learning_rate": 5e-06, + "loss": 0.2053, + "step": 4821 + }, + { + "epoch": 0.9219885277246654, + "grad_norm": 1.5132383108139038, + "learning_rate": 5e-06, + "loss": 0.1199, + "step": 4822 + }, + { + "epoch": 0.9221797323135755, + "grad_norm": 1.9439269304275513, + "learning_rate": 5e-06, + "loss": 0.2446, + "step": 4823 + }, + { + "epoch": 0.9223709369024856, + "grad_norm": 1.8772526979446411, + "learning_rate": 5e-06, + "loss": 0.0855, + "step": 4824 + }, + { + "epoch": 0.9225621414913958, + "grad_norm": 0.8960617184638977, + "learning_rate": 5e-06, + "loss": 0.0342, + "step": 4825 + }, + { + "epoch": 0.9227533460803059, + "grad_norm": 2.050640106201172, + "learning_rate": 5e-06, + "loss": 0.3478, + "step": 4826 + }, + { + "epoch": 0.9229445506692161, + "grad_norm": 1.714992880821228, + "learning_rate": 5e-06, + "loss": 0.1056, + "step": 4827 + }, + { + "epoch": 0.9231357552581262, + "grad_norm": 1.1834768056869507, + "learning_rate": 5e-06, + "loss": 0.0615, + "step": 4828 + }, + { + "epoch": 0.9233269598470363, + "grad_norm": 1.3316147327423096, + "learning_rate": 5e-06, + "loss": 0.0867, + "step": 4829 + }, + { + "epoch": 0.9235181644359465, + "grad_norm": 2.6052935123443604, + "learning_rate": 5e-06, + "loss": 0.1191, + "step": 4830 + }, + { + "epoch": 0.9237093690248566, + "grad_norm": 4.509915828704834, + "learning_rate": 5e-06, + "loss": 0.3938, + "step": 4831 + }, + { + "epoch": 0.9239005736137668, + "grad_norm": 2.1398138999938965, + "learning_rate": 5e-06, + "loss": 0.3432, + "step": 4832 + }, + { + "epoch": 0.9240917782026769, + "grad_norm": 2.1571106910705566, + "learning_rate": 5e-06, + "loss": 0.3267, + "step": 4833 + }, + { + "epoch": 0.924282982791587, + "grad_norm": 1.6129310131072998, + "learning_rate": 5e-06, + "loss": 0.1886, + "step": 4834 + }, + { + "epoch": 0.9244741873804971, + "grad_norm": 1.8231936693191528, + "learning_rate": 5e-06, + "loss": 0.1287, + "step": 4835 + }, + { + "epoch": 0.9246653919694072, + "grad_norm": 1.4259830713272095, + "learning_rate": 5e-06, + "loss": 0.065, + "step": 4836 + }, + { + "epoch": 0.9248565965583174, + "grad_norm": 1.6335722208023071, + "learning_rate": 5e-06, + "loss": 0.0964, + "step": 4837 + }, + { + "epoch": 0.9250478011472275, + "grad_norm": 2.2979252338409424, + "learning_rate": 5e-06, + "loss": 0.0973, + "step": 4838 + }, + { + "epoch": 0.9252390057361377, + "grad_norm": 2.0486056804656982, + "learning_rate": 5e-06, + "loss": 0.4276, + "step": 4839 + }, + { + "epoch": 0.9254302103250478, + "grad_norm": 0.9143030047416687, + "learning_rate": 5e-06, + "loss": 0.0769, + "step": 4840 + }, + { + "epoch": 0.9256214149139579, + "grad_norm": 2.2786619663238525, + "learning_rate": 5e-06, + "loss": 0.2726, + "step": 4841 + }, + { + "epoch": 0.9258126195028681, + "grad_norm": 0.9805214405059814, + "learning_rate": 5e-06, + "loss": 0.0744, + "step": 4842 + }, + { + "epoch": 0.9260038240917782, + "grad_norm": 1.4829126596450806, + "learning_rate": 5e-06, + "loss": 0.0954, + "step": 4843 + }, + { + "epoch": 0.9261950286806884, + "grad_norm": 1.7729511260986328, + "learning_rate": 5e-06, + "loss": 0.1219, + "step": 4844 + }, + { + "epoch": 0.9263862332695985, + "grad_norm": 2.1169512271881104, + "learning_rate": 5e-06, + "loss": 0.2112, + "step": 4845 + }, + { + "epoch": 0.9265774378585087, + "grad_norm": 3.1143903732299805, + "learning_rate": 5e-06, + "loss": 0.1733, + "step": 4846 + }, + { + "epoch": 0.9267686424474187, + "grad_norm": 1.8943371772766113, + "learning_rate": 5e-06, + "loss": 0.3117, + "step": 4847 + }, + { + "epoch": 0.9269598470363288, + "grad_norm": 1.6305577754974365, + "learning_rate": 5e-06, + "loss": 0.169, + "step": 4848 + }, + { + "epoch": 0.927151051625239, + "grad_norm": 0.9189224243164062, + "learning_rate": 5e-06, + "loss": 0.0763, + "step": 4849 + }, + { + "epoch": 0.9273422562141491, + "grad_norm": 2.8332042694091797, + "learning_rate": 5e-06, + "loss": 0.2152, + "step": 4850 + }, + { + "epoch": 0.9275334608030593, + "grad_norm": 5.684455394744873, + "learning_rate": 5e-06, + "loss": 0.2779, + "step": 4851 + }, + { + "epoch": 0.9277246653919694, + "grad_norm": 1.2276694774627686, + "learning_rate": 5e-06, + "loss": 0.1409, + "step": 4852 + }, + { + "epoch": 0.9279158699808795, + "grad_norm": 2.8665270805358887, + "learning_rate": 5e-06, + "loss": 0.3275, + "step": 4853 + }, + { + "epoch": 0.9281070745697897, + "grad_norm": 1.772354245185852, + "learning_rate": 5e-06, + "loss": 0.1607, + "step": 4854 + }, + { + "epoch": 0.9282982791586998, + "grad_norm": 2.3981382846832275, + "learning_rate": 5e-06, + "loss": 0.1545, + "step": 4855 + }, + { + "epoch": 0.92848948374761, + "grad_norm": 1.4854987859725952, + "learning_rate": 5e-06, + "loss": 0.0896, + "step": 4856 + }, + { + "epoch": 0.9286806883365201, + "grad_norm": 1.3724586963653564, + "learning_rate": 5e-06, + "loss": 0.1103, + "step": 4857 + }, + { + "epoch": 0.9288718929254303, + "grad_norm": 1.819861650466919, + "learning_rate": 5e-06, + "loss": 0.2387, + "step": 4858 + }, + { + "epoch": 0.9290630975143404, + "grad_norm": 1.742345929145813, + "learning_rate": 5e-06, + "loss": 0.1194, + "step": 4859 + }, + { + "epoch": 0.9292543021032504, + "grad_norm": 1.5727949142456055, + "learning_rate": 5e-06, + "loss": 0.1547, + "step": 4860 + }, + { + "epoch": 0.9294455066921606, + "grad_norm": 1.9011776447296143, + "learning_rate": 5e-06, + "loss": 0.2739, + "step": 4861 + }, + { + "epoch": 0.9296367112810707, + "grad_norm": 1.4754587411880493, + "learning_rate": 5e-06, + "loss": 0.111, + "step": 4862 + }, + { + "epoch": 0.9298279158699809, + "grad_norm": 1.5760308504104614, + "learning_rate": 5e-06, + "loss": 0.1089, + "step": 4863 + }, + { + "epoch": 0.930019120458891, + "grad_norm": 2.719432830810547, + "learning_rate": 5e-06, + "loss": 0.4185, + "step": 4864 + }, + { + "epoch": 0.9302103250478011, + "grad_norm": 2.2570371627807617, + "learning_rate": 5e-06, + "loss": 0.1839, + "step": 4865 + }, + { + "epoch": 0.9304015296367113, + "grad_norm": 1.3954803943634033, + "learning_rate": 5e-06, + "loss": 0.099, + "step": 4866 + }, + { + "epoch": 0.9305927342256214, + "grad_norm": 1.2225748300552368, + "learning_rate": 5e-06, + "loss": 0.1137, + "step": 4867 + }, + { + "epoch": 0.9307839388145316, + "grad_norm": 0.926919162273407, + "learning_rate": 5e-06, + "loss": 0.0477, + "step": 4868 + }, + { + "epoch": 0.9309751434034417, + "grad_norm": 1.8387597799301147, + "learning_rate": 5e-06, + "loss": 0.0733, + "step": 4869 + }, + { + "epoch": 0.9311663479923518, + "grad_norm": 1.1521694660186768, + "learning_rate": 5e-06, + "loss": 0.1333, + "step": 4870 + }, + { + "epoch": 0.931357552581262, + "grad_norm": 1.0635244846343994, + "learning_rate": 5e-06, + "loss": 0.1047, + "step": 4871 + }, + { + "epoch": 0.931548757170172, + "grad_norm": 1.6397044658660889, + "learning_rate": 5e-06, + "loss": 0.1682, + "step": 4872 + }, + { + "epoch": 0.9317399617590822, + "grad_norm": 1.338733434677124, + "learning_rate": 5e-06, + "loss": 0.1432, + "step": 4873 + }, + { + "epoch": 0.9319311663479923, + "grad_norm": 1.307789921760559, + "learning_rate": 5e-06, + "loss": 0.0916, + "step": 4874 + }, + { + "epoch": 0.9321223709369025, + "grad_norm": 2.0790135860443115, + "learning_rate": 5e-06, + "loss": 0.119, + "step": 4875 + }, + { + "epoch": 0.9323135755258126, + "grad_norm": 1.5820248126983643, + "learning_rate": 5e-06, + "loss": 0.206, + "step": 4876 + }, + { + "epoch": 0.9325047801147227, + "grad_norm": 1.5666821002960205, + "learning_rate": 5e-06, + "loss": 0.1286, + "step": 4877 + }, + { + "epoch": 0.9326959847036329, + "grad_norm": 1.4068495035171509, + "learning_rate": 5e-06, + "loss": 0.2069, + "step": 4878 + }, + { + "epoch": 0.932887189292543, + "grad_norm": 1.985937237739563, + "learning_rate": 5e-06, + "loss": 0.1667, + "step": 4879 + }, + { + "epoch": 0.9330783938814532, + "grad_norm": 1.2681323289871216, + "learning_rate": 5e-06, + "loss": 0.0669, + "step": 4880 + }, + { + "epoch": 0.9332695984703633, + "grad_norm": 1.4360939264297485, + "learning_rate": 5e-06, + "loss": 0.0561, + "step": 4881 + }, + { + "epoch": 0.9334608030592734, + "grad_norm": 1.2678735256195068, + "learning_rate": 5e-06, + "loss": 0.1019, + "step": 4882 + }, + { + "epoch": 0.9336520076481836, + "grad_norm": 2.8025400638580322, + "learning_rate": 5e-06, + "loss": 0.4316, + "step": 4883 + }, + { + "epoch": 0.9338432122370937, + "grad_norm": 2.1860554218292236, + "learning_rate": 5e-06, + "loss": 0.1563, + "step": 4884 + }, + { + "epoch": 0.9340344168260039, + "grad_norm": 1.1986353397369385, + "learning_rate": 5e-06, + "loss": 0.099, + "step": 4885 + }, + { + "epoch": 0.9342256214149139, + "grad_norm": 0.7431411743164062, + "learning_rate": 5e-06, + "loss": 0.0325, + "step": 4886 + }, + { + "epoch": 0.9344168260038241, + "grad_norm": 1.0779439210891724, + "learning_rate": 5e-06, + "loss": 0.0742, + "step": 4887 + }, + { + "epoch": 0.9346080305927342, + "grad_norm": 1.574372410774231, + "learning_rate": 5e-06, + "loss": 0.1254, + "step": 4888 + }, + { + "epoch": 0.9347992351816443, + "grad_norm": 1.4904820919036865, + "learning_rate": 5e-06, + "loss": 0.1166, + "step": 4889 + }, + { + "epoch": 0.9349904397705545, + "grad_norm": 3.761897325515747, + "learning_rate": 5e-06, + "loss": 0.5072, + "step": 4890 + }, + { + "epoch": 0.9351816443594646, + "grad_norm": 2.5792999267578125, + "learning_rate": 5e-06, + "loss": 0.3623, + "step": 4891 + }, + { + "epoch": 0.9353728489483748, + "grad_norm": 2.6418819427490234, + "learning_rate": 5e-06, + "loss": 0.1922, + "step": 4892 + }, + { + "epoch": 0.9355640535372849, + "grad_norm": 2.635206937789917, + "learning_rate": 5e-06, + "loss": 0.1172, + "step": 4893 + }, + { + "epoch": 0.935755258126195, + "grad_norm": 2.728708028793335, + "learning_rate": 5e-06, + "loss": 0.3097, + "step": 4894 + }, + { + "epoch": 0.9359464627151052, + "grad_norm": 2.1992111206054688, + "learning_rate": 5e-06, + "loss": 0.2703, + "step": 4895 + }, + { + "epoch": 0.9361376673040153, + "grad_norm": 1.2282865047454834, + "learning_rate": 5e-06, + "loss": 0.1054, + "step": 4896 + }, + { + "epoch": 0.9363288718929255, + "grad_norm": 1.8844170570373535, + "learning_rate": 5e-06, + "loss": 0.1161, + "step": 4897 + }, + { + "epoch": 0.9365200764818356, + "grad_norm": 1.5336863994598389, + "learning_rate": 5e-06, + "loss": 0.109, + "step": 4898 + }, + { + "epoch": 0.9367112810707457, + "grad_norm": 1.703996181488037, + "learning_rate": 5e-06, + "loss": 0.081, + "step": 4899 + }, + { + "epoch": 0.9369024856596558, + "grad_norm": 1.7110828161239624, + "learning_rate": 5e-06, + "loss": 0.1016, + "step": 4900 + }, + { + "epoch": 0.9370936902485659, + "grad_norm": 1.9495599269866943, + "learning_rate": 5e-06, + "loss": 0.3473, + "step": 4901 + }, + { + "epoch": 0.9372848948374761, + "grad_norm": 2.5797476768493652, + "learning_rate": 5e-06, + "loss": 0.4117, + "step": 4902 + }, + { + "epoch": 0.9374760994263862, + "grad_norm": 2.9593513011932373, + "learning_rate": 5e-06, + "loss": 0.3348, + "step": 4903 + }, + { + "epoch": 0.9376673040152964, + "grad_norm": 1.2573286294937134, + "learning_rate": 5e-06, + "loss": 0.1025, + "step": 4904 + }, + { + "epoch": 0.9378585086042065, + "grad_norm": 0.9126336574554443, + "learning_rate": 5e-06, + "loss": 0.0479, + "step": 4905 + }, + { + "epoch": 0.9380497131931166, + "grad_norm": 2.2401199340820312, + "learning_rate": 5e-06, + "loss": 0.1684, + "step": 4906 + }, + { + "epoch": 0.9382409177820268, + "grad_norm": 1.2739428281784058, + "learning_rate": 5e-06, + "loss": 0.1452, + "step": 4907 + }, + { + "epoch": 0.9384321223709369, + "grad_norm": 1.4765453338623047, + "learning_rate": 5e-06, + "loss": 0.1207, + "step": 4908 + }, + { + "epoch": 0.9386233269598471, + "grad_norm": 2.1147773265838623, + "learning_rate": 5e-06, + "loss": 0.1677, + "step": 4909 + }, + { + "epoch": 0.9388145315487572, + "grad_norm": 1.5339463949203491, + "learning_rate": 5e-06, + "loss": 0.1036, + "step": 4910 + }, + { + "epoch": 0.9390057361376674, + "grad_norm": 2.7874417304992676, + "learning_rate": 5e-06, + "loss": 0.1291, + "step": 4911 + }, + { + "epoch": 0.9391969407265774, + "grad_norm": 1.52443265914917, + "learning_rate": 5e-06, + "loss": 0.0438, + "step": 4912 + }, + { + "epoch": 0.9393881453154875, + "grad_norm": 2.6554689407348633, + "learning_rate": 5e-06, + "loss": 0.4022, + "step": 4913 + }, + { + "epoch": 0.9395793499043977, + "grad_norm": 2.2543184757232666, + "learning_rate": 5e-06, + "loss": 0.3544, + "step": 4914 + }, + { + "epoch": 0.9397705544933078, + "grad_norm": 3.414179563522339, + "learning_rate": 5e-06, + "loss": 0.5646, + "step": 4915 + }, + { + "epoch": 0.939961759082218, + "grad_norm": 2.3218886852264404, + "learning_rate": 5e-06, + "loss": 0.2771, + "step": 4916 + }, + { + "epoch": 0.9401529636711281, + "grad_norm": 1.0181642770767212, + "learning_rate": 5e-06, + "loss": 0.0543, + "step": 4917 + }, + { + "epoch": 0.9403441682600382, + "grad_norm": 2.524704933166504, + "learning_rate": 5e-06, + "loss": 0.1449, + "step": 4918 + }, + { + "epoch": 0.9405353728489484, + "grad_norm": 2.3251535892486572, + "learning_rate": 5e-06, + "loss": 0.1364, + "step": 4919 + }, + { + "epoch": 0.9407265774378585, + "grad_norm": 2.4270453453063965, + "learning_rate": 5e-06, + "loss": 0.2367, + "step": 4920 + }, + { + "epoch": 0.9409177820267687, + "grad_norm": 1.9268070459365845, + "learning_rate": 5e-06, + "loss": 0.2149, + "step": 4921 + }, + { + "epoch": 0.9411089866156788, + "grad_norm": 1.8743796348571777, + "learning_rate": 5e-06, + "loss": 0.3339, + "step": 4922 + }, + { + "epoch": 0.941300191204589, + "grad_norm": 1.756379246711731, + "learning_rate": 5e-06, + "loss": 0.119, + "step": 4923 + }, + { + "epoch": 0.941491395793499, + "grad_norm": 0.8560100793838501, + "learning_rate": 5e-06, + "loss": 0.0507, + "step": 4924 + }, + { + "epoch": 0.9416826003824091, + "grad_norm": 2.884312391281128, + "learning_rate": 5e-06, + "loss": 0.2991, + "step": 4925 + }, + { + "epoch": 0.9418738049713193, + "grad_norm": 1.6877045631408691, + "learning_rate": 5e-06, + "loss": 0.2015, + "step": 4926 + }, + { + "epoch": 0.9420650095602294, + "grad_norm": 1.8372879028320312, + "learning_rate": 5e-06, + "loss": 0.1594, + "step": 4927 + }, + { + "epoch": 0.9422562141491396, + "grad_norm": 1.6219450235366821, + "learning_rate": 5e-06, + "loss": 0.1066, + "step": 4928 + }, + { + "epoch": 0.9424474187380497, + "grad_norm": 3.6380860805511475, + "learning_rate": 5e-06, + "loss": 0.541, + "step": 4929 + }, + { + "epoch": 0.9426386233269598, + "grad_norm": 2.0586066246032715, + "learning_rate": 5e-06, + "loss": 0.1329, + "step": 4930 + }, + { + "epoch": 0.94282982791587, + "grad_norm": 1.7319679260253906, + "learning_rate": 5e-06, + "loss": 0.0845, + "step": 4931 + }, + { + "epoch": 0.9430210325047801, + "grad_norm": 2.5363359451293945, + "learning_rate": 5e-06, + "loss": 0.4112, + "step": 4932 + }, + { + "epoch": 0.9432122370936903, + "grad_norm": 1.9458513259887695, + "learning_rate": 5e-06, + "loss": 0.2529, + "step": 4933 + }, + { + "epoch": 0.9434034416826004, + "grad_norm": 3.23052716255188, + "learning_rate": 5e-06, + "loss": 0.5106, + "step": 4934 + }, + { + "epoch": 0.9435946462715105, + "grad_norm": 1.5255218744277954, + "learning_rate": 5e-06, + "loss": 0.1049, + "step": 4935 + }, + { + "epoch": 0.9437858508604207, + "grad_norm": 2.298287868499756, + "learning_rate": 5e-06, + "loss": 0.3477, + "step": 4936 + }, + { + "epoch": 0.9439770554493307, + "grad_norm": 1.1250810623168945, + "learning_rate": 5e-06, + "loss": 0.054, + "step": 4937 + }, + { + "epoch": 0.944168260038241, + "grad_norm": 3.244180679321289, + "learning_rate": 5e-06, + "loss": 0.2005, + "step": 4938 + }, + { + "epoch": 0.944359464627151, + "grad_norm": 3.1849043369293213, + "learning_rate": 5e-06, + "loss": 0.355, + "step": 4939 + }, + { + "epoch": 0.9445506692160612, + "grad_norm": 2.582594156265259, + "learning_rate": 5e-06, + "loss": 0.1716, + "step": 4940 + }, + { + "epoch": 0.9447418738049713, + "grad_norm": 1.0963287353515625, + "learning_rate": 5e-06, + "loss": 0.0854, + "step": 4941 + }, + { + "epoch": 0.9449330783938814, + "grad_norm": 2.1446595191955566, + "learning_rate": 5e-06, + "loss": 0.2562, + "step": 4942 + }, + { + "epoch": 0.9451242829827916, + "grad_norm": 1.8604848384857178, + "learning_rate": 5e-06, + "loss": 0.0994, + "step": 4943 + }, + { + "epoch": 0.9453154875717017, + "grad_norm": 1.9539889097213745, + "learning_rate": 5e-06, + "loss": 0.0991, + "step": 4944 + }, + { + "epoch": 0.9455066921606119, + "grad_norm": 2.2747185230255127, + "learning_rate": 5e-06, + "loss": 0.2482, + "step": 4945 + }, + { + "epoch": 0.945697896749522, + "grad_norm": 1.741715669631958, + "learning_rate": 5e-06, + "loss": 0.2089, + "step": 4946 + }, + { + "epoch": 0.9458891013384321, + "grad_norm": 1.2032427787780762, + "learning_rate": 5e-06, + "loss": 0.095, + "step": 4947 + }, + { + "epoch": 0.9460803059273423, + "grad_norm": 1.8849596977233887, + "learning_rate": 5e-06, + "loss": 0.0805, + "step": 4948 + }, + { + "epoch": 0.9462715105162524, + "grad_norm": 1.4403064250946045, + "learning_rate": 5e-06, + "loss": 0.1019, + "step": 4949 + }, + { + "epoch": 0.9464627151051626, + "grad_norm": 1.0823032855987549, + "learning_rate": 5e-06, + "loss": 0.059, + "step": 4950 + }, + { + "epoch": 0.9466539196940726, + "grad_norm": 3.9184749126434326, + "learning_rate": 5e-06, + "loss": 0.7392, + "step": 4951 + }, + { + "epoch": 0.9468451242829828, + "grad_norm": 10.959403991699219, + "learning_rate": 5e-06, + "loss": 0.1341, + "step": 4952 + }, + { + "epoch": 0.9470363288718929, + "grad_norm": 2.1564931869506836, + "learning_rate": 5e-06, + "loss": 0.2236, + "step": 4953 + }, + { + "epoch": 0.947227533460803, + "grad_norm": 1.5129090547561646, + "learning_rate": 5e-06, + "loss": 0.1154, + "step": 4954 + }, + { + "epoch": 0.9474187380497132, + "grad_norm": 0.9726706147193909, + "learning_rate": 5e-06, + "loss": 0.0421, + "step": 4955 + }, + { + "epoch": 0.9476099426386233, + "grad_norm": 1.2113652229309082, + "learning_rate": 5e-06, + "loss": 0.0896, + "step": 4956 + }, + { + "epoch": 0.9478011472275335, + "grad_norm": 1.5728042125701904, + "learning_rate": 5e-06, + "loss": 0.1368, + "step": 4957 + }, + { + "epoch": 0.9479923518164436, + "grad_norm": 1.3641796112060547, + "learning_rate": 5e-06, + "loss": 0.1675, + "step": 4958 + }, + { + "epoch": 0.9481835564053537, + "grad_norm": 1.7657511234283447, + "learning_rate": 5e-06, + "loss": 0.2036, + "step": 4959 + }, + { + "epoch": 0.9483747609942639, + "grad_norm": 0.9892101883888245, + "learning_rate": 5e-06, + "loss": 0.0903, + "step": 4960 + }, + { + "epoch": 0.948565965583174, + "grad_norm": 2.8380050659179688, + "learning_rate": 5e-06, + "loss": 0.3084, + "step": 4961 + }, + { + "epoch": 0.9487571701720842, + "grad_norm": 0.8977944850921631, + "learning_rate": 5e-06, + "loss": 0.0524, + "step": 4962 + }, + { + "epoch": 0.9489483747609943, + "grad_norm": 2.5370876789093018, + "learning_rate": 5e-06, + "loss": 0.407, + "step": 4963 + }, + { + "epoch": 0.9491395793499044, + "grad_norm": 2.922842502593994, + "learning_rate": 5e-06, + "loss": 0.4577, + "step": 4964 + }, + { + "epoch": 0.9493307839388145, + "grad_norm": 1.981605887413025, + "learning_rate": 5e-06, + "loss": 0.2596, + "step": 4965 + }, + { + "epoch": 0.9495219885277246, + "grad_norm": 2.6923184394836426, + "learning_rate": 5e-06, + "loss": 0.4727, + "step": 4966 + }, + { + "epoch": 0.9497131931166348, + "grad_norm": 1.4957002401351929, + "learning_rate": 5e-06, + "loss": 0.1353, + "step": 4967 + }, + { + "epoch": 0.9499043977055449, + "grad_norm": 1.2848210334777832, + "learning_rate": 5e-06, + "loss": 0.078, + "step": 4968 + }, + { + "epoch": 0.9500956022944551, + "grad_norm": 1.8343178033828735, + "learning_rate": 5e-06, + "loss": 0.1058, + "step": 4969 + }, + { + "epoch": 0.9502868068833652, + "grad_norm": 1.764131784439087, + "learning_rate": 5e-06, + "loss": 0.2088, + "step": 4970 + }, + { + "epoch": 0.9504780114722753, + "grad_norm": 1.4648293256759644, + "learning_rate": 5e-06, + "loss": 0.1017, + "step": 4971 + }, + { + "epoch": 0.9506692160611855, + "grad_norm": 1.3409184217453003, + "learning_rate": 5e-06, + "loss": 0.0881, + "step": 4972 + }, + { + "epoch": 0.9508604206500956, + "grad_norm": 1.3016523122787476, + "learning_rate": 5e-06, + "loss": 0.0451, + "step": 4973 + }, + { + "epoch": 0.9510516252390058, + "grad_norm": 0.7061133980751038, + "learning_rate": 5e-06, + "loss": 0.0428, + "step": 4974 + }, + { + "epoch": 0.9512428298279159, + "grad_norm": 3.5251235961914062, + "learning_rate": 5e-06, + "loss": 0.2068, + "step": 4975 + }, + { + "epoch": 0.9514340344168261, + "grad_norm": 2.1218931674957275, + "learning_rate": 5e-06, + "loss": 0.3303, + "step": 4976 + }, + { + "epoch": 0.9516252390057361, + "grad_norm": 2.1984875202178955, + "learning_rate": 5e-06, + "loss": 0.3954, + "step": 4977 + }, + { + "epoch": 0.9518164435946462, + "grad_norm": 1.5826531648635864, + "learning_rate": 5e-06, + "loss": 0.1482, + "step": 4978 + }, + { + "epoch": 0.9520076481835564, + "grad_norm": 1.824207067489624, + "learning_rate": 5e-06, + "loss": 0.1272, + "step": 4979 + }, + { + "epoch": 0.9521988527724665, + "grad_norm": 1.0942171812057495, + "learning_rate": 5e-06, + "loss": 0.0596, + "step": 4980 + }, + { + "epoch": 0.9523900573613767, + "grad_norm": 2.183330535888672, + "learning_rate": 5e-06, + "loss": 0.1098, + "step": 4981 + }, + { + "epoch": 0.9525812619502868, + "grad_norm": 1.866705060005188, + "learning_rate": 5e-06, + "loss": 0.1239, + "step": 4982 + }, + { + "epoch": 0.9527724665391969, + "grad_norm": 2.1280243396759033, + "learning_rate": 5e-06, + "loss": 0.2009, + "step": 4983 + }, + { + "epoch": 0.9529636711281071, + "grad_norm": 1.8956713676452637, + "learning_rate": 5e-06, + "loss": 0.1752, + "step": 4984 + }, + { + "epoch": 0.9531548757170172, + "grad_norm": 2.302192449569702, + "learning_rate": 5e-06, + "loss": 0.2851, + "step": 4985 + }, + { + "epoch": 0.9533460803059274, + "grad_norm": 4.471398830413818, + "learning_rate": 5e-06, + "loss": 0.1626, + "step": 4986 + }, + { + "epoch": 0.9535372848948375, + "grad_norm": 2.243290424346924, + "learning_rate": 5e-06, + "loss": 0.1037, + "step": 4987 + }, + { + "epoch": 0.9537284894837476, + "grad_norm": 2.0457358360290527, + "learning_rate": 5e-06, + "loss": 0.2751, + "step": 4988 + }, + { + "epoch": 0.9539196940726578, + "grad_norm": 1.7639656066894531, + "learning_rate": 5e-06, + "loss": 0.2408, + "step": 4989 + }, + { + "epoch": 0.9541108986615678, + "grad_norm": 2.313894748687744, + "learning_rate": 5e-06, + "loss": 0.2289, + "step": 4990 + }, + { + "epoch": 0.954302103250478, + "grad_norm": 1.8488069772720337, + "learning_rate": 5e-06, + "loss": 0.1356, + "step": 4991 + }, + { + "epoch": 0.9544933078393881, + "grad_norm": 1.261997938156128, + "learning_rate": 5e-06, + "loss": 0.1232, + "step": 4992 + }, + { + "epoch": 0.9546845124282983, + "grad_norm": 0.8408591747283936, + "learning_rate": 5e-06, + "loss": 0.0461, + "step": 4993 + }, + { + "epoch": 0.9548757170172084, + "grad_norm": 2.0298595428466797, + "learning_rate": 5e-06, + "loss": 0.0984, + "step": 4994 + }, + { + "epoch": 0.9550669216061185, + "grad_norm": 2.701509714126587, + "learning_rate": 5e-06, + "loss": 0.3663, + "step": 4995 + }, + { + "epoch": 0.9552581261950287, + "grad_norm": 1.558736801147461, + "learning_rate": 5e-06, + "loss": 0.1501, + "step": 4996 + }, + { + "epoch": 0.9554493307839388, + "grad_norm": 2.992225170135498, + "learning_rate": 5e-06, + "loss": 0.4301, + "step": 4997 + }, + { + "epoch": 0.955640535372849, + "grad_norm": 1.2962523698806763, + "learning_rate": 5e-06, + "loss": 0.0935, + "step": 4998 + }, + { + "epoch": 0.9558317399617591, + "grad_norm": 1.3460264205932617, + "learning_rate": 5e-06, + "loss": 0.0787, + "step": 4999 + }, + { + "epoch": 0.9560229445506692, + "grad_norm": 2.1929922103881836, + "learning_rate": 5e-06, + "loss": 0.1547, + "step": 5000 + }, + { + "epoch": 0.9560229445506692, + "eval_runtime": 742.0542, + "eval_samples_per_second": 2.067, + "eval_steps_per_second": 0.259, + "step": 5000 + }, + { + "epoch": 0.9562141491395794, + "grad_norm": 1.8363579511642456, + "learning_rate": 5e-06, + "loss": 0.1593, + "step": 5001 + }, + { + "epoch": 0.9564053537284894, + "grad_norm": 2.0634498596191406, + "learning_rate": 5e-06, + "loss": 0.1911, + "step": 5002 + }, + { + "epoch": 0.9565965583173996, + "grad_norm": 1.5266684293746948, + "learning_rate": 5e-06, + "loss": 0.1115, + "step": 5003 + }, + { + "epoch": 0.9567877629063097, + "grad_norm": 1.1533805131912231, + "learning_rate": 5e-06, + "loss": 0.1106, + "step": 5004 + }, + { + "epoch": 0.9569789674952199, + "grad_norm": 2.2362139225006104, + "learning_rate": 5e-06, + "loss": 0.1925, + "step": 5005 + }, + { + "epoch": 0.95717017208413, + "grad_norm": 2.0843067169189453, + "learning_rate": 5e-06, + "loss": 0.1154, + "step": 5006 + }, + { + "epoch": 0.9573613766730401, + "grad_norm": 2.134740114212036, + "learning_rate": 5e-06, + "loss": 0.3089, + "step": 5007 + }, + { + "epoch": 0.9575525812619503, + "grad_norm": 2.9222230911254883, + "learning_rate": 5e-06, + "loss": 0.4906, + "step": 5008 + }, + { + "epoch": 0.9577437858508604, + "grad_norm": 2.486271619796753, + "learning_rate": 5e-06, + "loss": 0.3661, + "step": 5009 + }, + { + "epoch": 0.9579349904397706, + "grad_norm": 0.9269048571586609, + "learning_rate": 5e-06, + "loss": 0.1277, + "step": 5010 + }, + { + "epoch": 0.9581261950286807, + "grad_norm": 2.4355266094207764, + "learning_rate": 5e-06, + "loss": 0.1951, + "step": 5011 + }, + { + "epoch": 0.9583173996175908, + "grad_norm": 2.3811750411987305, + "learning_rate": 5e-06, + "loss": 0.144, + "step": 5012 + }, + { + "epoch": 0.958508604206501, + "grad_norm": 2.0431880950927734, + "learning_rate": 5e-06, + "loss": 0.1872, + "step": 5013 + }, + { + "epoch": 0.9586998087954111, + "grad_norm": 3.104672908782959, + "learning_rate": 5e-06, + "loss": 0.4546, + "step": 5014 + }, + { + "epoch": 0.9588910133843213, + "grad_norm": 2.163524866104126, + "learning_rate": 5e-06, + "loss": 0.1894, + "step": 5015 + }, + { + "epoch": 0.9590822179732313, + "grad_norm": 2.0325043201446533, + "learning_rate": 5e-06, + "loss": 0.1353, + "step": 5016 + }, + { + "epoch": 0.9592734225621415, + "grad_norm": 2.2608044147491455, + "learning_rate": 5e-06, + "loss": 0.2509, + "step": 5017 + }, + { + "epoch": 0.9594646271510516, + "grad_norm": 1.2994120121002197, + "learning_rate": 5e-06, + "loss": 0.0878, + "step": 5018 + }, + { + "epoch": 0.9596558317399617, + "grad_norm": 1.0766092538833618, + "learning_rate": 5e-06, + "loss": 0.0688, + "step": 5019 + }, + { + "epoch": 0.9598470363288719, + "grad_norm": 2.5063343048095703, + "learning_rate": 5e-06, + "loss": 0.3423, + "step": 5020 + }, + { + "epoch": 0.960038240917782, + "grad_norm": 2.1994948387145996, + "learning_rate": 5e-06, + "loss": 0.2923, + "step": 5021 + }, + { + "epoch": 0.9602294455066922, + "grad_norm": 3.1960678100585938, + "learning_rate": 5e-06, + "loss": 0.4548, + "step": 5022 + }, + { + "epoch": 0.9604206500956023, + "grad_norm": 1.9123889207839966, + "learning_rate": 5e-06, + "loss": 0.1785, + "step": 5023 + }, + { + "epoch": 0.9606118546845124, + "grad_norm": 1.6679797172546387, + "learning_rate": 5e-06, + "loss": 0.1141, + "step": 5024 + }, + { + "epoch": 0.9608030592734226, + "grad_norm": 1.2722249031066895, + "learning_rate": 5e-06, + "loss": 0.0643, + "step": 5025 + }, + { + "epoch": 0.9609942638623327, + "grad_norm": 1.5316091775894165, + "learning_rate": 5e-06, + "loss": 0.2274, + "step": 5026 + }, + { + "epoch": 0.9611854684512429, + "grad_norm": 1.5301028490066528, + "learning_rate": 5e-06, + "loss": 0.1516, + "step": 5027 + }, + { + "epoch": 0.961376673040153, + "grad_norm": 1.963606357574463, + "learning_rate": 5e-06, + "loss": 0.2119, + "step": 5028 + }, + { + "epoch": 0.9615678776290632, + "grad_norm": 1.5402876138687134, + "learning_rate": 5e-06, + "loss": 0.1275, + "step": 5029 + }, + { + "epoch": 0.9617590822179732, + "grad_norm": 1.1363723278045654, + "learning_rate": 5e-06, + "loss": 0.0777, + "step": 5030 + }, + { + "epoch": 0.9619502868068833, + "grad_norm": 1.909873604774475, + "learning_rate": 5e-06, + "loss": 0.1227, + "step": 5031 + }, + { + "epoch": 0.9621414913957935, + "grad_norm": 1.3693710565567017, + "learning_rate": 5e-06, + "loss": 0.0944, + "step": 5032 + }, + { + "epoch": 0.9623326959847036, + "grad_norm": 2.0859200954437256, + "learning_rate": 5e-06, + "loss": 0.2549, + "step": 5033 + }, + { + "epoch": 0.9625239005736138, + "grad_norm": 1.5054364204406738, + "learning_rate": 5e-06, + "loss": 0.1717, + "step": 5034 + }, + { + "epoch": 0.9627151051625239, + "grad_norm": 1.8584833145141602, + "learning_rate": 5e-06, + "loss": 0.1093, + "step": 5035 + }, + { + "epoch": 0.962906309751434, + "grad_norm": 1.1875016689300537, + "learning_rate": 5e-06, + "loss": 0.0729, + "step": 5036 + }, + { + "epoch": 0.9630975143403442, + "grad_norm": 1.1567950248718262, + "learning_rate": 5e-06, + "loss": 0.0653, + "step": 5037 + }, + { + "epoch": 0.9632887189292543, + "grad_norm": 1.8108049631118774, + "learning_rate": 5e-06, + "loss": 0.1345, + "step": 5038 + }, + { + "epoch": 0.9634799235181645, + "grad_norm": 3.5861775875091553, + "learning_rate": 5e-06, + "loss": 0.5882, + "step": 5039 + }, + { + "epoch": 0.9636711281070746, + "grad_norm": 2.6124978065490723, + "learning_rate": 5e-06, + "loss": 0.1039, + "step": 5040 + }, + { + "epoch": 0.9638623326959848, + "grad_norm": 1.3670365810394287, + "learning_rate": 5e-06, + "loss": 0.1041, + "step": 5041 + }, + { + "epoch": 0.9640535372848948, + "grad_norm": 1.332395315170288, + "learning_rate": 5e-06, + "loss": 0.1791, + "step": 5042 + }, + { + "epoch": 0.9642447418738049, + "grad_norm": 1.1727179288864136, + "learning_rate": 5e-06, + "loss": 0.1109, + "step": 5043 + }, + { + "epoch": 0.9644359464627151, + "grad_norm": 1.3848689794540405, + "learning_rate": 5e-06, + "loss": 0.1115, + "step": 5044 + }, + { + "epoch": 0.9646271510516252, + "grad_norm": 1.4246430397033691, + "learning_rate": 5e-06, + "loss": 0.099, + "step": 5045 + }, + { + "epoch": 0.9648183556405354, + "grad_norm": 2.0380055904388428, + "learning_rate": 5e-06, + "loss": 0.2963, + "step": 5046 + }, + { + "epoch": 0.9650095602294455, + "grad_norm": 2.6432945728302, + "learning_rate": 5e-06, + "loss": 0.236, + "step": 5047 + }, + { + "epoch": 0.9652007648183556, + "grad_norm": 1.9849876165390015, + "learning_rate": 5e-06, + "loss": 0.2692, + "step": 5048 + }, + { + "epoch": 0.9653919694072658, + "grad_norm": 1.9646875858306885, + "learning_rate": 5e-06, + "loss": 0.3539, + "step": 5049 + }, + { + "epoch": 0.9655831739961759, + "grad_norm": 2.007697820663452, + "learning_rate": 5e-06, + "loss": 0.1226, + "step": 5050 + }, + { + "epoch": 0.9657743785850861, + "grad_norm": 1.224379062652588, + "learning_rate": 5e-06, + "loss": 0.1355, + "step": 5051 + }, + { + "epoch": 0.9659655831739962, + "grad_norm": 0.585542619228363, + "learning_rate": 5e-06, + "loss": 0.0688, + "step": 5052 + }, + { + "epoch": 0.9661567877629063, + "grad_norm": 1.7798622846603394, + "learning_rate": 5e-06, + "loss": 0.2512, + "step": 5053 + }, + { + "epoch": 0.9663479923518165, + "grad_norm": 1.9933887720108032, + "learning_rate": 5e-06, + "loss": 0.1716, + "step": 5054 + }, + { + "epoch": 0.9665391969407265, + "grad_norm": 1.6793018579483032, + "learning_rate": 5e-06, + "loss": 0.1758, + "step": 5055 + }, + { + "epoch": 0.9667304015296367, + "grad_norm": 1.5060217380523682, + "learning_rate": 5e-06, + "loss": 0.1141, + "step": 5056 + }, + { + "epoch": 0.9669216061185468, + "grad_norm": 2.19502329826355, + "learning_rate": 5e-06, + "loss": 0.2314, + "step": 5057 + }, + { + "epoch": 0.967112810707457, + "grad_norm": 1.7284197807312012, + "learning_rate": 5e-06, + "loss": 0.1213, + "step": 5058 + }, + { + "epoch": 0.9673040152963671, + "grad_norm": 1.6519883871078491, + "learning_rate": 5e-06, + "loss": 0.1296, + "step": 5059 + }, + { + "epoch": 0.9674952198852772, + "grad_norm": 2.585042715072632, + "learning_rate": 5e-06, + "loss": 0.2157, + "step": 5060 + }, + { + "epoch": 0.9676864244741874, + "grad_norm": 2.122485637664795, + "learning_rate": 5e-06, + "loss": 0.3705, + "step": 5061 + }, + { + "epoch": 0.9678776290630975, + "grad_norm": 1.6645944118499756, + "learning_rate": 5e-06, + "loss": 0.1524, + "step": 5062 + }, + { + "epoch": 0.9680688336520077, + "grad_norm": 2.446328639984131, + "learning_rate": 5e-06, + "loss": 0.1761, + "step": 5063 + }, + { + "epoch": 0.9682600382409178, + "grad_norm": 3.5966954231262207, + "learning_rate": 5e-06, + "loss": 0.5335, + "step": 5064 + }, + { + "epoch": 0.9684512428298279, + "grad_norm": 1.5934321880340576, + "learning_rate": 5e-06, + "loss": 0.1176, + "step": 5065 + }, + { + "epoch": 0.9686424474187381, + "grad_norm": 2.7553305625915527, + "learning_rate": 5e-06, + "loss": 0.2518, + "step": 5066 + }, + { + "epoch": 0.9688336520076481, + "grad_norm": 1.3165737390518188, + "learning_rate": 5e-06, + "loss": 0.0789, + "step": 5067 + }, + { + "epoch": 0.9690248565965583, + "grad_norm": 1.7967562675476074, + "learning_rate": 5e-06, + "loss": 0.1001, + "step": 5068 + }, + { + "epoch": 0.9692160611854684, + "grad_norm": 1.7139383554458618, + "learning_rate": 5e-06, + "loss": 0.1541, + "step": 5069 + }, + { + "epoch": 0.9694072657743786, + "grad_norm": 2.3538708686828613, + "learning_rate": 5e-06, + "loss": 0.3276, + "step": 5070 + }, + { + "epoch": 0.9695984703632887, + "grad_norm": 2.8249590396881104, + "learning_rate": 5e-06, + "loss": 0.446, + "step": 5071 + }, + { + "epoch": 0.9697896749521988, + "grad_norm": 1.2258639335632324, + "learning_rate": 5e-06, + "loss": 0.1059, + "step": 5072 + }, + { + "epoch": 0.969980879541109, + "grad_norm": 1.419464111328125, + "learning_rate": 5e-06, + "loss": 0.0757, + "step": 5073 + }, + { + "epoch": 0.9701720841300191, + "grad_norm": 1.7254623174667358, + "learning_rate": 5e-06, + "loss": 0.0864, + "step": 5074 + }, + { + "epoch": 0.9703632887189293, + "grad_norm": 2.9186739921569824, + "learning_rate": 5e-06, + "loss": 0.2495, + "step": 5075 + }, + { + "epoch": 0.9705544933078394, + "grad_norm": 1.3285423517227173, + "learning_rate": 5e-06, + "loss": 0.1859, + "step": 5076 + }, + { + "epoch": 0.9707456978967495, + "grad_norm": 1.7535793781280518, + "learning_rate": 5e-06, + "loss": 0.2977, + "step": 5077 + }, + { + "epoch": 0.9709369024856597, + "grad_norm": 1.533988356590271, + "learning_rate": 5e-06, + "loss": 0.1335, + "step": 5078 + }, + { + "epoch": 0.9711281070745698, + "grad_norm": 1.5041394233703613, + "learning_rate": 5e-06, + "loss": 0.142, + "step": 5079 + }, + { + "epoch": 0.97131931166348, + "grad_norm": 1.4372128248214722, + "learning_rate": 5e-06, + "loss": 0.0904, + "step": 5080 + }, + { + "epoch": 0.97151051625239, + "grad_norm": 2.476388454437256, + "learning_rate": 5e-06, + "loss": 0.1434, + "step": 5081 + }, + { + "epoch": 0.9717017208413002, + "grad_norm": 2.5645976066589355, + "learning_rate": 5e-06, + "loss": 0.2597, + "step": 5082 + }, + { + "epoch": 0.9718929254302103, + "grad_norm": 1.5782102346420288, + "learning_rate": 5e-06, + "loss": 0.1355, + "step": 5083 + }, + { + "epoch": 0.9720841300191204, + "grad_norm": 1.2604018449783325, + "learning_rate": 5e-06, + "loss": 0.2281, + "step": 5084 + }, + { + "epoch": 0.9722753346080306, + "grad_norm": 1.2508944272994995, + "learning_rate": 5e-06, + "loss": 0.0719, + "step": 5085 + }, + { + "epoch": 0.9724665391969407, + "grad_norm": 2.0485973358154297, + "learning_rate": 5e-06, + "loss": 0.0766, + "step": 5086 + }, + { + "epoch": 0.9726577437858509, + "grad_norm": 1.1528031826019287, + "learning_rate": 5e-06, + "loss": 0.0832, + "step": 5087 + }, + { + "epoch": 0.972848948374761, + "grad_norm": 1.789041519165039, + "learning_rate": 5e-06, + "loss": 0.1181, + "step": 5088 + }, + { + "epoch": 0.9730401529636711, + "grad_norm": 1.92593514919281, + "learning_rate": 5e-06, + "loss": 0.2047, + "step": 5089 + }, + { + "epoch": 0.9732313575525813, + "grad_norm": 1.1563007831573486, + "learning_rate": 5e-06, + "loss": 0.1133, + "step": 5090 + }, + { + "epoch": 0.9734225621414914, + "grad_norm": 1.1158252954483032, + "learning_rate": 5e-06, + "loss": 0.0862, + "step": 5091 + }, + { + "epoch": 0.9736137667304016, + "grad_norm": 1.5522339344024658, + "learning_rate": 5e-06, + "loss": 0.103, + "step": 5092 + }, + { + "epoch": 0.9738049713193117, + "grad_norm": 2.535926342010498, + "learning_rate": 5e-06, + "loss": 0.2977, + "step": 5093 + }, + { + "epoch": 0.9739961759082219, + "grad_norm": 1.2164846658706665, + "learning_rate": 5e-06, + "loss": 0.0662, + "step": 5094 + }, + { + "epoch": 0.9741873804971319, + "grad_norm": 2.7936418056488037, + "learning_rate": 5e-06, + "loss": 0.4989, + "step": 5095 + }, + { + "epoch": 0.974378585086042, + "grad_norm": 2.6002554893493652, + "learning_rate": 5e-06, + "loss": 0.3435, + "step": 5096 + }, + { + "epoch": 0.9745697896749522, + "grad_norm": 2.3612749576568604, + "learning_rate": 5e-06, + "loss": 0.2797, + "step": 5097 + }, + { + "epoch": 0.9747609942638623, + "grad_norm": 2.0107052326202393, + "learning_rate": 5e-06, + "loss": 0.1331, + "step": 5098 + }, + { + "epoch": 0.9749521988527725, + "grad_norm": 1.1361372470855713, + "learning_rate": 5e-06, + "loss": 0.0784, + "step": 5099 + }, + { + "epoch": 0.9751434034416826, + "grad_norm": 1.9277843236923218, + "learning_rate": 5e-06, + "loss": 0.1257, + "step": 5100 + }, + { + "epoch": 0.9753346080305927, + "grad_norm": 1.3512511253356934, + "learning_rate": 5e-06, + "loss": 0.119, + "step": 5101 + }, + { + "epoch": 0.9755258126195029, + "grad_norm": 3.0426383018493652, + "learning_rate": 5e-06, + "loss": 0.4038, + "step": 5102 + }, + { + "epoch": 0.975717017208413, + "grad_norm": 2.047166347503662, + "learning_rate": 5e-06, + "loss": 0.1507, + "step": 5103 + }, + { + "epoch": 0.9759082217973232, + "grad_norm": 1.3602712154388428, + "learning_rate": 5e-06, + "loss": 0.0953, + "step": 5104 + }, + { + "epoch": 0.9760994263862333, + "grad_norm": 2.696289539337158, + "learning_rate": 5e-06, + "loss": 0.1088, + "step": 5105 + }, + { + "epoch": 0.9762906309751435, + "grad_norm": 1.646152138710022, + "learning_rate": 5e-06, + "loss": 0.1026, + "step": 5106 + }, + { + "epoch": 0.9764818355640535, + "grad_norm": 1.7276148796081543, + "learning_rate": 5e-06, + "loss": 0.1875, + "step": 5107 + }, + { + "epoch": 0.9766730401529636, + "grad_norm": 2.928664445877075, + "learning_rate": 5e-06, + "loss": 0.1254, + "step": 5108 + }, + { + "epoch": 0.9768642447418738, + "grad_norm": 2.494978904724121, + "learning_rate": 5e-06, + "loss": 0.3259, + "step": 5109 + }, + { + "epoch": 0.9770554493307839, + "grad_norm": 1.8363969326019287, + "learning_rate": 5e-06, + "loss": 0.106, + "step": 5110 + }, + { + "epoch": 0.9772466539196941, + "grad_norm": 3.1696293354034424, + "learning_rate": 5e-06, + "loss": 0.2118, + "step": 5111 + }, + { + "epoch": 0.9774378585086042, + "grad_norm": 1.7869844436645508, + "learning_rate": 5e-06, + "loss": 0.1343, + "step": 5112 + }, + { + "epoch": 0.9776290630975143, + "grad_norm": 1.5095562934875488, + "learning_rate": 5e-06, + "loss": 0.1248, + "step": 5113 + }, + { + "epoch": 0.9778202676864245, + "grad_norm": 1.0771520137786865, + "learning_rate": 5e-06, + "loss": 0.1536, + "step": 5114 + }, + { + "epoch": 0.9780114722753346, + "grad_norm": 1.9261468648910522, + "learning_rate": 5e-06, + "loss": 0.2512, + "step": 5115 + }, + { + "epoch": 0.9782026768642448, + "grad_norm": 2.3080861568450928, + "learning_rate": 5e-06, + "loss": 0.1425, + "step": 5116 + }, + { + "epoch": 0.9783938814531549, + "grad_norm": 2.382253885269165, + "learning_rate": 5e-06, + "loss": 0.1637, + "step": 5117 + }, + { + "epoch": 0.978585086042065, + "grad_norm": 2.896066665649414, + "learning_rate": 5e-06, + "loss": 0.3333, + "step": 5118 + }, + { + "epoch": 0.9787762906309752, + "grad_norm": 1.2175503969192505, + "learning_rate": 5e-06, + "loss": 0.0933, + "step": 5119 + }, + { + "epoch": 0.9789674952198852, + "grad_norm": 1.7644296884536743, + "learning_rate": 5e-06, + "loss": 0.1718, + "step": 5120 + }, + { + "epoch": 0.9791586998087954, + "grad_norm": 1.8412610292434692, + "learning_rate": 5e-06, + "loss": 0.1739, + "step": 5121 + }, + { + "epoch": 0.9793499043977055, + "grad_norm": 1.3663194179534912, + "learning_rate": 5e-06, + "loss": 0.1117, + "step": 5122 + }, + { + "epoch": 0.9795411089866157, + "grad_norm": 1.2034631967544556, + "learning_rate": 5e-06, + "loss": 0.1127, + "step": 5123 + }, + { + "epoch": 0.9797323135755258, + "grad_norm": 2.052450180053711, + "learning_rate": 5e-06, + "loss": 0.1142, + "step": 5124 + }, + { + "epoch": 0.9799235181644359, + "grad_norm": 2.3145527839660645, + "learning_rate": 5e-06, + "loss": 0.1296, + "step": 5125 + }, + { + "epoch": 0.9801147227533461, + "grad_norm": 1.6712604761123657, + "learning_rate": 5e-06, + "loss": 0.2101, + "step": 5126 + }, + { + "epoch": 0.9803059273422562, + "grad_norm": 2.388617992401123, + "learning_rate": 5e-06, + "loss": 0.2405, + "step": 5127 + }, + { + "epoch": 0.9804971319311664, + "grad_norm": 3.332637071609497, + "learning_rate": 5e-06, + "loss": 0.286, + "step": 5128 + }, + { + "epoch": 0.9806883365200765, + "grad_norm": 2.245945930480957, + "learning_rate": 5e-06, + "loss": 0.1992, + "step": 5129 + }, + { + "epoch": 0.9808795411089866, + "grad_norm": 2.0014476776123047, + "learning_rate": 5e-06, + "loss": 0.2369, + "step": 5130 + }, + { + "epoch": 0.9810707456978968, + "grad_norm": 2.185051441192627, + "learning_rate": 5e-06, + "loss": 0.124, + "step": 5131 + }, + { + "epoch": 0.9812619502868068, + "grad_norm": 3.205618381500244, + "learning_rate": 5e-06, + "loss": 0.1862, + "step": 5132 + }, + { + "epoch": 0.981453154875717, + "grad_norm": 2.2920567989349365, + "learning_rate": 5e-06, + "loss": 0.3075, + "step": 5133 + }, + { + "epoch": 0.9816443594646271, + "grad_norm": 1.5987259149551392, + "learning_rate": 5e-06, + "loss": 0.0987, + "step": 5134 + }, + { + "epoch": 0.9818355640535373, + "grad_norm": 1.4586559534072876, + "learning_rate": 5e-06, + "loss": 0.1095, + "step": 5135 + }, + { + "epoch": 0.9820267686424474, + "grad_norm": 1.3265559673309326, + "learning_rate": 5e-06, + "loss": 0.0885, + "step": 5136 + }, + { + "epoch": 0.9822179732313575, + "grad_norm": 1.8111308813095093, + "learning_rate": 5e-06, + "loss": 0.098, + "step": 5137 + }, + { + "epoch": 0.9824091778202677, + "grad_norm": 1.9559201002120972, + "learning_rate": 5e-06, + "loss": 0.1827, + "step": 5138 + }, + { + "epoch": 0.9826003824091778, + "grad_norm": 2.519045829772949, + "learning_rate": 5e-06, + "loss": 0.4279, + "step": 5139 + }, + { + "epoch": 0.982791586998088, + "grad_norm": 1.0115805864334106, + "learning_rate": 5e-06, + "loss": 0.0554, + "step": 5140 + }, + { + "epoch": 0.9829827915869981, + "grad_norm": 1.5444289445877075, + "learning_rate": 5e-06, + "loss": 0.1956, + "step": 5141 + }, + { + "epoch": 0.9831739961759082, + "grad_norm": 1.1781493425369263, + "learning_rate": 5e-06, + "loss": 0.1124, + "step": 5142 + }, + { + "epoch": 0.9833652007648184, + "grad_norm": 1.6144378185272217, + "learning_rate": 5e-06, + "loss": 0.1011, + "step": 5143 + }, + { + "epoch": 0.9835564053537285, + "grad_norm": 1.777444839477539, + "learning_rate": 5e-06, + "loss": 0.3058, + "step": 5144 + }, + { + "epoch": 0.9837476099426387, + "grad_norm": 2.543937921524048, + "learning_rate": 5e-06, + "loss": 0.3751, + "step": 5145 + }, + { + "epoch": 0.9839388145315487, + "grad_norm": 2.032430648803711, + "learning_rate": 5e-06, + "loss": 0.2294, + "step": 5146 + }, + { + "epoch": 0.9841300191204589, + "grad_norm": 1.2935893535614014, + "learning_rate": 5e-06, + "loss": 0.1531, + "step": 5147 + }, + { + "epoch": 0.984321223709369, + "grad_norm": 1.9025208950042725, + "learning_rate": 5e-06, + "loss": 0.1284, + "step": 5148 + }, + { + "epoch": 0.9845124282982791, + "grad_norm": 3.2536699771881104, + "learning_rate": 5e-06, + "loss": 0.3226, + "step": 5149 + }, + { + "epoch": 0.9847036328871893, + "grad_norm": 2.4361836910247803, + "learning_rate": 5e-06, + "loss": 0.1494, + "step": 5150 + }, + { + "epoch": 0.9848948374760994, + "grad_norm": 1.2773830890655518, + "learning_rate": 5e-06, + "loss": 0.102, + "step": 5151 + }, + { + "epoch": 0.9850860420650096, + "grad_norm": 2.558335542678833, + "learning_rate": 5e-06, + "loss": 0.2727, + "step": 5152 + }, + { + "epoch": 0.9852772466539197, + "grad_norm": 2.8196585178375244, + "learning_rate": 5e-06, + "loss": 0.2036, + "step": 5153 + }, + { + "epoch": 0.9854684512428298, + "grad_norm": 1.9609150886535645, + "learning_rate": 5e-06, + "loss": 0.1723, + "step": 5154 + }, + { + "epoch": 0.98565965583174, + "grad_norm": 3.2695369720458984, + "learning_rate": 5e-06, + "loss": 0.0569, + "step": 5155 + }, + { + "epoch": 0.9858508604206501, + "grad_norm": 1.6373162269592285, + "learning_rate": 5e-06, + "loss": 0.0689, + "step": 5156 + }, + { + "epoch": 0.9860420650095603, + "grad_norm": 1.8946233987808228, + "learning_rate": 5e-06, + "loss": 0.2049, + "step": 5157 + }, + { + "epoch": 0.9862332695984704, + "grad_norm": 2.2586464881896973, + "learning_rate": 5e-06, + "loss": 0.1795, + "step": 5158 + }, + { + "epoch": 0.9864244741873806, + "grad_norm": 2.046656370162964, + "learning_rate": 5e-06, + "loss": 0.2424, + "step": 5159 + }, + { + "epoch": 0.9866156787762906, + "grad_norm": 0.6712673902511597, + "learning_rate": 5e-06, + "loss": 0.0386, + "step": 5160 + }, + { + "epoch": 0.9868068833652007, + "grad_norm": 1.4954816102981567, + "learning_rate": 5e-06, + "loss": 0.1202, + "step": 5161 + }, + { + "epoch": 0.9869980879541109, + "grad_norm": 2.4143035411834717, + "learning_rate": 5e-06, + "loss": 0.0619, + "step": 5162 + }, + { + "epoch": 0.987189292543021, + "grad_norm": 1.1633849143981934, + "learning_rate": 5e-06, + "loss": 0.0919, + "step": 5163 + }, + { + "epoch": 0.9873804971319312, + "grad_norm": 3.078310966491699, + "learning_rate": 5e-06, + "loss": 0.5286, + "step": 5164 + }, + { + "epoch": 0.9875717017208413, + "grad_norm": 3.7422358989715576, + "learning_rate": 5e-06, + "loss": 0.7243, + "step": 5165 + }, + { + "epoch": 0.9877629063097514, + "grad_norm": 2.2467732429504395, + "learning_rate": 5e-06, + "loss": 0.2758, + "step": 5166 + }, + { + "epoch": 0.9879541108986616, + "grad_norm": 1.4942512512207031, + "learning_rate": 5e-06, + "loss": 0.1251, + "step": 5167 + }, + { + "epoch": 0.9881453154875717, + "grad_norm": 3.4213297367095947, + "learning_rate": 5e-06, + "loss": 0.1562, + "step": 5168 + }, + { + "epoch": 0.9883365200764819, + "grad_norm": 0.8668385744094849, + "learning_rate": 5e-06, + "loss": 0.0532, + "step": 5169 + }, + { + "epoch": 0.988527724665392, + "grad_norm": 4.07417106628418, + "learning_rate": 5e-06, + "loss": 0.2916, + "step": 5170 + }, + { + "epoch": 0.988718929254302, + "grad_norm": 1.8834352493286133, + "learning_rate": 5e-06, + "loss": 0.2522, + "step": 5171 + }, + { + "epoch": 0.9889101338432122, + "grad_norm": 1.5943543910980225, + "learning_rate": 5e-06, + "loss": 0.1172, + "step": 5172 + }, + { + "epoch": 0.9891013384321223, + "grad_norm": 2.434356927871704, + "learning_rate": 5e-06, + "loss": 0.1855, + "step": 5173 + }, + { + "epoch": 0.9892925430210325, + "grad_norm": 2.505683422088623, + "learning_rate": 5e-06, + "loss": 0.1302, + "step": 5174 + }, + { + "epoch": 0.9894837476099426, + "grad_norm": 2.152951240539551, + "learning_rate": 5e-06, + "loss": 0.1351, + "step": 5175 + }, + { + "epoch": 0.9896749521988528, + "grad_norm": 1.6887750625610352, + "learning_rate": 5e-06, + "loss": 0.1331, + "step": 5176 + }, + { + "epoch": 0.9898661567877629, + "grad_norm": 1.2444119453430176, + "learning_rate": 5e-06, + "loss": 0.1079, + "step": 5177 + }, + { + "epoch": 0.990057361376673, + "grad_norm": 2.2396018505096436, + "learning_rate": 5e-06, + "loss": 0.3107, + "step": 5178 + }, + { + "epoch": 0.9902485659655832, + "grad_norm": 2.357428550720215, + "learning_rate": 5e-06, + "loss": 0.3262, + "step": 5179 + }, + { + "epoch": 0.9904397705544933, + "grad_norm": 1.2109572887420654, + "learning_rate": 5e-06, + "loss": 0.0421, + "step": 5180 + }, + { + "epoch": 0.9906309751434035, + "grad_norm": 3.045706272125244, + "learning_rate": 5e-06, + "loss": 0.0984, + "step": 5181 + }, + { + "epoch": 0.9908221797323136, + "grad_norm": 3.762176990509033, + "learning_rate": 5e-06, + "loss": 0.2968, + "step": 5182 + }, + { + "epoch": 0.9910133843212237, + "grad_norm": 1.7476848363876343, + "learning_rate": 5e-06, + "loss": 0.1861, + "step": 5183 + }, + { + "epoch": 0.9912045889101339, + "grad_norm": 1.974594235420227, + "learning_rate": 5e-06, + "loss": 0.1671, + "step": 5184 + }, + { + "epoch": 0.9913957934990439, + "grad_norm": 1.9838353395462036, + "learning_rate": 5e-06, + "loss": 0.1467, + "step": 5185 + }, + { + "epoch": 0.9915869980879541, + "grad_norm": 1.257691502571106, + "learning_rate": 5e-06, + "loss": 0.0803, + "step": 5186 + }, + { + "epoch": 0.9917782026768642, + "grad_norm": 1.7940335273742676, + "learning_rate": 5e-06, + "loss": 0.0902, + "step": 5187 + }, + { + "epoch": 0.9919694072657744, + "grad_norm": 2.434577226638794, + "learning_rate": 5e-06, + "loss": 0.1678, + "step": 5188 + }, + { + "epoch": 0.9921606118546845, + "grad_norm": 2.495274543762207, + "learning_rate": 5e-06, + "loss": 0.3524, + "step": 5189 + }, + { + "epoch": 0.9923518164435946, + "grad_norm": 2.122135639190674, + "learning_rate": 5e-06, + "loss": 0.334, + "step": 5190 + }, + { + "epoch": 0.9925430210325048, + "grad_norm": 3.421670436859131, + "learning_rate": 5e-06, + "loss": 0.3898, + "step": 5191 + }, + { + "epoch": 0.9927342256214149, + "grad_norm": 0.9619971513748169, + "learning_rate": 5e-06, + "loss": 0.0719, + "step": 5192 + }, + { + "epoch": 0.9929254302103251, + "grad_norm": 0.8660413026809692, + "learning_rate": 5e-06, + "loss": 0.0436, + "step": 5193 + }, + { + "epoch": 0.9931166347992352, + "grad_norm": 1.1905746459960938, + "learning_rate": 5e-06, + "loss": 0.0579, + "step": 5194 + }, + { + "epoch": 0.9933078393881453, + "grad_norm": 3.714054822921753, + "learning_rate": 5e-06, + "loss": 0.4012, + "step": 5195 + }, + { + "epoch": 0.9934990439770555, + "grad_norm": 1.5647445917129517, + "learning_rate": 5e-06, + "loss": 0.1707, + "step": 5196 + }, + { + "epoch": 0.9936902485659656, + "grad_norm": 1.8715081214904785, + "learning_rate": 5e-06, + "loss": 0.236, + "step": 5197 + }, + { + "epoch": 0.9938814531548757, + "grad_norm": 1.3924305438995361, + "learning_rate": 5e-06, + "loss": 0.1171, + "step": 5198 + }, + { + "epoch": 0.9940726577437858, + "grad_norm": 1.4491206407546997, + "learning_rate": 5e-06, + "loss": 0.1173, + "step": 5199 + }, + { + "epoch": 0.994263862332696, + "grad_norm": 3.2908480167388916, + "learning_rate": 5e-06, + "loss": 0.241, + "step": 5200 + }, + { + "epoch": 0.9944550669216061, + "grad_norm": 1.7284696102142334, + "learning_rate": 5e-06, + "loss": 0.1932, + "step": 5201 + }, + { + "epoch": 0.9946462715105162, + "grad_norm": 3.488659381866455, + "learning_rate": 5e-06, + "loss": 0.3217, + "step": 5202 + }, + { + "epoch": 0.9948374760994264, + "grad_norm": 1.9990845918655396, + "learning_rate": 5e-06, + "loss": 0.4941, + "step": 5203 + }, + { + "epoch": 0.9950286806883365, + "grad_norm": 1.8910921812057495, + "learning_rate": 5e-06, + "loss": 0.1171, + "step": 5204 + }, + { + "epoch": 0.9952198852772467, + "grad_norm": 1.4411567449569702, + "learning_rate": 5e-06, + "loss": 0.1537, + "step": 5205 + }, + { + "epoch": 0.9954110898661568, + "grad_norm": 1.101347804069519, + "learning_rate": 5e-06, + "loss": 0.0547, + "step": 5206 + }, + { + "epoch": 0.9956022944550669, + "grad_norm": 1.8866088390350342, + "learning_rate": 5e-06, + "loss": 0.1649, + "step": 5207 + }, + { + "epoch": 0.9957934990439771, + "grad_norm": 2.1902003288269043, + "learning_rate": 5e-06, + "loss": 0.281, + "step": 5208 + }, + { + "epoch": 0.9959847036328872, + "grad_norm": 2.494462490081787, + "learning_rate": 5e-06, + "loss": 0.1644, + "step": 5209 + }, + { + "epoch": 0.9961759082217974, + "grad_norm": 1.2047346830368042, + "learning_rate": 5e-06, + "loss": 0.1172, + "step": 5210 + }, + { + "epoch": 0.9963671128107074, + "grad_norm": 0.9425076246261597, + "learning_rate": 5e-06, + "loss": 0.059, + "step": 5211 + }, + { + "epoch": 0.9965583173996176, + "grad_norm": 1.435917615890503, + "learning_rate": 5e-06, + "loss": 0.0519, + "step": 5212 + }, + { + "epoch": 0.9967495219885277, + "grad_norm": 1.1277811527252197, + "learning_rate": 5e-06, + "loss": 0.0978, + "step": 5213 + }, + { + "epoch": 0.9969407265774378, + "grad_norm": 2.782651901245117, + "learning_rate": 5e-06, + "loss": 0.3652, + "step": 5214 + }, + { + "epoch": 0.997131931166348, + "grad_norm": 1.5946991443634033, + "learning_rate": 5e-06, + "loss": 0.1018, + "step": 5215 + }, + { + "epoch": 0.9973231357552581, + "grad_norm": 0.9089525938034058, + "learning_rate": 5e-06, + "loss": 0.0902, + "step": 5216 + }, + { + "epoch": 0.9975143403441683, + "grad_norm": 2.2287213802337646, + "learning_rate": 5e-06, + "loss": 0.1174, + "step": 5217 + }, + { + "epoch": 0.9977055449330784, + "grad_norm": 2.977839231491089, + "learning_rate": 5e-06, + "loss": 0.2146, + "step": 5218 + }, + { + "epoch": 0.9978967495219885, + "grad_norm": 1.4728442430496216, + "learning_rate": 5e-06, + "loss": 0.2433, + "step": 5219 + }, + { + "epoch": 0.9980879541108987, + "grad_norm": 2.0566630363464355, + "learning_rate": 5e-06, + "loss": 0.2769, + "step": 5220 + }, + { + "epoch": 0.9982791586998088, + "grad_norm": 1.2457760572433472, + "learning_rate": 5e-06, + "loss": 0.0825, + "step": 5221 + }, + { + "epoch": 0.998470363288719, + "grad_norm": 1.6305168867111206, + "learning_rate": 5e-06, + "loss": 0.1614, + "step": 5222 + }, + { + "epoch": 0.998661567877629, + "grad_norm": 1.9912116527557373, + "learning_rate": 5e-06, + "loss": 0.0982, + "step": 5223 + }, + { + "epoch": 0.9988527724665393, + "grad_norm": 4.200954437255859, + "learning_rate": 5e-06, + "loss": 0.2215, + "step": 5224 + }, + { + "epoch": 0.9990439770554493, + "grad_norm": 1.4355394840240479, + "learning_rate": 5e-06, + "loss": 0.0419, + "step": 5225 + }, + { + "epoch": 0.9992351816443594, + "grad_norm": 1.8551381826400757, + "learning_rate": 5e-06, + "loss": 0.2277, + "step": 5226 + }, + { + "epoch": 0.9994263862332696, + "grad_norm": 2.7024409770965576, + "learning_rate": 5e-06, + "loss": 0.5278, + "step": 5227 + }, + { + "epoch": 0.9996175908221797, + "grad_norm": 1.0480506420135498, + "learning_rate": 5e-06, + "loss": 0.0521, + "step": 5228 + }, + { + "epoch": 0.9998087954110899, + "grad_norm": 2.2718355655670166, + "learning_rate": 5e-06, + "loss": 0.1477, + "step": 5229 + }, + { + "epoch": 1.0, + "grad_norm": 3.5123586654663086, + "learning_rate": 5e-06, + "loss": 0.3213, + "step": 5230 + }, + { + "epoch": 1.00019120458891, + "grad_norm": 1.7297927141189575, + "learning_rate": 5e-06, + "loss": 0.2801, + "step": 5231 + }, + { + "epoch": 1.0003824091778202, + "grad_norm": 1.3347065448760986, + "learning_rate": 5e-06, + "loss": 0.0934, + "step": 5232 + }, + { + "epoch": 1.0005736137667305, + "grad_norm": 1.1105912923812866, + "learning_rate": 5e-06, + "loss": 0.094, + "step": 5233 + }, + { + "epoch": 1.0007648183556406, + "grad_norm": 0.9613377451896667, + "learning_rate": 5e-06, + "loss": 0.0577, + "step": 5234 + }, + { + "epoch": 1.0009560229445507, + "grad_norm": 1.0919339656829834, + "learning_rate": 5e-06, + "loss": 0.099, + "step": 5235 + }, + { + "epoch": 1.0011472275334607, + "grad_norm": 0.9715074300765991, + "learning_rate": 5e-06, + "loss": 0.063, + "step": 5236 + }, + { + "epoch": 1.0013384321223708, + "grad_norm": 1.8640201091766357, + "learning_rate": 5e-06, + "loss": 0.2279, + "step": 5237 + }, + { + "epoch": 1.0015296367112811, + "grad_norm": 2.2503163814544678, + "learning_rate": 5e-06, + "loss": 0.2751, + "step": 5238 + }, + { + "epoch": 1.0017208413001912, + "grad_norm": 1.2504420280456543, + "learning_rate": 5e-06, + "loss": 0.0953, + "step": 5239 + }, + { + "epoch": 1.0019120458891013, + "grad_norm": 1.6367700099945068, + "learning_rate": 5e-06, + "loss": 0.103, + "step": 5240 + }, + { + "epoch": 1.0021032504780114, + "grad_norm": 1.514311671257019, + "learning_rate": 5e-06, + "loss": 0.144, + "step": 5241 + }, + { + "epoch": 1.0022944550669215, + "grad_norm": 1.054772973060608, + "learning_rate": 5e-06, + "loss": 0.0299, + "step": 5242 + }, + { + "epoch": 1.0024856596558318, + "grad_norm": 2.3778746128082275, + "learning_rate": 5e-06, + "loss": 0.3053, + "step": 5243 + }, + { + "epoch": 1.002676864244742, + "grad_norm": 2.213348150253296, + "learning_rate": 5e-06, + "loss": 0.2055, + "step": 5244 + }, + { + "epoch": 1.002868068833652, + "grad_norm": 1.6743370294570923, + "learning_rate": 5e-06, + "loss": 0.0935, + "step": 5245 + }, + { + "epoch": 1.003059273422562, + "grad_norm": 1.9218884706497192, + "learning_rate": 5e-06, + "loss": 0.1872, + "step": 5246 + }, + { + "epoch": 1.0032504780114724, + "grad_norm": 0.5441107153892517, + "learning_rate": 5e-06, + "loss": 0.0418, + "step": 5247 + }, + { + "epoch": 1.0034416826003825, + "grad_norm": 1.4847966432571411, + "learning_rate": 5e-06, + "loss": 0.0598, + "step": 5248 + }, + { + "epoch": 1.0036328871892926, + "grad_norm": 1.8190624713897705, + "learning_rate": 5e-06, + "loss": 0.0375, + "step": 5249 + }, + { + "epoch": 1.0038240917782026, + "grad_norm": 2.3524463176727295, + "learning_rate": 5e-06, + "loss": 0.1668, + "step": 5250 + }, + { + "epoch": 1.0040152963671127, + "grad_norm": 2.548030376434326, + "learning_rate": 5e-06, + "loss": 0.3332, + "step": 5251 + }, + { + "epoch": 1.004206500956023, + "grad_norm": 1.4580950736999512, + "learning_rate": 5e-06, + "loss": 0.1052, + "step": 5252 + }, + { + "epoch": 1.0043977055449331, + "grad_norm": 1.2642977237701416, + "learning_rate": 5e-06, + "loss": 0.0873, + "step": 5253 + }, + { + "epoch": 1.0045889101338432, + "grad_norm": 1.0301834344863892, + "learning_rate": 5e-06, + "loss": 0.0373, + "step": 5254 + }, + { + "epoch": 1.0047801147227533, + "grad_norm": 1.7443534135818481, + "learning_rate": 5e-06, + "loss": 0.1106, + "step": 5255 + }, + { + "epoch": 1.0049713193116634, + "grad_norm": 1.4613491296768188, + "learning_rate": 5e-06, + "loss": 0.1496, + "step": 5256 + }, + { + "epoch": 1.0051625239005737, + "grad_norm": 1.6461591720581055, + "learning_rate": 5e-06, + "loss": 0.1837, + "step": 5257 + }, + { + "epoch": 1.0053537284894838, + "grad_norm": 1.1870779991149902, + "learning_rate": 5e-06, + "loss": 0.0809, + "step": 5258 + }, + { + "epoch": 1.0055449330783939, + "grad_norm": 0.8499864935874939, + "learning_rate": 5e-06, + "loss": 0.0478, + "step": 5259 + }, + { + "epoch": 1.005736137667304, + "grad_norm": 1.4943770170211792, + "learning_rate": 5e-06, + "loss": 0.0532, + "step": 5260 + }, + { + "epoch": 1.005927342256214, + "grad_norm": 1.3564469814300537, + "learning_rate": 5e-06, + "loss": 0.0571, + "step": 5261 + }, + { + "epoch": 1.0061185468451244, + "grad_norm": 1.2749664783477783, + "learning_rate": 5e-06, + "loss": 0.066, + "step": 5262 + }, + { + "epoch": 1.0063097514340344, + "grad_norm": 2.179372549057007, + "learning_rate": 5e-06, + "loss": 0.1723, + "step": 5263 + }, + { + "epoch": 1.0065009560229445, + "grad_norm": 1.2617064714431763, + "learning_rate": 5e-06, + "loss": 0.0835, + "step": 5264 + }, + { + "epoch": 1.0066921606118546, + "grad_norm": 0.9173052310943604, + "learning_rate": 5e-06, + "loss": 0.0922, + "step": 5265 + }, + { + "epoch": 1.0068833652007647, + "grad_norm": 1.5798940658569336, + "learning_rate": 5e-06, + "loss": 0.0287, + "step": 5266 + }, + { + "epoch": 1.007074569789675, + "grad_norm": 1.8742361068725586, + "learning_rate": 5e-06, + "loss": 0.088, + "step": 5267 + }, + { + "epoch": 1.007265774378585, + "grad_norm": 2.5780484676361084, + "learning_rate": 5e-06, + "loss": 0.087, + "step": 5268 + }, + { + "epoch": 1.0074569789674952, + "grad_norm": 2.258584499359131, + "learning_rate": 5e-06, + "loss": 0.2834, + "step": 5269 + }, + { + "epoch": 1.0076481835564053, + "grad_norm": 0.6943368315696716, + "learning_rate": 5e-06, + "loss": 0.0394, + "step": 5270 + }, + { + "epoch": 1.0078393881453156, + "grad_norm": 1.9142544269561768, + "learning_rate": 5e-06, + "loss": 0.0845, + "step": 5271 + }, + { + "epoch": 1.0080305927342257, + "grad_norm": 0.8416318297386169, + "learning_rate": 5e-06, + "loss": 0.0664, + "step": 5272 + }, + { + "epoch": 1.0082217973231358, + "grad_norm": 1.94535493850708, + "learning_rate": 5e-06, + "loss": 0.1711, + "step": 5273 + }, + { + "epoch": 1.0084130019120459, + "grad_norm": 1.2606523036956787, + "learning_rate": 5e-06, + "loss": 0.0534, + "step": 5274 + }, + { + "epoch": 1.008604206500956, + "grad_norm": 2.3237664699554443, + "learning_rate": 5e-06, + "loss": 0.2517, + "step": 5275 + }, + { + "epoch": 1.0087954110898663, + "grad_norm": 2.197056293487549, + "learning_rate": 5e-06, + "loss": 0.3001, + "step": 5276 + }, + { + "epoch": 1.0089866156787763, + "grad_norm": 1.2944321632385254, + "learning_rate": 5e-06, + "loss": 0.103, + "step": 5277 + }, + { + "epoch": 1.0091778202676864, + "grad_norm": 1.5966204404830933, + "learning_rate": 5e-06, + "loss": 0.1872, + "step": 5278 + }, + { + "epoch": 1.0093690248565965, + "grad_norm": 1.0905331373214722, + "learning_rate": 5e-06, + "loss": 0.0653, + "step": 5279 + }, + { + "epoch": 1.0095602294455066, + "grad_norm": 1.1597042083740234, + "learning_rate": 5e-06, + "loss": 0.0651, + "step": 5280 + }, + { + "epoch": 1.009751434034417, + "grad_norm": 2.1614441871643066, + "learning_rate": 5e-06, + "loss": 0.1937, + "step": 5281 + }, + { + "epoch": 1.009942638623327, + "grad_norm": 2.6233420372009277, + "learning_rate": 5e-06, + "loss": 0.3533, + "step": 5282 + }, + { + "epoch": 1.010133843212237, + "grad_norm": 1.4536938667297363, + "learning_rate": 5e-06, + "loss": 0.1249, + "step": 5283 + }, + { + "epoch": 1.0103250478011472, + "grad_norm": 0.9678513407707214, + "learning_rate": 5e-06, + "loss": 0.0769, + "step": 5284 + }, + { + "epoch": 1.0105162523900573, + "grad_norm": 1.2624231576919556, + "learning_rate": 5e-06, + "loss": 0.0625, + "step": 5285 + }, + { + "epoch": 1.0107074569789676, + "grad_norm": 1.1822149753570557, + "learning_rate": 5e-06, + "loss": 0.0367, + "step": 5286 + }, + { + "epoch": 1.0108986615678777, + "grad_norm": 1.7562121152877808, + "learning_rate": 5e-06, + "loss": 0.1614, + "step": 5287 + }, + { + "epoch": 1.0110898661567878, + "grad_norm": 2.224581003189087, + "learning_rate": 5e-06, + "loss": 0.3048, + "step": 5288 + }, + { + "epoch": 1.0112810707456978, + "grad_norm": 1.7163106203079224, + "learning_rate": 5e-06, + "loss": 0.2074, + "step": 5289 + }, + { + "epoch": 1.011472275334608, + "grad_norm": 1.9225854873657227, + "learning_rate": 5e-06, + "loss": 0.2347, + "step": 5290 + }, + { + "epoch": 1.0116634799235182, + "grad_norm": 0.8340242505073547, + "learning_rate": 5e-06, + "loss": 0.0415, + "step": 5291 + }, + { + "epoch": 1.0118546845124283, + "grad_norm": 0.9325177073478699, + "learning_rate": 5e-06, + "loss": 0.0336, + "step": 5292 + }, + { + "epoch": 1.0120458891013384, + "grad_norm": 1.3594424724578857, + "learning_rate": 5e-06, + "loss": 0.0575, + "step": 5293 + }, + { + "epoch": 1.0122370936902485, + "grad_norm": 2.631110429763794, + "learning_rate": 5e-06, + "loss": 0.4276, + "step": 5294 + }, + { + "epoch": 1.0124282982791586, + "grad_norm": 1.8750988245010376, + "learning_rate": 5e-06, + "loss": 0.3076, + "step": 5295 + }, + { + "epoch": 1.012619502868069, + "grad_norm": 1.2204387187957764, + "learning_rate": 5e-06, + "loss": 0.074, + "step": 5296 + }, + { + "epoch": 1.012810707456979, + "grad_norm": 1.7058926820755005, + "learning_rate": 5e-06, + "loss": 0.1998, + "step": 5297 + }, + { + "epoch": 1.013001912045889, + "grad_norm": 1.6197351217269897, + "learning_rate": 5e-06, + "loss": 0.071, + "step": 5298 + }, + { + "epoch": 1.0131931166347992, + "grad_norm": 1.8550605773925781, + "learning_rate": 5e-06, + "loss": 0.0786, + "step": 5299 + }, + { + "epoch": 1.0133843212237095, + "grad_norm": 1.1241230964660645, + "learning_rate": 5e-06, + "loss": 0.0807, + "step": 5300 + }, + { + "epoch": 1.0135755258126196, + "grad_norm": 2.6108241081237793, + "learning_rate": 5e-06, + "loss": 0.2384, + "step": 5301 + }, + { + "epoch": 1.0137667304015296, + "grad_norm": 1.967355728149414, + "learning_rate": 5e-06, + "loss": 0.1577, + "step": 5302 + }, + { + "epoch": 1.0139579349904397, + "grad_norm": 1.6688017845153809, + "learning_rate": 5e-06, + "loss": 0.0525, + "step": 5303 + }, + { + "epoch": 1.0141491395793498, + "grad_norm": 1.1315393447875977, + "learning_rate": 5e-06, + "loss": 0.0706, + "step": 5304 + }, + { + "epoch": 1.0143403441682601, + "grad_norm": 1.7984042167663574, + "learning_rate": 5e-06, + "loss": 0.078, + "step": 5305 + }, + { + "epoch": 1.0145315487571702, + "grad_norm": 1.8164480924606323, + "learning_rate": 5e-06, + "loss": 0.1293, + "step": 5306 + }, + { + "epoch": 1.0147227533460803, + "grad_norm": 1.3991438150405884, + "learning_rate": 5e-06, + "loss": 0.1551, + "step": 5307 + }, + { + "epoch": 1.0149139579349904, + "grad_norm": 1.8388389348983765, + "learning_rate": 5e-06, + "loss": 0.0591, + "step": 5308 + }, + { + "epoch": 1.0151051625239005, + "grad_norm": 1.4206583499908447, + "learning_rate": 5e-06, + "loss": 0.0782, + "step": 5309 + }, + { + "epoch": 1.0152963671128108, + "grad_norm": 0.5835526585578918, + "learning_rate": 5e-06, + "loss": 0.0423, + "step": 5310 + }, + { + "epoch": 1.0154875717017209, + "grad_norm": 1.2603877782821655, + "learning_rate": 5e-06, + "loss": 0.0581, + "step": 5311 + }, + { + "epoch": 1.015678776290631, + "grad_norm": 2.069904327392578, + "learning_rate": 5e-06, + "loss": 0.2226, + "step": 5312 + }, + { + "epoch": 1.015869980879541, + "grad_norm": 1.6898385286331177, + "learning_rate": 5e-06, + "loss": 0.1273, + "step": 5313 + }, + { + "epoch": 1.0160611854684511, + "grad_norm": 2.38932466506958, + "learning_rate": 5e-06, + "loss": 0.1309, + "step": 5314 + }, + { + "epoch": 1.0162523900573615, + "grad_norm": 1.2808679342269897, + "learning_rate": 5e-06, + "loss": 0.0763, + "step": 5315 + }, + { + "epoch": 1.0164435946462715, + "grad_norm": 1.2353378534317017, + "learning_rate": 5e-06, + "loss": 0.0659, + "step": 5316 + }, + { + "epoch": 1.0166347992351816, + "grad_norm": 1.1403536796569824, + "learning_rate": 5e-06, + "loss": 0.059, + "step": 5317 + }, + { + "epoch": 1.0168260038240917, + "grad_norm": 1.8925150632858276, + "learning_rate": 5e-06, + "loss": 0.1917, + "step": 5318 + }, + { + "epoch": 1.0170172084130018, + "grad_norm": 1.5492744445800781, + "learning_rate": 5e-06, + "loss": 0.1937, + "step": 5319 + }, + { + "epoch": 1.0172084130019121, + "grad_norm": 2.6364314556121826, + "learning_rate": 5e-06, + "loss": 0.2993, + "step": 5320 + }, + { + "epoch": 1.0173996175908222, + "grad_norm": 1.9068900346755981, + "learning_rate": 5e-06, + "loss": 0.2033, + "step": 5321 + }, + { + "epoch": 1.0175908221797323, + "grad_norm": 1.244210958480835, + "learning_rate": 5e-06, + "loss": 0.084, + "step": 5322 + }, + { + "epoch": 1.0177820267686424, + "grad_norm": 1.4095242023468018, + "learning_rate": 5e-06, + "loss": 0.0952, + "step": 5323 + }, + { + "epoch": 1.0179732313575527, + "grad_norm": 1.3021609783172607, + "learning_rate": 5e-06, + "loss": 0.0732, + "step": 5324 + }, + { + "epoch": 1.0181644359464628, + "grad_norm": 2.593829870223999, + "learning_rate": 5e-06, + "loss": 0.3372, + "step": 5325 + }, + { + "epoch": 1.0183556405353729, + "grad_norm": 0.7937406897544861, + "learning_rate": 5e-06, + "loss": 0.0694, + "step": 5326 + }, + { + "epoch": 1.018546845124283, + "grad_norm": 1.4523568153381348, + "learning_rate": 5e-06, + "loss": 0.0889, + "step": 5327 + }, + { + "epoch": 1.018738049713193, + "grad_norm": 2.000311851501465, + "learning_rate": 5e-06, + "loss": 0.1579, + "step": 5328 + }, + { + "epoch": 1.0189292543021033, + "grad_norm": 1.0171622037887573, + "learning_rate": 5e-06, + "loss": 0.0692, + "step": 5329 + }, + { + "epoch": 1.0191204588910134, + "grad_norm": 0.9312133193016052, + "learning_rate": 5e-06, + "loss": 0.0239, + "step": 5330 + }, + { + "epoch": 1.0193116634799235, + "grad_norm": 2.9251790046691895, + "learning_rate": 5e-06, + "loss": 0.1988, + "step": 5331 + }, + { + "epoch": 1.0195028680688336, + "grad_norm": 1.3772673606872559, + "learning_rate": 5e-06, + "loss": 0.0688, + "step": 5332 + }, + { + "epoch": 1.0196940726577437, + "grad_norm": 1.559841275215149, + "learning_rate": 5e-06, + "loss": 0.2109, + "step": 5333 + }, + { + "epoch": 1.019885277246654, + "grad_norm": 1.385416865348816, + "learning_rate": 5e-06, + "loss": 0.0531, + "step": 5334 + }, + { + "epoch": 1.020076481835564, + "grad_norm": 1.2152140140533447, + "learning_rate": 5e-06, + "loss": 0.0514, + "step": 5335 + }, + { + "epoch": 1.0202676864244742, + "grad_norm": 1.0661227703094482, + "learning_rate": 5e-06, + "loss": 0.0496, + "step": 5336 + }, + { + "epoch": 1.0204588910133843, + "grad_norm": 2.223980188369751, + "learning_rate": 5e-06, + "loss": 0.2001, + "step": 5337 + }, + { + "epoch": 1.0206500956022944, + "grad_norm": 2.8563883304595947, + "learning_rate": 5e-06, + "loss": 0.3093, + "step": 5338 + }, + { + "epoch": 1.0208413001912047, + "grad_norm": 1.7817109823226929, + "learning_rate": 5e-06, + "loss": 0.0834, + "step": 5339 + }, + { + "epoch": 1.0210325047801148, + "grad_norm": 1.39713716506958, + "learning_rate": 5e-06, + "loss": 0.1341, + "step": 5340 + }, + { + "epoch": 1.0212237093690248, + "grad_norm": 0.9216711521148682, + "learning_rate": 5e-06, + "loss": 0.046, + "step": 5341 + }, + { + "epoch": 1.021414913957935, + "grad_norm": 3.6287028789520264, + "learning_rate": 5e-06, + "loss": 0.066, + "step": 5342 + }, + { + "epoch": 1.021606118546845, + "grad_norm": 1.79201078414917, + "learning_rate": 5e-06, + "loss": 0.1809, + "step": 5343 + }, + { + "epoch": 1.0217973231357553, + "grad_norm": 1.3624271154403687, + "learning_rate": 5e-06, + "loss": 0.0847, + "step": 5344 + }, + { + "epoch": 1.0219885277246654, + "grad_norm": 1.809230089187622, + "learning_rate": 5e-06, + "loss": 0.0799, + "step": 5345 + }, + { + "epoch": 1.0221797323135755, + "grad_norm": 1.3011666536331177, + "learning_rate": 5e-06, + "loss": 0.0849, + "step": 5346 + }, + { + "epoch": 1.0223709369024856, + "grad_norm": 0.8237725496292114, + "learning_rate": 5e-06, + "loss": 0.0524, + "step": 5347 + }, + { + "epoch": 1.0225621414913957, + "grad_norm": 1.9484095573425293, + "learning_rate": 5e-06, + "loss": 0.0832, + "step": 5348 + }, + { + "epoch": 1.022753346080306, + "grad_norm": 1.2306699752807617, + "learning_rate": 5e-06, + "loss": 0.0433, + "step": 5349 + }, + { + "epoch": 1.022944550669216, + "grad_norm": 2.3627052307128906, + "learning_rate": 5e-06, + "loss": 0.2586, + "step": 5350 + }, + { + "epoch": 1.0231357552581262, + "grad_norm": 1.5180178880691528, + "learning_rate": 5e-06, + "loss": 0.1013, + "step": 5351 + }, + { + "epoch": 1.0233269598470363, + "grad_norm": 1.259223222732544, + "learning_rate": 5e-06, + "loss": 0.1714, + "step": 5352 + }, + { + "epoch": 1.0235181644359466, + "grad_norm": 1.2246848344802856, + "learning_rate": 5e-06, + "loss": 0.0907, + "step": 5353 + }, + { + "epoch": 1.0237093690248567, + "grad_norm": 2.1600708961486816, + "learning_rate": 5e-06, + "loss": 0.1361, + "step": 5354 + }, + { + "epoch": 1.0239005736137667, + "grad_norm": 1.1809910535812378, + "learning_rate": 5e-06, + "loss": 0.0521, + "step": 5355 + }, + { + "epoch": 1.0240917782026768, + "grad_norm": 2.0541279315948486, + "learning_rate": 5e-06, + "loss": 0.1937, + "step": 5356 + }, + { + "epoch": 1.024282982791587, + "grad_norm": 1.9363659620285034, + "learning_rate": 5e-06, + "loss": 0.1494, + "step": 5357 + }, + { + "epoch": 1.0244741873804972, + "grad_norm": 2.4680871963500977, + "learning_rate": 5e-06, + "loss": 0.1925, + "step": 5358 + }, + { + "epoch": 1.0246653919694073, + "grad_norm": 1.0142306089401245, + "learning_rate": 5e-06, + "loss": 0.0611, + "step": 5359 + }, + { + "epoch": 1.0248565965583174, + "grad_norm": 1.288448452949524, + "learning_rate": 5e-06, + "loss": 0.0642, + "step": 5360 + }, + { + "epoch": 1.0250478011472275, + "grad_norm": 1.4450470209121704, + "learning_rate": 5e-06, + "loss": 0.0648, + "step": 5361 + }, + { + "epoch": 1.0252390057361376, + "grad_norm": 1.5391541719436646, + "learning_rate": 5e-06, + "loss": 0.1351, + "step": 5362 + }, + { + "epoch": 1.0254302103250479, + "grad_norm": 2.144197940826416, + "learning_rate": 5e-06, + "loss": 0.3029, + "step": 5363 + }, + { + "epoch": 1.025621414913958, + "grad_norm": 1.9811673164367676, + "learning_rate": 5e-06, + "loss": 0.1664, + "step": 5364 + }, + { + "epoch": 1.025812619502868, + "grad_norm": 0.9573968648910522, + "learning_rate": 5e-06, + "loss": 0.0651, + "step": 5365 + }, + { + "epoch": 1.0260038240917781, + "grad_norm": 1.0169429779052734, + "learning_rate": 5e-06, + "loss": 0.0364, + "step": 5366 + }, + { + "epoch": 1.0261950286806882, + "grad_norm": 0.7411577701568604, + "learning_rate": 5e-06, + "loss": 0.0185, + "step": 5367 + }, + { + "epoch": 1.0263862332695985, + "grad_norm": 1.0757877826690674, + "learning_rate": 5e-06, + "loss": 0.082, + "step": 5368 + }, + { + "epoch": 1.0265774378585086, + "grad_norm": 2.224330425262451, + "learning_rate": 5e-06, + "loss": 0.2543, + "step": 5369 + }, + { + "epoch": 1.0267686424474187, + "grad_norm": 1.7419105768203735, + "learning_rate": 5e-06, + "loss": 0.1179, + "step": 5370 + }, + { + "epoch": 1.0269598470363288, + "grad_norm": 1.4337900876998901, + "learning_rate": 5e-06, + "loss": 0.0853, + "step": 5371 + }, + { + "epoch": 1.027151051625239, + "grad_norm": 1.0431238412857056, + "learning_rate": 5e-06, + "loss": 0.0712, + "step": 5372 + }, + { + "epoch": 1.0273422562141492, + "grad_norm": 1.8076082468032837, + "learning_rate": 5e-06, + "loss": 0.0804, + "step": 5373 + }, + { + "epoch": 1.0275334608030593, + "grad_norm": 1.998808741569519, + "learning_rate": 5e-06, + "loss": 0.0978, + "step": 5374 + }, + { + "epoch": 1.0277246653919694, + "grad_norm": 1.9513105154037476, + "learning_rate": 5e-06, + "loss": 0.1732, + "step": 5375 + }, + { + "epoch": 1.0279158699808795, + "grad_norm": 0.6905821561813354, + "learning_rate": 5e-06, + "loss": 0.0502, + "step": 5376 + }, + { + "epoch": 1.0281070745697898, + "grad_norm": 1.0172103643417358, + "learning_rate": 5e-06, + "loss": 0.0641, + "step": 5377 + }, + { + "epoch": 1.0282982791586999, + "grad_norm": 1.166339635848999, + "learning_rate": 5e-06, + "loss": 0.0766, + "step": 5378 + }, + { + "epoch": 1.02848948374761, + "grad_norm": 1.3585234880447388, + "learning_rate": 5e-06, + "loss": 0.0646, + "step": 5379 + }, + { + "epoch": 1.02868068833652, + "grad_norm": 1.65544593334198, + "learning_rate": 5e-06, + "loss": 0.1481, + "step": 5380 + }, + { + "epoch": 1.0288718929254301, + "grad_norm": 2.576430082321167, + "learning_rate": 5e-06, + "loss": 0.2197, + "step": 5381 + }, + { + "epoch": 1.0290630975143404, + "grad_norm": 2.2190842628479004, + "learning_rate": 5e-06, + "loss": 0.1957, + "step": 5382 + }, + { + "epoch": 1.0292543021032505, + "grad_norm": 1.092215895652771, + "learning_rate": 5e-06, + "loss": 0.0609, + "step": 5383 + }, + { + "epoch": 1.0294455066921606, + "grad_norm": 1.6553555727005005, + "learning_rate": 5e-06, + "loss": 0.085, + "step": 5384 + }, + { + "epoch": 1.0296367112810707, + "grad_norm": 1.6299556493759155, + "learning_rate": 5e-06, + "loss": 0.0454, + "step": 5385 + }, + { + "epoch": 1.0298279158699808, + "grad_norm": 1.2839386463165283, + "learning_rate": 5e-06, + "loss": 0.068, + "step": 5386 + }, + { + "epoch": 1.030019120458891, + "grad_norm": 1.5672410726547241, + "learning_rate": 5e-06, + "loss": 0.0897, + "step": 5387 + }, + { + "epoch": 1.0302103250478012, + "grad_norm": 1.3381305932998657, + "learning_rate": 5e-06, + "loss": 0.0716, + "step": 5388 + }, + { + "epoch": 1.0304015296367113, + "grad_norm": 0.9112756252288818, + "learning_rate": 5e-06, + "loss": 0.0466, + "step": 5389 + }, + { + "epoch": 1.0305927342256214, + "grad_norm": 0.6958617568016052, + "learning_rate": 5e-06, + "loss": 0.0528, + "step": 5390 + }, + { + "epoch": 1.0307839388145315, + "grad_norm": 1.5371332168579102, + "learning_rate": 5e-06, + "loss": 0.0767, + "step": 5391 + }, + { + "epoch": 1.0309751434034418, + "grad_norm": 2.49212646484375, + "learning_rate": 5e-06, + "loss": 0.1944, + "step": 5392 + }, + { + "epoch": 1.0311663479923519, + "grad_norm": 1.2017213106155396, + "learning_rate": 5e-06, + "loss": 0.1326, + "step": 5393 + }, + { + "epoch": 1.031357552581262, + "grad_norm": 1.103096604347229, + "learning_rate": 5e-06, + "loss": 0.0552, + "step": 5394 + }, + { + "epoch": 1.031548757170172, + "grad_norm": 2.4898500442504883, + "learning_rate": 5e-06, + "loss": 0.1824, + "step": 5395 + }, + { + "epoch": 1.0317399617590821, + "grad_norm": 1.9159975051879883, + "learning_rate": 5e-06, + "loss": 0.1253, + "step": 5396 + }, + { + "epoch": 1.0319311663479924, + "grad_norm": 1.6219099760055542, + "learning_rate": 5e-06, + "loss": 0.0913, + "step": 5397 + }, + { + "epoch": 1.0321223709369025, + "grad_norm": 1.076381802558899, + "learning_rate": 5e-06, + "loss": 0.0744, + "step": 5398 + }, + { + "epoch": 1.0323135755258126, + "grad_norm": 1.1917436122894287, + "learning_rate": 5e-06, + "loss": 0.0657, + "step": 5399 + }, + { + "epoch": 1.0325047801147227, + "grad_norm": 1.8527624607086182, + "learning_rate": 5e-06, + "loss": 0.16, + "step": 5400 + }, + { + "epoch": 1.0326959847036328, + "grad_norm": 2.29830002784729, + "learning_rate": 5e-06, + "loss": 0.1488, + "step": 5401 + }, + { + "epoch": 1.032887189292543, + "grad_norm": 2.4546074867248535, + "learning_rate": 5e-06, + "loss": 0.2168, + "step": 5402 + }, + { + "epoch": 1.0330783938814532, + "grad_norm": 0.7161755561828613, + "learning_rate": 5e-06, + "loss": 0.0753, + "step": 5403 + }, + { + "epoch": 1.0332695984703633, + "grad_norm": 1.9103176593780518, + "learning_rate": 5e-06, + "loss": 0.0824, + "step": 5404 + }, + { + "epoch": 1.0334608030592733, + "grad_norm": 2.567988634109497, + "learning_rate": 5e-06, + "loss": 0.1142, + "step": 5405 + }, + { + "epoch": 1.0336520076481837, + "grad_norm": 2.4080469608306885, + "learning_rate": 5e-06, + "loss": 0.3055, + "step": 5406 + }, + { + "epoch": 1.0338432122370937, + "grad_norm": 2.5664243698120117, + "learning_rate": 5e-06, + "loss": 0.3235, + "step": 5407 + }, + { + "epoch": 1.0340344168260038, + "grad_norm": 0.7243027091026306, + "learning_rate": 5e-06, + "loss": 0.1029, + "step": 5408 + }, + { + "epoch": 1.034225621414914, + "grad_norm": 2.28975510597229, + "learning_rate": 5e-06, + "loss": 0.089, + "step": 5409 + }, + { + "epoch": 1.034416826003824, + "grad_norm": 1.4908415079116821, + "learning_rate": 5e-06, + "loss": 0.0705, + "step": 5410 + }, + { + "epoch": 1.0346080305927343, + "grad_norm": 0.9343728423118591, + "learning_rate": 5e-06, + "loss": 0.0416, + "step": 5411 + }, + { + "epoch": 1.0347992351816444, + "grad_norm": 2.183262586593628, + "learning_rate": 5e-06, + "loss": 0.1471, + "step": 5412 + }, + { + "epoch": 1.0349904397705545, + "grad_norm": 1.1275417804718018, + "learning_rate": 5e-06, + "loss": 0.0581, + "step": 5413 + }, + { + "epoch": 1.0351816443594646, + "grad_norm": 3.412479877471924, + "learning_rate": 5e-06, + "loss": 0.4229, + "step": 5414 + }, + { + "epoch": 1.0353728489483747, + "grad_norm": 0.81615149974823, + "learning_rate": 5e-06, + "loss": 0.0412, + "step": 5415 + }, + { + "epoch": 1.035564053537285, + "grad_norm": 1.7186781167984009, + "learning_rate": 5e-06, + "loss": 0.1121, + "step": 5416 + }, + { + "epoch": 1.035755258126195, + "grad_norm": 3.903059482574463, + "learning_rate": 5e-06, + "loss": 0.0223, + "step": 5417 + }, + { + "epoch": 1.0359464627151052, + "grad_norm": 1.3878508806228638, + "learning_rate": 5e-06, + "loss": 0.069, + "step": 5418 + }, + { + "epoch": 1.0361376673040152, + "grad_norm": 2.2635209560394287, + "learning_rate": 5e-06, + "loss": 0.1977, + "step": 5419 + }, + { + "epoch": 1.0363288718929253, + "grad_norm": 1.8340933322906494, + "learning_rate": 5e-06, + "loss": 0.1436, + "step": 5420 + }, + { + "epoch": 1.0365200764818356, + "grad_norm": 1.2530016899108887, + "learning_rate": 5e-06, + "loss": 0.1269, + "step": 5421 + }, + { + "epoch": 1.0367112810707457, + "grad_norm": 0.9224182367324829, + "learning_rate": 5e-06, + "loss": 0.0661, + "step": 5422 + }, + { + "epoch": 1.0369024856596558, + "grad_norm": 1.306754231452942, + "learning_rate": 5e-06, + "loss": 0.0599, + "step": 5423 + }, + { + "epoch": 1.037093690248566, + "grad_norm": 1.311036467552185, + "learning_rate": 5e-06, + "loss": 0.0821, + "step": 5424 + }, + { + "epoch": 1.0372848948374762, + "grad_norm": 1.1835675239562988, + "learning_rate": 5e-06, + "loss": 0.0955, + "step": 5425 + }, + { + "epoch": 1.0374760994263863, + "grad_norm": 0.7402036190032959, + "learning_rate": 5e-06, + "loss": 0.0427, + "step": 5426 + }, + { + "epoch": 1.0376673040152964, + "grad_norm": 2.02817964553833, + "learning_rate": 5e-06, + "loss": 0.1171, + "step": 5427 + }, + { + "epoch": 1.0378585086042065, + "grad_norm": 2.148780107498169, + "learning_rate": 5e-06, + "loss": 0.1064, + "step": 5428 + }, + { + "epoch": 1.0380497131931166, + "grad_norm": 1.8620522022247314, + "learning_rate": 5e-06, + "loss": 0.1644, + "step": 5429 + }, + { + "epoch": 1.0382409177820269, + "grad_norm": 0.7214341759681702, + "learning_rate": 5e-06, + "loss": 0.0191, + "step": 5430 + }, + { + "epoch": 1.038432122370937, + "grad_norm": 1.828162670135498, + "learning_rate": 5e-06, + "loss": 0.1798, + "step": 5431 + }, + { + "epoch": 1.038623326959847, + "grad_norm": 2.594289541244507, + "learning_rate": 5e-06, + "loss": 0.2456, + "step": 5432 + }, + { + "epoch": 1.0388145315487571, + "grad_norm": 0.8717902302742004, + "learning_rate": 5e-06, + "loss": 0.0624, + "step": 5433 + }, + { + "epoch": 1.0390057361376672, + "grad_norm": 2.1278491020202637, + "learning_rate": 5e-06, + "loss": 0.1108, + "step": 5434 + }, + { + "epoch": 1.0391969407265775, + "grad_norm": 1.1939607858657837, + "learning_rate": 5e-06, + "loss": 0.0389, + "step": 5435 + }, + { + "epoch": 1.0393881453154876, + "grad_norm": 2.5707457065582275, + "learning_rate": 5e-06, + "loss": 0.2028, + "step": 5436 + }, + { + "epoch": 1.0395793499043977, + "grad_norm": 1.869125247001648, + "learning_rate": 5e-06, + "loss": 0.2081, + "step": 5437 + }, + { + "epoch": 1.0397705544933078, + "grad_norm": 3.4773709774017334, + "learning_rate": 5e-06, + "loss": 0.4955, + "step": 5438 + }, + { + "epoch": 1.0399617590822179, + "grad_norm": 1.6218843460083008, + "learning_rate": 5e-06, + "loss": 0.0909, + "step": 5439 + }, + { + "epoch": 1.0401529636711282, + "grad_norm": 2.7059316635131836, + "learning_rate": 5e-06, + "loss": 0.3133, + "step": 5440 + }, + { + "epoch": 1.0403441682600383, + "grad_norm": 1.0128536224365234, + "learning_rate": 5e-06, + "loss": 0.1095, + "step": 5441 + }, + { + "epoch": 1.0405353728489484, + "grad_norm": 6.157577037811279, + "learning_rate": 5e-06, + "loss": 0.2846, + "step": 5442 + }, + { + "epoch": 1.0407265774378585, + "grad_norm": 1.409836769104004, + "learning_rate": 5e-06, + "loss": 0.0739, + "step": 5443 + }, + { + "epoch": 1.0409177820267685, + "grad_norm": 2.545416831970215, + "learning_rate": 5e-06, + "loss": 0.3916, + "step": 5444 + }, + { + "epoch": 1.0411089866156789, + "grad_norm": 1.2188429832458496, + "learning_rate": 5e-06, + "loss": 0.0897, + "step": 5445 + }, + { + "epoch": 1.041300191204589, + "grad_norm": 3.023515224456787, + "learning_rate": 5e-06, + "loss": 0.4011, + "step": 5446 + }, + { + "epoch": 1.041491395793499, + "grad_norm": 2.3562259674072266, + "learning_rate": 5e-06, + "loss": 0.1399, + "step": 5447 + }, + { + "epoch": 1.0416826003824091, + "grad_norm": 1.0878733396530151, + "learning_rate": 5e-06, + "loss": 0.0712, + "step": 5448 + }, + { + "epoch": 1.0418738049713192, + "grad_norm": 2.7172553539276123, + "learning_rate": 5e-06, + "loss": 0.1926, + "step": 5449 + }, + { + "epoch": 1.0420650095602295, + "grad_norm": 2.0290565490722656, + "learning_rate": 5e-06, + "loss": 0.2468, + "step": 5450 + }, + { + "epoch": 1.0422562141491396, + "grad_norm": 0.8205150961875916, + "learning_rate": 5e-06, + "loss": 0.0483, + "step": 5451 + }, + { + "epoch": 1.0424474187380497, + "grad_norm": 1.924567699432373, + "learning_rate": 5e-06, + "loss": 0.2065, + "step": 5452 + }, + { + "epoch": 1.0426386233269598, + "grad_norm": 0.8093776702880859, + "learning_rate": 5e-06, + "loss": 0.0619, + "step": 5453 + }, + { + "epoch": 1.0428298279158699, + "grad_norm": 1.8981060981750488, + "learning_rate": 5e-06, + "loss": 0.1608, + "step": 5454 + }, + { + "epoch": 1.0430210325047802, + "grad_norm": 0.724181056022644, + "learning_rate": 5e-06, + "loss": 0.0284, + "step": 5455 + }, + { + "epoch": 1.0432122370936903, + "grad_norm": 1.9111442565917969, + "learning_rate": 5e-06, + "loss": 0.1779, + "step": 5456 + }, + { + "epoch": 1.0434034416826004, + "grad_norm": 1.8289096355438232, + "learning_rate": 5e-06, + "loss": 0.1876, + "step": 5457 + }, + { + "epoch": 1.0435946462715104, + "grad_norm": 2.535822629928589, + "learning_rate": 5e-06, + "loss": 0.2191, + "step": 5458 + }, + { + "epoch": 1.0437858508604207, + "grad_norm": 1.2734262943267822, + "learning_rate": 5e-06, + "loss": 0.0492, + "step": 5459 + }, + { + "epoch": 1.0439770554493308, + "grad_norm": 0.6029118895530701, + "learning_rate": 5e-06, + "loss": 0.0392, + "step": 5460 + }, + { + "epoch": 1.044168260038241, + "grad_norm": 1.2994762659072876, + "learning_rate": 5e-06, + "loss": 0.058, + "step": 5461 + }, + { + "epoch": 1.044359464627151, + "grad_norm": 1.6217557191848755, + "learning_rate": 5e-06, + "loss": 0.1581, + "step": 5462 + }, + { + "epoch": 1.044550669216061, + "grad_norm": 1.316620111465454, + "learning_rate": 5e-06, + "loss": 0.0736, + "step": 5463 + }, + { + "epoch": 1.0447418738049714, + "grad_norm": 1.5537855625152588, + "learning_rate": 5e-06, + "loss": 0.1094, + "step": 5464 + }, + { + "epoch": 1.0449330783938815, + "grad_norm": 0.6778096556663513, + "learning_rate": 5e-06, + "loss": 0.0312, + "step": 5465 + }, + { + "epoch": 1.0451242829827916, + "grad_norm": 1.1639436483383179, + "learning_rate": 5e-06, + "loss": 0.0931, + "step": 5466 + }, + { + "epoch": 1.0453154875717017, + "grad_norm": 1.2169827222824097, + "learning_rate": 5e-06, + "loss": 0.035, + "step": 5467 + }, + { + "epoch": 1.0455066921606118, + "grad_norm": 3.1080920696258545, + "learning_rate": 5e-06, + "loss": 0.2154, + "step": 5468 + }, + { + "epoch": 1.045697896749522, + "grad_norm": 1.4423012733459473, + "learning_rate": 5e-06, + "loss": 0.1196, + "step": 5469 + }, + { + "epoch": 1.0458891013384322, + "grad_norm": 1.3293002843856812, + "learning_rate": 5e-06, + "loss": 0.112, + "step": 5470 + }, + { + "epoch": 1.0460803059273422, + "grad_norm": 4.677468299865723, + "learning_rate": 5e-06, + "loss": 0.1826, + "step": 5471 + }, + { + "epoch": 1.0462715105162523, + "grad_norm": 1.4991474151611328, + "learning_rate": 5e-06, + "loss": 0.1735, + "step": 5472 + }, + { + "epoch": 1.0464627151051624, + "grad_norm": 0.8098369836807251, + "learning_rate": 5e-06, + "loss": 0.0329, + "step": 5473 + }, + { + "epoch": 1.0466539196940727, + "grad_norm": 1.0252405405044556, + "learning_rate": 5e-06, + "loss": 0.0512, + "step": 5474 + }, + { + "epoch": 1.0468451242829828, + "grad_norm": 2.2641191482543945, + "learning_rate": 5e-06, + "loss": 0.263, + "step": 5475 + }, + { + "epoch": 1.047036328871893, + "grad_norm": 2.4479317665100098, + "learning_rate": 5e-06, + "loss": 0.1826, + "step": 5476 + }, + { + "epoch": 1.047227533460803, + "grad_norm": 1.0218003988265991, + "learning_rate": 5e-06, + "loss": 0.0605, + "step": 5477 + }, + { + "epoch": 1.0474187380497133, + "grad_norm": 1.1280349493026733, + "learning_rate": 5e-06, + "loss": 0.0742, + "step": 5478 + }, + { + "epoch": 1.0476099426386234, + "grad_norm": 2.0644752979278564, + "learning_rate": 5e-06, + "loss": 0.1082, + "step": 5479 + }, + { + "epoch": 1.0478011472275335, + "grad_norm": 1.3084979057312012, + "learning_rate": 5e-06, + "loss": 0.0601, + "step": 5480 + }, + { + "epoch": 1.0479923518164436, + "grad_norm": 1.2070064544677734, + "learning_rate": 5e-06, + "loss": 0.0784, + "step": 5481 + }, + { + "epoch": 1.0481835564053537, + "grad_norm": 1.7970086336135864, + "learning_rate": 5e-06, + "loss": 0.1671, + "step": 5482 + }, + { + "epoch": 1.048374760994264, + "grad_norm": 2.143139362335205, + "learning_rate": 5e-06, + "loss": 0.1785, + "step": 5483 + }, + { + "epoch": 1.048565965583174, + "grad_norm": 1.9227676391601562, + "learning_rate": 5e-06, + "loss": 0.1499, + "step": 5484 + }, + { + "epoch": 1.0487571701720841, + "grad_norm": 1.6543242931365967, + "learning_rate": 5e-06, + "loss": 0.0493, + "step": 5485 + }, + { + "epoch": 1.0489483747609942, + "grad_norm": 0.5971057415008545, + "learning_rate": 5e-06, + "loss": 0.0124, + "step": 5486 + }, + { + "epoch": 1.0491395793499043, + "grad_norm": 2.2405505180358887, + "learning_rate": 5e-06, + "loss": 0.2293, + "step": 5487 + }, + { + "epoch": 1.0493307839388146, + "grad_norm": 2.6099843978881836, + "learning_rate": 5e-06, + "loss": 0.2646, + "step": 5488 + }, + { + "epoch": 1.0495219885277247, + "grad_norm": 1.8190488815307617, + "learning_rate": 5e-06, + "loss": 0.1664, + "step": 5489 + }, + { + "epoch": 1.0497131931166348, + "grad_norm": 2.8577523231506348, + "learning_rate": 5e-06, + "loss": 0.2463, + "step": 5490 + }, + { + "epoch": 1.049904397705545, + "grad_norm": 1.3947685956954956, + "learning_rate": 5e-06, + "loss": 0.115, + "step": 5491 + }, + { + "epoch": 1.050095602294455, + "grad_norm": 1.2213077545166016, + "learning_rate": 5e-06, + "loss": 0.0604, + "step": 5492 + }, + { + "epoch": 1.0502868068833653, + "grad_norm": 1.0234959125518799, + "learning_rate": 5e-06, + "loss": 0.0541, + "step": 5493 + }, + { + "epoch": 1.0504780114722754, + "grad_norm": 2.8558273315429688, + "learning_rate": 5e-06, + "loss": 0.266, + "step": 5494 + }, + { + "epoch": 1.0506692160611855, + "grad_norm": 1.4367798566818237, + "learning_rate": 5e-06, + "loss": 0.1352, + "step": 5495 + }, + { + "epoch": 1.0508604206500956, + "grad_norm": 1.1142265796661377, + "learning_rate": 5e-06, + "loss": 0.0389, + "step": 5496 + }, + { + "epoch": 1.0510516252390056, + "grad_norm": 2.527897596359253, + "learning_rate": 5e-06, + "loss": 0.1458, + "step": 5497 + }, + { + "epoch": 1.051242829827916, + "grad_norm": 1.8113329410552979, + "learning_rate": 5e-06, + "loss": 0.1046, + "step": 5498 + }, + { + "epoch": 1.051434034416826, + "grad_norm": 1.4811005592346191, + "learning_rate": 5e-06, + "loss": 0.1069, + "step": 5499 + }, + { + "epoch": 1.0516252390057361, + "grad_norm": 1.7655647993087769, + "learning_rate": 5e-06, + "loss": 0.1356, + "step": 5500 + }, + { + "epoch": 1.0516252390057361, + "eval_runtime": 834.4534, + "eval_samples_per_second": 1.838, + "eval_steps_per_second": 0.23, + "step": 5500 + }, + { + "epoch": 1.0518164435946462, + "grad_norm": 1.7707321643829346, + "learning_rate": 5e-06, + "loss": 0.1367, + "step": 5501 + }, + { + "epoch": 1.0520076481835563, + "grad_norm": 1.482149362564087, + "learning_rate": 5e-06, + "loss": 0.0581, + "step": 5502 + }, + { + "epoch": 1.0521988527724666, + "grad_norm": 0.7621336579322815, + "learning_rate": 5e-06, + "loss": 0.057, + "step": 5503 + }, + { + "epoch": 1.0523900573613767, + "grad_norm": 1.2675679922103882, + "learning_rate": 5e-06, + "loss": 0.0503, + "step": 5504 + }, + { + "epoch": 1.0525812619502868, + "grad_norm": 1.968985676765442, + "learning_rate": 5e-06, + "loss": 0.0581, + "step": 5505 + }, + { + "epoch": 1.0527724665391969, + "grad_norm": 2.8859829902648926, + "learning_rate": 5e-06, + "loss": 0.2671, + "step": 5506 + }, + { + "epoch": 1.0529636711281072, + "grad_norm": 0.8013049960136414, + "learning_rate": 5e-06, + "loss": 0.045, + "step": 5507 + }, + { + "epoch": 1.0531548757170173, + "grad_norm": 1.266425609588623, + "learning_rate": 5e-06, + "loss": 0.0874, + "step": 5508 + }, + { + "epoch": 1.0533460803059274, + "grad_norm": 2.2374284267425537, + "learning_rate": 5e-06, + "loss": 0.0577, + "step": 5509 + }, + { + "epoch": 1.0535372848948374, + "grad_norm": 1.1194673776626587, + "learning_rate": 5e-06, + "loss": 0.0354, + "step": 5510 + }, + { + "epoch": 1.0537284894837475, + "grad_norm": 1.0510578155517578, + "learning_rate": 5e-06, + "loss": 0.03, + "step": 5511 + }, + { + "epoch": 1.0539196940726578, + "grad_norm": 1.7931699752807617, + "learning_rate": 5e-06, + "loss": 0.1108, + "step": 5512 + }, + { + "epoch": 1.054110898661568, + "grad_norm": 1.9386789798736572, + "learning_rate": 5e-06, + "loss": 0.0997, + "step": 5513 + }, + { + "epoch": 1.054302103250478, + "grad_norm": 0.9754608869552612, + "learning_rate": 5e-06, + "loss": 0.0676, + "step": 5514 + }, + { + "epoch": 1.054493307839388, + "grad_norm": 1.582482099533081, + "learning_rate": 5e-06, + "loss": 0.0941, + "step": 5515 + }, + { + "epoch": 1.0546845124282982, + "grad_norm": 0.8954532742500305, + "learning_rate": 5e-06, + "loss": 0.0351, + "step": 5516 + }, + { + "epoch": 1.0548757170172085, + "grad_norm": 1.9438811540603638, + "learning_rate": 5e-06, + "loss": 0.1878, + "step": 5517 + }, + { + "epoch": 1.0550669216061186, + "grad_norm": 2.3192455768585205, + "learning_rate": 5e-06, + "loss": 0.1108, + "step": 5518 + }, + { + "epoch": 1.0552581261950287, + "grad_norm": 2.396188974380493, + "learning_rate": 5e-06, + "loss": 0.2317, + "step": 5519 + }, + { + "epoch": 1.0554493307839388, + "grad_norm": 1.4901149272918701, + "learning_rate": 5e-06, + "loss": 0.1238, + "step": 5520 + }, + { + "epoch": 1.0556405353728489, + "grad_norm": 1.5792475938796997, + "learning_rate": 5e-06, + "loss": 0.1507, + "step": 5521 + }, + { + "epoch": 1.0558317399617592, + "grad_norm": 1.3586615324020386, + "learning_rate": 5e-06, + "loss": 0.1084, + "step": 5522 + }, + { + "epoch": 1.0560229445506693, + "grad_norm": 1.5009024143218994, + "learning_rate": 5e-06, + "loss": 0.0418, + "step": 5523 + }, + { + "epoch": 1.0562141491395793, + "grad_norm": 1.5317716598510742, + "learning_rate": 5e-06, + "loss": 0.0772, + "step": 5524 + }, + { + "epoch": 1.0564053537284894, + "grad_norm": 1.119912028312683, + "learning_rate": 5e-06, + "loss": 0.1205, + "step": 5525 + }, + { + "epoch": 1.0565965583173995, + "grad_norm": 1.4832195043563843, + "learning_rate": 5e-06, + "loss": 0.1263, + "step": 5526 + }, + { + "epoch": 1.0567877629063098, + "grad_norm": 1.1946576833724976, + "learning_rate": 5e-06, + "loss": 0.0627, + "step": 5527 + }, + { + "epoch": 1.05697896749522, + "grad_norm": 1.3606984615325928, + "learning_rate": 5e-06, + "loss": 0.0645, + "step": 5528 + }, + { + "epoch": 1.05717017208413, + "grad_norm": 0.7441090941429138, + "learning_rate": 5e-06, + "loss": 0.0384, + "step": 5529 + }, + { + "epoch": 1.05736137667304, + "grad_norm": 1.1623265743255615, + "learning_rate": 5e-06, + "loss": 0.0436, + "step": 5530 + }, + { + "epoch": 1.0575525812619504, + "grad_norm": 1.458216667175293, + "learning_rate": 5e-06, + "loss": 0.1655, + "step": 5531 + }, + { + "epoch": 1.0577437858508605, + "grad_norm": 1.6108118295669556, + "learning_rate": 5e-06, + "loss": 0.1881, + "step": 5532 + }, + { + "epoch": 1.0579349904397706, + "grad_norm": 1.7551403045654297, + "learning_rate": 5e-06, + "loss": 0.1642, + "step": 5533 + }, + { + "epoch": 1.0581261950286807, + "grad_norm": 1.0185822248458862, + "learning_rate": 5e-06, + "loss": 0.055, + "step": 5534 + }, + { + "epoch": 1.0583173996175907, + "grad_norm": 0.6421904563903809, + "learning_rate": 5e-06, + "loss": 0.0271, + "step": 5535 + }, + { + "epoch": 1.058508604206501, + "grad_norm": 1.085301160812378, + "learning_rate": 5e-06, + "loss": 0.0458, + "step": 5536 + }, + { + "epoch": 1.0586998087954111, + "grad_norm": 1.3988016843795776, + "learning_rate": 5e-06, + "loss": 0.0921, + "step": 5537 + }, + { + "epoch": 1.0588910133843212, + "grad_norm": 3.1126415729522705, + "learning_rate": 5e-06, + "loss": 0.2017, + "step": 5538 + }, + { + "epoch": 1.0590822179732313, + "grad_norm": 1.5114021301269531, + "learning_rate": 5e-06, + "loss": 0.0989, + "step": 5539 + }, + { + "epoch": 1.0592734225621414, + "grad_norm": 1.0168532133102417, + "learning_rate": 5e-06, + "loss": 0.0519, + "step": 5540 + }, + { + "epoch": 1.0594646271510517, + "grad_norm": 0.8101119995117188, + "learning_rate": 5e-06, + "loss": 0.0505, + "step": 5541 + }, + { + "epoch": 1.0596558317399618, + "grad_norm": 1.7537641525268555, + "learning_rate": 5e-06, + "loss": 0.0264, + "step": 5542 + }, + { + "epoch": 1.059847036328872, + "grad_norm": 1.9035402536392212, + "learning_rate": 5e-06, + "loss": 0.168, + "step": 5543 + }, + { + "epoch": 1.060038240917782, + "grad_norm": 2.1393039226531982, + "learning_rate": 5e-06, + "loss": 0.2744, + "step": 5544 + }, + { + "epoch": 1.060229445506692, + "grad_norm": 1.2142505645751953, + "learning_rate": 5e-06, + "loss": 0.0952, + "step": 5545 + }, + { + "epoch": 1.0604206500956024, + "grad_norm": 1.868961215019226, + "learning_rate": 5e-06, + "loss": 0.0702, + "step": 5546 + }, + { + "epoch": 1.0606118546845125, + "grad_norm": 1.7776259183883667, + "learning_rate": 5e-06, + "loss": 0.198, + "step": 5547 + }, + { + "epoch": 1.0608030592734226, + "grad_norm": 0.7262550592422485, + "learning_rate": 5e-06, + "loss": 0.0306, + "step": 5548 + }, + { + "epoch": 1.0609942638623326, + "grad_norm": 1.554850697517395, + "learning_rate": 5e-06, + "loss": 0.0685, + "step": 5549 + }, + { + "epoch": 1.0611854684512427, + "grad_norm": 2.132505178451538, + "learning_rate": 5e-06, + "loss": 0.3059, + "step": 5550 + }, + { + "epoch": 1.061376673040153, + "grad_norm": 0.8302657008171082, + "learning_rate": 5e-06, + "loss": 0.0978, + "step": 5551 + }, + { + "epoch": 1.0615678776290631, + "grad_norm": 1.9379445314407349, + "learning_rate": 5e-06, + "loss": 0.1882, + "step": 5552 + }, + { + "epoch": 1.0617590822179732, + "grad_norm": 1.474511742591858, + "learning_rate": 5e-06, + "loss": 0.1236, + "step": 5553 + }, + { + "epoch": 1.0619502868068833, + "grad_norm": 2.032550573348999, + "learning_rate": 5e-06, + "loss": 0.1122, + "step": 5554 + }, + { + "epoch": 1.0621414913957934, + "grad_norm": 1.345967411994934, + "learning_rate": 5e-06, + "loss": 0.0447, + "step": 5555 + }, + { + "epoch": 1.0623326959847037, + "grad_norm": 2.1029436588287354, + "learning_rate": 5e-06, + "loss": 0.2457, + "step": 5556 + }, + { + "epoch": 1.0625239005736138, + "grad_norm": 1.188673734664917, + "learning_rate": 5e-06, + "loss": 0.0634, + "step": 5557 + }, + { + "epoch": 1.0627151051625239, + "grad_norm": 0.6115692257881165, + "learning_rate": 5e-06, + "loss": 0.0683, + "step": 5558 + }, + { + "epoch": 1.062906309751434, + "grad_norm": 1.1898462772369385, + "learning_rate": 5e-06, + "loss": 0.042, + "step": 5559 + }, + { + "epoch": 1.063097514340344, + "grad_norm": 2.9459636211395264, + "learning_rate": 5e-06, + "loss": 0.1648, + "step": 5560 + }, + { + "epoch": 1.0632887189292544, + "grad_norm": 1.1826893091201782, + "learning_rate": 5e-06, + "loss": 0.0368, + "step": 5561 + }, + { + "epoch": 1.0634799235181644, + "grad_norm": 2.073742389678955, + "learning_rate": 5e-06, + "loss": 0.2756, + "step": 5562 + }, + { + "epoch": 1.0636711281070745, + "grad_norm": 1.3164788484573364, + "learning_rate": 5e-06, + "loss": 0.1287, + "step": 5563 + }, + { + "epoch": 1.0638623326959846, + "grad_norm": 1.7094725370407104, + "learning_rate": 5e-06, + "loss": 0.2081, + "step": 5564 + }, + { + "epoch": 1.064053537284895, + "grad_norm": 1.2266261577606201, + "learning_rate": 5e-06, + "loss": 0.0791, + "step": 5565 + }, + { + "epoch": 1.064244741873805, + "grad_norm": 0.5611611008644104, + "learning_rate": 5e-06, + "loss": 0.0304, + "step": 5566 + }, + { + "epoch": 1.064435946462715, + "grad_norm": 1.0190863609313965, + "learning_rate": 5e-06, + "loss": 0.0428, + "step": 5567 + }, + { + "epoch": 1.0646271510516252, + "grad_norm": 1.5948188304901123, + "learning_rate": 5e-06, + "loss": 0.1038, + "step": 5568 + }, + { + "epoch": 1.0648183556405353, + "grad_norm": 1.694000005722046, + "learning_rate": 5e-06, + "loss": 0.1146, + "step": 5569 + }, + { + "epoch": 1.0650095602294456, + "grad_norm": 2.386840581893921, + "learning_rate": 5e-06, + "loss": 0.1727, + "step": 5570 + }, + { + "epoch": 1.0652007648183557, + "grad_norm": 2.1206300258636475, + "learning_rate": 5e-06, + "loss": 0.1712, + "step": 5571 + }, + { + "epoch": 1.0653919694072658, + "grad_norm": 1.195825457572937, + "learning_rate": 5e-06, + "loss": 0.0482, + "step": 5572 + }, + { + "epoch": 1.0655831739961759, + "grad_norm": 0.9524969458580017, + "learning_rate": 5e-06, + "loss": 0.0435, + "step": 5573 + }, + { + "epoch": 1.065774378585086, + "grad_norm": 1.50506591796875, + "learning_rate": 5e-06, + "loss": 0.1009, + "step": 5574 + }, + { + "epoch": 1.0659655831739963, + "grad_norm": 2.291487216949463, + "learning_rate": 5e-06, + "loss": 0.2958, + "step": 5575 + }, + { + "epoch": 1.0661567877629063, + "grad_norm": 1.8258750438690186, + "learning_rate": 5e-06, + "loss": 0.2111, + "step": 5576 + }, + { + "epoch": 1.0663479923518164, + "grad_norm": 1.87051260471344, + "learning_rate": 5e-06, + "loss": 0.1976, + "step": 5577 + }, + { + "epoch": 1.0665391969407265, + "grad_norm": 2.5715811252593994, + "learning_rate": 5e-06, + "loss": 0.1482, + "step": 5578 + }, + { + "epoch": 1.0667304015296368, + "grad_norm": 0.7487394213676453, + "learning_rate": 5e-06, + "loss": 0.0376, + "step": 5579 + }, + { + "epoch": 1.066921606118547, + "grad_norm": 2.3859152793884277, + "learning_rate": 5e-06, + "loss": 0.0333, + "step": 5580 + }, + { + "epoch": 1.067112810707457, + "grad_norm": 1.7255477905273438, + "learning_rate": 5e-06, + "loss": 0.072, + "step": 5581 + }, + { + "epoch": 1.067304015296367, + "grad_norm": 2.3104193210601807, + "learning_rate": 5e-06, + "loss": 0.3214, + "step": 5582 + }, + { + "epoch": 1.0674952198852772, + "grad_norm": 1.6485390663146973, + "learning_rate": 5e-06, + "loss": 0.1561, + "step": 5583 + }, + { + "epoch": 1.0676864244741875, + "grad_norm": 0.9040722846984863, + "learning_rate": 5e-06, + "loss": 0.0759, + "step": 5584 + }, + { + "epoch": 1.0678776290630976, + "grad_norm": 0.904682993888855, + "learning_rate": 5e-06, + "loss": 0.0422, + "step": 5585 + }, + { + "epoch": 1.0680688336520077, + "grad_norm": 1.8263367414474487, + "learning_rate": 5e-06, + "loss": 0.0747, + "step": 5586 + }, + { + "epoch": 1.0682600382409178, + "grad_norm": 2.2079832553863525, + "learning_rate": 5e-06, + "loss": 0.3012, + "step": 5587 + }, + { + "epoch": 1.0684512428298278, + "grad_norm": 1.0028715133666992, + "learning_rate": 5e-06, + "loss": 0.0642, + "step": 5588 + }, + { + "epoch": 1.0686424474187382, + "grad_norm": 2.1110336780548096, + "learning_rate": 5e-06, + "loss": 0.1711, + "step": 5589 + }, + { + "epoch": 1.0688336520076482, + "grad_norm": 1.1553751230239868, + "learning_rate": 5e-06, + "loss": 0.0648, + "step": 5590 + }, + { + "epoch": 1.0690248565965583, + "grad_norm": 1.4524354934692383, + "learning_rate": 5e-06, + "loss": 0.0671, + "step": 5591 + }, + { + "epoch": 1.0692160611854684, + "grad_norm": 1.1088978052139282, + "learning_rate": 5e-06, + "loss": 0.0488, + "step": 5592 + }, + { + "epoch": 1.0694072657743785, + "grad_norm": 1.0848307609558105, + "learning_rate": 5e-06, + "loss": 0.0353, + "step": 5593 + }, + { + "epoch": 1.0695984703632888, + "grad_norm": 1.9532465934753418, + "learning_rate": 5e-06, + "loss": 0.2058, + "step": 5594 + }, + { + "epoch": 1.069789674952199, + "grad_norm": 2.134166955947876, + "learning_rate": 5e-06, + "loss": 0.1638, + "step": 5595 + }, + { + "epoch": 1.069980879541109, + "grad_norm": 0.9889646172523499, + "learning_rate": 5e-06, + "loss": 0.076, + "step": 5596 + }, + { + "epoch": 1.070172084130019, + "grad_norm": 4.0119452476501465, + "learning_rate": 5e-06, + "loss": 0.2381, + "step": 5597 + }, + { + "epoch": 1.0703632887189292, + "grad_norm": 1.1719545125961304, + "learning_rate": 5e-06, + "loss": 0.0532, + "step": 5598 + }, + { + "epoch": 1.0705544933078395, + "grad_norm": 0.7966266870498657, + "learning_rate": 5e-06, + "loss": 0.0488, + "step": 5599 + }, + { + "epoch": 1.0707456978967496, + "grad_norm": 2.092799186706543, + "learning_rate": 5e-06, + "loss": 0.2781, + "step": 5600 + }, + { + "epoch": 1.0709369024856596, + "grad_norm": 1.770919680595398, + "learning_rate": 5e-06, + "loss": 0.2382, + "step": 5601 + }, + { + "epoch": 1.0711281070745697, + "grad_norm": 1.2337734699249268, + "learning_rate": 5e-06, + "loss": 0.1008, + "step": 5602 + }, + { + "epoch": 1.0713193116634798, + "grad_norm": 1.2285529375076294, + "learning_rate": 5e-06, + "loss": 0.0923, + "step": 5603 + }, + { + "epoch": 1.0715105162523901, + "grad_norm": 0.8008958101272583, + "learning_rate": 5e-06, + "loss": 0.0347, + "step": 5604 + }, + { + "epoch": 1.0717017208413002, + "grad_norm": 1.2167035341262817, + "learning_rate": 5e-06, + "loss": 0.0495, + "step": 5605 + }, + { + "epoch": 1.0718929254302103, + "grad_norm": 2.4168663024902344, + "learning_rate": 5e-06, + "loss": 0.3622, + "step": 5606 + }, + { + "epoch": 1.0720841300191204, + "grad_norm": 2.050773859024048, + "learning_rate": 5e-06, + "loss": 0.3046, + "step": 5607 + }, + { + "epoch": 1.0722753346080305, + "grad_norm": 2.666419267654419, + "learning_rate": 5e-06, + "loss": 0.1967, + "step": 5608 + }, + { + "epoch": 1.0724665391969408, + "grad_norm": 0.6062523722648621, + "learning_rate": 5e-06, + "loss": 0.0457, + "step": 5609 + }, + { + "epoch": 1.0726577437858509, + "grad_norm": 1.7348066568374634, + "learning_rate": 5e-06, + "loss": 0.0553, + "step": 5610 + }, + { + "epoch": 1.072848948374761, + "grad_norm": 2.2407853603363037, + "learning_rate": 5e-06, + "loss": 0.0718, + "step": 5611 + }, + { + "epoch": 1.073040152963671, + "grad_norm": 2.270958662033081, + "learning_rate": 5e-06, + "loss": 0.1383, + "step": 5612 + }, + { + "epoch": 1.0732313575525811, + "grad_norm": 2.6858391761779785, + "learning_rate": 5e-06, + "loss": 0.3042, + "step": 5613 + }, + { + "epoch": 1.0734225621414915, + "grad_norm": 2.1360089778900146, + "learning_rate": 5e-06, + "loss": 0.2963, + "step": 5614 + }, + { + "epoch": 1.0736137667304015, + "grad_norm": 1.7626913785934448, + "learning_rate": 5e-06, + "loss": 0.1656, + "step": 5615 + }, + { + "epoch": 1.0738049713193116, + "grad_norm": 0.7246912717819214, + "learning_rate": 5e-06, + "loss": 0.0466, + "step": 5616 + }, + { + "epoch": 1.0739961759082217, + "grad_norm": 1.5797908306121826, + "learning_rate": 5e-06, + "loss": 0.0513, + "step": 5617 + }, + { + "epoch": 1.074187380497132, + "grad_norm": 1.0937782526016235, + "learning_rate": 5e-06, + "loss": 0.066, + "step": 5618 + }, + { + "epoch": 1.0743785850860421, + "grad_norm": 1.6355293989181519, + "learning_rate": 5e-06, + "loss": 0.2058, + "step": 5619 + }, + { + "epoch": 1.0745697896749522, + "grad_norm": 1.8476656675338745, + "learning_rate": 5e-06, + "loss": 0.2098, + "step": 5620 + }, + { + "epoch": 1.0747609942638623, + "grad_norm": 1.877700686454773, + "learning_rate": 5e-06, + "loss": 0.0347, + "step": 5621 + }, + { + "epoch": 1.0749521988527724, + "grad_norm": 1.5795397758483887, + "learning_rate": 5e-06, + "loss": 0.092, + "step": 5622 + }, + { + "epoch": 1.0751434034416827, + "grad_norm": 1.5501025915145874, + "learning_rate": 5e-06, + "loss": 0.1344, + "step": 5623 + }, + { + "epoch": 1.0753346080305928, + "grad_norm": 1.7347816228866577, + "learning_rate": 5e-06, + "loss": 0.0974, + "step": 5624 + }, + { + "epoch": 1.0755258126195029, + "grad_norm": 2.559631109237671, + "learning_rate": 5e-06, + "loss": 0.3785, + "step": 5625 + }, + { + "epoch": 1.075717017208413, + "grad_norm": 1.182547688484192, + "learning_rate": 5e-06, + "loss": 0.0788, + "step": 5626 + }, + { + "epoch": 1.075908221797323, + "grad_norm": 1.3121072053909302, + "learning_rate": 5e-06, + "loss": 0.0665, + "step": 5627 + }, + { + "epoch": 1.0760994263862333, + "grad_norm": 1.6444857120513916, + "learning_rate": 5e-06, + "loss": 0.0654, + "step": 5628 + }, + { + "epoch": 1.0762906309751434, + "grad_norm": 1.5076884031295776, + "learning_rate": 5e-06, + "loss": 0.0763, + "step": 5629 + }, + { + "epoch": 1.0764818355640535, + "grad_norm": 1.1843534708023071, + "learning_rate": 5e-06, + "loss": 0.037, + "step": 5630 + }, + { + "epoch": 1.0766730401529636, + "grad_norm": 1.5834217071533203, + "learning_rate": 5e-06, + "loss": 0.1003, + "step": 5631 + }, + { + "epoch": 1.076864244741874, + "grad_norm": 2.579793691635132, + "learning_rate": 5e-06, + "loss": 0.2568, + "step": 5632 + }, + { + "epoch": 1.077055449330784, + "grad_norm": 1.1158941984176636, + "learning_rate": 5e-06, + "loss": 0.0999, + "step": 5633 + }, + { + "epoch": 1.077246653919694, + "grad_norm": 1.8821864128112793, + "learning_rate": 5e-06, + "loss": 0.1314, + "step": 5634 + }, + { + "epoch": 1.0774378585086042, + "grad_norm": 1.1883410215377808, + "learning_rate": 5e-06, + "loss": 0.0513, + "step": 5635 + }, + { + "epoch": 1.0776290630975143, + "grad_norm": 2.1233952045440674, + "learning_rate": 5e-06, + "loss": 0.0766, + "step": 5636 + }, + { + "epoch": 1.0778202676864246, + "grad_norm": 2.0192222595214844, + "learning_rate": 5e-06, + "loss": 0.2886, + "step": 5637 + }, + { + "epoch": 1.0780114722753347, + "grad_norm": 1.116824746131897, + "learning_rate": 5e-06, + "loss": 0.081, + "step": 5638 + }, + { + "epoch": 1.0782026768642448, + "grad_norm": 1.6397048234939575, + "learning_rate": 5e-06, + "loss": 0.0968, + "step": 5639 + }, + { + "epoch": 1.0783938814531548, + "grad_norm": 0.804574191570282, + "learning_rate": 5e-06, + "loss": 0.0296, + "step": 5640 + }, + { + "epoch": 1.078585086042065, + "grad_norm": 0.6931762099266052, + "learning_rate": 5e-06, + "loss": 0.0159, + "step": 5641 + }, + { + "epoch": 1.0787762906309752, + "grad_norm": 2.571634531021118, + "learning_rate": 5e-06, + "loss": 0.1144, + "step": 5642 + }, + { + "epoch": 1.0789674952198853, + "grad_norm": 1.3164654970169067, + "learning_rate": 5e-06, + "loss": 0.0838, + "step": 5643 + }, + { + "epoch": 1.0791586998087954, + "grad_norm": 1.9440586566925049, + "learning_rate": 5e-06, + "loss": 0.2427, + "step": 5644 + }, + { + "epoch": 1.0793499043977055, + "grad_norm": 1.7131543159484863, + "learning_rate": 5e-06, + "loss": 0.1543, + "step": 5645 + }, + { + "epoch": 1.0795411089866156, + "grad_norm": 0.7819735407829285, + "learning_rate": 5e-06, + "loss": 0.062, + "step": 5646 + }, + { + "epoch": 1.079732313575526, + "grad_norm": 1.6629984378814697, + "learning_rate": 5e-06, + "loss": 0.0666, + "step": 5647 + }, + { + "epoch": 1.079923518164436, + "grad_norm": 1.143560528755188, + "learning_rate": 5e-06, + "loss": 0.0335, + "step": 5648 + }, + { + "epoch": 1.080114722753346, + "grad_norm": 1.9708220958709717, + "learning_rate": 5e-06, + "loss": 0.1641, + "step": 5649 + }, + { + "epoch": 1.0803059273422562, + "grad_norm": 1.5629487037658691, + "learning_rate": 5e-06, + "loss": 0.2152, + "step": 5650 + }, + { + "epoch": 1.0804971319311663, + "grad_norm": 1.2981854677200317, + "learning_rate": 5e-06, + "loss": 0.0818, + "step": 5651 + }, + { + "epoch": 1.0806883365200766, + "grad_norm": 2.244119644165039, + "learning_rate": 5e-06, + "loss": 0.2363, + "step": 5652 + }, + { + "epoch": 1.0808795411089867, + "grad_norm": 0.7663077116012573, + "learning_rate": 5e-06, + "loss": 0.0666, + "step": 5653 + }, + { + "epoch": 1.0810707456978967, + "grad_norm": 0.9932133555412292, + "learning_rate": 5e-06, + "loss": 0.0601, + "step": 5654 + }, + { + "epoch": 1.0812619502868068, + "grad_norm": 0.8187389969825745, + "learning_rate": 5e-06, + "loss": 0.0233, + "step": 5655 + }, + { + "epoch": 1.081453154875717, + "grad_norm": 1.5282580852508545, + "learning_rate": 5e-06, + "loss": 0.1785, + "step": 5656 + }, + { + "epoch": 1.0816443594646272, + "grad_norm": 1.7210948467254639, + "learning_rate": 5e-06, + "loss": 0.1786, + "step": 5657 + }, + { + "epoch": 1.0818355640535373, + "grad_norm": 0.6434665322303772, + "learning_rate": 5e-06, + "loss": 0.0408, + "step": 5658 + }, + { + "epoch": 1.0820267686424474, + "grad_norm": 1.558206558227539, + "learning_rate": 5e-06, + "loss": 0.1026, + "step": 5659 + }, + { + "epoch": 1.0822179732313575, + "grad_norm": 1.6600244045257568, + "learning_rate": 5e-06, + "loss": 0.0921, + "step": 5660 + }, + { + "epoch": 1.0824091778202676, + "grad_norm": 1.1153446435928345, + "learning_rate": 5e-06, + "loss": 0.024, + "step": 5661 + }, + { + "epoch": 1.0826003824091779, + "grad_norm": 2.0310747623443604, + "learning_rate": 5e-06, + "loss": 0.1647, + "step": 5662 + }, + { + "epoch": 1.082791586998088, + "grad_norm": 1.8215316534042358, + "learning_rate": 5e-06, + "loss": 0.1737, + "step": 5663 + }, + { + "epoch": 1.082982791586998, + "grad_norm": 1.404730200767517, + "learning_rate": 5e-06, + "loss": 0.0599, + "step": 5664 + }, + { + "epoch": 1.0831739961759081, + "grad_norm": 5.3708176612854, + "learning_rate": 5e-06, + "loss": 0.0709, + "step": 5665 + }, + { + "epoch": 1.0833652007648185, + "grad_norm": 2.5323567390441895, + "learning_rate": 5e-06, + "loss": 0.1791, + "step": 5666 + }, + { + "epoch": 1.0835564053537285, + "grad_norm": 0.777966320514679, + "learning_rate": 5e-06, + "loss": 0.0265, + "step": 5667 + }, + { + "epoch": 1.0837476099426386, + "grad_norm": 1.7438549995422363, + "learning_rate": 5e-06, + "loss": 0.0811, + "step": 5668 + }, + { + "epoch": 1.0839388145315487, + "grad_norm": 1.896105408668518, + "learning_rate": 5e-06, + "loss": 0.0971, + "step": 5669 + }, + { + "epoch": 1.0841300191204588, + "grad_norm": 1.2311056852340698, + "learning_rate": 5e-06, + "loss": 0.0739, + "step": 5670 + }, + { + "epoch": 1.0843212237093691, + "grad_norm": 1.2870060205459595, + "learning_rate": 5e-06, + "loss": 0.0791, + "step": 5671 + }, + { + "epoch": 1.0845124282982792, + "grad_norm": 1.5939801931381226, + "learning_rate": 5e-06, + "loss": 0.035, + "step": 5672 + }, + { + "epoch": 1.0847036328871893, + "grad_norm": 1.4833265542984009, + "learning_rate": 5e-06, + "loss": 0.0654, + "step": 5673 + }, + { + "epoch": 1.0848948374760994, + "grad_norm": 2.980525255203247, + "learning_rate": 5e-06, + "loss": 0.3089, + "step": 5674 + }, + { + "epoch": 1.0850860420650095, + "grad_norm": 2.04789137840271, + "learning_rate": 5e-06, + "loss": 0.1638, + "step": 5675 + }, + { + "epoch": 1.0852772466539198, + "grad_norm": 1.1447137594223022, + "learning_rate": 5e-06, + "loss": 0.1109, + "step": 5676 + }, + { + "epoch": 1.0854684512428299, + "grad_norm": 2.3989462852478027, + "learning_rate": 5e-06, + "loss": 0.2504, + "step": 5677 + }, + { + "epoch": 1.08565965583174, + "grad_norm": 1.2400035858154297, + "learning_rate": 5e-06, + "loss": 0.0553, + "step": 5678 + }, + { + "epoch": 1.08585086042065, + "grad_norm": 0.6254027485847473, + "learning_rate": 5e-06, + "loss": 0.0353, + "step": 5679 + }, + { + "epoch": 1.0860420650095601, + "grad_norm": 0.8050343990325928, + "learning_rate": 5e-06, + "loss": 0.0261, + "step": 5680 + }, + { + "epoch": 1.0862332695984704, + "grad_norm": 2.053995132446289, + "learning_rate": 5e-06, + "loss": 0.3031, + "step": 5681 + }, + { + "epoch": 1.0864244741873805, + "grad_norm": 2.055222272872925, + "learning_rate": 5e-06, + "loss": 0.0618, + "step": 5682 + }, + { + "epoch": 1.0866156787762906, + "grad_norm": 1.6199257373809814, + "learning_rate": 5e-06, + "loss": 0.1672, + "step": 5683 + }, + { + "epoch": 1.0868068833652007, + "grad_norm": 1.113048791885376, + "learning_rate": 5e-06, + "loss": 0.0616, + "step": 5684 + }, + { + "epoch": 1.086998087954111, + "grad_norm": 0.8362628817558289, + "learning_rate": 5e-06, + "loss": 0.0341, + "step": 5685 + }, + { + "epoch": 1.087189292543021, + "grad_norm": 5.398261070251465, + "learning_rate": 5e-06, + "loss": 0.1019, + "step": 5686 + }, + { + "epoch": 1.0873804971319312, + "grad_norm": 1.4154852628707886, + "learning_rate": 5e-06, + "loss": 0.0897, + "step": 5687 + }, + { + "epoch": 1.0875717017208413, + "grad_norm": 1.3424711227416992, + "learning_rate": 5e-06, + "loss": 0.1103, + "step": 5688 + }, + { + "epoch": 1.0877629063097514, + "grad_norm": 1.525865912437439, + "learning_rate": 5e-06, + "loss": 0.0945, + "step": 5689 + }, + { + "epoch": 1.0879541108986617, + "grad_norm": 1.0463107824325562, + "learning_rate": 5e-06, + "loss": 0.0452, + "step": 5690 + }, + { + "epoch": 1.0881453154875718, + "grad_norm": 2.542323589324951, + "learning_rate": 5e-06, + "loss": 0.2381, + "step": 5691 + }, + { + "epoch": 1.0883365200764819, + "grad_norm": 1.3651113510131836, + "learning_rate": 5e-06, + "loss": 0.0745, + "step": 5692 + }, + { + "epoch": 1.088527724665392, + "grad_norm": 1.5302486419677734, + "learning_rate": 5e-06, + "loss": 0.1264, + "step": 5693 + }, + { + "epoch": 1.088718929254302, + "grad_norm": 1.7708359956741333, + "learning_rate": 5e-06, + "loss": 0.1359, + "step": 5694 + }, + { + "epoch": 1.0889101338432123, + "grad_norm": 1.8701834678649902, + "learning_rate": 5e-06, + "loss": 0.2137, + "step": 5695 + }, + { + "epoch": 1.0891013384321224, + "grad_norm": 2.5269713401794434, + "learning_rate": 5e-06, + "loss": 0.0765, + "step": 5696 + }, + { + "epoch": 1.0892925430210325, + "grad_norm": 1.7278193235397339, + "learning_rate": 5e-06, + "loss": 0.0885, + "step": 5697 + }, + { + "epoch": 1.0894837476099426, + "grad_norm": 1.453526496887207, + "learning_rate": 5e-06, + "loss": 0.0511, + "step": 5698 + }, + { + "epoch": 1.0896749521988527, + "grad_norm": 5.5870819091796875, + "learning_rate": 5e-06, + "loss": 0.2771, + "step": 5699 + }, + { + "epoch": 1.089866156787763, + "grad_norm": 1.4349433183670044, + "learning_rate": 5e-06, + "loss": 0.1705, + "step": 5700 + }, + { + "epoch": 1.090057361376673, + "grad_norm": 1.717117190361023, + "learning_rate": 5e-06, + "loss": 0.1773, + "step": 5701 + }, + { + "epoch": 1.0902485659655832, + "grad_norm": 1.0715373754501343, + "learning_rate": 5e-06, + "loss": 0.0745, + "step": 5702 + }, + { + "epoch": 1.0904397705544933, + "grad_norm": 1.4231574535369873, + "learning_rate": 5e-06, + "loss": 0.0805, + "step": 5703 + }, + { + "epoch": 1.0906309751434033, + "grad_norm": 2.196079730987549, + "learning_rate": 5e-06, + "loss": 0.1501, + "step": 5704 + }, + { + "epoch": 1.0908221797323137, + "grad_norm": 1.016573429107666, + "learning_rate": 5e-06, + "loss": 0.0582, + "step": 5705 + }, + { + "epoch": 1.0910133843212237, + "grad_norm": 1.7154649496078491, + "learning_rate": 5e-06, + "loss": 0.2169, + "step": 5706 + }, + { + "epoch": 1.0912045889101338, + "grad_norm": 1.9732598066329956, + "learning_rate": 5e-06, + "loss": 0.1264, + "step": 5707 + }, + { + "epoch": 1.091395793499044, + "grad_norm": 1.975980281829834, + "learning_rate": 5e-06, + "loss": 0.1932, + "step": 5708 + }, + { + "epoch": 1.091586998087954, + "grad_norm": 1.5534021854400635, + "learning_rate": 5e-06, + "loss": 0.0715, + "step": 5709 + }, + { + "epoch": 1.0917782026768643, + "grad_norm": 1.1966547966003418, + "learning_rate": 5e-06, + "loss": 0.0663, + "step": 5710 + }, + { + "epoch": 1.0919694072657744, + "grad_norm": 1.5349960327148438, + "learning_rate": 5e-06, + "loss": 0.0343, + "step": 5711 + }, + { + "epoch": 1.0921606118546845, + "grad_norm": 1.9171141386032104, + "learning_rate": 5e-06, + "loss": 0.0983, + "step": 5712 + }, + { + "epoch": 1.0923518164435946, + "grad_norm": 2.5431854724884033, + "learning_rate": 5e-06, + "loss": 0.4502, + "step": 5713 + }, + { + "epoch": 1.0925430210325047, + "grad_norm": 1.5429447889328003, + "learning_rate": 5e-06, + "loss": 0.1223, + "step": 5714 + }, + { + "epoch": 1.092734225621415, + "grad_norm": 2.018566131591797, + "learning_rate": 5e-06, + "loss": 0.2262, + "step": 5715 + }, + { + "epoch": 1.092925430210325, + "grad_norm": 1.1679967641830444, + "learning_rate": 5e-06, + "loss": 0.0731, + "step": 5716 + }, + { + "epoch": 1.0931166347992352, + "grad_norm": 1.0205624103546143, + "learning_rate": 5e-06, + "loss": 0.0543, + "step": 5717 + }, + { + "epoch": 1.0933078393881452, + "grad_norm": 1.0302752256393433, + "learning_rate": 5e-06, + "loss": 0.0479, + "step": 5718 + }, + { + "epoch": 1.0934990439770556, + "grad_norm": 2.270318031311035, + "learning_rate": 5e-06, + "loss": 0.251, + "step": 5719 + }, + { + "epoch": 1.0936902485659656, + "grad_norm": 1.8095113039016724, + "learning_rate": 5e-06, + "loss": 0.16, + "step": 5720 + }, + { + "epoch": 1.0938814531548757, + "grad_norm": 0.7488591074943542, + "learning_rate": 5e-06, + "loss": 0.0514, + "step": 5721 + }, + { + "epoch": 1.0940726577437858, + "grad_norm": 1.4656505584716797, + "learning_rate": 5e-06, + "loss": 0.0985, + "step": 5722 + }, + { + "epoch": 1.094263862332696, + "grad_norm": 0.7714589238166809, + "learning_rate": 5e-06, + "loss": 0.0423, + "step": 5723 + }, + { + "epoch": 1.0944550669216062, + "grad_norm": 2.106182813644409, + "learning_rate": 5e-06, + "loss": 0.2728, + "step": 5724 + }, + { + "epoch": 1.0946462715105163, + "grad_norm": 2.305906057357788, + "learning_rate": 5e-06, + "loss": 0.2627, + "step": 5725 + }, + { + "epoch": 1.0948374760994264, + "grad_norm": 0.6955013275146484, + "learning_rate": 5e-06, + "loss": 0.0651, + "step": 5726 + }, + { + "epoch": 1.0950286806883365, + "grad_norm": 1.7944916486740112, + "learning_rate": 5e-06, + "loss": 0.173, + "step": 5727 + }, + { + "epoch": 1.0952198852772466, + "grad_norm": 2.637842893600464, + "learning_rate": 5e-06, + "loss": 0.1017, + "step": 5728 + }, + { + "epoch": 1.0954110898661569, + "grad_norm": 0.9952343106269836, + "learning_rate": 5e-06, + "loss": 0.0618, + "step": 5729 + }, + { + "epoch": 1.095602294455067, + "grad_norm": 1.2145094871520996, + "learning_rate": 5e-06, + "loss": 0.0363, + "step": 5730 + }, + { + "epoch": 1.095793499043977, + "grad_norm": 2.777398109436035, + "learning_rate": 5e-06, + "loss": 0.1873, + "step": 5731 + }, + { + "epoch": 1.0959847036328871, + "grad_norm": 0.9324513673782349, + "learning_rate": 5e-06, + "loss": 0.069, + "step": 5732 + }, + { + "epoch": 1.0961759082217972, + "grad_norm": 1.6317458152770996, + "learning_rate": 5e-06, + "loss": 0.1003, + "step": 5733 + }, + { + "epoch": 1.0963671128107075, + "grad_norm": 1.5389405488967896, + "learning_rate": 5e-06, + "loss": 0.0722, + "step": 5734 + }, + { + "epoch": 1.0965583173996176, + "grad_norm": 3.062220335006714, + "learning_rate": 5e-06, + "loss": 0.1294, + "step": 5735 + }, + { + "epoch": 1.0967495219885277, + "grad_norm": 1.457374930381775, + "learning_rate": 5e-06, + "loss": 0.0802, + "step": 5736 + }, + { + "epoch": 1.0969407265774378, + "grad_norm": 2.036764144897461, + "learning_rate": 5e-06, + "loss": 0.1513, + "step": 5737 + }, + { + "epoch": 1.097131931166348, + "grad_norm": 2.1664798259735107, + "learning_rate": 5e-06, + "loss": 0.2764, + "step": 5738 + }, + { + "epoch": 1.0973231357552582, + "grad_norm": 1.5670701265335083, + "learning_rate": 5e-06, + "loss": 0.0703, + "step": 5739 + }, + { + "epoch": 1.0975143403441683, + "grad_norm": 0.9358064532279968, + "learning_rate": 5e-06, + "loss": 0.0345, + "step": 5740 + }, + { + "epoch": 1.0977055449330784, + "grad_norm": 1.4006162881851196, + "learning_rate": 5e-06, + "loss": 0.0829, + "step": 5741 + }, + { + "epoch": 1.0978967495219885, + "grad_norm": 1.403589129447937, + "learning_rate": 5e-06, + "loss": 0.061, + "step": 5742 + }, + { + "epoch": 1.0980879541108988, + "grad_norm": 1.3776750564575195, + "learning_rate": 5e-06, + "loss": 0.0641, + "step": 5743 + }, + { + "epoch": 1.0982791586998089, + "grad_norm": 1.898293375968933, + "learning_rate": 5e-06, + "loss": 0.1924, + "step": 5744 + }, + { + "epoch": 1.098470363288719, + "grad_norm": 1.39720618724823, + "learning_rate": 5e-06, + "loss": 0.0802, + "step": 5745 + }, + { + "epoch": 1.098661567877629, + "grad_norm": 1.3132373094558716, + "learning_rate": 5e-06, + "loss": 0.063, + "step": 5746 + }, + { + "epoch": 1.0988527724665391, + "grad_norm": 1.5557122230529785, + "learning_rate": 5e-06, + "loss": 0.175, + "step": 5747 + }, + { + "epoch": 1.0990439770554494, + "grad_norm": 2.059504508972168, + "learning_rate": 5e-06, + "loss": 0.1491, + "step": 5748 + }, + { + "epoch": 1.0992351816443595, + "grad_norm": 1.1423454284667969, + "learning_rate": 5e-06, + "loss": 0.0325, + "step": 5749 + }, + { + "epoch": 1.0994263862332696, + "grad_norm": 2.8969271183013916, + "learning_rate": 5e-06, + "loss": 0.4175, + "step": 5750 + }, + { + "epoch": 1.0996175908221797, + "grad_norm": 1.9927480220794678, + "learning_rate": 5e-06, + "loss": 0.0938, + "step": 5751 + }, + { + "epoch": 1.0998087954110898, + "grad_norm": 1.3723220825195312, + "learning_rate": 5e-06, + "loss": 0.1006, + "step": 5752 + }, + { + "epoch": 1.1, + "grad_norm": 0.7870752811431885, + "learning_rate": 5e-06, + "loss": 0.0486, + "step": 5753 + }, + { + "epoch": 1.1001912045889102, + "grad_norm": 0.7222342491149902, + "learning_rate": 5e-06, + "loss": 0.0125, + "step": 5754 + }, + { + "epoch": 1.1003824091778203, + "grad_norm": 1.8501266241073608, + "learning_rate": 5e-06, + "loss": 0.0527, + "step": 5755 + }, + { + "epoch": 1.1005736137667304, + "grad_norm": 2.527247905731201, + "learning_rate": 5e-06, + "loss": 0.2819, + "step": 5756 + }, + { + "epoch": 1.1007648183556404, + "grad_norm": 2.091769218444824, + "learning_rate": 5e-06, + "loss": 0.2612, + "step": 5757 + }, + { + "epoch": 1.1009560229445507, + "grad_norm": 1.6816740036010742, + "learning_rate": 5e-06, + "loss": 0.1683, + "step": 5758 + }, + { + "epoch": 1.1011472275334608, + "grad_norm": 1.4403516054153442, + "learning_rate": 5e-06, + "loss": 0.0814, + "step": 5759 + }, + { + "epoch": 1.101338432122371, + "grad_norm": 0.9857239723205566, + "learning_rate": 5e-06, + "loss": 0.0488, + "step": 5760 + }, + { + "epoch": 1.101529636711281, + "grad_norm": 2.1389448642730713, + "learning_rate": 5e-06, + "loss": 0.0507, + "step": 5761 + }, + { + "epoch": 1.101720841300191, + "grad_norm": 1.8587839603424072, + "learning_rate": 5e-06, + "loss": 0.1574, + "step": 5762 + }, + { + "epoch": 1.1019120458891014, + "grad_norm": 1.851593017578125, + "learning_rate": 5e-06, + "loss": 0.1472, + "step": 5763 + }, + { + "epoch": 1.1021032504780115, + "grad_norm": 1.861345648765564, + "learning_rate": 5e-06, + "loss": 0.1222, + "step": 5764 + }, + { + "epoch": 1.1022944550669216, + "grad_norm": 1.249029278755188, + "learning_rate": 5e-06, + "loss": 0.1045, + "step": 5765 + }, + { + "epoch": 1.1024856596558317, + "grad_norm": 2.417471408843994, + "learning_rate": 5e-06, + "loss": 0.2049, + "step": 5766 + }, + { + "epoch": 1.1026768642447418, + "grad_norm": 1.392234206199646, + "learning_rate": 5e-06, + "loss": 0.0759, + "step": 5767 + }, + { + "epoch": 1.102868068833652, + "grad_norm": 1.4488508701324463, + "learning_rate": 5e-06, + "loss": 0.0945, + "step": 5768 + }, + { + "epoch": 1.1030592734225622, + "grad_norm": 1.731885552406311, + "learning_rate": 5e-06, + "loss": 0.1366, + "step": 5769 + }, + { + "epoch": 1.1032504780114722, + "grad_norm": 1.2528032064437866, + "learning_rate": 5e-06, + "loss": 0.0849, + "step": 5770 + }, + { + "epoch": 1.1034416826003823, + "grad_norm": 1.366706132888794, + "learning_rate": 5e-06, + "loss": 0.1138, + "step": 5771 + }, + { + "epoch": 1.1036328871892926, + "grad_norm": 1.9706212282180786, + "learning_rate": 5e-06, + "loss": 0.1266, + "step": 5772 + }, + { + "epoch": 1.1038240917782027, + "grad_norm": 6.170831680297852, + "learning_rate": 5e-06, + "loss": 0.1795, + "step": 5773 + }, + { + "epoch": 1.1040152963671128, + "grad_norm": 1.9874886274337769, + "learning_rate": 5e-06, + "loss": 0.1312, + "step": 5774 + }, + { + "epoch": 1.104206500956023, + "grad_norm": 2.058870553970337, + "learning_rate": 5e-06, + "loss": 0.2049, + "step": 5775 + }, + { + "epoch": 1.104397705544933, + "grad_norm": 1.8878477811813354, + "learning_rate": 5e-06, + "loss": 0.2064, + "step": 5776 + }, + { + "epoch": 1.1045889101338433, + "grad_norm": 0.7636244893074036, + "learning_rate": 5e-06, + "loss": 0.0408, + "step": 5777 + }, + { + "epoch": 1.1047801147227534, + "grad_norm": 1.490784764289856, + "learning_rate": 5e-06, + "loss": 0.1765, + "step": 5778 + }, + { + "epoch": 1.1049713193116635, + "grad_norm": 0.795039713382721, + "learning_rate": 5e-06, + "loss": 0.0424, + "step": 5779 + }, + { + "epoch": 1.1051625239005736, + "grad_norm": 1.5659568309783936, + "learning_rate": 5e-06, + "loss": 0.0788, + "step": 5780 + }, + { + "epoch": 1.1053537284894837, + "grad_norm": 0.8034716844558716, + "learning_rate": 5e-06, + "loss": 0.0485, + "step": 5781 + }, + { + "epoch": 1.105544933078394, + "grad_norm": 1.0298188924789429, + "learning_rate": 5e-06, + "loss": 0.0677, + "step": 5782 + }, + { + "epoch": 1.105736137667304, + "grad_norm": 1.8984624147415161, + "learning_rate": 5e-06, + "loss": 0.1402, + "step": 5783 + }, + { + "epoch": 1.1059273422562141, + "grad_norm": 1.9967960119247437, + "learning_rate": 5e-06, + "loss": 0.1297, + "step": 5784 + }, + { + "epoch": 1.1061185468451242, + "grad_norm": 1.45951247215271, + "learning_rate": 5e-06, + "loss": 0.0652, + "step": 5785 + }, + { + "epoch": 1.1063097514340343, + "grad_norm": 1.4071576595306396, + "learning_rate": 5e-06, + "loss": 0.0426, + "step": 5786 + }, + { + "epoch": 1.1065009560229446, + "grad_norm": 3.093616247177124, + "learning_rate": 5e-06, + "loss": 0.2499, + "step": 5787 + }, + { + "epoch": 1.1066921606118547, + "grad_norm": 1.2860229015350342, + "learning_rate": 5e-06, + "loss": 0.0862, + "step": 5788 + }, + { + "epoch": 1.1068833652007648, + "grad_norm": 1.9504213333129883, + "learning_rate": 5e-06, + "loss": 0.2004, + "step": 5789 + }, + { + "epoch": 1.107074569789675, + "grad_norm": 2.4501025676727295, + "learning_rate": 5e-06, + "loss": 0.1979, + "step": 5790 + }, + { + "epoch": 1.1072657743785852, + "grad_norm": 3.6061577796936035, + "learning_rate": 5e-06, + "loss": 0.1549, + "step": 5791 + }, + { + "epoch": 1.1074569789674953, + "grad_norm": 0.46892136335372925, + "learning_rate": 5e-06, + "loss": 0.0101, + "step": 5792 + }, + { + "epoch": 1.1076481835564054, + "grad_norm": 1.5113166570663452, + "learning_rate": 5e-06, + "loss": 0.0745, + "step": 5793 + }, + { + "epoch": 1.1078393881453155, + "grad_norm": 2.696958303451538, + "learning_rate": 5e-06, + "loss": 0.3514, + "step": 5794 + }, + { + "epoch": 1.1080305927342256, + "grad_norm": 2.0832767486572266, + "learning_rate": 5e-06, + "loss": 0.1931, + "step": 5795 + }, + { + "epoch": 1.1082217973231359, + "grad_norm": 1.0346436500549316, + "learning_rate": 5e-06, + "loss": 0.0442, + "step": 5796 + }, + { + "epoch": 1.108413001912046, + "grad_norm": 2.7505767345428467, + "learning_rate": 5e-06, + "loss": 0.134, + "step": 5797 + }, + { + "epoch": 1.108604206500956, + "grad_norm": 1.7536537647247314, + "learning_rate": 5e-06, + "loss": 0.1032, + "step": 5798 + }, + { + "epoch": 1.1087954110898661, + "grad_norm": 0.7006078362464905, + "learning_rate": 5e-06, + "loss": 0.0423, + "step": 5799 + }, + { + "epoch": 1.1089866156787762, + "grad_norm": 1.4535229206085205, + "learning_rate": 5e-06, + "loss": 0.0865, + "step": 5800 + }, + { + "epoch": 1.1091778202676865, + "grad_norm": 2.1496787071228027, + "learning_rate": 5e-06, + "loss": 0.2779, + "step": 5801 + }, + { + "epoch": 1.1093690248565966, + "grad_norm": 2.135427474975586, + "learning_rate": 5e-06, + "loss": 0.2009, + "step": 5802 + }, + { + "epoch": 1.1095602294455067, + "grad_norm": 2.1016106605529785, + "learning_rate": 5e-06, + "loss": 0.111, + "step": 5803 + }, + { + "epoch": 1.1097514340344168, + "grad_norm": 0.7748586535453796, + "learning_rate": 5e-06, + "loss": 0.064, + "step": 5804 + }, + { + "epoch": 1.1099426386233269, + "grad_norm": 1.6331055164337158, + "learning_rate": 5e-06, + "loss": 0.0694, + "step": 5805 + }, + { + "epoch": 1.1101338432122372, + "grad_norm": 1.2938159704208374, + "learning_rate": 5e-06, + "loss": 0.0924, + "step": 5806 + }, + { + "epoch": 1.1103250478011473, + "grad_norm": 1.352454662322998, + "learning_rate": 5e-06, + "loss": 0.074, + "step": 5807 + }, + { + "epoch": 1.1105162523900574, + "grad_norm": 1.8348606824874878, + "learning_rate": 5e-06, + "loss": 0.2089, + "step": 5808 + }, + { + "epoch": 1.1107074569789674, + "grad_norm": 1.0210434198379517, + "learning_rate": 5e-06, + "loss": 0.0382, + "step": 5809 + }, + { + "epoch": 1.1108986615678775, + "grad_norm": 2.106792449951172, + "learning_rate": 5e-06, + "loss": 0.1903, + "step": 5810 + }, + { + "epoch": 1.1110898661567878, + "grad_norm": 1.2394472360610962, + "learning_rate": 5e-06, + "loss": 0.0562, + "step": 5811 + }, + { + "epoch": 1.111281070745698, + "grad_norm": 1.395771861076355, + "learning_rate": 5e-06, + "loss": 0.1461, + "step": 5812 + }, + { + "epoch": 1.111472275334608, + "grad_norm": 3.2685039043426514, + "learning_rate": 5e-06, + "loss": 0.1339, + "step": 5813 + }, + { + "epoch": 1.111663479923518, + "grad_norm": 1.9594659805297852, + "learning_rate": 5e-06, + "loss": 0.2694, + "step": 5814 + }, + { + "epoch": 1.1118546845124282, + "grad_norm": 1.7652355432510376, + "learning_rate": 5e-06, + "loss": 0.0963, + "step": 5815 + }, + { + "epoch": 1.1120458891013385, + "grad_norm": 2.3200769424438477, + "learning_rate": 5e-06, + "loss": 0.2094, + "step": 5816 + }, + { + "epoch": 1.1122370936902486, + "grad_norm": 1.5320895910263062, + "learning_rate": 5e-06, + "loss": 0.0662, + "step": 5817 + }, + { + "epoch": 1.1124282982791587, + "grad_norm": 0.7863326668739319, + "learning_rate": 5e-06, + "loss": 0.044, + "step": 5818 + }, + { + "epoch": 1.1126195028680688, + "grad_norm": 1.5986088514328003, + "learning_rate": 5e-06, + "loss": 0.2212, + "step": 5819 + }, + { + "epoch": 1.1128107074569789, + "grad_norm": 0.8004074096679688, + "learning_rate": 5e-06, + "loss": 0.0729, + "step": 5820 + }, + { + "epoch": 1.1130019120458892, + "grad_norm": 1.8215408325195312, + "learning_rate": 5e-06, + "loss": 0.15, + "step": 5821 + }, + { + "epoch": 1.1131931166347993, + "grad_norm": 0.8063507080078125, + "learning_rate": 5e-06, + "loss": 0.0511, + "step": 5822 + }, + { + "epoch": 1.1133843212237093, + "grad_norm": 0.7384288311004639, + "learning_rate": 5e-06, + "loss": 0.0304, + "step": 5823 + }, + { + "epoch": 1.1135755258126194, + "grad_norm": 0.910090446472168, + "learning_rate": 5e-06, + "loss": 0.0327, + "step": 5824 + }, + { + "epoch": 1.1137667304015297, + "grad_norm": 2.1890387535095215, + "learning_rate": 5e-06, + "loss": 0.2288, + "step": 5825 + }, + { + "epoch": 1.1139579349904398, + "grad_norm": 0.9365002512931824, + "learning_rate": 5e-06, + "loss": 0.074, + "step": 5826 + }, + { + "epoch": 1.11414913957935, + "grad_norm": 1.6376757621765137, + "learning_rate": 5e-06, + "loss": 0.1314, + "step": 5827 + }, + { + "epoch": 1.11434034416826, + "grad_norm": 0.6365956664085388, + "learning_rate": 5e-06, + "loss": 0.0318, + "step": 5828 + }, + { + "epoch": 1.11453154875717, + "grad_norm": 2.4572699069976807, + "learning_rate": 5e-06, + "loss": 0.1096, + "step": 5829 + }, + { + "epoch": 1.1147227533460804, + "grad_norm": 0.6337371468544006, + "learning_rate": 5e-06, + "loss": 0.0204, + "step": 5830 + }, + { + "epoch": 1.1149139579349905, + "grad_norm": 1.9503843784332275, + "learning_rate": 5e-06, + "loss": 0.2206, + "step": 5831 + }, + { + "epoch": 1.1151051625239006, + "grad_norm": 2.674628973007202, + "learning_rate": 5e-06, + "loss": 0.2115, + "step": 5832 + }, + { + "epoch": 1.1152963671128107, + "grad_norm": 1.2188706398010254, + "learning_rate": 5e-06, + "loss": 0.0726, + "step": 5833 + }, + { + "epoch": 1.1154875717017207, + "grad_norm": 4.067685127258301, + "learning_rate": 5e-06, + "loss": 0.1759, + "step": 5834 + }, + { + "epoch": 1.115678776290631, + "grad_norm": 1.1010500192642212, + "learning_rate": 5e-06, + "loss": 0.0777, + "step": 5835 + }, + { + "epoch": 1.1158699808795411, + "grad_norm": 1.6503252983093262, + "learning_rate": 5e-06, + "loss": 0.101, + "step": 5836 + }, + { + "epoch": 1.1160611854684512, + "grad_norm": 1.9464383125305176, + "learning_rate": 5e-06, + "loss": 0.2607, + "step": 5837 + }, + { + "epoch": 1.1162523900573613, + "grad_norm": 2.022468090057373, + "learning_rate": 5e-06, + "loss": 0.2771, + "step": 5838 + }, + { + "epoch": 1.1164435946462714, + "grad_norm": 1.6697824001312256, + "learning_rate": 5e-06, + "loss": 0.1383, + "step": 5839 + }, + { + "epoch": 1.1166347992351817, + "grad_norm": 2.806473731994629, + "learning_rate": 5e-06, + "loss": 0.282, + "step": 5840 + }, + { + "epoch": 1.1168260038240918, + "grad_norm": 1.8859554529190063, + "learning_rate": 5e-06, + "loss": 0.1404, + "step": 5841 + }, + { + "epoch": 1.117017208413002, + "grad_norm": 2.372626543045044, + "learning_rate": 5e-06, + "loss": 0.1272, + "step": 5842 + }, + { + "epoch": 1.117208413001912, + "grad_norm": 1.7864800691604614, + "learning_rate": 5e-06, + "loss": 0.093, + "step": 5843 + }, + { + "epoch": 1.1173996175908223, + "grad_norm": 2.1923136711120605, + "learning_rate": 5e-06, + "loss": 0.175, + "step": 5844 + }, + { + "epoch": 1.1175908221797324, + "grad_norm": 1.0703550577163696, + "learning_rate": 5e-06, + "loss": 0.0707, + "step": 5845 + }, + { + "epoch": 1.1177820267686425, + "grad_norm": 0.8329720497131348, + "learning_rate": 5e-06, + "loss": 0.073, + "step": 5846 + }, + { + "epoch": 1.1179732313575526, + "grad_norm": 1.4551035165786743, + "learning_rate": 5e-06, + "loss": 0.0402, + "step": 5847 + }, + { + "epoch": 1.1181644359464626, + "grad_norm": 1.0404424667358398, + "learning_rate": 5e-06, + "loss": 0.0406, + "step": 5848 + }, + { + "epoch": 1.118355640535373, + "grad_norm": 1.1678119897842407, + "learning_rate": 5e-06, + "loss": 0.0683, + "step": 5849 + }, + { + "epoch": 1.118546845124283, + "grad_norm": 3.122912883758545, + "learning_rate": 5e-06, + "loss": 0.496, + "step": 5850 + }, + { + "epoch": 1.1187380497131931, + "grad_norm": 1.2659235000610352, + "learning_rate": 5e-06, + "loss": 0.0904, + "step": 5851 + }, + { + "epoch": 1.1189292543021032, + "grad_norm": 2.43969988822937, + "learning_rate": 5e-06, + "loss": 0.308, + "step": 5852 + }, + { + "epoch": 1.1191204588910133, + "grad_norm": 1.8691391944885254, + "learning_rate": 5e-06, + "loss": 0.1474, + "step": 5853 + }, + { + "epoch": 1.1193116634799236, + "grad_norm": 2.168302059173584, + "learning_rate": 5e-06, + "loss": 0.0874, + "step": 5854 + }, + { + "epoch": 1.1195028680688337, + "grad_norm": 1.7108036279678345, + "learning_rate": 5e-06, + "loss": 0.0426, + "step": 5855 + }, + { + "epoch": 1.1196940726577438, + "grad_norm": 2.807419538497925, + "learning_rate": 5e-06, + "loss": 0.3353, + "step": 5856 + }, + { + "epoch": 1.1198852772466539, + "grad_norm": 1.1391634941101074, + "learning_rate": 5e-06, + "loss": 0.0533, + "step": 5857 + }, + { + "epoch": 1.120076481835564, + "grad_norm": 0.9980553984642029, + "learning_rate": 5e-06, + "loss": 0.0557, + "step": 5858 + }, + { + "epoch": 1.1202676864244743, + "grad_norm": 1.4949246644973755, + "learning_rate": 5e-06, + "loss": 0.1786, + "step": 5859 + }, + { + "epoch": 1.1204588910133844, + "grad_norm": 0.8271896839141846, + "learning_rate": 5e-06, + "loss": 0.0383, + "step": 5860 + }, + { + "epoch": 1.1206500956022944, + "grad_norm": 1.2064502239227295, + "learning_rate": 5e-06, + "loss": 0.0586, + "step": 5861 + }, + { + "epoch": 1.1208413001912045, + "grad_norm": 1.5888252258300781, + "learning_rate": 5e-06, + "loss": 0.1418, + "step": 5862 + }, + { + "epoch": 1.1210325047801146, + "grad_norm": 1.441205382347107, + "learning_rate": 5e-06, + "loss": 0.1375, + "step": 5863 + }, + { + "epoch": 1.121223709369025, + "grad_norm": 1.0657639503479004, + "learning_rate": 5e-06, + "loss": 0.0368, + "step": 5864 + }, + { + "epoch": 1.121414913957935, + "grad_norm": 1.1127941608428955, + "learning_rate": 5e-06, + "loss": 0.0692, + "step": 5865 + }, + { + "epoch": 1.121606118546845, + "grad_norm": 1.5667933225631714, + "learning_rate": 5e-06, + "loss": 0.0617, + "step": 5866 + }, + { + "epoch": 1.1217973231357552, + "grad_norm": 1.4817070960998535, + "learning_rate": 5e-06, + "loss": 0.0875, + "step": 5867 + }, + { + "epoch": 1.1219885277246653, + "grad_norm": 2.7602312564849854, + "learning_rate": 5e-06, + "loss": 0.2847, + "step": 5868 + }, + { + "epoch": 1.1221797323135756, + "grad_norm": 2.3214738368988037, + "learning_rate": 5e-06, + "loss": 0.3105, + "step": 5869 + }, + { + "epoch": 1.1223709369024857, + "grad_norm": 1.2657291889190674, + "learning_rate": 5e-06, + "loss": 0.0898, + "step": 5870 + }, + { + "epoch": 1.1225621414913958, + "grad_norm": 1.5459098815917969, + "learning_rate": 5e-06, + "loss": 0.1353, + "step": 5871 + }, + { + "epoch": 1.1227533460803059, + "grad_norm": 1.9266481399536133, + "learning_rate": 5e-06, + "loss": 0.1243, + "step": 5872 + }, + { + "epoch": 1.122944550669216, + "grad_norm": 1.1586101055145264, + "learning_rate": 5e-06, + "loss": 0.0829, + "step": 5873 + }, + { + "epoch": 1.1231357552581263, + "grad_norm": 1.7313776016235352, + "learning_rate": 5e-06, + "loss": 0.0654, + "step": 5874 + }, + { + "epoch": 1.1233269598470363, + "grad_norm": 2.0912880897521973, + "learning_rate": 5e-06, + "loss": 0.1978, + "step": 5875 + }, + { + "epoch": 1.1235181644359464, + "grad_norm": 2.500584840774536, + "learning_rate": 5e-06, + "loss": 0.2266, + "step": 5876 + }, + { + "epoch": 1.1237093690248565, + "grad_norm": 1.7160296440124512, + "learning_rate": 5e-06, + "loss": 0.153, + "step": 5877 + }, + { + "epoch": 1.1239005736137668, + "grad_norm": 0.9689812660217285, + "learning_rate": 5e-06, + "loss": 0.0418, + "step": 5878 + }, + { + "epoch": 1.124091778202677, + "grad_norm": 1.4203402996063232, + "learning_rate": 5e-06, + "loss": 0.0807, + "step": 5879 + }, + { + "epoch": 1.124282982791587, + "grad_norm": 3.106449842453003, + "learning_rate": 5e-06, + "loss": 0.0981, + "step": 5880 + }, + { + "epoch": 1.124474187380497, + "grad_norm": 2.113834857940674, + "learning_rate": 5e-06, + "loss": 0.2381, + "step": 5881 + }, + { + "epoch": 1.1246653919694072, + "grad_norm": 1.5136511325836182, + "learning_rate": 5e-06, + "loss": 0.1507, + "step": 5882 + }, + { + "epoch": 1.1248565965583175, + "grad_norm": 2.4438841342926025, + "learning_rate": 5e-06, + "loss": 0.1835, + "step": 5883 + }, + { + "epoch": 1.1250478011472276, + "grad_norm": 1.4762717485427856, + "learning_rate": 5e-06, + "loss": 0.0685, + "step": 5884 + }, + { + "epoch": 1.1252390057361377, + "grad_norm": 0.3290434181690216, + "learning_rate": 5e-06, + "loss": 0.013, + "step": 5885 + }, + { + "epoch": 1.1254302103250478, + "grad_norm": 1.6333457231521606, + "learning_rate": 5e-06, + "loss": 0.048, + "step": 5886 + }, + { + "epoch": 1.1256214149139578, + "grad_norm": 2.1950032711029053, + "learning_rate": 5e-06, + "loss": 0.192, + "step": 5887 + }, + { + "epoch": 1.1258126195028682, + "grad_norm": 0.8317578434944153, + "learning_rate": 5e-06, + "loss": 0.0635, + "step": 5888 + }, + { + "epoch": 1.1260038240917782, + "grad_norm": 1.20002281665802, + "learning_rate": 5e-06, + "loss": 0.0849, + "step": 5889 + }, + { + "epoch": 1.1261950286806883, + "grad_norm": 1.6884593963623047, + "learning_rate": 5e-06, + "loss": 0.0486, + "step": 5890 + }, + { + "epoch": 1.1263862332695984, + "grad_norm": 2.1091883182525635, + "learning_rate": 5e-06, + "loss": 0.1591, + "step": 5891 + }, + { + "epoch": 1.1265774378585087, + "grad_norm": 1.4711389541625977, + "learning_rate": 5e-06, + "loss": 0.0747, + "step": 5892 + }, + { + "epoch": 1.1267686424474188, + "grad_norm": 2.467928409576416, + "learning_rate": 5e-06, + "loss": 0.3312, + "step": 5893 + }, + { + "epoch": 1.126959847036329, + "grad_norm": 1.4449323415756226, + "learning_rate": 5e-06, + "loss": 0.1376, + "step": 5894 + }, + { + "epoch": 1.127151051625239, + "grad_norm": 2.1099445819854736, + "learning_rate": 5e-06, + "loss": 0.1635, + "step": 5895 + }, + { + "epoch": 1.127342256214149, + "grad_norm": 2.1566996574401855, + "learning_rate": 5e-06, + "loss": 0.1596, + "step": 5896 + }, + { + "epoch": 1.1275334608030594, + "grad_norm": 1.25776207447052, + "learning_rate": 5e-06, + "loss": 0.0662, + "step": 5897 + }, + { + "epoch": 1.1277246653919695, + "grad_norm": 1.7440266609191895, + "learning_rate": 5e-06, + "loss": 0.1401, + "step": 5898 + }, + { + "epoch": 1.1279158699808796, + "grad_norm": 1.6530354022979736, + "learning_rate": 5e-06, + "loss": 0.0837, + "step": 5899 + }, + { + "epoch": 1.1281070745697896, + "grad_norm": 2.073582649230957, + "learning_rate": 5e-06, + "loss": 0.209, + "step": 5900 + }, + { + "epoch": 1.1282982791586997, + "grad_norm": 1.2013795375823975, + "learning_rate": 5e-06, + "loss": 0.1344, + "step": 5901 + }, + { + "epoch": 1.12848948374761, + "grad_norm": 0.7417505979537964, + "learning_rate": 5e-06, + "loss": 0.061, + "step": 5902 + }, + { + "epoch": 1.1286806883365201, + "grad_norm": 0.7994497418403625, + "learning_rate": 5e-06, + "loss": 0.0461, + "step": 5903 + }, + { + "epoch": 1.1288718929254302, + "grad_norm": 0.9240248203277588, + "learning_rate": 5e-06, + "loss": 0.0244, + "step": 5904 + }, + { + "epoch": 1.1290630975143403, + "grad_norm": 1.514775276184082, + "learning_rate": 5e-06, + "loss": 0.0728, + "step": 5905 + }, + { + "epoch": 1.1292543021032504, + "grad_norm": 2.3339710235595703, + "learning_rate": 5e-06, + "loss": 0.2318, + "step": 5906 + }, + { + "epoch": 1.1294455066921607, + "grad_norm": 1.739105463027954, + "learning_rate": 5e-06, + "loss": 0.1055, + "step": 5907 + }, + { + "epoch": 1.1296367112810708, + "grad_norm": 1.3210444450378418, + "learning_rate": 5e-06, + "loss": 0.1018, + "step": 5908 + }, + { + "epoch": 1.1298279158699809, + "grad_norm": 1.95649254322052, + "learning_rate": 5e-06, + "loss": 0.127, + "step": 5909 + }, + { + "epoch": 1.130019120458891, + "grad_norm": 1.2586040496826172, + "learning_rate": 5e-06, + "loss": 0.0602, + "step": 5910 + }, + { + "epoch": 1.130210325047801, + "grad_norm": 0.6684005260467529, + "learning_rate": 5e-06, + "loss": 0.0093, + "step": 5911 + }, + { + "epoch": 1.1304015296367114, + "grad_norm": 1.8613158464431763, + "learning_rate": 5e-06, + "loss": 0.166, + "step": 5912 + }, + { + "epoch": 1.1305927342256215, + "grad_norm": 1.138454794883728, + "learning_rate": 5e-06, + "loss": 0.0658, + "step": 5913 + }, + { + "epoch": 1.1307839388145315, + "grad_norm": 1.3337939977645874, + "learning_rate": 5e-06, + "loss": 0.1279, + "step": 5914 + }, + { + "epoch": 1.1309751434034416, + "grad_norm": 1.7277913093566895, + "learning_rate": 5e-06, + "loss": 0.0672, + "step": 5915 + }, + { + "epoch": 1.1311663479923517, + "grad_norm": 1.699109673500061, + "learning_rate": 5e-06, + "loss": 0.0633, + "step": 5916 + }, + { + "epoch": 1.131357552581262, + "grad_norm": 2.2800168991088867, + "learning_rate": 5e-06, + "loss": 0.1858, + "step": 5917 + }, + { + "epoch": 1.1315487571701721, + "grad_norm": 2.4426426887512207, + "learning_rate": 5e-06, + "loss": 0.1289, + "step": 5918 + }, + { + "epoch": 1.1317399617590822, + "grad_norm": 1.9887288808822632, + "learning_rate": 5e-06, + "loss": 0.2119, + "step": 5919 + }, + { + "epoch": 1.1319311663479923, + "grad_norm": 2.3811888694763184, + "learning_rate": 5e-06, + "loss": 0.2509, + "step": 5920 + }, + { + "epoch": 1.1321223709369024, + "grad_norm": 2.353497266769409, + "learning_rate": 5e-06, + "loss": 0.2354, + "step": 5921 + }, + { + "epoch": 1.1323135755258127, + "grad_norm": 0.9172454476356506, + "learning_rate": 5e-06, + "loss": 0.0662, + "step": 5922 + }, + { + "epoch": 1.1325047801147228, + "grad_norm": 2.1203384399414062, + "learning_rate": 5e-06, + "loss": 0.085, + "step": 5923 + }, + { + "epoch": 1.1326959847036329, + "grad_norm": 2.3185367584228516, + "learning_rate": 5e-06, + "loss": 0.0868, + "step": 5924 + }, + { + "epoch": 1.132887189292543, + "grad_norm": 2.7123560905456543, + "learning_rate": 5e-06, + "loss": 0.3535, + "step": 5925 + }, + { + "epoch": 1.133078393881453, + "grad_norm": 2.161722183227539, + "learning_rate": 5e-06, + "loss": 0.2252, + "step": 5926 + }, + { + "epoch": 1.1332695984703633, + "grad_norm": 1.7932413816452026, + "learning_rate": 5e-06, + "loss": 0.1072, + "step": 5927 + }, + { + "epoch": 1.1334608030592734, + "grad_norm": 1.5227441787719727, + "learning_rate": 5e-06, + "loss": 0.0524, + "step": 5928 + }, + { + "epoch": 1.1336520076481835, + "grad_norm": 1.1363903284072876, + "learning_rate": 5e-06, + "loss": 0.065, + "step": 5929 + }, + { + "epoch": 1.1338432122370936, + "grad_norm": 2.1632423400878906, + "learning_rate": 5e-06, + "loss": 0.0894, + "step": 5930 + }, + { + "epoch": 1.1340344168260037, + "grad_norm": 0.6506508588790894, + "learning_rate": 5e-06, + "loss": 0.0442, + "step": 5931 + }, + { + "epoch": 1.134225621414914, + "grad_norm": 3.3030409812927246, + "learning_rate": 5e-06, + "loss": 0.3152, + "step": 5932 + }, + { + "epoch": 1.134416826003824, + "grad_norm": 1.9436111450195312, + "learning_rate": 5e-06, + "loss": 0.0738, + "step": 5933 + }, + { + "epoch": 1.1346080305927342, + "grad_norm": 3.1385626792907715, + "learning_rate": 5e-06, + "loss": 0.3247, + "step": 5934 + }, + { + "epoch": 1.1347992351816443, + "grad_norm": 3.9077584743499756, + "learning_rate": 5e-06, + "loss": 0.0565, + "step": 5935 + }, + { + "epoch": 1.1349904397705546, + "grad_norm": 1.5972917079925537, + "learning_rate": 5e-06, + "loss": 0.0843, + "step": 5936 + }, + { + "epoch": 1.1351816443594647, + "grad_norm": 1.5931931734085083, + "learning_rate": 5e-06, + "loss": 0.1674, + "step": 5937 + }, + { + "epoch": 1.1353728489483748, + "grad_norm": 1.6486834287643433, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 5938 + }, + { + "epoch": 1.1355640535372848, + "grad_norm": 2.0563056468963623, + "learning_rate": 5e-06, + "loss": 0.2244, + "step": 5939 + }, + { + "epoch": 1.135755258126195, + "grad_norm": 1.0756323337554932, + "learning_rate": 5e-06, + "loss": 0.0842, + "step": 5940 + }, + { + "epoch": 1.1359464627151052, + "grad_norm": 1.615752935409546, + "learning_rate": 5e-06, + "loss": 0.1011, + "step": 5941 + }, + { + "epoch": 1.1361376673040153, + "grad_norm": 1.9002844095230103, + "learning_rate": 5e-06, + "loss": 0.0837, + "step": 5942 + }, + { + "epoch": 1.1363288718929254, + "grad_norm": 1.3055564165115356, + "learning_rate": 5e-06, + "loss": 0.0799, + "step": 5943 + }, + { + "epoch": 1.1365200764818355, + "grad_norm": 2.0098092555999756, + "learning_rate": 5e-06, + "loss": 0.2034, + "step": 5944 + }, + { + "epoch": 1.1367112810707458, + "grad_norm": 1.3458549976348877, + "learning_rate": 5e-06, + "loss": 0.157, + "step": 5945 + }, + { + "epoch": 1.136902485659656, + "grad_norm": 1.6178194284439087, + "learning_rate": 5e-06, + "loss": 0.1039, + "step": 5946 + }, + { + "epoch": 1.137093690248566, + "grad_norm": 2.4097132682800293, + "learning_rate": 5e-06, + "loss": 0.0646, + "step": 5947 + }, + { + "epoch": 1.137284894837476, + "grad_norm": 1.505638599395752, + "learning_rate": 5e-06, + "loss": 0.0537, + "step": 5948 + }, + { + "epoch": 1.1374760994263862, + "grad_norm": 1.2082053422927856, + "learning_rate": 5e-06, + "loss": 0.047, + "step": 5949 + }, + { + "epoch": 1.1376673040152965, + "grad_norm": 1.2429389953613281, + "learning_rate": 5e-06, + "loss": 0.0812, + "step": 5950 + }, + { + "epoch": 1.1378585086042066, + "grad_norm": 2.585109233856201, + "learning_rate": 5e-06, + "loss": 0.069, + "step": 5951 + }, + { + "epoch": 1.1380497131931167, + "grad_norm": 2.0243451595306396, + "learning_rate": 5e-06, + "loss": 0.1475, + "step": 5952 + }, + { + "epoch": 1.1382409177820267, + "grad_norm": 1.217773199081421, + "learning_rate": 5e-06, + "loss": 0.0684, + "step": 5953 + }, + { + "epoch": 1.1384321223709368, + "grad_norm": 0.9870648384094238, + "learning_rate": 5e-06, + "loss": 0.0429, + "step": 5954 + }, + { + "epoch": 1.1386233269598471, + "grad_norm": 1.2159374952316284, + "learning_rate": 5e-06, + "loss": 0.0458, + "step": 5955 + }, + { + "epoch": 1.1388145315487572, + "grad_norm": 1.7573827505111694, + "learning_rate": 5e-06, + "loss": 0.2002, + "step": 5956 + }, + { + "epoch": 1.1390057361376673, + "grad_norm": 1.0106160640716553, + "learning_rate": 5e-06, + "loss": 0.1065, + "step": 5957 + }, + { + "epoch": 1.1391969407265774, + "grad_norm": 1.294363260269165, + "learning_rate": 5e-06, + "loss": 0.0982, + "step": 5958 + }, + { + "epoch": 1.1393881453154875, + "grad_norm": 1.239507794380188, + "learning_rate": 5e-06, + "loss": 0.1389, + "step": 5959 + }, + { + "epoch": 1.1395793499043978, + "grad_norm": 1.3383917808532715, + "learning_rate": 5e-06, + "loss": 0.0797, + "step": 5960 + }, + { + "epoch": 1.1397705544933079, + "grad_norm": 1.403836965560913, + "learning_rate": 5e-06, + "loss": 0.0702, + "step": 5961 + }, + { + "epoch": 1.139961759082218, + "grad_norm": 2.712651491165161, + "learning_rate": 5e-06, + "loss": 0.0612, + "step": 5962 + }, + { + "epoch": 1.140152963671128, + "grad_norm": 2.9551162719726562, + "learning_rate": 5e-06, + "loss": 0.2241, + "step": 5963 + }, + { + "epoch": 1.1403441682600381, + "grad_norm": 0.7462462186813354, + "learning_rate": 5e-06, + "loss": 0.0658, + "step": 5964 + }, + { + "epoch": 1.1405353728489485, + "grad_norm": 0.8238846659660339, + "learning_rate": 5e-06, + "loss": 0.0566, + "step": 5965 + }, + { + "epoch": 1.1407265774378585, + "grad_norm": 1.3812743425369263, + "learning_rate": 5e-06, + "loss": 0.0468, + "step": 5966 + }, + { + "epoch": 1.1409177820267686, + "grad_norm": 1.1476503610610962, + "learning_rate": 5e-06, + "loss": 0.0558, + "step": 5967 + }, + { + "epoch": 1.1411089866156787, + "grad_norm": 1.7411783933639526, + "learning_rate": 5e-06, + "loss": 0.1208, + "step": 5968 + }, + { + "epoch": 1.1413001912045888, + "grad_norm": 2.1010475158691406, + "learning_rate": 5e-06, + "loss": 0.176, + "step": 5969 + }, + { + "epoch": 1.1414913957934991, + "grad_norm": 0.9700941443443298, + "learning_rate": 5e-06, + "loss": 0.0863, + "step": 5970 + }, + { + "epoch": 1.1416826003824092, + "grad_norm": 1.139196753501892, + "learning_rate": 5e-06, + "loss": 0.0632, + "step": 5971 + }, + { + "epoch": 1.1418738049713193, + "grad_norm": 1.9784907102584839, + "learning_rate": 5e-06, + "loss": 0.1047, + "step": 5972 + }, + { + "epoch": 1.1420650095602294, + "grad_norm": 1.6605274677276611, + "learning_rate": 5e-06, + "loss": 0.1016, + "step": 5973 + }, + { + "epoch": 1.1422562141491395, + "grad_norm": 2.2134573459625244, + "learning_rate": 5e-06, + "loss": 0.1446, + "step": 5974 + }, + { + "epoch": 1.1424474187380498, + "grad_norm": 2.1722609996795654, + "learning_rate": 5e-06, + "loss": 0.3741, + "step": 5975 + }, + { + "epoch": 1.1426386233269599, + "grad_norm": 2.2831599712371826, + "learning_rate": 5e-06, + "loss": 0.3514, + "step": 5976 + }, + { + "epoch": 1.14282982791587, + "grad_norm": 3.3943796157836914, + "learning_rate": 5e-06, + "loss": 0.4212, + "step": 5977 + }, + { + "epoch": 1.14302103250478, + "grad_norm": 1.330841302871704, + "learning_rate": 5e-06, + "loss": 0.0732, + "step": 5978 + }, + { + "epoch": 1.1432122370936901, + "grad_norm": 1.9645999670028687, + "learning_rate": 5e-06, + "loss": 0.0853, + "step": 5979 + }, + { + "epoch": 1.1434034416826004, + "grad_norm": 1.6076546907424927, + "learning_rate": 5e-06, + "loss": 0.0658, + "step": 5980 + }, + { + "epoch": 1.1435946462715105, + "grad_norm": 2.182152509689331, + "learning_rate": 5e-06, + "loss": 0.1959, + "step": 5981 + }, + { + "epoch": 1.1437858508604206, + "grad_norm": 2.1681408882141113, + "learning_rate": 5e-06, + "loss": 0.2228, + "step": 5982 + }, + { + "epoch": 1.1439770554493307, + "grad_norm": 2.4408507347106934, + "learning_rate": 5e-06, + "loss": 0.2301, + "step": 5983 + }, + { + "epoch": 1.144168260038241, + "grad_norm": 1.2057291269302368, + "learning_rate": 5e-06, + "loss": 0.107, + "step": 5984 + }, + { + "epoch": 1.144359464627151, + "grad_norm": 0.7931608557701111, + "learning_rate": 5e-06, + "loss": 0.0782, + "step": 5985 + }, + { + "epoch": 1.1445506692160612, + "grad_norm": 1.4083235263824463, + "learning_rate": 5e-06, + "loss": 0.0425, + "step": 5986 + }, + { + "epoch": 1.1447418738049713, + "grad_norm": 3.754728317260742, + "learning_rate": 5e-06, + "loss": 0.32, + "step": 5987 + }, + { + "epoch": 1.1449330783938814, + "grad_norm": 2.447702407836914, + "learning_rate": 5e-06, + "loss": 0.2783, + "step": 5988 + }, + { + "epoch": 1.1451242829827917, + "grad_norm": 1.0956696271896362, + "learning_rate": 5e-06, + "loss": 0.1021, + "step": 5989 + }, + { + "epoch": 1.1453154875717018, + "grad_norm": 0.831962525844574, + "learning_rate": 5e-06, + "loss": 0.0399, + "step": 5990 + }, + { + "epoch": 1.1455066921606119, + "grad_norm": 1.3386517763137817, + "learning_rate": 5e-06, + "loss": 0.0901, + "step": 5991 + }, + { + "epoch": 1.145697896749522, + "grad_norm": 2.1000680923461914, + "learning_rate": 5e-06, + "loss": 0.0823, + "step": 5992 + }, + { + "epoch": 1.145889101338432, + "grad_norm": 1.4930262565612793, + "learning_rate": 5e-06, + "loss": 0.0767, + "step": 5993 + }, + { + "epoch": 1.1460803059273423, + "grad_norm": 1.3432178497314453, + "learning_rate": 5e-06, + "loss": 0.1057, + "step": 5994 + }, + { + "epoch": 1.1462715105162524, + "grad_norm": 1.8772666454315186, + "learning_rate": 5e-06, + "loss": 0.14, + "step": 5995 + }, + { + "epoch": 1.1464627151051625, + "grad_norm": 0.8968759179115295, + "learning_rate": 5e-06, + "loss": 0.0477, + "step": 5996 + }, + { + "epoch": 1.1466539196940726, + "grad_norm": 1.3898628950119019, + "learning_rate": 5e-06, + "loss": 0.067, + "step": 5997 + }, + { + "epoch": 1.146845124282983, + "grad_norm": 0.6423681974411011, + "learning_rate": 5e-06, + "loss": 0.0118, + "step": 5998 + }, + { + "epoch": 1.147036328871893, + "grad_norm": 1.6495819091796875, + "learning_rate": 5e-06, + "loss": 0.1907, + "step": 5999 + }, + { + "epoch": 1.147227533460803, + "grad_norm": 1.3852123022079468, + "learning_rate": 5e-06, + "loss": 0.0823, + "step": 6000 + }, + { + "epoch": 1.147227533460803, + "eval_runtime": 801.3896, + "eval_samples_per_second": 1.914, + "eval_steps_per_second": 0.24, + "step": 6000 + }, + { + "epoch": 1.1474187380497132, + "grad_norm": 1.957743763923645, + "learning_rate": 5e-06, + "loss": 0.1578, + "step": 6001 + }, + { + "epoch": 1.1476099426386233, + "grad_norm": 2.059922218322754, + "learning_rate": 5e-06, + "loss": 0.15, + "step": 6002 + }, + { + "epoch": 1.1478011472275336, + "grad_norm": 1.291016936302185, + "learning_rate": 5e-06, + "loss": 0.0634, + "step": 6003 + }, + { + "epoch": 1.1479923518164437, + "grad_norm": 1.9278820753097534, + "learning_rate": 5e-06, + "loss": 0.2147, + "step": 6004 + }, + { + "epoch": 1.1481835564053537, + "grad_norm": 1.496976613998413, + "learning_rate": 5e-06, + "loss": 0.0778, + "step": 6005 + }, + { + "epoch": 1.1483747609942638, + "grad_norm": 1.579020619392395, + "learning_rate": 5e-06, + "loss": 0.0946, + "step": 6006 + }, + { + "epoch": 1.148565965583174, + "grad_norm": 1.9187579154968262, + "learning_rate": 5e-06, + "loss": 0.1447, + "step": 6007 + }, + { + "epoch": 1.1487571701720842, + "grad_norm": 1.770951271057129, + "learning_rate": 5e-06, + "loss": 0.1904, + "step": 6008 + }, + { + "epoch": 1.1489483747609943, + "grad_norm": 1.5869866609573364, + "learning_rate": 5e-06, + "loss": 0.1424, + "step": 6009 + }, + { + "epoch": 1.1491395793499044, + "grad_norm": 2.293226718902588, + "learning_rate": 5e-06, + "loss": 0.0983, + "step": 6010 + }, + { + "epoch": 1.1493307839388145, + "grad_norm": 1.4532525539398193, + "learning_rate": 5e-06, + "loss": 0.0784, + "step": 6011 + }, + { + "epoch": 1.1495219885277246, + "grad_norm": 1.6799381971359253, + "learning_rate": 5e-06, + "loss": 0.0964, + "step": 6012 + }, + { + "epoch": 1.149713193116635, + "grad_norm": 1.4979642629623413, + "learning_rate": 5e-06, + "loss": 0.0636, + "step": 6013 + }, + { + "epoch": 1.149904397705545, + "grad_norm": 2.968336582183838, + "learning_rate": 5e-06, + "loss": 0.3399, + "step": 6014 + }, + { + "epoch": 1.150095602294455, + "grad_norm": 2.201011896133423, + "learning_rate": 5e-06, + "loss": 0.1948, + "step": 6015 + }, + { + "epoch": 1.1502868068833652, + "grad_norm": 1.8662936687469482, + "learning_rate": 5e-06, + "loss": 0.0909, + "step": 6016 + }, + { + "epoch": 1.1504780114722752, + "grad_norm": 1.6262683868408203, + "learning_rate": 5e-06, + "loss": 0.153, + "step": 6017 + }, + { + "epoch": 1.1506692160611856, + "grad_norm": 1.8292471170425415, + "learning_rate": 5e-06, + "loss": 0.1355, + "step": 6018 + }, + { + "epoch": 1.1508604206500956, + "grad_norm": 1.0284324884414673, + "learning_rate": 5e-06, + "loss": 0.0604, + "step": 6019 + }, + { + "epoch": 1.1510516252390057, + "grad_norm": 2.682225465774536, + "learning_rate": 5e-06, + "loss": 0.2026, + "step": 6020 + }, + { + "epoch": 1.1512428298279158, + "grad_norm": 0.996833086013794, + "learning_rate": 5e-06, + "loss": 0.06, + "step": 6021 + }, + { + "epoch": 1.151434034416826, + "grad_norm": 1.5025444030761719, + "learning_rate": 5e-06, + "loss": 0.0522, + "step": 6022 + }, + { + "epoch": 1.1516252390057362, + "grad_norm": 1.4137760400772095, + "learning_rate": 5e-06, + "loss": 0.079, + "step": 6023 + }, + { + "epoch": 1.1518164435946463, + "grad_norm": 2.3708930015563965, + "learning_rate": 5e-06, + "loss": 0.1059, + "step": 6024 + }, + { + "epoch": 1.1520076481835564, + "grad_norm": 1.4922761917114258, + "learning_rate": 5e-06, + "loss": 0.1294, + "step": 6025 + }, + { + "epoch": 1.1521988527724665, + "grad_norm": 0.9097602367401123, + "learning_rate": 5e-06, + "loss": 0.0681, + "step": 6026 + }, + { + "epoch": 1.1523900573613766, + "grad_norm": 1.8793368339538574, + "learning_rate": 5e-06, + "loss": 0.1962, + "step": 6027 + }, + { + "epoch": 1.1525812619502869, + "grad_norm": 3.2023956775665283, + "learning_rate": 5e-06, + "loss": 0.0744, + "step": 6028 + }, + { + "epoch": 1.152772466539197, + "grad_norm": 1.3651431798934937, + "learning_rate": 5e-06, + "loss": 0.0934, + "step": 6029 + }, + { + "epoch": 1.152963671128107, + "grad_norm": 2.276491641998291, + "learning_rate": 5e-06, + "loss": 0.1804, + "step": 6030 + }, + { + "epoch": 1.1531548757170171, + "grad_norm": 1.6140533685684204, + "learning_rate": 5e-06, + "loss": 0.1421, + "step": 6031 + }, + { + "epoch": 1.1533460803059272, + "grad_norm": 1.3192780017852783, + "learning_rate": 5e-06, + "loss": 0.0654, + "step": 6032 + }, + { + "epoch": 1.1535372848948375, + "grad_norm": 2.071134567260742, + "learning_rate": 5e-06, + "loss": 0.2245, + "step": 6033 + }, + { + "epoch": 1.1537284894837476, + "grad_norm": 1.0834347009658813, + "learning_rate": 5e-06, + "loss": 0.0735, + "step": 6034 + }, + { + "epoch": 1.1539196940726577, + "grad_norm": 5.82857084274292, + "learning_rate": 5e-06, + "loss": 0.1003, + "step": 6035 + }, + { + "epoch": 1.1541108986615678, + "grad_norm": 1.3361073732376099, + "learning_rate": 5e-06, + "loss": 0.0476, + "step": 6036 + }, + { + "epoch": 1.154302103250478, + "grad_norm": 2.2594399452209473, + "learning_rate": 5e-06, + "loss": 0.1533, + "step": 6037 + }, + { + "epoch": 1.1544933078393882, + "grad_norm": 1.668912649154663, + "learning_rate": 5e-06, + "loss": 0.1638, + "step": 6038 + }, + { + "epoch": 1.1546845124282983, + "grad_norm": 1.6249210834503174, + "learning_rate": 5e-06, + "loss": 0.0849, + "step": 6039 + }, + { + "epoch": 1.1548757170172084, + "grad_norm": 1.301710605621338, + "learning_rate": 5e-06, + "loss": 0.081, + "step": 6040 + }, + { + "epoch": 1.1550669216061185, + "grad_norm": 0.9434700608253479, + "learning_rate": 5e-06, + "loss": 0.0392, + "step": 6041 + }, + { + "epoch": 1.1552581261950288, + "grad_norm": 0.9116209745407104, + "learning_rate": 5e-06, + "loss": 0.0235, + "step": 6042 + }, + { + "epoch": 1.1554493307839389, + "grad_norm": 0.9934965968132019, + "learning_rate": 5e-06, + "loss": 0.0644, + "step": 6043 + }, + { + "epoch": 1.155640535372849, + "grad_norm": 1.8826981782913208, + "learning_rate": 5e-06, + "loss": 0.2933, + "step": 6044 + }, + { + "epoch": 1.155831739961759, + "grad_norm": 1.0682721138000488, + "learning_rate": 5e-06, + "loss": 0.0483, + "step": 6045 + }, + { + "epoch": 1.1560229445506693, + "grad_norm": 1.3939552307128906, + "learning_rate": 5e-06, + "loss": 0.0893, + "step": 6046 + }, + { + "epoch": 1.1562141491395794, + "grad_norm": 0.7219886183738708, + "learning_rate": 5e-06, + "loss": 0.0433, + "step": 6047 + }, + { + "epoch": 1.1564053537284895, + "grad_norm": 0.8082029819488525, + "learning_rate": 5e-06, + "loss": 0.0271, + "step": 6048 + }, + { + "epoch": 1.1565965583173996, + "grad_norm": 0.8083641529083252, + "learning_rate": 5e-06, + "loss": 0.0503, + "step": 6049 + }, + { + "epoch": 1.1567877629063097, + "grad_norm": 1.0870314836502075, + "learning_rate": 5e-06, + "loss": 0.0769, + "step": 6050 + }, + { + "epoch": 1.15697896749522, + "grad_norm": 0.9009522795677185, + "learning_rate": 5e-06, + "loss": 0.0477, + "step": 6051 + }, + { + "epoch": 1.15717017208413, + "grad_norm": 0.9931294322013855, + "learning_rate": 5e-06, + "loss": 0.0635, + "step": 6052 + }, + { + "epoch": 1.1573613766730402, + "grad_norm": 1.5237797498703003, + "learning_rate": 5e-06, + "loss": 0.0961, + "step": 6053 + }, + { + "epoch": 1.1575525812619503, + "grad_norm": 1.4311531782150269, + "learning_rate": 5e-06, + "loss": 0.0464, + "step": 6054 + }, + { + "epoch": 1.1577437858508604, + "grad_norm": 2.5143849849700928, + "learning_rate": 5e-06, + "loss": 0.2268, + "step": 6055 + }, + { + "epoch": 1.1579349904397707, + "grad_norm": 1.216488003730774, + "learning_rate": 5e-06, + "loss": 0.0839, + "step": 6056 + }, + { + "epoch": 1.1581261950286807, + "grad_norm": 2.2803962230682373, + "learning_rate": 5e-06, + "loss": 0.3013, + "step": 6057 + }, + { + "epoch": 1.1583173996175908, + "grad_norm": 0.7981217503547668, + "learning_rate": 5e-06, + "loss": 0.0436, + "step": 6058 + }, + { + "epoch": 1.158508604206501, + "grad_norm": 1.780248761177063, + "learning_rate": 5e-06, + "loss": 0.1795, + "step": 6059 + }, + { + "epoch": 1.158699808795411, + "grad_norm": 5.465449810028076, + "learning_rate": 5e-06, + "loss": 0.1149, + "step": 6060 + }, + { + "epoch": 1.1588910133843213, + "grad_norm": 1.3702939748764038, + "learning_rate": 5e-06, + "loss": 0.0291, + "step": 6061 + }, + { + "epoch": 1.1590822179732314, + "grad_norm": 2.034011125564575, + "learning_rate": 5e-06, + "loss": 0.2144, + "step": 6062 + }, + { + "epoch": 1.1592734225621415, + "grad_norm": 1.4905356168746948, + "learning_rate": 5e-06, + "loss": 0.1838, + "step": 6063 + }, + { + "epoch": 1.1594646271510516, + "grad_norm": 2.3215534687042236, + "learning_rate": 5e-06, + "loss": 0.1431, + "step": 6064 + }, + { + "epoch": 1.1596558317399617, + "grad_norm": 0.8124056458473206, + "learning_rate": 5e-06, + "loss": 0.0456, + "step": 6065 + }, + { + "epoch": 1.159847036328872, + "grad_norm": 1.8607640266418457, + "learning_rate": 5e-06, + "loss": 0.131, + "step": 6066 + }, + { + "epoch": 1.160038240917782, + "grad_norm": 1.4982339143753052, + "learning_rate": 5e-06, + "loss": 0.0986, + "step": 6067 + }, + { + "epoch": 1.1602294455066922, + "grad_norm": 2.7232859134674072, + "learning_rate": 5e-06, + "loss": 0.2743, + "step": 6068 + }, + { + "epoch": 1.1604206500956022, + "grad_norm": 2.740915536880493, + "learning_rate": 5e-06, + "loss": 0.4415, + "step": 6069 + }, + { + "epoch": 1.1606118546845123, + "grad_norm": 1.1084314584732056, + "learning_rate": 5e-06, + "loss": 0.0626, + "step": 6070 + }, + { + "epoch": 1.1608030592734226, + "grad_norm": 3.3199756145477295, + "learning_rate": 5e-06, + "loss": 0.2035, + "step": 6071 + }, + { + "epoch": 1.1609942638623327, + "grad_norm": 0.5316591262817383, + "learning_rate": 5e-06, + "loss": 0.0395, + "step": 6072 + }, + { + "epoch": 1.1611854684512428, + "grad_norm": 0.8929632306098938, + "learning_rate": 5e-06, + "loss": 0.0379, + "step": 6073 + }, + { + "epoch": 1.161376673040153, + "grad_norm": 1.1094083786010742, + "learning_rate": 5e-06, + "loss": 0.0883, + "step": 6074 + }, + { + "epoch": 1.161567877629063, + "grad_norm": 3.199720621109009, + "learning_rate": 5e-06, + "loss": 0.4122, + "step": 6075 + }, + { + "epoch": 1.1617590822179733, + "grad_norm": 2.4385669231414795, + "learning_rate": 5e-06, + "loss": 0.3021, + "step": 6076 + }, + { + "epoch": 1.1619502868068834, + "grad_norm": 1.3131438493728638, + "learning_rate": 5e-06, + "loss": 0.0734, + "step": 6077 + }, + { + "epoch": 1.1621414913957935, + "grad_norm": 2.2846639156341553, + "learning_rate": 5e-06, + "loss": 0.162, + "step": 6078 + }, + { + "epoch": 1.1623326959847036, + "grad_norm": 1.474809169769287, + "learning_rate": 5e-06, + "loss": 0.0868, + "step": 6079 + }, + { + "epoch": 1.1625239005736137, + "grad_norm": 1.7511301040649414, + "learning_rate": 5e-06, + "loss": 0.0742, + "step": 6080 + }, + { + "epoch": 1.162715105162524, + "grad_norm": 1.1560462713241577, + "learning_rate": 5e-06, + "loss": 0.1053, + "step": 6081 + }, + { + "epoch": 1.162906309751434, + "grad_norm": 1.9263542890548706, + "learning_rate": 5e-06, + "loss": 0.151, + "step": 6082 + }, + { + "epoch": 1.1630975143403441, + "grad_norm": 1.997341275215149, + "learning_rate": 5e-06, + "loss": 0.0731, + "step": 6083 + }, + { + "epoch": 1.1632887189292542, + "grad_norm": 2.5195326805114746, + "learning_rate": 5e-06, + "loss": 0.1253, + "step": 6084 + }, + { + "epoch": 1.1634799235181643, + "grad_norm": 1.3920161724090576, + "learning_rate": 5e-06, + "loss": 0.0689, + "step": 6085 + }, + { + "epoch": 1.1636711281070746, + "grad_norm": 1.848002552986145, + "learning_rate": 5e-06, + "loss": 0.0813, + "step": 6086 + }, + { + "epoch": 1.1638623326959847, + "grad_norm": 1.6859074831008911, + "learning_rate": 5e-06, + "loss": 0.2863, + "step": 6087 + }, + { + "epoch": 1.1640535372848948, + "grad_norm": 2.795480251312256, + "learning_rate": 5e-06, + "loss": 0.3935, + "step": 6088 + }, + { + "epoch": 1.164244741873805, + "grad_norm": 2.764321804046631, + "learning_rate": 5e-06, + "loss": 0.268, + "step": 6089 + }, + { + "epoch": 1.1644359464627152, + "grad_norm": 0.9207529425621033, + "learning_rate": 5e-06, + "loss": 0.0395, + "step": 6090 + }, + { + "epoch": 1.1646271510516253, + "grad_norm": 2.6257948875427246, + "learning_rate": 5e-06, + "loss": 0.2537, + "step": 6091 + }, + { + "epoch": 1.1648183556405354, + "grad_norm": 1.9078866243362427, + "learning_rate": 5e-06, + "loss": 0.1349, + "step": 6092 + }, + { + "epoch": 1.1650095602294455, + "grad_norm": 6.173279285430908, + "learning_rate": 5e-06, + "loss": 0.1253, + "step": 6093 + }, + { + "epoch": 1.1652007648183555, + "grad_norm": 2.563831090927124, + "learning_rate": 5e-06, + "loss": 0.1948, + "step": 6094 + }, + { + "epoch": 1.1653919694072659, + "grad_norm": 1.040425181388855, + "learning_rate": 5e-06, + "loss": 0.1154, + "step": 6095 + }, + { + "epoch": 1.165583173996176, + "grad_norm": 0.45935672521591187, + "learning_rate": 5e-06, + "loss": 0.0246, + "step": 6096 + }, + { + "epoch": 1.165774378585086, + "grad_norm": 1.1373692750930786, + "learning_rate": 5e-06, + "loss": 0.0963, + "step": 6097 + }, + { + "epoch": 1.1659655831739961, + "grad_norm": 2.903848886489868, + "learning_rate": 5e-06, + "loss": 0.1996, + "step": 6098 + }, + { + "epoch": 1.1661567877629064, + "grad_norm": 1.5372850894927979, + "learning_rate": 5e-06, + "loss": 0.075, + "step": 6099 + }, + { + "epoch": 1.1663479923518165, + "grad_norm": 2.0959360599517822, + "learning_rate": 5e-06, + "loss": 0.2309, + "step": 6100 + }, + { + "epoch": 1.1665391969407266, + "grad_norm": 3.3226559162139893, + "learning_rate": 5e-06, + "loss": 0.233, + "step": 6101 + }, + { + "epoch": 1.1667304015296367, + "grad_norm": 1.1195683479309082, + "learning_rate": 5e-06, + "loss": 0.0845, + "step": 6102 + }, + { + "epoch": 1.1669216061185468, + "grad_norm": 1.387656569480896, + "learning_rate": 5e-06, + "loss": 0.0717, + "step": 6103 + }, + { + "epoch": 1.167112810707457, + "grad_norm": 1.3395695686340332, + "learning_rate": 5e-06, + "loss": 0.0479, + "step": 6104 + }, + { + "epoch": 1.1673040152963672, + "grad_norm": 1.6233667135238647, + "learning_rate": 5e-06, + "loss": 0.0703, + "step": 6105 + }, + { + "epoch": 1.1674952198852773, + "grad_norm": 2.596073627471924, + "learning_rate": 5e-06, + "loss": 0.1554, + "step": 6106 + }, + { + "epoch": 1.1676864244741874, + "grad_norm": 1.9220967292785645, + "learning_rate": 5e-06, + "loss": 0.2205, + "step": 6107 + }, + { + "epoch": 1.1678776290630974, + "grad_norm": 1.6835957765579224, + "learning_rate": 5e-06, + "loss": 0.1507, + "step": 6108 + }, + { + "epoch": 1.1680688336520078, + "grad_norm": 1.530069351196289, + "learning_rate": 5e-06, + "loss": 0.0651, + "step": 6109 + }, + { + "epoch": 1.1682600382409178, + "grad_norm": 0.737148106098175, + "learning_rate": 5e-06, + "loss": 0.0299, + "step": 6110 + }, + { + "epoch": 1.168451242829828, + "grad_norm": 1.4814426898956299, + "learning_rate": 5e-06, + "loss": 0.0882, + "step": 6111 + }, + { + "epoch": 1.168642447418738, + "grad_norm": 0.973702609539032, + "learning_rate": 5e-06, + "loss": 0.0567, + "step": 6112 + }, + { + "epoch": 1.168833652007648, + "grad_norm": 2.017190933227539, + "learning_rate": 5e-06, + "loss": 0.2373, + "step": 6113 + }, + { + "epoch": 1.1690248565965584, + "grad_norm": 1.150895595550537, + "learning_rate": 5e-06, + "loss": 0.0863, + "step": 6114 + }, + { + "epoch": 1.1692160611854685, + "grad_norm": 2.035149335861206, + "learning_rate": 5e-06, + "loss": 0.0599, + "step": 6115 + }, + { + "epoch": 1.1694072657743786, + "grad_norm": 1.2829310894012451, + "learning_rate": 5e-06, + "loss": 0.0662, + "step": 6116 + }, + { + "epoch": 1.1695984703632887, + "grad_norm": 0.9252585172653198, + "learning_rate": 5e-06, + "loss": 0.0334, + "step": 6117 + }, + { + "epoch": 1.1697896749521988, + "grad_norm": 2.3354899883270264, + "learning_rate": 5e-06, + "loss": 0.1792, + "step": 6118 + }, + { + "epoch": 1.169980879541109, + "grad_norm": 1.619714617729187, + "learning_rate": 5e-06, + "loss": 0.2189, + "step": 6119 + }, + { + "epoch": 1.1701720841300192, + "grad_norm": 1.9781533479690552, + "learning_rate": 5e-06, + "loss": 0.179, + "step": 6120 + }, + { + "epoch": 1.1703632887189293, + "grad_norm": 1.8020018339157104, + "learning_rate": 5e-06, + "loss": 0.2103, + "step": 6121 + }, + { + "epoch": 1.1705544933078393, + "grad_norm": 1.7455902099609375, + "learning_rate": 5e-06, + "loss": 0.1837, + "step": 6122 + }, + { + "epoch": 1.1707456978967494, + "grad_norm": 1.8223216533660889, + "learning_rate": 5e-06, + "loss": 0.0997, + "step": 6123 + }, + { + "epoch": 1.1709369024856597, + "grad_norm": 1.6536144018173218, + "learning_rate": 5e-06, + "loss": 0.0922, + "step": 6124 + }, + { + "epoch": 1.1711281070745698, + "grad_norm": 2.5655362606048584, + "learning_rate": 5e-06, + "loss": 0.3305, + "step": 6125 + }, + { + "epoch": 1.17131931166348, + "grad_norm": 1.325499415397644, + "learning_rate": 5e-06, + "loss": 0.105, + "step": 6126 + }, + { + "epoch": 1.17151051625239, + "grad_norm": 1.0477885007858276, + "learning_rate": 5e-06, + "loss": 0.082, + "step": 6127 + }, + { + "epoch": 1.1717017208413, + "grad_norm": 1.0732998847961426, + "learning_rate": 5e-06, + "loss": 0.0464, + "step": 6128 + }, + { + "epoch": 1.1718929254302104, + "grad_norm": 1.3447692394256592, + "learning_rate": 5e-06, + "loss": 0.0626, + "step": 6129 + }, + { + "epoch": 1.1720841300191205, + "grad_norm": 2.3556089401245117, + "learning_rate": 5e-06, + "loss": 0.0672, + "step": 6130 + }, + { + "epoch": 1.1722753346080306, + "grad_norm": 2.144721031188965, + "learning_rate": 5e-06, + "loss": 0.2276, + "step": 6131 + }, + { + "epoch": 1.1724665391969407, + "grad_norm": 1.2007111310958862, + "learning_rate": 5e-06, + "loss": 0.0624, + "step": 6132 + }, + { + "epoch": 1.1726577437858507, + "grad_norm": 1.4135570526123047, + "learning_rate": 5e-06, + "loss": 0.082, + "step": 6133 + }, + { + "epoch": 1.172848948374761, + "grad_norm": 1.207506537437439, + "learning_rate": 5e-06, + "loss": 0.0531, + "step": 6134 + }, + { + "epoch": 1.1730401529636711, + "grad_norm": 0.7473751306533813, + "learning_rate": 5e-06, + "loss": 0.0336, + "step": 6135 + }, + { + "epoch": 1.1732313575525812, + "grad_norm": 1.5870224237442017, + "learning_rate": 5e-06, + "loss": 0.0666, + "step": 6136 + }, + { + "epoch": 1.1734225621414913, + "grad_norm": 2.2782132625579834, + "learning_rate": 5e-06, + "loss": 0.2869, + "step": 6137 + }, + { + "epoch": 1.1736137667304014, + "grad_norm": 2.2009999752044678, + "learning_rate": 5e-06, + "loss": 0.1322, + "step": 6138 + }, + { + "epoch": 1.1738049713193117, + "grad_norm": 2.3219454288482666, + "learning_rate": 5e-06, + "loss": 0.2503, + "step": 6139 + }, + { + "epoch": 1.1739961759082218, + "grad_norm": 1.4381366968154907, + "learning_rate": 5e-06, + "loss": 0.0768, + "step": 6140 + }, + { + "epoch": 1.174187380497132, + "grad_norm": 2.0978338718414307, + "learning_rate": 5e-06, + "loss": 0.1613, + "step": 6141 + }, + { + "epoch": 1.174378585086042, + "grad_norm": 2.1573426723480225, + "learning_rate": 5e-06, + "loss": 0.083, + "step": 6142 + }, + { + "epoch": 1.1745697896749523, + "grad_norm": 1.2375134229660034, + "learning_rate": 5e-06, + "loss": 0.0635, + "step": 6143 + }, + { + "epoch": 1.1747609942638624, + "grad_norm": 1.8515455722808838, + "learning_rate": 5e-06, + "loss": 0.2427, + "step": 6144 + }, + { + "epoch": 1.1749521988527725, + "grad_norm": 1.246639609336853, + "learning_rate": 5e-06, + "loss": 0.0717, + "step": 6145 + }, + { + "epoch": 1.1751434034416826, + "grad_norm": 1.7270658016204834, + "learning_rate": 5e-06, + "loss": 0.0917, + "step": 6146 + }, + { + "epoch": 1.1753346080305926, + "grad_norm": 1.3657946586608887, + "learning_rate": 5e-06, + "loss": 0.1012, + "step": 6147 + }, + { + "epoch": 1.175525812619503, + "grad_norm": 1.2972500324249268, + "learning_rate": 5e-06, + "loss": 0.0726, + "step": 6148 + }, + { + "epoch": 1.175717017208413, + "grad_norm": 2.3606135845184326, + "learning_rate": 5e-06, + "loss": 0.1246, + "step": 6149 + }, + { + "epoch": 1.1759082217973231, + "grad_norm": 1.981109619140625, + "learning_rate": 5e-06, + "loss": 0.2327, + "step": 6150 + }, + { + "epoch": 1.1760994263862332, + "grad_norm": 2.298509120941162, + "learning_rate": 5e-06, + "loss": 0.172, + "step": 6151 + }, + { + "epoch": 1.1762906309751435, + "grad_norm": 2.005854606628418, + "learning_rate": 5e-06, + "loss": 0.1567, + "step": 6152 + }, + { + "epoch": 1.1764818355640536, + "grad_norm": 0.9500100612640381, + "learning_rate": 5e-06, + "loss": 0.0708, + "step": 6153 + }, + { + "epoch": 1.1766730401529637, + "grad_norm": 1.3746157884597778, + "learning_rate": 5e-06, + "loss": 0.0448, + "step": 6154 + }, + { + "epoch": 1.1768642447418738, + "grad_norm": 0.5043768882751465, + "learning_rate": 5e-06, + "loss": 0.0102, + "step": 6155 + }, + { + "epoch": 1.1770554493307839, + "grad_norm": 1.2846918106079102, + "learning_rate": 5e-06, + "loss": 0.1466, + "step": 6156 + }, + { + "epoch": 1.1772466539196942, + "grad_norm": 2.024791955947876, + "learning_rate": 5e-06, + "loss": 0.2269, + "step": 6157 + }, + { + "epoch": 1.1774378585086043, + "grad_norm": 0.7490910887718201, + "learning_rate": 5e-06, + "loss": 0.0589, + "step": 6158 + }, + { + "epoch": 1.1776290630975144, + "grad_norm": 1.5239231586456299, + "learning_rate": 5e-06, + "loss": 0.1146, + "step": 6159 + }, + { + "epoch": 1.1778202676864244, + "grad_norm": 1.2455819845199585, + "learning_rate": 5e-06, + "loss": 0.0616, + "step": 6160 + }, + { + "epoch": 1.1780114722753345, + "grad_norm": 1.4771140813827515, + "learning_rate": 5e-06, + "loss": 0.0756, + "step": 6161 + }, + { + "epoch": 1.1782026768642448, + "grad_norm": 2.541170120239258, + "learning_rate": 5e-06, + "loss": 0.1628, + "step": 6162 + }, + { + "epoch": 1.178393881453155, + "grad_norm": 2.1921584606170654, + "learning_rate": 5e-06, + "loss": 0.2259, + "step": 6163 + }, + { + "epoch": 1.178585086042065, + "grad_norm": 1.606255292892456, + "learning_rate": 5e-06, + "loss": 0.0649, + "step": 6164 + }, + { + "epoch": 1.178776290630975, + "grad_norm": 3.125506639480591, + "learning_rate": 5e-06, + "loss": 0.2574, + "step": 6165 + }, + { + "epoch": 1.1789674952198852, + "grad_norm": 1.2741800546646118, + "learning_rate": 5e-06, + "loss": 0.0475, + "step": 6166 + }, + { + "epoch": 1.1791586998087955, + "grad_norm": 2.3493738174438477, + "learning_rate": 5e-06, + "loss": 0.059, + "step": 6167 + }, + { + "epoch": 1.1793499043977056, + "grad_norm": 1.9525309801101685, + "learning_rate": 5e-06, + "loss": 0.1746, + "step": 6168 + }, + { + "epoch": 1.1795411089866157, + "grad_norm": 2.7668604850769043, + "learning_rate": 5e-06, + "loss": 0.309, + "step": 6169 + }, + { + "epoch": 1.1797323135755258, + "grad_norm": 2.444951057434082, + "learning_rate": 5e-06, + "loss": 0.2414, + "step": 6170 + }, + { + "epoch": 1.1799235181644359, + "grad_norm": 1.980841040611267, + "learning_rate": 5e-06, + "loss": 0.0647, + "step": 6171 + }, + { + "epoch": 1.1801147227533462, + "grad_norm": 1.0069383382797241, + "learning_rate": 5e-06, + "loss": 0.0172, + "step": 6172 + }, + { + "epoch": 1.1803059273422563, + "grad_norm": 1.7199982404708862, + "learning_rate": 5e-06, + "loss": 0.1625, + "step": 6173 + }, + { + "epoch": 1.1804971319311663, + "grad_norm": 1.0968600511550903, + "learning_rate": 5e-06, + "loss": 0.0659, + "step": 6174 + }, + { + "epoch": 1.1806883365200764, + "grad_norm": 2.4999165534973145, + "learning_rate": 5e-06, + "loss": 0.3111, + "step": 6175 + }, + { + "epoch": 1.1808795411089865, + "grad_norm": 1.1034135818481445, + "learning_rate": 5e-06, + "loss": 0.0808, + "step": 6176 + }, + { + "epoch": 1.1810707456978968, + "grad_norm": 2.0255119800567627, + "learning_rate": 5e-06, + "loss": 0.1643, + "step": 6177 + }, + { + "epoch": 1.181261950286807, + "grad_norm": 1.7352789640426636, + "learning_rate": 5e-06, + "loss": 0.1539, + "step": 6178 + }, + { + "epoch": 1.181453154875717, + "grad_norm": 0.796521782875061, + "learning_rate": 5e-06, + "loss": 0.0369, + "step": 6179 + }, + { + "epoch": 1.181644359464627, + "grad_norm": 2.0245773792266846, + "learning_rate": 5e-06, + "loss": 0.0833, + "step": 6180 + }, + { + "epoch": 1.1818355640535372, + "grad_norm": 1.772589921951294, + "learning_rate": 5e-06, + "loss": 0.2376, + "step": 6181 + }, + { + "epoch": 1.1820267686424475, + "grad_norm": 2.6077778339385986, + "learning_rate": 5e-06, + "loss": 0.3178, + "step": 6182 + }, + { + "epoch": 1.1822179732313576, + "grad_norm": 1.4697093963623047, + "learning_rate": 5e-06, + "loss": 0.0894, + "step": 6183 + }, + { + "epoch": 1.1824091778202677, + "grad_norm": 1.1189794540405273, + "learning_rate": 5e-06, + "loss": 0.0712, + "step": 6184 + }, + { + "epoch": 1.1826003824091778, + "grad_norm": 1.7616618871688843, + "learning_rate": 5e-06, + "loss": 0.1014, + "step": 6185 + }, + { + "epoch": 1.1827915869980878, + "grad_norm": 1.4431734085083008, + "learning_rate": 5e-06, + "loss": 0.0509, + "step": 6186 + }, + { + "epoch": 1.1829827915869982, + "grad_norm": 1.4921287298202515, + "learning_rate": 5e-06, + "loss": 0.0924, + "step": 6187 + }, + { + "epoch": 1.1831739961759082, + "grad_norm": 3.1014909744262695, + "learning_rate": 5e-06, + "loss": 0.3719, + "step": 6188 + }, + { + "epoch": 1.1833652007648183, + "grad_norm": 1.158164620399475, + "learning_rate": 5e-06, + "loss": 0.0296, + "step": 6189 + }, + { + "epoch": 1.1835564053537284, + "grad_norm": 1.8710097074508667, + "learning_rate": 5e-06, + "loss": 0.1908, + "step": 6190 + }, + { + "epoch": 1.1837476099426385, + "grad_norm": 1.8682113885879517, + "learning_rate": 5e-06, + "loss": 0.1711, + "step": 6191 + }, + { + "epoch": 1.1839388145315488, + "grad_norm": 0.9372738003730774, + "learning_rate": 5e-06, + "loss": 0.0528, + "step": 6192 + }, + { + "epoch": 1.184130019120459, + "grad_norm": 1.8785148859024048, + "learning_rate": 5e-06, + "loss": 0.3054, + "step": 6193 + }, + { + "epoch": 1.184321223709369, + "grad_norm": 1.2328784465789795, + "learning_rate": 5e-06, + "loss": 0.082, + "step": 6194 + }, + { + "epoch": 1.184512428298279, + "grad_norm": 1.1626081466674805, + "learning_rate": 5e-06, + "loss": 0.0991, + "step": 6195 + }, + { + "epoch": 1.1847036328871894, + "grad_norm": 2.7644124031066895, + "learning_rate": 5e-06, + "loss": 0.2075, + "step": 6196 + }, + { + "epoch": 1.1848948374760995, + "grad_norm": 1.14145827293396, + "learning_rate": 5e-06, + "loss": 0.0526, + "step": 6197 + }, + { + "epoch": 1.1850860420650096, + "grad_norm": 1.1169764995574951, + "learning_rate": 5e-06, + "loss": 0.0239, + "step": 6198 + }, + { + "epoch": 1.1852772466539196, + "grad_norm": 2.4677770137786865, + "learning_rate": 5e-06, + "loss": 0.0712, + "step": 6199 + }, + { + "epoch": 1.1854684512428297, + "grad_norm": 2.896726369857788, + "learning_rate": 5e-06, + "loss": 0.1501, + "step": 6200 + }, + { + "epoch": 1.18565965583174, + "grad_norm": 1.0080317258834839, + "learning_rate": 5e-06, + "loss": 0.0729, + "step": 6201 + }, + { + "epoch": 1.1858508604206501, + "grad_norm": 1.406341791152954, + "learning_rate": 5e-06, + "loss": 0.1634, + "step": 6202 + }, + { + "epoch": 1.1860420650095602, + "grad_norm": 2.233196258544922, + "learning_rate": 5e-06, + "loss": 0.168, + "step": 6203 + }, + { + "epoch": 1.1862332695984703, + "grad_norm": 0.7236731052398682, + "learning_rate": 5e-06, + "loss": 0.0436, + "step": 6204 + }, + { + "epoch": 1.1864244741873806, + "grad_norm": 1.104647159576416, + "learning_rate": 5e-06, + "loss": 0.0803, + "step": 6205 + }, + { + "epoch": 1.1866156787762907, + "grad_norm": 1.6566107273101807, + "learning_rate": 5e-06, + "loss": 0.1847, + "step": 6206 + }, + { + "epoch": 1.1868068833652008, + "grad_norm": 2.225419282913208, + "learning_rate": 5e-06, + "loss": 0.1847, + "step": 6207 + }, + { + "epoch": 1.1869980879541109, + "grad_norm": 1.3101707696914673, + "learning_rate": 5e-06, + "loss": 0.0773, + "step": 6208 + }, + { + "epoch": 1.187189292543021, + "grad_norm": 1.1030720472335815, + "learning_rate": 5e-06, + "loss": 0.0684, + "step": 6209 + }, + { + "epoch": 1.1873804971319313, + "grad_norm": 1.0501564741134644, + "learning_rate": 5e-06, + "loss": 0.0591, + "step": 6210 + }, + { + "epoch": 1.1875717017208414, + "grad_norm": 1.0577927827835083, + "learning_rate": 5e-06, + "loss": 0.0496, + "step": 6211 + }, + { + "epoch": 1.1877629063097515, + "grad_norm": 2.4910309314727783, + "learning_rate": 5e-06, + "loss": 0.3094, + "step": 6212 + }, + { + "epoch": 1.1879541108986615, + "grad_norm": 1.5930724143981934, + "learning_rate": 5e-06, + "loss": 0.1574, + "step": 6213 + }, + { + "epoch": 1.1881453154875716, + "grad_norm": 1.2082481384277344, + "learning_rate": 5e-06, + "loss": 0.0612, + "step": 6214 + }, + { + "epoch": 1.188336520076482, + "grad_norm": 1.5550799369812012, + "learning_rate": 5e-06, + "loss": 0.0683, + "step": 6215 + }, + { + "epoch": 1.188527724665392, + "grad_norm": 1.548312783241272, + "learning_rate": 5e-06, + "loss": 0.0548, + "step": 6216 + }, + { + "epoch": 1.1887189292543021, + "grad_norm": 1.306344747543335, + "learning_rate": 5e-06, + "loss": 0.0559, + "step": 6217 + }, + { + "epoch": 1.1889101338432122, + "grad_norm": 1.2293124198913574, + "learning_rate": 5e-06, + "loss": 0.065, + "step": 6218 + }, + { + "epoch": 1.1891013384321223, + "grad_norm": 3.0053391456604004, + "learning_rate": 5e-06, + "loss": 0.4573, + "step": 6219 + }, + { + "epoch": 1.1892925430210326, + "grad_norm": 2.9051673412323, + "learning_rate": 5e-06, + "loss": 0.0862, + "step": 6220 + }, + { + "epoch": 1.1894837476099427, + "grad_norm": 1.1124510765075684, + "learning_rate": 5e-06, + "loss": 0.0373, + "step": 6221 + }, + { + "epoch": 1.1896749521988528, + "grad_norm": 6.297783374786377, + "learning_rate": 5e-06, + "loss": 0.1291, + "step": 6222 + }, + { + "epoch": 1.1898661567877629, + "grad_norm": 3.238295316696167, + "learning_rate": 5e-06, + "loss": 0.1066, + "step": 6223 + }, + { + "epoch": 1.190057361376673, + "grad_norm": 1.506626844406128, + "learning_rate": 5e-06, + "loss": 0.0639, + "step": 6224 + }, + { + "epoch": 1.1902485659655833, + "grad_norm": 1.8110731840133667, + "learning_rate": 5e-06, + "loss": 0.1739, + "step": 6225 + }, + { + "epoch": 1.1904397705544933, + "grad_norm": 1.8189138174057007, + "learning_rate": 5e-06, + "loss": 0.2069, + "step": 6226 + }, + { + "epoch": 1.1906309751434034, + "grad_norm": 1.4828901290893555, + "learning_rate": 5e-06, + "loss": 0.155, + "step": 6227 + }, + { + "epoch": 1.1908221797323135, + "grad_norm": 1.3421870470046997, + "learning_rate": 5e-06, + "loss": 0.0718, + "step": 6228 + }, + { + "epoch": 1.1910133843212236, + "grad_norm": 1.5782335996627808, + "learning_rate": 5e-06, + "loss": 0.1227, + "step": 6229 + }, + { + "epoch": 1.191204588910134, + "grad_norm": 0.5779447555541992, + "learning_rate": 5e-06, + "loss": 0.0196, + "step": 6230 + }, + { + "epoch": 1.191395793499044, + "grad_norm": 3.198089599609375, + "learning_rate": 5e-06, + "loss": 0.4032, + "step": 6231 + }, + { + "epoch": 1.191586998087954, + "grad_norm": 2.1662838459014893, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 6232 + }, + { + "epoch": 1.1917782026768642, + "grad_norm": 0.6701384782791138, + "learning_rate": 5e-06, + "loss": 0.0796, + "step": 6233 + }, + { + "epoch": 1.1919694072657743, + "grad_norm": 3.48876953125, + "learning_rate": 5e-06, + "loss": 0.3998, + "step": 6234 + }, + { + "epoch": 1.1921606118546846, + "grad_norm": 1.7155407667160034, + "learning_rate": 5e-06, + "loss": 0.0601, + "step": 6235 + }, + { + "epoch": 1.1923518164435947, + "grad_norm": 1.6817251443862915, + "learning_rate": 5e-06, + "loss": 0.0674, + "step": 6236 + }, + { + "epoch": 1.1925430210325048, + "grad_norm": 1.244370460510254, + "learning_rate": 5e-06, + "loss": 0.0725, + "step": 6237 + }, + { + "epoch": 1.1927342256214148, + "grad_norm": 0.7383753657341003, + "learning_rate": 5e-06, + "loss": 0.0387, + "step": 6238 + }, + { + "epoch": 1.192925430210325, + "grad_norm": 3.243826150894165, + "learning_rate": 5e-06, + "loss": 0.2708, + "step": 6239 + }, + { + "epoch": 1.1931166347992352, + "grad_norm": 1.0814484357833862, + "learning_rate": 5e-06, + "loss": 0.0429, + "step": 6240 + }, + { + "epoch": 1.1933078393881453, + "grad_norm": 1.800237774848938, + "learning_rate": 5e-06, + "loss": 0.1282, + "step": 6241 + }, + { + "epoch": 1.1934990439770554, + "grad_norm": 1.2890852689743042, + "learning_rate": 5e-06, + "loss": 0.0736, + "step": 6242 + }, + { + "epoch": 1.1936902485659655, + "grad_norm": 1.504570722579956, + "learning_rate": 5e-06, + "loss": 0.1157, + "step": 6243 + }, + { + "epoch": 1.1938814531548756, + "grad_norm": 3.0813817977905273, + "learning_rate": 5e-06, + "loss": 0.2378, + "step": 6244 + }, + { + "epoch": 1.194072657743786, + "grad_norm": 2.5581462383270264, + "learning_rate": 5e-06, + "loss": 0.2248, + "step": 6245 + }, + { + "epoch": 1.194263862332696, + "grad_norm": 1.9704996347427368, + "learning_rate": 5e-06, + "loss": 0.1308, + "step": 6246 + }, + { + "epoch": 1.194455066921606, + "grad_norm": 1.2662582397460938, + "learning_rate": 5e-06, + "loss": 0.0757, + "step": 6247 + }, + { + "epoch": 1.1946462715105162, + "grad_norm": 1.3485127687454224, + "learning_rate": 5e-06, + "loss": 0.0613, + "step": 6248 + }, + { + "epoch": 1.1948374760994265, + "grad_norm": 1.4152460098266602, + "learning_rate": 5e-06, + "loss": 0.0603, + "step": 6249 + }, + { + "epoch": 1.1950286806883366, + "grad_norm": 2.333681583404541, + "learning_rate": 5e-06, + "loss": 0.1836, + "step": 6250 + }, + { + "epoch": 1.1952198852772467, + "grad_norm": 1.0232439041137695, + "learning_rate": 5e-06, + "loss": 0.0979, + "step": 6251 + }, + { + "epoch": 1.1954110898661567, + "grad_norm": 1.8188303709030151, + "learning_rate": 5e-06, + "loss": 0.088, + "step": 6252 + }, + { + "epoch": 1.1956022944550668, + "grad_norm": 2.2055118083953857, + "learning_rate": 5e-06, + "loss": 0.1996, + "step": 6253 + }, + { + "epoch": 1.1957934990439771, + "grad_norm": 1.0319563150405884, + "learning_rate": 5e-06, + "loss": 0.0685, + "step": 6254 + }, + { + "epoch": 1.1959847036328872, + "grad_norm": 1.0295190811157227, + "learning_rate": 5e-06, + "loss": 0.0491, + "step": 6255 + }, + { + "epoch": 1.1961759082217973, + "grad_norm": 1.7119534015655518, + "learning_rate": 5e-06, + "loss": 0.1958, + "step": 6256 + }, + { + "epoch": 1.1963671128107074, + "grad_norm": 2.6020867824554443, + "learning_rate": 5e-06, + "loss": 0.3227, + "step": 6257 + }, + { + "epoch": 1.1965583173996177, + "grad_norm": 0.8010663390159607, + "learning_rate": 5e-06, + "loss": 0.0648, + "step": 6258 + }, + { + "epoch": 1.1967495219885278, + "grad_norm": 1.836167335510254, + "learning_rate": 5e-06, + "loss": 0.1377, + "step": 6259 + }, + { + "epoch": 1.1969407265774379, + "grad_norm": 2.1391921043395996, + "learning_rate": 5e-06, + "loss": 0.2105, + "step": 6260 + }, + { + "epoch": 1.197131931166348, + "grad_norm": 1.2611051797866821, + "learning_rate": 5e-06, + "loss": 0.0711, + "step": 6261 + }, + { + "epoch": 1.197323135755258, + "grad_norm": 1.799573540687561, + "learning_rate": 5e-06, + "loss": 0.1808, + "step": 6262 + }, + { + "epoch": 1.1975143403441684, + "grad_norm": 1.2242969274520874, + "learning_rate": 5e-06, + "loss": 0.1472, + "step": 6263 + }, + { + "epoch": 1.1977055449330785, + "grad_norm": 1.190840721130371, + "learning_rate": 5e-06, + "loss": 0.0509, + "step": 6264 + }, + { + "epoch": 1.1978967495219885, + "grad_norm": 1.2675566673278809, + "learning_rate": 5e-06, + "loss": 0.0601, + "step": 6265 + }, + { + "epoch": 1.1980879541108986, + "grad_norm": 1.62624990940094, + "learning_rate": 5e-06, + "loss": 0.0855, + "step": 6266 + }, + { + "epoch": 1.1982791586998087, + "grad_norm": 0.9653342366218567, + "learning_rate": 5e-06, + "loss": 0.0298, + "step": 6267 + }, + { + "epoch": 1.198470363288719, + "grad_norm": 2.015363931655884, + "learning_rate": 5e-06, + "loss": 0.1699, + "step": 6268 + }, + { + "epoch": 1.1986615678776291, + "grad_norm": 3.233055591583252, + "learning_rate": 5e-06, + "loss": 0.3099, + "step": 6269 + }, + { + "epoch": 1.1988527724665392, + "grad_norm": 3.2268948554992676, + "learning_rate": 5e-06, + "loss": 0.4068, + "step": 6270 + }, + { + "epoch": 1.1990439770554493, + "grad_norm": 2.4663195610046387, + "learning_rate": 5e-06, + "loss": 0.1524, + "step": 6271 + }, + { + "epoch": 1.1992351816443594, + "grad_norm": 1.4187419414520264, + "learning_rate": 5e-06, + "loss": 0.0471, + "step": 6272 + }, + { + "epoch": 1.1994263862332697, + "grad_norm": 0.7417177557945251, + "learning_rate": 5e-06, + "loss": 0.0204, + "step": 6273 + }, + { + "epoch": 1.1996175908221798, + "grad_norm": 1.6655139923095703, + "learning_rate": 5e-06, + "loss": 0.0895, + "step": 6274 + }, + { + "epoch": 1.1998087954110899, + "grad_norm": 2.4011824131011963, + "learning_rate": 5e-06, + "loss": 0.2601, + "step": 6275 + }, + { + "epoch": 1.2, + "grad_norm": 1.6423674821853638, + "learning_rate": 5e-06, + "loss": 0.1036, + "step": 6276 + }, + { + "epoch": 1.20019120458891, + "grad_norm": 1.1955245733261108, + "learning_rate": 5e-06, + "loss": 0.1115, + "step": 6277 + }, + { + "epoch": 1.2003824091778204, + "grad_norm": 2.667212724685669, + "learning_rate": 5e-06, + "loss": 0.2089, + "step": 6278 + }, + { + "epoch": 1.2005736137667304, + "grad_norm": 1.371573567390442, + "learning_rate": 5e-06, + "loss": 0.0467, + "step": 6279 + }, + { + "epoch": 1.2007648183556405, + "grad_norm": 0.9084519147872925, + "learning_rate": 5e-06, + "loss": 0.0358, + "step": 6280 + }, + { + "epoch": 1.2009560229445506, + "grad_norm": 2.8557240962982178, + "learning_rate": 5e-06, + "loss": 0.3219, + "step": 6281 + }, + { + "epoch": 1.2011472275334607, + "grad_norm": 3.1929121017456055, + "learning_rate": 5e-06, + "loss": 0.369, + "step": 6282 + }, + { + "epoch": 1.201338432122371, + "grad_norm": 1.8422592878341675, + "learning_rate": 5e-06, + "loss": 0.1406, + "step": 6283 + }, + { + "epoch": 1.201529636711281, + "grad_norm": 1.644729733467102, + "learning_rate": 5e-06, + "loss": 0.0833, + "step": 6284 + }, + { + "epoch": 1.2017208413001912, + "grad_norm": 1.2755012512207031, + "learning_rate": 5e-06, + "loss": 0.0553, + "step": 6285 + }, + { + "epoch": 1.2019120458891013, + "grad_norm": 0.9884834885597229, + "learning_rate": 5e-06, + "loss": 0.034, + "step": 6286 + }, + { + "epoch": 1.2021032504780114, + "grad_norm": 1.7062479257583618, + "learning_rate": 5e-06, + "loss": 0.1823, + "step": 6287 + }, + { + "epoch": 1.2022944550669217, + "grad_norm": 1.1571954488754272, + "learning_rate": 5e-06, + "loss": 0.0542, + "step": 6288 + }, + { + "epoch": 1.2024856596558318, + "grad_norm": 2.0687966346740723, + "learning_rate": 5e-06, + "loss": 0.2359, + "step": 6289 + }, + { + "epoch": 1.2026768642447419, + "grad_norm": 1.4236472845077515, + "learning_rate": 5e-06, + "loss": 0.0533, + "step": 6290 + }, + { + "epoch": 1.202868068833652, + "grad_norm": 1.979692816734314, + "learning_rate": 5e-06, + "loss": 0.0762, + "step": 6291 + }, + { + "epoch": 1.203059273422562, + "grad_norm": 0.6412269473075867, + "learning_rate": 5e-06, + "loss": 0.0188, + "step": 6292 + }, + { + "epoch": 1.2032504780114723, + "grad_norm": 2.0749592781066895, + "learning_rate": 5e-06, + "loss": 0.1959, + "step": 6293 + }, + { + "epoch": 1.2034416826003824, + "grad_norm": 1.9312831163406372, + "learning_rate": 5e-06, + "loss": 0.1836, + "step": 6294 + }, + { + "epoch": 1.2036328871892925, + "grad_norm": 1.3121813535690308, + "learning_rate": 5e-06, + "loss": 0.0944, + "step": 6295 + }, + { + "epoch": 1.2038240917782026, + "grad_norm": 1.8166366815567017, + "learning_rate": 5e-06, + "loss": 0.1763, + "step": 6296 + }, + { + "epoch": 1.2040152963671127, + "grad_norm": 2.4472837448120117, + "learning_rate": 5e-06, + "loss": 0.2342, + "step": 6297 + }, + { + "epoch": 1.204206500956023, + "grad_norm": 1.2451283931732178, + "learning_rate": 5e-06, + "loss": 0.0518, + "step": 6298 + }, + { + "epoch": 1.204397705544933, + "grad_norm": 1.7306233644485474, + "learning_rate": 5e-06, + "loss": 0.0941, + "step": 6299 + }, + { + "epoch": 1.2045889101338432, + "grad_norm": 1.8587627410888672, + "learning_rate": 5e-06, + "loss": 0.1325, + "step": 6300 + }, + { + "epoch": 1.2047801147227533, + "grad_norm": 1.9993771314620972, + "learning_rate": 5e-06, + "loss": 0.2328, + "step": 6301 + }, + { + "epoch": 1.2049713193116636, + "grad_norm": 1.5691461563110352, + "learning_rate": 5e-06, + "loss": 0.1121, + "step": 6302 + }, + { + "epoch": 1.2051625239005737, + "grad_norm": 1.0496647357940674, + "learning_rate": 5e-06, + "loss": 0.0498, + "step": 6303 + }, + { + "epoch": 1.2053537284894837, + "grad_norm": 6.206069469451904, + "learning_rate": 5e-06, + "loss": 0.0789, + "step": 6304 + }, + { + "epoch": 1.2055449330783938, + "grad_norm": 1.0517714023590088, + "learning_rate": 5e-06, + "loss": 0.0533, + "step": 6305 + }, + { + "epoch": 1.205736137667304, + "grad_norm": 1.230154275894165, + "learning_rate": 5e-06, + "loss": 0.0728, + "step": 6306 + }, + { + "epoch": 1.2059273422562142, + "grad_norm": 1.2463685274124146, + "learning_rate": 5e-06, + "loss": 0.0973, + "step": 6307 + }, + { + "epoch": 1.2061185468451243, + "grad_norm": 2.0279324054718018, + "learning_rate": 5e-06, + "loss": 0.1416, + "step": 6308 + }, + { + "epoch": 1.2063097514340344, + "grad_norm": 1.0040415525436401, + "learning_rate": 5e-06, + "loss": 0.0787, + "step": 6309 + }, + { + "epoch": 1.2065009560229445, + "grad_norm": 0.8246625065803528, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 6310 + }, + { + "epoch": 1.2066921606118548, + "grad_norm": 1.2535613775253296, + "learning_rate": 5e-06, + "loss": 0.0478, + "step": 6311 + }, + { + "epoch": 1.206883365200765, + "grad_norm": 1.6264280080795288, + "learning_rate": 5e-06, + "loss": 0.0806, + "step": 6312 + }, + { + "epoch": 1.207074569789675, + "grad_norm": 2.006242275238037, + "learning_rate": 5e-06, + "loss": 0.1946, + "step": 6313 + }, + { + "epoch": 1.207265774378585, + "grad_norm": 1.1853926181793213, + "learning_rate": 5e-06, + "loss": 0.0983, + "step": 6314 + }, + { + "epoch": 1.2074569789674952, + "grad_norm": 1.7839405536651611, + "learning_rate": 5e-06, + "loss": 0.1751, + "step": 6315 + }, + { + "epoch": 1.2076481835564055, + "grad_norm": 1.2711913585662842, + "learning_rate": 5e-06, + "loss": 0.0652, + "step": 6316 + }, + { + "epoch": 1.2078393881453156, + "grad_norm": 3.0699965953826904, + "learning_rate": 5e-06, + "loss": 0.0587, + "step": 6317 + }, + { + "epoch": 1.2080305927342256, + "grad_norm": 1.580519199371338, + "learning_rate": 5e-06, + "loss": 0.1163, + "step": 6318 + }, + { + "epoch": 1.2082217973231357, + "grad_norm": 1.9091516733169556, + "learning_rate": 5e-06, + "loss": 0.2536, + "step": 6319 + }, + { + "epoch": 1.2084130019120458, + "grad_norm": 1.5716497898101807, + "learning_rate": 5e-06, + "loss": 0.1696, + "step": 6320 + }, + { + "epoch": 1.2086042065009561, + "grad_norm": 1.1605068445205688, + "learning_rate": 5e-06, + "loss": 0.0474, + "step": 6321 + }, + { + "epoch": 1.2087954110898662, + "grad_norm": 1.2060127258300781, + "learning_rate": 5e-06, + "loss": 0.0683, + "step": 6322 + }, + { + "epoch": 1.2089866156787763, + "grad_norm": 1.9277080297470093, + "learning_rate": 5e-06, + "loss": 0.196, + "step": 6323 + }, + { + "epoch": 1.2091778202676864, + "grad_norm": 2.57090163230896, + "learning_rate": 5e-06, + "loss": 0.0813, + "step": 6324 + }, + { + "epoch": 1.2093690248565965, + "grad_norm": 2.1226699352264404, + "learning_rate": 5e-06, + "loss": 0.249, + "step": 6325 + }, + { + "epoch": 1.2095602294455068, + "grad_norm": 1.5362803936004639, + "learning_rate": 5e-06, + "loss": 0.183, + "step": 6326 + }, + { + "epoch": 1.2097514340344169, + "grad_norm": 2.6527233123779297, + "learning_rate": 5e-06, + "loss": 0.2657, + "step": 6327 + }, + { + "epoch": 1.209942638623327, + "grad_norm": 1.9042359590530396, + "learning_rate": 5e-06, + "loss": 0.1758, + "step": 6328 + }, + { + "epoch": 1.210133843212237, + "grad_norm": 0.6970901489257812, + "learning_rate": 5e-06, + "loss": 0.0262, + "step": 6329 + }, + { + "epoch": 1.2103250478011471, + "grad_norm": 1.6701407432556152, + "learning_rate": 5e-06, + "loss": 0.0508, + "step": 6330 + }, + { + "epoch": 1.2105162523900574, + "grad_norm": 2.219998836517334, + "learning_rate": 5e-06, + "loss": 0.376, + "step": 6331 + }, + { + "epoch": 1.2107074569789675, + "grad_norm": 1.266753077507019, + "learning_rate": 5e-06, + "loss": 0.078, + "step": 6332 + }, + { + "epoch": 1.2108986615678776, + "grad_norm": 1.639849066734314, + "learning_rate": 5e-06, + "loss": 0.1245, + "step": 6333 + }, + { + "epoch": 1.2110898661567877, + "grad_norm": 1.036638855934143, + "learning_rate": 5e-06, + "loss": 0.043, + "step": 6334 + }, + { + "epoch": 1.2112810707456978, + "grad_norm": 1.1263424158096313, + "learning_rate": 5e-06, + "loss": 0.0321, + "step": 6335 + }, + { + "epoch": 1.211472275334608, + "grad_norm": 1.8749959468841553, + "learning_rate": 5e-06, + "loss": 0.0971, + "step": 6336 + }, + { + "epoch": 1.2116634799235182, + "grad_norm": 0.9465709924697876, + "learning_rate": 5e-06, + "loss": 0.0425, + "step": 6337 + }, + { + "epoch": 1.2118546845124283, + "grad_norm": 2.4710988998413086, + "learning_rate": 5e-06, + "loss": 0.274, + "step": 6338 + }, + { + "epoch": 1.2120458891013384, + "grad_norm": 1.7354429960250854, + "learning_rate": 5e-06, + "loss": 0.1027, + "step": 6339 + }, + { + "epoch": 1.2122370936902485, + "grad_norm": 1.3433597087860107, + "learning_rate": 5e-06, + "loss": 0.0579, + "step": 6340 + }, + { + "epoch": 1.2124282982791588, + "grad_norm": 1.5866605043411255, + "learning_rate": 5e-06, + "loss": 0.0936, + "step": 6341 + }, + { + "epoch": 1.2126195028680689, + "grad_norm": 2.422170400619507, + "learning_rate": 5e-06, + "loss": 0.0584, + "step": 6342 + }, + { + "epoch": 1.212810707456979, + "grad_norm": 2.6649551391601562, + "learning_rate": 5e-06, + "loss": 0.1525, + "step": 6343 + }, + { + "epoch": 1.213001912045889, + "grad_norm": 1.7258577346801758, + "learning_rate": 5e-06, + "loss": 0.1976, + "step": 6344 + }, + { + "epoch": 1.2131931166347991, + "grad_norm": 2.841470241546631, + "learning_rate": 5e-06, + "loss": 0.3982, + "step": 6345 + }, + { + "epoch": 1.2133843212237094, + "grad_norm": 1.485338807106018, + "learning_rate": 5e-06, + "loss": 0.044, + "step": 6346 + }, + { + "epoch": 1.2135755258126195, + "grad_norm": 1.383081316947937, + "learning_rate": 5e-06, + "loss": 0.2076, + "step": 6347 + }, + { + "epoch": 1.2137667304015296, + "grad_norm": 0.8841148018836975, + "learning_rate": 5e-06, + "loss": 0.0334, + "step": 6348 + }, + { + "epoch": 1.2139579349904397, + "grad_norm": 1.4105879068374634, + "learning_rate": 5e-06, + "loss": 0.043, + "step": 6349 + }, + { + "epoch": 1.21414913957935, + "grad_norm": 2.799672842025757, + "learning_rate": 5e-06, + "loss": 0.2811, + "step": 6350 + }, + { + "epoch": 1.21434034416826, + "grad_norm": 1.106540322303772, + "learning_rate": 5e-06, + "loss": 0.0508, + "step": 6351 + }, + { + "epoch": 1.2145315487571702, + "grad_norm": 1.1798187494277954, + "learning_rate": 5e-06, + "loss": 0.071, + "step": 6352 + }, + { + "epoch": 1.2147227533460803, + "grad_norm": 1.4219733476638794, + "learning_rate": 5e-06, + "loss": 0.0609, + "step": 6353 + }, + { + "epoch": 1.2149139579349904, + "grad_norm": 0.49215996265411377, + "learning_rate": 5e-06, + "loss": 0.013, + "step": 6354 + }, + { + "epoch": 1.2151051625239007, + "grad_norm": 1.4540727138519287, + "learning_rate": 5e-06, + "loss": 0.066, + "step": 6355 + }, + { + "epoch": 1.2152963671128107, + "grad_norm": 2.5020601749420166, + "learning_rate": 5e-06, + "loss": 0.16, + "step": 6356 + }, + { + "epoch": 1.2154875717017208, + "grad_norm": 1.5029916763305664, + "learning_rate": 5e-06, + "loss": 0.1123, + "step": 6357 + }, + { + "epoch": 1.215678776290631, + "grad_norm": 1.260717749595642, + "learning_rate": 5e-06, + "loss": 0.0678, + "step": 6358 + }, + { + "epoch": 1.215869980879541, + "grad_norm": 1.8185780048370361, + "learning_rate": 5e-06, + "loss": 0.1582, + "step": 6359 + }, + { + "epoch": 1.2160611854684513, + "grad_norm": 0.7864851951599121, + "learning_rate": 5e-06, + "loss": 0.0319, + "step": 6360 + }, + { + "epoch": 1.2162523900573614, + "grad_norm": 1.0918238162994385, + "learning_rate": 5e-06, + "loss": 0.0447, + "step": 6361 + }, + { + "epoch": 1.2164435946462715, + "grad_norm": 2.916642427444458, + "learning_rate": 5e-06, + "loss": 0.2256, + "step": 6362 + }, + { + "epoch": 1.2166347992351816, + "grad_norm": 2.603426218032837, + "learning_rate": 5e-06, + "loss": 0.1929, + "step": 6363 + }, + { + "epoch": 1.216826003824092, + "grad_norm": 1.888663411140442, + "learning_rate": 5e-06, + "loss": 0.1862, + "step": 6364 + }, + { + "epoch": 1.217017208413002, + "grad_norm": 5.557328701019287, + "learning_rate": 5e-06, + "loss": 0.1513, + "step": 6365 + }, + { + "epoch": 1.217208413001912, + "grad_norm": 0.6387971043586731, + "learning_rate": 5e-06, + "loss": 0.0625, + "step": 6366 + }, + { + "epoch": 1.2173996175908222, + "grad_norm": 0.80973881483078, + "learning_rate": 5e-06, + "loss": 0.0612, + "step": 6367 + }, + { + "epoch": 1.2175908221797322, + "grad_norm": 1.4961767196655273, + "learning_rate": 5e-06, + "loss": 0.0794, + "step": 6368 + }, + { + "epoch": 1.2177820267686426, + "grad_norm": 1.3402931690216064, + "learning_rate": 5e-06, + "loss": 0.1085, + "step": 6369 + }, + { + "epoch": 1.2179732313575526, + "grad_norm": 1.615916132926941, + "learning_rate": 5e-06, + "loss": 0.1275, + "step": 6370 + }, + { + "epoch": 1.2181644359464627, + "grad_norm": 0.9041286110877991, + "learning_rate": 5e-06, + "loss": 0.0461, + "step": 6371 + }, + { + "epoch": 1.2183556405353728, + "grad_norm": 1.5138685703277588, + "learning_rate": 5e-06, + "loss": 0.0752, + "step": 6372 + }, + { + "epoch": 1.218546845124283, + "grad_norm": 2.4668264389038086, + "learning_rate": 5e-06, + "loss": 0.1996, + "step": 6373 + }, + { + "epoch": 1.2187380497131932, + "grad_norm": 1.754960060119629, + "learning_rate": 5e-06, + "loss": 0.0677, + "step": 6374 + }, + { + "epoch": 1.2189292543021033, + "grad_norm": 1.3340129852294922, + "learning_rate": 5e-06, + "loss": 0.0818, + "step": 6375 + }, + { + "epoch": 1.2191204588910134, + "grad_norm": 2.1007907390594482, + "learning_rate": 5e-06, + "loss": 0.1689, + "step": 6376 + }, + { + "epoch": 1.2193116634799235, + "grad_norm": 2.982419729232788, + "learning_rate": 5e-06, + "loss": 0.217, + "step": 6377 + }, + { + "epoch": 1.2195028680688336, + "grad_norm": 0.9394291043281555, + "learning_rate": 5e-06, + "loss": 0.079, + "step": 6378 + }, + { + "epoch": 1.2196940726577439, + "grad_norm": 0.9420931935310364, + "learning_rate": 5e-06, + "loss": 0.0518, + "step": 6379 + }, + { + "epoch": 1.219885277246654, + "grad_norm": 1.3374733924865723, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 6380 + }, + { + "epoch": 1.220076481835564, + "grad_norm": 3.1424825191497803, + "learning_rate": 5e-06, + "loss": 0.2596, + "step": 6381 + }, + { + "epoch": 1.2202676864244741, + "grad_norm": 1.0249494314193726, + "learning_rate": 5e-06, + "loss": 0.0696, + "step": 6382 + }, + { + "epoch": 1.2204588910133842, + "grad_norm": 1.8867278099060059, + "learning_rate": 5e-06, + "loss": 0.0802, + "step": 6383 + }, + { + "epoch": 1.2206500956022945, + "grad_norm": 1.559537649154663, + "learning_rate": 5e-06, + "loss": 0.0624, + "step": 6384 + }, + { + "epoch": 1.2208413001912046, + "grad_norm": 0.673812747001648, + "learning_rate": 5e-06, + "loss": 0.0599, + "step": 6385 + }, + { + "epoch": 1.2210325047801147, + "grad_norm": 0.6307937502861023, + "learning_rate": 5e-06, + "loss": 0.0229, + "step": 6386 + }, + { + "epoch": 1.2212237093690248, + "grad_norm": 1.3093122243881226, + "learning_rate": 5e-06, + "loss": 0.1147, + "step": 6387 + }, + { + "epoch": 1.221414913957935, + "grad_norm": 2.759263277053833, + "learning_rate": 5e-06, + "loss": 0.4477, + "step": 6388 + }, + { + "epoch": 1.2216061185468452, + "grad_norm": 2.4608709812164307, + "learning_rate": 5e-06, + "loss": 0.313, + "step": 6389 + }, + { + "epoch": 1.2217973231357553, + "grad_norm": 1.8315072059631348, + "learning_rate": 5e-06, + "loss": 0.089, + "step": 6390 + }, + { + "epoch": 1.2219885277246654, + "grad_norm": 0.7214317321777344, + "learning_rate": 5e-06, + "loss": 0.0515, + "step": 6391 + }, + { + "epoch": 1.2221797323135755, + "grad_norm": 0.8006981611251831, + "learning_rate": 5e-06, + "loss": 0.0421, + "step": 6392 + }, + { + "epoch": 1.2223709369024855, + "grad_norm": 1.7554163932800293, + "learning_rate": 5e-06, + "loss": 0.147, + "step": 6393 + }, + { + "epoch": 1.2225621414913959, + "grad_norm": 2.516019344329834, + "learning_rate": 5e-06, + "loss": 0.3259, + "step": 6394 + }, + { + "epoch": 1.222753346080306, + "grad_norm": 1.6270321607589722, + "learning_rate": 5e-06, + "loss": 0.0723, + "step": 6395 + }, + { + "epoch": 1.222944550669216, + "grad_norm": 0.9091405272483826, + "learning_rate": 5e-06, + "loss": 0.1058, + "step": 6396 + }, + { + "epoch": 1.2231357552581261, + "grad_norm": 0.7934021353721619, + "learning_rate": 5e-06, + "loss": 0.0766, + "step": 6397 + }, + { + "epoch": 1.2233269598470362, + "grad_norm": 0.9105468988418579, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 6398 + }, + { + "epoch": 1.2235181644359465, + "grad_norm": 1.7011659145355225, + "learning_rate": 5e-06, + "loss": 0.1063, + "step": 6399 + }, + { + "epoch": 1.2237093690248566, + "grad_norm": 1.6316510438919067, + "learning_rate": 5e-06, + "loss": 0.1647, + "step": 6400 + }, + { + "epoch": 1.2239005736137667, + "grad_norm": 1.9868152141571045, + "learning_rate": 5e-06, + "loss": 0.1778, + "step": 6401 + }, + { + "epoch": 1.2240917782026768, + "grad_norm": 1.912008285522461, + "learning_rate": 5e-06, + "loss": 0.1873, + "step": 6402 + }, + { + "epoch": 1.224282982791587, + "grad_norm": 1.5919640064239502, + "learning_rate": 5e-06, + "loss": 0.1538, + "step": 6403 + }, + { + "epoch": 1.2244741873804972, + "grad_norm": 1.775577425956726, + "learning_rate": 5e-06, + "loss": 0.0358, + "step": 6404 + }, + { + "epoch": 1.2246653919694073, + "grad_norm": 0.8517948985099792, + "learning_rate": 5e-06, + "loss": 0.0253, + "step": 6405 + }, + { + "epoch": 1.2248565965583174, + "grad_norm": 2.3561553955078125, + "learning_rate": 5e-06, + "loss": 0.2985, + "step": 6406 + }, + { + "epoch": 1.2250478011472274, + "grad_norm": 1.6562501192092896, + "learning_rate": 5e-06, + "loss": 0.2061, + "step": 6407 + }, + { + "epoch": 1.2252390057361378, + "grad_norm": 1.6132574081420898, + "learning_rate": 5e-06, + "loss": 0.1679, + "step": 6408 + }, + { + "epoch": 1.2254302103250478, + "grad_norm": 1.9934953451156616, + "learning_rate": 5e-06, + "loss": 0.2289, + "step": 6409 + }, + { + "epoch": 1.225621414913958, + "grad_norm": 1.7691314220428467, + "learning_rate": 5e-06, + "loss": 0.1696, + "step": 6410 + }, + { + "epoch": 1.225812619502868, + "grad_norm": 1.1167519092559814, + "learning_rate": 5e-06, + "loss": 0.0377, + "step": 6411 + }, + { + "epoch": 1.2260038240917783, + "grad_norm": 1.710638165473938, + "learning_rate": 5e-06, + "loss": 0.2259, + "step": 6412 + }, + { + "epoch": 1.2261950286806884, + "grad_norm": 2.857348680496216, + "learning_rate": 5e-06, + "loss": 0.442, + "step": 6413 + }, + { + "epoch": 1.2263862332695985, + "grad_norm": 2.343129873275757, + "learning_rate": 5e-06, + "loss": 0.2763, + "step": 6414 + }, + { + "epoch": 1.2265774378585086, + "grad_norm": 0.8242167830467224, + "learning_rate": 5e-06, + "loss": 0.0545, + "step": 6415 + }, + { + "epoch": 1.2267686424474187, + "grad_norm": 1.5538809299468994, + "learning_rate": 5e-06, + "loss": 0.0966, + "step": 6416 + }, + { + "epoch": 1.226959847036329, + "grad_norm": 1.174586534500122, + "learning_rate": 5e-06, + "loss": 0.0375, + "step": 6417 + }, + { + "epoch": 1.227151051625239, + "grad_norm": 2.826963186264038, + "learning_rate": 5e-06, + "loss": 0.3052, + "step": 6418 + }, + { + "epoch": 1.2273422562141492, + "grad_norm": 2.5396435260772705, + "learning_rate": 5e-06, + "loss": 0.2724, + "step": 6419 + }, + { + "epoch": 1.2275334608030593, + "grad_norm": 2.1552212238311768, + "learning_rate": 5e-06, + "loss": 0.2733, + "step": 6420 + }, + { + "epoch": 1.2277246653919693, + "grad_norm": 2.812382936477661, + "learning_rate": 5e-06, + "loss": 0.2159, + "step": 6421 + }, + { + "epoch": 1.2279158699808796, + "grad_norm": 1.1729477643966675, + "learning_rate": 5e-06, + "loss": 0.1066, + "step": 6422 + }, + { + "epoch": 1.2281070745697897, + "grad_norm": 1.4861737489700317, + "learning_rate": 5e-06, + "loss": 0.1204, + "step": 6423 + }, + { + "epoch": 1.2282982791586998, + "grad_norm": 0.9737563729286194, + "learning_rate": 5e-06, + "loss": 0.0359, + "step": 6424 + }, + { + "epoch": 1.22848948374761, + "grad_norm": 2.244095802307129, + "learning_rate": 5e-06, + "loss": 0.3383, + "step": 6425 + }, + { + "epoch": 1.22868068833652, + "grad_norm": 1.4030261039733887, + "learning_rate": 5e-06, + "loss": 0.0753, + "step": 6426 + }, + { + "epoch": 1.2288718929254303, + "grad_norm": 1.439172625541687, + "learning_rate": 5e-06, + "loss": 0.0642, + "step": 6427 + }, + { + "epoch": 1.2290630975143404, + "grad_norm": 0.6380961537361145, + "learning_rate": 5e-06, + "loss": 0.0633, + "step": 6428 + }, + { + "epoch": 1.2292543021032505, + "grad_norm": 0.9303730726242065, + "learning_rate": 5e-06, + "loss": 0.0676, + "step": 6429 + }, + { + "epoch": 1.2294455066921606, + "grad_norm": 1.2828363180160522, + "learning_rate": 5e-06, + "loss": 0.0468, + "step": 6430 + }, + { + "epoch": 1.2296367112810707, + "grad_norm": 2.6689090728759766, + "learning_rate": 5e-06, + "loss": 0.3648, + "step": 6431 + }, + { + "epoch": 1.229827915869981, + "grad_norm": 2.0346622467041016, + "learning_rate": 5e-06, + "loss": 0.2261, + "step": 6432 + }, + { + "epoch": 1.230019120458891, + "grad_norm": 1.939487338066101, + "learning_rate": 5e-06, + "loss": 0.193, + "step": 6433 + }, + { + "epoch": 1.2302103250478011, + "grad_norm": 1.3966894149780273, + "learning_rate": 5e-06, + "loss": 0.0372, + "step": 6434 + }, + { + "epoch": 1.2304015296367112, + "grad_norm": 1.1000654697418213, + "learning_rate": 5e-06, + "loss": 0.0814, + "step": 6435 + }, + { + "epoch": 1.2305927342256213, + "grad_norm": 4.047434329986572, + "learning_rate": 5e-06, + "loss": 0.1489, + "step": 6436 + }, + { + "epoch": 1.2307839388145316, + "grad_norm": 1.382005214691162, + "learning_rate": 5e-06, + "loss": 0.0809, + "step": 6437 + }, + { + "epoch": 1.2309751434034417, + "grad_norm": 1.00590181350708, + "learning_rate": 5e-06, + "loss": 0.0726, + "step": 6438 + }, + { + "epoch": 1.2311663479923518, + "grad_norm": 1.4503659009933472, + "learning_rate": 5e-06, + "loss": 0.1095, + "step": 6439 + }, + { + "epoch": 1.231357552581262, + "grad_norm": 1.0048933029174805, + "learning_rate": 5e-06, + "loss": 0.0531, + "step": 6440 + }, + { + "epoch": 1.231548757170172, + "grad_norm": 1.0850677490234375, + "learning_rate": 5e-06, + "loss": 0.0485, + "step": 6441 + }, + { + "epoch": 1.2317399617590823, + "grad_norm": 2.6655921936035156, + "learning_rate": 5e-06, + "loss": 0.1078, + "step": 6442 + }, + { + "epoch": 1.2319311663479924, + "grad_norm": 0.8888630867004395, + "learning_rate": 5e-06, + "loss": 0.0555, + "step": 6443 + }, + { + "epoch": 1.2321223709369025, + "grad_norm": 2.1980793476104736, + "learning_rate": 5e-06, + "loss": 0.3322, + "step": 6444 + }, + { + "epoch": 1.2323135755258126, + "grad_norm": 1.1009891033172607, + "learning_rate": 5e-06, + "loss": 0.0593, + "step": 6445 + }, + { + "epoch": 1.2325047801147226, + "grad_norm": 2.154589891433716, + "learning_rate": 5e-06, + "loss": 0.1893, + "step": 6446 + }, + { + "epoch": 1.232695984703633, + "grad_norm": 2.7047064304351807, + "learning_rate": 5e-06, + "loss": 0.178, + "step": 6447 + }, + { + "epoch": 1.232887189292543, + "grad_norm": 1.372484803199768, + "learning_rate": 5e-06, + "loss": 0.0953, + "step": 6448 + }, + { + "epoch": 1.2330783938814531, + "grad_norm": 1.1343607902526855, + "learning_rate": 5e-06, + "loss": 0.0352, + "step": 6449 + }, + { + "epoch": 1.2332695984703632, + "grad_norm": 2.3495070934295654, + "learning_rate": 5e-06, + "loss": 0.2928, + "step": 6450 + }, + { + "epoch": 1.2334608030592733, + "grad_norm": 1.3010920286178589, + "learning_rate": 5e-06, + "loss": 0.0774, + "step": 6451 + }, + { + "epoch": 1.2336520076481836, + "grad_norm": 1.11711585521698, + "learning_rate": 5e-06, + "loss": 0.0818, + "step": 6452 + }, + { + "epoch": 1.2338432122370937, + "grad_norm": 2.2713661193847656, + "learning_rate": 5e-06, + "loss": 0.1201, + "step": 6453 + }, + { + "epoch": 1.2340344168260038, + "grad_norm": 1.1702678203582764, + "learning_rate": 5e-06, + "loss": 0.0606, + "step": 6454 + }, + { + "epoch": 1.2342256214149139, + "grad_norm": 1.4318171739578247, + "learning_rate": 5e-06, + "loss": 0.0504, + "step": 6455 + }, + { + "epoch": 1.2344168260038242, + "grad_norm": 1.7486087083816528, + "learning_rate": 5e-06, + "loss": 0.209, + "step": 6456 + }, + { + "epoch": 1.2346080305927343, + "grad_norm": 2.538674831390381, + "learning_rate": 5e-06, + "loss": 0.2618, + "step": 6457 + }, + { + "epoch": 1.2347992351816444, + "grad_norm": 1.6931205987930298, + "learning_rate": 5e-06, + "loss": 0.1153, + "step": 6458 + }, + { + "epoch": 1.2349904397705544, + "grad_norm": 2.5916154384613037, + "learning_rate": 5e-06, + "loss": 0.2899, + "step": 6459 + }, + { + "epoch": 1.2351816443594645, + "grad_norm": 0.5500595569610596, + "learning_rate": 5e-06, + "loss": 0.0244, + "step": 6460 + }, + { + "epoch": 1.2353728489483748, + "grad_norm": 1.0323785543441772, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 6461 + }, + { + "epoch": 1.235564053537285, + "grad_norm": 2.1304259300231934, + "learning_rate": 5e-06, + "loss": 0.2838, + "step": 6462 + }, + { + "epoch": 1.235755258126195, + "grad_norm": 1.0863720178604126, + "learning_rate": 5e-06, + "loss": 0.0669, + "step": 6463 + }, + { + "epoch": 1.235946462715105, + "grad_norm": 1.7608684301376343, + "learning_rate": 5e-06, + "loss": 0.1987, + "step": 6464 + }, + { + "epoch": 1.2361376673040154, + "grad_norm": 2.235177516937256, + "learning_rate": 5e-06, + "loss": 0.2417, + "step": 6465 + }, + { + "epoch": 1.2363288718929255, + "grad_norm": 0.9923659563064575, + "learning_rate": 5e-06, + "loss": 0.0576, + "step": 6466 + }, + { + "epoch": 1.2365200764818356, + "grad_norm": 0.47206011414527893, + "learning_rate": 5e-06, + "loss": 0.0123, + "step": 6467 + }, + { + "epoch": 1.2367112810707457, + "grad_norm": 1.7471652030944824, + "learning_rate": 5e-06, + "loss": 0.1048, + "step": 6468 + }, + { + "epoch": 1.2369024856596558, + "grad_norm": 2.4238178730010986, + "learning_rate": 5e-06, + "loss": 0.3712, + "step": 6469 + }, + { + "epoch": 1.237093690248566, + "grad_norm": 1.817383050918579, + "learning_rate": 5e-06, + "loss": 0.173, + "step": 6470 + }, + { + "epoch": 1.2372848948374762, + "grad_norm": 0.883569598197937, + "learning_rate": 5e-06, + "loss": 0.0748, + "step": 6471 + }, + { + "epoch": 1.2374760994263863, + "grad_norm": 1.4984726905822754, + "learning_rate": 5e-06, + "loss": 0.1212, + "step": 6472 + }, + { + "epoch": 1.2376673040152963, + "grad_norm": 1.6612300872802734, + "learning_rate": 5e-06, + "loss": 0.1338, + "step": 6473 + }, + { + "epoch": 1.2378585086042064, + "grad_norm": 1.7457292079925537, + "learning_rate": 5e-06, + "loss": 0.0619, + "step": 6474 + }, + { + "epoch": 1.2380497131931167, + "grad_norm": 2.305072784423828, + "learning_rate": 5e-06, + "loss": 0.2619, + "step": 6475 + }, + { + "epoch": 1.2382409177820268, + "grad_norm": 0.9911337494850159, + "learning_rate": 5e-06, + "loss": 0.0859, + "step": 6476 + }, + { + "epoch": 1.238432122370937, + "grad_norm": 1.6552451848983765, + "learning_rate": 5e-06, + "loss": 0.1726, + "step": 6477 + }, + { + "epoch": 1.238623326959847, + "grad_norm": 2.003192901611328, + "learning_rate": 5e-06, + "loss": 0.1219, + "step": 6478 + }, + { + "epoch": 1.238814531548757, + "grad_norm": 1.888157844543457, + "learning_rate": 5e-06, + "loss": 0.0987, + "step": 6479 + }, + { + "epoch": 1.2390057361376674, + "grad_norm": 1.2396515607833862, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 6480 + }, + { + "epoch": 1.2391969407265775, + "grad_norm": 0.947607159614563, + "learning_rate": 5e-06, + "loss": 0.0804, + "step": 6481 + }, + { + "epoch": 1.2393881453154876, + "grad_norm": 2.7749950885772705, + "learning_rate": 5e-06, + "loss": 0.1755, + "step": 6482 + }, + { + "epoch": 1.2395793499043977, + "grad_norm": 2.7286288738250732, + "learning_rate": 5e-06, + "loss": 0.3329, + "step": 6483 + }, + { + "epoch": 1.2397705544933078, + "grad_norm": 2.670870065689087, + "learning_rate": 5e-06, + "loss": 0.0679, + "step": 6484 + }, + { + "epoch": 1.239961759082218, + "grad_norm": 0.9692270159721375, + "learning_rate": 5e-06, + "loss": 0.0543, + "step": 6485 + }, + { + "epoch": 1.2401529636711282, + "grad_norm": 1.525709867477417, + "learning_rate": 5e-06, + "loss": 0.0621, + "step": 6486 + }, + { + "epoch": 1.2403441682600382, + "grad_norm": 2.301687479019165, + "learning_rate": 5e-06, + "loss": 0.1212, + "step": 6487 + }, + { + "epoch": 1.2405353728489483, + "grad_norm": 1.8523778915405273, + "learning_rate": 5e-06, + "loss": 0.1042, + "step": 6488 + }, + { + "epoch": 1.2407265774378584, + "grad_norm": 1.0919690132141113, + "learning_rate": 5e-06, + "loss": 0.0533, + "step": 6489 + }, + { + "epoch": 1.2409177820267687, + "grad_norm": 0.7931455373764038, + "learning_rate": 5e-06, + "loss": 0.0574, + "step": 6490 + }, + { + "epoch": 1.2411089866156788, + "grad_norm": 0.8481590747833252, + "learning_rate": 5e-06, + "loss": 0.0492, + "step": 6491 + }, + { + "epoch": 1.241300191204589, + "grad_norm": 1.4083970785140991, + "learning_rate": 5e-06, + "loss": 0.0413, + "step": 6492 + }, + { + "epoch": 1.241491395793499, + "grad_norm": 0.9851438999176025, + "learning_rate": 5e-06, + "loss": 0.0501, + "step": 6493 + }, + { + "epoch": 1.241682600382409, + "grad_norm": 2.721176862716675, + "learning_rate": 5e-06, + "loss": 0.4178, + "step": 6494 + }, + { + "epoch": 1.2418738049713194, + "grad_norm": 1.2428650856018066, + "learning_rate": 5e-06, + "loss": 0.1112, + "step": 6495 + }, + { + "epoch": 1.2420650095602295, + "grad_norm": 3.327238082885742, + "learning_rate": 5e-06, + "loss": 0.3819, + "step": 6496 + }, + { + "epoch": 1.2422562141491396, + "grad_norm": 1.2107603549957275, + "learning_rate": 5e-06, + "loss": 0.0878, + "step": 6497 + }, + { + "epoch": 1.2424474187380496, + "grad_norm": 1.4254592657089233, + "learning_rate": 5e-06, + "loss": 0.0726, + "step": 6498 + }, + { + "epoch": 1.2426386233269597, + "grad_norm": 0.8053144216537476, + "learning_rate": 5e-06, + "loss": 0.0288, + "step": 6499 + }, + { + "epoch": 1.24282982791587, + "grad_norm": 2.3608970642089844, + "learning_rate": 5e-06, + "loss": 0.2719, + "step": 6500 + }, + { + "epoch": 1.24282982791587, + "eval_runtime": 801.4109, + "eval_samples_per_second": 1.914, + "eval_steps_per_second": 0.24, + "step": 6500 + } + ], + "logging_steps": 1.0, + "max_steps": 26150, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.592210007442588e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}