diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,68847 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9831, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 25.774307163187583, + "learning_rate": 6.779661016949153e-08, + "loss": 2.4261, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 26.21884882896487, + "learning_rate": 1.3559322033898305e-07, + "loss": 2.4695, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 16.482975314694887, + "learning_rate": 2.0338983050847458e-07, + "loss": 1.8327, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 21.85451201104805, + "learning_rate": 2.711864406779661e-07, + "loss": 2.0734, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 23.270952415339217, + "learning_rate": 3.3898305084745766e-07, + "loss": 2.236, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 22.87877942813057, + "learning_rate": 4.0677966101694916e-07, + "loss": 2.2076, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 21.776727913056515, + "learning_rate": 4.745762711864407e-07, + "loss": 2.1196, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 23.25224020480987, + "learning_rate": 5.423728813559322e-07, + "loss": 2.345, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 27.33668116834911, + "learning_rate": 6.101694915254238e-07, + "loss": 2.3076, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 19.181907330178706, + "learning_rate": 6.779661016949153e-07, + "loss": 2.141, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 23.615130567617697, + "learning_rate": 7.457627118644069e-07, + "loss": 2.2639, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 26.579337894350388, + "learning_rate": 8.135593220338983e-07, + "loss": 2.2511, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 23.051745481180966, + "learning_rate": 8.813559322033899e-07, + "loss": 2.0932, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 23.07477432582864, + "learning_rate": 9.491525423728814e-07, + "loss": 2.1369, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 21.56621520785092, + "learning_rate": 1.016949152542373e-06, + "loss": 2.1692, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 21.349853220880675, + "learning_rate": 1.0847457627118644e-06, + "loss": 1.9588, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 13.978393855508065, + "learning_rate": 1.152542372881356e-06, + "loss": 1.7003, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 15.129076262634308, + "learning_rate": 1.2203389830508477e-06, + "loss": 1.9801, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 13.20744557420552, + "learning_rate": 1.288135593220339e-06, + "loss": 1.7939, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 11.203685104742611, + "learning_rate": 1.3559322033898307e-06, + "loss": 1.5948, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 10.654869222087148, + "learning_rate": 1.4237288135593222e-06, + "loss": 1.6056, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 11.814571724474261, + "learning_rate": 1.4915254237288139e-06, + "loss": 1.7941, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 8.530069326697427, + "learning_rate": 1.5593220338983054e-06, + "loss": 1.2375, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 10.32660567477036, + "learning_rate": 1.6271186440677967e-06, + "loss": 1.2946, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 10.990179490365163, + "learning_rate": 1.6949152542372882e-06, + "loss": 1.1305, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 6.380671795900115, + "learning_rate": 1.7627118644067799e-06, + "loss": 0.9932, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 7.844307063285582, + "learning_rate": 1.8305084745762714e-06, + "loss": 1.0589, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 5.994303435945046, + "learning_rate": 1.8983050847457629e-06, + "loss": 0.9073, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 6.551427417951352, + "learning_rate": 1.9661016949152544e-06, + "loss": 1.0221, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 5.42154176934472, + "learning_rate": 2.033898305084746e-06, + "loss": 0.9045, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.6207875594724706, + "learning_rate": 2.1016949152542374e-06, + "loss": 0.8139, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 2.1824465930868917, + "learning_rate": 2.169491525423729e-06, + "loss": 0.669, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.706012292777276, + "learning_rate": 2.2372881355932204e-06, + "loss": 0.8022, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 3.8251403517668487, + "learning_rate": 2.305084745762712e-06, + "loss": 0.8254, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 2.984073023660238, + "learning_rate": 2.372881355932204e-06, + "loss": 0.9779, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.4188064951429475, + "learning_rate": 2.4406779661016953e-06, + "loss": 0.7361, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.3516733355792843, + "learning_rate": 2.5084745762711864e-06, + "loss": 0.8315, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.2496309758042417, + "learning_rate": 2.576271186440678e-06, + "loss": 0.8022, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 2.420808155220749, + "learning_rate": 2.64406779661017e-06, + "loss": 0.8303, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.235898259469345, + "learning_rate": 2.7118644067796613e-06, + "loss": 0.8828, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.259722995408095, + "learning_rate": 2.779661016949153e-06, + "loss": 0.7345, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 2.5220575903145717, + "learning_rate": 2.8474576271186443e-06, + "loss": 0.8884, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 2.01432745835319, + "learning_rate": 2.915254237288136e-06, + "loss": 0.716, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 2.3912698546304783, + "learning_rate": 2.9830508474576277e-06, + "loss": 0.8186, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 2.2176167549704413, + "learning_rate": 3.0508474576271192e-06, + "loss": 0.8315, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 2.3542559647297248, + "learning_rate": 3.1186440677966107e-06, + "loss": 0.8938, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.408850666287898, + "learning_rate": 3.186440677966102e-06, + "loss": 0.7443, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 1.9403993418379064, + "learning_rate": 3.2542372881355933e-06, + "loss": 0.7316, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 2.2266120044077855, + "learning_rate": 3.322033898305085e-06, + "loss": 0.823, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 2.2149796287947052, + "learning_rate": 3.3898305084745763e-06, + "loss": 0.8493, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.964248261933294, + "learning_rate": 3.457627118644068e-06, + "loss": 0.7657, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 2.1369395484319402, + "learning_rate": 3.5254237288135597e-06, + "loss": 0.7717, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.9324624073386334, + "learning_rate": 3.5932203389830512e-06, + "loss": 0.7885, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 1.7645971001845546, + "learning_rate": 3.6610169491525427e-06, + "loss": 0.6827, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 1.9794442176736449, + "learning_rate": 3.7288135593220342e-06, + "loss": 0.7341, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 2.205253413175247, + "learning_rate": 3.7966101694915257e-06, + "loss": 0.8477, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 1.9368774918084077, + "learning_rate": 3.864406779661018e-06, + "loss": 0.7182, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 1.8331552352287894, + "learning_rate": 3.932203389830509e-06, + "loss": 0.6232, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 2.1664795950928943, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7919, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 1.9510344143430243, + "learning_rate": 4.067796610169492e-06, + "loss": 0.7199, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.1394368935709833, + "learning_rate": 4.135593220338983e-06, + "loss": 0.846, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.9579721708838056, + "learning_rate": 4.203389830508475e-06, + "loss": 0.7141, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.257998109160227, + "learning_rate": 4.271186440677967e-06, + "loss": 0.7374, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.9561971563274394, + "learning_rate": 4.338983050847458e-06, + "loss": 0.6967, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.961780054829146, + "learning_rate": 4.40677966101695e-06, + "loss": 0.8134, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.9708755542520096, + "learning_rate": 4.474576271186441e-06, + "loss": 0.6124, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 2.1606992603731294, + "learning_rate": 4.542372881355933e-06, + "loss": 0.729, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 2.0759273723089726, + "learning_rate": 4.610169491525424e-06, + "loss": 0.7483, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 2.19293232912226, + "learning_rate": 4.677966101694916e-06, + "loss": 0.7522, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.8359859772491813, + "learning_rate": 4.745762711864408e-06, + "loss": 0.7899, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.3845629041250707, + "learning_rate": 4.813559322033899e-06, + "loss": 0.7246, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 2.1085590957497846, + "learning_rate": 4.881355932203391e-06, + "loss": 0.7739, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.8918037708327846, + "learning_rate": 4.949152542372882e-06, + "loss": 0.7698, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 2.0067874509675687, + "learning_rate": 5.016949152542373e-06, + "loss": 0.7346, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 2.119502972604516, + "learning_rate": 5.084745762711865e-06, + "loss": 0.8479, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 1.861663786810975, + "learning_rate": 5.152542372881356e-06, + "loss": 0.6784, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.9695486836646774, + "learning_rate": 5.220338983050848e-06, + "loss": 0.7107, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 2.171762088984204, + "learning_rate": 5.28813559322034e-06, + "loss": 0.7572, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.1380903015319648, + "learning_rate": 5.355932203389831e-06, + "loss": 0.7518, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.020115592796084, + "learning_rate": 5.423728813559323e-06, + "loss": 0.7172, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.3246993299029914, + "learning_rate": 5.491525423728814e-06, + "loss": 0.7228, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.0860284304875973, + "learning_rate": 5.559322033898306e-06, + "loss": 0.7337, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.105468963098628, + "learning_rate": 5.6271186440677975e-06, + "loss": 0.8502, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.9810307968538619, + "learning_rate": 5.694915254237289e-06, + "loss": 0.7639, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 2.42547864581517, + "learning_rate": 5.7627118644067805e-06, + "loss": 0.8198, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.9843324749634157, + "learning_rate": 5.830508474576272e-06, + "loss": 0.6272, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.8295596514924568, + "learning_rate": 5.8983050847457635e-06, + "loss": 0.6614, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 2.2018819268618697, + "learning_rate": 5.9661016949152555e-06, + "loss": 0.8396, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 2.014841623849869, + "learning_rate": 6.0338983050847465e-06, + "loss": 0.7825, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.1391290710966953, + "learning_rate": 6.1016949152542385e-06, + "loss": 0.6737, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.844927353698065, + "learning_rate": 6.1694915254237295e-06, + "loss": 0.7386, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 2.4827254198905573, + "learning_rate": 6.2372881355932215e-06, + "loss": 0.9183, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.1865083435662775, + "learning_rate": 6.3050847457627125e-06, + "loss": 0.6657, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.22914258959008, + "learning_rate": 6.372881355932204e-06, + "loss": 0.7878, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.0465478945502737, + "learning_rate": 6.440677966101695e-06, + "loss": 0.7384, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 2.150182007226782, + "learning_rate": 6.508474576271187e-06, + "loss": 0.7675, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 2.20138713129849, + "learning_rate": 6.576271186440678e-06, + "loss": 0.7957, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 2.1717654881518405, + "learning_rate": 6.64406779661017e-06, + "loss": 0.7656, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.1766599305935683, + "learning_rate": 6.7118644067796615e-06, + "loss": 0.8074, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 2.250178867344064, + "learning_rate": 6.779661016949153e-06, + "loss": 0.7486, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.1036428186662457, + "learning_rate": 6.8474576271186445e-06, + "loss": 0.7276, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 2.231120104885962, + "learning_rate": 6.915254237288136e-06, + "loss": 0.7694, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.8320331250168351, + "learning_rate": 6.9830508474576275e-06, + "loss": 0.7384, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 2.07040534417232, + "learning_rate": 7.0508474576271195e-06, + "loss": 0.6566, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 2.052812058503903, + "learning_rate": 7.1186440677966106e-06, + "loss": 0.761, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 2.1483979394263417, + "learning_rate": 7.1864406779661025e-06, + "loss": 0.7733, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 2.029441020419923, + "learning_rate": 7.2542372881355936e-06, + "loss": 0.6775, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 2.0568628127678794, + "learning_rate": 7.3220338983050855e-06, + "loss": 0.6425, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 2.179569873526594, + "learning_rate": 7.3898305084745766e-06, + "loss": 0.7068, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.9555856066898845, + "learning_rate": 7.4576271186440685e-06, + "loss": 0.7595, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.3202948797607554, + "learning_rate": 7.52542372881356e-06, + "loss": 0.8602, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 2.0643122915003107, + "learning_rate": 7.5932203389830515e-06, + "loss": 0.791, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.2126598533146677, + "learning_rate": 7.661016949152543e-06, + "loss": 0.7064, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 2.4298478880835463, + "learning_rate": 7.728813559322035e-06, + "loss": 0.7869, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 2.288531605902557, + "learning_rate": 7.796610169491526e-06, + "loss": 0.896, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 2.2745017660805855, + "learning_rate": 7.864406779661017e-06, + "loss": 0.6802, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.6570737256402928, + "learning_rate": 7.93220338983051e-06, + "loss": 0.6276, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.1733978454998923, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7266, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.142950183320776, + "learning_rate": 8.067796610169492e-06, + "loss": 0.7994, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.1131892760021556, + "learning_rate": 8.135593220338983e-06, + "loss": 0.7306, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.9104523566550864, + "learning_rate": 8.203389830508475e-06, + "loss": 0.7925, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.9144516822731146, + "learning_rate": 8.271186440677966e-06, + "loss": 0.7236, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 2.1479589283189173, + "learning_rate": 8.338983050847458e-06, + "loss": 0.7482, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 2.154494552349858, + "learning_rate": 8.40677966101695e-06, + "loss": 0.689, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.9714017351330468, + "learning_rate": 8.47457627118644e-06, + "loss": 0.7369, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.988551787371157, + "learning_rate": 8.542372881355933e-06, + "loss": 0.7643, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 2.325041578076891, + "learning_rate": 8.610169491525424e-06, + "loss": 0.7964, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.176814754004798, + "learning_rate": 8.677966101694915e-06, + "loss": 0.7643, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 2.0824812157544645, + "learning_rate": 8.745762711864407e-06, + "loss": 0.7471, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.9684242759673636, + "learning_rate": 8.8135593220339e-06, + "loss": 0.8099, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.0678745627639197, + "learning_rate": 8.88135593220339e-06, + "loss": 0.8663, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 2.060252010222962, + "learning_rate": 8.949152542372881e-06, + "loss": 0.7509, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.8677177304612413, + "learning_rate": 9.016949152542374e-06, + "loss": 0.719, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 2.0650299093701965, + "learning_rate": 9.084745762711865e-06, + "loss": 0.6587, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 2.157297646435543, + "learning_rate": 9.152542372881356e-06, + "loss": 0.7644, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 1.9208876885040105, + "learning_rate": 9.220338983050847e-06, + "loss": 0.7299, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 2.1636200591275476, + "learning_rate": 9.28813559322034e-06, + "loss": 0.8205, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 2.240165955523122, + "learning_rate": 9.355932203389831e-06, + "loss": 0.7958, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 2.3038562909608458, + "learning_rate": 9.423728813559322e-06, + "loss": 0.7342, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 1.8045909570080936, + "learning_rate": 9.491525423728815e-06, + "loss": 0.6016, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.8861330770480502, + "learning_rate": 9.559322033898306e-06, + "loss": 0.7197, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 1.973267688604536, + "learning_rate": 9.627118644067797e-06, + "loss": 0.7593, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.8698474554660958, + "learning_rate": 9.69491525423729e-06, + "loss": 0.6507, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 2.0010518197546987, + "learning_rate": 9.762711864406781e-06, + "loss": 0.7075, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.9194243510439397, + "learning_rate": 9.830508474576272e-06, + "loss": 0.802, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 2.168846656888089, + "learning_rate": 9.898305084745763e-06, + "loss": 0.7935, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 2.125833442803335, + "learning_rate": 9.966101694915256e-06, + "loss": 0.7864, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 2.102996254972869, + "learning_rate": 1.0033898305084746e-05, + "loss": 0.7078, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 2.1987112323642615, + "learning_rate": 1.0101694915254238e-05, + "loss": 0.7534, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 1.815570288794822, + "learning_rate": 1.016949152542373e-05, + "loss": 0.8229, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 2.1419109745500293, + "learning_rate": 1.0237288135593222e-05, + "loss": 0.7117, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 2.1575782595640476, + "learning_rate": 1.0305084745762712e-05, + "loss": 0.8649, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 2.3369718017851002, + "learning_rate": 1.0372881355932204e-05, + "loss": 0.841, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 1.89934679393495, + "learning_rate": 1.0440677966101695e-05, + "loss": 0.6265, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 2.116565213986925, + "learning_rate": 1.0508474576271188e-05, + "loss": 0.724, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 1.982021753415292, + "learning_rate": 1.057627118644068e-05, + "loss": 0.789, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 2.1139135488892666, + "learning_rate": 1.0644067796610172e-05, + "loss": 0.8074, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 2.118524566834072, + "learning_rate": 1.0711864406779661e-05, + "loss": 0.7203, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 2.12820506314938, + "learning_rate": 1.0779661016949154e-05, + "loss": 0.7984, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 1.9423385020630755, + "learning_rate": 1.0847457627118645e-05, + "loss": 0.6801, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 2.1069618355533097, + "learning_rate": 1.0915254237288135e-05, + "loss": 0.6958, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 2.2371552107869843, + "learning_rate": 1.0983050847457627e-05, + "loss": 0.5289, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 1.904024031354274, + "learning_rate": 1.1050847457627118e-05, + "loss": 0.6713, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 2.127282166318957, + "learning_rate": 1.1118644067796611e-05, + "loss": 0.7227, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 2.142497901612626, + "learning_rate": 1.1186440677966102e-05, + "loss": 0.5906, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 2.057330453800443, + "learning_rate": 1.1254237288135595e-05, + "loss": 0.6901, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 2.093010939189739, + "learning_rate": 1.1322033898305084e-05, + "loss": 0.7951, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 1.9107896944652738, + "learning_rate": 1.1389830508474577e-05, + "loss": 0.7762, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 2.27565600735753, + "learning_rate": 1.1457627118644068e-05, + "loss": 0.7201, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 2.078256587268634, + "learning_rate": 1.1525423728813561e-05, + "loss": 0.7271, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 1.976498895581692, + "learning_rate": 1.159322033898305e-05, + "loss": 0.7205, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 2.0274252035867497, + "learning_rate": 1.1661016949152543e-05, + "loss": 0.7542, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 2.248882635591796, + "learning_rate": 1.1728813559322034e-05, + "loss": 0.7563, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 2.152946416343347, + "learning_rate": 1.1796610169491527e-05, + "loss": 0.8891, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 1.9614188493029023, + "learning_rate": 1.1864406779661018e-05, + "loss": 0.6927, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 1.9561016698575437, + "learning_rate": 1.1932203389830511e-05, + "loss": 0.7323, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 2.2060339927740333, + "learning_rate": 1.2e-05, + "loss": 0.7635, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 2.361699969994298, + "learning_rate": 1.2067796610169493e-05, + "loss": 0.7074, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 2.2826526992705576, + "learning_rate": 1.2135593220338984e-05, + "loss": 0.8152, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 2.3383409650671783, + "learning_rate": 1.2203389830508477e-05, + "loss": 0.7099, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 2.314928794545433, + "learning_rate": 1.2271186440677966e-05, + "loss": 0.8114, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 2.1182102763924564, + "learning_rate": 1.2338983050847459e-05, + "loss": 0.7584, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 1.9427488854499315, + "learning_rate": 1.240677966101695e-05, + "loss": 0.6497, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 2.044035298732104, + "learning_rate": 1.2474576271186443e-05, + "loss": 0.8107, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 2.000921224831445, + "learning_rate": 1.2542372881355932e-05, + "loss": 0.7334, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 2.148842701328798, + "learning_rate": 1.2610169491525425e-05, + "loss": 0.8103, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 2.2665184439248676, + "learning_rate": 1.2677966101694916e-05, + "loss": 0.8093, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.9357363672299526, + "learning_rate": 1.2745762711864407e-05, + "loss": 0.7027, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.7570604626310313, + "learning_rate": 1.28135593220339e-05, + "loss": 0.6212, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 2.2773162054308607, + "learning_rate": 1.288135593220339e-05, + "loss": 0.7666, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 1.7761063488705422, + "learning_rate": 1.2949152542372882e-05, + "loss": 0.6996, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.8767194093067885, + "learning_rate": 1.3016949152542373e-05, + "loss": 0.7106, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 2.0091109182168436, + "learning_rate": 1.3084745762711866e-05, + "loss": 0.6987, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.8537469691014754, + "learning_rate": 1.3152542372881355e-05, + "loss": 0.6861, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 2.017545763530024, + "learning_rate": 1.3220338983050848e-05, + "loss": 0.7587, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 2.1514998030012236, + "learning_rate": 1.328813559322034e-05, + "loss": 0.828, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.9143327758731226, + "learning_rate": 1.3355932203389832e-05, + "loss": 0.7598, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.9042594463315574, + "learning_rate": 1.3423728813559323e-05, + "loss": 0.7754, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 1.8556994255207242, + "learning_rate": 1.3491525423728816e-05, + "loss": 0.7414, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 2.1774086553474232, + "learning_rate": 1.3559322033898305e-05, + "loss": 0.7429, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.9337554264476329, + "learning_rate": 1.3627118644067798e-05, + "loss": 0.6986, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 1.718424449386759, + "learning_rate": 1.3694915254237289e-05, + "loss": 0.68, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 1.906706540088502, + "learning_rate": 1.3762711864406782e-05, + "loss": 0.822, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 2.098824628782942, + "learning_rate": 1.3830508474576271e-05, + "loss": 0.7159, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 1.798103139341794, + "learning_rate": 1.3898305084745764e-05, + "loss": 0.817, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 1.6708618914113391, + "learning_rate": 1.3966101694915255e-05, + "loss": 0.7206, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 1.8960631122303189, + "learning_rate": 1.4033898305084748e-05, + "loss": 0.7623, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 1.941715727270628, + "learning_rate": 1.4101694915254239e-05, + "loss": 0.664, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.964233432476786, + "learning_rate": 1.416949152542373e-05, + "loss": 0.6237, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 1.7513382787513723, + "learning_rate": 1.4237288135593221e-05, + "loss": 0.6544, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 1.9443552456303226, + "learning_rate": 1.4305084745762714e-05, + "loss": 0.6672, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 1.9412006314237946, + "learning_rate": 1.4372881355932205e-05, + "loss": 0.8645, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 2.117636061000563, + "learning_rate": 1.4440677966101698e-05, + "loss": 0.686, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 2.0391785941599845, + "learning_rate": 1.4508474576271187e-05, + "loss": 0.8448, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 1.950652111059504, + "learning_rate": 1.4576271186440678e-05, + "loss": 0.6526, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 1.987977720735387, + "learning_rate": 1.4644067796610171e-05, + "loss": 0.7114, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 2.173671919832719, + "learning_rate": 1.4711864406779662e-05, + "loss": 0.706, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 2.0170009363485395, + "learning_rate": 1.4779661016949153e-05, + "loss": 0.6986, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 1.7783761996339809, + "learning_rate": 1.4847457627118644e-05, + "loss": 0.7819, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 2.120675334171391, + "learning_rate": 1.4915254237288137e-05, + "loss": 0.7375, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 1.8559328669266935, + "learning_rate": 1.4983050847457628e-05, + "loss": 0.7511, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 1.9057535498740088, + "learning_rate": 1.505084745762712e-05, + "loss": 0.7453, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 1.9176427912162302, + "learning_rate": 1.511864406779661e-05, + "loss": 0.65, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 1.967862907768235, + "learning_rate": 1.5186440677966103e-05, + "loss": 0.7469, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.8613239875337846, + "learning_rate": 1.5254237288135594e-05, + "loss": 0.8355, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 1.6798452172801586, + "learning_rate": 1.5322033898305085e-05, + "loss": 0.7247, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 1.9657751054133528, + "learning_rate": 1.5389830508474578e-05, + "loss": 0.6851, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 1.8171542649675818, + "learning_rate": 1.545762711864407e-05, + "loss": 0.7742, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 2.010455167366286, + "learning_rate": 1.552542372881356e-05, + "loss": 0.8376, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 1.891522214459134, + "learning_rate": 1.5593220338983053e-05, + "loss": 0.8006, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 1.8244670277804427, + "learning_rate": 1.5661016949152542e-05, + "loss": 0.6862, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 2.035296790223969, + "learning_rate": 1.5728813559322035e-05, + "loss": 0.7342, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 2.036716471677796, + "learning_rate": 1.5796610169491528e-05, + "loss": 0.7172, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 1.966447074428311, + "learning_rate": 1.586440677966102e-05, + "loss": 0.778, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 1.8457730222414168, + "learning_rate": 1.593220338983051e-05, + "loss": 0.628, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 1.8050930389522784, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.7722, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 2.531322108969687, + "learning_rate": 1.6067796610169492e-05, + "loss": 0.8066, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 2.0927576851100174, + "learning_rate": 1.6135593220338985e-05, + "loss": 0.7249, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 2.029523839237554, + "learning_rate": 1.6203389830508474e-05, + "loss": 0.8032, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 2.0478471468460615, + "learning_rate": 1.6271186440677967e-05, + "loss": 0.7326, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.829411642842363, + "learning_rate": 1.633898305084746e-05, + "loss": 0.817, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 1.8488495872202233, + "learning_rate": 1.640677966101695e-05, + "loss": 0.6522, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 1.9832710283695125, + "learning_rate": 1.6474576271186442e-05, + "loss": 0.7512, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 2.100562680767454, + "learning_rate": 1.654237288135593e-05, + "loss": 0.8037, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 2.035367004658631, + "learning_rate": 1.6610169491525424e-05, + "loss": 0.8428, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 1.7483550173167273, + "learning_rate": 1.6677966101694917e-05, + "loss": 0.75, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 1.930898045070027, + "learning_rate": 1.674576271186441e-05, + "loss": 0.6774, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 1.8365756306413494, + "learning_rate": 1.68135593220339e-05, + "loss": 0.723, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 1.701138383260526, + "learning_rate": 1.6881355932203392e-05, + "loss": 0.7625, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 1.7105626859044865, + "learning_rate": 1.694915254237288e-05, + "loss": 0.6939, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 2.059412608350948, + "learning_rate": 1.7016949152542374e-05, + "loss": 0.6739, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 1.712149442270916, + "learning_rate": 1.7084745762711867e-05, + "loss": 0.744, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 2.015776887334701, + "learning_rate": 1.715254237288136e-05, + "loss": 0.7223, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 1.685804526129291, + "learning_rate": 1.722033898305085e-05, + "loss": 0.7029, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 1.9446296967732069, + "learning_rate": 1.728813559322034e-05, + "loss": 0.7818, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 1.791749565441212, + "learning_rate": 1.735593220338983e-05, + "loss": 0.6184, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 1.7154113226867052, + "learning_rate": 1.7423728813559324e-05, + "loss": 0.7036, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 1.883567618364835, + "learning_rate": 1.7491525423728813e-05, + "loss": 0.7517, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 1.8940636144580816, + "learning_rate": 1.7559322033898306e-05, + "loss": 0.7078, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 1.9268538873409744, + "learning_rate": 1.76271186440678e-05, + "loss": 0.7563, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 1.7748203123085324, + "learning_rate": 1.769491525423729e-05, + "loss": 0.7079, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 1.76578312180332, + "learning_rate": 1.776271186440678e-05, + "loss": 0.6875, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 1.8139793782816562, + "learning_rate": 1.7830508474576274e-05, + "loss": 0.7254, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 1.7496909186973184, + "learning_rate": 1.7898305084745763e-05, + "loss": 0.7714, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 1.9619860178080115, + "learning_rate": 1.7966101694915256e-05, + "loss": 0.6743, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 1.9633822472809355, + "learning_rate": 1.803389830508475e-05, + "loss": 0.7198, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 1.8938463788084676, + "learning_rate": 1.810169491525424e-05, + "loss": 0.8072, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 1.7910793721294165, + "learning_rate": 1.816949152542373e-05, + "loss": 0.7417, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 1.8385244007491037, + "learning_rate": 1.823728813559322e-05, + "loss": 0.7399, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 1.8299310983186288, + "learning_rate": 1.8305084745762713e-05, + "loss": 0.8197, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 1.6938430742018853, + "learning_rate": 1.8372881355932202e-05, + "loss": 0.7432, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 1.97932361867903, + "learning_rate": 1.8440677966101695e-05, + "loss": 0.7969, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 1.7016617079758896, + "learning_rate": 1.8508474576271188e-05, + "loss": 0.7311, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 1.7402288852531422, + "learning_rate": 1.857627118644068e-05, + "loss": 0.8583, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 1.8009302958397473, + "learning_rate": 1.864406779661017e-05, + "loss": 0.7303, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 1.8701338309527233, + "learning_rate": 1.8711864406779663e-05, + "loss": 0.7549, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 2.0312194476891876, + "learning_rate": 1.8779661016949152e-05, + "loss": 0.7222, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 1.8677846365652164, + "learning_rate": 1.8847457627118645e-05, + "loss": 0.7295, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 1.949694415094439, + "learning_rate": 1.8915254237288138e-05, + "loss": 0.7984, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 1.7111163200955561, + "learning_rate": 1.898305084745763e-05, + "loss": 0.7203, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 1.9717239024751367, + "learning_rate": 1.905084745762712e-05, + "loss": 0.6803, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 1.579686370395743, + "learning_rate": 1.9118644067796613e-05, + "loss": 0.7824, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 1.678149645995694, + "learning_rate": 1.9186440677966102e-05, + "loss": 0.6746, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 1.8213188558433615, + "learning_rate": 1.9254237288135595e-05, + "loss": 0.8051, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 1.7725977550117176, + "learning_rate": 1.9322033898305087e-05, + "loss": 0.7281, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 1.9372054768281428, + "learning_rate": 1.938983050847458e-05, + "loss": 0.6533, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 1.7535765837740318, + "learning_rate": 1.945762711864407e-05, + "loss": 0.8288, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 1.859536473734018, + "learning_rate": 1.9525423728813562e-05, + "loss": 0.8189, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 1.843961540210964, + "learning_rate": 1.9593220338983052e-05, + "loss": 0.7668, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 1.8461464381997479, + "learning_rate": 1.9661016949152545e-05, + "loss": 0.7714, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 1.7238192387959088, + "learning_rate": 1.9728813559322034e-05, + "loss": 0.7324, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 1.6427586720320793, + "learning_rate": 1.9796610169491527e-05, + "loss": 0.7335, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 1.6800342906853636, + "learning_rate": 1.986440677966102e-05, + "loss": 0.7549, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 1.7832389725065179, + "learning_rate": 1.9932203389830512e-05, + "loss": 0.8058, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 1.727775985861393, + "learning_rate": 2e-05, + "loss": 0.6759, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 1.7977238751907165, + "learning_rate": 1.999999945732819e-05, + "loss": 0.7674, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 1.7390933788321141, + "learning_rate": 1.9999997829312825e-05, + "loss": 0.8395, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 1.7764290549311015, + "learning_rate": 1.9999995115954075e-05, + "loss": 0.711, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 1.6826307129531393, + "learning_rate": 1.999999131725224e-05, + "loss": 0.6686, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 1.664467172792983, + "learning_rate": 1.9999986433207727e-05, + "loss": 0.6859, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.91798902000967, + "learning_rate": 1.9999980463821073e-05, + "loss": 0.7042, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 1.6449618209624421, + "learning_rate": 1.9999973409092916e-05, + "loss": 0.6718, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 1.7583224606564245, + "learning_rate": 1.999996526902403e-05, + "loss": 0.7526, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 1.7418882410175618, + "learning_rate": 1.99999560436153e-05, + "loss": 0.8077, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 1.6483278016364271, + "learning_rate": 1.9999945732867717e-05, + "loss": 0.6834, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 1.8758827011232249, + "learning_rate": 1.999993433678241e-05, + "loss": 0.6097, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 1.9950744094762918, + "learning_rate": 1.999992185536061e-05, + "loss": 0.8323, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 1.9600308793601466, + "learning_rate": 1.9999908288603678e-05, + "loss": 0.7898, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 1.6362825731715926, + "learning_rate": 1.9999893636513078e-05, + "loss": 0.715, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 1.9191632580569564, + "learning_rate": 1.9999877899090408e-05, + "loss": 0.8983, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 2.2077252767571283, + "learning_rate": 1.999986107633737e-05, + "loss": 0.8502, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 1.7211750642056591, + "learning_rate": 1.9999843168255792e-05, + "loss": 0.6766, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 1.8405066522264266, + "learning_rate": 1.999982417484762e-05, + "loss": 0.7717, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 1.7584962556190842, + "learning_rate": 1.9999804096114914e-05, + "loss": 0.7634, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 1.9067370561909576, + "learning_rate": 1.999978293205985e-05, + "loss": 0.7854, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 1.650263611438605, + "learning_rate": 1.9999760682684732e-05, + "loss": 0.8554, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 1.9239042342081574, + "learning_rate": 1.999973734799197e-05, + "loss": 0.7182, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 1.914889513011852, + "learning_rate": 1.9999712927984097e-05, + "loss": 0.8431, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 1.7596818561456582, + "learning_rate": 1.999968742266376e-05, + "loss": 0.7526, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 1.9152360275383022, + "learning_rate": 1.9999660832033733e-05, + "loss": 0.8691, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 1.7674570266140242, + "learning_rate": 1.99996331560969e-05, + "loss": 0.7009, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 1.8346671747992218, + "learning_rate": 1.9999604394856265e-05, + "loss": 0.7458, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 1.7024441673305173, + "learning_rate": 1.999957454831495e-05, + "loss": 0.6926, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 1.8858862630231061, + "learning_rate": 1.9999543616476195e-05, + "loss": 0.7739, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.918639765230697, + "learning_rate": 1.999951159934335e-05, + "loss": 0.7835, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 1.713705802835458, + "learning_rate": 1.99994784969199e-05, + "loss": 0.8373, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 2.125556618290798, + "learning_rate": 1.9999444309209432e-05, + "loss": 0.7237, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 1.8747153782294466, + "learning_rate": 1.999940903621566e-05, + "loss": 0.8242, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 1.8541269896488926, + "learning_rate": 1.999937267794241e-05, + "loss": 0.8078, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 1.8097939569533714, + "learning_rate": 1.999933523439363e-05, + "loss": 0.6798, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 1.6707764941842447, + "learning_rate": 1.999929670557338e-05, + "loss": 0.7481, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 1.5408400944729324, + "learning_rate": 1.9999257091485842e-05, + "loss": 0.6834, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.8105877361881173, + "learning_rate": 1.999921639213532e-05, + "loss": 0.7798, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 2.0245928171672127, + "learning_rate": 1.9999174607526232e-05, + "loss": 0.8071, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 1.7126779698650814, + "learning_rate": 1.9999131737663106e-05, + "loss": 0.7318, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 1.8799372221054982, + "learning_rate": 1.9999087782550596e-05, + "loss": 0.7915, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.8302157825287524, + "learning_rate": 1.999904274219348e-05, + "loss": 0.8132, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 1.8873097603497921, + "learning_rate": 1.9998996616596643e-05, + "loss": 0.818, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 1.7616359225670495, + "learning_rate": 1.9998949405765086e-05, + "loss": 0.6288, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 1.9961891473127835, + "learning_rate": 1.9998901109703942e-05, + "loss": 0.7461, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.9590109916459193, + "learning_rate": 1.9998851728418443e-05, + "loss": 0.7262, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 1.7017357772857418, + "learning_rate": 1.9998801261913956e-05, + "loss": 0.697, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 1.8180055536057091, + "learning_rate": 1.9998749710195957e-05, + "loss": 0.7893, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 2.0549880336112016, + "learning_rate": 1.9998697073270038e-05, + "loss": 0.7717, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 1.6411992346494544, + "learning_rate": 1.9998643351141912e-05, + "loss": 0.7945, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 1.8553473962687779, + "learning_rate": 1.9998588543817415e-05, + "loss": 0.8276, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 1.8223722140799334, + "learning_rate": 1.999853265130249e-05, + "loss": 0.842, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 1.7858229071600773, + "learning_rate": 1.9998475673603205e-05, + "loss": 0.6635, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 1.5519682746904044, + "learning_rate": 1.9998417610725745e-05, + "loss": 0.6665, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 1.9122831115958798, + "learning_rate": 1.999835846267641e-05, + "loss": 0.7812, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 1.7934216535596592, + "learning_rate": 1.9998298229461624e-05, + "loss": 0.7411, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 1.714151647311422, + "learning_rate": 1.999823691108792e-05, + "loss": 0.7088, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 1.8445691735054452, + "learning_rate": 1.9998174507561952e-05, + "loss": 0.8371, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 1.946917458223072, + "learning_rate": 1.9998111018890496e-05, + "loss": 0.8158, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 1.9627402550609958, + "learning_rate": 1.999804644508044e-05, + "loss": 0.816, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 1.684490996381524, + "learning_rate": 1.99979807861388e-05, + "loss": 0.7906, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 1.7292982143939393, + "learning_rate": 1.999791404207269e-05, + "loss": 0.8777, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 1.6990863259567761, + "learning_rate": 1.9997846212889367e-05, + "loss": 0.7051, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 1.6806371114663146, + "learning_rate": 1.999777729859618e-05, + "loss": 0.6675, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 1.891811437481674, + "learning_rate": 1.9997707299200622e-05, + "loss": 0.8882, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 1.7243799607340802, + "learning_rate": 1.999763621471028e-05, + "loss": 0.7244, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 1.8932374823398974, + "learning_rate": 1.999756404513287e-05, + "loss": 0.7592, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 1.7673934736131074, + "learning_rate": 1.999749079047623e-05, + "loss": 0.7581, + "step": 363 + }, + { + "epoch": 0.04, + "grad_norm": 1.688678447260584, + "learning_rate": 1.9997416450748308e-05, + "loss": 0.7357, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 1.8030241010019676, + "learning_rate": 1.999734102595717e-05, + "loss": 0.8179, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 1.6616197218353204, + "learning_rate": 1.9997264516111006e-05, + "loss": 0.6571, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 1.7328480272335298, + "learning_rate": 1.9997186921218123e-05, + "loss": 0.8508, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 1.591325686668673, + "learning_rate": 1.999710824128693e-05, + "loss": 0.7019, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 1.7290334221881392, + "learning_rate": 1.9997028476325978e-05, + "loss": 0.8135, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 1.9651039231608671, + "learning_rate": 1.9996947626343924e-05, + "loss": 0.8036, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 1.8616771729798702, + "learning_rate": 1.9996865691349538e-05, + "loss": 0.7986, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 1.656935611345808, + "learning_rate": 1.9996782671351713e-05, + "loss": 0.7317, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 1.8377731073141668, + "learning_rate": 1.9996698566359462e-05, + "loss": 0.7378, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 1.8112813049650685, + "learning_rate": 1.9996613376381916e-05, + "loss": 0.7948, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 2.0118770931741934, + "learning_rate": 1.999652710142831e-05, + "loss": 0.8042, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 1.7478808769772007, + "learning_rate": 1.999643974150802e-05, + "loss": 0.7676, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 1.7676338042836226, + "learning_rate": 1.9996351296630525e-05, + "loss": 0.7365, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 1.7180183778756248, + "learning_rate": 1.999626176680542e-05, + "loss": 0.6893, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 1.50083822299388, + "learning_rate": 1.9996171152042425e-05, + "loss": 0.6976, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 2.003558525467704, + "learning_rate": 1.999607945235137e-05, + "loss": 0.8067, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 1.8041757070189628, + "learning_rate": 1.9995986667742217e-05, + "loss": 0.7977, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 1.5599959651379252, + "learning_rate": 1.999589279822503e-05, + "loss": 0.6672, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 1.7795825546643322, + "learning_rate": 1.9995797843809998e-05, + "loss": 0.7974, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 1.802976071998248, + "learning_rate": 1.9995701804507423e-05, + "loss": 0.7719, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 1.5941440848302881, + "learning_rate": 1.9995604680327735e-05, + "loss": 0.7913, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 1.753323779430216, + "learning_rate": 1.9995506471281473e-05, + "loss": 0.77, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 1.6683717471946458, + "learning_rate": 1.9995407177379295e-05, + "loss": 0.8528, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 1.6751255034212773, + "learning_rate": 1.9995306798631978e-05, + "loss": 0.7806, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 1.7968947270822646, + "learning_rate": 1.999520533505042e-05, + "loss": 0.7733, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 2.025330454366504, + "learning_rate": 1.999510278664563e-05, + "loss": 0.7973, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 1.5378582514197812, + "learning_rate": 1.9994999153428737e-05, + "loss": 0.6657, + "step": 391 + }, + { + "epoch": 0.04, + "grad_norm": 1.583531116731558, + "learning_rate": 1.999489443541099e-05, + "loss": 0.7282, + "step": 392 + }, + { + "epoch": 0.04, + "grad_norm": 1.6735520561192077, + "learning_rate": 1.9994788632603756e-05, + "loss": 0.7124, + "step": 393 + }, + { + "epoch": 0.04, + "grad_norm": 1.7029836297903527, + "learning_rate": 1.9994681745018516e-05, + "loss": 0.734, + "step": 394 + }, + { + "epoch": 0.04, + "grad_norm": 1.664386176676482, + "learning_rate": 1.999457377266687e-05, + "loss": 0.6538, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 1.7530802960298322, + "learning_rate": 1.9994464715560543e-05, + "loss": 0.6985, + "step": 396 + }, + { + "epoch": 0.04, + "grad_norm": 1.6860590312819281, + "learning_rate": 1.9994354573711363e-05, + "loss": 0.7206, + "step": 397 + }, + { + "epoch": 0.04, + "grad_norm": 2.0901210609686722, + "learning_rate": 1.999424334713129e-05, + "loss": 0.7572, + "step": 398 + }, + { + "epoch": 0.04, + "grad_norm": 1.8874014333140492, + "learning_rate": 1.9994131035832394e-05, + "loss": 0.782, + "step": 399 + }, + { + "epoch": 0.04, + "grad_norm": 1.6553991557065093, + "learning_rate": 1.9994017639826867e-05, + "loss": 0.9557, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 1.6718485788810615, + "learning_rate": 1.999390315912701e-05, + "loss": 0.7127, + "step": 401 + }, + { + "epoch": 0.04, + "grad_norm": 1.7304276426396776, + "learning_rate": 1.9993787593745254e-05, + "loss": 0.7887, + "step": 402 + }, + { + "epoch": 0.04, + "grad_norm": 1.8547453550237238, + "learning_rate": 1.999367094369414e-05, + "loss": 0.7698, + "step": 403 + }, + { + "epoch": 0.04, + "grad_norm": 2.0608110132252917, + "learning_rate": 1.999355320898633e-05, + "loss": 0.7771, + "step": 404 + }, + { + "epoch": 0.04, + "grad_norm": 1.6305518417499105, + "learning_rate": 1.99934343896346e-05, + "loss": 0.6427, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 1.6217405497359352, + "learning_rate": 1.9993314485651848e-05, + "loss": 0.7834, + "step": 406 + }, + { + "epoch": 0.04, + "grad_norm": 1.620804556556247, + "learning_rate": 1.9993193497051084e-05, + "loss": 0.8264, + "step": 407 + }, + { + "epoch": 0.04, + "grad_norm": 1.7909076263383104, + "learning_rate": 1.999307142384544e-05, + "loss": 0.8092, + "step": 408 + }, + { + "epoch": 0.04, + "grad_norm": 1.70064491469979, + "learning_rate": 1.9992948266048174e-05, + "loss": 0.821, + "step": 409 + }, + { + "epoch": 0.04, + "grad_norm": 1.7059610993511671, + "learning_rate": 1.999282402367264e-05, + "loss": 0.7432, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 1.5692469342874094, + "learning_rate": 1.999269869673233e-05, + "loss": 0.6727, + "step": 411 + }, + { + "epoch": 0.04, + "grad_norm": 1.7378065054799623, + "learning_rate": 1.9992572285240848e-05, + "loss": 0.841, + "step": 412 + }, + { + "epoch": 0.04, + "grad_norm": 1.8668129805207883, + "learning_rate": 1.999244478921191e-05, + "loss": 0.7503, + "step": 413 + }, + { + "epoch": 0.04, + "grad_norm": 1.9060504553017656, + "learning_rate": 1.9992316208659352e-05, + "loss": 0.7054, + "step": 414 + }, + { + "epoch": 0.04, + "grad_norm": 1.8212929976354297, + "learning_rate": 1.9992186543597133e-05, + "loss": 0.8107, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 1.809192974400173, + "learning_rate": 1.9992055794039325e-05, + "loss": 0.7067, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 1.5465827774015835, + "learning_rate": 1.9991923960000116e-05, + "loss": 0.6662, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 1.5850884512167587, + "learning_rate": 1.999179104149382e-05, + "loss": 0.7838, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 1.6622022535939016, + "learning_rate": 1.999165703853486e-05, + "loss": 0.7488, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 1.6903905078454566, + "learning_rate": 1.9991521951137783e-05, + "loss": 0.7443, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 1.6687306292307453, + "learning_rate": 1.9991385779317246e-05, + "loss": 0.723, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 1.7219109927338176, + "learning_rate": 1.999124852308803e-05, + "loss": 0.78, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 1.8243785936412005, + "learning_rate": 1.9991110182465032e-05, + "loss": 0.6784, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 1.6490323132123654, + "learning_rate": 1.999097075746327e-05, + "loss": 0.6888, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 1.7346088865508633, + "learning_rate": 1.999083024809787e-05, + "loss": 0.785, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 1.6767037997897496, + "learning_rate": 1.999068865438409e-05, + "loss": 0.746, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 1.7701958205287336, + "learning_rate": 1.999054597633729e-05, + "loss": 0.7477, + "step": 427 + }, + { + "epoch": 0.04, + "grad_norm": 1.6835840347014255, + "learning_rate": 1.999040221397296e-05, + "loss": 0.7634, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 1.7769492248134329, + "learning_rate": 1.99902573673067e-05, + "loss": 0.7342, + "step": 429 + }, + { + "epoch": 0.04, + "grad_norm": 1.7048105843168924, + "learning_rate": 1.9990111436354237e-05, + "loss": 0.6987, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 1.5761305862194244, + "learning_rate": 1.99899644211314e-05, + "loss": 0.7776, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 1.74082648576072, + "learning_rate": 1.9989816321654155e-05, + "loss": 0.751, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 1.8126076461111138, + "learning_rate": 1.9989667137938573e-05, + "loss": 0.7662, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 1.6477682572902752, + "learning_rate": 1.998951687000084e-05, + "loss": 0.7225, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 1.7702013292110663, + "learning_rate": 1.998936551785727e-05, + "loss": 0.828, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 1.7479554654572946, + "learning_rate": 1.9989213081524293e-05, + "loss": 0.7605, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 1.7333751540111162, + "learning_rate": 1.9989059561018448e-05, + "loss": 0.7445, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 1.8159978064450342, + "learning_rate": 1.9988904956356395e-05, + "loss": 0.7398, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 1.6659874749639998, + "learning_rate": 1.998874926755492e-05, + "loss": 0.8073, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 1.7995914872407046, + "learning_rate": 1.9988592494630922e-05, + "loss": 0.9021, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 1.7213002946596805, + "learning_rate": 1.998843463760141e-05, + "loss": 0.7034, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 1.7732611112379235, + "learning_rate": 1.998827569648352e-05, + "loss": 0.738, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 2.0593757365062655, + "learning_rate": 1.9988115671294502e-05, + "loss": 0.8003, + "step": 443 + }, + { + "epoch": 0.05, + "grad_norm": 1.5199583294558163, + "learning_rate": 1.9987954562051724e-05, + "loss": 0.7742, + "step": 444 + }, + { + "epoch": 0.05, + "grad_norm": 1.6694614497357176, + "learning_rate": 1.9987792368772677e-05, + "loss": 0.7096, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 1.757071457535054, + "learning_rate": 1.9987629091474955e-05, + "loss": 0.7648, + "step": 446 + }, + { + "epoch": 0.05, + "grad_norm": 1.4926165767089656, + "learning_rate": 1.9987464730176285e-05, + "loss": 0.6738, + "step": 447 + }, + { + "epoch": 0.05, + "grad_norm": 1.6278470687032995, + "learning_rate": 1.9987299284894505e-05, + "loss": 0.6468, + "step": 448 + }, + { + "epoch": 0.05, + "grad_norm": 1.5928480530517506, + "learning_rate": 1.9987132755647574e-05, + "loss": 0.733, + "step": 449 + }, + { + "epoch": 0.05, + "grad_norm": 1.954406246297614, + "learning_rate": 1.9986965142453562e-05, + "loss": 0.7253, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 1.677578009697896, + "learning_rate": 1.998679644533066e-05, + "loss": 0.7313, + "step": 451 + }, + { + "epoch": 0.05, + "grad_norm": 1.756632357254643, + "learning_rate": 1.998662666429718e-05, + "loss": 0.7917, + "step": 452 + }, + { + "epoch": 0.05, + "grad_norm": 1.6989063531350133, + "learning_rate": 1.9986455799371555e-05, + "loss": 0.7377, + "step": 453 + }, + { + "epoch": 0.05, + "grad_norm": 1.5734631073088516, + "learning_rate": 1.9986283850572317e-05, + "loss": 0.7689, + "step": 454 + }, + { + "epoch": 0.05, + "grad_norm": 1.6438195775524247, + "learning_rate": 1.998611081791814e-05, + "loss": 0.6276, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 1.7470830793024756, + "learning_rate": 1.9985936701427797e-05, + "loss": 0.8255, + "step": 456 + }, + { + "epoch": 0.05, + "grad_norm": 1.627502692382246, + "learning_rate": 1.998576150112019e-05, + "loss": 0.8686, + "step": 457 + }, + { + "epoch": 0.05, + "grad_norm": 1.6742927725469114, + "learning_rate": 1.9985585217014326e-05, + "loss": 0.756, + "step": 458 + }, + { + "epoch": 0.05, + "grad_norm": 1.806975380520805, + "learning_rate": 1.9985407849129346e-05, + "loss": 0.8736, + "step": 459 + }, + { + "epoch": 0.05, + "grad_norm": 1.6601591312990973, + "learning_rate": 1.9985229397484504e-05, + "loss": 0.7521, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 1.793242802316679, + "learning_rate": 1.9985049862099156e-05, + "loss": 0.7921, + "step": 461 + }, + { + "epoch": 0.05, + "grad_norm": 1.6875996812673835, + "learning_rate": 1.9984869242992798e-05, + "loss": 0.8031, + "step": 462 + }, + { + "epoch": 0.05, + "grad_norm": 1.5964038305874215, + "learning_rate": 1.9984687540185026e-05, + "loss": 0.63, + "step": 463 + }, + { + "epoch": 0.05, + "grad_norm": 1.7710598946573337, + "learning_rate": 1.9984504753695567e-05, + "loss": 0.7269, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 1.5836628328169005, + "learning_rate": 1.9984320883544254e-05, + "loss": 0.7293, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 1.5838641530722382, + "learning_rate": 1.998413592975105e-05, + "loss": 0.6755, + "step": 466 + }, + { + "epoch": 0.05, + "grad_norm": 1.4998685973317545, + "learning_rate": 1.9983949892336024e-05, + "loss": 0.6839, + "step": 467 + }, + { + "epoch": 0.05, + "grad_norm": 1.8559871084679145, + "learning_rate": 1.998376277131937e-05, + "loss": 0.8468, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 1.794841563871742, + "learning_rate": 1.998357456672139e-05, + "loss": 0.7864, + "step": 469 + }, + { + "epoch": 0.05, + "grad_norm": 1.6493411230580337, + "learning_rate": 1.9983385278562524e-05, + "loss": 0.7253, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 1.8178196745533752, + "learning_rate": 1.9983194906863305e-05, + "loss": 0.8109, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 1.6890060467396713, + "learning_rate": 1.99830034516444e-05, + "loss": 0.744, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 1.7568201017705833, + "learning_rate": 1.9982810912926586e-05, + "loss": 0.8342, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 1.6989251017754607, + "learning_rate": 1.998261729073076e-05, + "loss": 0.77, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 1.673235968940527, + "learning_rate": 1.998242258507794e-05, + "loss": 0.8354, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 1.777489272545835, + "learning_rate": 1.9982226795989255e-05, + "loss": 0.7293, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 1.9007324938155739, + "learning_rate": 1.9982029923485957e-05, + "loss": 0.8945, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 1.695960720362699, + "learning_rate": 1.9981831967589412e-05, + "loss": 0.7604, + "step": 478 + }, + { + "epoch": 0.05, + "grad_norm": 1.546497291074308, + "learning_rate": 1.9981632928321103e-05, + "loss": 0.747, + "step": 479 + }, + { + "epoch": 0.05, + "grad_norm": 1.7995388123021736, + "learning_rate": 1.9981432805702638e-05, + "loss": 0.7974, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 1.8867834481521102, + "learning_rate": 1.998123159975573e-05, + "loss": 0.7991, + "step": 481 + }, + { + "epoch": 0.05, + "grad_norm": 1.8366610120829818, + "learning_rate": 1.9981029310502227e-05, + "loss": 0.762, + "step": 482 + }, + { + "epoch": 0.05, + "grad_norm": 1.5535863887360595, + "learning_rate": 1.9980825937964074e-05, + "loss": 0.8253, + "step": 483 + }, + { + "epoch": 0.05, + "grad_norm": 1.6730007484155052, + "learning_rate": 1.9980621482163348e-05, + "loss": 0.6544, + "step": 484 + }, + { + "epoch": 0.05, + "grad_norm": 1.828936812673665, + "learning_rate": 1.998041594312224e-05, + "loss": 0.7863, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 1.669970570478486, + "learning_rate": 1.998020932086306e-05, + "loss": 0.8412, + "step": 486 + }, + { + "epoch": 0.05, + "grad_norm": 1.730258104372564, + "learning_rate": 1.9980001615408228e-05, + "loss": 0.8279, + "step": 487 + }, + { + "epoch": 0.05, + "grad_norm": 1.7588302456093907, + "learning_rate": 1.997979282678029e-05, + "loss": 0.691, + "step": 488 + }, + { + "epoch": 0.05, + "grad_norm": 1.5050103571137134, + "learning_rate": 1.9979582955001908e-05, + "loss": 0.6758, + "step": 489 + }, + { + "epoch": 0.05, + "grad_norm": 1.8806382255625589, + "learning_rate": 1.997937200009586e-05, + "loss": 0.7779, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 1.7261954107457222, + "learning_rate": 1.9979159962085044e-05, + "loss": 0.7572, + "step": 491 + }, + { + "epoch": 0.05, + "grad_norm": 1.7225866766844808, + "learning_rate": 1.997894684099247e-05, + "loss": 0.8224, + "step": 492 + }, + { + "epoch": 0.05, + "grad_norm": 1.6829420585393222, + "learning_rate": 1.9978732636841264e-05, + "loss": 0.6425, + "step": 493 + }, + { + "epoch": 0.05, + "grad_norm": 1.7874342491112851, + "learning_rate": 1.9978517349654684e-05, + "loss": 0.7899, + "step": 494 + }, + { + "epoch": 0.05, + "grad_norm": 1.94124149740623, + "learning_rate": 1.9978300979456094e-05, + "loss": 0.7637, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 1.798068985507405, + "learning_rate": 1.9978083526268974e-05, + "loss": 0.9313, + "step": 496 + }, + { + "epoch": 0.05, + "grad_norm": 1.7847619085647561, + "learning_rate": 1.9977864990116926e-05, + "loss": 0.8486, + "step": 497 + }, + { + "epoch": 0.05, + "grad_norm": 1.6565790972074341, + "learning_rate": 1.9977645371023673e-05, + "loss": 0.807, + "step": 498 + }, + { + "epoch": 0.05, + "grad_norm": 1.654235902274819, + "learning_rate": 1.9977424669013044e-05, + "loss": 0.724, + "step": 499 + }, + { + "epoch": 0.05, + "grad_norm": 1.6431609076499223, + "learning_rate": 1.9977202884109e-05, + "loss": 0.6685, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 1.643346718045025, + "learning_rate": 1.9976980016335607e-05, + "loss": 0.7285, + "step": 501 + }, + { + "epoch": 0.05, + "grad_norm": 1.833324399036458, + "learning_rate": 1.9976756065717056e-05, + "loss": 0.8175, + "step": 502 + }, + { + "epoch": 0.05, + "grad_norm": 1.6383228878544507, + "learning_rate": 1.9976531032277653e-05, + "loss": 0.736, + "step": 503 + }, + { + "epoch": 0.05, + "grad_norm": 1.9735369455703395, + "learning_rate": 1.9976304916041824e-05, + "loss": 0.6961, + "step": 504 + }, + { + "epoch": 0.05, + "grad_norm": 1.7905953530647531, + "learning_rate": 1.9976077717034105e-05, + "loss": 0.8517, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 1.588535210734372, + "learning_rate": 1.997584943527916e-05, + "loss": 0.804, + "step": 506 + }, + { + "epoch": 0.05, + "grad_norm": 1.627547386451281, + "learning_rate": 1.9975620070801763e-05, + "loss": 0.7761, + "step": 507 + }, + { + "epoch": 0.05, + "grad_norm": 1.7265931965712278, + "learning_rate": 1.997538962362681e-05, + "loss": 0.7825, + "step": 508 + }, + { + "epoch": 0.05, + "grad_norm": 1.847361586714884, + "learning_rate": 1.997515809377931e-05, + "loss": 0.7746, + "step": 509 + }, + { + "epoch": 0.05, + "grad_norm": 1.759148489609264, + "learning_rate": 1.997492548128439e-05, + "loss": 0.8152, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 1.822969884000541, + "learning_rate": 1.9974691786167303e-05, + "loss": 0.7323, + "step": 511 + }, + { + "epoch": 0.05, + "grad_norm": 1.9328903420977737, + "learning_rate": 1.9974457008453408e-05, + "loss": 0.791, + "step": 512 + }, + { + "epoch": 0.05, + "grad_norm": 1.687975853272304, + "learning_rate": 1.997422114816819e-05, + "loss": 0.7652, + "step": 513 + }, + { + "epoch": 0.05, + "grad_norm": 1.8072781375412006, + "learning_rate": 1.9973984205337244e-05, + "loss": 0.7159, + "step": 514 + }, + { + "epoch": 0.05, + "grad_norm": 1.7775120206705086, + "learning_rate": 1.997374617998629e-05, + "loss": 0.8137, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 1.6592282302457346, + "learning_rate": 1.997350707214116e-05, + "loss": 0.7775, + "step": 516 + }, + { + "epoch": 0.05, + "grad_norm": 1.6719635951730791, + "learning_rate": 1.99732668818278e-05, + "loss": 0.827, + "step": 517 + }, + { + "epoch": 0.05, + "grad_norm": 1.65300911080359, + "learning_rate": 1.997302560907229e-05, + "loss": 0.7261, + "step": 518 + }, + { + "epoch": 0.05, + "grad_norm": 1.76543409775746, + "learning_rate": 1.997278325390081e-05, + "loss": 0.8259, + "step": 519 + }, + { + "epoch": 0.05, + "grad_norm": 1.6797023411856506, + "learning_rate": 1.9972539816339664e-05, + "loss": 0.6834, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 1.816213182694269, + "learning_rate": 1.997229529641527e-05, + "loss": 0.8897, + "step": 521 + }, + { + "epoch": 0.05, + "grad_norm": 1.583562441314376, + "learning_rate": 1.9972049694154175e-05, + "loss": 0.7364, + "step": 522 + }, + { + "epoch": 0.05, + "grad_norm": 1.5629931433036204, + "learning_rate": 1.997180300958303e-05, + "loss": 0.7036, + "step": 523 + }, + { + "epoch": 0.05, + "grad_norm": 1.7244295918749826, + "learning_rate": 1.997155524272861e-05, + "loss": 0.8148, + "step": 524 + }, + { + "epoch": 0.05, + "grad_norm": 1.9409193617437879, + "learning_rate": 1.997130639361781e-05, + "loss": 0.8473, + "step": 525 + }, + { + "epoch": 0.05, + "grad_norm": 1.7157934465546114, + "learning_rate": 1.9971056462277632e-05, + "loss": 0.7863, + "step": 526 + }, + { + "epoch": 0.05, + "grad_norm": 1.646802444838358, + "learning_rate": 1.9970805448735205e-05, + "loss": 0.7295, + "step": 527 + }, + { + "epoch": 0.05, + "grad_norm": 1.625090042380927, + "learning_rate": 1.9970553353017772e-05, + "loss": 0.7481, + "step": 528 + }, + { + "epoch": 0.05, + "grad_norm": 1.7295431401631154, + "learning_rate": 1.9970300175152696e-05, + "loss": 0.8117, + "step": 529 + }, + { + "epoch": 0.05, + "grad_norm": 1.6140197901048001, + "learning_rate": 1.997004591516745e-05, + "loss": 0.7754, + "step": 530 + }, + { + "epoch": 0.05, + "grad_norm": 1.6560281350322912, + "learning_rate": 1.9969790573089637e-05, + "loss": 0.6868, + "step": 531 + }, + { + "epoch": 0.05, + "grad_norm": 1.7529556342448862, + "learning_rate": 1.9969534148946965e-05, + "loss": 0.6683, + "step": 532 + }, + { + "epoch": 0.05, + "grad_norm": 1.7068734128772407, + "learning_rate": 1.996927664276727e-05, + "loss": 0.6886, + "step": 533 + }, + { + "epoch": 0.05, + "grad_norm": 1.636282086916384, + "learning_rate": 1.9969018054578497e-05, + "loss": 0.707, + "step": 534 + }, + { + "epoch": 0.05, + "grad_norm": 1.703668964082737, + "learning_rate": 1.9968758384408715e-05, + "loss": 0.8091, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 1.6294150121655904, + "learning_rate": 1.99684976322861e-05, + "loss": 0.6922, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 1.7678562963178264, + "learning_rate": 1.9968235798238956e-05, + "loss": 0.8531, + "step": 537 + }, + { + "epoch": 0.05, + "grad_norm": 1.7785400771636022, + "learning_rate": 1.9967972882295704e-05, + "loss": 0.759, + "step": 538 + }, + { + "epoch": 0.05, + "grad_norm": 2.028098506449123, + "learning_rate": 1.9967708884484875e-05, + "loss": 0.9118, + "step": 539 + }, + { + "epoch": 0.05, + "grad_norm": 1.5347148064228826, + "learning_rate": 1.996744380483513e-05, + "loss": 0.6544, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 1.820079444384609, + "learning_rate": 1.9967177643375227e-05, + "loss": 0.8159, + "step": 541 + }, + { + "epoch": 0.06, + "grad_norm": 1.6671394749171597, + "learning_rate": 1.9966910400134062e-05, + "loss": 0.7221, + "step": 542 + }, + { + "epoch": 0.06, + "grad_norm": 1.6788173634731034, + "learning_rate": 1.996664207514064e-05, + "loss": 0.7436, + "step": 543 + }, + { + "epoch": 0.06, + "grad_norm": 1.5098704414985629, + "learning_rate": 1.9966372668424083e-05, + "loss": 0.6714, + "step": 544 + }, + { + "epoch": 0.06, + "grad_norm": 1.5418486733686516, + "learning_rate": 1.9966102180013625e-05, + "loss": 0.758, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 1.7685637436623223, + "learning_rate": 1.9965830609938627e-05, + "loss": 0.7747, + "step": 546 + }, + { + "epoch": 0.06, + "grad_norm": 1.7274011670181888, + "learning_rate": 1.996555795822857e-05, + "loss": 0.8325, + "step": 547 + }, + { + "epoch": 0.06, + "grad_norm": 1.6679061402459576, + "learning_rate": 1.9965284224913034e-05, + "loss": 0.8263, + "step": 548 + }, + { + "epoch": 0.06, + "grad_norm": 1.573807291130018, + "learning_rate": 1.9965009410021742e-05, + "loss": 0.713, + "step": 549 + }, + { + "epoch": 0.06, + "grad_norm": 1.8231463328916808, + "learning_rate": 1.9964733513584508e-05, + "loss": 0.7005, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 1.6741907583067228, + "learning_rate": 1.9964456535631287e-05, + "loss": 0.7334, + "step": 551 + }, + { + "epoch": 0.06, + "grad_norm": 1.79918584488658, + "learning_rate": 1.9964178476192132e-05, + "loss": 0.7676, + "step": 552 + }, + { + "epoch": 0.06, + "grad_norm": 1.811502444265952, + "learning_rate": 1.9963899335297227e-05, + "loss": 0.7321, + "step": 553 + }, + { + "epoch": 0.06, + "grad_norm": 1.7755800902867478, + "learning_rate": 1.9963619112976867e-05, + "loss": 0.7925, + "step": 554 + }, + { + "epoch": 0.06, + "grad_norm": 1.5597918230704464, + "learning_rate": 1.9963337809261465e-05, + "loss": 0.7272, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 1.864093123130176, + "learning_rate": 1.9963055424181556e-05, + "loss": 0.7669, + "step": 556 + }, + { + "epoch": 0.06, + "grad_norm": 2.389358668807301, + "learning_rate": 1.9962771957767782e-05, + "loss": 0.7292, + "step": 557 + }, + { + "epoch": 0.06, + "grad_norm": 1.74596830181998, + "learning_rate": 1.9962487410050915e-05, + "loss": 0.7526, + "step": 558 + }, + { + "epoch": 0.06, + "grad_norm": 1.8385868687010367, + "learning_rate": 1.9962201781061833e-05, + "loss": 0.8057, + "step": 559 + }, + { + "epoch": 0.06, + "grad_norm": 1.8668615928182508, + "learning_rate": 1.996191507083154e-05, + "loss": 0.792, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 1.7652860661908676, + "learning_rate": 1.9961627279391154e-05, + "loss": 0.7552, + "step": 561 + }, + { + "epoch": 0.06, + "grad_norm": 1.739278918825244, + "learning_rate": 1.9961338406771908e-05, + "loss": 0.8162, + "step": 562 + }, + { + "epoch": 0.06, + "grad_norm": 1.6244770953157561, + "learning_rate": 1.996104845300516e-05, + "loss": 0.7304, + "step": 563 + }, + { + "epoch": 0.06, + "grad_norm": 1.7582338674779954, + "learning_rate": 1.996075741812237e-05, + "loss": 0.7494, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 1.7029228717071425, + "learning_rate": 1.9960465302155132e-05, + "loss": 0.7869, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 1.6435160351803817, + "learning_rate": 1.996017210513515e-05, + "loss": 0.7812, + "step": 566 + }, + { + "epoch": 0.06, + "grad_norm": 1.6801997517535343, + "learning_rate": 1.9959877827094248e-05, + "loss": 0.765, + "step": 567 + }, + { + "epoch": 0.06, + "grad_norm": 1.835359427744583, + "learning_rate": 1.995958246806436e-05, + "loss": 0.8702, + "step": 568 + }, + { + "epoch": 0.06, + "grad_norm": 1.7548856870112344, + "learning_rate": 1.995928602807755e-05, + "loss": 0.891, + "step": 569 + }, + { + "epoch": 0.06, + "grad_norm": 1.6249171481144946, + "learning_rate": 1.9958988507165985e-05, + "loss": 0.7405, + "step": 570 + }, + { + "epoch": 0.06, + "grad_norm": 1.8747074662765073, + "learning_rate": 1.9958689905361956e-05, + "loss": 0.8896, + "step": 571 + }, + { + "epoch": 0.06, + "grad_norm": 1.685467232191064, + "learning_rate": 1.9958390222697875e-05, + "loss": 0.8109, + "step": 572 + }, + { + "epoch": 0.06, + "grad_norm": 1.5586476170652768, + "learning_rate": 1.9958089459206272e-05, + "loss": 0.8215, + "step": 573 + }, + { + "epoch": 0.06, + "grad_norm": 1.9110312441730324, + "learning_rate": 1.9957787614919782e-05, + "loss": 0.683, + "step": 574 + }, + { + "epoch": 0.06, + "grad_norm": 1.795968523861863, + "learning_rate": 1.9957484689871167e-05, + "loss": 0.7298, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 1.619585532371068, + "learning_rate": 1.995718068409331e-05, + "loss": 0.8326, + "step": 576 + }, + { + "epoch": 0.06, + "grad_norm": 1.686148657322329, + "learning_rate": 1.99568755976192e-05, + "loss": 0.7904, + "step": 577 + }, + { + "epoch": 0.06, + "grad_norm": 1.8980209427585717, + "learning_rate": 1.9956569430481954e-05, + "loss": 0.8522, + "step": 578 + }, + { + "epoch": 0.06, + "grad_norm": 1.8223249537789892, + "learning_rate": 1.99562621827148e-05, + "loss": 0.7409, + "step": 579 + }, + { + "epoch": 0.06, + "grad_norm": 1.6249663857855754, + "learning_rate": 1.9955953854351083e-05, + "loss": 0.7908, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 1.6680154013641537, + "learning_rate": 1.995564444542427e-05, + "loss": 0.8155, + "step": 581 + }, + { + "epoch": 0.06, + "grad_norm": 1.7437358002377406, + "learning_rate": 1.9955333955967936e-05, + "loss": 0.7572, + "step": 582 + }, + { + "epoch": 0.06, + "grad_norm": 1.7442385392988993, + "learning_rate": 1.9955022386015792e-05, + "loss": 0.7381, + "step": 583 + }, + { + "epoch": 0.06, + "grad_norm": 1.6455585855805404, + "learning_rate": 1.995470973560164e-05, + "loss": 0.7625, + "step": 584 + }, + { + "epoch": 0.06, + "grad_norm": 1.5922361876502813, + "learning_rate": 1.995439600475943e-05, + "loss": 0.7293, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 1.5827693035694907, + "learning_rate": 1.9954081193523197e-05, + "loss": 0.8177, + "step": 586 + }, + { + "epoch": 0.06, + "grad_norm": 1.7369402384800399, + "learning_rate": 1.9953765301927116e-05, + "loss": 0.761, + "step": 587 + }, + { + "epoch": 0.06, + "grad_norm": 1.7496504255649807, + "learning_rate": 1.9953448330005472e-05, + "loss": 0.7633, + "step": 588 + }, + { + "epoch": 0.06, + "grad_norm": 1.6992475664068267, + "learning_rate": 1.9953130277792664e-05, + "loss": 0.7109, + "step": 589 + }, + { + "epoch": 0.06, + "grad_norm": 1.7758089119133673, + "learning_rate": 1.9952811145323213e-05, + "loss": 0.8003, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 1.5105006477745304, + "learning_rate": 1.995249093263176e-05, + "loss": 0.6333, + "step": 591 + }, + { + "epoch": 0.06, + "grad_norm": 1.5506652898849551, + "learning_rate": 1.9952169639753055e-05, + "loss": 0.7387, + "step": 592 + }, + { + "epoch": 0.06, + "grad_norm": 1.6406097273031046, + "learning_rate": 1.995184726672197e-05, + "loss": 0.754, + "step": 593 + }, + { + "epoch": 0.06, + "grad_norm": 1.6937146704480206, + "learning_rate": 1.9951523813573496e-05, + "loss": 0.6996, + "step": 594 + }, + { + "epoch": 0.06, + "grad_norm": 1.6863369020459251, + "learning_rate": 1.9951199280342732e-05, + "loss": 0.7959, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 1.601016536813647, + "learning_rate": 1.9950873667064906e-05, + "loss": 0.7021, + "step": 596 + }, + { + "epoch": 0.06, + "grad_norm": 1.7116189222241849, + "learning_rate": 1.995054697377536e-05, + "loss": 0.7783, + "step": 597 + }, + { + "epoch": 0.06, + "grad_norm": 1.7443096421429085, + "learning_rate": 1.9950219200509547e-05, + "loss": 0.6914, + "step": 598 + }, + { + "epoch": 0.06, + "grad_norm": 1.8021785056836013, + "learning_rate": 1.9949890347303047e-05, + "loss": 0.7945, + "step": 599 + }, + { + "epoch": 0.06, + "grad_norm": 1.6555910571792856, + "learning_rate": 1.9949560414191546e-05, + "loss": 0.7046, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 1.7326941535085139, + "learning_rate": 1.9949229401210855e-05, + "loss": 0.8461, + "step": 601 + }, + { + "epoch": 0.06, + "grad_norm": 1.6370922161012353, + "learning_rate": 1.9948897308396903e-05, + "loss": 0.6904, + "step": 602 + }, + { + "epoch": 0.06, + "grad_norm": 1.6119600425769653, + "learning_rate": 1.9948564135785733e-05, + "loss": 0.8214, + "step": 603 + }, + { + "epoch": 0.06, + "grad_norm": 1.7636336681905245, + "learning_rate": 1.9948229883413503e-05, + "loss": 0.8049, + "step": 604 + }, + { + "epoch": 0.06, + "grad_norm": 1.766219115503329, + "learning_rate": 1.994789455131649e-05, + "loss": 0.6965, + "step": 605 + }, + { + "epoch": 0.06, + "grad_norm": 1.7177562818571706, + "learning_rate": 1.994755813953109e-05, + "loss": 0.8001, + "step": 606 + }, + { + "epoch": 0.06, + "grad_norm": 1.720211764446672, + "learning_rate": 1.994722064809382e-05, + "loss": 0.8151, + "step": 607 + }, + { + "epoch": 0.06, + "grad_norm": 1.7469479452575274, + "learning_rate": 1.9946882077041304e-05, + "loss": 0.7806, + "step": 608 + }, + { + "epoch": 0.06, + "grad_norm": 1.85793386115385, + "learning_rate": 1.9946542426410295e-05, + "loss": 0.7966, + "step": 609 + }, + { + "epoch": 0.06, + "grad_norm": 1.7629818781691207, + "learning_rate": 1.9946201696237645e-05, + "loss": 0.7873, + "step": 610 + }, + { + "epoch": 0.06, + "grad_norm": 1.6830394408562603, + "learning_rate": 1.9945859886560346e-05, + "loss": 0.7506, + "step": 611 + }, + { + "epoch": 0.06, + "grad_norm": 1.7860842728889232, + "learning_rate": 1.9945516997415493e-05, + "loss": 0.7918, + "step": 612 + }, + { + "epoch": 0.06, + "grad_norm": 1.6513821786445477, + "learning_rate": 1.9945173028840296e-05, + "loss": 0.8549, + "step": 613 + }, + { + "epoch": 0.06, + "grad_norm": 1.9218388455242161, + "learning_rate": 1.9944827980872094e-05, + "loss": 0.8566, + "step": 614 + }, + { + "epoch": 0.06, + "grad_norm": 1.7239023872462365, + "learning_rate": 1.9944481853548335e-05, + "loss": 0.7107, + "step": 615 + }, + { + "epoch": 0.06, + "grad_norm": 1.7916304919841761, + "learning_rate": 1.9944134646906588e-05, + "loss": 0.8648, + "step": 616 + }, + { + "epoch": 0.06, + "grad_norm": 1.4579672832201422, + "learning_rate": 1.994378636098453e-05, + "loss": 0.7886, + "step": 617 + }, + { + "epoch": 0.06, + "grad_norm": 1.5409736030803547, + "learning_rate": 1.9943436995819968e-05, + "loss": 0.6151, + "step": 618 + }, + { + "epoch": 0.06, + "grad_norm": 1.7127356435439465, + "learning_rate": 1.9943086551450816e-05, + "loss": 0.7307, + "step": 619 + }, + { + "epoch": 0.06, + "grad_norm": 1.5792128695492997, + "learning_rate": 1.9942735027915113e-05, + "loss": 0.7638, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 1.7555416285049137, + "learning_rate": 1.994238242525101e-05, + "loss": 0.8322, + "step": 621 + }, + { + "epoch": 0.06, + "grad_norm": 1.805621761550607, + "learning_rate": 1.9942028743496773e-05, + "loss": 0.744, + "step": 622 + }, + { + "epoch": 0.06, + "grad_norm": 1.8466682699690558, + "learning_rate": 1.9941673982690795e-05, + "loss": 0.8617, + "step": 623 + }, + { + "epoch": 0.06, + "grad_norm": 1.6590526970977735, + "learning_rate": 1.9941318142871576e-05, + "loss": 0.6979, + "step": 624 + }, + { + "epoch": 0.06, + "grad_norm": 1.7786400594149645, + "learning_rate": 1.9940961224077736e-05, + "loss": 0.7248, + "step": 625 + }, + { + "epoch": 0.06, + "grad_norm": 1.6788965960428635, + "learning_rate": 1.9940603226348015e-05, + "loss": 0.7699, + "step": 626 + }, + { + "epoch": 0.06, + "grad_norm": 1.9265619143980974, + "learning_rate": 1.994024414972127e-05, + "loss": 0.8173, + "step": 627 + }, + { + "epoch": 0.06, + "grad_norm": 1.715611931609657, + "learning_rate": 1.9939883994236468e-05, + "loss": 0.6624, + "step": 628 + }, + { + "epoch": 0.06, + "grad_norm": 1.5987849831467449, + "learning_rate": 1.99395227599327e-05, + "loss": 0.8136, + "step": 629 + }, + { + "epoch": 0.06, + "grad_norm": 1.6986730658985771, + "learning_rate": 1.9939160446849176e-05, + "loss": 0.7376, + "step": 630 + }, + { + "epoch": 0.06, + "grad_norm": 1.6545697033122497, + "learning_rate": 1.9938797055025213e-05, + "loss": 0.7149, + "step": 631 + }, + { + "epoch": 0.06, + "grad_norm": 1.994617700256768, + "learning_rate": 1.993843258450026e-05, + "loss": 0.8161, + "step": 632 + }, + { + "epoch": 0.06, + "grad_norm": 1.8732615461651878, + "learning_rate": 1.9938067035313865e-05, + "loss": 0.7736, + "step": 633 + }, + { + "epoch": 0.06, + "grad_norm": 1.6800973995399204, + "learning_rate": 1.993770040750571e-05, + "loss": 0.8434, + "step": 634 + }, + { + "epoch": 0.06, + "grad_norm": 1.6925647661553942, + "learning_rate": 1.9937332701115585e-05, + "loss": 0.788, + "step": 635 + }, + { + "epoch": 0.06, + "grad_norm": 1.7217431415270548, + "learning_rate": 1.993696391618339e-05, + "loss": 0.6426, + "step": 636 + }, + { + "epoch": 0.06, + "grad_norm": 1.6743741825967342, + "learning_rate": 1.993659405274917e-05, + "loss": 0.7904, + "step": 637 + }, + { + "epoch": 0.06, + "grad_norm": 1.7108692982520572, + "learning_rate": 1.993622311085305e-05, + "loss": 0.6979, + "step": 638 + }, + { + "epoch": 0.06, + "grad_norm": 1.6256885715656448, + "learning_rate": 1.9935851090535295e-05, + "loss": 0.7019, + "step": 639 + }, + { + "epoch": 0.07, + "grad_norm": 1.6229470491886617, + "learning_rate": 1.993547799183629e-05, + "loss": 0.7252, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 1.5160911920409792, + "learning_rate": 1.9935103814796516e-05, + "loss": 0.7873, + "step": 641 + }, + { + "epoch": 0.07, + "grad_norm": 1.5804423279608668, + "learning_rate": 1.9934728559456592e-05, + "loss": 0.8012, + "step": 642 + }, + { + "epoch": 0.07, + "grad_norm": 1.4519598120061032, + "learning_rate": 1.9934352225857245e-05, + "loss": 0.7251, + "step": 643 + }, + { + "epoch": 0.07, + "grad_norm": 1.7263353286073957, + "learning_rate": 1.993397481403932e-05, + "loss": 0.8427, + "step": 644 + }, + { + "epoch": 0.07, + "grad_norm": 1.637774815064037, + "learning_rate": 1.993359632404378e-05, + "loss": 0.7888, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 1.7467785786117918, + "learning_rate": 1.9933216755911706e-05, + "loss": 0.7857, + "step": 646 + }, + { + "epoch": 0.07, + "grad_norm": 1.5876144203148541, + "learning_rate": 1.9932836109684287e-05, + "loss": 0.7362, + "step": 647 + }, + { + "epoch": 0.07, + "grad_norm": 1.697451802836231, + "learning_rate": 1.993245438540284e-05, + "loss": 0.7894, + "step": 648 + }, + { + "epoch": 0.07, + "grad_norm": 1.7166828340233518, + "learning_rate": 1.9932071583108796e-05, + "loss": 0.7418, + "step": 649 + }, + { + "epoch": 0.07, + "grad_norm": 1.4878913435621886, + "learning_rate": 1.9931687702843706e-05, + "loss": 0.7117, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 1.9177940612995616, + "learning_rate": 1.9931302744649224e-05, + "loss": 0.8025, + "step": 651 + }, + { + "epoch": 0.07, + "grad_norm": 1.4139266215501305, + "learning_rate": 1.993091670856714e-05, + "loss": 0.7023, + "step": 652 + }, + { + "epoch": 0.07, + "grad_norm": 1.6439224416147458, + "learning_rate": 1.993052959463935e-05, + "loss": 0.7698, + "step": 653 + }, + { + "epoch": 0.07, + "grad_norm": 1.5886834224506095, + "learning_rate": 1.9930141402907867e-05, + "loss": 0.7759, + "step": 654 + }, + { + "epoch": 0.07, + "grad_norm": 1.5458636504703784, + "learning_rate": 1.9929752133414827e-05, + "loss": 0.7419, + "step": 655 + }, + { + "epoch": 0.07, + "grad_norm": 1.5464792132429968, + "learning_rate": 1.9929361786202476e-05, + "loss": 0.6632, + "step": 656 + }, + { + "epoch": 0.07, + "grad_norm": 1.525133701113065, + "learning_rate": 1.9928970361313182e-05, + "loss": 0.8723, + "step": 657 + }, + { + "epoch": 0.07, + "grad_norm": 1.6124831659371417, + "learning_rate": 1.9928577858789424e-05, + "loss": 0.806, + "step": 658 + }, + { + "epoch": 0.07, + "grad_norm": 1.4701753543539475, + "learning_rate": 1.992818427867381e-05, + "loss": 0.6918, + "step": 659 + }, + { + "epoch": 0.07, + "grad_norm": 1.6849551194071541, + "learning_rate": 1.9927789621009045e-05, + "loss": 0.8426, + "step": 660 + }, + { + "epoch": 0.07, + "grad_norm": 1.8884047795845, + "learning_rate": 1.9927393885837975e-05, + "loss": 0.695, + "step": 661 + }, + { + "epoch": 0.07, + "grad_norm": 1.8116947046420835, + "learning_rate": 1.9926997073203544e-05, + "loss": 0.8473, + "step": 662 + }, + { + "epoch": 0.07, + "grad_norm": 1.8689748277748406, + "learning_rate": 1.9926599183148822e-05, + "loss": 0.8169, + "step": 663 + }, + { + "epoch": 0.07, + "grad_norm": 1.994878462441299, + "learning_rate": 1.9926200215716993e-05, + "loss": 0.8405, + "step": 664 + }, + { + "epoch": 0.07, + "grad_norm": 1.7987595227302102, + "learning_rate": 1.992580017095136e-05, + "loss": 0.7267, + "step": 665 + }, + { + "epoch": 0.07, + "grad_norm": 1.5843737449052286, + "learning_rate": 1.992539904889534e-05, + "loss": 0.8127, + "step": 666 + }, + { + "epoch": 0.07, + "grad_norm": 1.8387810874585708, + "learning_rate": 1.992499684959247e-05, + "loss": 0.7091, + "step": 667 + }, + { + "epoch": 0.07, + "grad_norm": 1.7093730466761763, + "learning_rate": 1.9924593573086398e-05, + "loss": 0.7642, + "step": 668 + }, + { + "epoch": 0.07, + "grad_norm": 1.7669829291305252, + "learning_rate": 1.99241892194209e-05, + "loss": 0.7177, + "step": 669 + }, + { + "epoch": 0.07, + "grad_norm": 1.7859363978876226, + "learning_rate": 1.9923783788639862e-05, + "loss": 0.7864, + "step": 670 + }, + { + "epoch": 0.07, + "grad_norm": 1.5913637995967829, + "learning_rate": 1.992337728078728e-05, + "loss": 0.7669, + "step": 671 + }, + { + "epoch": 0.07, + "grad_norm": 1.9006919289403583, + "learning_rate": 1.9922969695907278e-05, + "loss": 0.8227, + "step": 672 + }, + { + "epoch": 0.07, + "grad_norm": 1.5707279920402772, + "learning_rate": 1.9922561034044095e-05, + "loss": 0.7132, + "step": 673 + }, + { + "epoch": 0.07, + "grad_norm": 2.0511635474237364, + "learning_rate": 1.9922151295242085e-05, + "loss": 0.777, + "step": 674 + }, + { + "epoch": 0.07, + "grad_norm": 1.378420795935105, + "learning_rate": 1.9921740479545716e-05, + "loss": 0.6924, + "step": 675 + }, + { + "epoch": 0.07, + "grad_norm": 1.3751596448402028, + "learning_rate": 1.9921328586999574e-05, + "loss": 0.7471, + "step": 676 + }, + { + "epoch": 0.07, + "grad_norm": 1.6308793722944452, + "learning_rate": 1.992091561764837e-05, + "loss": 0.7539, + "step": 677 + }, + { + "epoch": 0.07, + "grad_norm": 1.6200136862142986, + "learning_rate": 1.9920501571536917e-05, + "loss": 0.71, + "step": 678 + }, + { + "epoch": 0.07, + "grad_norm": 1.6981092405574794, + "learning_rate": 1.9920086448710162e-05, + "loss": 0.7512, + "step": 679 + }, + { + "epoch": 0.07, + "grad_norm": 1.731471177175868, + "learning_rate": 1.9919670249213152e-05, + "loss": 0.8579, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 1.6461346547255145, + "learning_rate": 1.9919252973091067e-05, + "loss": 0.7589, + "step": 681 + }, + { + "epoch": 0.07, + "grad_norm": 1.7313768240322136, + "learning_rate": 1.991883462038919e-05, + "loss": 0.8429, + "step": 682 + }, + { + "epoch": 0.07, + "grad_norm": 1.6033143635975582, + "learning_rate": 1.9918415191152927e-05, + "loss": 0.7439, + "step": 683 + }, + { + "epoch": 0.07, + "grad_norm": 1.6283606197499367, + "learning_rate": 1.99179946854278e-05, + "loss": 0.7209, + "step": 684 + }, + { + "epoch": 0.07, + "grad_norm": 1.6895382664512686, + "learning_rate": 1.9917573103259452e-05, + "loss": 0.8396, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 1.5021274979284158, + "learning_rate": 1.9917150444693635e-05, + "loss": 0.809, + "step": 686 + }, + { + "epoch": 0.07, + "grad_norm": 1.5898326262688758, + "learning_rate": 1.9916726709776228e-05, + "loss": 0.7391, + "step": 687 + }, + { + "epoch": 0.07, + "grad_norm": 1.7143323205939232, + "learning_rate": 1.9916301898553215e-05, + "loss": 0.8389, + "step": 688 + }, + { + "epoch": 0.07, + "grad_norm": 1.6601964607064166, + "learning_rate": 1.9915876011070705e-05, + "loss": 0.7384, + "step": 689 + }, + { + "epoch": 0.07, + "grad_norm": 1.6997295899037483, + "learning_rate": 1.991544904737492e-05, + "loss": 0.6985, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 1.6338386462513637, + "learning_rate": 1.9915021007512202e-05, + "loss": 0.8418, + "step": 691 + }, + { + "epoch": 0.07, + "grad_norm": 1.6619689013467975, + "learning_rate": 1.991459189152901e-05, + "loss": 0.828, + "step": 692 + }, + { + "epoch": 0.07, + "grad_norm": 1.5415233753313728, + "learning_rate": 1.991416169947191e-05, + "loss": 0.7842, + "step": 693 + }, + { + "epoch": 0.07, + "grad_norm": 1.7594928368716753, + "learning_rate": 1.9913730431387603e-05, + "loss": 0.7791, + "step": 694 + }, + { + "epoch": 0.07, + "grad_norm": 1.8285621513513672, + "learning_rate": 1.9913298087322886e-05, + "loss": 0.8216, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 1.6189878588401247, + "learning_rate": 1.991286466732469e-05, + "loss": 0.7305, + "step": 696 + }, + { + "epoch": 0.07, + "grad_norm": 1.6523217899208054, + "learning_rate": 1.9912430171440053e-05, + "loss": 0.8201, + "step": 697 + }, + { + "epoch": 0.07, + "grad_norm": 1.6392344267199934, + "learning_rate": 1.9911994599716137e-05, + "loss": 0.7456, + "step": 698 + }, + { + "epoch": 0.07, + "grad_norm": 1.687768478994549, + "learning_rate": 1.9911557952200212e-05, + "loss": 0.7575, + "step": 699 + }, + { + "epoch": 0.07, + "grad_norm": 1.7800920776702511, + "learning_rate": 1.9911120228939668e-05, + "loss": 0.7296, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 1.7376436911092747, + "learning_rate": 1.991068142998202e-05, + "loss": 0.8336, + "step": 701 + }, + { + "epoch": 0.07, + "grad_norm": 1.6366755441602803, + "learning_rate": 1.991024155537489e-05, + "loss": 0.6899, + "step": 702 + }, + { + "epoch": 0.07, + "grad_norm": 1.6311542633007308, + "learning_rate": 1.9909800605166013e-05, + "loss": 0.7963, + "step": 703 + }, + { + "epoch": 0.07, + "grad_norm": 1.6271019133596079, + "learning_rate": 1.990935857940325e-05, + "loss": 0.7225, + "step": 704 + }, + { + "epoch": 0.07, + "grad_norm": 1.730023263842185, + "learning_rate": 1.9908915478134584e-05, + "loss": 0.8107, + "step": 705 + }, + { + "epoch": 0.07, + "grad_norm": 1.7223462659250526, + "learning_rate": 1.9908471301408097e-05, + "loss": 0.8464, + "step": 706 + }, + { + "epoch": 0.07, + "grad_norm": 1.766101862175367, + "learning_rate": 1.9908026049272006e-05, + "loss": 0.7489, + "step": 707 + }, + { + "epoch": 0.07, + "grad_norm": 1.5577791461925623, + "learning_rate": 1.990757972177463e-05, + "loss": 0.7676, + "step": 708 + }, + { + "epoch": 0.07, + "grad_norm": 1.7987244153202877, + "learning_rate": 1.990713231896441e-05, + "loss": 0.7746, + "step": 709 + }, + { + "epoch": 0.07, + "grad_norm": 1.7553912474460873, + "learning_rate": 1.990668384088991e-05, + "loss": 0.8975, + "step": 710 + }, + { + "epoch": 0.07, + "grad_norm": 1.7169730984255511, + "learning_rate": 1.99062342875998e-05, + "loss": 0.7652, + "step": 711 + }, + { + "epoch": 0.07, + "grad_norm": 1.826842707928925, + "learning_rate": 1.9905783659142878e-05, + "loss": 0.7199, + "step": 712 + }, + { + "epoch": 0.07, + "grad_norm": 1.7082093614480132, + "learning_rate": 1.9905331955568045e-05, + "loss": 0.694, + "step": 713 + }, + { + "epoch": 0.07, + "grad_norm": 1.5829316805497582, + "learning_rate": 1.990487917692433e-05, + "loss": 0.7717, + "step": 714 + }, + { + "epoch": 0.07, + "grad_norm": 1.6451440728127658, + "learning_rate": 1.9904425323260875e-05, + "loss": 0.7858, + "step": 715 + }, + { + "epoch": 0.07, + "grad_norm": 1.652820322856562, + "learning_rate": 1.990397039462694e-05, + "loss": 0.7959, + "step": 716 + }, + { + "epoch": 0.07, + "grad_norm": 1.7584740049172933, + "learning_rate": 1.99035143910719e-05, + "loss": 0.7561, + "step": 717 + }, + { + "epoch": 0.07, + "grad_norm": 1.7493478475508397, + "learning_rate": 1.990305731264525e-05, + "loss": 0.7176, + "step": 718 + }, + { + "epoch": 0.07, + "grad_norm": 1.618316837166389, + "learning_rate": 1.990259915939659e-05, + "loss": 0.7485, + "step": 719 + }, + { + "epoch": 0.07, + "grad_norm": 1.6385372900613149, + "learning_rate": 1.9902139931375654e-05, + "loss": 0.8057, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 1.5803614398859005, + "learning_rate": 1.9901679628632277e-05, + "loss": 0.6364, + "step": 721 + }, + { + "epoch": 0.07, + "grad_norm": 1.620284724869963, + "learning_rate": 1.9901218251216424e-05, + "loss": 0.7402, + "step": 722 + }, + { + "epoch": 0.07, + "grad_norm": 1.762040099161046, + "learning_rate": 1.9900755799178165e-05, + "loss": 0.7896, + "step": 723 + }, + { + "epoch": 0.07, + "grad_norm": 1.6331588276201974, + "learning_rate": 1.99002922725677e-05, + "loss": 0.7898, + "step": 724 + }, + { + "epoch": 0.07, + "grad_norm": 1.650574167609646, + "learning_rate": 1.989982767143533e-05, + "loss": 0.7415, + "step": 725 + }, + { + "epoch": 0.07, + "grad_norm": 1.8140348040677017, + "learning_rate": 1.9899361995831477e-05, + "loss": 0.9059, + "step": 726 + }, + { + "epoch": 0.07, + "grad_norm": 1.5401630519652945, + "learning_rate": 1.989889524580669e-05, + "loss": 0.6807, + "step": 727 + }, + { + "epoch": 0.07, + "grad_norm": 1.6906440729063186, + "learning_rate": 1.9898427421411627e-05, + "loss": 0.6651, + "step": 728 + }, + { + "epoch": 0.07, + "grad_norm": 1.7156359455773944, + "learning_rate": 1.9897958522697066e-05, + "loss": 0.7354, + "step": 729 + }, + { + "epoch": 0.07, + "grad_norm": 1.6778297011427266, + "learning_rate": 1.9897488549713887e-05, + "loss": 0.853, + "step": 730 + }, + { + "epoch": 0.07, + "grad_norm": 1.5822425468702135, + "learning_rate": 1.9897017502513107e-05, + "loss": 0.8437, + "step": 731 + }, + { + "epoch": 0.07, + "grad_norm": 1.5697794417278896, + "learning_rate": 1.9896545381145854e-05, + "loss": 0.6893, + "step": 732 + }, + { + "epoch": 0.07, + "grad_norm": 1.6609595195224915, + "learning_rate": 1.9896072185663358e-05, + "loss": 0.8496, + "step": 733 + }, + { + "epoch": 0.07, + "grad_norm": 1.7210583504941446, + "learning_rate": 1.989559791611699e-05, + "loss": 0.7671, + "step": 734 + }, + { + "epoch": 0.07, + "grad_norm": 1.6351024582932636, + "learning_rate": 1.989512257255821e-05, + "loss": 0.7643, + "step": 735 + }, + { + "epoch": 0.07, + "grad_norm": 1.5373211139173386, + "learning_rate": 1.9894646155038624e-05, + "loss": 0.7101, + "step": 736 + }, + { + "epoch": 0.07, + "grad_norm": 1.6909968283815922, + "learning_rate": 1.989416866360993e-05, + "loss": 0.7985, + "step": 737 + }, + { + "epoch": 0.08, + "grad_norm": 1.5172315909593315, + "learning_rate": 1.9893690098323955e-05, + "loss": 0.6585, + "step": 738 + }, + { + "epoch": 0.08, + "grad_norm": 1.6726015484769146, + "learning_rate": 1.989321045923264e-05, + "loss": 0.7843, + "step": 739 + }, + { + "epoch": 0.08, + "grad_norm": 1.5799583071761338, + "learning_rate": 1.989272974638804e-05, + "loss": 0.8198, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 1.6190253159018553, + "learning_rate": 1.9892247959842338e-05, + "loss": 0.7887, + "step": 741 + }, + { + "epoch": 0.08, + "grad_norm": 1.6597510473945443, + "learning_rate": 1.989176509964781e-05, + "loss": 0.8144, + "step": 742 + }, + { + "epoch": 0.08, + "grad_norm": 1.535999847235126, + "learning_rate": 1.9891281165856876e-05, + "loss": 0.6152, + "step": 743 + }, + { + "epoch": 0.08, + "grad_norm": 1.766018263265675, + "learning_rate": 1.989079615852205e-05, + "loss": 0.7636, + "step": 744 + }, + { + "epoch": 0.08, + "grad_norm": 1.5858962987014023, + "learning_rate": 1.9890310077695976e-05, + "loss": 0.7285, + "step": 745 + }, + { + "epoch": 0.08, + "grad_norm": 1.5638476394295513, + "learning_rate": 1.988982292343141e-05, + "loss": 0.7238, + "step": 746 + }, + { + "epoch": 0.08, + "grad_norm": 1.5665371791225642, + "learning_rate": 1.9889334695781227e-05, + "loss": 0.6451, + "step": 747 + }, + { + "epoch": 0.08, + "grad_norm": 1.629557149390182, + "learning_rate": 1.9888845394798416e-05, + "loss": 0.7883, + "step": 748 + }, + { + "epoch": 0.08, + "grad_norm": 1.7545548160299196, + "learning_rate": 1.9888355020536078e-05, + "loss": 0.7631, + "step": 749 + }, + { + "epoch": 0.08, + "grad_norm": 1.9006888455912045, + "learning_rate": 1.988786357304744e-05, + "loss": 0.7871, + "step": 750 + }, + { + "epoch": 0.08, + "grad_norm": 1.7168503900092866, + "learning_rate": 1.9887371052385843e-05, + "loss": 0.8515, + "step": 751 + }, + { + "epoch": 0.08, + "grad_norm": 1.8927518735252673, + "learning_rate": 1.9886877458604737e-05, + "loss": 0.8392, + "step": 752 + }, + { + "epoch": 0.08, + "grad_norm": 1.7011730949471735, + "learning_rate": 1.9886382791757695e-05, + "loss": 0.7202, + "step": 753 + }, + { + "epoch": 0.08, + "grad_norm": 1.637575948589387, + "learning_rate": 1.9885887051898407e-05, + "loss": 0.74, + "step": 754 + }, + { + "epoch": 0.08, + "grad_norm": 1.8131790572190998, + "learning_rate": 1.988539023908068e-05, + "loss": 0.7383, + "step": 755 + }, + { + "epoch": 0.08, + "grad_norm": 1.6681631940285306, + "learning_rate": 1.988489235335843e-05, + "loss": 0.7128, + "step": 756 + }, + { + "epoch": 0.08, + "grad_norm": 1.7593396966304202, + "learning_rate": 1.98843933947857e-05, + "loss": 0.8314, + "step": 757 + }, + { + "epoch": 0.08, + "grad_norm": 1.7039767523301115, + "learning_rate": 1.988389336341664e-05, + "loss": 0.7352, + "step": 758 + }, + { + "epoch": 0.08, + "grad_norm": 1.679183477039409, + "learning_rate": 1.988339225930552e-05, + "loss": 0.617, + "step": 759 + }, + { + "epoch": 0.08, + "grad_norm": 1.6711954196811725, + "learning_rate": 1.9882890082506733e-05, + "loss": 0.7709, + "step": 760 + }, + { + "epoch": 0.08, + "grad_norm": 1.7192406625162853, + "learning_rate": 1.988238683307478e-05, + "loss": 0.9232, + "step": 761 + }, + { + "epoch": 0.08, + "grad_norm": 1.5921764606177538, + "learning_rate": 1.9881882511064275e-05, + "loss": 0.7469, + "step": 762 + }, + { + "epoch": 0.08, + "grad_norm": 1.6184206958606004, + "learning_rate": 1.9881377116529964e-05, + "loss": 0.681, + "step": 763 + }, + { + "epoch": 0.08, + "grad_norm": 1.6011320408604426, + "learning_rate": 1.988087064952669e-05, + "loss": 0.7478, + "step": 764 + }, + { + "epoch": 0.08, + "grad_norm": 1.4149338242204557, + "learning_rate": 1.9880363110109427e-05, + "loss": 0.7489, + "step": 765 + }, + { + "epoch": 0.08, + "grad_norm": 1.6112484116223573, + "learning_rate": 1.987985449833326e-05, + "loss": 0.7996, + "step": 766 + }, + { + "epoch": 0.08, + "grad_norm": 1.7167345566634746, + "learning_rate": 1.987934481425339e-05, + "loss": 0.8087, + "step": 767 + }, + { + "epoch": 0.08, + "grad_norm": 1.5677451466214758, + "learning_rate": 1.987883405792514e-05, + "loss": 0.7465, + "step": 768 + }, + { + "epoch": 0.08, + "grad_norm": 1.5804647890281336, + "learning_rate": 1.9878322229403938e-05, + "loss": 0.626, + "step": 769 + }, + { + "epoch": 0.08, + "grad_norm": 1.616554155661927, + "learning_rate": 1.9877809328745338e-05, + "loss": 0.6658, + "step": 770 + }, + { + "epoch": 0.08, + "grad_norm": 1.5867376321137878, + "learning_rate": 1.987729535600501e-05, + "loss": 0.7254, + "step": 771 + }, + { + "epoch": 0.08, + "grad_norm": 1.7469693174480554, + "learning_rate": 1.9876780311238727e-05, + "loss": 0.7281, + "step": 772 + }, + { + "epoch": 0.08, + "grad_norm": 1.6938507016832554, + "learning_rate": 1.9876264194502403e-05, + "loss": 0.7879, + "step": 773 + }, + { + "epoch": 0.08, + "grad_norm": 1.6085928749683316, + "learning_rate": 1.9875747005852048e-05, + "loss": 0.8294, + "step": 774 + }, + { + "epoch": 0.08, + "grad_norm": 1.6144461068818, + "learning_rate": 1.987522874534379e-05, + "loss": 0.6786, + "step": 775 + }, + { + "epoch": 0.08, + "grad_norm": 1.6544216920426305, + "learning_rate": 1.987470941303389e-05, + "loss": 0.7309, + "step": 776 + }, + { + "epoch": 0.08, + "grad_norm": 1.7767428032177268, + "learning_rate": 1.9874189008978702e-05, + "loss": 0.7362, + "step": 777 + }, + { + "epoch": 0.08, + "grad_norm": 1.735644721679841, + "learning_rate": 1.9873667533234714e-05, + "loss": 0.8189, + "step": 778 + }, + { + "epoch": 0.08, + "grad_norm": 1.7759896504317103, + "learning_rate": 1.987314498585852e-05, + "loss": 0.7809, + "step": 779 + }, + { + "epoch": 0.08, + "grad_norm": 1.6443310875392836, + "learning_rate": 1.987262136690684e-05, + "loss": 0.7772, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 1.6914137805454288, + "learning_rate": 1.9872096676436498e-05, + "loss": 0.6338, + "step": 781 + }, + { + "epoch": 0.08, + "grad_norm": 1.9227888727294091, + "learning_rate": 1.9871570914504447e-05, + "loss": 0.8005, + "step": 782 + }, + { + "epoch": 0.08, + "grad_norm": 1.795882620074113, + "learning_rate": 1.9871044081167742e-05, + "loss": 0.7707, + "step": 783 + }, + { + "epoch": 0.08, + "grad_norm": 1.6930358644809467, + "learning_rate": 1.9870516176483573e-05, + "loss": 0.8259, + "step": 784 + }, + { + "epoch": 0.08, + "grad_norm": 1.7819340442518423, + "learning_rate": 1.9869987200509228e-05, + "loss": 0.6661, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 1.790035538160907, + "learning_rate": 1.9869457153302124e-05, + "loss": 0.829, + "step": 786 + }, + { + "epoch": 0.08, + "grad_norm": 1.5202353379476825, + "learning_rate": 1.9868926034919787e-05, + "loss": 0.8845, + "step": 787 + }, + { + "epoch": 0.08, + "grad_norm": 1.863990678141754, + "learning_rate": 1.986839384541986e-05, + "loss": 0.7813, + "step": 788 + }, + { + "epoch": 0.08, + "grad_norm": 1.5088174748801766, + "learning_rate": 1.9867860584860106e-05, + "loss": 0.8197, + "step": 789 + }, + { + "epoch": 0.08, + "grad_norm": 1.569596018469031, + "learning_rate": 1.98673262532984e-05, + "loss": 0.8397, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 1.76684943786916, + "learning_rate": 1.986679085079274e-05, + "loss": 0.8621, + "step": 791 + }, + { + "epoch": 0.08, + "grad_norm": 1.909957245712091, + "learning_rate": 1.986625437740123e-05, + "loss": 0.9255, + "step": 792 + }, + { + "epoch": 0.08, + "grad_norm": 1.51112619591038, + "learning_rate": 1.98657168331821e-05, + "loss": 0.7438, + "step": 793 + }, + { + "epoch": 0.08, + "grad_norm": 1.7600103242211305, + "learning_rate": 1.986517821819369e-05, + "loss": 0.762, + "step": 794 + }, + { + "epoch": 0.08, + "grad_norm": 1.629549944021964, + "learning_rate": 1.986463853249446e-05, + "loss": 0.8457, + "step": 795 + }, + { + "epoch": 0.08, + "grad_norm": 1.6322779733282207, + "learning_rate": 1.9864097776142978e-05, + "loss": 0.7376, + "step": 796 + }, + { + "epoch": 0.08, + "grad_norm": 1.6427720947747648, + "learning_rate": 1.9863555949197942e-05, + "loss": 0.7868, + "step": 797 + }, + { + "epoch": 0.08, + "grad_norm": 1.5070344335988768, + "learning_rate": 1.986301305171816e-05, + "loss": 0.7177, + "step": 798 + }, + { + "epoch": 0.08, + "grad_norm": 1.6517460173464724, + "learning_rate": 1.9862469083762546e-05, + "loss": 0.83, + "step": 799 + }, + { + "epoch": 0.08, + "grad_norm": 1.862254713203288, + "learning_rate": 1.9861924045390147e-05, + "loss": 0.8391, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 1.840773968509843, + "learning_rate": 1.986137793666012e-05, + "loss": 0.7189, + "step": 801 + }, + { + "epoch": 0.08, + "grad_norm": 1.7091886103939837, + "learning_rate": 1.9860830757631727e-05, + "loss": 0.7196, + "step": 802 + }, + { + "epoch": 0.08, + "grad_norm": 1.939228414732462, + "learning_rate": 1.9860282508364365e-05, + "loss": 0.7769, + "step": 803 + }, + { + "epoch": 0.08, + "grad_norm": 1.7527357282829756, + "learning_rate": 1.9859733188917532e-05, + "loss": 0.7744, + "step": 804 + }, + { + "epoch": 0.08, + "grad_norm": 1.8700242525423052, + "learning_rate": 1.9859182799350852e-05, + "loss": 0.7007, + "step": 805 + }, + { + "epoch": 0.08, + "grad_norm": 1.595136671416076, + "learning_rate": 1.985863133972406e-05, + "loss": 0.7394, + "step": 806 + }, + { + "epoch": 0.08, + "grad_norm": 1.666270275523058, + "learning_rate": 1.9858078810097004e-05, + "loss": 0.7128, + "step": 807 + }, + { + "epoch": 0.08, + "grad_norm": 1.6526420386804836, + "learning_rate": 1.985752521052966e-05, + "loss": 0.793, + "step": 808 + }, + { + "epoch": 0.08, + "grad_norm": 2.0164674424468023, + "learning_rate": 1.9856970541082107e-05, + "loss": 0.7952, + "step": 809 + }, + { + "epoch": 0.08, + "grad_norm": 1.7422692497713812, + "learning_rate": 1.985641480181455e-05, + "loss": 0.7611, + "step": 810 + }, + { + "epoch": 0.08, + "grad_norm": 1.8988225567936234, + "learning_rate": 1.98558579927873e-05, + "loss": 0.8972, + "step": 811 + }, + { + "epoch": 0.08, + "grad_norm": 1.5986381889416141, + "learning_rate": 1.9855300114060795e-05, + "loss": 0.7646, + "step": 812 + }, + { + "epoch": 0.08, + "grad_norm": 1.6483180495163923, + "learning_rate": 1.9854741165695583e-05, + "loss": 0.7311, + "step": 813 + }, + { + "epoch": 0.08, + "grad_norm": 1.8063581944455203, + "learning_rate": 1.9854181147752326e-05, + "loss": 0.9311, + "step": 814 + }, + { + "epoch": 0.08, + "grad_norm": 1.6504812770449069, + "learning_rate": 1.9853620060291812e-05, + "loss": 0.767, + "step": 815 + }, + { + "epoch": 0.08, + "grad_norm": 1.4128816928019157, + "learning_rate": 1.985305790337493e-05, + "loss": 0.669, + "step": 816 + }, + { + "epoch": 0.08, + "grad_norm": 1.733825753404519, + "learning_rate": 1.98524946770627e-05, + "loss": 0.8098, + "step": 817 + }, + { + "epoch": 0.08, + "grad_norm": 1.6299749868856428, + "learning_rate": 1.9851930381416243e-05, + "loss": 0.748, + "step": 818 + }, + { + "epoch": 0.08, + "grad_norm": 1.6517357971409825, + "learning_rate": 1.9851365016496812e-05, + "loss": 0.8209, + "step": 819 + }, + { + "epoch": 0.08, + "grad_norm": 1.583252522074743, + "learning_rate": 1.985079858236577e-05, + "loss": 0.6843, + "step": 820 + }, + { + "epoch": 0.08, + "grad_norm": 1.6536302219640018, + "learning_rate": 1.9850231079084593e-05, + "loss": 0.7518, + "step": 821 + }, + { + "epoch": 0.08, + "grad_norm": 1.7454823923954026, + "learning_rate": 1.9849662506714865e-05, + "loss": 0.7312, + "step": 822 + }, + { + "epoch": 0.08, + "grad_norm": 1.914739489812267, + "learning_rate": 1.984909286531831e-05, + "loss": 0.8605, + "step": 823 + }, + { + "epoch": 0.08, + "grad_norm": 1.6639931309404836, + "learning_rate": 1.9848522154956744e-05, + "loss": 0.751, + "step": 824 + }, + { + "epoch": 0.08, + "grad_norm": 1.5651952261950397, + "learning_rate": 1.984795037569211e-05, + "loss": 0.7202, + "step": 825 + }, + { + "epoch": 0.08, + "grad_norm": 1.5688629583006275, + "learning_rate": 1.984737752758647e-05, + "loss": 0.665, + "step": 826 + }, + { + "epoch": 0.08, + "grad_norm": 1.4482723619902158, + "learning_rate": 1.9846803610701996e-05, + "loss": 0.861, + "step": 827 + }, + { + "epoch": 0.08, + "grad_norm": 1.7229690482399078, + "learning_rate": 1.9846228625100977e-05, + "loss": 0.756, + "step": 828 + }, + { + "epoch": 0.08, + "grad_norm": 1.5180781551592952, + "learning_rate": 1.9845652570845818e-05, + "loss": 0.8151, + "step": 829 + }, + { + "epoch": 0.08, + "grad_norm": 1.655586251600461, + "learning_rate": 1.9845075447999042e-05, + "loss": 0.8247, + "step": 830 + }, + { + "epoch": 0.08, + "grad_norm": 1.6566059901082226, + "learning_rate": 1.9844497256623283e-05, + "loss": 0.7896, + "step": 831 + }, + { + "epoch": 0.08, + "grad_norm": 1.6097210602345795, + "learning_rate": 1.98439179967813e-05, + "loss": 0.7025, + "step": 832 + }, + { + "epoch": 0.08, + "grad_norm": 1.6245002946759295, + "learning_rate": 1.9843337668535958e-05, + "loss": 0.7641, + "step": 833 + }, + { + "epoch": 0.08, + "grad_norm": 1.5594374812846619, + "learning_rate": 1.9842756271950247e-05, + "loss": 0.6164, + "step": 834 + }, + { + "epoch": 0.08, + "grad_norm": 1.662029547316825, + "learning_rate": 1.9842173807087264e-05, + "loss": 0.7241, + "step": 835 + }, + { + "epoch": 0.09, + "grad_norm": 1.7204550225697404, + "learning_rate": 1.9841590274010228e-05, + "loss": 0.8256, + "step": 836 + }, + { + "epoch": 0.09, + "grad_norm": 1.712325268991625, + "learning_rate": 1.9841005672782473e-05, + "loss": 0.8094, + "step": 837 + }, + { + "epoch": 0.09, + "grad_norm": 1.6558537995946654, + "learning_rate": 1.984042000346745e-05, + "loss": 0.8182, + "step": 838 + }, + { + "epoch": 0.09, + "grad_norm": 1.4574707909678533, + "learning_rate": 1.9839833266128726e-05, + "loss": 0.7029, + "step": 839 + }, + { + "epoch": 0.09, + "grad_norm": 1.5151941195461256, + "learning_rate": 1.9839245460829974e-05, + "loss": 0.8322, + "step": 840 + }, + { + "epoch": 0.09, + "grad_norm": 1.578641414260665, + "learning_rate": 1.9838656587634996e-05, + "loss": 0.6918, + "step": 841 + }, + { + "epoch": 0.09, + "grad_norm": 1.583223765478341, + "learning_rate": 1.983806664660771e-05, + "loss": 0.6128, + "step": 842 + }, + { + "epoch": 0.09, + "grad_norm": 1.35378598549046, + "learning_rate": 1.9837475637812132e-05, + "loss": 0.6721, + "step": 843 + }, + { + "epoch": 0.09, + "grad_norm": 1.5124064601223453, + "learning_rate": 1.983688356131242e-05, + "loss": 0.7138, + "step": 844 + }, + { + "epoch": 0.09, + "grad_norm": 1.6007621242000454, + "learning_rate": 1.9836290417172825e-05, + "loss": 0.7293, + "step": 845 + }, + { + "epoch": 0.09, + "grad_norm": 1.532861934209684, + "learning_rate": 1.983569620545773e-05, + "loss": 0.7525, + "step": 846 + }, + { + "epoch": 0.09, + "grad_norm": 1.5919170282152117, + "learning_rate": 1.9835100926231625e-05, + "loss": 0.7039, + "step": 847 + }, + { + "epoch": 0.09, + "grad_norm": 1.613310000105187, + "learning_rate": 1.9834504579559118e-05, + "loss": 0.7526, + "step": 848 + }, + { + "epoch": 0.09, + "grad_norm": 1.5091381434192657, + "learning_rate": 1.9833907165504935e-05, + "loss": 0.674, + "step": 849 + }, + { + "epoch": 0.09, + "grad_norm": 1.6288949471182157, + "learning_rate": 1.9833308684133913e-05, + "loss": 0.6881, + "step": 850 + }, + { + "epoch": 0.09, + "grad_norm": 1.7035249813071198, + "learning_rate": 1.9832709135511006e-05, + "loss": 0.7646, + "step": 851 + }, + { + "epoch": 0.09, + "grad_norm": 1.6085841109244907, + "learning_rate": 1.983210851970129e-05, + "loss": 0.7064, + "step": 852 + }, + { + "epoch": 0.09, + "grad_norm": 1.5416377825068288, + "learning_rate": 1.9831506836769955e-05, + "loss": 0.6925, + "step": 853 + }, + { + "epoch": 0.09, + "grad_norm": 1.6407129976757333, + "learning_rate": 1.9830904086782298e-05, + "loss": 0.8426, + "step": 854 + }, + { + "epoch": 0.09, + "grad_norm": 1.711869418845939, + "learning_rate": 1.983030026980374e-05, + "loss": 0.8236, + "step": 855 + }, + { + "epoch": 0.09, + "grad_norm": 1.511105480844578, + "learning_rate": 1.9829695385899816e-05, + "loss": 0.7293, + "step": 856 + }, + { + "epoch": 0.09, + "grad_norm": 1.727960597652724, + "learning_rate": 1.9829089435136176e-05, + "loss": 0.7806, + "step": 857 + }, + { + "epoch": 0.09, + "grad_norm": 1.8756242422118425, + "learning_rate": 1.982848241757859e-05, + "loss": 0.8235, + "step": 858 + }, + { + "epoch": 0.09, + "grad_norm": 1.751065062669428, + "learning_rate": 1.982787433329294e-05, + "loss": 0.8057, + "step": 859 + }, + { + "epoch": 0.09, + "grad_norm": 1.5458333978687038, + "learning_rate": 1.9827265182345218e-05, + "loss": 0.7308, + "step": 860 + }, + { + "epoch": 0.09, + "grad_norm": 1.5226977001977733, + "learning_rate": 1.9826654964801544e-05, + "loss": 0.7669, + "step": 861 + }, + { + "epoch": 0.09, + "grad_norm": 1.6839225185773306, + "learning_rate": 1.9826043680728142e-05, + "loss": 0.8051, + "step": 862 + }, + { + "epoch": 0.09, + "grad_norm": 1.5793830832920863, + "learning_rate": 1.9825431330191366e-05, + "loss": 0.8132, + "step": 863 + }, + { + "epoch": 0.09, + "grad_norm": 1.6937724188454104, + "learning_rate": 1.9824817913257666e-05, + "loss": 0.6887, + "step": 864 + }, + { + "epoch": 0.09, + "grad_norm": 1.873772989542665, + "learning_rate": 1.9824203429993627e-05, + "loss": 0.8393, + "step": 865 + }, + { + "epoch": 0.09, + "grad_norm": 1.6477348786228707, + "learning_rate": 1.982358788046594e-05, + "loss": 0.8163, + "step": 866 + }, + { + "epoch": 0.09, + "grad_norm": 1.52524445412708, + "learning_rate": 1.9822971264741412e-05, + "loss": 0.7488, + "step": 867 + }, + { + "epoch": 0.09, + "grad_norm": 1.6276327048556951, + "learning_rate": 1.9822353582886963e-05, + "loss": 0.6686, + "step": 868 + }, + { + "epoch": 0.09, + "grad_norm": 1.634620053938659, + "learning_rate": 1.9821734834969643e-05, + "loss": 0.8702, + "step": 869 + }, + { + "epoch": 0.09, + "grad_norm": 1.5984651127834377, + "learning_rate": 1.98211150210566e-05, + "loss": 0.6718, + "step": 870 + }, + { + "epoch": 0.09, + "grad_norm": 1.5710084200353578, + "learning_rate": 1.98204941412151e-05, + "loss": 0.6835, + "step": 871 + }, + { + "epoch": 0.09, + "grad_norm": 1.6624153524943914, + "learning_rate": 1.9819872195512545e-05, + "loss": 0.7358, + "step": 872 + }, + { + "epoch": 0.09, + "grad_norm": 1.590680428823555, + "learning_rate": 1.9819249184016426e-05, + "loss": 0.7424, + "step": 873 + }, + { + "epoch": 0.09, + "grad_norm": 1.6679560046276667, + "learning_rate": 1.9818625106794363e-05, + "loss": 0.7768, + "step": 874 + }, + { + "epoch": 0.09, + "grad_norm": 1.576313274431883, + "learning_rate": 1.981799996391409e-05, + "loss": 0.758, + "step": 875 + }, + { + "epoch": 0.09, + "grad_norm": 1.6518281038340517, + "learning_rate": 1.981737375544346e-05, + "loss": 0.7233, + "step": 876 + }, + { + "epoch": 0.09, + "grad_norm": 1.4274151645512967, + "learning_rate": 1.9816746481450436e-05, + "loss": 0.7168, + "step": 877 + }, + { + "epoch": 0.09, + "grad_norm": 1.939294035861291, + "learning_rate": 1.9816118142003096e-05, + "loss": 0.8503, + "step": 878 + }, + { + "epoch": 0.09, + "grad_norm": 1.5387673753810336, + "learning_rate": 1.981548873716964e-05, + "loss": 0.7362, + "step": 879 + }, + { + "epoch": 0.09, + "grad_norm": 1.4805195280344556, + "learning_rate": 1.9814858267018376e-05, + "loss": 0.6436, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 1.5446391199954803, + "learning_rate": 1.9814226731617734e-05, + "loss": 0.6923, + "step": 881 + }, + { + "epoch": 0.09, + "grad_norm": 1.4813312189994743, + "learning_rate": 1.981359413103626e-05, + "loss": 0.7571, + "step": 882 + }, + { + "epoch": 0.09, + "grad_norm": 1.6640726456178598, + "learning_rate": 1.981296046534261e-05, + "loss": 0.851, + "step": 883 + }, + { + "epoch": 0.09, + "grad_norm": 1.6633157436957888, + "learning_rate": 1.981232573460556e-05, + "loss": 0.7917, + "step": 884 + }, + { + "epoch": 0.09, + "grad_norm": 1.6746676607899385, + "learning_rate": 1.9811689938893998e-05, + "loss": 0.7473, + "step": 885 + }, + { + "epoch": 0.09, + "grad_norm": 1.7031648068997371, + "learning_rate": 1.9811053078276933e-05, + "loss": 0.7654, + "step": 886 + }, + { + "epoch": 0.09, + "grad_norm": 1.5023261382337536, + "learning_rate": 1.981041515282348e-05, + "loss": 0.6898, + "step": 887 + }, + { + "epoch": 0.09, + "grad_norm": 1.5128621826976616, + "learning_rate": 1.980977616260288e-05, + "loss": 0.6342, + "step": 888 + }, + { + "epoch": 0.09, + "grad_norm": 1.7107190654157496, + "learning_rate": 1.980913610768449e-05, + "loss": 0.8767, + "step": 889 + }, + { + "epoch": 0.09, + "grad_norm": 1.7018440255203298, + "learning_rate": 1.9808494988137766e-05, + "loss": 0.8087, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 1.748811212451861, + "learning_rate": 1.9807852804032306e-05, + "loss": 0.7345, + "step": 891 + }, + { + "epoch": 0.09, + "grad_norm": 1.7272712489265187, + "learning_rate": 1.98072095554378e-05, + "loss": 0.8743, + "step": 892 + }, + { + "epoch": 0.09, + "grad_norm": 1.8069453963982343, + "learning_rate": 1.9806565242424064e-05, + "loss": 0.8003, + "step": 893 + }, + { + "epoch": 0.09, + "grad_norm": 1.5389870763989857, + "learning_rate": 1.980591986506103e-05, + "loss": 0.7899, + "step": 894 + }, + { + "epoch": 0.09, + "grad_norm": 1.3762396601862672, + "learning_rate": 1.9805273423418737e-05, + "loss": 0.7024, + "step": 895 + }, + { + "epoch": 0.09, + "grad_norm": 1.46629201856903, + "learning_rate": 1.980462591756735e-05, + "loss": 0.7164, + "step": 896 + }, + { + "epoch": 0.09, + "grad_norm": 1.6408767957620132, + "learning_rate": 1.9803977347577154e-05, + "loss": 0.7415, + "step": 897 + }, + { + "epoch": 0.09, + "grad_norm": 1.5881132334026928, + "learning_rate": 1.980332771351853e-05, + "loss": 0.8025, + "step": 898 + }, + { + "epoch": 0.09, + "grad_norm": 1.5763669524638544, + "learning_rate": 1.980267701546199e-05, + "loss": 0.8883, + "step": 899 + }, + { + "epoch": 0.09, + "grad_norm": 1.4978780824228621, + "learning_rate": 1.980202525347816e-05, + "loss": 0.7626, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 1.4887098145165154, + "learning_rate": 1.980137242763777e-05, + "loss": 0.8167, + "step": 901 + }, + { + "epoch": 0.09, + "grad_norm": 1.7711545128982842, + "learning_rate": 1.9800718538011683e-05, + "loss": 0.7537, + "step": 902 + }, + { + "epoch": 0.09, + "grad_norm": 1.5460885788878576, + "learning_rate": 1.9800063584670864e-05, + "loss": 0.7343, + "step": 903 + }, + { + "epoch": 0.09, + "grad_norm": 1.7386866230563274, + "learning_rate": 1.97994075676864e-05, + "loss": 0.8492, + "step": 904 + }, + { + "epoch": 0.09, + "grad_norm": 1.5807749512434859, + "learning_rate": 1.9798750487129486e-05, + "loss": 0.6922, + "step": 905 + }, + { + "epoch": 0.09, + "grad_norm": 1.5324171436351097, + "learning_rate": 1.9798092343071445e-05, + "loss": 0.7796, + "step": 906 + }, + { + "epoch": 0.09, + "grad_norm": 1.7562281371712432, + "learning_rate": 1.9797433135583705e-05, + "loss": 0.7693, + "step": 907 + }, + { + "epoch": 0.09, + "grad_norm": 1.6217028713794182, + "learning_rate": 1.979677286473781e-05, + "loss": 0.7588, + "step": 908 + }, + { + "epoch": 0.09, + "grad_norm": 1.565090135593504, + "learning_rate": 1.9796111530605428e-05, + "loss": 0.7376, + "step": 909 + }, + { + "epoch": 0.09, + "grad_norm": 1.603068989358705, + "learning_rate": 1.9795449133258335e-05, + "loss": 0.7798, + "step": 910 + }, + { + "epoch": 0.09, + "grad_norm": 1.6396418489523752, + "learning_rate": 1.979478567276842e-05, + "loss": 0.6676, + "step": 911 + }, + { + "epoch": 0.09, + "grad_norm": 1.6372104385022896, + "learning_rate": 1.979412114920769e-05, + "loss": 0.7513, + "step": 912 + }, + { + "epoch": 0.09, + "grad_norm": 1.675396810421105, + "learning_rate": 1.9793455562648276e-05, + "loss": 0.7631, + "step": 913 + }, + { + "epoch": 0.09, + "grad_norm": 1.897849675524275, + "learning_rate": 1.9792788913162414e-05, + "loss": 0.7721, + "step": 914 + }, + { + "epoch": 0.09, + "grad_norm": 1.5554235700494816, + "learning_rate": 1.9792121200822456e-05, + "loss": 0.81, + "step": 915 + }, + { + "epoch": 0.09, + "grad_norm": 1.4373647032594903, + "learning_rate": 1.9791452425700874e-05, + "loss": 0.5976, + "step": 916 + }, + { + "epoch": 0.09, + "grad_norm": 1.7491234619263865, + "learning_rate": 1.9790782587870252e-05, + "loss": 0.7904, + "step": 917 + }, + { + "epoch": 0.09, + "grad_norm": 1.6486451154873125, + "learning_rate": 1.9790111687403292e-05, + "loss": 0.7869, + "step": 918 + }, + { + "epoch": 0.09, + "grad_norm": 1.5310571807878468, + "learning_rate": 1.978943972437281e-05, + "loss": 0.7273, + "step": 919 + }, + { + "epoch": 0.09, + "grad_norm": 1.7047828597241863, + "learning_rate": 1.978876669885173e-05, + "loss": 0.682, + "step": 920 + }, + { + "epoch": 0.09, + "grad_norm": 1.6301078272480116, + "learning_rate": 1.9788092610913107e-05, + "loss": 0.799, + "step": 921 + }, + { + "epoch": 0.09, + "grad_norm": 1.5601134386377202, + "learning_rate": 1.97874174606301e-05, + "loss": 0.8432, + "step": 922 + }, + { + "epoch": 0.09, + "grad_norm": 1.7210324057395259, + "learning_rate": 1.9786741248075984e-05, + "loss": 0.8726, + "step": 923 + }, + { + "epoch": 0.09, + "grad_norm": 1.394576608403927, + "learning_rate": 1.9786063973324156e-05, + "loss": 0.7333, + "step": 924 + }, + { + "epoch": 0.09, + "grad_norm": 1.5976279760603629, + "learning_rate": 1.978538563644812e-05, + "loss": 0.7835, + "step": 925 + }, + { + "epoch": 0.09, + "grad_norm": 1.5422919699984112, + "learning_rate": 1.97847062375215e-05, + "loss": 0.772, + "step": 926 + }, + { + "epoch": 0.09, + "grad_norm": 1.6557365404347686, + "learning_rate": 1.978402577661803e-05, + "loss": 0.7868, + "step": 927 + }, + { + "epoch": 0.09, + "grad_norm": 1.674019352909184, + "learning_rate": 1.978334425381157e-05, + "loss": 0.7464, + "step": 928 + }, + { + "epoch": 0.09, + "grad_norm": 1.6255566180377932, + "learning_rate": 1.9782661669176086e-05, + "loss": 0.7936, + "step": 929 + }, + { + "epoch": 0.09, + "grad_norm": 1.6899360192659847, + "learning_rate": 1.978197802278566e-05, + "loss": 0.8837, + "step": 930 + }, + { + "epoch": 0.09, + "grad_norm": 1.5427340894121977, + "learning_rate": 1.9781293314714493e-05, + "loss": 0.7157, + "step": 931 + }, + { + "epoch": 0.09, + "grad_norm": 1.5486328001346643, + "learning_rate": 1.97806075450369e-05, + "loss": 0.6634, + "step": 932 + }, + { + "epoch": 0.09, + "grad_norm": 1.676171028632494, + "learning_rate": 1.9779920713827307e-05, + "loss": 0.6629, + "step": 933 + }, + { + "epoch": 0.1, + "grad_norm": 1.6169928978186605, + "learning_rate": 1.9779232821160264e-05, + "loss": 0.7855, + "step": 934 + }, + { + "epoch": 0.1, + "grad_norm": 1.6601303576342932, + "learning_rate": 1.9778543867110428e-05, + "loss": 0.7458, + "step": 935 + }, + { + "epoch": 0.1, + "grad_norm": 1.572233190447626, + "learning_rate": 1.9777853851752575e-05, + "loss": 0.7343, + "step": 936 + }, + { + "epoch": 0.1, + "grad_norm": 1.5786433225350176, + "learning_rate": 1.9777162775161592e-05, + "loss": 0.7364, + "step": 937 + }, + { + "epoch": 0.1, + "grad_norm": 1.6711676113294462, + "learning_rate": 1.977647063741249e-05, + "loss": 0.6002, + "step": 938 + }, + { + "epoch": 0.1, + "grad_norm": 1.6719456821034266, + "learning_rate": 1.9775777438580387e-05, + "loss": 0.7643, + "step": 939 + }, + { + "epoch": 0.1, + "grad_norm": 1.6821802539166844, + "learning_rate": 1.977508317874052e-05, + "loss": 0.8292, + "step": 940 + }, + { + "epoch": 0.1, + "grad_norm": 1.5626244212813953, + "learning_rate": 1.9774387857968237e-05, + "loss": 0.6857, + "step": 941 + }, + { + "epoch": 0.1, + "grad_norm": 1.621394503878368, + "learning_rate": 1.977369147633901e-05, + "loss": 0.6925, + "step": 942 + }, + { + "epoch": 0.1, + "grad_norm": 1.5671624337676704, + "learning_rate": 1.977299403392841e-05, + "loss": 0.7112, + "step": 943 + }, + { + "epoch": 0.1, + "grad_norm": 2.004929948433543, + "learning_rate": 1.9772295530812143e-05, + "loss": 0.7504, + "step": 944 + }, + { + "epoch": 0.1, + "grad_norm": 1.5805032467684188, + "learning_rate": 1.977159596706602e-05, + "loss": 0.7705, + "step": 945 + }, + { + "epoch": 0.1, + "grad_norm": 1.564866358149519, + "learning_rate": 1.9770895342765964e-05, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.1, + "grad_norm": 1.5104736001360075, + "learning_rate": 1.977019365798802e-05, + "loss": 0.6725, + "step": 947 + }, + { + "epoch": 0.1, + "grad_norm": 1.7279061948601635, + "learning_rate": 1.976949091280834e-05, + "loss": 0.8203, + "step": 948 + }, + { + "epoch": 0.1, + "grad_norm": 1.6987389794038148, + "learning_rate": 1.97687871073032e-05, + "loss": 0.7356, + "step": 949 + }, + { + "epoch": 0.1, + "grad_norm": 1.5683557461074862, + "learning_rate": 1.976808224154899e-05, + "loss": 0.7154, + "step": 950 + }, + { + "epoch": 0.1, + "grad_norm": 1.6291551489710554, + "learning_rate": 1.9767376315622204e-05, + "loss": 0.6979, + "step": 951 + }, + { + "epoch": 0.1, + "grad_norm": 1.7429785759185206, + "learning_rate": 1.9766669329599466e-05, + "loss": 0.9519, + "step": 952 + }, + { + "epoch": 0.1, + "grad_norm": 1.7201420018698663, + "learning_rate": 1.9765961283557503e-05, + "loss": 0.8157, + "step": 953 + }, + { + "epoch": 0.1, + "grad_norm": 1.6340952247661684, + "learning_rate": 1.976525217757317e-05, + "loss": 0.8053, + "step": 954 + }, + { + "epoch": 0.1, + "grad_norm": 1.7497449502423785, + "learning_rate": 1.976454201172342e-05, + "loss": 0.6892, + "step": 955 + }, + { + "epoch": 0.1, + "grad_norm": 1.8490925913199021, + "learning_rate": 1.976383078608534e-05, + "loss": 0.7282, + "step": 956 + }, + { + "epoch": 0.1, + "grad_norm": 1.571142660205738, + "learning_rate": 1.976311850073611e-05, + "loss": 0.7532, + "step": 957 + }, + { + "epoch": 0.1, + "grad_norm": 1.490756284704002, + "learning_rate": 1.976240515575305e-05, + "loss": 0.7327, + "step": 958 + }, + { + "epoch": 0.1, + "grad_norm": 1.6843788399658988, + "learning_rate": 1.976169075121358e-05, + "loss": 0.7708, + "step": 959 + }, + { + "epoch": 0.1, + "grad_norm": 1.7748135897626862, + "learning_rate": 1.976097528719523e-05, + "loss": 0.8446, + "step": 960 + }, + { + "epoch": 0.1, + "grad_norm": 1.6707936681278435, + "learning_rate": 1.9760258763775656e-05, + "loss": 0.7498, + "step": 961 + }, + { + "epoch": 0.1, + "grad_norm": 1.7033717941830635, + "learning_rate": 1.975954118103263e-05, + "loss": 0.6827, + "step": 962 + }, + { + "epoch": 0.1, + "grad_norm": 1.6250184779189945, + "learning_rate": 1.975882253904403e-05, + "loss": 0.8026, + "step": 963 + }, + { + "epoch": 0.1, + "grad_norm": 1.6361132028766527, + "learning_rate": 1.9758102837887853e-05, + "loss": 0.6545, + "step": 964 + }, + { + "epoch": 0.1, + "grad_norm": 1.6965149086218916, + "learning_rate": 1.9757382077642214e-05, + "loss": 0.8264, + "step": 965 + }, + { + "epoch": 0.1, + "grad_norm": 1.6633583574231603, + "learning_rate": 1.9756660258385338e-05, + "loss": 0.8511, + "step": 966 + }, + { + "epoch": 0.1, + "grad_norm": 1.5977541239908342, + "learning_rate": 1.975593738019557e-05, + "loss": 0.7447, + "step": 967 + }, + { + "epoch": 0.1, + "grad_norm": 1.56727108125653, + "learning_rate": 1.975521344315136e-05, + "loss": 0.7605, + "step": 968 + }, + { + "epoch": 0.1, + "grad_norm": 1.622962864888057, + "learning_rate": 1.9754488447331292e-05, + "loss": 0.7268, + "step": 969 + }, + { + "epoch": 0.1, + "grad_norm": 1.5541156726666745, + "learning_rate": 1.9753762392814043e-05, + "loss": 0.7134, + "step": 970 + }, + { + "epoch": 0.1, + "grad_norm": 1.610775363823247, + "learning_rate": 1.975303527967842e-05, + "loss": 0.7554, + "step": 971 + }, + { + "epoch": 0.1, + "grad_norm": 1.7499914820423736, + "learning_rate": 1.9752307108003334e-05, + "loss": 0.8552, + "step": 972 + }, + { + "epoch": 0.1, + "grad_norm": 1.5534873957880497, + "learning_rate": 1.9751577877867823e-05, + "loss": 0.753, + "step": 973 + }, + { + "epoch": 0.1, + "grad_norm": 1.808776873744801, + "learning_rate": 1.975084758935103e-05, + "loss": 0.7819, + "step": 974 + }, + { + "epoch": 0.1, + "grad_norm": 1.7700864769108424, + "learning_rate": 1.9750116242532217e-05, + "loss": 0.9487, + "step": 975 + }, + { + "epoch": 0.1, + "grad_norm": 1.6943439977730848, + "learning_rate": 1.974938383749076e-05, + "loss": 0.7968, + "step": 976 + }, + { + "epoch": 0.1, + "grad_norm": 1.6407491012547737, + "learning_rate": 1.9748650374306156e-05, + "loss": 0.7705, + "step": 977 + }, + { + "epoch": 0.1, + "grad_norm": 1.70469858711636, + "learning_rate": 1.9747915853058003e-05, + "loss": 0.7693, + "step": 978 + }, + { + "epoch": 0.1, + "grad_norm": 1.582707324059266, + "learning_rate": 1.974718027382602e-05, + "loss": 0.7874, + "step": 979 + }, + { + "epoch": 0.1, + "grad_norm": 1.6073999705030813, + "learning_rate": 1.974644363669005e-05, + "loss": 0.799, + "step": 980 + }, + { + "epoch": 0.1, + "grad_norm": 1.6479941375707847, + "learning_rate": 1.974570594173004e-05, + "loss": 0.7785, + "step": 981 + }, + { + "epoch": 0.1, + "grad_norm": 1.6646530813302085, + "learning_rate": 1.974496718902606e-05, + "loss": 0.8484, + "step": 982 + }, + { + "epoch": 0.1, + "grad_norm": 1.506444754411822, + "learning_rate": 1.9744227378658283e-05, + "loss": 0.7749, + "step": 983 + }, + { + "epoch": 0.1, + "grad_norm": 1.6978448175283514, + "learning_rate": 1.9743486510707006e-05, + "loss": 0.9159, + "step": 984 + }, + { + "epoch": 0.1, + "grad_norm": 1.542376548702729, + "learning_rate": 1.974274458525264e-05, + "loss": 0.7079, + "step": 985 + }, + { + "epoch": 0.1, + "grad_norm": 1.6363243613427414, + "learning_rate": 1.9742001602375708e-05, + "loss": 0.802, + "step": 986 + }, + { + "epoch": 0.1, + "grad_norm": 1.596726103662305, + "learning_rate": 1.9741257562156854e-05, + "loss": 0.8283, + "step": 987 + }, + { + "epoch": 0.1, + "grad_norm": 1.552072210203679, + "learning_rate": 1.974051246467682e-05, + "loss": 0.7828, + "step": 988 + }, + { + "epoch": 0.1, + "grad_norm": 1.7601591894326951, + "learning_rate": 1.973976631001649e-05, + "loss": 0.6845, + "step": 989 + }, + { + "epoch": 0.1, + "grad_norm": 1.751601699761324, + "learning_rate": 1.9739019098256835e-05, + "loss": 0.7943, + "step": 990 + }, + { + "epoch": 0.1, + "grad_norm": 1.5746857081818262, + "learning_rate": 1.973827082947896e-05, + "loss": 0.7584, + "step": 991 + }, + { + "epoch": 0.1, + "grad_norm": 1.6786168546308895, + "learning_rate": 1.9737521503764076e-05, + "loss": 0.704, + "step": 992 + }, + { + "epoch": 0.1, + "grad_norm": 1.7689656599100716, + "learning_rate": 1.9736771121193513e-05, + "loss": 0.8223, + "step": 993 + }, + { + "epoch": 0.1, + "grad_norm": 1.6628613066396887, + "learning_rate": 1.9736019681848706e-05, + "loss": 0.7639, + "step": 994 + }, + { + "epoch": 0.1, + "grad_norm": 1.6507086963589646, + "learning_rate": 1.973526718581122e-05, + "loss": 0.7448, + "step": 995 + }, + { + "epoch": 0.1, + "grad_norm": 1.4891511625875535, + "learning_rate": 1.9734513633162723e-05, + "loss": 0.7129, + "step": 996 + }, + { + "epoch": 0.1, + "grad_norm": 1.801981972115585, + "learning_rate": 1.9733759023985e-05, + "loss": 0.7863, + "step": 997 + }, + { + "epoch": 0.1, + "grad_norm": 1.6483362524488605, + "learning_rate": 1.9733003358359955e-05, + "loss": 0.6735, + "step": 998 + }, + { + "epoch": 0.1, + "grad_norm": 1.5831381134582354, + "learning_rate": 1.9732246636369605e-05, + "loss": 0.74, + "step": 999 + }, + { + "epoch": 0.1, + "grad_norm": 1.7761677747689284, + "learning_rate": 1.9731488858096078e-05, + "loss": 0.7121, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 1.5907097418790517, + "learning_rate": 1.9730730023621617e-05, + "loss": 0.7825, + "step": 1001 + }, + { + "epoch": 0.1, + "grad_norm": 1.6103489754656453, + "learning_rate": 1.972997013302858e-05, + "loss": 0.8179, + "step": 1002 + }, + { + "epoch": 0.1, + "grad_norm": 1.5474773072539885, + "learning_rate": 1.9729209186399448e-05, + "loss": 0.7446, + "step": 1003 + }, + { + "epoch": 0.1, + "grad_norm": 1.522546952331723, + "learning_rate": 1.972844718381681e-05, + "loss": 0.7528, + "step": 1004 + }, + { + "epoch": 0.1, + "grad_norm": 1.78490021522784, + "learning_rate": 1.9727684125363364e-05, + "loss": 0.887, + "step": 1005 + }, + { + "epoch": 0.1, + "grad_norm": 1.5860785797184678, + "learning_rate": 1.972692001112193e-05, + "loss": 0.71, + "step": 1006 + }, + { + "epoch": 0.1, + "grad_norm": 1.5648733823815915, + "learning_rate": 1.972615484117544e-05, + "loss": 0.7583, + "step": 1007 + }, + { + "epoch": 0.1, + "grad_norm": 1.707739890292607, + "learning_rate": 1.972538861560694e-05, + "loss": 0.7862, + "step": 1008 + }, + { + "epoch": 0.1, + "grad_norm": 1.659828837091845, + "learning_rate": 1.9724621334499597e-05, + "loss": 0.7663, + "step": 1009 + }, + { + "epoch": 0.1, + "grad_norm": 1.6810106139429293, + "learning_rate": 1.9723852997936683e-05, + "loss": 0.7745, + "step": 1010 + }, + { + "epoch": 0.1, + "grad_norm": 1.5375832455600957, + "learning_rate": 1.972308360600159e-05, + "loss": 0.7442, + "step": 1011 + }, + { + "epoch": 0.1, + "grad_norm": 1.8275172545359188, + "learning_rate": 1.9722313158777825e-05, + "loss": 0.8308, + "step": 1012 + }, + { + "epoch": 0.1, + "grad_norm": 1.5558075737762853, + "learning_rate": 1.9721541656349005e-05, + "loss": 0.7765, + "step": 1013 + }, + { + "epoch": 0.1, + "grad_norm": 1.7634963149801517, + "learning_rate": 1.9720769098798866e-05, + "loss": 0.7983, + "step": 1014 + }, + { + "epoch": 0.1, + "grad_norm": 1.5644790225602312, + "learning_rate": 1.9719995486211258e-05, + "loss": 0.7695, + "step": 1015 + }, + { + "epoch": 0.1, + "grad_norm": 1.5350140296556833, + "learning_rate": 1.9719220818670144e-05, + "loss": 0.8209, + "step": 1016 + }, + { + "epoch": 0.1, + "grad_norm": 1.716896406028085, + "learning_rate": 1.97184450962596e-05, + "loss": 0.8061, + "step": 1017 + }, + { + "epoch": 0.1, + "grad_norm": 1.5783961891694578, + "learning_rate": 1.971766831906382e-05, + "loss": 0.6355, + "step": 1018 + }, + { + "epoch": 0.1, + "grad_norm": 1.8176065622672664, + "learning_rate": 1.9716890487167114e-05, + "loss": 0.7767, + "step": 1019 + }, + { + "epoch": 0.1, + "grad_norm": 1.5373269041490218, + "learning_rate": 1.97161116006539e-05, + "loss": 0.8069, + "step": 1020 + }, + { + "epoch": 0.1, + "grad_norm": 1.779558739717626, + "learning_rate": 1.9715331659608716e-05, + "loss": 0.9195, + "step": 1021 + }, + { + "epoch": 0.1, + "grad_norm": 1.5333585600073345, + "learning_rate": 1.971455066411621e-05, + "loss": 0.762, + "step": 1022 + }, + { + "epoch": 0.1, + "grad_norm": 1.568751380846371, + "learning_rate": 1.9713768614261144e-05, + "loss": 0.7352, + "step": 1023 + }, + { + "epoch": 0.1, + "grad_norm": 1.4626406800636604, + "learning_rate": 1.9712985510128406e-05, + "loss": 0.7571, + "step": 1024 + }, + { + "epoch": 0.1, + "grad_norm": 1.6115763573627133, + "learning_rate": 1.9712201351802985e-05, + "loss": 0.8721, + "step": 1025 + }, + { + "epoch": 0.1, + "grad_norm": 1.4664746057207612, + "learning_rate": 1.9711416139369984e-05, + "loss": 0.6534, + "step": 1026 + }, + { + "epoch": 0.1, + "grad_norm": 1.834484523255519, + "learning_rate": 1.971062987291464e-05, + "loss": 0.7143, + "step": 1027 + }, + { + "epoch": 0.1, + "grad_norm": 1.5306636799352586, + "learning_rate": 1.9709842552522272e-05, + "loss": 0.7261, + "step": 1028 + }, + { + "epoch": 0.1, + "grad_norm": 1.480494949221711, + "learning_rate": 1.9709054178278343e-05, + "loss": 0.7775, + "step": 1029 + }, + { + "epoch": 0.1, + "grad_norm": 1.4159653760073085, + "learning_rate": 1.9708264750268416e-05, + "loss": 0.6398, + "step": 1030 + }, + { + "epoch": 0.1, + "grad_norm": 1.4677707044356991, + "learning_rate": 1.9707474268578172e-05, + "loss": 0.8, + "step": 1031 + }, + { + "epoch": 0.1, + "grad_norm": 1.6534741622675229, + "learning_rate": 1.97066827332934e-05, + "loss": 0.802, + "step": 1032 + }, + { + "epoch": 0.11, + "grad_norm": 1.5389456821966196, + "learning_rate": 1.9705890144500012e-05, + "loss": 0.764, + "step": 1033 + }, + { + "epoch": 0.11, + "grad_norm": 1.58608861866648, + "learning_rate": 1.9705096502284037e-05, + "loss": 0.6967, + "step": 1034 + }, + { + "epoch": 0.11, + "grad_norm": 1.7745355167995958, + "learning_rate": 1.9704301806731604e-05, + "loss": 0.7554, + "step": 1035 + }, + { + "epoch": 0.11, + "grad_norm": 1.6537554040171483, + "learning_rate": 1.970350605792897e-05, + "loss": 0.7989, + "step": 1036 + }, + { + "epoch": 0.11, + "grad_norm": 1.49864081600279, + "learning_rate": 1.97027092559625e-05, + "loss": 0.6163, + "step": 1037 + }, + { + "epoch": 0.11, + "grad_norm": 1.6541958709909896, + "learning_rate": 1.9701911400918673e-05, + "loss": 0.7903, + "step": 1038 + }, + { + "epoch": 0.11, + "grad_norm": 1.7003266196100684, + "learning_rate": 1.9701112492884084e-05, + "loss": 0.8901, + "step": 1039 + }, + { + "epoch": 0.11, + "grad_norm": 1.6807657449170552, + "learning_rate": 1.9700312531945444e-05, + "loss": 0.6864, + "step": 1040 + }, + { + "epoch": 0.11, + "grad_norm": 1.614703409405987, + "learning_rate": 1.969951151818957e-05, + "loss": 0.839, + "step": 1041 + }, + { + "epoch": 0.11, + "grad_norm": 1.7806689503190112, + "learning_rate": 1.9698709451703405e-05, + "loss": 0.7984, + "step": 1042 + }, + { + "epoch": 0.11, + "grad_norm": 1.8040362335997895, + "learning_rate": 1.9697906332574005e-05, + "loss": 0.8109, + "step": 1043 + }, + { + "epoch": 0.11, + "grad_norm": 1.7385609277311922, + "learning_rate": 1.9697102160888525e-05, + "loss": 0.7235, + "step": 1044 + }, + { + "epoch": 0.11, + "grad_norm": 1.6972535125809556, + "learning_rate": 1.9696296936734254e-05, + "loss": 0.7874, + "step": 1045 + }, + { + "epoch": 0.11, + "grad_norm": 1.8012082058922911, + "learning_rate": 1.9695490660198584e-05, + "loss": 0.7251, + "step": 1046 + }, + { + "epoch": 0.11, + "grad_norm": 1.7649284575263353, + "learning_rate": 1.9694683331369023e-05, + "loss": 0.7602, + "step": 1047 + }, + { + "epoch": 0.11, + "grad_norm": 1.6764194604703575, + "learning_rate": 1.9693874950333196e-05, + "loss": 0.8882, + "step": 1048 + }, + { + "epoch": 0.11, + "grad_norm": 1.6350072171486167, + "learning_rate": 1.9693065517178836e-05, + "loss": 0.6515, + "step": 1049 + }, + { + "epoch": 0.11, + "grad_norm": 1.4989978096130447, + "learning_rate": 1.9692255031993794e-05, + "loss": 0.7783, + "step": 1050 + }, + { + "epoch": 0.11, + "grad_norm": 1.3890840164822351, + "learning_rate": 1.9691443494866043e-05, + "loss": 0.5872, + "step": 1051 + }, + { + "epoch": 0.11, + "grad_norm": 1.63102381931439, + "learning_rate": 1.9690630905883658e-05, + "loss": 0.7515, + "step": 1052 + }, + { + "epoch": 0.11, + "grad_norm": 1.6734613888623586, + "learning_rate": 1.968981726513483e-05, + "loss": 0.7333, + "step": 1053 + }, + { + "epoch": 0.11, + "grad_norm": 1.643404966679851, + "learning_rate": 1.9689002572707873e-05, + "loss": 0.7412, + "step": 1054 + }, + { + "epoch": 0.11, + "grad_norm": 1.601773214091372, + "learning_rate": 1.96881868286912e-05, + "loss": 0.7213, + "step": 1055 + }, + { + "epoch": 0.11, + "grad_norm": 1.6130619273018145, + "learning_rate": 1.9687370033173356e-05, + "loss": 0.7485, + "step": 1056 + }, + { + "epoch": 0.11, + "grad_norm": 1.7587685900576053, + "learning_rate": 1.968655218624299e-05, + "loss": 0.767, + "step": 1057 + }, + { + "epoch": 0.11, + "grad_norm": 1.6537766624915502, + "learning_rate": 1.968573328798886e-05, + "loss": 0.6481, + "step": 1058 + }, + { + "epoch": 0.11, + "grad_norm": 1.7185152830818846, + "learning_rate": 1.9684913338499855e-05, + "loss": 0.8093, + "step": 1059 + }, + { + "epoch": 0.11, + "grad_norm": 1.9319626256997462, + "learning_rate": 1.968409233786496e-05, + "loss": 0.6413, + "step": 1060 + }, + { + "epoch": 0.11, + "grad_norm": 1.547975529846523, + "learning_rate": 1.9683270286173286e-05, + "loss": 0.749, + "step": 1061 + }, + { + "epoch": 0.11, + "grad_norm": 1.6573271155926854, + "learning_rate": 1.968244718351405e-05, + "loss": 0.644, + "step": 1062 + }, + { + "epoch": 0.11, + "grad_norm": 1.5989053921550134, + "learning_rate": 1.968162302997659e-05, + "loss": 0.818, + "step": 1063 + }, + { + "epoch": 0.11, + "grad_norm": 1.7268069657969647, + "learning_rate": 1.968079782565035e-05, + "loss": 0.7992, + "step": 1064 + }, + { + "epoch": 0.11, + "grad_norm": 1.862673950648877, + "learning_rate": 1.9679971570624902e-05, + "loss": 0.7751, + "step": 1065 + }, + { + "epoch": 0.11, + "grad_norm": 1.6890329705049392, + "learning_rate": 1.9679144264989914e-05, + "loss": 0.7416, + "step": 1066 + }, + { + "epoch": 0.11, + "grad_norm": 1.6130513551926382, + "learning_rate": 1.967831590883518e-05, + "loss": 0.7199, + "step": 1067 + }, + { + "epoch": 0.11, + "grad_norm": 1.653095405272077, + "learning_rate": 1.967748650225061e-05, + "loss": 0.7696, + "step": 1068 + }, + { + "epoch": 0.11, + "grad_norm": 1.445353967928603, + "learning_rate": 1.9676656045326217e-05, + "loss": 0.7718, + "step": 1069 + }, + { + "epoch": 0.11, + "grad_norm": 1.6023578260531115, + "learning_rate": 1.967582453815214e-05, + "loss": 0.8398, + "step": 1070 + }, + { + "epoch": 0.11, + "grad_norm": 1.6055120340482538, + "learning_rate": 1.967499198081862e-05, + "loss": 0.8376, + "step": 1071 + }, + { + "epoch": 0.11, + "grad_norm": 1.5238960023480306, + "learning_rate": 1.9674158373416017e-05, + "loss": 0.619, + "step": 1072 + }, + { + "epoch": 0.11, + "grad_norm": 1.7324188075148437, + "learning_rate": 1.9673323716034812e-05, + "loss": 0.9763, + "step": 1073 + }, + { + "epoch": 0.11, + "grad_norm": 1.5746141870115677, + "learning_rate": 1.967248800876559e-05, + "loss": 0.7339, + "step": 1074 + }, + { + "epoch": 0.11, + "grad_norm": 1.4853638045612656, + "learning_rate": 1.967165125169906e-05, + "loss": 0.7768, + "step": 1075 + }, + { + "epoch": 0.11, + "grad_norm": 1.7272333668866946, + "learning_rate": 1.9670813444926028e-05, + "loss": 0.8803, + "step": 1076 + }, + { + "epoch": 0.11, + "grad_norm": 1.7776062101599435, + "learning_rate": 1.9669974588537437e-05, + "loss": 0.7372, + "step": 1077 + }, + { + "epoch": 0.11, + "grad_norm": 1.7740405473527927, + "learning_rate": 1.9669134682624324e-05, + "loss": 0.7725, + "step": 1078 + }, + { + "epoch": 0.11, + "grad_norm": 1.6327544110262224, + "learning_rate": 1.9668293727277847e-05, + "loss": 0.8023, + "step": 1079 + }, + { + "epoch": 0.11, + "grad_norm": 1.6830258720590778, + "learning_rate": 1.9667451722589283e-05, + "loss": 0.8351, + "step": 1080 + }, + { + "epoch": 0.11, + "grad_norm": 1.632865983465764, + "learning_rate": 1.966660866865002e-05, + "loss": 0.8502, + "step": 1081 + }, + { + "epoch": 0.11, + "grad_norm": 1.8501165148542782, + "learning_rate": 1.9665764565551548e-05, + "loss": 0.8346, + "step": 1082 + }, + { + "epoch": 0.11, + "grad_norm": 1.643792627111494, + "learning_rate": 1.966491941338549e-05, + "loss": 0.6841, + "step": 1083 + }, + { + "epoch": 0.11, + "grad_norm": 1.7869989049285269, + "learning_rate": 1.9664073212243576e-05, + "loss": 0.7794, + "step": 1084 + }, + { + "epoch": 0.11, + "grad_norm": 1.5272749364519826, + "learning_rate": 1.966322596221764e-05, + "loss": 0.6827, + "step": 1085 + }, + { + "epoch": 0.11, + "grad_norm": 1.548034982280869, + "learning_rate": 1.9662377663399647e-05, + "loss": 0.7078, + "step": 1086 + }, + { + "epoch": 0.11, + "grad_norm": 1.7707301221752616, + "learning_rate": 1.9661528315881654e-05, + "loss": 0.8737, + "step": 1087 + }, + { + "epoch": 0.11, + "grad_norm": 1.738629441499874, + "learning_rate": 1.9660677919755855e-05, + "loss": 0.9194, + "step": 1088 + }, + { + "epoch": 0.11, + "grad_norm": 1.6275322704385589, + "learning_rate": 1.9659826475114548e-05, + "loss": 0.7833, + "step": 1089 + }, + { + "epoch": 0.11, + "grad_norm": 1.6804171948269853, + "learning_rate": 1.9658973982050133e-05, + "loss": 0.776, + "step": 1090 + }, + { + "epoch": 0.11, + "grad_norm": 1.575697569245944, + "learning_rate": 1.9658120440655147e-05, + "loss": 0.7907, + "step": 1091 + }, + { + "epoch": 0.11, + "grad_norm": 1.570696801563854, + "learning_rate": 1.965726585102222e-05, + "loss": 0.7582, + "step": 1092 + }, + { + "epoch": 0.11, + "grad_norm": 1.6602049450510845, + "learning_rate": 1.965641021324411e-05, + "loss": 0.718, + "step": 1093 + }, + { + "epoch": 0.11, + "grad_norm": 1.6935903967630492, + "learning_rate": 1.965555352741368e-05, + "loss": 0.8315, + "step": 1094 + }, + { + "epoch": 0.11, + "grad_norm": 1.795982914953094, + "learning_rate": 1.9654695793623908e-05, + "loss": 0.7485, + "step": 1095 + }, + { + "epoch": 0.11, + "grad_norm": 1.5328332724743226, + "learning_rate": 1.9653837011967894e-05, + "loss": 0.7178, + "step": 1096 + }, + { + "epoch": 0.11, + "grad_norm": 1.5640940025238694, + "learning_rate": 1.9652977182538836e-05, + "loss": 0.7228, + "step": 1097 + }, + { + "epoch": 0.11, + "grad_norm": 1.556543305513743, + "learning_rate": 1.9652116305430067e-05, + "loss": 0.8408, + "step": 1098 + }, + { + "epoch": 0.11, + "grad_norm": 1.5417627207071147, + "learning_rate": 1.965125438073501e-05, + "loss": 0.7884, + "step": 1099 + }, + { + "epoch": 0.11, + "grad_norm": 1.526098291521299, + "learning_rate": 1.9650391408547222e-05, + "loss": 0.8267, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 1.7507073343126842, + "learning_rate": 1.9649527388960365e-05, + "loss": 0.891, + "step": 1101 + }, + { + "epoch": 0.11, + "grad_norm": 1.5063836112608697, + "learning_rate": 1.9648662322068205e-05, + "loss": 0.8313, + "step": 1102 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884409011943324, + "learning_rate": 1.964779620796464e-05, + "loss": 0.7544, + "step": 1103 + }, + { + "epoch": 0.11, + "grad_norm": 1.7060068858942647, + "learning_rate": 1.9646929046743675e-05, + "loss": 0.727, + "step": 1104 + }, + { + "epoch": 0.11, + "grad_norm": 1.3902547802147753, + "learning_rate": 1.9646060838499418e-05, + "loss": 0.7339, + "step": 1105 + }, + { + "epoch": 0.11, + "grad_norm": 1.4974045504752067, + "learning_rate": 1.964519158332611e-05, + "loss": 0.8947, + "step": 1106 + }, + { + "epoch": 0.11, + "grad_norm": 1.627185223593396, + "learning_rate": 1.9644321281318085e-05, + "loss": 0.7223, + "step": 1107 + }, + { + "epoch": 0.11, + "grad_norm": 1.8650431831681953, + "learning_rate": 1.9643449932569803e-05, + "loss": 0.8473, + "step": 1108 + }, + { + "epoch": 0.11, + "grad_norm": 1.5295919963150972, + "learning_rate": 1.9642577537175845e-05, + "loss": 0.6756, + "step": 1109 + }, + { + "epoch": 0.11, + "grad_norm": 1.827817218480238, + "learning_rate": 1.9641704095230884e-05, + "loss": 0.7595, + "step": 1110 + }, + { + "epoch": 0.11, + "grad_norm": 1.8572345935296157, + "learning_rate": 1.9640829606829724e-05, + "loss": 0.8726, + "step": 1111 + }, + { + "epoch": 0.11, + "grad_norm": 1.9229900375324138, + "learning_rate": 1.9639954072067273e-05, + "loss": 0.6821, + "step": 1112 + }, + { + "epoch": 0.11, + "grad_norm": 1.529709612228805, + "learning_rate": 1.9639077491038562e-05, + "loss": 0.7728, + "step": 1113 + }, + { + "epoch": 0.11, + "grad_norm": 1.7816911587916253, + "learning_rate": 1.9638199863838726e-05, + "loss": 0.6983, + "step": 1114 + }, + { + "epoch": 0.11, + "grad_norm": 1.5904056602001073, + "learning_rate": 1.9637321190563023e-05, + "loss": 0.7543, + "step": 1115 + }, + { + "epoch": 0.11, + "grad_norm": 1.764043464464645, + "learning_rate": 1.963644147130681e-05, + "loss": 0.8764, + "step": 1116 + }, + { + "epoch": 0.11, + "grad_norm": 1.581725970823919, + "learning_rate": 1.9635560706165577e-05, + "loss": 0.7722, + "step": 1117 + }, + { + "epoch": 0.11, + "grad_norm": 1.483170971872246, + "learning_rate": 1.9634678895234912e-05, + "loss": 0.6994, + "step": 1118 + }, + { + "epoch": 0.11, + "grad_norm": 1.607073912269842, + "learning_rate": 1.963379603861052e-05, + "loss": 0.7012, + "step": 1119 + }, + { + "epoch": 0.11, + "grad_norm": 1.5788678057053491, + "learning_rate": 1.9632912136388228e-05, + "loss": 0.6462, + "step": 1120 + }, + { + "epoch": 0.11, + "grad_norm": 1.736077450846907, + "learning_rate": 1.9632027188663963e-05, + "loss": 0.7634, + "step": 1121 + }, + { + "epoch": 0.11, + "grad_norm": 1.553299674016011, + "learning_rate": 1.9631141195533773e-05, + "loss": 0.7819, + "step": 1122 + }, + { + "epoch": 0.11, + "grad_norm": 1.4890833928925393, + "learning_rate": 1.9630254157093822e-05, + "loss": 0.6707, + "step": 1123 + }, + { + "epoch": 0.11, + "grad_norm": 1.7022152845461826, + "learning_rate": 1.9629366073440385e-05, + "loss": 0.7831, + "step": 1124 + }, + { + "epoch": 0.11, + "grad_norm": 1.574794997754779, + "learning_rate": 1.9628476944669847e-05, + "loss": 0.6835, + "step": 1125 + }, + { + "epoch": 0.11, + "grad_norm": 1.6830359629247464, + "learning_rate": 1.9627586770878705e-05, + "loss": 0.7132, + "step": 1126 + }, + { + "epoch": 0.11, + "grad_norm": 1.6489366566650268, + "learning_rate": 1.962669555216358e-05, + "loss": 0.8291, + "step": 1127 + }, + { + "epoch": 0.11, + "grad_norm": 1.5845939269600287, + "learning_rate": 1.9625803288621194e-05, + "loss": 0.7917, + "step": 1128 + }, + { + "epoch": 0.11, + "grad_norm": 1.4958457966941456, + "learning_rate": 1.96249099803484e-05, + "loss": 0.7724, + "step": 1129 + }, + { + "epoch": 0.11, + "grad_norm": 1.6052748434750355, + "learning_rate": 1.9624015627442134e-05, + "loss": 0.7444, + "step": 1130 + }, + { + "epoch": 0.12, + "grad_norm": 1.5071294567920583, + "learning_rate": 1.962312022999948e-05, + "loss": 0.6666, + "step": 1131 + }, + { + "epoch": 0.12, + "grad_norm": 1.7281784144036083, + "learning_rate": 1.9622223788117614e-05, + "loss": 0.7376, + "step": 1132 + }, + { + "epoch": 0.12, + "grad_norm": 1.6185475613897287, + "learning_rate": 1.962132630189383e-05, + "loss": 0.7193, + "step": 1133 + }, + { + "epoch": 0.12, + "grad_norm": 1.7260084083877627, + "learning_rate": 1.962042777142554e-05, + "loss": 0.7864, + "step": 1134 + }, + { + "epoch": 0.12, + "grad_norm": 1.5682532904630293, + "learning_rate": 1.9619528196810254e-05, + "loss": 0.7225, + "step": 1135 + }, + { + "epoch": 0.12, + "grad_norm": 1.6391159674374254, + "learning_rate": 1.9618627578145624e-05, + "loss": 0.7729, + "step": 1136 + }, + { + "epoch": 0.12, + "grad_norm": 1.5784690928165177, + "learning_rate": 1.9617725915529383e-05, + "loss": 0.8187, + "step": 1137 + }, + { + "epoch": 0.12, + "grad_norm": 1.7088910412836227, + "learning_rate": 1.9616823209059398e-05, + "loss": 0.784, + "step": 1138 + }, + { + "epoch": 0.12, + "grad_norm": 1.6571345846430185, + "learning_rate": 1.9615919458833646e-05, + "loss": 0.6829, + "step": 1139 + }, + { + "epoch": 0.12, + "grad_norm": 1.6319202664587704, + "learning_rate": 1.9615014664950214e-05, + "loss": 0.75, + "step": 1140 + }, + { + "epoch": 0.12, + "grad_norm": 1.4303103285508485, + "learning_rate": 1.9614108827507304e-05, + "loss": 0.6818, + "step": 1141 + }, + { + "epoch": 0.12, + "grad_norm": 1.5585376987437913, + "learning_rate": 1.9613201946603225e-05, + "loss": 0.8776, + "step": 1142 + }, + { + "epoch": 0.12, + "grad_norm": 1.6595202034800935, + "learning_rate": 1.9612294022336407e-05, + "loss": 0.8057, + "step": 1143 + }, + { + "epoch": 0.12, + "grad_norm": 1.8047651958531572, + "learning_rate": 1.9611385054805396e-05, + "loss": 0.8138, + "step": 1144 + }, + { + "epoch": 0.12, + "grad_norm": 1.5737996224499786, + "learning_rate": 1.9610475044108837e-05, + "loss": 0.6541, + "step": 1145 + }, + { + "epoch": 0.12, + "grad_norm": 1.5709415443250103, + "learning_rate": 1.960956399034551e-05, + "loss": 0.7679, + "step": 1146 + }, + { + "epoch": 0.12, + "grad_norm": 1.58783726351291, + "learning_rate": 1.9608651893614284e-05, + "loss": 0.772, + "step": 1147 + }, + { + "epoch": 0.12, + "grad_norm": 1.6528514722614565, + "learning_rate": 1.960773875401416e-05, + "loss": 0.6852, + "step": 1148 + }, + { + "epoch": 0.12, + "grad_norm": 1.6218521782436417, + "learning_rate": 1.960682457164424e-05, + "loss": 0.7883, + "step": 1149 + }, + { + "epoch": 0.12, + "grad_norm": 1.4396576356085227, + "learning_rate": 1.960590934660375e-05, + "loss": 0.6858, + "step": 1150 + }, + { + "epoch": 0.12, + "grad_norm": 1.6058391146974962, + "learning_rate": 1.9604993078992016e-05, + "loss": 0.6883, + "step": 1151 + }, + { + "epoch": 0.12, + "grad_norm": 1.752370009104151, + "learning_rate": 1.960407576890849e-05, + "loss": 0.8184, + "step": 1152 + }, + { + "epoch": 0.12, + "grad_norm": 1.4172074465364604, + "learning_rate": 1.9603157416452732e-05, + "loss": 0.6789, + "step": 1153 + }, + { + "epoch": 0.12, + "grad_norm": 1.843728995407731, + "learning_rate": 1.960223802172441e-05, + "loss": 0.7007, + "step": 1154 + }, + { + "epoch": 0.12, + "grad_norm": 1.631743660794764, + "learning_rate": 1.9601317584823315e-05, + "loss": 0.764, + "step": 1155 + }, + { + "epoch": 0.12, + "grad_norm": 1.484817707471324, + "learning_rate": 1.9600396105849346e-05, + "loss": 0.7767, + "step": 1156 + }, + { + "epoch": 0.12, + "grad_norm": 1.6070133386997218, + "learning_rate": 1.9599473584902512e-05, + "loss": 0.8539, + "step": 1157 + }, + { + "epoch": 0.12, + "grad_norm": 1.695249667390617, + "learning_rate": 1.9598550022082937e-05, + "loss": 0.7938, + "step": 1158 + }, + { + "epoch": 0.12, + "grad_norm": 1.6234052698473143, + "learning_rate": 1.9597625417490863e-05, + "loss": 0.757, + "step": 1159 + }, + { + "epoch": 0.12, + "grad_norm": 1.6293595346347398, + "learning_rate": 1.9596699771226638e-05, + "loss": 0.7461, + "step": 1160 + }, + { + "epoch": 0.12, + "grad_norm": 1.662704230882139, + "learning_rate": 1.959577308339073e-05, + "loss": 0.926, + "step": 1161 + }, + { + "epoch": 0.12, + "grad_norm": 1.756056604825178, + "learning_rate": 1.9594845354083716e-05, + "loss": 0.8179, + "step": 1162 + }, + { + "epoch": 0.12, + "grad_norm": 1.6719212295165158, + "learning_rate": 1.9593916583406285e-05, + "loss": 0.7861, + "step": 1163 + }, + { + "epoch": 0.12, + "grad_norm": 1.6184719245446453, + "learning_rate": 1.959298677145924e-05, + "loss": 0.8487, + "step": 1164 + }, + { + "epoch": 0.12, + "grad_norm": 1.7206162783405348, + "learning_rate": 1.95920559183435e-05, + "loss": 0.7311, + "step": 1165 + }, + { + "epoch": 0.12, + "grad_norm": 1.8359997424317043, + "learning_rate": 1.9591124024160092e-05, + "loss": 0.7866, + "step": 1166 + }, + { + "epoch": 0.12, + "grad_norm": 1.624456920621825, + "learning_rate": 1.959019108901016e-05, + "loss": 0.747, + "step": 1167 + }, + { + "epoch": 0.12, + "grad_norm": 1.6260806359564066, + "learning_rate": 1.9589257112994955e-05, + "loss": 0.69, + "step": 1168 + }, + { + "epoch": 0.12, + "grad_norm": 1.667073327058875, + "learning_rate": 1.958832209621586e-05, + "loss": 0.7738, + "step": 1169 + }, + { + "epoch": 0.12, + "grad_norm": 1.6441841038248202, + "learning_rate": 1.9587386038774336e-05, + "loss": 0.7193, + "step": 1170 + }, + { + "epoch": 0.12, + "grad_norm": 1.621706152411868, + "learning_rate": 1.958644894077199e-05, + "loss": 0.8084, + "step": 1171 + }, + { + "epoch": 0.12, + "grad_norm": 1.586482320038094, + "learning_rate": 1.9585510802310527e-05, + "loss": 0.7456, + "step": 1172 + }, + { + "epoch": 0.12, + "grad_norm": 1.692881629428406, + "learning_rate": 1.9584571623491767e-05, + "loss": 0.8248, + "step": 1173 + }, + { + "epoch": 0.12, + "grad_norm": 1.7294424016547736, + "learning_rate": 1.9583631404417644e-05, + "loss": 0.7689, + "step": 1174 + }, + { + "epoch": 0.12, + "grad_norm": 1.6193899899241604, + "learning_rate": 1.9582690145190203e-05, + "loss": 0.8272, + "step": 1175 + }, + { + "epoch": 0.12, + "grad_norm": 1.7799529369465272, + "learning_rate": 1.9581747845911606e-05, + "loss": 0.8271, + "step": 1176 + }, + { + "epoch": 0.12, + "grad_norm": 1.493448273391503, + "learning_rate": 1.958080450668412e-05, + "loss": 0.6895, + "step": 1177 + }, + { + "epoch": 0.12, + "grad_norm": 1.5369375142395612, + "learning_rate": 1.9579860127610127e-05, + "loss": 0.7262, + "step": 1178 + }, + { + "epoch": 0.12, + "grad_norm": 1.5421982874414422, + "learning_rate": 1.9578914708792137e-05, + "loss": 0.7941, + "step": 1179 + }, + { + "epoch": 0.12, + "grad_norm": 1.595212877359824, + "learning_rate": 1.9577968250332748e-05, + "loss": 0.8563, + "step": 1180 + }, + { + "epoch": 0.12, + "grad_norm": 1.55371549416413, + "learning_rate": 1.957702075233469e-05, + "loss": 0.7626, + "step": 1181 + }, + { + "epoch": 0.12, + "grad_norm": 1.4194941949887323, + "learning_rate": 1.9576072214900797e-05, + "loss": 0.783, + "step": 1182 + }, + { + "epoch": 0.12, + "grad_norm": 1.5571594964699573, + "learning_rate": 1.9575122638134018e-05, + "loss": 0.6418, + "step": 1183 + }, + { + "epoch": 0.12, + "grad_norm": 1.6443867895825042, + "learning_rate": 1.9574172022137416e-05, + "loss": 0.7109, + "step": 1184 + }, + { + "epoch": 0.12, + "grad_norm": 1.5964204320215944, + "learning_rate": 1.9573220367014164e-05, + "loss": 0.6518, + "step": 1185 + }, + { + "epoch": 0.12, + "grad_norm": 1.593349684628786, + "learning_rate": 1.9572267672867546e-05, + "loss": 0.7596, + "step": 1186 + }, + { + "epoch": 0.12, + "grad_norm": 1.616352916585672, + "learning_rate": 1.957131393980097e-05, + "loss": 0.6699, + "step": 1187 + }, + { + "epoch": 0.12, + "grad_norm": 1.623342101148859, + "learning_rate": 1.9570359167917942e-05, + "loss": 0.753, + "step": 1188 + }, + { + "epoch": 0.12, + "grad_norm": 1.5982626120798151, + "learning_rate": 1.956940335732209e-05, + "loss": 0.774, + "step": 1189 + }, + { + "epoch": 0.12, + "grad_norm": 1.5780506501342173, + "learning_rate": 1.9568446508117155e-05, + "loss": 0.6698, + "step": 1190 + }, + { + "epoch": 0.12, + "grad_norm": 1.6191888496783715, + "learning_rate": 1.9567488620406984e-05, + "loss": 0.7069, + "step": 1191 + }, + { + "epoch": 0.12, + "grad_norm": 3.128084240737797, + "learning_rate": 1.956652969429554e-05, + "loss": 0.8099, + "step": 1192 + }, + { + "epoch": 0.12, + "grad_norm": 1.654400131567404, + "learning_rate": 1.95655697298869e-05, + "loss": 0.8374, + "step": 1193 + }, + { + "epoch": 0.12, + "grad_norm": 1.5711157493370986, + "learning_rate": 1.9564608727285258e-05, + "loss": 0.6901, + "step": 1194 + }, + { + "epoch": 0.12, + "grad_norm": 1.458281055944847, + "learning_rate": 1.956364668659491e-05, + "loss": 0.6599, + "step": 1195 + }, + { + "epoch": 0.12, + "grad_norm": 1.6305506241103176, + "learning_rate": 1.9562683607920278e-05, + "loss": 0.8076, + "step": 1196 + }, + { + "epoch": 0.12, + "grad_norm": 1.6685254009668504, + "learning_rate": 1.956171949136588e-05, + "loss": 0.7673, + "step": 1197 + }, + { + "epoch": 0.12, + "grad_norm": 1.5926444627846572, + "learning_rate": 1.956075433703636e-05, + "loss": 0.6434, + "step": 1198 + }, + { + "epoch": 0.12, + "grad_norm": 1.5334041797983586, + "learning_rate": 1.955978814503647e-05, + "loss": 0.6648, + "step": 1199 + }, + { + "epoch": 0.12, + "grad_norm": 1.6348793124114884, + "learning_rate": 1.9558820915471076e-05, + "loss": 0.8082, + "step": 1200 + }, + { + "epoch": 0.12, + "grad_norm": 1.6650518092286242, + "learning_rate": 1.9557852648445155e-05, + "loss": 0.7144, + "step": 1201 + }, + { + "epoch": 0.12, + "grad_norm": 1.7730364636444929, + "learning_rate": 1.95568833440638e-05, + "loss": 0.8038, + "step": 1202 + }, + { + "epoch": 0.12, + "grad_norm": 1.4817291628359017, + "learning_rate": 1.9555913002432207e-05, + "loss": 0.7215, + "step": 1203 + }, + { + "epoch": 0.12, + "grad_norm": 1.5437673000618928, + "learning_rate": 1.9554941623655695e-05, + "loss": 0.7794, + "step": 1204 + }, + { + "epoch": 0.12, + "grad_norm": 1.4958564172666966, + "learning_rate": 1.9553969207839694e-05, + "loss": 0.7292, + "step": 1205 + }, + { + "epoch": 0.12, + "grad_norm": 1.4671382704484248, + "learning_rate": 1.955299575508974e-05, + "loss": 0.718, + "step": 1206 + }, + { + "epoch": 0.12, + "grad_norm": 1.5760005500160656, + "learning_rate": 1.9552021265511492e-05, + "loss": 0.7143, + "step": 1207 + }, + { + "epoch": 0.12, + "grad_norm": 1.5823479034445453, + "learning_rate": 1.955104573921071e-05, + "loss": 0.7174, + "step": 1208 + }, + { + "epoch": 0.12, + "grad_norm": 1.7896892822695007, + "learning_rate": 1.955006917629328e-05, + "loss": 0.8204, + "step": 1209 + }, + { + "epoch": 0.12, + "grad_norm": 1.609040548686237, + "learning_rate": 1.9549091576865186e-05, + "loss": 0.7619, + "step": 1210 + }, + { + "epoch": 0.12, + "grad_norm": 1.5210302834438783, + "learning_rate": 1.954811294103253e-05, + "loss": 0.6917, + "step": 1211 + }, + { + "epoch": 0.12, + "grad_norm": 1.668606437182086, + "learning_rate": 1.9547133268901527e-05, + "loss": 0.7154, + "step": 1212 + }, + { + "epoch": 0.12, + "grad_norm": 1.7072837999942514, + "learning_rate": 1.954615256057851e-05, + "loss": 0.7634, + "step": 1213 + }, + { + "epoch": 0.12, + "grad_norm": 1.8353300138895317, + "learning_rate": 1.954517081616992e-05, + "loss": 0.7178, + "step": 1214 + }, + { + "epoch": 0.12, + "grad_norm": 1.5972113260052339, + "learning_rate": 1.954418803578231e-05, + "loss": 0.7412, + "step": 1215 + }, + { + "epoch": 0.12, + "grad_norm": 1.6412684612277957, + "learning_rate": 1.9543204219522338e-05, + "loss": 0.728, + "step": 1216 + }, + { + "epoch": 0.12, + "grad_norm": 1.609735472945734, + "learning_rate": 1.954221936749679e-05, + "loss": 0.8719, + "step": 1217 + }, + { + "epoch": 0.12, + "grad_norm": 1.5573591820655903, + "learning_rate": 1.9541233479812552e-05, + "loss": 0.7166, + "step": 1218 + }, + { + "epoch": 0.12, + "grad_norm": 1.7781813815209297, + "learning_rate": 1.9540246556576624e-05, + "loss": 0.7628, + "step": 1219 + }, + { + "epoch": 0.12, + "grad_norm": 1.683038497796432, + "learning_rate": 1.9539258597896132e-05, + "loss": 0.9519, + "step": 1220 + }, + { + "epoch": 0.12, + "grad_norm": 1.6477867242414523, + "learning_rate": 1.9538269603878293e-05, + "loss": 0.7761, + "step": 1221 + }, + { + "epoch": 0.12, + "grad_norm": 1.6247177428236637, + "learning_rate": 1.953727957463045e-05, + "loss": 0.8392, + "step": 1222 + }, + { + "epoch": 0.12, + "grad_norm": 1.7641582896967056, + "learning_rate": 1.953628851026006e-05, + "loss": 0.8251, + "step": 1223 + }, + { + "epoch": 0.12, + "grad_norm": 1.651347180107763, + "learning_rate": 1.9535296410874678e-05, + "loss": 0.7091, + "step": 1224 + }, + { + "epoch": 0.12, + "grad_norm": 1.596795753990509, + "learning_rate": 1.953430327658199e-05, + "loss": 0.6776, + "step": 1225 + }, + { + "epoch": 0.12, + "grad_norm": 1.72023293687916, + "learning_rate": 1.9533309107489775e-05, + "loss": 0.7708, + "step": 1226 + }, + { + "epoch": 0.12, + "grad_norm": 1.8641043621843565, + "learning_rate": 1.9532313903705945e-05, + "loss": 0.8081, + "step": 1227 + }, + { + "epoch": 0.12, + "grad_norm": 1.6892750119679534, + "learning_rate": 1.953131766533851e-05, + "loss": 0.8025, + "step": 1228 + }, + { + "epoch": 0.13, + "grad_norm": 1.6979908514604627, + "learning_rate": 1.9530320392495592e-05, + "loss": 0.8839, + "step": 1229 + }, + { + "epoch": 0.13, + "grad_norm": 1.5614345333542738, + "learning_rate": 1.9529322085285436e-05, + "loss": 0.8074, + "step": 1230 + }, + { + "epoch": 0.13, + "grad_norm": 1.5462404519794781, + "learning_rate": 1.952832274381639e-05, + "loss": 0.7008, + "step": 1231 + }, + { + "epoch": 0.13, + "grad_norm": 1.6801337840039725, + "learning_rate": 1.9527322368196913e-05, + "loss": 0.7842, + "step": 1232 + }, + { + "epoch": 0.13, + "grad_norm": 1.4877774460670745, + "learning_rate": 1.952632095853559e-05, + "loss": 0.7126, + "step": 1233 + }, + { + "epoch": 0.13, + "grad_norm": 1.470108974836532, + "learning_rate": 1.9525318514941097e-05, + "loss": 0.7865, + "step": 1234 + }, + { + "epoch": 0.13, + "grad_norm": 1.6233458319969059, + "learning_rate": 1.9524315037522238e-05, + "loss": 0.7215, + "step": 1235 + }, + { + "epoch": 0.13, + "grad_norm": 1.6854954491546525, + "learning_rate": 1.9523310526387928e-05, + "loss": 0.7766, + "step": 1236 + }, + { + "epoch": 0.13, + "grad_norm": 1.71560710244287, + "learning_rate": 1.9522304981647186e-05, + "loss": 0.8284, + "step": 1237 + }, + { + "epoch": 0.13, + "grad_norm": 1.5165468607991677, + "learning_rate": 1.952129840340915e-05, + "loss": 0.7682, + "step": 1238 + }, + { + "epoch": 0.13, + "grad_norm": 1.5311283978080743, + "learning_rate": 1.952029079178307e-05, + "loss": 0.7429, + "step": 1239 + }, + { + "epoch": 0.13, + "grad_norm": 1.5454786126577658, + "learning_rate": 1.951928214687831e-05, + "loss": 0.7592, + "step": 1240 + }, + { + "epoch": 0.13, + "grad_norm": 1.7566208659424567, + "learning_rate": 1.9518272468804333e-05, + "loss": 0.7453, + "step": 1241 + }, + { + "epoch": 0.13, + "grad_norm": 1.6901545265480546, + "learning_rate": 1.9517261757670728e-05, + "loss": 0.8635, + "step": 1242 + }, + { + "epoch": 0.13, + "grad_norm": 1.7770640309026147, + "learning_rate": 1.9516250013587193e-05, + "loss": 0.7617, + "step": 1243 + }, + { + "epoch": 0.13, + "grad_norm": 1.5424774446781067, + "learning_rate": 1.9515237236663537e-05, + "loss": 0.7539, + "step": 1244 + }, + { + "epoch": 0.13, + "grad_norm": 1.6504202538179722, + "learning_rate": 1.951422342700968e-05, + "loss": 0.7253, + "step": 1245 + }, + { + "epoch": 0.13, + "grad_norm": 1.479830693335698, + "learning_rate": 1.9513208584735656e-05, + "loss": 0.7184, + "step": 1246 + }, + { + "epoch": 0.13, + "grad_norm": 1.610053631904774, + "learning_rate": 1.9512192709951613e-05, + "loss": 0.8248, + "step": 1247 + }, + { + "epoch": 0.13, + "grad_norm": 1.647736990225513, + "learning_rate": 1.95111758027678e-05, + "loss": 0.7153, + "step": 1248 + }, + { + "epoch": 0.13, + "grad_norm": 1.6762468882847545, + "learning_rate": 1.9510157863294595e-05, + "loss": 0.8218, + "step": 1249 + }, + { + "epoch": 0.13, + "grad_norm": 1.5976749648522812, + "learning_rate": 1.9509138891642476e-05, + "loss": 0.7339, + "step": 1250 + }, + { + "epoch": 0.13, + "grad_norm": 1.6550024682871, + "learning_rate": 1.950811888792204e-05, + "loss": 0.754, + "step": 1251 + }, + { + "epoch": 0.13, + "grad_norm": 1.8202097251988651, + "learning_rate": 1.9507097852243982e-05, + "loss": 0.7897, + "step": 1252 + }, + { + "epoch": 0.13, + "grad_norm": 1.5527467626965263, + "learning_rate": 1.950607578471913e-05, + "loss": 0.7427, + "step": 1253 + }, + { + "epoch": 0.13, + "grad_norm": 1.431786500229432, + "learning_rate": 1.950505268545841e-05, + "loss": 0.6891, + "step": 1254 + }, + { + "epoch": 0.13, + "grad_norm": 1.5255211241113107, + "learning_rate": 1.9504028554572865e-05, + "loss": 0.6895, + "step": 1255 + }, + { + "epoch": 0.13, + "grad_norm": 1.573716102171307, + "learning_rate": 1.9503003392173643e-05, + "loss": 0.7392, + "step": 1256 + }, + { + "epoch": 0.13, + "grad_norm": 1.7500915942835833, + "learning_rate": 1.9501977198372017e-05, + "loss": 0.8491, + "step": 1257 + }, + { + "epoch": 0.13, + "grad_norm": 1.6546658273474368, + "learning_rate": 1.9500949973279358e-05, + "loss": 0.6988, + "step": 1258 + }, + { + "epoch": 0.13, + "grad_norm": 1.6133269018317669, + "learning_rate": 1.949992171700716e-05, + "loss": 0.6442, + "step": 1259 + }, + { + "epoch": 0.13, + "grad_norm": 1.6354850724656633, + "learning_rate": 1.949889242966702e-05, + "loss": 0.8259, + "step": 1260 + }, + { + "epoch": 0.13, + "grad_norm": 1.7682967900115991, + "learning_rate": 1.9497862111370654e-05, + "loss": 0.8345, + "step": 1261 + }, + { + "epoch": 0.13, + "grad_norm": 1.6029947100999593, + "learning_rate": 1.9496830762229884e-05, + "loss": 0.8014, + "step": 1262 + }, + { + "epoch": 0.13, + "grad_norm": 1.6075492800959517, + "learning_rate": 1.949579838235665e-05, + "loss": 0.7311, + "step": 1263 + }, + { + "epoch": 0.13, + "grad_norm": 1.6998815244012917, + "learning_rate": 1.9494764971862998e-05, + "loss": 0.7245, + "step": 1264 + }, + { + "epoch": 0.13, + "grad_norm": 1.4951211694261486, + "learning_rate": 1.949373053086109e-05, + "loss": 0.8232, + "step": 1265 + }, + { + "epoch": 0.13, + "grad_norm": 1.5067111018633088, + "learning_rate": 1.9492695059463197e-05, + "loss": 0.7533, + "step": 1266 + }, + { + "epoch": 0.13, + "grad_norm": 1.6037628206146815, + "learning_rate": 1.949165855778171e-05, + "loss": 0.7363, + "step": 1267 + }, + { + "epoch": 0.13, + "grad_norm": 1.5768105452349963, + "learning_rate": 1.9490621025929112e-05, + "loss": 0.7573, + "step": 1268 + }, + { + "epoch": 0.13, + "grad_norm": 2.01346038116964, + "learning_rate": 1.9489582464018023e-05, + "loss": 0.8341, + "step": 1269 + }, + { + "epoch": 0.13, + "grad_norm": 1.7239135570187674, + "learning_rate": 1.948854287216116e-05, + "loss": 0.7889, + "step": 1270 + }, + { + "epoch": 0.13, + "grad_norm": 1.4433082279559541, + "learning_rate": 1.9487502250471347e-05, + "loss": 0.6745, + "step": 1271 + }, + { + "epoch": 0.13, + "grad_norm": 1.6967479347098455, + "learning_rate": 1.9486460599061536e-05, + "loss": 0.6947, + "step": 1272 + }, + { + "epoch": 0.13, + "grad_norm": 1.6368410087098855, + "learning_rate": 1.948541791804478e-05, + "loss": 0.7861, + "step": 1273 + }, + { + "epoch": 0.13, + "grad_norm": 1.7263202574572991, + "learning_rate": 1.948437420753424e-05, + "loss": 0.7657, + "step": 1274 + }, + { + "epoch": 0.13, + "grad_norm": 1.4448531385811203, + "learning_rate": 1.94833294676432e-05, + "loss": 0.5998, + "step": 1275 + }, + { + "epoch": 0.13, + "grad_norm": 1.5124154039368294, + "learning_rate": 1.948228369848505e-05, + "loss": 0.7219, + "step": 1276 + }, + { + "epoch": 0.13, + "grad_norm": 1.702412187559942, + "learning_rate": 1.948123690017329e-05, + "loss": 0.781, + "step": 1277 + }, + { + "epoch": 0.13, + "grad_norm": 1.555786535491567, + "learning_rate": 1.9480189072821537e-05, + "loss": 0.7099, + "step": 1278 + }, + { + "epoch": 0.13, + "grad_norm": 1.6032400807672151, + "learning_rate": 1.947914021654351e-05, + "loss": 0.6508, + "step": 1279 + }, + { + "epoch": 0.13, + "grad_norm": 1.3784201916828709, + "learning_rate": 1.9478090331453054e-05, + "loss": 0.7026, + "step": 1280 + }, + { + "epoch": 0.13, + "grad_norm": 1.5784513045776165, + "learning_rate": 1.947703941766411e-05, + "loss": 0.8092, + "step": 1281 + }, + { + "epoch": 0.13, + "grad_norm": 1.5666128770715946, + "learning_rate": 1.947598747529074e-05, + "loss": 0.7278, + "step": 1282 + }, + { + "epoch": 0.13, + "grad_norm": 1.6376187966620492, + "learning_rate": 1.947493450444712e-05, + "loss": 0.7695, + "step": 1283 + }, + { + "epoch": 0.13, + "grad_norm": 1.406052287009246, + "learning_rate": 1.9473880505247532e-05, + "loss": 0.7641, + "step": 1284 + }, + { + "epoch": 0.13, + "grad_norm": 1.7269431831784723, + "learning_rate": 1.947282547780637e-05, + "loss": 0.8017, + "step": 1285 + }, + { + "epoch": 0.13, + "grad_norm": 1.5161138349707055, + "learning_rate": 1.947176942223814e-05, + "loss": 0.7157, + "step": 1286 + }, + { + "epoch": 0.13, + "grad_norm": 1.5678540961023142, + "learning_rate": 1.947071233865746e-05, + "loss": 0.767, + "step": 1287 + }, + { + "epoch": 0.13, + "grad_norm": 1.5527071981054203, + "learning_rate": 1.946965422717906e-05, + "loss": 0.8231, + "step": 1288 + }, + { + "epoch": 0.13, + "grad_norm": 1.5907878060262768, + "learning_rate": 1.9468595087917786e-05, + "loss": 0.6807, + "step": 1289 + }, + { + "epoch": 0.13, + "grad_norm": 1.5158981766324053, + "learning_rate": 1.9467534920988586e-05, + "loss": 0.697, + "step": 1290 + }, + { + "epoch": 0.13, + "grad_norm": 1.59870920201292, + "learning_rate": 1.9466473726506522e-05, + "loss": 0.7714, + "step": 1291 + }, + { + "epoch": 0.13, + "grad_norm": 1.7794305184289632, + "learning_rate": 1.9465411504586778e-05, + "loss": 0.734, + "step": 1292 + }, + { + "epoch": 0.13, + "grad_norm": 1.6392493833071595, + "learning_rate": 1.9464348255344642e-05, + "loss": 0.7791, + "step": 1293 + }, + { + "epoch": 0.13, + "grad_norm": 1.6509845458827024, + "learning_rate": 1.9463283978895503e-05, + "loss": 0.7268, + "step": 1294 + }, + { + "epoch": 0.13, + "grad_norm": 1.7374581302193655, + "learning_rate": 1.9462218675354877e-05, + "loss": 0.8593, + "step": 1295 + }, + { + "epoch": 0.13, + "grad_norm": 1.5227355426200806, + "learning_rate": 1.9461152344838386e-05, + "loss": 0.766, + "step": 1296 + }, + { + "epoch": 0.13, + "grad_norm": 1.546737758110503, + "learning_rate": 1.9460084987461767e-05, + "loss": 0.7758, + "step": 1297 + }, + { + "epoch": 0.13, + "grad_norm": 1.5361827428965045, + "learning_rate": 1.945901660334086e-05, + "loss": 0.7971, + "step": 1298 + }, + { + "epoch": 0.13, + "grad_norm": 1.7399009926320979, + "learning_rate": 1.9457947192591623e-05, + "loss": 0.6787, + "step": 1299 + }, + { + "epoch": 0.13, + "grad_norm": 1.5621018072678603, + "learning_rate": 1.945687675533013e-05, + "loss": 0.7643, + "step": 1300 + }, + { + "epoch": 0.13, + "grad_norm": 1.5677030577309845, + "learning_rate": 1.9455805291672544e-05, + "loss": 0.7244, + "step": 1301 + }, + { + "epoch": 0.13, + "grad_norm": 1.8324847460117326, + "learning_rate": 1.9454732801735173e-05, + "loss": 0.8323, + "step": 1302 + }, + { + "epoch": 0.13, + "grad_norm": 1.8054740103348892, + "learning_rate": 1.945365928563441e-05, + "loss": 0.86, + "step": 1303 + }, + { + "epoch": 0.13, + "grad_norm": 1.5267902312222363, + "learning_rate": 1.9452584743486768e-05, + "loss": 0.7199, + "step": 1304 + }, + { + "epoch": 0.13, + "grad_norm": 1.6207235099791248, + "learning_rate": 1.9451509175408875e-05, + "loss": 0.8241, + "step": 1305 + }, + { + "epoch": 0.13, + "grad_norm": 1.6311789356149866, + "learning_rate": 1.9450432581517466e-05, + "loss": 0.8855, + "step": 1306 + }, + { + "epoch": 0.13, + "grad_norm": 1.6012394050190988, + "learning_rate": 1.944935496192939e-05, + "loss": 0.7032, + "step": 1307 + }, + { + "epoch": 0.13, + "grad_norm": 1.6940041699409758, + "learning_rate": 1.9448276316761604e-05, + "loss": 0.795, + "step": 1308 + }, + { + "epoch": 0.13, + "grad_norm": 1.6439835630134267, + "learning_rate": 1.9447196646131177e-05, + "loss": 0.8516, + "step": 1309 + }, + { + "epoch": 0.13, + "grad_norm": 1.563633479841595, + "learning_rate": 1.9446115950155293e-05, + "loss": 0.855, + "step": 1310 + }, + { + "epoch": 0.13, + "grad_norm": 1.6668575947195545, + "learning_rate": 1.9445034228951245e-05, + "loss": 0.7242, + "step": 1311 + }, + { + "epoch": 0.13, + "grad_norm": 1.573876033648897, + "learning_rate": 1.944395148263643e-05, + "loss": 0.7852, + "step": 1312 + }, + { + "epoch": 0.13, + "grad_norm": 1.77770874892001, + "learning_rate": 1.9442867711328372e-05, + "loss": 0.8421, + "step": 1313 + }, + { + "epoch": 0.13, + "grad_norm": 1.6470437342507795, + "learning_rate": 1.9441782915144694e-05, + "loss": 0.762, + "step": 1314 + }, + { + "epoch": 0.13, + "grad_norm": 1.5012094606088324, + "learning_rate": 1.9440697094203135e-05, + "loss": 0.7325, + "step": 1315 + }, + { + "epoch": 0.13, + "grad_norm": 1.624074833699309, + "learning_rate": 1.943961024862154e-05, + "loss": 0.7439, + "step": 1316 + }, + { + "epoch": 0.13, + "grad_norm": 1.853394164880661, + "learning_rate": 1.943852237851787e-05, + "loss": 0.7825, + "step": 1317 + }, + { + "epoch": 0.13, + "grad_norm": 1.8797453479583488, + "learning_rate": 1.9437433484010203e-05, + "loss": 0.769, + "step": 1318 + }, + { + "epoch": 0.13, + "grad_norm": 1.6375299522173172, + "learning_rate": 1.943634356521671e-05, + "loss": 0.7554, + "step": 1319 + }, + { + "epoch": 0.13, + "grad_norm": 1.5538090573787875, + "learning_rate": 1.9435252622255694e-05, + "loss": 0.8113, + "step": 1320 + }, + { + "epoch": 0.13, + "grad_norm": 1.5303415868657724, + "learning_rate": 1.9434160655245557e-05, + "loss": 0.777, + "step": 1321 + }, + { + "epoch": 0.13, + "grad_norm": 1.6464673627238309, + "learning_rate": 1.9433067664304818e-05, + "loss": 0.7287, + "step": 1322 + }, + { + "epoch": 0.13, + "grad_norm": 1.687666449674919, + "learning_rate": 1.9431973649552094e-05, + "loss": 0.7274, + "step": 1323 + }, + { + "epoch": 0.13, + "grad_norm": 1.8097115392477354, + "learning_rate": 1.9430878611106133e-05, + "loss": 0.7362, + "step": 1324 + }, + { + "epoch": 0.13, + "grad_norm": 1.5710783797225716, + "learning_rate": 1.942978254908578e-05, + "loss": 0.73, + "step": 1325 + }, + { + "epoch": 0.13, + "grad_norm": 1.4434197275009946, + "learning_rate": 1.942868546361e-05, + "loss": 0.6966, + "step": 1326 + }, + { + "epoch": 0.13, + "grad_norm": 1.7744289801449897, + "learning_rate": 1.9427587354797857e-05, + "loss": 0.8461, + "step": 1327 + }, + { + "epoch": 0.14, + "grad_norm": 1.5746501758483216, + "learning_rate": 1.9426488222768542e-05, + "loss": 0.7306, + "step": 1328 + }, + { + "epoch": 0.14, + "grad_norm": 1.6972017007527762, + "learning_rate": 1.942538806764134e-05, + "loss": 0.9094, + "step": 1329 + }, + { + "epoch": 0.14, + "grad_norm": 1.6418540267949544, + "learning_rate": 1.9424286889535663e-05, + "loss": 0.7503, + "step": 1330 + }, + { + "epoch": 0.14, + "grad_norm": 1.5414145441351583, + "learning_rate": 1.942318468857102e-05, + "loss": 0.809, + "step": 1331 + }, + { + "epoch": 0.14, + "grad_norm": 1.4873908203642654, + "learning_rate": 1.9422081464867043e-05, + "loss": 0.7576, + "step": 1332 + }, + { + "epoch": 0.14, + "grad_norm": 1.5279253204351688, + "learning_rate": 1.942097721854347e-05, + "loss": 0.7851, + "step": 1333 + }, + { + "epoch": 0.14, + "grad_norm": 1.687191963317494, + "learning_rate": 1.9419871949720145e-05, + "loss": 0.8089, + "step": 1334 + }, + { + "epoch": 0.14, + "grad_norm": 1.4740624992934195, + "learning_rate": 1.9418765658517032e-05, + "loss": 0.8128, + "step": 1335 + }, + { + "epoch": 0.14, + "grad_norm": 1.62078576616334, + "learning_rate": 1.9417658345054196e-05, + "loss": 0.7414, + "step": 1336 + }, + { + "epoch": 0.14, + "grad_norm": 1.4588705309609198, + "learning_rate": 1.9416550009451827e-05, + "loss": 0.6019, + "step": 1337 + }, + { + "epoch": 0.14, + "grad_norm": 1.5758310763195056, + "learning_rate": 1.941544065183021e-05, + "loss": 0.7815, + "step": 1338 + }, + { + "epoch": 0.14, + "grad_norm": 1.5708714704534004, + "learning_rate": 1.9414330272309753e-05, + "loss": 0.6871, + "step": 1339 + }, + { + "epoch": 0.14, + "grad_norm": 1.5364979929950109, + "learning_rate": 1.9413218871010964e-05, + "loss": 0.7195, + "step": 1340 + }, + { + "epoch": 0.14, + "grad_norm": 1.7385744275514265, + "learning_rate": 1.9412106448054476e-05, + "loss": 0.8279, + "step": 1341 + }, + { + "epoch": 0.14, + "grad_norm": 1.731954304641849, + "learning_rate": 1.9410993003561027e-05, + "loss": 0.8354, + "step": 1342 + }, + { + "epoch": 0.14, + "grad_norm": 1.6744216970597119, + "learning_rate": 1.940987853765145e-05, + "loss": 0.7414, + "step": 1343 + }, + { + "epoch": 0.14, + "grad_norm": 1.6391277634441834, + "learning_rate": 1.9408763050446716e-05, + "loss": 0.7927, + "step": 1344 + }, + { + "epoch": 0.14, + "grad_norm": 1.7315724333267457, + "learning_rate": 1.9407646542067892e-05, + "loss": 0.8221, + "step": 1345 + }, + { + "epoch": 0.14, + "grad_norm": 1.6881225736292127, + "learning_rate": 1.9406529012636153e-05, + "loss": 0.7886, + "step": 1346 + }, + { + "epoch": 0.14, + "grad_norm": 1.581344862065677, + "learning_rate": 1.940541046227279e-05, + "loss": 0.6217, + "step": 1347 + }, + { + "epoch": 0.14, + "grad_norm": 1.589644667566598, + "learning_rate": 1.9404290891099205e-05, + "loss": 0.8205, + "step": 1348 + }, + { + "epoch": 0.14, + "grad_norm": 1.5314874491401882, + "learning_rate": 1.940317029923691e-05, + "loss": 0.7064, + "step": 1349 + }, + { + "epoch": 0.14, + "grad_norm": 1.6299195296123066, + "learning_rate": 1.9402048686807534e-05, + "loss": 0.6926, + "step": 1350 + }, + { + "epoch": 0.14, + "grad_norm": 1.6570600149638093, + "learning_rate": 1.94009260539328e-05, + "loss": 0.7882, + "step": 1351 + }, + { + "epoch": 0.14, + "grad_norm": 1.4966120899156123, + "learning_rate": 1.939980240073456e-05, + "loss": 0.8876, + "step": 1352 + }, + { + "epoch": 0.14, + "grad_norm": 1.5556698302348833, + "learning_rate": 1.939867772733476e-05, + "loss": 0.731, + "step": 1353 + }, + { + "epoch": 0.14, + "grad_norm": 1.733950508696902, + "learning_rate": 1.9397552033855474e-05, + "loss": 0.8104, + "step": 1354 + }, + { + "epoch": 0.14, + "grad_norm": 1.6408561640363308, + "learning_rate": 1.939642532041888e-05, + "loss": 0.7891, + "step": 1355 + }, + { + "epoch": 0.14, + "grad_norm": 1.6736879527988986, + "learning_rate": 1.9395297587147256e-05, + "loss": 0.8729, + "step": 1356 + }, + { + "epoch": 0.14, + "grad_norm": 1.6896580180672505, + "learning_rate": 1.9394168834163007e-05, + "loss": 0.9141, + "step": 1357 + }, + { + "epoch": 0.14, + "grad_norm": 1.6713377955139828, + "learning_rate": 1.939303906158864e-05, + "loss": 0.8107, + "step": 1358 + }, + { + "epoch": 0.14, + "grad_norm": 1.6503043741757688, + "learning_rate": 1.9391908269546773e-05, + "loss": 0.8003, + "step": 1359 + }, + { + "epoch": 0.14, + "grad_norm": 1.47962070900276, + "learning_rate": 1.9390776458160137e-05, + "loss": 0.6367, + "step": 1360 + }, + { + "epoch": 0.14, + "grad_norm": 1.7702743050628662, + "learning_rate": 1.938964362755157e-05, + "loss": 0.8017, + "step": 1361 + }, + { + "epoch": 0.14, + "grad_norm": 1.7824582581508035, + "learning_rate": 1.9388509777844027e-05, + "loss": 0.8004, + "step": 1362 + }, + { + "epoch": 0.14, + "grad_norm": 1.5668170478152565, + "learning_rate": 1.9387374909160567e-05, + "loss": 0.7007, + "step": 1363 + }, + { + "epoch": 0.14, + "grad_norm": 1.560949917574267, + "learning_rate": 1.9386239021624362e-05, + "loss": 0.7871, + "step": 1364 + }, + { + "epoch": 0.14, + "grad_norm": 1.7083113146742042, + "learning_rate": 1.9385102115358695e-05, + "loss": 0.7337, + "step": 1365 + }, + { + "epoch": 0.14, + "grad_norm": 1.6388928807175198, + "learning_rate": 1.938396419048696e-05, + "loss": 0.7306, + "step": 1366 + }, + { + "epoch": 0.14, + "grad_norm": 1.8150081650050511, + "learning_rate": 1.9382825247132662e-05, + "loss": 0.7636, + "step": 1367 + }, + { + "epoch": 0.14, + "grad_norm": 1.6574189204465755, + "learning_rate": 1.9381685285419417e-05, + "loss": 0.7559, + "step": 1368 + }, + { + "epoch": 0.14, + "grad_norm": 1.6478090214963608, + "learning_rate": 1.9380544305470942e-05, + "loss": 0.868, + "step": 1369 + }, + { + "epoch": 0.14, + "grad_norm": 1.585684010108357, + "learning_rate": 1.937940230741108e-05, + "loss": 0.6967, + "step": 1370 + }, + { + "epoch": 0.14, + "grad_norm": 1.5524270780817286, + "learning_rate": 1.9378259291363774e-05, + "loss": 0.7274, + "step": 1371 + }, + { + "epoch": 0.14, + "grad_norm": 1.7849853674264955, + "learning_rate": 1.9377115257453085e-05, + "loss": 0.7452, + "step": 1372 + }, + { + "epoch": 0.14, + "grad_norm": 1.5268152637742454, + "learning_rate": 1.9375970205803175e-05, + "loss": 0.7099, + "step": 1373 + }, + { + "epoch": 0.14, + "grad_norm": 1.5911324780952263, + "learning_rate": 1.937482413653832e-05, + "loss": 0.8437, + "step": 1374 + }, + { + "epoch": 0.14, + "grad_norm": 1.4835029992058266, + "learning_rate": 1.9373677049782916e-05, + "loss": 0.5901, + "step": 1375 + }, + { + "epoch": 0.14, + "grad_norm": 1.5101360380804878, + "learning_rate": 1.937252894566145e-05, + "loss": 0.7386, + "step": 1376 + }, + { + "epoch": 0.14, + "grad_norm": 1.4448014915967933, + "learning_rate": 1.937137982429854e-05, + "loss": 0.7943, + "step": 1377 + }, + { + "epoch": 0.14, + "grad_norm": 1.5081393545005264, + "learning_rate": 1.9370229685818903e-05, + "loss": 0.663, + "step": 1378 + }, + { + "epoch": 0.14, + "grad_norm": 1.5379845123993552, + "learning_rate": 1.936907853034737e-05, + "loss": 0.7651, + "step": 1379 + }, + { + "epoch": 0.14, + "grad_norm": 1.7229609662472323, + "learning_rate": 1.9367926358008872e-05, + "loss": 0.624, + "step": 1380 + }, + { + "epoch": 0.14, + "grad_norm": 1.5857171073643888, + "learning_rate": 1.936677316892847e-05, + "loss": 0.7273, + "step": 1381 + }, + { + "epoch": 0.14, + "grad_norm": 1.8265782778745185, + "learning_rate": 1.936561896323132e-05, + "loss": 0.831, + "step": 1382 + }, + { + "epoch": 0.14, + "grad_norm": 1.5858906216474162, + "learning_rate": 1.9364463741042694e-05, + "loss": 0.8427, + "step": 1383 + }, + { + "epoch": 0.14, + "grad_norm": 1.6444591766781325, + "learning_rate": 1.936330750248797e-05, + "loss": 0.7696, + "step": 1384 + }, + { + "epoch": 0.14, + "grad_norm": 1.7707256531790037, + "learning_rate": 1.9362150247692646e-05, + "loss": 0.7645, + "step": 1385 + }, + { + "epoch": 0.14, + "grad_norm": 1.510474176362667, + "learning_rate": 1.9360991976782317e-05, + "loss": 0.5463, + "step": 1386 + }, + { + "epoch": 0.14, + "grad_norm": 1.616219735344741, + "learning_rate": 1.93598326898827e-05, + "loss": 0.6828, + "step": 1387 + }, + { + "epoch": 0.14, + "grad_norm": 1.5900808236080102, + "learning_rate": 1.935867238711962e-05, + "loss": 0.7644, + "step": 1388 + }, + { + "epoch": 0.14, + "grad_norm": 1.5447266911351711, + "learning_rate": 1.9357511068619e-05, + "loss": 0.7823, + "step": 1389 + }, + { + "epoch": 0.14, + "grad_norm": 1.443662221459821, + "learning_rate": 1.9356348734506888e-05, + "loss": 0.6705, + "step": 1390 + }, + { + "epoch": 0.14, + "grad_norm": 1.6858155970178428, + "learning_rate": 1.935518538490944e-05, + "loss": 0.7775, + "step": 1391 + }, + { + "epoch": 0.14, + "grad_norm": 1.6260463865702544, + "learning_rate": 1.9354021019952917e-05, + "loss": 0.6997, + "step": 1392 + }, + { + "epoch": 0.14, + "grad_norm": 1.6151462386185187, + "learning_rate": 1.9352855639763693e-05, + "loss": 0.7388, + "step": 1393 + }, + { + "epoch": 0.14, + "grad_norm": 1.6226745231864121, + "learning_rate": 1.935168924446825e-05, + "loss": 0.7637, + "step": 1394 + }, + { + "epoch": 0.14, + "grad_norm": 1.736869739132547, + "learning_rate": 1.935052183419319e-05, + "loss": 0.8231, + "step": 1395 + }, + { + "epoch": 0.14, + "grad_norm": 1.5521494478169477, + "learning_rate": 1.9349353409065203e-05, + "loss": 0.8538, + "step": 1396 + }, + { + "epoch": 0.14, + "grad_norm": 1.4936808683766414, + "learning_rate": 1.9348183969211113e-05, + "loss": 0.7137, + "step": 1397 + }, + { + "epoch": 0.14, + "grad_norm": 1.6405737267346916, + "learning_rate": 1.9347013514757845e-05, + "loss": 0.7905, + "step": 1398 + }, + { + "epoch": 0.14, + "grad_norm": 1.621940251119487, + "learning_rate": 1.934584204583243e-05, + "loss": 0.828, + "step": 1399 + }, + { + "epoch": 0.14, + "grad_norm": 1.6037284742563462, + "learning_rate": 1.934466956256201e-05, + "loss": 0.6796, + "step": 1400 + }, + { + "epoch": 0.14, + "grad_norm": 1.567145940022025, + "learning_rate": 1.9343496065073846e-05, + "loss": 0.7317, + "step": 1401 + }, + { + "epoch": 0.14, + "grad_norm": 1.9771936077668393, + "learning_rate": 1.93423215534953e-05, + "loss": 0.7907, + "step": 1402 + }, + { + "epoch": 0.14, + "grad_norm": 1.6312584772073946, + "learning_rate": 1.934114602795385e-05, + "loss": 0.6874, + "step": 1403 + }, + { + "epoch": 0.14, + "grad_norm": 1.4670109880400073, + "learning_rate": 1.9339969488577074e-05, + "loss": 0.7118, + "step": 1404 + }, + { + "epoch": 0.14, + "grad_norm": 1.5236031544065936, + "learning_rate": 1.933879193549267e-05, + "loss": 0.7507, + "step": 1405 + }, + { + "epoch": 0.14, + "grad_norm": 1.6547095135909067, + "learning_rate": 1.9337613368828443e-05, + "loss": 0.7915, + "step": 1406 + }, + { + "epoch": 0.14, + "grad_norm": 1.6994751897935607, + "learning_rate": 1.9336433788712313e-05, + "loss": 0.7358, + "step": 1407 + }, + { + "epoch": 0.14, + "grad_norm": 1.6317326459822472, + "learning_rate": 1.9335253195272298e-05, + "loss": 0.7229, + "step": 1408 + }, + { + "epoch": 0.14, + "grad_norm": 1.4262997590885145, + "learning_rate": 1.9334071588636537e-05, + "loss": 0.6226, + "step": 1409 + }, + { + "epoch": 0.14, + "grad_norm": 1.650951039532934, + "learning_rate": 1.9332888968933273e-05, + "loss": 0.7597, + "step": 1410 + }, + { + "epoch": 0.14, + "grad_norm": 1.431463986073521, + "learning_rate": 1.933170533629086e-05, + "loss": 0.7399, + "step": 1411 + }, + { + "epoch": 0.14, + "grad_norm": 1.7834899114222995, + "learning_rate": 1.933052069083777e-05, + "loss": 0.7878, + "step": 1412 + }, + { + "epoch": 0.14, + "grad_norm": 1.6821249271791365, + "learning_rate": 1.9329335032702564e-05, + "loss": 0.7578, + "step": 1413 + }, + { + "epoch": 0.14, + "grad_norm": 1.6168283852908498, + "learning_rate": 1.932814836201394e-05, + "loss": 0.8011, + "step": 1414 + }, + { + "epoch": 0.14, + "grad_norm": 1.5598551085056578, + "learning_rate": 1.932696067890069e-05, + "loss": 0.7733, + "step": 1415 + }, + { + "epoch": 0.14, + "grad_norm": 1.4618766523100533, + "learning_rate": 1.9325771983491708e-05, + "loss": 0.6826, + "step": 1416 + }, + { + "epoch": 0.14, + "grad_norm": 1.3812200411588107, + "learning_rate": 1.932458227591602e-05, + "loss": 0.7218, + "step": 1417 + }, + { + "epoch": 0.14, + "grad_norm": 1.5380172482866934, + "learning_rate": 1.9323391556302743e-05, + "loss": 0.8418, + "step": 1418 + }, + { + "epoch": 0.14, + "grad_norm": 1.6412778862474402, + "learning_rate": 1.9322199824781117e-05, + "loss": 0.826, + "step": 1419 + }, + { + "epoch": 0.14, + "grad_norm": 1.5275748605652015, + "learning_rate": 1.9321007081480486e-05, + "loss": 0.7854, + "step": 1420 + }, + { + "epoch": 0.14, + "grad_norm": 1.4810290185183532, + "learning_rate": 1.9319813326530296e-05, + "loss": 0.7001, + "step": 1421 + }, + { + "epoch": 0.14, + "grad_norm": 1.5757069867975266, + "learning_rate": 1.9318618560060117e-05, + "loss": 0.7207, + "step": 1422 + }, + { + "epoch": 0.14, + "grad_norm": 1.4581775676110735, + "learning_rate": 1.931742278219962e-05, + "loss": 0.7473, + "step": 1423 + }, + { + "epoch": 0.14, + "grad_norm": 1.5452482775653402, + "learning_rate": 1.9316225993078586e-05, + "loss": 0.7197, + "step": 1424 + }, + { + "epoch": 0.14, + "grad_norm": 1.5196283043899141, + "learning_rate": 1.9315028192826912e-05, + "loss": 0.7792, + "step": 1425 + }, + { + "epoch": 0.15, + "grad_norm": 1.490523064687223, + "learning_rate": 1.9313829381574594e-05, + "loss": 0.714, + "step": 1426 + }, + { + "epoch": 0.15, + "grad_norm": 1.5986736451592876, + "learning_rate": 1.9312629559451755e-05, + "loss": 0.8192, + "step": 1427 + }, + { + "epoch": 0.15, + "grad_norm": 1.6455596522866387, + "learning_rate": 1.931142872658861e-05, + "loss": 0.8674, + "step": 1428 + }, + { + "epoch": 0.15, + "grad_norm": 1.690847570583257, + "learning_rate": 1.9310226883115488e-05, + "loss": 0.7916, + "step": 1429 + }, + { + "epoch": 0.15, + "grad_norm": 1.7743202508358853, + "learning_rate": 1.9309024029162833e-05, + "loss": 0.5859, + "step": 1430 + }, + { + "epoch": 0.15, + "grad_norm": 1.5466754395817315, + "learning_rate": 1.93078201648612e-05, + "loss": 0.7705, + "step": 1431 + }, + { + "epoch": 0.15, + "grad_norm": 1.5247144984670906, + "learning_rate": 1.9306615290341244e-05, + "loss": 0.7685, + "step": 1432 + }, + { + "epoch": 0.15, + "grad_norm": 1.6077323413863682, + "learning_rate": 1.9305409405733736e-05, + "loss": 0.7231, + "step": 1433 + }, + { + "epoch": 0.15, + "grad_norm": 1.6702697638591097, + "learning_rate": 1.930420251116956e-05, + "loss": 0.7152, + "step": 1434 + }, + { + "epoch": 0.15, + "grad_norm": 1.4667964469637342, + "learning_rate": 1.9302994606779704e-05, + "loss": 0.6875, + "step": 1435 + }, + { + "epoch": 0.15, + "grad_norm": 1.525503816062669, + "learning_rate": 1.930178569269526e-05, + "loss": 0.684, + "step": 1436 + }, + { + "epoch": 0.15, + "grad_norm": 1.8024932861274463, + "learning_rate": 1.9300575769047448e-05, + "loss": 0.7442, + "step": 1437 + }, + { + "epoch": 0.15, + "grad_norm": 1.562052851367724, + "learning_rate": 1.929936483596758e-05, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.15, + "grad_norm": 1.6614152158765623, + "learning_rate": 1.9298152893587084e-05, + "loss": 0.7837, + "step": 1439 + }, + { + "epoch": 0.15, + "grad_norm": 1.7276983476208452, + "learning_rate": 1.92969399420375e-05, + "loss": 0.732, + "step": 1440 + }, + { + "epoch": 0.15, + "grad_norm": 1.5647712919266812, + "learning_rate": 1.9295725981450473e-05, + "loss": 0.7849, + "step": 1441 + }, + { + "epoch": 0.15, + "grad_norm": 1.6277974927232288, + "learning_rate": 1.9294511011957757e-05, + "loss": 0.7406, + "step": 1442 + }, + { + "epoch": 0.15, + "grad_norm": 1.6582286211160586, + "learning_rate": 1.9293295033691223e-05, + "loss": 0.7728, + "step": 1443 + }, + { + "epoch": 0.15, + "grad_norm": 1.581468131712894, + "learning_rate": 1.9292078046782844e-05, + "loss": 0.7779, + "step": 1444 + }, + { + "epoch": 0.15, + "grad_norm": 1.7121934073770229, + "learning_rate": 1.9290860051364704e-05, + "loss": 0.8609, + "step": 1445 + }, + { + "epoch": 0.15, + "grad_norm": 1.549302609178488, + "learning_rate": 1.9289641047569e-05, + "loss": 0.6958, + "step": 1446 + }, + { + "epoch": 0.15, + "grad_norm": 1.6043051626537128, + "learning_rate": 1.928842103552803e-05, + "loss": 0.7551, + "step": 1447 + }, + { + "epoch": 0.15, + "grad_norm": 1.6362496985196373, + "learning_rate": 1.928720001537421e-05, + "loss": 0.8159, + "step": 1448 + }, + { + "epoch": 0.15, + "grad_norm": 1.8244786394221024, + "learning_rate": 1.928597798724007e-05, + "loss": 0.8213, + "step": 1449 + }, + { + "epoch": 0.15, + "grad_norm": 1.670704556030598, + "learning_rate": 1.9284754951258233e-05, + "loss": 0.8299, + "step": 1450 + }, + { + "epoch": 0.15, + "grad_norm": 1.511410122208224, + "learning_rate": 1.9283530907561445e-05, + "loss": 0.7247, + "step": 1451 + }, + { + "epoch": 0.15, + "grad_norm": 1.6005491815394173, + "learning_rate": 1.9282305856282554e-05, + "loss": 0.7206, + "step": 1452 + }, + { + "epoch": 0.15, + "grad_norm": 1.623120296961341, + "learning_rate": 1.928107979755452e-05, + "loss": 0.669, + "step": 1453 + }, + { + "epoch": 0.15, + "grad_norm": 1.7108125044473872, + "learning_rate": 1.9279852731510417e-05, + "loss": 0.752, + "step": 1454 + }, + { + "epoch": 0.15, + "grad_norm": 1.5740653272856195, + "learning_rate": 1.927862465828342e-05, + "loss": 0.7685, + "step": 1455 + }, + { + "epoch": 0.15, + "grad_norm": 1.5311910116271827, + "learning_rate": 1.9277395578006815e-05, + "loss": 0.7808, + "step": 1456 + }, + { + "epoch": 0.15, + "grad_norm": 1.7568559916243593, + "learning_rate": 1.9276165490814e-05, + "loss": 0.7578, + "step": 1457 + }, + { + "epoch": 0.15, + "grad_norm": 1.6287705974238318, + "learning_rate": 1.927493439683849e-05, + "loss": 0.7119, + "step": 1458 + }, + { + "epoch": 0.15, + "grad_norm": 1.7208541800835253, + "learning_rate": 1.9273702296213893e-05, + "loss": 0.8291, + "step": 1459 + }, + { + "epoch": 0.15, + "grad_norm": 1.5970064188840418, + "learning_rate": 1.9272469189073936e-05, + "loss": 0.7491, + "step": 1460 + }, + { + "epoch": 0.15, + "grad_norm": 1.591605260808786, + "learning_rate": 1.9271235075552454e-05, + "loss": 0.6758, + "step": 1461 + }, + { + "epoch": 0.15, + "grad_norm": 1.5906047294247754, + "learning_rate": 1.926999995578339e-05, + "loss": 0.7827, + "step": 1462 + }, + { + "epoch": 0.15, + "grad_norm": 1.8630511855892309, + "learning_rate": 1.9268763829900798e-05, + "loss": 0.7945, + "step": 1463 + }, + { + "epoch": 0.15, + "grad_norm": 1.793429611608883, + "learning_rate": 1.9267526698038838e-05, + "loss": 0.8592, + "step": 1464 + }, + { + "epoch": 0.15, + "grad_norm": 1.803266576136559, + "learning_rate": 1.9266288560331782e-05, + "loss": 0.7465, + "step": 1465 + }, + { + "epoch": 0.15, + "grad_norm": 1.5244435320341803, + "learning_rate": 1.9265049416914015e-05, + "loss": 0.7006, + "step": 1466 + }, + { + "epoch": 0.15, + "grad_norm": 1.7823386142447417, + "learning_rate": 1.9263809267920023e-05, + "loss": 0.8138, + "step": 1467 + }, + { + "epoch": 0.15, + "grad_norm": 1.546354254225523, + "learning_rate": 1.92625681134844e-05, + "loss": 0.6401, + "step": 1468 + }, + { + "epoch": 0.15, + "grad_norm": 1.5043994110416876, + "learning_rate": 1.9261325953741863e-05, + "loss": 0.8051, + "step": 1469 + }, + { + "epoch": 0.15, + "grad_norm": 1.6999701502405948, + "learning_rate": 1.9260082788827225e-05, + "loss": 0.8232, + "step": 1470 + }, + { + "epoch": 0.15, + "grad_norm": 1.6116709705664414, + "learning_rate": 1.9258838618875407e-05, + "loss": 0.7465, + "step": 1471 + }, + { + "epoch": 0.15, + "grad_norm": 1.5700867235311593, + "learning_rate": 1.9257593444021455e-05, + "loss": 0.6726, + "step": 1472 + }, + { + "epoch": 0.15, + "grad_norm": 1.6404232682439661, + "learning_rate": 1.92563472644005e-05, + "loss": 0.8057, + "step": 1473 + }, + { + "epoch": 0.15, + "grad_norm": 1.609198643059564, + "learning_rate": 1.9255100080147807e-05, + "loss": 0.8679, + "step": 1474 + }, + { + "epoch": 0.15, + "grad_norm": 1.61013255267516, + "learning_rate": 1.9253851891398735e-05, + "loss": 0.732, + "step": 1475 + }, + { + "epoch": 0.15, + "grad_norm": 1.5979956087776157, + "learning_rate": 1.9252602698288752e-05, + "loss": 0.7424, + "step": 1476 + }, + { + "epoch": 0.15, + "grad_norm": 1.5554224156137497, + "learning_rate": 1.925135250095344e-05, + "loss": 0.7008, + "step": 1477 + }, + { + "epoch": 0.15, + "grad_norm": 1.6167967536593908, + "learning_rate": 1.9250101299528495e-05, + "loss": 0.8089, + "step": 1478 + }, + { + "epoch": 0.15, + "grad_norm": 1.6375834796003947, + "learning_rate": 1.9248849094149703e-05, + "loss": 0.7496, + "step": 1479 + }, + { + "epoch": 0.15, + "grad_norm": 1.5682760543403722, + "learning_rate": 1.9247595884952977e-05, + "loss": 0.8036, + "step": 1480 + }, + { + "epoch": 0.15, + "grad_norm": 1.6028509146470817, + "learning_rate": 1.924634167207434e-05, + "loss": 0.7174, + "step": 1481 + }, + { + "epoch": 0.15, + "grad_norm": 1.6240045575203728, + "learning_rate": 1.9245086455649903e-05, + "loss": 0.7451, + "step": 1482 + }, + { + "epoch": 0.15, + "grad_norm": 1.619210407596221, + "learning_rate": 1.9243830235815913e-05, + "loss": 0.7987, + "step": 1483 + }, + { + "epoch": 0.15, + "grad_norm": 1.4698223414285452, + "learning_rate": 1.924257301270871e-05, + "loss": 0.7055, + "step": 1484 + }, + { + "epoch": 0.15, + "grad_norm": 1.5073455061374148, + "learning_rate": 1.924131478646474e-05, + "loss": 0.8317, + "step": 1485 + }, + { + "epoch": 0.15, + "grad_norm": 1.6407134053525472, + "learning_rate": 1.9240055557220573e-05, + "loss": 0.7548, + "step": 1486 + }, + { + "epoch": 0.15, + "grad_norm": 1.8031573777028704, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.7956, + "step": 1487 + }, + { + "epoch": 0.15, + "grad_norm": 1.582598949302052, + "learning_rate": 1.923753409027841e-05, + "loss": 0.7401, + "step": 1488 + }, + { + "epoch": 0.15, + "grad_norm": 1.4703010435992137, + "learning_rate": 1.9236271852854088e-05, + "loss": 0.8035, + "step": 1489 + }, + { + "epoch": 0.15, + "grad_norm": 1.4954150702770912, + "learning_rate": 1.9235008612976897e-05, + "loss": 0.7635, + "step": 1490 + }, + { + "epoch": 0.15, + "grad_norm": 1.9250287248373963, + "learning_rate": 1.9233744370783935e-05, + "loss": 0.7855, + "step": 1491 + }, + { + "epoch": 0.15, + "grad_norm": 1.6460987951004957, + "learning_rate": 1.9232479126412425e-05, + "loss": 0.8874, + "step": 1492 + }, + { + "epoch": 0.15, + "grad_norm": 1.563877453344683, + "learning_rate": 1.9231212879999686e-05, + "loss": 0.8022, + "step": 1493 + }, + { + "epoch": 0.15, + "grad_norm": 1.7544917369182025, + "learning_rate": 1.922994563168315e-05, + "loss": 0.7461, + "step": 1494 + }, + { + "epoch": 0.15, + "grad_norm": 1.6902299666720255, + "learning_rate": 1.9228677381600352e-05, + "loss": 0.7271, + "step": 1495 + }, + { + "epoch": 0.15, + "grad_norm": 1.672354406035893, + "learning_rate": 1.9227408129888945e-05, + "loss": 0.7699, + "step": 1496 + }, + { + "epoch": 0.15, + "grad_norm": 1.396315478816546, + "learning_rate": 1.9226137876686686e-05, + "loss": 0.7364, + "step": 1497 + }, + { + "epoch": 0.15, + "grad_norm": 1.7579916394018387, + "learning_rate": 1.9224866622131442e-05, + "loss": 0.7713, + "step": 1498 + }, + { + "epoch": 0.15, + "grad_norm": 1.5314102029803376, + "learning_rate": 1.9223594366361188e-05, + "loss": 0.8544, + "step": 1499 + }, + { + "epoch": 0.15, + "grad_norm": 1.5980160334993416, + "learning_rate": 1.9222321109514006e-05, + "loss": 0.8614, + "step": 1500 + }, + { + "epoch": 0.15, + "grad_norm": 1.5790504981956814, + "learning_rate": 1.922104685172809e-05, + "loss": 0.6682, + "step": 1501 + }, + { + "epoch": 0.15, + "grad_norm": 1.506221154074456, + "learning_rate": 1.9219771593141736e-05, + "loss": 0.827, + "step": 1502 + }, + { + "epoch": 0.15, + "grad_norm": 1.6994180118450408, + "learning_rate": 1.921849533389336e-05, + "loss": 0.7882, + "step": 1503 + }, + { + "epoch": 0.15, + "grad_norm": 1.5998130523759928, + "learning_rate": 1.9217218074121474e-05, + "loss": 0.6596, + "step": 1504 + }, + { + "epoch": 0.15, + "grad_norm": 1.524449341063304, + "learning_rate": 1.9215939813964707e-05, + "loss": 0.7048, + "step": 1505 + }, + { + "epoch": 0.15, + "grad_norm": 1.5581478522339887, + "learning_rate": 1.9214660553561796e-05, + "loss": 0.7535, + "step": 1506 + }, + { + "epoch": 0.15, + "grad_norm": 1.616475689899037, + "learning_rate": 1.921338029305158e-05, + "loss": 0.8237, + "step": 1507 + }, + { + "epoch": 0.15, + "grad_norm": 1.540256961106526, + "learning_rate": 1.9212099032573017e-05, + "loss": 0.7855, + "step": 1508 + }, + { + "epoch": 0.15, + "grad_norm": 1.6530635115493926, + "learning_rate": 1.9210816772265166e-05, + "loss": 0.7577, + "step": 1509 + }, + { + "epoch": 0.15, + "grad_norm": 1.6069626479514616, + "learning_rate": 1.9209533512267193e-05, + "loss": 0.793, + "step": 1510 + }, + { + "epoch": 0.15, + "grad_norm": 1.726638072569584, + "learning_rate": 1.920824925271838e-05, + "loss": 0.8787, + "step": 1511 + }, + { + "epoch": 0.15, + "grad_norm": 1.622170668313893, + "learning_rate": 1.920696399375811e-05, + "loss": 0.6719, + "step": 1512 + }, + { + "epoch": 0.15, + "grad_norm": 1.5849164974116032, + "learning_rate": 1.9205677735525883e-05, + "loss": 0.7252, + "step": 1513 + }, + { + "epoch": 0.15, + "grad_norm": 1.6194613219944933, + "learning_rate": 1.9204390478161294e-05, + "loss": 0.8939, + "step": 1514 + }, + { + "epoch": 0.15, + "grad_norm": 1.5088878684059277, + "learning_rate": 1.9203102221804062e-05, + "loss": 0.7617, + "step": 1515 + }, + { + "epoch": 0.15, + "grad_norm": 1.7035212373349042, + "learning_rate": 1.9201812966594e-05, + "loss": 0.7593, + "step": 1516 + }, + { + "epoch": 0.15, + "grad_norm": 1.6703938590394398, + "learning_rate": 1.920052271267105e-05, + "loss": 0.6431, + "step": 1517 + }, + { + "epoch": 0.15, + "grad_norm": 1.5769045147068257, + "learning_rate": 1.9199231460175232e-05, + "loss": 0.6884, + "step": 1518 + }, + { + "epoch": 0.15, + "grad_norm": 1.5809397366575757, + "learning_rate": 1.9197939209246697e-05, + "loss": 0.7772, + "step": 1519 + }, + { + "epoch": 0.15, + "grad_norm": 1.6391801398503676, + "learning_rate": 1.9196645960025707e-05, + "loss": 0.7642, + "step": 1520 + }, + { + "epoch": 0.15, + "grad_norm": 1.8299512941006664, + "learning_rate": 1.9195351712652615e-05, + "loss": 0.7433, + "step": 1521 + }, + { + "epoch": 0.15, + "grad_norm": 1.5095893517785566, + "learning_rate": 1.9194056467267893e-05, + "loss": 0.7138, + "step": 1522 + }, + { + "epoch": 0.15, + "grad_norm": 1.6093122134592646, + "learning_rate": 1.919276022401212e-05, + "loss": 0.8123, + "step": 1523 + }, + { + "epoch": 0.16, + "grad_norm": 1.6161664662430497, + "learning_rate": 1.9191462983025984e-05, + "loss": 0.7022, + "step": 1524 + }, + { + "epoch": 0.16, + "grad_norm": 1.761508380087029, + "learning_rate": 1.919016474445028e-05, + "loss": 0.7325, + "step": 1525 + }, + { + "epoch": 0.16, + "grad_norm": 1.6202857694097255, + "learning_rate": 1.9188865508425912e-05, + "loss": 0.7897, + "step": 1526 + }, + { + "epoch": 0.16, + "grad_norm": 1.566351498765691, + "learning_rate": 1.918756527509389e-05, + "loss": 0.7244, + "step": 1527 + }, + { + "epoch": 0.16, + "grad_norm": 1.5388170986926715, + "learning_rate": 1.9186264044595334e-05, + "loss": 0.7408, + "step": 1528 + }, + { + "epoch": 0.16, + "grad_norm": 2.0411475883686667, + "learning_rate": 1.9184961817071474e-05, + "loss": 0.8856, + "step": 1529 + }, + { + "epoch": 0.16, + "grad_norm": 1.537923172996495, + "learning_rate": 1.9183658592663646e-05, + "loss": 0.7629, + "step": 1530 + }, + { + "epoch": 0.16, + "grad_norm": 1.5649059968966519, + "learning_rate": 1.918235437151329e-05, + "loss": 0.8007, + "step": 1531 + }, + { + "epoch": 0.16, + "grad_norm": 1.4841845007351928, + "learning_rate": 1.9181049153761968e-05, + "loss": 0.7615, + "step": 1532 + }, + { + "epoch": 0.16, + "grad_norm": 1.5810852008291814, + "learning_rate": 1.9179742939551334e-05, + "loss": 0.7283, + "step": 1533 + }, + { + "epoch": 0.16, + "grad_norm": 1.6433971912621534, + "learning_rate": 1.917843572902316e-05, + "loss": 0.7719, + "step": 1534 + }, + { + "epoch": 0.16, + "grad_norm": 1.6686771191429626, + "learning_rate": 1.917712752231932e-05, + "loss": 0.7082, + "step": 1535 + }, + { + "epoch": 0.16, + "grad_norm": 1.5992105637180958, + "learning_rate": 1.9175818319581804e-05, + "loss": 0.8333, + "step": 1536 + }, + { + "epoch": 0.16, + "grad_norm": 1.5199822157361877, + "learning_rate": 1.9174508120952703e-05, + "loss": 0.6218, + "step": 1537 + }, + { + "epoch": 0.16, + "grad_norm": 1.696677016341126, + "learning_rate": 1.9173196926574217e-05, + "loss": 0.774, + "step": 1538 + }, + { + "epoch": 0.16, + "grad_norm": 1.5505851914086217, + "learning_rate": 1.9171884736588658e-05, + "loss": 0.8452, + "step": 1539 + }, + { + "epoch": 0.16, + "grad_norm": 1.6196310318726466, + "learning_rate": 1.9170571551138443e-05, + "loss": 0.7345, + "step": 1540 + }, + { + "epoch": 0.16, + "grad_norm": 1.711535964064723, + "learning_rate": 1.91692573703661e-05, + "loss": 0.8005, + "step": 1541 + }, + { + "epoch": 0.16, + "grad_norm": 1.602974087584989, + "learning_rate": 1.916794219441426e-05, + "loss": 0.7703, + "step": 1542 + }, + { + "epoch": 0.16, + "grad_norm": 1.803421607568332, + "learning_rate": 1.9166626023425663e-05, + "loss": 0.7639, + "step": 1543 + }, + { + "epoch": 0.16, + "grad_norm": 1.6301320994866801, + "learning_rate": 1.9165308857543162e-05, + "loss": 0.7741, + "step": 1544 + }, + { + "epoch": 0.16, + "grad_norm": 1.923659935329634, + "learning_rate": 1.9163990696909714e-05, + "loss": 0.83, + "step": 1545 + }, + { + "epoch": 0.16, + "grad_norm": 1.833057279021541, + "learning_rate": 1.9162671541668384e-05, + "loss": 0.8549, + "step": 1546 + }, + { + "epoch": 0.16, + "grad_norm": 1.5499426415563595, + "learning_rate": 1.9161351391962347e-05, + "loss": 0.7802, + "step": 1547 + }, + { + "epoch": 0.16, + "grad_norm": 1.61928782607135, + "learning_rate": 1.9160030247934887e-05, + "loss": 0.8191, + "step": 1548 + }, + { + "epoch": 0.16, + "grad_norm": 1.454897568051772, + "learning_rate": 1.9158708109729387e-05, + "loss": 0.6563, + "step": 1549 + }, + { + "epoch": 0.16, + "grad_norm": 1.6785044647038443, + "learning_rate": 1.9157384977489347e-05, + "loss": 0.7766, + "step": 1550 + }, + { + "epoch": 0.16, + "grad_norm": 1.7302652329541985, + "learning_rate": 1.9156060851358377e-05, + "loss": 0.709, + "step": 1551 + }, + { + "epoch": 0.16, + "grad_norm": 1.4544831620456349, + "learning_rate": 1.9154735731480185e-05, + "loss": 0.6414, + "step": 1552 + }, + { + "epoch": 0.16, + "grad_norm": 1.6795340410621473, + "learning_rate": 1.915340961799859e-05, + "loss": 0.8081, + "step": 1553 + }, + { + "epoch": 0.16, + "grad_norm": 1.4616387300706588, + "learning_rate": 1.9152082511057525e-05, + "loss": 0.8863, + "step": 1554 + }, + { + "epoch": 0.16, + "grad_norm": 1.522043264691204, + "learning_rate": 1.915075441080103e-05, + "loss": 0.7951, + "step": 1555 + }, + { + "epoch": 0.16, + "grad_norm": 1.567166649488013, + "learning_rate": 1.9149425317373242e-05, + "loss": 0.8348, + "step": 1556 + }, + { + "epoch": 0.16, + "grad_norm": 1.651202931472597, + "learning_rate": 1.914809523091842e-05, + "loss": 0.7499, + "step": 1557 + }, + { + "epoch": 0.16, + "grad_norm": 1.6437075117820903, + "learning_rate": 1.9146764151580916e-05, + "loss": 0.7034, + "step": 1558 + }, + { + "epoch": 0.16, + "grad_norm": 1.955079683704659, + "learning_rate": 1.9145432079505205e-05, + "loss": 0.802, + "step": 1559 + }, + { + "epoch": 0.16, + "grad_norm": 1.5064482039439775, + "learning_rate": 1.9144099014835863e-05, + "loss": 0.7824, + "step": 1560 + }, + { + "epoch": 0.16, + "grad_norm": 1.5106371142000838, + "learning_rate": 1.914276495771757e-05, + "loss": 0.6852, + "step": 1561 + }, + { + "epoch": 0.16, + "grad_norm": 1.4133764252186334, + "learning_rate": 1.9141429908295115e-05, + "loss": 0.7393, + "step": 1562 + }, + { + "epoch": 0.16, + "grad_norm": 1.5175862671123022, + "learning_rate": 1.91400938667134e-05, + "loss": 0.652, + "step": 1563 + }, + { + "epoch": 0.16, + "grad_norm": 1.8668591354206043, + "learning_rate": 1.9138756833117433e-05, + "loss": 0.8072, + "step": 1564 + }, + { + "epoch": 0.16, + "grad_norm": 1.7148653540172474, + "learning_rate": 1.9137418807652322e-05, + "loss": 0.7976, + "step": 1565 + }, + { + "epoch": 0.16, + "grad_norm": 1.3971811107739365, + "learning_rate": 1.9136079790463297e-05, + "loss": 0.6894, + "step": 1566 + }, + { + "epoch": 0.16, + "grad_norm": 1.558703271920691, + "learning_rate": 1.913473978169568e-05, + "loss": 0.7824, + "step": 1567 + }, + { + "epoch": 0.16, + "grad_norm": 1.6105428293661352, + "learning_rate": 1.9133398781494915e-05, + "loss": 0.8303, + "step": 1568 + }, + { + "epoch": 0.16, + "grad_norm": 1.443076696573244, + "learning_rate": 1.9132056790006538e-05, + "loss": 0.6699, + "step": 1569 + }, + { + "epoch": 0.16, + "grad_norm": 1.5389324497104773, + "learning_rate": 1.9130713807376208e-05, + "loss": 0.7501, + "step": 1570 + }, + { + "epoch": 0.16, + "grad_norm": 1.418175886216392, + "learning_rate": 1.9129369833749684e-05, + "loss": 0.7416, + "step": 1571 + }, + { + "epoch": 0.16, + "grad_norm": 1.5503745273715697, + "learning_rate": 1.9128024869272827e-05, + "loss": 0.8613, + "step": 1572 + }, + { + "epoch": 0.16, + "grad_norm": 1.5373599502009552, + "learning_rate": 1.912667891409162e-05, + "loss": 0.6855, + "step": 1573 + }, + { + "epoch": 0.16, + "grad_norm": 1.778854879435199, + "learning_rate": 1.9125331968352144e-05, + "loss": 0.838, + "step": 1574 + }, + { + "epoch": 0.16, + "grad_norm": 1.6617599172340474, + "learning_rate": 1.9123984032200586e-05, + "loss": 0.8029, + "step": 1575 + }, + { + "epoch": 0.16, + "grad_norm": 1.5626142896457125, + "learning_rate": 1.9122635105783246e-05, + "loss": 0.7714, + "step": 1576 + }, + { + "epoch": 0.16, + "grad_norm": 1.5953250529453096, + "learning_rate": 1.9121285189246522e-05, + "loss": 0.7895, + "step": 1577 + }, + { + "epoch": 0.16, + "grad_norm": 1.8414569875633697, + "learning_rate": 1.9119934282736937e-05, + "loss": 0.9218, + "step": 1578 + }, + { + "epoch": 0.16, + "grad_norm": 1.4706065042563319, + "learning_rate": 1.91185823864011e-05, + "loss": 0.6972, + "step": 1579 + }, + { + "epoch": 0.16, + "grad_norm": 1.5299081578716949, + "learning_rate": 1.9117229500385747e-05, + "loss": 0.7931, + "step": 1580 + }, + { + "epoch": 0.16, + "grad_norm": 1.3789090369504793, + "learning_rate": 1.9115875624837712e-05, + "loss": 0.7851, + "step": 1581 + }, + { + "epoch": 0.16, + "grad_norm": 1.4746021435408434, + "learning_rate": 1.911452075990393e-05, + "loss": 0.8104, + "step": 1582 + }, + { + "epoch": 0.16, + "grad_norm": 1.6450850823417302, + "learning_rate": 1.9113164905731456e-05, + "loss": 0.7881, + "step": 1583 + }, + { + "epoch": 0.16, + "grad_norm": 1.5010962716226026, + "learning_rate": 1.9111808062467444e-05, + "loss": 0.7363, + "step": 1584 + }, + { + "epoch": 0.16, + "grad_norm": 1.6404627346035592, + "learning_rate": 1.9110450230259164e-05, + "loss": 0.7816, + "step": 1585 + }, + { + "epoch": 0.16, + "grad_norm": 1.6060822901587557, + "learning_rate": 1.910909140925398e-05, + "loss": 0.8176, + "step": 1586 + }, + { + "epoch": 0.16, + "grad_norm": 1.51717087233064, + "learning_rate": 1.910773159959937e-05, + "loss": 0.7383, + "step": 1587 + }, + { + "epoch": 0.16, + "grad_norm": 1.61362295729002, + "learning_rate": 1.9106370801442932e-05, + "loss": 0.7415, + "step": 1588 + }, + { + "epoch": 0.16, + "grad_norm": 1.4683775876196694, + "learning_rate": 1.9105009014932346e-05, + "loss": 0.6588, + "step": 1589 + }, + { + "epoch": 0.16, + "grad_norm": 1.770666499944069, + "learning_rate": 1.9103646240215417e-05, + "loss": 0.7953, + "step": 1590 + }, + { + "epoch": 0.16, + "grad_norm": 1.738033048483459, + "learning_rate": 1.9102282477440054e-05, + "loss": 0.7054, + "step": 1591 + }, + { + "epoch": 0.16, + "grad_norm": 1.5252116881641011, + "learning_rate": 1.9100917726754274e-05, + "loss": 0.8349, + "step": 1592 + }, + { + "epoch": 0.16, + "grad_norm": 1.628771819360274, + "learning_rate": 1.9099551988306196e-05, + "loss": 0.7939, + "step": 1593 + }, + { + "epoch": 0.16, + "grad_norm": 1.5997324728092637, + "learning_rate": 1.9098185262244052e-05, + "loss": 0.7922, + "step": 1594 + }, + { + "epoch": 0.16, + "grad_norm": 1.577128392588311, + "learning_rate": 1.9096817548716176e-05, + "loss": 0.7836, + "step": 1595 + }, + { + "epoch": 0.16, + "grad_norm": 1.7620820504603598, + "learning_rate": 1.9095448847871017e-05, + "loss": 0.6764, + "step": 1596 + }, + { + "epoch": 0.16, + "grad_norm": 1.4430637543169647, + "learning_rate": 1.909407915985712e-05, + "loss": 0.7046, + "step": 1597 + }, + { + "epoch": 0.16, + "grad_norm": 1.5525390305375857, + "learning_rate": 1.9092708484823146e-05, + "loss": 0.778, + "step": 1598 + }, + { + "epoch": 0.16, + "grad_norm": 1.6033162887065793, + "learning_rate": 1.9091336822917858e-05, + "loss": 0.734, + "step": 1599 + }, + { + "epoch": 0.16, + "grad_norm": 1.373049548183506, + "learning_rate": 1.908996417429013e-05, + "loss": 0.7226, + "step": 1600 + }, + { + "epoch": 0.16, + "grad_norm": 1.515664936780327, + "learning_rate": 1.9088590539088944e-05, + "loss": 0.8194, + "step": 1601 + }, + { + "epoch": 0.16, + "grad_norm": 1.676295501712208, + "learning_rate": 1.9087215917463388e-05, + "loss": 0.7845, + "step": 1602 + }, + { + "epoch": 0.16, + "grad_norm": 1.7349913040217095, + "learning_rate": 1.9085840309562647e-05, + "loss": 0.7594, + "step": 1603 + }, + { + "epoch": 0.16, + "grad_norm": 1.4118887464545329, + "learning_rate": 1.9084463715536028e-05, + "loss": 0.7238, + "step": 1604 + }, + { + "epoch": 0.16, + "grad_norm": 1.5514927958243494, + "learning_rate": 1.908308613553294e-05, + "loss": 0.8336, + "step": 1605 + }, + { + "epoch": 0.16, + "grad_norm": 1.7937303832748122, + "learning_rate": 1.9081707569702895e-05, + "loss": 0.8283, + "step": 1606 + }, + { + "epoch": 0.16, + "grad_norm": 1.6048560592421042, + "learning_rate": 1.9080328018195512e-05, + "loss": 0.9193, + "step": 1607 + }, + { + "epoch": 0.16, + "grad_norm": 1.5251342315015357, + "learning_rate": 1.9078947481160525e-05, + "loss": 0.7064, + "step": 1608 + }, + { + "epoch": 0.16, + "grad_norm": 1.5915408336271724, + "learning_rate": 1.907756595874777e-05, + "loss": 0.8293, + "step": 1609 + }, + { + "epoch": 0.16, + "grad_norm": 1.5361474379210114, + "learning_rate": 1.9076183451107185e-05, + "loss": 0.7543, + "step": 1610 + }, + { + "epoch": 0.16, + "grad_norm": 1.6348641221810145, + "learning_rate": 1.9074799958388824e-05, + "loss": 0.8217, + "step": 1611 + }, + { + "epoch": 0.16, + "grad_norm": 1.6079219437209098, + "learning_rate": 1.907341548074284e-05, + "loss": 0.8163, + "step": 1612 + }, + { + "epoch": 0.16, + "grad_norm": 1.5291210741782348, + "learning_rate": 1.9072030018319498e-05, + "loss": 0.8774, + "step": 1613 + }, + { + "epoch": 0.16, + "grad_norm": 1.6444665406797316, + "learning_rate": 1.9070643571269168e-05, + "loss": 0.7758, + "step": 1614 + }, + { + "epoch": 0.16, + "grad_norm": 1.4431843276318772, + "learning_rate": 1.906925613974233e-05, + "loss": 0.6653, + "step": 1615 + }, + { + "epoch": 0.16, + "grad_norm": 1.567521467193932, + "learning_rate": 1.9067867723889563e-05, + "loss": 0.8517, + "step": 1616 + }, + { + "epoch": 0.16, + "grad_norm": 1.5714406321988241, + "learning_rate": 1.9066478323861558e-05, + "loss": 0.7835, + "step": 1617 + }, + { + "epoch": 0.16, + "grad_norm": 1.5959228661639966, + "learning_rate": 1.906508793980912e-05, + "loss": 0.7141, + "step": 1618 + }, + { + "epoch": 0.16, + "grad_norm": 1.4847876302277403, + "learning_rate": 1.9063696571883145e-05, + "loss": 0.7785, + "step": 1619 + }, + { + "epoch": 0.16, + "grad_norm": 1.5503075253886878, + "learning_rate": 1.9062304220234648e-05, + "loss": 0.7935, + "step": 1620 + }, + { + "epoch": 0.16, + "grad_norm": 1.6556741264913504, + "learning_rate": 1.9060910885014745e-05, + "loss": 0.6974, + "step": 1621 + }, + { + "epoch": 0.16, + "grad_norm": 1.4290189452587512, + "learning_rate": 1.9059516566374662e-05, + "loss": 0.6936, + "step": 1622 + }, + { + "epoch": 0.17, + "grad_norm": 1.5478177845334817, + "learning_rate": 1.9058121264465733e-05, + "loss": 0.7785, + "step": 1623 + }, + { + "epoch": 0.17, + "grad_norm": 1.5658364846871906, + "learning_rate": 1.9056724979439395e-05, + "loss": 0.8645, + "step": 1624 + }, + { + "epoch": 0.17, + "grad_norm": 1.5477169944101723, + "learning_rate": 1.9055327711447188e-05, + "loss": 0.7858, + "step": 1625 + }, + { + "epoch": 0.17, + "grad_norm": 1.4657396785160544, + "learning_rate": 1.905392946064077e-05, + "loss": 0.7117, + "step": 1626 + }, + { + "epoch": 0.17, + "grad_norm": 1.5708545119734512, + "learning_rate": 1.90525302271719e-05, + "loss": 0.7021, + "step": 1627 + }, + { + "epoch": 0.17, + "grad_norm": 1.6476076431039985, + "learning_rate": 1.9051130011192432e-05, + "loss": 0.7394, + "step": 1628 + }, + { + "epoch": 0.17, + "grad_norm": 1.4400462499040607, + "learning_rate": 1.904972881285435e-05, + "loss": 0.8131, + "step": 1629 + }, + { + "epoch": 0.17, + "grad_norm": 1.6174406456486523, + "learning_rate": 1.9048326632309725e-05, + "loss": 0.7308, + "step": 1630 + }, + { + "epoch": 0.17, + "grad_norm": 1.5193393003943425, + "learning_rate": 1.9046923469710745e-05, + "loss": 0.7133, + "step": 1631 + }, + { + "epoch": 0.17, + "grad_norm": 1.7774121870566917, + "learning_rate": 1.90455193252097e-05, + "loss": 0.8559, + "step": 1632 + }, + { + "epoch": 0.17, + "grad_norm": 1.526356917860105, + "learning_rate": 1.904411419895899e-05, + "loss": 0.6728, + "step": 1633 + }, + { + "epoch": 0.17, + "grad_norm": 1.869316998443361, + "learning_rate": 1.9042708091111118e-05, + "loss": 0.8109, + "step": 1634 + }, + { + "epoch": 0.17, + "grad_norm": 1.6306679434589662, + "learning_rate": 1.9041301001818694e-05, + "loss": 0.7945, + "step": 1635 + }, + { + "epoch": 0.17, + "grad_norm": 1.757448536351054, + "learning_rate": 1.9039892931234434e-05, + "loss": 0.7686, + "step": 1636 + }, + { + "epoch": 0.17, + "grad_norm": 1.544620177922369, + "learning_rate": 1.9038483879511166e-05, + "loss": 0.6817, + "step": 1637 + }, + { + "epoch": 0.17, + "grad_norm": 1.7468142129124404, + "learning_rate": 1.903707384680182e-05, + "loss": 0.7455, + "step": 1638 + }, + { + "epoch": 0.17, + "grad_norm": 1.6848255747831735, + "learning_rate": 1.9035662833259433e-05, + "loss": 0.8196, + "step": 1639 + }, + { + "epoch": 0.17, + "grad_norm": 1.5329377292764257, + "learning_rate": 1.9034250839037144e-05, + "loss": 0.7906, + "step": 1640 + }, + { + "epoch": 0.17, + "grad_norm": 1.5415068985564901, + "learning_rate": 1.903283786428821e-05, + "loss": 0.6799, + "step": 1641 + }, + { + "epoch": 0.17, + "grad_norm": 1.4666601543044273, + "learning_rate": 1.903142390916598e-05, + "loss": 0.7268, + "step": 1642 + }, + { + "epoch": 0.17, + "grad_norm": 1.575405053556528, + "learning_rate": 1.903000897382392e-05, + "loss": 0.7831, + "step": 1643 + }, + { + "epoch": 0.17, + "grad_norm": 1.6124679892613942, + "learning_rate": 1.9028593058415604e-05, + "loss": 0.7744, + "step": 1644 + }, + { + "epoch": 0.17, + "grad_norm": 1.4810041986168434, + "learning_rate": 1.90271761630947e-05, + "loss": 0.7654, + "step": 1645 + }, + { + "epoch": 0.17, + "grad_norm": 1.5817930470576074, + "learning_rate": 1.9025758288014992e-05, + "loss": 0.7471, + "step": 1646 + }, + { + "epoch": 0.17, + "grad_norm": 1.7806556950470092, + "learning_rate": 1.9024339433330374e-05, + "loss": 0.7575, + "step": 1647 + }, + { + "epoch": 0.17, + "grad_norm": 1.7388350084069517, + "learning_rate": 1.902291959919483e-05, + "loss": 0.7252, + "step": 1648 + }, + { + "epoch": 0.17, + "grad_norm": 1.6049185515811917, + "learning_rate": 1.9021498785762465e-05, + "loss": 0.71, + "step": 1649 + }, + { + "epoch": 0.17, + "grad_norm": 1.5496687398901938, + "learning_rate": 1.902007699318749e-05, + "loss": 0.8395, + "step": 1650 + }, + { + "epoch": 0.17, + "grad_norm": 1.5859664455327087, + "learning_rate": 1.9018654221624215e-05, + "loss": 0.8465, + "step": 1651 + }, + { + "epoch": 0.17, + "grad_norm": 1.6860476567662297, + "learning_rate": 1.9017230471227065e-05, + "loss": 0.8093, + "step": 1652 + }, + { + "epoch": 0.17, + "grad_norm": 1.5255507157557995, + "learning_rate": 1.9015805742150555e-05, + "loss": 0.7641, + "step": 1653 + }, + { + "epoch": 0.17, + "grad_norm": 1.5576595287920239, + "learning_rate": 1.9014380034549326e-05, + "loss": 0.7399, + "step": 1654 + }, + { + "epoch": 0.17, + "grad_norm": 1.5716277986521965, + "learning_rate": 1.901295334857811e-05, + "loss": 0.7475, + "step": 1655 + }, + { + "epoch": 0.17, + "grad_norm": 1.5940348916804257, + "learning_rate": 1.901152568439176e-05, + "loss": 0.8035, + "step": 1656 + }, + { + "epoch": 0.17, + "grad_norm": 1.652806276890107, + "learning_rate": 1.9010097042145217e-05, + "loss": 0.7669, + "step": 1657 + }, + { + "epoch": 0.17, + "grad_norm": 1.5652094056502321, + "learning_rate": 1.9008667421993542e-05, + "loss": 0.7551, + "step": 1658 + }, + { + "epoch": 0.17, + "grad_norm": 1.4934518396928138, + "learning_rate": 1.9007236824091902e-05, + "loss": 0.8183, + "step": 1659 + }, + { + "epoch": 0.17, + "grad_norm": 1.4623729202198175, + "learning_rate": 1.9005805248595558e-05, + "loss": 0.7898, + "step": 1660 + }, + { + "epoch": 0.17, + "grad_norm": 1.572320496115777, + "learning_rate": 1.9004372695659893e-05, + "loss": 0.7187, + "step": 1661 + }, + { + "epoch": 0.17, + "grad_norm": 1.5236632692717045, + "learning_rate": 1.9002939165440382e-05, + "loss": 0.7906, + "step": 1662 + }, + { + "epoch": 0.17, + "grad_norm": 1.6567306846632446, + "learning_rate": 1.9001504658092614e-05, + "loss": 0.7019, + "step": 1663 + }, + { + "epoch": 0.17, + "grad_norm": 1.7033509945844678, + "learning_rate": 1.900006917377229e-05, + "loss": 0.8409, + "step": 1664 + }, + { + "epoch": 0.17, + "grad_norm": 1.5112174633108457, + "learning_rate": 1.8998632712635197e-05, + "loss": 0.6675, + "step": 1665 + }, + { + "epoch": 0.17, + "grad_norm": 1.6077579566638796, + "learning_rate": 1.8997195274837246e-05, + "loss": 0.7659, + "step": 1666 + }, + { + "epoch": 0.17, + "grad_norm": 1.6200552954296608, + "learning_rate": 1.8995756860534447e-05, + "loss": 0.7704, + "step": 1667 + }, + { + "epoch": 0.17, + "grad_norm": 1.7219390404142412, + "learning_rate": 1.899431746988292e-05, + "loss": 0.6929, + "step": 1668 + }, + { + "epoch": 0.17, + "grad_norm": 1.5052480569089988, + "learning_rate": 1.8992877103038888e-05, + "loss": 0.7769, + "step": 1669 + }, + { + "epoch": 0.17, + "grad_norm": 1.7106222019790918, + "learning_rate": 1.8991435760158678e-05, + "loss": 0.7939, + "step": 1670 + }, + { + "epoch": 0.17, + "grad_norm": 1.6093050462228884, + "learning_rate": 1.8989993441398725e-05, + "loss": 0.7808, + "step": 1671 + }, + { + "epoch": 0.17, + "grad_norm": 1.6456229403185043, + "learning_rate": 1.8988550146915577e-05, + "loss": 0.7685, + "step": 1672 + }, + { + "epoch": 0.17, + "grad_norm": 1.6032421563969286, + "learning_rate": 1.8987105876865875e-05, + "loss": 0.769, + "step": 1673 + }, + { + "epoch": 0.17, + "grad_norm": 1.583807363251475, + "learning_rate": 1.898566063140637e-05, + "loss": 0.7032, + "step": 1674 + }, + { + "epoch": 0.17, + "grad_norm": 1.493731079727449, + "learning_rate": 1.898421441069392e-05, + "loss": 0.7107, + "step": 1675 + }, + { + "epoch": 0.17, + "grad_norm": 1.5749307014664113, + "learning_rate": 1.89827672148855e-05, + "loss": 0.8638, + "step": 1676 + }, + { + "epoch": 0.17, + "grad_norm": 1.6771481017193612, + "learning_rate": 1.898131904413817e-05, + "loss": 0.7711, + "step": 1677 + }, + { + "epoch": 0.17, + "grad_norm": 1.4599515555865261, + "learning_rate": 1.897986989860911e-05, + "loss": 0.7718, + "step": 1678 + }, + { + "epoch": 0.17, + "grad_norm": 1.748238922772858, + "learning_rate": 1.8978419778455604e-05, + "loss": 0.7038, + "step": 1679 + }, + { + "epoch": 0.17, + "grad_norm": 1.7105558563254843, + "learning_rate": 1.8976968683835035e-05, + "loss": 0.7343, + "step": 1680 + }, + { + "epoch": 0.17, + "grad_norm": 1.53133673055251, + "learning_rate": 1.89755166149049e-05, + "loss": 0.7529, + "step": 1681 + }, + { + "epoch": 0.17, + "grad_norm": 1.5783781823685694, + "learning_rate": 1.8974063571822802e-05, + "loss": 0.8669, + "step": 1682 + }, + { + "epoch": 0.17, + "grad_norm": 1.657796324266244, + "learning_rate": 1.8972609554746438e-05, + "loss": 0.8415, + "step": 1683 + }, + { + "epoch": 0.17, + "grad_norm": 1.864844220133073, + "learning_rate": 1.8971154563833624e-05, + "loss": 0.7533, + "step": 1684 + }, + { + "epoch": 0.17, + "grad_norm": 1.6854041586824038, + "learning_rate": 1.896969859924227e-05, + "loss": 0.7676, + "step": 1685 + }, + { + "epoch": 0.17, + "grad_norm": 1.5650090207778384, + "learning_rate": 1.896824166113041e-05, + "loss": 0.8221, + "step": 1686 + }, + { + "epoch": 0.17, + "grad_norm": 1.7722938808951092, + "learning_rate": 1.8966783749656162e-05, + "loss": 0.6842, + "step": 1687 + }, + { + "epoch": 0.17, + "grad_norm": 1.4914344699971838, + "learning_rate": 1.8965324864977766e-05, + "loss": 0.6963, + "step": 1688 + }, + { + "epoch": 0.17, + "grad_norm": 1.5272008983944905, + "learning_rate": 1.8963865007253557e-05, + "loss": 0.8026, + "step": 1689 + }, + { + "epoch": 0.17, + "grad_norm": 1.482223785650939, + "learning_rate": 1.8962404176641976e-05, + "loss": 0.8048, + "step": 1690 + }, + { + "epoch": 0.17, + "grad_norm": 1.5593816026928582, + "learning_rate": 1.8960942373301584e-05, + "loss": 0.6894, + "step": 1691 + }, + { + "epoch": 0.17, + "grad_norm": 1.5617523696611526, + "learning_rate": 1.895947959739103e-05, + "loss": 0.8203, + "step": 1692 + }, + { + "epoch": 0.17, + "grad_norm": 1.814077619268731, + "learning_rate": 1.8958015849069074e-05, + "loss": 0.8205, + "step": 1693 + }, + { + "epoch": 0.17, + "grad_norm": 1.6045784749385548, + "learning_rate": 1.8956551128494583e-05, + "loss": 0.7982, + "step": 1694 + }, + { + "epoch": 0.17, + "grad_norm": 1.7260765769995472, + "learning_rate": 1.8955085435826535e-05, + "loss": 0.6918, + "step": 1695 + }, + { + "epoch": 0.17, + "grad_norm": 1.5504805127465078, + "learning_rate": 1.8953618771224003e-05, + "loss": 0.7217, + "step": 1696 + }, + { + "epoch": 0.17, + "grad_norm": 1.720269284682822, + "learning_rate": 1.895215113484618e-05, + "loss": 0.9367, + "step": 1697 + }, + { + "epoch": 0.17, + "grad_norm": 1.5714592051560299, + "learning_rate": 1.895068252685234e-05, + "loss": 0.7853, + "step": 1698 + }, + { + "epoch": 0.17, + "grad_norm": 1.7019456115766567, + "learning_rate": 1.8949212947401884e-05, + "loss": 0.8011, + "step": 1699 + }, + { + "epoch": 0.17, + "grad_norm": 1.884939067526523, + "learning_rate": 1.8947742396654318e-05, + "loss": 0.8446, + "step": 1700 + }, + { + "epoch": 0.17, + "grad_norm": 1.6279141034164122, + "learning_rate": 1.8946270874769234e-05, + "loss": 0.6527, + "step": 1701 + }, + { + "epoch": 0.17, + "grad_norm": 1.530506408713068, + "learning_rate": 1.8944798381906356e-05, + "loss": 0.7613, + "step": 1702 + }, + { + "epoch": 0.17, + "grad_norm": 1.6168212969929507, + "learning_rate": 1.8943324918225495e-05, + "loss": 0.7585, + "step": 1703 + }, + { + "epoch": 0.17, + "grad_norm": 1.6534578465353755, + "learning_rate": 1.894185048388657e-05, + "loss": 0.7905, + "step": 1704 + }, + { + "epoch": 0.17, + "grad_norm": 1.5523776193348853, + "learning_rate": 1.894037507904961e-05, + "loss": 0.8521, + "step": 1705 + }, + { + "epoch": 0.17, + "grad_norm": 1.551937987120993, + "learning_rate": 1.8938898703874747e-05, + "loss": 0.8972, + "step": 1706 + }, + { + "epoch": 0.17, + "grad_norm": 1.69364613713285, + "learning_rate": 1.893742135852222e-05, + "loss": 0.767, + "step": 1707 + }, + { + "epoch": 0.17, + "grad_norm": 1.6210580238903587, + "learning_rate": 1.893594304315237e-05, + "loss": 0.6865, + "step": 1708 + }, + { + "epoch": 0.17, + "grad_norm": 1.451273066156775, + "learning_rate": 1.8934463757925642e-05, + "loss": 0.7857, + "step": 1709 + }, + { + "epoch": 0.17, + "grad_norm": 1.683278709031471, + "learning_rate": 1.8932983503002598e-05, + "loss": 0.7911, + "step": 1710 + }, + { + "epoch": 0.17, + "grad_norm": 1.5290904299097812, + "learning_rate": 1.8931502278543887e-05, + "loss": 0.6458, + "step": 1711 + }, + { + "epoch": 0.17, + "grad_norm": 1.5786586347808211, + "learning_rate": 1.8930020084710276e-05, + "loss": 0.727, + "step": 1712 + }, + { + "epoch": 0.17, + "grad_norm": 1.4938119669135754, + "learning_rate": 1.8928536921662637e-05, + "loss": 0.6879, + "step": 1713 + }, + { + "epoch": 0.17, + "grad_norm": 1.5091020987453612, + "learning_rate": 1.892705278956194e-05, + "loss": 0.6316, + "step": 1714 + }, + { + "epoch": 0.17, + "grad_norm": 1.5694048656046364, + "learning_rate": 1.892556768856927e-05, + "loss": 0.8505, + "step": 1715 + }, + { + "epoch": 0.17, + "grad_norm": 1.4532551755755019, + "learning_rate": 1.8924081618845804e-05, + "loss": 0.6874, + "step": 1716 + }, + { + "epoch": 0.17, + "grad_norm": 1.4379667556058273, + "learning_rate": 1.892259458055284e-05, + "loss": 0.6783, + "step": 1717 + }, + { + "epoch": 0.17, + "grad_norm": 1.5265403642367932, + "learning_rate": 1.892110657385176e-05, + "loss": 0.7493, + "step": 1718 + }, + { + "epoch": 0.17, + "grad_norm": 1.5479382570256124, + "learning_rate": 1.891961759890408e-05, + "loss": 0.7337, + "step": 1719 + }, + { + "epoch": 0.17, + "grad_norm": 1.559889119676712, + "learning_rate": 1.891812765587139e-05, + "loss": 0.8697, + "step": 1720 + }, + { + "epoch": 0.18, + "grad_norm": 1.593218655173124, + "learning_rate": 1.891663674491541e-05, + "loss": 0.7259, + "step": 1721 + }, + { + "epoch": 0.18, + "grad_norm": 1.4592818229059854, + "learning_rate": 1.8915144866197954e-05, + "loss": 0.7734, + "step": 1722 + }, + { + "epoch": 0.18, + "grad_norm": 1.648324646449686, + "learning_rate": 1.8913652019880938e-05, + "loss": 0.8242, + "step": 1723 + }, + { + "epoch": 0.18, + "grad_norm": 1.4729500894677385, + "learning_rate": 1.8912158206126386e-05, + "loss": 0.6893, + "step": 1724 + }, + { + "epoch": 0.18, + "grad_norm": 1.6400655969345412, + "learning_rate": 1.8910663425096435e-05, + "loss": 0.8344, + "step": 1725 + }, + { + "epoch": 0.18, + "grad_norm": 1.497654805250299, + "learning_rate": 1.8909167676953317e-05, + "loss": 0.7566, + "step": 1726 + }, + { + "epoch": 0.18, + "grad_norm": 1.6871982575991455, + "learning_rate": 1.890767096185937e-05, + "loss": 0.7531, + "step": 1727 + }, + { + "epoch": 0.18, + "grad_norm": 1.5138489141587188, + "learning_rate": 1.8906173279977042e-05, + "loss": 0.6804, + "step": 1728 + }, + { + "epoch": 0.18, + "grad_norm": 1.6122397020645634, + "learning_rate": 1.890467463146888e-05, + "loss": 0.804, + "step": 1729 + }, + { + "epoch": 0.18, + "grad_norm": 1.6764245244750045, + "learning_rate": 1.890317501649754e-05, + "loss": 0.8541, + "step": 1730 + }, + { + "epoch": 0.18, + "grad_norm": 1.5251612599991828, + "learning_rate": 1.8901674435225784e-05, + "loss": 0.7366, + "step": 1731 + }, + { + "epoch": 0.18, + "grad_norm": 1.719713507717278, + "learning_rate": 1.890017288781647e-05, + "loss": 0.7156, + "step": 1732 + }, + { + "epoch": 0.18, + "grad_norm": 1.5326149533609268, + "learning_rate": 1.889867037443258e-05, + "loss": 0.8467, + "step": 1733 + }, + { + "epoch": 0.18, + "grad_norm": 1.686840873303595, + "learning_rate": 1.8897166895237172e-05, + "loss": 0.7844, + "step": 1734 + }, + { + "epoch": 0.18, + "grad_norm": 1.6371997798982931, + "learning_rate": 1.889566245039344e-05, + "loss": 0.7474, + "step": 1735 + }, + { + "epoch": 0.18, + "grad_norm": 1.611010263494007, + "learning_rate": 1.8894157040064657e-05, + "loss": 0.6348, + "step": 1736 + }, + { + "epoch": 0.18, + "grad_norm": 1.6232898835545204, + "learning_rate": 1.889265066441422e-05, + "loss": 0.7371, + "step": 1737 + }, + { + "epoch": 0.18, + "grad_norm": 1.6149901594670062, + "learning_rate": 1.8891143323605618e-05, + "loss": 0.7951, + "step": 1738 + }, + { + "epoch": 0.18, + "grad_norm": 1.4939178979273533, + "learning_rate": 1.888963501780245e-05, + "loss": 0.7611, + "step": 1739 + }, + { + "epoch": 0.18, + "grad_norm": 1.569457200983501, + "learning_rate": 1.8888125747168418e-05, + "loss": 0.7955, + "step": 1740 + }, + { + "epoch": 0.18, + "grad_norm": 1.7213407027487273, + "learning_rate": 1.8886615511867334e-05, + "loss": 0.607, + "step": 1741 + }, + { + "epoch": 0.18, + "grad_norm": 1.5760183368741416, + "learning_rate": 1.88851043120631e-05, + "loss": 0.8016, + "step": 1742 + }, + { + "epoch": 0.18, + "grad_norm": 1.7611761834273227, + "learning_rate": 1.888359214791975e-05, + "loss": 0.8343, + "step": 1743 + }, + { + "epoch": 0.18, + "grad_norm": 1.5535745253840973, + "learning_rate": 1.888207901960139e-05, + "loss": 0.7834, + "step": 1744 + }, + { + "epoch": 0.18, + "grad_norm": 1.6699865529676274, + "learning_rate": 1.8880564927272255e-05, + "loss": 0.7145, + "step": 1745 + }, + { + "epoch": 0.18, + "grad_norm": 1.5987541438124577, + "learning_rate": 1.887904987109667e-05, + "loss": 0.8974, + "step": 1746 + }, + { + "epoch": 0.18, + "grad_norm": 1.5848509287747299, + "learning_rate": 1.887753385123908e-05, + "loss": 0.8937, + "step": 1747 + }, + { + "epoch": 0.18, + "grad_norm": 1.5965473543295567, + "learning_rate": 1.8876016867864012e-05, + "loss": 0.643, + "step": 1748 + }, + { + "epoch": 0.18, + "grad_norm": 1.474355510667469, + "learning_rate": 1.8874498921136128e-05, + "loss": 0.7083, + "step": 1749 + }, + { + "epoch": 0.18, + "grad_norm": 1.6473044578139473, + "learning_rate": 1.887298001122016e-05, + "loss": 0.8028, + "step": 1750 + }, + { + "epoch": 0.18, + "grad_norm": 1.6837953191941109, + "learning_rate": 1.8871460138280972e-05, + "loss": 0.6944, + "step": 1751 + }, + { + "epoch": 0.18, + "grad_norm": 1.6999399008927014, + "learning_rate": 1.8869939302483523e-05, + "loss": 0.7527, + "step": 1752 + }, + { + "epoch": 0.18, + "grad_norm": 1.580640208318216, + "learning_rate": 1.8868417503992868e-05, + "loss": 0.7821, + "step": 1753 + }, + { + "epoch": 0.18, + "grad_norm": 1.7608845706069354, + "learning_rate": 1.8866894742974185e-05, + "loss": 0.735, + "step": 1754 + }, + { + "epoch": 0.18, + "grad_norm": 1.783871397792197, + "learning_rate": 1.8865371019592734e-05, + "loss": 0.6882, + "step": 1755 + }, + { + "epoch": 0.18, + "grad_norm": 4.106234520745424, + "learning_rate": 1.8863846334013903e-05, + "loss": 0.9047, + "step": 1756 + }, + { + "epoch": 0.18, + "grad_norm": 1.6261963712487317, + "learning_rate": 1.8862320686403163e-05, + "loss": 0.8233, + "step": 1757 + }, + { + "epoch": 0.18, + "grad_norm": 1.5592849660065935, + "learning_rate": 1.886079407692611e-05, + "loss": 0.7271, + "step": 1758 + }, + { + "epoch": 0.18, + "grad_norm": 1.696634596163249, + "learning_rate": 1.8859266505748422e-05, + "loss": 0.8247, + "step": 1759 + }, + { + "epoch": 0.18, + "grad_norm": 1.5583376186257532, + "learning_rate": 1.88577379730359e-05, + "loss": 0.753, + "step": 1760 + }, + { + "epoch": 0.18, + "grad_norm": 1.6329960586391725, + "learning_rate": 1.885620847895444e-05, + "loss": 0.8578, + "step": 1761 + }, + { + "epoch": 0.18, + "grad_norm": 1.669480616922605, + "learning_rate": 1.885467802367005e-05, + "loss": 0.7505, + "step": 1762 + }, + { + "epoch": 0.18, + "grad_norm": 1.657482857375443, + "learning_rate": 1.8853146607348824e-05, + "loss": 0.769, + "step": 1763 + }, + { + "epoch": 0.18, + "grad_norm": 1.5154300129562621, + "learning_rate": 1.8851614230156984e-05, + "loss": 0.749, + "step": 1764 + }, + { + "epoch": 0.18, + "grad_norm": 1.6318632109908695, + "learning_rate": 1.8850080892260844e-05, + "loss": 0.7529, + "step": 1765 + }, + { + "epoch": 0.18, + "grad_norm": 1.7001848525559118, + "learning_rate": 1.8848546593826827e-05, + "loss": 0.8489, + "step": 1766 + }, + { + "epoch": 0.18, + "grad_norm": 1.589134711427665, + "learning_rate": 1.8847011335021447e-05, + "loss": 0.848, + "step": 1767 + }, + { + "epoch": 0.18, + "grad_norm": 1.5385234932969856, + "learning_rate": 1.884547511601134e-05, + "loss": 0.7197, + "step": 1768 + }, + { + "epoch": 0.18, + "grad_norm": 1.5689035447257587, + "learning_rate": 1.884393793696324e-05, + "loss": 0.7098, + "step": 1769 + }, + { + "epoch": 0.18, + "grad_norm": 1.6426536233783764, + "learning_rate": 1.884239979804398e-05, + "loss": 0.7266, + "step": 1770 + }, + { + "epoch": 0.18, + "grad_norm": 1.677291392580847, + "learning_rate": 1.8840860699420497e-05, + "loss": 0.8098, + "step": 1771 + }, + { + "epoch": 0.18, + "grad_norm": 1.7249955607522947, + "learning_rate": 1.8839320641259844e-05, + "loss": 0.8916, + "step": 1772 + }, + { + "epoch": 0.18, + "grad_norm": 1.6209383073724422, + "learning_rate": 1.8837779623729167e-05, + "loss": 0.8117, + "step": 1773 + }, + { + "epoch": 0.18, + "grad_norm": 1.593803315763351, + "learning_rate": 1.8836237646995717e-05, + "loss": 0.8198, + "step": 1774 + }, + { + "epoch": 0.18, + "grad_norm": 1.6593476747026599, + "learning_rate": 1.883469471122686e-05, + "loss": 0.721, + "step": 1775 + }, + { + "epoch": 0.18, + "grad_norm": 1.5354092753657136, + "learning_rate": 1.8833150816590042e-05, + "loss": 0.774, + "step": 1776 + }, + { + "epoch": 0.18, + "grad_norm": 1.4254979394593619, + "learning_rate": 1.8831605963252845e-05, + "loss": 0.6664, + "step": 1777 + }, + { + "epoch": 0.18, + "grad_norm": 1.7161205041025949, + "learning_rate": 1.883006015138293e-05, + "loss": 0.8563, + "step": 1778 + }, + { + "epoch": 0.18, + "grad_norm": 1.7088321727886966, + "learning_rate": 1.8828513381148072e-05, + "loss": 0.8043, + "step": 1779 + }, + { + "epoch": 0.18, + "grad_norm": 1.5486816559944219, + "learning_rate": 1.8826965652716148e-05, + "loss": 0.7855, + "step": 1780 + }, + { + "epoch": 0.18, + "grad_norm": 1.3385051756675685, + "learning_rate": 1.882541696625514e-05, + "loss": 0.7483, + "step": 1781 + }, + { + "epoch": 0.18, + "grad_norm": 1.498188058539235, + "learning_rate": 1.8823867321933135e-05, + "loss": 0.7488, + "step": 1782 + }, + { + "epoch": 0.18, + "grad_norm": 1.736701746116013, + "learning_rate": 1.882231671991832e-05, + "loss": 0.805, + "step": 1783 + }, + { + "epoch": 0.18, + "grad_norm": 1.7067130208010808, + "learning_rate": 1.8820765160378996e-05, + "loss": 0.7416, + "step": 1784 + }, + { + "epoch": 0.18, + "grad_norm": 1.5944026426589217, + "learning_rate": 1.881921264348355e-05, + "loss": 0.863, + "step": 1785 + }, + { + "epoch": 0.18, + "grad_norm": 1.63057640894758, + "learning_rate": 1.8817659169400493e-05, + "loss": 0.8167, + "step": 1786 + }, + { + "epoch": 0.18, + "grad_norm": 1.4701737654650993, + "learning_rate": 1.8816104738298425e-05, + "loss": 0.7732, + "step": 1787 + }, + { + "epoch": 0.18, + "grad_norm": 1.7659716645257424, + "learning_rate": 1.8814549350346052e-05, + "loss": 0.8413, + "step": 1788 + }, + { + "epoch": 0.18, + "grad_norm": 1.6703190187917394, + "learning_rate": 1.8812993005712196e-05, + "loss": 0.8808, + "step": 1789 + }, + { + "epoch": 0.18, + "grad_norm": 1.5804918076804337, + "learning_rate": 1.881143570456577e-05, + "loss": 0.8396, + "step": 1790 + }, + { + "epoch": 0.18, + "grad_norm": 1.502015060915155, + "learning_rate": 1.880987744707579e-05, + "loss": 0.7767, + "step": 1791 + }, + { + "epoch": 0.18, + "grad_norm": 1.5100557185000367, + "learning_rate": 1.8808318233411384e-05, + "loss": 0.837, + "step": 1792 + }, + { + "epoch": 0.18, + "grad_norm": 1.4980699724605326, + "learning_rate": 1.880675806374178e-05, + "loss": 0.8265, + "step": 1793 + }, + { + "epoch": 0.18, + "grad_norm": 1.6145024202929508, + "learning_rate": 1.880519693823631e-05, + "loss": 0.8522, + "step": 1794 + }, + { + "epoch": 0.18, + "grad_norm": 1.7060963762719894, + "learning_rate": 1.8803634857064413e-05, + "loss": 0.6858, + "step": 1795 + }, + { + "epoch": 0.18, + "grad_norm": 1.66699583908313, + "learning_rate": 1.8802071820395626e-05, + "loss": 0.8265, + "step": 1796 + }, + { + "epoch": 0.18, + "grad_norm": 1.5088819587635958, + "learning_rate": 1.880050782839959e-05, + "loss": 0.8256, + "step": 1797 + }, + { + "epoch": 0.18, + "grad_norm": 1.8248098806211075, + "learning_rate": 1.8798942881246057e-05, + "loss": 0.7441, + "step": 1798 + }, + { + "epoch": 0.18, + "grad_norm": 1.497823717757942, + "learning_rate": 1.8797376979104874e-05, + "loss": 0.6212, + "step": 1799 + }, + { + "epoch": 0.18, + "grad_norm": 1.5313709881390287, + "learning_rate": 1.879581012214599e-05, + "loss": 0.7585, + "step": 1800 + }, + { + "epoch": 0.18, + "grad_norm": 1.6533852967630702, + "learning_rate": 1.8794242310539475e-05, + "loss": 0.706, + "step": 1801 + }, + { + "epoch": 0.18, + "grad_norm": 1.57322286965956, + "learning_rate": 1.879267354445548e-05, + "loss": 0.7315, + "step": 1802 + }, + { + "epoch": 0.18, + "grad_norm": 1.4688226723378812, + "learning_rate": 1.8791103824064275e-05, + "loss": 0.6824, + "step": 1803 + }, + { + "epoch": 0.18, + "grad_norm": 1.6606304786948904, + "learning_rate": 1.8789533149536227e-05, + "loss": 0.774, + "step": 1804 + }, + { + "epoch": 0.18, + "grad_norm": 1.4170226955215965, + "learning_rate": 1.8787961521041808e-05, + "loss": 0.7616, + "step": 1805 + }, + { + "epoch": 0.18, + "grad_norm": 1.6319528691892806, + "learning_rate": 1.8786388938751593e-05, + "loss": 0.7277, + "step": 1806 + }, + { + "epoch": 0.18, + "grad_norm": 1.7368883126240444, + "learning_rate": 1.8784815402836264e-05, + "loss": 0.7857, + "step": 1807 + }, + { + "epoch": 0.18, + "grad_norm": 1.6956446906840685, + "learning_rate": 1.87832409134666e-05, + "loss": 0.8243, + "step": 1808 + }, + { + "epoch": 0.18, + "grad_norm": 1.5661135765652143, + "learning_rate": 1.8781665470813493e-05, + "loss": 0.7309, + "step": 1809 + }, + { + "epoch": 0.18, + "grad_norm": 1.6488344290849406, + "learning_rate": 1.878008907504793e-05, + "loss": 0.741, + "step": 1810 + }, + { + "epoch": 0.18, + "grad_norm": 1.748004174675965, + "learning_rate": 1.8778511726341e-05, + "loss": 0.812, + "step": 1811 + }, + { + "epoch": 0.18, + "grad_norm": 1.6500412921236254, + "learning_rate": 1.8776933424863903e-05, + "loss": 0.75, + "step": 1812 + }, + { + "epoch": 0.18, + "grad_norm": 1.5783811495293991, + "learning_rate": 1.8775354170787938e-05, + "loss": 0.7833, + "step": 1813 + }, + { + "epoch": 0.18, + "grad_norm": 1.6438030612284624, + "learning_rate": 1.8773773964284512e-05, + "loss": 0.8352, + "step": 1814 + }, + { + "epoch": 0.18, + "grad_norm": 1.7254538324598188, + "learning_rate": 1.8772192805525125e-05, + "loss": 0.7913, + "step": 1815 + }, + { + "epoch": 0.18, + "grad_norm": 1.4609001559387844, + "learning_rate": 1.8770610694681393e-05, + "loss": 0.7164, + "step": 1816 + }, + { + "epoch": 0.18, + "grad_norm": 1.5638189485710678, + "learning_rate": 1.8769027631925027e-05, + "loss": 0.741, + "step": 1817 + }, + { + "epoch": 0.18, + "grad_norm": 1.5970980794499663, + "learning_rate": 1.876744361742784e-05, + "loss": 0.752, + "step": 1818 + }, + { + "epoch": 0.19, + "grad_norm": 1.3691889410857434, + "learning_rate": 1.876585865136176e-05, + "loss": 0.7374, + "step": 1819 + }, + { + "epoch": 0.19, + "grad_norm": 1.7025448759231918, + "learning_rate": 1.8764272733898808e-05, + "loss": 0.7367, + "step": 1820 + }, + { + "epoch": 0.19, + "grad_norm": 1.6286596973353795, + "learning_rate": 1.8762685865211106e-05, + "loss": 0.7936, + "step": 1821 + }, + { + "epoch": 0.19, + "grad_norm": 1.8002277034115608, + "learning_rate": 1.8761098045470887e-05, + "loss": 0.8227, + "step": 1822 + }, + { + "epoch": 0.19, + "grad_norm": 1.6901918456825955, + "learning_rate": 1.875950927485048e-05, + "loss": 0.7645, + "step": 1823 + }, + { + "epoch": 0.19, + "grad_norm": 1.5603243747675015, + "learning_rate": 1.875791955352233e-05, + "loss": 0.7471, + "step": 1824 + }, + { + "epoch": 0.19, + "grad_norm": 1.638513369552182, + "learning_rate": 1.8756328881658968e-05, + "loss": 0.86, + "step": 1825 + }, + { + "epoch": 0.19, + "grad_norm": 1.4728899465231686, + "learning_rate": 1.875473725943304e-05, + "loss": 0.7207, + "step": 1826 + }, + { + "epoch": 0.19, + "grad_norm": 1.6494863709917857, + "learning_rate": 1.8753144687017292e-05, + "loss": 0.755, + "step": 1827 + }, + { + "epoch": 0.19, + "grad_norm": 1.4154536873183075, + "learning_rate": 1.8751551164584568e-05, + "loss": 0.7739, + "step": 1828 + }, + { + "epoch": 0.19, + "grad_norm": 1.58099440095728, + "learning_rate": 1.8749956692307825e-05, + "loss": 0.7177, + "step": 1829 + }, + { + "epoch": 0.19, + "grad_norm": 1.526424937636552, + "learning_rate": 1.874836127036012e-05, + "loss": 0.6252, + "step": 1830 + }, + { + "epoch": 0.19, + "grad_norm": 1.7385896248430377, + "learning_rate": 1.874676489891461e-05, + "loss": 0.75, + "step": 1831 + }, + { + "epoch": 0.19, + "grad_norm": 1.6335448597236224, + "learning_rate": 1.8745167578144552e-05, + "loss": 0.8474, + "step": 1832 + }, + { + "epoch": 0.19, + "grad_norm": 1.5534145226465848, + "learning_rate": 1.8743569308223312e-05, + "loss": 0.7913, + "step": 1833 + }, + { + "epoch": 0.19, + "grad_norm": 1.5581638165153364, + "learning_rate": 1.8741970089324357e-05, + "loss": 0.6453, + "step": 1834 + }, + { + "epoch": 0.19, + "grad_norm": 1.6011044528243155, + "learning_rate": 1.874036992162126e-05, + "loss": 0.7914, + "step": 1835 + }, + { + "epoch": 0.19, + "grad_norm": 1.768395282832198, + "learning_rate": 1.873876880528769e-05, + "loss": 0.7309, + "step": 1836 + }, + { + "epoch": 0.19, + "grad_norm": 1.5493951226784552, + "learning_rate": 1.8737166740497427e-05, + "loss": 0.7553, + "step": 1837 + }, + { + "epoch": 0.19, + "grad_norm": 1.5239649939874127, + "learning_rate": 1.873556372742435e-05, + "loss": 0.7589, + "step": 1838 + }, + { + "epoch": 0.19, + "grad_norm": 1.634050189294019, + "learning_rate": 1.8733959766242435e-05, + "loss": 0.897, + "step": 1839 + }, + { + "epoch": 0.19, + "grad_norm": 1.6207931351883975, + "learning_rate": 1.8732354857125773e-05, + "loss": 0.814, + "step": 1840 + }, + { + "epoch": 0.19, + "grad_norm": 1.507728990025924, + "learning_rate": 1.873074900024855e-05, + "loss": 0.6828, + "step": 1841 + }, + { + "epoch": 0.19, + "grad_norm": 1.5626504219753878, + "learning_rate": 1.8729142195785057e-05, + "loss": 0.8172, + "step": 1842 + }, + { + "epoch": 0.19, + "grad_norm": 1.6739510888151425, + "learning_rate": 1.8727534443909686e-05, + "loss": 0.7512, + "step": 1843 + }, + { + "epoch": 0.19, + "grad_norm": 1.5598058732542022, + "learning_rate": 1.872592574479694e-05, + "loss": 0.7194, + "step": 1844 + }, + { + "epoch": 0.19, + "grad_norm": 1.6005303628723884, + "learning_rate": 1.8724316098621405e-05, + "loss": 0.7972, + "step": 1845 + }, + { + "epoch": 0.19, + "grad_norm": 1.600654059697692, + "learning_rate": 1.872270550555779e-05, + "loss": 0.9066, + "step": 1846 + }, + { + "epoch": 0.19, + "grad_norm": 1.5518927935254088, + "learning_rate": 1.872109396578091e-05, + "loss": 0.6793, + "step": 1847 + }, + { + "epoch": 0.19, + "grad_norm": 1.5799506697248105, + "learning_rate": 1.8719481479465657e-05, + "loss": 0.6856, + "step": 1848 + }, + { + "epoch": 0.19, + "grad_norm": 1.6159705842129906, + "learning_rate": 1.8717868046787046e-05, + "loss": 0.8038, + "step": 1849 + }, + { + "epoch": 0.19, + "grad_norm": 1.4684918042317172, + "learning_rate": 1.871625366792019e-05, + "loss": 0.6801, + "step": 1850 + }, + { + "epoch": 0.19, + "grad_norm": 1.6412423287402154, + "learning_rate": 1.8714638343040306e-05, + "loss": 0.7523, + "step": 1851 + }, + { + "epoch": 0.19, + "grad_norm": 1.6437306891728323, + "learning_rate": 1.871302207232271e-05, + "loss": 0.7218, + "step": 1852 + }, + { + "epoch": 0.19, + "grad_norm": 1.4601714173931502, + "learning_rate": 1.871140485594283e-05, + "loss": 0.7227, + "step": 1853 + }, + { + "epoch": 0.19, + "grad_norm": 1.6513526696248817, + "learning_rate": 1.8709786694076178e-05, + "loss": 0.8656, + "step": 1854 + }, + { + "epoch": 0.19, + "grad_norm": 1.4853149556610323, + "learning_rate": 1.870816758689839e-05, + "loss": 0.7308, + "step": 1855 + }, + { + "epoch": 0.19, + "grad_norm": 1.6802466029990593, + "learning_rate": 1.870654753458519e-05, + "loss": 0.8212, + "step": 1856 + }, + { + "epoch": 0.19, + "grad_norm": 1.8500457105225216, + "learning_rate": 1.8704926537312408e-05, + "loss": 0.8009, + "step": 1857 + }, + { + "epoch": 0.19, + "grad_norm": 1.5972154532993916, + "learning_rate": 1.8703304595255983e-05, + "loss": 0.6746, + "step": 1858 + }, + { + "epoch": 0.19, + "grad_norm": 1.6662701350599647, + "learning_rate": 1.8701681708591947e-05, + "loss": 0.8041, + "step": 1859 + }, + { + "epoch": 0.19, + "grad_norm": 1.5730426403183335, + "learning_rate": 1.8700057877496443e-05, + "loss": 0.9202, + "step": 1860 + }, + { + "epoch": 0.19, + "grad_norm": 1.4563945916902619, + "learning_rate": 1.8698433102145706e-05, + "loss": 0.7369, + "step": 1861 + }, + { + "epoch": 0.19, + "grad_norm": 1.4702139366238451, + "learning_rate": 1.8696807382716085e-05, + "loss": 0.7116, + "step": 1862 + }, + { + "epoch": 0.19, + "grad_norm": 1.5621107416457056, + "learning_rate": 1.869518071938403e-05, + "loss": 0.7758, + "step": 1863 + }, + { + "epoch": 0.19, + "grad_norm": 1.5895626444726687, + "learning_rate": 1.8693553112326084e-05, + "loss": 0.8106, + "step": 1864 + }, + { + "epoch": 0.19, + "grad_norm": 1.7607266030575186, + "learning_rate": 1.8691924561718897e-05, + "loss": 0.844, + "step": 1865 + }, + { + "epoch": 0.19, + "grad_norm": 1.4933064378757956, + "learning_rate": 1.8690295067739226e-05, + "loss": 0.7138, + "step": 1866 + }, + { + "epoch": 0.19, + "grad_norm": 1.712265806011729, + "learning_rate": 1.8688664630563928e-05, + "loss": 0.8637, + "step": 1867 + }, + { + "epoch": 0.19, + "grad_norm": 1.6019635887840478, + "learning_rate": 1.8687033250369955e-05, + "loss": 0.7653, + "step": 1868 + }, + { + "epoch": 0.19, + "grad_norm": 1.612276802897441, + "learning_rate": 1.8685400927334377e-05, + "loss": 0.8336, + "step": 1869 + }, + { + "epoch": 0.19, + "grad_norm": 1.670180632071749, + "learning_rate": 1.8683767661634354e-05, + "loss": 0.8118, + "step": 1870 + }, + { + "epoch": 0.19, + "grad_norm": 1.3743265251696855, + "learning_rate": 1.8682133453447147e-05, + "loss": 0.82, + "step": 1871 + }, + { + "epoch": 0.19, + "grad_norm": 1.683875985160119, + "learning_rate": 1.868049830295013e-05, + "loss": 0.9025, + "step": 1872 + }, + { + "epoch": 0.19, + "grad_norm": 1.6648127443357852, + "learning_rate": 1.867886221032077e-05, + "loss": 0.8204, + "step": 1873 + }, + { + "epoch": 0.19, + "grad_norm": 1.6458275270596272, + "learning_rate": 1.8677225175736636e-05, + "loss": 0.7923, + "step": 1874 + }, + { + "epoch": 0.19, + "grad_norm": 1.5987414009006289, + "learning_rate": 1.8675587199375407e-05, + "loss": 0.6926, + "step": 1875 + }, + { + "epoch": 0.19, + "grad_norm": 1.801676353452884, + "learning_rate": 1.8673948281414857e-05, + "loss": 0.8069, + "step": 1876 + }, + { + "epoch": 0.19, + "grad_norm": 1.5174117497935236, + "learning_rate": 1.867230842203287e-05, + "loss": 0.6795, + "step": 1877 + }, + { + "epoch": 0.19, + "grad_norm": 1.447289592777005, + "learning_rate": 1.8670667621407423e-05, + "loss": 0.7696, + "step": 1878 + }, + { + "epoch": 0.19, + "grad_norm": 1.543875258571506, + "learning_rate": 1.8669025879716597e-05, + "loss": 0.7519, + "step": 1879 + }, + { + "epoch": 0.19, + "grad_norm": 1.6415775747071424, + "learning_rate": 1.866738319713858e-05, + "loss": 0.7268, + "step": 1880 + }, + { + "epoch": 0.19, + "grad_norm": 1.7785108692805012, + "learning_rate": 1.8665739573851662e-05, + "loss": 0.7699, + "step": 1881 + }, + { + "epoch": 0.19, + "grad_norm": 1.7857674261403411, + "learning_rate": 1.866409501003423e-05, + "loss": 0.8229, + "step": 1882 + }, + { + "epoch": 0.19, + "grad_norm": 1.9847860216254964, + "learning_rate": 1.8662449505864776e-05, + "loss": 0.7704, + "step": 1883 + }, + { + "epoch": 0.19, + "grad_norm": 1.7227834593591063, + "learning_rate": 1.8660803061521894e-05, + "loss": 0.7932, + "step": 1884 + }, + { + "epoch": 0.19, + "grad_norm": 1.6255355103927502, + "learning_rate": 1.865915567718428e-05, + "loss": 0.682, + "step": 1885 + }, + { + "epoch": 0.19, + "grad_norm": 1.5679640202331104, + "learning_rate": 1.865750735303073e-05, + "loss": 0.812, + "step": 1886 + }, + { + "epoch": 0.19, + "grad_norm": 1.5657907851284936, + "learning_rate": 1.8655858089240143e-05, + "loss": 0.7679, + "step": 1887 + }, + { + "epoch": 0.19, + "grad_norm": 1.3940385718601986, + "learning_rate": 1.8654207885991527e-05, + "loss": 0.6367, + "step": 1888 + }, + { + "epoch": 0.19, + "grad_norm": 1.570281967703175, + "learning_rate": 1.865255674346398e-05, + "loss": 0.7764, + "step": 1889 + }, + { + "epoch": 0.19, + "grad_norm": 1.6446730773550722, + "learning_rate": 1.8650904661836707e-05, + "loss": 0.7509, + "step": 1890 + }, + { + "epoch": 0.19, + "grad_norm": 1.7466286168189615, + "learning_rate": 1.864925164128902e-05, + "loss": 0.7999, + "step": 1891 + }, + { + "epoch": 0.19, + "grad_norm": 1.5994431575638575, + "learning_rate": 1.864759768200033e-05, + "loss": 0.7346, + "step": 1892 + }, + { + "epoch": 0.19, + "grad_norm": 1.6980032406179928, + "learning_rate": 1.864594278415014e-05, + "loss": 0.8921, + "step": 1893 + }, + { + "epoch": 0.19, + "grad_norm": 1.6641202788730192, + "learning_rate": 1.864428694791807e-05, + "loss": 0.7413, + "step": 1894 + }, + { + "epoch": 0.19, + "grad_norm": 1.4534612756573948, + "learning_rate": 1.8642630173483832e-05, + "loss": 0.7873, + "step": 1895 + }, + { + "epoch": 0.19, + "grad_norm": 1.6806786748699862, + "learning_rate": 1.8640972461027246e-05, + "loss": 0.6926, + "step": 1896 + }, + { + "epoch": 0.19, + "grad_norm": 1.7567742933825417, + "learning_rate": 1.863931381072823e-05, + "loss": 0.806, + "step": 1897 + }, + { + "epoch": 0.19, + "grad_norm": 1.5208842194437813, + "learning_rate": 1.8637654222766802e-05, + "loss": 0.7438, + "step": 1898 + }, + { + "epoch": 0.19, + "grad_norm": 1.5697337897387504, + "learning_rate": 1.8635993697323086e-05, + "loss": 0.8382, + "step": 1899 + }, + { + "epoch": 0.19, + "grad_norm": 1.5281416666058911, + "learning_rate": 1.8634332234577307e-05, + "loss": 0.7687, + "step": 1900 + }, + { + "epoch": 0.19, + "grad_norm": 1.5959906743154828, + "learning_rate": 1.8632669834709787e-05, + "loss": 0.7747, + "step": 1901 + }, + { + "epoch": 0.19, + "grad_norm": 1.553663965084361, + "learning_rate": 1.8631006497900957e-05, + "loss": 0.7009, + "step": 1902 + }, + { + "epoch": 0.19, + "grad_norm": 1.5721802721916782, + "learning_rate": 1.862934222433135e-05, + "loss": 0.7484, + "step": 1903 + }, + { + "epoch": 0.19, + "grad_norm": 1.523293199579708, + "learning_rate": 1.8627677014181586e-05, + "loss": 0.6594, + "step": 1904 + }, + { + "epoch": 0.19, + "grad_norm": 1.615424841576138, + "learning_rate": 1.862601086763241e-05, + "loss": 0.7287, + "step": 1905 + }, + { + "epoch": 0.19, + "grad_norm": 1.4620235693583123, + "learning_rate": 1.8624343784864644e-05, + "loss": 0.6631, + "step": 1906 + }, + { + "epoch": 0.19, + "grad_norm": 1.609922327362643, + "learning_rate": 1.8622675766059232e-05, + "loss": 0.6958, + "step": 1907 + }, + { + "epoch": 0.19, + "grad_norm": 1.7415547413413075, + "learning_rate": 1.862100681139721e-05, + "loss": 0.8045, + "step": 1908 + }, + { + "epoch": 0.19, + "grad_norm": 1.642979877233161, + "learning_rate": 1.861933692105972e-05, + "loss": 0.8172, + "step": 1909 + }, + { + "epoch": 0.19, + "grad_norm": 1.5667632726209306, + "learning_rate": 1.8617666095227994e-05, + "loss": 0.7247, + "step": 1910 + }, + { + "epoch": 0.19, + "grad_norm": 1.4583686501782427, + "learning_rate": 1.8615994334083377e-05, + "loss": 0.81, + "step": 1911 + }, + { + "epoch": 0.19, + "grad_norm": 1.4396267855158282, + "learning_rate": 1.8614321637807315e-05, + "loss": 0.7329, + "step": 1912 + }, + { + "epoch": 0.19, + "grad_norm": 1.6642235119219537, + "learning_rate": 1.8612648006581354e-05, + "loss": 0.8072, + "step": 1913 + }, + { + "epoch": 0.19, + "grad_norm": 1.754213812581633, + "learning_rate": 1.861097344058714e-05, + "loss": 0.8149, + "step": 1914 + }, + { + "epoch": 0.19, + "grad_norm": 1.6761790132590595, + "learning_rate": 1.8609297940006418e-05, + "loss": 0.8206, + "step": 1915 + }, + { + "epoch": 0.19, + "grad_norm": 1.560191341344239, + "learning_rate": 1.8607621505021035e-05, + "loss": 0.7154, + "step": 1916 + }, + { + "epoch": 0.19, + "grad_norm": 1.4918257116316898, + "learning_rate": 1.860594413581295e-05, + "loss": 0.7098, + "step": 1917 + }, + { + "epoch": 0.2, + "grad_norm": 1.5958708627476872, + "learning_rate": 1.860426583256421e-05, + "loss": 0.8249, + "step": 1918 + }, + { + "epoch": 0.2, + "grad_norm": 1.4247394782586766, + "learning_rate": 1.8602586595456974e-05, + "loss": 0.8818, + "step": 1919 + }, + { + "epoch": 0.2, + "grad_norm": 1.7522243225471708, + "learning_rate": 1.8600906424673487e-05, + "loss": 0.8321, + "step": 1920 + }, + { + "epoch": 0.2, + "grad_norm": 1.5649963909661708, + "learning_rate": 1.8599225320396113e-05, + "loss": 0.7594, + "step": 1921 + }, + { + "epoch": 0.2, + "grad_norm": 1.6616037284068703, + "learning_rate": 1.8597543282807303e-05, + "loss": 0.808, + "step": 1922 + }, + { + "epoch": 0.2, + "grad_norm": 1.6696455666594916, + "learning_rate": 1.8595860312089625e-05, + "loss": 0.7538, + "step": 1923 + }, + { + "epoch": 0.2, + "grad_norm": 1.6723167707677005, + "learning_rate": 1.859417640842573e-05, + "loss": 0.768, + "step": 1924 + }, + { + "epoch": 0.2, + "grad_norm": 1.6174241472922297, + "learning_rate": 1.859249157199839e-05, + "loss": 0.7958, + "step": 1925 + }, + { + "epoch": 0.2, + "grad_norm": 1.4890001345357784, + "learning_rate": 1.8590805802990458e-05, + "loss": 0.797, + "step": 1926 + }, + { + "epoch": 0.2, + "grad_norm": 1.5636449533652899, + "learning_rate": 1.8589119101584902e-05, + "loss": 0.7467, + "step": 1927 + }, + { + "epoch": 0.2, + "grad_norm": 1.7449087745880458, + "learning_rate": 1.8587431467964784e-05, + "loss": 0.8519, + "step": 1928 + }, + { + "epoch": 0.2, + "grad_norm": 1.6217747543112395, + "learning_rate": 1.8585742902313274e-05, + "loss": 0.7736, + "step": 1929 + }, + { + "epoch": 0.2, + "grad_norm": 1.5118301003504073, + "learning_rate": 1.858405340481364e-05, + "loss": 0.6824, + "step": 1930 + }, + { + "epoch": 0.2, + "grad_norm": 1.4925272292019434, + "learning_rate": 1.8582362975649245e-05, + "loss": 0.678, + "step": 1931 + }, + { + "epoch": 0.2, + "grad_norm": 1.6573874004589289, + "learning_rate": 1.8580671615003566e-05, + "loss": 0.8158, + "step": 1932 + }, + { + "epoch": 0.2, + "grad_norm": 1.5882708936318732, + "learning_rate": 1.8578979323060164e-05, + "loss": 0.703, + "step": 1933 + }, + { + "epoch": 0.2, + "grad_norm": 1.747937980589991, + "learning_rate": 1.8577286100002723e-05, + "loss": 0.8432, + "step": 1934 + }, + { + "epoch": 0.2, + "grad_norm": 1.5880321948553646, + "learning_rate": 1.8575591946015006e-05, + "loss": 0.8102, + "step": 1935 + }, + { + "epoch": 0.2, + "grad_norm": 1.67263280842425, + "learning_rate": 1.8573896861280893e-05, + "loss": 0.8489, + "step": 1936 + }, + { + "epoch": 0.2, + "grad_norm": 1.616183957035531, + "learning_rate": 1.8572200845984352e-05, + "loss": 0.6341, + "step": 1937 + }, + { + "epoch": 0.2, + "grad_norm": 1.6906920012154358, + "learning_rate": 1.857050390030947e-05, + "loss": 0.8366, + "step": 1938 + }, + { + "epoch": 0.2, + "grad_norm": 1.5695251712587637, + "learning_rate": 1.8568806024440415e-05, + "loss": 0.6616, + "step": 1939 + }, + { + "epoch": 0.2, + "grad_norm": 1.531032808164511, + "learning_rate": 1.8567107218561463e-05, + "loss": 0.7847, + "step": 1940 + }, + { + "epoch": 0.2, + "grad_norm": 1.5605863172120555, + "learning_rate": 1.8565407482857e-05, + "loss": 0.7061, + "step": 1941 + }, + { + "epoch": 0.2, + "grad_norm": 1.6497185295835817, + "learning_rate": 1.85637068175115e-05, + "loss": 0.7621, + "step": 1942 + }, + { + "epoch": 0.2, + "grad_norm": 1.6176729022633565, + "learning_rate": 1.856200522270955e-05, + "loss": 0.7856, + "step": 1943 + }, + { + "epoch": 0.2, + "grad_norm": 1.9537736544492905, + "learning_rate": 1.856030269863583e-05, + "loss": 0.88, + "step": 1944 + }, + { + "epoch": 0.2, + "grad_norm": 1.5430187767275976, + "learning_rate": 1.8558599245475115e-05, + "loss": 0.6248, + "step": 1945 + }, + { + "epoch": 0.2, + "grad_norm": 1.3672398031794841, + "learning_rate": 1.8556894863412297e-05, + "loss": 0.6678, + "step": 1946 + }, + { + "epoch": 0.2, + "grad_norm": 1.6851967538360875, + "learning_rate": 1.8555189552632353e-05, + "loss": 0.7628, + "step": 1947 + }, + { + "epoch": 0.2, + "grad_norm": 1.308925529168711, + "learning_rate": 1.8553483313320372e-05, + "loss": 0.7125, + "step": 1948 + }, + { + "epoch": 0.2, + "grad_norm": 1.5482666375476917, + "learning_rate": 1.855177614566154e-05, + "loss": 0.7105, + "step": 1949 + }, + { + "epoch": 0.2, + "grad_norm": 1.6451222261319363, + "learning_rate": 1.8550068049841143e-05, + "loss": 0.754, + "step": 1950 + }, + { + "epoch": 0.2, + "grad_norm": 1.530070822063962, + "learning_rate": 1.8548359026044567e-05, + "loss": 0.8445, + "step": 1951 + }, + { + "epoch": 0.2, + "grad_norm": 1.6759016729134624, + "learning_rate": 1.85466490744573e-05, + "loss": 0.8776, + "step": 1952 + }, + { + "epoch": 0.2, + "grad_norm": 1.6564499377083384, + "learning_rate": 1.854493819526493e-05, + "loss": 0.8513, + "step": 1953 + }, + { + "epoch": 0.2, + "grad_norm": 1.6158099814548028, + "learning_rate": 1.854322638865315e-05, + "loss": 0.8304, + "step": 1954 + }, + { + "epoch": 0.2, + "grad_norm": 1.6128948769079847, + "learning_rate": 1.854151365480774e-05, + "loss": 0.7835, + "step": 1955 + }, + { + "epoch": 0.2, + "grad_norm": 1.6422859708402293, + "learning_rate": 1.8539799993914602e-05, + "loss": 0.7945, + "step": 1956 + }, + { + "epoch": 0.2, + "grad_norm": 1.5442670681243345, + "learning_rate": 1.8538085406159722e-05, + "loss": 0.7909, + "step": 1957 + }, + { + "epoch": 0.2, + "grad_norm": 1.6477871595113218, + "learning_rate": 1.8536369891729188e-05, + "loss": 0.7269, + "step": 1958 + }, + { + "epoch": 0.2, + "grad_norm": 1.6111243464475271, + "learning_rate": 1.85346534508092e-05, + "loss": 0.7188, + "step": 1959 + }, + { + "epoch": 0.2, + "grad_norm": 1.528940302181452, + "learning_rate": 1.8532936083586047e-05, + "loss": 0.757, + "step": 1960 + }, + { + "epoch": 0.2, + "grad_norm": 1.6760006322163905, + "learning_rate": 1.853121779024612e-05, + "loss": 0.6481, + "step": 1961 + }, + { + "epoch": 0.2, + "grad_norm": 1.5850301760975005, + "learning_rate": 1.8529498570975918e-05, + "loss": 0.8071, + "step": 1962 + }, + { + "epoch": 0.2, + "grad_norm": 1.439301043406329, + "learning_rate": 1.852777842596203e-05, + "loss": 0.6926, + "step": 1963 + }, + { + "epoch": 0.2, + "grad_norm": 1.4913418883840095, + "learning_rate": 1.8526057355391153e-05, + "loss": 0.7353, + "step": 1964 + }, + { + "epoch": 0.2, + "grad_norm": 1.4632243798516378, + "learning_rate": 1.8524335359450084e-05, + "loss": 0.6683, + "step": 1965 + }, + { + "epoch": 0.2, + "grad_norm": 1.6979225750276388, + "learning_rate": 1.852261243832572e-05, + "loss": 0.7964, + "step": 1966 + }, + { + "epoch": 0.2, + "grad_norm": 1.6935771936979542, + "learning_rate": 1.852088859220505e-05, + "loss": 0.8466, + "step": 1967 + }, + { + "epoch": 0.2, + "grad_norm": 1.6235813178922434, + "learning_rate": 1.851916382127518e-05, + "loss": 0.7163, + "step": 1968 + }, + { + "epoch": 0.2, + "grad_norm": 1.6952450683272433, + "learning_rate": 1.85174381257233e-05, + "loss": 0.8911, + "step": 1969 + }, + { + "epoch": 0.2, + "grad_norm": 1.6217638700128432, + "learning_rate": 1.8515711505736708e-05, + "loss": 0.8231, + "step": 1970 + }, + { + "epoch": 0.2, + "grad_norm": 1.5461320142762094, + "learning_rate": 1.8513983961502802e-05, + "loss": 0.6862, + "step": 1971 + }, + { + "epoch": 0.2, + "grad_norm": 1.8345041401665254, + "learning_rate": 1.851225549320908e-05, + "loss": 0.7441, + "step": 1972 + }, + { + "epoch": 0.2, + "grad_norm": 1.5711831067423128, + "learning_rate": 1.8510526101043146e-05, + "loss": 0.869, + "step": 1973 + }, + { + "epoch": 0.2, + "grad_norm": 1.5191116397427378, + "learning_rate": 1.850879578519269e-05, + "loss": 0.7696, + "step": 1974 + }, + { + "epoch": 0.2, + "grad_norm": 1.5090778444514343, + "learning_rate": 1.8507064545845513e-05, + "loss": 0.7709, + "step": 1975 + }, + { + "epoch": 0.2, + "grad_norm": 1.5043235651481828, + "learning_rate": 1.8505332383189518e-05, + "loss": 0.6482, + "step": 1976 + }, + { + "epoch": 0.2, + "grad_norm": 1.7494370121789027, + "learning_rate": 1.85035992974127e-05, + "loss": 0.659, + "step": 1977 + }, + { + "epoch": 0.2, + "grad_norm": 1.6977092603202606, + "learning_rate": 1.850186528870316e-05, + "loss": 0.8217, + "step": 1978 + }, + { + "epoch": 0.2, + "grad_norm": 1.4858417843522007, + "learning_rate": 1.85001303572491e-05, + "loss": 0.7733, + "step": 1979 + }, + { + "epoch": 0.2, + "grad_norm": 1.3671668705036, + "learning_rate": 1.8498394503238814e-05, + "loss": 0.7278, + "step": 1980 + }, + { + "epoch": 0.2, + "grad_norm": 1.6825392973325513, + "learning_rate": 1.84966577268607e-05, + "loss": 0.7464, + "step": 1981 + }, + { + "epoch": 0.2, + "grad_norm": 1.555401729857369, + "learning_rate": 1.849492002830327e-05, + "loss": 0.766, + "step": 1982 + }, + { + "epoch": 0.2, + "grad_norm": 1.6489193347651052, + "learning_rate": 1.8493181407755117e-05, + "loss": 0.8089, + "step": 1983 + }, + { + "epoch": 0.2, + "grad_norm": 1.555459995400608, + "learning_rate": 1.849144186540494e-05, + "loss": 0.7948, + "step": 1984 + }, + { + "epoch": 0.2, + "grad_norm": 1.6696539094363, + "learning_rate": 1.8489701401441534e-05, + "loss": 0.7851, + "step": 1985 + }, + { + "epoch": 0.2, + "grad_norm": 1.599201865291812, + "learning_rate": 1.848796001605381e-05, + "loss": 0.8093, + "step": 1986 + }, + { + "epoch": 0.2, + "grad_norm": 1.6274974091557912, + "learning_rate": 1.8486217709430757e-05, + "loss": 0.7926, + "step": 1987 + }, + { + "epoch": 0.2, + "grad_norm": 1.3623804172978458, + "learning_rate": 1.848447448176149e-05, + "loss": 0.7262, + "step": 1988 + }, + { + "epoch": 0.2, + "grad_norm": 1.629211820742282, + "learning_rate": 1.8482730333235196e-05, + "loss": 0.8294, + "step": 1989 + }, + { + "epoch": 0.2, + "grad_norm": 1.7541029673914632, + "learning_rate": 1.8480985264041176e-05, + "loss": 0.8159, + "step": 1990 + }, + { + "epoch": 0.2, + "grad_norm": 1.586614609609062, + "learning_rate": 1.847923927436884e-05, + "loss": 0.6823, + "step": 1991 + }, + { + "epoch": 0.2, + "grad_norm": 1.5419850233142978, + "learning_rate": 1.8477492364407677e-05, + "loss": 0.6992, + "step": 1992 + }, + { + "epoch": 0.2, + "grad_norm": 1.472491430080609, + "learning_rate": 1.8475744534347293e-05, + "loss": 0.7535, + "step": 1993 + }, + { + "epoch": 0.2, + "grad_norm": 1.5645681274063772, + "learning_rate": 1.8473995784377384e-05, + "loss": 0.7991, + "step": 1994 + }, + { + "epoch": 0.2, + "grad_norm": 1.6389601170363615, + "learning_rate": 1.8472246114687754e-05, + "loss": 0.7141, + "step": 1995 + }, + { + "epoch": 0.2, + "grad_norm": 1.6616292369048578, + "learning_rate": 1.8470495525468295e-05, + "loss": 0.7879, + "step": 1996 + }, + { + "epoch": 0.2, + "grad_norm": 1.4737315897382532, + "learning_rate": 1.8468744016909012e-05, + "loss": 0.7577, + "step": 1997 + }, + { + "epoch": 0.2, + "grad_norm": 1.549146981818495, + "learning_rate": 1.8466991589200004e-05, + "loss": 0.6605, + "step": 1998 + }, + { + "epoch": 0.2, + "grad_norm": 1.4417487564219365, + "learning_rate": 1.8465238242531467e-05, + "loss": 0.7124, + "step": 1999 + }, + { + "epoch": 0.2, + "grad_norm": 1.6835586343638251, + "learning_rate": 1.84634839770937e-05, + "loss": 0.7829, + "step": 2000 + }, + { + "epoch": 0.2, + "grad_norm": 1.7115119058166468, + "learning_rate": 1.8461728793077104e-05, + "loss": 0.7586, + "step": 2001 + }, + { + "epoch": 0.2, + "grad_norm": 1.4947857186720843, + "learning_rate": 1.8459972690672172e-05, + "loss": 0.7328, + "step": 2002 + }, + { + "epoch": 0.2, + "grad_norm": 1.4280227740430842, + "learning_rate": 1.8458215670069502e-05, + "loss": 0.7671, + "step": 2003 + }, + { + "epoch": 0.2, + "grad_norm": 1.6151555542402964, + "learning_rate": 1.8456457731459795e-05, + "loss": 0.848, + "step": 2004 + }, + { + "epoch": 0.2, + "grad_norm": 1.479691535263727, + "learning_rate": 1.8454698875033843e-05, + "loss": 0.8455, + "step": 2005 + }, + { + "epoch": 0.2, + "grad_norm": 1.4177021763508002, + "learning_rate": 1.8452939100982547e-05, + "loss": 0.7119, + "step": 2006 + }, + { + "epoch": 0.2, + "grad_norm": 1.6199388078799257, + "learning_rate": 1.8451178409496903e-05, + "loss": 0.6344, + "step": 2007 + }, + { + "epoch": 0.2, + "grad_norm": 1.5825791681632486, + "learning_rate": 1.8449416800767998e-05, + "loss": 0.7744, + "step": 2008 + }, + { + "epoch": 0.2, + "grad_norm": 1.5318746508145062, + "learning_rate": 1.8447654274987038e-05, + "loss": 0.8559, + "step": 2009 + }, + { + "epoch": 0.2, + "grad_norm": 1.6233848458956928, + "learning_rate": 1.844589083234531e-05, + "loss": 0.7345, + "step": 2010 + }, + { + "epoch": 0.2, + "grad_norm": 1.994553363314384, + "learning_rate": 1.8444126473034212e-05, + "loss": 0.7971, + "step": 2011 + }, + { + "epoch": 0.2, + "grad_norm": 1.6031955233721904, + "learning_rate": 1.844236119724524e-05, + "loss": 0.7934, + "step": 2012 + }, + { + "epoch": 0.2, + "grad_norm": 1.4580459780767323, + "learning_rate": 1.8440595005169985e-05, + "loss": 0.7894, + "step": 2013 + }, + { + "epoch": 0.2, + "grad_norm": 1.7207833492032534, + "learning_rate": 1.843882789700013e-05, + "loss": 0.8159, + "step": 2014 + }, + { + "epoch": 0.2, + "grad_norm": 1.538572416456644, + "learning_rate": 1.843705987292748e-05, + "loss": 0.8205, + "step": 2015 + }, + { + "epoch": 0.21, + "grad_norm": 1.6077079023161813, + "learning_rate": 1.8435290933143925e-05, + "loss": 0.712, + "step": 2016 + }, + { + "epoch": 0.21, + "grad_norm": 1.4914180648119735, + "learning_rate": 1.8433521077841447e-05, + "loss": 0.705, + "step": 2017 + }, + { + "epoch": 0.21, + "grad_norm": 1.5844200282792549, + "learning_rate": 1.8431750307212143e-05, + "loss": 0.747, + "step": 2018 + }, + { + "epoch": 0.21, + "grad_norm": 1.5512967626223364, + "learning_rate": 1.84299786214482e-05, + "loss": 0.7392, + "step": 2019 + }, + { + "epoch": 0.21, + "grad_norm": 1.6445683986761288, + "learning_rate": 1.8428206020741913e-05, + "loss": 0.8185, + "step": 2020 + }, + { + "epoch": 0.21, + "grad_norm": 1.5331188402336782, + "learning_rate": 1.8426432505285658e-05, + "loss": 0.6476, + "step": 2021 + }, + { + "epoch": 0.21, + "grad_norm": 1.5679496580351822, + "learning_rate": 1.8424658075271934e-05, + "loss": 0.6425, + "step": 2022 + }, + { + "epoch": 0.21, + "grad_norm": 1.5806644443693592, + "learning_rate": 1.8422882730893323e-05, + "loss": 0.815, + "step": 2023 + }, + { + "epoch": 0.21, + "grad_norm": 1.8126513267054625, + "learning_rate": 1.8421106472342507e-05, + "loss": 0.7919, + "step": 2024 + }, + { + "epoch": 0.21, + "grad_norm": 1.8010077865252379, + "learning_rate": 1.841932929981228e-05, + "loss": 0.792, + "step": 2025 + }, + { + "epoch": 0.21, + "grad_norm": 1.6176071870755673, + "learning_rate": 1.8417551213495516e-05, + "loss": 0.681, + "step": 2026 + }, + { + "epoch": 0.21, + "grad_norm": 1.6194212121101104, + "learning_rate": 1.8415772213585206e-05, + "loss": 0.781, + "step": 2027 + }, + { + "epoch": 0.21, + "grad_norm": 1.4615424765257328, + "learning_rate": 1.8413992300274432e-05, + "loss": 0.7386, + "step": 2028 + }, + { + "epoch": 0.21, + "grad_norm": 1.345972388499806, + "learning_rate": 1.8412211473756366e-05, + "loss": 0.6836, + "step": 2029 + }, + { + "epoch": 0.21, + "grad_norm": 1.420925984429223, + "learning_rate": 1.8410429734224305e-05, + "loss": 0.7123, + "step": 2030 + }, + { + "epoch": 0.21, + "grad_norm": 1.4714918647469502, + "learning_rate": 1.8408647081871617e-05, + "loss": 0.7548, + "step": 2031 + }, + { + "epoch": 0.21, + "grad_norm": 1.2880187247176165, + "learning_rate": 1.8406863516891787e-05, + "loss": 0.6705, + "step": 2032 + }, + { + "epoch": 0.21, + "grad_norm": 1.5561890649523946, + "learning_rate": 1.840507903947839e-05, + "loss": 0.8041, + "step": 2033 + }, + { + "epoch": 0.21, + "grad_norm": 1.8240869988638713, + "learning_rate": 1.8403293649825105e-05, + "loss": 0.8352, + "step": 2034 + }, + { + "epoch": 0.21, + "grad_norm": 1.6843780737067835, + "learning_rate": 1.8401507348125706e-05, + "loss": 0.8211, + "step": 2035 + }, + { + "epoch": 0.21, + "grad_norm": 1.8174509816114721, + "learning_rate": 1.8399720134574068e-05, + "loss": 0.8882, + "step": 2036 + }, + { + "epoch": 0.21, + "grad_norm": 1.8972402872295238, + "learning_rate": 1.839793200936417e-05, + "loss": 0.8591, + "step": 2037 + }, + { + "epoch": 0.21, + "grad_norm": 1.7514471330221437, + "learning_rate": 1.8396142972690075e-05, + "loss": 0.6459, + "step": 2038 + }, + { + "epoch": 0.21, + "grad_norm": 1.8239195890199404, + "learning_rate": 1.8394353024745965e-05, + "loss": 0.7782, + "step": 2039 + }, + { + "epoch": 0.21, + "grad_norm": 1.5675086725687128, + "learning_rate": 1.839256216572611e-05, + "loss": 0.8358, + "step": 2040 + }, + { + "epoch": 0.21, + "grad_norm": 1.5085805660216012, + "learning_rate": 1.8390770395824874e-05, + "loss": 0.735, + "step": 2041 + }, + { + "epoch": 0.21, + "grad_norm": 1.547175912059753, + "learning_rate": 1.8388977715236728e-05, + "loss": 0.7459, + "step": 2042 + }, + { + "epoch": 0.21, + "grad_norm": 1.7356081253311848, + "learning_rate": 1.838718412415624e-05, + "loss": 0.8296, + "step": 2043 + }, + { + "epoch": 0.21, + "grad_norm": 1.655669317138079, + "learning_rate": 1.8385389622778076e-05, + "loss": 0.9693, + "step": 2044 + }, + { + "epoch": 0.21, + "grad_norm": 1.6362615478232747, + "learning_rate": 1.8383594211297002e-05, + "loss": 0.8121, + "step": 2045 + }, + { + "epoch": 0.21, + "grad_norm": 1.5372469664900423, + "learning_rate": 1.838179788990788e-05, + "loss": 0.7378, + "step": 2046 + }, + { + "epoch": 0.21, + "grad_norm": 1.7855729505540894, + "learning_rate": 1.838000065880568e-05, + "loss": 0.8149, + "step": 2047 + }, + { + "epoch": 0.21, + "grad_norm": 1.6119552242718482, + "learning_rate": 1.837820251818545e-05, + "loss": 0.7797, + "step": 2048 + }, + { + "epoch": 0.21, + "grad_norm": 1.6814490296118239, + "learning_rate": 1.837640346824236e-05, + "loss": 0.6792, + "step": 2049 + }, + { + "epoch": 0.21, + "grad_norm": 1.5907274351368677, + "learning_rate": 1.837460350917166e-05, + "loss": 0.6379, + "step": 2050 + }, + { + "epoch": 0.21, + "grad_norm": 1.582028706018697, + "learning_rate": 1.837280264116872e-05, + "loss": 0.87, + "step": 2051 + }, + { + "epoch": 0.21, + "grad_norm": 1.5896426368682781, + "learning_rate": 1.837100086442899e-05, + "loss": 0.7454, + "step": 2052 + }, + { + "epoch": 0.21, + "grad_norm": 1.515693029722567, + "learning_rate": 1.8369198179148022e-05, + "loss": 0.5782, + "step": 2053 + }, + { + "epoch": 0.21, + "grad_norm": 1.6823168519943408, + "learning_rate": 1.836739458552147e-05, + "loss": 0.7646, + "step": 2054 + }, + { + "epoch": 0.21, + "grad_norm": 1.5807384127341129, + "learning_rate": 1.8365590083745085e-05, + "loss": 0.7864, + "step": 2055 + }, + { + "epoch": 0.21, + "grad_norm": 1.5014690904678236, + "learning_rate": 1.8363784674014726e-05, + "loss": 0.7983, + "step": 2056 + }, + { + "epoch": 0.21, + "grad_norm": 1.527163447694033, + "learning_rate": 1.836197835652633e-05, + "loss": 0.7369, + "step": 2057 + }, + { + "epoch": 0.21, + "grad_norm": 1.4761009440956374, + "learning_rate": 1.8360171131475954e-05, + "loss": 0.7562, + "step": 2058 + }, + { + "epoch": 0.21, + "grad_norm": 1.5807536940605051, + "learning_rate": 1.8358362999059738e-05, + "loss": 0.7055, + "step": 2059 + }, + { + "epoch": 0.21, + "grad_norm": 1.7363606674901761, + "learning_rate": 1.835655395947393e-05, + "loss": 0.8579, + "step": 2060 + }, + { + "epoch": 0.21, + "grad_norm": 1.5892285709310214, + "learning_rate": 1.835474401291487e-05, + "loss": 0.76, + "step": 2061 + }, + { + "epoch": 0.21, + "grad_norm": 1.96710212615678, + "learning_rate": 1.8352933159579e-05, + "loss": 0.7318, + "step": 2062 + }, + { + "epoch": 0.21, + "grad_norm": 1.656354040552658, + "learning_rate": 1.8351121399662862e-05, + "loss": 0.7117, + "step": 2063 + }, + { + "epoch": 0.21, + "grad_norm": 1.4360631305910538, + "learning_rate": 1.8349308733363093e-05, + "loss": 0.6603, + "step": 2064 + }, + { + "epoch": 0.21, + "grad_norm": 1.646228813200624, + "learning_rate": 1.8347495160876432e-05, + "loss": 0.7216, + "step": 2065 + }, + { + "epoch": 0.21, + "grad_norm": 1.446053013317963, + "learning_rate": 1.834568068239971e-05, + "loss": 0.7162, + "step": 2066 + }, + { + "epoch": 0.21, + "grad_norm": 1.4323835655155246, + "learning_rate": 1.8343865298129858e-05, + "loss": 0.7047, + "step": 2067 + }, + { + "epoch": 0.21, + "grad_norm": 1.5035102946772407, + "learning_rate": 1.8342049008263917e-05, + "loss": 0.8284, + "step": 2068 + }, + { + "epoch": 0.21, + "grad_norm": 1.6444469347806443, + "learning_rate": 1.8340231812999007e-05, + "loss": 0.7184, + "step": 2069 + }, + { + "epoch": 0.21, + "grad_norm": 1.6596641180910732, + "learning_rate": 1.8338413712532365e-05, + "loss": 0.8087, + "step": 2070 + }, + { + "epoch": 0.21, + "grad_norm": 1.5875512346960872, + "learning_rate": 1.833659470706131e-05, + "loss": 0.7649, + "step": 2071 + }, + { + "epoch": 0.21, + "grad_norm": 1.5843463610866617, + "learning_rate": 1.8334774796783268e-05, + "loss": 0.7858, + "step": 2072 + }, + { + "epoch": 0.21, + "grad_norm": 1.4945176583565407, + "learning_rate": 1.833295398189576e-05, + "loss": 0.7108, + "step": 2073 + }, + { + "epoch": 0.21, + "grad_norm": 1.812069632882091, + "learning_rate": 1.8331132262596418e-05, + "loss": 0.8371, + "step": 2074 + }, + { + "epoch": 0.21, + "grad_norm": 1.4764660495567634, + "learning_rate": 1.832930963908295e-05, + "loss": 0.6777, + "step": 2075 + }, + { + "epoch": 0.21, + "grad_norm": 1.5515759927728716, + "learning_rate": 1.8327486111553174e-05, + "loss": 0.8229, + "step": 2076 + }, + { + "epoch": 0.21, + "grad_norm": 1.4658649863388442, + "learning_rate": 1.832566168020501e-05, + "loss": 0.7533, + "step": 2077 + }, + { + "epoch": 0.21, + "grad_norm": 1.5142633214554027, + "learning_rate": 1.832383634523647e-05, + "loss": 0.6924, + "step": 2078 + }, + { + "epoch": 0.21, + "grad_norm": 1.5895282587876711, + "learning_rate": 1.8322010106845663e-05, + "loss": 0.8646, + "step": 2079 + }, + { + "epoch": 0.21, + "grad_norm": 1.4819718119255312, + "learning_rate": 1.8320182965230803e-05, + "loss": 0.6515, + "step": 2080 + }, + { + "epoch": 0.21, + "grad_norm": 1.5681585082866136, + "learning_rate": 1.8318354920590195e-05, + "loss": 0.8381, + "step": 2081 + }, + { + "epoch": 0.21, + "grad_norm": 1.4810855763092055, + "learning_rate": 1.8316525973122243e-05, + "loss": 0.769, + "step": 2082 + }, + { + "epoch": 0.21, + "grad_norm": 1.7651158078259417, + "learning_rate": 1.8314696123025456e-05, + "loss": 0.8944, + "step": 2083 + }, + { + "epoch": 0.21, + "grad_norm": 1.3824607726643825, + "learning_rate": 1.8312865370498428e-05, + "loss": 0.7574, + "step": 2084 + }, + { + "epoch": 0.21, + "grad_norm": 1.723480098849916, + "learning_rate": 1.8311033715739864e-05, + "loss": 0.7932, + "step": 2085 + }, + { + "epoch": 0.21, + "grad_norm": 1.7779798532759492, + "learning_rate": 1.830920115894856e-05, + "loss": 0.7958, + "step": 2086 + }, + { + "epoch": 0.21, + "grad_norm": 1.5820203885718955, + "learning_rate": 1.8307367700323412e-05, + "loss": 0.6929, + "step": 2087 + }, + { + "epoch": 0.21, + "grad_norm": 1.6031859652237008, + "learning_rate": 1.8305533340063416e-05, + "loss": 0.7878, + "step": 2088 + }, + { + "epoch": 0.21, + "grad_norm": 1.5935160645728348, + "learning_rate": 1.8303698078367654e-05, + "loss": 0.7755, + "step": 2089 + }, + { + "epoch": 0.21, + "grad_norm": 1.5403578121776387, + "learning_rate": 1.8301861915435325e-05, + "loss": 0.8191, + "step": 2090 + }, + { + "epoch": 0.21, + "grad_norm": 1.8028348646006056, + "learning_rate": 1.830002485146571e-05, + "loss": 0.8042, + "step": 2091 + }, + { + "epoch": 0.21, + "grad_norm": 1.553546451754014, + "learning_rate": 1.8298186886658194e-05, + "loss": 0.8077, + "step": 2092 + }, + { + "epoch": 0.21, + "grad_norm": 1.5730451787934852, + "learning_rate": 1.8296348021212264e-05, + "loss": 0.8645, + "step": 2093 + }, + { + "epoch": 0.21, + "grad_norm": 1.635196346349874, + "learning_rate": 1.8294508255327495e-05, + "loss": 0.8325, + "step": 2094 + }, + { + "epoch": 0.21, + "grad_norm": 1.7375633128416428, + "learning_rate": 1.8292667589203567e-05, + "loss": 0.7546, + "step": 2095 + }, + { + "epoch": 0.21, + "grad_norm": 1.4748248166175115, + "learning_rate": 1.8290826023040257e-05, + "loss": 0.8443, + "step": 2096 + }, + { + "epoch": 0.21, + "grad_norm": 1.8397460998187334, + "learning_rate": 1.8288983557037432e-05, + "loss": 0.7881, + "step": 2097 + }, + { + "epoch": 0.21, + "grad_norm": 1.4051773976204776, + "learning_rate": 1.8287140191395066e-05, + "loss": 0.7506, + "step": 2098 + }, + { + "epoch": 0.21, + "grad_norm": 1.5543239613177329, + "learning_rate": 1.8285295926313234e-05, + "loss": 0.7805, + "step": 2099 + }, + { + "epoch": 0.21, + "grad_norm": 1.6267215636888006, + "learning_rate": 1.8283450761992095e-05, + "loss": 0.7499, + "step": 2100 + }, + { + "epoch": 0.21, + "grad_norm": 1.6634829018566752, + "learning_rate": 1.8281604698631913e-05, + "loss": 0.702, + "step": 2101 + }, + { + "epoch": 0.21, + "grad_norm": 1.458900948499807, + "learning_rate": 1.827975773643305e-05, + "loss": 0.7928, + "step": 2102 + }, + { + "epoch": 0.21, + "grad_norm": 1.7096776108046234, + "learning_rate": 1.8277909875595967e-05, + "loss": 0.7865, + "step": 2103 + }, + { + "epoch": 0.21, + "grad_norm": 1.4549477637499242, + "learning_rate": 1.827606111632122e-05, + "loss": 0.7086, + "step": 2104 + }, + { + "epoch": 0.21, + "grad_norm": 1.6106913000137875, + "learning_rate": 1.827421145880946e-05, + "loss": 0.7765, + "step": 2105 + }, + { + "epoch": 0.21, + "grad_norm": 1.522056592191057, + "learning_rate": 1.8272360903261443e-05, + "loss": 0.7613, + "step": 2106 + }, + { + "epoch": 0.21, + "grad_norm": 1.539075161245667, + "learning_rate": 1.8270509449878015e-05, + "loss": 0.724, + "step": 2107 + }, + { + "epoch": 0.21, + "grad_norm": 1.4949035601058638, + "learning_rate": 1.8268657098860118e-05, + "loss": 0.8125, + "step": 2108 + }, + { + "epoch": 0.21, + "grad_norm": 1.4720359002309913, + "learning_rate": 1.82668038504088e-05, + "loss": 0.6817, + "step": 2109 + }, + { + "epoch": 0.21, + "grad_norm": 1.733004339510359, + "learning_rate": 1.826494970472521e-05, + "loss": 0.8364, + "step": 2110 + }, + { + "epoch": 0.21, + "grad_norm": 1.5290000198905853, + "learning_rate": 1.8263094662010575e-05, + "loss": 0.7266, + "step": 2111 + }, + { + "epoch": 0.21, + "grad_norm": 1.6943982181306592, + "learning_rate": 1.8261238722466233e-05, + "loss": 0.6214, + "step": 2112 + }, + { + "epoch": 0.21, + "grad_norm": 1.7396644301300892, + "learning_rate": 1.825938188629362e-05, + "loss": 0.7582, + "step": 2113 + }, + { + "epoch": 0.22, + "grad_norm": 1.756893544882042, + "learning_rate": 1.8257524153694265e-05, + "loss": 0.7242, + "step": 2114 + }, + { + "epoch": 0.22, + "grad_norm": 1.6405496596283244, + "learning_rate": 1.82556655248698e-05, + "loss": 0.8054, + "step": 2115 + }, + { + "epoch": 0.22, + "grad_norm": 1.6438093570265306, + "learning_rate": 1.8253806000021943e-05, + "loss": 0.726, + "step": 2116 + }, + { + "epoch": 0.22, + "grad_norm": 1.564672755650021, + "learning_rate": 1.825194557935252e-05, + "loss": 0.6933, + "step": 2117 + }, + { + "epoch": 0.22, + "grad_norm": 1.7318313364847981, + "learning_rate": 1.825008426306345e-05, + "loss": 0.7744, + "step": 2118 + }, + { + "epoch": 0.22, + "grad_norm": 1.6157523650895218, + "learning_rate": 1.8248222051356756e-05, + "loss": 0.7796, + "step": 2119 + }, + { + "epoch": 0.22, + "grad_norm": 1.382492070976764, + "learning_rate": 1.824635894443454e-05, + "loss": 0.7445, + "step": 2120 + }, + { + "epoch": 0.22, + "grad_norm": 1.6583871301219149, + "learning_rate": 1.8244494942499017e-05, + "loss": 0.7904, + "step": 2121 + }, + { + "epoch": 0.22, + "grad_norm": 1.4231276239769954, + "learning_rate": 1.8242630045752504e-05, + "loss": 0.6558, + "step": 2122 + }, + { + "epoch": 0.22, + "grad_norm": 1.455061423838774, + "learning_rate": 1.8240764254397392e-05, + "loss": 0.7436, + "step": 2123 + }, + { + "epoch": 0.22, + "grad_norm": 1.703724941352681, + "learning_rate": 1.8238897568636197e-05, + "loss": 0.6682, + "step": 2124 + }, + { + "epoch": 0.22, + "grad_norm": 1.6735807021258513, + "learning_rate": 1.8237029988671514e-05, + "loss": 0.763, + "step": 2125 + }, + { + "epoch": 0.22, + "grad_norm": 1.548686319328144, + "learning_rate": 1.8235161514706036e-05, + "loss": 0.7035, + "step": 2126 + }, + { + "epoch": 0.22, + "grad_norm": 1.6797818763836223, + "learning_rate": 1.823329214694256e-05, + "loss": 0.6695, + "step": 2127 + }, + { + "epoch": 0.22, + "grad_norm": 1.6404203109199544, + "learning_rate": 1.8231421885583972e-05, + "loss": 0.8632, + "step": 2128 + }, + { + "epoch": 0.22, + "grad_norm": 1.5372973402500991, + "learning_rate": 1.822955073083327e-05, + "loss": 0.7308, + "step": 2129 + }, + { + "epoch": 0.22, + "grad_norm": 1.6315504556007998, + "learning_rate": 1.822767868289353e-05, + "loss": 0.8769, + "step": 2130 + }, + { + "epoch": 0.22, + "grad_norm": 1.6043084151034726, + "learning_rate": 1.8225805741967934e-05, + "loss": 0.7148, + "step": 2131 + }, + { + "epoch": 0.22, + "grad_norm": 1.5401015017426092, + "learning_rate": 1.822393190825976e-05, + "loss": 0.7348, + "step": 2132 + }, + { + "epoch": 0.22, + "grad_norm": 1.4478526089039574, + "learning_rate": 1.8222057181972386e-05, + "loss": 0.7506, + "step": 2133 + }, + { + "epoch": 0.22, + "grad_norm": 1.7731832083632584, + "learning_rate": 1.8220181563309284e-05, + "loss": 0.7261, + "step": 2134 + }, + { + "epoch": 0.22, + "grad_norm": 1.510479871553798, + "learning_rate": 1.8218305052474025e-05, + "loss": 0.6962, + "step": 2135 + }, + { + "epoch": 0.22, + "grad_norm": 1.6077436722899787, + "learning_rate": 1.821642764967027e-05, + "loss": 0.7813, + "step": 2136 + }, + { + "epoch": 0.22, + "grad_norm": 1.6144631090233805, + "learning_rate": 1.8214549355101786e-05, + "loss": 0.7784, + "step": 2137 + }, + { + "epoch": 0.22, + "grad_norm": 1.9735813719971183, + "learning_rate": 1.8212670168972428e-05, + "loss": 0.7713, + "step": 2138 + }, + { + "epoch": 0.22, + "grad_norm": 1.6638275430587353, + "learning_rate": 1.8210790091486156e-05, + "loss": 0.7142, + "step": 2139 + }, + { + "epoch": 0.22, + "grad_norm": 1.5424127528689335, + "learning_rate": 1.8208909122847024e-05, + "loss": 0.7102, + "step": 2140 + }, + { + "epoch": 0.22, + "grad_norm": 1.597958992848855, + "learning_rate": 1.8207027263259176e-05, + "loss": 0.8374, + "step": 2141 + }, + { + "epoch": 0.22, + "grad_norm": 1.565074333708335, + "learning_rate": 1.8205144512926866e-05, + "loss": 0.754, + "step": 2142 + }, + { + "epoch": 0.22, + "grad_norm": 1.5897706230321191, + "learning_rate": 1.8203260872054432e-05, + "loss": 0.8083, + "step": 2143 + }, + { + "epoch": 0.22, + "grad_norm": 1.4999155604503212, + "learning_rate": 1.8201376340846315e-05, + "loss": 0.6513, + "step": 2144 + }, + { + "epoch": 0.22, + "grad_norm": 1.4601575616164244, + "learning_rate": 1.819949091950705e-05, + "loss": 0.6396, + "step": 2145 + }, + { + "epoch": 0.22, + "grad_norm": 1.5653866484142276, + "learning_rate": 1.8197604608241272e-05, + "loss": 0.6518, + "step": 2146 + }, + { + "epoch": 0.22, + "grad_norm": 1.6287065297840895, + "learning_rate": 1.819571740725371e-05, + "loss": 0.7207, + "step": 2147 + }, + { + "epoch": 0.22, + "grad_norm": 1.5469126251614622, + "learning_rate": 1.819382931674919e-05, + "loss": 0.6479, + "step": 2148 + }, + { + "epoch": 0.22, + "grad_norm": 1.6232718164950395, + "learning_rate": 1.8191940336932638e-05, + "loss": 0.8188, + "step": 2149 + }, + { + "epoch": 0.22, + "grad_norm": 1.8874968240656473, + "learning_rate": 1.8190050468009065e-05, + "loss": 0.8426, + "step": 2150 + }, + { + "epoch": 0.22, + "grad_norm": 1.6375878574874134, + "learning_rate": 1.8188159710183595e-05, + "loss": 0.831, + "step": 2151 + }, + { + "epoch": 0.22, + "grad_norm": 1.6805671634638826, + "learning_rate": 1.8186268063661432e-05, + "loss": 0.7414, + "step": 2152 + }, + { + "epoch": 0.22, + "grad_norm": 1.6881636311339634, + "learning_rate": 1.8184375528647896e-05, + "loss": 0.785, + "step": 2153 + }, + { + "epoch": 0.22, + "grad_norm": 1.5103174375177986, + "learning_rate": 1.8182482105348383e-05, + "loss": 0.789, + "step": 2154 + }, + { + "epoch": 0.22, + "grad_norm": 1.5687022363735545, + "learning_rate": 1.8180587793968398e-05, + "loss": 0.7227, + "step": 2155 + }, + { + "epoch": 0.22, + "grad_norm": 1.6928307729934189, + "learning_rate": 1.817869259471354e-05, + "loss": 0.7008, + "step": 2156 + }, + { + "epoch": 0.22, + "grad_norm": 1.2934833517593187, + "learning_rate": 1.8176796507789497e-05, + "loss": 0.6371, + "step": 2157 + }, + { + "epoch": 0.22, + "grad_norm": 1.5592241220325302, + "learning_rate": 1.8174899533402066e-05, + "loss": 0.7327, + "step": 2158 + }, + { + "epoch": 0.22, + "grad_norm": 1.8484178493317083, + "learning_rate": 1.8173001671757127e-05, + "loss": 0.8249, + "step": 2159 + }, + { + "epoch": 0.22, + "grad_norm": 1.688186929197883, + "learning_rate": 1.8171102923060675e-05, + "loss": 0.8771, + "step": 2160 + }, + { + "epoch": 0.22, + "grad_norm": 1.5226220759645985, + "learning_rate": 1.8169203287518778e-05, + "loss": 0.7016, + "step": 2161 + }, + { + "epoch": 0.22, + "grad_norm": 1.5748867957539676, + "learning_rate": 1.816730276533762e-05, + "loss": 0.7682, + "step": 2162 + }, + { + "epoch": 0.22, + "grad_norm": 1.5209777167596974, + "learning_rate": 1.8165401356723467e-05, + "loss": 0.7089, + "step": 2163 + }, + { + "epoch": 0.22, + "grad_norm": 1.5028113310786382, + "learning_rate": 1.816349906188269e-05, + "loss": 0.6731, + "step": 2164 + }, + { + "epoch": 0.22, + "grad_norm": 1.572176249001475, + "learning_rate": 1.8161595881021756e-05, + "loss": 0.8238, + "step": 2165 + }, + { + "epoch": 0.22, + "grad_norm": 1.5882665731586676, + "learning_rate": 1.8159691814347217e-05, + "loss": 0.8786, + "step": 2166 + }, + { + "epoch": 0.22, + "grad_norm": 1.5656827837258642, + "learning_rate": 1.8157786862065735e-05, + "loss": 0.7759, + "step": 2167 + }, + { + "epoch": 0.22, + "grad_norm": 1.4514364376988993, + "learning_rate": 1.8155881024384064e-05, + "loss": 0.835, + "step": 2168 + }, + { + "epoch": 0.22, + "grad_norm": 1.588171207975576, + "learning_rate": 1.8153974301509053e-05, + "loss": 0.7176, + "step": 2169 + }, + { + "epoch": 0.22, + "grad_norm": 1.7084577912367054, + "learning_rate": 1.815206669364764e-05, + "loss": 0.8163, + "step": 2170 + }, + { + "epoch": 0.22, + "grad_norm": 1.5668159027685062, + "learning_rate": 1.8150158201006878e-05, + "loss": 0.8995, + "step": 2171 + }, + { + "epoch": 0.22, + "grad_norm": 1.8305587750700543, + "learning_rate": 1.8148248823793895e-05, + "loss": 0.7976, + "step": 2172 + }, + { + "epoch": 0.22, + "grad_norm": 1.5635711829004972, + "learning_rate": 1.8146338562215927e-05, + "loss": 0.8354, + "step": 2173 + }, + { + "epoch": 0.22, + "grad_norm": 1.569805517173065, + "learning_rate": 1.81444274164803e-05, + "loss": 0.8663, + "step": 2174 + }, + { + "epoch": 0.22, + "grad_norm": 1.583628255647632, + "learning_rate": 1.8142515386794443e-05, + "loss": 0.7238, + "step": 2175 + }, + { + "epoch": 0.22, + "grad_norm": 1.5298895392029468, + "learning_rate": 1.814060247336588e-05, + "loss": 0.7605, + "step": 2176 + }, + { + "epoch": 0.22, + "grad_norm": 1.6327355633469391, + "learning_rate": 1.8138688676402212e-05, + "loss": 0.7277, + "step": 2177 + }, + { + "epoch": 0.22, + "grad_norm": 1.4512353475385125, + "learning_rate": 1.8136773996111175e-05, + "loss": 0.6717, + "step": 2178 + }, + { + "epoch": 0.22, + "grad_norm": 1.7808283545662742, + "learning_rate": 1.813485843270056e-05, + "loss": 0.8072, + "step": 2179 + }, + { + "epoch": 0.22, + "grad_norm": 1.6894019878441329, + "learning_rate": 1.8132941986378276e-05, + "loss": 0.6918, + "step": 2180 + }, + { + "epoch": 0.22, + "grad_norm": 1.2985355768151867, + "learning_rate": 1.8131024657352328e-05, + "loss": 0.7033, + "step": 2181 + }, + { + "epoch": 0.22, + "grad_norm": 1.3873095945563672, + "learning_rate": 1.8129106445830807e-05, + "loss": 0.8286, + "step": 2182 + }, + { + "epoch": 0.22, + "grad_norm": 1.7586578364971428, + "learning_rate": 1.8127187352021908e-05, + "loss": 0.8945, + "step": 2183 + }, + { + "epoch": 0.22, + "grad_norm": 1.6220564789546164, + "learning_rate": 1.8125267376133912e-05, + "loss": 0.81, + "step": 2184 + }, + { + "epoch": 0.22, + "grad_norm": 1.6101531248573264, + "learning_rate": 1.812334651837521e-05, + "loss": 0.7452, + "step": 2185 + }, + { + "epoch": 0.22, + "grad_norm": 1.5883461039783242, + "learning_rate": 1.8121424778954278e-05, + "loss": 0.8675, + "step": 2186 + }, + { + "epoch": 0.22, + "grad_norm": 1.4514916706150318, + "learning_rate": 1.8119502158079693e-05, + "loss": 0.7217, + "step": 2187 + }, + { + "epoch": 0.22, + "grad_norm": 1.6195550361174957, + "learning_rate": 1.8117578655960123e-05, + "loss": 0.9525, + "step": 2188 + }, + { + "epoch": 0.22, + "grad_norm": 1.5460650451901794, + "learning_rate": 1.8115654272804333e-05, + "loss": 0.6632, + "step": 2189 + }, + { + "epoch": 0.22, + "grad_norm": 1.5250281746069403, + "learning_rate": 1.811372900882119e-05, + "loss": 0.6274, + "step": 2190 + }, + { + "epoch": 0.22, + "grad_norm": 1.5412167993310468, + "learning_rate": 1.8111802864219643e-05, + "loss": 0.7328, + "step": 2191 + }, + { + "epoch": 0.22, + "grad_norm": 1.637075226598729, + "learning_rate": 1.8109875839208754e-05, + "loss": 0.7426, + "step": 2192 + }, + { + "epoch": 0.22, + "grad_norm": 1.6717734330324852, + "learning_rate": 1.8107947933997663e-05, + "loss": 0.8137, + "step": 2193 + }, + { + "epoch": 0.22, + "grad_norm": 1.530759629670791, + "learning_rate": 1.810601914879562e-05, + "loss": 0.7175, + "step": 2194 + }, + { + "epoch": 0.22, + "grad_norm": 1.526483843980927, + "learning_rate": 1.8104089483811963e-05, + "loss": 0.7751, + "step": 2195 + }, + { + "epoch": 0.22, + "grad_norm": 1.5835310478088402, + "learning_rate": 1.8102158939256122e-05, + "loss": 0.7437, + "step": 2196 + }, + { + "epoch": 0.22, + "grad_norm": 1.6780789997117656, + "learning_rate": 1.8100227515337634e-05, + "loss": 0.7519, + "step": 2197 + }, + { + "epoch": 0.22, + "grad_norm": 1.6293693684949369, + "learning_rate": 1.8098295212266123e-05, + "loss": 0.7471, + "step": 2198 + }, + { + "epoch": 0.22, + "grad_norm": 1.5963838193341875, + "learning_rate": 1.8096362030251312e-05, + "loss": 0.8278, + "step": 2199 + }, + { + "epoch": 0.22, + "grad_norm": 1.641469573490994, + "learning_rate": 1.8094427969503013e-05, + "loss": 0.772, + "step": 2200 + }, + { + "epoch": 0.22, + "grad_norm": 1.634557774448412, + "learning_rate": 1.8092493030231142e-05, + "loss": 0.8353, + "step": 2201 + }, + { + "epoch": 0.22, + "grad_norm": 1.565183800023239, + "learning_rate": 1.8090557212645702e-05, + "loss": 0.7399, + "step": 2202 + }, + { + "epoch": 0.22, + "grad_norm": 1.7881324029802745, + "learning_rate": 1.8088620516956804e-05, + "loss": 0.8626, + "step": 2203 + }, + { + "epoch": 0.22, + "grad_norm": 1.4944277512903883, + "learning_rate": 1.808668294337464e-05, + "loss": 0.7774, + "step": 2204 + }, + { + "epoch": 0.22, + "grad_norm": 1.527912053127469, + "learning_rate": 1.8084744492109497e-05, + "loss": 0.7545, + "step": 2205 + }, + { + "epoch": 0.22, + "grad_norm": 1.592145236674976, + "learning_rate": 1.8082805163371777e-05, + "loss": 0.7721, + "step": 2206 + }, + { + "epoch": 0.22, + "grad_norm": 1.6812339574309858, + "learning_rate": 1.8080864957371958e-05, + "loss": 0.7945, + "step": 2207 + }, + { + "epoch": 0.22, + "grad_norm": 1.6211980361220193, + "learning_rate": 1.8078923874320615e-05, + "loss": 0.7518, + "step": 2208 + }, + { + "epoch": 0.22, + "grad_norm": 1.614036868443832, + "learning_rate": 1.8076981914428426e-05, + "loss": 0.868, + "step": 2209 + }, + { + "epoch": 0.22, + "grad_norm": 1.6792941191195823, + "learning_rate": 1.8075039077906163e-05, + "loss": 0.8067, + "step": 2210 + }, + { + "epoch": 0.22, + "grad_norm": 1.5699987593174205, + "learning_rate": 1.8073095364964682e-05, + "loss": 0.7725, + "step": 2211 + }, + { + "epoch": 0.23, + "grad_norm": 1.5710191725635028, + "learning_rate": 1.8071150775814956e-05, + "loss": 0.8535, + "step": 2212 + }, + { + "epoch": 0.23, + "grad_norm": 1.6947804590122968, + "learning_rate": 1.8069205310668026e-05, + "loss": 0.7741, + "step": 2213 + }, + { + "epoch": 0.23, + "grad_norm": 1.4980756469697138, + "learning_rate": 1.8067258969735052e-05, + "loss": 0.727, + "step": 2214 + }, + { + "epoch": 0.23, + "grad_norm": 1.452475006388767, + "learning_rate": 1.8065311753227272e-05, + "loss": 0.6818, + "step": 2215 + }, + { + "epoch": 0.23, + "grad_norm": 1.5605286651538655, + "learning_rate": 1.8063363661356027e-05, + "loss": 0.7793, + "step": 2216 + }, + { + "epoch": 0.23, + "grad_norm": 1.6668571113325545, + "learning_rate": 1.8061414694332756e-05, + "loss": 0.6445, + "step": 2217 + }, + { + "epoch": 0.23, + "grad_norm": 1.5436010718479045, + "learning_rate": 1.8059464852368987e-05, + "loss": 0.8266, + "step": 2218 + }, + { + "epoch": 0.23, + "grad_norm": 1.5529862587308387, + "learning_rate": 1.8057514135676344e-05, + "loss": 0.6925, + "step": 2219 + }, + { + "epoch": 0.23, + "grad_norm": 1.673046218654071, + "learning_rate": 1.8055562544466544e-05, + "loss": 0.7221, + "step": 2220 + }, + { + "epoch": 0.23, + "grad_norm": 1.6071811351863372, + "learning_rate": 1.8053610078951406e-05, + "loss": 0.7382, + "step": 2221 + }, + { + "epoch": 0.23, + "grad_norm": 1.697397817656138, + "learning_rate": 1.805165673934284e-05, + "loss": 0.8095, + "step": 2222 + }, + { + "epoch": 0.23, + "grad_norm": 1.494326366333854, + "learning_rate": 1.804970252585285e-05, + "loss": 0.7678, + "step": 2223 + }, + { + "epoch": 0.23, + "grad_norm": 1.460052000381731, + "learning_rate": 1.804774743869353e-05, + "loss": 0.6809, + "step": 2224 + }, + { + "epoch": 0.23, + "grad_norm": 1.6463863753897041, + "learning_rate": 1.804579147807708e-05, + "loss": 0.9014, + "step": 2225 + }, + { + "epoch": 0.23, + "grad_norm": 1.4916566202877668, + "learning_rate": 1.8043834644215788e-05, + "loss": 0.7232, + "step": 2226 + }, + { + "epoch": 0.23, + "grad_norm": 1.4792196205038426, + "learning_rate": 1.8041876937322037e-05, + "loss": 0.7147, + "step": 2227 + }, + { + "epoch": 0.23, + "grad_norm": 1.6681977826488052, + "learning_rate": 1.8039918357608304e-05, + "loss": 0.7517, + "step": 2228 + }, + { + "epoch": 0.23, + "grad_norm": 1.6038026551810642, + "learning_rate": 1.803795890528716e-05, + "loss": 0.7602, + "step": 2229 + }, + { + "epoch": 0.23, + "grad_norm": 1.6556802077399546, + "learning_rate": 1.8035998580571282e-05, + "loss": 0.6506, + "step": 2230 + }, + { + "epoch": 0.23, + "grad_norm": 1.738570205110402, + "learning_rate": 1.8034037383673428e-05, + "loss": 0.7763, + "step": 2231 + }, + { + "epoch": 0.23, + "grad_norm": 1.5473612742580263, + "learning_rate": 1.803207531480645e-05, + "loss": 0.6568, + "step": 2232 + }, + { + "epoch": 0.23, + "grad_norm": 1.6939965516749653, + "learning_rate": 1.8030112374183308e-05, + "loss": 0.7322, + "step": 2233 + }, + { + "epoch": 0.23, + "grad_norm": 1.528914442618967, + "learning_rate": 1.802814856201704e-05, + "loss": 0.7585, + "step": 2234 + }, + { + "epoch": 0.23, + "grad_norm": 1.4757461250675585, + "learning_rate": 1.8026183878520794e-05, + "loss": 0.7902, + "step": 2235 + }, + { + "epoch": 0.23, + "grad_norm": 1.5769224453410504, + "learning_rate": 1.8024218323907807e-05, + "loss": 0.6148, + "step": 2236 + }, + { + "epoch": 0.23, + "grad_norm": 1.4609111499910967, + "learning_rate": 1.8022251898391402e-05, + "loss": 0.6959, + "step": 2237 + }, + { + "epoch": 0.23, + "grad_norm": 1.434729618799995, + "learning_rate": 1.8020284602185006e-05, + "loss": 0.7305, + "step": 2238 + }, + { + "epoch": 0.23, + "grad_norm": 1.4238232846508847, + "learning_rate": 1.8018316435502143e-05, + "loss": 0.6863, + "step": 2239 + }, + { + "epoch": 0.23, + "grad_norm": 1.5818552847046934, + "learning_rate": 1.8016347398556423e-05, + "loss": 0.7345, + "step": 2240 + }, + { + "epoch": 0.23, + "grad_norm": 1.7039010405467567, + "learning_rate": 1.8014377491561553e-05, + "loss": 0.7224, + "step": 2241 + }, + { + "epoch": 0.23, + "grad_norm": 1.6340705954608221, + "learning_rate": 1.8012406714731338e-05, + "loss": 0.8247, + "step": 2242 + }, + { + "epoch": 0.23, + "grad_norm": 1.5826030085527707, + "learning_rate": 1.8010435068279673e-05, + "loss": 0.8059, + "step": 2243 + }, + { + "epoch": 0.23, + "grad_norm": 1.5847457915547827, + "learning_rate": 1.800846255242055e-05, + "loss": 0.7552, + "step": 2244 + }, + { + "epoch": 0.23, + "grad_norm": 1.7361257789569242, + "learning_rate": 1.800648916736806e-05, + "loss": 0.7518, + "step": 2245 + }, + { + "epoch": 0.23, + "grad_norm": 1.7586600117640085, + "learning_rate": 1.8004514913336374e-05, + "loss": 0.9412, + "step": 2246 + }, + { + "epoch": 0.23, + "grad_norm": 1.6696786408496551, + "learning_rate": 1.8002539790539772e-05, + "loss": 0.8753, + "step": 2247 + }, + { + "epoch": 0.23, + "grad_norm": 1.559155976771375, + "learning_rate": 1.8000563799192622e-05, + "loss": 0.7045, + "step": 2248 + }, + { + "epoch": 0.23, + "grad_norm": 1.5477534172888618, + "learning_rate": 1.7998586939509385e-05, + "loss": 0.7251, + "step": 2249 + }, + { + "epoch": 0.23, + "grad_norm": 1.7697877319548974, + "learning_rate": 1.799660921170462e-05, + "loss": 0.8147, + "step": 2250 + }, + { + "epoch": 0.23, + "grad_norm": 1.629767247929377, + "learning_rate": 1.799463061599298e-05, + "loss": 0.9108, + "step": 2251 + }, + { + "epoch": 0.23, + "grad_norm": 1.596274521142993, + "learning_rate": 1.7992651152589205e-05, + "loss": 0.7334, + "step": 2252 + }, + { + "epoch": 0.23, + "grad_norm": 1.6516715835230245, + "learning_rate": 1.799067082170814e-05, + "loss": 0.7745, + "step": 2253 + }, + { + "epoch": 0.23, + "grad_norm": 1.8505464121141373, + "learning_rate": 1.798868962356472e-05, + "loss": 0.7958, + "step": 2254 + }, + { + "epoch": 0.23, + "grad_norm": 1.5896082359240096, + "learning_rate": 1.798670755837397e-05, + "loss": 0.7953, + "step": 2255 + }, + { + "epoch": 0.23, + "grad_norm": 1.574272206265239, + "learning_rate": 1.7984724626351008e-05, + "loss": 0.8106, + "step": 2256 + }, + { + "epoch": 0.23, + "grad_norm": 1.558125079934259, + "learning_rate": 1.7982740827711058e-05, + "loss": 0.6858, + "step": 2257 + }, + { + "epoch": 0.23, + "grad_norm": 1.5459963520716786, + "learning_rate": 1.7980756162669427e-05, + "loss": 0.7438, + "step": 2258 + }, + { + "epoch": 0.23, + "grad_norm": 1.5282081192747206, + "learning_rate": 1.797877063144152e-05, + "loss": 0.7825, + "step": 2259 + }, + { + "epoch": 0.23, + "grad_norm": 1.523370901061267, + "learning_rate": 1.7976784234242838e-05, + "loss": 0.6955, + "step": 2260 + }, + { + "epoch": 0.23, + "grad_norm": 1.5460023446118578, + "learning_rate": 1.7974796971288966e-05, + "loss": 0.7258, + "step": 2261 + }, + { + "epoch": 0.23, + "grad_norm": 1.3836104999840484, + "learning_rate": 1.7972808842795594e-05, + "loss": 0.8183, + "step": 2262 + }, + { + "epoch": 0.23, + "grad_norm": 1.6467488935328332, + "learning_rate": 1.7970819848978503e-05, + "loss": 0.7974, + "step": 2263 + }, + { + "epoch": 0.23, + "grad_norm": 1.513920497223266, + "learning_rate": 1.7968829990053572e-05, + "loss": 0.7312, + "step": 2264 + }, + { + "epoch": 0.23, + "grad_norm": 1.5416125278602086, + "learning_rate": 1.796683926623676e-05, + "loss": 0.8179, + "step": 2265 + }, + { + "epoch": 0.23, + "grad_norm": 1.6108320417733, + "learning_rate": 1.7964847677744136e-05, + "loss": 0.719, + "step": 2266 + }, + { + "epoch": 0.23, + "grad_norm": 1.5754808386958055, + "learning_rate": 1.7962855224791852e-05, + "loss": 0.7511, + "step": 2267 + }, + { + "epoch": 0.23, + "grad_norm": 1.4482906664508932, + "learning_rate": 1.7960861907596158e-05, + "loss": 0.7618, + "step": 2268 + }, + { + "epoch": 0.23, + "grad_norm": 1.5903384355437244, + "learning_rate": 1.7958867726373393e-05, + "loss": 0.7502, + "step": 2269 + }, + { + "epoch": 0.23, + "grad_norm": 1.3129301228336305, + "learning_rate": 1.7956872681340005e-05, + "loss": 0.7442, + "step": 2270 + }, + { + "epoch": 0.23, + "grad_norm": 1.60673569630705, + "learning_rate": 1.795487677271252e-05, + "loss": 0.7776, + "step": 2271 + }, + { + "epoch": 0.23, + "grad_norm": 1.655070352685222, + "learning_rate": 1.7952880000707558e-05, + "loss": 0.6787, + "step": 2272 + }, + { + "epoch": 0.23, + "grad_norm": 1.5340526168662216, + "learning_rate": 1.795088236554184e-05, + "loss": 0.7754, + "step": 2273 + }, + { + "epoch": 0.23, + "grad_norm": 1.525057887506206, + "learning_rate": 1.7948883867432184e-05, + "loss": 0.8401, + "step": 2274 + }, + { + "epoch": 0.23, + "grad_norm": 1.6525453707981543, + "learning_rate": 1.7946884506595487e-05, + "loss": 0.7541, + "step": 2275 + }, + { + "epoch": 0.23, + "grad_norm": 1.6554115210034996, + "learning_rate": 1.794488428324875e-05, + "loss": 0.7034, + "step": 2276 + }, + { + "epoch": 0.23, + "grad_norm": 1.4414860762539807, + "learning_rate": 1.7942883197609074e-05, + "loss": 0.6702, + "step": 2277 + }, + { + "epoch": 0.23, + "grad_norm": 1.594305792624561, + "learning_rate": 1.7940881249893636e-05, + "loss": 0.7537, + "step": 2278 + }, + { + "epoch": 0.23, + "grad_norm": 1.6752368175096224, + "learning_rate": 1.7938878440319722e-05, + "loss": 0.8536, + "step": 2279 + }, + { + "epoch": 0.23, + "grad_norm": 1.3880591018903257, + "learning_rate": 1.7936874769104703e-05, + "loss": 0.5761, + "step": 2280 + }, + { + "epoch": 0.23, + "grad_norm": 1.6830662990808718, + "learning_rate": 1.7934870236466047e-05, + "loss": 0.7947, + "step": 2281 + }, + { + "epoch": 0.23, + "grad_norm": 1.5170592285262723, + "learning_rate": 1.7932864842621312e-05, + "loss": 0.7356, + "step": 2282 + }, + { + "epoch": 0.23, + "grad_norm": 1.590286210609126, + "learning_rate": 1.7930858587788156e-05, + "loss": 0.7456, + "step": 2283 + }, + { + "epoch": 0.23, + "grad_norm": 1.5899046321067252, + "learning_rate": 1.7928851472184323e-05, + "loss": 0.7471, + "step": 2284 + }, + { + "epoch": 0.23, + "grad_norm": 1.830304910240319, + "learning_rate": 1.792684349602766e-05, + "loss": 0.8573, + "step": 2285 + }, + { + "epoch": 0.23, + "grad_norm": 1.5360155713568722, + "learning_rate": 1.7924834659536093e-05, + "loss": 0.899, + "step": 2286 + }, + { + "epoch": 0.23, + "grad_norm": 1.503268262966366, + "learning_rate": 1.7922824962927657e-05, + "loss": 0.808, + "step": 2287 + }, + { + "epoch": 0.23, + "grad_norm": 1.6637303000309078, + "learning_rate": 1.792081440642047e-05, + "loss": 0.8694, + "step": 2288 + }, + { + "epoch": 0.23, + "grad_norm": 1.4930490566181709, + "learning_rate": 1.7918802990232748e-05, + "loss": 0.7238, + "step": 2289 + }, + { + "epoch": 0.23, + "grad_norm": 1.6580291703719623, + "learning_rate": 1.7916790714582795e-05, + "loss": 0.7738, + "step": 2290 + }, + { + "epoch": 0.23, + "grad_norm": 1.5967606389216442, + "learning_rate": 1.7914777579689017e-05, + "loss": 0.7241, + "step": 2291 + }, + { + "epoch": 0.23, + "grad_norm": 1.6330398542636713, + "learning_rate": 1.7912763585769905e-05, + "loss": 0.8487, + "step": 2292 + }, + { + "epoch": 0.23, + "grad_norm": 1.4311002448815942, + "learning_rate": 1.791074873304405e-05, + "loss": 0.7219, + "step": 2293 + }, + { + "epoch": 0.23, + "grad_norm": 1.738071137245182, + "learning_rate": 1.790873302173013e-05, + "loss": 0.7444, + "step": 2294 + }, + { + "epoch": 0.23, + "grad_norm": 1.5850740120003257, + "learning_rate": 1.7906716452046915e-05, + "loss": 0.7367, + "step": 2295 + }, + { + "epoch": 0.23, + "grad_norm": 1.5592420662588853, + "learning_rate": 1.790469902421328e-05, + "loss": 0.7313, + "step": 2296 + }, + { + "epoch": 0.23, + "grad_norm": 1.4806536703150404, + "learning_rate": 1.7902680738448178e-05, + "loss": 0.8097, + "step": 2297 + }, + { + "epoch": 0.23, + "grad_norm": 1.489296277259454, + "learning_rate": 1.790066159497067e-05, + "loss": 0.7146, + "step": 2298 + }, + { + "epoch": 0.23, + "grad_norm": 1.558413346803114, + "learning_rate": 1.7898641593999893e-05, + "loss": 0.7599, + "step": 2299 + }, + { + "epoch": 0.23, + "grad_norm": 1.4735137697579415, + "learning_rate": 1.78966207357551e-05, + "loss": 0.8103, + "step": 2300 + }, + { + "epoch": 0.23, + "grad_norm": 1.4504295184011107, + "learning_rate": 1.789459902045561e-05, + "loss": 0.6457, + "step": 2301 + }, + { + "epoch": 0.23, + "grad_norm": 1.4983891556182687, + "learning_rate": 1.7892576448320854e-05, + "loss": 0.7301, + "step": 2302 + }, + { + "epoch": 0.23, + "grad_norm": 1.635941334049674, + "learning_rate": 1.7890553019570353e-05, + "loss": 0.7768, + "step": 2303 + }, + { + "epoch": 0.23, + "grad_norm": 1.3864888565813458, + "learning_rate": 1.7888528734423715e-05, + "loss": 0.6055, + "step": 2304 + }, + { + "epoch": 0.23, + "grad_norm": 1.4703550242509769, + "learning_rate": 1.7886503593100646e-05, + "loss": 0.7608, + "step": 2305 + }, + { + "epoch": 0.23, + "grad_norm": 1.6246563516863506, + "learning_rate": 1.788447759582094e-05, + "loss": 0.7621, + "step": 2306 + }, + { + "epoch": 0.23, + "grad_norm": 1.577237508115445, + "learning_rate": 1.7882450742804493e-05, + "loss": 0.7709, + "step": 2307 + }, + { + "epoch": 0.23, + "grad_norm": 1.690658860404219, + "learning_rate": 1.7880423034271287e-05, + "loss": 0.7846, + "step": 2308 + }, + { + "epoch": 0.23, + "grad_norm": 1.6931476627904156, + "learning_rate": 1.7878394470441396e-05, + "loss": 0.6558, + "step": 2309 + }, + { + "epoch": 0.23, + "grad_norm": 1.6204463011198105, + "learning_rate": 1.7876365051534987e-05, + "loss": 0.8227, + "step": 2310 + }, + { + "epoch": 0.24, + "grad_norm": 1.522142949127196, + "learning_rate": 1.7874334777772326e-05, + "loss": 0.7449, + "step": 2311 + }, + { + "epoch": 0.24, + "grad_norm": 1.6461329249430225, + "learning_rate": 1.7872303649373767e-05, + "loss": 0.7682, + "step": 2312 + }, + { + "epoch": 0.24, + "grad_norm": 1.5824110708824122, + "learning_rate": 1.7870271666559755e-05, + "loss": 0.7866, + "step": 2313 + }, + { + "epoch": 0.24, + "grad_norm": 1.6400074273266572, + "learning_rate": 1.786823882955083e-05, + "loss": 0.8164, + "step": 2314 + }, + { + "epoch": 0.24, + "grad_norm": 1.502158375278795, + "learning_rate": 1.7866205138567626e-05, + "loss": 0.7833, + "step": 2315 + }, + { + "epoch": 0.24, + "grad_norm": 1.4295665349426534, + "learning_rate": 1.7864170593830868e-05, + "loss": 0.7662, + "step": 2316 + }, + { + "epoch": 0.24, + "grad_norm": 1.516678854068183, + "learning_rate": 1.7862135195561373e-05, + "loss": 0.6889, + "step": 2317 + }, + { + "epoch": 0.24, + "grad_norm": 1.546924044213888, + "learning_rate": 1.7860098943980056e-05, + "loss": 0.6845, + "step": 2318 + }, + { + "epoch": 0.24, + "grad_norm": 1.5686659936839316, + "learning_rate": 1.7858061839307913e-05, + "loss": 0.7132, + "step": 2319 + }, + { + "epoch": 0.24, + "grad_norm": 1.5550197845969858, + "learning_rate": 1.7856023881766048e-05, + "loss": 0.7091, + "step": 2320 + }, + { + "epoch": 0.24, + "grad_norm": 1.529551763484397, + "learning_rate": 1.785398507157564e-05, + "loss": 0.7918, + "step": 2321 + }, + { + "epoch": 0.24, + "grad_norm": 1.6061716867769542, + "learning_rate": 1.7851945408957977e-05, + "loss": 0.788, + "step": 2322 + }, + { + "epoch": 0.24, + "grad_norm": 1.556076449285782, + "learning_rate": 1.784990489413443e-05, + "loss": 0.8446, + "step": 2323 + }, + { + "epoch": 0.24, + "grad_norm": 1.6842789691197135, + "learning_rate": 1.7847863527326468e-05, + "loss": 0.7131, + "step": 2324 + }, + { + "epoch": 0.24, + "grad_norm": 1.5732131853316018, + "learning_rate": 1.7845821308755644e-05, + "loss": 0.7275, + "step": 2325 + }, + { + "epoch": 0.24, + "grad_norm": 1.6710459182156756, + "learning_rate": 1.7843778238643615e-05, + "loss": 0.7791, + "step": 2326 + }, + { + "epoch": 0.24, + "grad_norm": 1.767202341467178, + "learning_rate": 1.784173431721212e-05, + "loss": 0.801, + "step": 2327 + }, + { + "epoch": 0.24, + "grad_norm": 1.5185138048879634, + "learning_rate": 1.7839689544682993e-05, + "loss": 0.8229, + "step": 2328 + }, + { + "epoch": 0.24, + "grad_norm": 1.6198534151063857, + "learning_rate": 1.7837643921278168e-05, + "loss": 0.763, + "step": 2329 + }, + { + "epoch": 0.24, + "grad_norm": 1.3949632519487039, + "learning_rate": 1.783559744721966e-05, + "loss": 0.7574, + "step": 2330 + }, + { + "epoch": 0.24, + "grad_norm": 1.6805417262534503, + "learning_rate": 1.7833550122729587e-05, + "loss": 0.728, + "step": 2331 + }, + { + "epoch": 0.24, + "grad_norm": 1.5531944312041643, + "learning_rate": 1.783150194803015e-05, + "loss": 0.7335, + "step": 2332 + }, + { + "epoch": 0.24, + "grad_norm": 1.7159353418847874, + "learning_rate": 1.7829452923343648e-05, + "loss": 0.632, + "step": 2333 + }, + { + "epoch": 0.24, + "grad_norm": 1.491814756386104, + "learning_rate": 1.782740304889247e-05, + "loss": 0.7295, + "step": 2334 + }, + { + "epoch": 0.24, + "grad_norm": 1.6304216530256852, + "learning_rate": 1.7825352324899093e-05, + "loss": 0.7432, + "step": 2335 + }, + { + "epoch": 0.24, + "grad_norm": 1.5059743877915686, + "learning_rate": 1.7823300751586103e-05, + "loss": 0.8248, + "step": 2336 + }, + { + "epoch": 0.24, + "grad_norm": 1.5372983039722445, + "learning_rate": 1.7821248329176156e-05, + "loss": 0.7279, + "step": 2337 + }, + { + "epoch": 0.24, + "grad_norm": 1.634906198136157, + "learning_rate": 1.7819195057892015e-05, + "loss": 0.8099, + "step": 2338 + }, + { + "epoch": 0.24, + "grad_norm": 1.395598233074499, + "learning_rate": 1.781714093795653e-05, + "loss": 0.7467, + "step": 2339 + }, + { + "epoch": 0.24, + "grad_norm": 1.387677729120744, + "learning_rate": 1.781508596959264e-05, + "loss": 0.6657, + "step": 2340 + }, + { + "epoch": 0.24, + "grad_norm": 1.665462662562675, + "learning_rate": 1.7813030153023382e-05, + "loss": 0.7382, + "step": 2341 + }, + { + "epoch": 0.24, + "grad_norm": 1.4919226889245358, + "learning_rate": 1.7810973488471882e-05, + "loss": 0.8811, + "step": 2342 + }, + { + "epoch": 0.24, + "grad_norm": 1.570893636073215, + "learning_rate": 1.7808915976161364e-05, + "loss": 0.6923, + "step": 2343 + }, + { + "epoch": 0.24, + "grad_norm": 1.574635587846645, + "learning_rate": 1.780685761631513e-05, + "loss": 0.7827, + "step": 2344 + }, + { + "epoch": 0.24, + "grad_norm": 1.6669870967013605, + "learning_rate": 1.7804798409156592e-05, + "loss": 0.8147, + "step": 2345 + }, + { + "epoch": 0.24, + "grad_norm": 1.7019734330167111, + "learning_rate": 1.7802738354909236e-05, + "loss": 0.8692, + "step": 2346 + }, + { + "epoch": 0.24, + "grad_norm": 1.5679652330468707, + "learning_rate": 1.7800677453796656e-05, + "loss": 0.7307, + "step": 2347 + }, + { + "epoch": 0.24, + "grad_norm": 1.5464299583191818, + "learning_rate": 1.7798615706042525e-05, + "loss": 0.6894, + "step": 2348 + }, + { + "epoch": 0.24, + "grad_norm": 1.5136573641374516, + "learning_rate": 1.7796553111870616e-05, + "loss": 0.7219, + "step": 2349 + }, + { + "epoch": 0.24, + "grad_norm": 1.4193934300294413, + "learning_rate": 1.7794489671504793e-05, + "loss": 0.7493, + "step": 2350 + }, + { + "epoch": 0.24, + "grad_norm": 1.4403542637862665, + "learning_rate": 1.7792425385169006e-05, + "loss": 0.7527, + "step": 2351 + }, + { + "epoch": 0.24, + "grad_norm": 1.7784718114779616, + "learning_rate": 1.7790360253087304e-05, + "loss": 0.6761, + "step": 2352 + }, + { + "epoch": 0.24, + "grad_norm": 1.5741525022384846, + "learning_rate": 1.7788294275483826e-05, + "loss": 0.7737, + "step": 2353 + }, + { + "epoch": 0.24, + "grad_norm": 1.5300340210520655, + "learning_rate": 1.77862274525828e-05, + "loss": 0.7701, + "step": 2354 + }, + { + "epoch": 0.24, + "grad_norm": 1.668962554632878, + "learning_rate": 1.7784159784608543e-05, + "loss": 0.7207, + "step": 2355 + }, + { + "epoch": 0.24, + "grad_norm": 1.4913381646840573, + "learning_rate": 1.7782091271785475e-05, + "loss": 0.6585, + "step": 2356 + }, + { + "epoch": 0.24, + "grad_norm": 1.6479277771557135, + "learning_rate": 1.7780021914338097e-05, + "loss": 0.7769, + "step": 2357 + }, + { + "epoch": 0.24, + "grad_norm": 1.5844220072583104, + "learning_rate": 1.7777951712491007e-05, + "loss": 0.7385, + "step": 2358 + }, + { + "epoch": 0.24, + "grad_norm": 1.5974427028035825, + "learning_rate": 1.7775880666468893e-05, + "loss": 0.7192, + "step": 2359 + }, + { + "epoch": 0.24, + "grad_norm": 1.5697418827414715, + "learning_rate": 1.777380877649653e-05, + "loss": 0.7989, + "step": 2360 + }, + { + "epoch": 0.24, + "grad_norm": 1.6440250115104977, + "learning_rate": 1.7771736042798796e-05, + "loss": 0.8351, + "step": 2361 + }, + { + "epoch": 0.24, + "grad_norm": 1.641336175166559, + "learning_rate": 1.7769662465600648e-05, + "loss": 0.9197, + "step": 2362 + }, + { + "epoch": 0.24, + "grad_norm": 1.4982313511845302, + "learning_rate": 1.776758804512715e-05, + "loss": 0.783, + "step": 2363 + }, + { + "epoch": 0.24, + "grad_norm": 1.6681412333189058, + "learning_rate": 1.7765512781603434e-05, + "loss": 0.8278, + "step": 2364 + }, + { + "epoch": 0.24, + "grad_norm": 1.5493644062862706, + "learning_rate": 1.776343667525475e-05, + "loss": 0.6787, + "step": 2365 + }, + { + "epoch": 0.24, + "grad_norm": 1.6440060561290857, + "learning_rate": 1.7761359726306416e-05, + "loss": 0.682, + "step": 2366 + }, + { + "epoch": 0.24, + "grad_norm": 1.6511456873028476, + "learning_rate": 1.775928193498386e-05, + "loss": 0.8085, + "step": 2367 + }, + { + "epoch": 0.24, + "grad_norm": 1.6045551984799362, + "learning_rate": 1.775720330151259e-05, + "loss": 0.7526, + "step": 2368 + }, + { + "epoch": 0.24, + "grad_norm": 1.5439942621819842, + "learning_rate": 1.7755123826118215e-05, + "loss": 0.8021, + "step": 2369 + }, + { + "epoch": 0.24, + "grad_norm": 1.4369959230768674, + "learning_rate": 1.7753043509026423e-05, + "loss": 0.7637, + "step": 2370 + }, + { + "epoch": 0.24, + "grad_norm": 1.6486282948075461, + "learning_rate": 1.7750962350463005e-05, + "loss": 0.8355, + "step": 2371 + }, + { + "epoch": 0.24, + "grad_norm": 1.6284762608456098, + "learning_rate": 1.774888035065383e-05, + "loss": 0.8257, + "step": 2372 + }, + { + "epoch": 0.24, + "grad_norm": 1.5193088564965311, + "learning_rate": 1.7746797509824875e-05, + "loss": 0.7028, + "step": 2373 + }, + { + "epoch": 0.24, + "grad_norm": 1.4951644348597182, + "learning_rate": 1.7744713828202197e-05, + "loss": 0.7792, + "step": 2374 + }, + { + "epoch": 0.24, + "grad_norm": 1.6373593823681687, + "learning_rate": 1.7742629306011944e-05, + "loss": 0.7483, + "step": 2375 + }, + { + "epoch": 0.24, + "grad_norm": 1.5588424733699444, + "learning_rate": 1.7740543943480366e-05, + "loss": 0.7492, + "step": 2376 + }, + { + "epoch": 0.24, + "grad_norm": 1.5793556324853542, + "learning_rate": 1.7738457740833785e-05, + "loss": 0.7173, + "step": 2377 + }, + { + "epoch": 0.24, + "grad_norm": 1.5671484025185511, + "learning_rate": 1.7736370698298637e-05, + "loss": 0.7549, + "step": 2378 + }, + { + "epoch": 0.24, + "grad_norm": 1.6090619943711126, + "learning_rate": 1.773428281610143e-05, + "loss": 0.8098, + "step": 2379 + }, + { + "epoch": 0.24, + "grad_norm": 1.5578260997102495, + "learning_rate": 1.7732194094468774e-05, + "loss": 0.7443, + "step": 2380 + }, + { + "epoch": 0.24, + "grad_norm": 1.4643681684258554, + "learning_rate": 1.773010453362737e-05, + "loss": 0.6952, + "step": 2381 + }, + { + "epoch": 0.24, + "grad_norm": 1.6443043569457474, + "learning_rate": 1.7728014133804004e-05, + "loss": 0.7982, + "step": 2382 + }, + { + "epoch": 0.24, + "grad_norm": 1.7237283774975798, + "learning_rate": 1.7725922895225554e-05, + "loss": 0.7161, + "step": 2383 + }, + { + "epoch": 0.24, + "grad_norm": 1.5062277487135245, + "learning_rate": 1.7723830818118997e-05, + "loss": 0.8251, + "step": 2384 + }, + { + "epoch": 0.24, + "grad_norm": 1.5078731457102554, + "learning_rate": 1.772173790271139e-05, + "loss": 0.6404, + "step": 2385 + }, + { + "epoch": 0.24, + "grad_norm": 1.5321573853321941, + "learning_rate": 1.7719644149229888e-05, + "loss": 0.7963, + "step": 2386 + }, + { + "epoch": 0.24, + "grad_norm": 1.4672533371132384, + "learning_rate": 1.7717549557901735e-05, + "loss": 0.6917, + "step": 2387 + }, + { + "epoch": 0.24, + "grad_norm": 1.6271594580379487, + "learning_rate": 1.7715454128954265e-05, + "loss": 0.8336, + "step": 2388 + }, + { + "epoch": 0.24, + "grad_norm": 1.5602583919191477, + "learning_rate": 1.771335786261491e-05, + "loss": 0.7374, + "step": 2389 + }, + { + "epoch": 0.24, + "grad_norm": 1.5818657874167723, + "learning_rate": 1.7711260759111176e-05, + "loss": 0.828, + "step": 2390 + }, + { + "epoch": 0.24, + "grad_norm": 1.558116204908089, + "learning_rate": 1.7709162818670682e-05, + "loss": 0.6835, + "step": 2391 + }, + { + "epoch": 0.24, + "grad_norm": 1.7432452033050492, + "learning_rate": 1.770706404152112e-05, + "loss": 0.8202, + "step": 2392 + }, + { + "epoch": 0.24, + "grad_norm": 1.5211509608944633, + "learning_rate": 1.7704964427890283e-05, + "loss": 0.7055, + "step": 2393 + }, + { + "epoch": 0.24, + "grad_norm": 1.5952756302093576, + "learning_rate": 1.770286397800605e-05, + "loss": 0.6794, + "step": 2394 + }, + { + "epoch": 0.24, + "grad_norm": 1.5888107698469187, + "learning_rate": 1.770076269209639e-05, + "loss": 0.7322, + "step": 2395 + }, + { + "epoch": 0.24, + "grad_norm": 1.5430566750829608, + "learning_rate": 1.7698660570389362e-05, + "loss": 0.79, + "step": 2396 + }, + { + "epoch": 0.24, + "grad_norm": 1.657367611936941, + "learning_rate": 1.769655761311313e-05, + "loss": 0.7225, + "step": 2397 + }, + { + "epoch": 0.24, + "grad_norm": 1.5974172027320854, + "learning_rate": 1.7694453820495926e-05, + "loss": 0.8598, + "step": 2398 + }, + { + "epoch": 0.24, + "grad_norm": 1.5034867091666928, + "learning_rate": 1.769234919276609e-05, + "loss": 0.7784, + "step": 2399 + }, + { + "epoch": 0.24, + "grad_norm": 1.5627926366672371, + "learning_rate": 1.7690243730152043e-05, + "loss": 0.8084, + "step": 2400 + }, + { + "epoch": 0.24, + "grad_norm": 1.5433255789976013, + "learning_rate": 1.76881374328823e-05, + "loss": 0.7259, + "step": 2401 + }, + { + "epoch": 0.24, + "grad_norm": 1.4802198654414553, + "learning_rate": 1.7686030301185467e-05, + "loss": 0.7845, + "step": 2402 + }, + { + "epoch": 0.24, + "grad_norm": 1.5150080798011463, + "learning_rate": 1.768392233529024e-05, + "loss": 0.8206, + "step": 2403 + }, + { + "epoch": 0.24, + "grad_norm": 1.443068138380811, + "learning_rate": 1.768181353542541e-05, + "loss": 0.7574, + "step": 2404 + }, + { + "epoch": 0.24, + "grad_norm": 1.6784448140480828, + "learning_rate": 1.7679703901819846e-05, + "loss": 0.7368, + "step": 2405 + }, + { + "epoch": 0.24, + "grad_norm": 1.6090442224123616, + "learning_rate": 1.7677593434702525e-05, + "loss": 0.7393, + "step": 2406 + }, + { + "epoch": 0.24, + "grad_norm": 1.5542583164937114, + "learning_rate": 1.7675482134302503e-05, + "loss": 0.8111, + "step": 2407 + }, + { + "epoch": 0.24, + "grad_norm": 1.471717033418376, + "learning_rate": 1.767337000084892e-05, + "loss": 0.8439, + "step": 2408 + }, + { + "epoch": 0.25, + "grad_norm": 1.7737988262489541, + "learning_rate": 1.7671257034571027e-05, + "loss": 0.726, + "step": 2409 + }, + { + "epoch": 0.25, + "grad_norm": 1.5441726699663372, + "learning_rate": 1.7669143235698147e-05, + "loss": 0.7305, + "step": 2410 + }, + { + "epoch": 0.25, + "grad_norm": 1.6664292775600074, + "learning_rate": 1.7667028604459698e-05, + "loss": 0.8451, + "step": 2411 + }, + { + "epoch": 0.25, + "grad_norm": 1.6825398377815297, + "learning_rate": 1.7664913141085192e-05, + "loss": 0.8154, + "step": 2412 + }, + { + "epoch": 0.25, + "grad_norm": 1.6625355468496055, + "learning_rate": 1.7662796845804237e-05, + "loss": 0.7677, + "step": 2413 + }, + { + "epoch": 0.25, + "grad_norm": 1.6267473749888721, + "learning_rate": 1.766067971884651e-05, + "loss": 0.7411, + "step": 2414 + }, + { + "epoch": 0.25, + "grad_norm": 1.5825823341562621, + "learning_rate": 1.7658561760441803e-05, + "loss": 0.7802, + "step": 2415 + }, + { + "epoch": 0.25, + "grad_norm": 1.6240718740048647, + "learning_rate": 1.765644297081998e-05, + "loss": 0.7313, + "step": 2416 + }, + { + "epoch": 0.25, + "grad_norm": 1.6696875876375654, + "learning_rate": 1.7654323350211008e-05, + "loss": 0.7813, + "step": 2417 + }, + { + "epoch": 0.25, + "grad_norm": 1.5501823094293559, + "learning_rate": 1.7652202898844935e-05, + "loss": 0.7431, + "step": 2418 + }, + { + "epoch": 0.25, + "grad_norm": 1.50758234605498, + "learning_rate": 1.765008161695191e-05, + "loss": 0.6745, + "step": 2419 + }, + { + "epoch": 0.25, + "grad_norm": 1.4204801932106466, + "learning_rate": 1.7647959504762155e-05, + "loss": 0.6802, + "step": 2420 + }, + { + "epoch": 0.25, + "grad_norm": 1.5743749498683806, + "learning_rate": 1.7645836562506e-05, + "loss": 0.7562, + "step": 2421 + }, + { + "epoch": 0.25, + "grad_norm": 1.3687332132205343, + "learning_rate": 1.7643712790413848e-05, + "loss": 0.5944, + "step": 2422 + }, + { + "epoch": 0.25, + "grad_norm": 1.7182075046051772, + "learning_rate": 1.764158818871621e-05, + "loss": 0.7734, + "step": 2423 + }, + { + "epoch": 0.25, + "grad_norm": 1.5626744617095825, + "learning_rate": 1.7639462757643672e-05, + "loss": 0.75, + "step": 2424 + }, + { + "epoch": 0.25, + "grad_norm": 1.5197050336323787, + "learning_rate": 1.7637336497426925e-05, + "loss": 0.7943, + "step": 2425 + }, + { + "epoch": 0.25, + "grad_norm": 1.3619574222041446, + "learning_rate": 1.7635209408296733e-05, + "loss": 0.6312, + "step": 2426 + }, + { + "epoch": 0.25, + "grad_norm": 1.6305415758528246, + "learning_rate": 1.763308149048396e-05, + "loss": 0.888, + "step": 2427 + }, + { + "epoch": 0.25, + "grad_norm": 1.5546492520862543, + "learning_rate": 1.763095274421956e-05, + "loss": 0.8019, + "step": 2428 + }, + { + "epoch": 0.25, + "grad_norm": 1.555215142648058, + "learning_rate": 1.7628823169734573e-05, + "loss": 0.7817, + "step": 2429 + }, + { + "epoch": 0.25, + "grad_norm": 1.511727865036425, + "learning_rate": 1.7626692767260136e-05, + "loss": 0.7247, + "step": 2430 + }, + { + "epoch": 0.25, + "grad_norm": 1.804382355917543, + "learning_rate": 1.7624561537027465e-05, + "loss": 0.8425, + "step": 2431 + }, + { + "epoch": 0.25, + "grad_norm": 1.6648886739946853, + "learning_rate": 1.7622429479267876e-05, + "loss": 0.8356, + "step": 2432 + }, + { + "epoch": 0.25, + "grad_norm": 1.7135054130486669, + "learning_rate": 1.7620296594212766e-05, + "loss": 0.7348, + "step": 2433 + }, + { + "epoch": 0.25, + "grad_norm": 1.4794522885096486, + "learning_rate": 1.761816288209363e-05, + "loss": 0.7983, + "step": 2434 + }, + { + "epoch": 0.25, + "grad_norm": 1.7887338748656285, + "learning_rate": 1.7616028343142048e-05, + "loss": 0.8899, + "step": 2435 + }, + { + "epoch": 0.25, + "grad_norm": 1.630679636137063, + "learning_rate": 1.7613892977589692e-05, + "loss": 0.8243, + "step": 2436 + }, + { + "epoch": 0.25, + "grad_norm": 1.625913634437561, + "learning_rate": 1.7611756785668313e-05, + "loss": 0.823, + "step": 2437 + }, + { + "epoch": 0.25, + "grad_norm": 1.4996521582223123, + "learning_rate": 1.7609619767609782e-05, + "loss": 0.7152, + "step": 2438 + }, + { + "epoch": 0.25, + "grad_norm": 1.6813915627504108, + "learning_rate": 1.760748192364602e-05, + "loss": 0.8351, + "step": 2439 + }, + { + "epoch": 0.25, + "grad_norm": 1.563039082175078, + "learning_rate": 1.7605343254009063e-05, + "loss": 0.82, + "step": 2440 + }, + { + "epoch": 0.25, + "grad_norm": 1.5668253334856597, + "learning_rate": 1.7603203758931027e-05, + "loss": 0.6734, + "step": 2441 + }, + { + "epoch": 0.25, + "grad_norm": 1.64218612244895, + "learning_rate": 1.760106343864413e-05, + "loss": 0.7117, + "step": 2442 + }, + { + "epoch": 0.25, + "grad_norm": 1.4172340689668361, + "learning_rate": 1.759892229338066e-05, + "loss": 0.7138, + "step": 2443 + }, + { + "epoch": 0.25, + "grad_norm": 1.5161626935295773, + "learning_rate": 1.759678032337301e-05, + "loss": 0.7911, + "step": 2444 + }, + { + "epoch": 0.25, + "grad_norm": 1.6555376454889454, + "learning_rate": 1.7594637528853654e-05, + "loss": 0.7448, + "step": 2445 + }, + { + "epoch": 0.25, + "grad_norm": 1.6448450279141316, + "learning_rate": 1.7592493910055165e-05, + "loss": 0.8029, + "step": 2446 + }, + { + "epoch": 0.25, + "grad_norm": 1.6811458558176975, + "learning_rate": 1.759034946721019e-05, + "loss": 0.8106, + "step": 2447 + }, + { + "epoch": 0.25, + "grad_norm": 1.4864518399180122, + "learning_rate": 1.7588204200551486e-05, + "loss": 0.8088, + "step": 2448 + }, + { + "epoch": 0.25, + "grad_norm": 1.685284783331612, + "learning_rate": 1.7586058110311882e-05, + "loss": 0.7551, + "step": 2449 + }, + { + "epoch": 0.25, + "grad_norm": 1.7535255994342807, + "learning_rate": 1.75839111967243e-05, + "loss": 0.7497, + "step": 2450 + }, + { + "epoch": 0.25, + "grad_norm": 1.519266239438577, + "learning_rate": 1.7581763460021758e-05, + "loss": 0.7114, + "step": 2451 + }, + { + "epoch": 0.25, + "grad_norm": 1.5205905936797746, + "learning_rate": 1.757961490043736e-05, + "loss": 0.7277, + "step": 2452 + }, + { + "epoch": 0.25, + "grad_norm": 1.4832968313005075, + "learning_rate": 1.757746551820429e-05, + "loss": 0.6313, + "step": 2453 + }, + { + "epoch": 0.25, + "grad_norm": 1.5906751619604762, + "learning_rate": 1.7575315313555844e-05, + "loss": 0.7671, + "step": 2454 + }, + { + "epoch": 0.25, + "grad_norm": 1.5306138592197545, + "learning_rate": 1.757316428672538e-05, + "loss": 0.8233, + "step": 2455 + }, + { + "epoch": 0.25, + "grad_norm": 1.5420758259052774, + "learning_rate": 1.7571012437946365e-05, + "loss": 0.8207, + "step": 2456 + }, + { + "epoch": 0.25, + "grad_norm": 1.4863632131027524, + "learning_rate": 1.7568859767452347e-05, + "loss": 0.6853, + "step": 2457 + }, + { + "epoch": 0.25, + "grad_norm": 1.5257917166953734, + "learning_rate": 1.7566706275476967e-05, + "loss": 0.6817, + "step": 2458 + }, + { + "epoch": 0.25, + "grad_norm": 1.4743887619029097, + "learning_rate": 1.7564551962253952e-05, + "loss": 0.7811, + "step": 2459 + }, + { + "epoch": 0.25, + "grad_norm": 1.5472620487297755, + "learning_rate": 1.756239682801711e-05, + "loss": 0.7322, + "step": 2460 + }, + { + "epoch": 0.25, + "grad_norm": 1.4691655852007381, + "learning_rate": 1.7560240873000363e-05, + "loss": 0.7562, + "step": 2461 + }, + { + "epoch": 0.25, + "grad_norm": 1.4818530953336322, + "learning_rate": 1.7558084097437697e-05, + "loss": 0.6782, + "step": 2462 + }, + { + "epoch": 0.25, + "grad_norm": 1.5238066160364558, + "learning_rate": 1.7555926501563198e-05, + "loss": 0.749, + "step": 2463 + }, + { + "epoch": 0.25, + "grad_norm": 1.7467509201649687, + "learning_rate": 1.7553768085611033e-05, + "loss": 0.6228, + "step": 2464 + }, + { + "epoch": 0.25, + "grad_norm": 1.6127320429072924, + "learning_rate": 1.7551608849815473e-05, + "loss": 0.796, + "step": 2465 + }, + { + "epoch": 0.25, + "grad_norm": 1.4307340600762173, + "learning_rate": 1.754944879441087e-05, + "loss": 0.7548, + "step": 2466 + }, + { + "epoch": 0.25, + "grad_norm": 1.7693917555197356, + "learning_rate": 1.7547287919631655e-05, + "loss": 0.8992, + "step": 2467 + }, + { + "epoch": 0.25, + "grad_norm": 1.4706406131606666, + "learning_rate": 1.7545126225712366e-05, + "loss": 0.7496, + "step": 2468 + }, + { + "epoch": 0.25, + "grad_norm": 1.6957896599223548, + "learning_rate": 1.7542963712887618e-05, + "loss": 0.6873, + "step": 2469 + }, + { + "epoch": 0.25, + "grad_norm": 1.5260931443330856, + "learning_rate": 1.7540800381392116e-05, + "loss": 0.7522, + "step": 2470 + }, + { + "epoch": 0.25, + "grad_norm": 1.543857764791894, + "learning_rate": 1.753863623146066e-05, + "loss": 0.6656, + "step": 2471 + }, + { + "epoch": 0.25, + "grad_norm": 1.4427949153903654, + "learning_rate": 1.753647126332813e-05, + "loss": 0.6484, + "step": 2472 + }, + { + "epoch": 0.25, + "grad_norm": 1.541434903268351, + "learning_rate": 1.7534305477229502e-05, + "loss": 0.6468, + "step": 2473 + }, + { + "epoch": 0.25, + "grad_norm": 1.5649722223214777, + "learning_rate": 1.7532138873399838e-05, + "loss": 0.7066, + "step": 2474 + }, + { + "epoch": 0.25, + "grad_norm": 1.453940354232897, + "learning_rate": 1.7529971452074288e-05, + "loss": 0.7706, + "step": 2475 + }, + { + "epoch": 0.25, + "grad_norm": 1.4539802973508147, + "learning_rate": 1.752780321348809e-05, + "loss": 0.635, + "step": 2476 + }, + { + "epoch": 0.25, + "grad_norm": 1.5904603944217826, + "learning_rate": 1.752563415787658e-05, + "loss": 0.8014, + "step": 2477 + }, + { + "epoch": 0.25, + "grad_norm": 1.6679468593874847, + "learning_rate": 1.752346428547517e-05, + "loss": 0.8044, + "step": 2478 + }, + { + "epoch": 0.25, + "grad_norm": 1.5745163149114132, + "learning_rate": 1.7521293596519364e-05, + "loss": 0.835, + "step": 2479 + }, + { + "epoch": 0.25, + "grad_norm": 1.6925493416084263, + "learning_rate": 1.7519122091244757e-05, + "loss": 0.7898, + "step": 2480 + }, + { + "epoch": 0.25, + "grad_norm": 1.511155784957144, + "learning_rate": 1.7516949769887032e-05, + "loss": 0.7317, + "step": 2481 + }, + { + "epoch": 0.25, + "grad_norm": 1.6708420971092535, + "learning_rate": 1.7514776632681964e-05, + "loss": 0.8107, + "step": 2482 + }, + { + "epoch": 0.25, + "grad_norm": 1.6091074751165642, + "learning_rate": 1.751260267986541e-05, + "loss": 0.7963, + "step": 2483 + }, + { + "epoch": 0.25, + "grad_norm": 1.6040076390401958, + "learning_rate": 1.751042791167332e-05, + "loss": 0.7685, + "step": 2484 + }, + { + "epoch": 0.25, + "grad_norm": 1.6669016961142142, + "learning_rate": 1.7508252328341726e-05, + "loss": 0.8528, + "step": 2485 + }, + { + "epoch": 0.25, + "grad_norm": 1.5555218509357984, + "learning_rate": 1.7506075930106757e-05, + "loss": 0.7461, + "step": 2486 + }, + { + "epoch": 0.25, + "grad_norm": 1.5085029479729102, + "learning_rate": 1.750389871720463e-05, + "loss": 0.8095, + "step": 2487 + }, + { + "epoch": 0.25, + "grad_norm": 1.647440048436809, + "learning_rate": 1.750172068987165e-05, + "loss": 0.8776, + "step": 2488 + }, + { + "epoch": 0.25, + "grad_norm": 1.5388843511885144, + "learning_rate": 1.7499541848344198e-05, + "loss": 0.7754, + "step": 2489 + }, + { + "epoch": 0.25, + "grad_norm": 1.608332838142146, + "learning_rate": 1.749736219285876e-05, + "loss": 0.8276, + "step": 2490 + }, + { + "epoch": 0.25, + "grad_norm": 1.4894710731185878, + "learning_rate": 1.7495181723651898e-05, + "loss": 0.7, + "step": 2491 + }, + { + "epoch": 0.25, + "grad_norm": 1.4209838347335078, + "learning_rate": 1.7493000440960277e-05, + "loss": 0.6782, + "step": 2492 + }, + { + "epoch": 0.25, + "grad_norm": 1.506093175873103, + "learning_rate": 1.749081834502063e-05, + "loss": 0.6989, + "step": 2493 + }, + { + "epoch": 0.25, + "grad_norm": 1.5791699181274723, + "learning_rate": 1.74886354360698e-05, + "loss": 0.7106, + "step": 2494 + }, + { + "epoch": 0.25, + "grad_norm": 1.5764451475291832, + "learning_rate": 1.74864517143447e-05, + "loss": 0.7963, + "step": 2495 + }, + { + "epoch": 0.25, + "grad_norm": 1.5289101233823514, + "learning_rate": 1.7484267180082343e-05, + "loss": 0.7024, + "step": 2496 + }, + { + "epoch": 0.25, + "grad_norm": 1.605164422817265, + "learning_rate": 1.7482081833519822e-05, + "loss": 0.847, + "step": 2497 + }, + { + "epoch": 0.25, + "grad_norm": 1.730092743675462, + "learning_rate": 1.7479895674894326e-05, + "loss": 0.6953, + "step": 2498 + }, + { + "epoch": 0.25, + "grad_norm": 1.6454716877649915, + "learning_rate": 1.7477708704443125e-05, + "loss": 0.7739, + "step": 2499 + }, + { + "epoch": 0.25, + "grad_norm": 1.8375789056356429, + "learning_rate": 1.7475520922403582e-05, + "loss": 0.877, + "step": 2500 + }, + { + "epoch": 0.25, + "grad_norm": 1.5146743080412426, + "learning_rate": 1.7473332329013152e-05, + "loss": 0.7364, + "step": 2501 + }, + { + "epoch": 0.25, + "grad_norm": 1.4101525134673571, + "learning_rate": 1.7471142924509365e-05, + "loss": 0.6998, + "step": 2502 + }, + { + "epoch": 0.25, + "grad_norm": 1.4413863461289211, + "learning_rate": 1.7468952709129848e-05, + "loss": 0.7836, + "step": 2503 + }, + { + "epoch": 0.25, + "grad_norm": 1.4701548661669526, + "learning_rate": 1.7466761683112316e-05, + "loss": 0.6526, + "step": 2504 + }, + { + "epoch": 0.25, + "grad_norm": 1.529401860380887, + "learning_rate": 1.746456984669457e-05, + "loss": 0.7717, + "step": 2505 + }, + { + "epoch": 0.25, + "grad_norm": 1.3710047924443467, + "learning_rate": 1.7462377200114503e-05, + "loss": 0.6129, + "step": 2506 + }, + { + "epoch": 0.26, + "grad_norm": 1.7241252121013668, + "learning_rate": 1.7460183743610085e-05, + "loss": 0.7578, + "step": 2507 + }, + { + "epoch": 0.26, + "grad_norm": 1.6915304109956375, + "learning_rate": 1.745798947741939e-05, + "loss": 0.7747, + "step": 2508 + }, + { + "epoch": 0.26, + "grad_norm": 1.5089746597984957, + "learning_rate": 1.7455794401780565e-05, + "loss": 0.7323, + "step": 2509 + }, + { + "epoch": 0.26, + "grad_norm": 1.4422751178761635, + "learning_rate": 1.745359851693185e-05, + "loss": 0.7499, + "step": 2510 + }, + { + "epoch": 0.26, + "grad_norm": 1.5399503613199372, + "learning_rate": 1.7451401823111585e-05, + "loss": 0.7636, + "step": 2511 + }, + { + "epoch": 0.26, + "grad_norm": 1.606901545574598, + "learning_rate": 1.7449204320558172e-05, + "loss": 0.7686, + "step": 2512 + }, + { + "epoch": 0.26, + "grad_norm": 1.5511141257902685, + "learning_rate": 1.7447006009510127e-05, + "loss": 0.6931, + "step": 2513 + }, + { + "epoch": 0.26, + "grad_norm": 1.425351819376338, + "learning_rate": 1.7444806890206036e-05, + "loss": 0.7941, + "step": 2514 + }, + { + "epoch": 0.26, + "grad_norm": 1.3726498678601882, + "learning_rate": 1.7442606962884582e-05, + "loss": 0.7392, + "step": 2515 + }, + { + "epoch": 0.26, + "grad_norm": 1.430601606264957, + "learning_rate": 1.744040622778453e-05, + "loss": 0.6102, + "step": 2516 + }, + { + "epoch": 0.26, + "grad_norm": 1.5338334677524086, + "learning_rate": 1.7438204685144733e-05, + "loss": 0.8332, + "step": 2517 + }, + { + "epoch": 0.26, + "grad_norm": 1.5960453889328998, + "learning_rate": 1.7436002335204144e-05, + "loss": 0.8608, + "step": 2518 + }, + { + "epoch": 0.26, + "grad_norm": 1.7945166823710936, + "learning_rate": 1.7433799178201786e-05, + "loss": 0.8269, + "step": 2519 + }, + { + "epoch": 0.26, + "grad_norm": 1.630442107761875, + "learning_rate": 1.743159521437678e-05, + "loss": 0.8059, + "step": 2520 + }, + { + "epoch": 0.26, + "grad_norm": 1.5696632337208958, + "learning_rate": 1.7429390443968327e-05, + "loss": 0.847, + "step": 2521 + }, + { + "epoch": 0.26, + "grad_norm": 1.7247206084773075, + "learning_rate": 1.742718486721573e-05, + "loss": 0.868, + "step": 2522 + }, + { + "epoch": 0.26, + "grad_norm": 1.6226798368416193, + "learning_rate": 1.742497848435836e-05, + "loss": 0.8623, + "step": 2523 + }, + { + "epoch": 0.26, + "grad_norm": 1.5800909522421087, + "learning_rate": 1.742277129563569e-05, + "loss": 0.7399, + "step": 2524 + }, + { + "epoch": 0.26, + "grad_norm": 2.6787372496634005, + "learning_rate": 1.742056330128727e-05, + "loss": 0.6171, + "step": 2525 + }, + { + "epoch": 0.26, + "grad_norm": 1.6313396899700607, + "learning_rate": 1.7418354501552755e-05, + "loss": 0.8866, + "step": 2526 + }, + { + "epoch": 0.26, + "grad_norm": 1.6581527447564648, + "learning_rate": 1.7416144896671868e-05, + "loss": 0.7926, + "step": 2527 + }, + { + "epoch": 0.26, + "grad_norm": 1.4389725304207477, + "learning_rate": 1.7413934486884424e-05, + "loss": 0.751, + "step": 2528 + }, + { + "epoch": 0.26, + "grad_norm": 1.626069977682484, + "learning_rate": 1.7411723272430337e-05, + "loss": 0.7008, + "step": 2529 + }, + { + "epoch": 0.26, + "grad_norm": 1.5656126801302932, + "learning_rate": 1.7409511253549592e-05, + "loss": 0.7889, + "step": 2530 + }, + { + "epoch": 0.26, + "grad_norm": 1.5549251618392606, + "learning_rate": 1.7407298430482272e-05, + "loss": 0.702, + "step": 2531 + }, + { + "epoch": 0.26, + "grad_norm": 1.7298135822967657, + "learning_rate": 1.7405084803468545e-05, + "loss": 0.7869, + "step": 2532 + }, + { + "epoch": 0.26, + "grad_norm": 1.5801692734885895, + "learning_rate": 1.7402870372748667e-05, + "loss": 0.7014, + "step": 2533 + }, + { + "epoch": 0.26, + "grad_norm": 1.585716907263995, + "learning_rate": 1.7400655138562977e-05, + "loss": 0.7708, + "step": 2534 + }, + { + "epoch": 0.26, + "grad_norm": 1.6201214371053156, + "learning_rate": 1.7398439101151908e-05, + "loss": 0.7475, + "step": 2535 + }, + { + "epoch": 0.26, + "grad_norm": 1.5630473760032266, + "learning_rate": 1.7396222260755967e-05, + "loss": 0.7259, + "step": 2536 + }, + { + "epoch": 0.26, + "grad_norm": 1.6177675004741257, + "learning_rate": 1.739400461761577e-05, + "loss": 0.8223, + "step": 2537 + }, + { + "epoch": 0.26, + "grad_norm": 1.5956924130781553, + "learning_rate": 1.7391786171971995e-05, + "loss": 0.8004, + "step": 2538 + }, + { + "epoch": 0.26, + "grad_norm": 1.548599637727936, + "learning_rate": 1.738956692406543e-05, + "loss": 0.7373, + "step": 2539 + }, + { + "epoch": 0.26, + "grad_norm": 1.8006817203863534, + "learning_rate": 1.7387346874136936e-05, + "loss": 0.8084, + "step": 2540 + }, + { + "epoch": 0.26, + "grad_norm": 1.5053069784925723, + "learning_rate": 1.7385126022427462e-05, + "loss": 0.7325, + "step": 2541 + }, + { + "epoch": 0.26, + "grad_norm": 1.6085547890387042, + "learning_rate": 1.738290436917805e-05, + "loss": 0.8145, + "step": 2542 + }, + { + "epoch": 0.26, + "grad_norm": 1.5814550489473806, + "learning_rate": 1.738068191462982e-05, + "loss": 0.7558, + "step": 2543 + }, + { + "epoch": 0.26, + "grad_norm": 1.5549657373912378, + "learning_rate": 1.7378458659023995e-05, + "loss": 0.7423, + "step": 2544 + }, + { + "epoch": 0.26, + "grad_norm": 1.5182984340635106, + "learning_rate": 1.7376234602601862e-05, + "loss": 0.7716, + "step": 2545 + }, + { + "epoch": 0.26, + "grad_norm": 1.6084968643569708, + "learning_rate": 1.737400974560482e-05, + "loss": 0.8375, + "step": 2546 + }, + { + "epoch": 0.26, + "grad_norm": 1.562666012399755, + "learning_rate": 1.7371784088274336e-05, + "loss": 0.7752, + "step": 2547 + }, + { + "epoch": 0.26, + "grad_norm": 1.4105680133899585, + "learning_rate": 1.7369557630851968e-05, + "loss": 0.7848, + "step": 2548 + }, + { + "epoch": 0.26, + "grad_norm": 1.6011744787383677, + "learning_rate": 1.736733037357937e-05, + "loss": 0.8372, + "step": 2549 + }, + { + "epoch": 0.26, + "grad_norm": 1.5803871758323422, + "learning_rate": 1.736510231669827e-05, + "loss": 0.7333, + "step": 2550 + }, + { + "epoch": 0.26, + "grad_norm": 1.5883478183700042, + "learning_rate": 1.736287346045049e-05, + "loss": 0.8632, + "step": 2551 + }, + { + "epoch": 0.26, + "grad_norm": 1.6146855441975676, + "learning_rate": 1.736064380507794e-05, + "loss": 0.7427, + "step": 2552 + }, + { + "epoch": 0.26, + "grad_norm": 1.8021113066776384, + "learning_rate": 1.735841335082261e-05, + "loss": 0.8252, + "step": 2553 + }, + { + "epoch": 0.26, + "grad_norm": 1.5866143850955474, + "learning_rate": 1.7356182097926586e-05, + "loss": 0.7464, + "step": 2554 + }, + { + "epoch": 0.26, + "grad_norm": 1.4130768308137258, + "learning_rate": 1.7353950046632034e-05, + "loss": 0.7745, + "step": 2555 + }, + { + "epoch": 0.26, + "grad_norm": 1.707661990330429, + "learning_rate": 1.7351717197181205e-05, + "loss": 0.7891, + "step": 2556 + }, + { + "epoch": 0.26, + "grad_norm": 1.7951534499962143, + "learning_rate": 1.7349483549816442e-05, + "loss": 0.7286, + "step": 2557 + }, + { + "epoch": 0.26, + "grad_norm": 1.499878405477149, + "learning_rate": 1.7347249104780174e-05, + "loss": 0.6814, + "step": 2558 + }, + { + "epoch": 0.26, + "grad_norm": 1.441729542410146, + "learning_rate": 1.7345013862314915e-05, + "loss": 0.7117, + "step": 2559 + }, + { + "epoch": 0.26, + "grad_norm": 1.485469000960217, + "learning_rate": 1.734277782266326e-05, + "loss": 0.7257, + "step": 2560 + }, + { + "epoch": 0.26, + "grad_norm": 1.6231370649812686, + "learning_rate": 1.7340540986067907e-05, + "loss": 0.8767, + "step": 2561 + }, + { + "epoch": 0.26, + "grad_norm": 1.5556532622432997, + "learning_rate": 1.733830335277162e-05, + "loss": 0.6962, + "step": 2562 + }, + { + "epoch": 0.26, + "grad_norm": 1.5465549577351825, + "learning_rate": 1.733606492301726e-05, + "loss": 0.6955, + "step": 2563 + }, + { + "epoch": 0.26, + "grad_norm": 1.5457692336086464, + "learning_rate": 1.7333825697047778e-05, + "loss": 0.7819, + "step": 2564 + }, + { + "epoch": 0.26, + "grad_norm": 1.5236657144163215, + "learning_rate": 1.7331585675106205e-05, + "loss": 0.8064, + "step": 2565 + }, + { + "epoch": 0.26, + "grad_norm": 1.482979384172734, + "learning_rate": 1.732934485743566e-05, + "loss": 0.6854, + "step": 2566 + }, + { + "epoch": 0.26, + "grad_norm": 1.5310665557094747, + "learning_rate": 1.732710324427935e-05, + "loss": 0.7453, + "step": 2567 + }, + { + "epoch": 0.26, + "grad_norm": 1.5265915332766808, + "learning_rate": 1.7324860835880564e-05, + "loss": 0.7083, + "step": 2568 + }, + { + "epoch": 0.26, + "grad_norm": 1.5640457528269724, + "learning_rate": 1.7322617632482682e-05, + "loss": 0.8216, + "step": 2569 + }, + { + "epoch": 0.26, + "grad_norm": 1.5862358089141146, + "learning_rate": 1.732037363432917e-05, + "loss": 0.7527, + "step": 2570 + }, + { + "epoch": 0.26, + "grad_norm": 1.5229343485201696, + "learning_rate": 1.731812884166358e-05, + "loss": 0.7896, + "step": 2571 + }, + { + "epoch": 0.26, + "grad_norm": 1.4177542192456623, + "learning_rate": 1.7315883254729545e-05, + "loss": 0.7879, + "step": 2572 + }, + { + "epoch": 0.26, + "grad_norm": 1.6326061490316859, + "learning_rate": 1.731363687377079e-05, + "loss": 0.7037, + "step": 2573 + }, + { + "epoch": 0.26, + "grad_norm": 1.601629614231217, + "learning_rate": 1.7311389699031123e-05, + "loss": 0.6777, + "step": 2574 + }, + { + "epoch": 0.26, + "grad_norm": 1.6520831590575529, + "learning_rate": 1.7309141730754445e-05, + "loss": 0.7052, + "step": 2575 + }, + { + "epoch": 0.26, + "grad_norm": 1.619518747814848, + "learning_rate": 1.730689296918473e-05, + "loss": 0.7202, + "step": 2576 + }, + { + "epoch": 0.26, + "grad_norm": 1.4296269965485597, + "learning_rate": 1.7304643414566054e-05, + "loss": 0.6845, + "step": 2577 + }, + { + "epoch": 0.26, + "grad_norm": 1.489642595553068, + "learning_rate": 1.7302393067142567e-05, + "loss": 0.7439, + "step": 2578 + }, + { + "epoch": 0.26, + "grad_norm": 1.5089579003609654, + "learning_rate": 1.7300141927158506e-05, + "loss": 0.6531, + "step": 2579 + }, + { + "epoch": 0.26, + "grad_norm": 1.5543973690671014, + "learning_rate": 1.7297889994858207e-05, + "loss": 0.7204, + "step": 2580 + }, + { + "epoch": 0.26, + "grad_norm": 1.5813435578709207, + "learning_rate": 1.729563727048607e-05, + "loss": 0.7328, + "step": 2581 + }, + { + "epoch": 0.26, + "grad_norm": 1.79196216418814, + "learning_rate": 1.7293383754286597e-05, + "loss": 0.8422, + "step": 2582 + }, + { + "epoch": 0.26, + "grad_norm": 1.4605108163184226, + "learning_rate": 1.7291129446504376e-05, + "loss": 0.7435, + "step": 2583 + }, + { + "epoch": 0.26, + "grad_norm": 1.5836072342624712, + "learning_rate": 1.7288874347384074e-05, + "loss": 0.7172, + "step": 2584 + }, + { + "epoch": 0.26, + "grad_norm": 1.5132036770627308, + "learning_rate": 1.7286618457170445e-05, + "loss": 0.733, + "step": 2585 + }, + { + "epoch": 0.26, + "grad_norm": 1.4698388154169335, + "learning_rate": 1.7284361776108334e-05, + "loss": 0.7593, + "step": 2586 + }, + { + "epoch": 0.26, + "grad_norm": 1.6601052501100197, + "learning_rate": 1.7282104304442665e-05, + "loss": 0.7853, + "step": 2587 + }, + { + "epoch": 0.26, + "grad_norm": 1.6290132199377816, + "learning_rate": 1.7279846042418457e-05, + "loss": 0.8818, + "step": 2588 + }, + { + "epoch": 0.26, + "grad_norm": 1.5030303854427125, + "learning_rate": 1.72775869902808e-05, + "loss": 0.7677, + "step": 2589 + }, + { + "epoch": 0.26, + "grad_norm": 1.7115445257183575, + "learning_rate": 1.727532714827489e-05, + "loss": 0.7636, + "step": 2590 + }, + { + "epoch": 0.26, + "grad_norm": 1.6349488084588633, + "learning_rate": 1.727306651664599e-05, + "loss": 0.85, + "step": 2591 + }, + { + "epoch": 0.26, + "grad_norm": 1.5229765706901655, + "learning_rate": 1.7270805095639453e-05, + "loss": 0.7667, + "step": 2592 + }, + { + "epoch": 0.26, + "grad_norm": 1.4680567877934216, + "learning_rate": 1.726854288550073e-05, + "loss": 0.741, + "step": 2593 + }, + { + "epoch": 0.26, + "grad_norm": 1.5454570479760592, + "learning_rate": 1.726627988647534e-05, + "loss": 0.7619, + "step": 2594 + }, + { + "epoch": 0.26, + "grad_norm": 1.4331477716947236, + "learning_rate": 1.7264016098808904e-05, + "loss": 0.7499, + "step": 2595 + }, + { + "epoch": 0.26, + "grad_norm": 1.6029816212533754, + "learning_rate": 1.7261751522747116e-05, + "loss": 0.735, + "step": 2596 + }, + { + "epoch": 0.26, + "grad_norm": 1.6261320064664497, + "learning_rate": 1.725948615853576e-05, + "loss": 0.8171, + "step": 2597 + }, + { + "epoch": 0.26, + "grad_norm": 1.5537449429626073, + "learning_rate": 1.725722000642071e-05, + "loss": 0.7344, + "step": 2598 + }, + { + "epoch": 0.26, + "grad_norm": 1.7498220500226265, + "learning_rate": 1.7254953066647915e-05, + "loss": 0.8478, + "step": 2599 + }, + { + "epoch": 0.26, + "grad_norm": 1.4349222353392714, + "learning_rate": 1.7252685339463423e-05, + "loss": 0.7856, + "step": 2600 + }, + { + "epoch": 0.26, + "grad_norm": 1.5870205146425946, + "learning_rate": 1.7250416825113355e-05, + "loss": 0.7084, + "step": 2601 + }, + { + "epoch": 0.26, + "grad_norm": 1.5105277145464684, + "learning_rate": 1.7248147523843925e-05, + "loss": 0.7371, + "step": 2602 + }, + { + "epoch": 0.26, + "grad_norm": 1.5539051554492644, + "learning_rate": 1.7245877435901428e-05, + "loss": 0.7413, + "step": 2603 + }, + { + "epoch": 0.26, + "grad_norm": 1.5027853678248158, + "learning_rate": 1.7243606561532247e-05, + "loss": 0.8116, + "step": 2604 + }, + { + "epoch": 0.26, + "grad_norm": 1.6730095843999155, + "learning_rate": 1.7241334900982854e-05, + "loss": 0.7399, + "step": 2605 + }, + { + "epoch": 0.27, + "grad_norm": 1.582166574687787, + "learning_rate": 1.72390624544998e-05, + "loss": 0.7094, + "step": 2606 + }, + { + "epoch": 0.27, + "grad_norm": 1.6502380774099639, + "learning_rate": 1.723678922232972e-05, + "loss": 0.7626, + "step": 2607 + }, + { + "epoch": 0.27, + "grad_norm": 1.666325386956416, + "learning_rate": 1.7234515204719342e-05, + "loss": 0.7526, + "step": 2608 + }, + { + "epoch": 0.27, + "grad_norm": 1.5806104358843491, + "learning_rate": 1.7232240401915473e-05, + "loss": 0.7783, + "step": 2609 + }, + { + "epoch": 0.27, + "grad_norm": 1.5524286203165254, + "learning_rate": 1.722996481416501e-05, + "loss": 0.7713, + "step": 2610 + }, + { + "epoch": 0.27, + "grad_norm": 1.5354166257297275, + "learning_rate": 1.722768844171493e-05, + "loss": 0.7797, + "step": 2611 + }, + { + "epoch": 0.27, + "grad_norm": 1.648540577003657, + "learning_rate": 1.7225411284812294e-05, + "loss": 0.8315, + "step": 2612 + }, + { + "epoch": 0.27, + "grad_norm": 1.4579288968385267, + "learning_rate": 1.722313334370426e-05, + "loss": 0.8666, + "step": 2613 + }, + { + "epoch": 0.27, + "grad_norm": 1.4702550601601378, + "learning_rate": 1.722085461863806e-05, + "loss": 0.6591, + "step": 2614 + }, + { + "epoch": 0.27, + "grad_norm": 1.6201922996801192, + "learning_rate": 1.721857510986101e-05, + "loss": 0.7371, + "step": 2615 + }, + { + "epoch": 0.27, + "grad_norm": 1.5990364634319834, + "learning_rate": 1.7216294817620518e-05, + "loss": 0.8072, + "step": 2616 + }, + { + "epoch": 0.27, + "grad_norm": 1.5437191298534145, + "learning_rate": 1.7214013742164072e-05, + "loss": 0.846, + "step": 2617 + }, + { + "epoch": 0.27, + "grad_norm": 1.5671733942399428, + "learning_rate": 1.7211731883739254e-05, + "loss": 0.7216, + "step": 2618 + }, + { + "epoch": 0.27, + "grad_norm": 1.5207270159074848, + "learning_rate": 1.7209449242593715e-05, + "loss": 0.8302, + "step": 2619 + }, + { + "epoch": 0.27, + "grad_norm": 1.4966851389973288, + "learning_rate": 1.7207165818975206e-05, + "loss": 0.7077, + "step": 2620 + }, + { + "epoch": 0.27, + "grad_norm": 1.6020631870179807, + "learning_rate": 1.720488161313155e-05, + "loss": 0.7177, + "step": 2621 + }, + { + "epoch": 0.27, + "grad_norm": 1.6527642932203361, + "learning_rate": 1.720259662531067e-05, + "loss": 0.8524, + "step": 2622 + }, + { + "epoch": 0.27, + "grad_norm": 1.5292848541794821, + "learning_rate": 1.7200310855760564e-05, + "loss": 0.6932, + "step": 2623 + }, + { + "epoch": 0.27, + "grad_norm": 1.5040037411224556, + "learning_rate": 1.7198024304729314e-05, + "loss": 0.7989, + "step": 2624 + }, + { + "epoch": 0.27, + "grad_norm": 1.6330275027530194, + "learning_rate": 1.7195736972465087e-05, + "loss": 0.7362, + "step": 2625 + }, + { + "epoch": 0.27, + "grad_norm": 1.4973184520025868, + "learning_rate": 1.7193448859216146e-05, + "loss": 0.7976, + "step": 2626 + }, + { + "epoch": 0.27, + "grad_norm": 1.5210898000562127, + "learning_rate": 1.719115996523082e-05, + "loss": 0.8209, + "step": 2627 + }, + { + "epoch": 0.27, + "grad_norm": 2.823776645172069, + "learning_rate": 1.7188870290757536e-05, + "loss": 0.6101, + "step": 2628 + }, + { + "epoch": 0.27, + "grad_norm": 1.3714876037564456, + "learning_rate": 1.7186579836044804e-05, + "loss": 0.7237, + "step": 2629 + }, + { + "epoch": 0.27, + "grad_norm": 1.5517520331930001, + "learning_rate": 1.718428860134122e-05, + "loss": 0.7674, + "step": 2630 + }, + { + "epoch": 0.27, + "grad_norm": 1.5261112683689515, + "learning_rate": 1.7181996586895456e-05, + "loss": 0.7371, + "step": 2631 + }, + { + "epoch": 0.27, + "grad_norm": 1.546182807578598, + "learning_rate": 1.7179703792956276e-05, + "loss": 0.6897, + "step": 2632 + }, + { + "epoch": 0.27, + "grad_norm": 1.6211807619267422, + "learning_rate": 1.7177410219772527e-05, + "loss": 0.7503, + "step": 2633 + }, + { + "epoch": 0.27, + "grad_norm": 1.6640469296407736, + "learning_rate": 1.7175115867593143e-05, + "loss": 0.8524, + "step": 2634 + }, + { + "epoch": 0.27, + "grad_norm": 1.5480571031288723, + "learning_rate": 1.7172820736667133e-05, + "loss": 0.7468, + "step": 2635 + }, + { + "epoch": 0.27, + "grad_norm": 1.685157331961948, + "learning_rate": 1.7170524827243608e-05, + "loss": 0.7039, + "step": 2636 + }, + { + "epoch": 0.27, + "grad_norm": 1.5492580721226499, + "learning_rate": 1.7168228139571744e-05, + "loss": 0.8109, + "step": 2637 + }, + { + "epoch": 0.27, + "grad_norm": 1.4887599314971207, + "learning_rate": 1.7165930673900812e-05, + "loss": 0.7612, + "step": 2638 + }, + { + "epoch": 0.27, + "grad_norm": 1.6865716116863976, + "learning_rate": 1.7163632430480172e-05, + "loss": 0.7688, + "step": 2639 + }, + { + "epoch": 0.27, + "grad_norm": 1.6204207493635825, + "learning_rate": 1.7161333409559256e-05, + "loss": 0.7999, + "step": 2640 + }, + { + "epoch": 0.27, + "grad_norm": 1.5023197048656975, + "learning_rate": 1.715903361138759e-05, + "loss": 0.7147, + "step": 2641 + }, + { + "epoch": 0.27, + "grad_norm": 1.4427185908059887, + "learning_rate": 1.715673303621478e-05, + "loss": 0.6728, + "step": 2642 + }, + { + "epoch": 0.27, + "grad_norm": 1.4589691465012895, + "learning_rate": 1.715443168429052e-05, + "loss": 0.8163, + "step": 2643 + }, + { + "epoch": 0.27, + "grad_norm": 1.622314886789072, + "learning_rate": 1.7152129555864583e-05, + "loss": 0.7668, + "step": 2644 + }, + { + "epoch": 0.27, + "grad_norm": 1.5875141396662342, + "learning_rate": 1.7149826651186828e-05, + "loss": 0.7138, + "step": 2645 + }, + { + "epoch": 0.27, + "grad_norm": 1.5819705613379111, + "learning_rate": 1.71475229705072e-05, + "loss": 0.7547, + "step": 2646 + }, + { + "epoch": 0.27, + "grad_norm": 1.5286856655549894, + "learning_rate": 1.714521851407573e-05, + "loss": 0.6983, + "step": 2647 + }, + { + "epoch": 0.27, + "grad_norm": 1.6680424585241245, + "learning_rate": 1.7142913282142528e-05, + "loss": 0.8304, + "step": 2648 + }, + { + "epoch": 0.27, + "grad_norm": 1.3949753968844358, + "learning_rate": 1.7140607274957793e-05, + "loss": 0.7176, + "step": 2649 + }, + { + "epoch": 0.27, + "grad_norm": 1.7513081690313692, + "learning_rate": 1.7138300492771803e-05, + "loss": 0.7674, + "step": 2650 + }, + { + "epoch": 0.27, + "grad_norm": 1.5134706541956857, + "learning_rate": 1.7135992935834927e-05, + "loss": 0.6783, + "step": 2651 + }, + { + "epoch": 0.27, + "grad_norm": 1.749163817172862, + "learning_rate": 1.7133684604397613e-05, + "loss": 0.7608, + "step": 2652 + }, + { + "epoch": 0.27, + "grad_norm": 1.4252020954515696, + "learning_rate": 1.713137549871039e-05, + "loss": 0.764, + "step": 2653 + }, + { + "epoch": 0.27, + "grad_norm": 1.6358995582805185, + "learning_rate": 1.712906561902388e-05, + "loss": 0.7249, + "step": 2654 + }, + { + "epoch": 0.27, + "grad_norm": 1.535740650980999, + "learning_rate": 1.7126754965588785e-05, + "loss": 0.6888, + "step": 2655 + }, + { + "epoch": 0.27, + "grad_norm": 1.2716763178954538, + "learning_rate": 1.7124443538655887e-05, + "loss": 0.6396, + "step": 2656 + }, + { + "epoch": 0.27, + "grad_norm": 1.5575463059511079, + "learning_rate": 1.7122131338476058e-05, + "loss": 0.6889, + "step": 2657 + }, + { + "epoch": 0.27, + "grad_norm": 1.5997207855607138, + "learning_rate": 1.7119818365300246e-05, + "loss": 0.735, + "step": 2658 + }, + { + "epoch": 0.27, + "grad_norm": 1.65220121537614, + "learning_rate": 1.7117504619379498e-05, + "loss": 0.7919, + "step": 2659 + }, + { + "epoch": 0.27, + "grad_norm": 1.5541586302274732, + "learning_rate": 1.7115190100964926e-05, + "loss": 0.7166, + "step": 2660 + }, + { + "epoch": 0.27, + "grad_norm": 1.4452623372732751, + "learning_rate": 1.7112874810307736e-05, + "loss": 0.68, + "step": 2661 + }, + { + "epoch": 0.27, + "grad_norm": 1.5159176357495028, + "learning_rate": 1.711055874765922e-05, + "loss": 0.8157, + "step": 2662 + }, + { + "epoch": 0.27, + "grad_norm": 1.5363560553715614, + "learning_rate": 1.710824191327075e-05, + "loss": 0.7492, + "step": 2663 + }, + { + "epoch": 0.27, + "grad_norm": 1.7141688792271914, + "learning_rate": 1.7105924307393776e-05, + "loss": 0.7959, + "step": 2664 + }, + { + "epoch": 0.27, + "grad_norm": 1.6963032092016188, + "learning_rate": 1.7103605930279847e-05, + "loss": 0.789, + "step": 2665 + }, + { + "epoch": 0.27, + "grad_norm": 1.570563090841888, + "learning_rate": 1.7101286782180585e-05, + "loss": 0.7145, + "step": 2666 + }, + { + "epoch": 0.27, + "grad_norm": 1.5015102151992468, + "learning_rate": 1.7098966863347687e-05, + "loss": 0.7126, + "step": 2667 + }, + { + "epoch": 0.27, + "grad_norm": 1.523283582923896, + "learning_rate": 1.7096646174032955e-05, + "loss": 0.7493, + "step": 2668 + }, + { + "epoch": 0.27, + "grad_norm": 1.6155367999750465, + "learning_rate": 1.7094324714488258e-05, + "loss": 0.7459, + "step": 2669 + }, + { + "epoch": 0.27, + "grad_norm": 1.4925118200717544, + "learning_rate": 1.709200248496556e-05, + "loss": 0.8062, + "step": 2670 + }, + { + "epoch": 0.27, + "grad_norm": 1.4778170137928905, + "learning_rate": 1.7089679485716897e-05, + "loss": 0.7196, + "step": 2671 + }, + { + "epoch": 0.27, + "grad_norm": 1.4904397859832723, + "learning_rate": 1.7087355716994398e-05, + "loss": 0.6981, + "step": 2672 + }, + { + "epoch": 0.27, + "grad_norm": 1.7317550031169902, + "learning_rate": 1.7085031179050268e-05, + "loss": 0.7211, + "step": 2673 + }, + { + "epoch": 0.27, + "grad_norm": 1.489607713671086, + "learning_rate": 1.7082705872136797e-05, + "loss": 0.7409, + "step": 2674 + }, + { + "epoch": 0.27, + "grad_norm": 1.516574391694995, + "learning_rate": 1.708037979650637e-05, + "loss": 0.7761, + "step": 2675 + }, + { + "epoch": 0.27, + "grad_norm": 1.6355128601376654, + "learning_rate": 1.707805295241144e-05, + "loss": 0.8915, + "step": 2676 + }, + { + "epoch": 0.27, + "grad_norm": 1.5304123605760016, + "learning_rate": 1.707572534010455e-05, + "loss": 0.849, + "step": 2677 + }, + { + "epoch": 0.27, + "grad_norm": 1.5575421176033872, + "learning_rate": 1.7073396959838326e-05, + "loss": 0.724, + "step": 2678 + }, + { + "epoch": 0.27, + "grad_norm": 1.696531813138858, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.7578, + "step": 2679 + }, + { + "epoch": 0.27, + "grad_norm": 1.6790816568746896, + "learning_rate": 1.7068737896438796e-05, + "loss": 0.8616, + "step": 2680 + }, + { + "epoch": 0.27, + "grad_norm": 1.7031711209522635, + "learning_rate": 1.706640721381116e-05, + "loss": 0.7563, + "step": 2681 + }, + { + "epoch": 0.27, + "grad_norm": 1.4519377943836558, + "learning_rate": 1.7064075764235527e-05, + "loss": 0.6649, + "step": 2682 + }, + { + "epoch": 0.27, + "grad_norm": 1.6649570253386337, + "learning_rate": 1.706174354796494e-05, + "loss": 0.8154, + "step": 2683 + }, + { + "epoch": 0.27, + "grad_norm": 1.620028413852496, + "learning_rate": 1.7059410565252525e-05, + "loss": 0.7565, + "step": 2684 + }, + { + "epoch": 0.27, + "grad_norm": 1.355628318623081, + "learning_rate": 1.7057076816351487e-05, + "loss": 0.5766, + "step": 2685 + }, + { + "epoch": 0.27, + "grad_norm": 1.4905400260550263, + "learning_rate": 1.7054742301515123e-05, + "loss": 0.6522, + "step": 2686 + }, + { + "epoch": 0.27, + "grad_norm": 1.6314227432654287, + "learning_rate": 1.7052407020996804e-05, + "loss": 0.7687, + "step": 2687 + }, + { + "epoch": 0.27, + "grad_norm": 1.6353452003075364, + "learning_rate": 1.7050070975049993e-05, + "loss": 0.7895, + "step": 2688 + }, + { + "epoch": 0.27, + "grad_norm": 1.3951006709312348, + "learning_rate": 1.7047734163928227e-05, + "loss": 0.7829, + "step": 2689 + }, + { + "epoch": 0.27, + "grad_norm": 1.6440774612884574, + "learning_rate": 1.704539658788513e-05, + "loss": 0.8283, + "step": 2690 + }, + { + "epoch": 0.27, + "grad_norm": 1.4537194274268685, + "learning_rate": 1.7043058247174414e-05, + "loss": 0.639, + "step": 2691 + }, + { + "epoch": 0.27, + "grad_norm": 1.6083947454220908, + "learning_rate": 1.7040719142049864e-05, + "loss": 0.6641, + "step": 2692 + }, + { + "epoch": 0.27, + "grad_norm": 1.6615547545200424, + "learning_rate": 1.7038379272765357e-05, + "loss": 0.7293, + "step": 2693 + }, + { + "epoch": 0.27, + "grad_norm": 1.5466180268091978, + "learning_rate": 1.703603863957485e-05, + "loss": 0.7146, + "step": 2694 + }, + { + "epoch": 0.27, + "grad_norm": 1.5668440062466291, + "learning_rate": 1.7033697242732376e-05, + "loss": 0.8263, + "step": 2695 + }, + { + "epoch": 0.27, + "grad_norm": 1.7239135169274935, + "learning_rate": 1.7031355082492066e-05, + "loss": 0.8717, + "step": 2696 + }, + { + "epoch": 0.27, + "grad_norm": 1.484778336143205, + "learning_rate": 1.7029012159108114e-05, + "loss": 0.7309, + "step": 2697 + }, + { + "epoch": 0.27, + "grad_norm": 1.550338169624176, + "learning_rate": 1.7026668472834818e-05, + "loss": 0.7148, + "step": 2698 + }, + { + "epoch": 0.27, + "grad_norm": 1.4474445970065117, + "learning_rate": 1.702432402392654e-05, + "loss": 0.7862, + "step": 2699 + }, + { + "epoch": 0.27, + "grad_norm": 1.61847983913502, + "learning_rate": 1.7021978812637742e-05, + "loss": 0.7613, + "step": 2700 + }, + { + "epoch": 0.27, + "grad_norm": 1.5619155946150487, + "learning_rate": 1.7019632839222954e-05, + "loss": 0.8718, + "step": 2701 + }, + { + "epoch": 0.27, + "grad_norm": 1.5338774499992636, + "learning_rate": 1.7017286103936793e-05, + "loss": 0.8118, + "step": 2702 + }, + { + "epoch": 0.27, + "grad_norm": 1.3798929312784296, + "learning_rate": 1.7014938607033967e-05, + "loss": 0.7618, + "step": 2703 + }, + { + "epoch": 0.28, + "grad_norm": 1.6002922978466065, + "learning_rate": 1.7012590348769255e-05, + "loss": 0.7193, + "step": 2704 + }, + { + "epoch": 0.28, + "grad_norm": 1.4831134418629623, + "learning_rate": 1.7010241329397524e-05, + "loss": 0.7847, + "step": 2705 + }, + { + "epoch": 0.28, + "grad_norm": 1.4187030989217948, + "learning_rate": 1.700789154917373e-05, + "loss": 0.6872, + "step": 2706 + }, + { + "epoch": 0.28, + "grad_norm": 1.5502049567007963, + "learning_rate": 1.7005541008352894e-05, + "loss": 0.6629, + "step": 2707 + }, + { + "epoch": 0.28, + "grad_norm": 1.562381202598881, + "learning_rate": 1.700318970719014e-05, + "loss": 0.7573, + "step": 2708 + }, + { + "epoch": 0.28, + "grad_norm": 1.4838134273125427, + "learning_rate": 1.7000837645940654e-05, + "loss": 0.7006, + "step": 2709 + }, + { + "epoch": 0.28, + "grad_norm": 1.562643798332321, + "learning_rate": 1.6998484824859728e-05, + "loss": 0.8275, + "step": 2710 + }, + { + "epoch": 0.28, + "grad_norm": 1.4447304429418308, + "learning_rate": 1.699613124420272e-05, + "loss": 0.7452, + "step": 2711 + }, + { + "epoch": 0.28, + "grad_norm": 1.4520248132390094, + "learning_rate": 1.699377690422507e-05, + "loss": 0.794, + "step": 2712 + }, + { + "epoch": 0.28, + "grad_norm": 1.5650615558288783, + "learning_rate": 1.6991421805182305e-05, + "loss": 0.8639, + "step": 2713 + }, + { + "epoch": 0.28, + "grad_norm": 1.5385647957238642, + "learning_rate": 1.6989065947330038e-05, + "loss": 0.7816, + "step": 2714 + }, + { + "epoch": 0.28, + "grad_norm": 1.605033676576137, + "learning_rate": 1.6986709330923964e-05, + "loss": 0.7617, + "step": 2715 + }, + { + "epoch": 0.28, + "grad_norm": 1.5254040652379102, + "learning_rate": 1.6984351956219847e-05, + "loss": 0.7744, + "step": 2716 + }, + { + "epoch": 0.28, + "grad_norm": 1.5370041367141265, + "learning_rate": 1.698199382347355e-05, + "loss": 0.7817, + "step": 2717 + }, + { + "epoch": 0.28, + "grad_norm": 1.5591871015222685, + "learning_rate": 1.697963493294101e-05, + "loss": 0.7608, + "step": 2718 + }, + { + "epoch": 0.28, + "grad_norm": 1.5268100390967951, + "learning_rate": 1.6977275284878243e-05, + "loss": 0.7932, + "step": 2719 + }, + { + "epoch": 0.28, + "grad_norm": 1.5310722382386455, + "learning_rate": 1.697491487954136e-05, + "loss": 0.7739, + "step": 2720 + }, + { + "epoch": 0.28, + "grad_norm": 1.5020818896731902, + "learning_rate": 1.6972553717186543e-05, + "loss": 0.7654, + "step": 2721 + }, + { + "epoch": 0.28, + "grad_norm": 1.6285379518225092, + "learning_rate": 1.6970191798070056e-05, + "loss": 0.8342, + "step": 2722 + }, + { + "epoch": 0.28, + "grad_norm": 1.5214473900346936, + "learning_rate": 1.6967829122448255e-05, + "loss": 0.7637, + "step": 2723 + }, + { + "epoch": 0.28, + "grad_norm": 1.6437973758314557, + "learning_rate": 1.6965465690577562e-05, + "loss": 0.6452, + "step": 2724 + }, + { + "epoch": 0.28, + "grad_norm": 1.5907319901063022, + "learning_rate": 1.69631015027145e-05, + "loss": 0.7918, + "step": 2725 + }, + { + "epoch": 0.28, + "grad_norm": 1.6153610345004459, + "learning_rate": 1.696073655911566e-05, + "loss": 0.6448, + "step": 2726 + }, + { + "epoch": 0.28, + "grad_norm": 1.6265533051979708, + "learning_rate": 1.695837086003772e-05, + "loss": 0.7247, + "step": 2727 + }, + { + "epoch": 0.28, + "grad_norm": 1.7536067425885253, + "learning_rate": 1.695600440573744e-05, + "loss": 0.8441, + "step": 2728 + }, + { + "epoch": 0.28, + "grad_norm": 1.4307599485709788, + "learning_rate": 1.6953637196471658e-05, + "loss": 0.6191, + "step": 2729 + }, + { + "epoch": 0.28, + "grad_norm": 1.369399874389617, + "learning_rate": 1.6951269232497307e-05, + "loss": 0.7959, + "step": 2730 + }, + { + "epoch": 0.28, + "grad_norm": 1.5347249065403918, + "learning_rate": 1.6948900514071383e-05, + "loss": 0.7254, + "step": 2731 + }, + { + "epoch": 0.28, + "grad_norm": 1.569237032596756, + "learning_rate": 1.6946531041450976e-05, + "loss": 0.843, + "step": 2732 + }, + { + "epoch": 0.28, + "grad_norm": 1.4274347398074667, + "learning_rate": 1.694416081489326e-05, + "loss": 0.7344, + "step": 2733 + }, + { + "epoch": 0.28, + "grad_norm": 1.5881966084737997, + "learning_rate": 1.694178983465548e-05, + "loss": 0.7619, + "step": 2734 + }, + { + "epoch": 0.28, + "grad_norm": 1.4842518701692677, + "learning_rate": 1.6939418100994972e-05, + "loss": 0.6843, + "step": 2735 + }, + { + "epoch": 0.28, + "grad_norm": 1.409863595614964, + "learning_rate": 1.6937045614169147e-05, + "loss": 0.6616, + "step": 2736 + }, + { + "epoch": 0.28, + "grad_norm": 1.7392852185664112, + "learning_rate": 1.6934672374435504e-05, + "loss": 0.7943, + "step": 2737 + }, + { + "epoch": 0.28, + "grad_norm": 1.6435124089413926, + "learning_rate": 1.693229838205162e-05, + "loss": 0.7713, + "step": 2738 + }, + { + "epoch": 0.28, + "grad_norm": 1.4599316988101887, + "learning_rate": 1.692992363727516e-05, + "loss": 0.8101, + "step": 2739 + }, + { + "epoch": 0.28, + "grad_norm": 1.489997692489692, + "learning_rate": 1.6927548140363857e-05, + "loss": 0.7271, + "step": 2740 + }, + { + "epoch": 0.28, + "grad_norm": 1.62368929242668, + "learning_rate": 1.692517189157554e-05, + "loss": 0.8319, + "step": 2741 + }, + { + "epoch": 0.28, + "grad_norm": 1.6038961839172665, + "learning_rate": 1.6922794891168114e-05, + "loss": 0.6931, + "step": 2742 + }, + { + "epoch": 0.28, + "grad_norm": 1.6660621188481828, + "learning_rate": 1.6920417139399558e-05, + "loss": 0.7983, + "step": 2743 + }, + { + "epoch": 0.28, + "grad_norm": 1.6005730767318995, + "learning_rate": 1.6918038636527947e-05, + "loss": 0.7519, + "step": 2744 + }, + { + "epoch": 0.28, + "grad_norm": 1.5824965838797156, + "learning_rate": 1.691565938281143e-05, + "loss": 0.8057, + "step": 2745 + }, + { + "epoch": 0.28, + "grad_norm": 1.6253194656473673, + "learning_rate": 1.691327937850823e-05, + "loss": 0.776, + "step": 2746 + }, + { + "epoch": 0.28, + "grad_norm": 1.5283801314123524, + "learning_rate": 1.691089862387667e-05, + "loss": 0.6774, + "step": 2747 + }, + { + "epoch": 0.28, + "grad_norm": 1.4364607171702077, + "learning_rate": 1.690851711917514e-05, + "loss": 0.7053, + "step": 2748 + }, + { + "epoch": 0.28, + "grad_norm": 1.4031659843312712, + "learning_rate": 1.690613486466211e-05, + "loss": 0.7023, + "step": 2749 + }, + { + "epoch": 0.28, + "grad_norm": 1.5841263671334511, + "learning_rate": 1.6903751860596142e-05, + "loss": 0.7127, + "step": 2750 + }, + { + "epoch": 0.28, + "grad_norm": 1.6482604897965214, + "learning_rate": 1.690136810723587e-05, + "loss": 0.8546, + "step": 2751 + }, + { + "epoch": 0.28, + "grad_norm": 1.4450313742005596, + "learning_rate": 1.689898360484002e-05, + "loss": 0.8229, + "step": 2752 + }, + { + "epoch": 0.28, + "grad_norm": 1.6073793491502968, + "learning_rate": 1.6896598353667384e-05, + "loss": 0.7387, + "step": 2753 + }, + { + "epoch": 0.28, + "grad_norm": 1.7321541961311, + "learning_rate": 1.689421235397685e-05, + "loss": 0.7184, + "step": 2754 + }, + { + "epoch": 0.28, + "grad_norm": 1.595811929094905, + "learning_rate": 1.6891825606027376e-05, + "loss": 0.8199, + "step": 2755 + }, + { + "epoch": 0.28, + "grad_norm": 1.653256316090929, + "learning_rate": 1.688943811007801e-05, + "loss": 0.7459, + "step": 2756 + }, + { + "epoch": 0.28, + "grad_norm": 1.3943982750909278, + "learning_rate": 1.6887049866387874e-05, + "loss": 0.7861, + "step": 2757 + }, + { + "epoch": 0.28, + "grad_norm": 1.833818403056107, + "learning_rate": 1.6884660875216178e-05, + "loss": 0.8094, + "step": 2758 + }, + { + "epoch": 0.28, + "grad_norm": 1.62709212438958, + "learning_rate": 1.6882271136822204e-05, + "loss": 0.8247, + "step": 2759 + }, + { + "epoch": 0.28, + "grad_norm": 1.4804818862878864, + "learning_rate": 1.687988065146533e-05, + "loss": 0.7697, + "step": 2760 + }, + { + "epoch": 0.28, + "grad_norm": 1.535185488000713, + "learning_rate": 1.6877489419405e-05, + "loss": 0.7532, + "step": 2761 + }, + { + "epoch": 0.28, + "grad_norm": 1.4658145091412107, + "learning_rate": 1.6875097440900746e-05, + "loss": 0.7117, + "step": 2762 + }, + { + "epoch": 0.28, + "grad_norm": 1.476554530620898, + "learning_rate": 1.6872704716212178e-05, + "loss": 0.696, + "step": 2763 + }, + { + "epoch": 0.28, + "grad_norm": 1.4988545882011124, + "learning_rate": 1.6870311245598992e-05, + "loss": 0.8044, + "step": 2764 + }, + { + "epoch": 0.28, + "grad_norm": 1.4127927069516595, + "learning_rate": 1.6867917029320958e-05, + "loss": 0.8519, + "step": 2765 + }, + { + "epoch": 0.28, + "grad_norm": 1.3757450216152427, + "learning_rate": 1.6865522067637932e-05, + "loss": 0.6608, + "step": 2766 + }, + { + "epoch": 0.28, + "grad_norm": 1.4416391583154726, + "learning_rate": 1.686312636080985e-05, + "loss": 0.6738, + "step": 2767 + }, + { + "epoch": 0.28, + "grad_norm": 1.4930160707420024, + "learning_rate": 1.686072990909673e-05, + "loss": 0.7284, + "step": 2768 + }, + { + "epoch": 0.28, + "grad_norm": 1.6919012095067527, + "learning_rate": 1.6858332712758667e-05, + "loss": 0.7883, + "step": 2769 + }, + { + "epoch": 0.28, + "grad_norm": 1.4702667160247174, + "learning_rate": 1.6855934772055843e-05, + "loss": 0.7797, + "step": 2770 + }, + { + "epoch": 0.28, + "grad_norm": 1.4828486258168854, + "learning_rate": 1.6853536087248512e-05, + "loss": 0.8487, + "step": 2771 + }, + { + "epoch": 0.28, + "grad_norm": 1.6255330339109715, + "learning_rate": 1.6851136658597014e-05, + "loss": 0.7718, + "step": 2772 + }, + { + "epoch": 0.28, + "grad_norm": 1.5772887860309583, + "learning_rate": 1.6848736486361777e-05, + "loss": 0.6948, + "step": 2773 + }, + { + "epoch": 0.28, + "grad_norm": 1.7697523320924025, + "learning_rate": 1.6846335570803293e-05, + "loss": 0.8107, + "step": 2774 + }, + { + "epoch": 0.28, + "grad_norm": 1.575249049219454, + "learning_rate": 1.6843933912182148e-05, + "loss": 0.7582, + "step": 2775 + }, + { + "epoch": 0.28, + "grad_norm": 1.6035336350257257, + "learning_rate": 1.6841531510759007e-05, + "loss": 0.7807, + "step": 2776 + }, + { + "epoch": 0.28, + "grad_norm": 1.5277565031061031, + "learning_rate": 1.6839128366794603e-05, + "loss": 0.8003, + "step": 2777 + }, + { + "epoch": 0.28, + "grad_norm": 1.665939020688003, + "learning_rate": 1.6836724480549773e-05, + "loss": 0.7279, + "step": 2778 + }, + { + "epoch": 0.28, + "grad_norm": 1.649265018826581, + "learning_rate": 1.6834319852285413e-05, + "loss": 0.7026, + "step": 2779 + }, + { + "epoch": 0.28, + "grad_norm": 1.5585497816292266, + "learning_rate": 1.6831914482262514e-05, + "loss": 0.7922, + "step": 2780 + }, + { + "epoch": 0.28, + "grad_norm": 1.5025288970479587, + "learning_rate": 1.682950837074213e-05, + "loss": 0.6813, + "step": 2781 + }, + { + "epoch": 0.28, + "grad_norm": 1.703071819086007, + "learning_rate": 1.6827101517985417e-05, + "loss": 0.8524, + "step": 2782 + }, + { + "epoch": 0.28, + "grad_norm": 1.483095775477807, + "learning_rate": 1.6824693924253596e-05, + "loss": 0.6962, + "step": 2783 + }, + { + "epoch": 0.28, + "grad_norm": 1.48203059260217, + "learning_rate": 1.682228558980798e-05, + "loss": 0.768, + "step": 2784 + }, + { + "epoch": 0.28, + "grad_norm": 1.6182378066924379, + "learning_rate": 1.681987651490995e-05, + "loss": 0.7338, + "step": 2785 + }, + { + "epoch": 0.28, + "grad_norm": 1.5946567346322944, + "learning_rate": 1.6817466699820976e-05, + "loss": 0.7571, + "step": 2786 + }, + { + "epoch": 0.28, + "grad_norm": 1.4595787577320416, + "learning_rate": 1.6815056144802604e-05, + "loss": 0.7257, + "step": 2787 + }, + { + "epoch": 0.28, + "grad_norm": 1.7789185563151684, + "learning_rate": 1.681264485011646e-05, + "loss": 0.6834, + "step": 2788 + }, + { + "epoch": 0.28, + "grad_norm": 1.5248080417971017, + "learning_rate": 1.681023281602426e-05, + "loss": 0.7149, + "step": 2789 + }, + { + "epoch": 0.28, + "grad_norm": 1.425715654156925, + "learning_rate": 1.6807820042787788e-05, + "loss": 0.7185, + "step": 2790 + }, + { + "epoch": 0.28, + "grad_norm": 1.4767606646263185, + "learning_rate": 1.680540653066891e-05, + "loss": 0.8509, + "step": 2791 + }, + { + "epoch": 0.28, + "grad_norm": 1.694444133436615, + "learning_rate": 1.6802992279929583e-05, + "loss": 0.8359, + "step": 2792 + }, + { + "epoch": 0.28, + "grad_norm": 1.6278746176394097, + "learning_rate": 1.6800577290831825e-05, + "loss": 0.698, + "step": 2793 + }, + { + "epoch": 0.28, + "grad_norm": 1.4507274176318135, + "learning_rate": 1.6798161563637754e-05, + "loss": 0.689, + "step": 2794 + }, + { + "epoch": 0.28, + "grad_norm": 1.4968155686337008, + "learning_rate": 1.6795745098609555e-05, + "loss": 0.6953, + "step": 2795 + }, + { + "epoch": 0.28, + "grad_norm": 1.4824801508895227, + "learning_rate": 1.6793327896009498e-05, + "loss": 0.6898, + "step": 2796 + }, + { + "epoch": 0.28, + "grad_norm": 1.6046740497702996, + "learning_rate": 1.6790909956099935e-05, + "loss": 0.6818, + "step": 2797 + }, + { + "epoch": 0.28, + "grad_norm": 1.5459818527277547, + "learning_rate": 1.6788491279143298e-05, + "loss": 0.682, + "step": 2798 + }, + { + "epoch": 0.28, + "grad_norm": 1.4972000594763024, + "learning_rate": 1.6786071865402086e-05, + "loss": 0.7576, + "step": 2799 + }, + { + "epoch": 0.28, + "grad_norm": 1.560378258512186, + "learning_rate": 1.6783651715138902e-05, + "loss": 0.7076, + "step": 2800 + }, + { + "epoch": 0.28, + "grad_norm": 1.6391118154486723, + "learning_rate": 1.6781230828616404e-05, + "loss": 0.8201, + "step": 2801 + }, + { + "epoch": 0.29, + "grad_norm": 1.6916916979306973, + "learning_rate": 1.6778809206097347e-05, + "loss": 0.8081, + "step": 2802 + }, + { + "epoch": 0.29, + "grad_norm": 1.6874205828232405, + "learning_rate": 1.6776386847844563e-05, + "loss": 0.7184, + "step": 2803 + }, + { + "epoch": 0.29, + "grad_norm": 1.458018963147143, + "learning_rate": 1.6773963754120952e-05, + "loss": 0.7259, + "step": 2804 + }, + { + "epoch": 0.29, + "grad_norm": 1.6070999871208667, + "learning_rate": 1.6771539925189514e-05, + "loss": 0.7739, + "step": 2805 + }, + { + "epoch": 0.29, + "grad_norm": 1.666769768470216, + "learning_rate": 1.6769115361313308e-05, + "loss": 0.7305, + "step": 2806 + }, + { + "epoch": 0.29, + "grad_norm": 1.513886999979369, + "learning_rate": 1.6766690062755488e-05, + "loss": 0.6507, + "step": 2807 + }, + { + "epoch": 0.29, + "grad_norm": 1.5865848169441699, + "learning_rate": 1.676426402977928e-05, + "loss": 0.6846, + "step": 2808 + }, + { + "epoch": 0.29, + "grad_norm": 1.5473956402439422, + "learning_rate": 1.6761837262647994e-05, + "loss": 0.7615, + "step": 2809 + }, + { + "epoch": 0.29, + "grad_norm": 1.7061180332352512, + "learning_rate": 1.6759409761625015e-05, + "loss": 0.7268, + "step": 2810 + }, + { + "epoch": 0.29, + "grad_norm": 1.5375806434832613, + "learning_rate": 1.6756981526973813e-05, + "loss": 0.7344, + "step": 2811 + }, + { + "epoch": 0.29, + "grad_norm": 1.626081660308829, + "learning_rate": 1.6754552558957936e-05, + "loss": 0.7418, + "step": 2812 + }, + { + "epoch": 0.29, + "grad_norm": 1.4755972980689747, + "learning_rate": 1.6752122857841005e-05, + "loss": 0.7499, + "step": 2813 + }, + { + "epoch": 0.29, + "grad_norm": 1.6415653421485688, + "learning_rate": 1.6749692423886727e-05, + "loss": 0.7489, + "step": 2814 + }, + { + "epoch": 0.29, + "grad_norm": 1.5273140977187238, + "learning_rate": 1.6747261257358894e-05, + "loss": 0.8002, + "step": 2815 + }, + { + "epoch": 0.29, + "grad_norm": 1.5374915960526312, + "learning_rate": 1.6744829358521368e-05, + "loss": 0.7432, + "step": 2816 + }, + { + "epoch": 0.29, + "grad_norm": 1.488772374315502, + "learning_rate": 1.674239672763809e-05, + "loss": 0.6842, + "step": 2817 + }, + { + "epoch": 0.29, + "grad_norm": 1.4171710004641913, + "learning_rate": 1.6739963364973084e-05, + "loss": 0.6569, + "step": 2818 + }, + { + "epoch": 0.29, + "grad_norm": 1.6203595465668792, + "learning_rate": 1.673752927079046e-05, + "loss": 0.7785, + "step": 2819 + }, + { + "epoch": 0.29, + "grad_norm": 1.493613756350887, + "learning_rate": 1.6735094445354395e-05, + "loss": 0.8533, + "step": 2820 + }, + { + "epoch": 0.29, + "grad_norm": 1.5122966135044509, + "learning_rate": 1.6732658888929153e-05, + "loss": 0.8206, + "step": 2821 + }, + { + "epoch": 0.29, + "grad_norm": 1.4885803944048208, + "learning_rate": 1.6730222601779075e-05, + "loss": 0.6961, + "step": 2822 + }, + { + "epoch": 0.29, + "grad_norm": 1.6084817399112654, + "learning_rate": 1.6727785584168583e-05, + "loss": 0.7868, + "step": 2823 + }, + { + "epoch": 0.29, + "grad_norm": 1.5461636352471053, + "learning_rate": 1.6725347836362178e-05, + "loss": 0.6473, + "step": 2824 + }, + { + "epoch": 0.29, + "grad_norm": 1.5190104114024647, + "learning_rate": 1.6722909358624436e-05, + "loss": 0.7424, + "step": 2825 + }, + { + "epoch": 0.29, + "grad_norm": 1.5882905731886237, + "learning_rate": 1.6720470151220018e-05, + "loss": 0.7879, + "step": 2826 + }, + { + "epoch": 0.29, + "grad_norm": 1.6363752670735936, + "learning_rate": 1.671803021441366e-05, + "loss": 0.7221, + "step": 2827 + }, + { + "epoch": 0.29, + "grad_norm": 1.7099595353026695, + "learning_rate": 1.6715589548470187e-05, + "loss": 0.8382, + "step": 2828 + }, + { + "epoch": 0.29, + "grad_norm": 1.6410804954346245, + "learning_rate": 1.6713148153654483e-05, + "loss": 0.8053, + "step": 2829 + }, + { + "epoch": 0.29, + "grad_norm": 1.440973928004605, + "learning_rate": 1.6710706030231533e-05, + "loss": 0.733, + "step": 2830 + }, + { + "epoch": 0.29, + "grad_norm": 1.45683336566356, + "learning_rate": 1.670826317846638e-05, + "loss": 0.7442, + "step": 2831 + }, + { + "epoch": 0.29, + "grad_norm": 1.5222653629474279, + "learning_rate": 1.6705819598624168e-05, + "loss": 0.7067, + "step": 2832 + }, + { + "epoch": 0.29, + "grad_norm": 1.5092845480116759, + "learning_rate": 1.6703375290970107e-05, + "loss": 0.6241, + "step": 2833 + }, + { + "epoch": 0.29, + "grad_norm": 1.673406426838146, + "learning_rate": 1.6700930255769486e-05, + "loss": 0.9131, + "step": 2834 + }, + { + "epoch": 0.29, + "grad_norm": 1.4722136796614742, + "learning_rate": 1.6698484493287678e-05, + "loss": 0.774, + "step": 2835 + }, + { + "epoch": 0.29, + "grad_norm": 1.7138172095048119, + "learning_rate": 1.6696038003790128e-05, + "loss": 0.8088, + "step": 2836 + }, + { + "epoch": 0.29, + "grad_norm": 1.4999575241202465, + "learning_rate": 1.6693590787542372e-05, + "loss": 0.8001, + "step": 2837 + }, + { + "epoch": 0.29, + "grad_norm": 1.596426220710093, + "learning_rate": 1.6691142844810013e-05, + "loss": 0.7445, + "step": 2838 + }, + { + "epoch": 0.29, + "grad_norm": 1.5321179361818746, + "learning_rate": 1.668869417585873e-05, + "loss": 0.7331, + "step": 2839 + }, + { + "epoch": 0.29, + "grad_norm": 1.4175973021799633, + "learning_rate": 1.6686244780954294e-05, + "loss": 0.6561, + "step": 2840 + }, + { + "epoch": 0.29, + "grad_norm": 1.6761302463380996, + "learning_rate": 1.668379466036255e-05, + "loss": 0.7373, + "step": 2841 + }, + { + "epoch": 0.29, + "grad_norm": 1.6008508892660749, + "learning_rate": 1.668134381434942e-05, + "loss": 0.7636, + "step": 2842 + }, + { + "epoch": 0.29, + "grad_norm": 1.7527217040594256, + "learning_rate": 1.66788922431809e-05, + "loss": 0.8217, + "step": 2843 + }, + { + "epoch": 0.29, + "grad_norm": 1.49467108389566, + "learning_rate": 1.6676439947123075e-05, + "loss": 0.7853, + "step": 2844 + }, + { + "epoch": 0.29, + "grad_norm": 1.6393797745455427, + "learning_rate": 1.66739869264421e-05, + "loss": 0.8308, + "step": 2845 + }, + { + "epoch": 0.29, + "grad_norm": 1.4602628736605359, + "learning_rate": 1.667153318140422e-05, + "loss": 0.7054, + "step": 2846 + }, + { + "epoch": 0.29, + "grad_norm": 1.579963462481767, + "learning_rate": 1.666907871227574e-05, + "loss": 0.8356, + "step": 2847 + }, + { + "epoch": 0.29, + "grad_norm": 1.6868043052083224, + "learning_rate": 1.6666623519323056e-05, + "loss": 0.7385, + "step": 2848 + }, + { + "epoch": 0.29, + "grad_norm": 1.5060864355496348, + "learning_rate": 1.6664167602812646e-05, + "loss": 0.7126, + "step": 2849 + }, + { + "epoch": 0.29, + "grad_norm": 1.369964772802513, + "learning_rate": 1.6661710963011057e-05, + "loss": 0.7056, + "step": 2850 + }, + { + "epoch": 0.29, + "grad_norm": 1.5409934582076479, + "learning_rate": 1.665925360018492e-05, + "loss": 0.7766, + "step": 2851 + }, + { + "epoch": 0.29, + "grad_norm": 1.5099368755717504, + "learning_rate": 1.6656795514600948e-05, + "loss": 0.8421, + "step": 2852 + }, + { + "epoch": 0.29, + "grad_norm": 1.4299528658124159, + "learning_rate": 1.665433670652592e-05, + "loss": 0.7769, + "step": 2853 + }, + { + "epoch": 0.29, + "grad_norm": 1.519976430392634, + "learning_rate": 1.665187717622671e-05, + "loss": 0.7765, + "step": 2854 + }, + { + "epoch": 0.29, + "grad_norm": 1.4572693694812913, + "learning_rate": 1.6649416923970248e-05, + "loss": 0.7355, + "step": 2855 + }, + { + "epoch": 0.29, + "grad_norm": 1.630645817094188, + "learning_rate": 1.664695595002357e-05, + "loss": 0.7639, + "step": 2856 + }, + { + "epoch": 0.29, + "grad_norm": 1.7757479522459132, + "learning_rate": 1.6644494254653767e-05, + "loss": 0.8451, + "step": 2857 + }, + { + "epoch": 0.29, + "grad_norm": 1.558065617309748, + "learning_rate": 1.664203183812802e-05, + "loss": 0.774, + "step": 2858 + }, + { + "epoch": 0.29, + "grad_norm": 1.5455113506164675, + "learning_rate": 1.6639568700713587e-05, + "loss": 0.7113, + "step": 2859 + }, + { + "epoch": 0.29, + "grad_norm": 1.699674973561388, + "learning_rate": 1.6637104842677807e-05, + "loss": 0.6718, + "step": 2860 + }, + { + "epoch": 0.29, + "grad_norm": 1.5690022382016071, + "learning_rate": 1.6634640264288087e-05, + "loss": 0.8394, + "step": 2861 + }, + { + "epoch": 0.29, + "grad_norm": 1.5342184544437765, + "learning_rate": 1.663217496581192e-05, + "loss": 0.795, + "step": 2862 + }, + { + "epoch": 0.29, + "grad_norm": 1.5959621667132664, + "learning_rate": 1.6629708947516875e-05, + "loss": 0.8409, + "step": 2863 + }, + { + "epoch": 0.29, + "grad_norm": 1.3836758638315834, + "learning_rate": 1.6627242209670606e-05, + "loss": 0.7884, + "step": 2864 + }, + { + "epoch": 0.29, + "grad_norm": 1.7082254637709893, + "learning_rate": 1.6624774752540828e-05, + "loss": 0.848, + "step": 2865 + }, + { + "epoch": 0.29, + "grad_norm": 1.4559456057698992, + "learning_rate": 1.6622306576395355e-05, + "loss": 0.7191, + "step": 2866 + }, + { + "epoch": 0.29, + "grad_norm": 1.491862156638078, + "learning_rate": 1.661983768150206e-05, + "loss": 0.779, + "step": 2867 + }, + { + "epoch": 0.29, + "grad_norm": 1.58902897489258, + "learning_rate": 1.661736806812891e-05, + "loss": 0.7634, + "step": 2868 + }, + { + "epoch": 0.29, + "grad_norm": 1.452695928938561, + "learning_rate": 1.661489773654394e-05, + "loss": 0.6678, + "step": 2869 + }, + { + "epoch": 0.29, + "grad_norm": 1.4349696118715132, + "learning_rate": 1.6612426687015266e-05, + "loss": 0.7108, + "step": 2870 + }, + { + "epoch": 0.29, + "grad_norm": 1.578562608016945, + "learning_rate": 1.6609954919811078e-05, + "loss": 0.7473, + "step": 2871 + }, + { + "epoch": 0.29, + "grad_norm": 1.5552003153913527, + "learning_rate": 1.6607482435199654e-05, + "loss": 0.7015, + "step": 2872 + }, + { + "epoch": 0.29, + "grad_norm": 1.5501887712427833, + "learning_rate": 1.6605009233449344e-05, + "loss": 0.8025, + "step": 2873 + }, + { + "epoch": 0.29, + "grad_norm": 1.5822623419828576, + "learning_rate": 1.6602535314828572e-05, + "loss": 0.7478, + "step": 2874 + }, + { + "epoch": 0.29, + "grad_norm": 1.563036489716803, + "learning_rate": 1.6600060679605844e-05, + "loss": 0.7182, + "step": 2875 + }, + { + "epoch": 0.29, + "grad_norm": 1.5476161809098843, + "learning_rate": 1.6597585328049736e-05, + "loss": 0.6821, + "step": 2876 + }, + { + "epoch": 0.29, + "grad_norm": 1.496579865043439, + "learning_rate": 1.6595109260428924e-05, + "loss": 0.6769, + "step": 2877 + }, + { + "epoch": 0.29, + "grad_norm": 1.595038129995531, + "learning_rate": 1.6592632477012132e-05, + "loss": 0.8206, + "step": 2878 + }, + { + "epoch": 0.29, + "grad_norm": 1.6403593358151038, + "learning_rate": 1.6590154978068185e-05, + "loss": 0.7059, + "step": 2879 + }, + { + "epoch": 0.29, + "grad_norm": 1.4723084470416206, + "learning_rate": 1.6587676763865972e-05, + "loss": 0.7159, + "step": 2880 + }, + { + "epoch": 0.29, + "grad_norm": 1.4455832157117119, + "learning_rate": 1.6585197834674466e-05, + "loss": 0.6425, + "step": 2881 + }, + { + "epoch": 0.29, + "grad_norm": 1.559593603945744, + "learning_rate": 1.6582718190762718e-05, + "loss": 0.7431, + "step": 2882 + }, + { + "epoch": 0.29, + "grad_norm": 1.6447593020281035, + "learning_rate": 1.658023783239985e-05, + "loss": 0.7389, + "step": 2883 + }, + { + "epoch": 0.29, + "grad_norm": 1.7274509720157214, + "learning_rate": 1.657775675985507e-05, + "loss": 0.7268, + "step": 2884 + }, + { + "epoch": 0.29, + "grad_norm": 1.6186783596171195, + "learning_rate": 1.657527497339766e-05, + "loss": 0.8189, + "step": 2885 + }, + { + "epoch": 0.29, + "grad_norm": 1.6366378590676298, + "learning_rate": 1.6572792473296972e-05, + "loss": 0.7788, + "step": 2886 + }, + { + "epoch": 0.29, + "grad_norm": 1.7420947209611584, + "learning_rate": 1.6570309259822455e-05, + "loss": 0.7549, + "step": 2887 + }, + { + "epoch": 0.29, + "grad_norm": 1.723075188665114, + "learning_rate": 1.656782533324361e-05, + "loss": 0.8223, + "step": 2888 + }, + { + "epoch": 0.29, + "grad_norm": 1.6003846657928615, + "learning_rate": 1.6565340693830035e-05, + "loss": 0.685, + "step": 2889 + }, + { + "epoch": 0.29, + "grad_norm": 1.509000318762168, + "learning_rate": 1.6562855341851402e-05, + "loss": 0.6158, + "step": 2890 + }, + { + "epoch": 0.29, + "grad_norm": 1.531501881671066, + "learning_rate": 1.6560369277577454e-05, + "loss": 0.7816, + "step": 2891 + }, + { + "epoch": 0.29, + "grad_norm": 1.574341951527415, + "learning_rate": 1.6557882501278012e-05, + "loss": 0.8377, + "step": 2892 + }, + { + "epoch": 0.29, + "grad_norm": 1.4853515813642724, + "learning_rate": 1.6555395013222978e-05, + "loss": 0.7056, + "step": 2893 + }, + { + "epoch": 0.29, + "grad_norm": 1.4195032069063818, + "learning_rate": 1.6552906813682324e-05, + "loss": 0.6399, + "step": 2894 + }, + { + "epoch": 0.29, + "grad_norm": 1.4676046827308649, + "learning_rate": 1.655041790292612e-05, + "loss": 0.7103, + "step": 2895 + }, + { + "epoch": 0.29, + "grad_norm": 1.5535140253853619, + "learning_rate": 1.6547928281224484e-05, + "loss": 0.6995, + "step": 2896 + }, + { + "epoch": 0.29, + "grad_norm": 1.4573585724428806, + "learning_rate": 1.654543794884764e-05, + "loss": 0.793, + "step": 2897 + }, + { + "epoch": 0.29, + "grad_norm": 1.695730920554382, + "learning_rate": 1.6542946906065854e-05, + "loss": 0.7337, + "step": 2898 + }, + { + "epoch": 0.29, + "grad_norm": 1.5311418599951014, + "learning_rate": 1.654045515314951e-05, + "loss": 0.8071, + "step": 2899 + }, + { + "epoch": 0.29, + "grad_norm": 1.6182734204706768, + "learning_rate": 1.653796269036904e-05, + "loss": 0.8456, + "step": 2900 + }, + { + "epoch": 0.3, + "grad_norm": 1.6533053637642925, + "learning_rate": 1.653546951799496e-05, + "loss": 0.8067, + "step": 2901 + }, + { + "epoch": 0.3, + "grad_norm": 1.6011248753907634, + "learning_rate": 1.6532975636297863e-05, + "loss": 0.7606, + "step": 2902 + }, + { + "epoch": 0.3, + "grad_norm": 1.7192000277990367, + "learning_rate": 1.6530481045548426e-05, + "loss": 0.8073, + "step": 2903 + }, + { + "epoch": 0.3, + "grad_norm": 1.543114296940595, + "learning_rate": 1.6527985746017405e-05, + "loss": 0.7372, + "step": 2904 + }, + { + "epoch": 0.3, + "grad_norm": 1.431147193220211, + "learning_rate": 1.652548973797561e-05, + "loss": 0.6621, + "step": 2905 + }, + { + "epoch": 0.3, + "grad_norm": 1.4846835227313837, + "learning_rate": 1.6522993021693955e-05, + "loss": 0.6698, + "step": 2906 + }, + { + "epoch": 0.3, + "grad_norm": 1.6667734247359371, + "learning_rate": 1.6520495597443415e-05, + "loss": 0.7361, + "step": 2907 + }, + { + "epoch": 0.3, + "grad_norm": 1.531655652290317, + "learning_rate": 1.6517997465495046e-05, + "loss": 0.7556, + "step": 2908 + }, + { + "epoch": 0.3, + "grad_norm": 1.6498541138511267, + "learning_rate": 1.651549862611998e-05, + "loss": 0.8484, + "step": 2909 + }, + { + "epoch": 0.3, + "grad_norm": 1.4380001397040072, + "learning_rate": 1.6512999079589435e-05, + "loss": 0.8083, + "step": 2910 + }, + { + "epoch": 0.3, + "grad_norm": 1.5522328190326016, + "learning_rate": 1.6510498826174685e-05, + "loss": 0.6948, + "step": 2911 + }, + { + "epoch": 0.3, + "grad_norm": 1.4467565956725026, + "learning_rate": 1.6507997866147107e-05, + "loss": 0.644, + "step": 2912 + }, + { + "epoch": 0.3, + "grad_norm": 1.5220169878385472, + "learning_rate": 1.650549619977813e-05, + "loss": 0.747, + "step": 2913 + }, + { + "epoch": 0.3, + "grad_norm": 1.5977989579510004, + "learning_rate": 1.6502993827339274e-05, + "loss": 0.7749, + "step": 2914 + }, + { + "epoch": 0.3, + "grad_norm": 1.4482793528657738, + "learning_rate": 1.6500490749102137e-05, + "loss": 0.7466, + "step": 2915 + }, + { + "epoch": 0.3, + "grad_norm": 1.653687165964818, + "learning_rate": 1.6497986965338383e-05, + "loss": 0.7703, + "step": 2916 + }, + { + "epoch": 0.3, + "grad_norm": 1.681617825976571, + "learning_rate": 1.649548247631976e-05, + "loss": 0.7186, + "step": 2917 + }, + { + "epoch": 0.3, + "grad_norm": 1.6218352088541688, + "learning_rate": 1.64929772823181e-05, + "loss": 0.7309, + "step": 2918 + }, + { + "epoch": 0.3, + "grad_norm": 1.6085825395587305, + "learning_rate": 1.649047138360529e-05, + "loss": 0.7282, + "step": 2919 + }, + { + "epoch": 0.3, + "grad_norm": 1.5622700051177003, + "learning_rate": 1.648796478045331e-05, + "loss": 0.6535, + "step": 2920 + }, + { + "epoch": 0.3, + "grad_norm": 1.664828775585605, + "learning_rate": 1.648545747313421e-05, + "loss": 0.7428, + "step": 2921 + }, + { + "epoch": 0.3, + "grad_norm": 1.5657215537870217, + "learning_rate": 1.648294946192013e-05, + "loss": 0.9065, + "step": 2922 + }, + { + "epoch": 0.3, + "grad_norm": 1.4834078087685076, + "learning_rate": 1.648044074708326e-05, + "loss": 0.6748, + "step": 2923 + }, + { + "epoch": 0.3, + "grad_norm": 1.6883500257594792, + "learning_rate": 1.6477931328895895e-05, + "loss": 0.8581, + "step": 2924 + }, + { + "epoch": 0.3, + "grad_norm": 1.6501700073000836, + "learning_rate": 1.647542120763039e-05, + "loss": 0.783, + "step": 2925 + }, + { + "epoch": 0.3, + "grad_norm": 1.629511650923731, + "learning_rate": 1.647291038355917e-05, + "loss": 0.8634, + "step": 2926 + }, + { + "epoch": 0.3, + "grad_norm": 1.5041014480365311, + "learning_rate": 1.6470398856954758e-05, + "loss": 0.7276, + "step": 2927 + }, + { + "epoch": 0.3, + "grad_norm": 1.6518055797582596, + "learning_rate": 1.6467886628089734e-05, + "loss": 0.7761, + "step": 2928 + }, + { + "epoch": 0.3, + "grad_norm": 1.3929239385366425, + "learning_rate": 1.6465373697236762e-05, + "loss": 0.6874, + "step": 2929 + }, + { + "epoch": 0.3, + "grad_norm": 1.5387621505367852, + "learning_rate": 1.6462860064668582e-05, + "loss": 0.8216, + "step": 2930 + }, + { + "epoch": 0.3, + "grad_norm": 1.7292060414509942, + "learning_rate": 1.6460345730658015e-05, + "loss": 0.7825, + "step": 2931 + }, + { + "epoch": 0.3, + "grad_norm": 1.5574284883149774, + "learning_rate": 1.6457830695477942e-05, + "loss": 0.7583, + "step": 2932 + }, + { + "epoch": 0.3, + "grad_norm": 1.7204118790035463, + "learning_rate": 1.6455314959401337e-05, + "loss": 0.7306, + "step": 2933 + }, + { + "epoch": 0.3, + "grad_norm": 1.5051589890511696, + "learning_rate": 1.6452798522701244e-05, + "loss": 0.6778, + "step": 2934 + }, + { + "epoch": 0.3, + "grad_norm": 1.6934088049746245, + "learning_rate": 1.6450281385650783e-05, + "loss": 0.7215, + "step": 2935 + }, + { + "epoch": 0.3, + "grad_norm": 1.5681140824743331, + "learning_rate": 1.6447763548523145e-05, + "loss": 0.8215, + "step": 2936 + }, + { + "epoch": 0.3, + "grad_norm": 1.410247781162616, + "learning_rate": 1.644524501159161e-05, + "loss": 0.6756, + "step": 2937 + }, + { + "epoch": 0.3, + "grad_norm": 1.6823167629308104, + "learning_rate": 1.644272577512952e-05, + "loss": 0.7749, + "step": 2938 + }, + { + "epoch": 0.3, + "grad_norm": 1.630703164194887, + "learning_rate": 1.64402058394103e-05, + "loss": 0.6726, + "step": 2939 + }, + { + "epoch": 0.3, + "grad_norm": 1.446999359948708, + "learning_rate": 1.643768520470745e-05, + "loss": 0.6298, + "step": 2940 + }, + { + "epoch": 0.3, + "grad_norm": 1.6801001999822267, + "learning_rate": 1.6435163871294543e-05, + "loss": 0.7354, + "step": 2941 + }, + { + "epoch": 0.3, + "grad_norm": 1.5506128809974706, + "learning_rate": 1.6432641839445234e-05, + "loss": 0.7504, + "step": 2942 + }, + { + "epoch": 0.3, + "grad_norm": 1.5430153520759022, + "learning_rate": 1.6430119109433247e-05, + "loss": 0.7373, + "step": 2943 + }, + { + "epoch": 0.3, + "grad_norm": 1.463452113269399, + "learning_rate": 1.6427595681532387e-05, + "loss": 0.7108, + "step": 2944 + }, + { + "epoch": 0.3, + "grad_norm": 1.4432730436194878, + "learning_rate": 1.6425071556016533e-05, + "loss": 0.8079, + "step": 2945 + }, + { + "epoch": 0.3, + "grad_norm": 1.5186131828255842, + "learning_rate": 1.642254673315964e-05, + "loss": 0.7656, + "step": 2946 + }, + { + "epoch": 0.3, + "grad_norm": 1.4850811230291339, + "learning_rate": 1.6420021213235736e-05, + "loss": 0.7535, + "step": 2947 + }, + { + "epoch": 0.3, + "grad_norm": 1.5396786472697646, + "learning_rate": 1.6417494996518925e-05, + "loss": 0.7964, + "step": 2948 + }, + { + "epoch": 0.3, + "grad_norm": 1.572772703812896, + "learning_rate": 1.641496808328339e-05, + "loss": 0.7796, + "step": 2949 + }, + { + "epoch": 0.3, + "grad_norm": 1.621635568935317, + "learning_rate": 1.641244047380339e-05, + "loss": 0.7015, + "step": 2950 + }, + { + "epoch": 0.3, + "grad_norm": 1.6564223885148655, + "learning_rate": 1.640991216835326e-05, + "loss": 0.7155, + "step": 2951 + }, + { + "epoch": 0.3, + "grad_norm": 1.5100958840388738, + "learning_rate": 1.64073831672074e-05, + "loss": 0.785, + "step": 2952 + }, + { + "epoch": 0.3, + "grad_norm": 1.6905292713911557, + "learning_rate": 1.6404853470640295e-05, + "loss": 0.7858, + "step": 2953 + }, + { + "epoch": 0.3, + "grad_norm": 1.4028782823432704, + "learning_rate": 1.640232307892651e-05, + "loss": 0.7429, + "step": 2954 + }, + { + "epoch": 0.3, + "grad_norm": 1.5817976361332047, + "learning_rate": 1.6399791992340674e-05, + "loss": 0.7154, + "step": 2955 + }, + { + "epoch": 0.3, + "grad_norm": 1.5391480351313862, + "learning_rate": 1.63972602111575e-05, + "loss": 0.7061, + "step": 2956 + }, + { + "epoch": 0.3, + "grad_norm": 1.6468298584133225, + "learning_rate": 1.6394727735651772e-05, + "loss": 0.8179, + "step": 2957 + }, + { + "epoch": 0.3, + "grad_norm": 1.6514204198522735, + "learning_rate": 1.6392194566098352e-05, + "loss": 0.7793, + "step": 2958 + }, + { + "epoch": 0.3, + "grad_norm": 1.5621916853219573, + "learning_rate": 1.638966070277217e-05, + "loss": 0.7411, + "step": 2959 + }, + { + "epoch": 0.3, + "grad_norm": 1.5437305047711751, + "learning_rate": 1.6387126145948248e-05, + "loss": 0.8206, + "step": 2960 + }, + { + "epoch": 0.3, + "grad_norm": 1.5740494144417758, + "learning_rate": 1.638459089590166e-05, + "loss": 0.6844, + "step": 2961 + }, + { + "epoch": 0.3, + "grad_norm": 1.4071516788183378, + "learning_rate": 1.638205495290758e-05, + "loss": 0.7201, + "step": 2962 + }, + { + "epoch": 0.3, + "grad_norm": 1.566424857115309, + "learning_rate": 1.6379518317241236e-05, + "loss": 0.616, + "step": 2963 + }, + { + "epoch": 0.3, + "grad_norm": 1.6035080522044143, + "learning_rate": 1.6376980989177942e-05, + "loss": 0.8079, + "step": 2964 + }, + { + "epoch": 0.3, + "grad_norm": 1.6479440300773858, + "learning_rate": 1.6374442968993086e-05, + "loss": 0.7135, + "step": 2965 + }, + { + "epoch": 0.3, + "grad_norm": 1.4576916199655903, + "learning_rate": 1.6371904256962135e-05, + "loss": 0.675, + "step": 2966 + }, + { + "epoch": 0.3, + "grad_norm": 1.5816551028695465, + "learning_rate": 1.636936485336062e-05, + "loss": 0.7392, + "step": 2967 + }, + { + "epoch": 0.3, + "grad_norm": 1.6615128946836017, + "learning_rate": 1.6366824758464156e-05, + "loss": 0.7514, + "step": 2968 + }, + { + "epoch": 0.3, + "grad_norm": 1.6986189045921958, + "learning_rate": 1.636428397254843e-05, + "loss": 0.7831, + "step": 2969 + }, + { + "epoch": 0.3, + "grad_norm": 1.6285478666931792, + "learning_rate": 1.636174249588921e-05, + "loss": 0.8359, + "step": 2970 + }, + { + "epoch": 0.3, + "grad_norm": 1.6220292651856945, + "learning_rate": 1.635920032876232e-05, + "loss": 0.8139, + "step": 2971 + }, + { + "epoch": 0.3, + "grad_norm": 1.5218810852144655, + "learning_rate": 1.6356657471443686e-05, + "loss": 0.7399, + "step": 2972 + }, + { + "epoch": 0.3, + "grad_norm": 1.60756563674869, + "learning_rate": 1.635411392420929e-05, + "loss": 0.719, + "step": 2973 + }, + { + "epoch": 0.3, + "grad_norm": 1.5665830216671, + "learning_rate": 1.6351569687335195e-05, + "loss": 0.7577, + "step": 2974 + }, + { + "epoch": 0.3, + "grad_norm": 1.5933734297319644, + "learning_rate": 1.6349024761097532e-05, + "loss": 0.7912, + "step": 2975 + }, + { + "epoch": 0.3, + "grad_norm": 1.5034799372445287, + "learning_rate": 1.634647914577253e-05, + "loss": 0.7415, + "step": 2976 + }, + { + "epoch": 0.3, + "grad_norm": 1.4852458185738506, + "learning_rate": 1.6343932841636455e-05, + "loss": 0.6613, + "step": 2977 + }, + { + "epoch": 0.3, + "grad_norm": 1.473874116603972, + "learning_rate": 1.634138584896568e-05, + "loss": 0.6285, + "step": 2978 + }, + { + "epoch": 0.3, + "grad_norm": 1.438166471562597, + "learning_rate": 1.6338838168036643e-05, + "loss": 0.6949, + "step": 2979 + }, + { + "epoch": 0.3, + "grad_norm": 1.3987278868685178, + "learning_rate": 1.633628979912585e-05, + "loss": 0.7365, + "step": 2980 + }, + { + "epoch": 0.3, + "grad_norm": 1.6983099446864078, + "learning_rate": 1.6333740742509886e-05, + "loss": 0.6947, + "step": 2981 + }, + { + "epoch": 0.3, + "grad_norm": 1.6141816785702128, + "learning_rate": 1.6331190998465416e-05, + "loss": 0.7373, + "step": 2982 + }, + { + "epoch": 0.3, + "grad_norm": 1.5452457901687615, + "learning_rate": 1.632864056726917e-05, + "loss": 0.6975, + "step": 2983 + }, + { + "epoch": 0.3, + "grad_norm": 1.6608140326198042, + "learning_rate": 1.6326089449197956e-05, + "loss": 0.7476, + "step": 2984 + }, + { + "epoch": 0.3, + "grad_norm": 1.4572606282215208, + "learning_rate": 1.6323537644528666e-05, + "loss": 0.6997, + "step": 2985 + }, + { + "epoch": 0.3, + "grad_norm": 1.5716370053847366, + "learning_rate": 1.6320985153538255e-05, + "loss": 0.7826, + "step": 2986 + }, + { + "epoch": 0.3, + "grad_norm": 1.6592991363088199, + "learning_rate": 1.6318431976503754e-05, + "loss": 0.7428, + "step": 2987 + }, + { + "epoch": 0.3, + "grad_norm": 1.8237988750533487, + "learning_rate": 1.6315878113702264e-05, + "loss": 0.7558, + "step": 2988 + }, + { + "epoch": 0.3, + "grad_norm": 1.6716469257192015, + "learning_rate": 1.631332356541098e-05, + "loss": 0.7054, + "step": 2989 + }, + { + "epoch": 0.3, + "grad_norm": 1.585305425503702, + "learning_rate": 1.6310768331907152e-05, + "loss": 0.8262, + "step": 2990 + }, + { + "epoch": 0.3, + "grad_norm": 1.5738139196841392, + "learning_rate": 1.6308212413468112e-05, + "loss": 0.7449, + "step": 2991 + }, + { + "epoch": 0.3, + "grad_norm": 1.5846646205419024, + "learning_rate": 1.6305655810371263e-05, + "loss": 0.7675, + "step": 2992 + }, + { + "epoch": 0.3, + "grad_norm": 1.5475748322806462, + "learning_rate": 1.630309852289408e-05, + "loss": 0.7844, + "step": 2993 + }, + { + "epoch": 0.3, + "grad_norm": 1.4521587234662767, + "learning_rate": 1.630054055131413e-05, + "loss": 0.9161, + "step": 2994 + }, + { + "epoch": 0.3, + "grad_norm": 1.5509347372810725, + "learning_rate": 1.629798189590903e-05, + "loss": 0.6987, + "step": 2995 + }, + { + "epoch": 0.3, + "grad_norm": 1.508791759708456, + "learning_rate": 1.6295422556956482e-05, + "loss": 0.7036, + "step": 2996 + }, + { + "epoch": 0.3, + "grad_norm": 1.6009516480774482, + "learning_rate": 1.6292862534734265e-05, + "loss": 0.7279, + "step": 2997 + }, + { + "epoch": 0.3, + "grad_norm": 1.4819643251473897, + "learning_rate": 1.629030182952023e-05, + "loss": 0.7282, + "step": 2998 + }, + { + "epoch": 0.31, + "grad_norm": 1.457259734300266, + "learning_rate": 1.6287740441592302e-05, + "loss": 0.7124, + "step": 2999 + }, + { + "epoch": 0.31, + "grad_norm": 1.503453385065571, + "learning_rate": 1.6285178371228477e-05, + "loss": 0.7833, + "step": 3000 + }, + { + "epoch": 0.31, + "grad_norm": 1.4651584475008543, + "learning_rate": 1.628261561870683e-05, + "loss": 0.7883, + "step": 3001 + }, + { + "epoch": 0.31, + "grad_norm": 1.5518497406157812, + "learning_rate": 1.6280052184305505e-05, + "loss": 0.7794, + "step": 3002 + }, + { + "epoch": 0.31, + "grad_norm": 1.487130618612337, + "learning_rate": 1.627748806830273e-05, + "loss": 0.7968, + "step": 3003 + }, + { + "epoch": 0.31, + "grad_norm": 1.8566924238421831, + "learning_rate": 1.627492327097679e-05, + "loss": 0.7767, + "step": 3004 + }, + { + "epoch": 0.31, + "grad_norm": 1.5397235409358518, + "learning_rate": 1.627235779260606e-05, + "loss": 0.8144, + "step": 3005 + }, + { + "epoch": 0.31, + "grad_norm": 1.688856929862697, + "learning_rate": 1.6269791633468975e-05, + "loss": 0.8169, + "step": 3006 + }, + { + "epoch": 0.31, + "grad_norm": 1.4271495517722819, + "learning_rate": 1.626722479384406e-05, + "loss": 0.8141, + "step": 3007 + }, + { + "epoch": 0.31, + "grad_norm": 1.6141490822849023, + "learning_rate": 1.62646572740099e-05, + "loss": 0.828, + "step": 3008 + }, + { + "epoch": 0.31, + "grad_norm": 1.3862109919366226, + "learning_rate": 1.6262089074245163e-05, + "loss": 0.6934, + "step": 3009 + }, + { + "epoch": 0.31, + "grad_norm": 1.7264246506749796, + "learning_rate": 1.6259520194828586e-05, + "loss": 0.8084, + "step": 3010 + }, + { + "epoch": 0.31, + "grad_norm": 1.6831492410996842, + "learning_rate": 1.625695063603898e-05, + "loss": 0.7705, + "step": 3011 + }, + { + "epoch": 0.31, + "grad_norm": 1.7346547899690183, + "learning_rate": 1.6254380398155226e-05, + "loss": 0.794, + "step": 3012 + }, + { + "epoch": 0.31, + "grad_norm": 1.5155030467406767, + "learning_rate": 1.6251809481456292e-05, + "loss": 0.7796, + "step": 3013 + }, + { + "epoch": 0.31, + "grad_norm": 1.5258784519917301, + "learning_rate": 1.6249237886221206e-05, + "loss": 0.6925, + "step": 3014 + }, + { + "epoch": 0.31, + "grad_norm": 1.7016953394241054, + "learning_rate": 1.6246665612729074e-05, + "loss": 0.8212, + "step": 3015 + }, + { + "epoch": 0.31, + "grad_norm": 1.6296541627190022, + "learning_rate": 1.6244092661259073e-05, + "loss": 0.7483, + "step": 3016 + }, + { + "epoch": 0.31, + "grad_norm": 1.512531790713189, + "learning_rate": 1.6241519032090466e-05, + "loss": 0.7023, + "step": 3017 + }, + { + "epoch": 0.31, + "grad_norm": 1.6388795320351859, + "learning_rate": 1.623894472550257e-05, + "loss": 0.8662, + "step": 3018 + }, + { + "epoch": 0.31, + "grad_norm": 1.5834486433720267, + "learning_rate": 1.623636974177479e-05, + "loss": 0.9036, + "step": 3019 + }, + { + "epoch": 0.31, + "grad_norm": 1.608536353251349, + "learning_rate": 1.62337940811866e-05, + "loss": 0.8445, + "step": 3020 + }, + { + "epoch": 0.31, + "grad_norm": 1.6247451048664159, + "learning_rate": 1.6231217744017557e-05, + "loss": 0.7967, + "step": 3021 + }, + { + "epoch": 0.31, + "grad_norm": 1.4633740641195936, + "learning_rate": 1.6228640730547264e-05, + "loss": 0.8128, + "step": 3022 + }, + { + "epoch": 0.31, + "grad_norm": 1.7411844443965356, + "learning_rate": 1.622606304105543e-05, + "loss": 0.7927, + "step": 3023 + }, + { + "epoch": 0.31, + "grad_norm": 1.4140442778396463, + "learning_rate": 1.6223484675821813e-05, + "loss": 0.6619, + "step": 3024 + }, + { + "epoch": 0.31, + "grad_norm": 1.6603407739311253, + "learning_rate": 1.622090563512626e-05, + "loss": 0.8268, + "step": 3025 + }, + { + "epoch": 0.31, + "grad_norm": 1.457021585971081, + "learning_rate": 1.6218325919248688e-05, + "loss": 0.7776, + "step": 3026 + }, + { + "epoch": 0.31, + "grad_norm": 1.5941908538126683, + "learning_rate": 1.6215745528469078e-05, + "loss": 0.7932, + "step": 3027 + }, + { + "epoch": 0.31, + "grad_norm": 1.6131401763391802, + "learning_rate": 1.6213164463067495e-05, + "loss": 0.7959, + "step": 3028 + }, + { + "epoch": 0.31, + "grad_norm": 1.5040078612577223, + "learning_rate": 1.621058272332407e-05, + "loss": 0.7749, + "step": 3029 + }, + { + "epoch": 0.31, + "grad_norm": 1.4877261332506513, + "learning_rate": 1.6208000309519017e-05, + "loss": 0.7778, + "step": 3030 + }, + { + "epoch": 0.31, + "grad_norm": 1.7057119690601854, + "learning_rate": 1.6205417221932612e-05, + "loss": 0.8156, + "step": 3031 + }, + { + "epoch": 0.31, + "grad_norm": 1.5713357933890586, + "learning_rate": 1.620283346084521e-05, + "loss": 0.7903, + "step": 3032 + }, + { + "epoch": 0.31, + "grad_norm": 1.5128948987964903, + "learning_rate": 1.6200249026537232e-05, + "loss": 0.6547, + "step": 3033 + }, + { + "epoch": 0.31, + "grad_norm": 1.5813368080784047, + "learning_rate": 1.619766391928919e-05, + "loss": 0.8104, + "step": 3034 + }, + { + "epoch": 0.31, + "grad_norm": 1.5360058392310687, + "learning_rate": 1.6195078139381647e-05, + "loss": 0.7894, + "step": 3035 + }, + { + "epoch": 0.31, + "grad_norm": 1.472545203396127, + "learning_rate": 1.619249168709525e-05, + "loss": 0.6958, + "step": 3036 + }, + { + "epoch": 0.31, + "grad_norm": 1.4597438853991118, + "learning_rate": 1.6189904562710722e-05, + "loss": 0.6809, + "step": 3037 + }, + { + "epoch": 0.31, + "grad_norm": 1.6564332088860414, + "learning_rate": 1.6187316766508856e-05, + "loss": 0.659, + "step": 3038 + }, + { + "epoch": 0.31, + "grad_norm": 1.526292436196685, + "learning_rate": 1.618472829877051e-05, + "loss": 0.768, + "step": 3039 + }, + { + "epoch": 0.31, + "grad_norm": 1.6314143560380923, + "learning_rate": 1.6182139159776627e-05, + "loss": 0.8162, + "step": 3040 + }, + { + "epoch": 0.31, + "grad_norm": 1.702836373014158, + "learning_rate": 1.6179549349808216e-05, + "loss": 0.7858, + "step": 3041 + }, + { + "epoch": 0.31, + "grad_norm": 1.328996213321001, + "learning_rate": 1.6176958869146358e-05, + "loss": 0.7156, + "step": 3042 + }, + { + "epoch": 0.31, + "grad_norm": 1.567602922565799, + "learning_rate": 1.6174367718072213e-05, + "loss": 0.729, + "step": 3043 + }, + { + "epoch": 0.31, + "grad_norm": 1.5843451063905833, + "learning_rate": 1.617177589686701e-05, + "loss": 0.8005, + "step": 3044 + }, + { + "epoch": 0.31, + "grad_norm": 1.6711460527473359, + "learning_rate": 1.6169183405812053e-05, + "loss": 0.652, + "step": 3045 + }, + { + "epoch": 0.31, + "grad_norm": 1.4961663881076657, + "learning_rate": 1.6166590245188708e-05, + "loss": 0.6315, + "step": 3046 + }, + { + "epoch": 0.31, + "grad_norm": 1.565609963546912, + "learning_rate": 1.6163996415278423e-05, + "loss": 0.8349, + "step": 3047 + }, + { + "epoch": 0.31, + "grad_norm": 1.3998288323033399, + "learning_rate": 1.6161401916362723e-05, + "loss": 0.6608, + "step": 3048 + }, + { + "epoch": 0.31, + "grad_norm": 1.6135613014089765, + "learning_rate": 1.6158806748723205e-05, + "loss": 0.7993, + "step": 3049 + }, + { + "epoch": 0.31, + "grad_norm": 1.5912260549170647, + "learning_rate": 1.6156210912641524e-05, + "loss": 0.825, + "step": 3050 + }, + { + "epoch": 0.31, + "grad_norm": 1.4735378808073134, + "learning_rate": 1.6153614408399418e-05, + "loss": 0.6708, + "step": 3051 + }, + { + "epoch": 0.31, + "grad_norm": 1.6086824751987387, + "learning_rate": 1.61510172362787e-05, + "loss": 0.7271, + "step": 3052 + }, + { + "epoch": 0.31, + "grad_norm": 1.3443880247958508, + "learning_rate": 1.6148419396561254e-05, + "loss": 0.7419, + "step": 3053 + }, + { + "epoch": 0.31, + "grad_norm": 1.5674892031057017, + "learning_rate": 1.6145820889529033e-05, + "loss": 0.7514, + "step": 3054 + }, + { + "epoch": 0.31, + "grad_norm": 1.5110324653313316, + "learning_rate": 1.6143221715464067e-05, + "loss": 0.7184, + "step": 3055 + }, + { + "epoch": 0.31, + "grad_norm": 1.7002384489332842, + "learning_rate": 1.6140621874648447e-05, + "loss": 0.7133, + "step": 3056 + }, + { + "epoch": 0.31, + "grad_norm": 1.625221892701305, + "learning_rate": 1.6138021367364353e-05, + "loss": 0.7698, + "step": 3057 + }, + { + "epoch": 0.31, + "grad_norm": 1.614223008622113, + "learning_rate": 1.613542019389403e-05, + "loss": 0.8406, + "step": 3058 + }, + { + "epoch": 0.31, + "grad_norm": 1.6200348606542283, + "learning_rate": 1.613281835451979e-05, + "loss": 0.8781, + "step": 3059 + }, + { + "epoch": 0.31, + "grad_norm": 1.5587315808852187, + "learning_rate": 1.6130215849524025e-05, + "loss": 0.7167, + "step": 3060 + }, + { + "epoch": 0.31, + "grad_norm": 1.7338243906017512, + "learning_rate": 1.6127612679189195e-05, + "loss": 0.7895, + "step": 3061 + }, + { + "epoch": 0.31, + "grad_norm": 1.7108718796736928, + "learning_rate": 1.6125008843797835e-05, + "loss": 0.7477, + "step": 3062 + }, + { + "epoch": 0.31, + "grad_norm": 1.592167593368202, + "learning_rate": 1.6122404343632547e-05, + "loss": 0.7709, + "step": 3063 + }, + { + "epoch": 0.31, + "grad_norm": 1.4668296830135767, + "learning_rate": 1.6119799178976014e-05, + "loss": 0.6438, + "step": 3064 + }, + { + "epoch": 0.31, + "grad_norm": 1.6321493961782922, + "learning_rate": 1.6117193350110982e-05, + "loss": 0.7311, + "step": 3065 + }, + { + "epoch": 0.31, + "grad_norm": 1.6487251263538025, + "learning_rate": 1.6114586857320272e-05, + "loss": 0.675, + "step": 3066 + }, + { + "epoch": 0.31, + "grad_norm": 1.7908476169567467, + "learning_rate": 1.611197970088678e-05, + "loss": 0.8056, + "step": 3067 + }, + { + "epoch": 0.31, + "grad_norm": 1.3986976644629658, + "learning_rate": 1.6109371881093476e-05, + "loss": 0.7328, + "step": 3068 + }, + { + "epoch": 0.31, + "grad_norm": 1.6928601199095388, + "learning_rate": 1.6106763398223394e-05, + "loss": 0.7561, + "step": 3069 + }, + { + "epoch": 0.31, + "grad_norm": 1.4871254686923177, + "learning_rate": 1.610415425255964e-05, + "loss": 0.6538, + "step": 3070 + }, + { + "epoch": 0.31, + "grad_norm": 1.506830990770424, + "learning_rate": 1.6101544444385406e-05, + "loss": 0.7115, + "step": 3071 + }, + { + "epoch": 0.31, + "grad_norm": 1.4004733856804321, + "learning_rate": 1.6098933973983934e-05, + "loss": 0.7236, + "step": 3072 + }, + { + "epoch": 0.31, + "grad_norm": 1.5034428955072574, + "learning_rate": 1.609632284163856e-05, + "loss": 0.7715, + "step": 3073 + }, + { + "epoch": 0.31, + "grad_norm": 1.7482891338939817, + "learning_rate": 1.6093711047632676e-05, + "loss": 0.7237, + "step": 3074 + }, + { + "epoch": 0.31, + "grad_norm": 1.4696133304283667, + "learning_rate": 1.6091098592249754e-05, + "loss": 0.8095, + "step": 3075 + }, + { + "epoch": 0.31, + "grad_norm": 1.6176646068560052, + "learning_rate": 1.608848547577333e-05, + "loss": 0.7347, + "step": 3076 + }, + { + "epoch": 0.31, + "grad_norm": 1.568225549711669, + "learning_rate": 1.6085871698487023e-05, + "loss": 0.7641, + "step": 3077 + }, + { + "epoch": 0.31, + "grad_norm": 1.508505589717331, + "learning_rate": 1.608325726067452e-05, + "loss": 0.7548, + "step": 3078 + }, + { + "epoch": 0.31, + "grad_norm": 1.54588442431525, + "learning_rate": 1.6080642162619567e-05, + "loss": 0.7671, + "step": 3079 + }, + { + "epoch": 0.31, + "grad_norm": 1.6584741334511337, + "learning_rate": 1.6078026404605998e-05, + "loss": 0.8191, + "step": 3080 + }, + { + "epoch": 0.31, + "grad_norm": 1.4583145165427174, + "learning_rate": 1.6075409986917714e-05, + "loss": 0.7521, + "step": 3081 + }, + { + "epoch": 0.31, + "grad_norm": 1.694055870736409, + "learning_rate": 1.6072792909838686e-05, + "loss": 0.7129, + "step": 3082 + }, + { + "epoch": 0.31, + "grad_norm": 1.6988886090864983, + "learning_rate": 1.6070175173652954e-05, + "loss": 0.8416, + "step": 3083 + }, + { + "epoch": 0.31, + "grad_norm": 1.4580745191676912, + "learning_rate": 1.6067556778644633e-05, + "loss": 0.6665, + "step": 3084 + }, + { + "epoch": 0.31, + "grad_norm": 1.7544466094328668, + "learning_rate": 1.606493772509791e-05, + "loss": 0.7951, + "step": 3085 + }, + { + "epoch": 0.31, + "grad_norm": 1.53367296871484, + "learning_rate": 1.6062318013297045e-05, + "loss": 0.8168, + "step": 3086 + }, + { + "epoch": 0.31, + "grad_norm": 1.5875483622870659, + "learning_rate": 1.6059697643526363e-05, + "loss": 0.7529, + "step": 3087 + }, + { + "epoch": 0.31, + "grad_norm": 1.4879290031335537, + "learning_rate": 1.605707661607026e-05, + "loss": 0.7073, + "step": 3088 + }, + { + "epoch": 0.31, + "grad_norm": 1.5101875171658965, + "learning_rate": 1.6054454931213217e-05, + "loss": 0.7343, + "step": 3089 + }, + { + "epoch": 0.31, + "grad_norm": 1.4990092041474783, + "learning_rate": 1.605183258923977e-05, + "loss": 0.7026, + "step": 3090 + }, + { + "epoch": 0.31, + "grad_norm": 1.7472358291731696, + "learning_rate": 1.6049209590434538e-05, + "loss": 0.8645, + "step": 3091 + }, + { + "epoch": 0.31, + "grad_norm": 1.5695152177694733, + "learning_rate": 1.60465859350822e-05, + "loss": 0.7457, + "step": 3092 + }, + { + "epoch": 0.31, + "grad_norm": 1.4429394717863138, + "learning_rate": 1.6043961623467523e-05, + "loss": 0.7753, + "step": 3093 + }, + { + "epoch": 0.31, + "grad_norm": 1.4815833238007763, + "learning_rate": 1.6041336655875324e-05, + "loss": 0.7611, + "step": 3094 + }, + { + "epoch": 0.31, + "grad_norm": 1.653173088460423, + "learning_rate": 1.6038711032590507e-05, + "loss": 0.7596, + "step": 3095 + }, + { + "epoch": 0.31, + "grad_norm": 1.557819086276001, + "learning_rate": 1.6036084753898046e-05, + "loss": 0.7968, + "step": 3096 + }, + { + "epoch": 0.32, + "grad_norm": 1.4276364100852235, + "learning_rate": 1.6033457820082975e-05, + "loss": 0.6869, + "step": 3097 + }, + { + "epoch": 0.32, + "grad_norm": 1.5354139858886717, + "learning_rate": 1.6030830231430412e-05, + "loss": 0.8235, + "step": 3098 + }, + { + "epoch": 0.32, + "grad_norm": 1.6050709658162097, + "learning_rate": 1.6028201988225536e-05, + "loss": 0.8106, + "step": 3099 + }, + { + "epoch": 0.32, + "grad_norm": 1.6729719675167836, + "learning_rate": 1.6025573090753608e-05, + "loss": 0.7784, + "step": 3100 + }, + { + "epoch": 0.32, + "grad_norm": 1.599429966910005, + "learning_rate": 1.6022943539299948e-05, + "loss": 0.7485, + "step": 3101 + }, + { + "epoch": 0.32, + "grad_norm": 1.4492624752991685, + "learning_rate": 1.602031333414996e-05, + "loss": 0.6903, + "step": 3102 + }, + { + "epoch": 0.32, + "grad_norm": 1.5228689547530554, + "learning_rate": 1.6017682475589103e-05, + "loss": 0.7714, + "step": 3103 + }, + { + "epoch": 0.32, + "grad_norm": 1.4370981140658128, + "learning_rate": 1.601505096390292e-05, + "loss": 0.6991, + "step": 3104 + }, + { + "epoch": 0.32, + "grad_norm": 1.8961969950247142, + "learning_rate": 1.601241879937702e-05, + "loss": 0.6681, + "step": 3105 + }, + { + "epoch": 0.32, + "grad_norm": 1.554051609425092, + "learning_rate": 1.600978598229708e-05, + "loss": 0.7787, + "step": 3106 + }, + { + "epoch": 0.32, + "grad_norm": 1.427848239361892, + "learning_rate": 1.6007152512948855e-05, + "loss": 0.7268, + "step": 3107 + }, + { + "epoch": 0.32, + "grad_norm": 1.5025791642754298, + "learning_rate": 1.600451839161817e-05, + "loss": 0.7459, + "step": 3108 + }, + { + "epoch": 0.32, + "grad_norm": 1.6643850750285802, + "learning_rate": 1.6001883618590913e-05, + "loss": 0.8121, + "step": 3109 + }, + { + "epoch": 0.32, + "grad_norm": 1.418192732050395, + "learning_rate": 1.599924819415305e-05, + "loss": 0.7477, + "step": 3110 + }, + { + "epoch": 0.32, + "grad_norm": 1.4618962142506864, + "learning_rate": 1.5996612118590604e-05, + "loss": 0.7099, + "step": 3111 + }, + { + "epoch": 0.32, + "grad_norm": 1.526946571378853, + "learning_rate": 1.5993975392189697e-05, + "loss": 0.7344, + "step": 3112 + }, + { + "epoch": 0.32, + "grad_norm": 1.4919376879669402, + "learning_rate": 1.5991338015236494e-05, + "loss": 0.8031, + "step": 3113 + }, + { + "epoch": 0.32, + "grad_norm": 1.6745537838724252, + "learning_rate": 1.5988699988017243e-05, + "loss": 0.808, + "step": 3114 + }, + { + "epoch": 0.32, + "grad_norm": 1.5838066227108833, + "learning_rate": 1.598606131081826e-05, + "loss": 0.7917, + "step": 3115 + }, + { + "epoch": 0.32, + "grad_norm": 1.5775363708969412, + "learning_rate": 1.5983421983925937e-05, + "loss": 0.6163, + "step": 3116 + }, + { + "epoch": 0.32, + "grad_norm": 1.5748608622317408, + "learning_rate": 1.598078200762673e-05, + "loss": 0.7859, + "step": 3117 + }, + { + "epoch": 0.32, + "grad_norm": 1.5423950467094711, + "learning_rate": 1.597814138220716e-05, + "loss": 0.7963, + "step": 3118 + }, + { + "epoch": 0.32, + "grad_norm": 1.658696812208415, + "learning_rate": 1.597550010795383e-05, + "loss": 0.7569, + "step": 3119 + }, + { + "epoch": 0.32, + "grad_norm": 1.3567834085980208, + "learning_rate": 1.5972858185153412e-05, + "loss": 0.6929, + "step": 3120 + }, + { + "epoch": 0.32, + "grad_norm": 1.6085663273116662, + "learning_rate": 1.5970215614092642e-05, + "loss": 0.6935, + "step": 3121 + }, + { + "epoch": 0.32, + "grad_norm": 1.3961019816420073, + "learning_rate": 1.5967572395058334e-05, + "loss": 0.7607, + "step": 3122 + }, + { + "epoch": 0.32, + "grad_norm": 1.6165689879778342, + "learning_rate": 1.5964928528337363e-05, + "loss": 0.8189, + "step": 3123 + }, + { + "epoch": 0.32, + "grad_norm": 1.570775241467543, + "learning_rate": 1.596228401421668e-05, + "loss": 0.7161, + "step": 3124 + }, + { + "epoch": 0.32, + "grad_norm": 1.6292724209145448, + "learning_rate": 1.5959638852983306e-05, + "loss": 0.7867, + "step": 3125 + }, + { + "epoch": 0.32, + "grad_norm": 1.5400550233485224, + "learning_rate": 1.5956993044924334e-05, + "loss": 0.7101, + "step": 3126 + }, + { + "epoch": 0.32, + "grad_norm": 1.7324267822730282, + "learning_rate": 1.5954346590326927e-05, + "loss": 0.8033, + "step": 3127 + }, + { + "epoch": 0.32, + "grad_norm": 1.5495496863750844, + "learning_rate": 1.595169948947831e-05, + "loss": 0.7431, + "step": 3128 + }, + { + "epoch": 0.32, + "grad_norm": 1.6223883659624856, + "learning_rate": 1.5949051742665788e-05, + "loss": 0.795, + "step": 3129 + }, + { + "epoch": 0.32, + "grad_norm": 1.4697103408900418, + "learning_rate": 1.594640335017673e-05, + "loss": 0.703, + "step": 3130 + }, + { + "epoch": 0.32, + "grad_norm": 1.7101405485345647, + "learning_rate": 1.5943754312298583e-05, + "loss": 0.7555, + "step": 3131 + }, + { + "epoch": 0.32, + "grad_norm": 1.5690228773213615, + "learning_rate": 1.5941104629318856e-05, + "loss": 0.7266, + "step": 3132 + }, + { + "epoch": 0.32, + "grad_norm": 1.5873101127509042, + "learning_rate": 1.5938454301525126e-05, + "loss": 0.7022, + "step": 3133 + }, + { + "epoch": 0.32, + "grad_norm": 1.439819522741593, + "learning_rate": 1.593580332920505e-05, + "loss": 0.6614, + "step": 3134 + }, + { + "epoch": 0.32, + "grad_norm": 1.6108536230936394, + "learning_rate": 1.593315171264635e-05, + "loss": 0.8291, + "step": 3135 + }, + { + "epoch": 0.32, + "grad_norm": 1.4550865485826752, + "learning_rate": 1.5930499452136816e-05, + "loss": 0.6884, + "step": 3136 + }, + { + "epoch": 0.32, + "grad_norm": 1.6520085948866068, + "learning_rate": 1.592784654796431e-05, + "loss": 0.794, + "step": 3137 + }, + { + "epoch": 0.32, + "grad_norm": 1.5170627473468863, + "learning_rate": 1.5925193000416756e-05, + "loss": 0.6584, + "step": 3138 + }, + { + "epoch": 0.32, + "grad_norm": 1.7376109545630536, + "learning_rate": 1.5922538809782166e-05, + "loss": 0.7054, + "step": 3139 + }, + { + "epoch": 0.32, + "grad_norm": 1.5238517632758721, + "learning_rate": 1.591988397634861e-05, + "loss": 0.7448, + "step": 3140 + }, + { + "epoch": 0.32, + "grad_norm": 1.7180058889898717, + "learning_rate": 1.591722850040422e-05, + "loss": 0.6657, + "step": 3141 + }, + { + "epoch": 0.32, + "grad_norm": 1.562494608534868, + "learning_rate": 1.591457238223721e-05, + "loss": 0.6182, + "step": 3142 + }, + { + "epoch": 0.32, + "grad_norm": 1.5083445543382505, + "learning_rate": 1.5911915622135864e-05, + "loss": 0.7609, + "step": 3143 + }, + { + "epoch": 0.32, + "grad_norm": 1.5367934569040897, + "learning_rate": 1.590925822038853e-05, + "loss": 0.7238, + "step": 3144 + }, + { + "epoch": 0.32, + "grad_norm": 1.5691078613734306, + "learning_rate": 1.5906600177283628e-05, + "loss": 0.8175, + "step": 3145 + }, + { + "epoch": 0.32, + "grad_norm": 1.690059752667436, + "learning_rate": 1.590394149310964e-05, + "loss": 0.7669, + "step": 3146 + }, + { + "epoch": 0.32, + "grad_norm": 1.701910126920164, + "learning_rate": 1.5901282168155136e-05, + "loss": 0.6637, + "step": 3147 + }, + { + "epoch": 0.32, + "grad_norm": 1.4938188942808153, + "learning_rate": 1.5898622202708734e-05, + "loss": 0.7786, + "step": 3148 + }, + { + "epoch": 0.32, + "grad_norm": 1.4882713244830919, + "learning_rate": 1.5895961597059137e-05, + "loss": 0.6403, + "step": 3149 + }, + { + "epoch": 0.32, + "grad_norm": 1.652969857758579, + "learning_rate": 1.5893300351495115e-05, + "loss": 0.8245, + "step": 3150 + }, + { + "epoch": 0.32, + "grad_norm": 1.4934941684919685, + "learning_rate": 1.5890638466305495e-05, + "loss": 0.7107, + "step": 3151 + }, + { + "epoch": 0.32, + "grad_norm": 1.5580866347387254, + "learning_rate": 1.5887975941779196e-05, + "loss": 0.7022, + "step": 3152 + }, + { + "epoch": 0.32, + "grad_norm": 1.3971726739844001, + "learning_rate": 1.588531277820518e-05, + "loss": 0.6722, + "step": 3153 + }, + { + "epoch": 0.32, + "grad_norm": 1.5983800866485998, + "learning_rate": 1.58826489758725e-05, + "loss": 0.7353, + "step": 3154 + }, + { + "epoch": 0.32, + "grad_norm": 1.4430310300597617, + "learning_rate": 1.587998453507027e-05, + "loss": 0.6149, + "step": 3155 + }, + { + "epoch": 0.32, + "grad_norm": 1.5549077993086349, + "learning_rate": 1.587731945608767e-05, + "loss": 0.7428, + "step": 3156 + }, + { + "epoch": 0.32, + "grad_norm": 1.6302877656070816, + "learning_rate": 1.5874653739213948e-05, + "loss": 0.7145, + "step": 3157 + }, + { + "epoch": 0.32, + "grad_norm": 1.4814762359067342, + "learning_rate": 1.587198738473844e-05, + "loss": 0.7209, + "step": 3158 + }, + { + "epoch": 0.32, + "grad_norm": 1.5449844910463155, + "learning_rate": 1.5869320392950526e-05, + "loss": 0.7601, + "step": 3159 + }, + { + "epoch": 0.32, + "grad_norm": 1.5479632840600481, + "learning_rate": 1.5866652764139667e-05, + "loss": 0.6991, + "step": 3160 + }, + { + "epoch": 0.32, + "grad_norm": 1.5665482425525807, + "learning_rate": 1.58639844985954e-05, + "loss": 0.7769, + "step": 3161 + }, + { + "epoch": 0.32, + "grad_norm": 1.4987697650410985, + "learning_rate": 1.5861315596607315e-05, + "loss": 0.788, + "step": 3162 + }, + { + "epoch": 0.32, + "grad_norm": 1.4443483167803732, + "learning_rate": 1.585864605846508e-05, + "loss": 0.7179, + "step": 3163 + }, + { + "epoch": 0.32, + "grad_norm": 1.5872236675585665, + "learning_rate": 1.585597588445844e-05, + "loss": 0.6736, + "step": 3164 + }, + { + "epoch": 0.32, + "grad_norm": 1.5611647818715682, + "learning_rate": 1.585330507487719e-05, + "loss": 0.8164, + "step": 3165 + }, + { + "epoch": 0.32, + "grad_norm": 1.673138476399332, + "learning_rate": 1.585063363001121e-05, + "loss": 0.7495, + "step": 3166 + }, + { + "epoch": 0.32, + "grad_norm": 1.7099893089128935, + "learning_rate": 1.584796155015045e-05, + "loss": 0.8088, + "step": 3167 + }, + { + "epoch": 0.32, + "grad_norm": 1.4623288158796532, + "learning_rate": 1.584528883558491e-05, + "loss": 0.7705, + "step": 3168 + }, + { + "epoch": 0.32, + "grad_norm": 1.509934012381232, + "learning_rate": 1.5842615486604674e-05, + "loss": 0.7781, + "step": 3169 + }, + { + "epoch": 0.32, + "grad_norm": 1.5273599096580284, + "learning_rate": 1.58399415034999e-05, + "loss": 0.823, + "step": 3170 + }, + { + "epoch": 0.32, + "grad_norm": 1.5482870979077432, + "learning_rate": 1.5837266886560802e-05, + "loss": 0.774, + "step": 3171 + }, + { + "epoch": 0.32, + "grad_norm": 1.4253493680036589, + "learning_rate": 1.583459163607767e-05, + "loss": 0.6898, + "step": 3172 + }, + { + "epoch": 0.32, + "grad_norm": 1.4850118758223274, + "learning_rate": 1.5831915752340855e-05, + "loss": 0.7483, + "step": 3173 + }, + { + "epoch": 0.32, + "grad_norm": 1.7053278015269768, + "learning_rate": 1.5829239235640782e-05, + "loss": 0.8189, + "step": 3174 + }, + { + "epoch": 0.32, + "grad_norm": 1.4857371360723388, + "learning_rate": 1.5826562086267956e-05, + "loss": 0.7126, + "step": 3175 + }, + { + "epoch": 0.32, + "grad_norm": 1.5745791094329897, + "learning_rate": 1.5823884304512934e-05, + "loss": 0.7374, + "step": 3176 + }, + { + "epoch": 0.32, + "grad_norm": 1.792000476054425, + "learning_rate": 1.582120589066634e-05, + "loss": 0.7418, + "step": 3177 + }, + { + "epoch": 0.32, + "grad_norm": 1.4355059010070181, + "learning_rate": 1.581852684501888e-05, + "loss": 0.7174, + "step": 3178 + }, + { + "epoch": 0.32, + "grad_norm": 1.6095170706882729, + "learning_rate": 1.5815847167861327e-05, + "loss": 0.866, + "step": 3179 + }, + { + "epoch": 0.32, + "grad_norm": 1.426623584531756, + "learning_rate": 1.5813166859484515e-05, + "loss": 0.6396, + "step": 3180 + }, + { + "epoch": 0.32, + "grad_norm": 1.3991244579834483, + "learning_rate": 1.5810485920179344e-05, + "loss": 0.6319, + "step": 3181 + }, + { + "epoch": 0.32, + "grad_norm": 1.4587604377004648, + "learning_rate": 1.5807804350236793e-05, + "loss": 0.6378, + "step": 3182 + }, + { + "epoch": 0.32, + "grad_norm": 1.426280154165666, + "learning_rate": 1.5805122149947904e-05, + "loss": 0.7459, + "step": 3183 + }, + { + "epoch": 0.32, + "grad_norm": 1.5217399746727973, + "learning_rate": 1.5802439319603786e-05, + "loss": 0.753, + "step": 3184 + }, + { + "epoch": 0.32, + "grad_norm": 1.4164338393551477, + "learning_rate": 1.5799755859495625e-05, + "loss": 0.617, + "step": 3185 + }, + { + "epoch": 0.32, + "grad_norm": 1.5097480099291565, + "learning_rate": 1.579707176991466e-05, + "loss": 0.8471, + "step": 3186 + }, + { + "epoch": 0.32, + "grad_norm": 1.72903010776992, + "learning_rate": 1.5794387051152208e-05, + "loss": 0.768, + "step": 3187 + }, + { + "epoch": 0.32, + "grad_norm": 1.6044825868083277, + "learning_rate": 1.5791701703499656e-05, + "loss": 0.7313, + "step": 3188 + }, + { + "epoch": 0.32, + "grad_norm": 1.5420586309894015, + "learning_rate": 1.578901572724846e-05, + "loss": 0.6848, + "step": 3189 + }, + { + "epoch": 0.32, + "grad_norm": 1.6985781115712704, + "learning_rate": 1.5786329122690135e-05, + "loss": 0.728, + "step": 3190 + }, + { + "epoch": 0.32, + "grad_norm": 1.3593546053483965, + "learning_rate": 1.5783641890116273e-05, + "loss": 0.5939, + "step": 3191 + }, + { + "epoch": 0.32, + "grad_norm": 1.444491418044976, + "learning_rate": 1.578095402981853e-05, + "loss": 0.6696, + "step": 3192 + }, + { + "epoch": 0.32, + "grad_norm": 1.4372557697332757, + "learning_rate": 1.577826554208863e-05, + "loss": 0.7297, + "step": 3193 + }, + { + "epoch": 0.32, + "grad_norm": 1.6857280272203305, + "learning_rate": 1.577557642721837e-05, + "loss": 0.7403, + "step": 3194 + }, + { + "epoch": 0.32, + "grad_norm": 1.5915123912985911, + "learning_rate": 1.5772886685499605e-05, + "loss": 0.767, + "step": 3195 + }, + { + "epoch": 0.33, + "grad_norm": 1.5693985857062513, + "learning_rate": 1.5770196317224267e-05, + "loss": 0.8271, + "step": 3196 + }, + { + "epoch": 0.33, + "grad_norm": 1.7371800022626132, + "learning_rate": 1.576750532268436e-05, + "loss": 0.7539, + "step": 3197 + }, + { + "epoch": 0.33, + "grad_norm": 1.4895456395950455, + "learning_rate": 1.5764813702171937e-05, + "loss": 0.7717, + "step": 3198 + }, + { + "epoch": 0.33, + "grad_norm": 1.5575816426789546, + "learning_rate": 1.5762121455979144e-05, + "loss": 0.7825, + "step": 3199 + }, + { + "epoch": 0.33, + "grad_norm": 1.5366001344613465, + "learning_rate": 1.5759428584398175e-05, + "loss": 0.7271, + "step": 3200 + }, + { + "epoch": 0.33, + "grad_norm": 1.4564051361437806, + "learning_rate": 1.5756735087721297e-05, + "loss": 0.7732, + "step": 3201 + }, + { + "epoch": 0.33, + "grad_norm": 1.4784322680239337, + "learning_rate": 1.5754040966240856e-05, + "loss": 0.6973, + "step": 3202 + }, + { + "epoch": 0.33, + "grad_norm": 1.547436781038291, + "learning_rate": 1.575134622024925e-05, + "loss": 0.7008, + "step": 3203 + }, + { + "epoch": 0.33, + "grad_norm": 1.6936529434866159, + "learning_rate": 1.5748650850038952e-05, + "loss": 0.8086, + "step": 3204 + }, + { + "epoch": 0.33, + "grad_norm": 1.5107765210344077, + "learning_rate": 1.57459548559025e-05, + "loss": 0.6434, + "step": 3205 + }, + { + "epoch": 0.33, + "grad_norm": 1.4124165547616998, + "learning_rate": 1.5743258238132506e-05, + "loss": 0.7412, + "step": 3206 + }, + { + "epoch": 0.33, + "grad_norm": 1.6505896344500248, + "learning_rate": 1.574056099702165e-05, + "loss": 0.7154, + "step": 3207 + }, + { + "epoch": 0.33, + "grad_norm": 1.3868823830634551, + "learning_rate": 1.5737863132862667e-05, + "loss": 0.6848, + "step": 3208 + }, + { + "epoch": 0.33, + "grad_norm": 1.5852957586226968, + "learning_rate": 1.5735164645948365e-05, + "loss": 0.719, + "step": 3209 + }, + { + "epoch": 0.33, + "grad_norm": 1.5387482663631675, + "learning_rate": 1.5732465536571635e-05, + "loss": 0.7347, + "step": 3210 + }, + { + "epoch": 0.33, + "grad_norm": 1.5964984312736819, + "learning_rate": 1.5729765805025416e-05, + "loss": 0.7555, + "step": 3211 + }, + { + "epoch": 0.33, + "grad_norm": 1.472043261569749, + "learning_rate": 1.5727065451602722e-05, + "loss": 0.737, + "step": 3212 + }, + { + "epoch": 0.33, + "grad_norm": 1.7394590989294285, + "learning_rate": 1.5724364476596637e-05, + "loss": 0.8139, + "step": 3213 + }, + { + "epoch": 0.33, + "grad_norm": 1.4609690623304736, + "learning_rate": 1.5721662880300303e-05, + "loss": 0.8106, + "step": 3214 + }, + { + "epoch": 0.33, + "grad_norm": 1.4984659414493768, + "learning_rate": 1.5718960663006942e-05, + "loss": 0.8126, + "step": 3215 + }, + { + "epoch": 0.33, + "grad_norm": 1.5078764678998093, + "learning_rate": 1.5716257825009836e-05, + "loss": 0.7072, + "step": 3216 + }, + { + "epoch": 0.33, + "grad_norm": 1.6217296271864472, + "learning_rate": 1.5713554366602333e-05, + "loss": 0.808, + "step": 3217 + }, + { + "epoch": 0.33, + "grad_norm": 1.6107267709731687, + "learning_rate": 1.571085028807786e-05, + "loss": 0.7409, + "step": 3218 + }, + { + "epoch": 0.33, + "grad_norm": 1.496304836221755, + "learning_rate": 1.5708145589729887e-05, + "loss": 0.7075, + "step": 3219 + }, + { + "epoch": 0.33, + "grad_norm": 1.6943857719430757, + "learning_rate": 1.5705440271851982e-05, + "loss": 0.82, + "step": 3220 + }, + { + "epoch": 0.33, + "grad_norm": 1.5916792170173406, + "learning_rate": 1.570273433473776e-05, + "loss": 0.7467, + "step": 3221 + }, + { + "epoch": 0.33, + "grad_norm": 1.5585203351560093, + "learning_rate": 1.57000277786809e-05, + "loss": 0.7451, + "step": 3222 + }, + { + "epoch": 0.33, + "grad_norm": 1.5284408972175045, + "learning_rate": 1.569732060397517e-05, + "loss": 0.7689, + "step": 3223 + }, + { + "epoch": 0.33, + "grad_norm": 1.593389917270574, + "learning_rate": 1.5694612810914378e-05, + "loss": 0.7555, + "step": 3224 + }, + { + "epoch": 0.33, + "grad_norm": 1.5395338356559685, + "learning_rate": 1.5691904399792425e-05, + "loss": 0.7338, + "step": 3225 + }, + { + "epoch": 0.33, + "grad_norm": 1.6093287327816626, + "learning_rate": 1.5689195370903258e-05, + "loss": 0.7608, + "step": 3226 + }, + { + "epoch": 0.33, + "grad_norm": 1.5416980247313807, + "learning_rate": 1.56864857245409e-05, + "loss": 0.6709, + "step": 3227 + }, + { + "epoch": 0.33, + "grad_norm": 1.6869407406650818, + "learning_rate": 1.5683775460999446e-05, + "loss": 0.7468, + "step": 3228 + }, + { + "epoch": 0.33, + "grad_norm": 1.5516293625736395, + "learning_rate": 1.568106458057305e-05, + "loss": 0.7228, + "step": 3229 + }, + { + "epoch": 0.33, + "grad_norm": 1.5785437584600943, + "learning_rate": 1.5678353083555938e-05, + "loss": 0.743, + "step": 3230 + }, + { + "epoch": 0.33, + "grad_norm": 1.5095253750379387, + "learning_rate": 1.5675640970242393e-05, + "loss": 0.7638, + "step": 3231 + }, + { + "epoch": 0.33, + "grad_norm": 1.6052697059376566, + "learning_rate": 1.5672928240926782e-05, + "loss": 0.7752, + "step": 3232 + }, + { + "epoch": 0.33, + "grad_norm": 1.6343953052917846, + "learning_rate": 1.5670214895903522e-05, + "loss": 0.7411, + "step": 3233 + }, + { + "epoch": 0.33, + "grad_norm": 1.5285162580731912, + "learning_rate": 1.5667500935467112e-05, + "loss": 0.866, + "step": 3234 + }, + { + "epoch": 0.33, + "grad_norm": 1.698977154811488, + "learning_rate": 1.56647863599121e-05, + "loss": 0.7947, + "step": 3235 + }, + { + "epoch": 0.33, + "grad_norm": 1.4866475904993897, + "learning_rate": 1.566207116953312e-05, + "loss": 0.718, + "step": 3236 + }, + { + "epoch": 0.33, + "grad_norm": 1.7092189587646751, + "learning_rate": 1.5659355364624856e-05, + "loss": 0.653, + "step": 3237 + }, + { + "epoch": 0.33, + "grad_norm": 1.503865433297925, + "learning_rate": 1.565663894548207e-05, + "loss": 0.7405, + "step": 3238 + }, + { + "epoch": 0.33, + "grad_norm": 1.4975521811870942, + "learning_rate": 1.565392191239959e-05, + "loss": 0.7793, + "step": 3239 + }, + { + "epoch": 0.33, + "grad_norm": 1.669709188715031, + "learning_rate": 1.5651204265672305e-05, + "loss": 0.8376, + "step": 3240 + }, + { + "epoch": 0.33, + "grad_norm": 1.6396041162136814, + "learning_rate": 1.5648486005595167e-05, + "loss": 0.6777, + "step": 3241 + }, + { + "epoch": 0.33, + "grad_norm": 1.6140187825941346, + "learning_rate": 1.5645767132463205e-05, + "loss": 0.7828, + "step": 3242 + }, + { + "epoch": 0.33, + "grad_norm": 1.646485113867051, + "learning_rate": 1.5643047646571515e-05, + "loss": 0.7514, + "step": 3243 + }, + { + "epoch": 0.33, + "grad_norm": 1.451040577376004, + "learning_rate": 1.5640327548215245e-05, + "loss": 0.7091, + "step": 3244 + }, + { + "epoch": 0.33, + "grad_norm": 1.5039928013791208, + "learning_rate": 1.5637606837689632e-05, + "loss": 0.6726, + "step": 3245 + }, + { + "epoch": 0.33, + "grad_norm": 1.6029000330987817, + "learning_rate": 1.563488551528995e-05, + "loss": 0.8024, + "step": 3246 + }, + { + "epoch": 0.33, + "grad_norm": 1.5792729716232257, + "learning_rate": 1.563216358131157e-05, + "loss": 0.8164, + "step": 3247 + }, + { + "epoch": 0.33, + "grad_norm": 1.570838961503552, + "learning_rate": 1.5629441036049914e-05, + "loss": 0.7266, + "step": 3248 + }, + { + "epoch": 0.33, + "grad_norm": 1.4417182766874617, + "learning_rate": 1.5626717879800465e-05, + "loss": 0.7966, + "step": 3249 + }, + { + "epoch": 0.33, + "grad_norm": 1.5969231950371279, + "learning_rate": 1.562399411285878e-05, + "loss": 0.753, + "step": 3250 + }, + { + "epoch": 0.33, + "grad_norm": 1.5993321650315608, + "learning_rate": 1.5621269735520485e-05, + "loss": 0.7534, + "step": 3251 + }, + { + "epoch": 0.33, + "grad_norm": 1.721250803274356, + "learning_rate": 1.5618544748081264e-05, + "loss": 0.8658, + "step": 3252 + }, + { + "epoch": 0.33, + "grad_norm": 1.6157898329866196, + "learning_rate": 1.561581915083688e-05, + "loss": 0.7707, + "step": 3253 + }, + { + "epoch": 0.33, + "grad_norm": 1.4024982634493652, + "learning_rate": 1.561309294408315e-05, + "loss": 0.6503, + "step": 3254 + }, + { + "epoch": 0.33, + "grad_norm": 1.6343740262884703, + "learning_rate": 1.5610366128115955e-05, + "loss": 0.8926, + "step": 3255 + }, + { + "epoch": 0.33, + "grad_norm": 1.6057197627756363, + "learning_rate": 1.5607638703231252e-05, + "loss": 0.7579, + "step": 3256 + }, + { + "epoch": 0.33, + "grad_norm": 1.4338536433313815, + "learning_rate": 1.5604910669725066e-05, + "loss": 0.7252, + "step": 3257 + }, + { + "epoch": 0.33, + "grad_norm": 1.590164273276705, + "learning_rate": 1.5602182027893475e-05, + "loss": 0.8045, + "step": 3258 + }, + { + "epoch": 0.33, + "grad_norm": 1.7041979020864662, + "learning_rate": 1.5599452778032634e-05, + "loss": 0.689, + "step": 3259 + }, + { + "epoch": 0.33, + "grad_norm": 1.4874237126040357, + "learning_rate": 1.559672292043876e-05, + "loss": 0.7374, + "step": 3260 + }, + { + "epoch": 0.33, + "grad_norm": 1.4925645152044493, + "learning_rate": 1.5593992455408136e-05, + "loss": 0.6849, + "step": 3261 + }, + { + "epoch": 0.33, + "grad_norm": 1.5962062522842229, + "learning_rate": 1.559126138323711e-05, + "loss": 0.8662, + "step": 3262 + }, + { + "epoch": 0.33, + "grad_norm": 1.4143071400598322, + "learning_rate": 1.55885297042221e-05, + "loss": 0.6973, + "step": 3263 + }, + { + "epoch": 0.33, + "grad_norm": 1.4838263696618768, + "learning_rate": 1.5585797418659584e-05, + "loss": 0.6286, + "step": 3264 + }, + { + "epoch": 0.33, + "grad_norm": 1.5525624545464243, + "learning_rate": 1.558306452684611e-05, + "loss": 0.7891, + "step": 3265 + }, + { + "epoch": 0.33, + "grad_norm": 1.4785889855698298, + "learning_rate": 1.5580331029078294e-05, + "loss": 0.7668, + "step": 3266 + }, + { + "epoch": 0.33, + "grad_norm": 1.5508249032227592, + "learning_rate": 1.5577596925652808e-05, + "loss": 0.6957, + "step": 3267 + }, + { + "epoch": 0.33, + "grad_norm": 1.3613392374759203, + "learning_rate": 1.5574862216866403e-05, + "loss": 0.7151, + "step": 3268 + }, + { + "epoch": 0.33, + "grad_norm": 1.5786788804197516, + "learning_rate": 1.557212690301588e-05, + "loss": 0.6864, + "step": 3269 + }, + { + "epoch": 0.33, + "grad_norm": 1.681522707673013, + "learning_rate": 1.5569390984398127e-05, + "loss": 0.794, + "step": 3270 + }, + { + "epoch": 0.33, + "grad_norm": 1.682966125140333, + "learning_rate": 1.5566654461310073e-05, + "loss": 0.7273, + "step": 3271 + }, + { + "epoch": 0.33, + "grad_norm": 1.5554113078407963, + "learning_rate": 1.556391733404873e-05, + "loss": 0.7791, + "step": 3272 + }, + { + "epoch": 0.33, + "grad_norm": 1.6006383408949836, + "learning_rate": 1.5561179602911173e-05, + "loss": 0.7161, + "step": 3273 + }, + { + "epoch": 0.33, + "grad_norm": 1.5815849883781743, + "learning_rate": 1.5558441268194537e-05, + "loss": 0.6785, + "step": 3274 + }, + { + "epoch": 0.33, + "grad_norm": 1.4786161669319569, + "learning_rate": 1.5555702330196024e-05, + "loss": 0.6685, + "step": 3275 + }, + { + "epoch": 0.33, + "grad_norm": 1.5620239060191075, + "learning_rate": 1.5552962789212904e-05, + "loss": 0.7152, + "step": 3276 + }, + { + "epoch": 0.33, + "grad_norm": 1.4996886244762353, + "learning_rate": 1.5550222645542517e-05, + "loss": 0.7554, + "step": 3277 + }, + { + "epoch": 0.33, + "grad_norm": 1.6078257952546522, + "learning_rate": 1.5547481899482252e-05, + "loss": 0.7861, + "step": 3278 + }, + { + "epoch": 0.33, + "grad_norm": 1.5363310435057185, + "learning_rate": 1.554474055132958e-05, + "loss": 0.7803, + "step": 3279 + }, + { + "epoch": 0.33, + "grad_norm": 1.5009334366953122, + "learning_rate": 1.5541998601382035e-05, + "loss": 0.7894, + "step": 3280 + }, + { + "epoch": 0.33, + "grad_norm": 1.5874154896475607, + "learning_rate": 1.5539256049937205e-05, + "loss": 0.6396, + "step": 3281 + }, + { + "epoch": 0.33, + "grad_norm": 1.5775813348209935, + "learning_rate": 1.5536512897292757e-05, + "loss": 0.7288, + "step": 3282 + }, + { + "epoch": 0.33, + "grad_norm": 1.6513279054317311, + "learning_rate": 1.5533769143746416e-05, + "loss": 0.7524, + "step": 3283 + }, + { + "epoch": 0.33, + "grad_norm": 1.3792290072180375, + "learning_rate": 1.5531024789595968e-05, + "loss": 0.6658, + "step": 3284 + }, + { + "epoch": 0.33, + "grad_norm": 1.4989661405348915, + "learning_rate": 1.552827983513928e-05, + "loss": 0.7818, + "step": 3285 + }, + { + "epoch": 0.33, + "grad_norm": 1.4834997754072567, + "learning_rate": 1.552553428067427e-05, + "loss": 0.8217, + "step": 3286 + }, + { + "epoch": 0.33, + "grad_norm": 1.584344077792234, + "learning_rate": 1.5522788126498916e-05, + "loss": 0.8188, + "step": 3287 + }, + { + "epoch": 0.33, + "grad_norm": 1.4302097903103494, + "learning_rate": 1.5520041372911286e-05, + "loss": 0.7642, + "step": 3288 + }, + { + "epoch": 0.33, + "grad_norm": 1.6986211356952792, + "learning_rate": 1.5517294020209483e-05, + "loss": 0.796, + "step": 3289 + }, + { + "epoch": 0.33, + "grad_norm": 1.670903396086338, + "learning_rate": 1.5514546068691697e-05, + "loss": 0.8377, + "step": 3290 + }, + { + "epoch": 0.33, + "grad_norm": 1.6445195419654928, + "learning_rate": 1.5511797518656174e-05, + "loss": 0.868, + "step": 3291 + }, + { + "epoch": 0.33, + "grad_norm": 1.6439578769044951, + "learning_rate": 1.5509048370401224e-05, + "loss": 0.7765, + "step": 3292 + }, + { + "epoch": 0.33, + "grad_norm": 1.608729331314093, + "learning_rate": 1.5506298624225225e-05, + "loss": 0.7792, + "step": 3293 + }, + { + "epoch": 0.34, + "grad_norm": 1.5707601205056936, + "learning_rate": 1.550354828042662e-05, + "loss": 0.7926, + "step": 3294 + }, + { + "epoch": 0.34, + "grad_norm": 1.493563824960474, + "learning_rate": 1.5500797339303913e-05, + "loss": 0.7154, + "step": 3295 + }, + { + "epoch": 0.34, + "grad_norm": 1.5552815430878002, + "learning_rate": 1.549804580115568e-05, + "loss": 0.647, + "step": 3296 + }, + { + "epoch": 0.34, + "grad_norm": 1.3313103567796118, + "learning_rate": 1.5495293666280556e-05, + "loss": 0.7092, + "step": 3297 + }, + { + "epoch": 0.34, + "grad_norm": 1.8987395905196398, + "learning_rate": 1.549254093497724e-05, + "loss": 0.8331, + "step": 3298 + }, + { + "epoch": 0.34, + "grad_norm": 1.6075348752193146, + "learning_rate": 1.5489787607544498e-05, + "loss": 0.8554, + "step": 3299 + }, + { + "epoch": 0.34, + "grad_norm": 1.6698223611640848, + "learning_rate": 1.5487033684281163e-05, + "loss": 0.7023, + "step": 3300 + }, + { + "epoch": 0.34, + "grad_norm": 1.5546756934383337, + "learning_rate": 1.548427916548613e-05, + "loss": 0.882, + "step": 3301 + }, + { + "epoch": 0.34, + "grad_norm": 1.5788479756963842, + "learning_rate": 1.5481524051458356e-05, + "loss": 0.6948, + "step": 3302 + }, + { + "epoch": 0.34, + "grad_norm": 1.5474807971285485, + "learning_rate": 1.5478768342496872e-05, + "loss": 0.8409, + "step": 3303 + }, + { + "epoch": 0.34, + "grad_norm": 1.6709675283696996, + "learning_rate": 1.547601203890076e-05, + "loss": 0.7155, + "step": 3304 + }, + { + "epoch": 0.34, + "grad_norm": 1.5105573645817394, + "learning_rate": 1.5473255140969176e-05, + "loss": 0.6846, + "step": 3305 + }, + { + "epoch": 0.34, + "grad_norm": 1.5219539065398735, + "learning_rate": 1.5470497649001336e-05, + "loss": 0.7485, + "step": 3306 + }, + { + "epoch": 0.34, + "grad_norm": 1.5137795824059195, + "learning_rate": 1.5467739563296528e-05, + "loss": 0.8512, + "step": 3307 + }, + { + "epoch": 0.34, + "grad_norm": 1.5675663536182731, + "learning_rate": 1.54649808841541e-05, + "loss": 0.8345, + "step": 3308 + }, + { + "epoch": 0.34, + "grad_norm": 1.583690197417576, + "learning_rate": 1.546222161187345e-05, + "loss": 0.7969, + "step": 3309 + }, + { + "epoch": 0.34, + "grad_norm": 1.4902741507582222, + "learning_rate": 1.5459461746754075e-05, + "loss": 0.6601, + "step": 3310 + }, + { + "epoch": 0.34, + "grad_norm": 1.5923296984248694, + "learning_rate": 1.5456701289095496e-05, + "loss": 0.7259, + "step": 3311 + }, + { + "epoch": 0.34, + "grad_norm": 1.569763023547656, + "learning_rate": 1.5453940239197328e-05, + "loss": 0.7625, + "step": 3312 + }, + { + "epoch": 0.34, + "grad_norm": 1.588716506288604, + "learning_rate": 1.5451178597359237e-05, + "loss": 0.7954, + "step": 3313 + }, + { + "epoch": 0.34, + "grad_norm": 1.458630633504389, + "learning_rate": 1.5448416363880956e-05, + "loss": 0.7124, + "step": 3314 + }, + { + "epoch": 0.34, + "grad_norm": 1.4807992133698338, + "learning_rate": 1.5445653539062283e-05, + "loss": 0.8119, + "step": 3315 + }, + { + "epoch": 0.34, + "grad_norm": 1.6550269578144803, + "learning_rate": 1.5442890123203077e-05, + "loss": 0.8211, + "step": 3316 + }, + { + "epoch": 0.34, + "grad_norm": 1.6472584346267662, + "learning_rate": 1.544012611660327e-05, + "loss": 0.7719, + "step": 3317 + }, + { + "epoch": 0.34, + "grad_norm": 1.6462001997415807, + "learning_rate": 1.5437361519562843e-05, + "loss": 0.9011, + "step": 3318 + }, + { + "epoch": 0.34, + "grad_norm": 1.6298769830925262, + "learning_rate": 1.5434596332381853e-05, + "loss": 0.8106, + "step": 3319 + }, + { + "epoch": 0.34, + "grad_norm": 1.6925339646508504, + "learning_rate": 1.543183055536042e-05, + "loss": 0.7336, + "step": 3320 + }, + { + "epoch": 0.34, + "grad_norm": 1.5500185607465797, + "learning_rate": 1.5429064188798727e-05, + "loss": 0.6836, + "step": 3321 + }, + { + "epoch": 0.34, + "grad_norm": 1.6311566638530315, + "learning_rate": 1.5426297232997016e-05, + "loss": 0.6382, + "step": 3322 + }, + { + "epoch": 0.34, + "grad_norm": 1.482678790650085, + "learning_rate": 1.5423529688255593e-05, + "loss": 0.7339, + "step": 3323 + }, + { + "epoch": 0.34, + "grad_norm": 1.4653855741666746, + "learning_rate": 1.542076155487484e-05, + "loss": 0.8448, + "step": 3324 + }, + { + "epoch": 0.34, + "grad_norm": 1.428470162563532, + "learning_rate": 1.54179928331552e-05, + "loss": 0.7418, + "step": 3325 + }, + { + "epoch": 0.34, + "grad_norm": 1.5905801823778656, + "learning_rate": 1.5415223523397153e-05, + "loss": 0.7248, + "step": 3326 + }, + { + "epoch": 0.34, + "grad_norm": 1.7581095189115765, + "learning_rate": 1.541245362590128e-05, + "loss": 0.7834, + "step": 3327 + }, + { + "epoch": 0.34, + "grad_norm": 1.5621095736065527, + "learning_rate": 1.5409683140968213e-05, + "loss": 0.7208, + "step": 3328 + }, + { + "epoch": 0.34, + "grad_norm": 1.7278372548733352, + "learning_rate": 1.540691206889864e-05, + "loss": 0.6603, + "step": 3329 + }, + { + "epoch": 0.34, + "grad_norm": 1.4516693914835002, + "learning_rate": 1.540414040999331e-05, + "loss": 0.76, + "step": 3330 + }, + { + "epoch": 0.34, + "grad_norm": 1.4918138555527303, + "learning_rate": 1.5401368164553054e-05, + "loss": 0.7142, + "step": 3331 + }, + { + "epoch": 0.34, + "grad_norm": 1.567648550614192, + "learning_rate": 1.539859533287875e-05, + "loss": 0.6528, + "step": 3332 + }, + { + "epoch": 0.34, + "grad_norm": 1.5803270851499127, + "learning_rate": 1.5395821915271344e-05, + "loss": 0.7476, + "step": 3333 + }, + { + "epoch": 0.34, + "grad_norm": 1.5112348972644198, + "learning_rate": 1.539304791203186e-05, + "loss": 0.7673, + "step": 3334 + }, + { + "epoch": 0.34, + "grad_norm": 1.6631107341011613, + "learning_rate": 1.5390273323461354e-05, + "loss": 0.7598, + "step": 3335 + }, + { + "epoch": 0.34, + "grad_norm": 1.656394575426843, + "learning_rate": 1.5387498149860975e-05, + "loss": 0.8525, + "step": 3336 + }, + { + "epoch": 0.34, + "grad_norm": 1.4690253202750414, + "learning_rate": 1.5384722391531922e-05, + "loss": 0.6093, + "step": 3337 + }, + { + "epoch": 0.34, + "grad_norm": 1.6346574420144158, + "learning_rate": 1.5381946048775462e-05, + "loss": 0.7501, + "step": 3338 + }, + { + "epoch": 0.34, + "grad_norm": 1.446909212096057, + "learning_rate": 1.5379169121892925e-05, + "loss": 0.7256, + "step": 3339 + }, + { + "epoch": 0.34, + "grad_norm": 1.4950797036537102, + "learning_rate": 1.5376391611185703e-05, + "loss": 0.6672, + "step": 3340 + }, + { + "epoch": 0.34, + "grad_norm": 1.6664378639633073, + "learning_rate": 1.5373613516955243e-05, + "loss": 0.7275, + "step": 3341 + }, + { + "epoch": 0.34, + "grad_norm": 1.5257888571405214, + "learning_rate": 1.537083483950307e-05, + "loss": 0.7199, + "step": 3342 + }, + { + "epoch": 0.34, + "grad_norm": 1.484698622122737, + "learning_rate": 1.5368055579130768e-05, + "loss": 0.7321, + "step": 3343 + }, + { + "epoch": 0.34, + "grad_norm": 1.6038707286398843, + "learning_rate": 1.5365275736139978e-05, + "loss": 0.7752, + "step": 3344 + }, + { + "epoch": 0.34, + "grad_norm": 1.484225545623433, + "learning_rate": 1.536249531083241e-05, + "loss": 0.655, + "step": 3345 + }, + { + "epoch": 0.34, + "grad_norm": 1.4082424654428607, + "learning_rate": 1.5359714303509838e-05, + "loss": 0.7996, + "step": 3346 + }, + { + "epoch": 0.34, + "grad_norm": 1.6178201732198476, + "learning_rate": 1.5356932714474096e-05, + "loss": 0.7231, + "step": 3347 + }, + { + "epoch": 0.34, + "grad_norm": 1.5148520011513111, + "learning_rate": 1.535415054402708e-05, + "loss": 0.7029, + "step": 3348 + }, + { + "epoch": 0.34, + "grad_norm": 1.6893975573942526, + "learning_rate": 1.535136779247075e-05, + "loss": 0.7314, + "step": 3349 + }, + { + "epoch": 0.34, + "grad_norm": 1.5313278686857816, + "learning_rate": 1.5348584460107132e-05, + "loss": 0.8194, + "step": 3350 + }, + { + "epoch": 0.34, + "grad_norm": 1.6152196301716772, + "learning_rate": 1.5345800547238315e-05, + "loss": 0.7202, + "step": 3351 + }, + { + "epoch": 0.34, + "grad_norm": 1.3996437024777064, + "learning_rate": 1.534301605416645e-05, + "loss": 0.7516, + "step": 3352 + }, + { + "epoch": 0.34, + "grad_norm": 1.3860359296983389, + "learning_rate": 1.5340230981193745e-05, + "loss": 0.6924, + "step": 3353 + }, + { + "epoch": 0.34, + "grad_norm": 1.5472239021599388, + "learning_rate": 1.5337445328622478e-05, + "loss": 0.6571, + "step": 3354 + }, + { + "epoch": 0.34, + "grad_norm": 1.6599870981410105, + "learning_rate": 1.533465909675499e-05, + "loss": 0.7952, + "step": 3355 + }, + { + "epoch": 0.34, + "grad_norm": 1.5075921361400568, + "learning_rate": 1.5331872285893682e-05, + "loss": 0.6703, + "step": 3356 + }, + { + "epoch": 0.34, + "grad_norm": 1.4520303260348506, + "learning_rate": 1.5329084896341017e-05, + "loss": 0.6917, + "step": 3357 + }, + { + "epoch": 0.34, + "grad_norm": 1.509243271112511, + "learning_rate": 1.5326296928399527e-05, + "loss": 0.7536, + "step": 3358 + }, + { + "epoch": 0.34, + "grad_norm": 1.4780434353646639, + "learning_rate": 1.5323508382371796e-05, + "loss": 0.8208, + "step": 3359 + }, + { + "epoch": 0.34, + "grad_norm": 1.7156394695200816, + "learning_rate": 1.5320719258560482e-05, + "loss": 0.6873, + "step": 3360 + }, + { + "epoch": 0.34, + "grad_norm": 1.569804942520179, + "learning_rate": 1.53179295572683e-05, + "loss": 0.6311, + "step": 3361 + }, + { + "epoch": 0.34, + "grad_norm": 1.43936954024699, + "learning_rate": 1.5315139278798027e-05, + "loss": 0.7471, + "step": 3362 + }, + { + "epoch": 0.34, + "grad_norm": 1.531157207106405, + "learning_rate": 1.5312348423452506e-05, + "loss": 0.7707, + "step": 3363 + }, + { + "epoch": 0.34, + "grad_norm": 1.6599684182504466, + "learning_rate": 1.5309556991534636e-05, + "loss": 0.8001, + "step": 3364 + }, + { + "epoch": 0.34, + "grad_norm": 1.7921342018549509, + "learning_rate": 1.530676498334739e-05, + "loss": 0.7317, + "step": 3365 + }, + { + "epoch": 0.34, + "grad_norm": 1.6102687290223747, + "learning_rate": 1.5303972399193795e-05, + "loss": 0.8126, + "step": 3366 + }, + { + "epoch": 0.34, + "grad_norm": 1.7942876571579665, + "learning_rate": 1.5301179239376936e-05, + "loss": 0.8059, + "step": 3367 + }, + { + "epoch": 0.34, + "grad_norm": 1.5295088337332308, + "learning_rate": 1.5298385504199975e-05, + "loss": 0.6891, + "step": 3368 + }, + { + "epoch": 0.34, + "grad_norm": 1.5894517073762104, + "learning_rate": 1.5295591193966125e-05, + "loss": 0.8017, + "step": 3369 + }, + { + "epoch": 0.34, + "grad_norm": 1.5526743477678926, + "learning_rate": 1.5292796308978662e-05, + "loss": 0.6994, + "step": 3370 + }, + { + "epoch": 0.34, + "grad_norm": 1.5052162467039785, + "learning_rate": 1.529000084954093e-05, + "loss": 0.7152, + "step": 3371 + }, + { + "epoch": 0.34, + "grad_norm": 1.6780524182232355, + "learning_rate": 1.5287204815956334e-05, + "loss": 0.7928, + "step": 3372 + }, + { + "epoch": 0.34, + "grad_norm": 1.683032929660594, + "learning_rate": 1.5284408208528342e-05, + "loss": 0.755, + "step": 3373 + }, + { + "epoch": 0.34, + "grad_norm": 1.6660135300486254, + "learning_rate": 1.5281611027560473e-05, + "loss": 0.697, + "step": 3374 + }, + { + "epoch": 0.34, + "grad_norm": 1.5766118490644017, + "learning_rate": 1.527881327335632e-05, + "loss": 0.6787, + "step": 3375 + }, + { + "epoch": 0.34, + "grad_norm": 1.6779578367810388, + "learning_rate": 1.5276014946219546e-05, + "loss": 0.919, + "step": 3376 + }, + { + "epoch": 0.34, + "grad_norm": 1.6525866035261925, + "learning_rate": 1.5273216046453847e-05, + "loss": 0.8289, + "step": 3377 + }, + { + "epoch": 0.34, + "grad_norm": 1.6010039332657084, + "learning_rate": 1.5270416574363015e-05, + "loss": 0.7289, + "step": 3378 + }, + { + "epoch": 0.34, + "grad_norm": 1.7230029616329539, + "learning_rate": 1.5267616530250885e-05, + "loss": 0.7253, + "step": 3379 + }, + { + "epoch": 0.34, + "grad_norm": 1.4597544106926883, + "learning_rate": 1.5264815914421355e-05, + "loss": 0.6624, + "step": 3380 + }, + { + "epoch": 0.34, + "grad_norm": 1.6520878222696886, + "learning_rate": 1.526201472717839e-05, + "loss": 0.7278, + "step": 3381 + }, + { + "epoch": 0.34, + "grad_norm": 1.455043108180653, + "learning_rate": 1.5259212968826013e-05, + "loss": 0.7888, + "step": 3382 + }, + { + "epoch": 0.34, + "grad_norm": 1.4135719140088696, + "learning_rate": 1.5256410639668317e-05, + "loss": 0.6792, + "step": 3383 + }, + { + "epoch": 0.34, + "grad_norm": 1.5148945144636952, + "learning_rate": 1.5253607740009447e-05, + "loss": 0.7648, + "step": 3384 + }, + { + "epoch": 0.34, + "grad_norm": 1.3845562724566336, + "learning_rate": 1.5250804270153614e-05, + "loss": 0.7052, + "step": 3385 + }, + { + "epoch": 0.34, + "grad_norm": 1.4782533714227502, + "learning_rate": 1.5248000230405086e-05, + "loss": 0.8368, + "step": 3386 + }, + { + "epoch": 0.34, + "grad_norm": 1.6029221711674988, + "learning_rate": 1.5245195621068207e-05, + "loss": 0.8218, + "step": 3387 + }, + { + "epoch": 0.34, + "grad_norm": 1.7594558175947395, + "learning_rate": 1.524239044244737e-05, + "loss": 0.8015, + "step": 3388 + }, + { + "epoch": 0.34, + "grad_norm": 1.5221214843094437, + "learning_rate": 1.5239584694847032e-05, + "loss": 0.6736, + "step": 3389 + }, + { + "epoch": 0.34, + "grad_norm": 1.6120502289852638, + "learning_rate": 1.523677837857171e-05, + "loss": 0.7201, + "step": 3390 + }, + { + "epoch": 0.34, + "grad_norm": 1.6352747168282848, + "learning_rate": 1.523397149392599e-05, + "loss": 0.7127, + "step": 3391 + }, + { + "epoch": 0.35, + "grad_norm": 1.5988918051963217, + "learning_rate": 1.5231164041214515e-05, + "loss": 0.868, + "step": 3392 + }, + { + "epoch": 0.35, + "grad_norm": 1.5813185864787247, + "learning_rate": 1.5228356020741991e-05, + "loss": 0.867, + "step": 3393 + }, + { + "epoch": 0.35, + "grad_norm": 1.5204761871068275, + "learning_rate": 1.5225547432813184e-05, + "loss": 0.7359, + "step": 3394 + }, + { + "epoch": 0.35, + "grad_norm": 1.4075043465821404, + "learning_rate": 1.5222738277732918e-05, + "loss": 0.7087, + "step": 3395 + }, + { + "epoch": 0.35, + "grad_norm": 1.5446258138687168, + "learning_rate": 1.5219928555806087e-05, + "loss": 0.7042, + "step": 3396 + }, + { + "epoch": 0.35, + "grad_norm": 1.4503770486789294, + "learning_rate": 1.5217118267337645e-05, + "loss": 0.6952, + "step": 3397 + }, + { + "epoch": 0.35, + "grad_norm": 1.5964787717962494, + "learning_rate": 1.52143074126326e-05, + "loss": 0.709, + "step": 3398 + }, + { + "epoch": 0.35, + "grad_norm": 1.6755609053141203, + "learning_rate": 1.5211495991996029e-05, + "loss": 0.798, + "step": 3399 + }, + { + "epoch": 0.35, + "grad_norm": 1.7113459909179223, + "learning_rate": 1.5208684005733063e-05, + "loss": 0.7412, + "step": 3400 + }, + { + "epoch": 0.35, + "grad_norm": 1.4563857410337533, + "learning_rate": 1.5205871454148909e-05, + "loss": 0.716, + "step": 3401 + }, + { + "epoch": 0.35, + "grad_norm": 1.6121262428835532, + "learning_rate": 1.5203058337548813e-05, + "loss": 0.7882, + "step": 3402 + }, + { + "epoch": 0.35, + "grad_norm": 1.6361418926139473, + "learning_rate": 1.5200244656238106e-05, + "loss": 0.7199, + "step": 3403 + }, + { + "epoch": 0.35, + "grad_norm": 1.6940134297858211, + "learning_rate": 1.519743041052216e-05, + "loss": 0.7211, + "step": 3404 + }, + { + "epoch": 0.35, + "grad_norm": 1.51968529644561, + "learning_rate": 1.5194615600706426e-05, + "loss": 0.7086, + "step": 3405 + }, + { + "epoch": 0.35, + "grad_norm": 1.4702030040840666, + "learning_rate": 1.51918002270964e-05, + "loss": 0.6855, + "step": 3406 + }, + { + "epoch": 0.35, + "grad_norm": 1.454815808229764, + "learning_rate": 1.5188984289997653e-05, + "loss": 0.6909, + "step": 3407 + }, + { + "epoch": 0.35, + "grad_norm": 1.5142798550349588, + "learning_rate": 1.5186167789715805e-05, + "loss": 0.7542, + "step": 3408 + }, + { + "epoch": 0.35, + "grad_norm": 1.3422009964047386, + "learning_rate": 1.5183350726556545e-05, + "loss": 0.747, + "step": 3409 + }, + { + "epoch": 0.35, + "grad_norm": 1.3183729374682407, + "learning_rate": 1.5180533100825625e-05, + "loss": 0.7534, + "step": 3410 + }, + { + "epoch": 0.35, + "grad_norm": 1.6271445029553226, + "learning_rate": 1.5177714912828849e-05, + "loss": 0.8038, + "step": 3411 + }, + { + "epoch": 0.35, + "grad_norm": 1.4796260691101912, + "learning_rate": 1.5174896162872093e-05, + "loss": 0.748, + "step": 3412 + }, + { + "epoch": 0.35, + "grad_norm": 1.4917898706741968, + "learning_rate": 1.5172076851261284e-05, + "loss": 0.7499, + "step": 3413 + }, + { + "epoch": 0.35, + "grad_norm": 1.4965042302274694, + "learning_rate": 1.5169256978302414e-05, + "loss": 0.7906, + "step": 3414 + }, + { + "epoch": 0.35, + "grad_norm": 1.5864985396719788, + "learning_rate": 1.5166436544301537e-05, + "loss": 0.7249, + "step": 3415 + }, + { + "epoch": 0.35, + "grad_norm": 1.60300863633716, + "learning_rate": 1.5163615549564767e-05, + "loss": 0.848, + "step": 3416 + }, + { + "epoch": 0.35, + "grad_norm": 1.522315570698071, + "learning_rate": 1.5160793994398279e-05, + "loss": 0.7615, + "step": 3417 + }, + { + "epoch": 0.35, + "grad_norm": 1.638295121816351, + "learning_rate": 1.5157971879108312e-05, + "loss": 0.7377, + "step": 3418 + }, + { + "epoch": 0.35, + "grad_norm": 1.6293378893887394, + "learning_rate": 1.5155149204001154e-05, + "loss": 0.8073, + "step": 3419 + }, + { + "epoch": 0.35, + "grad_norm": 1.6763894967757471, + "learning_rate": 1.5152325969383173e-05, + "loss": 0.6717, + "step": 3420 + }, + { + "epoch": 0.35, + "grad_norm": 1.6242355549321144, + "learning_rate": 1.5149502175560777e-05, + "loss": 0.7134, + "step": 3421 + }, + { + "epoch": 0.35, + "grad_norm": 1.4804791823585846, + "learning_rate": 1.514667782284045e-05, + "loss": 0.6685, + "step": 3422 + }, + { + "epoch": 0.35, + "grad_norm": 1.3690241377962513, + "learning_rate": 1.5143852911528729e-05, + "loss": 0.6693, + "step": 3423 + }, + { + "epoch": 0.35, + "grad_norm": 1.44855052107555, + "learning_rate": 1.5141027441932217e-05, + "loss": 0.7307, + "step": 3424 + }, + { + "epoch": 0.35, + "grad_norm": 1.7367703094119369, + "learning_rate": 1.5138201414357574e-05, + "loss": 0.6859, + "step": 3425 + }, + { + "epoch": 0.35, + "grad_norm": 1.680051274739284, + "learning_rate": 1.513537482911152e-05, + "loss": 0.8612, + "step": 3426 + }, + { + "epoch": 0.35, + "grad_norm": 1.678594781233372, + "learning_rate": 1.5132547686500834e-05, + "loss": 0.7895, + "step": 3427 + }, + { + "epoch": 0.35, + "grad_norm": 1.5156833085619492, + "learning_rate": 1.5129719986832363e-05, + "loss": 0.7615, + "step": 3428 + }, + { + "epoch": 0.35, + "grad_norm": 1.5973240474554833, + "learning_rate": 1.5126891730413005e-05, + "loss": 0.7608, + "step": 3429 + }, + { + "epoch": 0.35, + "grad_norm": 1.5179787385027927, + "learning_rate": 1.5124062917549728e-05, + "loss": 0.7311, + "step": 3430 + }, + { + "epoch": 0.35, + "grad_norm": 1.4531619208786193, + "learning_rate": 1.512123354854955e-05, + "loss": 0.7504, + "step": 3431 + }, + { + "epoch": 0.35, + "grad_norm": 1.5130580888869969, + "learning_rate": 1.511840362371956e-05, + "loss": 0.6863, + "step": 3432 + }, + { + "epoch": 0.35, + "grad_norm": 1.4289391942985183, + "learning_rate": 1.51155731433669e-05, + "loss": 0.7857, + "step": 3433 + }, + { + "epoch": 0.35, + "grad_norm": 1.5482410360919439, + "learning_rate": 1.511274210779877e-05, + "loss": 0.7745, + "step": 3434 + }, + { + "epoch": 0.35, + "grad_norm": 1.6338364710983555, + "learning_rate": 1.5109910517322442e-05, + "loss": 0.7405, + "step": 3435 + }, + { + "epoch": 0.35, + "grad_norm": 1.5065075264891135, + "learning_rate": 1.5107078372245237e-05, + "loss": 0.6536, + "step": 3436 + }, + { + "epoch": 0.35, + "grad_norm": 1.561399673765195, + "learning_rate": 1.5104245672874538e-05, + "loss": 0.7642, + "step": 3437 + }, + { + "epoch": 0.35, + "grad_norm": 1.516983977202264, + "learning_rate": 1.5101412419517796e-05, + "loss": 0.7723, + "step": 3438 + }, + { + "epoch": 0.35, + "grad_norm": 1.5629150934760518, + "learning_rate": 1.509857861248251e-05, + "loss": 0.7311, + "step": 3439 + }, + { + "epoch": 0.35, + "grad_norm": 1.642574367936479, + "learning_rate": 1.509574425207625e-05, + "loss": 0.7182, + "step": 3440 + }, + { + "epoch": 0.35, + "grad_norm": 1.57635743355489, + "learning_rate": 1.509290933860664e-05, + "loss": 0.7569, + "step": 3441 + }, + { + "epoch": 0.35, + "grad_norm": 1.541793303046407, + "learning_rate": 1.5090073872381363e-05, + "loss": 0.7027, + "step": 3442 + }, + { + "epoch": 0.35, + "grad_norm": 1.5103972648127917, + "learning_rate": 1.5087237853708171e-05, + "loss": 0.7425, + "step": 3443 + }, + { + "epoch": 0.35, + "grad_norm": 1.4817589947354464, + "learning_rate": 1.5084401282894864e-05, + "loss": 0.7552, + "step": 3444 + }, + { + "epoch": 0.35, + "grad_norm": 1.542431128929272, + "learning_rate": 1.5081564160249307e-05, + "loss": 0.8015, + "step": 3445 + }, + { + "epoch": 0.35, + "grad_norm": 1.5698521794455202, + "learning_rate": 1.507872648607943e-05, + "loss": 0.6561, + "step": 3446 + }, + { + "epoch": 0.35, + "grad_norm": 1.4648673126928915, + "learning_rate": 1.5075888260693214e-05, + "loss": 0.7357, + "step": 3447 + }, + { + "epoch": 0.35, + "grad_norm": 1.4953230949375165, + "learning_rate": 1.5073049484398707e-05, + "loss": 0.7219, + "step": 3448 + }, + { + "epoch": 0.35, + "grad_norm": 1.497409323484993, + "learning_rate": 1.5070210157504009e-05, + "loss": 0.7111, + "step": 3449 + }, + { + "epoch": 0.35, + "grad_norm": 1.4666121459888177, + "learning_rate": 1.5067370280317289e-05, + "loss": 0.7245, + "step": 3450 + }, + { + "epoch": 0.35, + "grad_norm": 1.7367927555042613, + "learning_rate": 1.506452985314677e-05, + "loss": 0.7632, + "step": 3451 + }, + { + "epoch": 0.35, + "grad_norm": 1.4721834836468148, + "learning_rate": 1.5061688876300738e-05, + "loss": 0.7348, + "step": 3452 + }, + { + "epoch": 0.35, + "grad_norm": 1.584192423035963, + "learning_rate": 1.505884735008753e-05, + "loss": 0.739, + "step": 3453 + }, + { + "epoch": 0.35, + "grad_norm": 1.5366616006137925, + "learning_rate": 1.5056005274815557e-05, + "loss": 0.7103, + "step": 3454 + }, + { + "epoch": 0.35, + "grad_norm": 1.5065621493020562, + "learning_rate": 1.5053162650793277e-05, + "loss": 0.8539, + "step": 3455 + }, + { + "epoch": 0.35, + "grad_norm": 1.5153782193173004, + "learning_rate": 1.5050319478329213e-05, + "loss": 0.7514, + "step": 3456 + }, + { + "epoch": 0.35, + "grad_norm": 1.37023829246634, + "learning_rate": 1.504747575773195e-05, + "loss": 0.7087, + "step": 3457 + }, + { + "epoch": 0.35, + "grad_norm": 1.5598410742579687, + "learning_rate": 1.5044631489310127e-05, + "loss": 0.6599, + "step": 3458 + }, + { + "epoch": 0.35, + "grad_norm": 1.4924135931050373, + "learning_rate": 1.5041786673372445e-05, + "loss": 0.7987, + "step": 3459 + }, + { + "epoch": 0.35, + "grad_norm": 1.4892305193365434, + "learning_rate": 1.5038941310227663e-05, + "loss": 0.7199, + "step": 3460 + }, + { + "epoch": 0.35, + "grad_norm": 1.4600408002651917, + "learning_rate": 1.5036095400184603e-05, + "loss": 0.7402, + "step": 3461 + }, + { + "epoch": 0.35, + "grad_norm": 1.489791988488241, + "learning_rate": 1.5033248943552141e-05, + "loss": 0.6495, + "step": 3462 + }, + { + "epoch": 0.35, + "grad_norm": 1.5042633182167775, + "learning_rate": 1.5030401940639221e-05, + "loss": 0.7515, + "step": 3463 + }, + { + "epoch": 0.35, + "grad_norm": 1.608923764011562, + "learning_rate": 1.5027554391754838e-05, + "loss": 0.7703, + "step": 3464 + }, + { + "epoch": 0.35, + "grad_norm": 1.4888772998343753, + "learning_rate": 1.5024706297208042e-05, + "loss": 0.6357, + "step": 3465 + }, + { + "epoch": 0.35, + "grad_norm": 1.3819111163671756, + "learning_rate": 1.5021857657307957e-05, + "loss": 0.6705, + "step": 3466 + }, + { + "epoch": 0.35, + "grad_norm": 1.5237889483123492, + "learning_rate": 1.5019008472363759e-05, + "loss": 0.7462, + "step": 3467 + }, + { + "epoch": 0.35, + "grad_norm": 1.5342490853257287, + "learning_rate": 1.5016158742684677e-05, + "loss": 0.6554, + "step": 3468 + }, + { + "epoch": 0.35, + "grad_norm": 1.7384068499426892, + "learning_rate": 1.501330846858001e-05, + "loss": 0.8208, + "step": 3469 + }, + { + "epoch": 0.35, + "grad_norm": 1.5652607294497758, + "learning_rate": 1.5010457650359107e-05, + "loss": 0.7814, + "step": 3470 + }, + { + "epoch": 0.35, + "grad_norm": 1.6478352433592374, + "learning_rate": 1.5007606288331382e-05, + "loss": 0.7972, + "step": 3471 + }, + { + "epoch": 0.35, + "grad_norm": 1.634555272718592, + "learning_rate": 1.50047543828063e-05, + "loss": 0.6908, + "step": 3472 + }, + { + "epoch": 0.35, + "grad_norm": 1.5720221466031237, + "learning_rate": 1.5001901934093401e-05, + "loss": 0.7597, + "step": 3473 + }, + { + "epoch": 0.35, + "grad_norm": 2.5344693193411723, + "learning_rate": 1.4999048942502266e-05, + "loss": 0.6344, + "step": 3474 + }, + { + "epoch": 0.35, + "grad_norm": 1.4777960019305205, + "learning_rate": 1.4996195408342544e-05, + "loss": 0.693, + "step": 3475 + }, + { + "epoch": 0.35, + "grad_norm": 1.7370996677258854, + "learning_rate": 1.4993341331923941e-05, + "loss": 0.7798, + "step": 3476 + }, + { + "epoch": 0.35, + "grad_norm": 1.5785841948824484, + "learning_rate": 1.4990486713556225e-05, + "loss": 0.7868, + "step": 3477 + }, + { + "epoch": 0.35, + "grad_norm": 1.6036574659873122, + "learning_rate": 1.498763155354922e-05, + "loss": 0.7637, + "step": 3478 + }, + { + "epoch": 0.35, + "grad_norm": 1.7302510182696837, + "learning_rate": 1.4984775852212807e-05, + "loss": 0.7649, + "step": 3479 + }, + { + "epoch": 0.35, + "grad_norm": 1.664703370876807, + "learning_rate": 1.4981919609856927e-05, + "loss": 0.7293, + "step": 3480 + }, + { + "epoch": 0.35, + "grad_norm": 1.649517852767688, + "learning_rate": 1.4979062826791584e-05, + "loss": 0.7203, + "step": 3481 + }, + { + "epoch": 0.35, + "grad_norm": 1.4861988697765522, + "learning_rate": 1.4976205503326833e-05, + "loss": 0.7522, + "step": 3482 + }, + { + "epoch": 0.35, + "grad_norm": 1.536728007961462, + "learning_rate": 1.4973347639772796e-05, + "loss": 0.6858, + "step": 3483 + }, + { + "epoch": 0.35, + "grad_norm": 1.3665924982565034, + "learning_rate": 1.4970489236439645e-05, + "loss": 0.7136, + "step": 3484 + }, + { + "epoch": 0.35, + "grad_norm": 1.7127713453643445, + "learning_rate": 1.4967630293637618e-05, + "loss": 0.7941, + "step": 3485 + }, + { + "epoch": 0.35, + "grad_norm": 1.523515736800205, + "learning_rate": 1.4964770811677007e-05, + "loss": 0.7964, + "step": 3486 + }, + { + "epoch": 0.35, + "grad_norm": 1.906325251267563, + "learning_rate": 1.4961910790868167e-05, + "loss": 0.7163, + "step": 3487 + }, + { + "epoch": 0.35, + "grad_norm": 1.4949145245196809, + "learning_rate": 1.4959050231521503e-05, + "loss": 0.7391, + "step": 3488 + }, + { + "epoch": 0.35, + "grad_norm": 1.4994096945124373, + "learning_rate": 1.4956189133947492e-05, + "loss": 0.664, + "step": 3489 + }, + { + "epoch": 0.35, + "grad_norm": 1.5604623367643393, + "learning_rate": 1.4953327498456652e-05, + "loss": 0.7454, + "step": 3490 + }, + { + "epoch": 0.36, + "grad_norm": 1.4918625748558811, + "learning_rate": 1.4950465325359573e-05, + "loss": 0.8073, + "step": 3491 + }, + { + "epoch": 0.36, + "grad_norm": 1.4786728621419427, + "learning_rate": 1.4947602614966904e-05, + "loss": 0.6621, + "step": 3492 + }, + { + "epoch": 0.36, + "grad_norm": 1.5629535108757933, + "learning_rate": 1.494473936758934e-05, + "loss": 0.8043, + "step": 3493 + }, + { + "epoch": 0.36, + "grad_norm": 1.591950029518263, + "learning_rate": 1.4941875583537643e-05, + "loss": 0.705, + "step": 3494 + }, + { + "epoch": 0.36, + "grad_norm": 1.5452538099171806, + "learning_rate": 1.4939011263122635e-05, + "loss": 0.6806, + "step": 3495 + }, + { + "epoch": 0.36, + "grad_norm": 1.5899349971218648, + "learning_rate": 1.4936146406655194e-05, + "loss": 0.8044, + "step": 3496 + }, + { + "epoch": 0.36, + "grad_norm": 1.5583774506657113, + "learning_rate": 1.4933281014446253e-05, + "loss": 0.7003, + "step": 3497 + }, + { + "epoch": 0.36, + "grad_norm": 1.6291525500407988, + "learning_rate": 1.4930415086806806e-05, + "loss": 0.7825, + "step": 3498 + }, + { + "epoch": 0.36, + "grad_norm": 1.845175889444008, + "learning_rate": 1.49275486240479e-05, + "loss": 0.8185, + "step": 3499 + }, + { + "epoch": 0.36, + "grad_norm": 1.5155979978094534, + "learning_rate": 1.4924681626480654e-05, + "loss": 0.7676, + "step": 3500 + }, + { + "epoch": 0.36, + "grad_norm": 1.5870897903048031, + "learning_rate": 1.492181409441623e-05, + "loss": 0.7441, + "step": 3501 + }, + { + "epoch": 0.36, + "grad_norm": 1.629353091866416, + "learning_rate": 1.4918946028165854e-05, + "loss": 0.7711, + "step": 3502 + }, + { + "epoch": 0.36, + "grad_norm": 1.418929425121082, + "learning_rate": 1.4916077428040812e-05, + "loss": 0.6599, + "step": 3503 + }, + { + "epoch": 0.36, + "grad_norm": 1.78318020605609, + "learning_rate": 1.491320829435244e-05, + "loss": 0.7912, + "step": 3504 + }, + { + "epoch": 0.36, + "grad_norm": 1.582679788025773, + "learning_rate": 1.4910338627412145e-05, + "loss": 0.7817, + "step": 3505 + }, + { + "epoch": 0.36, + "grad_norm": 1.6395184005956496, + "learning_rate": 1.4907468427531378e-05, + "loss": 0.7486, + "step": 3506 + }, + { + "epoch": 0.36, + "grad_norm": 1.595727446077858, + "learning_rate": 1.4904597695021661e-05, + "loss": 0.7169, + "step": 3507 + }, + { + "epoch": 0.36, + "grad_norm": 1.7410899348133368, + "learning_rate": 1.4901726430194559e-05, + "loss": 0.697, + "step": 3508 + }, + { + "epoch": 0.36, + "grad_norm": 1.6369873661667074, + "learning_rate": 1.489885463336171e-05, + "loss": 0.7974, + "step": 3509 + }, + { + "epoch": 0.36, + "grad_norm": 1.6216919985187985, + "learning_rate": 1.4895982304834801e-05, + "loss": 0.8268, + "step": 3510 + }, + { + "epoch": 0.36, + "grad_norm": 1.5314031214487973, + "learning_rate": 1.4893109444925578e-05, + "loss": 0.6685, + "step": 3511 + }, + { + "epoch": 0.36, + "grad_norm": 1.6248727782910444, + "learning_rate": 1.4890236053945843e-05, + "loss": 0.7778, + "step": 3512 + }, + { + "epoch": 0.36, + "grad_norm": 1.4707089643167897, + "learning_rate": 1.4887362132207459e-05, + "loss": 0.7438, + "step": 3513 + }, + { + "epoch": 0.36, + "grad_norm": 1.7110333572039853, + "learning_rate": 1.4884487680022345e-05, + "loss": 0.7783, + "step": 3514 + }, + { + "epoch": 0.36, + "grad_norm": 1.5233667532722868, + "learning_rate": 1.4881612697702478e-05, + "loss": 0.8079, + "step": 3515 + }, + { + "epoch": 0.36, + "grad_norm": 1.6387517929079978, + "learning_rate": 1.4878737185559892e-05, + "loss": 0.7378, + "step": 3516 + }, + { + "epoch": 0.36, + "grad_norm": 1.7368485789115762, + "learning_rate": 1.4875861143906682e-05, + "loss": 0.7348, + "step": 3517 + }, + { + "epoch": 0.36, + "grad_norm": 1.6443544631394493, + "learning_rate": 1.4872984573054993e-05, + "loss": 0.8044, + "step": 3518 + }, + { + "epoch": 0.36, + "grad_norm": 1.6264571228684506, + "learning_rate": 1.4870107473317035e-05, + "loss": 0.6913, + "step": 3519 + }, + { + "epoch": 0.36, + "grad_norm": 1.4394243588179805, + "learning_rate": 1.4867229845005069e-05, + "loss": 0.7788, + "step": 3520 + }, + { + "epoch": 0.36, + "grad_norm": 1.5243776042754629, + "learning_rate": 1.486435168843142e-05, + "loss": 0.8644, + "step": 3521 + }, + { + "epoch": 0.36, + "grad_norm": 1.39511558840125, + "learning_rate": 1.4861473003908464e-05, + "loss": 0.7615, + "step": 3522 + }, + { + "epoch": 0.36, + "grad_norm": 1.5237359383852438, + "learning_rate": 1.4858593791748636e-05, + "loss": 0.7109, + "step": 3523 + }, + { + "epoch": 0.36, + "grad_norm": 1.4141768796323457, + "learning_rate": 1.4855714052264436e-05, + "loss": 0.6682, + "step": 3524 + }, + { + "epoch": 0.36, + "grad_norm": 1.6013691548953137, + "learning_rate": 1.4852833785768411e-05, + "loss": 0.7491, + "step": 3525 + }, + { + "epoch": 0.36, + "grad_norm": 1.6514226529816114, + "learning_rate": 1.4849952992573164e-05, + "loss": 0.7559, + "step": 3526 + }, + { + "epoch": 0.36, + "grad_norm": 1.6338844754434587, + "learning_rate": 1.4847071672991366e-05, + "loss": 0.8517, + "step": 3527 + }, + { + "epoch": 0.36, + "grad_norm": 1.5079211705287743, + "learning_rate": 1.4844189827335739e-05, + "loss": 0.6482, + "step": 3528 + }, + { + "epoch": 0.36, + "grad_norm": 1.3343098488815823, + "learning_rate": 1.484130745591906e-05, + "loss": 0.6869, + "step": 3529 + }, + { + "epoch": 0.36, + "grad_norm": 1.6681546973582473, + "learning_rate": 1.4838424559054167e-05, + "loss": 0.7828, + "step": 3530 + }, + { + "epoch": 0.36, + "grad_norm": 1.7164511219007077, + "learning_rate": 1.4835541137053952e-05, + "loss": 0.7786, + "step": 3531 + }, + { + "epoch": 0.36, + "grad_norm": 1.5107616051889503, + "learning_rate": 1.4832657190231366e-05, + "loss": 0.7285, + "step": 3532 + }, + { + "epoch": 0.36, + "grad_norm": 1.397482883918753, + "learning_rate": 1.4829772718899417e-05, + "loss": 0.6134, + "step": 3533 + }, + { + "epoch": 0.36, + "grad_norm": 1.71295749159559, + "learning_rate": 1.4826887723371168e-05, + "loss": 0.7616, + "step": 3534 + }, + { + "epoch": 0.36, + "grad_norm": 1.6666440172844008, + "learning_rate": 1.482400220395974e-05, + "loss": 0.7533, + "step": 3535 + }, + { + "epoch": 0.36, + "grad_norm": 1.4688735439647813, + "learning_rate": 1.4821116160978313e-05, + "loss": 0.7034, + "step": 3536 + }, + { + "epoch": 0.36, + "grad_norm": 1.6611288071734818, + "learning_rate": 1.481822959474012e-05, + "loss": 0.7442, + "step": 3537 + }, + { + "epoch": 0.36, + "grad_norm": 1.434539771228801, + "learning_rate": 1.4815342505558454e-05, + "loss": 0.7714, + "step": 3538 + }, + { + "epoch": 0.36, + "grad_norm": 1.838877994694268, + "learning_rate": 1.4812454893746662e-05, + "loss": 0.8659, + "step": 3539 + }, + { + "epoch": 0.36, + "grad_norm": 1.6113005589235647, + "learning_rate": 1.4809566759618148e-05, + "loss": 0.8147, + "step": 3540 + }, + { + "epoch": 0.36, + "grad_norm": 1.5726735797445943, + "learning_rate": 1.4806678103486376e-05, + "loss": 0.7031, + "step": 3541 + }, + { + "epoch": 0.36, + "grad_norm": 1.4865889652910735, + "learning_rate": 1.4803788925664867e-05, + "loss": 0.7913, + "step": 3542 + }, + { + "epoch": 0.36, + "grad_norm": 1.603946966940541, + "learning_rate": 1.480089922646719e-05, + "loss": 0.8, + "step": 3543 + }, + { + "epoch": 0.36, + "grad_norm": 1.5696449987816368, + "learning_rate": 1.4798009006206979e-05, + "loss": 0.892, + "step": 3544 + }, + { + "epoch": 0.36, + "grad_norm": 1.565736236104443, + "learning_rate": 1.4795118265197926e-05, + "loss": 0.7458, + "step": 3545 + }, + { + "epoch": 0.36, + "grad_norm": 1.5380747057397648, + "learning_rate": 1.479222700375377e-05, + "loss": 0.8117, + "step": 3546 + }, + { + "epoch": 0.36, + "grad_norm": 1.5974693929548376, + "learning_rate": 1.4789335222188314e-05, + "loss": 0.7076, + "step": 3547 + }, + { + "epoch": 0.36, + "grad_norm": 1.6337144633545488, + "learning_rate": 1.478644292081542e-05, + "loss": 0.6971, + "step": 3548 + }, + { + "epoch": 0.36, + "grad_norm": 1.4854328456568768, + "learning_rate": 1.4783550099948995e-05, + "loss": 0.6628, + "step": 3549 + }, + { + "epoch": 0.36, + "grad_norm": 1.3421312597167916, + "learning_rate": 1.4780656759903015e-05, + "loss": 0.6936, + "step": 3550 + }, + { + "epoch": 0.36, + "grad_norm": 1.5433572587003273, + "learning_rate": 1.4777762900991506e-05, + "loss": 0.6593, + "step": 3551 + }, + { + "epoch": 0.36, + "grad_norm": 1.524569316576266, + "learning_rate": 1.4774868523528548e-05, + "loss": 0.7604, + "step": 3552 + }, + { + "epoch": 0.36, + "grad_norm": 1.4645164651882978, + "learning_rate": 1.4771973627828283e-05, + "loss": 0.7331, + "step": 3553 + }, + { + "epoch": 0.36, + "grad_norm": 1.6152049411395204, + "learning_rate": 1.4769078214204904e-05, + "loss": 0.6323, + "step": 3554 + }, + { + "epoch": 0.36, + "grad_norm": 1.6252699943112658, + "learning_rate": 1.4766182282972667e-05, + "loss": 0.8333, + "step": 3555 + }, + { + "epoch": 0.36, + "grad_norm": 1.593900253019558, + "learning_rate": 1.4763285834445877e-05, + "loss": 0.7247, + "step": 3556 + }, + { + "epoch": 0.36, + "grad_norm": 1.4427449082602886, + "learning_rate": 1.4760388868938902e-05, + "loss": 0.7594, + "step": 3557 + }, + { + "epoch": 0.36, + "grad_norm": 1.556240541058109, + "learning_rate": 1.4757491386766154e-05, + "loss": 0.8283, + "step": 3558 + }, + { + "epoch": 0.36, + "grad_norm": 1.5031696053700674, + "learning_rate": 1.4754593388242117e-05, + "loss": 0.7985, + "step": 3559 + }, + { + "epoch": 0.36, + "grad_norm": 1.4533310803694022, + "learning_rate": 1.4751694873681324e-05, + "loss": 0.7481, + "step": 3560 + }, + { + "epoch": 0.36, + "grad_norm": 1.5333323453412875, + "learning_rate": 1.4748795843398361e-05, + "loss": 0.6943, + "step": 3561 + }, + { + "epoch": 0.36, + "grad_norm": 1.389091874942304, + "learning_rate": 1.4745896297707867e-05, + "loss": 0.6686, + "step": 3562 + }, + { + "epoch": 0.36, + "grad_norm": 1.5586342886882658, + "learning_rate": 1.4742996236924551e-05, + "loss": 0.7302, + "step": 3563 + }, + { + "epoch": 0.36, + "grad_norm": 1.478618598982181, + "learning_rate": 1.4740095661363165e-05, + "loss": 0.7922, + "step": 3564 + }, + { + "epoch": 0.36, + "grad_norm": 1.626860733600897, + "learning_rate": 1.4737194571338523e-05, + "loss": 0.7162, + "step": 3565 + }, + { + "epoch": 0.36, + "grad_norm": 1.4377768151388717, + "learning_rate": 1.4734292967165491e-05, + "loss": 0.7658, + "step": 3566 + }, + { + "epoch": 0.36, + "grad_norm": 1.4427609709577702, + "learning_rate": 1.473139084915899e-05, + "loss": 0.7086, + "step": 3567 + }, + { + "epoch": 0.36, + "grad_norm": 1.607204509238383, + "learning_rate": 1.4728488217634008e-05, + "loss": 0.7084, + "step": 3568 + }, + { + "epoch": 0.36, + "grad_norm": 1.3988616333901625, + "learning_rate": 1.4725585072905573e-05, + "loss": 0.7179, + "step": 3569 + }, + { + "epoch": 0.36, + "grad_norm": 1.5123617215419016, + "learning_rate": 1.4722681415288775e-05, + "loss": 0.7475, + "step": 3570 + }, + { + "epoch": 0.36, + "grad_norm": 1.497428536624791, + "learning_rate": 1.471977724509877e-05, + "loss": 0.7024, + "step": 3571 + }, + { + "epoch": 0.36, + "grad_norm": 1.6175948288740258, + "learning_rate": 1.471687256265075e-05, + "loss": 0.7575, + "step": 3572 + }, + { + "epoch": 0.36, + "grad_norm": 1.5769398718160916, + "learning_rate": 1.4713967368259981e-05, + "loss": 0.7064, + "step": 3573 + }, + { + "epoch": 0.36, + "grad_norm": 1.5223116988772043, + "learning_rate": 1.4711061662241765e-05, + "loss": 0.8851, + "step": 3574 + }, + { + "epoch": 0.36, + "grad_norm": 1.6202519356094804, + "learning_rate": 1.4708155444911485e-05, + "loss": 0.6783, + "step": 3575 + }, + { + "epoch": 0.36, + "grad_norm": 1.5682800582853023, + "learning_rate": 1.4705248716584556e-05, + "loss": 0.8026, + "step": 3576 + }, + { + "epoch": 0.36, + "grad_norm": 1.4970671852419102, + "learning_rate": 1.4702341477576461e-05, + "loss": 0.7771, + "step": 3577 + }, + { + "epoch": 0.36, + "grad_norm": 1.5979231371091105, + "learning_rate": 1.4699433728202736e-05, + "loss": 0.7805, + "step": 3578 + }, + { + "epoch": 0.36, + "grad_norm": 1.58291483435768, + "learning_rate": 1.469652546877897e-05, + "loss": 0.8282, + "step": 3579 + }, + { + "epoch": 0.36, + "grad_norm": 1.6557268300441905, + "learning_rate": 1.4693616699620808e-05, + "loss": 0.7751, + "step": 3580 + }, + { + "epoch": 0.36, + "grad_norm": 1.5562287512074653, + "learning_rate": 1.4690707421043956e-05, + "loss": 0.6575, + "step": 3581 + }, + { + "epoch": 0.36, + "grad_norm": 1.4175640438046877, + "learning_rate": 1.4687797633364167e-05, + "loss": 0.6543, + "step": 3582 + }, + { + "epoch": 0.36, + "grad_norm": 1.6040932810491193, + "learning_rate": 1.4684887336897255e-05, + "loss": 0.763, + "step": 3583 + }, + { + "epoch": 0.36, + "grad_norm": 1.6058731823369146, + "learning_rate": 1.4681976531959085e-05, + "loss": 0.8224, + "step": 3584 + }, + { + "epoch": 0.36, + "grad_norm": 1.8304388759577574, + "learning_rate": 1.4679065218865577e-05, + "loss": 0.8874, + "step": 3585 + }, + { + "epoch": 0.36, + "grad_norm": 1.4640845041467676, + "learning_rate": 1.4676153397932718e-05, + "loss": 0.6027, + "step": 3586 + }, + { + "epoch": 0.36, + "grad_norm": 1.6107944122500681, + "learning_rate": 1.4673241069476532e-05, + "loss": 0.7631, + "step": 3587 + }, + { + "epoch": 0.36, + "grad_norm": 1.476859364163601, + "learning_rate": 1.4670328233813109e-05, + "loss": 0.7166, + "step": 3588 + }, + { + "epoch": 0.37, + "grad_norm": 1.62445852773536, + "learning_rate": 1.4667414891258592e-05, + "loss": 0.726, + "step": 3589 + }, + { + "epoch": 0.37, + "grad_norm": 1.451038371551322, + "learning_rate": 1.4664501042129179e-05, + "loss": 0.7385, + "step": 3590 + }, + { + "epoch": 0.37, + "grad_norm": 1.567794040163018, + "learning_rate": 1.466158668674112e-05, + "loss": 0.6489, + "step": 3591 + }, + { + "epoch": 0.37, + "grad_norm": 1.3653304786147358, + "learning_rate": 1.4658671825410729e-05, + "loss": 0.7955, + "step": 3592 + }, + { + "epoch": 0.37, + "grad_norm": 1.524032132670125, + "learning_rate": 1.4655756458454364e-05, + "loss": 0.7514, + "step": 3593 + }, + { + "epoch": 0.37, + "grad_norm": 1.4538196894282756, + "learning_rate": 1.4652840586188442e-05, + "loss": 0.7715, + "step": 3594 + }, + { + "epoch": 0.37, + "grad_norm": 1.687136784830419, + "learning_rate": 1.4649924208929436e-05, + "loss": 0.7911, + "step": 3595 + }, + { + "epoch": 0.37, + "grad_norm": 1.4995055304016083, + "learning_rate": 1.4647007326993876e-05, + "loss": 0.8462, + "step": 3596 + }, + { + "epoch": 0.37, + "grad_norm": 1.403670122516953, + "learning_rate": 1.4644089940698341e-05, + "loss": 0.6554, + "step": 3597 + }, + { + "epoch": 0.37, + "grad_norm": 1.4977699227582575, + "learning_rate": 1.4641172050359466e-05, + "loss": 0.7081, + "step": 3598 + }, + { + "epoch": 0.37, + "grad_norm": 1.6177170296718928, + "learning_rate": 1.4638253656293948e-05, + "loss": 0.7322, + "step": 3599 + }, + { + "epoch": 0.37, + "grad_norm": 1.562436002230096, + "learning_rate": 1.463533475881853e-05, + "loss": 0.7545, + "step": 3600 + }, + { + "epoch": 0.37, + "grad_norm": 1.5961450775953814, + "learning_rate": 1.463241535825001e-05, + "loss": 0.7949, + "step": 3601 + }, + { + "epoch": 0.37, + "grad_norm": 1.575564927707248, + "learning_rate": 1.4629495454905249e-05, + "loss": 0.7422, + "step": 3602 + }, + { + "epoch": 0.37, + "grad_norm": 1.604384087743517, + "learning_rate": 1.4626575049101148e-05, + "loss": 0.6647, + "step": 3603 + }, + { + "epoch": 0.37, + "grad_norm": 1.6632558311955634, + "learning_rate": 1.4623654141154682e-05, + "loss": 0.8267, + "step": 3604 + }, + { + "epoch": 0.37, + "grad_norm": 1.5379881536216085, + "learning_rate": 1.4620732731382863e-05, + "loss": 0.701, + "step": 3605 + }, + { + "epoch": 0.37, + "grad_norm": 1.562272895182808, + "learning_rate": 1.4617810820102766e-05, + "loss": 0.7102, + "step": 3606 + }, + { + "epoch": 0.37, + "grad_norm": 1.6436077048898663, + "learning_rate": 1.461488840763152e-05, + "loss": 0.8055, + "step": 3607 + }, + { + "epoch": 0.37, + "grad_norm": 1.4710283252621488, + "learning_rate": 1.4611965494286304e-05, + "loss": 0.7057, + "step": 3608 + }, + { + "epoch": 0.37, + "grad_norm": 1.6459419220368325, + "learning_rate": 1.4609042080384359e-05, + "loss": 0.764, + "step": 3609 + }, + { + "epoch": 0.37, + "grad_norm": 1.618306370841053, + "learning_rate": 1.4606118166242974e-05, + "loss": 0.8026, + "step": 3610 + }, + { + "epoch": 0.37, + "grad_norm": 1.7448311208782217, + "learning_rate": 1.4603193752179494e-05, + "loss": 0.7705, + "step": 3611 + }, + { + "epoch": 0.37, + "grad_norm": 1.538104497557957, + "learning_rate": 1.4600268838511314e-05, + "loss": 0.8236, + "step": 3612 + }, + { + "epoch": 0.37, + "grad_norm": 1.5925145637942848, + "learning_rate": 1.4597343425555894e-05, + "loss": 0.737, + "step": 3613 + }, + { + "epoch": 0.37, + "grad_norm": 1.5889299388290077, + "learning_rate": 1.459441751363074e-05, + "loss": 0.7804, + "step": 3614 + }, + { + "epoch": 0.37, + "grad_norm": 1.6711569124333012, + "learning_rate": 1.4591491103053414e-05, + "loss": 0.8212, + "step": 3615 + }, + { + "epoch": 0.37, + "grad_norm": 1.669951186642647, + "learning_rate": 1.458856419414153e-05, + "loss": 0.8123, + "step": 3616 + }, + { + "epoch": 0.37, + "grad_norm": 1.5160115842064925, + "learning_rate": 1.458563678721276e-05, + "loss": 0.6931, + "step": 3617 + }, + { + "epoch": 0.37, + "grad_norm": 1.577245360536689, + "learning_rate": 1.4582708882584831e-05, + "loss": 0.8215, + "step": 3618 + }, + { + "epoch": 0.37, + "grad_norm": 1.546826739829446, + "learning_rate": 1.4579780480575516e-05, + "loss": 0.6896, + "step": 3619 + }, + { + "epoch": 0.37, + "grad_norm": 1.6012972025578633, + "learning_rate": 1.4576851581502652e-05, + "loss": 0.7108, + "step": 3620 + }, + { + "epoch": 0.37, + "grad_norm": 1.4358221012271974, + "learning_rate": 1.4573922185684118e-05, + "loss": 0.7926, + "step": 3621 + }, + { + "epoch": 0.37, + "grad_norm": 1.5724954684563723, + "learning_rate": 1.4570992293437862e-05, + "loss": 0.6527, + "step": 3622 + }, + { + "epoch": 0.37, + "grad_norm": 1.4796895310716867, + "learning_rate": 1.4568061905081874e-05, + "loss": 0.7353, + "step": 3623 + }, + { + "epoch": 0.37, + "grad_norm": 1.4753205823693467, + "learning_rate": 1.4565131020934203e-05, + "loss": 0.7481, + "step": 3624 + }, + { + "epoch": 0.37, + "grad_norm": 1.4559061357050045, + "learning_rate": 1.456219964131295e-05, + "loss": 0.8166, + "step": 3625 + }, + { + "epoch": 0.37, + "grad_norm": 1.5246544530422408, + "learning_rate": 1.4559267766536272e-05, + "loss": 0.8174, + "step": 3626 + }, + { + "epoch": 0.37, + "grad_norm": 1.3934216510143873, + "learning_rate": 1.4556335396922376e-05, + "loss": 0.8245, + "step": 3627 + }, + { + "epoch": 0.37, + "grad_norm": 1.622348369929527, + "learning_rate": 1.4553402532789525e-05, + "loss": 0.7072, + "step": 3628 + }, + { + "epoch": 0.37, + "grad_norm": 1.6293277403360602, + "learning_rate": 1.4550469174456038e-05, + "loss": 0.8638, + "step": 3629 + }, + { + "epoch": 0.37, + "grad_norm": 1.487813163535852, + "learning_rate": 1.4547535322240283e-05, + "loss": 0.7255, + "step": 3630 + }, + { + "epoch": 0.37, + "grad_norm": 1.5659994707847569, + "learning_rate": 1.4544600976460684e-05, + "loss": 0.8259, + "step": 3631 + }, + { + "epoch": 0.37, + "grad_norm": 1.6386233973682773, + "learning_rate": 1.4541666137435717e-05, + "loss": 0.896, + "step": 3632 + }, + { + "epoch": 0.37, + "grad_norm": 1.4945777648735155, + "learning_rate": 1.4538730805483918e-05, + "loss": 0.6776, + "step": 3633 + }, + { + "epoch": 0.37, + "grad_norm": 1.3869665607423542, + "learning_rate": 1.4535794980923863e-05, + "loss": 0.6188, + "step": 3634 + }, + { + "epoch": 0.37, + "grad_norm": 1.5267935588868653, + "learning_rate": 1.45328586640742e-05, + "loss": 0.7501, + "step": 3635 + }, + { + "epoch": 0.37, + "grad_norm": 1.6141746140610456, + "learning_rate": 1.452992185525361e-05, + "loss": 0.7758, + "step": 3636 + }, + { + "epoch": 0.37, + "grad_norm": 1.7863954147640309, + "learning_rate": 1.4526984554780847e-05, + "loss": 0.8292, + "step": 3637 + }, + { + "epoch": 0.37, + "grad_norm": 1.437857154083615, + "learning_rate": 1.4524046762974705e-05, + "loss": 0.7079, + "step": 3638 + }, + { + "epoch": 0.37, + "grad_norm": 1.4963813486501298, + "learning_rate": 1.4521108480154032e-05, + "loss": 0.8067, + "step": 3639 + }, + { + "epoch": 0.37, + "grad_norm": 1.6212269153066883, + "learning_rate": 1.4518169706637736e-05, + "loss": 0.8064, + "step": 3640 + }, + { + "epoch": 0.37, + "grad_norm": 1.4496354534684153, + "learning_rate": 1.4515230442744774e-05, + "loss": 0.6204, + "step": 3641 + }, + { + "epoch": 0.37, + "grad_norm": 1.6003373215463383, + "learning_rate": 1.4512290688794161e-05, + "loss": 0.8154, + "step": 3642 + }, + { + "epoch": 0.37, + "grad_norm": 1.4260596812680963, + "learning_rate": 1.4509350445104955e-05, + "loss": 0.6393, + "step": 3643 + }, + { + "epoch": 0.37, + "grad_norm": 1.7932749293783161, + "learning_rate": 1.4506409711996278e-05, + "loss": 0.7551, + "step": 3644 + }, + { + "epoch": 0.37, + "grad_norm": 1.5188216802858843, + "learning_rate": 1.45034684897873e-05, + "loss": 0.7528, + "step": 3645 + }, + { + "epoch": 0.37, + "grad_norm": 1.5663984399125837, + "learning_rate": 1.4500526778797244e-05, + "loss": 0.7476, + "step": 3646 + }, + { + "epoch": 0.37, + "grad_norm": 1.6327098199457324, + "learning_rate": 1.4497584579345385e-05, + "loss": 0.7634, + "step": 3647 + }, + { + "epoch": 0.37, + "grad_norm": 1.5901216906836784, + "learning_rate": 1.4494641891751053e-05, + "loss": 0.873, + "step": 3648 + }, + { + "epoch": 0.37, + "grad_norm": 1.3081865288850023, + "learning_rate": 1.4491698716333632e-05, + "loss": 0.7266, + "step": 3649 + }, + { + "epoch": 0.37, + "grad_norm": 1.4269513171508648, + "learning_rate": 1.4488755053412562e-05, + "loss": 0.6676, + "step": 3650 + }, + { + "epoch": 0.37, + "grad_norm": 1.844540740181958, + "learning_rate": 1.4485810903307324e-05, + "loss": 0.8679, + "step": 3651 + }, + { + "epoch": 0.37, + "grad_norm": 1.6034428084923098, + "learning_rate": 1.4482866266337462e-05, + "loss": 0.7452, + "step": 3652 + }, + { + "epoch": 0.37, + "grad_norm": 1.6166846651844484, + "learning_rate": 1.4479921142822573e-05, + "loss": 0.6864, + "step": 3653 + }, + { + "epoch": 0.37, + "grad_norm": 1.464754271434382, + "learning_rate": 1.44769755330823e-05, + "loss": 0.6319, + "step": 3654 + }, + { + "epoch": 0.37, + "grad_norm": 1.5621261510649411, + "learning_rate": 1.4474029437436347e-05, + "loss": 0.7819, + "step": 3655 + }, + { + "epoch": 0.37, + "grad_norm": 1.5159227509879454, + "learning_rate": 1.4471082856204464e-05, + "loss": 0.7048, + "step": 3656 + }, + { + "epoch": 0.37, + "grad_norm": 1.528622911675774, + "learning_rate": 1.4468135789706454e-05, + "loss": 0.7776, + "step": 3657 + }, + { + "epoch": 0.37, + "grad_norm": 1.5907679003343815, + "learning_rate": 1.4465188238262183e-05, + "loss": 0.7024, + "step": 3658 + }, + { + "epoch": 0.37, + "grad_norm": 1.4917979339422152, + "learning_rate": 1.4462240202191553e-05, + "loss": 0.8161, + "step": 3659 + }, + { + "epoch": 0.37, + "grad_norm": 1.418213101349719, + "learning_rate": 1.4459291681814534e-05, + "loss": 0.7786, + "step": 3660 + }, + { + "epoch": 0.37, + "grad_norm": 1.4640771784646323, + "learning_rate": 1.4456342677451135e-05, + "loss": 0.5936, + "step": 3661 + }, + { + "epoch": 0.37, + "grad_norm": 1.4022523187744633, + "learning_rate": 1.4453393189421428e-05, + "loss": 0.7231, + "step": 3662 + }, + { + "epoch": 0.37, + "grad_norm": 2.7894244277851525, + "learning_rate": 1.4450443218045533e-05, + "loss": 0.6724, + "step": 3663 + }, + { + "epoch": 0.37, + "grad_norm": 1.5693836473898657, + "learning_rate": 1.4447492763643628e-05, + "loss": 0.6606, + "step": 3664 + }, + { + "epoch": 0.37, + "grad_norm": 1.4248358569829045, + "learning_rate": 1.4444541826535932e-05, + "loss": 0.7743, + "step": 3665 + }, + { + "epoch": 0.37, + "grad_norm": 1.6878677574918814, + "learning_rate": 1.4441590407042723e-05, + "loss": 0.8351, + "step": 3666 + }, + { + "epoch": 0.37, + "grad_norm": 1.5392881799461169, + "learning_rate": 1.4438638505484335e-05, + "loss": 0.6838, + "step": 3667 + }, + { + "epoch": 0.37, + "grad_norm": 1.5517144493417998, + "learning_rate": 1.4435686122181152e-05, + "loss": 0.7951, + "step": 3668 + }, + { + "epoch": 0.37, + "grad_norm": 1.4617789852879355, + "learning_rate": 1.4432733257453606e-05, + "loss": 0.7206, + "step": 3669 + }, + { + "epoch": 0.37, + "grad_norm": 1.5475756651496593, + "learning_rate": 1.4429779911622185e-05, + "loss": 0.6955, + "step": 3670 + }, + { + "epoch": 0.37, + "grad_norm": 1.586135028957735, + "learning_rate": 1.4426826085007429e-05, + "loss": 0.7057, + "step": 3671 + }, + { + "epoch": 0.37, + "grad_norm": 1.4336651729302952, + "learning_rate": 1.442387177792993e-05, + "loss": 0.683, + "step": 3672 + }, + { + "epoch": 0.37, + "grad_norm": 1.541184065036639, + "learning_rate": 1.442091699071033e-05, + "loss": 0.7663, + "step": 3673 + }, + { + "epoch": 0.37, + "grad_norm": 1.623271185521219, + "learning_rate": 1.4417961723669325e-05, + "loss": 0.7979, + "step": 3674 + }, + { + "epoch": 0.37, + "grad_norm": 1.678579444230614, + "learning_rate": 1.4415005977127666e-05, + "loss": 0.8188, + "step": 3675 + }, + { + "epoch": 0.37, + "grad_norm": 1.5397455104574125, + "learning_rate": 1.4412049751406149e-05, + "loss": 0.7839, + "step": 3676 + }, + { + "epoch": 0.37, + "grad_norm": 1.858473486282819, + "learning_rate": 1.4409093046825628e-05, + "loss": 0.9055, + "step": 3677 + }, + { + "epoch": 0.37, + "grad_norm": 1.45726580316041, + "learning_rate": 1.4406135863707011e-05, + "loss": 0.7296, + "step": 3678 + }, + { + "epoch": 0.37, + "grad_norm": 1.4610251095613365, + "learning_rate": 1.4403178202371246e-05, + "loss": 0.6918, + "step": 3679 + }, + { + "epoch": 0.37, + "grad_norm": 1.4127601977062236, + "learning_rate": 1.4400220063139348e-05, + "loss": 0.7537, + "step": 3680 + }, + { + "epoch": 0.37, + "grad_norm": 1.60772054879461, + "learning_rate": 1.4397261446332374e-05, + "loss": 0.713, + "step": 3681 + }, + { + "epoch": 0.37, + "grad_norm": 1.6570686371270857, + "learning_rate": 1.4394302352271432e-05, + "loss": 0.7406, + "step": 3682 + }, + { + "epoch": 0.37, + "grad_norm": 1.6827887598500695, + "learning_rate": 1.4391342781277694e-05, + "loss": 0.7145, + "step": 3683 + }, + { + "epoch": 0.37, + "grad_norm": 1.4378929980810253, + "learning_rate": 1.4388382733672366e-05, + "loss": 0.7492, + "step": 3684 + }, + { + "epoch": 0.37, + "grad_norm": 1.5246600418398752, + "learning_rate": 1.4385422209776718e-05, + "loss": 0.7944, + "step": 3685 + }, + { + "epoch": 0.37, + "grad_norm": 1.4014568904240385, + "learning_rate": 1.4382461209912073e-05, + "loss": 0.6436, + "step": 3686 + }, + { + "epoch": 0.38, + "grad_norm": 1.4428423629394982, + "learning_rate": 1.4379499734399797e-05, + "loss": 0.6961, + "step": 3687 + }, + { + "epoch": 0.38, + "grad_norm": 1.623447139815587, + "learning_rate": 1.4376537783561312e-05, + "loss": 0.775, + "step": 3688 + }, + { + "epoch": 0.38, + "grad_norm": 1.6461117377577357, + "learning_rate": 1.4373575357718091e-05, + "loss": 0.8546, + "step": 3689 + }, + { + "epoch": 0.38, + "grad_norm": 1.570365184009539, + "learning_rate": 1.4370612457191661e-05, + "loss": 0.7562, + "step": 3690 + }, + { + "epoch": 0.38, + "grad_norm": 1.525833349657673, + "learning_rate": 1.4367649082303598e-05, + "loss": 0.6829, + "step": 3691 + }, + { + "epoch": 0.38, + "grad_norm": 1.651551129448428, + "learning_rate": 1.4364685233375531e-05, + "loss": 0.7618, + "step": 3692 + }, + { + "epoch": 0.38, + "grad_norm": 1.8057838334810443, + "learning_rate": 1.4361720910729133e-05, + "loss": 0.7877, + "step": 3693 + }, + { + "epoch": 0.38, + "grad_norm": 1.6881366741148849, + "learning_rate": 1.4358756114686144e-05, + "loss": 0.7429, + "step": 3694 + }, + { + "epoch": 0.38, + "grad_norm": 1.5659478374429834, + "learning_rate": 1.4355790845568341e-05, + "loss": 0.7668, + "step": 3695 + }, + { + "epoch": 0.38, + "grad_norm": 1.8136241690524513, + "learning_rate": 1.4352825103697559e-05, + "loss": 0.8258, + "step": 3696 + }, + { + "epoch": 0.38, + "grad_norm": 1.547061746400421, + "learning_rate": 1.4349858889395682e-05, + "loss": 0.8155, + "step": 3697 + }, + { + "epoch": 0.38, + "grad_norm": 1.6797520121295821, + "learning_rate": 1.4346892202984645e-05, + "loss": 0.6994, + "step": 3698 + }, + { + "epoch": 0.38, + "grad_norm": 1.4798017009668816, + "learning_rate": 1.434392504478644e-05, + "loss": 0.7518, + "step": 3699 + }, + { + "epoch": 0.38, + "grad_norm": 1.8504089558554946, + "learning_rate": 1.4340957415123102e-05, + "loss": 0.7717, + "step": 3700 + }, + { + "epoch": 0.38, + "grad_norm": 1.4628113248243737, + "learning_rate": 1.4337989314316722e-05, + "loss": 0.7195, + "step": 3701 + }, + { + "epoch": 0.38, + "grad_norm": 1.718890806307345, + "learning_rate": 1.4335020742689439e-05, + "loss": 0.777, + "step": 3702 + }, + { + "epoch": 0.38, + "grad_norm": 1.6301303689478914, + "learning_rate": 1.4332051700563448e-05, + "loss": 0.6558, + "step": 3703 + }, + { + "epoch": 0.38, + "grad_norm": 1.7492602602785114, + "learning_rate": 1.432908218826099e-05, + "loss": 0.7529, + "step": 3704 + }, + { + "epoch": 0.38, + "grad_norm": 1.5101433076297917, + "learning_rate": 1.4326112206104359e-05, + "loss": 0.833, + "step": 3705 + }, + { + "epoch": 0.38, + "grad_norm": 1.741038527985199, + "learning_rate": 1.4323141754415904e-05, + "loss": 0.797, + "step": 3706 + }, + { + "epoch": 0.38, + "grad_norm": 1.3241571181008938, + "learning_rate": 1.4320170833518012e-05, + "loss": 0.632, + "step": 3707 + }, + { + "epoch": 0.38, + "grad_norm": 1.5259498416902493, + "learning_rate": 1.431719944373314e-05, + "loss": 0.7218, + "step": 3708 + }, + { + "epoch": 0.38, + "grad_norm": 1.6374207928378792, + "learning_rate": 1.4314227585383782e-05, + "loss": 0.7591, + "step": 3709 + }, + { + "epoch": 0.38, + "grad_norm": 1.4740806885171056, + "learning_rate": 1.4311255258792487e-05, + "loss": 0.8105, + "step": 3710 + }, + { + "epoch": 0.38, + "grad_norm": 1.3374152615652346, + "learning_rate": 1.430828246428185e-05, + "loss": 0.6014, + "step": 3711 + }, + { + "epoch": 0.38, + "grad_norm": 1.535286610220723, + "learning_rate": 1.4305309202174531e-05, + "loss": 0.6523, + "step": 3712 + }, + { + "epoch": 0.38, + "grad_norm": 1.62827588808955, + "learning_rate": 1.4302335472793223e-05, + "loss": 0.8442, + "step": 3713 + }, + { + "epoch": 0.38, + "grad_norm": 1.6613625625280515, + "learning_rate": 1.4299361276460682e-05, + "loss": 0.7732, + "step": 3714 + }, + { + "epoch": 0.38, + "grad_norm": 1.37174038097834, + "learning_rate": 1.4296386613499707e-05, + "loss": 0.6459, + "step": 3715 + }, + { + "epoch": 0.38, + "grad_norm": 1.5986286169846708, + "learning_rate": 1.429341148423315e-05, + "loss": 0.7608, + "step": 3716 + }, + { + "epoch": 0.38, + "grad_norm": 1.488244744833797, + "learning_rate": 1.4290435888983925e-05, + "loss": 0.7629, + "step": 3717 + }, + { + "epoch": 0.38, + "grad_norm": 1.6315928375643658, + "learning_rate": 1.4287459828074974e-05, + "loss": 0.7264, + "step": 3718 + }, + { + "epoch": 0.38, + "grad_norm": 1.5400859331905221, + "learning_rate": 1.4284483301829308e-05, + "loss": 0.7096, + "step": 3719 + }, + { + "epoch": 0.38, + "grad_norm": 1.3762063201954269, + "learning_rate": 1.4281506310569982e-05, + "loss": 0.6705, + "step": 3720 + }, + { + "epoch": 0.38, + "grad_norm": 1.4196649835515989, + "learning_rate": 1.4278528854620101e-05, + "loss": 0.6865, + "step": 3721 + }, + { + "epoch": 0.38, + "grad_norm": 1.6112878945831668, + "learning_rate": 1.4275550934302822e-05, + "loss": 0.8048, + "step": 3722 + }, + { + "epoch": 0.38, + "grad_norm": 1.42243294129432, + "learning_rate": 1.4272572549941353e-05, + "loss": 0.7125, + "step": 3723 + }, + { + "epoch": 0.38, + "grad_norm": 1.5105908639260996, + "learning_rate": 1.4269593701858946e-05, + "loss": 0.7069, + "step": 3724 + }, + { + "epoch": 0.38, + "grad_norm": 1.5253097055310054, + "learning_rate": 1.4266614390378912e-05, + "loss": 0.7579, + "step": 3725 + }, + { + "epoch": 0.38, + "grad_norm": 1.6273141589271638, + "learning_rate": 1.4263634615824611e-05, + "loss": 0.688, + "step": 3726 + }, + { + "epoch": 0.38, + "grad_norm": 1.536711430900752, + "learning_rate": 1.4260654378519445e-05, + "loss": 0.7457, + "step": 3727 + }, + { + "epoch": 0.38, + "grad_norm": 1.6207216793297654, + "learning_rate": 1.4257673678786878e-05, + "loss": 0.7035, + "step": 3728 + }, + { + "epoch": 0.38, + "grad_norm": 1.5322103079932716, + "learning_rate": 1.4254692516950415e-05, + "loss": 0.6968, + "step": 3729 + }, + { + "epoch": 0.38, + "grad_norm": 1.606998978040395, + "learning_rate": 1.4251710893333615e-05, + "loss": 0.7688, + "step": 3730 + }, + { + "epoch": 0.38, + "grad_norm": 1.6404842672847377, + "learning_rate": 1.4248728808260086e-05, + "loss": 0.7266, + "step": 3731 + }, + { + "epoch": 0.38, + "grad_norm": 1.5499251500162907, + "learning_rate": 1.4245746262053489e-05, + "loss": 0.815, + "step": 3732 + }, + { + "epoch": 0.38, + "grad_norm": 1.4590827206827173, + "learning_rate": 1.4242763255037528e-05, + "loss": 0.7333, + "step": 3733 + }, + { + "epoch": 0.38, + "grad_norm": 1.520366668123699, + "learning_rate": 1.4239779787535969e-05, + "loss": 0.7919, + "step": 3734 + }, + { + "epoch": 0.38, + "grad_norm": 1.599862226499546, + "learning_rate": 1.4236795859872613e-05, + "loss": 0.7795, + "step": 3735 + }, + { + "epoch": 0.38, + "grad_norm": 1.5905915358036564, + "learning_rate": 1.4233811472371326e-05, + "loss": 0.6292, + "step": 3736 + }, + { + "epoch": 0.38, + "grad_norm": 1.6291003406413689, + "learning_rate": 1.4230826625356011e-05, + "loss": 0.7277, + "step": 3737 + }, + { + "epoch": 0.38, + "grad_norm": 1.4272235179354023, + "learning_rate": 1.422784131915063e-05, + "loss": 0.6668, + "step": 3738 + }, + { + "epoch": 0.38, + "grad_norm": 1.832901248039077, + "learning_rate": 1.422485555407919e-05, + "loss": 0.8227, + "step": 3739 + }, + { + "epoch": 0.38, + "grad_norm": 1.7708106990296313, + "learning_rate": 1.4221869330465747e-05, + "loss": 0.7496, + "step": 3740 + }, + { + "epoch": 0.38, + "grad_norm": 1.6247255897769424, + "learning_rate": 1.4218882648634413e-05, + "loss": 0.7707, + "step": 3741 + }, + { + "epoch": 0.38, + "grad_norm": 1.5910916475377026, + "learning_rate": 1.4215895508909343e-05, + "loss": 0.7974, + "step": 3742 + }, + { + "epoch": 0.38, + "grad_norm": 1.4224597783707875, + "learning_rate": 1.4212907911614744e-05, + "loss": 0.6212, + "step": 3743 + }, + { + "epoch": 0.38, + "grad_norm": 2.7028252775857555, + "learning_rate": 1.4209919857074875e-05, + "loss": 0.7943, + "step": 3744 + }, + { + "epoch": 0.38, + "grad_norm": 1.6161012343232768, + "learning_rate": 1.420693134561404e-05, + "loss": 0.7725, + "step": 3745 + }, + { + "epoch": 0.38, + "grad_norm": 1.678927989956609, + "learning_rate": 1.4203942377556596e-05, + "loss": 0.846, + "step": 3746 + }, + { + "epoch": 0.38, + "grad_norm": 1.3767408598905295, + "learning_rate": 1.4200952953226949e-05, + "loss": 0.7024, + "step": 3747 + }, + { + "epoch": 0.38, + "grad_norm": 1.545315952518722, + "learning_rate": 1.4197963072949556e-05, + "loss": 0.7767, + "step": 3748 + }, + { + "epoch": 0.38, + "grad_norm": 1.6255543421501542, + "learning_rate": 1.419497273704892e-05, + "loss": 0.8155, + "step": 3749 + }, + { + "epoch": 0.38, + "grad_norm": 1.2918856773106884, + "learning_rate": 1.4191981945849595e-05, + "loss": 0.6786, + "step": 3750 + }, + { + "epoch": 0.38, + "grad_norm": 1.5229528852163654, + "learning_rate": 1.4188990699676186e-05, + "loss": 0.6854, + "step": 3751 + }, + { + "epoch": 0.38, + "grad_norm": 1.62475224522315, + "learning_rate": 1.4185998998853343e-05, + "loss": 0.7444, + "step": 3752 + }, + { + "epoch": 0.38, + "grad_norm": 1.458221762720607, + "learning_rate": 1.4183006843705774e-05, + "loss": 0.702, + "step": 3753 + }, + { + "epoch": 0.38, + "grad_norm": 1.4584481686773072, + "learning_rate": 1.4180014234558224e-05, + "loss": 0.7233, + "step": 3754 + }, + { + "epoch": 0.38, + "grad_norm": 1.6129813492541507, + "learning_rate": 1.4177021171735499e-05, + "loss": 0.6746, + "step": 3755 + }, + { + "epoch": 0.38, + "grad_norm": 1.576899186873065, + "learning_rate": 1.4174027655562443e-05, + "loss": 0.7028, + "step": 3756 + }, + { + "epoch": 0.38, + "grad_norm": 1.4883587909747937, + "learning_rate": 1.4171033686363962e-05, + "loss": 0.795, + "step": 3757 + }, + { + "epoch": 0.38, + "grad_norm": 1.5501029021198707, + "learning_rate": 1.4168039264465003e-05, + "loss": 0.7463, + "step": 3758 + }, + { + "epoch": 0.38, + "grad_norm": 1.589529021865067, + "learning_rate": 1.4165044390190563e-05, + "loss": 0.7619, + "step": 3759 + }, + { + "epoch": 0.38, + "grad_norm": 1.6718136233085048, + "learning_rate": 1.4162049063865686e-05, + "loss": 0.7743, + "step": 3760 + }, + { + "epoch": 0.38, + "grad_norm": 1.6352469921507213, + "learning_rate": 1.4159053285815472e-05, + "loss": 0.7488, + "step": 3761 + }, + { + "epoch": 0.38, + "grad_norm": 1.728331817314099, + "learning_rate": 1.4156057056365064e-05, + "loss": 0.6152, + "step": 3762 + }, + { + "epoch": 0.38, + "grad_norm": 1.5786144672305662, + "learning_rate": 1.4153060375839656e-05, + "loss": 0.8472, + "step": 3763 + }, + { + "epoch": 0.38, + "grad_norm": 1.5559091282283846, + "learning_rate": 1.4150063244564491e-05, + "loss": 0.7672, + "step": 3764 + }, + { + "epoch": 0.38, + "grad_norm": 1.6212637710342384, + "learning_rate": 1.4147065662864859e-05, + "loss": 0.765, + "step": 3765 + }, + { + "epoch": 0.38, + "grad_norm": 1.5200602689627594, + "learning_rate": 1.4144067631066102e-05, + "loss": 0.7064, + "step": 3766 + }, + { + "epoch": 0.38, + "grad_norm": 1.534721277780881, + "learning_rate": 1.4141069149493612e-05, + "loss": 0.691, + "step": 3767 + }, + { + "epoch": 0.38, + "grad_norm": 1.592590452367276, + "learning_rate": 1.4138070218472825e-05, + "loss": 0.8375, + "step": 3768 + }, + { + "epoch": 0.38, + "grad_norm": 1.4903446030462624, + "learning_rate": 1.4135070838329227e-05, + "loss": 0.649, + "step": 3769 + }, + { + "epoch": 0.38, + "grad_norm": 1.6113849496475525, + "learning_rate": 1.4132071009388353e-05, + "loss": 0.5889, + "step": 3770 + }, + { + "epoch": 0.38, + "grad_norm": 1.5290589043034744, + "learning_rate": 1.4129070731975791e-05, + "loss": 0.7634, + "step": 3771 + }, + { + "epoch": 0.38, + "grad_norm": 1.5352165322416986, + "learning_rate": 1.4126070006417174e-05, + "loss": 0.7416, + "step": 3772 + }, + { + "epoch": 0.38, + "grad_norm": 1.5023754903404172, + "learning_rate": 1.412306883303818e-05, + "loss": 0.689, + "step": 3773 + }, + { + "epoch": 0.38, + "grad_norm": 1.528870029355672, + "learning_rate": 1.4120067212164542e-05, + "loss": 0.6352, + "step": 3774 + }, + { + "epoch": 0.38, + "grad_norm": 1.4173567433584682, + "learning_rate": 1.4117065144122038e-05, + "loss": 0.649, + "step": 3775 + }, + { + "epoch": 0.38, + "grad_norm": 1.5576072638287743, + "learning_rate": 1.4114062629236497e-05, + "loss": 0.7539, + "step": 3776 + }, + { + "epoch": 0.38, + "grad_norm": 1.5399350550920947, + "learning_rate": 1.4111059667833797e-05, + "loss": 0.7383, + "step": 3777 + }, + { + "epoch": 0.38, + "grad_norm": 1.574872734967755, + "learning_rate": 1.4108056260239858e-05, + "loss": 0.7656, + "step": 3778 + }, + { + "epoch": 0.38, + "grad_norm": 1.5001125430872018, + "learning_rate": 1.4105052406780653e-05, + "loss": 0.6585, + "step": 3779 + }, + { + "epoch": 0.38, + "grad_norm": 1.7144549791102108, + "learning_rate": 1.4102048107782206e-05, + "loss": 0.7931, + "step": 3780 + }, + { + "epoch": 0.38, + "grad_norm": 1.6329338489197691, + "learning_rate": 1.4099043363570588e-05, + "loss": 0.8194, + "step": 3781 + }, + { + "epoch": 0.38, + "grad_norm": 1.524154261575747, + "learning_rate": 1.4096038174471913e-05, + "loss": 0.7184, + "step": 3782 + }, + { + "epoch": 0.38, + "grad_norm": 1.5219843735215859, + "learning_rate": 1.4093032540812347e-05, + "loss": 0.8048, + "step": 3783 + }, + { + "epoch": 0.38, + "grad_norm": 1.4340792225579542, + "learning_rate": 1.4090026462918109e-05, + "loss": 0.7044, + "step": 3784 + }, + { + "epoch": 0.39, + "grad_norm": 1.5796502903346024, + "learning_rate": 1.408701994111546e-05, + "loss": 0.7784, + "step": 3785 + }, + { + "epoch": 0.39, + "grad_norm": 1.665890566670337, + "learning_rate": 1.408401297573071e-05, + "loss": 0.7158, + "step": 3786 + }, + { + "epoch": 0.39, + "grad_norm": 1.4960904548272018, + "learning_rate": 1.4081005567090217e-05, + "loss": 0.6829, + "step": 3787 + }, + { + "epoch": 0.39, + "grad_norm": 1.4380137590155226, + "learning_rate": 1.4077997715520389e-05, + "loss": 0.6871, + "step": 3788 + }, + { + "epoch": 0.39, + "grad_norm": 1.6775201967511228, + "learning_rate": 1.4074989421347683e-05, + "loss": 0.7099, + "step": 3789 + }, + { + "epoch": 0.39, + "grad_norm": 1.4047990814731268, + "learning_rate": 1.4071980684898599e-05, + "loss": 0.6937, + "step": 3790 + }, + { + "epoch": 0.39, + "grad_norm": 1.5564919382081719, + "learning_rate": 1.4068971506499693e-05, + "loss": 0.6568, + "step": 3791 + }, + { + "epoch": 0.39, + "grad_norm": 1.3846321477736974, + "learning_rate": 1.4065961886477561e-05, + "loss": 0.6756, + "step": 3792 + }, + { + "epoch": 0.39, + "grad_norm": 1.795205437629554, + "learning_rate": 1.4062951825158848e-05, + "loss": 0.8043, + "step": 3793 + }, + { + "epoch": 0.39, + "grad_norm": 1.496177531214818, + "learning_rate": 1.4059941322870255e-05, + "loss": 0.665, + "step": 3794 + }, + { + "epoch": 0.39, + "grad_norm": 1.6592131448302958, + "learning_rate": 1.405693037993852e-05, + "loss": 0.7308, + "step": 3795 + }, + { + "epoch": 0.39, + "grad_norm": 1.587738465551338, + "learning_rate": 1.4053918996690437e-05, + "loss": 0.8211, + "step": 3796 + }, + { + "epoch": 0.39, + "grad_norm": 1.3009250251491375, + "learning_rate": 1.4050907173452841e-05, + "loss": 0.6029, + "step": 3797 + }, + { + "epoch": 0.39, + "grad_norm": 1.596963960134962, + "learning_rate": 1.4047894910552624e-05, + "loss": 0.6934, + "step": 3798 + }, + { + "epoch": 0.39, + "grad_norm": 1.3562784172634255, + "learning_rate": 1.4044882208316714e-05, + "loss": 0.7474, + "step": 3799 + }, + { + "epoch": 0.39, + "grad_norm": 1.6531533546302477, + "learning_rate": 1.4041869067072096e-05, + "loss": 0.7492, + "step": 3800 + }, + { + "epoch": 0.39, + "grad_norm": 1.6285548654829045, + "learning_rate": 1.40388554871458e-05, + "loss": 0.8076, + "step": 3801 + }, + { + "epoch": 0.39, + "grad_norm": 1.583551870150882, + "learning_rate": 1.4035841468864897e-05, + "loss": 0.7962, + "step": 3802 + }, + { + "epoch": 0.39, + "grad_norm": 1.47241936234266, + "learning_rate": 1.403282701255652e-05, + "loss": 0.7323, + "step": 3803 + }, + { + "epoch": 0.39, + "grad_norm": 1.516670769299008, + "learning_rate": 1.4029812118547836e-05, + "loss": 0.7883, + "step": 3804 + }, + { + "epoch": 0.39, + "grad_norm": 1.5406833413888608, + "learning_rate": 1.4026796787166066e-05, + "loss": 0.6896, + "step": 3805 + }, + { + "epoch": 0.39, + "grad_norm": 1.521719402197105, + "learning_rate": 1.4023781018738474e-05, + "loss": 0.6659, + "step": 3806 + }, + { + "epoch": 0.39, + "grad_norm": 1.4315441440696073, + "learning_rate": 1.402076481359238e-05, + "loss": 0.6849, + "step": 3807 + }, + { + "epoch": 0.39, + "grad_norm": 1.3956933687096298, + "learning_rate": 1.4017748172055146e-05, + "loss": 0.7567, + "step": 3808 + }, + { + "epoch": 0.39, + "grad_norm": 1.589580251407946, + "learning_rate": 1.4014731094454175e-05, + "loss": 0.7499, + "step": 3809 + }, + { + "epoch": 0.39, + "grad_norm": 1.4449082249852578, + "learning_rate": 1.4011713581116929e-05, + "loss": 0.7198, + "step": 3810 + }, + { + "epoch": 0.39, + "grad_norm": 1.7331598584411358, + "learning_rate": 1.4008695632370905e-05, + "loss": 0.771, + "step": 3811 + }, + { + "epoch": 0.39, + "grad_norm": 1.5165282302359995, + "learning_rate": 1.4005677248543664e-05, + "loss": 0.6751, + "step": 3812 + }, + { + "epoch": 0.39, + "grad_norm": 1.4090930328374909, + "learning_rate": 1.4002658429962797e-05, + "loss": 0.7594, + "step": 3813 + }, + { + "epoch": 0.39, + "grad_norm": 1.4536552389420134, + "learning_rate": 1.3999639176955954e-05, + "loss": 0.6469, + "step": 3814 + }, + { + "epoch": 0.39, + "grad_norm": 1.6771064874091852, + "learning_rate": 1.3996619489850822e-05, + "loss": 0.7147, + "step": 3815 + }, + { + "epoch": 0.39, + "grad_norm": 1.6093967262983275, + "learning_rate": 1.3993599368975148e-05, + "loss": 0.7282, + "step": 3816 + }, + { + "epoch": 0.39, + "grad_norm": 1.6028915140892204, + "learning_rate": 1.3990578814656716e-05, + "loss": 0.7347, + "step": 3817 + }, + { + "epoch": 0.39, + "grad_norm": 1.5816198883110975, + "learning_rate": 1.398755782722336e-05, + "loss": 0.6194, + "step": 3818 + }, + { + "epoch": 0.39, + "grad_norm": 1.5576591775681445, + "learning_rate": 1.3984536407002958e-05, + "loss": 0.7161, + "step": 3819 + }, + { + "epoch": 0.39, + "grad_norm": 1.528736190571755, + "learning_rate": 1.398151455432344e-05, + "loss": 0.7431, + "step": 3820 + }, + { + "epoch": 0.39, + "grad_norm": 1.5997533389755794, + "learning_rate": 1.397849226951278e-05, + "loss": 0.6505, + "step": 3821 + }, + { + "epoch": 0.39, + "grad_norm": 1.5242466138313542, + "learning_rate": 1.3975469552899003e-05, + "loss": 0.7331, + "step": 3822 + }, + { + "epoch": 0.39, + "grad_norm": 1.42476330834485, + "learning_rate": 1.3972446404810176e-05, + "loss": 0.7084, + "step": 3823 + }, + { + "epoch": 0.39, + "grad_norm": 1.553639315764046, + "learning_rate": 1.396942282557441e-05, + "loss": 0.828, + "step": 3824 + }, + { + "epoch": 0.39, + "grad_norm": 1.709618287320766, + "learning_rate": 1.3966398815519874e-05, + "loss": 0.8233, + "step": 3825 + }, + { + "epoch": 0.39, + "grad_norm": 1.5133739711007284, + "learning_rate": 1.3963374374974774e-05, + "loss": 0.688, + "step": 3826 + }, + { + "epoch": 0.39, + "grad_norm": 1.6350687600513938, + "learning_rate": 1.3960349504267367e-05, + "loss": 0.7473, + "step": 3827 + }, + { + "epoch": 0.39, + "grad_norm": 1.6231874561776136, + "learning_rate": 1.3957324203725952e-05, + "loss": 0.7765, + "step": 3828 + }, + { + "epoch": 0.39, + "grad_norm": 1.4225649096261395, + "learning_rate": 1.395429847367888e-05, + "loss": 0.7304, + "step": 3829 + }, + { + "epoch": 0.39, + "grad_norm": 1.524648245289213, + "learning_rate": 1.3951272314454549e-05, + "loss": 0.7341, + "step": 3830 + }, + { + "epoch": 0.39, + "grad_norm": 1.580200458978781, + "learning_rate": 1.39482457263814e-05, + "loss": 0.8039, + "step": 3831 + }, + { + "epoch": 0.39, + "grad_norm": 1.6674960493863458, + "learning_rate": 1.394521870978792e-05, + "loss": 0.7352, + "step": 3832 + }, + { + "epoch": 0.39, + "grad_norm": 1.5538310665859416, + "learning_rate": 1.3942191265002642e-05, + "loss": 0.6707, + "step": 3833 + }, + { + "epoch": 0.39, + "grad_norm": 1.4450896961970803, + "learning_rate": 1.3939163392354156e-05, + "loss": 0.623, + "step": 3834 + }, + { + "epoch": 0.39, + "grad_norm": 1.7013094745437254, + "learning_rate": 1.3936135092171084e-05, + "loss": 0.7985, + "step": 3835 + }, + { + "epoch": 0.39, + "grad_norm": 1.5946591799629912, + "learning_rate": 1.3933106364782103e-05, + "loss": 0.7847, + "step": 3836 + }, + { + "epoch": 0.39, + "grad_norm": 1.6543035325745643, + "learning_rate": 1.393007721051593e-05, + "loss": 0.839, + "step": 3837 + }, + { + "epoch": 0.39, + "grad_norm": 1.3017152342489924, + "learning_rate": 1.3927047629701336e-05, + "loss": 0.7176, + "step": 3838 + }, + { + "epoch": 0.39, + "grad_norm": 1.4315199274203088, + "learning_rate": 1.3924017622667134e-05, + "loss": 0.7293, + "step": 3839 + }, + { + "epoch": 0.39, + "grad_norm": 1.6881075867618345, + "learning_rate": 1.3920987189742186e-05, + "loss": 0.7541, + "step": 3840 + }, + { + "epoch": 0.39, + "grad_norm": 1.5138340574527112, + "learning_rate": 1.3917956331255393e-05, + "loss": 0.7273, + "step": 3841 + }, + { + "epoch": 0.39, + "grad_norm": 1.6414630893059443, + "learning_rate": 1.3914925047535712e-05, + "loss": 0.7024, + "step": 3842 + }, + { + "epoch": 0.39, + "grad_norm": 1.5294447073641448, + "learning_rate": 1.3911893338912142e-05, + "loss": 0.7604, + "step": 3843 + }, + { + "epoch": 0.39, + "grad_norm": 1.510644801294499, + "learning_rate": 1.390886120571372e-05, + "loss": 0.7265, + "step": 3844 + }, + { + "epoch": 0.39, + "grad_norm": 1.4877613287490439, + "learning_rate": 1.3905828648269544e-05, + "loss": 0.6287, + "step": 3845 + }, + { + "epoch": 0.39, + "grad_norm": 1.5412889660007723, + "learning_rate": 1.390279566690875e-05, + "loss": 0.7313, + "step": 3846 + }, + { + "epoch": 0.39, + "grad_norm": 1.6011530524353756, + "learning_rate": 1.3899762261960519e-05, + "loss": 0.7371, + "step": 3847 + }, + { + "epoch": 0.39, + "grad_norm": 1.530472811667168, + "learning_rate": 1.3896728433754078e-05, + "loss": 0.6527, + "step": 3848 + }, + { + "epoch": 0.39, + "grad_norm": 1.4986886699284894, + "learning_rate": 1.3893694182618705e-05, + "loss": 0.8456, + "step": 3849 + }, + { + "epoch": 0.39, + "grad_norm": 1.5593222557486242, + "learning_rate": 1.3890659508883719e-05, + "loss": 0.7395, + "step": 3850 + }, + { + "epoch": 0.39, + "grad_norm": 1.5585342555525534, + "learning_rate": 1.3887624412878485e-05, + "loss": 0.7369, + "step": 3851 + }, + { + "epoch": 0.39, + "grad_norm": 1.6438767824159255, + "learning_rate": 1.3884588894932418e-05, + "loss": 0.6615, + "step": 3852 + }, + { + "epoch": 0.39, + "grad_norm": 1.5879575751740531, + "learning_rate": 1.3881552955374975e-05, + "loss": 0.7852, + "step": 3853 + }, + { + "epoch": 0.39, + "grad_norm": 1.5550148776799544, + "learning_rate": 1.3878516594535661e-05, + "loss": 0.739, + "step": 3854 + }, + { + "epoch": 0.39, + "grad_norm": 1.5185103880024398, + "learning_rate": 1.3875479812744022e-05, + "loss": 0.803, + "step": 3855 + }, + { + "epoch": 0.39, + "grad_norm": 1.532457978963268, + "learning_rate": 1.3872442610329652e-05, + "loss": 0.7316, + "step": 3856 + }, + { + "epoch": 0.39, + "grad_norm": 1.6130164917900631, + "learning_rate": 1.38694049876222e-05, + "loss": 0.693, + "step": 3857 + }, + { + "epoch": 0.39, + "grad_norm": 1.5063801970280357, + "learning_rate": 1.3866366944951344e-05, + "loss": 0.7135, + "step": 3858 + }, + { + "epoch": 0.39, + "grad_norm": 1.5886342956960429, + "learning_rate": 1.386332848264682e-05, + "loss": 0.8027, + "step": 3859 + }, + { + "epoch": 0.39, + "grad_norm": 1.7263521408368807, + "learning_rate": 1.3860289601038406e-05, + "loss": 0.7198, + "step": 3860 + }, + { + "epoch": 0.39, + "grad_norm": 1.5607732274716484, + "learning_rate": 1.3857250300455922e-05, + "loss": 0.776, + "step": 3861 + }, + { + "epoch": 0.39, + "grad_norm": 1.6904536538120865, + "learning_rate": 1.3854210581229242e-05, + "loss": 0.7018, + "step": 3862 + }, + { + "epoch": 0.39, + "grad_norm": 1.6716662376200253, + "learning_rate": 1.3851170443688274e-05, + "loss": 0.7247, + "step": 3863 + }, + { + "epoch": 0.39, + "grad_norm": 1.4706608496989764, + "learning_rate": 1.384812988816298e-05, + "loss": 0.6567, + "step": 3864 + }, + { + "epoch": 0.39, + "grad_norm": 1.6684571671032502, + "learning_rate": 1.3845088914983365e-05, + "loss": 0.7876, + "step": 3865 + }, + { + "epoch": 0.39, + "grad_norm": 1.5257979153878254, + "learning_rate": 1.3842047524479478e-05, + "loss": 0.833, + "step": 3866 + }, + { + "epoch": 0.39, + "grad_norm": 1.428668645500322, + "learning_rate": 1.3839005716981416e-05, + "loss": 0.7895, + "step": 3867 + }, + { + "epoch": 0.39, + "grad_norm": 1.437318096501984, + "learning_rate": 1.383596349281932e-05, + "loss": 0.7416, + "step": 3868 + }, + { + "epoch": 0.39, + "grad_norm": 1.5649894008799563, + "learning_rate": 1.383292085232337e-05, + "loss": 0.7262, + "step": 3869 + }, + { + "epoch": 0.39, + "grad_norm": 1.6585877060908665, + "learning_rate": 1.3829877795823805e-05, + "loss": 0.679, + "step": 3870 + }, + { + "epoch": 0.39, + "grad_norm": 1.6236608561530104, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.6862, + "step": 3871 + }, + { + "epoch": 0.39, + "grad_norm": 1.4800539140742295, + "learning_rate": 1.3823790436134971e-05, + "loss": 0.7337, + "step": 3872 + }, + { + "epoch": 0.39, + "grad_norm": 1.4511731411899458, + "learning_rate": 1.3820746133606388e-05, + "loss": 0.7061, + "step": 3873 + }, + { + "epoch": 0.39, + "grad_norm": 1.5955067901139126, + "learning_rate": 1.3817701416395562e-05, + "loss": 0.7108, + "step": 3874 + }, + { + "epoch": 0.39, + "grad_norm": 1.5956323928795686, + "learning_rate": 1.381465628483295e-05, + "loss": 0.836, + "step": 3875 + }, + { + "epoch": 0.39, + "grad_norm": 1.5328985880733839, + "learning_rate": 1.3811610739249052e-05, + "loss": 0.7159, + "step": 3876 + }, + { + "epoch": 0.39, + "grad_norm": 1.449031123788358, + "learning_rate": 1.3808564779974418e-05, + "loss": 0.7684, + "step": 3877 + }, + { + "epoch": 0.39, + "grad_norm": 1.413321842781258, + "learning_rate": 1.3805518407339633e-05, + "loss": 0.7434, + "step": 3878 + }, + { + "epoch": 0.39, + "grad_norm": 1.6163216631466955, + "learning_rate": 1.3802471621675337e-05, + "loss": 0.7836, + "step": 3879 + }, + { + "epoch": 0.39, + "grad_norm": 1.5284123049299065, + "learning_rate": 1.3799424423312213e-05, + "loss": 0.8221, + "step": 3880 + }, + { + "epoch": 0.39, + "grad_norm": 1.459195116401371, + "learning_rate": 1.3796376812580983e-05, + "loss": 0.6967, + "step": 3881 + }, + { + "epoch": 0.39, + "grad_norm": 1.6636941098201041, + "learning_rate": 1.379332878981242e-05, + "loss": 0.8194, + "step": 3882 + }, + { + "epoch": 0.39, + "grad_norm": 1.7712865910914184, + "learning_rate": 1.3790280355337332e-05, + "loss": 0.7863, + "step": 3883 + }, + { + "epoch": 0.4, + "grad_norm": 1.340039087969083, + "learning_rate": 1.378723150948659e-05, + "loss": 0.6541, + "step": 3884 + }, + { + "epoch": 0.4, + "grad_norm": 1.6035814459079745, + "learning_rate": 1.3784182252591091e-05, + "loss": 0.8015, + "step": 3885 + }, + { + "epoch": 0.4, + "grad_norm": 1.4591240737646074, + "learning_rate": 1.3781132584981789e-05, + "loss": 0.5822, + "step": 3886 + }, + { + "epoch": 0.4, + "grad_norm": 1.4467307457492842, + "learning_rate": 1.3778082506989673e-05, + "loss": 0.7766, + "step": 3887 + }, + { + "epoch": 0.4, + "grad_norm": 1.495377031381774, + "learning_rate": 1.3775032018945784e-05, + "loss": 0.6177, + "step": 3888 + }, + { + "epoch": 0.4, + "grad_norm": 1.6590640615190357, + "learning_rate": 1.3771981121181207e-05, + "loss": 0.814, + "step": 3889 + }, + { + "epoch": 0.4, + "grad_norm": 1.5529252860548812, + "learning_rate": 1.3768929814027064e-05, + "loss": 0.7258, + "step": 3890 + }, + { + "epoch": 0.4, + "grad_norm": 1.4806719936737924, + "learning_rate": 1.376587809781453e-05, + "loss": 0.7873, + "step": 3891 + }, + { + "epoch": 0.4, + "grad_norm": 1.6248051659767457, + "learning_rate": 1.3762825972874816e-05, + "loss": 0.6624, + "step": 3892 + }, + { + "epoch": 0.4, + "grad_norm": 1.715735022035157, + "learning_rate": 1.375977343953919e-05, + "loss": 0.7183, + "step": 3893 + }, + { + "epoch": 0.4, + "grad_norm": 1.5299224025682734, + "learning_rate": 1.3756720498138956e-05, + "loss": 0.7039, + "step": 3894 + }, + { + "epoch": 0.4, + "grad_norm": 1.4701266266479829, + "learning_rate": 1.3753667149005457e-05, + "loss": 0.794, + "step": 3895 + }, + { + "epoch": 0.4, + "grad_norm": 1.567391393980187, + "learning_rate": 1.375061339247009e-05, + "loss": 0.8436, + "step": 3896 + }, + { + "epoch": 0.4, + "grad_norm": 1.6305836041831285, + "learning_rate": 1.3747559228864294e-05, + "loss": 0.7146, + "step": 3897 + }, + { + "epoch": 0.4, + "grad_norm": 1.5033273499462547, + "learning_rate": 1.3744504658519545e-05, + "loss": 0.707, + "step": 3898 + }, + { + "epoch": 0.4, + "grad_norm": 1.610271975696023, + "learning_rate": 1.3741449681767376e-05, + "loss": 0.809, + "step": 3899 + }, + { + "epoch": 0.4, + "grad_norm": 1.6805097925785062, + "learning_rate": 1.3738394298939354e-05, + "loss": 0.7389, + "step": 3900 + }, + { + "epoch": 0.4, + "grad_norm": 1.5444366150685034, + "learning_rate": 1.3735338510367093e-05, + "loss": 0.7933, + "step": 3901 + }, + { + "epoch": 0.4, + "grad_norm": 1.488665112098971, + "learning_rate": 1.3732282316382249e-05, + "loss": 0.7044, + "step": 3902 + }, + { + "epoch": 0.4, + "grad_norm": 1.5498518567347663, + "learning_rate": 1.3729225717316528e-05, + "loss": 0.767, + "step": 3903 + }, + { + "epoch": 0.4, + "grad_norm": 1.465550090058791, + "learning_rate": 1.3726168713501673e-05, + "loss": 0.7504, + "step": 3904 + }, + { + "epoch": 0.4, + "grad_norm": 2.0252785952349286, + "learning_rate": 1.3723111305269474e-05, + "loss": 0.7166, + "step": 3905 + }, + { + "epoch": 0.4, + "grad_norm": 1.7429264453882414, + "learning_rate": 1.3720053492951766e-05, + "loss": 0.7712, + "step": 3906 + }, + { + "epoch": 0.4, + "grad_norm": 1.5456938227408854, + "learning_rate": 1.3716995276880428e-05, + "loss": 0.6899, + "step": 3907 + }, + { + "epoch": 0.4, + "grad_norm": 1.504687922160811, + "learning_rate": 1.3713936657387379e-05, + "loss": 0.7122, + "step": 3908 + }, + { + "epoch": 0.4, + "grad_norm": 1.5962339911396888, + "learning_rate": 1.3710877634804587e-05, + "loss": 0.709, + "step": 3909 + }, + { + "epoch": 0.4, + "grad_norm": 1.4784206664096584, + "learning_rate": 1.3707818209464057e-05, + "loss": 0.7649, + "step": 3910 + }, + { + "epoch": 0.4, + "grad_norm": 1.498108568389412, + "learning_rate": 1.3704758381697845e-05, + "loss": 0.7926, + "step": 3911 + }, + { + "epoch": 0.4, + "grad_norm": 1.5681712841089852, + "learning_rate": 1.3701698151838048e-05, + "loss": 0.6128, + "step": 3912 + }, + { + "epoch": 0.4, + "grad_norm": 1.5598006504989548, + "learning_rate": 1.3698637520216802e-05, + "loss": 0.7507, + "step": 3913 + }, + { + "epoch": 0.4, + "grad_norm": 1.4329336123610517, + "learning_rate": 1.3695576487166293e-05, + "loss": 0.7848, + "step": 3914 + }, + { + "epoch": 0.4, + "grad_norm": 1.6529935580307296, + "learning_rate": 1.369251505301875e-05, + "loss": 0.7724, + "step": 3915 + }, + { + "epoch": 0.4, + "grad_norm": 1.4943906668863864, + "learning_rate": 1.3689453218106445e-05, + "loss": 0.7686, + "step": 3916 + }, + { + "epoch": 0.4, + "grad_norm": 1.6263525308946214, + "learning_rate": 1.3686390982761689e-05, + "loss": 0.7711, + "step": 3917 + }, + { + "epoch": 0.4, + "grad_norm": 1.590883672321078, + "learning_rate": 1.368332834731684e-05, + "loss": 0.7328, + "step": 3918 + }, + { + "epoch": 0.4, + "grad_norm": 1.5011123483847482, + "learning_rate": 1.3680265312104297e-05, + "loss": 0.6958, + "step": 3919 + }, + { + "epoch": 0.4, + "grad_norm": 1.4704528756404118, + "learning_rate": 1.367720187745651e-05, + "loss": 0.7051, + "step": 3920 + }, + { + "epoch": 0.4, + "grad_norm": 1.6128133730605272, + "learning_rate": 1.3674138043705962e-05, + "loss": 0.9151, + "step": 3921 + }, + { + "epoch": 0.4, + "grad_norm": 1.672605327710779, + "learning_rate": 1.367107381118519e-05, + "loss": 0.7019, + "step": 3922 + }, + { + "epoch": 0.4, + "grad_norm": 1.4984658369655424, + "learning_rate": 1.366800918022676e-05, + "loss": 0.6323, + "step": 3923 + }, + { + "epoch": 0.4, + "grad_norm": 2.185466425690861, + "learning_rate": 1.3664944151163298e-05, + "loss": 0.7865, + "step": 3924 + }, + { + "epoch": 0.4, + "grad_norm": 1.5925705912270587, + "learning_rate": 1.3661878724327462e-05, + "loss": 0.7435, + "step": 3925 + }, + { + "epoch": 0.4, + "grad_norm": 1.7039222421624558, + "learning_rate": 1.3658812900051956e-05, + "loss": 0.6535, + "step": 3926 + }, + { + "epoch": 0.4, + "grad_norm": 1.5512031904450436, + "learning_rate": 1.3655746678669526e-05, + "loss": 0.7673, + "step": 3927 + }, + { + "epoch": 0.4, + "grad_norm": 1.7229401174639263, + "learning_rate": 1.3652680060512965e-05, + "loss": 0.7888, + "step": 3928 + }, + { + "epoch": 0.4, + "grad_norm": 1.5228373082528688, + "learning_rate": 1.3649613045915103e-05, + "loss": 0.7251, + "step": 3929 + }, + { + "epoch": 0.4, + "grad_norm": 1.6710105183424213, + "learning_rate": 1.3646545635208818e-05, + "loss": 0.6803, + "step": 3930 + }, + { + "epoch": 0.4, + "grad_norm": 1.8352157434527323, + "learning_rate": 1.3643477828727032e-05, + "loss": 0.7465, + "step": 3931 + }, + { + "epoch": 0.4, + "grad_norm": 1.566225925123017, + "learning_rate": 1.3640409626802704e-05, + "loss": 0.8061, + "step": 3932 + }, + { + "epoch": 0.4, + "grad_norm": 1.732142431695408, + "learning_rate": 1.363734102976884e-05, + "loss": 0.8112, + "step": 3933 + }, + { + "epoch": 0.4, + "grad_norm": 1.7342905642809994, + "learning_rate": 1.3634272037958492e-05, + "loss": 0.8466, + "step": 3934 + }, + { + "epoch": 0.4, + "grad_norm": 1.705404804091433, + "learning_rate": 1.3631202651704745e-05, + "loss": 0.7414, + "step": 3935 + }, + { + "epoch": 0.4, + "grad_norm": 1.481497381352257, + "learning_rate": 1.3628132871340738e-05, + "loss": 0.7467, + "step": 3936 + }, + { + "epoch": 0.4, + "grad_norm": 1.3577307892769122, + "learning_rate": 1.3625062697199643e-05, + "loss": 0.7035, + "step": 3937 + }, + { + "epoch": 0.4, + "grad_norm": 1.536636098337762, + "learning_rate": 1.3621992129614683e-05, + "loss": 0.6956, + "step": 3938 + }, + { + "epoch": 0.4, + "grad_norm": 1.5967946850115433, + "learning_rate": 1.3618921168919119e-05, + "loss": 0.7888, + "step": 3939 + }, + { + "epoch": 0.4, + "grad_norm": 1.6370373459453844, + "learning_rate": 1.3615849815446255e-05, + "loss": 0.6748, + "step": 3940 + }, + { + "epoch": 0.4, + "grad_norm": 1.4152368310294077, + "learning_rate": 1.3612778069529439e-05, + "loss": 0.7403, + "step": 3941 + }, + { + "epoch": 0.4, + "grad_norm": 1.4061541912594002, + "learning_rate": 1.3609705931502063e-05, + "loss": 0.6308, + "step": 3942 + }, + { + "epoch": 0.4, + "grad_norm": 1.5958246079468728, + "learning_rate": 1.3606633401697557e-05, + "loss": 0.7026, + "step": 3943 + }, + { + "epoch": 0.4, + "grad_norm": 1.5512193775463972, + "learning_rate": 1.3603560480449395e-05, + "loss": 0.7159, + "step": 3944 + }, + { + "epoch": 0.4, + "grad_norm": 1.8355451876396274, + "learning_rate": 1.3600487168091099e-05, + "loss": 0.8106, + "step": 3945 + }, + { + "epoch": 0.4, + "grad_norm": 1.6634340855586849, + "learning_rate": 1.3597413464956222e-05, + "loss": 0.8114, + "step": 3946 + }, + { + "epoch": 0.4, + "grad_norm": 1.5560974007212274, + "learning_rate": 1.3594339371378373e-05, + "loss": 0.5911, + "step": 3947 + }, + { + "epoch": 0.4, + "grad_norm": 1.7429364246328933, + "learning_rate": 1.3591264887691198e-05, + "loss": 0.7712, + "step": 3948 + }, + { + "epoch": 0.4, + "grad_norm": 1.3727463391852266, + "learning_rate": 1.3588190014228376e-05, + "loss": 0.5769, + "step": 3949 + }, + { + "epoch": 0.4, + "grad_norm": 1.5220570359255545, + "learning_rate": 1.358511475132364e-05, + "loss": 0.7909, + "step": 3950 + }, + { + "epoch": 0.4, + "grad_norm": 1.6723979110942109, + "learning_rate": 1.3582039099310767e-05, + "loss": 0.6782, + "step": 3951 + }, + { + "epoch": 0.4, + "grad_norm": 1.430894102045827, + "learning_rate": 1.3578963058523565e-05, + "loss": 0.7322, + "step": 3952 + }, + { + "epoch": 0.4, + "grad_norm": 1.5004514717420763, + "learning_rate": 1.357588662929589e-05, + "loss": 0.6998, + "step": 3953 + }, + { + "epoch": 0.4, + "grad_norm": 1.4512332645002124, + "learning_rate": 1.3572809811961642e-05, + "loss": 0.7361, + "step": 3954 + }, + { + "epoch": 0.4, + "grad_norm": 1.5840456135063126, + "learning_rate": 1.3569732606854764e-05, + "loss": 0.7795, + "step": 3955 + }, + { + "epoch": 0.4, + "grad_norm": 1.5977379919396377, + "learning_rate": 1.3566655014309231e-05, + "loss": 0.7411, + "step": 3956 + }, + { + "epoch": 0.4, + "grad_norm": 1.6264944495338678, + "learning_rate": 1.3563577034659078e-05, + "loss": 0.6683, + "step": 3957 + }, + { + "epoch": 0.4, + "grad_norm": 1.5164155158232762, + "learning_rate": 1.3560498668238364e-05, + "loss": 0.7513, + "step": 3958 + }, + { + "epoch": 0.4, + "grad_norm": 1.640008203893221, + "learning_rate": 1.35574199153812e-05, + "loss": 0.7755, + "step": 3959 + }, + { + "epoch": 0.4, + "grad_norm": 1.6039825251834356, + "learning_rate": 1.3554340776421736e-05, + "loss": 0.6642, + "step": 3960 + }, + { + "epoch": 0.4, + "grad_norm": 1.7130422927765, + "learning_rate": 1.3551261251694162e-05, + "loss": 0.8338, + "step": 3961 + }, + { + "epoch": 0.4, + "grad_norm": 1.4141896215299843, + "learning_rate": 1.3548181341532715e-05, + "loss": 0.7731, + "step": 3962 + }, + { + "epoch": 0.4, + "grad_norm": 1.3793364177003844, + "learning_rate": 1.3545101046271673e-05, + "loss": 0.6984, + "step": 3963 + }, + { + "epoch": 0.4, + "grad_norm": 1.457833073055294, + "learning_rate": 1.354202036624535e-05, + "loss": 0.7104, + "step": 3964 + }, + { + "epoch": 0.4, + "grad_norm": 1.4252227929541943, + "learning_rate": 1.353893930178811e-05, + "loss": 0.7564, + "step": 3965 + }, + { + "epoch": 0.4, + "grad_norm": 1.476133945396026, + "learning_rate": 1.353585785323435e-05, + "loss": 0.7568, + "step": 3966 + }, + { + "epoch": 0.4, + "grad_norm": 1.5951424524109457, + "learning_rate": 1.3532776020918515e-05, + "loss": 0.7453, + "step": 3967 + }, + { + "epoch": 0.4, + "grad_norm": 1.7376181756282787, + "learning_rate": 1.3529693805175087e-05, + "loss": 0.7995, + "step": 3968 + }, + { + "epoch": 0.4, + "grad_norm": 1.6823680223599915, + "learning_rate": 1.3526611206338595e-05, + "loss": 0.8328, + "step": 3969 + }, + { + "epoch": 0.4, + "grad_norm": 1.5942370022385746, + "learning_rate": 1.352352822474361e-05, + "loss": 0.6895, + "step": 3970 + }, + { + "epoch": 0.4, + "grad_norm": 1.734311346453895, + "learning_rate": 1.3520444860724737e-05, + "loss": 0.7768, + "step": 3971 + }, + { + "epoch": 0.4, + "grad_norm": 1.4420902721836801, + "learning_rate": 1.3517361114616628e-05, + "loss": 0.7815, + "step": 3972 + }, + { + "epoch": 0.4, + "grad_norm": 1.6730404476991343, + "learning_rate": 1.351427698675397e-05, + "loss": 0.8037, + "step": 3973 + }, + { + "epoch": 0.4, + "grad_norm": 1.7654437159713978, + "learning_rate": 1.3511192477471509e-05, + "loss": 0.7146, + "step": 3974 + }, + { + "epoch": 0.4, + "grad_norm": 1.645189569094433, + "learning_rate": 1.350810758710401e-05, + "loss": 0.7678, + "step": 3975 + }, + { + "epoch": 0.4, + "grad_norm": 1.5397826998041777, + "learning_rate": 1.3505022315986295e-05, + "loss": 0.6837, + "step": 3976 + }, + { + "epoch": 0.4, + "grad_norm": 1.6013430729945768, + "learning_rate": 1.350193666445322e-05, + "loss": 0.7314, + "step": 3977 + }, + { + "epoch": 0.4, + "grad_norm": 1.53250292796185, + "learning_rate": 1.3498850632839683e-05, + "loss": 0.6441, + "step": 3978 + }, + { + "epoch": 0.4, + "grad_norm": 1.4275075859196327, + "learning_rate": 1.3495764221480625e-05, + "loss": 0.6272, + "step": 3979 + }, + { + "epoch": 0.4, + "grad_norm": 1.4213206179979412, + "learning_rate": 1.349267743071103e-05, + "loss": 0.759, + "step": 3980 + }, + { + "epoch": 0.4, + "grad_norm": 1.5861061657113282, + "learning_rate": 1.3489590260865919e-05, + "loss": 0.7647, + "step": 3981 + }, + { + "epoch": 0.41, + "grad_norm": 1.4740057360364553, + "learning_rate": 1.3486502712280354e-05, + "loss": 0.8151, + "step": 3982 + }, + { + "epoch": 0.41, + "grad_norm": 1.7059409786907052, + "learning_rate": 1.3483414785289447e-05, + "loss": 0.7177, + "step": 3983 + }, + { + "epoch": 0.41, + "grad_norm": 1.5460171824416302, + "learning_rate": 1.3480326480228336e-05, + "loss": 0.7102, + "step": 3984 + }, + { + "epoch": 0.41, + "grad_norm": 1.6288133991658376, + "learning_rate": 1.3477237797432214e-05, + "loss": 0.7615, + "step": 3985 + }, + { + "epoch": 0.41, + "grad_norm": 1.5816596029004617, + "learning_rate": 1.3474148737236305e-05, + "loss": 0.7862, + "step": 3986 + }, + { + "epoch": 0.41, + "grad_norm": 1.472868552210121, + "learning_rate": 1.347105929997588e-05, + "loss": 0.7079, + "step": 3987 + }, + { + "epoch": 0.41, + "grad_norm": 1.5274707854704879, + "learning_rate": 1.346796948598625e-05, + "loss": 0.749, + "step": 3988 + }, + { + "epoch": 0.41, + "grad_norm": 1.6334326446322402, + "learning_rate": 1.3464879295602767e-05, + "loss": 0.8399, + "step": 3989 + }, + { + "epoch": 0.41, + "grad_norm": 1.5472172851604107, + "learning_rate": 1.346178872916082e-05, + "loss": 0.7571, + "step": 3990 + }, + { + "epoch": 0.41, + "grad_norm": 1.4613981141692318, + "learning_rate": 1.345869778699584e-05, + "loss": 0.6745, + "step": 3991 + }, + { + "epoch": 0.41, + "grad_norm": 1.7461178368925054, + "learning_rate": 1.3455606469443308e-05, + "loss": 0.724, + "step": 3992 + }, + { + "epoch": 0.41, + "grad_norm": 1.5461884023147348, + "learning_rate": 1.345251477683873e-05, + "loss": 0.7484, + "step": 3993 + }, + { + "epoch": 0.41, + "grad_norm": 1.5143082004134223, + "learning_rate": 1.3449422709517665e-05, + "loss": 0.6859, + "step": 3994 + }, + { + "epoch": 0.41, + "grad_norm": 1.472662926327967, + "learning_rate": 1.344633026781571e-05, + "loss": 0.7049, + "step": 3995 + }, + { + "epoch": 0.41, + "grad_norm": 1.565687265737648, + "learning_rate": 1.3443237452068496e-05, + "loss": 0.7657, + "step": 3996 + }, + { + "epoch": 0.41, + "grad_norm": 1.4049062205494047, + "learning_rate": 1.3440144262611704e-05, + "loss": 0.677, + "step": 3997 + }, + { + "epoch": 0.41, + "grad_norm": 1.4465927947849393, + "learning_rate": 1.3437050699781052e-05, + "loss": 0.6983, + "step": 3998 + }, + { + "epoch": 0.41, + "grad_norm": 1.4138060959994558, + "learning_rate": 1.3433956763912293e-05, + "loss": 0.6678, + "step": 3999 + }, + { + "epoch": 0.41, + "grad_norm": 1.5221747354708852, + "learning_rate": 1.3430862455341228e-05, + "loss": 0.7398, + "step": 4000 + }, + { + "epoch": 0.41, + "grad_norm": 1.6519117347548133, + "learning_rate": 1.3427767774403697e-05, + "loss": 0.8253, + "step": 4001 + }, + { + "epoch": 0.41, + "grad_norm": 1.421518040175894, + "learning_rate": 1.3424672721435582e-05, + "loss": 0.7104, + "step": 4002 + }, + { + "epoch": 0.41, + "grad_norm": 1.6079084155329708, + "learning_rate": 1.3421577296772795e-05, + "loss": 0.7406, + "step": 4003 + }, + { + "epoch": 0.41, + "grad_norm": 1.5137819432029258, + "learning_rate": 1.34184815007513e-05, + "loss": 0.7776, + "step": 4004 + }, + { + "epoch": 0.41, + "grad_norm": 1.7366776196818696, + "learning_rate": 1.3415385333707096e-05, + "loss": 0.8128, + "step": 4005 + }, + { + "epoch": 0.41, + "grad_norm": 1.5077195369673975, + "learning_rate": 1.3412288795976228e-05, + "loss": 0.705, + "step": 4006 + }, + { + "epoch": 0.41, + "grad_norm": 1.6593668158452635, + "learning_rate": 1.340919188789477e-05, + "loss": 0.7579, + "step": 4007 + }, + { + "epoch": 0.41, + "grad_norm": 1.5030958680366602, + "learning_rate": 1.340609460979885e-05, + "loss": 0.7195, + "step": 4008 + }, + { + "epoch": 0.41, + "grad_norm": 1.5556440737593225, + "learning_rate": 1.3402996962024623e-05, + "loss": 0.7334, + "step": 4009 + }, + { + "epoch": 0.41, + "grad_norm": 1.5110906348055217, + "learning_rate": 1.3399898944908292e-05, + "loss": 0.7625, + "step": 4010 + }, + { + "epoch": 0.41, + "grad_norm": 1.5609678400026048, + "learning_rate": 1.3396800558786102e-05, + "loss": 0.6784, + "step": 4011 + }, + { + "epoch": 0.41, + "grad_norm": 1.687003664548803, + "learning_rate": 1.339370180399433e-05, + "loss": 0.812, + "step": 4012 + }, + { + "epoch": 0.41, + "grad_norm": 1.5049571514814664, + "learning_rate": 1.33906026808693e-05, + "loss": 0.7418, + "step": 4013 + }, + { + "epoch": 0.41, + "grad_norm": 1.639106490446256, + "learning_rate": 1.3387503189747369e-05, + "loss": 0.6906, + "step": 4014 + }, + { + "epoch": 0.41, + "grad_norm": 1.4993464719464538, + "learning_rate": 1.3384403330964944e-05, + "loss": 0.7478, + "step": 4015 + }, + { + "epoch": 0.41, + "grad_norm": 1.471817374722621, + "learning_rate": 1.3381303104858462e-05, + "loss": 0.686, + "step": 4016 + }, + { + "epoch": 0.41, + "grad_norm": 1.588559439772156, + "learning_rate": 1.3378202511764409e-05, + "loss": 0.7877, + "step": 4017 + }, + { + "epoch": 0.41, + "grad_norm": 1.623103714420601, + "learning_rate": 1.3375101552019296e-05, + "loss": 0.7324, + "step": 4018 + }, + { + "epoch": 0.41, + "grad_norm": 1.5576566124316074, + "learning_rate": 1.3372000225959696e-05, + "loss": 0.7979, + "step": 4019 + }, + { + "epoch": 0.41, + "grad_norm": 1.6544913152819087, + "learning_rate": 1.3368898533922202e-05, + "loss": 0.7104, + "step": 4020 + }, + { + "epoch": 0.41, + "grad_norm": 1.7688287844335833, + "learning_rate": 1.3365796476243455e-05, + "loss": 0.731, + "step": 4021 + }, + { + "epoch": 0.41, + "grad_norm": 1.535589463303318, + "learning_rate": 1.3362694053260136e-05, + "loss": 0.781, + "step": 4022 + }, + { + "epoch": 0.41, + "grad_norm": 1.4515536576064387, + "learning_rate": 1.3359591265308965e-05, + "loss": 0.6553, + "step": 4023 + }, + { + "epoch": 0.41, + "grad_norm": 1.6016329573953065, + "learning_rate": 1.33564881127267e-05, + "loss": 0.7897, + "step": 4024 + }, + { + "epoch": 0.41, + "grad_norm": 1.3707604948899, + "learning_rate": 1.3353384595850139e-05, + "loss": 0.7084, + "step": 4025 + }, + { + "epoch": 0.41, + "grad_norm": 1.5991205748896737, + "learning_rate": 1.3350280715016122e-05, + "loss": 0.7636, + "step": 4026 + }, + { + "epoch": 0.41, + "grad_norm": 1.49161456735389, + "learning_rate": 1.3347176470561525e-05, + "loss": 0.7854, + "step": 4027 + }, + { + "epoch": 0.41, + "grad_norm": 1.426486035192875, + "learning_rate": 1.3344071862823267e-05, + "loss": 0.7637, + "step": 4028 + }, + { + "epoch": 0.41, + "grad_norm": 1.5661573435509604, + "learning_rate": 1.3340966892138306e-05, + "loss": 0.6711, + "step": 4029 + }, + { + "epoch": 0.41, + "grad_norm": 1.5588267225607866, + "learning_rate": 1.3337861558843632e-05, + "loss": 0.775, + "step": 4030 + }, + { + "epoch": 0.41, + "grad_norm": 1.6100450258007952, + "learning_rate": 1.3334755863276288e-05, + "loss": 0.7643, + "step": 4031 + }, + { + "epoch": 0.41, + "grad_norm": 1.506099579082188, + "learning_rate": 1.3331649805773339e-05, + "loss": 0.7668, + "step": 4032 + }, + { + "epoch": 0.41, + "grad_norm": 1.6918366583759896, + "learning_rate": 1.3328543386671908e-05, + "loss": 0.6847, + "step": 4033 + }, + { + "epoch": 0.41, + "grad_norm": 1.4958834953303939, + "learning_rate": 1.3325436606309146e-05, + "loss": 0.6859, + "step": 4034 + }, + { + "epoch": 0.41, + "grad_norm": 1.5302344329469577, + "learning_rate": 1.3322329465022244e-05, + "loss": 0.7167, + "step": 4035 + }, + { + "epoch": 0.41, + "grad_norm": 1.4107656193790257, + "learning_rate": 1.3319221963148432e-05, + "loss": 0.7237, + "step": 4036 + }, + { + "epoch": 0.41, + "grad_norm": 1.6684130305180658, + "learning_rate": 1.3316114101024984e-05, + "loss": 0.7189, + "step": 4037 + }, + { + "epoch": 0.41, + "grad_norm": 1.6413025034363493, + "learning_rate": 1.331300587898921e-05, + "loss": 0.7789, + "step": 4038 + }, + { + "epoch": 0.41, + "grad_norm": 1.7096488103304526, + "learning_rate": 1.3309897297378456e-05, + "loss": 0.8057, + "step": 4039 + }, + { + "epoch": 0.41, + "grad_norm": 1.527070743940658, + "learning_rate": 1.3306788356530111e-05, + "loss": 0.7914, + "step": 4040 + }, + { + "epoch": 0.41, + "grad_norm": 1.704771614310355, + "learning_rate": 1.3303679056781603e-05, + "loss": 0.8515, + "step": 4041 + }, + { + "epoch": 0.41, + "grad_norm": 1.6883261564971928, + "learning_rate": 1.3300569398470398e-05, + "loss": 0.8103, + "step": 4042 + }, + { + "epoch": 0.41, + "grad_norm": 1.6306291070874364, + "learning_rate": 1.3297459381933999e-05, + "loss": 0.8424, + "step": 4043 + }, + { + "epoch": 0.41, + "grad_norm": 1.6226850730174835, + "learning_rate": 1.329434900750995e-05, + "loss": 0.6827, + "step": 4044 + }, + { + "epoch": 0.41, + "grad_norm": 1.592943110845101, + "learning_rate": 1.3291238275535831e-05, + "loss": 0.7318, + "step": 4045 + }, + { + "epoch": 0.41, + "grad_norm": 1.6276586615517499, + "learning_rate": 1.3288127186349274e-05, + "loss": 0.8086, + "step": 4046 + }, + { + "epoch": 0.41, + "grad_norm": 1.5809120260098595, + "learning_rate": 1.3285015740287926e-05, + "loss": 0.6867, + "step": 4047 + }, + { + "epoch": 0.41, + "grad_norm": 1.60571809164613, + "learning_rate": 1.3281903937689493e-05, + "loss": 0.6884, + "step": 4048 + }, + { + "epoch": 0.41, + "grad_norm": 1.4376966388706274, + "learning_rate": 1.327879177889171e-05, + "loss": 0.6801, + "step": 4049 + }, + { + "epoch": 0.41, + "grad_norm": 1.7896708274759923, + "learning_rate": 1.3275679264232353e-05, + "loss": 0.822, + "step": 4050 + }, + { + "epoch": 0.41, + "grad_norm": 1.5150771194204846, + "learning_rate": 1.327256639404924e-05, + "loss": 0.7674, + "step": 4051 + }, + { + "epoch": 0.41, + "grad_norm": 1.5066318786705863, + "learning_rate": 1.3269453168680223e-05, + "loss": 0.7316, + "step": 4052 + }, + { + "epoch": 0.41, + "grad_norm": 1.5224427225579769, + "learning_rate": 1.3266339588463191e-05, + "loss": 0.7884, + "step": 4053 + }, + { + "epoch": 0.41, + "grad_norm": 1.4163623206171054, + "learning_rate": 1.3263225653736075e-05, + "loss": 0.8494, + "step": 4054 + }, + { + "epoch": 0.41, + "grad_norm": 1.5620742435587964, + "learning_rate": 1.326011136483685e-05, + "loss": 0.7807, + "step": 4055 + }, + { + "epoch": 0.41, + "grad_norm": 1.6402310828409679, + "learning_rate": 1.3256996722103517e-05, + "loss": 0.8428, + "step": 4056 + }, + { + "epoch": 0.41, + "grad_norm": 1.4431914952890472, + "learning_rate": 1.3253881725874122e-05, + "loss": 0.6633, + "step": 4057 + }, + { + "epoch": 0.41, + "grad_norm": 1.4191342409961611, + "learning_rate": 1.325076637648675e-05, + "loss": 0.7194, + "step": 4058 + }, + { + "epoch": 0.41, + "grad_norm": 1.6401724364523487, + "learning_rate": 1.3247650674279526e-05, + "loss": 0.7146, + "step": 4059 + }, + { + "epoch": 0.41, + "grad_norm": 2.955684699842525, + "learning_rate": 1.3244534619590608e-05, + "loss": 0.7624, + "step": 4060 + }, + { + "epoch": 0.41, + "grad_norm": 1.7062570829972066, + "learning_rate": 1.3241418212758194e-05, + "loss": 0.7219, + "step": 4061 + }, + { + "epoch": 0.41, + "grad_norm": 1.3868359029573687, + "learning_rate": 1.3238301454120527e-05, + "loss": 0.674, + "step": 4062 + }, + { + "epoch": 0.41, + "grad_norm": 1.835307203260098, + "learning_rate": 1.3235184344015877e-05, + "loss": 0.8416, + "step": 4063 + }, + { + "epoch": 0.41, + "grad_norm": 1.9181633969275953, + "learning_rate": 1.3232066882782557e-05, + "loss": 0.6275, + "step": 4064 + }, + { + "epoch": 0.41, + "grad_norm": 1.4805190237294978, + "learning_rate": 1.3228949070758925e-05, + "loss": 0.6706, + "step": 4065 + }, + { + "epoch": 0.41, + "grad_norm": 1.6628355377811277, + "learning_rate": 1.3225830908283363e-05, + "loss": 0.7272, + "step": 4066 + }, + { + "epoch": 0.41, + "grad_norm": 1.525776708814196, + "learning_rate": 1.3222712395694303e-05, + "loss": 0.7371, + "step": 4067 + }, + { + "epoch": 0.41, + "grad_norm": 1.5025346264582065, + "learning_rate": 1.3219593533330211e-05, + "loss": 0.8229, + "step": 4068 + }, + { + "epoch": 0.41, + "grad_norm": 1.7879124391361025, + "learning_rate": 1.3216474321529589e-05, + "loss": 0.7747, + "step": 4069 + }, + { + "epoch": 0.41, + "grad_norm": 1.6357297829478563, + "learning_rate": 1.3213354760630981e-05, + "loss": 0.6958, + "step": 4070 + }, + { + "epoch": 0.41, + "grad_norm": 1.5215139989575535, + "learning_rate": 1.3210234850972966e-05, + "loss": 0.7636, + "step": 4071 + }, + { + "epoch": 0.41, + "grad_norm": 1.3885791150732956, + "learning_rate": 1.3207114592894155e-05, + "loss": 0.6559, + "step": 4072 + }, + { + "epoch": 0.41, + "grad_norm": 1.5713248008518697, + "learning_rate": 1.3203993986733212e-05, + "loss": 0.719, + "step": 4073 + }, + { + "epoch": 0.41, + "grad_norm": 1.5446063043547251, + "learning_rate": 1.3200873032828827e-05, + "loss": 0.7433, + "step": 4074 + }, + { + "epoch": 0.41, + "grad_norm": 1.564066324834634, + "learning_rate": 1.319775173151973e-05, + "loss": 0.6929, + "step": 4075 + }, + { + "epoch": 0.41, + "grad_norm": 1.6681039151887862, + "learning_rate": 1.319463008314469e-05, + "loss": 0.811, + "step": 4076 + }, + { + "epoch": 0.41, + "grad_norm": 1.7337500036471722, + "learning_rate": 1.3191508088042511e-05, + "loss": 0.7997, + "step": 4077 + }, + { + "epoch": 0.41, + "grad_norm": 1.5590561891101518, + "learning_rate": 1.3188385746552039e-05, + "loss": 0.8167, + "step": 4078 + }, + { + "epoch": 0.41, + "grad_norm": 1.6649219606880605, + "learning_rate": 1.3185263059012158e-05, + "loss": 0.7256, + "step": 4079 + }, + { + "epoch": 0.42, + "grad_norm": 1.4916284993073423, + "learning_rate": 1.3182140025761782e-05, + "loss": 0.7404, + "step": 4080 + }, + { + "epoch": 0.42, + "grad_norm": 1.5809091122227412, + "learning_rate": 1.3179016647139866e-05, + "loss": 0.6229, + "step": 4081 + }, + { + "epoch": 0.42, + "grad_norm": 1.577754747739406, + "learning_rate": 1.3175892923485411e-05, + "loss": 0.7157, + "step": 4082 + }, + { + "epoch": 0.42, + "grad_norm": 1.494271837332332, + "learning_rate": 1.3172768855137446e-05, + "loss": 0.866, + "step": 4083 + }, + { + "epoch": 0.42, + "grad_norm": 1.591720798787326, + "learning_rate": 1.3169644442435036e-05, + "loss": 0.7013, + "step": 4084 + }, + { + "epoch": 0.42, + "grad_norm": 1.5553585860498986, + "learning_rate": 1.316651968571729e-05, + "loss": 0.8126, + "step": 4085 + }, + { + "epoch": 0.42, + "grad_norm": 1.4891715960594787, + "learning_rate": 1.316339458532335e-05, + "loss": 0.7398, + "step": 4086 + }, + { + "epoch": 0.42, + "grad_norm": 1.542762990549468, + "learning_rate": 1.31602691415924e-05, + "loss": 0.7326, + "step": 4087 + }, + { + "epoch": 0.42, + "grad_norm": 1.490050148493316, + "learning_rate": 1.3157143354863653e-05, + "loss": 0.7312, + "step": 4088 + }, + { + "epoch": 0.42, + "grad_norm": 1.7648769404474558, + "learning_rate": 1.315401722547637e-05, + "loss": 0.7986, + "step": 4089 + }, + { + "epoch": 0.42, + "grad_norm": 1.719709018178479, + "learning_rate": 1.3150890753769836e-05, + "loss": 0.7715, + "step": 4090 + }, + { + "epoch": 0.42, + "grad_norm": 1.5627320403532918, + "learning_rate": 1.314776394008339e-05, + "loss": 0.7837, + "step": 4091 + }, + { + "epoch": 0.42, + "grad_norm": 1.453463710193572, + "learning_rate": 1.3144636784756391e-05, + "loss": 0.6246, + "step": 4092 + }, + { + "epoch": 0.42, + "grad_norm": 1.650974176196651, + "learning_rate": 1.3141509288128248e-05, + "loss": 0.786, + "step": 4093 + }, + { + "epoch": 0.42, + "grad_norm": 1.5484033252730638, + "learning_rate": 1.3138381450538398e-05, + "loss": 0.7412, + "step": 4094 + }, + { + "epoch": 0.42, + "grad_norm": 1.6037113882082326, + "learning_rate": 1.3135253272326316e-05, + "loss": 0.6865, + "step": 4095 + }, + { + "epoch": 0.42, + "grad_norm": 1.5918578191023562, + "learning_rate": 1.3132124753831525e-05, + "loss": 0.8283, + "step": 4096 + }, + { + "epoch": 0.42, + "grad_norm": 1.4520708558181288, + "learning_rate": 1.3128995895393576e-05, + "loss": 0.7158, + "step": 4097 + }, + { + "epoch": 0.42, + "grad_norm": 1.4518469912628422, + "learning_rate": 1.3125866697352052e-05, + "loss": 0.7374, + "step": 4098 + }, + { + "epoch": 0.42, + "grad_norm": 1.5362404511239431, + "learning_rate": 1.312273716004658e-05, + "loss": 0.735, + "step": 4099 + }, + { + "epoch": 0.42, + "grad_norm": 1.5893081455973872, + "learning_rate": 1.3119607283816823e-05, + "loss": 0.7949, + "step": 4100 + }, + { + "epoch": 0.42, + "grad_norm": 1.5451259765624377, + "learning_rate": 1.3116477069002483e-05, + "loss": 0.7979, + "step": 4101 + }, + { + "epoch": 0.42, + "grad_norm": 1.6486244621849588, + "learning_rate": 1.3113346515943292e-05, + "loss": 0.7615, + "step": 4102 + }, + { + "epoch": 0.42, + "grad_norm": 1.8091766023002607, + "learning_rate": 1.3110215624979026e-05, + "loss": 0.7761, + "step": 4103 + }, + { + "epoch": 0.42, + "grad_norm": 1.6659017005621068, + "learning_rate": 1.3107084396449488e-05, + "loss": 0.768, + "step": 4104 + }, + { + "epoch": 0.42, + "grad_norm": 1.6204191281973657, + "learning_rate": 1.3103952830694531e-05, + "loss": 0.8416, + "step": 4105 + }, + { + "epoch": 0.42, + "grad_norm": 1.5668821827815804, + "learning_rate": 1.3100820928054034e-05, + "loss": 0.7551, + "step": 4106 + }, + { + "epoch": 0.42, + "grad_norm": 1.643785128436163, + "learning_rate": 1.3097688688867917e-05, + "loss": 0.899, + "step": 4107 + }, + { + "epoch": 0.42, + "grad_norm": 1.4816697433407158, + "learning_rate": 1.3094556113476138e-05, + "loss": 0.7447, + "step": 4108 + }, + { + "epoch": 0.42, + "grad_norm": 1.3628423656563957, + "learning_rate": 1.309142320221868e-05, + "loss": 0.7281, + "step": 4109 + }, + { + "epoch": 0.42, + "grad_norm": 1.493882064536198, + "learning_rate": 1.3088289955435583e-05, + "loss": 0.7529, + "step": 4110 + }, + { + "epoch": 0.42, + "grad_norm": 1.5185950409449684, + "learning_rate": 1.3085156373466905e-05, + "loss": 0.6877, + "step": 4111 + }, + { + "epoch": 0.42, + "grad_norm": 1.552483946978183, + "learning_rate": 1.3082022456652748e-05, + "loss": 0.619, + "step": 4112 + }, + { + "epoch": 0.42, + "grad_norm": 1.6203458777816349, + "learning_rate": 1.3078888205333251e-05, + "loss": 0.7527, + "step": 4113 + }, + { + "epoch": 0.42, + "grad_norm": 1.6555482537000537, + "learning_rate": 1.307575361984859e-05, + "loss": 0.8066, + "step": 4114 + }, + { + "epoch": 0.42, + "grad_norm": 1.561415647287691, + "learning_rate": 1.307261870053897e-05, + "loss": 0.7591, + "step": 4115 + }, + { + "epoch": 0.42, + "grad_norm": 1.5575464787991349, + "learning_rate": 1.3069483447744644e-05, + "loss": 0.7045, + "step": 4116 + }, + { + "epoch": 0.42, + "grad_norm": 1.601909026948332, + "learning_rate": 1.306634786180589e-05, + "loss": 0.7526, + "step": 4117 + }, + { + "epoch": 0.42, + "grad_norm": 1.4804890751107598, + "learning_rate": 1.3063211943063028e-05, + "loss": 0.7148, + "step": 4118 + }, + { + "epoch": 0.42, + "grad_norm": 1.4231408741569105, + "learning_rate": 1.3060075691856408e-05, + "loss": 0.7549, + "step": 4119 + }, + { + "epoch": 0.42, + "grad_norm": 1.6377303109941803, + "learning_rate": 1.3056939108526432e-05, + "loss": 0.8093, + "step": 4120 + }, + { + "epoch": 0.42, + "grad_norm": 1.4810282169201698, + "learning_rate": 1.3053802193413518e-05, + "loss": 0.6746, + "step": 4121 + }, + { + "epoch": 0.42, + "grad_norm": 1.586189039013135, + "learning_rate": 1.3050664946858132e-05, + "loss": 0.7905, + "step": 4122 + }, + { + "epoch": 0.42, + "grad_norm": 1.5164378866528998, + "learning_rate": 1.3047527369200774e-05, + "loss": 0.703, + "step": 4123 + }, + { + "epoch": 0.42, + "grad_norm": 1.652431828924487, + "learning_rate": 1.304438946078198e-05, + "loss": 0.7871, + "step": 4124 + }, + { + "epoch": 0.42, + "grad_norm": 1.5698746425610295, + "learning_rate": 1.3041251221942316e-05, + "loss": 0.7083, + "step": 4125 + }, + { + "epoch": 0.42, + "grad_norm": 1.550651917500886, + "learning_rate": 1.3038112653022394e-05, + "loss": 0.8533, + "step": 4126 + }, + { + "epoch": 0.42, + "grad_norm": 1.6512048825368992, + "learning_rate": 1.3034973754362851e-05, + "loss": 0.9085, + "step": 4127 + }, + { + "epoch": 0.42, + "grad_norm": 1.4969760442828155, + "learning_rate": 1.303183452630437e-05, + "loss": 0.7446, + "step": 4128 + }, + { + "epoch": 0.42, + "grad_norm": 1.5726190460076614, + "learning_rate": 1.3028694969187665e-05, + "loss": 0.743, + "step": 4129 + }, + { + "epoch": 0.42, + "grad_norm": 1.4195619694904704, + "learning_rate": 1.3025555083353485e-05, + "loss": 0.7286, + "step": 4130 + }, + { + "epoch": 0.42, + "grad_norm": 1.4969874031232395, + "learning_rate": 1.3022414869142612e-05, + "loss": 0.6673, + "step": 4131 + }, + { + "epoch": 0.42, + "grad_norm": 1.5093059970994227, + "learning_rate": 1.3019274326895874e-05, + "loss": 0.7385, + "step": 4132 + }, + { + "epoch": 0.42, + "grad_norm": 1.5271500356756018, + "learning_rate": 1.3016133456954123e-05, + "loss": 0.7508, + "step": 4133 + }, + { + "epoch": 0.42, + "grad_norm": 1.520571634239225, + "learning_rate": 1.301299225965825e-05, + "loss": 0.7333, + "step": 4134 + }, + { + "epoch": 0.42, + "grad_norm": 1.5627731896035457, + "learning_rate": 1.300985073534919e-05, + "loss": 0.7891, + "step": 4135 + }, + { + "epoch": 0.42, + "grad_norm": 1.5326098612589172, + "learning_rate": 1.3006708884367895e-05, + "loss": 0.7618, + "step": 4136 + }, + { + "epoch": 0.42, + "grad_norm": 1.8468622614861878, + "learning_rate": 1.3003566707055375e-05, + "loss": 0.7747, + "step": 4137 + }, + { + "epoch": 0.42, + "grad_norm": 1.4174643847168273, + "learning_rate": 1.3000424203752658e-05, + "loss": 0.694, + "step": 4138 + }, + { + "epoch": 0.42, + "grad_norm": 1.4514857062771434, + "learning_rate": 1.2997281374800817e-05, + "loss": 0.6628, + "step": 4139 + }, + { + "epoch": 0.42, + "grad_norm": 1.5595809922253188, + "learning_rate": 1.299413822054095e-05, + "loss": 0.85, + "step": 4140 + }, + { + "epoch": 0.42, + "grad_norm": 1.4476139519602225, + "learning_rate": 1.2990994741314204e-05, + "loss": 0.7068, + "step": 4141 + }, + { + "epoch": 0.42, + "grad_norm": 1.5251837298148312, + "learning_rate": 1.2987850937461757e-05, + "loss": 0.7197, + "step": 4142 + }, + { + "epoch": 0.42, + "grad_norm": 1.4801726487169606, + "learning_rate": 1.2984706809324812e-05, + "loss": 0.7069, + "step": 4143 + }, + { + "epoch": 0.42, + "grad_norm": 1.5355823466076746, + "learning_rate": 1.298156235724462e-05, + "loss": 0.684, + "step": 4144 + }, + { + "epoch": 0.42, + "grad_norm": 1.7080840468328307, + "learning_rate": 1.2978417581562456e-05, + "loss": 0.8024, + "step": 4145 + }, + { + "epoch": 0.42, + "grad_norm": 1.5326888897079554, + "learning_rate": 1.2975272482619645e-05, + "loss": 0.7033, + "step": 4146 + }, + { + "epoch": 0.42, + "grad_norm": 1.52255665500569, + "learning_rate": 1.2972127060757536e-05, + "loss": 0.7515, + "step": 4147 + }, + { + "epoch": 0.42, + "grad_norm": 1.6721411657315013, + "learning_rate": 1.296898131631751e-05, + "loss": 0.8513, + "step": 4148 + }, + { + "epoch": 0.42, + "grad_norm": 1.4392424729686444, + "learning_rate": 1.296583524964099e-05, + "loss": 0.7486, + "step": 4149 + }, + { + "epoch": 0.42, + "grad_norm": 1.579784131290916, + "learning_rate": 1.296268886106944e-05, + "loss": 0.7839, + "step": 4150 + }, + { + "epoch": 0.42, + "grad_norm": 1.442856162503546, + "learning_rate": 1.295954215094434e-05, + "loss": 0.7541, + "step": 4151 + }, + { + "epoch": 0.42, + "grad_norm": 1.5161520779023678, + "learning_rate": 1.2956395119607225e-05, + "loss": 0.6876, + "step": 4152 + }, + { + "epoch": 0.42, + "grad_norm": 1.6270966963220517, + "learning_rate": 1.2953247767399653e-05, + "loss": 0.7555, + "step": 4153 + }, + { + "epoch": 0.42, + "grad_norm": 1.4074602171467065, + "learning_rate": 1.2950100094663216e-05, + "loss": 0.6801, + "step": 4154 + }, + { + "epoch": 0.42, + "grad_norm": 1.485430352222668, + "learning_rate": 1.294695210173955e-05, + "loss": 0.6834, + "step": 4155 + }, + { + "epoch": 0.42, + "grad_norm": 1.5971365760873109, + "learning_rate": 1.2943803788970319e-05, + "loss": 0.6976, + "step": 4156 + }, + { + "epoch": 0.42, + "grad_norm": 1.5877675607054489, + "learning_rate": 1.294065515669722e-05, + "loss": 0.7516, + "step": 4157 + }, + { + "epoch": 0.42, + "grad_norm": 1.5611545308976658, + "learning_rate": 1.2937506205261991e-05, + "loss": 0.7916, + "step": 4158 + }, + { + "epoch": 0.42, + "grad_norm": 1.5146360432539503, + "learning_rate": 1.2934356935006405e-05, + "loss": 0.7435, + "step": 4159 + }, + { + "epoch": 0.42, + "grad_norm": 1.5328419599085918, + "learning_rate": 1.293120734627226e-05, + "loss": 0.6888, + "step": 4160 + }, + { + "epoch": 0.42, + "grad_norm": 1.6270060946823581, + "learning_rate": 1.2928057439401396e-05, + "loss": 0.6567, + "step": 4161 + }, + { + "epoch": 0.42, + "grad_norm": 1.5686762641960714, + "learning_rate": 1.2924907214735686e-05, + "loss": 0.7013, + "step": 4162 + }, + { + "epoch": 0.42, + "grad_norm": 1.679353724413454, + "learning_rate": 1.2921756672617039e-05, + "loss": 0.8016, + "step": 4163 + }, + { + "epoch": 0.42, + "grad_norm": 1.7116083356462308, + "learning_rate": 1.2918605813387395e-05, + "loss": 0.8348, + "step": 4164 + }, + { + "epoch": 0.42, + "grad_norm": 1.3332388029642228, + "learning_rate": 1.2915454637388733e-05, + "loss": 0.6438, + "step": 4165 + }, + { + "epoch": 0.42, + "grad_norm": 1.5179907748585992, + "learning_rate": 1.2912303144963062e-05, + "loss": 0.7783, + "step": 4166 + }, + { + "epoch": 0.42, + "grad_norm": 1.5280775014974233, + "learning_rate": 1.290915133645243e-05, + "loss": 0.7175, + "step": 4167 + }, + { + "epoch": 0.42, + "grad_norm": 1.6248733792262533, + "learning_rate": 1.2905999212198911e-05, + "loss": 0.8058, + "step": 4168 + }, + { + "epoch": 0.42, + "grad_norm": 1.5849368736277467, + "learning_rate": 1.2902846772544625e-05, + "loss": 0.7131, + "step": 4169 + }, + { + "epoch": 0.42, + "grad_norm": 1.4466438034909208, + "learning_rate": 1.2899694017831717e-05, + "loss": 0.6041, + "step": 4170 + }, + { + "epoch": 0.42, + "grad_norm": 1.4385438213485517, + "learning_rate": 1.289654094840237e-05, + "loss": 0.6044, + "step": 4171 + }, + { + "epoch": 0.42, + "grad_norm": 1.5468408761996495, + "learning_rate": 1.2893387564598798e-05, + "loss": 0.7942, + "step": 4172 + }, + { + "epoch": 0.42, + "grad_norm": 1.6063767743257966, + "learning_rate": 1.2890233866763253e-05, + "loss": 0.7492, + "step": 4173 + }, + { + "epoch": 0.42, + "grad_norm": 1.5397754358691504, + "learning_rate": 1.2887079855238023e-05, + "loss": 0.7214, + "step": 4174 + }, + { + "epoch": 0.42, + "grad_norm": 1.5545908044398078, + "learning_rate": 1.2883925530365422e-05, + "loss": 0.7593, + "step": 4175 + }, + { + "epoch": 0.42, + "grad_norm": 1.669074462325384, + "learning_rate": 1.2880770892487801e-05, + "loss": 0.7603, + "step": 4176 + }, + { + "epoch": 0.42, + "grad_norm": 1.564607040217321, + "learning_rate": 1.2877615941947554e-05, + "loss": 0.723, + "step": 4177 + }, + { + "epoch": 0.42, + "grad_norm": 1.4498352227742795, + "learning_rate": 1.2874460679087097e-05, + "loss": 0.7274, + "step": 4178 + }, + { + "epoch": 0.43, + "grad_norm": 1.8793408935895344, + "learning_rate": 1.2871305104248885e-05, + "loss": 0.7631, + "step": 4179 + }, + { + "epoch": 0.43, + "grad_norm": 1.5812346101843897, + "learning_rate": 1.2868149217775405e-05, + "loss": 0.8137, + "step": 4180 + }, + { + "epoch": 0.43, + "grad_norm": 1.521026332929309, + "learning_rate": 1.286499302000918e-05, + "loss": 0.705, + "step": 4181 + }, + { + "epoch": 0.43, + "grad_norm": 1.5306191672334164, + "learning_rate": 1.2861836511292766e-05, + "loss": 0.6851, + "step": 4182 + }, + { + "epoch": 0.43, + "grad_norm": 1.5073339226386875, + "learning_rate": 1.2858679691968754e-05, + "loss": 0.7127, + "step": 4183 + }, + { + "epoch": 0.43, + "grad_norm": 1.564368783420573, + "learning_rate": 1.2855522562379768e-05, + "loss": 0.7618, + "step": 4184 + }, + { + "epoch": 0.43, + "grad_norm": 1.4856689898896016, + "learning_rate": 1.2852365122868458e-05, + "loss": 0.5892, + "step": 4185 + }, + { + "epoch": 0.43, + "grad_norm": 1.5290378168110395, + "learning_rate": 1.2849207373777523e-05, + "loss": 0.8697, + "step": 4186 + }, + { + "epoch": 0.43, + "grad_norm": 1.6287002115251614, + "learning_rate": 1.2846049315449687e-05, + "loss": 0.7245, + "step": 4187 + }, + { + "epoch": 0.43, + "grad_norm": 1.5121503034940078, + "learning_rate": 1.2842890948227702e-05, + "loss": 0.7317, + "step": 4188 + }, + { + "epoch": 0.43, + "grad_norm": 1.716114438295996, + "learning_rate": 1.2839732272454364e-05, + "loss": 0.7015, + "step": 4189 + }, + { + "epoch": 0.43, + "grad_norm": 1.4890403805664274, + "learning_rate": 1.283657328847249e-05, + "loss": 0.7445, + "step": 4190 + }, + { + "epoch": 0.43, + "grad_norm": 1.604534877043379, + "learning_rate": 1.2833413996624953e-05, + "loss": 0.73, + "step": 4191 + }, + { + "epoch": 0.43, + "grad_norm": 1.661337652846229, + "learning_rate": 1.2830254397254631e-05, + "loss": 0.7738, + "step": 4192 + }, + { + "epoch": 0.43, + "grad_norm": 1.522246190980118, + "learning_rate": 1.2827094490704457e-05, + "loss": 0.6997, + "step": 4193 + }, + { + "epoch": 0.43, + "grad_norm": 1.607181524710579, + "learning_rate": 1.2823934277317385e-05, + "loss": 0.8442, + "step": 4194 + }, + { + "epoch": 0.43, + "grad_norm": 1.5441419295022962, + "learning_rate": 1.2820773757436408e-05, + "loss": 0.7235, + "step": 4195 + }, + { + "epoch": 0.43, + "grad_norm": 1.4943906211090183, + "learning_rate": 1.2817612931404554e-05, + "loss": 0.7262, + "step": 4196 + }, + { + "epoch": 0.43, + "grad_norm": 1.5578734075719667, + "learning_rate": 1.281445179956488e-05, + "loss": 0.7286, + "step": 4197 + }, + { + "epoch": 0.43, + "grad_norm": 1.444465296199738, + "learning_rate": 1.2811290362260475e-05, + "loss": 0.7434, + "step": 4198 + }, + { + "epoch": 0.43, + "grad_norm": 1.3774053157762296, + "learning_rate": 1.280812861983446e-05, + "loss": 0.6669, + "step": 4199 + }, + { + "epoch": 0.43, + "grad_norm": 1.6867537085739222, + "learning_rate": 1.2804966572630004e-05, + "loss": 0.718, + "step": 4200 + }, + { + "epoch": 0.43, + "grad_norm": 1.7171504576021996, + "learning_rate": 1.280180422099029e-05, + "loss": 0.7925, + "step": 4201 + }, + { + "epoch": 0.43, + "grad_norm": 1.5436301822788967, + "learning_rate": 1.2798641565258542e-05, + "loss": 0.7213, + "step": 4202 + }, + { + "epoch": 0.43, + "grad_norm": 1.5605055613090992, + "learning_rate": 1.2795478605778016e-05, + "loss": 0.6631, + "step": 4203 + }, + { + "epoch": 0.43, + "grad_norm": 1.5283818360565202, + "learning_rate": 1.2792315342892007e-05, + "loss": 0.7638, + "step": 4204 + }, + { + "epoch": 0.43, + "grad_norm": 1.7239811766109319, + "learning_rate": 1.2789151776943833e-05, + "loss": 0.8045, + "step": 4205 + }, + { + "epoch": 0.43, + "grad_norm": 1.6300973652859465, + "learning_rate": 1.2785987908276853e-05, + "loss": 0.6882, + "step": 4206 + }, + { + "epoch": 0.43, + "grad_norm": 1.4571204296952587, + "learning_rate": 1.2782823737234452e-05, + "loss": 0.7022, + "step": 4207 + }, + { + "epoch": 0.43, + "grad_norm": 1.5581818868739223, + "learning_rate": 1.2779659264160052e-05, + "loss": 0.7965, + "step": 4208 + }, + { + "epoch": 0.43, + "grad_norm": 1.5640531713720136, + "learning_rate": 1.2776494489397106e-05, + "loss": 0.7584, + "step": 4209 + }, + { + "epoch": 0.43, + "grad_norm": 1.5477868655715468, + "learning_rate": 1.2773329413289108e-05, + "loss": 0.6989, + "step": 4210 + }, + { + "epoch": 0.43, + "grad_norm": 1.5960710113205359, + "learning_rate": 1.2770164036179569e-05, + "loss": 0.7414, + "step": 4211 + }, + { + "epoch": 0.43, + "grad_norm": 1.4883667796547893, + "learning_rate": 1.2766998358412044e-05, + "loss": 0.7878, + "step": 4212 + }, + { + "epoch": 0.43, + "grad_norm": 1.5053209268600225, + "learning_rate": 1.2763832380330118e-05, + "loss": 0.7385, + "step": 4213 + }, + { + "epoch": 0.43, + "grad_norm": 1.365287466500688, + "learning_rate": 1.276066610227741e-05, + "loss": 0.7644, + "step": 4214 + }, + { + "epoch": 0.43, + "grad_norm": 1.617074598757576, + "learning_rate": 1.275749952459757e-05, + "loss": 0.8595, + "step": 4215 + }, + { + "epoch": 0.43, + "grad_norm": 1.5454203983494088, + "learning_rate": 1.2754332647634278e-05, + "loss": 0.7681, + "step": 4216 + }, + { + "epoch": 0.43, + "grad_norm": 1.5272545256382615, + "learning_rate": 1.2751165471731248e-05, + "loss": 0.6502, + "step": 4217 + }, + { + "epoch": 0.43, + "grad_norm": 1.4746003923918978, + "learning_rate": 1.274799799723223e-05, + "loss": 0.7119, + "step": 4218 + }, + { + "epoch": 0.43, + "grad_norm": 1.4663198618339883, + "learning_rate": 1.2744830224481004e-05, + "loss": 0.6739, + "step": 4219 + }, + { + "epoch": 0.43, + "grad_norm": 1.4428619255434645, + "learning_rate": 1.2741662153821381e-05, + "loss": 0.7681, + "step": 4220 + }, + { + "epoch": 0.43, + "grad_norm": 1.4664268102529439, + "learning_rate": 1.2738493785597208e-05, + "loss": 0.6864, + "step": 4221 + }, + { + "epoch": 0.43, + "grad_norm": 1.5853029515488561, + "learning_rate": 1.2735325120152357e-05, + "loss": 0.7223, + "step": 4222 + }, + { + "epoch": 0.43, + "grad_norm": 1.5644251327546417, + "learning_rate": 1.2732156157830744e-05, + "loss": 0.6766, + "step": 4223 + }, + { + "epoch": 0.43, + "grad_norm": 1.4276214621906658, + "learning_rate": 1.27289868989763e-05, + "loss": 0.6797, + "step": 4224 + }, + { + "epoch": 0.43, + "grad_norm": 1.4727346588567345, + "learning_rate": 1.272581734393301e-05, + "loss": 0.7309, + "step": 4225 + }, + { + "epoch": 0.43, + "grad_norm": 1.5771596386172593, + "learning_rate": 1.2722647493044873e-05, + "loss": 0.779, + "step": 4226 + }, + { + "epoch": 0.43, + "grad_norm": 1.540483221373571, + "learning_rate": 1.2719477346655926e-05, + "loss": 0.7378, + "step": 4227 + }, + { + "epoch": 0.43, + "grad_norm": 1.6694464546919359, + "learning_rate": 1.2716306905110243e-05, + "loss": 0.7269, + "step": 4228 + }, + { + "epoch": 0.43, + "grad_norm": 1.6459659204994768, + "learning_rate": 1.2713136168751923e-05, + "loss": 0.682, + "step": 4229 + }, + { + "epoch": 0.43, + "grad_norm": 1.4297368947921283, + "learning_rate": 1.27099651379251e-05, + "loss": 0.6766, + "step": 4230 + }, + { + "epoch": 0.43, + "grad_norm": 1.5360672121688423, + "learning_rate": 1.2706793812973941e-05, + "loss": 0.7731, + "step": 4231 + }, + { + "epoch": 0.43, + "grad_norm": 1.6790490187862654, + "learning_rate": 1.2703622194242644e-05, + "loss": 0.7898, + "step": 4232 + }, + { + "epoch": 0.43, + "grad_norm": 1.6940426223260903, + "learning_rate": 1.2700450282075439e-05, + "loss": 0.7994, + "step": 4233 + }, + { + "epoch": 0.43, + "grad_norm": 1.6035809726333352, + "learning_rate": 1.2697278076816584e-05, + "loss": 0.8182, + "step": 4234 + }, + { + "epoch": 0.43, + "grad_norm": 1.911690966187699, + "learning_rate": 1.2694105578810372e-05, + "loss": 0.7525, + "step": 4235 + }, + { + "epoch": 0.43, + "grad_norm": 1.5921485670353903, + "learning_rate": 1.2690932788401135e-05, + "loss": 0.7356, + "step": 4236 + }, + { + "epoch": 0.43, + "grad_norm": 1.68167081984641, + "learning_rate": 1.2687759705933225e-05, + "loss": 0.7508, + "step": 4237 + }, + { + "epoch": 0.43, + "grad_norm": 1.6075358036783265, + "learning_rate": 1.2684586331751027e-05, + "loss": 0.734, + "step": 4238 + }, + { + "epoch": 0.43, + "grad_norm": 1.4133034871638013, + "learning_rate": 1.2681412666198967e-05, + "loss": 0.7107, + "step": 4239 + }, + { + "epoch": 0.43, + "grad_norm": 1.4271432982156649, + "learning_rate": 1.2678238709621495e-05, + "loss": 0.6239, + "step": 4240 + }, + { + "epoch": 0.43, + "grad_norm": 1.4874506931674698, + "learning_rate": 1.2675064462363095e-05, + "loss": 0.7471, + "step": 4241 + }, + { + "epoch": 0.43, + "grad_norm": 1.6615279803875191, + "learning_rate": 1.267188992476828e-05, + "loss": 0.7187, + "step": 4242 + }, + { + "epoch": 0.43, + "grad_norm": 1.7055012784103516, + "learning_rate": 1.2668715097181598e-05, + "loss": 0.7781, + "step": 4243 + }, + { + "epoch": 0.43, + "grad_norm": 1.5392643779564632, + "learning_rate": 1.2665539979947625e-05, + "loss": 0.8275, + "step": 4244 + }, + { + "epoch": 0.43, + "grad_norm": 1.6392295533469456, + "learning_rate": 1.2662364573410972e-05, + "loss": 0.7079, + "step": 4245 + }, + { + "epoch": 0.43, + "grad_norm": 1.467439358859539, + "learning_rate": 1.265918887791628e-05, + "loss": 0.8107, + "step": 4246 + }, + { + "epoch": 0.43, + "grad_norm": 1.5340914320267773, + "learning_rate": 1.265601289380822e-05, + "loss": 0.6401, + "step": 4247 + }, + { + "epoch": 0.43, + "grad_norm": 1.6326046668917402, + "learning_rate": 1.2652836621431494e-05, + "loss": 0.6579, + "step": 4248 + }, + { + "epoch": 0.43, + "grad_norm": 1.6060006722081543, + "learning_rate": 1.264966006113084e-05, + "loss": 0.7607, + "step": 4249 + }, + { + "epoch": 0.43, + "grad_norm": 1.6301556197140081, + "learning_rate": 1.2646483213251024e-05, + "loss": 0.7725, + "step": 4250 + }, + { + "epoch": 0.43, + "grad_norm": 1.4477180043687317, + "learning_rate": 1.2643306078136839e-05, + "loss": 0.6654, + "step": 4251 + }, + { + "epoch": 0.43, + "grad_norm": 1.7183853097079613, + "learning_rate": 1.264012865613312e-05, + "loss": 0.757, + "step": 4252 + }, + { + "epoch": 0.43, + "grad_norm": 1.4991277481962177, + "learning_rate": 1.2636950947584716e-05, + "loss": 0.7827, + "step": 4253 + }, + { + "epoch": 0.43, + "grad_norm": 1.3682709122047572, + "learning_rate": 1.2633772952836527e-05, + "loss": 0.6954, + "step": 4254 + }, + { + "epoch": 0.43, + "grad_norm": 1.5264634061223141, + "learning_rate": 1.2630594672233474e-05, + "loss": 0.6219, + "step": 4255 + }, + { + "epoch": 0.43, + "grad_norm": 1.6081603775260733, + "learning_rate": 1.2627416106120505e-05, + "loss": 0.7464, + "step": 4256 + }, + { + "epoch": 0.43, + "grad_norm": 1.6664012905975172, + "learning_rate": 1.2624237254842606e-05, + "loss": 0.8268, + "step": 4257 + }, + { + "epoch": 0.43, + "grad_norm": 1.7236809343198751, + "learning_rate": 1.2621058118744789e-05, + "loss": 0.7237, + "step": 4258 + }, + { + "epoch": 0.43, + "grad_norm": 1.7199094659192433, + "learning_rate": 1.2617878698172106e-05, + "loss": 0.8653, + "step": 4259 + }, + { + "epoch": 0.43, + "grad_norm": 1.5702803700581136, + "learning_rate": 1.2614698993469627e-05, + "loss": 0.7597, + "step": 4260 + }, + { + "epoch": 0.43, + "grad_norm": 1.6599947985801582, + "learning_rate": 1.2611519004982463e-05, + "loss": 0.72, + "step": 4261 + }, + { + "epoch": 0.43, + "grad_norm": 1.5007726034218898, + "learning_rate": 1.2608338733055746e-05, + "loss": 0.7291, + "step": 4262 + }, + { + "epoch": 0.43, + "grad_norm": 1.6180141696571795, + "learning_rate": 1.2605158178034656e-05, + "loss": 0.8082, + "step": 4263 + }, + { + "epoch": 0.43, + "grad_norm": 1.5370907818119977, + "learning_rate": 1.2601977340264385e-05, + "loss": 0.7279, + "step": 4264 + }, + { + "epoch": 0.43, + "grad_norm": 1.6017277705458968, + "learning_rate": 1.2598796220090162e-05, + "loss": 0.7468, + "step": 4265 + }, + { + "epoch": 0.43, + "grad_norm": 1.470357468363117, + "learning_rate": 1.259561481785725e-05, + "loss": 0.7744, + "step": 4266 + }, + { + "epoch": 0.43, + "grad_norm": 1.5011085967488755, + "learning_rate": 1.2592433133910937e-05, + "loss": 0.7894, + "step": 4267 + }, + { + "epoch": 0.43, + "grad_norm": 1.8058621580428456, + "learning_rate": 1.2589251168596554e-05, + "loss": 0.6833, + "step": 4268 + }, + { + "epoch": 0.43, + "grad_norm": 1.5790531507207646, + "learning_rate": 1.2586068922259446e-05, + "loss": 0.807, + "step": 4269 + }, + { + "epoch": 0.43, + "grad_norm": 1.6129616607668176, + "learning_rate": 1.2582886395245e-05, + "loss": 0.7582, + "step": 4270 + }, + { + "epoch": 0.43, + "grad_norm": 1.5636248037220053, + "learning_rate": 1.2579703587898622e-05, + "loss": 0.7492, + "step": 4271 + }, + { + "epoch": 0.43, + "grad_norm": 1.6477268242323033, + "learning_rate": 1.2576520500565767e-05, + "loss": 0.8129, + "step": 4272 + }, + { + "epoch": 0.43, + "grad_norm": 1.53427061103123, + "learning_rate": 1.2573337133591904e-05, + "loss": 0.7485, + "step": 4273 + }, + { + "epoch": 0.43, + "grad_norm": 1.2760812692182046, + "learning_rate": 1.2570153487322534e-05, + "loss": 0.6305, + "step": 4274 + }, + { + "epoch": 0.43, + "grad_norm": 1.4831290425463768, + "learning_rate": 1.2566969562103197e-05, + "loss": 0.6692, + "step": 4275 + }, + { + "epoch": 0.43, + "grad_norm": 1.4653955773573912, + "learning_rate": 1.2563785358279459e-05, + "loss": 0.6619, + "step": 4276 + }, + { + "epoch": 0.44, + "grad_norm": 1.3376784691184405, + "learning_rate": 1.2560600876196911e-05, + "loss": 0.6905, + "step": 4277 + }, + { + "epoch": 0.44, + "grad_norm": 1.5494933539636409, + "learning_rate": 1.255741611620118e-05, + "loss": 0.7156, + "step": 4278 + }, + { + "epoch": 0.44, + "grad_norm": 1.8980840079305699, + "learning_rate": 1.2554231078637926e-05, + "loss": 0.7758, + "step": 4279 + }, + { + "epoch": 0.44, + "grad_norm": 1.5326597254174636, + "learning_rate": 1.2551045763852832e-05, + "loss": 0.7281, + "step": 4280 + }, + { + "epoch": 0.44, + "grad_norm": 1.6493596040522875, + "learning_rate": 1.2547860172191613e-05, + "loss": 0.8091, + "step": 4281 + }, + { + "epoch": 0.44, + "grad_norm": 1.6204455055952354, + "learning_rate": 1.2544674304000017e-05, + "loss": 0.7701, + "step": 4282 + }, + { + "epoch": 0.44, + "grad_norm": 1.5362361176412207, + "learning_rate": 1.254148815962382e-05, + "loss": 0.6546, + "step": 4283 + }, + { + "epoch": 0.44, + "grad_norm": 1.5859203814985514, + "learning_rate": 1.2538301739408827e-05, + "loss": 0.7461, + "step": 4284 + }, + { + "epoch": 0.44, + "grad_norm": 1.6904326299259982, + "learning_rate": 1.2535115043700873e-05, + "loss": 0.7867, + "step": 4285 + }, + { + "epoch": 0.44, + "grad_norm": 1.49829461015147, + "learning_rate": 1.2531928072845829e-05, + "loss": 0.7796, + "step": 4286 + }, + { + "epoch": 0.44, + "grad_norm": 1.5082088395185627, + "learning_rate": 1.2528740827189588e-05, + "loss": 0.7096, + "step": 4287 + }, + { + "epoch": 0.44, + "grad_norm": 1.6725549597277543, + "learning_rate": 1.2525553307078074e-05, + "loss": 0.7854, + "step": 4288 + }, + { + "epoch": 0.44, + "grad_norm": 1.4817730777320866, + "learning_rate": 1.2522365512857241e-05, + "loss": 0.6682, + "step": 4289 + }, + { + "epoch": 0.44, + "grad_norm": 1.535915561788933, + "learning_rate": 1.251917744487308e-05, + "loss": 0.7609, + "step": 4290 + }, + { + "epoch": 0.44, + "grad_norm": 1.5623924070870114, + "learning_rate": 1.2515989103471602e-05, + "loss": 0.6874, + "step": 4291 + }, + { + "epoch": 0.44, + "grad_norm": 1.525575165734888, + "learning_rate": 1.2512800488998856e-05, + "loss": 0.8236, + "step": 4292 + }, + { + "epoch": 0.44, + "grad_norm": 1.7352730208482219, + "learning_rate": 1.2509611601800908e-05, + "loss": 0.6995, + "step": 4293 + }, + { + "epoch": 0.44, + "grad_norm": 1.6429301062021924, + "learning_rate": 1.2506422442223867e-05, + "loss": 0.8077, + "step": 4294 + }, + { + "epoch": 0.44, + "grad_norm": 1.4617991681259823, + "learning_rate": 1.2503233010613866e-05, + "loss": 0.6972, + "step": 4295 + }, + { + "epoch": 0.44, + "grad_norm": 1.5839803085209656, + "learning_rate": 1.2500043307317072e-05, + "loss": 0.7635, + "step": 4296 + }, + { + "epoch": 0.44, + "grad_norm": 1.4985757263506088, + "learning_rate": 1.2496853332679668e-05, + "loss": 0.68, + "step": 4297 + }, + { + "epoch": 0.44, + "grad_norm": 1.876952140170346, + "learning_rate": 1.2493663087047883e-05, + "loss": 0.8143, + "step": 4298 + }, + { + "epoch": 0.44, + "grad_norm": 1.4912297726382522, + "learning_rate": 1.2490472570767966e-05, + "loss": 0.7566, + "step": 4299 + }, + { + "epoch": 0.44, + "grad_norm": 1.4135576446856426, + "learning_rate": 1.24872817841862e-05, + "loss": 0.7676, + "step": 4300 + }, + { + "epoch": 0.44, + "grad_norm": 1.6894482298761555, + "learning_rate": 1.2484090727648892e-05, + "loss": 0.7346, + "step": 4301 + }, + { + "epoch": 0.44, + "grad_norm": 1.410949297535782, + "learning_rate": 1.2480899401502384e-05, + "loss": 0.698, + "step": 4302 + }, + { + "epoch": 0.44, + "grad_norm": 1.4645565318403775, + "learning_rate": 1.247770780609304e-05, + "loss": 0.6811, + "step": 4303 + }, + { + "epoch": 0.44, + "grad_norm": 1.6854348277084623, + "learning_rate": 1.2474515941767262e-05, + "loss": 0.7574, + "step": 4304 + }, + { + "epoch": 0.44, + "grad_norm": 1.7077740397861376, + "learning_rate": 1.2471323808871475e-05, + "loss": 0.7648, + "step": 4305 + }, + { + "epoch": 0.44, + "grad_norm": 1.670997861295573, + "learning_rate": 1.2468131407752138e-05, + "loss": 0.7125, + "step": 4306 + }, + { + "epoch": 0.44, + "grad_norm": 1.6537332834926746, + "learning_rate": 1.246493873875573e-05, + "loss": 0.9453, + "step": 4307 + }, + { + "epoch": 0.44, + "grad_norm": 1.6298292725373442, + "learning_rate": 1.2461745802228771e-05, + "loss": 0.6995, + "step": 4308 + }, + { + "epoch": 0.44, + "grad_norm": 1.6336178608807248, + "learning_rate": 1.2458552598517804e-05, + "loss": 0.9437, + "step": 4309 + }, + { + "epoch": 0.44, + "grad_norm": 1.6605536767879754, + "learning_rate": 1.24553591279694e-05, + "loss": 0.7589, + "step": 4310 + }, + { + "epoch": 0.44, + "grad_norm": 1.6214145562153597, + "learning_rate": 1.2452165390930159e-05, + "loss": 0.756, + "step": 4311 + }, + { + "epoch": 0.44, + "grad_norm": 1.7693944283318281, + "learning_rate": 1.244897138774671e-05, + "loss": 0.776, + "step": 4312 + }, + { + "epoch": 0.44, + "grad_norm": 1.9429667641152428, + "learning_rate": 1.244577711876572e-05, + "loss": 0.7921, + "step": 4313 + }, + { + "epoch": 0.44, + "grad_norm": 1.4719654131064106, + "learning_rate": 1.2442582584333867e-05, + "loss": 0.7164, + "step": 4314 + }, + { + "epoch": 0.44, + "grad_norm": 1.6998681620739857, + "learning_rate": 1.2439387784797873e-05, + "loss": 0.7358, + "step": 4315 + }, + { + "epoch": 0.44, + "grad_norm": 1.5367814354024754, + "learning_rate": 1.2436192720504483e-05, + "loss": 0.7949, + "step": 4316 + }, + { + "epoch": 0.44, + "grad_norm": 1.5814752360852835, + "learning_rate": 1.2432997391800471e-05, + "loss": 0.7588, + "step": 4317 + }, + { + "epoch": 0.44, + "grad_norm": 1.5482592457995743, + "learning_rate": 1.242980179903264e-05, + "loss": 0.7803, + "step": 4318 + }, + { + "epoch": 0.44, + "grad_norm": 1.543902286716255, + "learning_rate": 1.2426605942547822e-05, + "loss": 0.7623, + "step": 4319 + }, + { + "epoch": 0.44, + "grad_norm": 1.5364496774447027, + "learning_rate": 1.2423409822692878e-05, + "loss": 0.7359, + "step": 4320 + }, + { + "epoch": 0.44, + "grad_norm": 1.7470060336826811, + "learning_rate": 1.2420213439814693e-05, + "loss": 0.8224, + "step": 4321 + }, + { + "epoch": 0.44, + "grad_norm": 1.7947145930976374, + "learning_rate": 1.2417016794260188e-05, + "loss": 0.9604, + "step": 4322 + }, + { + "epoch": 0.44, + "grad_norm": 1.4318612043412737, + "learning_rate": 1.2413819886376308e-05, + "loss": 0.7188, + "step": 4323 + }, + { + "epoch": 0.44, + "grad_norm": 1.578719074009928, + "learning_rate": 1.241062271651003e-05, + "loss": 0.7632, + "step": 4324 + }, + { + "epoch": 0.44, + "grad_norm": 1.4222966215231232, + "learning_rate": 1.2407425285008348e-05, + "loss": 0.727, + "step": 4325 + }, + { + "epoch": 0.44, + "grad_norm": 1.7037074418102436, + "learning_rate": 1.2404227592218302e-05, + "loss": 0.7614, + "step": 4326 + }, + { + "epoch": 0.44, + "grad_norm": 1.5560684096552946, + "learning_rate": 1.2401029638486952e-05, + "loss": 0.7566, + "step": 4327 + }, + { + "epoch": 0.44, + "grad_norm": 1.5496149562355548, + "learning_rate": 1.2397831424161379e-05, + "loss": 0.6464, + "step": 4328 + }, + { + "epoch": 0.44, + "grad_norm": 1.5661260611233834, + "learning_rate": 1.2394632949588702e-05, + "loss": 0.7027, + "step": 4329 + }, + { + "epoch": 0.44, + "grad_norm": 1.647435871699377, + "learning_rate": 1.239143421511607e-05, + "loss": 0.7214, + "step": 4330 + }, + { + "epoch": 0.44, + "grad_norm": 1.6376179783423268, + "learning_rate": 1.238823522109065e-05, + "loss": 0.7569, + "step": 4331 + }, + { + "epoch": 0.44, + "grad_norm": 1.7213715524105555, + "learning_rate": 1.2385035967859644e-05, + "loss": 0.8125, + "step": 4332 + }, + { + "epoch": 0.44, + "grad_norm": 1.5625709746455267, + "learning_rate": 1.2381836455770281e-05, + "loss": 0.696, + "step": 4333 + }, + { + "epoch": 0.44, + "grad_norm": 1.5850692435817693, + "learning_rate": 1.237863668516982e-05, + "loss": 0.8419, + "step": 4334 + }, + { + "epoch": 0.44, + "grad_norm": 1.6029512722585655, + "learning_rate": 1.2375436656405543e-05, + "loss": 0.8223, + "step": 4335 + }, + { + "epoch": 0.44, + "grad_norm": 1.4897526621851649, + "learning_rate": 1.2372236369824766e-05, + "loss": 0.7425, + "step": 4336 + }, + { + "epoch": 0.44, + "grad_norm": 1.4048163565710574, + "learning_rate": 1.236903582577483e-05, + "loss": 0.6398, + "step": 4337 + }, + { + "epoch": 0.44, + "grad_norm": 1.5031420071231552, + "learning_rate": 1.2365835024603099e-05, + "loss": 0.8474, + "step": 4338 + }, + { + "epoch": 0.44, + "grad_norm": 1.3533265566661303, + "learning_rate": 1.2362633966656974e-05, + "loss": 0.5881, + "step": 4339 + }, + { + "epoch": 0.44, + "grad_norm": 1.6080843502117637, + "learning_rate": 1.2359432652283882e-05, + "loss": 0.7397, + "step": 4340 + }, + { + "epoch": 0.44, + "grad_norm": 1.6916691645649253, + "learning_rate": 1.2356231081831274e-05, + "loss": 0.835, + "step": 4341 + }, + { + "epoch": 0.44, + "grad_norm": 1.541941556966986, + "learning_rate": 1.2353029255646627e-05, + "loss": 0.7793, + "step": 4342 + }, + { + "epoch": 0.44, + "grad_norm": 1.3431183490118175, + "learning_rate": 1.234982717407745e-05, + "loss": 0.6793, + "step": 4343 + }, + { + "epoch": 0.44, + "grad_norm": 1.5996485332722739, + "learning_rate": 1.2346624837471282e-05, + "loss": 0.8009, + "step": 4344 + }, + { + "epoch": 0.44, + "grad_norm": 1.5334897538387262, + "learning_rate": 1.2343422246175687e-05, + "loss": 0.6806, + "step": 4345 + }, + { + "epoch": 0.44, + "grad_norm": 1.5390653761364153, + "learning_rate": 1.2340219400538251e-05, + "loss": 0.7227, + "step": 4346 + }, + { + "epoch": 0.44, + "grad_norm": 1.5577072860816348, + "learning_rate": 1.2337016300906599e-05, + "loss": 0.7152, + "step": 4347 + }, + { + "epoch": 0.44, + "grad_norm": 1.4338529859628366, + "learning_rate": 1.2333812947628372e-05, + "loss": 0.6529, + "step": 4348 + }, + { + "epoch": 0.44, + "grad_norm": 1.5103611554283487, + "learning_rate": 1.2330609341051248e-05, + "loss": 0.5906, + "step": 4349 + }, + { + "epoch": 0.44, + "grad_norm": 1.5452087394046694, + "learning_rate": 1.2327405481522925e-05, + "loss": 0.7806, + "step": 4350 + }, + { + "epoch": 0.44, + "grad_norm": 1.506080493851666, + "learning_rate": 1.2324201369391135e-05, + "loss": 0.7061, + "step": 4351 + }, + { + "epoch": 0.44, + "grad_norm": 1.611522681979579, + "learning_rate": 1.232099700500363e-05, + "loss": 0.7171, + "step": 4352 + }, + { + "epoch": 0.44, + "grad_norm": 1.4512234024915196, + "learning_rate": 1.23177923887082e-05, + "loss": 0.7174, + "step": 4353 + }, + { + "epoch": 0.44, + "grad_norm": 1.4573865532669146, + "learning_rate": 1.2314587520852652e-05, + "loss": 0.6301, + "step": 4354 + }, + { + "epoch": 0.44, + "grad_norm": 1.639497091761245, + "learning_rate": 1.2311382401784823e-05, + "loss": 0.7631, + "step": 4355 + }, + { + "epoch": 0.44, + "grad_norm": 1.502122098326732, + "learning_rate": 1.2308177031852581e-05, + "loss": 0.7225, + "step": 4356 + }, + { + "epoch": 0.44, + "grad_norm": 1.6607915454021849, + "learning_rate": 1.2304971411403818e-05, + "loss": 0.7495, + "step": 4357 + }, + { + "epoch": 0.44, + "grad_norm": 1.4264459129384013, + "learning_rate": 1.2301765540786454e-05, + "loss": 0.6694, + "step": 4358 + }, + { + "epoch": 0.44, + "grad_norm": 1.4441931508298085, + "learning_rate": 1.2298559420348437e-05, + "loss": 0.6286, + "step": 4359 + }, + { + "epoch": 0.44, + "grad_norm": 1.6350542372791959, + "learning_rate": 1.229535305043774e-05, + "loss": 0.7763, + "step": 4360 + }, + { + "epoch": 0.44, + "grad_norm": 1.5076741786633405, + "learning_rate": 1.2292146431402364e-05, + "loss": 0.7056, + "step": 4361 + }, + { + "epoch": 0.44, + "grad_norm": 1.4946756140117825, + "learning_rate": 1.2288939563590336e-05, + "loss": 0.7498, + "step": 4362 + }, + { + "epoch": 0.44, + "grad_norm": 1.4954116089047855, + "learning_rate": 1.2285732447349718e-05, + "loss": 0.7052, + "step": 4363 + }, + { + "epoch": 0.44, + "grad_norm": 1.539129278892199, + "learning_rate": 1.2282525083028585e-05, + "loss": 0.7045, + "step": 4364 + }, + { + "epoch": 0.44, + "grad_norm": 1.5616495206143877, + "learning_rate": 1.227931747097505e-05, + "loss": 0.7171, + "step": 4365 + }, + { + "epoch": 0.44, + "grad_norm": 1.541181730746174, + "learning_rate": 1.2276109611537246e-05, + "loss": 0.6921, + "step": 4366 + }, + { + "epoch": 0.44, + "grad_norm": 1.574039031315957, + "learning_rate": 1.2272901505063339e-05, + "loss": 0.7405, + "step": 4367 + }, + { + "epoch": 0.44, + "grad_norm": 1.6611638774575088, + "learning_rate": 1.226969315190152e-05, + "loss": 0.7466, + "step": 4368 + }, + { + "epoch": 0.44, + "grad_norm": 1.795872991044649, + "learning_rate": 1.2266484552400002e-05, + "loss": 0.7829, + "step": 4369 + }, + { + "epoch": 0.44, + "grad_norm": 1.5984338918264582, + "learning_rate": 1.2263275706907028e-05, + "loss": 0.6836, + "step": 4370 + }, + { + "epoch": 0.44, + "grad_norm": 1.6082604218617218, + "learning_rate": 1.2260066615770872e-05, + "loss": 0.7268, + "step": 4371 + }, + { + "epoch": 0.44, + "grad_norm": 1.5863700764160853, + "learning_rate": 1.2256857279339826e-05, + "loss": 0.7271, + "step": 4372 + }, + { + "epoch": 0.44, + "grad_norm": 1.502654497794165, + "learning_rate": 1.2253647697962219e-05, + "loss": 0.7056, + "step": 4373 + }, + { + "epoch": 0.44, + "grad_norm": 1.455642030024103, + "learning_rate": 1.2250437871986396e-05, + "loss": 0.6944, + "step": 4374 + }, + { + "epoch": 0.45, + "grad_norm": 1.732673603633385, + "learning_rate": 1.2247227801760733e-05, + "loss": 0.8043, + "step": 4375 + }, + { + "epoch": 0.45, + "grad_norm": 1.5572231314499265, + "learning_rate": 1.2244017487633637e-05, + "loss": 0.6938, + "step": 4376 + }, + { + "epoch": 0.45, + "grad_norm": 1.687209376387509, + "learning_rate": 1.2240806929953535e-05, + "loss": 0.703, + "step": 4377 + }, + { + "epoch": 0.45, + "grad_norm": 1.5609158173524036, + "learning_rate": 1.2237596129068883e-05, + "loss": 0.812, + "step": 4378 + }, + { + "epoch": 0.45, + "grad_norm": 1.530954813512489, + "learning_rate": 1.223438508532816e-05, + "loss": 0.77, + "step": 4379 + }, + { + "epoch": 0.45, + "grad_norm": 1.5362119262865612, + "learning_rate": 1.2231173799079881e-05, + "loss": 0.718, + "step": 4380 + }, + { + "epoch": 0.45, + "grad_norm": 1.4563236326328328, + "learning_rate": 1.2227962270672578e-05, + "loss": 0.7434, + "step": 4381 + }, + { + "epoch": 0.45, + "grad_norm": 1.6876187955506705, + "learning_rate": 1.2224750500454812e-05, + "loss": 0.7205, + "step": 4382 + }, + { + "epoch": 0.45, + "grad_norm": 1.5422731338231965, + "learning_rate": 1.2221538488775169e-05, + "loss": 0.7501, + "step": 4383 + }, + { + "epoch": 0.45, + "grad_norm": 1.6188008465025745, + "learning_rate": 1.2218326235982265e-05, + "loss": 0.7096, + "step": 4384 + }, + { + "epoch": 0.45, + "grad_norm": 1.5020069685350754, + "learning_rate": 1.2215113742424737e-05, + "loss": 0.7346, + "step": 4385 + }, + { + "epoch": 0.45, + "grad_norm": 1.5813327112219508, + "learning_rate": 1.2211901008451254e-05, + "loss": 0.6556, + "step": 4386 + }, + { + "epoch": 0.45, + "grad_norm": 1.666893881592953, + "learning_rate": 1.2208688034410508e-05, + "loss": 0.7044, + "step": 4387 + }, + { + "epoch": 0.45, + "grad_norm": 1.5003835672667813, + "learning_rate": 1.2205474820651216e-05, + "loss": 0.7246, + "step": 4388 + }, + { + "epoch": 0.45, + "grad_norm": 1.549861347647712, + "learning_rate": 1.2202261367522119e-05, + "loss": 0.6512, + "step": 4389 + }, + { + "epoch": 0.45, + "grad_norm": 1.5602212910735438, + "learning_rate": 1.2199047675371993e-05, + "loss": 0.7465, + "step": 4390 + }, + { + "epoch": 0.45, + "grad_norm": 1.753807997087651, + "learning_rate": 1.219583374454963e-05, + "loss": 0.7398, + "step": 4391 + }, + { + "epoch": 0.45, + "grad_norm": 1.6286670549259223, + "learning_rate": 1.2192619575403855e-05, + "loss": 0.7511, + "step": 4392 + }, + { + "epoch": 0.45, + "grad_norm": 1.3647542817459297, + "learning_rate": 1.218940516828351e-05, + "loss": 0.6561, + "step": 4393 + }, + { + "epoch": 0.45, + "grad_norm": 1.5529890621235534, + "learning_rate": 1.2186190523537476e-05, + "loss": 0.695, + "step": 4394 + }, + { + "epoch": 0.45, + "grad_norm": 1.5510040334936726, + "learning_rate": 1.2182975641514646e-05, + "loss": 0.6774, + "step": 4395 + }, + { + "epoch": 0.45, + "grad_norm": 1.5926843629490448, + "learning_rate": 1.217976052256395e-05, + "loss": 0.7498, + "step": 4396 + }, + { + "epoch": 0.45, + "grad_norm": 1.5187507805011788, + "learning_rate": 1.2176545167034337e-05, + "loss": 0.7326, + "step": 4397 + }, + { + "epoch": 0.45, + "grad_norm": 1.6532601994941343, + "learning_rate": 1.2173329575274781e-05, + "loss": 0.6889, + "step": 4398 + }, + { + "epoch": 0.45, + "grad_norm": 1.5301841899089985, + "learning_rate": 1.217011374763429e-05, + "loss": 0.8719, + "step": 4399 + }, + { + "epoch": 0.45, + "grad_norm": 1.5979785297825504, + "learning_rate": 1.2166897684461887e-05, + "loss": 0.6897, + "step": 4400 + }, + { + "epoch": 0.45, + "grad_norm": 1.4936498800599838, + "learning_rate": 1.2163681386106628e-05, + "loss": 0.7131, + "step": 4401 + }, + { + "epoch": 0.45, + "grad_norm": 1.561516544759848, + "learning_rate": 1.2160464852917586e-05, + "loss": 0.7337, + "step": 4402 + }, + { + "epoch": 0.45, + "grad_norm": 1.6101791995119874, + "learning_rate": 1.2157248085243875e-05, + "loss": 0.7181, + "step": 4403 + }, + { + "epoch": 0.45, + "grad_norm": 1.4056365477488921, + "learning_rate": 1.2154031083434618e-05, + "loss": 0.7246, + "step": 4404 + }, + { + "epoch": 0.45, + "grad_norm": 1.4791446978711529, + "learning_rate": 1.2150813847838973e-05, + "loss": 0.7785, + "step": 4405 + }, + { + "epoch": 0.45, + "grad_norm": 1.6116695930351572, + "learning_rate": 1.2147596378806122e-05, + "loss": 0.8783, + "step": 4406 + }, + { + "epoch": 0.45, + "grad_norm": 1.525736879948389, + "learning_rate": 1.2144378676685265e-05, + "loss": 0.6726, + "step": 4407 + }, + { + "epoch": 0.45, + "grad_norm": 1.704644176726741, + "learning_rate": 1.2141160741825637e-05, + "loss": 0.8096, + "step": 4408 + }, + { + "epoch": 0.45, + "grad_norm": 1.6201455401565257, + "learning_rate": 1.2137942574576498e-05, + "loss": 0.7243, + "step": 4409 + }, + { + "epoch": 0.45, + "grad_norm": 1.5974565990510263, + "learning_rate": 1.2134724175287124e-05, + "loss": 0.7345, + "step": 4410 + }, + { + "epoch": 0.45, + "grad_norm": 1.5472920340894925, + "learning_rate": 1.213150554430682e-05, + "loss": 0.6502, + "step": 4411 + }, + { + "epoch": 0.45, + "grad_norm": 1.6689059927391285, + "learning_rate": 1.2128286681984929e-05, + "loss": 0.8014, + "step": 4412 + }, + { + "epoch": 0.45, + "grad_norm": 1.7726442768700028, + "learning_rate": 1.2125067588670798e-05, + "loss": 0.6601, + "step": 4413 + }, + { + "epoch": 0.45, + "grad_norm": 1.6477284044940803, + "learning_rate": 1.2121848264713813e-05, + "loss": 0.7439, + "step": 4414 + }, + { + "epoch": 0.45, + "grad_norm": 1.4965439315138036, + "learning_rate": 1.2118628710463383e-05, + "loss": 0.6484, + "step": 4415 + }, + { + "epoch": 0.45, + "grad_norm": 1.5165985892615441, + "learning_rate": 1.2115408926268935e-05, + "loss": 0.7808, + "step": 4416 + }, + { + "epoch": 0.45, + "grad_norm": 1.5558271212569084, + "learning_rate": 1.211218891247993e-05, + "loss": 0.6624, + "step": 4417 + }, + { + "epoch": 0.45, + "grad_norm": 1.5968364577633354, + "learning_rate": 1.210896866944585e-05, + "loss": 0.7942, + "step": 4418 + }, + { + "epoch": 0.45, + "grad_norm": 1.5649442392983401, + "learning_rate": 1.2105748197516202e-05, + "loss": 0.7289, + "step": 4419 + }, + { + "epoch": 0.45, + "grad_norm": 1.4583243392481153, + "learning_rate": 1.2102527497040518e-05, + "loss": 0.7746, + "step": 4420 + }, + { + "epoch": 0.45, + "grad_norm": 1.4272608738704586, + "learning_rate": 1.2099306568368351e-05, + "loss": 0.645, + "step": 4421 + }, + { + "epoch": 0.45, + "grad_norm": 1.4106183228923768, + "learning_rate": 1.2096085411849288e-05, + "loss": 0.7285, + "step": 4422 + }, + { + "epoch": 0.45, + "grad_norm": 1.5010831315856243, + "learning_rate": 1.2092864027832934e-05, + "loss": 0.6688, + "step": 4423 + }, + { + "epoch": 0.45, + "grad_norm": 1.6271030020091606, + "learning_rate": 1.2089642416668917e-05, + "loss": 0.7768, + "step": 4424 + }, + { + "epoch": 0.45, + "grad_norm": 1.5235151617778604, + "learning_rate": 1.2086420578706892e-05, + "loss": 0.6403, + "step": 4425 + }, + { + "epoch": 0.45, + "grad_norm": 1.5076423948411202, + "learning_rate": 1.2083198514296543e-05, + "loss": 0.7494, + "step": 4426 + }, + { + "epoch": 0.45, + "grad_norm": 1.5464772791066064, + "learning_rate": 1.2079976223787572e-05, + "loss": 0.889, + "step": 4427 + }, + { + "epoch": 0.45, + "grad_norm": 1.5354979787731868, + "learning_rate": 1.2076753707529712e-05, + "loss": 0.6382, + "step": 4428 + }, + { + "epoch": 0.45, + "grad_norm": 1.423218504781596, + "learning_rate": 1.207353096587271e-05, + "loss": 0.7312, + "step": 4429 + }, + { + "epoch": 0.45, + "grad_norm": 1.3970447157524866, + "learning_rate": 1.2070307999166349e-05, + "loss": 0.6877, + "step": 4430 + }, + { + "epoch": 0.45, + "grad_norm": 1.6599140309374598, + "learning_rate": 1.2067084807760432e-05, + "loss": 0.7027, + "step": 4431 + }, + { + "epoch": 0.45, + "grad_norm": 1.6651396370667126, + "learning_rate": 1.2063861392004786e-05, + "loss": 0.7474, + "step": 4432 + }, + { + "epoch": 0.45, + "grad_norm": 1.430089832277469, + "learning_rate": 1.2060637752249258e-05, + "loss": 0.7243, + "step": 4433 + }, + { + "epoch": 0.45, + "grad_norm": 1.614481881170845, + "learning_rate": 1.2057413888843725e-05, + "loss": 0.8231, + "step": 4434 + }, + { + "epoch": 0.45, + "grad_norm": 1.4348838527996164, + "learning_rate": 1.2054189802138092e-05, + "loss": 0.7713, + "step": 4435 + }, + { + "epoch": 0.45, + "grad_norm": 1.7212586577126978, + "learning_rate": 1.205096549248228e-05, + "loss": 0.658, + "step": 4436 + }, + { + "epoch": 0.45, + "grad_norm": 1.658407315132644, + "learning_rate": 1.2047740960226237e-05, + "loss": 0.7476, + "step": 4437 + }, + { + "epoch": 0.45, + "grad_norm": 1.5328008396841144, + "learning_rate": 1.2044516205719936e-05, + "loss": 0.8208, + "step": 4438 + }, + { + "epoch": 0.45, + "grad_norm": 1.4121317489710041, + "learning_rate": 1.2041291229313372e-05, + "loss": 0.7737, + "step": 4439 + }, + { + "epoch": 0.45, + "grad_norm": 1.6338576991003722, + "learning_rate": 1.2038066031356568e-05, + "loss": 0.777, + "step": 4440 + }, + { + "epoch": 0.45, + "grad_norm": 1.6169747349261165, + "learning_rate": 1.203484061219957e-05, + "loss": 0.7694, + "step": 4441 + }, + { + "epoch": 0.45, + "grad_norm": 1.534574166088398, + "learning_rate": 1.2031614972192443e-05, + "loss": 0.7543, + "step": 4442 + }, + { + "epoch": 0.45, + "grad_norm": 1.5235420808364872, + "learning_rate": 1.2028389111685283e-05, + "loss": 0.766, + "step": 4443 + }, + { + "epoch": 0.45, + "grad_norm": 1.505387947763993, + "learning_rate": 1.2025163031028203e-05, + "loss": 0.8059, + "step": 4444 + }, + { + "epoch": 0.45, + "grad_norm": 1.5351796430514244, + "learning_rate": 1.202193673057135e-05, + "loss": 0.6711, + "step": 4445 + }, + { + "epoch": 0.45, + "grad_norm": 1.5112091190626977, + "learning_rate": 1.201871021066488e-05, + "loss": 0.8201, + "step": 4446 + }, + { + "epoch": 0.45, + "grad_norm": 1.627302058235011, + "learning_rate": 1.2015483471658986e-05, + "loss": 0.7077, + "step": 4447 + }, + { + "epoch": 0.45, + "grad_norm": 2.0278561697736004, + "learning_rate": 1.2012256513903881e-05, + "loss": 0.7857, + "step": 4448 + }, + { + "epoch": 0.45, + "grad_norm": 1.611940844306635, + "learning_rate": 1.2009029337749803e-05, + "loss": 0.7612, + "step": 4449 + }, + { + "epoch": 0.45, + "grad_norm": 1.8121778869497533, + "learning_rate": 1.2005801943547004e-05, + "loss": 0.78, + "step": 4450 + }, + { + "epoch": 0.45, + "grad_norm": 1.5552713757449876, + "learning_rate": 1.2002574331645774e-05, + "loss": 0.7561, + "step": 4451 + }, + { + "epoch": 0.45, + "grad_norm": 1.6294178143747788, + "learning_rate": 1.1999346502396411e-05, + "loss": 0.6489, + "step": 4452 + }, + { + "epoch": 0.45, + "grad_norm": 1.5805010400813575, + "learning_rate": 1.1996118456149258e-05, + "loss": 0.6256, + "step": 4453 + }, + { + "epoch": 0.45, + "grad_norm": 1.6173180486478613, + "learning_rate": 1.199289019325466e-05, + "loss": 0.7834, + "step": 4454 + }, + { + "epoch": 0.45, + "grad_norm": 1.4599722225184395, + "learning_rate": 1.1989661714063e-05, + "loss": 0.7161, + "step": 4455 + }, + { + "epoch": 0.45, + "grad_norm": 1.5629592431617074, + "learning_rate": 1.198643301892467e-05, + "loss": 0.6841, + "step": 4456 + }, + { + "epoch": 0.45, + "grad_norm": 1.6195847291825776, + "learning_rate": 1.1983204108190102e-05, + "loss": 0.7359, + "step": 4457 + }, + { + "epoch": 0.45, + "grad_norm": 1.6672599164503639, + "learning_rate": 1.1979974982209745e-05, + "loss": 0.8227, + "step": 4458 + }, + { + "epoch": 0.45, + "grad_norm": 1.6956380914634728, + "learning_rate": 1.1976745641334064e-05, + "loss": 0.7724, + "step": 4459 + }, + { + "epoch": 0.45, + "grad_norm": 1.3448791245225744, + "learning_rate": 1.197351608591356e-05, + "loss": 0.6883, + "step": 4460 + }, + { + "epoch": 0.45, + "grad_norm": 1.4919968482707506, + "learning_rate": 1.1970286316298743e-05, + "loss": 0.8661, + "step": 4461 + }, + { + "epoch": 0.45, + "grad_norm": 1.5888723787256789, + "learning_rate": 1.1967056332840159e-05, + "loss": 0.6612, + "step": 4462 + }, + { + "epoch": 0.45, + "grad_norm": 1.500626958840491, + "learning_rate": 1.1963826135888374e-05, + "loss": 0.6686, + "step": 4463 + }, + { + "epoch": 0.45, + "grad_norm": 1.3928861170099849, + "learning_rate": 1.196059572579397e-05, + "loss": 0.7959, + "step": 4464 + }, + { + "epoch": 0.45, + "grad_norm": 1.6296766789886576, + "learning_rate": 1.1957365102907562e-05, + "loss": 0.8137, + "step": 4465 + }, + { + "epoch": 0.45, + "grad_norm": 2.0328265489817867, + "learning_rate": 1.1954134267579781e-05, + "loss": 0.8245, + "step": 4466 + }, + { + "epoch": 0.45, + "grad_norm": 1.654234354125284, + "learning_rate": 1.1950903220161286e-05, + "loss": 0.6642, + "step": 4467 + }, + { + "epoch": 0.45, + "grad_norm": 1.5800488068146095, + "learning_rate": 1.1947671961002753e-05, + "loss": 0.7177, + "step": 4468 + }, + { + "epoch": 0.45, + "grad_norm": 1.4788972818110588, + "learning_rate": 1.1944440490454887e-05, + "loss": 0.764, + "step": 4469 + }, + { + "epoch": 0.45, + "grad_norm": 1.7483788792193522, + "learning_rate": 1.1941208808868411e-05, + "loss": 0.7508, + "step": 4470 + }, + { + "epoch": 0.45, + "grad_norm": 1.518687096393544, + "learning_rate": 1.193797691659408e-05, + "loss": 0.8022, + "step": 4471 + }, + { + "epoch": 0.45, + "grad_norm": 1.5851103825987014, + "learning_rate": 1.1934744813982662e-05, + "loss": 0.7285, + "step": 4472 + }, + { + "epoch": 0.45, + "grad_norm": 1.5147747934349225, + "learning_rate": 1.193151250138495e-05, + "loss": 0.6713, + "step": 4473 + }, + { + "epoch": 0.46, + "grad_norm": 1.7252009780168112, + "learning_rate": 1.1928279979151757e-05, + "loss": 0.8059, + "step": 4474 + }, + { + "epoch": 0.46, + "grad_norm": 1.6550933483637722, + "learning_rate": 1.1925047247633931e-05, + "loss": 0.7824, + "step": 4475 + }, + { + "epoch": 0.46, + "grad_norm": 1.537864083464159, + "learning_rate": 1.1921814307182331e-05, + "loss": 0.7198, + "step": 4476 + }, + { + "epoch": 0.46, + "grad_norm": 1.6247694014405787, + "learning_rate": 1.1918581158147842e-05, + "loss": 0.7996, + "step": 4477 + }, + { + "epoch": 0.46, + "grad_norm": 1.5040273170796579, + "learning_rate": 1.1915347800881369e-05, + "loss": 0.7318, + "step": 4478 + }, + { + "epoch": 0.46, + "grad_norm": 1.624008872920982, + "learning_rate": 1.1912114235733846e-05, + "loss": 0.8635, + "step": 4479 + }, + { + "epoch": 0.46, + "grad_norm": 1.6783046655630303, + "learning_rate": 1.1908880463056225e-05, + "loss": 0.7072, + "step": 4480 + }, + { + "epoch": 0.46, + "grad_norm": 1.5987382924783546, + "learning_rate": 1.1905646483199481e-05, + "loss": 0.737, + "step": 4481 + }, + { + "epoch": 0.46, + "grad_norm": 1.4190839686862713, + "learning_rate": 1.1902412296514614e-05, + "loss": 0.6069, + "step": 4482 + }, + { + "epoch": 0.46, + "grad_norm": 1.4717341097697567, + "learning_rate": 1.1899177903352639e-05, + "loss": 0.6212, + "step": 4483 + }, + { + "epoch": 0.46, + "grad_norm": 1.5764180668532193, + "learning_rate": 1.1895943304064606e-05, + "loss": 0.6834, + "step": 4484 + }, + { + "epoch": 0.46, + "grad_norm": 1.545690402929286, + "learning_rate": 1.1892708499001576e-05, + "loss": 0.613, + "step": 4485 + }, + { + "epoch": 0.46, + "grad_norm": 1.5153286028527162, + "learning_rate": 1.1889473488514636e-05, + "loss": 0.7848, + "step": 4486 + }, + { + "epoch": 0.46, + "grad_norm": 1.5809882519403804, + "learning_rate": 1.1886238272954897e-05, + "loss": 0.7388, + "step": 4487 + }, + { + "epoch": 0.46, + "grad_norm": 1.5637085518869727, + "learning_rate": 1.1883002852673489e-05, + "loss": 0.7386, + "step": 4488 + }, + { + "epoch": 0.46, + "grad_norm": 1.522366166224289, + "learning_rate": 1.187976722802157e-05, + "loss": 0.6202, + "step": 4489 + }, + { + "epoch": 0.46, + "grad_norm": 1.6818519644180367, + "learning_rate": 1.1876531399350316e-05, + "loss": 0.7759, + "step": 4490 + }, + { + "epoch": 0.46, + "grad_norm": 1.552904746172006, + "learning_rate": 1.1873295367010923e-05, + "loss": 0.7709, + "step": 4491 + }, + { + "epoch": 0.46, + "grad_norm": 1.4481819506527602, + "learning_rate": 1.1870059131354611e-05, + "loss": 0.7444, + "step": 4492 + }, + { + "epoch": 0.46, + "grad_norm": 1.5800787460222114, + "learning_rate": 1.186682269273263e-05, + "loss": 0.8605, + "step": 4493 + }, + { + "epoch": 0.46, + "grad_norm": 1.6270808444334217, + "learning_rate": 1.1863586051496235e-05, + "loss": 0.6664, + "step": 4494 + }, + { + "epoch": 0.46, + "grad_norm": 1.3324751236800856, + "learning_rate": 1.1860349207996717e-05, + "loss": 0.7568, + "step": 4495 + }, + { + "epoch": 0.46, + "grad_norm": 1.6101269155158606, + "learning_rate": 1.1857112162585387e-05, + "loss": 0.811, + "step": 4496 + }, + { + "epoch": 0.46, + "grad_norm": 1.5018243352019984, + "learning_rate": 1.1853874915613575e-05, + "loss": 0.816, + "step": 4497 + }, + { + "epoch": 0.46, + "grad_norm": 1.6115192399287763, + "learning_rate": 1.1850637467432629e-05, + "loss": 0.7215, + "step": 4498 + }, + { + "epoch": 0.46, + "grad_norm": 1.5961199154311783, + "learning_rate": 1.184739981839393e-05, + "loss": 0.6597, + "step": 4499 + }, + { + "epoch": 0.46, + "grad_norm": 1.577479387062586, + "learning_rate": 1.1844161968848868e-05, + "loss": 0.7565, + "step": 4500 + }, + { + "epoch": 0.46, + "grad_norm": 1.5304523479785352, + "learning_rate": 1.1840923919148863e-05, + "loss": 0.6509, + "step": 4501 + }, + { + "epoch": 0.46, + "grad_norm": 1.318352712290105, + "learning_rate": 1.1837685669645358e-05, + "loss": 0.6931, + "step": 4502 + }, + { + "epoch": 0.46, + "grad_norm": 1.5895914554412998, + "learning_rate": 1.1834447220689813e-05, + "loss": 0.737, + "step": 4503 + }, + { + "epoch": 0.46, + "grad_norm": 1.5971370797130637, + "learning_rate": 1.1831208572633708e-05, + "loss": 0.7396, + "step": 4504 + }, + { + "epoch": 0.46, + "grad_norm": 1.544909155827977, + "learning_rate": 1.182796972582855e-05, + "loss": 0.6391, + "step": 4505 + }, + { + "epoch": 0.46, + "grad_norm": 1.5076713528789134, + "learning_rate": 1.1824730680625862e-05, + "loss": 0.7217, + "step": 4506 + }, + { + "epoch": 0.46, + "grad_norm": 1.6015860072889743, + "learning_rate": 1.1821491437377198e-05, + "loss": 0.7712, + "step": 4507 + }, + { + "epoch": 0.46, + "grad_norm": 1.559399696293891, + "learning_rate": 1.181825199643412e-05, + "loss": 0.682, + "step": 4508 + }, + { + "epoch": 0.46, + "grad_norm": 1.3669347461286494, + "learning_rate": 1.1815012358148223e-05, + "loss": 0.5674, + "step": 4509 + }, + { + "epoch": 0.46, + "grad_norm": 1.7240826706457286, + "learning_rate": 1.1811772522871119e-05, + "loss": 0.7303, + "step": 4510 + }, + { + "epoch": 0.46, + "grad_norm": 1.6573139580065175, + "learning_rate": 1.1808532490954438e-05, + "loss": 0.7783, + "step": 4511 + }, + { + "epoch": 0.46, + "grad_norm": 1.6018607990043154, + "learning_rate": 1.180529226274984e-05, + "loss": 0.8296, + "step": 4512 + }, + { + "epoch": 0.46, + "grad_norm": 1.6779980575994735, + "learning_rate": 1.1802051838609e-05, + "loss": 0.7266, + "step": 4513 + }, + { + "epoch": 0.46, + "grad_norm": 1.4748993331800264, + "learning_rate": 1.1798811218883613e-05, + "loss": 0.7311, + "step": 4514 + }, + { + "epoch": 0.46, + "grad_norm": 1.6332739658424689, + "learning_rate": 1.1795570403925397e-05, + "loss": 0.6996, + "step": 4515 + }, + { + "epoch": 0.46, + "grad_norm": 1.520362224377642, + "learning_rate": 1.1792329394086094e-05, + "loss": 0.6282, + "step": 4516 + }, + { + "epoch": 0.46, + "grad_norm": 1.3075697117316751, + "learning_rate": 1.1789088189717463e-05, + "loss": 0.7129, + "step": 4517 + }, + { + "epoch": 0.46, + "grad_norm": 1.6161533444931127, + "learning_rate": 1.178584679117129e-05, + "loss": 0.6684, + "step": 4518 + }, + { + "epoch": 0.46, + "grad_norm": 1.5433187962870167, + "learning_rate": 1.1782605198799371e-05, + "loss": 0.721, + "step": 4519 + }, + { + "epoch": 0.46, + "grad_norm": 1.4472065593881311, + "learning_rate": 1.1779363412953537e-05, + "loss": 0.806, + "step": 4520 + }, + { + "epoch": 0.46, + "grad_norm": 1.4993530836356177, + "learning_rate": 1.177612143398563e-05, + "loss": 0.7325, + "step": 4521 + }, + { + "epoch": 0.46, + "grad_norm": 1.7396800214504256, + "learning_rate": 1.1772879262247519e-05, + "loss": 0.7748, + "step": 4522 + }, + { + "epoch": 0.46, + "grad_norm": 1.7334103979363331, + "learning_rate": 1.1769636898091089e-05, + "loss": 0.6951, + "step": 4523 + }, + { + "epoch": 0.46, + "grad_norm": 1.5340003104618167, + "learning_rate": 1.1766394341868242e-05, + "loss": 0.6365, + "step": 4524 + }, + { + "epoch": 0.46, + "grad_norm": 1.5822547649750762, + "learning_rate": 1.1763151593930918e-05, + "loss": 0.7985, + "step": 4525 + }, + { + "epoch": 0.46, + "grad_norm": 1.4388485063230003, + "learning_rate": 1.1759908654631057e-05, + "loss": 0.7047, + "step": 4526 + }, + { + "epoch": 0.46, + "grad_norm": 1.6219002551817887, + "learning_rate": 1.1756665524320638e-05, + "loss": 0.6781, + "step": 4527 + }, + { + "epoch": 0.46, + "grad_norm": 1.5640124456440354, + "learning_rate": 1.1753422203351643e-05, + "loss": 0.7798, + "step": 4528 + }, + { + "epoch": 0.46, + "grad_norm": 1.3788120020476304, + "learning_rate": 1.1750178692076092e-05, + "loss": 0.7017, + "step": 4529 + }, + { + "epoch": 0.46, + "grad_norm": 1.7685053143291274, + "learning_rate": 1.1746934990846013e-05, + "loss": 0.7724, + "step": 4530 + }, + { + "epoch": 0.46, + "grad_norm": 1.590333764801969, + "learning_rate": 1.1743691100013459e-05, + "loss": 0.7066, + "step": 4531 + }, + { + "epoch": 0.46, + "grad_norm": 1.556934087076597, + "learning_rate": 1.1740447019930506e-05, + "loss": 0.686, + "step": 4532 + }, + { + "epoch": 0.46, + "grad_norm": 1.4977064506551216, + "learning_rate": 1.1737202750949244e-05, + "loss": 0.7134, + "step": 4533 + }, + { + "epoch": 0.46, + "grad_norm": 1.5293106757310524, + "learning_rate": 1.173395829342179e-05, + "loss": 0.6573, + "step": 4534 + }, + { + "epoch": 0.46, + "grad_norm": 1.4865247698778525, + "learning_rate": 1.1730713647700282e-05, + "loss": 0.6099, + "step": 4535 + }, + { + "epoch": 0.46, + "grad_norm": 1.5621144654474877, + "learning_rate": 1.1727468814136871e-05, + "loss": 0.7107, + "step": 4536 + }, + { + "epoch": 0.46, + "grad_norm": 1.9113889775095725, + "learning_rate": 1.1724223793083733e-05, + "loss": 0.81, + "step": 4537 + }, + { + "epoch": 0.46, + "grad_norm": 1.3475550233522957, + "learning_rate": 1.172097858489307e-05, + "loss": 0.6461, + "step": 4538 + }, + { + "epoch": 0.46, + "grad_norm": 1.6564171669585754, + "learning_rate": 1.1717733189917092e-05, + "loss": 0.722, + "step": 4539 + }, + { + "epoch": 0.46, + "grad_norm": 1.6240145152483296, + "learning_rate": 1.1714487608508043e-05, + "loss": 0.699, + "step": 4540 + }, + { + "epoch": 0.46, + "grad_norm": 1.7102134687250472, + "learning_rate": 1.1711241841018173e-05, + "loss": 0.6937, + "step": 4541 + }, + { + "epoch": 0.46, + "grad_norm": 1.6969571762175641, + "learning_rate": 1.1707995887799759e-05, + "loss": 0.7627, + "step": 4542 + }, + { + "epoch": 0.46, + "grad_norm": 1.7822040158464596, + "learning_rate": 1.1704749749205105e-05, + "loss": 0.8299, + "step": 4543 + }, + { + "epoch": 0.46, + "grad_norm": 1.5846248801060048, + "learning_rate": 1.1701503425586522e-05, + "loss": 0.8152, + "step": 4544 + }, + { + "epoch": 0.46, + "grad_norm": 1.458804753241937, + "learning_rate": 1.1698256917296354e-05, + "loss": 0.7306, + "step": 4545 + }, + { + "epoch": 0.46, + "grad_norm": 1.509308899737421, + "learning_rate": 1.1695010224686951e-05, + "loss": 0.7077, + "step": 4546 + }, + { + "epoch": 0.46, + "grad_norm": 1.5690076410012654, + "learning_rate": 1.1691763348110698e-05, + "loss": 0.7536, + "step": 4547 + }, + { + "epoch": 0.46, + "grad_norm": 1.6779952948032286, + "learning_rate": 1.168851628791999e-05, + "loss": 0.7712, + "step": 4548 + }, + { + "epoch": 0.46, + "grad_norm": 1.7875031142218436, + "learning_rate": 1.1685269044467244e-05, + "loss": 0.8445, + "step": 4549 + }, + { + "epoch": 0.46, + "grad_norm": 1.4464393868159005, + "learning_rate": 1.16820216181049e-05, + "loss": 0.6798, + "step": 4550 + }, + { + "epoch": 0.46, + "grad_norm": 1.5728041249582039, + "learning_rate": 1.167877400918541e-05, + "loss": 0.6674, + "step": 4551 + }, + { + "epoch": 0.46, + "grad_norm": 1.9005455827057687, + "learning_rate": 1.1675526218061254e-05, + "loss": 0.7243, + "step": 4552 + }, + { + "epoch": 0.46, + "grad_norm": 1.5418841385192381, + "learning_rate": 1.1672278245084931e-05, + "loss": 0.6592, + "step": 4553 + }, + { + "epoch": 0.46, + "grad_norm": 1.495491500895745, + "learning_rate": 1.1669030090608957e-05, + "loss": 0.6856, + "step": 4554 + }, + { + "epoch": 0.46, + "grad_norm": 1.6104076342307279, + "learning_rate": 1.1665781754985867e-05, + "loss": 0.6973, + "step": 4555 + }, + { + "epoch": 0.46, + "grad_norm": 1.5121867638957396, + "learning_rate": 1.1662533238568214e-05, + "loss": 0.7295, + "step": 4556 + }, + { + "epoch": 0.46, + "grad_norm": 1.5661736979533423, + "learning_rate": 1.1659284541708581e-05, + "loss": 0.7195, + "step": 4557 + }, + { + "epoch": 0.46, + "grad_norm": 1.454697725959841, + "learning_rate": 1.165603566475956e-05, + "loss": 0.6999, + "step": 4558 + }, + { + "epoch": 0.46, + "grad_norm": 1.4835668066546128, + "learning_rate": 1.1652786608073763e-05, + "loss": 0.7271, + "step": 4559 + }, + { + "epoch": 0.46, + "grad_norm": 1.3864305015981893, + "learning_rate": 1.1649537372003825e-05, + "loss": 0.6977, + "step": 4560 + }, + { + "epoch": 0.46, + "grad_norm": 1.5254952489151536, + "learning_rate": 1.1646287956902402e-05, + "loss": 0.6152, + "step": 4561 + }, + { + "epoch": 0.46, + "grad_norm": 1.6481920827138161, + "learning_rate": 1.1643038363122168e-05, + "loss": 0.7651, + "step": 4562 + }, + { + "epoch": 0.46, + "grad_norm": 1.4195624179129456, + "learning_rate": 1.1639788591015815e-05, + "loss": 0.6878, + "step": 4563 + }, + { + "epoch": 0.46, + "grad_norm": 1.6036746756352342, + "learning_rate": 1.1636538640936052e-05, + "loss": 0.7982, + "step": 4564 + }, + { + "epoch": 0.46, + "grad_norm": 1.5207564745319613, + "learning_rate": 1.163328851323561e-05, + "loss": 0.7535, + "step": 4565 + }, + { + "epoch": 0.46, + "grad_norm": 1.5653334921119917, + "learning_rate": 1.1630038208267244e-05, + "loss": 0.7109, + "step": 4566 + }, + { + "epoch": 0.46, + "grad_norm": 1.5478786441455192, + "learning_rate": 1.1626787726383722e-05, + "loss": 0.6924, + "step": 4567 + }, + { + "epoch": 0.46, + "grad_norm": 1.5811167412057043, + "learning_rate": 1.1623537067937832e-05, + "loss": 0.766, + "step": 4568 + }, + { + "epoch": 0.46, + "grad_norm": 1.5202293492541648, + "learning_rate": 1.162028623328238e-05, + "loss": 0.709, + "step": 4569 + }, + { + "epoch": 0.46, + "grad_norm": 1.538960478366557, + "learning_rate": 1.1617035222770199e-05, + "loss": 0.7687, + "step": 4570 + }, + { + "epoch": 0.46, + "grad_norm": 1.4244896266657237, + "learning_rate": 1.1613784036754132e-05, + "loss": 0.6506, + "step": 4571 + }, + { + "epoch": 0.47, + "grad_norm": 1.5037212119134407, + "learning_rate": 1.1610532675587042e-05, + "loss": 0.6561, + "step": 4572 + }, + { + "epoch": 0.47, + "grad_norm": 1.5084780381210663, + "learning_rate": 1.160728113962182e-05, + "loss": 0.6705, + "step": 4573 + }, + { + "epoch": 0.47, + "grad_norm": 1.7686645441932545, + "learning_rate": 1.1604029429211361e-05, + "loss": 0.7333, + "step": 4574 + }, + { + "epoch": 0.47, + "grad_norm": 1.4847152737520022, + "learning_rate": 1.1600777544708595e-05, + "loss": 0.7147, + "step": 4575 + }, + { + "epoch": 0.47, + "grad_norm": 1.550495765299229, + "learning_rate": 1.1597525486466458e-05, + "loss": 0.68, + "step": 4576 + }, + { + "epoch": 0.47, + "grad_norm": 1.5757475793774263, + "learning_rate": 1.1594273254837914e-05, + "loss": 0.7538, + "step": 4577 + }, + { + "epoch": 0.47, + "grad_norm": 1.5793316860033737, + "learning_rate": 1.1591020850175935e-05, + "loss": 0.7194, + "step": 4578 + }, + { + "epoch": 0.47, + "grad_norm": 1.5312094094225086, + "learning_rate": 1.1587768272833529e-05, + "loss": 0.795, + "step": 4579 + }, + { + "epoch": 0.47, + "grad_norm": 1.6037209281036378, + "learning_rate": 1.1584515523163705e-05, + "loss": 0.6588, + "step": 4580 + }, + { + "epoch": 0.47, + "grad_norm": 1.4918056952223486, + "learning_rate": 1.15812626015195e-05, + "loss": 0.6309, + "step": 4581 + }, + { + "epoch": 0.47, + "grad_norm": 1.605269547265067, + "learning_rate": 1.1578009508253966e-05, + "loss": 0.7243, + "step": 4582 + }, + { + "epoch": 0.47, + "grad_norm": 1.4666143394088873, + "learning_rate": 1.157475624372018e-05, + "loss": 0.6769, + "step": 4583 + }, + { + "epoch": 0.47, + "grad_norm": 1.6405211541922342, + "learning_rate": 1.1571502808271229e-05, + "loss": 0.7228, + "step": 4584 + }, + { + "epoch": 0.47, + "grad_norm": 1.6232228814453615, + "learning_rate": 1.1568249202260224e-05, + "loss": 0.8883, + "step": 4585 + }, + { + "epoch": 0.47, + "grad_norm": 1.5683314189351627, + "learning_rate": 1.1564995426040293e-05, + "loss": 0.672, + "step": 4586 + }, + { + "epoch": 0.47, + "grad_norm": 1.548145977674433, + "learning_rate": 1.1561741479964579e-05, + "loss": 0.6571, + "step": 4587 + }, + { + "epoch": 0.47, + "grad_norm": 1.6133717388005364, + "learning_rate": 1.1558487364386255e-05, + "loss": 0.7443, + "step": 4588 + }, + { + "epoch": 0.47, + "grad_norm": 1.5584829486616945, + "learning_rate": 1.1555233079658496e-05, + "loss": 0.7495, + "step": 4589 + }, + { + "epoch": 0.47, + "grad_norm": 1.5753794669064949, + "learning_rate": 1.155197862613451e-05, + "loss": 0.7855, + "step": 4590 + }, + { + "epoch": 0.47, + "grad_norm": 1.4472244902213225, + "learning_rate": 1.1548724004167513e-05, + "loss": 0.7718, + "step": 4591 + }, + { + "epoch": 0.47, + "grad_norm": 1.5367327064558058, + "learning_rate": 1.1545469214110745e-05, + "loss": 0.8578, + "step": 4592 + }, + { + "epoch": 0.47, + "grad_norm": 1.5334893194368162, + "learning_rate": 1.1542214256317461e-05, + "loss": 0.83, + "step": 4593 + }, + { + "epoch": 0.47, + "grad_norm": 1.5910044241350005, + "learning_rate": 1.153895913114094e-05, + "loss": 0.6814, + "step": 4594 + }, + { + "epoch": 0.47, + "grad_norm": 1.6807368860123768, + "learning_rate": 1.153570383893447e-05, + "loss": 0.733, + "step": 4595 + }, + { + "epoch": 0.47, + "grad_norm": 1.5584840199996832, + "learning_rate": 1.1532448380051362e-05, + "loss": 0.6921, + "step": 4596 + }, + { + "epoch": 0.47, + "grad_norm": 1.5458643754720789, + "learning_rate": 1.152919275484495e-05, + "loss": 0.7061, + "step": 4597 + }, + { + "epoch": 0.47, + "grad_norm": 1.7702393765678885, + "learning_rate": 1.1525936963668579e-05, + "loss": 0.7618, + "step": 4598 + }, + { + "epoch": 0.47, + "grad_norm": 1.4375965231055472, + "learning_rate": 1.1522681006875614e-05, + "loss": 0.6703, + "step": 4599 + }, + { + "epoch": 0.47, + "grad_norm": 1.697375977900443, + "learning_rate": 1.1519424884819432e-05, + "loss": 0.7234, + "step": 4600 + }, + { + "epoch": 0.47, + "grad_norm": 1.4384659116884588, + "learning_rate": 1.1516168597853446e-05, + "loss": 0.7087, + "step": 4601 + }, + { + "epoch": 0.47, + "grad_norm": 1.495867458271716, + "learning_rate": 1.1512912146331066e-05, + "loss": 0.6975, + "step": 4602 + }, + { + "epoch": 0.47, + "grad_norm": 1.5537088310355118, + "learning_rate": 1.1509655530605732e-05, + "loss": 0.8397, + "step": 4603 + }, + { + "epoch": 0.47, + "grad_norm": 1.374743892023546, + "learning_rate": 1.1506398751030899e-05, + "loss": 0.6131, + "step": 4604 + }, + { + "epoch": 0.47, + "grad_norm": 1.677575822578111, + "learning_rate": 1.150314180796004e-05, + "loss": 0.6829, + "step": 4605 + }, + { + "epoch": 0.47, + "grad_norm": 1.4445203506416073, + "learning_rate": 1.1499884701746642e-05, + "loss": 0.798, + "step": 4606 + }, + { + "epoch": 0.47, + "grad_norm": 1.712464858100849, + "learning_rate": 1.1496627432744216e-05, + "loss": 0.7282, + "step": 4607 + }, + { + "epoch": 0.47, + "grad_norm": 1.5985773032288149, + "learning_rate": 1.1493370001306288e-05, + "loss": 0.7807, + "step": 4608 + }, + { + "epoch": 0.47, + "grad_norm": 1.486347003648851, + "learning_rate": 1.14901124077864e-05, + "loss": 0.7325, + "step": 4609 + }, + { + "epoch": 0.47, + "grad_norm": 1.5134227711225614, + "learning_rate": 1.148685465253811e-05, + "loss": 0.6873, + "step": 4610 + }, + { + "epoch": 0.47, + "grad_norm": 1.4965204314771694, + "learning_rate": 1.1483596735915003e-05, + "loss": 0.6983, + "step": 4611 + }, + { + "epoch": 0.47, + "grad_norm": 1.6605822099109793, + "learning_rate": 1.148033865827067e-05, + "loss": 0.6704, + "step": 4612 + }, + { + "epoch": 0.47, + "grad_norm": 1.2937035044904677, + "learning_rate": 1.1477080419958726e-05, + "loss": 0.6363, + "step": 4613 + }, + { + "epoch": 0.47, + "grad_norm": 1.8160972538634474, + "learning_rate": 1.14738220213328e-05, + "loss": 0.7374, + "step": 4614 + }, + { + "epoch": 0.47, + "grad_norm": 1.4711744385294088, + "learning_rate": 1.1470563462746542e-05, + "loss": 0.7394, + "step": 4615 + }, + { + "epoch": 0.47, + "grad_norm": 1.8100258979621198, + "learning_rate": 1.1467304744553618e-05, + "loss": 0.7842, + "step": 4616 + }, + { + "epoch": 0.47, + "grad_norm": 1.4886306908892508, + "learning_rate": 1.1464045867107712e-05, + "loss": 0.7585, + "step": 4617 + }, + { + "epoch": 0.47, + "grad_norm": 1.4741574652866123, + "learning_rate": 1.1460786830762519e-05, + "loss": 0.6208, + "step": 4618 + }, + { + "epoch": 0.47, + "grad_norm": 1.5586042387688461, + "learning_rate": 1.1457527635871759e-05, + "loss": 0.6356, + "step": 4619 + }, + { + "epoch": 0.47, + "grad_norm": 1.6055025072357336, + "learning_rate": 1.145426828278917e-05, + "loss": 0.8323, + "step": 4620 + }, + { + "epoch": 0.47, + "grad_norm": 1.6099059419948964, + "learning_rate": 1.14510087718685e-05, + "loss": 0.7571, + "step": 4621 + }, + { + "epoch": 0.47, + "grad_norm": 1.6203016711208587, + "learning_rate": 1.1447749103463519e-05, + "loss": 0.7162, + "step": 4622 + }, + { + "epoch": 0.47, + "grad_norm": 1.7047316755504125, + "learning_rate": 1.144448927792801e-05, + "loss": 0.8016, + "step": 4623 + }, + { + "epoch": 0.47, + "grad_norm": 1.5676650614066094, + "learning_rate": 1.1441229295615784e-05, + "loss": 0.7296, + "step": 4624 + }, + { + "epoch": 0.47, + "grad_norm": 1.622912975230112, + "learning_rate": 1.1437969156880656e-05, + "loss": 0.764, + "step": 4625 + }, + { + "epoch": 0.47, + "grad_norm": 1.4111662466560349, + "learning_rate": 1.143470886207646e-05, + "loss": 0.6159, + "step": 4626 + }, + { + "epoch": 0.47, + "grad_norm": 1.5863421762962944, + "learning_rate": 1.1431448411557056e-05, + "loss": 0.7665, + "step": 4627 + }, + { + "epoch": 0.47, + "grad_norm": 1.6377625649613345, + "learning_rate": 1.1428187805676309e-05, + "loss": 0.7062, + "step": 4628 + }, + { + "epoch": 0.47, + "grad_norm": 1.4509682195293687, + "learning_rate": 1.1424927044788114e-05, + "loss": 0.6903, + "step": 4629 + }, + { + "epoch": 0.47, + "grad_norm": 1.5453389143146479, + "learning_rate": 1.1421666129246372e-05, + "loss": 0.7763, + "step": 4630 + }, + { + "epoch": 0.47, + "grad_norm": 1.483135906834292, + "learning_rate": 1.1418405059405002e-05, + "loss": 0.797, + "step": 4631 + }, + { + "epoch": 0.47, + "grad_norm": 1.7483319893470177, + "learning_rate": 1.1415143835617943e-05, + "loss": 0.8703, + "step": 4632 + }, + { + "epoch": 0.47, + "grad_norm": 1.5270226220355356, + "learning_rate": 1.1411882458239152e-05, + "loss": 0.7648, + "step": 4633 + }, + { + "epoch": 0.47, + "grad_norm": 1.6431099516940701, + "learning_rate": 1.14086209276226e-05, + "loss": 0.7689, + "step": 4634 + }, + { + "epoch": 0.47, + "grad_norm": 1.4282775689777298, + "learning_rate": 1.1405359244122275e-05, + "loss": 0.6884, + "step": 4635 + }, + { + "epoch": 0.47, + "grad_norm": 1.6107562592794664, + "learning_rate": 1.1402097408092184e-05, + "loss": 0.7794, + "step": 4636 + }, + { + "epoch": 0.47, + "grad_norm": 1.7817765520103868, + "learning_rate": 1.1398835419886339e-05, + "loss": 0.7781, + "step": 4637 + }, + { + "epoch": 0.47, + "grad_norm": 1.7683257244565183, + "learning_rate": 1.1395573279858789e-05, + "loss": 0.7045, + "step": 4638 + }, + { + "epoch": 0.47, + "grad_norm": 1.6521730801992396, + "learning_rate": 1.1392310988363584e-05, + "loss": 0.75, + "step": 4639 + }, + { + "epoch": 0.47, + "grad_norm": 1.521210187770528, + "learning_rate": 1.1389048545754794e-05, + "loss": 0.7056, + "step": 4640 + }, + { + "epoch": 0.47, + "grad_norm": 1.5150310415725645, + "learning_rate": 1.1385785952386503e-05, + "loss": 0.7595, + "step": 4641 + }, + { + "epoch": 0.47, + "grad_norm": 1.3661564856473303, + "learning_rate": 1.1382523208612823e-05, + "loss": 0.7069, + "step": 4642 + }, + { + "epoch": 0.47, + "grad_norm": 1.6708812769448553, + "learning_rate": 1.1379260314787867e-05, + "loss": 0.7898, + "step": 4643 + }, + { + "epoch": 0.47, + "grad_norm": 1.6704991799723237, + "learning_rate": 1.1375997271265775e-05, + "loss": 0.77, + "step": 4644 + }, + { + "epoch": 0.47, + "grad_norm": 1.5237140033724268, + "learning_rate": 1.1372734078400695e-05, + "loss": 0.7653, + "step": 4645 + }, + { + "epoch": 0.47, + "grad_norm": 1.5858634760779247, + "learning_rate": 1.1369470736546798e-05, + "loss": 0.8662, + "step": 4646 + }, + { + "epoch": 0.47, + "grad_norm": 1.4565672294570617, + "learning_rate": 1.1366207246058269e-05, + "loss": 0.7507, + "step": 4647 + }, + { + "epoch": 0.47, + "grad_norm": 1.4991843578593989, + "learning_rate": 1.136294360728931e-05, + "loss": 0.7551, + "step": 4648 + }, + { + "epoch": 0.47, + "grad_norm": 1.5417235142162893, + "learning_rate": 1.1359679820594137e-05, + "loss": 0.6552, + "step": 4649 + }, + { + "epoch": 0.47, + "grad_norm": 1.548589989786535, + "learning_rate": 1.135641588632698e-05, + "loss": 0.611, + "step": 4650 + }, + { + "epoch": 0.47, + "grad_norm": 1.4696288433018039, + "learning_rate": 1.1353151804842088e-05, + "loss": 0.7258, + "step": 4651 + }, + { + "epoch": 0.47, + "grad_norm": 1.4592926096276786, + "learning_rate": 1.1349887576493735e-05, + "loss": 0.6607, + "step": 4652 + }, + { + "epoch": 0.47, + "grad_norm": 1.6058655036855474, + "learning_rate": 1.1346623201636192e-05, + "loss": 0.7338, + "step": 4653 + }, + { + "epoch": 0.47, + "grad_norm": 1.580983795741087, + "learning_rate": 1.1343358680623757e-05, + "loss": 0.8141, + "step": 4654 + }, + { + "epoch": 0.47, + "grad_norm": 1.3749699175665049, + "learning_rate": 1.1340094013810749e-05, + "loss": 0.6731, + "step": 4655 + }, + { + "epoch": 0.47, + "grad_norm": 1.6096032470138852, + "learning_rate": 1.1336829201551492e-05, + "loss": 0.7535, + "step": 4656 + }, + { + "epoch": 0.47, + "grad_norm": 1.6262893926298707, + "learning_rate": 1.1333564244200328e-05, + "loss": 0.7377, + "step": 4657 + }, + { + "epoch": 0.47, + "grad_norm": 1.5410648418608643, + "learning_rate": 1.1330299142111623e-05, + "loss": 0.739, + "step": 4658 + }, + { + "epoch": 0.47, + "grad_norm": 1.5485513987128794, + "learning_rate": 1.1327033895639747e-05, + "loss": 0.8029, + "step": 4659 + }, + { + "epoch": 0.47, + "grad_norm": 1.410059273776502, + "learning_rate": 1.1323768505139095e-05, + "loss": 0.7232, + "step": 4660 + }, + { + "epoch": 0.47, + "grad_norm": 1.5624646887742166, + "learning_rate": 1.1320502970964076e-05, + "loss": 0.7231, + "step": 4661 + }, + { + "epoch": 0.47, + "grad_norm": 1.5372926149461579, + "learning_rate": 1.1317237293469108e-05, + "loss": 0.7501, + "step": 4662 + }, + { + "epoch": 0.47, + "grad_norm": 1.5681818631952515, + "learning_rate": 1.1313971473008628e-05, + "loss": 0.7218, + "step": 4663 + }, + { + "epoch": 0.47, + "grad_norm": 1.482830029646937, + "learning_rate": 1.1310705509937096e-05, + "loss": 0.7101, + "step": 4664 + }, + { + "epoch": 0.47, + "grad_norm": 1.562764113175667, + "learning_rate": 1.1307439404608978e-05, + "loss": 0.7555, + "step": 4665 + }, + { + "epoch": 0.47, + "grad_norm": 1.4911542811063758, + "learning_rate": 1.130417315737876e-05, + "loss": 0.6715, + "step": 4666 + }, + { + "epoch": 0.47, + "grad_norm": 1.5254406078226133, + "learning_rate": 1.1300906768600939e-05, + "loss": 0.6447, + "step": 4667 + }, + { + "epoch": 0.47, + "grad_norm": 1.4471085397289045, + "learning_rate": 1.129764023863003e-05, + "loss": 0.7193, + "step": 4668 + }, + { + "epoch": 0.47, + "grad_norm": 1.5608343873438544, + "learning_rate": 1.1294373567820568e-05, + "loss": 0.661, + "step": 4669 + }, + { + "epoch": 0.48, + "grad_norm": 1.4437985820957768, + "learning_rate": 1.12911067565271e-05, + "loss": 0.624, + "step": 4670 + }, + { + "epoch": 0.48, + "grad_norm": 1.4246038270133796, + "learning_rate": 1.1287839805104182e-05, + "loss": 0.691, + "step": 4671 + }, + { + "epoch": 0.48, + "grad_norm": 1.4721353611911032, + "learning_rate": 1.1284572713906392e-05, + "loss": 0.7062, + "step": 4672 + }, + { + "epoch": 0.48, + "grad_norm": 1.7658949422577648, + "learning_rate": 1.1281305483288321e-05, + "loss": 0.7626, + "step": 4673 + }, + { + "epoch": 0.48, + "grad_norm": 1.504533207614064, + "learning_rate": 1.1278038113604579e-05, + "loss": 0.7198, + "step": 4674 + }, + { + "epoch": 0.48, + "grad_norm": 1.5931060163297304, + "learning_rate": 1.1274770605209786e-05, + "loss": 0.8055, + "step": 4675 + }, + { + "epoch": 0.48, + "grad_norm": 1.5542388836057903, + "learning_rate": 1.1271502958458577e-05, + "loss": 0.7602, + "step": 4676 + }, + { + "epoch": 0.48, + "grad_norm": 1.575193549763372, + "learning_rate": 1.1268235173705608e-05, + "loss": 0.8088, + "step": 4677 + }, + { + "epoch": 0.48, + "grad_norm": 1.4411760933441318, + "learning_rate": 1.1264967251305542e-05, + "loss": 0.6863, + "step": 4678 + }, + { + "epoch": 0.48, + "grad_norm": 1.6648858012128047, + "learning_rate": 1.1261699191613067e-05, + "loss": 0.7772, + "step": 4679 + }, + { + "epoch": 0.48, + "grad_norm": 1.7199672360791733, + "learning_rate": 1.1258430994982872e-05, + "loss": 0.8195, + "step": 4680 + }, + { + "epoch": 0.48, + "grad_norm": 1.4568557243100269, + "learning_rate": 1.1255162661769674e-05, + "loss": 0.6591, + "step": 4681 + }, + { + "epoch": 0.48, + "grad_norm": 1.520795016619549, + "learning_rate": 1.1251894192328194e-05, + "loss": 0.7553, + "step": 4682 + }, + { + "epoch": 0.48, + "grad_norm": 1.6015181193109884, + "learning_rate": 1.124862558701318e-05, + "loss": 0.7741, + "step": 4683 + }, + { + "epoch": 0.48, + "grad_norm": 1.4447539094155426, + "learning_rate": 1.1245356846179384e-05, + "loss": 0.7089, + "step": 4684 + }, + { + "epoch": 0.48, + "grad_norm": 1.4584951955485221, + "learning_rate": 1.1242087970181578e-05, + "loss": 0.7395, + "step": 4685 + }, + { + "epoch": 0.48, + "grad_norm": 1.4461603196919004, + "learning_rate": 1.1238818959374547e-05, + "loss": 0.6715, + "step": 4686 + }, + { + "epoch": 0.48, + "grad_norm": 1.7311798026421528, + "learning_rate": 1.1235549814113092e-05, + "loss": 0.8515, + "step": 4687 + }, + { + "epoch": 0.48, + "grad_norm": 1.4048144176782515, + "learning_rate": 1.1232280534752025e-05, + "loss": 0.5922, + "step": 4688 + }, + { + "epoch": 0.48, + "grad_norm": 1.5481973447074155, + "learning_rate": 1.1229011121646176e-05, + "loss": 0.7715, + "step": 4689 + }, + { + "epoch": 0.48, + "grad_norm": 1.6958381976979935, + "learning_rate": 1.1225741575150391e-05, + "loss": 0.7143, + "step": 4690 + }, + { + "epoch": 0.48, + "grad_norm": 1.4660906832812384, + "learning_rate": 1.1222471895619525e-05, + "loss": 0.7651, + "step": 4691 + }, + { + "epoch": 0.48, + "grad_norm": 1.514977911309017, + "learning_rate": 1.1219202083408454e-05, + "loss": 0.7018, + "step": 4692 + }, + { + "epoch": 0.48, + "grad_norm": 1.7532823984411445, + "learning_rate": 1.1215932138872061e-05, + "loss": 0.7323, + "step": 4693 + }, + { + "epoch": 0.48, + "grad_norm": 1.4676790184219108, + "learning_rate": 1.1212662062365252e-05, + "loss": 0.7031, + "step": 4694 + }, + { + "epoch": 0.48, + "grad_norm": 1.6054557795695394, + "learning_rate": 1.1209391854242938e-05, + "loss": 0.8304, + "step": 4695 + }, + { + "epoch": 0.48, + "grad_norm": 1.5991701867348644, + "learning_rate": 1.120612151486005e-05, + "loss": 0.6987, + "step": 4696 + }, + { + "epoch": 0.48, + "grad_norm": 1.5170179933755257, + "learning_rate": 1.1202851044571533e-05, + "loss": 0.6792, + "step": 4697 + }, + { + "epoch": 0.48, + "grad_norm": 1.7157827956741485, + "learning_rate": 1.1199580443732347e-05, + "loss": 0.7554, + "step": 4698 + }, + { + "epoch": 0.48, + "grad_norm": 1.4753493754003022, + "learning_rate": 1.1196309712697463e-05, + "loss": 0.6668, + "step": 4699 + }, + { + "epoch": 0.48, + "grad_norm": 1.4924475104111545, + "learning_rate": 1.1193038851821867e-05, + "loss": 0.7118, + "step": 4700 + }, + { + "epoch": 0.48, + "grad_norm": 1.5430843931734841, + "learning_rate": 1.1189767861460563e-05, + "loss": 0.7068, + "step": 4701 + }, + { + "epoch": 0.48, + "grad_norm": 1.5076245973794509, + "learning_rate": 1.1186496741968562e-05, + "loss": 0.6831, + "step": 4702 + }, + { + "epoch": 0.48, + "grad_norm": 1.7370968200999768, + "learning_rate": 1.1183225493700895e-05, + "loss": 0.7394, + "step": 4703 + }, + { + "epoch": 0.48, + "grad_norm": 1.800654570183203, + "learning_rate": 1.1179954117012603e-05, + "loss": 0.7211, + "step": 4704 + }, + { + "epoch": 0.48, + "grad_norm": 1.5017002098978107, + "learning_rate": 1.1176682612258745e-05, + "loss": 0.7532, + "step": 4705 + }, + { + "epoch": 0.48, + "grad_norm": 1.6617996193529947, + "learning_rate": 1.117341097979439e-05, + "loss": 0.7171, + "step": 4706 + }, + { + "epoch": 0.48, + "grad_norm": 1.7063874397456549, + "learning_rate": 1.1170139219974626e-05, + "loss": 0.7891, + "step": 4707 + }, + { + "epoch": 0.48, + "grad_norm": 1.6636920434588713, + "learning_rate": 1.1166867333154543e-05, + "loss": 0.7299, + "step": 4708 + }, + { + "epoch": 0.48, + "grad_norm": 1.5654818487601652, + "learning_rate": 1.1163595319689264e-05, + "loss": 0.6421, + "step": 4709 + }, + { + "epoch": 0.48, + "grad_norm": 1.4218850872715825, + "learning_rate": 1.1160323179933908e-05, + "loss": 0.6249, + "step": 4710 + }, + { + "epoch": 0.48, + "grad_norm": 1.5576364310269057, + "learning_rate": 1.1157050914243614e-05, + "loss": 0.7484, + "step": 4711 + }, + { + "epoch": 0.48, + "grad_norm": 1.6310208232754702, + "learning_rate": 1.115377852297354e-05, + "loss": 0.7097, + "step": 4712 + }, + { + "epoch": 0.48, + "grad_norm": 1.6373612010247085, + "learning_rate": 1.1150506006478849e-05, + "loss": 0.7567, + "step": 4713 + }, + { + "epoch": 0.48, + "grad_norm": 1.764893872771686, + "learning_rate": 1.1147233365114725e-05, + "loss": 0.7486, + "step": 4714 + }, + { + "epoch": 0.48, + "grad_norm": 1.4628182364711966, + "learning_rate": 1.1143960599236358e-05, + "loss": 0.6277, + "step": 4715 + }, + { + "epoch": 0.48, + "grad_norm": 1.7369395077416692, + "learning_rate": 1.1140687709198958e-05, + "loss": 0.7288, + "step": 4716 + }, + { + "epoch": 0.48, + "grad_norm": 1.635015653974785, + "learning_rate": 1.1137414695357747e-05, + "loss": 0.7613, + "step": 4717 + }, + { + "epoch": 0.48, + "grad_norm": 1.5503660871383618, + "learning_rate": 1.1134141558067955e-05, + "loss": 0.7288, + "step": 4718 + }, + { + "epoch": 0.48, + "grad_norm": 1.5538599617254092, + "learning_rate": 1.1130868297684833e-05, + "loss": 0.7645, + "step": 4719 + }, + { + "epoch": 0.48, + "grad_norm": 1.511799580256174, + "learning_rate": 1.1127594914563647e-05, + "loss": 0.8259, + "step": 4720 + }, + { + "epoch": 0.48, + "grad_norm": 1.557814325422162, + "learning_rate": 1.1124321409059661e-05, + "loss": 0.7098, + "step": 4721 + }, + { + "epoch": 0.48, + "grad_norm": 1.5253434446450445, + "learning_rate": 1.1121047781528174e-05, + "loss": 0.7664, + "step": 4722 + }, + { + "epoch": 0.48, + "grad_norm": 1.7265330902888985, + "learning_rate": 1.1117774032324476e-05, + "loss": 0.7295, + "step": 4723 + }, + { + "epoch": 0.48, + "grad_norm": 1.5518678435153568, + "learning_rate": 1.111450016180389e-05, + "loss": 0.7724, + "step": 4724 + }, + { + "epoch": 0.48, + "grad_norm": 1.5664907417938099, + "learning_rate": 1.1111226170321738e-05, + "loss": 0.7906, + "step": 4725 + }, + { + "epoch": 0.48, + "grad_norm": 1.5130135727104113, + "learning_rate": 1.1107952058233367e-05, + "loss": 0.766, + "step": 4726 + }, + { + "epoch": 0.48, + "grad_norm": 2.2548589391419385, + "learning_rate": 1.1104677825894121e-05, + "loss": 0.7856, + "step": 4727 + }, + { + "epoch": 0.48, + "grad_norm": 1.4289814666356178, + "learning_rate": 1.1101403473659376e-05, + "loss": 0.7476, + "step": 4728 + }, + { + "epoch": 0.48, + "grad_norm": 1.6747946340533135, + "learning_rate": 1.1098129001884508e-05, + "loss": 0.8052, + "step": 4729 + }, + { + "epoch": 0.48, + "grad_norm": 1.5331014347391687, + "learning_rate": 1.1094854410924909e-05, + "loss": 0.8508, + "step": 4730 + }, + { + "epoch": 0.48, + "grad_norm": 1.8579945737436387, + "learning_rate": 1.1091579701135986e-05, + "loss": 0.7624, + "step": 4731 + }, + { + "epoch": 0.48, + "grad_norm": 1.566768370211104, + "learning_rate": 1.1088304872873152e-05, + "loss": 0.7611, + "step": 4732 + }, + { + "epoch": 0.48, + "grad_norm": 1.9112271223739234, + "learning_rate": 1.1085029926491848e-05, + "loss": 0.8016, + "step": 4733 + }, + { + "epoch": 0.48, + "grad_norm": 1.507987571004608, + "learning_rate": 1.1081754862347513e-05, + "loss": 0.7083, + "step": 4734 + }, + { + "epoch": 0.48, + "grad_norm": 1.4228665977224153, + "learning_rate": 1.1078479680795604e-05, + "loss": 0.7893, + "step": 4735 + }, + { + "epoch": 0.48, + "grad_norm": 1.6040397098394448, + "learning_rate": 1.107520438219159e-05, + "loss": 0.7328, + "step": 4736 + }, + { + "epoch": 0.48, + "grad_norm": 1.5392882633332718, + "learning_rate": 1.1071928966890957e-05, + "loss": 0.781, + "step": 4737 + }, + { + "epoch": 0.48, + "grad_norm": 1.666983043219811, + "learning_rate": 1.1068653435249197e-05, + "loss": 0.7516, + "step": 4738 + }, + { + "epoch": 0.48, + "grad_norm": 1.6269538017114316, + "learning_rate": 1.1065377787621819e-05, + "loss": 0.7179, + "step": 4739 + }, + { + "epoch": 0.48, + "grad_norm": 1.714058517254383, + "learning_rate": 1.1062102024364341e-05, + "loss": 0.786, + "step": 4740 + }, + { + "epoch": 0.48, + "grad_norm": 1.6659851540381478, + "learning_rate": 1.1058826145832298e-05, + "loss": 0.8467, + "step": 4741 + }, + { + "epoch": 0.48, + "grad_norm": 1.6685789483122337, + "learning_rate": 1.1055550152381235e-05, + "loss": 0.7775, + "step": 4742 + }, + { + "epoch": 0.48, + "grad_norm": 1.6167305758786248, + "learning_rate": 1.1052274044366711e-05, + "loss": 0.7274, + "step": 4743 + }, + { + "epoch": 0.48, + "grad_norm": 1.4746836673435093, + "learning_rate": 1.1048997822144296e-05, + "loss": 0.8207, + "step": 4744 + }, + { + "epoch": 0.48, + "grad_norm": 1.633586522725332, + "learning_rate": 1.1045721486069568e-05, + "loss": 0.7326, + "step": 4745 + }, + { + "epoch": 0.48, + "grad_norm": 1.4809266073703506, + "learning_rate": 1.1042445036498129e-05, + "loss": 0.7137, + "step": 4746 + }, + { + "epoch": 0.48, + "grad_norm": 1.5195680421319833, + "learning_rate": 1.1039168473785584e-05, + "loss": 0.7911, + "step": 4747 + }, + { + "epoch": 0.48, + "grad_norm": 1.6137137131166324, + "learning_rate": 1.103589179828755e-05, + "loss": 0.7207, + "step": 4748 + }, + { + "epoch": 0.48, + "grad_norm": 1.4873281881742182, + "learning_rate": 1.1032615010359661e-05, + "loss": 0.672, + "step": 4749 + }, + { + "epoch": 0.48, + "grad_norm": 1.5131039557796713, + "learning_rate": 1.1029338110357561e-05, + "loss": 0.5865, + "step": 4750 + }, + { + "epoch": 0.48, + "grad_norm": 1.4201459792639004, + "learning_rate": 1.1026061098636906e-05, + "loss": 0.6513, + "step": 4751 + }, + { + "epoch": 0.48, + "grad_norm": 1.486349912355769, + "learning_rate": 1.1022783975553366e-05, + "loss": 0.6578, + "step": 4752 + }, + { + "epoch": 0.48, + "grad_norm": 1.5176489180640584, + "learning_rate": 1.101950674146262e-05, + "loss": 0.6687, + "step": 4753 + }, + { + "epoch": 0.48, + "grad_norm": 1.6306745452948275, + "learning_rate": 1.101622939672036e-05, + "loss": 0.6677, + "step": 4754 + }, + { + "epoch": 0.48, + "grad_norm": 1.78403339360376, + "learning_rate": 1.1012951941682291e-05, + "loss": 0.747, + "step": 4755 + }, + { + "epoch": 0.48, + "grad_norm": 1.5074978240557007, + "learning_rate": 1.1009674376704128e-05, + "loss": 0.751, + "step": 4756 + }, + { + "epoch": 0.48, + "grad_norm": 1.539491194858523, + "learning_rate": 1.1006396702141605e-05, + "loss": 0.7455, + "step": 4757 + }, + { + "epoch": 0.48, + "grad_norm": 1.7450620289002257, + "learning_rate": 1.1003118918350456e-05, + "loss": 0.7322, + "step": 4758 + }, + { + "epoch": 0.48, + "grad_norm": 1.5733533171495808, + "learning_rate": 1.0999841025686431e-05, + "loss": 0.5972, + "step": 4759 + }, + { + "epoch": 0.48, + "grad_norm": 1.5719877764421808, + "learning_rate": 1.0996563024505303e-05, + "loss": 0.7433, + "step": 4760 + }, + { + "epoch": 0.48, + "grad_norm": 1.3363325039311442, + "learning_rate": 1.0993284915162843e-05, + "loss": 0.6251, + "step": 4761 + }, + { + "epoch": 0.48, + "grad_norm": 1.4101295343750289, + "learning_rate": 1.0990006698014837e-05, + "loss": 0.6571, + "step": 4762 + }, + { + "epoch": 0.48, + "grad_norm": 1.3324377019912343, + "learning_rate": 1.0986728373417088e-05, + "loss": 0.5947, + "step": 4763 + }, + { + "epoch": 0.48, + "grad_norm": 1.6333620766873393, + "learning_rate": 1.0983449941725404e-05, + "loss": 0.7579, + "step": 4764 + }, + { + "epoch": 0.48, + "grad_norm": 1.6989820977183576, + "learning_rate": 1.098017140329561e-05, + "loss": 0.6614, + "step": 4765 + }, + { + "epoch": 0.48, + "grad_norm": 1.5800683448962918, + "learning_rate": 1.0976892758483533e-05, + "loss": 0.6856, + "step": 4766 + }, + { + "epoch": 0.48, + "grad_norm": 1.5916909083774833, + "learning_rate": 1.0973614007645027e-05, + "loss": 0.7061, + "step": 4767 + }, + { + "epoch": 0.48, + "grad_norm": 1.4914111133780676, + "learning_rate": 1.097033515113595e-05, + "loss": 0.6408, + "step": 4768 + }, + { + "epoch": 0.49, + "grad_norm": 1.657441182352292, + "learning_rate": 1.096705618931216e-05, + "loss": 0.6737, + "step": 4769 + }, + { + "epoch": 0.49, + "grad_norm": 1.6302751638129835, + "learning_rate": 1.0963777122529547e-05, + "loss": 0.6715, + "step": 4770 + }, + { + "epoch": 0.49, + "grad_norm": 1.5698054269632449, + "learning_rate": 1.0960497951144001e-05, + "loss": 0.8629, + "step": 4771 + }, + { + "epoch": 0.49, + "grad_norm": 1.5812627655597422, + "learning_rate": 1.0957218675511418e-05, + "loss": 0.7924, + "step": 4772 + }, + { + "epoch": 0.49, + "grad_norm": 1.6161084006625803, + "learning_rate": 1.0953939295987722e-05, + "loss": 0.7627, + "step": 4773 + }, + { + "epoch": 0.49, + "grad_norm": 1.6299841409594045, + "learning_rate": 1.095065981292883e-05, + "loss": 0.6798, + "step": 4774 + }, + { + "epoch": 0.49, + "grad_norm": 1.588289422502915, + "learning_rate": 1.0947380226690686e-05, + "loss": 0.6632, + "step": 4775 + }, + { + "epoch": 0.49, + "grad_norm": 1.7083918017260773, + "learning_rate": 1.0944100537629229e-05, + "loss": 0.7163, + "step": 4776 + }, + { + "epoch": 0.49, + "grad_norm": 1.4988568694539077, + "learning_rate": 1.0940820746100425e-05, + "loss": 0.758, + "step": 4777 + }, + { + "epoch": 0.49, + "grad_norm": 1.4764122582404338, + "learning_rate": 1.0937540852460242e-05, + "loss": 0.7082, + "step": 4778 + }, + { + "epoch": 0.49, + "grad_norm": 1.4720694527200804, + "learning_rate": 1.093426085706466e-05, + "loss": 0.7218, + "step": 4779 + }, + { + "epoch": 0.49, + "grad_norm": 1.5462059060882711, + "learning_rate": 1.0930980760269673e-05, + "loss": 0.7314, + "step": 4780 + }, + { + "epoch": 0.49, + "grad_norm": 1.4346137380421373, + "learning_rate": 1.0927700562431283e-05, + "loss": 0.695, + "step": 4781 + }, + { + "epoch": 0.49, + "grad_norm": 1.5832976035826196, + "learning_rate": 1.0924420263905505e-05, + "loss": 0.7294, + "step": 4782 + }, + { + "epoch": 0.49, + "grad_norm": 1.557316426835322, + "learning_rate": 1.0921139865048362e-05, + "loss": 0.7435, + "step": 4783 + }, + { + "epoch": 0.49, + "grad_norm": 1.4448128642118385, + "learning_rate": 1.0917859366215895e-05, + "loss": 0.7079, + "step": 4784 + }, + { + "epoch": 0.49, + "grad_norm": 1.5853405092750676, + "learning_rate": 1.0914578767764145e-05, + "loss": 0.7592, + "step": 4785 + }, + { + "epoch": 0.49, + "grad_norm": 1.4668078137037874, + "learning_rate": 1.0911298070049172e-05, + "loss": 0.7091, + "step": 4786 + }, + { + "epoch": 0.49, + "grad_norm": 1.4979091711758297, + "learning_rate": 1.0908017273427044e-05, + "loss": 0.7898, + "step": 4787 + }, + { + "epoch": 0.49, + "grad_norm": 1.4492199846891123, + "learning_rate": 1.0904736378253844e-05, + "loss": 0.6762, + "step": 4788 + }, + { + "epoch": 0.49, + "grad_norm": 1.4721752479841992, + "learning_rate": 1.0901455384885658e-05, + "loss": 0.6579, + "step": 4789 + }, + { + "epoch": 0.49, + "grad_norm": 1.428659963881077, + "learning_rate": 1.0898174293678583e-05, + "loss": 0.6705, + "step": 4790 + }, + { + "epoch": 0.49, + "grad_norm": 1.6378135027870262, + "learning_rate": 1.0894893104988738e-05, + "loss": 0.7132, + "step": 4791 + }, + { + "epoch": 0.49, + "grad_norm": 1.556497741824382, + "learning_rate": 1.089161181917224e-05, + "loss": 0.7206, + "step": 4792 + }, + { + "epoch": 0.49, + "grad_norm": 1.7230568439952965, + "learning_rate": 1.0888330436585224e-05, + "loss": 0.6613, + "step": 4793 + }, + { + "epoch": 0.49, + "grad_norm": 1.612851513748318, + "learning_rate": 1.088504895758383e-05, + "loss": 0.7709, + "step": 4794 + }, + { + "epoch": 0.49, + "grad_norm": 1.4659086390595735, + "learning_rate": 1.088176738252421e-05, + "loss": 0.6839, + "step": 4795 + }, + { + "epoch": 0.49, + "grad_norm": 1.9422400062295213, + "learning_rate": 1.0878485711762533e-05, + "loss": 0.8199, + "step": 4796 + }, + { + "epoch": 0.49, + "grad_norm": 1.5666072973905, + "learning_rate": 1.0875203945654969e-05, + "loss": 0.758, + "step": 4797 + }, + { + "epoch": 0.49, + "grad_norm": 1.4006583596790678, + "learning_rate": 1.0871922084557702e-05, + "loss": 0.6683, + "step": 4798 + }, + { + "epoch": 0.49, + "grad_norm": 1.5592372404936563, + "learning_rate": 1.0868640128826929e-05, + "loss": 0.7184, + "step": 4799 + }, + { + "epoch": 0.49, + "grad_norm": 1.392316596997383, + "learning_rate": 1.0865358078818855e-05, + "loss": 0.6242, + "step": 4800 + }, + { + "epoch": 0.49, + "grad_norm": 1.4557220760954837, + "learning_rate": 1.0862075934889694e-05, + "loss": 0.6217, + "step": 4801 + }, + { + "epoch": 0.49, + "grad_norm": 1.6556912744299246, + "learning_rate": 1.085879369739567e-05, + "loss": 0.6765, + "step": 4802 + }, + { + "epoch": 0.49, + "grad_norm": 1.5322411610345907, + "learning_rate": 1.085551136669302e-05, + "loss": 0.6711, + "step": 4803 + }, + { + "epoch": 0.49, + "grad_norm": 1.5823221318504805, + "learning_rate": 1.0852228943137992e-05, + "loss": 0.703, + "step": 4804 + }, + { + "epoch": 0.49, + "grad_norm": 1.5657096226939102, + "learning_rate": 1.0848946427086839e-05, + "loss": 0.7833, + "step": 4805 + }, + { + "epoch": 0.49, + "grad_norm": 1.452125827239233, + "learning_rate": 1.0845663818895826e-05, + "loss": 0.5953, + "step": 4806 + }, + { + "epoch": 0.49, + "grad_norm": 1.4984899358281312, + "learning_rate": 1.0842381118921233e-05, + "loss": 0.6292, + "step": 4807 + }, + { + "epoch": 0.49, + "grad_norm": 1.705995215029523, + "learning_rate": 1.083909832751934e-05, + "loss": 0.7315, + "step": 4808 + }, + { + "epoch": 0.49, + "grad_norm": 1.6352719630144974, + "learning_rate": 1.0835815445046447e-05, + "loss": 0.7375, + "step": 4809 + }, + { + "epoch": 0.49, + "grad_norm": 1.6271663906870726, + "learning_rate": 1.0832532471858857e-05, + "loss": 0.8057, + "step": 4810 + }, + { + "epoch": 0.49, + "grad_norm": 1.4259725059873003, + "learning_rate": 1.0829249408312888e-05, + "loss": 0.6011, + "step": 4811 + }, + { + "epoch": 0.49, + "grad_norm": 1.5507813456773818, + "learning_rate": 1.0825966254764864e-05, + "loss": 0.7963, + "step": 4812 + }, + { + "epoch": 0.49, + "grad_norm": 2.4161423181330814, + "learning_rate": 1.0822683011571118e-05, + "loss": 0.7603, + "step": 4813 + }, + { + "epoch": 0.49, + "grad_norm": 1.7237254932378825, + "learning_rate": 1.0819399679087997e-05, + "loss": 0.7259, + "step": 4814 + }, + { + "epoch": 0.49, + "grad_norm": 1.672925153363037, + "learning_rate": 1.0816116257671856e-05, + "loss": 0.7901, + "step": 4815 + }, + { + "epoch": 0.49, + "grad_norm": 1.6042468592684607, + "learning_rate": 1.0812832747679054e-05, + "loss": 0.6788, + "step": 4816 + }, + { + "epoch": 0.49, + "grad_norm": 1.496471070682988, + "learning_rate": 1.0809549149465971e-05, + "loss": 0.7245, + "step": 4817 + }, + { + "epoch": 0.49, + "grad_norm": 1.6801939660509613, + "learning_rate": 1.0806265463388989e-05, + "loss": 0.6754, + "step": 4818 + }, + { + "epoch": 0.49, + "grad_norm": 1.7168952904835155, + "learning_rate": 1.0802981689804499e-05, + "loss": 0.7401, + "step": 4819 + }, + { + "epoch": 0.49, + "grad_norm": 1.5862059151970802, + "learning_rate": 1.0799697829068903e-05, + "loss": 0.8646, + "step": 4820 + }, + { + "epoch": 0.49, + "grad_norm": 1.504679180024265, + "learning_rate": 1.0796413881538614e-05, + "loss": 0.7113, + "step": 4821 + }, + { + "epoch": 0.49, + "grad_norm": 1.5559146837534314, + "learning_rate": 1.0793129847570052e-05, + "loss": 0.6912, + "step": 4822 + }, + { + "epoch": 0.49, + "grad_norm": 1.4792854093478405, + "learning_rate": 1.0789845727519647e-05, + "loss": 0.6655, + "step": 4823 + }, + { + "epoch": 0.49, + "grad_norm": 1.3621064320761462, + "learning_rate": 1.0786561521743843e-05, + "loss": 0.7083, + "step": 4824 + }, + { + "epoch": 0.49, + "grad_norm": 1.5724233675494086, + "learning_rate": 1.0783277230599088e-05, + "loss": 0.6914, + "step": 4825 + }, + { + "epoch": 0.49, + "grad_norm": 1.6449096777501475, + "learning_rate": 1.0779992854441833e-05, + "loss": 0.7233, + "step": 4826 + }, + { + "epoch": 0.49, + "grad_norm": 1.5395166521358816, + "learning_rate": 1.0776708393628555e-05, + "loss": 0.7435, + "step": 4827 + }, + { + "epoch": 0.49, + "grad_norm": 1.5538379810180913, + "learning_rate": 1.0773423848515728e-05, + "loss": 0.6461, + "step": 4828 + }, + { + "epoch": 0.49, + "grad_norm": 1.7966878586338726, + "learning_rate": 1.0770139219459834e-05, + "loss": 0.7655, + "step": 4829 + }, + { + "epoch": 0.49, + "grad_norm": 1.6554512917828683, + "learning_rate": 1.0766854506817374e-05, + "loss": 0.7496, + "step": 4830 + }, + { + "epoch": 0.49, + "grad_norm": 1.5582376751493443, + "learning_rate": 1.0763569710944848e-05, + "loss": 0.7463, + "step": 4831 + }, + { + "epoch": 0.49, + "grad_norm": 1.9045453022232466, + "learning_rate": 1.076028483219877e-05, + "loss": 0.7948, + "step": 4832 + }, + { + "epoch": 0.49, + "grad_norm": 1.6515591767481628, + "learning_rate": 1.0756999870935668e-05, + "loss": 0.8395, + "step": 4833 + }, + { + "epoch": 0.49, + "grad_norm": 1.4945025057050394, + "learning_rate": 1.0753714827512063e-05, + "loss": 0.7287, + "step": 4834 + }, + { + "epoch": 0.49, + "grad_norm": 1.586248043237468, + "learning_rate": 1.0750429702284499e-05, + "loss": 0.7354, + "step": 4835 + }, + { + "epoch": 0.49, + "grad_norm": 1.567076394776895, + "learning_rate": 1.0747144495609531e-05, + "loss": 0.7669, + "step": 4836 + }, + { + "epoch": 0.49, + "grad_norm": 1.6765578455669068, + "learning_rate": 1.074385920784371e-05, + "loss": 0.7558, + "step": 4837 + }, + { + "epoch": 0.49, + "grad_norm": 1.512168260010115, + "learning_rate": 1.0740573839343607e-05, + "loss": 0.829, + "step": 4838 + }, + { + "epoch": 0.49, + "grad_norm": 1.4896477079673143, + "learning_rate": 1.0737288390465792e-05, + "loss": 0.6725, + "step": 4839 + }, + { + "epoch": 0.49, + "grad_norm": 1.4702480151410497, + "learning_rate": 1.073400286156685e-05, + "loss": 0.7041, + "step": 4840 + }, + { + "epoch": 0.49, + "grad_norm": 1.6569811631390055, + "learning_rate": 1.073071725300338e-05, + "loss": 0.7568, + "step": 4841 + }, + { + "epoch": 0.49, + "grad_norm": 1.6490024318489755, + "learning_rate": 1.0727431565131978e-05, + "loss": 0.7976, + "step": 4842 + }, + { + "epoch": 0.49, + "grad_norm": 1.5752260202004427, + "learning_rate": 1.0724145798309258e-05, + "loss": 0.714, + "step": 4843 + }, + { + "epoch": 0.49, + "grad_norm": 1.5449459528099991, + "learning_rate": 1.0720859952891834e-05, + "loss": 0.7015, + "step": 4844 + }, + { + "epoch": 0.49, + "grad_norm": 1.6503976761089003, + "learning_rate": 1.0717574029236333e-05, + "loss": 0.8448, + "step": 4845 + }, + { + "epoch": 0.49, + "grad_norm": 1.612022678078976, + "learning_rate": 1.0714288027699392e-05, + "loss": 0.7325, + "step": 4846 + }, + { + "epoch": 0.49, + "grad_norm": 1.6225715275901382, + "learning_rate": 1.071100194863766e-05, + "loss": 0.7276, + "step": 4847 + }, + { + "epoch": 0.49, + "grad_norm": 1.476340849753411, + "learning_rate": 1.0707715792407784e-05, + "loss": 0.726, + "step": 4848 + }, + { + "epoch": 0.49, + "grad_norm": 1.4837181460190332, + "learning_rate": 1.0704429559366424e-05, + "loss": 0.6483, + "step": 4849 + }, + { + "epoch": 0.49, + "grad_norm": 1.5986402635920125, + "learning_rate": 1.0701143249870253e-05, + "loss": 0.6338, + "step": 4850 + }, + { + "epoch": 0.49, + "grad_norm": 1.7729055343245756, + "learning_rate": 1.0697856864275949e-05, + "loss": 0.8581, + "step": 4851 + }, + { + "epoch": 0.49, + "grad_norm": 1.4999791982867494, + "learning_rate": 1.0694570402940192e-05, + "loss": 0.756, + "step": 4852 + }, + { + "epoch": 0.49, + "grad_norm": 1.4845152414841658, + "learning_rate": 1.0691283866219682e-05, + "loss": 0.7603, + "step": 4853 + }, + { + "epoch": 0.49, + "grad_norm": 1.5932376783084876, + "learning_rate": 1.0687997254471117e-05, + "loss": 0.6744, + "step": 4854 + }, + { + "epoch": 0.49, + "grad_norm": 1.685232234665782, + "learning_rate": 1.0684710568051211e-05, + "loss": 0.734, + "step": 4855 + }, + { + "epoch": 0.49, + "grad_norm": 1.7060391227861584, + "learning_rate": 1.068142380731668e-05, + "loss": 0.764, + "step": 4856 + }, + { + "epoch": 0.49, + "grad_norm": 1.53478915495137, + "learning_rate": 1.0678136972624249e-05, + "loss": 0.7804, + "step": 4857 + }, + { + "epoch": 0.49, + "grad_norm": 1.5942102451563043, + "learning_rate": 1.0674850064330655e-05, + "loss": 0.7312, + "step": 4858 + }, + { + "epoch": 0.49, + "grad_norm": 1.5305582484928222, + "learning_rate": 1.0671563082792643e-05, + "loss": 0.624, + "step": 4859 + }, + { + "epoch": 0.49, + "grad_norm": 1.7287157430425841, + "learning_rate": 1.0668276028366957e-05, + "loss": 0.7495, + "step": 4860 + }, + { + "epoch": 0.49, + "grad_norm": 1.5015860213975631, + "learning_rate": 1.0664988901410361e-05, + "loss": 0.6169, + "step": 4861 + }, + { + "epoch": 0.49, + "grad_norm": 1.5295651950629567, + "learning_rate": 1.0661701702279619e-05, + "loss": 0.7288, + "step": 4862 + }, + { + "epoch": 0.49, + "grad_norm": 1.3833428272326505, + "learning_rate": 1.0658414431331502e-05, + "loss": 0.7572, + "step": 4863 + }, + { + "epoch": 0.49, + "grad_norm": 1.8452863333294058, + "learning_rate": 1.0655127088922799e-05, + "loss": 0.7759, + "step": 4864 + }, + { + "epoch": 0.49, + "grad_norm": 1.5262039228128124, + "learning_rate": 1.0651839675410295e-05, + "loss": 0.7951, + "step": 4865 + }, + { + "epoch": 0.49, + "grad_norm": 1.5133758309522023, + "learning_rate": 1.0648552191150784e-05, + "loss": 0.5965, + "step": 4866 + }, + { + "epoch": 0.5, + "grad_norm": 1.473671675400503, + "learning_rate": 1.0645264636501078e-05, + "loss": 0.7426, + "step": 4867 + }, + { + "epoch": 0.5, + "grad_norm": 1.5734654354890787, + "learning_rate": 1.0641977011817986e-05, + "loss": 0.7293, + "step": 4868 + }, + { + "epoch": 0.5, + "grad_norm": 1.5238583307710616, + "learning_rate": 1.0638689317458328e-05, + "loss": 0.667, + "step": 4869 + }, + { + "epoch": 0.5, + "grad_norm": 1.5674873789425725, + "learning_rate": 1.0635401553778934e-05, + "loss": 0.7048, + "step": 4870 + }, + { + "epoch": 0.5, + "grad_norm": 1.476556899148976, + "learning_rate": 1.0632113721136636e-05, + "loss": 0.693, + "step": 4871 + }, + { + "epoch": 0.5, + "grad_norm": 1.5318947628197657, + "learning_rate": 1.0628825819888281e-05, + "loss": 0.7025, + "step": 4872 + }, + { + "epoch": 0.5, + "grad_norm": 1.317050099437974, + "learning_rate": 1.0625537850390718e-05, + "loss": 0.7165, + "step": 4873 + }, + { + "epoch": 0.5, + "grad_norm": 1.591593406876283, + "learning_rate": 1.0622249813000799e-05, + "loss": 0.6269, + "step": 4874 + }, + { + "epoch": 0.5, + "grad_norm": 1.521353505316856, + "learning_rate": 1.0618961708075398e-05, + "loss": 0.6831, + "step": 4875 + }, + { + "epoch": 0.5, + "grad_norm": 1.6774101897798614, + "learning_rate": 1.061567353597138e-05, + "loss": 0.8003, + "step": 4876 + }, + { + "epoch": 0.5, + "grad_norm": 1.5712174893359296, + "learning_rate": 1.061238529704563e-05, + "loss": 0.7023, + "step": 4877 + }, + { + "epoch": 0.5, + "grad_norm": 1.5083574288941524, + "learning_rate": 1.060909699165503e-05, + "loss": 0.6789, + "step": 4878 + }, + { + "epoch": 0.5, + "grad_norm": 1.7912885880405833, + "learning_rate": 1.060580862015648e-05, + "loss": 0.7922, + "step": 4879 + }, + { + "epoch": 0.5, + "grad_norm": 1.4282792013974326, + "learning_rate": 1.0602520182906877e-05, + "loss": 0.6505, + "step": 4880 + }, + { + "epoch": 0.5, + "grad_norm": 1.579379700276147, + "learning_rate": 1.0599231680263127e-05, + "loss": 0.688, + "step": 4881 + }, + { + "epoch": 0.5, + "grad_norm": 1.4453470872146301, + "learning_rate": 1.0595943112582152e-05, + "loss": 0.6834, + "step": 4882 + }, + { + "epoch": 0.5, + "grad_norm": 1.5607007989926776, + "learning_rate": 1.0592654480220874e-05, + "loss": 0.5767, + "step": 4883 + }, + { + "epoch": 0.5, + "grad_norm": 1.5797955334370553, + "learning_rate": 1.0589365783536218e-05, + "loss": 0.6384, + "step": 4884 + }, + { + "epoch": 0.5, + "grad_norm": 1.5695344367475006, + "learning_rate": 1.0586077022885122e-05, + "loss": 0.7588, + "step": 4885 + }, + { + "epoch": 0.5, + "grad_norm": 1.6856965266457415, + "learning_rate": 1.0582788198624532e-05, + "loss": 0.7736, + "step": 4886 + }, + { + "epoch": 0.5, + "grad_norm": 1.5489583302375258, + "learning_rate": 1.0579499311111397e-05, + "loss": 0.7299, + "step": 4887 + }, + { + "epoch": 0.5, + "grad_norm": 1.5112208970054677, + "learning_rate": 1.0576210360702673e-05, + "loss": 0.7585, + "step": 4888 + }, + { + "epoch": 0.5, + "grad_norm": 1.630106670056698, + "learning_rate": 1.0572921347755325e-05, + "loss": 0.6544, + "step": 4889 + }, + { + "epoch": 0.5, + "grad_norm": 1.4713329795106127, + "learning_rate": 1.0569632272626323e-05, + "loss": 0.677, + "step": 4890 + }, + { + "epoch": 0.5, + "grad_norm": 1.5021811281953732, + "learning_rate": 1.0566343135672647e-05, + "loss": 0.7436, + "step": 4891 + }, + { + "epoch": 0.5, + "grad_norm": 1.6665198712378295, + "learning_rate": 1.0563053937251282e-05, + "loss": 0.7593, + "step": 4892 + }, + { + "epoch": 0.5, + "grad_norm": 1.5413742120634135, + "learning_rate": 1.0559764677719218e-05, + "loss": 0.731, + "step": 4893 + }, + { + "epoch": 0.5, + "grad_norm": 1.694579223239616, + "learning_rate": 1.0556475357433446e-05, + "loss": 0.6751, + "step": 4894 + }, + { + "epoch": 0.5, + "grad_norm": 1.66881771693106, + "learning_rate": 1.0553185976750981e-05, + "loss": 0.7052, + "step": 4895 + }, + { + "epoch": 0.5, + "grad_norm": 1.570220760256492, + "learning_rate": 1.054989653602883e-05, + "loss": 0.633, + "step": 4896 + }, + { + "epoch": 0.5, + "grad_norm": 1.6084551011215464, + "learning_rate": 1.0546607035624008e-05, + "loss": 0.842, + "step": 4897 + }, + { + "epoch": 0.5, + "grad_norm": 1.5322621024852836, + "learning_rate": 1.0543317475893541e-05, + "loss": 0.7698, + "step": 4898 + }, + { + "epoch": 0.5, + "grad_norm": 1.6600600782491488, + "learning_rate": 1.0540027857194457e-05, + "loss": 0.7044, + "step": 4899 + }, + { + "epoch": 0.5, + "grad_norm": 1.381638149166171, + "learning_rate": 1.0536738179883795e-05, + "loss": 0.6612, + "step": 4900 + }, + { + "epoch": 0.5, + "grad_norm": 1.419555536907255, + "learning_rate": 1.0533448444318598e-05, + "loss": 0.6967, + "step": 4901 + }, + { + "epoch": 0.5, + "grad_norm": 1.6555120053495183, + "learning_rate": 1.0530158650855913e-05, + "loss": 0.7314, + "step": 4902 + }, + { + "epoch": 0.5, + "grad_norm": 1.6046414041018124, + "learning_rate": 1.0526868799852797e-05, + "loss": 0.744, + "step": 4903 + }, + { + "epoch": 0.5, + "grad_norm": 1.7339299401857255, + "learning_rate": 1.0523578891666313e-05, + "loss": 0.7227, + "step": 4904 + }, + { + "epoch": 0.5, + "grad_norm": 1.5164595375565013, + "learning_rate": 1.0520288926653528e-05, + "loss": 0.7927, + "step": 4905 + }, + { + "epoch": 0.5, + "grad_norm": 1.5925848482022713, + "learning_rate": 1.0516998905171516e-05, + "loss": 0.6613, + "step": 4906 + }, + { + "epoch": 0.5, + "grad_norm": 1.614041586169326, + "learning_rate": 1.051370882757736e-05, + "loss": 0.7167, + "step": 4907 + }, + { + "epoch": 0.5, + "grad_norm": 1.690770155753182, + "learning_rate": 1.0510418694228139e-05, + "loss": 0.7274, + "step": 4908 + }, + { + "epoch": 0.5, + "grad_norm": 1.558572494279657, + "learning_rate": 1.0507128505480956e-05, + "loss": 0.6946, + "step": 4909 + }, + { + "epoch": 0.5, + "grad_norm": 1.64031812904918, + "learning_rate": 1.0503838261692902e-05, + "loss": 0.7701, + "step": 4910 + }, + { + "epoch": 0.5, + "grad_norm": 1.461769504781515, + "learning_rate": 1.0500547963221086e-05, + "loss": 0.6864, + "step": 4911 + }, + { + "epoch": 0.5, + "grad_norm": 1.5102708091137018, + "learning_rate": 1.0497257610422614e-05, + "loss": 0.6578, + "step": 4912 + }, + { + "epoch": 0.5, + "grad_norm": 1.863032025828687, + "learning_rate": 1.0493967203654604e-05, + "loss": 0.682, + "step": 4913 + }, + { + "epoch": 0.5, + "grad_norm": 1.383540654360992, + "learning_rate": 1.0490676743274181e-05, + "loss": 0.7154, + "step": 4914 + }, + { + "epoch": 0.5, + "grad_norm": 1.5303593280040118, + "learning_rate": 1.0487386229638473e-05, + "loss": 0.7482, + "step": 4915 + }, + { + "epoch": 0.5, + "grad_norm": 1.4621120752135677, + "learning_rate": 1.0484095663104608e-05, + "loss": 0.6405, + "step": 4916 + }, + { + "epoch": 0.5, + "grad_norm": 1.604206772487237, + "learning_rate": 1.048080504402973e-05, + "loss": 0.6942, + "step": 4917 + }, + { + "epoch": 0.5, + "grad_norm": 1.6209202519114927, + "learning_rate": 1.0477514372770983e-05, + "loss": 0.7303, + "step": 4918 + }, + { + "epoch": 0.5, + "grad_norm": 1.5004633451450609, + "learning_rate": 1.0474223649685517e-05, + "loss": 0.67, + "step": 4919 + }, + { + "epoch": 0.5, + "grad_norm": 1.6091999763739429, + "learning_rate": 1.0470932875130493e-05, + "loss": 0.6157, + "step": 4920 + }, + { + "epoch": 0.5, + "grad_norm": 1.6686612577304816, + "learning_rate": 1.0467642049463068e-05, + "loss": 0.7565, + "step": 4921 + }, + { + "epoch": 0.5, + "grad_norm": 1.6312257099474892, + "learning_rate": 1.0464351173040412e-05, + "loss": 0.7045, + "step": 4922 + }, + { + "epoch": 0.5, + "grad_norm": 1.7194543341860458, + "learning_rate": 1.0461060246219699e-05, + "loss": 0.7014, + "step": 4923 + }, + { + "epoch": 0.5, + "grad_norm": 1.477713446193529, + "learning_rate": 1.0457769269358104e-05, + "loss": 0.661, + "step": 4924 + }, + { + "epoch": 0.5, + "grad_norm": 1.5176551779973013, + "learning_rate": 1.0454478242812814e-05, + "loss": 0.7093, + "step": 4925 + }, + { + "epoch": 0.5, + "grad_norm": 1.6337582845395682, + "learning_rate": 1.0451187166941019e-05, + "loss": 0.7892, + "step": 4926 + }, + { + "epoch": 0.5, + "grad_norm": 1.5818579907899044, + "learning_rate": 1.0447896042099914e-05, + "loss": 0.7644, + "step": 4927 + }, + { + "epoch": 0.5, + "grad_norm": 1.5478429707052999, + "learning_rate": 1.0444604868646695e-05, + "loss": 0.8387, + "step": 4928 + }, + { + "epoch": 0.5, + "grad_norm": 1.5116126932431422, + "learning_rate": 1.0441313646938571e-05, + "loss": 0.7394, + "step": 4929 + }, + { + "epoch": 0.5, + "grad_norm": 1.3437106808936985, + "learning_rate": 1.0438022377332755e-05, + "loss": 0.5898, + "step": 4930 + }, + { + "epoch": 0.5, + "grad_norm": 1.5724323804816993, + "learning_rate": 1.0434731060186456e-05, + "loss": 0.7116, + "step": 4931 + }, + { + "epoch": 0.5, + "grad_norm": 1.4391075832989195, + "learning_rate": 1.0431439695856902e-05, + "loss": 0.7365, + "step": 4932 + }, + { + "epoch": 0.5, + "grad_norm": 1.7101197878193377, + "learning_rate": 1.0428148284701313e-05, + "loss": 0.7036, + "step": 4933 + }, + { + "epoch": 0.5, + "grad_norm": 1.5331147672575676, + "learning_rate": 1.0424856827076926e-05, + "loss": 0.7613, + "step": 4934 + }, + { + "epoch": 0.5, + "grad_norm": 1.5142965480147195, + "learning_rate": 1.0421565323340971e-05, + "loss": 0.7013, + "step": 4935 + }, + { + "epoch": 0.5, + "grad_norm": 1.5435928907819016, + "learning_rate": 1.0418273773850696e-05, + "loss": 0.7334, + "step": 4936 + }, + { + "epoch": 0.5, + "grad_norm": 1.603917057655441, + "learning_rate": 1.0414982178963344e-05, + "loss": 0.6197, + "step": 4937 + }, + { + "epoch": 0.5, + "grad_norm": 1.6710661824730904, + "learning_rate": 1.0411690539036165e-05, + "loss": 0.7766, + "step": 4938 + }, + { + "epoch": 0.5, + "grad_norm": 1.796042940499753, + "learning_rate": 1.0408398854426413e-05, + "loss": 0.779, + "step": 4939 + }, + { + "epoch": 0.5, + "grad_norm": 1.5307692075330168, + "learning_rate": 1.0405107125491359e-05, + "loss": 0.7377, + "step": 4940 + }, + { + "epoch": 0.5, + "grad_norm": 1.7343403838596432, + "learning_rate": 1.0401815352588256e-05, + "loss": 0.6619, + "step": 4941 + }, + { + "epoch": 0.5, + "grad_norm": 1.6659604279911036, + "learning_rate": 1.0398523536074383e-05, + "loss": 0.7201, + "step": 4942 + }, + { + "epoch": 0.5, + "grad_norm": 1.4924325225794244, + "learning_rate": 1.0395231676307012e-05, + "loss": 0.7449, + "step": 4943 + }, + { + "epoch": 0.5, + "grad_norm": 1.6215899504993245, + "learning_rate": 1.039193977364342e-05, + "loss": 0.6166, + "step": 4944 + }, + { + "epoch": 0.5, + "grad_norm": 1.5413755103496067, + "learning_rate": 1.0388647828440899e-05, + "loss": 0.6889, + "step": 4945 + }, + { + "epoch": 0.5, + "grad_norm": 1.8571969034985918, + "learning_rate": 1.0385355841056732e-05, + "loss": 0.7965, + "step": 4946 + }, + { + "epoch": 0.5, + "grad_norm": 1.505675486970467, + "learning_rate": 1.0382063811848216e-05, + "loss": 0.7368, + "step": 4947 + }, + { + "epoch": 0.5, + "grad_norm": 1.6573830089678232, + "learning_rate": 1.0378771741172647e-05, + "loss": 0.728, + "step": 4948 + }, + { + "epoch": 0.5, + "grad_norm": 1.5151009935557034, + "learning_rate": 1.0375479629387328e-05, + "loss": 0.6786, + "step": 4949 + }, + { + "epoch": 0.5, + "grad_norm": 1.6888252027377988, + "learning_rate": 1.037218747684957e-05, + "loss": 0.785, + "step": 4950 + }, + { + "epoch": 0.5, + "grad_norm": 1.6060069581627165, + "learning_rate": 1.0368895283916678e-05, + "loss": 0.7782, + "step": 4951 + }, + { + "epoch": 0.5, + "grad_norm": 1.5784964492645637, + "learning_rate": 1.0365603050945977e-05, + "loss": 0.6826, + "step": 4952 + }, + { + "epoch": 0.5, + "grad_norm": 1.5425911773394898, + "learning_rate": 1.0362310778294775e-05, + "loss": 0.7056, + "step": 4953 + }, + { + "epoch": 0.5, + "grad_norm": 1.599694320939845, + "learning_rate": 1.0359018466320407e-05, + "loss": 0.7094, + "step": 4954 + }, + { + "epoch": 0.5, + "grad_norm": 1.6737011916314462, + "learning_rate": 1.0355726115380203e-05, + "loss": 0.7664, + "step": 4955 + }, + { + "epoch": 0.5, + "grad_norm": 1.530616756386857, + "learning_rate": 1.0352433725831487e-05, + "loss": 0.7243, + "step": 4956 + }, + { + "epoch": 0.5, + "grad_norm": 1.5627773628239507, + "learning_rate": 1.0349141298031601e-05, + "loss": 0.6623, + "step": 4957 + }, + { + "epoch": 0.5, + "grad_norm": 1.5693580207455513, + "learning_rate": 1.0345848832337891e-05, + "loss": 0.7476, + "step": 4958 + }, + { + "epoch": 0.5, + "grad_norm": 1.4476294616961733, + "learning_rate": 1.0342556329107699e-05, + "loss": 0.7097, + "step": 4959 + }, + { + "epoch": 0.5, + "grad_norm": 1.4493711917271666, + "learning_rate": 1.0339263788698373e-05, + "loss": 0.607, + "step": 4960 + }, + { + "epoch": 0.5, + "grad_norm": 1.7351171237436391, + "learning_rate": 1.0335971211467265e-05, + "loss": 0.8264, + "step": 4961 + }, + { + "epoch": 0.5, + "grad_norm": 1.4803412656979784, + "learning_rate": 1.0332678597771739e-05, + "loss": 0.7487, + "step": 4962 + }, + { + "epoch": 0.5, + "grad_norm": 1.4887948440376384, + "learning_rate": 1.0329385947969154e-05, + "loss": 0.6569, + "step": 4963 + }, + { + "epoch": 0.5, + "grad_norm": 1.8106392631604589, + "learning_rate": 1.0326093262416874e-05, + "loss": 0.8789, + "step": 4964 + }, + { + "epoch": 0.51, + "grad_norm": 1.6668317622828839, + "learning_rate": 1.0322800541472273e-05, + "loss": 0.8512, + "step": 4965 + }, + { + "epoch": 0.51, + "grad_norm": 1.4499170830275672, + "learning_rate": 1.0319507785492718e-05, + "loss": 0.7089, + "step": 4966 + }, + { + "epoch": 0.51, + "grad_norm": 1.4472378715645056, + "learning_rate": 1.031621499483559e-05, + "loss": 0.6131, + "step": 4967 + }, + { + "epoch": 0.51, + "grad_norm": 1.6582927017623401, + "learning_rate": 1.031292216985827e-05, + "loss": 0.8072, + "step": 4968 + }, + { + "epoch": 0.51, + "grad_norm": 1.4679495456025033, + "learning_rate": 1.030962931091814e-05, + "loss": 0.7026, + "step": 4969 + }, + { + "epoch": 0.51, + "grad_norm": 1.6686120921027308, + "learning_rate": 1.0306336418372595e-05, + "loss": 0.6743, + "step": 4970 + }, + { + "epoch": 0.51, + "grad_norm": 1.6297080468438931, + "learning_rate": 1.0303043492579017e-05, + "loss": 0.7175, + "step": 4971 + }, + { + "epoch": 0.51, + "grad_norm": 1.4740594819956203, + "learning_rate": 1.029975053389481e-05, + "loss": 0.6553, + "step": 4972 + }, + { + "epoch": 0.51, + "grad_norm": 1.6604044930548936, + "learning_rate": 1.0296457542677371e-05, + "loss": 0.7285, + "step": 4973 + }, + { + "epoch": 0.51, + "grad_norm": 1.5331123784276655, + "learning_rate": 1.02931645192841e-05, + "loss": 0.6788, + "step": 4974 + }, + { + "epoch": 0.51, + "grad_norm": 1.5496946285391504, + "learning_rate": 1.0289871464072405e-05, + "loss": 0.7427, + "step": 4975 + }, + { + "epoch": 0.51, + "grad_norm": 1.5626346544494052, + "learning_rate": 1.0286578377399696e-05, + "loss": 0.5882, + "step": 4976 + }, + { + "epoch": 0.51, + "grad_norm": 1.7064394550189277, + "learning_rate": 1.0283285259623385e-05, + "loss": 0.7806, + "step": 4977 + }, + { + "epoch": 0.51, + "grad_norm": 1.74754784601924, + "learning_rate": 1.0279992111100891e-05, + "loss": 0.8419, + "step": 4978 + }, + { + "epoch": 0.51, + "grad_norm": 1.6513705967731058, + "learning_rate": 1.027669893218963e-05, + "loss": 0.7918, + "step": 4979 + }, + { + "epoch": 0.51, + "grad_norm": 1.5863692267049108, + "learning_rate": 1.0273405723247028e-05, + "loss": 0.7655, + "step": 4980 + }, + { + "epoch": 0.51, + "grad_norm": 1.5455744916739862, + "learning_rate": 1.027011248463051e-05, + "loss": 0.662, + "step": 4981 + }, + { + "epoch": 0.51, + "grad_norm": 1.6189715089164287, + "learning_rate": 1.0266819216697506e-05, + "loss": 0.6878, + "step": 4982 + }, + { + "epoch": 0.51, + "grad_norm": 1.5728852805180555, + "learning_rate": 1.0263525919805448e-05, + "loss": 0.7392, + "step": 4983 + }, + { + "epoch": 0.51, + "grad_norm": 1.6485182074086644, + "learning_rate": 1.0260232594311775e-05, + "loss": 0.6494, + "step": 4984 + }, + { + "epoch": 0.51, + "grad_norm": 1.6860934646556733, + "learning_rate": 1.025693924057392e-05, + "loss": 0.8304, + "step": 4985 + }, + { + "epoch": 0.51, + "grad_norm": 1.5444623763479748, + "learning_rate": 1.025364585894933e-05, + "loss": 0.721, + "step": 4986 + }, + { + "epoch": 0.51, + "grad_norm": 1.6114327837659634, + "learning_rate": 1.0250352449795449e-05, + "loss": 0.6927, + "step": 4987 + }, + { + "epoch": 0.51, + "grad_norm": 1.6844313667842148, + "learning_rate": 1.0247059013469725e-05, + "loss": 0.8153, + "step": 4988 + }, + { + "epoch": 0.51, + "grad_norm": 1.3912046210481805, + "learning_rate": 1.0243765550329606e-05, + "loss": 0.7573, + "step": 4989 + }, + { + "epoch": 0.51, + "grad_norm": 1.7719163050138464, + "learning_rate": 1.0240472060732552e-05, + "loss": 0.7606, + "step": 4990 + }, + { + "epoch": 0.51, + "grad_norm": 1.5907944102676055, + "learning_rate": 1.0237178545036015e-05, + "loss": 0.6463, + "step": 4991 + }, + { + "epoch": 0.51, + "grad_norm": 1.4653612312225397, + "learning_rate": 1.0233885003597457e-05, + "loss": 0.6643, + "step": 4992 + }, + { + "epoch": 0.51, + "grad_norm": 1.6555274215570521, + "learning_rate": 1.0230591436774337e-05, + "loss": 0.7665, + "step": 4993 + }, + { + "epoch": 0.51, + "grad_norm": 1.480369019040231, + "learning_rate": 1.0227297844924123e-05, + "loss": 0.7743, + "step": 4994 + }, + { + "epoch": 0.51, + "grad_norm": 1.4834908923443815, + "learning_rate": 1.0224004228404286e-05, + "loss": 0.6652, + "step": 4995 + }, + { + "epoch": 0.51, + "grad_norm": 1.7027958245467154, + "learning_rate": 1.0220710587572289e-05, + "loss": 0.6794, + "step": 4996 + }, + { + "epoch": 0.51, + "grad_norm": 1.5892728457192657, + "learning_rate": 1.0217416922785613e-05, + "loss": 0.8259, + "step": 4997 + }, + { + "epoch": 0.51, + "grad_norm": 1.417842304139637, + "learning_rate": 1.0214123234401725e-05, + "loss": 0.6653, + "step": 4998 + }, + { + "epoch": 0.51, + "grad_norm": 1.4897514214464702, + "learning_rate": 1.0210829522778111e-05, + "loss": 0.6722, + "step": 4999 + }, + { + "epoch": 0.51, + "grad_norm": 1.4753853811493134, + "learning_rate": 1.020753578827225e-05, + "loss": 0.6329, + "step": 5000 + }, + { + "epoch": 0.51, + "grad_norm": 1.698061227933607, + "learning_rate": 1.0204242031241624e-05, + "loss": 0.6545, + "step": 5001 + }, + { + "epoch": 0.51, + "grad_norm": 1.5685023415147041, + "learning_rate": 1.020094825204372e-05, + "loss": 0.7103, + "step": 5002 + }, + { + "epoch": 0.51, + "grad_norm": 1.757520070304479, + "learning_rate": 1.0197654451036025e-05, + "loss": 0.8271, + "step": 5003 + }, + { + "epoch": 0.51, + "grad_norm": 1.496912420552621, + "learning_rate": 1.0194360628576032e-05, + "loss": 0.646, + "step": 5004 + }, + { + "epoch": 0.51, + "grad_norm": 1.6368808812363533, + "learning_rate": 1.0191066785021232e-05, + "loss": 0.6652, + "step": 5005 + }, + { + "epoch": 0.51, + "grad_norm": 1.7515757375629755, + "learning_rate": 1.0187772920729118e-05, + "loss": 0.7075, + "step": 5006 + }, + { + "epoch": 0.51, + "grad_norm": 1.532702151487452, + "learning_rate": 1.0184479036057191e-05, + "loss": 0.7008, + "step": 5007 + }, + { + "epoch": 0.51, + "grad_norm": 1.8067628392350867, + "learning_rate": 1.0181185131362949e-05, + "loss": 0.8778, + "step": 5008 + }, + { + "epoch": 0.51, + "grad_norm": 1.5930483783286502, + "learning_rate": 1.0177891207003897e-05, + "loss": 0.6907, + "step": 5009 + }, + { + "epoch": 0.51, + "grad_norm": 1.5893821059072393, + "learning_rate": 1.0174597263337533e-05, + "loss": 0.6614, + "step": 5010 + }, + { + "epoch": 0.51, + "grad_norm": 1.6078386505119349, + "learning_rate": 1.0171303300721369e-05, + "loss": 0.8173, + "step": 5011 + }, + { + "epoch": 0.51, + "grad_norm": 1.4627833807251032, + "learning_rate": 1.0168009319512908e-05, + "loss": 0.735, + "step": 5012 + }, + { + "epoch": 0.51, + "grad_norm": 1.5924423835822419, + "learning_rate": 1.0164715320069667e-05, + "loss": 0.6657, + "step": 5013 + }, + { + "epoch": 0.51, + "grad_norm": 1.3847413718790185, + "learning_rate": 1.016142130274915e-05, + "loss": 0.7492, + "step": 5014 + }, + { + "epoch": 0.51, + "grad_norm": 1.4717912087381975, + "learning_rate": 1.0158127267908877e-05, + "loss": 0.6354, + "step": 5015 + }, + { + "epoch": 0.51, + "grad_norm": 1.5804650771140725, + "learning_rate": 1.0154833215906359e-05, + "loss": 0.6814, + "step": 5016 + }, + { + "epoch": 0.51, + "grad_norm": 1.802943339801398, + "learning_rate": 1.015153914709912e-05, + "loss": 0.9026, + "step": 5017 + }, + { + "epoch": 0.51, + "grad_norm": 1.5091332096941117, + "learning_rate": 1.0148245061844678e-05, + "loss": 0.8205, + "step": 5018 + }, + { + "epoch": 0.51, + "grad_norm": 1.6077826529932098, + "learning_rate": 1.0144950960500548e-05, + "loss": 0.6848, + "step": 5019 + }, + { + "epoch": 0.51, + "grad_norm": 1.498282901072857, + "learning_rate": 1.0141656843424261e-05, + "loss": 0.7553, + "step": 5020 + }, + { + "epoch": 0.51, + "grad_norm": 1.5345091773948873, + "learning_rate": 1.0138362710973337e-05, + "loss": 0.7252, + "step": 5021 + }, + { + "epoch": 0.51, + "grad_norm": 1.5877742089090119, + "learning_rate": 1.0135068563505305e-05, + "loss": 0.6948, + "step": 5022 + }, + { + "epoch": 0.51, + "grad_norm": 1.9732284318068651, + "learning_rate": 1.0131774401377694e-05, + "loss": 0.6836, + "step": 5023 + }, + { + "epoch": 0.51, + "grad_norm": 1.4967048574718913, + "learning_rate": 1.0128480224948032e-05, + "loss": 0.653, + "step": 5024 + }, + { + "epoch": 0.51, + "grad_norm": 1.472468435443647, + "learning_rate": 1.0125186034573848e-05, + "loss": 0.7266, + "step": 5025 + }, + { + "epoch": 0.51, + "grad_norm": 1.8378646799585223, + "learning_rate": 1.0121891830612682e-05, + "loss": 0.7543, + "step": 5026 + }, + { + "epoch": 0.51, + "grad_norm": 1.7292167157582072, + "learning_rate": 1.0118597613422064e-05, + "loss": 0.777, + "step": 5027 + }, + { + "epoch": 0.51, + "grad_norm": 1.747577986929217, + "learning_rate": 1.0115303383359527e-05, + "loss": 0.6887, + "step": 5028 + }, + { + "epoch": 0.51, + "grad_norm": 1.5050282607663057, + "learning_rate": 1.011200914078261e-05, + "loss": 0.7344, + "step": 5029 + }, + { + "epoch": 0.51, + "grad_norm": 1.5261534192520534, + "learning_rate": 1.0108714886048854e-05, + "loss": 0.7532, + "step": 5030 + }, + { + "epoch": 0.51, + "grad_norm": 1.4756222381550796, + "learning_rate": 1.0105420619515798e-05, + "loss": 0.7665, + "step": 5031 + }, + { + "epoch": 0.51, + "grad_norm": 1.4293226846812288, + "learning_rate": 1.0102126341540981e-05, + "loss": 0.7232, + "step": 5032 + }, + { + "epoch": 0.51, + "grad_norm": 1.8230140564560462, + "learning_rate": 1.0098832052481946e-05, + "loss": 0.7146, + "step": 5033 + }, + { + "epoch": 0.51, + "grad_norm": 1.6393459817543277, + "learning_rate": 1.0095537752696238e-05, + "loss": 0.7186, + "step": 5034 + }, + { + "epoch": 0.51, + "grad_norm": 1.540375836450423, + "learning_rate": 1.0092243442541404e-05, + "loss": 0.6179, + "step": 5035 + }, + { + "epoch": 0.51, + "grad_norm": 1.6002854156707615, + "learning_rate": 1.0088949122374982e-05, + "loss": 0.7583, + "step": 5036 + }, + { + "epoch": 0.51, + "grad_norm": 1.791900632212417, + "learning_rate": 1.0085654792554527e-05, + "loss": 0.6443, + "step": 5037 + }, + { + "epoch": 0.51, + "grad_norm": 1.8762373825564007, + "learning_rate": 1.0082360453437583e-05, + "loss": 0.7761, + "step": 5038 + }, + { + "epoch": 0.51, + "grad_norm": 1.540714618809593, + "learning_rate": 1.00790661053817e-05, + "loss": 0.6828, + "step": 5039 + }, + { + "epoch": 0.51, + "grad_norm": 1.584376129187976, + "learning_rate": 1.0075771748744425e-05, + "loss": 0.739, + "step": 5040 + }, + { + "epoch": 0.51, + "grad_norm": 1.8348523122891849, + "learning_rate": 1.0072477383883315e-05, + "loss": 0.6564, + "step": 5041 + }, + { + "epoch": 0.51, + "grad_norm": 1.7931328635932917, + "learning_rate": 1.0069183011155919e-05, + "loss": 0.7407, + "step": 5042 + }, + { + "epoch": 0.51, + "grad_norm": 1.6265720031446385, + "learning_rate": 1.0065888630919784e-05, + "loss": 0.8066, + "step": 5043 + }, + { + "epoch": 0.51, + "grad_norm": 1.6535312579766697, + "learning_rate": 1.0062594243532473e-05, + "loss": 0.7392, + "step": 5044 + }, + { + "epoch": 0.51, + "grad_norm": 1.6066458750468946, + "learning_rate": 1.0059299849351537e-05, + "loss": 0.772, + "step": 5045 + }, + { + "epoch": 0.51, + "grad_norm": 1.490332422068347, + "learning_rate": 1.0056005448734527e-05, + "loss": 0.7085, + "step": 5046 + }, + { + "epoch": 0.51, + "grad_norm": 1.5434677098664784, + "learning_rate": 1.0052711042039e-05, + "loss": 0.6175, + "step": 5047 + }, + { + "epoch": 0.51, + "grad_norm": 1.7218251257628168, + "learning_rate": 1.0049416629622515e-05, + "loss": 0.7077, + "step": 5048 + }, + { + "epoch": 0.51, + "grad_norm": 1.5429660052228062, + "learning_rate": 1.0046122211842629e-05, + "loss": 0.6454, + "step": 5049 + }, + { + "epoch": 0.51, + "grad_norm": 1.7276557227894023, + "learning_rate": 1.0042827789056897e-05, + "loss": 0.8497, + "step": 5050 + }, + { + "epoch": 0.51, + "grad_norm": 1.5107847951141293, + "learning_rate": 1.003953336162288e-05, + "loss": 0.675, + "step": 5051 + }, + { + "epoch": 0.51, + "grad_norm": 1.5534066484669686, + "learning_rate": 1.0036238929898133e-05, + "loss": 0.6476, + "step": 5052 + }, + { + "epoch": 0.51, + "grad_norm": 1.7502090023426649, + "learning_rate": 1.0032944494240215e-05, + "loss": 0.7414, + "step": 5053 + }, + { + "epoch": 0.51, + "grad_norm": 1.61180441500941, + "learning_rate": 1.002965005500669e-05, + "loss": 0.7667, + "step": 5054 + }, + { + "epoch": 0.51, + "grad_norm": 1.5907909366442183, + "learning_rate": 1.0026355612555116e-05, + "loss": 0.8143, + "step": 5055 + }, + { + "epoch": 0.51, + "grad_norm": 1.6927249074587167, + "learning_rate": 1.0023061167243051e-05, + "loss": 0.8298, + "step": 5056 + }, + { + "epoch": 0.51, + "grad_norm": 1.558245964430676, + "learning_rate": 1.0019766719428056e-05, + "loss": 0.6458, + "step": 5057 + }, + { + "epoch": 0.51, + "grad_norm": 1.5989346457818998, + "learning_rate": 1.0016472269467694e-05, + "loss": 0.7566, + "step": 5058 + }, + { + "epoch": 0.51, + "grad_norm": 1.5501716886854584, + "learning_rate": 1.0013177817719525e-05, + "loss": 0.6956, + "step": 5059 + }, + { + "epoch": 0.51, + "grad_norm": 1.5298661053568097, + "learning_rate": 1.000988336454111e-05, + "loss": 0.7485, + "step": 5060 + }, + { + "epoch": 0.51, + "grad_norm": 1.534800077294969, + "learning_rate": 1.0006588910290009e-05, + "loss": 0.72, + "step": 5061 + }, + { + "epoch": 0.51, + "grad_norm": 1.4039336604523878, + "learning_rate": 1.0003294455323787e-05, + "loss": 0.6622, + "step": 5062 + }, + { + "epoch": 0.52, + "grad_norm": 1.4625189066192077, + "learning_rate": 1e-05, + "loss": 0.7393, + "step": 5063 + }, + { + "epoch": 0.52, + "grad_norm": 1.6009023241100437, + "learning_rate": 9.996705544676214e-06, + "loss": 0.7206, + "step": 5064 + }, + { + "epoch": 0.52, + "grad_norm": 1.6802185162627696, + "learning_rate": 9.993411089709995e-06, + "loss": 0.703, + "step": 5065 + }, + { + "epoch": 0.52, + "grad_norm": 1.4998292988494204, + "learning_rate": 9.990116635458893e-06, + "loss": 0.6667, + "step": 5066 + }, + { + "epoch": 0.52, + "grad_norm": 1.5951409889363042, + "learning_rate": 9.98682218228048e-06, + "loss": 0.7688, + "step": 5067 + }, + { + "epoch": 0.52, + "grad_norm": 1.4420550450229865, + "learning_rate": 9.98352773053231e-06, + "loss": 0.833, + "step": 5068 + }, + { + "epoch": 0.52, + "grad_norm": 1.3540576285928896, + "learning_rate": 9.980233280571946e-06, + "loss": 0.6754, + "step": 5069 + }, + { + "epoch": 0.52, + "grad_norm": 1.58857712960787, + "learning_rate": 9.976938832756952e-06, + "loss": 0.7208, + "step": 5070 + }, + { + "epoch": 0.52, + "grad_norm": 1.6888765345374839, + "learning_rate": 9.973644387444887e-06, + "loss": 0.8324, + "step": 5071 + }, + { + "epoch": 0.52, + "grad_norm": 1.6628467015311343, + "learning_rate": 9.970349944993315e-06, + "loss": 0.7039, + "step": 5072 + }, + { + "epoch": 0.52, + "grad_norm": 1.4986284311832823, + "learning_rate": 9.967055505759787e-06, + "loss": 0.6795, + "step": 5073 + }, + { + "epoch": 0.52, + "grad_norm": 1.8380637247061966, + "learning_rate": 9.963761070101869e-06, + "loss": 0.8186, + "step": 5074 + }, + { + "epoch": 0.52, + "grad_norm": 1.6236843675335488, + "learning_rate": 9.960466638377125e-06, + "loss": 0.7437, + "step": 5075 + }, + { + "epoch": 0.52, + "grad_norm": 1.7043518992354776, + "learning_rate": 9.957172210943105e-06, + "loss": 0.6886, + "step": 5076 + }, + { + "epoch": 0.52, + "grad_norm": 1.5516039920891835, + "learning_rate": 9.953877788157373e-06, + "loss": 0.7047, + "step": 5077 + }, + { + "epoch": 0.52, + "grad_norm": 1.4788193377628533, + "learning_rate": 9.950583370377488e-06, + "loss": 0.7271, + "step": 5078 + }, + { + "epoch": 0.52, + "grad_norm": 1.560881796187408, + "learning_rate": 9.947288957961001e-06, + "loss": 0.6769, + "step": 5079 + }, + { + "epoch": 0.52, + "grad_norm": 1.5307649077697028, + "learning_rate": 9.943994551265478e-06, + "loss": 0.6648, + "step": 5080 + }, + { + "epoch": 0.52, + "grad_norm": 1.565759677818178, + "learning_rate": 9.940700150648467e-06, + "loss": 0.7498, + "step": 5081 + }, + { + "epoch": 0.52, + "grad_norm": 1.5830488858116634, + "learning_rate": 9.937405756467527e-06, + "loss": 0.7328, + "step": 5082 + }, + { + "epoch": 0.52, + "grad_norm": 1.4985569654884294, + "learning_rate": 9.934111369080218e-06, + "loss": 0.6863, + "step": 5083 + }, + { + "epoch": 0.52, + "grad_norm": 1.6931785992505062, + "learning_rate": 9.930816988844084e-06, + "loss": 0.6846, + "step": 5084 + }, + { + "epoch": 0.52, + "grad_norm": 1.5202715907906006, + "learning_rate": 9.927522616116687e-06, + "loss": 0.6753, + "step": 5085 + }, + { + "epoch": 0.52, + "grad_norm": 1.6362017411817606, + "learning_rate": 9.924228251255577e-06, + "loss": 0.8604, + "step": 5086 + }, + { + "epoch": 0.52, + "grad_norm": 1.5759133874543885, + "learning_rate": 9.920933894618303e-06, + "loss": 0.696, + "step": 5087 + }, + { + "epoch": 0.52, + "grad_norm": 1.7292168383139979, + "learning_rate": 9.91763954656242e-06, + "loss": 0.8085, + "step": 5088 + }, + { + "epoch": 0.52, + "grad_norm": 1.5071322373168559, + "learning_rate": 9.914345207445476e-06, + "loss": 0.7611, + "step": 5089 + }, + { + "epoch": 0.52, + "grad_norm": 1.6228730452289775, + "learning_rate": 9.911050877625022e-06, + "loss": 0.6774, + "step": 5090 + }, + { + "epoch": 0.52, + "grad_norm": 1.608877474252974, + "learning_rate": 9.9077565574586e-06, + "loss": 0.7135, + "step": 5091 + }, + { + "epoch": 0.52, + "grad_norm": 1.5956742547959841, + "learning_rate": 9.904462247303764e-06, + "loss": 0.8134, + "step": 5092 + }, + { + "epoch": 0.52, + "grad_norm": 1.4093253731337068, + "learning_rate": 9.901167947518057e-06, + "loss": 0.6145, + "step": 5093 + }, + { + "epoch": 0.52, + "grad_norm": 1.6676622348956422, + "learning_rate": 9.89787365845902e-06, + "loss": 0.7878, + "step": 5094 + }, + { + "epoch": 0.52, + "grad_norm": 1.485830024047951, + "learning_rate": 9.894579380484206e-06, + "loss": 0.6392, + "step": 5095 + }, + { + "epoch": 0.52, + "grad_norm": 1.8186613390608457, + "learning_rate": 9.89128511395115e-06, + "loss": 0.6955, + "step": 5096 + }, + { + "epoch": 0.52, + "grad_norm": 1.5786168135399719, + "learning_rate": 9.887990859217391e-06, + "loss": 0.7995, + "step": 5097 + }, + { + "epoch": 0.52, + "grad_norm": 1.7458285958219886, + "learning_rate": 9.88469661664048e-06, + "loss": 0.6816, + "step": 5098 + }, + { + "epoch": 0.52, + "grad_norm": 1.511674397329433, + "learning_rate": 9.881402386577942e-06, + "loss": 0.6325, + "step": 5099 + }, + { + "epoch": 0.52, + "grad_norm": 1.5292384328734583, + "learning_rate": 9.87810816938732e-06, + "loss": 0.6494, + "step": 5100 + }, + { + "epoch": 0.52, + "grad_norm": 1.4321640908065174, + "learning_rate": 9.874813965426154e-06, + "loss": 0.6423, + "step": 5101 + }, + { + "epoch": 0.52, + "grad_norm": 1.504524200960757, + "learning_rate": 9.871519775051971e-06, + "loss": 0.6557, + "step": 5102 + }, + { + "epoch": 0.52, + "grad_norm": 1.4892683522137387, + "learning_rate": 9.86822559862231e-06, + "loss": 0.7091, + "step": 5103 + }, + { + "epoch": 0.52, + "grad_norm": 1.6672876374415153, + "learning_rate": 9.864931436494696e-06, + "loss": 0.7504, + "step": 5104 + }, + { + "epoch": 0.52, + "grad_norm": 1.5283640218290506, + "learning_rate": 9.861637289026664e-06, + "loss": 0.7191, + "step": 5105 + }, + { + "epoch": 0.52, + "grad_norm": 1.476362253308102, + "learning_rate": 9.858343156575744e-06, + "loss": 0.6484, + "step": 5106 + }, + { + "epoch": 0.52, + "grad_norm": 1.6467227908333926, + "learning_rate": 9.855049039499454e-06, + "loss": 0.7415, + "step": 5107 + }, + { + "epoch": 0.52, + "grad_norm": 1.5866704705759278, + "learning_rate": 9.851754938155329e-06, + "loss": 0.722, + "step": 5108 + }, + { + "epoch": 0.52, + "grad_norm": 1.51916605439026, + "learning_rate": 9.848460852900883e-06, + "loss": 0.6658, + "step": 5109 + }, + { + "epoch": 0.52, + "grad_norm": 1.4627539145682227, + "learning_rate": 9.845166784093641e-06, + "loss": 0.675, + "step": 5110 + }, + { + "epoch": 0.52, + "grad_norm": 1.681030853676064, + "learning_rate": 9.841872732091128e-06, + "loss": 0.7634, + "step": 5111 + }, + { + "epoch": 0.52, + "grad_norm": 1.7011084668318175, + "learning_rate": 9.838578697250852e-06, + "loss": 0.7477, + "step": 5112 + }, + { + "epoch": 0.52, + "grad_norm": 1.6294344584210423, + "learning_rate": 9.835284679930335e-06, + "loss": 0.7684, + "step": 5113 + }, + { + "epoch": 0.52, + "grad_norm": 1.6873612611256998, + "learning_rate": 9.831990680487094e-06, + "loss": 0.7013, + "step": 5114 + }, + { + "epoch": 0.52, + "grad_norm": 1.5622192387852167, + "learning_rate": 9.828696699278633e-06, + "loss": 0.749, + "step": 5115 + }, + { + "epoch": 0.52, + "grad_norm": 1.4396246048138015, + "learning_rate": 9.82540273666247e-06, + "loss": 0.6634, + "step": 5116 + }, + { + "epoch": 0.52, + "grad_norm": 1.59187032207698, + "learning_rate": 9.822108792996107e-06, + "loss": 0.7317, + "step": 5117 + }, + { + "epoch": 0.52, + "grad_norm": 1.573390113679101, + "learning_rate": 9.818814868637051e-06, + "loss": 0.7894, + "step": 5118 + }, + { + "epoch": 0.52, + "grad_norm": 1.6061202138652004, + "learning_rate": 9.815520963942812e-06, + "loss": 0.6493, + "step": 5119 + }, + { + "epoch": 0.52, + "grad_norm": 1.5454123702799032, + "learning_rate": 9.812227079270884e-06, + "loss": 0.7948, + "step": 5120 + }, + { + "epoch": 0.52, + "grad_norm": 1.597879788112768, + "learning_rate": 9.808933214978773e-06, + "loss": 0.7693, + "step": 5121 + }, + { + "epoch": 0.52, + "grad_norm": 1.690313982093511, + "learning_rate": 9.805639371423971e-06, + "loss": 0.6432, + "step": 5122 + }, + { + "epoch": 0.52, + "grad_norm": 1.677017710340723, + "learning_rate": 9.802345548963977e-06, + "loss": 0.7294, + "step": 5123 + }, + { + "epoch": 0.52, + "grad_norm": 1.6484158834398428, + "learning_rate": 9.799051747956284e-06, + "loss": 0.718, + "step": 5124 + }, + { + "epoch": 0.52, + "grad_norm": 1.7048099351617905, + "learning_rate": 9.795757968758378e-06, + "loss": 0.7263, + "step": 5125 + }, + { + "epoch": 0.52, + "grad_norm": 1.6467261635402397, + "learning_rate": 9.792464211727755e-06, + "loss": 0.6859, + "step": 5126 + }, + { + "epoch": 0.52, + "grad_norm": 1.3529923691945207, + "learning_rate": 9.789170477221892e-06, + "loss": 0.6789, + "step": 5127 + }, + { + "epoch": 0.52, + "grad_norm": 1.608164579433592, + "learning_rate": 9.785876765598276e-06, + "loss": 0.754, + "step": 5128 + }, + { + "epoch": 0.52, + "grad_norm": 1.6396310055821297, + "learning_rate": 9.782583077214394e-06, + "loss": 0.7712, + "step": 5129 + }, + { + "epoch": 0.52, + "grad_norm": 1.6564971675287619, + "learning_rate": 9.779289412427714e-06, + "loss": 0.7607, + "step": 5130 + }, + { + "epoch": 0.52, + "grad_norm": 1.6173430588878457, + "learning_rate": 9.775995771595716e-06, + "loss": 0.7981, + "step": 5131 + }, + { + "epoch": 0.52, + "grad_norm": 1.6017258418261529, + "learning_rate": 9.772702155075879e-06, + "loss": 0.7253, + "step": 5132 + }, + { + "epoch": 0.52, + "grad_norm": 1.7709026391696094, + "learning_rate": 9.769408563225665e-06, + "loss": 0.7332, + "step": 5133 + }, + { + "epoch": 0.52, + "grad_norm": 1.519283381436825, + "learning_rate": 9.76611499640255e-06, + "loss": 0.6772, + "step": 5134 + }, + { + "epoch": 0.52, + "grad_norm": 1.609935659876278, + "learning_rate": 9.76282145496399e-06, + "loss": 0.7068, + "step": 5135 + }, + { + "epoch": 0.52, + "grad_norm": 1.531593684480193, + "learning_rate": 9.75952793926745e-06, + "loss": 0.646, + "step": 5136 + }, + { + "epoch": 0.52, + "grad_norm": 1.5790104519401773, + "learning_rate": 9.756234449670396e-06, + "loss": 0.6888, + "step": 5137 + }, + { + "epoch": 0.52, + "grad_norm": 1.446800963186642, + "learning_rate": 9.752940986530279e-06, + "loss": 0.6697, + "step": 5138 + }, + { + "epoch": 0.52, + "grad_norm": 1.6523518022610042, + "learning_rate": 9.749647550204555e-06, + "loss": 0.7867, + "step": 5139 + }, + { + "epoch": 0.52, + "grad_norm": 1.4971052938564537, + "learning_rate": 9.746354141050674e-06, + "loss": 0.7876, + "step": 5140 + }, + { + "epoch": 0.52, + "grad_norm": 1.570377547663415, + "learning_rate": 9.743060759426082e-06, + "loss": 0.7654, + "step": 5141 + }, + { + "epoch": 0.52, + "grad_norm": 1.6064579733110664, + "learning_rate": 9.739767405688228e-06, + "loss": 0.737, + "step": 5142 + }, + { + "epoch": 0.52, + "grad_norm": 1.5455426190649941, + "learning_rate": 9.736474080194555e-06, + "loss": 0.6627, + "step": 5143 + }, + { + "epoch": 0.52, + "grad_norm": 1.7875043911265842, + "learning_rate": 9.733180783302499e-06, + "loss": 0.8372, + "step": 5144 + }, + { + "epoch": 0.52, + "grad_norm": 1.4778229053041372, + "learning_rate": 9.729887515369491e-06, + "loss": 0.6284, + "step": 5145 + }, + { + "epoch": 0.52, + "grad_norm": 1.6191155801176957, + "learning_rate": 9.726594276752975e-06, + "loss": 0.663, + "step": 5146 + }, + { + "epoch": 0.52, + "grad_norm": 1.8209617158366576, + "learning_rate": 9.723301067810373e-06, + "loss": 0.8653, + "step": 5147 + }, + { + "epoch": 0.52, + "grad_norm": 1.6978790968854747, + "learning_rate": 9.720007888899112e-06, + "loss": 0.7375, + "step": 5148 + }, + { + "epoch": 0.52, + "grad_norm": 1.5651878518597222, + "learning_rate": 9.716714740376616e-06, + "loss": 0.7489, + "step": 5149 + }, + { + "epoch": 0.52, + "grad_norm": 1.2843081750148266, + "learning_rate": 9.713421622600307e-06, + "loss": 0.5879, + "step": 5150 + }, + { + "epoch": 0.52, + "grad_norm": 1.553289187108251, + "learning_rate": 9.710128535927597e-06, + "loss": 0.7157, + "step": 5151 + }, + { + "epoch": 0.52, + "grad_norm": 1.6699906188531186, + "learning_rate": 9.706835480715905e-06, + "loss": 0.8308, + "step": 5152 + }, + { + "epoch": 0.52, + "grad_norm": 1.512597982088698, + "learning_rate": 9.703542457322632e-06, + "loss": 0.7099, + "step": 5153 + }, + { + "epoch": 0.52, + "grad_norm": 1.416985900975218, + "learning_rate": 9.700249466105193e-06, + "loss": 0.6356, + "step": 5154 + }, + { + "epoch": 0.52, + "grad_norm": 1.6035167743253373, + "learning_rate": 9.696956507420984e-06, + "loss": 0.6669, + "step": 5155 + }, + { + "epoch": 0.52, + "grad_norm": 1.534607483177831, + "learning_rate": 9.693663581627408e-06, + "loss": 0.6586, + "step": 5156 + }, + { + "epoch": 0.52, + "grad_norm": 1.5302169037608306, + "learning_rate": 9.690370689081863e-06, + "loss": 0.674, + "step": 5157 + }, + { + "epoch": 0.52, + "grad_norm": 1.6115188616385638, + "learning_rate": 9.687077830141734e-06, + "loss": 0.6778, + "step": 5158 + }, + { + "epoch": 0.52, + "grad_norm": 1.618571618506667, + "learning_rate": 9.683785005164412e-06, + "loss": 0.7339, + "step": 5159 + }, + { + "epoch": 0.52, + "grad_norm": 1.4804908116816398, + "learning_rate": 9.680492214507287e-06, + "loss": 0.6456, + "step": 5160 + }, + { + "epoch": 0.52, + "grad_norm": 1.494140203548696, + "learning_rate": 9.67719945852773e-06, + "loss": 0.717, + "step": 5161 + }, + { + "epoch": 0.53, + "grad_norm": 1.6607958643733662, + "learning_rate": 9.67390673758313e-06, + "loss": 0.7738, + "step": 5162 + }, + { + "epoch": 0.53, + "grad_norm": 1.4974459021451776, + "learning_rate": 9.670614052030849e-06, + "loss": 0.8225, + "step": 5163 + }, + { + "epoch": 0.53, + "grad_norm": 1.7055403350977216, + "learning_rate": 9.667321402228261e-06, + "loss": 0.7338, + "step": 5164 + }, + { + "epoch": 0.53, + "grad_norm": 1.5909879761244057, + "learning_rate": 9.664028788532738e-06, + "loss": 0.7313, + "step": 5165 + }, + { + "epoch": 0.53, + "grad_norm": 1.5359287895843046, + "learning_rate": 9.66073621130163e-06, + "loss": 0.8177, + "step": 5166 + }, + { + "epoch": 0.53, + "grad_norm": 1.503526554781987, + "learning_rate": 9.657443670892303e-06, + "loss": 0.6308, + "step": 5167 + }, + { + "epoch": 0.53, + "grad_norm": 1.6412671598656263, + "learning_rate": 9.65415116766211e-06, + "loss": 0.6373, + "step": 5168 + }, + { + "epoch": 0.53, + "grad_norm": 1.5027991930993563, + "learning_rate": 9.650858701968399e-06, + "loss": 0.666, + "step": 5169 + }, + { + "epoch": 0.53, + "grad_norm": 1.5524957943245377, + "learning_rate": 9.647566274168516e-06, + "loss": 0.681, + "step": 5170 + }, + { + "epoch": 0.53, + "grad_norm": 1.4380181512787078, + "learning_rate": 9.644273884619802e-06, + "loss": 0.7568, + "step": 5171 + }, + { + "epoch": 0.53, + "grad_norm": 1.6762040745449318, + "learning_rate": 9.640981533679593e-06, + "loss": 0.6954, + "step": 5172 + }, + { + "epoch": 0.53, + "grad_norm": 1.592284969495283, + "learning_rate": 9.637689221705227e-06, + "loss": 0.6853, + "step": 5173 + }, + { + "epoch": 0.53, + "grad_norm": 1.4948404765759724, + "learning_rate": 9.634396949054028e-06, + "loss": 0.6828, + "step": 5174 + }, + { + "epoch": 0.53, + "grad_norm": 1.5866449703821197, + "learning_rate": 9.631104716083327e-06, + "loss": 0.6721, + "step": 5175 + }, + { + "epoch": 0.53, + "grad_norm": 1.4904129492677478, + "learning_rate": 9.627812523150434e-06, + "loss": 0.6681, + "step": 5176 + }, + { + "epoch": 0.53, + "grad_norm": 1.6687105576410526, + "learning_rate": 9.624520370612672e-06, + "loss": 0.7614, + "step": 5177 + }, + { + "epoch": 0.53, + "grad_norm": 1.662810354791064, + "learning_rate": 9.621228258827358e-06, + "loss": 0.6917, + "step": 5178 + }, + { + "epoch": 0.53, + "grad_norm": 1.597033737761668, + "learning_rate": 9.617936188151786e-06, + "loss": 0.6599, + "step": 5179 + }, + { + "epoch": 0.53, + "grad_norm": 1.59732571462882, + "learning_rate": 9.614644158943273e-06, + "loss": 0.7304, + "step": 5180 + }, + { + "epoch": 0.53, + "grad_norm": 1.4910868084236737, + "learning_rate": 9.611352171559105e-06, + "loss": 0.6728, + "step": 5181 + }, + { + "epoch": 0.53, + "grad_norm": 1.608341256608166, + "learning_rate": 9.60806022635658e-06, + "loss": 0.7524, + "step": 5182 + }, + { + "epoch": 0.53, + "grad_norm": 1.550887296530815, + "learning_rate": 9.604768323692993e-06, + "loss": 0.6347, + "step": 5183 + }, + { + "epoch": 0.53, + "grad_norm": 1.7808332484032412, + "learning_rate": 9.601476463925619e-06, + "loss": 0.8607, + "step": 5184 + }, + { + "epoch": 0.53, + "grad_norm": 1.556241721735327, + "learning_rate": 9.598184647411744e-06, + "loss": 0.6297, + "step": 5185 + }, + { + "epoch": 0.53, + "grad_norm": 1.585860373624063, + "learning_rate": 9.594892874508646e-06, + "loss": 0.6625, + "step": 5186 + }, + { + "epoch": 0.53, + "grad_norm": 1.6386386267965907, + "learning_rate": 9.591601145573585e-06, + "loss": 0.7331, + "step": 5187 + }, + { + "epoch": 0.53, + "grad_norm": 1.4186691498174373, + "learning_rate": 9.58830946096384e-06, + "loss": 0.6774, + "step": 5188 + }, + { + "epoch": 0.53, + "grad_norm": 1.5007796466653354, + "learning_rate": 9.58501782103666e-06, + "loss": 0.789, + "step": 5189 + }, + { + "epoch": 0.53, + "grad_norm": 1.6658588544515636, + "learning_rate": 9.581726226149304e-06, + "loss": 0.6567, + "step": 5190 + }, + { + "epoch": 0.53, + "grad_norm": 1.4548246999305345, + "learning_rate": 9.57843467665903e-06, + "loss": 0.7615, + "step": 5191 + }, + { + "epoch": 0.53, + "grad_norm": 1.487561384764103, + "learning_rate": 9.575143172923076e-06, + "loss": 0.6926, + "step": 5192 + }, + { + "epoch": 0.53, + "grad_norm": 1.583406569115701, + "learning_rate": 9.571851715298688e-06, + "loss": 0.6854, + "step": 5193 + }, + { + "epoch": 0.53, + "grad_norm": 1.7058952396836893, + "learning_rate": 9.568560304143101e-06, + "loss": 0.7685, + "step": 5194 + }, + { + "epoch": 0.53, + "grad_norm": 1.4557818774250706, + "learning_rate": 9.565268939813545e-06, + "loss": 0.6032, + "step": 5195 + }, + { + "epoch": 0.53, + "grad_norm": 1.5812853823646884, + "learning_rate": 9.561977622667248e-06, + "loss": 0.6898, + "step": 5196 + }, + { + "epoch": 0.53, + "grad_norm": 1.5803499566349357, + "learning_rate": 9.55868635306143e-06, + "loss": 0.7041, + "step": 5197 + }, + { + "epoch": 0.53, + "grad_norm": 1.5179266574469583, + "learning_rate": 9.555395131353307e-06, + "loss": 0.6686, + "step": 5198 + }, + { + "epoch": 0.53, + "grad_norm": 1.583336365752639, + "learning_rate": 9.55210395790009e-06, + "loss": 0.7714, + "step": 5199 + }, + { + "epoch": 0.53, + "grad_norm": 1.7268133639594228, + "learning_rate": 9.548812833058984e-06, + "loss": 0.7094, + "step": 5200 + }, + { + "epoch": 0.53, + "grad_norm": 1.5994523133476826, + "learning_rate": 9.545521757187188e-06, + "loss": 0.6845, + "step": 5201 + }, + { + "epoch": 0.53, + "grad_norm": 1.5297856828233163, + "learning_rate": 9.542230730641898e-06, + "loss": 0.689, + "step": 5202 + }, + { + "epoch": 0.53, + "grad_norm": 1.6519608888054835, + "learning_rate": 9.538939753780304e-06, + "loss": 0.7541, + "step": 5203 + }, + { + "epoch": 0.53, + "grad_norm": 1.5538051540501847, + "learning_rate": 9.535648826959591e-06, + "loss": 0.7378, + "step": 5204 + }, + { + "epoch": 0.53, + "grad_norm": 1.5560502994532663, + "learning_rate": 9.532357950536934e-06, + "loss": 0.7586, + "step": 5205 + }, + { + "epoch": 0.53, + "grad_norm": 1.5562973234346362, + "learning_rate": 9.529067124869512e-06, + "loss": 0.7769, + "step": 5206 + }, + { + "epoch": 0.53, + "grad_norm": 1.4718150268643992, + "learning_rate": 9.525776350314486e-06, + "loss": 0.6668, + "step": 5207 + }, + { + "epoch": 0.53, + "grad_norm": 1.5758318544958667, + "learning_rate": 9.522485627229022e-06, + "loss": 0.7326, + "step": 5208 + }, + { + "epoch": 0.53, + "grad_norm": 1.6178121744079512, + "learning_rate": 9.519194955970275e-06, + "loss": 0.7513, + "step": 5209 + }, + { + "epoch": 0.53, + "grad_norm": 1.5169507449367376, + "learning_rate": 9.515904336895395e-06, + "loss": 0.7525, + "step": 5210 + }, + { + "epoch": 0.53, + "grad_norm": 1.4632947803615415, + "learning_rate": 9.512613770361532e-06, + "loss": 0.6774, + "step": 5211 + }, + { + "epoch": 0.53, + "grad_norm": 1.5564286603593986, + "learning_rate": 9.50932325672582e-06, + "loss": 0.7545, + "step": 5212 + }, + { + "epoch": 0.53, + "grad_norm": 1.4108790343983597, + "learning_rate": 9.506032796345394e-06, + "loss": 0.6761, + "step": 5213 + }, + { + "epoch": 0.53, + "grad_norm": 1.4547297896795017, + "learning_rate": 9.50274238957739e-06, + "loss": 0.7189, + "step": 5214 + }, + { + "epoch": 0.53, + "grad_norm": 1.6395134721925821, + "learning_rate": 9.499452036778917e-06, + "loss": 0.6663, + "step": 5215 + }, + { + "epoch": 0.53, + "grad_norm": 1.4346935198396273, + "learning_rate": 9.4961617383071e-06, + "loss": 0.5815, + "step": 5216 + }, + { + "epoch": 0.53, + "grad_norm": 1.5006255850588064, + "learning_rate": 9.492871494519047e-06, + "loss": 0.7279, + "step": 5217 + }, + { + "epoch": 0.53, + "grad_norm": 1.4022924331040245, + "learning_rate": 9.48958130577186e-06, + "loss": 0.6135, + "step": 5218 + }, + { + "epoch": 0.53, + "grad_norm": 1.7138990478244631, + "learning_rate": 9.486291172422646e-06, + "loss": 0.8144, + "step": 5219 + }, + { + "epoch": 0.53, + "grad_norm": 1.6216852067769545, + "learning_rate": 9.483001094828486e-06, + "loss": 0.7125, + "step": 5220 + }, + { + "epoch": 0.53, + "grad_norm": 1.5811436084819726, + "learning_rate": 9.479711073346477e-06, + "loss": 0.605, + "step": 5221 + }, + { + "epoch": 0.53, + "grad_norm": 1.5977073093399896, + "learning_rate": 9.476421108333692e-06, + "loss": 0.6632, + "step": 5222 + }, + { + "epoch": 0.53, + "grad_norm": 1.6516495394418902, + "learning_rate": 9.473131200147205e-06, + "loss": 0.6504, + "step": 5223 + }, + { + "epoch": 0.53, + "grad_norm": 1.4696334684470354, + "learning_rate": 9.469841349144092e-06, + "loss": 0.6536, + "step": 5224 + }, + { + "epoch": 0.53, + "grad_norm": 1.398855086381696, + "learning_rate": 9.466551555681406e-06, + "loss": 0.7457, + "step": 5225 + }, + { + "epoch": 0.53, + "grad_norm": 1.7310787645704016, + "learning_rate": 9.463261820116207e-06, + "loss": 0.7888, + "step": 5226 + }, + { + "epoch": 0.53, + "grad_norm": 1.4237314465626654, + "learning_rate": 9.459972142805546e-06, + "loss": 0.8367, + "step": 5227 + }, + { + "epoch": 0.53, + "grad_norm": 1.7246104531629485, + "learning_rate": 9.456682524106462e-06, + "loss": 0.7903, + "step": 5228 + }, + { + "epoch": 0.53, + "grad_norm": 1.5646826279558628, + "learning_rate": 9.453392964375997e-06, + "loss": 0.7519, + "step": 5229 + }, + { + "epoch": 0.53, + "grad_norm": 1.5724704042349023, + "learning_rate": 9.450103463971173e-06, + "loss": 0.6894, + "step": 5230 + }, + { + "epoch": 0.53, + "grad_norm": 1.563277249429908, + "learning_rate": 9.446814023249017e-06, + "loss": 0.7203, + "step": 5231 + }, + { + "epoch": 0.53, + "grad_norm": 1.6000393194900262, + "learning_rate": 9.443524642566555e-06, + "loss": 0.704, + "step": 5232 + }, + { + "epoch": 0.53, + "grad_norm": 1.5404602459582803, + "learning_rate": 9.440235322280787e-06, + "loss": 0.6584, + "step": 5233 + }, + { + "epoch": 0.53, + "grad_norm": 1.731041243050856, + "learning_rate": 9.436946062748723e-06, + "loss": 0.6915, + "step": 5234 + }, + { + "epoch": 0.53, + "grad_norm": 1.4755425913249085, + "learning_rate": 9.433656864327355e-06, + "loss": 0.7044, + "step": 5235 + }, + { + "epoch": 0.53, + "grad_norm": 1.630038704091692, + "learning_rate": 9.430367727373677e-06, + "loss": 0.6958, + "step": 5236 + }, + { + "epoch": 0.53, + "grad_norm": 1.5067240174520171, + "learning_rate": 9.42707865224468e-06, + "loss": 0.6853, + "step": 5237 + }, + { + "epoch": 0.53, + "grad_norm": 1.4613411878535234, + "learning_rate": 9.42378963929733e-06, + "loss": 0.7166, + "step": 5238 + }, + { + "epoch": 0.53, + "grad_norm": 1.4578198906412034, + "learning_rate": 9.42050068888861e-06, + "loss": 0.6512, + "step": 5239 + }, + { + "epoch": 0.53, + "grad_norm": 1.7000464098217933, + "learning_rate": 9.417211801375473e-06, + "loss": 0.8523, + "step": 5240 + }, + { + "epoch": 0.53, + "grad_norm": 1.4453794950143881, + "learning_rate": 9.41392297711488e-06, + "loss": 0.7007, + "step": 5241 + }, + { + "epoch": 0.53, + "grad_norm": 1.7221610086703045, + "learning_rate": 9.410634216463788e-06, + "loss": 0.7637, + "step": 5242 + }, + { + "epoch": 0.53, + "grad_norm": 1.523370062055753, + "learning_rate": 9.407345519779129e-06, + "loss": 0.6814, + "step": 5243 + }, + { + "epoch": 0.53, + "grad_norm": 1.608673973036898, + "learning_rate": 9.404056887417847e-06, + "loss": 0.65, + "step": 5244 + }, + { + "epoch": 0.53, + "grad_norm": 1.6451242028947322, + "learning_rate": 9.400768319736874e-06, + "loss": 0.694, + "step": 5245 + }, + { + "epoch": 0.53, + "grad_norm": 1.6038945321260987, + "learning_rate": 9.397479817093126e-06, + "loss": 0.6588, + "step": 5246 + }, + { + "epoch": 0.53, + "grad_norm": 1.6994527008054192, + "learning_rate": 9.394191379843524e-06, + "loss": 0.8255, + "step": 5247 + }, + { + "epoch": 0.53, + "grad_norm": 1.681999020364834, + "learning_rate": 9.390903008344972e-06, + "loss": 0.6967, + "step": 5248 + }, + { + "epoch": 0.53, + "grad_norm": 1.5577270594188073, + "learning_rate": 9.387614702954372e-06, + "loss": 0.6311, + "step": 5249 + }, + { + "epoch": 0.53, + "grad_norm": 1.6413677367230959, + "learning_rate": 9.384326464028622e-06, + "loss": 0.7477, + "step": 5250 + }, + { + "epoch": 0.53, + "grad_norm": 1.5642001057958752, + "learning_rate": 9.381038291924607e-06, + "loss": 0.733, + "step": 5251 + }, + { + "epoch": 0.53, + "grad_norm": 1.505431451576894, + "learning_rate": 9.377750186999203e-06, + "loss": 0.6696, + "step": 5252 + }, + { + "epoch": 0.53, + "grad_norm": 1.5609368395988965, + "learning_rate": 9.374462149609286e-06, + "loss": 0.7326, + "step": 5253 + }, + { + "epoch": 0.53, + "grad_norm": 1.4406046445318315, + "learning_rate": 9.371174180111722e-06, + "loss": 0.6684, + "step": 5254 + }, + { + "epoch": 0.53, + "grad_norm": 1.6830633570580327, + "learning_rate": 9.367886278863366e-06, + "loss": 0.7631, + "step": 5255 + }, + { + "epoch": 0.53, + "grad_norm": 1.4375200749846495, + "learning_rate": 9.364598446221068e-06, + "loss": 0.7355, + "step": 5256 + }, + { + "epoch": 0.53, + "grad_norm": 1.5440091833277783, + "learning_rate": 9.361310682541675e-06, + "loss": 0.7658, + "step": 5257 + }, + { + "epoch": 0.53, + "grad_norm": 1.352238801113515, + "learning_rate": 9.358022988182017e-06, + "loss": 0.6368, + "step": 5258 + }, + { + "epoch": 0.53, + "grad_norm": 1.5949685287350601, + "learning_rate": 9.354735363498925e-06, + "loss": 0.709, + "step": 5259 + }, + { + "epoch": 0.54, + "grad_norm": 1.4948494174337732, + "learning_rate": 9.351447808849219e-06, + "loss": 0.6661, + "step": 5260 + }, + { + "epoch": 0.54, + "grad_norm": 1.624657505673658, + "learning_rate": 9.348160324589709e-06, + "loss": 0.7397, + "step": 5261 + }, + { + "epoch": 0.54, + "grad_norm": 1.5507950151661274, + "learning_rate": 9.344872911077206e-06, + "loss": 0.7714, + "step": 5262 + }, + { + "epoch": 0.54, + "grad_norm": 1.5574701634407244, + "learning_rate": 9.341585568668501e-06, + "loss": 0.7168, + "step": 5263 + }, + { + "epoch": 0.54, + "grad_norm": 1.4886619135598378, + "learning_rate": 9.338298297720385e-06, + "loss": 0.7643, + "step": 5264 + }, + { + "epoch": 0.54, + "grad_norm": 1.48937824246935, + "learning_rate": 9.335011098589644e-06, + "loss": 0.7108, + "step": 5265 + }, + { + "epoch": 0.54, + "grad_norm": 1.5537164166555901, + "learning_rate": 9.331723971633046e-06, + "loss": 0.6894, + "step": 5266 + }, + { + "epoch": 0.54, + "grad_norm": 1.4409876623283242, + "learning_rate": 9.328436917207359e-06, + "loss": 0.6557, + "step": 5267 + }, + { + "epoch": 0.54, + "grad_norm": 1.3906556675122819, + "learning_rate": 9.325149935669347e-06, + "loss": 0.6577, + "step": 5268 + }, + { + "epoch": 0.54, + "grad_norm": 1.4896382181245778, + "learning_rate": 9.321863027375753e-06, + "loss": 0.6772, + "step": 5269 + }, + { + "epoch": 0.54, + "grad_norm": 1.5065184254475665, + "learning_rate": 9.318576192683326e-06, + "loss": 0.6624, + "step": 5270 + }, + { + "epoch": 0.54, + "grad_norm": 1.6122106573181099, + "learning_rate": 9.315289431948794e-06, + "loss": 0.7075, + "step": 5271 + }, + { + "epoch": 0.54, + "grad_norm": 1.6082453926127243, + "learning_rate": 9.312002745528885e-06, + "loss": 0.7192, + "step": 5272 + }, + { + "epoch": 0.54, + "grad_norm": 1.5519173002255817, + "learning_rate": 9.308716133780323e-06, + "loss": 0.7058, + "step": 5273 + }, + { + "epoch": 0.54, + "grad_norm": 1.5511865568301473, + "learning_rate": 9.30542959705981e-06, + "loss": 0.6688, + "step": 5274 + }, + { + "epoch": 0.54, + "grad_norm": 1.6629909624506398, + "learning_rate": 9.302143135724058e-06, + "loss": 0.722, + "step": 5275 + }, + { + "epoch": 0.54, + "grad_norm": 1.554510834086878, + "learning_rate": 9.29885675012975e-06, + "loss": 0.7647, + "step": 5276 + }, + { + "epoch": 0.54, + "grad_norm": 1.4276555624226124, + "learning_rate": 9.295570440633577e-06, + "loss": 0.6158, + "step": 5277 + }, + { + "epoch": 0.54, + "grad_norm": 1.741157099895649, + "learning_rate": 9.29228420759222e-06, + "loss": 0.6786, + "step": 5278 + }, + { + "epoch": 0.54, + "grad_norm": 1.619832980822339, + "learning_rate": 9.288998051362343e-06, + "loss": 0.7473, + "step": 5279 + }, + { + "epoch": 0.54, + "grad_norm": 1.3846762393519525, + "learning_rate": 9.285711972300607e-06, + "loss": 0.5589, + "step": 5280 + }, + { + "epoch": 0.54, + "grad_norm": 1.494543803188233, + "learning_rate": 9.28242597076367e-06, + "loss": 0.6714, + "step": 5281 + }, + { + "epoch": 0.54, + "grad_norm": 1.54481372971172, + "learning_rate": 9.27914004710817e-06, + "loss": 0.716, + "step": 5282 + }, + { + "epoch": 0.54, + "grad_norm": 1.5701962361404953, + "learning_rate": 9.275854201690748e-06, + "loss": 0.7484, + "step": 5283 + }, + { + "epoch": 0.54, + "grad_norm": 1.564642596041506, + "learning_rate": 9.272568434868024e-06, + "loss": 0.6589, + "step": 5284 + }, + { + "epoch": 0.54, + "grad_norm": 1.5127112390691917, + "learning_rate": 9.26928274699662e-06, + "loss": 0.692, + "step": 5285 + }, + { + "epoch": 0.54, + "grad_norm": 1.6438483898007703, + "learning_rate": 9.265997138433152e-06, + "loss": 0.7135, + "step": 5286 + }, + { + "epoch": 0.54, + "grad_norm": 1.418691263372242, + "learning_rate": 9.262711609534211e-06, + "loss": 0.6108, + "step": 5287 + }, + { + "epoch": 0.54, + "grad_norm": 1.4353502362260733, + "learning_rate": 9.2594261606564e-06, + "loss": 0.6181, + "step": 5288 + }, + { + "epoch": 0.54, + "grad_norm": 1.5773405361749253, + "learning_rate": 9.256140792156292e-06, + "loss": 0.6366, + "step": 5289 + }, + { + "epoch": 0.54, + "grad_norm": 1.740700222032807, + "learning_rate": 9.25285550439047e-06, + "loss": 0.713, + "step": 5290 + }, + { + "epoch": 0.54, + "grad_norm": 1.5815623965705798, + "learning_rate": 9.249570297715503e-06, + "loss": 0.6745, + "step": 5291 + }, + { + "epoch": 0.54, + "grad_norm": 1.7360353904593255, + "learning_rate": 9.24628517248794e-06, + "loss": 0.7386, + "step": 5292 + }, + { + "epoch": 0.54, + "grad_norm": 1.600154153236615, + "learning_rate": 9.243000129064339e-06, + "loss": 0.7301, + "step": 5293 + }, + { + "epoch": 0.54, + "grad_norm": 1.6322114591649122, + "learning_rate": 9.239715167801232e-06, + "loss": 0.6683, + "step": 5294 + }, + { + "epoch": 0.54, + "grad_norm": 1.5597922373861357, + "learning_rate": 9.236430289055154e-06, + "loss": 0.7467, + "step": 5295 + }, + { + "epoch": 0.54, + "grad_norm": 1.5373445772916388, + "learning_rate": 9.23314549318263e-06, + "loss": 0.7699, + "step": 5296 + }, + { + "epoch": 0.54, + "grad_norm": 1.480216056117163, + "learning_rate": 9.229860780540168e-06, + "loss": 0.7339, + "step": 5297 + }, + { + "epoch": 0.54, + "grad_norm": 1.6150409063359652, + "learning_rate": 9.226576151484274e-06, + "loss": 0.7056, + "step": 5298 + }, + { + "epoch": 0.54, + "grad_norm": 1.5014317259723393, + "learning_rate": 9.223291606371448e-06, + "loss": 0.7125, + "step": 5299 + }, + { + "epoch": 0.54, + "grad_norm": 1.4923615606329292, + "learning_rate": 9.220007145558169e-06, + "loss": 0.6393, + "step": 5300 + }, + { + "epoch": 0.54, + "grad_norm": 1.5687143048914496, + "learning_rate": 9.216722769400917e-06, + "loss": 0.6065, + "step": 5301 + }, + { + "epoch": 0.54, + "grad_norm": 1.5118993852348217, + "learning_rate": 9.213438478256159e-06, + "loss": 0.6034, + "step": 5302 + }, + { + "epoch": 0.54, + "grad_norm": 1.4038497103453222, + "learning_rate": 9.210154272480353e-06, + "loss": 0.7255, + "step": 5303 + }, + { + "epoch": 0.54, + "grad_norm": 1.7505356088247321, + "learning_rate": 9.20687015242995e-06, + "loss": 0.6883, + "step": 5304 + }, + { + "epoch": 0.54, + "grad_norm": 1.552647846292909, + "learning_rate": 9.203586118461389e-06, + "loss": 0.6346, + "step": 5305 + }, + { + "epoch": 0.54, + "grad_norm": 1.7016995825452594, + "learning_rate": 9.2003021709311e-06, + "loss": 0.6711, + "step": 5306 + }, + { + "epoch": 0.54, + "grad_norm": 1.7047703184824687, + "learning_rate": 9.197018310195503e-06, + "loss": 0.7431, + "step": 5307 + }, + { + "epoch": 0.54, + "grad_norm": 1.6009774451727634, + "learning_rate": 9.193734536611013e-06, + "loss": 0.753, + "step": 5308 + }, + { + "epoch": 0.54, + "grad_norm": 1.5727984936944126, + "learning_rate": 9.19045085053403e-06, + "loss": 0.6955, + "step": 5309 + }, + { + "epoch": 0.54, + "grad_norm": 1.5171645830514315, + "learning_rate": 9.18716725232095e-06, + "loss": 0.6256, + "step": 5310 + }, + { + "epoch": 0.54, + "grad_norm": 1.5013686068549332, + "learning_rate": 9.18388374232815e-06, + "loss": 0.7419, + "step": 5311 + }, + { + "epoch": 0.54, + "grad_norm": 1.5452514446664825, + "learning_rate": 9.180600320912006e-06, + "loss": 0.8011, + "step": 5312 + }, + { + "epoch": 0.54, + "grad_norm": 1.6779607379171213, + "learning_rate": 9.177316988428885e-06, + "loss": 0.7006, + "step": 5313 + }, + { + "epoch": 0.54, + "grad_norm": 1.5460751326931226, + "learning_rate": 9.174033745235141e-06, + "loss": 0.726, + "step": 5314 + }, + { + "epoch": 0.54, + "grad_norm": 1.4406243469055928, + "learning_rate": 9.170750591687114e-06, + "loss": 0.7465, + "step": 5315 + }, + { + "epoch": 0.54, + "grad_norm": 1.6337337025110672, + "learning_rate": 9.167467528141144e-06, + "loss": 0.7255, + "step": 5316 + }, + { + "epoch": 0.54, + "grad_norm": 1.5467283757292964, + "learning_rate": 9.164184554953557e-06, + "loss": 0.7321, + "step": 5317 + }, + { + "epoch": 0.54, + "grad_norm": 1.682046371729968, + "learning_rate": 9.160901672480662e-06, + "loss": 0.6444, + "step": 5318 + }, + { + "epoch": 0.54, + "grad_norm": 1.488118046274293, + "learning_rate": 9.157618881078772e-06, + "loss": 0.713, + "step": 5319 + }, + { + "epoch": 0.54, + "grad_norm": 1.7079121163396311, + "learning_rate": 9.154336181104175e-06, + "loss": 0.6949, + "step": 5320 + }, + { + "epoch": 0.54, + "grad_norm": 1.5068370655989956, + "learning_rate": 9.151053572913163e-06, + "loss": 0.7095, + "step": 5321 + }, + { + "epoch": 0.54, + "grad_norm": 1.7707562315922247, + "learning_rate": 9.147771056862011e-06, + "loss": 0.7966, + "step": 5322 + }, + { + "epoch": 0.54, + "grad_norm": 1.6133590922465229, + "learning_rate": 9.144488633306981e-06, + "loss": 0.6659, + "step": 5323 + }, + { + "epoch": 0.54, + "grad_norm": 1.4545510638629, + "learning_rate": 9.141206302604336e-06, + "loss": 0.7483, + "step": 5324 + }, + { + "epoch": 0.54, + "grad_norm": 1.655522474567456, + "learning_rate": 9.13792406511031e-06, + "loss": 0.76, + "step": 5325 + }, + { + "epoch": 0.54, + "grad_norm": 1.3576656692724869, + "learning_rate": 9.134641921181147e-06, + "loss": 0.6448, + "step": 5326 + }, + { + "epoch": 0.54, + "grad_norm": 1.7787458227500046, + "learning_rate": 9.131359871173074e-06, + "loss": 0.7244, + "step": 5327 + }, + { + "epoch": 0.54, + "grad_norm": 1.6178083657835598, + "learning_rate": 9.1280779154423e-06, + "loss": 0.6684, + "step": 5328 + }, + { + "epoch": 0.54, + "grad_norm": 1.7010524693762994, + "learning_rate": 9.124796054345036e-06, + "loss": 0.7417, + "step": 5329 + }, + { + "epoch": 0.54, + "grad_norm": 1.5360912007131207, + "learning_rate": 9.12151428823747e-06, + "loss": 0.7682, + "step": 5330 + }, + { + "epoch": 0.54, + "grad_norm": 1.7481181530838306, + "learning_rate": 9.118232617475791e-06, + "loss": 0.7718, + "step": 5331 + }, + { + "epoch": 0.54, + "grad_norm": 1.5760176359497586, + "learning_rate": 9.114951042416175e-06, + "loss": 0.6897, + "step": 5332 + }, + { + "epoch": 0.54, + "grad_norm": 1.4532079356845804, + "learning_rate": 9.11166956341478e-06, + "loss": 0.7062, + "step": 5333 + }, + { + "epoch": 0.54, + "grad_norm": 1.5295195035357474, + "learning_rate": 9.10838818082776e-06, + "loss": 0.6727, + "step": 5334 + }, + { + "epoch": 0.54, + "grad_norm": 1.502411837379029, + "learning_rate": 9.105106895011263e-06, + "loss": 0.6246, + "step": 5335 + }, + { + "epoch": 0.54, + "grad_norm": 1.6815569575078204, + "learning_rate": 9.101825706321417e-06, + "loss": 0.6468, + "step": 5336 + }, + { + "epoch": 0.54, + "grad_norm": 1.6571536250194325, + "learning_rate": 9.098544615114347e-06, + "loss": 0.7936, + "step": 5337 + }, + { + "epoch": 0.54, + "grad_norm": 1.5541916532325835, + "learning_rate": 9.095263621746159e-06, + "loss": 0.7494, + "step": 5338 + }, + { + "epoch": 0.54, + "grad_norm": 1.5773470344567357, + "learning_rate": 9.091982726572956e-06, + "loss": 0.7626, + "step": 5339 + }, + { + "epoch": 0.54, + "grad_norm": 1.7006352160254827, + "learning_rate": 9.088701929950831e-06, + "loss": 0.6884, + "step": 5340 + }, + { + "epoch": 0.54, + "grad_norm": 1.5057538412071996, + "learning_rate": 9.085421232235857e-06, + "loss": 0.5913, + "step": 5341 + }, + { + "epoch": 0.54, + "grad_norm": 1.681665813385068, + "learning_rate": 9.08214063378411e-06, + "loss": 0.8246, + "step": 5342 + }, + { + "epoch": 0.54, + "grad_norm": 1.616750287402683, + "learning_rate": 9.07886013495164e-06, + "loss": 0.6381, + "step": 5343 + }, + { + "epoch": 0.54, + "grad_norm": 1.5931263201795647, + "learning_rate": 9.075579736094497e-06, + "loss": 0.6813, + "step": 5344 + }, + { + "epoch": 0.54, + "grad_norm": 1.5711422910918862, + "learning_rate": 9.072299437568722e-06, + "loss": 0.7396, + "step": 5345 + }, + { + "epoch": 0.54, + "grad_norm": 1.4722021442670385, + "learning_rate": 9.069019239730329e-06, + "loss": 0.7187, + "step": 5346 + }, + { + "epoch": 0.54, + "grad_norm": 1.684898911516207, + "learning_rate": 9.065739142935345e-06, + "loss": 0.784, + "step": 5347 + }, + { + "epoch": 0.54, + "grad_norm": 1.5649833582235908, + "learning_rate": 9.062459147539761e-06, + "loss": 0.7162, + "step": 5348 + }, + { + "epoch": 0.54, + "grad_norm": 1.70568458464764, + "learning_rate": 9.059179253899577e-06, + "loss": 0.6717, + "step": 5349 + }, + { + "epoch": 0.54, + "grad_norm": 1.516662532354538, + "learning_rate": 9.055899462370776e-06, + "loss": 0.6827, + "step": 5350 + }, + { + "epoch": 0.54, + "grad_norm": 1.527811050777723, + "learning_rate": 9.052619773309318e-06, + "loss": 0.715, + "step": 5351 + }, + { + "epoch": 0.54, + "grad_norm": 1.394315245520379, + "learning_rate": 9.04934018707117e-06, + "loss": 0.6611, + "step": 5352 + }, + { + "epoch": 0.54, + "grad_norm": 1.669341562664938, + "learning_rate": 9.046060704012281e-06, + "loss": 0.7238, + "step": 5353 + }, + { + "epoch": 0.54, + "grad_norm": 1.4150418929513942, + "learning_rate": 9.042781324488582e-06, + "loss": 0.6455, + "step": 5354 + }, + { + "epoch": 0.54, + "grad_norm": 1.7234502937842522, + "learning_rate": 9.039502048856002e-06, + "loss": 0.7208, + "step": 5355 + }, + { + "epoch": 0.54, + "grad_norm": 1.4593406939103721, + "learning_rate": 9.036222877470455e-06, + "loss": 0.693, + "step": 5356 + }, + { + "epoch": 0.54, + "grad_norm": 1.7215012614015754, + "learning_rate": 9.032943810687841e-06, + "loss": 0.792, + "step": 5357 + }, + { + "epoch": 0.55, + "grad_norm": 1.6688124560243105, + "learning_rate": 9.029664848864055e-06, + "loss": 0.6758, + "step": 5358 + }, + { + "epoch": 0.55, + "grad_norm": 1.8236526409515186, + "learning_rate": 9.026385992354974e-06, + "loss": 0.7684, + "step": 5359 + }, + { + "epoch": 0.55, + "grad_norm": 1.633138913316259, + "learning_rate": 9.023107241516469e-06, + "loss": 0.7151, + "step": 5360 + }, + { + "epoch": 0.55, + "grad_norm": 1.5803702477774828, + "learning_rate": 9.019828596704394e-06, + "loss": 0.8013, + "step": 5361 + }, + { + "epoch": 0.55, + "grad_norm": 1.669277721960774, + "learning_rate": 9.016550058274599e-06, + "loss": 0.7613, + "step": 5362 + }, + { + "epoch": 0.55, + "grad_norm": 1.5690020797998012, + "learning_rate": 9.013271626582915e-06, + "loss": 0.7337, + "step": 5363 + }, + { + "epoch": 0.55, + "grad_norm": 1.41272139408782, + "learning_rate": 9.009993301985167e-06, + "loss": 0.6609, + "step": 5364 + }, + { + "epoch": 0.55, + "grad_norm": 1.468996080511503, + "learning_rate": 9.006715084837162e-06, + "loss": 0.5811, + "step": 5365 + }, + { + "epoch": 0.55, + "grad_norm": 1.5838321250978402, + "learning_rate": 9.003436975494699e-06, + "loss": 0.6922, + "step": 5366 + }, + { + "epoch": 0.55, + "grad_norm": 1.4850631249125565, + "learning_rate": 9.00015897431357e-06, + "loss": 0.6375, + "step": 5367 + }, + { + "epoch": 0.55, + "grad_norm": 1.529836227289035, + "learning_rate": 8.996881081649551e-06, + "loss": 0.629, + "step": 5368 + }, + { + "epoch": 0.55, + "grad_norm": 1.5638270705319934, + "learning_rate": 8.9936032978584e-06, + "loss": 0.636, + "step": 5369 + }, + { + "epoch": 0.55, + "grad_norm": 1.4665776471013017, + "learning_rate": 8.990325623295877e-06, + "loss": 0.7205, + "step": 5370 + }, + { + "epoch": 0.55, + "grad_norm": 1.651673457610251, + "learning_rate": 8.987048058317714e-06, + "loss": 0.7849, + "step": 5371 + }, + { + "epoch": 0.55, + "grad_norm": 1.7829197596581683, + "learning_rate": 8.983770603279642e-06, + "loss": 0.751, + "step": 5372 + }, + { + "epoch": 0.55, + "grad_norm": 1.8456608183651604, + "learning_rate": 8.980493258537384e-06, + "loss": 0.7839, + "step": 5373 + }, + { + "epoch": 0.55, + "grad_norm": 1.703926959579352, + "learning_rate": 8.977216024446636e-06, + "loss": 0.8251, + "step": 5374 + }, + { + "epoch": 0.55, + "grad_norm": 1.530470230657618, + "learning_rate": 8.973938901363094e-06, + "loss": 0.6577, + "step": 5375 + }, + { + "epoch": 0.55, + "grad_norm": 1.5809578736817425, + "learning_rate": 8.970661889642442e-06, + "loss": 0.7201, + "step": 5376 + }, + { + "epoch": 0.55, + "grad_norm": 1.4271102652547045, + "learning_rate": 8.96738498964034e-06, + "loss": 0.6792, + "step": 5377 + }, + { + "epoch": 0.55, + "grad_norm": 1.5426396797757143, + "learning_rate": 8.964108201712455e-06, + "loss": 0.6958, + "step": 5378 + }, + { + "epoch": 0.55, + "grad_norm": 1.8103751233562824, + "learning_rate": 8.96083152621442e-06, + "loss": 0.8033, + "step": 5379 + }, + { + "epoch": 0.55, + "grad_norm": 1.5761100242991692, + "learning_rate": 8.957554963501873e-06, + "loss": 0.7647, + "step": 5380 + }, + { + "epoch": 0.55, + "grad_norm": 1.5905942042313983, + "learning_rate": 8.954278513930434e-06, + "loss": 0.7884, + "step": 5381 + }, + { + "epoch": 0.55, + "grad_norm": 1.5539806558481204, + "learning_rate": 8.951002177855708e-06, + "loss": 0.7533, + "step": 5382 + }, + { + "epoch": 0.55, + "grad_norm": 1.6703501138543935, + "learning_rate": 8.947725955633294e-06, + "loss": 0.749, + "step": 5383 + }, + { + "epoch": 0.55, + "grad_norm": 1.5119616474184723, + "learning_rate": 8.944449847618768e-06, + "loss": 0.6985, + "step": 5384 + }, + { + "epoch": 0.55, + "grad_norm": 1.6205786635233608, + "learning_rate": 8.941173854167703e-06, + "loss": 0.7124, + "step": 5385 + }, + { + "epoch": 0.55, + "grad_norm": 1.5975066009074679, + "learning_rate": 8.937897975635664e-06, + "loss": 0.6692, + "step": 5386 + }, + { + "epoch": 0.55, + "grad_norm": 1.636521080970513, + "learning_rate": 8.934622212378185e-06, + "loss": 0.6629, + "step": 5387 + }, + { + "epoch": 0.55, + "grad_norm": 1.5190343789945384, + "learning_rate": 8.931346564750808e-06, + "loss": 0.651, + "step": 5388 + }, + { + "epoch": 0.55, + "grad_norm": 1.5957467566016765, + "learning_rate": 8.928071033109047e-06, + "loss": 0.7199, + "step": 5389 + }, + { + "epoch": 0.55, + "grad_norm": 1.4624387519553008, + "learning_rate": 8.924795617808411e-06, + "loss": 0.6886, + "step": 5390 + }, + { + "epoch": 0.55, + "grad_norm": 1.4511607366742325, + "learning_rate": 8.921520319204399e-06, + "loss": 0.733, + "step": 5391 + }, + { + "epoch": 0.55, + "grad_norm": 1.4985675491539114, + "learning_rate": 8.91824513765249e-06, + "loss": 0.7086, + "step": 5392 + }, + { + "epoch": 0.55, + "grad_norm": 1.5089126612633614, + "learning_rate": 8.914970073508152e-06, + "loss": 0.7444, + "step": 5393 + }, + { + "epoch": 0.55, + "grad_norm": 1.628177603423058, + "learning_rate": 8.91169512712685e-06, + "loss": 0.7051, + "step": 5394 + }, + { + "epoch": 0.55, + "grad_norm": 1.5772919443785387, + "learning_rate": 8.908420298864018e-06, + "loss": 0.7317, + "step": 5395 + }, + { + "epoch": 0.55, + "grad_norm": 1.4723112256257591, + "learning_rate": 8.905145589075098e-06, + "loss": 0.7227, + "step": 5396 + }, + { + "epoch": 0.55, + "grad_norm": 1.5801448338705688, + "learning_rate": 8.901870998115495e-06, + "loss": 0.6773, + "step": 5397 + }, + { + "epoch": 0.55, + "grad_norm": 1.5860986082201733, + "learning_rate": 8.898596526340624e-06, + "loss": 0.7343, + "step": 5398 + }, + { + "epoch": 0.55, + "grad_norm": 1.6039832133572458, + "learning_rate": 8.895322174105882e-06, + "loss": 0.805, + "step": 5399 + }, + { + "epoch": 0.55, + "grad_norm": 1.6155249124855937, + "learning_rate": 8.892047941766636e-06, + "loss": 0.7111, + "step": 5400 + }, + { + "epoch": 0.55, + "grad_norm": 1.6067471312551125, + "learning_rate": 8.888773829678265e-06, + "loss": 0.7244, + "step": 5401 + }, + { + "epoch": 0.55, + "grad_norm": 1.619332799189476, + "learning_rate": 8.885499838196114e-06, + "loss": 0.765, + "step": 5402 + }, + { + "epoch": 0.55, + "grad_norm": 1.6699781838918835, + "learning_rate": 8.882225967675526e-06, + "loss": 0.7806, + "step": 5403 + }, + { + "epoch": 0.55, + "grad_norm": 1.6180503220313573, + "learning_rate": 8.878952218471833e-06, + "loss": 0.7772, + "step": 5404 + }, + { + "epoch": 0.55, + "grad_norm": 1.7606649996624608, + "learning_rate": 8.87567859094034e-06, + "loss": 0.7342, + "step": 5405 + }, + { + "epoch": 0.55, + "grad_norm": 1.6189646431863196, + "learning_rate": 8.872405085436356e-06, + "loss": 0.6012, + "step": 5406 + }, + { + "epoch": 0.55, + "grad_norm": 1.6365844250183907, + "learning_rate": 8.869131702315169e-06, + "loss": 0.7184, + "step": 5407 + }, + { + "epoch": 0.55, + "grad_norm": 1.5492476290084674, + "learning_rate": 8.865858441932047e-06, + "loss": 0.7093, + "step": 5408 + }, + { + "epoch": 0.55, + "grad_norm": 1.5731924035028098, + "learning_rate": 8.862585304642257e-06, + "loss": 0.7088, + "step": 5409 + }, + { + "epoch": 0.55, + "grad_norm": 1.5326160778305695, + "learning_rate": 8.859312290801044e-06, + "loss": 0.6615, + "step": 5410 + }, + { + "epoch": 0.55, + "grad_norm": 1.685641814822192, + "learning_rate": 8.856039400763644e-06, + "loss": 0.6843, + "step": 5411 + }, + { + "epoch": 0.55, + "grad_norm": 1.6687141835760049, + "learning_rate": 8.852766634885277e-06, + "loss": 0.7037, + "step": 5412 + }, + { + "epoch": 0.55, + "grad_norm": 1.5741605632580782, + "learning_rate": 8.849493993521153e-06, + "loss": 0.6725, + "step": 5413 + }, + { + "epoch": 0.55, + "grad_norm": 1.5848257648940565, + "learning_rate": 8.846221477026463e-06, + "loss": 0.7393, + "step": 5414 + }, + { + "epoch": 0.55, + "grad_norm": 1.4946988393052152, + "learning_rate": 8.842949085756389e-06, + "loss": 0.7276, + "step": 5415 + }, + { + "epoch": 0.55, + "grad_norm": 1.633398789443463, + "learning_rate": 8.839676820066095e-06, + "loss": 0.7075, + "step": 5416 + }, + { + "epoch": 0.55, + "grad_norm": 1.5064528057849553, + "learning_rate": 8.836404680310739e-06, + "loss": 0.7695, + "step": 5417 + }, + { + "epoch": 0.55, + "grad_norm": 1.6169451224176257, + "learning_rate": 8.833132666845459e-06, + "loss": 0.7448, + "step": 5418 + }, + { + "epoch": 0.55, + "grad_norm": 1.5249218319990228, + "learning_rate": 8.82986078002538e-06, + "loss": 0.6317, + "step": 5419 + }, + { + "epoch": 0.55, + "grad_norm": 1.6626818643466355, + "learning_rate": 8.826589020205611e-06, + "loss": 0.6781, + "step": 5420 + }, + { + "epoch": 0.55, + "grad_norm": 1.5977691718018296, + "learning_rate": 8.823317387741258e-06, + "loss": 0.7576, + "step": 5421 + }, + { + "epoch": 0.55, + "grad_norm": 1.5462585411188319, + "learning_rate": 8.8200458829874e-06, + "loss": 0.8659, + "step": 5422 + }, + { + "epoch": 0.55, + "grad_norm": 1.5190555713203076, + "learning_rate": 8.816774506299106e-06, + "loss": 0.6913, + "step": 5423 + }, + { + "epoch": 0.55, + "grad_norm": 1.6269163363674695, + "learning_rate": 8.813503258031443e-06, + "loss": 0.7462, + "step": 5424 + }, + { + "epoch": 0.55, + "grad_norm": 1.882660965495172, + "learning_rate": 8.81023213853944e-06, + "loss": 0.8512, + "step": 5425 + }, + { + "epoch": 0.55, + "grad_norm": 1.5943514236393186, + "learning_rate": 8.806961148178133e-06, + "loss": 0.678, + "step": 5426 + }, + { + "epoch": 0.55, + "grad_norm": 1.5669447477330756, + "learning_rate": 8.803690287302542e-06, + "loss": 0.668, + "step": 5427 + }, + { + "epoch": 0.55, + "grad_norm": 1.6535602382835375, + "learning_rate": 8.800419556267655e-06, + "loss": 0.7596, + "step": 5428 + }, + { + "epoch": 0.55, + "grad_norm": 1.6308465952597497, + "learning_rate": 8.797148955428467e-06, + "loss": 0.7934, + "step": 5429 + }, + { + "epoch": 0.55, + "grad_norm": 1.408309632362028, + "learning_rate": 8.793878485139954e-06, + "loss": 0.6747, + "step": 5430 + }, + { + "epoch": 0.55, + "grad_norm": 1.6085099512031618, + "learning_rate": 8.790608145757066e-06, + "loss": 0.5726, + "step": 5431 + }, + { + "epoch": 0.55, + "grad_norm": 1.5522606137544397, + "learning_rate": 8.787337937634755e-06, + "loss": 0.7439, + "step": 5432 + }, + { + "epoch": 0.55, + "grad_norm": 1.5890313434061458, + "learning_rate": 8.784067861127942e-06, + "loss": 0.7395, + "step": 5433 + }, + { + "epoch": 0.55, + "grad_norm": 1.540243937101572, + "learning_rate": 8.780797916591548e-06, + "loss": 0.708, + "step": 5434 + }, + { + "epoch": 0.55, + "grad_norm": 1.6690409088302076, + "learning_rate": 8.777528104380478e-06, + "loss": 0.7222, + "step": 5435 + }, + { + "epoch": 0.55, + "grad_norm": 1.5706857558719922, + "learning_rate": 8.77425842484961e-06, + "loss": 0.786, + "step": 5436 + }, + { + "epoch": 0.55, + "grad_norm": 1.4738961397397348, + "learning_rate": 8.770988878353827e-06, + "loss": 0.6863, + "step": 5437 + }, + { + "epoch": 0.55, + "grad_norm": 1.49909854563047, + "learning_rate": 8.767719465247979e-06, + "loss": 0.6873, + "step": 5438 + }, + { + "epoch": 0.55, + "grad_norm": 1.7050766612344421, + "learning_rate": 8.76445018588691e-06, + "loss": 0.7244, + "step": 5439 + }, + { + "epoch": 0.55, + "grad_norm": 1.5630758956722233, + "learning_rate": 8.761181040625457e-06, + "loss": 0.6779, + "step": 5440 + }, + { + "epoch": 0.55, + "grad_norm": 1.5859976116499224, + "learning_rate": 8.757912029818424e-06, + "loss": 0.7304, + "step": 5441 + }, + { + "epoch": 0.55, + "grad_norm": 1.6690118530119127, + "learning_rate": 8.75464315382062e-06, + "loss": 0.6904, + "step": 5442 + }, + { + "epoch": 0.55, + "grad_norm": 1.576408966950522, + "learning_rate": 8.751374412986822e-06, + "loss": 0.8189, + "step": 5443 + }, + { + "epoch": 0.55, + "grad_norm": 1.6019837359423332, + "learning_rate": 8.748105807671806e-06, + "loss": 0.7219, + "step": 5444 + }, + { + "epoch": 0.55, + "grad_norm": 1.8202596783529816, + "learning_rate": 8.744837338230331e-06, + "loss": 0.8334, + "step": 5445 + }, + { + "epoch": 0.55, + "grad_norm": 1.5460218928791967, + "learning_rate": 8.74156900501713e-06, + "loss": 0.8658, + "step": 5446 + }, + { + "epoch": 0.55, + "grad_norm": 1.6092606350579812, + "learning_rate": 8.738300808386934e-06, + "loss": 0.7317, + "step": 5447 + }, + { + "epoch": 0.55, + "grad_norm": 1.6053711156944777, + "learning_rate": 8.73503274869446e-06, + "loss": 0.6881, + "step": 5448 + }, + { + "epoch": 0.55, + "grad_norm": 1.6755703755220746, + "learning_rate": 8.731764826294394e-06, + "loss": 0.7608, + "step": 5449 + }, + { + "epoch": 0.55, + "grad_norm": 1.61294871243576, + "learning_rate": 8.728497041541426e-06, + "loss": 0.6088, + "step": 5450 + }, + { + "epoch": 0.55, + "grad_norm": 1.4898392329846786, + "learning_rate": 8.725229394790217e-06, + "loss": 0.6994, + "step": 5451 + }, + { + "epoch": 0.55, + "grad_norm": 1.5406794279721598, + "learning_rate": 8.721961886395423e-06, + "loss": 0.6649, + "step": 5452 + }, + { + "epoch": 0.55, + "grad_norm": 1.5276423066659235, + "learning_rate": 8.718694516711684e-06, + "loss": 0.7596, + "step": 5453 + }, + { + "epoch": 0.55, + "grad_norm": 1.555067050858641, + "learning_rate": 8.715427286093611e-06, + "loss": 0.6365, + "step": 5454 + }, + { + "epoch": 0.55, + "grad_norm": 1.4073917884142084, + "learning_rate": 8.712160194895825e-06, + "loss": 0.5868, + "step": 5455 + }, + { + "epoch": 0.55, + "grad_norm": 1.5977461991934643, + "learning_rate": 8.708893243472905e-06, + "loss": 0.6963, + "step": 5456 + }, + { + "epoch": 0.56, + "grad_norm": 1.6408899699791568, + "learning_rate": 8.705626432179432e-06, + "loss": 0.6912, + "step": 5457 + }, + { + "epoch": 0.56, + "grad_norm": 1.5462815026322818, + "learning_rate": 8.702359761369972e-06, + "loss": 0.7436, + "step": 5458 + }, + { + "epoch": 0.56, + "grad_norm": 1.5104521935408541, + "learning_rate": 8.699093231399064e-06, + "loss": 0.726, + "step": 5459 + }, + { + "epoch": 0.56, + "grad_norm": 1.5145792617441518, + "learning_rate": 8.695826842621244e-06, + "loss": 0.6668, + "step": 5460 + }, + { + "epoch": 0.56, + "grad_norm": 1.5567167872130345, + "learning_rate": 8.692560595391023e-06, + "loss": 0.6974, + "step": 5461 + }, + { + "epoch": 0.56, + "grad_norm": 1.535631369442896, + "learning_rate": 8.689294490062906e-06, + "loss": 0.7519, + "step": 5462 + }, + { + "epoch": 0.56, + "grad_norm": 1.525000280601043, + "learning_rate": 8.686028526991373e-06, + "loss": 0.6805, + "step": 5463 + }, + { + "epoch": 0.56, + "grad_norm": 1.3676512965755294, + "learning_rate": 8.682762706530897e-06, + "loss": 0.6684, + "step": 5464 + }, + { + "epoch": 0.56, + "grad_norm": 1.4824849114578038, + "learning_rate": 8.679497029035926e-06, + "loss": 0.7436, + "step": 5465 + }, + { + "epoch": 0.56, + "grad_norm": 1.6201369875081661, + "learning_rate": 8.676231494860907e-06, + "loss": 0.8019, + "step": 5466 + }, + { + "epoch": 0.56, + "grad_norm": 1.5794700859428037, + "learning_rate": 8.672966104360255e-06, + "loss": 0.6173, + "step": 5467 + }, + { + "epoch": 0.56, + "grad_norm": 1.588515557807471, + "learning_rate": 8.669700857888382e-06, + "loss": 0.7549, + "step": 5468 + }, + { + "epoch": 0.56, + "grad_norm": 1.4362244278634058, + "learning_rate": 8.666435755799675e-06, + "loss": 0.6177, + "step": 5469 + }, + { + "epoch": 0.56, + "grad_norm": 1.4553152841062662, + "learning_rate": 8.663170798448511e-06, + "loss": 0.886, + "step": 5470 + }, + { + "epoch": 0.56, + "grad_norm": 1.5276721104428754, + "learning_rate": 8.659905986189254e-06, + "loss": 0.6391, + "step": 5471 + }, + { + "epoch": 0.56, + "grad_norm": 1.3626636058873272, + "learning_rate": 8.656641319376244e-06, + "loss": 0.6458, + "step": 5472 + }, + { + "epoch": 0.56, + "grad_norm": 1.7623374037769408, + "learning_rate": 8.653376798363813e-06, + "loss": 0.7789, + "step": 5473 + }, + { + "epoch": 0.56, + "grad_norm": 1.358068460640926, + "learning_rate": 8.650112423506268e-06, + "loss": 0.6103, + "step": 5474 + }, + { + "epoch": 0.56, + "grad_norm": 1.6080385386359963, + "learning_rate": 8.646848195157914e-06, + "loss": 0.7079, + "step": 5475 + }, + { + "epoch": 0.56, + "grad_norm": 1.4797588538693942, + "learning_rate": 8.643584113673025e-06, + "loss": 0.5676, + "step": 5476 + }, + { + "epoch": 0.56, + "grad_norm": 1.5675290842063057, + "learning_rate": 8.640320179405867e-06, + "loss": 0.6131, + "step": 5477 + }, + { + "epoch": 0.56, + "grad_norm": 1.5496226897761918, + "learning_rate": 8.637056392710695e-06, + "loss": 0.5674, + "step": 5478 + }, + { + "epoch": 0.56, + "grad_norm": 1.5906722506085222, + "learning_rate": 8.633792753941733e-06, + "loss": 0.7608, + "step": 5479 + }, + { + "epoch": 0.56, + "grad_norm": 1.5845995315266523, + "learning_rate": 8.630529263453202e-06, + "loss": 0.8818, + "step": 5480 + }, + { + "epoch": 0.56, + "grad_norm": 1.5647801024812311, + "learning_rate": 8.62726592159931e-06, + "loss": 0.7297, + "step": 5481 + }, + { + "epoch": 0.56, + "grad_norm": 1.5349042911362298, + "learning_rate": 8.624002728734228e-06, + "loss": 0.6783, + "step": 5482 + }, + { + "epoch": 0.56, + "grad_norm": 1.5358799651119135, + "learning_rate": 8.620739685212133e-06, + "loss": 0.6789, + "step": 5483 + }, + { + "epoch": 0.56, + "grad_norm": 1.6279261350508096, + "learning_rate": 8.61747679138718e-06, + "loss": 0.8028, + "step": 5484 + }, + { + "epoch": 0.56, + "grad_norm": 1.3911315916545854, + "learning_rate": 8.614214047613497e-06, + "loss": 0.6441, + "step": 5485 + }, + { + "epoch": 0.56, + "grad_norm": 1.638391124770247, + "learning_rate": 8.610951454245211e-06, + "loss": 0.715, + "step": 5486 + }, + { + "epoch": 0.56, + "grad_norm": 1.4050757193555106, + "learning_rate": 8.60768901163642e-06, + "loss": 0.6294, + "step": 5487 + }, + { + "epoch": 0.56, + "grad_norm": 1.4292014571964684, + "learning_rate": 8.604426720141211e-06, + "loss": 0.6052, + "step": 5488 + }, + { + "epoch": 0.56, + "grad_norm": 1.5718555849430869, + "learning_rate": 8.601164580113663e-06, + "loss": 0.6714, + "step": 5489 + }, + { + "epoch": 0.56, + "grad_norm": 1.6533540016966977, + "learning_rate": 8.597902591907821e-06, + "loss": 0.7301, + "step": 5490 + }, + { + "epoch": 0.56, + "grad_norm": 1.7720517033439591, + "learning_rate": 8.594640755877728e-06, + "loss": 0.7721, + "step": 5491 + }, + { + "epoch": 0.56, + "grad_norm": 1.5961720840389233, + "learning_rate": 8.591379072377401e-06, + "loss": 0.6951, + "step": 5492 + }, + { + "epoch": 0.56, + "grad_norm": 1.8415406374214263, + "learning_rate": 8.588117541760848e-06, + "loss": 0.8032, + "step": 5493 + }, + { + "epoch": 0.56, + "grad_norm": 1.3633959194012253, + "learning_rate": 8.58485616438206e-06, + "loss": 0.6743, + "step": 5494 + }, + { + "epoch": 0.56, + "grad_norm": 1.512968423653738, + "learning_rate": 8.581594940595002e-06, + "loss": 0.7512, + "step": 5495 + }, + { + "epoch": 0.56, + "grad_norm": 1.4542667920517232, + "learning_rate": 8.578333870753635e-06, + "loss": 0.7837, + "step": 5496 + }, + { + "epoch": 0.56, + "grad_norm": 1.6243586978939764, + "learning_rate": 8.575072955211888e-06, + "loss": 0.6827, + "step": 5497 + }, + { + "epoch": 0.56, + "grad_norm": 1.5343117176073966, + "learning_rate": 8.57181219432369e-06, + "loss": 0.6556, + "step": 5498 + }, + { + "epoch": 0.56, + "grad_norm": 1.5582736131421835, + "learning_rate": 8.568551588442949e-06, + "loss": 0.7319, + "step": 5499 + }, + { + "epoch": 0.56, + "grad_norm": 1.5461179064783763, + "learning_rate": 8.565291137923542e-06, + "loss": 0.7341, + "step": 5500 + }, + { + "epoch": 0.56, + "grad_norm": 1.7157349604771854, + "learning_rate": 8.562030843119346e-06, + "loss": 0.7225, + "step": 5501 + }, + { + "epoch": 0.56, + "grad_norm": 1.7871742129365662, + "learning_rate": 8.55877070438422e-06, + "loss": 0.7229, + "step": 5502 + }, + { + "epoch": 0.56, + "grad_norm": 1.9183352542015495, + "learning_rate": 8.55551072207199e-06, + "loss": 0.7484, + "step": 5503 + }, + { + "epoch": 0.56, + "grad_norm": 1.464447201539831, + "learning_rate": 8.552250896536488e-06, + "loss": 0.7543, + "step": 5504 + }, + { + "epoch": 0.56, + "grad_norm": 1.8243480915817782, + "learning_rate": 8.548991228131503e-06, + "loss": 0.8253, + "step": 5505 + }, + { + "epoch": 0.56, + "grad_norm": 1.601641158363978, + "learning_rate": 8.545731717210832e-06, + "loss": 0.6864, + "step": 5506 + }, + { + "epoch": 0.56, + "grad_norm": 1.6520512376146232, + "learning_rate": 8.542472364128244e-06, + "loss": 0.6393, + "step": 5507 + }, + { + "epoch": 0.56, + "grad_norm": 1.5117404534779368, + "learning_rate": 8.539213169237483e-06, + "loss": 0.7381, + "step": 5508 + }, + { + "epoch": 0.56, + "grad_norm": 1.5655502787624929, + "learning_rate": 8.535954132892295e-06, + "loss": 0.7102, + "step": 5509 + }, + { + "epoch": 0.56, + "grad_norm": 1.7211773204730685, + "learning_rate": 8.532695255446384e-06, + "loss": 0.7406, + "step": 5510 + }, + { + "epoch": 0.56, + "grad_norm": 1.6313306762180304, + "learning_rate": 8.529436537253458e-06, + "loss": 0.7661, + "step": 5511 + }, + { + "epoch": 0.56, + "grad_norm": 1.6058096645199098, + "learning_rate": 8.526177978667202e-06, + "loss": 0.7611, + "step": 5512 + }, + { + "epoch": 0.56, + "grad_norm": 1.4762936540269178, + "learning_rate": 8.522919580041276e-06, + "loss": 0.8195, + "step": 5513 + }, + { + "epoch": 0.56, + "grad_norm": 1.751430534818543, + "learning_rate": 8.519661341729333e-06, + "loss": 0.6863, + "step": 5514 + }, + { + "epoch": 0.56, + "grad_norm": 1.462329357830292, + "learning_rate": 8.516403264084998e-06, + "loss": 0.681, + "step": 5515 + }, + { + "epoch": 0.56, + "grad_norm": 1.4524260115250442, + "learning_rate": 8.51314534746189e-06, + "loss": 0.6933, + "step": 5516 + }, + { + "epoch": 0.56, + "grad_norm": 1.5756065194086915, + "learning_rate": 8.509887592213604e-06, + "loss": 0.6209, + "step": 5517 + }, + { + "epoch": 0.56, + "grad_norm": 1.7025368438615078, + "learning_rate": 8.506629998693714e-06, + "loss": 0.7609, + "step": 5518 + }, + { + "epoch": 0.56, + "grad_norm": 1.7032354266244643, + "learning_rate": 8.503372567255787e-06, + "loss": 0.7251, + "step": 5519 + }, + { + "epoch": 0.56, + "grad_norm": 1.6639787231587528, + "learning_rate": 8.500115298253361e-06, + "loss": 0.6176, + "step": 5520 + }, + { + "epoch": 0.56, + "grad_norm": 1.4583955236227215, + "learning_rate": 8.496858192039962e-06, + "loss": 0.615, + "step": 5521 + }, + { + "epoch": 0.56, + "grad_norm": 1.6337365754120245, + "learning_rate": 8.493601248969103e-06, + "loss": 0.8058, + "step": 5522 + }, + { + "epoch": 0.56, + "grad_norm": 1.7051305347035186, + "learning_rate": 8.490344469394271e-06, + "loss": 0.7225, + "step": 5523 + }, + { + "epoch": 0.56, + "grad_norm": 1.5575367248237868, + "learning_rate": 8.487087853668935e-06, + "loss": 0.63, + "step": 5524 + }, + { + "epoch": 0.56, + "grad_norm": 1.6287292527008408, + "learning_rate": 8.483831402146559e-06, + "loss": 0.6938, + "step": 5525 + }, + { + "epoch": 0.56, + "grad_norm": 1.50483887604819, + "learning_rate": 8.48057511518057e-06, + "loss": 0.6772, + "step": 5526 + }, + { + "epoch": 0.56, + "grad_norm": 1.7309490692074987, + "learning_rate": 8.477318993124393e-06, + "loss": 0.8651, + "step": 5527 + }, + { + "epoch": 0.56, + "grad_norm": 1.4125409131381073, + "learning_rate": 8.474063036331425e-06, + "loss": 0.712, + "step": 5528 + }, + { + "epoch": 0.56, + "grad_norm": 1.5606438706805952, + "learning_rate": 8.470807245155053e-06, + "loss": 0.6616, + "step": 5529 + }, + { + "epoch": 0.56, + "grad_norm": 1.4651607308021533, + "learning_rate": 8.46755161994864e-06, + "loss": 0.722, + "step": 5530 + }, + { + "epoch": 0.56, + "grad_norm": 1.5445416441462756, + "learning_rate": 8.464296161065533e-06, + "loss": 0.7322, + "step": 5531 + }, + { + "epoch": 0.56, + "grad_norm": 1.5176082048874708, + "learning_rate": 8.461040868859065e-06, + "loss": 0.628, + "step": 5532 + }, + { + "epoch": 0.56, + "grad_norm": 1.5574105642679101, + "learning_rate": 8.45778574368254e-06, + "loss": 0.7133, + "step": 5533 + }, + { + "epoch": 0.56, + "grad_norm": 1.5858107938600134, + "learning_rate": 8.454530785889256e-06, + "loss": 0.762, + "step": 5534 + }, + { + "epoch": 0.56, + "grad_norm": 1.6696786532923031, + "learning_rate": 8.45127599583249e-06, + "loss": 0.7083, + "step": 5535 + }, + { + "epoch": 0.56, + "grad_norm": 1.8353554641884566, + "learning_rate": 8.448021373865493e-06, + "loss": 0.7876, + "step": 5536 + }, + { + "epoch": 0.56, + "grad_norm": 1.5643365391993624, + "learning_rate": 8.444766920341509e-06, + "loss": 0.7114, + "step": 5537 + }, + { + "epoch": 0.56, + "grad_norm": 1.6023590781916328, + "learning_rate": 8.441512635613749e-06, + "loss": 0.6952, + "step": 5538 + }, + { + "epoch": 0.56, + "grad_norm": 1.731932561372792, + "learning_rate": 8.438258520035421e-06, + "loss": 0.7214, + "step": 5539 + }, + { + "epoch": 0.56, + "grad_norm": 1.5965376353785534, + "learning_rate": 8.435004573959714e-06, + "loss": 0.6458, + "step": 5540 + }, + { + "epoch": 0.56, + "grad_norm": 1.8887697894262163, + "learning_rate": 8.43175079773978e-06, + "loss": 0.7957, + "step": 5541 + }, + { + "epoch": 0.56, + "grad_norm": 1.5778138226547658, + "learning_rate": 8.428497191728773e-06, + "loss": 0.6641, + "step": 5542 + }, + { + "epoch": 0.56, + "grad_norm": 1.5779590248921962, + "learning_rate": 8.425243756279824e-06, + "loss": 0.7809, + "step": 5543 + }, + { + "epoch": 0.56, + "grad_norm": 1.5315990931760999, + "learning_rate": 8.421990491746036e-06, + "loss": 0.6848, + "step": 5544 + }, + { + "epoch": 0.56, + "grad_norm": 1.5889701751512135, + "learning_rate": 8.418737398480505e-06, + "loss": 0.6625, + "step": 5545 + }, + { + "epoch": 0.56, + "grad_norm": 1.6874965203363166, + "learning_rate": 8.415484476836299e-06, + "loss": 0.7285, + "step": 5546 + }, + { + "epoch": 0.56, + "grad_norm": 1.540099537700872, + "learning_rate": 8.412231727166473e-06, + "loss": 0.7185, + "step": 5547 + }, + { + "epoch": 0.56, + "grad_norm": 1.5670593760585358, + "learning_rate": 8.408979149824066e-06, + "loss": 0.749, + "step": 5548 + }, + { + "epoch": 0.56, + "grad_norm": 1.5576932693594387, + "learning_rate": 8.40572674516209e-06, + "loss": 0.5929, + "step": 5549 + }, + { + "epoch": 0.56, + "grad_norm": 1.4934320775822312, + "learning_rate": 8.402474513533547e-06, + "loss": 0.7322, + "step": 5550 + }, + { + "epoch": 0.56, + "grad_norm": 1.6562092106422215, + "learning_rate": 8.39922245529141e-06, + "loss": 0.6973, + "step": 5551 + }, + { + "epoch": 0.56, + "grad_norm": 1.6473263063260772, + "learning_rate": 8.395970570788639e-06, + "loss": 0.6905, + "step": 5552 + }, + { + "epoch": 0.56, + "grad_norm": 1.585602680375613, + "learning_rate": 8.392718860378184e-06, + "loss": 0.7053, + "step": 5553 + }, + { + "epoch": 0.56, + "grad_norm": 1.6454460458345783, + "learning_rate": 8.38946732441296e-06, + "loss": 0.7143, + "step": 5554 + }, + { + "epoch": 0.57, + "grad_norm": 1.4922249663072742, + "learning_rate": 8.386215963245873e-06, + "loss": 0.6574, + "step": 5555 + }, + { + "epoch": 0.57, + "grad_norm": 1.5801772788488986, + "learning_rate": 8.382964777229805e-06, + "loss": 0.7476, + "step": 5556 + }, + { + "epoch": 0.57, + "grad_norm": 1.5681557434886617, + "learning_rate": 8.379713766717621e-06, + "loss": 0.6844, + "step": 5557 + }, + { + "epoch": 0.57, + "grad_norm": 1.6810740575671264, + "learning_rate": 8.376462932062173e-06, + "loss": 0.6406, + "step": 5558 + }, + { + "epoch": 0.57, + "grad_norm": 1.5934027970623843, + "learning_rate": 8.373212273616281e-06, + "loss": 0.7524, + "step": 5559 + }, + { + "epoch": 0.57, + "grad_norm": 1.6566698688257904, + "learning_rate": 8.369961791732758e-06, + "loss": 0.6743, + "step": 5560 + }, + { + "epoch": 0.57, + "grad_norm": 1.6857676438146336, + "learning_rate": 8.366711486764392e-06, + "loss": 0.6712, + "step": 5561 + }, + { + "epoch": 0.57, + "grad_norm": 1.705561999111611, + "learning_rate": 8.363461359063952e-06, + "loss": 0.6882, + "step": 5562 + }, + { + "epoch": 0.57, + "grad_norm": 1.374891944612468, + "learning_rate": 8.360211408984192e-06, + "loss": 0.5937, + "step": 5563 + }, + { + "epoch": 0.57, + "grad_norm": 1.574393404851258, + "learning_rate": 8.356961636877834e-06, + "loss": 0.7417, + "step": 5564 + }, + { + "epoch": 0.57, + "grad_norm": 1.7885837197042456, + "learning_rate": 8.353712043097598e-06, + "loss": 0.7578, + "step": 5565 + }, + { + "epoch": 0.57, + "grad_norm": 1.6466916135457996, + "learning_rate": 8.350462627996177e-06, + "loss": 0.6979, + "step": 5566 + }, + { + "epoch": 0.57, + "grad_norm": 1.655448490479182, + "learning_rate": 8.34721339192624e-06, + "loss": 0.7016, + "step": 5567 + }, + { + "epoch": 0.57, + "grad_norm": 1.6885302646033078, + "learning_rate": 8.343964335240443e-06, + "loss": 0.6853, + "step": 5568 + }, + { + "epoch": 0.57, + "grad_norm": 1.6656985984382529, + "learning_rate": 8.340715458291422e-06, + "loss": 0.6549, + "step": 5569 + }, + { + "epoch": 0.57, + "grad_norm": 1.5822177752381685, + "learning_rate": 8.337466761431785e-06, + "loss": 0.7849, + "step": 5570 + }, + { + "epoch": 0.57, + "grad_norm": 1.584122032281881, + "learning_rate": 8.334218245014138e-06, + "loss": 0.772, + "step": 5571 + }, + { + "epoch": 0.57, + "grad_norm": 1.655308408893074, + "learning_rate": 8.330969909391046e-06, + "loss": 0.7676, + "step": 5572 + }, + { + "epoch": 0.57, + "grad_norm": 1.5107759446204707, + "learning_rate": 8.32772175491507e-06, + "loss": 0.7032, + "step": 5573 + }, + { + "epoch": 0.57, + "grad_norm": 1.5337381993316719, + "learning_rate": 8.324473781938749e-06, + "loss": 0.743, + "step": 5574 + }, + { + "epoch": 0.57, + "grad_norm": 1.6892896481283168, + "learning_rate": 8.321225990814592e-06, + "loss": 0.6367, + "step": 5575 + }, + { + "epoch": 0.57, + "grad_norm": 1.5491307737483462, + "learning_rate": 8.317978381895105e-06, + "loss": 0.7983, + "step": 5576 + }, + { + "epoch": 0.57, + "grad_norm": 1.6842601260701693, + "learning_rate": 8.314730955532757e-06, + "loss": 0.7657, + "step": 5577 + }, + { + "epoch": 0.57, + "grad_norm": 1.4904895744526254, + "learning_rate": 8.311483712080011e-06, + "loss": 0.6465, + "step": 5578 + }, + { + "epoch": 0.57, + "grad_norm": 1.5253575724269708, + "learning_rate": 8.308236651889304e-06, + "loss": 0.5721, + "step": 5579 + }, + { + "epoch": 0.57, + "grad_norm": 1.6042099863240251, + "learning_rate": 8.30498977531305e-06, + "loss": 0.7276, + "step": 5580 + }, + { + "epoch": 0.57, + "grad_norm": 1.497215582210722, + "learning_rate": 8.301743082703651e-06, + "loss": 0.6767, + "step": 5581 + }, + { + "epoch": 0.57, + "grad_norm": 1.5552740736267856, + "learning_rate": 8.29849657441348e-06, + "loss": 0.7603, + "step": 5582 + }, + { + "epoch": 0.57, + "grad_norm": 1.5397565487267744, + "learning_rate": 8.2952502507949e-06, + "loss": 0.6586, + "step": 5583 + }, + { + "epoch": 0.57, + "grad_norm": 1.6627728598845204, + "learning_rate": 8.292004112200245e-06, + "loss": 0.6835, + "step": 5584 + }, + { + "epoch": 0.57, + "grad_norm": 1.6729356422110861, + "learning_rate": 8.28875815898183e-06, + "loss": 0.7202, + "step": 5585 + }, + { + "epoch": 0.57, + "grad_norm": 1.5953255361916052, + "learning_rate": 8.285512391491964e-06, + "loss": 0.7248, + "step": 5586 + }, + { + "epoch": 0.57, + "grad_norm": 1.5983069282455369, + "learning_rate": 8.28226681008291e-06, + "loss": 0.6544, + "step": 5587 + }, + { + "epoch": 0.57, + "grad_norm": 1.3969342118400652, + "learning_rate": 8.27902141510693e-06, + "loss": 0.5857, + "step": 5588 + }, + { + "epoch": 0.57, + "grad_norm": 1.6125249307841638, + "learning_rate": 8.275776206916268e-06, + "loss": 0.6612, + "step": 5589 + }, + { + "epoch": 0.57, + "grad_norm": 1.6681447234561344, + "learning_rate": 8.27253118586313e-06, + "loss": 0.7215, + "step": 5590 + }, + { + "epoch": 0.57, + "grad_norm": 1.5914349153850604, + "learning_rate": 8.269286352299723e-06, + "loss": 0.6635, + "step": 5591 + }, + { + "epoch": 0.57, + "grad_norm": 1.7033247822836768, + "learning_rate": 8.266041706578212e-06, + "loss": 0.6536, + "step": 5592 + }, + { + "epoch": 0.57, + "grad_norm": 1.6737519134798131, + "learning_rate": 8.262797249050758e-06, + "loss": 0.7259, + "step": 5593 + }, + { + "epoch": 0.57, + "grad_norm": 1.6019239391867346, + "learning_rate": 8.259552980069499e-06, + "loss": 0.7038, + "step": 5594 + }, + { + "epoch": 0.57, + "grad_norm": 1.7608015334026599, + "learning_rate": 8.256308899986543e-06, + "loss": 0.5839, + "step": 5595 + }, + { + "epoch": 0.57, + "grad_norm": 1.6677107873015877, + "learning_rate": 8.253065009153988e-06, + "loss": 0.6422, + "step": 5596 + }, + { + "epoch": 0.57, + "grad_norm": 1.5685334631977799, + "learning_rate": 8.249821307923911e-06, + "loss": 0.7272, + "step": 5597 + }, + { + "epoch": 0.57, + "grad_norm": 1.4941559661351542, + "learning_rate": 8.246577796648357e-06, + "loss": 0.7019, + "step": 5598 + }, + { + "epoch": 0.57, + "grad_norm": 1.6341561834848046, + "learning_rate": 8.243334475679367e-06, + "loss": 0.7615, + "step": 5599 + }, + { + "epoch": 0.57, + "grad_norm": 1.7031917858396979, + "learning_rate": 8.240091345368944e-06, + "loss": 0.8472, + "step": 5600 + }, + { + "epoch": 0.57, + "grad_norm": 1.4470246442660457, + "learning_rate": 8.236848406069084e-06, + "loss": 0.6536, + "step": 5601 + }, + { + "epoch": 0.57, + "grad_norm": 1.7029488235864563, + "learning_rate": 8.23360565813176e-06, + "loss": 0.7819, + "step": 5602 + }, + { + "epoch": 0.57, + "grad_norm": 1.7033324038098736, + "learning_rate": 8.230363101908916e-06, + "loss": 0.7275, + "step": 5603 + }, + { + "epoch": 0.57, + "grad_norm": 1.485534242872215, + "learning_rate": 8.227120737752486e-06, + "loss": 0.6535, + "step": 5604 + }, + { + "epoch": 0.57, + "grad_norm": 1.6491114795083066, + "learning_rate": 8.223878566014372e-06, + "loss": 0.7522, + "step": 5605 + }, + { + "epoch": 0.57, + "grad_norm": 1.688704750751506, + "learning_rate": 8.220636587046464e-06, + "loss": 0.763, + "step": 5606 + }, + { + "epoch": 0.57, + "grad_norm": 1.3452689780904759, + "learning_rate": 8.217394801200632e-06, + "loss": 0.6435, + "step": 5607 + }, + { + "epoch": 0.57, + "grad_norm": 1.6312682950472936, + "learning_rate": 8.214153208828714e-06, + "loss": 0.7451, + "step": 5608 + }, + { + "epoch": 0.57, + "grad_norm": 1.5168308517665703, + "learning_rate": 8.210911810282542e-06, + "loss": 0.6365, + "step": 5609 + }, + { + "epoch": 0.57, + "grad_norm": 1.6131248031193193, + "learning_rate": 8.20767060591391e-06, + "loss": 0.6881, + "step": 5610 + }, + { + "epoch": 0.57, + "grad_norm": 1.5979577897434138, + "learning_rate": 8.204429596074605e-06, + "loss": 0.7088, + "step": 5611 + }, + { + "epoch": 0.57, + "grad_norm": 1.4937526190944859, + "learning_rate": 8.201188781116392e-06, + "loss": 0.6796, + "step": 5612 + }, + { + "epoch": 0.57, + "grad_norm": 1.45679874555144, + "learning_rate": 8.197948161391003e-06, + "loss": 0.7388, + "step": 5613 + }, + { + "epoch": 0.57, + "grad_norm": 1.5562185392172332, + "learning_rate": 8.194707737250158e-06, + "loss": 0.7071, + "step": 5614 + }, + { + "epoch": 0.57, + "grad_norm": 1.7053520652444238, + "learning_rate": 8.191467509045564e-06, + "loss": 0.7217, + "step": 5615 + }, + { + "epoch": 0.57, + "grad_norm": 1.5433041988831695, + "learning_rate": 8.188227477128883e-06, + "loss": 0.7122, + "step": 5616 + }, + { + "epoch": 0.57, + "grad_norm": 1.5797508292402456, + "learning_rate": 8.18498764185178e-06, + "loss": 0.6357, + "step": 5617 + }, + { + "epoch": 0.57, + "grad_norm": 1.47683890933061, + "learning_rate": 8.181748003565883e-06, + "loss": 0.6484, + "step": 5618 + }, + { + "epoch": 0.57, + "grad_norm": 1.5348046552396966, + "learning_rate": 8.178508562622804e-06, + "loss": 0.7089, + "step": 5619 + }, + { + "epoch": 0.57, + "grad_norm": 1.414162890397765, + "learning_rate": 8.175269319374142e-06, + "loss": 0.7159, + "step": 5620 + }, + { + "epoch": 0.57, + "grad_norm": 1.7791038023349213, + "learning_rate": 8.172030274171452e-06, + "loss": 0.7431, + "step": 5621 + }, + { + "epoch": 0.57, + "grad_norm": 1.4784290773633204, + "learning_rate": 8.168791427366297e-06, + "loss": 0.7103, + "step": 5622 + }, + { + "epoch": 0.57, + "grad_norm": 1.635880216665212, + "learning_rate": 8.16555277931019e-06, + "loss": 0.7444, + "step": 5623 + }, + { + "epoch": 0.57, + "grad_norm": 1.7363639324473743, + "learning_rate": 8.162314330354642e-06, + "loss": 0.733, + "step": 5624 + }, + { + "epoch": 0.57, + "grad_norm": 1.7253204538491358, + "learning_rate": 8.159076080851139e-06, + "loss": 0.7552, + "step": 5625 + }, + { + "epoch": 0.57, + "grad_norm": 1.5681074189686877, + "learning_rate": 8.155838031151134e-06, + "loss": 0.7665, + "step": 5626 + }, + { + "epoch": 0.57, + "grad_norm": 1.5958043823114054, + "learning_rate": 8.152600181606073e-06, + "loss": 0.6868, + "step": 5627 + }, + { + "epoch": 0.57, + "grad_norm": 1.5497556316778451, + "learning_rate": 8.149362532567374e-06, + "loss": 0.6495, + "step": 5628 + }, + { + "epoch": 0.57, + "grad_norm": 1.5626805277983316, + "learning_rate": 8.146125084386428e-06, + "loss": 0.6569, + "step": 5629 + }, + { + "epoch": 0.57, + "grad_norm": 1.6780030139091988, + "learning_rate": 8.142887837414614e-06, + "loss": 0.8537, + "step": 5630 + }, + { + "epoch": 0.57, + "grad_norm": 1.5189408676389409, + "learning_rate": 8.139650792003286e-06, + "loss": 0.8312, + "step": 5631 + }, + { + "epoch": 0.57, + "grad_norm": 1.4776069199360429, + "learning_rate": 8.136413948503767e-06, + "loss": 0.6339, + "step": 5632 + }, + { + "epoch": 0.57, + "grad_norm": 1.5138991319169117, + "learning_rate": 8.133177307267376e-06, + "loss": 0.7134, + "step": 5633 + }, + { + "epoch": 0.57, + "grad_norm": 1.5731729926117684, + "learning_rate": 8.12994086864539e-06, + "loss": 0.7459, + "step": 5634 + }, + { + "epoch": 0.57, + "grad_norm": 1.5803257301451343, + "learning_rate": 8.126704632989082e-06, + "loss": 0.7218, + "step": 5635 + }, + { + "epoch": 0.57, + "grad_norm": 1.6209290479538236, + "learning_rate": 8.123468600649686e-06, + "loss": 0.7538, + "step": 5636 + }, + { + "epoch": 0.57, + "grad_norm": 1.4808690714988408, + "learning_rate": 8.120232771978432e-06, + "loss": 0.689, + "step": 5637 + }, + { + "epoch": 0.57, + "grad_norm": 1.676758702387847, + "learning_rate": 8.116997147326515e-06, + "loss": 0.7705, + "step": 5638 + }, + { + "epoch": 0.57, + "grad_norm": 1.5484496099134843, + "learning_rate": 8.113761727045106e-06, + "loss": 0.737, + "step": 5639 + }, + { + "epoch": 0.57, + "grad_norm": 1.7164143999551997, + "learning_rate": 8.110526511485371e-06, + "loss": 0.7558, + "step": 5640 + }, + { + "epoch": 0.57, + "grad_norm": 1.5754148116572044, + "learning_rate": 8.10729150099843e-06, + "loss": 0.6998, + "step": 5641 + }, + { + "epoch": 0.57, + "grad_norm": 1.7140088046757154, + "learning_rate": 8.104056695935396e-06, + "loss": 0.7623, + "step": 5642 + }, + { + "epoch": 0.57, + "grad_norm": 1.7869442676574774, + "learning_rate": 8.100822096647365e-06, + "loss": 0.7454, + "step": 5643 + }, + { + "epoch": 0.57, + "grad_norm": 1.4764597743231278, + "learning_rate": 8.09758770348539e-06, + "loss": 0.6513, + "step": 5644 + }, + { + "epoch": 0.57, + "grad_norm": 1.6279896509723633, + "learning_rate": 8.094353516800524e-06, + "loss": 0.6936, + "step": 5645 + }, + { + "epoch": 0.57, + "grad_norm": 1.5207184246056769, + "learning_rate": 8.091119536943779e-06, + "loss": 0.7996, + "step": 5646 + }, + { + "epoch": 0.57, + "grad_norm": 2.0178330689034416, + "learning_rate": 8.087885764266156e-06, + "loss": 0.7302, + "step": 5647 + }, + { + "epoch": 0.57, + "grad_norm": 1.5985413437981397, + "learning_rate": 8.084652199118636e-06, + "loss": 0.7385, + "step": 5648 + }, + { + "epoch": 0.57, + "grad_norm": 1.6947296632777942, + "learning_rate": 8.081418841852163e-06, + "loss": 0.7599, + "step": 5649 + }, + { + "epoch": 0.57, + "grad_norm": 1.485467514936313, + "learning_rate": 8.07818569281767e-06, + "loss": 0.7569, + "step": 5650 + }, + { + "epoch": 0.57, + "grad_norm": 1.6350537395684603, + "learning_rate": 8.074952752366072e-06, + "loss": 0.6527, + "step": 5651 + }, + { + "epoch": 0.57, + "grad_norm": 1.4772057181632665, + "learning_rate": 8.071720020848245e-06, + "loss": 0.677, + "step": 5652 + }, + { + "epoch": 0.58, + "grad_norm": 1.5288422750275847, + "learning_rate": 8.068487498615057e-06, + "loss": 0.6706, + "step": 5653 + }, + { + "epoch": 0.58, + "grad_norm": 1.5773263458771583, + "learning_rate": 8.065255186017342e-06, + "loss": 0.7145, + "step": 5654 + }, + { + "epoch": 0.58, + "grad_norm": 1.4698615244133397, + "learning_rate": 8.06202308340592e-06, + "loss": 0.6992, + "step": 5655 + }, + { + "epoch": 0.58, + "grad_norm": 2.0458570117068953, + "learning_rate": 8.05879119113159e-06, + "loss": 0.7137, + "step": 5656 + }, + { + "epoch": 0.58, + "grad_norm": 1.5528151714159797, + "learning_rate": 8.055559509545115e-06, + "loss": 0.5657, + "step": 5657 + }, + { + "epoch": 0.58, + "grad_norm": 1.5618631597788561, + "learning_rate": 8.052328038997252e-06, + "loss": 0.7858, + "step": 5658 + }, + { + "epoch": 0.58, + "grad_norm": 1.6302595225217231, + "learning_rate": 8.04909677983872e-06, + "loss": 0.6427, + "step": 5659 + }, + { + "epoch": 0.58, + "grad_norm": 1.8190481089776078, + "learning_rate": 8.04586573242022e-06, + "loss": 0.803, + "step": 5660 + }, + { + "epoch": 0.58, + "grad_norm": 1.6958660962032366, + "learning_rate": 8.042634897092443e-06, + "loss": 0.7755, + "step": 5661 + }, + { + "epoch": 0.58, + "grad_norm": 1.742053888634955, + "learning_rate": 8.039404274206031e-06, + "loss": 0.6488, + "step": 5662 + }, + { + "epoch": 0.58, + "grad_norm": 1.5210478365225468, + "learning_rate": 8.03617386411163e-06, + "loss": 0.6826, + "step": 5663 + }, + { + "epoch": 0.58, + "grad_norm": 1.4114991514657393, + "learning_rate": 8.032943667159843e-06, + "loss": 0.6926, + "step": 5664 + }, + { + "epoch": 0.58, + "grad_norm": 1.4134598644524605, + "learning_rate": 8.029713683701259e-06, + "loss": 0.6353, + "step": 5665 + }, + { + "epoch": 0.58, + "grad_norm": 1.5362933596085961, + "learning_rate": 8.026483914086446e-06, + "loss": 0.7266, + "step": 5666 + }, + { + "epoch": 0.58, + "grad_norm": 1.4561356510584063, + "learning_rate": 8.023254358665938e-06, + "loss": 0.7336, + "step": 5667 + }, + { + "epoch": 0.58, + "grad_norm": 1.5192674346006763, + "learning_rate": 8.020025017790261e-06, + "loss": 0.7731, + "step": 5668 + }, + { + "epoch": 0.58, + "grad_norm": 1.5640800467790366, + "learning_rate": 8.0167958918099e-06, + "loss": 0.716, + "step": 5669 + }, + { + "epoch": 0.58, + "grad_norm": 1.456511074126383, + "learning_rate": 8.013566981075331e-06, + "loss": 0.6562, + "step": 5670 + }, + { + "epoch": 0.58, + "grad_norm": 1.6136566932546004, + "learning_rate": 8.010338285937006e-06, + "loss": 0.7981, + "step": 5671 + }, + { + "epoch": 0.58, + "grad_norm": 1.669060168683187, + "learning_rate": 8.007109806745341e-06, + "loss": 0.7027, + "step": 5672 + }, + { + "epoch": 0.58, + "grad_norm": 1.4881521535751443, + "learning_rate": 8.003881543850742e-06, + "loss": 0.7885, + "step": 5673 + }, + { + "epoch": 0.58, + "grad_norm": 1.5634498701023354, + "learning_rate": 8.00065349760359e-06, + "loss": 0.6158, + "step": 5674 + }, + { + "epoch": 0.58, + "grad_norm": 1.6285482943204677, + "learning_rate": 7.99742566835423e-06, + "loss": 0.7625, + "step": 5675 + }, + { + "epoch": 0.58, + "grad_norm": 1.5147804177704658, + "learning_rate": 7.994198056453001e-06, + "loss": 0.7938, + "step": 5676 + }, + { + "epoch": 0.58, + "grad_norm": 1.7054944824732357, + "learning_rate": 7.9909706622502e-06, + "loss": 0.8145, + "step": 5677 + }, + { + "epoch": 0.58, + "grad_norm": 1.4443496050342668, + "learning_rate": 7.987743486096119e-06, + "loss": 0.7224, + "step": 5678 + }, + { + "epoch": 0.58, + "grad_norm": 1.7038128500780676, + "learning_rate": 7.984516528341016e-06, + "loss": 0.6913, + "step": 5679 + }, + { + "epoch": 0.58, + "grad_norm": 1.582191849396049, + "learning_rate": 7.981289789335122e-06, + "loss": 0.7606, + "step": 5680 + }, + { + "epoch": 0.58, + "grad_norm": 1.7291647907979886, + "learning_rate": 7.978063269428656e-06, + "loss": 0.7354, + "step": 5681 + }, + { + "epoch": 0.58, + "grad_norm": 1.5886724144698394, + "learning_rate": 7.974836968971799e-06, + "loss": 0.7044, + "step": 5682 + }, + { + "epoch": 0.58, + "grad_norm": 1.5544304168529097, + "learning_rate": 7.97161088831472e-06, + "loss": 0.7389, + "step": 5683 + }, + { + "epoch": 0.58, + "grad_norm": 1.6719397258805486, + "learning_rate": 7.968385027807558e-06, + "loss": 0.7663, + "step": 5684 + }, + { + "epoch": 0.58, + "grad_norm": 1.4880037468847649, + "learning_rate": 7.965159387800434e-06, + "loss": 0.6278, + "step": 5685 + }, + { + "epoch": 0.58, + "grad_norm": 1.602445909793838, + "learning_rate": 7.961933968643435e-06, + "loss": 0.8443, + "step": 5686 + }, + { + "epoch": 0.58, + "grad_norm": 1.7395603370748483, + "learning_rate": 7.95870877068663e-06, + "loss": 0.7678, + "step": 5687 + }, + { + "epoch": 0.58, + "grad_norm": 1.8056856860517443, + "learning_rate": 7.955483794280068e-06, + "loss": 0.6395, + "step": 5688 + }, + { + "epoch": 0.58, + "grad_norm": 1.55847808067052, + "learning_rate": 7.952259039773767e-06, + "loss": 0.6431, + "step": 5689 + }, + { + "epoch": 0.58, + "grad_norm": 1.687208438007986, + "learning_rate": 7.949034507517721e-06, + "loss": 0.8264, + "step": 5690 + }, + { + "epoch": 0.58, + "grad_norm": 1.835421354444477, + "learning_rate": 7.94581019786191e-06, + "loss": 0.7163, + "step": 5691 + }, + { + "epoch": 0.58, + "grad_norm": 1.7973409383727332, + "learning_rate": 7.942586111156277e-06, + "loss": 0.7155, + "step": 5692 + }, + { + "epoch": 0.58, + "grad_norm": 1.7173157692995349, + "learning_rate": 7.939362247750745e-06, + "loss": 0.7497, + "step": 5693 + }, + { + "epoch": 0.58, + "grad_norm": 1.675781627264294, + "learning_rate": 7.93613860799522e-06, + "loss": 0.7794, + "step": 5694 + }, + { + "epoch": 0.58, + "grad_norm": 1.5541808483981965, + "learning_rate": 7.932915192239571e-06, + "loss": 0.6026, + "step": 5695 + }, + { + "epoch": 0.58, + "grad_norm": 1.5952182515805586, + "learning_rate": 7.929692000833653e-06, + "loss": 0.6893, + "step": 5696 + }, + { + "epoch": 0.58, + "grad_norm": 1.6961654663081456, + "learning_rate": 7.926469034127292e-06, + "loss": 0.7462, + "step": 5697 + }, + { + "epoch": 0.58, + "grad_norm": 1.732500632150684, + "learning_rate": 7.923246292470292e-06, + "loss": 0.7884, + "step": 5698 + }, + { + "epoch": 0.58, + "grad_norm": 1.6452456021606285, + "learning_rate": 7.920023776212433e-06, + "loss": 0.7367, + "step": 5699 + }, + { + "epoch": 0.58, + "grad_norm": 1.52846216501172, + "learning_rate": 7.91680148570346e-06, + "loss": 0.6121, + "step": 5700 + }, + { + "epoch": 0.58, + "grad_norm": 1.6674960297503236, + "learning_rate": 7.91357942129311e-06, + "loss": 0.7909, + "step": 5701 + }, + { + "epoch": 0.58, + "grad_norm": 1.5036535214427527, + "learning_rate": 7.910357583331088e-06, + "loss": 0.7316, + "step": 5702 + }, + { + "epoch": 0.58, + "grad_norm": 1.652839330197833, + "learning_rate": 7.90713597216707e-06, + "loss": 0.7686, + "step": 5703 + }, + { + "epoch": 0.58, + "grad_norm": 1.4637652515969, + "learning_rate": 7.903914588150716e-06, + "loss": 0.739, + "step": 5704 + }, + { + "epoch": 0.58, + "grad_norm": 1.6957518798495284, + "learning_rate": 7.90069343163165e-06, + "loss": 0.7086, + "step": 5705 + }, + { + "epoch": 0.58, + "grad_norm": 1.6581052593969878, + "learning_rate": 7.897472502959484e-06, + "loss": 0.6988, + "step": 5706 + }, + { + "epoch": 0.58, + "grad_norm": 1.716293328997258, + "learning_rate": 7.894251802483803e-06, + "loss": 0.7448, + "step": 5707 + }, + { + "epoch": 0.58, + "grad_norm": 1.608428356318047, + "learning_rate": 7.891031330554151e-06, + "loss": 0.7065, + "step": 5708 + }, + { + "epoch": 0.58, + "grad_norm": 1.4342559198109504, + "learning_rate": 7.88781108752007e-06, + "loss": 0.7052, + "step": 5709 + }, + { + "epoch": 0.58, + "grad_norm": 1.8107927017895886, + "learning_rate": 7.884591073731068e-06, + "loss": 0.7711, + "step": 5710 + }, + { + "epoch": 0.58, + "grad_norm": 1.5861194643001888, + "learning_rate": 7.88137128953662e-06, + "loss": 0.7277, + "step": 5711 + }, + { + "epoch": 0.58, + "grad_norm": 1.609787288473546, + "learning_rate": 7.87815173528619e-06, + "loss": 0.6833, + "step": 5712 + }, + { + "epoch": 0.58, + "grad_norm": 1.648058051240888, + "learning_rate": 7.874932411329203e-06, + "loss": 0.6429, + "step": 5713 + }, + { + "epoch": 0.58, + "grad_norm": 1.7289025911897014, + "learning_rate": 7.871713318015071e-06, + "loss": 0.6381, + "step": 5714 + }, + { + "epoch": 0.58, + "grad_norm": 1.7201473743909554, + "learning_rate": 7.868494455693181e-06, + "loss": 0.7717, + "step": 5715 + }, + { + "epoch": 0.58, + "grad_norm": 1.5865041465364405, + "learning_rate": 7.86527582471288e-06, + "loss": 0.653, + "step": 5716 + }, + { + "epoch": 0.58, + "grad_norm": 1.577293936850371, + "learning_rate": 7.862057425423509e-06, + "loss": 0.753, + "step": 5717 + }, + { + "epoch": 0.58, + "grad_norm": 1.414651766000381, + "learning_rate": 7.858839258174365e-06, + "loss": 0.6993, + "step": 5718 + }, + { + "epoch": 0.58, + "grad_norm": 1.6155072297570676, + "learning_rate": 7.855621323314736e-06, + "loss": 0.7568, + "step": 5719 + }, + { + "epoch": 0.58, + "grad_norm": 1.575719982898642, + "learning_rate": 7.852403621193883e-06, + "loss": 0.7807, + "step": 5720 + }, + { + "epoch": 0.58, + "grad_norm": 1.6118080727936386, + "learning_rate": 7.849186152161028e-06, + "loss": 0.7623, + "step": 5721 + }, + { + "epoch": 0.58, + "grad_norm": 1.6776561690628125, + "learning_rate": 7.845968916565385e-06, + "loss": 0.6105, + "step": 5722 + }, + { + "epoch": 0.58, + "grad_norm": 1.6816273038027498, + "learning_rate": 7.842751914756128e-06, + "loss": 0.7343, + "step": 5723 + }, + { + "epoch": 0.58, + "grad_norm": 1.521521598263868, + "learning_rate": 7.839535147082414e-06, + "loss": 0.7389, + "step": 5724 + }, + { + "epoch": 0.58, + "grad_norm": 1.6662923031868875, + "learning_rate": 7.836318613893377e-06, + "loss": 0.7188, + "step": 5725 + }, + { + "epoch": 0.58, + "grad_norm": 1.5968447870884663, + "learning_rate": 7.833102315538116e-06, + "loss": 0.6968, + "step": 5726 + }, + { + "epoch": 0.58, + "grad_norm": 1.620655229813537, + "learning_rate": 7.829886252365712e-06, + "loss": 0.7694, + "step": 5727 + }, + { + "epoch": 0.58, + "grad_norm": 1.5244934459907622, + "learning_rate": 7.82667042472522e-06, + "loss": 0.658, + "step": 5728 + }, + { + "epoch": 0.58, + "grad_norm": 1.6190112998576944, + "learning_rate": 7.823454832965666e-06, + "loss": 0.706, + "step": 5729 + }, + { + "epoch": 0.58, + "grad_norm": 1.7098127138691492, + "learning_rate": 7.820239477436055e-06, + "loss": 0.7253, + "step": 5730 + }, + { + "epoch": 0.58, + "grad_norm": 1.6048506977406953, + "learning_rate": 7.817024358485357e-06, + "loss": 0.7531, + "step": 5731 + }, + { + "epoch": 0.58, + "grad_norm": 1.5235882405052927, + "learning_rate": 7.813809476462525e-06, + "loss": 0.6348, + "step": 5732 + }, + { + "epoch": 0.58, + "grad_norm": 1.4798292434357883, + "learning_rate": 7.810594831716494e-06, + "loss": 0.5956, + "step": 5733 + }, + { + "epoch": 0.58, + "grad_norm": 1.5262332112480077, + "learning_rate": 7.807380424596149e-06, + "loss": 0.7542, + "step": 5734 + }, + { + "epoch": 0.58, + "grad_norm": 1.442225125841356, + "learning_rate": 7.804166255450372e-06, + "loss": 0.7161, + "step": 5735 + }, + { + "epoch": 0.58, + "grad_norm": 1.641892390278661, + "learning_rate": 7.80095232462801e-06, + "loss": 0.851, + "step": 5736 + }, + { + "epoch": 0.58, + "grad_norm": 1.620114405413643, + "learning_rate": 7.797738632477881e-06, + "loss": 0.7402, + "step": 5737 + }, + { + "epoch": 0.58, + "grad_norm": 1.4693871991300977, + "learning_rate": 7.794525179348786e-06, + "loss": 0.6569, + "step": 5738 + }, + { + "epoch": 0.58, + "grad_norm": 1.5528409369558045, + "learning_rate": 7.791311965589494e-06, + "loss": 0.6966, + "step": 5739 + }, + { + "epoch": 0.58, + "grad_norm": 1.5940979057868363, + "learning_rate": 7.788098991548749e-06, + "loss": 0.7308, + "step": 5740 + }, + { + "epoch": 0.58, + "grad_norm": 1.5244751204342553, + "learning_rate": 7.784886257575264e-06, + "loss": 0.6187, + "step": 5741 + }, + { + "epoch": 0.58, + "grad_norm": 1.6675693980486168, + "learning_rate": 7.781673764017738e-06, + "loss": 0.6508, + "step": 5742 + }, + { + "epoch": 0.58, + "grad_norm": 1.624044453668377, + "learning_rate": 7.778461511224835e-06, + "loss": 0.6836, + "step": 5743 + }, + { + "epoch": 0.58, + "grad_norm": 1.8273438127319237, + "learning_rate": 7.775249499545189e-06, + "loss": 0.6414, + "step": 5744 + }, + { + "epoch": 0.58, + "grad_norm": 1.8859707954802163, + "learning_rate": 7.772037729327423e-06, + "loss": 0.7952, + "step": 5745 + }, + { + "epoch": 0.58, + "grad_norm": 1.6982793251008128, + "learning_rate": 7.76882620092012e-06, + "loss": 0.8613, + "step": 5746 + }, + { + "epoch": 0.58, + "grad_norm": 1.410673546463811, + "learning_rate": 7.76561491467184e-06, + "loss": 0.7405, + "step": 5747 + }, + { + "epoch": 0.58, + "grad_norm": 1.4415440808988367, + "learning_rate": 7.762403870931123e-06, + "loss": 0.6729, + "step": 5748 + }, + { + "epoch": 0.58, + "grad_norm": 1.5811451596234793, + "learning_rate": 7.75919307004647e-06, + "loss": 0.7217, + "step": 5749 + }, + { + "epoch": 0.58, + "grad_norm": 1.6540154907977642, + "learning_rate": 7.755982512366366e-06, + "loss": 0.6948, + "step": 5750 + }, + { + "epoch": 0.58, + "grad_norm": 1.6137901831730095, + "learning_rate": 7.75277219823927e-06, + "loss": 0.6641, + "step": 5751 + }, + { + "epoch": 0.59, + "grad_norm": 1.6472949529054923, + "learning_rate": 7.749562128013606e-06, + "loss": 0.6697, + "step": 5752 + }, + { + "epoch": 0.59, + "grad_norm": 1.6150286746139912, + "learning_rate": 7.746352302037786e-06, + "loss": 0.6422, + "step": 5753 + }, + { + "epoch": 0.59, + "grad_norm": 1.801711018578529, + "learning_rate": 7.743142720660175e-06, + "loss": 0.6853, + "step": 5754 + }, + { + "epoch": 0.59, + "grad_norm": 1.4361518027545168, + "learning_rate": 7.73993338422913e-06, + "loss": 0.6891, + "step": 5755 + }, + { + "epoch": 0.59, + "grad_norm": 1.7345840777511297, + "learning_rate": 7.736724293092975e-06, + "loss": 0.7067, + "step": 5756 + }, + { + "epoch": 0.59, + "grad_norm": 1.481089883684841, + "learning_rate": 7.733515447600001e-06, + "loss": 0.6426, + "step": 5757 + }, + { + "epoch": 0.59, + "grad_norm": 1.4978401850787875, + "learning_rate": 7.730306848098485e-06, + "loss": 0.7194, + "step": 5758 + }, + { + "epoch": 0.59, + "grad_norm": 1.5186636663429582, + "learning_rate": 7.727098494936663e-06, + "loss": 0.7443, + "step": 5759 + }, + { + "epoch": 0.59, + "grad_norm": 1.60008866516873, + "learning_rate": 7.723890388462755e-06, + "loss": 0.7472, + "step": 5760 + }, + { + "epoch": 0.59, + "grad_norm": 1.4684241101960502, + "learning_rate": 7.720682529024954e-06, + "loss": 0.6613, + "step": 5761 + }, + { + "epoch": 0.59, + "grad_norm": 1.3717717677756704, + "learning_rate": 7.717474916971417e-06, + "loss": 0.5526, + "step": 5762 + }, + { + "epoch": 0.59, + "grad_norm": 1.4948051573294627, + "learning_rate": 7.714267552650283e-06, + "loss": 0.6751, + "step": 5763 + }, + { + "epoch": 0.59, + "grad_norm": 1.6615283937090968, + "learning_rate": 7.711060436409666e-06, + "loss": 0.8354, + "step": 5764 + }, + { + "epoch": 0.59, + "grad_norm": 1.504730171070354, + "learning_rate": 7.707853568597638e-06, + "loss": 0.7561, + "step": 5765 + }, + { + "epoch": 0.59, + "grad_norm": 1.7639588172269918, + "learning_rate": 7.704646949562264e-06, + "loss": 0.7053, + "step": 5766 + }, + { + "epoch": 0.59, + "grad_norm": 1.6477621135946416, + "learning_rate": 7.701440579651566e-06, + "loss": 0.7342, + "step": 5767 + }, + { + "epoch": 0.59, + "grad_norm": 1.4672356032530092, + "learning_rate": 7.698234459213545e-06, + "loss": 0.6801, + "step": 5768 + }, + { + "epoch": 0.59, + "grad_norm": 1.6686897139751138, + "learning_rate": 7.695028588596185e-06, + "loss": 0.6892, + "step": 5769 + }, + { + "epoch": 0.59, + "grad_norm": 1.5214058904728014, + "learning_rate": 7.69182296814742e-06, + "loss": 0.6733, + "step": 5770 + }, + { + "epoch": 0.59, + "grad_norm": 1.7885394268906438, + "learning_rate": 7.688617598215182e-06, + "loss": 0.7831, + "step": 5771 + }, + { + "epoch": 0.59, + "grad_norm": 1.6048047448421283, + "learning_rate": 7.685412479147352e-06, + "loss": 0.6657, + "step": 5772 + }, + { + "epoch": 0.59, + "grad_norm": 1.5551971775290863, + "learning_rate": 7.6822076112918e-06, + "loss": 0.6063, + "step": 5773 + }, + { + "epoch": 0.59, + "grad_norm": 1.6143685528810567, + "learning_rate": 7.679002994996372e-06, + "loss": 0.7048, + "step": 5774 + }, + { + "epoch": 0.59, + "grad_norm": 1.657918462223705, + "learning_rate": 7.675798630608867e-06, + "loss": 0.771, + "step": 5775 + }, + { + "epoch": 0.59, + "grad_norm": 1.528457546476097, + "learning_rate": 7.672594518477078e-06, + "loss": 0.7309, + "step": 5776 + }, + { + "epoch": 0.59, + "grad_norm": 1.5279410934335003, + "learning_rate": 7.669390658948755e-06, + "loss": 0.7658, + "step": 5777 + }, + { + "epoch": 0.59, + "grad_norm": 1.6382782909336935, + "learning_rate": 7.66618705237163e-06, + "loss": 0.7689, + "step": 5778 + }, + { + "epoch": 0.59, + "grad_norm": 1.5531108260546986, + "learning_rate": 7.662983699093406e-06, + "loss": 0.7474, + "step": 5779 + }, + { + "epoch": 0.59, + "grad_norm": 1.5019858046942969, + "learning_rate": 7.65978059946175e-06, + "loss": 0.581, + "step": 5780 + }, + { + "epoch": 0.59, + "grad_norm": 1.6333470851583256, + "learning_rate": 7.656577753824314e-06, + "loss": 0.7437, + "step": 5781 + }, + { + "epoch": 0.59, + "grad_norm": 1.6317538425852873, + "learning_rate": 7.65337516252872e-06, + "loss": 0.6981, + "step": 5782 + }, + { + "epoch": 0.59, + "grad_norm": 1.65717576290963, + "learning_rate": 7.65017282592255e-06, + "loss": 0.6735, + "step": 5783 + }, + { + "epoch": 0.59, + "grad_norm": 1.6307719724360934, + "learning_rate": 7.64697074435338e-06, + "loss": 0.6793, + "step": 5784 + }, + { + "epoch": 0.59, + "grad_norm": 1.5764880566930657, + "learning_rate": 7.64376891816873e-06, + "loss": 0.707, + "step": 5785 + }, + { + "epoch": 0.59, + "grad_norm": 1.5477253630397978, + "learning_rate": 7.64056734771612e-06, + "loss": 0.6924, + "step": 5786 + }, + { + "epoch": 0.59, + "grad_norm": 1.5882482658079655, + "learning_rate": 7.637366033343028e-06, + "loss": 0.6662, + "step": 5787 + }, + { + "epoch": 0.59, + "grad_norm": 1.4886743464528789, + "learning_rate": 7.634164975396903e-06, + "loss": 0.6774, + "step": 5788 + }, + { + "epoch": 0.59, + "grad_norm": 1.5966823968369421, + "learning_rate": 7.630964174225175e-06, + "loss": 0.732, + "step": 5789 + }, + { + "epoch": 0.59, + "grad_norm": 1.47748735264666, + "learning_rate": 7.627763630175236e-06, + "loss": 0.7289, + "step": 5790 + }, + { + "epoch": 0.59, + "grad_norm": 1.5900714815590495, + "learning_rate": 7.624563343594457e-06, + "loss": 0.6183, + "step": 5791 + }, + { + "epoch": 0.59, + "grad_norm": 1.4729519261324666, + "learning_rate": 7.621363314830182e-06, + "loss": 0.6789, + "step": 5792 + }, + { + "epoch": 0.59, + "grad_norm": 1.5556688222110906, + "learning_rate": 7.618163544229722e-06, + "loss": 0.6508, + "step": 5793 + }, + { + "epoch": 0.59, + "grad_norm": 1.642197194926469, + "learning_rate": 7.614964032140359e-06, + "loss": 0.6123, + "step": 5794 + }, + { + "epoch": 0.59, + "grad_norm": 1.5312811606471664, + "learning_rate": 7.611764778909352e-06, + "loss": 0.7497, + "step": 5795 + }, + { + "epoch": 0.59, + "grad_norm": 1.612451680261775, + "learning_rate": 7.608565784883932e-06, + "loss": 0.7227, + "step": 5796 + }, + { + "epoch": 0.59, + "grad_norm": 1.619756588082997, + "learning_rate": 7.6053670504112995e-06, + "loss": 0.7505, + "step": 5797 + }, + { + "epoch": 0.59, + "grad_norm": 1.5693701631566446, + "learning_rate": 7.602168575838622e-06, + "loss": 0.7009, + "step": 5798 + }, + { + "epoch": 0.59, + "grad_norm": 1.856737086352245, + "learning_rate": 7.598970361513052e-06, + "loss": 0.7856, + "step": 5799 + }, + { + "epoch": 0.59, + "grad_norm": 1.5410044281344544, + "learning_rate": 7.5957724077816985e-06, + "loss": 0.7093, + "step": 5800 + }, + { + "epoch": 0.59, + "grad_norm": 1.7184048220446808, + "learning_rate": 7.5925747149916515e-06, + "loss": 0.6581, + "step": 5801 + }, + { + "epoch": 0.59, + "grad_norm": 1.6669867702953898, + "learning_rate": 7.589377283489976e-06, + "loss": 0.7804, + "step": 5802 + }, + { + "epoch": 0.59, + "grad_norm": 1.6790229508054892, + "learning_rate": 7.586180113623694e-06, + "loss": 0.769, + "step": 5803 + }, + { + "epoch": 0.59, + "grad_norm": 1.4762824768291218, + "learning_rate": 7.582983205739815e-06, + "loss": 0.6036, + "step": 5804 + }, + { + "epoch": 0.59, + "grad_norm": 1.3934053469588565, + "learning_rate": 7.579786560185311e-06, + "loss": 0.6886, + "step": 5805 + }, + { + "epoch": 0.59, + "grad_norm": 1.6255197560643635, + "learning_rate": 7.576590177307125e-06, + "loss": 0.6945, + "step": 5806 + }, + { + "epoch": 0.59, + "grad_norm": 1.6613780786906946, + "learning_rate": 7.573394057452181e-06, + "loss": 0.7167, + "step": 5807 + }, + { + "epoch": 0.59, + "grad_norm": 1.570481880582874, + "learning_rate": 7.570198200967363e-06, + "loss": 0.7472, + "step": 5808 + }, + { + "epoch": 0.59, + "grad_norm": 1.617755903243908, + "learning_rate": 7.5670026081995295e-06, + "loss": 0.7497, + "step": 5809 + }, + { + "epoch": 0.59, + "grad_norm": 1.7559979574300808, + "learning_rate": 7.563807279495521e-06, + "loss": 0.7635, + "step": 5810 + }, + { + "epoch": 0.59, + "grad_norm": 1.7041260538063296, + "learning_rate": 7.560612215202129e-06, + "loss": 0.7424, + "step": 5811 + }, + { + "epoch": 0.59, + "grad_norm": 1.5673151751986434, + "learning_rate": 7.557417415666138e-06, + "loss": 0.7054, + "step": 5812 + }, + { + "epoch": 0.59, + "grad_norm": 1.642617769709063, + "learning_rate": 7.554222881234284e-06, + "loss": 0.6868, + "step": 5813 + }, + { + "epoch": 0.59, + "grad_norm": 1.4400619885328076, + "learning_rate": 7.55102861225329e-06, + "loss": 0.7142, + "step": 5814 + }, + { + "epoch": 0.59, + "grad_norm": 1.52967714882465, + "learning_rate": 7.547834609069846e-06, + "loss": 0.7085, + "step": 5815 + }, + { + "epoch": 0.59, + "grad_norm": 1.4760264857506311, + "learning_rate": 7.544640872030604e-06, + "loss": 0.6837, + "step": 5816 + }, + { + "epoch": 0.59, + "grad_norm": 1.6786991582733812, + "learning_rate": 7.5414474014822e-06, + "loss": 0.8335, + "step": 5817 + }, + { + "epoch": 0.59, + "grad_norm": 1.8209395656572913, + "learning_rate": 7.538254197771231e-06, + "loss": 0.7985, + "step": 5818 + }, + { + "epoch": 0.59, + "grad_norm": 1.6453101136824253, + "learning_rate": 7.535061261244271e-06, + "loss": 0.7504, + "step": 5819 + }, + { + "epoch": 0.59, + "grad_norm": 1.5088877840540806, + "learning_rate": 7.5318685922478675e-06, + "loss": 0.6652, + "step": 5820 + }, + { + "epoch": 0.59, + "grad_norm": 1.3748999206010954, + "learning_rate": 7.528676191128528e-06, + "loss": 0.7178, + "step": 5821 + }, + { + "epoch": 0.59, + "grad_norm": 1.60091876485254, + "learning_rate": 7.525484058232739e-06, + "loss": 0.8099, + "step": 5822 + }, + { + "epoch": 0.59, + "grad_norm": 1.4644531630938904, + "learning_rate": 7.522292193906964e-06, + "loss": 0.6834, + "step": 5823 + }, + { + "epoch": 0.59, + "grad_norm": 1.487917947088439, + "learning_rate": 7.5191005984976196e-06, + "loss": 0.6411, + "step": 5824 + }, + { + "epoch": 0.59, + "grad_norm": 1.6010186255910743, + "learning_rate": 7.515909272351112e-06, + "loss": 0.7479, + "step": 5825 + }, + { + "epoch": 0.59, + "grad_norm": 1.6386631361679176, + "learning_rate": 7.512718215813802e-06, + "loss": 0.7278, + "step": 5826 + }, + { + "epoch": 0.59, + "grad_norm": 1.7203597276680547, + "learning_rate": 7.509527429232033e-06, + "loss": 0.7027, + "step": 5827 + }, + { + "epoch": 0.59, + "grad_norm": 1.7403021291942407, + "learning_rate": 7.506336912952121e-06, + "loss": 0.6961, + "step": 5828 + }, + { + "epoch": 0.59, + "grad_norm": 1.649231460060806, + "learning_rate": 7.503146667320334e-06, + "loss": 0.6828, + "step": 5829 + }, + { + "epoch": 0.59, + "grad_norm": 1.548994166079608, + "learning_rate": 7.499956692682935e-06, + "loss": 0.684, + "step": 5830 + }, + { + "epoch": 0.59, + "grad_norm": 1.5992775479851562, + "learning_rate": 7.4967669893861364e-06, + "loss": 0.6688, + "step": 5831 + }, + { + "epoch": 0.59, + "grad_norm": 1.5744237446744, + "learning_rate": 7.493577557776135e-06, + "loss": 0.7714, + "step": 5832 + }, + { + "epoch": 0.59, + "grad_norm": 1.5185226876303048, + "learning_rate": 7.490388398199098e-06, + "loss": 0.7452, + "step": 5833 + }, + { + "epoch": 0.59, + "grad_norm": 1.8450325476321954, + "learning_rate": 7.487199511001148e-06, + "loss": 0.7273, + "step": 5834 + }, + { + "epoch": 0.59, + "grad_norm": 1.6248177114263815, + "learning_rate": 7.4840108965284016e-06, + "loss": 0.7375, + "step": 5835 + }, + { + "epoch": 0.59, + "grad_norm": 1.7510103642438084, + "learning_rate": 7.480822555126923e-06, + "loss": 0.8054, + "step": 5836 + }, + { + "epoch": 0.59, + "grad_norm": 1.4450252781350619, + "learning_rate": 7.477634487142759e-06, + "loss": 0.7438, + "step": 5837 + }, + { + "epoch": 0.59, + "grad_norm": 1.5620357709252137, + "learning_rate": 7.474446692921931e-06, + "loss": 0.8239, + "step": 5838 + }, + { + "epoch": 0.59, + "grad_norm": 1.7434351076894188, + "learning_rate": 7.471259172810417e-06, + "loss": 0.7382, + "step": 5839 + }, + { + "epoch": 0.59, + "grad_norm": 1.7191352077450146, + "learning_rate": 7.468071927154173e-06, + "loss": 0.7162, + "step": 5840 + }, + { + "epoch": 0.59, + "grad_norm": 1.6492411606609065, + "learning_rate": 7.46488495629913e-06, + "loss": 0.7453, + "step": 5841 + }, + { + "epoch": 0.59, + "grad_norm": 1.4196509139536047, + "learning_rate": 7.461698260591175e-06, + "loss": 0.668, + "step": 5842 + }, + { + "epoch": 0.59, + "grad_norm": 1.5833384209273522, + "learning_rate": 7.458511840376184e-06, + "loss": 0.767, + "step": 5843 + }, + { + "epoch": 0.59, + "grad_norm": 1.6025918147510856, + "learning_rate": 7.455325695999986e-06, + "loss": 0.7642, + "step": 5844 + }, + { + "epoch": 0.59, + "grad_norm": 1.6468470813377398, + "learning_rate": 7.452139827808389e-06, + "loss": 0.6987, + "step": 5845 + }, + { + "epoch": 0.59, + "grad_norm": 1.8830976071957954, + "learning_rate": 7.44895423614717e-06, + "loss": 0.8303, + "step": 5846 + }, + { + "epoch": 0.59, + "grad_norm": 1.7994734743793335, + "learning_rate": 7.445768921362076e-06, + "loss": 0.7118, + "step": 5847 + }, + { + "epoch": 0.59, + "grad_norm": 1.7340069598124714, + "learning_rate": 7.442583883798822e-06, + "loss": 0.801, + "step": 5848 + }, + { + "epoch": 0.59, + "grad_norm": 1.7089345395030544, + "learning_rate": 7.439399123803091e-06, + "loss": 0.8203, + "step": 5849 + }, + { + "epoch": 0.6, + "grad_norm": 1.4880693573774777, + "learning_rate": 7.436214641720545e-06, + "loss": 0.6633, + "step": 5850 + }, + { + "epoch": 0.6, + "grad_norm": 1.559121371519373, + "learning_rate": 7.433030437896806e-06, + "loss": 0.6573, + "step": 5851 + }, + { + "epoch": 0.6, + "grad_norm": 1.5023829504747404, + "learning_rate": 7.429846512677468e-06, + "loss": 0.7174, + "step": 5852 + }, + { + "epoch": 0.6, + "grad_norm": 1.5674878085450052, + "learning_rate": 7.426662866408103e-06, + "loss": 0.6898, + "step": 5853 + }, + { + "epoch": 0.6, + "grad_norm": 1.619686042807621, + "learning_rate": 7.423479499434236e-06, + "loss": 0.656, + "step": 5854 + }, + { + "epoch": 0.6, + "grad_norm": 1.634263367846682, + "learning_rate": 7.42029641210138e-06, + "loss": 0.7641, + "step": 5855 + }, + { + "epoch": 0.6, + "grad_norm": 1.6343792631487037, + "learning_rate": 7.4171136047550065e-06, + "loss": 0.6877, + "step": 5856 + }, + { + "epoch": 0.6, + "grad_norm": 1.42843883095044, + "learning_rate": 7.413931077740557e-06, + "loss": 0.604, + "step": 5857 + }, + { + "epoch": 0.6, + "grad_norm": 1.4921527424437566, + "learning_rate": 7.410748831403449e-06, + "loss": 0.6833, + "step": 5858 + }, + { + "epoch": 0.6, + "grad_norm": 1.6232802400330084, + "learning_rate": 7.4075668660890646e-06, + "loss": 0.744, + "step": 5859 + }, + { + "epoch": 0.6, + "grad_norm": 1.7416011582072546, + "learning_rate": 7.404385182142753e-06, + "loss": 0.7026, + "step": 5860 + }, + { + "epoch": 0.6, + "grad_norm": 1.6063603146265764, + "learning_rate": 7.4012037799098445e-06, + "loss": 0.6517, + "step": 5861 + }, + { + "epoch": 0.6, + "grad_norm": 1.8099053783680814, + "learning_rate": 7.39802265973562e-06, + "loss": 0.7787, + "step": 5862 + }, + { + "epoch": 0.6, + "grad_norm": 1.6263527004572946, + "learning_rate": 7.394841821965345e-06, + "loss": 0.7453, + "step": 5863 + }, + { + "epoch": 0.6, + "grad_norm": 1.4791392099068637, + "learning_rate": 7.391661266944254e-06, + "loss": 0.7141, + "step": 5864 + }, + { + "epoch": 0.6, + "grad_norm": 1.5802351106753338, + "learning_rate": 7.38848099501754e-06, + "loss": 0.7015, + "step": 5865 + }, + { + "epoch": 0.6, + "grad_norm": 1.4871552338594307, + "learning_rate": 7.385301006530378e-06, + "loss": 0.7324, + "step": 5866 + }, + { + "epoch": 0.6, + "grad_norm": 1.6085992046388986, + "learning_rate": 7.382121301827898e-06, + "loss": 0.743, + "step": 5867 + }, + { + "epoch": 0.6, + "grad_norm": 1.7153826304474897, + "learning_rate": 7.378941881255211e-06, + "loss": 0.7159, + "step": 5868 + }, + { + "epoch": 0.6, + "grad_norm": 1.8728107832178462, + "learning_rate": 7.3757627451573995e-06, + "loss": 0.5738, + "step": 5869 + }, + { + "epoch": 0.6, + "grad_norm": 1.5159368666910624, + "learning_rate": 7.372583893879499e-06, + "loss": 0.6209, + "step": 5870 + }, + { + "epoch": 0.6, + "grad_norm": 1.449899387393714, + "learning_rate": 7.369405327766532e-06, + "loss": 0.6736, + "step": 5871 + }, + { + "epoch": 0.6, + "grad_norm": 1.6303463734462023, + "learning_rate": 7.366227047163476e-06, + "loss": 0.6618, + "step": 5872 + }, + { + "epoch": 0.6, + "grad_norm": 1.6863818917790458, + "learning_rate": 7.363049052415285e-06, + "loss": 0.836, + "step": 5873 + }, + { + "epoch": 0.6, + "grad_norm": 1.820253340943577, + "learning_rate": 7.359871343866887e-06, + "loss": 0.741, + "step": 5874 + }, + { + "epoch": 0.6, + "grad_norm": 1.5931785045286797, + "learning_rate": 7.356693921863163e-06, + "loss": 0.6547, + "step": 5875 + }, + { + "epoch": 0.6, + "grad_norm": 1.6440388959898296, + "learning_rate": 7.3535167867489775e-06, + "loss": 0.7592, + "step": 5876 + }, + { + "epoch": 0.6, + "grad_norm": 1.5939645437591048, + "learning_rate": 7.350339938869162e-06, + "loss": 0.6622, + "step": 5877 + }, + { + "epoch": 0.6, + "grad_norm": 1.5597381779921151, + "learning_rate": 7.347163378568507e-06, + "loss": 0.8154, + "step": 5878 + }, + { + "epoch": 0.6, + "grad_norm": 1.4589897039416655, + "learning_rate": 7.343987106191786e-06, + "loss": 0.5976, + "step": 5879 + }, + { + "epoch": 0.6, + "grad_norm": 1.5984733750682367, + "learning_rate": 7.340811122083723e-06, + "loss": 0.6821, + "step": 5880 + }, + { + "epoch": 0.6, + "grad_norm": 1.4764646748866384, + "learning_rate": 7.3376354265890295e-06, + "loss": 0.7543, + "step": 5881 + }, + { + "epoch": 0.6, + "grad_norm": 1.5223731010955242, + "learning_rate": 7.334460020052379e-06, + "loss": 0.6846, + "step": 5882 + }, + { + "epoch": 0.6, + "grad_norm": 1.4660607880193885, + "learning_rate": 7.331284902818405e-06, + "loss": 0.6711, + "step": 5883 + }, + { + "epoch": 0.6, + "grad_norm": 1.446061417819951, + "learning_rate": 7.328110075231725e-06, + "loss": 0.7085, + "step": 5884 + }, + { + "epoch": 0.6, + "grad_norm": 1.5524107193494978, + "learning_rate": 7.324935537636908e-06, + "loss": 0.697, + "step": 5885 + }, + { + "epoch": 0.6, + "grad_norm": 1.6555445285652144, + "learning_rate": 7.321761290378505e-06, + "loss": 0.7275, + "step": 5886 + }, + { + "epoch": 0.6, + "grad_norm": 1.4430592773775197, + "learning_rate": 7.318587333801036e-06, + "loss": 0.5604, + "step": 5887 + }, + { + "epoch": 0.6, + "grad_norm": 1.4908733198747264, + "learning_rate": 7.315413668248974e-06, + "loss": 0.7183, + "step": 5888 + }, + { + "epoch": 0.6, + "grad_norm": 1.5245766996522263, + "learning_rate": 7.312240294066782e-06, + "loss": 0.6907, + "step": 5889 + }, + { + "epoch": 0.6, + "grad_norm": 1.6246441521241848, + "learning_rate": 7.309067211598868e-06, + "loss": 0.7805, + "step": 5890 + }, + { + "epoch": 0.6, + "grad_norm": 1.529185478581184, + "learning_rate": 7.305894421189628e-06, + "loss": 0.7304, + "step": 5891 + }, + { + "epoch": 0.6, + "grad_norm": 1.4448886607686946, + "learning_rate": 7.302721923183421e-06, + "loss": 0.6054, + "step": 5892 + }, + { + "epoch": 0.6, + "grad_norm": 1.5751107153435748, + "learning_rate": 7.299549717924565e-06, + "loss": 0.6843, + "step": 5893 + }, + { + "epoch": 0.6, + "grad_norm": 1.718706567927009, + "learning_rate": 7.296377805757357e-06, + "loss": 0.6116, + "step": 5894 + }, + { + "epoch": 0.6, + "grad_norm": 1.5177722397037496, + "learning_rate": 7.29320618702606e-06, + "loss": 0.6166, + "step": 5895 + }, + { + "epoch": 0.6, + "grad_norm": 1.5444918903217328, + "learning_rate": 7.2900348620749016e-06, + "loss": 0.722, + "step": 5896 + }, + { + "epoch": 0.6, + "grad_norm": 1.537921306975767, + "learning_rate": 7.286863831248078e-06, + "loss": 0.7269, + "step": 5897 + }, + { + "epoch": 0.6, + "grad_norm": 1.7285406302564792, + "learning_rate": 7.28369309488976e-06, + "loss": 0.7455, + "step": 5898 + }, + { + "epoch": 0.6, + "grad_norm": 1.5245798267270128, + "learning_rate": 7.280522653344076e-06, + "loss": 0.7289, + "step": 5899 + }, + { + "epoch": 0.6, + "grad_norm": 1.4914728486125917, + "learning_rate": 7.27735250695513e-06, + "loss": 0.6049, + "step": 5900 + }, + { + "epoch": 0.6, + "grad_norm": 1.466868236257867, + "learning_rate": 7.274182656066992e-06, + "loss": 0.6538, + "step": 5901 + }, + { + "epoch": 0.6, + "grad_norm": 1.7183062480640914, + "learning_rate": 7.271013101023702e-06, + "loss": 0.7655, + "step": 5902 + }, + { + "epoch": 0.6, + "grad_norm": 1.8887092239674945, + "learning_rate": 7.26784384216926e-06, + "loss": 0.8537, + "step": 5903 + }, + { + "epoch": 0.6, + "grad_norm": 1.7400780102568585, + "learning_rate": 7.264674879847644e-06, + "loss": 0.7871, + "step": 5904 + }, + { + "epoch": 0.6, + "grad_norm": 1.7134581127166453, + "learning_rate": 7.261506214402796e-06, + "loss": 0.7119, + "step": 5905 + }, + { + "epoch": 0.6, + "grad_norm": 1.5867941278621458, + "learning_rate": 7.258337846178621e-06, + "loss": 0.6933, + "step": 5906 + }, + { + "epoch": 0.6, + "grad_norm": 1.5424375073206111, + "learning_rate": 7.255169775519e-06, + "loss": 0.8146, + "step": 5907 + }, + { + "epoch": 0.6, + "grad_norm": 1.3814753377772704, + "learning_rate": 7.2520020027677715e-06, + "loss": 0.6826, + "step": 5908 + }, + { + "epoch": 0.6, + "grad_norm": 1.6083435545153657, + "learning_rate": 7.248834528268756e-06, + "loss": 0.7188, + "step": 5909 + }, + { + "epoch": 0.6, + "grad_norm": 1.6591768526517174, + "learning_rate": 7.245667352365727e-06, + "loss": 0.7414, + "step": 5910 + }, + { + "epoch": 0.6, + "grad_norm": 1.4171103717364297, + "learning_rate": 7.242500475402433e-06, + "loss": 0.6912, + "step": 5911 + }, + { + "epoch": 0.6, + "grad_norm": 1.5330338232654563, + "learning_rate": 7.239333897722591e-06, + "loss": 0.7653, + "step": 5912 + }, + { + "epoch": 0.6, + "grad_norm": 1.5562892061873899, + "learning_rate": 7.2361676196698834e-06, + "loss": 0.6745, + "step": 5913 + }, + { + "epoch": 0.6, + "grad_norm": 1.6240138314562722, + "learning_rate": 7.233001641587958e-06, + "loss": 0.5879, + "step": 5914 + }, + { + "epoch": 0.6, + "grad_norm": 1.617474529996535, + "learning_rate": 7.229835963820435e-06, + "loss": 0.8026, + "step": 5915 + }, + { + "epoch": 0.6, + "grad_norm": 1.6839525778259303, + "learning_rate": 7.226670586710896e-06, + "loss": 0.7293, + "step": 5916 + }, + { + "epoch": 0.6, + "grad_norm": 1.5312147309286948, + "learning_rate": 7.223505510602893e-06, + "loss": 0.6628, + "step": 5917 + }, + { + "epoch": 0.6, + "grad_norm": 1.7960645410763412, + "learning_rate": 7.220340735839953e-06, + "loss": 0.6784, + "step": 5918 + }, + { + "epoch": 0.6, + "grad_norm": 1.700530965726789, + "learning_rate": 7.217176262765551e-06, + "loss": 0.746, + "step": 5919 + }, + { + "epoch": 0.6, + "grad_norm": 1.77136543867775, + "learning_rate": 7.2140120917231525e-06, + "loss": 0.728, + "step": 5920 + }, + { + "epoch": 0.6, + "grad_norm": 1.6255467952292564, + "learning_rate": 7.210848223056169e-06, + "loss": 0.6546, + "step": 5921 + }, + { + "epoch": 0.6, + "grad_norm": 1.5471405337399904, + "learning_rate": 7.207684657107994e-06, + "loss": 0.7178, + "step": 5922 + }, + { + "epoch": 0.6, + "grad_norm": 1.4928474424652634, + "learning_rate": 7.204521394221986e-06, + "loss": 0.7027, + "step": 5923 + }, + { + "epoch": 0.6, + "grad_norm": 1.5710364609704472, + "learning_rate": 7.201358434741461e-06, + "loss": 0.7739, + "step": 5924 + }, + { + "epoch": 0.6, + "grad_norm": 1.5515633009029757, + "learning_rate": 7.1981957790097155e-06, + "loss": 0.6327, + "step": 5925 + }, + { + "epoch": 0.6, + "grad_norm": 1.589661024787032, + "learning_rate": 7.195033427369998e-06, + "loss": 0.8063, + "step": 5926 + }, + { + "epoch": 0.6, + "grad_norm": 1.7548994446950992, + "learning_rate": 7.191871380165538e-06, + "loss": 0.7202, + "step": 5927 + }, + { + "epoch": 0.6, + "grad_norm": 1.5781662873618456, + "learning_rate": 7.1887096377395305e-06, + "loss": 0.6457, + "step": 5928 + }, + { + "epoch": 0.6, + "grad_norm": 1.6275642291493222, + "learning_rate": 7.185548200435123e-06, + "loss": 0.7623, + "step": 5929 + }, + { + "epoch": 0.6, + "grad_norm": 1.8226216363463947, + "learning_rate": 7.182387068595445e-06, + "loss": 0.6989, + "step": 5930 + }, + { + "epoch": 0.6, + "grad_norm": 1.6337489744342286, + "learning_rate": 7.179226242563593e-06, + "loss": 0.8565, + "step": 5931 + }, + { + "epoch": 0.6, + "grad_norm": 1.7040405735331745, + "learning_rate": 7.176065722682616e-06, + "loss": 0.6508, + "step": 5932 + }, + { + "epoch": 0.6, + "grad_norm": 1.7053161007144007, + "learning_rate": 7.172905509295547e-06, + "loss": 0.6041, + "step": 5933 + }, + { + "epoch": 0.6, + "grad_norm": 1.5160666240079106, + "learning_rate": 7.1697456027453705e-06, + "loss": 0.7378, + "step": 5934 + }, + { + "epoch": 0.6, + "grad_norm": 1.5134763013492174, + "learning_rate": 7.166586003375049e-06, + "loss": 0.6613, + "step": 5935 + }, + { + "epoch": 0.6, + "grad_norm": 1.5604089079300418, + "learning_rate": 7.16342671152751e-06, + "loss": 0.7043, + "step": 5936 + }, + { + "epoch": 0.6, + "grad_norm": 1.705865897955065, + "learning_rate": 7.16026772754564e-06, + "loss": 0.7221, + "step": 5937 + }, + { + "epoch": 0.6, + "grad_norm": 1.5183849133745817, + "learning_rate": 7.157109051772304e-06, + "loss": 0.6544, + "step": 5938 + }, + { + "epoch": 0.6, + "grad_norm": 1.6388424865238325, + "learning_rate": 7.153950684550317e-06, + "loss": 0.6588, + "step": 5939 + }, + { + "epoch": 0.6, + "grad_norm": 1.735175336027495, + "learning_rate": 7.150792626222476e-06, + "loss": 0.8374, + "step": 5940 + }, + { + "epoch": 0.6, + "grad_norm": 1.4780545701966636, + "learning_rate": 7.147634877131544e-06, + "loss": 0.7295, + "step": 5941 + }, + { + "epoch": 0.6, + "grad_norm": 1.635128129316677, + "learning_rate": 7.144477437620235e-06, + "loss": 0.7532, + "step": 5942 + }, + { + "epoch": 0.6, + "grad_norm": 1.531452447180397, + "learning_rate": 7.14132030803125e-06, + "loss": 0.7193, + "step": 5943 + }, + { + "epoch": 0.6, + "grad_norm": 1.6133299287122889, + "learning_rate": 7.138163488707235e-06, + "loss": 0.7288, + "step": 5944 + }, + { + "epoch": 0.6, + "grad_norm": 1.615628410179108, + "learning_rate": 7.1350069799908205e-06, + "loss": 0.6419, + "step": 5945 + }, + { + "epoch": 0.6, + "grad_norm": 1.4970890926910432, + "learning_rate": 7.131850782224598e-06, + "loss": 0.7031, + "step": 5946 + }, + { + "epoch": 0.6, + "grad_norm": 1.479217347173918, + "learning_rate": 7.128694895751118e-06, + "loss": 0.709, + "step": 5947 + }, + { + "epoch": 0.61, + "grad_norm": 1.4703328917956562, + "learning_rate": 7.1255393209129034e-06, + "loss": 0.6481, + "step": 5948 + }, + { + "epoch": 0.61, + "grad_norm": 1.5584138647975212, + "learning_rate": 7.1223840580524485e-06, + "loss": 0.6695, + "step": 5949 + }, + { + "epoch": 0.61, + "grad_norm": 1.5948021840095914, + "learning_rate": 7.1192291075121985e-06, + "loss": 0.7755, + "step": 5950 + }, + { + "epoch": 0.61, + "grad_norm": 1.589248305768824, + "learning_rate": 7.116074469634582e-06, + "loss": 0.8458, + "step": 5951 + }, + { + "epoch": 0.61, + "grad_norm": 1.685088414455333, + "learning_rate": 7.112920144761981e-06, + "loss": 0.7741, + "step": 5952 + }, + { + "epoch": 0.61, + "grad_norm": 1.6519851958314915, + "learning_rate": 7.109766133236747e-06, + "loss": 0.7756, + "step": 5953 + }, + { + "epoch": 0.61, + "grad_norm": 1.603902906380497, + "learning_rate": 7.106612435401204e-06, + "loss": 0.7936, + "step": 5954 + }, + { + "epoch": 0.61, + "grad_norm": 1.5540901955171227, + "learning_rate": 7.103459051597634e-06, + "loss": 0.6245, + "step": 5955 + }, + { + "epoch": 0.61, + "grad_norm": 1.4938465866104673, + "learning_rate": 7.100305982168287e-06, + "loss": 0.7651, + "step": 5956 + }, + { + "epoch": 0.61, + "grad_norm": 1.4482093578275226, + "learning_rate": 7.097153227455379e-06, + "loss": 0.7077, + "step": 5957 + }, + { + "epoch": 0.61, + "grad_norm": 1.458455597539955, + "learning_rate": 7.09400078780109e-06, + "loss": 0.6886, + "step": 5958 + }, + { + "epoch": 0.61, + "grad_norm": 1.6426093964650683, + "learning_rate": 7.090848663547574e-06, + "loss": 0.7427, + "step": 5959 + }, + { + "epoch": 0.61, + "grad_norm": 1.5489288651190687, + "learning_rate": 7.087696855036941e-06, + "loss": 0.7297, + "step": 5960 + }, + { + "epoch": 0.61, + "grad_norm": 1.4600247920573843, + "learning_rate": 7.084545362611271e-06, + "loss": 0.6347, + "step": 5961 + }, + { + "epoch": 0.61, + "grad_norm": 1.45701917668946, + "learning_rate": 7.081394186612607e-06, + "loss": 0.7495, + "step": 5962 + }, + { + "epoch": 0.61, + "grad_norm": 1.7218090265097128, + "learning_rate": 7.078243327382965e-06, + "loss": 0.7223, + "step": 5963 + }, + { + "epoch": 0.61, + "grad_norm": 1.667199490568043, + "learning_rate": 7.075092785264318e-06, + "loss": 0.8159, + "step": 5964 + }, + { + "epoch": 0.61, + "grad_norm": 1.6435345957610008, + "learning_rate": 7.071942560598607e-06, + "loss": 0.6628, + "step": 5965 + }, + { + "epoch": 0.61, + "grad_norm": 1.512973675003832, + "learning_rate": 7.068792653727745e-06, + "loss": 0.6767, + "step": 5966 + }, + { + "epoch": 0.61, + "grad_norm": 1.715175169843844, + "learning_rate": 7.065643064993598e-06, + "loss": 0.7698, + "step": 5967 + }, + { + "epoch": 0.61, + "grad_norm": 1.695624356046342, + "learning_rate": 7.062493794738008e-06, + "loss": 0.7545, + "step": 5968 + }, + { + "epoch": 0.61, + "grad_norm": 1.6865151048997264, + "learning_rate": 7.059344843302783e-06, + "loss": 0.7827, + "step": 5969 + }, + { + "epoch": 0.61, + "grad_norm": 1.7368175081011257, + "learning_rate": 7.0561962110296845e-06, + "loss": 0.7576, + "step": 5970 + }, + { + "epoch": 0.61, + "grad_norm": 1.615677836368483, + "learning_rate": 7.0530478982604524e-06, + "loss": 0.657, + "step": 5971 + }, + { + "epoch": 0.61, + "grad_norm": 1.693141974058053, + "learning_rate": 7.049899905336788e-06, + "loss": 0.7445, + "step": 5972 + }, + { + "epoch": 0.61, + "grad_norm": 1.5262126051228404, + "learning_rate": 7.046752232600351e-06, + "loss": 0.6724, + "step": 5973 + }, + { + "epoch": 0.61, + "grad_norm": 1.7065540618543513, + "learning_rate": 7.043604880392781e-06, + "loss": 0.7284, + "step": 5974 + }, + { + "epoch": 0.61, + "grad_norm": 1.5338232721417555, + "learning_rate": 7.0404578490556616e-06, + "loss": 0.6668, + "step": 5975 + }, + { + "epoch": 0.61, + "grad_norm": 1.5935989717722543, + "learning_rate": 7.037311138930563e-06, + "loss": 0.6522, + "step": 5976 + }, + { + "epoch": 0.61, + "grad_norm": 1.6575930100117833, + "learning_rate": 7.034164750359011e-06, + "loss": 0.627, + "step": 5977 + }, + { + "epoch": 0.61, + "grad_norm": 1.7356854343165955, + "learning_rate": 7.031018683682494e-06, + "loss": 0.7598, + "step": 5978 + }, + { + "epoch": 0.61, + "grad_norm": 1.6779200410679829, + "learning_rate": 7.02787293924247e-06, + "loss": 0.7907, + "step": 5979 + }, + { + "epoch": 0.61, + "grad_norm": 1.501651087009643, + "learning_rate": 7.024727517380356e-06, + "loss": 0.6912, + "step": 5980 + }, + { + "epoch": 0.61, + "grad_norm": 1.5649323303119, + "learning_rate": 7.021582418437543e-06, + "loss": 0.6899, + "step": 5981 + }, + { + "epoch": 0.61, + "grad_norm": 1.6378941911076537, + "learning_rate": 7.018437642755384e-06, + "loss": 0.6915, + "step": 5982 + }, + { + "epoch": 0.61, + "grad_norm": 1.5636475281530569, + "learning_rate": 7.015293190675191e-06, + "loss": 0.6637, + "step": 5983 + }, + { + "epoch": 0.61, + "grad_norm": 1.561748989419727, + "learning_rate": 7.012149062538249e-06, + "loss": 0.7508, + "step": 5984 + }, + { + "epoch": 0.61, + "grad_norm": 1.75746553351738, + "learning_rate": 7.009005258685797e-06, + "loss": 0.6662, + "step": 5985 + }, + { + "epoch": 0.61, + "grad_norm": 1.7962183540031025, + "learning_rate": 7.005861779459051e-06, + "loss": 0.674, + "step": 5986 + }, + { + "epoch": 0.61, + "grad_norm": 1.5312431132527884, + "learning_rate": 7.00271862519919e-06, + "loss": 0.6533, + "step": 5987 + }, + { + "epoch": 0.61, + "grad_norm": 1.8068991522290494, + "learning_rate": 6.999575796247344e-06, + "loss": 0.7689, + "step": 5988 + }, + { + "epoch": 0.61, + "grad_norm": 1.4364228321091383, + "learning_rate": 6.996433292944627e-06, + "loss": 0.759, + "step": 5989 + }, + { + "epoch": 0.61, + "grad_norm": 1.5059840142036114, + "learning_rate": 6.993291115632108e-06, + "loss": 0.7712, + "step": 5990 + }, + { + "epoch": 0.61, + "grad_norm": 1.879677897781722, + "learning_rate": 6.990149264650814e-06, + "loss": 0.735, + "step": 5991 + }, + { + "epoch": 0.61, + "grad_norm": 1.5384761249987013, + "learning_rate": 6.987007740341754e-06, + "loss": 0.6578, + "step": 5992 + }, + { + "epoch": 0.61, + "grad_norm": 1.442254383864038, + "learning_rate": 6.983866543045881e-06, + "loss": 0.6452, + "step": 5993 + }, + { + "epoch": 0.61, + "grad_norm": 1.5261177107975168, + "learning_rate": 6.980725673104128e-06, + "loss": 0.708, + "step": 5994 + }, + { + "epoch": 0.61, + "grad_norm": 1.5038665795578166, + "learning_rate": 6.97758513085739e-06, + "loss": 0.7081, + "step": 5995 + }, + { + "epoch": 0.61, + "grad_norm": 1.846541150792894, + "learning_rate": 6.974444916646517e-06, + "loss": 0.7867, + "step": 5996 + }, + { + "epoch": 0.61, + "grad_norm": 1.690342301741687, + "learning_rate": 6.971305030812339e-06, + "loss": 0.7515, + "step": 5997 + }, + { + "epoch": 0.61, + "grad_norm": 1.5635055628794958, + "learning_rate": 6.968165473695632e-06, + "loss": 0.6293, + "step": 5998 + }, + { + "epoch": 0.61, + "grad_norm": 1.4455948190504697, + "learning_rate": 6.96502624563715e-06, + "loss": 0.6306, + "step": 5999 + }, + { + "epoch": 0.61, + "grad_norm": 1.585497684641971, + "learning_rate": 6.961887346977611e-06, + "loss": 0.6823, + "step": 6000 + }, + { + "epoch": 0.61, + "grad_norm": 1.4864011580113403, + "learning_rate": 6.958748778057687e-06, + "loss": 0.7411, + "step": 6001 + }, + { + "epoch": 0.61, + "grad_norm": 1.7449596125569689, + "learning_rate": 6.955610539218023e-06, + "loss": 0.6064, + "step": 6002 + }, + { + "epoch": 0.61, + "grad_norm": 1.6330422724486302, + "learning_rate": 6.952472630799227e-06, + "loss": 0.7924, + "step": 6003 + }, + { + "epoch": 0.61, + "grad_norm": 1.561676435086924, + "learning_rate": 6.949335053141868e-06, + "loss": 0.6949, + "step": 6004 + }, + { + "epoch": 0.61, + "grad_norm": 1.5681058178598866, + "learning_rate": 6.9461978065864835e-06, + "loss": 0.8036, + "step": 6005 + }, + { + "epoch": 0.61, + "grad_norm": 1.5518107210158107, + "learning_rate": 6.943060891473572e-06, + "loss": 0.7636, + "step": 6006 + }, + { + "epoch": 0.61, + "grad_norm": 1.7040411881612014, + "learning_rate": 6.939924308143591e-06, + "loss": 0.7985, + "step": 6007 + }, + { + "epoch": 0.61, + "grad_norm": 1.673280397750217, + "learning_rate": 6.936788056936976e-06, + "loss": 0.7416, + "step": 6008 + }, + { + "epoch": 0.61, + "grad_norm": 1.674118049602933, + "learning_rate": 6.933652138194114e-06, + "loss": 0.6395, + "step": 6009 + }, + { + "epoch": 0.61, + "grad_norm": 1.5361770186226622, + "learning_rate": 6.93051655225536e-06, + "loss": 0.6385, + "step": 6010 + }, + { + "epoch": 0.61, + "grad_norm": 1.5032438867630116, + "learning_rate": 6.9273812994610315e-06, + "loss": 0.6813, + "step": 6011 + }, + { + "epoch": 0.61, + "grad_norm": 1.7101399111295492, + "learning_rate": 6.924246380151411e-06, + "loss": 0.8143, + "step": 6012 + }, + { + "epoch": 0.61, + "grad_norm": 1.5441825955179973, + "learning_rate": 6.92111179466675e-06, + "loss": 0.6683, + "step": 6013 + }, + { + "epoch": 0.61, + "grad_norm": 1.7242217401584956, + "learning_rate": 6.917977543347254e-06, + "loss": 0.7674, + "step": 6014 + }, + { + "epoch": 0.61, + "grad_norm": 1.7117024400753804, + "learning_rate": 6.914843626533099e-06, + "loss": 0.7421, + "step": 6015 + }, + { + "epoch": 0.61, + "grad_norm": 1.644187079438976, + "learning_rate": 6.911710044564419e-06, + "loss": 0.769, + "step": 6016 + }, + { + "epoch": 0.61, + "grad_norm": 1.4185821366042153, + "learning_rate": 6.908576797781321e-06, + "loss": 0.7162, + "step": 6017 + }, + { + "epoch": 0.61, + "grad_norm": 1.5664785236770578, + "learning_rate": 6.905443886523868e-06, + "loss": 0.6154, + "step": 6018 + }, + { + "epoch": 0.61, + "grad_norm": 1.472890070195125, + "learning_rate": 6.902311311132084e-06, + "loss": 0.6228, + "step": 6019 + }, + { + "epoch": 0.61, + "grad_norm": 1.6096312989951134, + "learning_rate": 6.89917907194597e-06, + "loss": 0.6971, + "step": 6020 + }, + { + "epoch": 0.61, + "grad_norm": 1.7037258038186427, + "learning_rate": 6.896047169305471e-06, + "loss": 0.661, + "step": 6021 + }, + { + "epoch": 0.61, + "grad_norm": 1.5649396853725739, + "learning_rate": 6.892915603550512e-06, + "loss": 0.634, + "step": 6022 + }, + { + "epoch": 0.61, + "grad_norm": 1.6722121537895382, + "learning_rate": 6.8897843750209796e-06, + "loss": 0.7228, + "step": 6023 + }, + { + "epoch": 0.61, + "grad_norm": 1.5956272085005472, + "learning_rate": 6.8866534840567104e-06, + "loss": 0.7118, + "step": 6024 + }, + { + "epoch": 0.61, + "grad_norm": 1.599425073147109, + "learning_rate": 6.883522930997517e-06, + "loss": 0.7629, + "step": 6025 + }, + { + "epoch": 0.61, + "grad_norm": 1.6284369870991078, + "learning_rate": 6.88039271618318e-06, + "loss": 0.7095, + "step": 6026 + }, + { + "epoch": 0.61, + "grad_norm": 1.4076269843619167, + "learning_rate": 6.877262839953422e-06, + "loss": 0.5916, + "step": 6027 + }, + { + "epoch": 0.61, + "grad_norm": 1.5691541798606565, + "learning_rate": 6.874133302647953e-06, + "loss": 0.6988, + "step": 6028 + }, + { + "epoch": 0.61, + "grad_norm": 1.5347110368511394, + "learning_rate": 6.871004104606427e-06, + "loss": 0.6454, + "step": 6029 + }, + { + "epoch": 0.61, + "grad_norm": 1.4321167031271145, + "learning_rate": 6.867875246168474e-06, + "loss": 0.6535, + "step": 6030 + }, + { + "epoch": 0.61, + "grad_norm": 1.6411822049037168, + "learning_rate": 6.864746727673685e-06, + "loss": 0.6945, + "step": 6031 + }, + { + "epoch": 0.61, + "grad_norm": 1.633086470213813, + "learning_rate": 6.861618549461606e-06, + "loss": 0.624, + "step": 6032 + }, + { + "epoch": 0.61, + "grad_norm": 1.6858246877293717, + "learning_rate": 6.858490711871759e-06, + "loss": 0.686, + "step": 6033 + }, + { + "epoch": 0.61, + "grad_norm": 1.5239869152303707, + "learning_rate": 6.855363215243612e-06, + "loss": 0.8371, + "step": 6034 + }, + { + "epoch": 0.61, + "grad_norm": 1.445563473951057, + "learning_rate": 6.852236059916612e-06, + "loss": 0.6643, + "step": 6035 + }, + { + "epoch": 0.61, + "grad_norm": 1.6714240174258932, + "learning_rate": 6.849109246230167e-06, + "loss": 0.7424, + "step": 6036 + }, + { + "epoch": 0.61, + "grad_norm": 1.683585445655326, + "learning_rate": 6.845982774523634e-06, + "loss": 0.6813, + "step": 6037 + }, + { + "epoch": 0.61, + "grad_norm": 1.6850963323083905, + "learning_rate": 6.842856645136351e-06, + "loss": 0.744, + "step": 6038 + }, + { + "epoch": 0.61, + "grad_norm": 1.6178748682318473, + "learning_rate": 6.839730858407604e-06, + "loss": 0.6569, + "step": 6039 + }, + { + "epoch": 0.61, + "grad_norm": 1.5032196567724914, + "learning_rate": 6.836605414676652e-06, + "loss": 0.6416, + "step": 6040 + }, + { + "epoch": 0.61, + "grad_norm": 1.5419909852187603, + "learning_rate": 6.8334803142827144e-06, + "loss": 0.7778, + "step": 6041 + }, + { + "epoch": 0.61, + "grad_norm": 1.6887785419890449, + "learning_rate": 6.830355557564966e-06, + "loss": 0.8648, + "step": 6042 + }, + { + "epoch": 0.61, + "grad_norm": 1.4451232947180794, + "learning_rate": 6.827231144862555e-06, + "loss": 0.7358, + "step": 6043 + }, + { + "epoch": 0.61, + "grad_norm": 1.660669903928941, + "learning_rate": 6.82410707651459e-06, + "loss": 0.7228, + "step": 6044 + }, + { + "epoch": 0.61, + "grad_norm": 1.6440758805537505, + "learning_rate": 6.820983352860133e-06, + "loss": 0.7577, + "step": 6045 + }, + { + "epoch": 0.61, + "grad_norm": 1.6257757580594896, + "learning_rate": 6.817859974238223e-06, + "loss": 0.6243, + "step": 6046 + }, + { + "epoch": 0.62, + "grad_norm": 1.6584220789207265, + "learning_rate": 6.814736940987845e-06, + "loss": 0.6615, + "step": 6047 + }, + { + "epoch": 0.62, + "grad_norm": 1.5600593702346273, + "learning_rate": 6.81161425344796e-06, + "loss": 0.6646, + "step": 6048 + }, + { + "epoch": 0.62, + "grad_norm": 1.7382755089642217, + "learning_rate": 6.808491911957492e-06, + "loss": 0.8191, + "step": 6049 + }, + { + "epoch": 0.62, + "grad_norm": 1.6370998175010671, + "learning_rate": 6.805369916855313e-06, + "loss": 0.8585, + "step": 6050 + }, + { + "epoch": 0.62, + "grad_norm": 1.5579772194685, + "learning_rate": 6.8022482684802745e-06, + "loss": 0.6841, + "step": 6051 + }, + { + "epoch": 0.62, + "grad_norm": 1.5781599509351145, + "learning_rate": 6.799126967171177e-06, + "loss": 0.7058, + "step": 6052 + }, + { + "epoch": 0.62, + "grad_norm": 1.6306166871228853, + "learning_rate": 6.7960060132667895e-06, + "loss": 0.7381, + "step": 6053 + }, + { + "epoch": 0.62, + "grad_norm": 1.8356173142033885, + "learning_rate": 6.792885407105848e-06, + "loss": 0.7518, + "step": 6054 + }, + { + "epoch": 0.62, + "grad_norm": 1.6522651945471996, + "learning_rate": 6.789765149027039e-06, + "loss": 0.6215, + "step": 6055 + }, + { + "epoch": 0.62, + "grad_norm": 1.53831203818576, + "learning_rate": 6.786645239369022e-06, + "loss": 0.7443, + "step": 6056 + }, + { + "epoch": 0.62, + "grad_norm": 1.9292567739728215, + "learning_rate": 6.783525678470413e-06, + "loss": 0.7387, + "step": 6057 + }, + { + "epoch": 0.62, + "grad_norm": 1.3837803608137365, + "learning_rate": 6.78040646666979e-06, + "loss": 0.7023, + "step": 6058 + }, + { + "epoch": 0.62, + "grad_norm": 1.6306535740518562, + "learning_rate": 6.777287604305698e-06, + "loss": 0.7212, + "step": 6059 + }, + { + "epoch": 0.62, + "grad_norm": 1.6551843866151499, + "learning_rate": 6.774169091716638e-06, + "loss": 0.796, + "step": 6060 + }, + { + "epoch": 0.62, + "grad_norm": 1.7677581368413209, + "learning_rate": 6.771050929241076e-06, + "loss": 0.8101, + "step": 6061 + }, + { + "epoch": 0.62, + "grad_norm": 1.5152515724530302, + "learning_rate": 6.767933117217444e-06, + "loss": 0.7826, + "step": 6062 + }, + { + "epoch": 0.62, + "grad_norm": 1.7830840121583593, + "learning_rate": 6.764815655984125e-06, + "loss": 0.7526, + "step": 6063 + }, + { + "epoch": 0.62, + "grad_norm": 1.7562929056380636, + "learning_rate": 6.761698545879476e-06, + "loss": 0.7104, + "step": 6064 + }, + { + "epoch": 0.62, + "grad_norm": 1.5567240302854402, + "learning_rate": 6.758581787241807e-06, + "loss": 0.5608, + "step": 6065 + }, + { + "epoch": 0.62, + "grad_norm": 1.723317681957902, + "learning_rate": 6.755465380409394e-06, + "loss": 0.688, + "step": 6066 + }, + { + "epoch": 0.62, + "grad_norm": 1.5952815129509346, + "learning_rate": 6.7523493257204776e-06, + "loss": 0.7464, + "step": 6067 + }, + { + "epoch": 0.62, + "grad_norm": 1.4090609686958033, + "learning_rate": 6.749233623513254e-06, + "loss": 0.7506, + "step": 6068 + }, + { + "epoch": 0.62, + "grad_norm": 1.5473101411992871, + "learning_rate": 6.746118274125883e-06, + "loss": 0.6807, + "step": 6069 + }, + { + "epoch": 0.62, + "grad_norm": 1.6216043388355958, + "learning_rate": 6.743003277896487e-06, + "loss": 0.6917, + "step": 6070 + }, + { + "epoch": 0.62, + "grad_norm": 1.916644081908343, + "learning_rate": 6.739888635163155e-06, + "loss": 0.7651, + "step": 6071 + }, + { + "epoch": 0.62, + "grad_norm": 1.5161974751902205, + "learning_rate": 6.7367743462639265e-06, + "loss": 0.6257, + "step": 6072 + }, + { + "epoch": 0.62, + "grad_norm": 1.300149205632678, + "learning_rate": 6.733660411536811e-06, + "loss": 0.6334, + "step": 6073 + }, + { + "epoch": 0.62, + "grad_norm": 1.553990093872253, + "learning_rate": 6.7305468313197815e-06, + "loss": 0.7501, + "step": 6074 + }, + { + "epoch": 0.62, + "grad_norm": 1.5132042880834147, + "learning_rate": 6.7274336059507614e-06, + "loss": 0.7175, + "step": 6075 + }, + { + "epoch": 0.62, + "grad_norm": 1.5129442394667727, + "learning_rate": 6.724320735767646e-06, + "loss": 0.708, + "step": 6076 + }, + { + "epoch": 0.62, + "grad_norm": 1.51569068130344, + "learning_rate": 6.721208221108293e-06, + "loss": 0.7555, + "step": 6077 + }, + { + "epoch": 0.62, + "grad_norm": 1.6666093513562708, + "learning_rate": 6.71809606231051e-06, + "loss": 0.7786, + "step": 6078 + }, + { + "epoch": 0.62, + "grad_norm": 1.3740089472688364, + "learning_rate": 6.714984259712074e-06, + "loss": 0.6608, + "step": 6079 + }, + { + "epoch": 0.62, + "grad_norm": 1.6533318917815236, + "learning_rate": 6.71187281365073e-06, + "loss": 0.7125, + "step": 6080 + }, + { + "epoch": 0.62, + "grad_norm": 1.5660729332736263, + "learning_rate": 6.708761724464168e-06, + "loss": 0.6867, + "step": 6081 + }, + { + "epoch": 0.62, + "grad_norm": 1.5812173362275896, + "learning_rate": 6.705650992490054e-06, + "loss": 0.6205, + "step": 6082 + }, + { + "epoch": 0.62, + "grad_norm": 1.4441682341203965, + "learning_rate": 6.7025406180660046e-06, + "loss": 0.6858, + "step": 6083 + }, + { + "epoch": 0.62, + "grad_norm": 1.4743755511325085, + "learning_rate": 6.699430601529604e-06, + "loss": 0.6628, + "step": 6084 + }, + { + "epoch": 0.62, + "grad_norm": 1.6450244119277522, + "learning_rate": 6.696320943218401e-06, + "loss": 0.6891, + "step": 6085 + }, + { + "epoch": 0.62, + "grad_norm": 1.5033342990472907, + "learning_rate": 6.69321164346989e-06, + "loss": 0.7987, + "step": 6086 + }, + { + "epoch": 0.62, + "grad_norm": 1.6742692228987246, + "learning_rate": 6.690102702621548e-06, + "loss": 0.7311, + "step": 6087 + }, + { + "epoch": 0.62, + "grad_norm": 1.6586419641837007, + "learning_rate": 6.686994121010794e-06, + "loss": 0.6917, + "step": 6088 + }, + { + "epoch": 0.62, + "grad_norm": 1.5909589091117324, + "learning_rate": 6.683885898975016e-06, + "loss": 0.6332, + "step": 6089 + }, + { + "epoch": 0.62, + "grad_norm": 1.450643132802545, + "learning_rate": 6.680778036851572e-06, + "loss": 0.6433, + "step": 6090 + }, + { + "epoch": 0.62, + "grad_norm": 1.551984481934261, + "learning_rate": 6.677670534977759e-06, + "loss": 0.7499, + "step": 6091 + }, + { + "epoch": 0.62, + "grad_norm": 1.5859172831013613, + "learning_rate": 6.674563393690858e-06, + "loss": 0.7056, + "step": 6092 + }, + { + "epoch": 0.62, + "grad_norm": 1.6366553804795465, + "learning_rate": 6.6714566133280944e-06, + "loss": 0.6751, + "step": 6093 + }, + { + "epoch": 0.62, + "grad_norm": 1.6655711827187523, + "learning_rate": 6.668350194226662e-06, + "loss": 0.7645, + "step": 6094 + }, + { + "epoch": 0.62, + "grad_norm": 1.5485819950508424, + "learning_rate": 6.665244136723719e-06, + "loss": 0.727, + "step": 6095 + }, + { + "epoch": 0.62, + "grad_norm": 1.4127253810983589, + "learning_rate": 6.662138441156371e-06, + "loss": 0.6536, + "step": 6096 + }, + { + "epoch": 0.62, + "grad_norm": 1.6329594626580382, + "learning_rate": 6.659033107861697e-06, + "loss": 0.7044, + "step": 6097 + }, + { + "epoch": 0.62, + "grad_norm": 1.7760481582902228, + "learning_rate": 6.655928137176735e-06, + "loss": 0.7399, + "step": 6098 + }, + { + "epoch": 0.62, + "grad_norm": 1.6032009667240732, + "learning_rate": 6.652823529438476e-06, + "loss": 0.6502, + "step": 6099 + }, + { + "epoch": 0.62, + "grad_norm": 1.4651867018769336, + "learning_rate": 6.649719284983882e-06, + "loss": 0.7245, + "step": 6100 + }, + { + "epoch": 0.62, + "grad_norm": 1.5920615165314944, + "learning_rate": 6.646615404149863e-06, + "loss": 0.7663, + "step": 6101 + }, + { + "epoch": 0.62, + "grad_norm": 1.6675682691281073, + "learning_rate": 6.6435118872733016e-06, + "loss": 0.7442, + "step": 6102 + }, + { + "epoch": 0.62, + "grad_norm": 1.6036547915026613, + "learning_rate": 6.640408734691039e-06, + "loss": 0.7039, + "step": 6103 + }, + { + "epoch": 0.62, + "grad_norm": 1.702638743603976, + "learning_rate": 6.637305946739865e-06, + "loss": 0.7123, + "step": 6104 + }, + { + "epoch": 0.62, + "grad_norm": 1.6000778248442582, + "learning_rate": 6.63420352375655e-06, + "loss": 0.6588, + "step": 6105 + }, + { + "epoch": 0.62, + "grad_norm": 1.5714725149545097, + "learning_rate": 6.631101466077801e-06, + "loss": 0.677, + "step": 6106 + }, + { + "epoch": 0.62, + "grad_norm": 1.6820828931724316, + "learning_rate": 6.627999774040305e-06, + "loss": 0.7083, + "step": 6107 + }, + { + "epoch": 0.62, + "grad_norm": 1.4645956316659832, + "learning_rate": 6.624898447980706e-06, + "loss": 0.6704, + "step": 6108 + }, + { + "epoch": 0.62, + "grad_norm": 1.6712984172262535, + "learning_rate": 6.6217974882355955e-06, + "loss": 0.6464, + "step": 6109 + }, + { + "epoch": 0.62, + "grad_norm": 1.570087609165966, + "learning_rate": 6.618696895141541e-06, + "loss": 0.7499, + "step": 6110 + }, + { + "epoch": 0.62, + "grad_norm": 1.7519649038605543, + "learning_rate": 6.615596669035059e-06, + "loss": 0.6598, + "step": 6111 + }, + { + "epoch": 0.62, + "grad_norm": 1.5637128365097759, + "learning_rate": 6.6124968102526325e-06, + "loss": 0.8167, + "step": 6112 + }, + { + "epoch": 0.62, + "grad_norm": 1.8123296526525825, + "learning_rate": 6.6093973191307055e-06, + "loss": 0.6705, + "step": 6113 + }, + { + "epoch": 0.62, + "grad_norm": 1.5613541223195324, + "learning_rate": 6.606298196005673e-06, + "loss": 0.7487, + "step": 6114 + }, + { + "epoch": 0.62, + "grad_norm": 1.7326002131841742, + "learning_rate": 6.603199441213901e-06, + "loss": 0.7714, + "step": 6115 + }, + { + "epoch": 0.62, + "grad_norm": 1.7424750174350074, + "learning_rate": 6.60010105509171e-06, + "loss": 0.7891, + "step": 6116 + }, + { + "epoch": 0.62, + "grad_norm": 1.5365332328220989, + "learning_rate": 6.597003037975379e-06, + "loss": 0.7173, + "step": 6117 + }, + { + "epoch": 0.62, + "grad_norm": 1.5147365728214297, + "learning_rate": 6.593905390201154e-06, + "loss": 0.6687, + "step": 6118 + }, + { + "epoch": 0.62, + "grad_norm": 1.6877996988645476, + "learning_rate": 6.590808112105232e-06, + "loss": 0.8397, + "step": 6119 + }, + { + "epoch": 0.62, + "grad_norm": 1.7092597152986675, + "learning_rate": 6.5877112040237746e-06, + "loss": 0.7818, + "step": 6120 + }, + { + "epoch": 0.62, + "grad_norm": 1.613334053324412, + "learning_rate": 6.584614666292906e-06, + "loss": 0.717, + "step": 6121 + }, + { + "epoch": 0.62, + "grad_norm": 1.5180147856705175, + "learning_rate": 6.581518499248705e-06, + "loss": 0.7353, + "step": 6122 + }, + { + "epoch": 0.62, + "grad_norm": 1.5256391495854664, + "learning_rate": 6.578422703227211e-06, + "loss": 0.7384, + "step": 6123 + }, + { + "epoch": 0.62, + "grad_norm": 1.6215104435427263, + "learning_rate": 6.5753272785644225e-06, + "loss": 0.5765, + "step": 6124 + }, + { + "epoch": 0.62, + "grad_norm": 1.4798270566748801, + "learning_rate": 6.572232225596306e-06, + "loss": 0.636, + "step": 6125 + }, + { + "epoch": 0.62, + "grad_norm": 1.6553762148182485, + "learning_rate": 6.569137544658775e-06, + "loss": 0.8014, + "step": 6126 + }, + { + "epoch": 0.62, + "grad_norm": 1.621778896682407, + "learning_rate": 6.56604323608771e-06, + "loss": 0.6951, + "step": 6127 + }, + { + "epoch": 0.62, + "grad_norm": 1.5478542582256125, + "learning_rate": 6.562949300218955e-06, + "loss": 0.6891, + "step": 6128 + }, + { + "epoch": 0.62, + "grad_norm": 1.751566216790655, + "learning_rate": 6.559855737388299e-06, + "loss": 0.7081, + "step": 6129 + }, + { + "epoch": 0.62, + "grad_norm": 1.6892966354547168, + "learning_rate": 6.556762547931505e-06, + "loss": 0.7547, + "step": 6130 + }, + { + "epoch": 0.62, + "grad_norm": 1.4676227029041156, + "learning_rate": 6.553669732184296e-06, + "loss": 0.6531, + "step": 6131 + }, + { + "epoch": 0.62, + "grad_norm": 1.5368957217679278, + "learning_rate": 6.550577290482336e-06, + "loss": 0.6174, + "step": 6132 + }, + { + "epoch": 0.62, + "grad_norm": 1.8966492736331908, + "learning_rate": 6.547485223161274e-06, + "loss": 0.8064, + "step": 6133 + }, + { + "epoch": 0.62, + "grad_norm": 1.6919571246987175, + "learning_rate": 6.544393530556697e-06, + "loss": 0.7939, + "step": 6134 + }, + { + "epoch": 0.62, + "grad_norm": 1.5433280990541385, + "learning_rate": 6.54130221300416e-06, + "loss": 0.6668, + "step": 6135 + }, + { + "epoch": 0.62, + "grad_norm": 1.4384780138474969, + "learning_rate": 6.538211270839185e-06, + "loss": 0.6493, + "step": 6136 + }, + { + "epoch": 0.62, + "grad_norm": 1.483118398389075, + "learning_rate": 6.535120704397236e-06, + "loss": 0.6313, + "step": 6137 + }, + { + "epoch": 0.62, + "grad_norm": 1.5126790622807804, + "learning_rate": 6.53203051401375e-06, + "loss": 0.6973, + "step": 6138 + }, + { + "epoch": 0.62, + "grad_norm": 1.7201848995408757, + "learning_rate": 6.528940700024122e-06, + "loss": 0.7339, + "step": 6139 + }, + { + "epoch": 0.62, + "grad_norm": 1.7302480571234962, + "learning_rate": 6.525851262763696e-06, + "loss": 0.666, + "step": 6140 + }, + { + "epoch": 0.62, + "grad_norm": 1.7036931968958287, + "learning_rate": 6.522762202567792e-06, + "loss": 0.7611, + "step": 6141 + }, + { + "epoch": 0.62, + "grad_norm": 1.609127038666171, + "learning_rate": 6.519673519771666e-06, + "loss": 0.6689, + "step": 6142 + }, + { + "epoch": 0.62, + "grad_norm": 1.4781757504167778, + "learning_rate": 6.516585214710554e-06, + "loss": 0.6643, + "step": 6143 + }, + { + "epoch": 0.62, + "grad_norm": 1.5196067810040579, + "learning_rate": 6.513497287719648e-06, + "loss": 0.5777, + "step": 6144 + }, + { + "epoch": 0.63, + "grad_norm": 1.5785347540072658, + "learning_rate": 6.510409739134082e-06, + "loss": 0.7492, + "step": 6145 + }, + { + "epoch": 0.63, + "grad_norm": 1.666016850977178, + "learning_rate": 6.507322569288974e-06, + "loss": 0.7188, + "step": 6146 + }, + { + "epoch": 0.63, + "grad_norm": 1.6450134972059112, + "learning_rate": 6.504235778519376e-06, + "loss": 0.6529, + "step": 6147 + }, + { + "epoch": 0.63, + "grad_norm": 1.502261606984428, + "learning_rate": 6.501149367160319e-06, + "loss": 0.6806, + "step": 6148 + }, + { + "epoch": 0.63, + "grad_norm": 1.5076422897426254, + "learning_rate": 6.4980633355467845e-06, + "loss": 0.7812, + "step": 6149 + }, + { + "epoch": 0.63, + "grad_norm": 1.4866092473823802, + "learning_rate": 6.494977684013708e-06, + "loss": 0.5794, + "step": 6150 + }, + { + "epoch": 0.63, + "grad_norm": 1.5008093258540893, + "learning_rate": 6.4918924128959945e-06, + "loss": 0.6753, + "step": 6151 + }, + { + "epoch": 0.63, + "grad_norm": 1.7265260554302626, + "learning_rate": 6.488807522528496e-06, + "loss": 0.7154, + "step": 6152 + }, + { + "epoch": 0.63, + "grad_norm": 1.5887570645077855, + "learning_rate": 6.48572301324603e-06, + "loss": 0.7191, + "step": 6153 + }, + { + "epoch": 0.63, + "grad_norm": 1.5810627427814312, + "learning_rate": 6.482638885383379e-06, + "loss": 0.6966, + "step": 6154 + }, + { + "epoch": 0.63, + "grad_norm": 1.6789536547200257, + "learning_rate": 6.479555139275267e-06, + "loss": 0.7727, + "step": 6155 + }, + { + "epoch": 0.63, + "grad_norm": 1.6242882408342163, + "learning_rate": 6.476471775256391e-06, + "loss": 0.6108, + "step": 6156 + }, + { + "epoch": 0.63, + "grad_norm": 1.6112020057308443, + "learning_rate": 6.473388793661406e-06, + "loss": 0.6523, + "step": 6157 + }, + { + "epoch": 0.63, + "grad_norm": 2.0051813216712024, + "learning_rate": 6.470306194824914e-06, + "loss": 0.7441, + "step": 6158 + }, + { + "epoch": 0.63, + "grad_norm": 1.6806981253531708, + "learning_rate": 6.467223979081491e-06, + "loss": 0.7096, + "step": 6159 + }, + { + "epoch": 0.63, + "grad_norm": 1.702782121668881, + "learning_rate": 6.464142146765653e-06, + "loss": 0.7192, + "step": 6160 + }, + { + "epoch": 0.63, + "grad_norm": 1.6169778774789403, + "learning_rate": 6.4610606982118914e-06, + "loss": 0.7052, + "step": 6161 + }, + { + "epoch": 0.63, + "grad_norm": 1.6807203061396876, + "learning_rate": 6.457979633754652e-06, + "loss": 0.7438, + "step": 6162 + }, + { + "epoch": 0.63, + "grad_norm": 1.6349504107469093, + "learning_rate": 6.454898953728328e-06, + "loss": 0.7091, + "step": 6163 + }, + { + "epoch": 0.63, + "grad_norm": 1.562430195567715, + "learning_rate": 6.4518186584672856e-06, + "loss": 0.7275, + "step": 6164 + }, + { + "epoch": 0.63, + "grad_norm": 1.6464753506977141, + "learning_rate": 6.448738748305841e-06, + "loss": 0.6985, + "step": 6165 + }, + { + "epoch": 0.63, + "grad_norm": 1.6695608842141376, + "learning_rate": 6.445659223578267e-06, + "loss": 0.6286, + "step": 6166 + }, + { + "epoch": 0.63, + "grad_norm": 1.5903742512985368, + "learning_rate": 6.442580084618806e-06, + "loss": 0.7277, + "step": 6167 + }, + { + "epoch": 0.63, + "grad_norm": 1.6137454812957188, + "learning_rate": 6.439501331761639e-06, + "loss": 0.6366, + "step": 6168 + }, + { + "epoch": 0.63, + "grad_norm": 1.4440989137214413, + "learning_rate": 6.436422965340925e-06, + "loss": 0.6553, + "step": 6169 + }, + { + "epoch": 0.63, + "grad_norm": 1.6574414923263707, + "learning_rate": 6.4333449856907705e-06, + "loss": 0.7672, + "step": 6170 + }, + { + "epoch": 0.63, + "grad_norm": 1.458801356848339, + "learning_rate": 6.43026739314524e-06, + "loss": 0.6427, + "step": 6171 + }, + { + "epoch": 0.63, + "grad_norm": 1.5653358507579374, + "learning_rate": 6.427190188038362e-06, + "loss": 0.7527, + "step": 6172 + }, + { + "epoch": 0.63, + "grad_norm": 1.650511862021473, + "learning_rate": 6.424113370704115e-06, + "loss": 0.7734, + "step": 6173 + }, + { + "epoch": 0.63, + "grad_norm": 1.5755765541554974, + "learning_rate": 6.421036941476439e-06, + "loss": 0.6524, + "step": 6174 + }, + { + "epoch": 0.63, + "grad_norm": 1.5420151974503422, + "learning_rate": 6.417960900689238e-06, + "loss": 0.6917, + "step": 6175 + }, + { + "epoch": 0.63, + "grad_norm": 1.7208568860923064, + "learning_rate": 6.414885248676361e-06, + "loss": 0.7065, + "step": 6176 + }, + { + "epoch": 0.63, + "grad_norm": 1.6013565217810959, + "learning_rate": 6.411809985771629e-06, + "loss": 0.73, + "step": 6177 + }, + { + "epoch": 0.63, + "grad_norm": 1.5697716167814815, + "learning_rate": 6.408735112308806e-06, + "loss": 0.6028, + "step": 6178 + }, + { + "epoch": 0.63, + "grad_norm": 1.571569770741846, + "learning_rate": 6.405660628621628e-06, + "loss": 0.7557, + "step": 6179 + }, + { + "epoch": 0.63, + "grad_norm": 1.40999018231601, + "learning_rate": 6.40258653504378e-06, + "loss": 0.6783, + "step": 6180 + }, + { + "epoch": 0.63, + "grad_norm": 1.3555682140849614, + "learning_rate": 6.3995128319089036e-06, + "loss": 0.6635, + "step": 6181 + }, + { + "epoch": 0.63, + "grad_norm": 1.7179220579329704, + "learning_rate": 6.396439519550609e-06, + "loss": 0.821, + "step": 6182 + }, + { + "epoch": 0.63, + "grad_norm": 1.556213439686506, + "learning_rate": 6.3933665983024465e-06, + "loss": 0.7582, + "step": 6183 + }, + { + "epoch": 0.63, + "grad_norm": 1.76363791750226, + "learning_rate": 6.390294068497937e-06, + "loss": 0.7535, + "step": 6184 + }, + { + "epoch": 0.63, + "grad_norm": 1.7217112711922193, + "learning_rate": 6.387221930470564e-06, + "loss": 0.6494, + "step": 6185 + }, + { + "epoch": 0.63, + "grad_norm": 1.639820890694774, + "learning_rate": 6.3841501845537464e-06, + "loss": 0.7747, + "step": 6186 + }, + { + "epoch": 0.63, + "grad_norm": 1.8380696332977846, + "learning_rate": 6.3810788310808855e-06, + "loss": 0.7267, + "step": 6187 + }, + { + "epoch": 0.63, + "grad_norm": 1.6150044913304795, + "learning_rate": 6.378007870385321e-06, + "loss": 0.7976, + "step": 6188 + }, + { + "epoch": 0.63, + "grad_norm": 1.6080626517186347, + "learning_rate": 6.3749373028003595e-06, + "loss": 0.645, + "step": 6189 + }, + { + "epoch": 0.63, + "grad_norm": 1.590231884761789, + "learning_rate": 6.371867128659267e-06, + "loss": 0.6826, + "step": 6190 + }, + { + "epoch": 0.63, + "grad_norm": 1.6438217494955731, + "learning_rate": 6.368797348295257e-06, + "loss": 0.7977, + "step": 6191 + }, + { + "epoch": 0.63, + "grad_norm": 1.7206786971877635, + "learning_rate": 6.36572796204151e-06, + "loss": 0.7241, + "step": 6192 + }, + { + "epoch": 0.63, + "grad_norm": 1.6000650003086683, + "learning_rate": 6.362658970231161e-06, + "loss": 0.7469, + "step": 6193 + }, + { + "epoch": 0.63, + "grad_norm": 1.543427534178332, + "learning_rate": 6.3595903731972975e-06, + "loss": 0.7506, + "step": 6194 + }, + { + "epoch": 0.63, + "grad_norm": 1.6441500222599714, + "learning_rate": 6.356522171272972e-06, + "loss": 0.7373, + "step": 6195 + }, + { + "epoch": 0.63, + "grad_norm": 1.5579041192143457, + "learning_rate": 6.353454364791184e-06, + "loss": 0.7233, + "step": 6196 + }, + { + "epoch": 0.63, + "grad_norm": 1.6165157378519235, + "learning_rate": 6.350386954084898e-06, + "loss": 0.7153, + "step": 6197 + }, + { + "epoch": 0.63, + "grad_norm": 1.6553996838473866, + "learning_rate": 6.34731993948704e-06, + "loss": 0.6837, + "step": 6198 + }, + { + "epoch": 0.63, + "grad_norm": 1.6509896130097286, + "learning_rate": 6.344253321330476e-06, + "loss": 0.7204, + "step": 6199 + }, + { + "epoch": 0.63, + "grad_norm": 1.5788859467893934, + "learning_rate": 6.341187099948049e-06, + "loss": 0.7383, + "step": 6200 + }, + { + "epoch": 0.63, + "grad_norm": 1.740660603581767, + "learning_rate": 6.33812127567254e-06, + "loss": 0.7318, + "step": 6201 + }, + { + "epoch": 0.63, + "grad_norm": 1.6619288434644135, + "learning_rate": 6.335055848836702e-06, + "loss": 0.6514, + "step": 6202 + }, + { + "epoch": 0.63, + "grad_norm": 1.740794983691795, + "learning_rate": 6.3319908197732415e-06, + "loss": 0.6587, + "step": 6203 + }, + { + "epoch": 0.63, + "grad_norm": 1.6781697662136037, + "learning_rate": 6.328926188814814e-06, + "loss": 0.8202, + "step": 6204 + }, + { + "epoch": 0.63, + "grad_norm": 1.5546369098985757, + "learning_rate": 6.325861956294042e-06, + "loss": 0.6052, + "step": 6205 + }, + { + "epoch": 0.63, + "grad_norm": 1.6583434658522769, + "learning_rate": 6.322798122543494e-06, + "loss": 0.6109, + "step": 6206 + }, + { + "epoch": 0.63, + "grad_norm": 1.7530808501779143, + "learning_rate": 6.319734687895704e-06, + "loss": 0.6413, + "step": 6207 + }, + { + "epoch": 0.63, + "grad_norm": 1.611177983099102, + "learning_rate": 6.316671652683166e-06, + "loss": 0.7458, + "step": 6208 + }, + { + "epoch": 0.63, + "grad_norm": 1.4791293149751368, + "learning_rate": 6.3136090172383136e-06, + "loss": 0.7304, + "step": 6209 + }, + { + "epoch": 0.63, + "grad_norm": 1.7167625865519724, + "learning_rate": 6.310546781893556e-06, + "loss": 0.7791, + "step": 6210 + }, + { + "epoch": 0.63, + "grad_norm": 1.6833743431663846, + "learning_rate": 6.307484946981251e-06, + "loss": 0.7106, + "step": 6211 + }, + { + "epoch": 0.63, + "grad_norm": 1.7007098813718895, + "learning_rate": 6.3044235128337065e-06, + "loss": 0.7332, + "step": 6212 + }, + { + "epoch": 0.63, + "grad_norm": 1.4874192642800728, + "learning_rate": 6.301362479783202e-06, + "loss": 0.648, + "step": 6213 + }, + { + "epoch": 0.63, + "grad_norm": 1.6123542295788824, + "learning_rate": 6.298301848161956e-06, + "loss": 0.6245, + "step": 6214 + }, + { + "epoch": 0.63, + "grad_norm": 1.4299700958218342, + "learning_rate": 6.295241618302156e-06, + "loss": 0.7062, + "step": 6215 + }, + { + "epoch": 0.63, + "grad_norm": 1.6539641925588098, + "learning_rate": 6.292181790535947e-06, + "loss": 0.7223, + "step": 6216 + }, + { + "epoch": 0.63, + "grad_norm": 1.778675414817209, + "learning_rate": 6.289122365195416e-06, + "loss": 0.8371, + "step": 6217 + }, + { + "epoch": 0.63, + "grad_norm": 1.576626746519206, + "learning_rate": 6.286063342612625e-06, + "loss": 0.6435, + "step": 6218 + }, + { + "epoch": 0.63, + "grad_norm": 1.514287004334272, + "learning_rate": 6.283004723119575e-06, + "loss": 0.6376, + "step": 6219 + }, + { + "epoch": 0.63, + "grad_norm": 1.5667375882028205, + "learning_rate": 6.2799465070482335e-06, + "loss": 0.7439, + "step": 6220 + }, + { + "epoch": 0.63, + "grad_norm": 1.5966702971852884, + "learning_rate": 6.276888694730529e-06, + "loss": 0.674, + "step": 6221 + }, + { + "epoch": 0.63, + "grad_norm": 1.7417020646634092, + "learning_rate": 6.27383128649833e-06, + "loss": 0.7694, + "step": 6222 + }, + { + "epoch": 0.63, + "grad_norm": 1.582188756240723, + "learning_rate": 6.270774282683476e-06, + "loss": 0.7764, + "step": 6223 + }, + { + "epoch": 0.63, + "grad_norm": 1.6387750113625243, + "learning_rate": 6.267717683617753e-06, + "loss": 0.7126, + "step": 6224 + }, + { + "epoch": 0.63, + "grad_norm": 1.7036047789699225, + "learning_rate": 6.26466148963291e-06, + "loss": 0.6611, + "step": 6225 + }, + { + "epoch": 0.63, + "grad_norm": 1.4623154809803915, + "learning_rate": 6.261605701060649e-06, + "loss": 0.6599, + "step": 6226 + }, + { + "epoch": 0.63, + "grad_norm": 1.6301884041651713, + "learning_rate": 6.258550318232626e-06, + "loss": 0.6825, + "step": 6227 + }, + { + "epoch": 0.63, + "grad_norm": 1.500453691932618, + "learning_rate": 6.255495341480455e-06, + "loss": 0.8056, + "step": 6228 + }, + { + "epoch": 0.63, + "grad_norm": 1.861004993798108, + "learning_rate": 6.25244077113571e-06, + "loss": 0.6886, + "step": 6229 + }, + { + "epoch": 0.63, + "grad_norm": 1.4682486308246931, + "learning_rate": 6.249386607529914e-06, + "loss": 0.6634, + "step": 6230 + }, + { + "epoch": 0.63, + "grad_norm": 1.5279737405454734, + "learning_rate": 6.246332850994547e-06, + "loss": 0.7362, + "step": 6231 + }, + { + "epoch": 0.63, + "grad_norm": 1.5951840003051774, + "learning_rate": 6.243279501861048e-06, + "loss": 0.6437, + "step": 6232 + }, + { + "epoch": 0.63, + "grad_norm": 1.5273387782005643, + "learning_rate": 6.240226560460811e-06, + "loss": 0.7581, + "step": 6233 + }, + { + "epoch": 0.63, + "grad_norm": 1.5694446493717522, + "learning_rate": 6.237174027125186e-06, + "loss": 0.7142, + "step": 6234 + }, + { + "epoch": 0.63, + "grad_norm": 1.7587644931185076, + "learning_rate": 6.234121902185475e-06, + "loss": 0.7101, + "step": 6235 + }, + { + "epoch": 0.63, + "grad_norm": 1.538172724798086, + "learning_rate": 6.231070185972943e-06, + "loss": 0.588, + "step": 6236 + }, + { + "epoch": 0.63, + "grad_norm": 1.732688285618746, + "learning_rate": 6.2280188788187975e-06, + "loss": 0.5983, + "step": 6237 + }, + { + "epoch": 0.63, + "grad_norm": 1.681520609984493, + "learning_rate": 6.224967981054216e-06, + "loss": 0.7516, + "step": 6238 + }, + { + "epoch": 0.63, + "grad_norm": 1.6385101464147946, + "learning_rate": 6.22191749301033e-06, + "loss": 0.7589, + "step": 6239 + }, + { + "epoch": 0.63, + "grad_norm": 1.7230564965309907, + "learning_rate": 6.218867415018213e-06, + "loss": 0.7187, + "step": 6240 + }, + { + "epoch": 0.63, + "grad_norm": 1.6237010455135803, + "learning_rate": 6.215817747408912e-06, + "loss": 0.7512, + "step": 6241 + }, + { + "epoch": 0.63, + "grad_norm": 1.7284927765460951, + "learning_rate": 6.212768490513412e-06, + "loss": 0.7624, + "step": 6242 + }, + { + "epoch": 0.64, + "grad_norm": 1.4408795649716721, + "learning_rate": 6.209719644662668e-06, + "loss": 0.6562, + "step": 6243 + }, + { + "epoch": 0.64, + "grad_norm": 1.5648988173321112, + "learning_rate": 6.206671210187587e-06, + "loss": 0.6206, + "step": 6244 + }, + { + "epoch": 0.64, + "grad_norm": 1.5329244582770034, + "learning_rate": 6.203623187419021e-06, + "loss": 0.7085, + "step": 6245 + }, + { + "epoch": 0.64, + "grad_norm": 1.689201951258469, + "learning_rate": 6.200575576687788e-06, + "loss": 0.6889, + "step": 6246 + }, + { + "epoch": 0.64, + "grad_norm": 1.5068712420456922, + "learning_rate": 6.197528378324664e-06, + "loss": 0.6117, + "step": 6247 + }, + { + "epoch": 0.64, + "grad_norm": 1.7289125382594037, + "learning_rate": 6.194481592660369e-06, + "loss": 0.7321, + "step": 6248 + }, + { + "epoch": 0.64, + "grad_norm": 1.6075378730320198, + "learning_rate": 6.191435220025586e-06, + "loss": 0.7114, + "step": 6249 + }, + { + "epoch": 0.64, + "grad_norm": 1.6979383319765728, + "learning_rate": 6.188389260750948e-06, + "loss": 0.6504, + "step": 6250 + }, + { + "epoch": 0.64, + "grad_norm": 1.741041947752599, + "learning_rate": 6.18534371516705e-06, + "loss": 0.7787, + "step": 6251 + }, + { + "epoch": 0.64, + "grad_norm": 1.6692017023266297, + "learning_rate": 6.18229858360444e-06, + "loss": 0.7007, + "step": 6252 + }, + { + "epoch": 0.64, + "grad_norm": 1.5863510304471515, + "learning_rate": 6.179253866393613e-06, + "loss": 0.7042, + "step": 6253 + }, + { + "epoch": 0.64, + "grad_norm": 1.5770757934096424, + "learning_rate": 6.1762095638650336e-06, + "loss": 0.7247, + "step": 6254 + }, + { + "epoch": 0.64, + "grad_norm": 1.5956129238205312, + "learning_rate": 6.173165676349103e-06, + "loss": 0.7633, + "step": 6255 + }, + { + "epoch": 0.64, + "grad_norm": 1.6212786954144751, + "learning_rate": 6.170122204176194e-06, + "loss": 0.7317, + "step": 6256 + }, + { + "epoch": 0.64, + "grad_norm": 1.6222304674140229, + "learning_rate": 6.167079147676632e-06, + "loss": 0.6724, + "step": 6257 + }, + { + "epoch": 0.64, + "grad_norm": 1.5572506586899673, + "learning_rate": 6.164036507180684e-06, + "loss": 0.682, + "step": 6258 + }, + { + "epoch": 0.64, + "grad_norm": 1.6638911977836073, + "learning_rate": 6.160994283018589e-06, + "loss": 0.7388, + "step": 6259 + }, + { + "epoch": 0.64, + "grad_norm": 1.6200290901079435, + "learning_rate": 6.157952475520525e-06, + "loss": 0.7179, + "step": 6260 + }, + { + "epoch": 0.64, + "grad_norm": 1.775490359235819, + "learning_rate": 6.154911085016637e-06, + "loss": 0.6368, + "step": 6261 + }, + { + "epoch": 0.64, + "grad_norm": 1.5992580931554452, + "learning_rate": 6.151870111837024e-06, + "loss": 0.8094, + "step": 6262 + }, + { + "epoch": 0.64, + "grad_norm": 1.7821024117668556, + "learning_rate": 6.148829556311728e-06, + "loss": 0.8342, + "step": 6263 + }, + { + "epoch": 0.64, + "grad_norm": 1.5027920068721097, + "learning_rate": 6.1457894187707644e-06, + "loss": 0.6714, + "step": 6264 + }, + { + "epoch": 0.64, + "grad_norm": 1.7657574807299323, + "learning_rate": 6.142749699544079e-06, + "loss": 0.7252, + "step": 6265 + }, + { + "epoch": 0.64, + "grad_norm": 1.5835231242874228, + "learning_rate": 6.1397103989615955e-06, + "loss": 0.6411, + "step": 6266 + }, + { + "epoch": 0.64, + "grad_norm": 1.5892404817482912, + "learning_rate": 6.136671517353183e-06, + "loss": 0.7354, + "step": 6267 + }, + { + "epoch": 0.64, + "grad_norm": 1.3784828776341291, + "learning_rate": 6.133633055048658e-06, + "loss": 0.5934, + "step": 6268 + }, + { + "epoch": 0.64, + "grad_norm": 1.7207808030825036, + "learning_rate": 6.1305950123778024e-06, + "loss": 0.7213, + "step": 6269 + }, + { + "epoch": 0.64, + "grad_norm": 1.5378430526726719, + "learning_rate": 6.127557389670351e-06, + "loss": 0.6262, + "step": 6270 + }, + { + "epoch": 0.64, + "grad_norm": 1.5920669806506544, + "learning_rate": 6.124520187255983e-06, + "loss": 0.7416, + "step": 6271 + }, + { + "epoch": 0.64, + "grad_norm": 1.406596466187717, + "learning_rate": 6.121483405464346e-06, + "loss": 0.6638, + "step": 6272 + }, + { + "epoch": 0.64, + "grad_norm": 1.7939125165954555, + "learning_rate": 6.118447044625027e-06, + "loss": 0.7147, + "step": 6273 + }, + { + "epoch": 0.64, + "grad_norm": 1.54819035552784, + "learning_rate": 6.115411105067582e-06, + "loss": 0.5916, + "step": 6274 + }, + { + "epoch": 0.64, + "grad_norm": 1.7152824570643843, + "learning_rate": 6.112375587121518e-06, + "loss": 0.7127, + "step": 6275 + }, + { + "epoch": 0.64, + "grad_norm": 1.7193281644819463, + "learning_rate": 6.109340491116284e-06, + "loss": 0.762, + "step": 6276 + }, + { + "epoch": 0.64, + "grad_norm": 1.6424410423978122, + "learning_rate": 6.1063058173812975e-06, + "loss": 0.7889, + "step": 6277 + }, + { + "epoch": 0.64, + "grad_norm": 1.6949947318613092, + "learning_rate": 6.103271566245925e-06, + "loss": 0.7154, + "step": 6278 + }, + { + "epoch": 0.64, + "grad_norm": 1.514080879501498, + "learning_rate": 6.1002377380394835e-06, + "loss": 0.7785, + "step": 6279 + }, + { + "epoch": 0.64, + "grad_norm": 1.506629183284391, + "learning_rate": 6.097204333091254e-06, + "loss": 0.7608, + "step": 6280 + }, + { + "epoch": 0.64, + "grad_norm": 1.5244336613717824, + "learning_rate": 6.094171351730458e-06, + "loss": 0.7205, + "step": 6281 + }, + { + "epoch": 0.64, + "grad_norm": 1.66852571528665, + "learning_rate": 6.0911387942862835e-06, + "loss": 0.7075, + "step": 6282 + }, + { + "epoch": 0.64, + "grad_norm": 1.6264908140490997, + "learning_rate": 6.0881066610878624e-06, + "loss": 0.7442, + "step": 6283 + }, + { + "epoch": 0.64, + "grad_norm": 1.4881575242339578, + "learning_rate": 6.08507495246429e-06, + "loss": 0.8112, + "step": 6284 + }, + { + "epoch": 0.64, + "grad_norm": 1.810367992210238, + "learning_rate": 6.08204366874461e-06, + "loss": 0.8227, + "step": 6285 + }, + { + "epoch": 0.64, + "grad_norm": 1.6755010174301157, + "learning_rate": 6.079012810257816e-06, + "loss": 0.7535, + "step": 6286 + }, + { + "epoch": 0.64, + "grad_norm": 1.5551086318305345, + "learning_rate": 6.075982377332868e-06, + "loss": 0.6764, + "step": 6287 + }, + { + "epoch": 0.64, + "grad_norm": 1.5306851594326032, + "learning_rate": 6.072952370298667e-06, + "loss": 0.6971, + "step": 6288 + }, + { + "epoch": 0.64, + "grad_norm": 1.7111329561521385, + "learning_rate": 6.069922789484073e-06, + "loss": 0.6651, + "step": 6289 + }, + { + "epoch": 0.64, + "grad_norm": 1.6290435054738361, + "learning_rate": 6.066893635217903e-06, + "loss": 0.6927, + "step": 6290 + }, + { + "epoch": 0.64, + "grad_norm": 1.792078224753121, + "learning_rate": 6.0638649078289195e-06, + "loss": 0.7057, + "step": 6291 + }, + { + "epoch": 0.64, + "grad_norm": 1.5726534950494397, + "learning_rate": 6.060836607645845e-06, + "loss": 0.7077, + "step": 6292 + }, + { + "epoch": 0.64, + "grad_norm": 1.414599565633372, + "learning_rate": 6.057808734997359e-06, + "loss": 0.6678, + "step": 6293 + }, + { + "epoch": 0.64, + "grad_norm": 1.5506814668719011, + "learning_rate": 6.054781290212083e-06, + "loss": 0.617, + "step": 6294 + }, + { + "epoch": 0.64, + "grad_norm": 1.5262725814759375, + "learning_rate": 6.051754273618605e-06, + "loss": 0.7637, + "step": 6295 + }, + { + "epoch": 0.64, + "grad_norm": 1.5878570034248665, + "learning_rate": 6.0487276855454525e-06, + "loss": 0.7669, + "step": 6296 + }, + { + "epoch": 0.64, + "grad_norm": 1.5719283837210194, + "learning_rate": 6.04570152632112e-06, + "loss": 0.7243, + "step": 6297 + }, + { + "epoch": 0.64, + "grad_norm": 1.5875716436302951, + "learning_rate": 6.042675796274051e-06, + "loss": 0.6854, + "step": 6298 + }, + { + "epoch": 0.64, + "grad_norm": 1.5012491845451224, + "learning_rate": 6.0396504957326365e-06, + "loss": 0.7472, + "step": 6299 + }, + { + "epoch": 0.64, + "grad_norm": 1.5980477111010662, + "learning_rate": 6.03662562502523e-06, + "loss": 0.7216, + "step": 6300 + }, + { + "epoch": 0.64, + "grad_norm": 1.5034426911917258, + "learning_rate": 6.033601184480129e-06, + "loss": 0.619, + "step": 6301 + }, + { + "epoch": 0.64, + "grad_norm": 1.4686870305170632, + "learning_rate": 6.03057717442559e-06, + "loss": 0.638, + "step": 6302 + }, + { + "epoch": 0.64, + "grad_norm": 1.5049175281810077, + "learning_rate": 6.02755359518983e-06, + "loss": 0.667, + "step": 6303 + }, + { + "epoch": 0.64, + "grad_norm": 1.6873684104287792, + "learning_rate": 6.024530447101e-06, + "loss": 0.7237, + "step": 6304 + }, + { + "epoch": 0.64, + "grad_norm": 1.7609432099220983, + "learning_rate": 6.021507730487221e-06, + "loss": 0.7489, + "step": 6305 + }, + { + "epoch": 0.64, + "grad_norm": 1.4623537810787113, + "learning_rate": 6.018485445676565e-06, + "loss": 0.6506, + "step": 6306 + }, + { + "epoch": 0.64, + "grad_norm": 1.888703526424575, + "learning_rate": 6.015463592997046e-06, + "loss": 0.7954, + "step": 6307 + }, + { + "epoch": 0.64, + "grad_norm": 1.6726747559744533, + "learning_rate": 6.0124421727766465e-06, + "loss": 0.7867, + "step": 6308 + }, + { + "epoch": 0.64, + "grad_norm": 1.6863782626305706, + "learning_rate": 6.009421185343287e-06, + "loss": 0.7393, + "step": 6309 + }, + { + "epoch": 0.64, + "grad_norm": 1.6499492821668567, + "learning_rate": 6.006400631024851e-06, + "loss": 0.8357, + "step": 6310 + }, + { + "epoch": 0.64, + "grad_norm": 1.7095743812571649, + "learning_rate": 6.003380510149179e-06, + "loss": 0.6429, + "step": 6311 + }, + { + "epoch": 0.64, + "grad_norm": 1.6484443288049906, + "learning_rate": 6.000360823044049e-06, + "loss": 0.7172, + "step": 6312 + }, + { + "epoch": 0.64, + "grad_norm": 1.4579244853163298, + "learning_rate": 5.997341570037208e-06, + "loss": 0.729, + "step": 6313 + }, + { + "epoch": 0.64, + "grad_norm": 1.4871202444789973, + "learning_rate": 5.994322751456339e-06, + "loss": 0.6165, + "step": 6314 + }, + { + "epoch": 0.64, + "grad_norm": 1.5291037181344551, + "learning_rate": 5.9913043676290964e-06, + "loss": 0.718, + "step": 6315 + }, + { + "epoch": 0.64, + "grad_norm": 1.5223742645046954, + "learning_rate": 5.988286418883078e-06, + "loss": 0.7463, + "step": 6316 + }, + { + "epoch": 0.64, + "grad_norm": 1.3940083711431022, + "learning_rate": 5.985268905545829e-06, + "loss": 0.5375, + "step": 6317 + }, + { + "epoch": 0.64, + "grad_norm": 1.6237783872688747, + "learning_rate": 5.98225182794486e-06, + "loss": 0.812, + "step": 6318 + }, + { + "epoch": 0.64, + "grad_norm": 1.6176773287997215, + "learning_rate": 5.979235186407622e-06, + "loss": 0.8133, + "step": 6319 + }, + { + "epoch": 0.64, + "grad_norm": 1.9042342799536476, + "learning_rate": 5.9762189812615254e-06, + "loss": 0.7468, + "step": 6320 + }, + { + "epoch": 0.64, + "grad_norm": 1.7788211346001306, + "learning_rate": 5.973203212833939e-06, + "loss": 0.7365, + "step": 6321 + }, + { + "epoch": 0.64, + "grad_norm": 1.8176366507210457, + "learning_rate": 5.970187881452168e-06, + "loss": 0.6977, + "step": 6322 + }, + { + "epoch": 0.64, + "grad_norm": 1.4221286727959068, + "learning_rate": 5.967172987443482e-06, + "loss": 0.5771, + "step": 6323 + }, + { + "epoch": 0.64, + "grad_norm": 1.4065211568246696, + "learning_rate": 5.964158531135106e-06, + "loss": 0.6418, + "step": 6324 + }, + { + "epoch": 0.64, + "grad_norm": 1.650739756578554, + "learning_rate": 5.961144512854205e-06, + "loss": 0.6653, + "step": 6325 + }, + { + "epoch": 0.64, + "grad_norm": 1.6301746789006268, + "learning_rate": 5.958130932927908e-06, + "loss": 0.7417, + "step": 6326 + }, + { + "epoch": 0.64, + "grad_norm": 1.4479738696607576, + "learning_rate": 5.955117791683289e-06, + "loss": 0.6565, + "step": 6327 + }, + { + "epoch": 0.64, + "grad_norm": 1.5210492576094117, + "learning_rate": 5.952105089447378e-06, + "loss": 0.6512, + "step": 6328 + }, + { + "epoch": 0.64, + "grad_norm": 1.656886978316453, + "learning_rate": 5.94909282654716e-06, + "loss": 0.6448, + "step": 6329 + }, + { + "epoch": 0.64, + "grad_norm": 1.6081903057337392, + "learning_rate": 5.946081003309565e-06, + "loss": 0.5581, + "step": 6330 + }, + { + "epoch": 0.64, + "grad_norm": 1.753778483406041, + "learning_rate": 5.943069620061481e-06, + "loss": 0.7395, + "step": 6331 + }, + { + "epoch": 0.64, + "grad_norm": 1.6802658251468272, + "learning_rate": 5.940058677129748e-06, + "loss": 0.6784, + "step": 6332 + }, + { + "epoch": 0.64, + "grad_norm": 1.6891268164739461, + "learning_rate": 5.937048174841153e-06, + "loss": 0.7862, + "step": 6333 + }, + { + "epoch": 0.64, + "grad_norm": 1.918030920602363, + "learning_rate": 5.934038113522442e-06, + "loss": 0.7078, + "step": 6334 + }, + { + "epoch": 0.64, + "grad_norm": 1.7695806628166773, + "learning_rate": 5.9310284935003106e-06, + "loss": 0.7207, + "step": 6335 + }, + { + "epoch": 0.64, + "grad_norm": 1.5286937983901505, + "learning_rate": 5.928019315101403e-06, + "loss": 0.639, + "step": 6336 + }, + { + "epoch": 0.64, + "grad_norm": 1.5447517041351149, + "learning_rate": 5.92501057865232e-06, + "loss": 0.6319, + "step": 6337 + }, + { + "epoch": 0.64, + "grad_norm": 1.5609798103338413, + "learning_rate": 5.922002284479614e-06, + "loss": 0.642, + "step": 6338 + }, + { + "epoch": 0.64, + "grad_norm": 1.5687889938638144, + "learning_rate": 5.9189944329097885e-06, + "loss": 0.7272, + "step": 6339 + }, + { + "epoch": 0.64, + "grad_norm": 1.6224286950572846, + "learning_rate": 5.915987024269294e-06, + "loss": 0.7058, + "step": 6340 + }, + { + "epoch": 0.65, + "grad_norm": 1.5027567146457037, + "learning_rate": 5.912980058884543e-06, + "loss": 0.7581, + "step": 6341 + }, + { + "epoch": 0.65, + "grad_norm": 1.7082554644942154, + "learning_rate": 5.909973537081893e-06, + "loss": 0.7154, + "step": 6342 + }, + { + "epoch": 0.65, + "grad_norm": 1.691328540293944, + "learning_rate": 5.9069674591876535e-06, + "loss": 0.7977, + "step": 6343 + }, + { + "epoch": 0.65, + "grad_norm": 1.6256752787523274, + "learning_rate": 5.903961825528092e-06, + "loss": 0.6784, + "step": 6344 + }, + { + "epoch": 0.65, + "grad_norm": 1.556542188553163, + "learning_rate": 5.900956636429416e-06, + "loss": 0.6217, + "step": 6345 + }, + { + "epoch": 0.65, + "grad_norm": 1.7067246350845486, + "learning_rate": 5.8979518922177956e-06, + "loss": 0.7909, + "step": 6346 + }, + { + "epoch": 0.65, + "grad_norm": 1.5820138550985727, + "learning_rate": 5.8949475932193505e-06, + "loss": 0.6841, + "step": 6347 + }, + { + "epoch": 0.65, + "grad_norm": 1.540678213252206, + "learning_rate": 5.891943739760144e-06, + "loss": 0.6931, + "step": 6348 + }, + { + "epoch": 0.65, + "grad_norm": 1.6771935590518536, + "learning_rate": 5.888940332166209e-06, + "loss": 0.6764, + "step": 6349 + }, + { + "epoch": 0.65, + "grad_norm": 1.7257977664771067, + "learning_rate": 5.885937370763503e-06, + "loss": 0.6693, + "step": 6350 + }, + { + "epoch": 0.65, + "grad_norm": 1.7398417630894818, + "learning_rate": 5.882934855877962e-06, + "loss": 0.7415, + "step": 6351 + }, + { + "epoch": 0.65, + "grad_norm": 1.8211802165336226, + "learning_rate": 5.8799327878354615e-06, + "loss": 0.7329, + "step": 6352 + }, + { + "epoch": 0.65, + "grad_norm": 1.6342677687069151, + "learning_rate": 5.876931166961823e-06, + "loss": 0.758, + "step": 6353 + }, + { + "epoch": 0.65, + "grad_norm": 1.5043393902621687, + "learning_rate": 5.873929993582832e-06, + "loss": 0.6999, + "step": 6354 + }, + { + "epoch": 0.65, + "grad_norm": 1.6347981025495912, + "learning_rate": 5.8709292680242114e-06, + "loss": 0.6204, + "step": 6355 + }, + { + "epoch": 0.65, + "grad_norm": 1.5008672609314513, + "learning_rate": 5.867928990611647e-06, + "loss": 0.5959, + "step": 6356 + }, + { + "epoch": 0.65, + "grad_norm": 1.6378971953945671, + "learning_rate": 5.864929161670778e-06, + "loss": 0.7179, + "step": 6357 + }, + { + "epoch": 0.65, + "grad_norm": 1.5754498848685459, + "learning_rate": 5.861929781527178e-06, + "loss": 0.698, + "step": 6358 + }, + { + "epoch": 0.65, + "grad_norm": 1.553793198749774, + "learning_rate": 5.858930850506388e-06, + "loss": 0.6156, + "step": 6359 + }, + { + "epoch": 0.65, + "grad_norm": 1.458805236794188, + "learning_rate": 5.8559323689338985e-06, + "loss": 0.7464, + "step": 6360 + }, + { + "epoch": 0.65, + "grad_norm": 1.5183497123812193, + "learning_rate": 5.852934337135142e-06, + "loss": 0.6604, + "step": 6361 + }, + { + "epoch": 0.65, + "grad_norm": 1.7357794415167893, + "learning_rate": 5.849936755435513e-06, + "loss": 0.75, + "step": 6362 + }, + { + "epoch": 0.65, + "grad_norm": 1.4167584344055437, + "learning_rate": 5.846939624160346e-06, + "loss": 0.6739, + "step": 6363 + }, + { + "epoch": 0.65, + "grad_norm": 1.5571878965670656, + "learning_rate": 5.843942943634937e-06, + "loss": 0.7643, + "step": 6364 + }, + { + "epoch": 0.65, + "grad_norm": 1.674640396827517, + "learning_rate": 5.840946714184531e-06, + "loss": 0.8063, + "step": 6365 + }, + { + "epoch": 0.65, + "grad_norm": 1.6494268241629362, + "learning_rate": 5.8379509361343154e-06, + "loss": 0.6854, + "step": 6366 + }, + { + "epoch": 0.65, + "grad_norm": 1.5840646906041485, + "learning_rate": 5.834955609809443e-06, + "loss": 0.6196, + "step": 6367 + }, + { + "epoch": 0.65, + "grad_norm": 1.6550653534082749, + "learning_rate": 5.831960735534999e-06, + "loss": 0.7491, + "step": 6368 + }, + { + "epoch": 0.65, + "grad_norm": 1.7931312062298828, + "learning_rate": 5.8289663136360376e-06, + "loss": 0.7552, + "step": 6369 + }, + { + "epoch": 0.65, + "grad_norm": 1.6495599365387563, + "learning_rate": 5.8259723444375605e-06, + "loss": 0.805, + "step": 6370 + }, + { + "epoch": 0.65, + "grad_norm": 1.6737151146352796, + "learning_rate": 5.822978828264505e-06, + "loss": 0.7276, + "step": 6371 + }, + { + "epoch": 0.65, + "grad_norm": 1.6837966851701676, + "learning_rate": 5.819985765441781e-06, + "loss": 0.7014, + "step": 6372 + }, + { + "epoch": 0.65, + "grad_norm": 1.5072843482812206, + "learning_rate": 5.81699315629423e-06, + "loss": 0.6911, + "step": 6373 + }, + { + "epoch": 0.65, + "grad_norm": 1.7206761150182308, + "learning_rate": 5.814001001146657e-06, + "loss": 0.6909, + "step": 6374 + }, + { + "epoch": 0.65, + "grad_norm": 1.5046183305177292, + "learning_rate": 5.8110093003238175e-06, + "loss": 0.627, + "step": 6375 + }, + { + "epoch": 0.65, + "grad_norm": 1.4910705578526564, + "learning_rate": 5.808018054150406e-06, + "loss": 0.6718, + "step": 6376 + }, + { + "epoch": 0.65, + "grad_norm": 1.5485078588939911, + "learning_rate": 5.805027262951079e-06, + "loss": 0.7327, + "step": 6377 + }, + { + "epoch": 0.65, + "grad_norm": 1.6506032689070136, + "learning_rate": 5.802036927050447e-06, + "loss": 0.6427, + "step": 6378 + }, + { + "epoch": 0.65, + "grad_norm": 1.6698136293154233, + "learning_rate": 5.799047046773052e-06, + "loss": 0.8641, + "step": 6379 + }, + { + "epoch": 0.65, + "grad_norm": 1.5663409493127316, + "learning_rate": 5.7960576224434074e-06, + "loss": 0.6532, + "step": 6380 + }, + { + "epoch": 0.65, + "grad_norm": 1.485427935130718, + "learning_rate": 5.793068654385963e-06, + "loss": 0.6535, + "step": 6381 + }, + { + "epoch": 0.65, + "grad_norm": 1.6252387841757945, + "learning_rate": 5.790080142925128e-06, + "loss": 0.7337, + "step": 6382 + }, + { + "epoch": 0.65, + "grad_norm": 1.6319477125343451, + "learning_rate": 5.7870920883852595e-06, + "loss": 0.6668, + "step": 6383 + }, + { + "epoch": 0.65, + "grad_norm": 1.8594213711478256, + "learning_rate": 5.7841044910906585e-06, + "loss": 0.7431, + "step": 6384 + }, + { + "epoch": 0.65, + "grad_norm": 1.6592215105240347, + "learning_rate": 5.781117351365592e-06, + "loss": 0.7135, + "step": 6385 + }, + { + "epoch": 0.65, + "grad_norm": 1.517474499115124, + "learning_rate": 5.778130669534254e-06, + "loss": 0.6853, + "step": 6386 + }, + { + "epoch": 0.65, + "grad_norm": 1.4892996147591693, + "learning_rate": 5.775144445920811e-06, + "loss": 0.7471, + "step": 6387 + }, + { + "epoch": 0.65, + "grad_norm": 1.5256135918516867, + "learning_rate": 5.772158680849374e-06, + "loss": 0.7412, + "step": 6388 + }, + { + "epoch": 0.65, + "grad_norm": 1.5529282088524847, + "learning_rate": 5.769173374643991e-06, + "loss": 0.6539, + "step": 6389 + }, + { + "epoch": 0.65, + "grad_norm": 1.6485522629940237, + "learning_rate": 5.766188527628679e-06, + "loss": 0.7233, + "step": 6390 + }, + { + "epoch": 0.65, + "grad_norm": 1.6621024165012506, + "learning_rate": 5.76320414012739e-06, + "loss": 0.6734, + "step": 6391 + }, + { + "epoch": 0.65, + "grad_norm": 1.7848678387530656, + "learning_rate": 5.760220212464034e-06, + "loss": 0.7261, + "step": 6392 + }, + { + "epoch": 0.65, + "grad_norm": 1.4887429010798594, + "learning_rate": 5.757236744962476e-06, + "loss": 0.6584, + "step": 6393 + }, + { + "epoch": 0.65, + "grad_norm": 1.67450143515445, + "learning_rate": 5.754253737946516e-06, + "loss": 0.6852, + "step": 6394 + }, + { + "epoch": 0.65, + "grad_norm": 1.5312323954574496, + "learning_rate": 5.751271191739917e-06, + "loss": 0.7058, + "step": 6395 + }, + { + "epoch": 0.65, + "grad_norm": 1.6236234571249322, + "learning_rate": 5.748289106666392e-06, + "loss": 0.7407, + "step": 6396 + }, + { + "epoch": 0.65, + "grad_norm": 1.6408794958238797, + "learning_rate": 5.7453074830495896e-06, + "loss": 0.7075, + "step": 6397 + }, + { + "epoch": 0.65, + "grad_norm": 1.9122164979695884, + "learning_rate": 5.742326321213127e-06, + "loss": 0.6851, + "step": 6398 + }, + { + "epoch": 0.65, + "grad_norm": 1.5172361668955892, + "learning_rate": 5.739345621480559e-06, + "loss": 0.6739, + "step": 6399 + }, + { + "epoch": 0.65, + "grad_norm": 1.5217073707337485, + "learning_rate": 5.736365384175393e-06, + "loss": 0.5661, + "step": 6400 + }, + { + "epoch": 0.65, + "grad_norm": 1.681497755371146, + "learning_rate": 5.733385609621092e-06, + "loss": 0.7032, + "step": 6401 + }, + { + "epoch": 0.65, + "grad_norm": 1.4642727363740708, + "learning_rate": 5.730406298141058e-06, + "loss": 0.6612, + "step": 6402 + }, + { + "epoch": 0.65, + "grad_norm": 1.6701836113853448, + "learning_rate": 5.7274274500586535e-06, + "loss": 0.6383, + "step": 6403 + }, + { + "epoch": 0.65, + "grad_norm": 1.5146756310866385, + "learning_rate": 5.724449065697182e-06, + "loss": 0.6371, + "step": 6404 + }, + { + "epoch": 0.65, + "grad_norm": 1.5455245536838689, + "learning_rate": 5.721471145379901e-06, + "loss": 0.6639, + "step": 6405 + }, + { + "epoch": 0.65, + "grad_norm": 1.705302002513825, + "learning_rate": 5.718493689430022e-06, + "loss": 0.807, + "step": 6406 + }, + { + "epoch": 0.65, + "grad_norm": 1.7491318011537786, + "learning_rate": 5.715516698170696e-06, + "loss": 0.8477, + "step": 6407 + }, + { + "epoch": 0.65, + "grad_norm": 1.666722548545367, + "learning_rate": 5.712540171925029e-06, + "loss": 0.7674, + "step": 6408 + }, + { + "epoch": 0.65, + "grad_norm": 1.7357789038610587, + "learning_rate": 5.709564111016081e-06, + "loss": 0.6924, + "step": 6409 + }, + { + "epoch": 0.65, + "grad_norm": 1.6110366786415917, + "learning_rate": 5.706588515766851e-06, + "loss": 0.6787, + "step": 6410 + }, + { + "epoch": 0.65, + "grad_norm": 1.6350931099155135, + "learning_rate": 5.7036133865003e-06, + "loss": 0.698, + "step": 6411 + }, + { + "epoch": 0.65, + "grad_norm": 1.7228823458815403, + "learning_rate": 5.700638723539325e-06, + "loss": 0.7233, + "step": 6412 + }, + { + "epoch": 0.65, + "grad_norm": 1.6021431264605037, + "learning_rate": 5.69766452720678e-06, + "loss": 0.6322, + "step": 6413 + }, + { + "epoch": 0.65, + "grad_norm": 1.6334750931777735, + "learning_rate": 5.694690797825475e-06, + "loss": 0.6427, + "step": 6414 + }, + { + "epoch": 0.65, + "grad_norm": 1.5833645717447067, + "learning_rate": 5.691717535718151e-06, + "loss": 0.7319, + "step": 6415 + }, + { + "epoch": 0.65, + "grad_norm": 1.7498137753102825, + "learning_rate": 5.688744741207516e-06, + "loss": 0.6878, + "step": 6416 + }, + { + "epoch": 0.65, + "grad_norm": 1.5676021784469838, + "learning_rate": 5.6857724146162215e-06, + "loss": 0.6379, + "step": 6417 + }, + { + "epoch": 0.65, + "grad_norm": 1.5217944573670767, + "learning_rate": 5.682800556266862e-06, + "loss": 0.6389, + "step": 6418 + }, + { + "epoch": 0.65, + "grad_norm": 1.3689128575405203, + "learning_rate": 5.6798291664819875e-06, + "loss": 0.5964, + "step": 6419 + }, + { + "epoch": 0.65, + "grad_norm": 1.5680466041697287, + "learning_rate": 5.676858245584103e-06, + "loss": 0.6202, + "step": 6420 + }, + { + "epoch": 0.65, + "grad_norm": 1.755950732711214, + "learning_rate": 5.6738877938956426e-06, + "loss": 0.7522, + "step": 6421 + }, + { + "epoch": 0.65, + "grad_norm": 1.7249538817905252, + "learning_rate": 5.6709178117390105e-06, + "loss": 0.7686, + "step": 6422 + }, + { + "epoch": 0.65, + "grad_norm": 1.69730402680779, + "learning_rate": 5.667948299436555e-06, + "loss": 0.682, + "step": 6423 + }, + { + "epoch": 0.65, + "grad_norm": 1.6589657016064634, + "learning_rate": 5.6649792573105625e-06, + "loss": 0.6969, + "step": 6424 + }, + { + "epoch": 0.65, + "grad_norm": 1.695822779042538, + "learning_rate": 5.662010685683279e-06, + "loss": 0.6876, + "step": 6425 + }, + { + "epoch": 0.65, + "grad_norm": 1.7421322612105266, + "learning_rate": 5.6590425848769e-06, + "loss": 0.6974, + "step": 6426 + }, + { + "epoch": 0.65, + "grad_norm": 1.8334794155879066, + "learning_rate": 5.6560749552135605e-06, + "loss": 0.8303, + "step": 6427 + }, + { + "epoch": 0.65, + "grad_norm": 1.6470435016775014, + "learning_rate": 5.653107797015354e-06, + "loss": 0.6621, + "step": 6428 + }, + { + "epoch": 0.65, + "grad_norm": 1.627521191505139, + "learning_rate": 5.6501411106043205e-06, + "loss": 0.6695, + "step": 6429 + }, + { + "epoch": 0.65, + "grad_norm": 1.5213801668421556, + "learning_rate": 5.647174896302442e-06, + "loss": 0.6911, + "step": 6430 + }, + { + "epoch": 0.65, + "grad_norm": 1.5363389440683215, + "learning_rate": 5.644209154431662e-06, + "loss": 0.6973, + "step": 6431 + }, + { + "epoch": 0.65, + "grad_norm": 1.4381990884890545, + "learning_rate": 5.641243885313856e-06, + "loss": 0.697, + "step": 6432 + }, + { + "epoch": 0.65, + "grad_norm": 1.7122163825991126, + "learning_rate": 5.6382790892708665e-06, + "loss": 0.6942, + "step": 6433 + }, + { + "epoch": 0.65, + "grad_norm": 1.5466831181701948, + "learning_rate": 5.635314766624474e-06, + "loss": 0.6505, + "step": 6434 + }, + { + "epoch": 0.65, + "grad_norm": 1.543566320891912, + "learning_rate": 5.632350917696402e-06, + "loss": 0.6747, + "step": 6435 + }, + { + "epoch": 0.65, + "grad_norm": 1.6487087786876302, + "learning_rate": 5.629387542808338e-06, + "loss": 0.6439, + "step": 6436 + }, + { + "epoch": 0.65, + "grad_norm": 1.5662196712883276, + "learning_rate": 5.626424642281909e-06, + "loss": 0.6167, + "step": 6437 + }, + { + "epoch": 0.65, + "grad_norm": 1.6125241640825956, + "learning_rate": 5.623462216438689e-06, + "loss": 0.6974, + "step": 6438 + }, + { + "epoch": 0.65, + "grad_norm": 1.4991815498617118, + "learning_rate": 5.620500265600206e-06, + "loss": 0.7292, + "step": 6439 + }, + { + "epoch": 0.66, + "grad_norm": 1.6384111399872088, + "learning_rate": 5.617538790087927e-06, + "loss": 0.653, + "step": 6440 + }, + { + "epoch": 0.66, + "grad_norm": 1.3135639297560717, + "learning_rate": 5.614577790223279e-06, + "loss": 0.61, + "step": 6441 + }, + { + "epoch": 0.66, + "grad_norm": 1.5333413114465992, + "learning_rate": 5.611617266327636e-06, + "loss": 0.6731, + "step": 6442 + }, + { + "epoch": 0.66, + "grad_norm": 1.6331806993463807, + "learning_rate": 5.608657218722309e-06, + "loss": 0.7274, + "step": 6443 + }, + { + "epoch": 0.66, + "grad_norm": 1.6925118285778062, + "learning_rate": 5.6056976477285695e-06, + "loss": 0.884, + "step": 6444 + }, + { + "epoch": 0.66, + "grad_norm": 1.4276036664358116, + "learning_rate": 5.602738553667629e-06, + "loss": 0.6985, + "step": 6445 + }, + { + "epoch": 0.66, + "grad_norm": 1.5209224336151763, + "learning_rate": 5.599779936860652e-06, + "loss": 0.6954, + "step": 6446 + }, + { + "epoch": 0.66, + "grad_norm": 1.611923903199948, + "learning_rate": 5.5968217976287565e-06, + "loss": 0.6656, + "step": 6447 + }, + { + "epoch": 0.66, + "grad_norm": 1.8579107068263057, + "learning_rate": 5.593864136292992e-06, + "loss": 0.707, + "step": 6448 + }, + { + "epoch": 0.66, + "grad_norm": 1.5452524877049152, + "learning_rate": 5.5909069531743755e-06, + "loss": 0.6808, + "step": 6449 + }, + { + "epoch": 0.66, + "grad_norm": 1.698242540548482, + "learning_rate": 5.5879502485938544e-06, + "loss": 0.7089, + "step": 6450 + }, + { + "epoch": 0.66, + "grad_norm": 1.8162676945644611, + "learning_rate": 5.584994022872337e-06, + "loss": 0.7118, + "step": 6451 + }, + { + "epoch": 0.66, + "grad_norm": 1.538992476391323, + "learning_rate": 5.582038276330679e-06, + "loss": 0.7687, + "step": 6452 + }, + { + "epoch": 0.66, + "grad_norm": 1.7325920782664779, + "learning_rate": 5.5790830092896744e-06, + "loss": 0.6782, + "step": 6453 + }, + { + "epoch": 0.66, + "grad_norm": 1.7047326937943428, + "learning_rate": 5.576128222070072e-06, + "loss": 0.7279, + "step": 6454 + }, + { + "epoch": 0.66, + "grad_norm": 1.5326744108994126, + "learning_rate": 5.573173914992575e-06, + "loss": 0.7136, + "step": 6455 + }, + { + "epoch": 0.66, + "grad_norm": 1.5658639914644978, + "learning_rate": 5.570220088377817e-06, + "loss": 0.6877, + "step": 6456 + }, + { + "epoch": 0.66, + "grad_norm": 1.5407873449624567, + "learning_rate": 5.567266742546398e-06, + "loss": 0.7392, + "step": 6457 + }, + { + "epoch": 0.66, + "grad_norm": 1.7165329737166082, + "learning_rate": 5.56431387781885e-06, + "loss": 0.6999, + "step": 6458 + }, + { + "epoch": 0.66, + "grad_norm": 1.5792158050361675, + "learning_rate": 5.5613614945156644e-06, + "loss": 0.8257, + "step": 6459 + }, + { + "epoch": 0.66, + "grad_norm": 1.6861064784060478, + "learning_rate": 5.558409592957281e-06, + "loss": 0.6937, + "step": 6460 + }, + { + "epoch": 0.66, + "grad_norm": 1.6622203408620764, + "learning_rate": 5.555458173464073e-06, + "loss": 0.7079, + "step": 6461 + }, + { + "epoch": 0.66, + "grad_norm": 1.6179041108888554, + "learning_rate": 5.552507236356378e-06, + "loss": 0.6671, + "step": 6462 + }, + { + "epoch": 0.66, + "grad_norm": 1.5561973558428366, + "learning_rate": 5.549556781954468e-06, + "loss": 0.688, + "step": 6463 + }, + { + "epoch": 0.66, + "grad_norm": 1.8770461530985756, + "learning_rate": 5.546606810578574e-06, + "loss": 0.7056, + "step": 6464 + }, + { + "epoch": 0.66, + "grad_norm": 1.6639773368960784, + "learning_rate": 5.54365732254887e-06, + "loss": 0.8691, + "step": 6465 + }, + { + "epoch": 0.66, + "grad_norm": 1.766616000861856, + "learning_rate": 5.540708318185471e-06, + "loss": 0.6345, + "step": 6466 + }, + { + "epoch": 0.66, + "grad_norm": 1.7447860975157186, + "learning_rate": 5.537759797808451e-06, + "loss": 0.8052, + "step": 6467 + }, + { + "epoch": 0.66, + "grad_norm": 1.6314521492158323, + "learning_rate": 5.5348117617378205e-06, + "loss": 0.7425, + "step": 6468 + }, + { + "epoch": 0.66, + "grad_norm": 1.6169731780187688, + "learning_rate": 5.531864210293545e-06, + "loss": 0.6888, + "step": 6469 + }, + { + "epoch": 0.66, + "grad_norm": 1.8868048761897305, + "learning_rate": 5.52891714379554e-06, + "loss": 0.6861, + "step": 6470 + }, + { + "epoch": 0.66, + "grad_norm": 1.563665870685886, + "learning_rate": 5.525970562563656e-06, + "loss": 0.7059, + "step": 6471 + }, + { + "epoch": 0.66, + "grad_norm": 1.66926400990454, + "learning_rate": 5.5230244669177e-06, + "loss": 0.7607, + "step": 6472 + }, + { + "epoch": 0.66, + "grad_norm": 1.6671494780823763, + "learning_rate": 5.520078857177431e-06, + "loss": 0.7621, + "step": 6473 + }, + { + "epoch": 0.66, + "grad_norm": 1.574392066987864, + "learning_rate": 5.5171337336625385e-06, + "loss": 0.718, + "step": 6474 + }, + { + "epoch": 0.66, + "grad_norm": 1.77239094210856, + "learning_rate": 5.5141890966926814e-06, + "loss": 0.7875, + "step": 6475 + }, + { + "epoch": 0.66, + "grad_norm": 1.423741447571245, + "learning_rate": 5.511244946587442e-06, + "loss": 0.7009, + "step": 6476 + }, + { + "epoch": 0.66, + "grad_norm": 1.555072089538958, + "learning_rate": 5.5083012836663675e-06, + "loss": 0.6933, + "step": 6477 + }, + { + "epoch": 0.66, + "grad_norm": 1.685968478521124, + "learning_rate": 5.5053581082489505e-06, + "loss": 0.6945, + "step": 6478 + }, + { + "epoch": 0.66, + "grad_norm": 1.4061025474606934, + "learning_rate": 5.502415420654619e-06, + "loss": 0.5818, + "step": 6479 + }, + { + "epoch": 0.66, + "grad_norm": 1.6721597452098804, + "learning_rate": 5.4994732212027625e-06, + "loss": 0.7269, + "step": 6480 + }, + { + "epoch": 0.66, + "grad_norm": 1.6885123796718169, + "learning_rate": 5.4965315102127035e-06, + "loss": 0.7111, + "step": 6481 + }, + { + "epoch": 0.66, + "grad_norm": 1.587768016336439, + "learning_rate": 5.4935902880037224e-06, + "loss": 0.7552, + "step": 6482 + }, + { + "epoch": 0.66, + "grad_norm": 1.6544396399034218, + "learning_rate": 5.490649554895047e-06, + "loss": 0.7939, + "step": 6483 + }, + { + "epoch": 0.66, + "grad_norm": 1.903730177094068, + "learning_rate": 5.487709311205842e-06, + "loss": 0.703, + "step": 6484 + }, + { + "epoch": 0.66, + "grad_norm": 1.5464431344845322, + "learning_rate": 5.48476955725523e-06, + "loss": 0.6926, + "step": 6485 + }, + { + "epoch": 0.66, + "grad_norm": 1.6445360027395115, + "learning_rate": 5.481830293362267e-06, + "loss": 0.72, + "step": 6486 + }, + { + "epoch": 0.66, + "grad_norm": 1.4844153552483355, + "learning_rate": 5.47889151984597e-06, + "loss": 0.6338, + "step": 6487 + }, + { + "epoch": 0.66, + "grad_norm": 1.7527504474138509, + "learning_rate": 5.475953237025301e-06, + "loss": 0.7561, + "step": 6488 + }, + { + "epoch": 0.66, + "grad_norm": 1.5552778353388261, + "learning_rate": 5.473015445219155e-06, + "loss": 0.821, + "step": 6489 + }, + { + "epoch": 0.66, + "grad_norm": 1.6307048291861013, + "learning_rate": 5.470078144746389e-06, + "loss": 0.7274, + "step": 6490 + }, + { + "epoch": 0.66, + "grad_norm": 1.7001117892238, + "learning_rate": 5.467141335925804e-06, + "loss": 0.7728, + "step": 6491 + }, + { + "epoch": 0.66, + "grad_norm": 1.5869091692069035, + "learning_rate": 5.464205019076138e-06, + "loss": 0.6786, + "step": 6492 + }, + { + "epoch": 0.66, + "grad_norm": 1.6329469565910237, + "learning_rate": 5.461269194516088e-06, + "loss": 0.7075, + "step": 6493 + }, + { + "epoch": 0.66, + "grad_norm": 1.8988642438158536, + "learning_rate": 5.458333862564286e-06, + "loss": 0.7586, + "step": 6494 + }, + { + "epoch": 0.66, + "grad_norm": 1.6633595330570712, + "learning_rate": 5.4553990235393184e-06, + "loss": 0.6541, + "step": 6495 + }, + { + "epoch": 0.66, + "grad_norm": 1.5781666882673357, + "learning_rate": 5.452464677759721e-06, + "loss": 0.6325, + "step": 6496 + }, + { + "epoch": 0.66, + "grad_norm": 1.5661497583477781, + "learning_rate": 5.449530825543965e-06, + "loss": 0.7228, + "step": 6497 + }, + { + "epoch": 0.66, + "grad_norm": 1.5866430847198567, + "learning_rate": 5.446597467210479e-06, + "loss": 0.5912, + "step": 6498 + }, + { + "epoch": 0.66, + "grad_norm": 1.6481833752296238, + "learning_rate": 5.443664603077628e-06, + "loss": 0.8418, + "step": 6499 + }, + { + "epoch": 0.66, + "grad_norm": 1.695607932716036, + "learning_rate": 5.44073223346373e-06, + "loss": 0.7262, + "step": 6500 + }, + { + "epoch": 0.66, + "grad_norm": 1.5218003829304017, + "learning_rate": 5.437800358687054e-06, + "loss": 0.7119, + "step": 6501 + }, + { + "epoch": 0.66, + "grad_norm": 1.5239834220771615, + "learning_rate": 5.4348689790658006e-06, + "loss": 0.6818, + "step": 6502 + }, + { + "epoch": 0.66, + "grad_norm": 1.5960102475372329, + "learning_rate": 5.431938094918132e-06, + "loss": 0.7411, + "step": 6503 + }, + { + "epoch": 0.66, + "grad_norm": 1.657548516507031, + "learning_rate": 5.4290077065621426e-06, + "loss": 0.7023, + "step": 6504 + }, + { + "epoch": 0.66, + "grad_norm": 1.6262940709200162, + "learning_rate": 5.426077814315885e-06, + "loss": 0.7244, + "step": 6505 + }, + { + "epoch": 0.66, + "grad_norm": 1.6390654376483609, + "learning_rate": 5.423148418497356e-06, + "loss": 0.5825, + "step": 6506 + }, + { + "epoch": 0.66, + "grad_norm": 1.6031873520521178, + "learning_rate": 5.4202195194244876e-06, + "loss": 0.7294, + "step": 6507 + }, + { + "epoch": 0.66, + "grad_norm": 1.7798635128022935, + "learning_rate": 5.417291117415172e-06, + "loss": 0.6766, + "step": 6508 + }, + { + "epoch": 0.66, + "grad_norm": 1.6385895753692539, + "learning_rate": 5.414363212787244e-06, + "loss": 0.6456, + "step": 6509 + }, + { + "epoch": 0.66, + "grad_norm": 1.586186792105555, + "learning_rate": 5.411435805858473e-06, + "loss": 0.7585, + "step": 6510 + }, + { + "epoch": 0.66, + "grad_norm": 1.6013166183073786, + "learning_rate": 5.4085088969465916e-06, + "loss": 0.703, + "step": 6511 + }, + { + "epoch": 0.66, + "grad_norm": 1.5405890246283256, + "learning_rate": 5.405582486369264e-06, + "loss": 0.6945, + "step": 6512 + }, + { + "epoch": 0.66, + "grad_norm": 1.4703347207965698, + "learning_rate": 5.402656574444108e-06, + "loss": 0.709, + "step": 6513 + }, + { + "epoch": 0.66, + "grad_norm": 1.7235942090586773, + "learning_rate": 5.399731161488691e-06, + "loss": 0.7877, + "step": 6514 + }, + { + "epoch": 0.66, + "grad_norm": 1.576014597382119, + "learning_rate": 5.3968062478205115e-06, + "loss": 0.705, + "step": 6515 + }, + { + "epoch": 0.66, + "grad_norm": 1.4943026192192757, + "learning_rate": 5.393881833757033e-06, + "loss": 0.6441, + "step": 6516 + }, + { + "epoch": 0.66, + "grad_norm": 1.4372588715266486, + "learning_rate": 5.390957919615645e-06, + "loss": 0.6064, + "step": 6517 + }, + { + "epoch": 0.66, + "grad_norm": 1.5746568492901751, + "learning_rate": 5.388034505713697e-06, + "loss": 0.7424, + "step": 6518 + }, + { + "epoch": 0.66, + "grad_norm": 1.6185110839386305, + "learning_rate": 5.385111592368486e-06, + "loss": 0.6514, + "step": 6519 + }, + { + "epoch": 0.66, + "grad_norm": 1.795299732229972, + "learning_rate": 5.382189179897237e-06, + "loss": 0.7573, + "step": 6520 + }, + { + "epoch": 0.66, + "grad_norm": 1.6681414848347083, + "learning_rate": 5.379267268617139e-06, + "loss": 0.731, + "step": 6521 + }, + { + "epoch": 0.66, + "grad_norm": 1.8258298354662874, + "learning_rate": 5.376345858845322e-06, + "loss": 0.7484, + "step": 6522 + }, + { + "epoch": 0.66, + "grad_norm": 1.4220644902625972, + "learning_rate": 5.373424950898854e-06, + "loss": 0.5895, + "step": 6523 + }, + { + "epoch": 0.66, + "grad_norm": 1.5616930856307956, + "learning_rate": 5.3705045450947546e-06, + "loss": 0.7315, + "step": 6524 + }, + { + "epoch": 0.66, + "grad_norm": 1.650050715075898, + "learning_rate": 5.367584641749994e-06, + "loss": 0.7211, + "step": 6525 + }, + { + "epoch": 0.66, + "grad_norm": 1.5675250874675495, + "learning_rate": 5.3646652411814735e-06, + "loss": 0.767, + "step": 6526 + }, + { + "epoch": 0.66, + "grad_norm": 1.7880041175999453, + "learning_rate": 5.361746343706053e-06, + "loss": 0.7265, + "step": 6527 + }, + { + "epoch": 0.66, + "grad_norm": 1.4506290455380577, + "learning_rate": 5.358827949640536e-06, + "loss": 0.7461, + "step": 6528 + }, + { + "epoch": 0.66, + "grad_norm": 1.9009951006619712, + "learning_rate": 5.3559100593016615e-06, + "loss": 0.6162, + "step": 6529 + }, + { + "epoch": 0.66, + "grad_norm": 1.6192604915096651, + "learning_rate": 5.352992673006125e-06, + "loss": 0.6748, + "step": 6530 + }, + { + "epoch": 0.66, + "grad_norm": 1.6344829669114818, + "learning_rate": 5.350075791070566e-06, + "loss": 0.7157, + "step": 6531 + }, + { + "epoch": 0.66, + "grad_norm": 1.7282190505058468, + "learning_rate": 5.34715941381156e-06, + "loss": 0.7053, + "step": 6532 + }, + { + "epoch": 0.66, + "grad_norm": 1.5616856047012901, + "learning_rate": 5.344243541545637e-06, + "loss": 0.7304, + "step": 6533 + }, + { + "epoch": 0.66, + "grad_norm": 1.488895832298304, + "learning_rate": 5.341328174589273e-06, + "loss": 0.7114, + "step": 6534 + }, + { + "epoch": 0.66, + "grad_norm": 1.7697051118099179, + "learning_rate": 5.338413313258879e-06, + "loss": 0.7133, + "step": 6535 + }, + { + "epoch": 0.66, + "grad_norm": 1.6997375561691037, + "learning_rate": 5.335498957870821e-06, + "loss": 0.7103, + "step": 6536 + }, + { + "epoch": 0.66, + "grad_norm": 1.737073289454585, + "learning_rate": 5.332585108741411e-06, + "loss": 0.7116, + "step": 6537 + }, + { + "epoch": 0.67, + "grad_norm": 1.69707998251365, + "learning_rate": 5.329671766186892e-06, + "loss": 0.6982, + "step": 6538 + }, + { + "epoch": 0.67, + "grad_norm": 1.5938705443800656, + "learning_rate": 5.32675893052347e-06, + "loss": 0.7617, + "step": 6539 + }, + { + "epoch": 0.67, + "grad_norm": 1.6785160562909558, + "learning_rate": 5.323846602067283e-06, + "loss": 0.7133, + "step": 6540 + }, + { + "epoch": 0.67, + "grad_norm": 1.7254541858211765, + "learning_rate": 5.320934781134419e-06, + "loss": 0.7006, + "step": 6541 + }, + { + "epoch": 0.67, + "grad_norm": 1.4717849939657723, + "learning_rate": 5.318023468040918e-06, + "loss": 0.6648, + "step": 6542 + }, + { + "epoch": 0.67, + "grad_norm": 1.542910712759783, + "learning_rate": 5.315112663102746e-06, + "loss": 0.7322, + "step": 6543 + }, + { + "epoch": 0.67, + "grad_norm": 1.737205889574892, + "learning_rate": 5.312202366635831e-06, + "loss": 0.7091, + "step": 6544 + }, + { + "epoch": 0.67, + "grad_norm": 1.5309583554809463, + "learning_rate": 5.3092925789560445e-06, + "loss": 0.6099, + "step": 6545 + }, + { + "epoch": 0.67, + "grad_norm": 1.480662378918754, + "learning_rate": 5.30638330037919e-06, + "loss": 0.749, + "step": 6546 + }, + { + "epoch": 0.67, + "grad_norm": 1.461291497402869, + "learning_rate": 5.303474531221033e-06, + "loss": 0.7021, + "step": 6547 + }, + { + "epoch": 0.67, + "grad_norm": 1.65880915293672, + "learning_rate": 5.300566271797265e-06, + "loss": 0.6811, + "step": 6548 + }, + { + "epoch": 0.67, + "grad_norm": 1.3833613813366754, + "learning_rate": 5.297658522423539e-06, + "loss": 0.65, + "step": 6549 + }, + { + "epoch": 0.67, + "grad_norm": 1.587303853860589, + "learning_rate": 5.294751283415446e-06, + "loss": 0.76, + "step": 6550 + }, + { + "epoch": 0.67, + "grad_norm": 1.563782664562314, + "learning_rate": 5.2918445550885165e-06, + "loss": 0.6299, + "step": 6551 + }, + { + "epoch": 0.67, + "grad_norm": 1.4450028756994537, + "learning_rate": 5.288938337758236e-06, + "loss": 0.5837, + "step": 6552 + }, + { + "epoch": 0.67, + "grad_norm": 1.663183818495271, + "learning_rate": 5.286032631740023e-06, + "loss": 0.7403, + "step": 6553 + }, + { + "epoch": 0.67, + "grad_norm": 1.491077128099564, + "learning_rate": 5.28312743734925e-06, + "loss": 0.6107, + "step": 6554 + }, + { + "epoch": 0.67, + "grad_norm": 1.7013609398124023, + "learning_rate": 5.280222754901234e-06, + "loss": 0.7547, + "step": 6555 + }, + { + "epoch": 0.67, + "grad_norm": 1.488129397639651, + "learning_rate": 5.277318584711224e-06, + "loss": 0.5939, + "step": 6556 + }, + { + "epoch": 0.67, + "grad_norm": 1.6660977197771891, + "learning_rate": 5.274414927094433e-06, + "loss": 0.718, + "step": 6557 + }, + { + "epoch": 0.67, + "grad_norm": 1.596142506255152, + "learning_rate": 5.271511782365997e-06, + "loss": 0.5755, + "step": 6558 + }, + { + "epoch": 0.67, + "grad_norm": 1.570684630382438, + "learning_rate": 5.268609150841011e-06, + "loss": 0.7117, + "step": 6559 + }, + { + "epoch": 0.67, + "grad_norm": 1.3862051356848353, + "learning_rate": 5.265707032834515e-06, + "loss": 0.8393, + "step": 6560 + }, + { + "epoch": 0.67, + "grad_norm": 1.676536115535365, + "learning_rate": 5.262805428661481e-06, + "loss": 0.8902, + "step": 6561 + }, + { + "epoch": 0.67, + "grad_norm": 1.6601852418073246, + "learning_rate": 5.259904338636839e-06, + "loss": 0.6665, + "step": 6562 + }, + { + "epoch": 0.67, + "grad_norm": 1.7040187006990393, + "learning_rate": 5.257003763075452e-06, + "loss": 0.7277, + "step": 6563 + }, + { + "epoch": 0.67, + "grad_norm": 1.683236229240097, + "learning_rate": 5.2541037022921335e-06, + "loss": 0.7488, + "step": 6564 + }, + { + "epoch": 0.67, + "grad_norm": 1.4747881576506794, + "learning_rate": 5.251204156601645e-06, + "loss": 0.6613, + "step": 6565 + }, + { + "epoch": 0.67, + "grad_norm": 1.6054185824390816, + "learning_rate": 5.248305126318678e-06, + "loss": 0.7666, + "step": 6566 + }, + { + "epoch": 0.67, + "grad_norm": 1.4386913476814391, + "learning_rate": 5.245406611757882e-06, + "loss": 0.7446, + "step": 6567 + }, + { + "epoch": 0.67, + "grad_norm": 1.5303464616012228, + "learning_rate": 5.242508613233848e-06, + "loss": 0.6642, + "step": 6568 + }, + { + "epoch": 0.67, + "grad_norm": 1.6563424630765726, + "learning_rate": 5.239611131061102e-06, + "loss": 0.6124, + "step": 6569 + }, + { + "epoch": 0.67, + "grad_norm": 1.4331120110368254, + "learning_rate": 5.236714165554126e-06, + "loss": 0.6598, + "step": 6570 + }, + { + "epoch": 0.67, + "grad_norm": 1.298885302331489, + "learning_rate": 5.233817717027337e-06, + "loss": 0.6008, + "step": 6571 + }, + { + "epoch": 0.67, + "grad_norm": 1.603739287059268, + "learning_rate": 5.230921785795097e-06, + "loss": 0.6126, + "step": 6572 + }, + { + "epoch": 0.67, + "grad_norm": 1.5185383591183974, + "learning_rate": 5.228026372171722e-06, + "loss": 0.6423, + "step": 6573 + }, + { + "epoch": 0.67, + "grad_norm": 1.408739492333563, + "learning_rate": 5.225131476471455e-06, + "loss": 0.6603, + "step": 6574 + }, + { + "epoch": 0.67, + "grad_norm": 1.5087200124354232, + "learning_rate": 5.2222370990085006e-06, + "loss": 0.5578, + "step": 6575 + }, + { + "epoch": 0.67, + "grad_norm": 1.6882350770849142, + "learning_rate": 5.219343240096988e-06, + "loss": 0.7846, + "step": 6576 + }, + { + "epoch": 0.67, + "grad_norm": 1.4165427103087591, + "learning_rate": 5.216449900051006e-06, + "loss": 0.6297, + "step": 6577 + }, + { + "epoch": 0.67, + "grad_norm": 1.6031125511277677, + "learning_rate": 5.213557079184584e-06, + "loss": 0.6257, + "step": 6578 + }, + { + "epoch": 0.67, + "grad_norm": 1.6330463089764724, + "learning_rate": 5.210664777811687e-06, + "loss": 0.7085, + "step": 6579 + }, + { + "epoch": 0.67, + "grad_norm": 1.55459047365538, + "learning_rate": 5.207772996246234e-06, + "loss": 0.718, + "step": 6580 + }, + { + "epoch": 0.67, + "grad_norm": 1.8562593153317937, + "learning_rate": 5.204881734802078e-06, + "loss": 0.7194, + "step": 6581 + }, + { + "epoch": 0.67, + "grad_norm": 1.6494287308768552, + "learning_rate": 5.201990993793022e-06, + "loss": 0.7813, + "step": 6582 + }, + { + "epoch": 0.67, + "grad_norm": 1.6253578733828447, + "learning_rate": 5.199100773532815e-06, + "loss": 0.7396, + "step": 6583 + }, + { + "epoch": 0.67, + "grad_norm": 1.7042981931473753, + "learning_rate": 5.196211074335136e-06, + "loss": 0.7305, + "step": 6584 + }, + { + "epoch": 0.67, + "grad_norm": 1.4128951522222983, + "learning_rate": 5.1933218965136235e-06, + "loss": 0.7753, + "step": 6585 + }, + { + "epoch": 0.67, + "grad_norm": 1.6599676572854145, + "learning_rate": 5.190433240381855e-06, + "loss": 0.7574, + "step": 6586 + }, + { + "epoch": 0.67, + "grad_norm": 1.6950322153257795, + "learning_rate": 5.18754510625334e-06, + "loss": 0.6192, + "step": 6587 + }, + { + "epoch": 0.67, + "grad_norm": 1.5876474049189262, + "learning_rate": 5.1846574944415505e-06, + "loss": 0.6778, + "step": 6588 + }, + { + "epoch": 0.67, + "grad_norm": 1.7067641494328831, + "learning_rate": 5.181770405259883e-06, + "loss": 0.6738, + "step": 6589 + }, + { + "epoch": 0.67, + "grad_norm": 1.560063028944311, + "learning_rate": 5.1788838390216874e-06, + "loss": 0.604, + "step": 6590 + }, + { + "epoch": 0.67, + "grad_norm": 1.6402935150003717, + "learning_rate": 5.175997796040262e-06, + "loss": 0.7236, + "step": 6591 + }, + { + "epoch": 0.67, + "grad_norm": 1.4941234108482795, + "learning_rate": 5.173112276628834e-06, + "loss": 0.8053, + "step": 6592 + }, + { + "epoch": 0.67, + "grad_norm": 1.8240564355995594, + "learning_rate": 5.170227281100587e-06, + "loss": 0.6713, + "step": 6593 + }, + { + "epoch": 0.67, + "grad_norm": 1.5962204639297288, + "learning_rate": 5.167342809768636e-06, + "loss": 0.7, + "step": 6594 + }, + { + "epoch": 0.67, + "grad_norm": 1.6994359482698425, + "learning_rate": 5.164458862946049e-06, + "loss": 0.6477, + "step": 6595 + }, + { + "epoch": 0.67, + "grad_norm": 1.5502079375142361, + "learning_rate": 5.1615754409458365e-06, + "loss": 0.628, + "step": 6596 + }, + { + "epoch": 0.67, + "grad_norm": 1.7346230320590885, + "learning_rate": 5.158692544080941e-06, + "loss": 0.7447, + "step": 6597 + }, + { + "epoch": 0.67, + "grad_norm": 1.4492600740682662, + "learning_rate": 5.155810172664264e-06, + "loss": 0.6451, + "step": 6598 + }, + { + "epoch": 0.67, + "grad_norm": 1.6226120765657546, + "learning_rate": 5.152928327008635e-06, + "loss": 0.5691, + "step": 6599 + }, + { + "epoch": 0.67, + "grad_norm": 1.6380828656883044, + "learning_rate": 5.1500470074268375e-06, + "loss": 0.6748, + "step": 6600 + }, + { + "epoch": 0.67, + "grad_norm": 1.5436601441971176, + "learning_rate": 5.147166214231595e-06, + "loss": 0.6615, + "step": 6601 + }, + { + "epoch": 0.67, + "grad_norm": 1.4598286166313794, + "learning_rate": 5.144285947735566e-06, + "loss": 0.6504, + "step": 6602 + }, + { + "epoch": 0.67, + "grad_norm": 1.691712192890952, + "learning_rate": 5.141406208251363e-06, + "loss": 0.6155, + "step": 6603 + }, + { + "epoch": 0.67, + "grad_norm": 1.4712765944949058, + "learning_rate": 5.13852699609154e-06, + "loss": 0.6557, + "step": 6604 + }, + { + "epoch": 0.67, + "grad_norm": 1.6182224948885766, + "learning_rate": 5.135648311568583e-06, + "loss": 0.7785, + "step": 6605 + }, + { + "epoch": 0.67, + "grad_norm": 1.7863311346040405, + "learning_rate": 5.132770154994935e-06, + "loss": 0.7662, + "step": 6606 + }, + { + "epoch": 0.67, + "grad_norm": 1.6875108524698368, + "learning_rate": 5.1298925266829694e-06, + "loss": 0.681, + "step": 6607 + }, + { + "epoch": 0.67, + "grad_norm": 1.6087828341371864, + "learning_rate": 5.127015426945008e-06, + "loss": 0.6428, + "step": 6608 + }, + { + "epoch": 0.67, + "grad_norm": 1.4344497230237643, + "learning_rate": 5.124138856093321e-06, + "loss": 0.6741, + "step": 6609 + }, + { + "epoch": 0.67, + "grad_norm": 1.630878458485287, + "learning_rate": 5.1212628144401086e-06, + "loss": 0.7074, + "step": 6610 + }, + { + "epoch": 0.67, + "grad_norm": 1.5157406026220226, + "learning_rate": 5.118387302297527e-06, + "loss": 0.674, + "step": 6611 + }, + { + "epoch": 0.67, + "grad_norm": 1.7480585578015724, + "learning_rate": 5.115512319977659e-06, + "loss": 0.6263, + "step": 6612 + }, + { + "epoch": 0.67, + "grad_norm": 1.8044357935825248, + "learning_rate": 5.112637867792544e-06, + "loss": 0.7768, + "step": 6613 + }, + { + "epoch": 0.67, + "grad_norm": 1.5207172511477907, + "learning_rate": 5.109763946054163e-06, + "loss": 0.7245, + "step": 6614 + }, + { + "epoch": 0.67, + "grad_norm": 1.6838757695281426, + "learning_rate": 5.106890555074426e-06, + "loss": 0.6495, + "step": 6615 + }, + { + "epoch": 0.67, + "grad_norm": 1.5251784247030207, + "learning_rate": 5.1040176951652046e-06, + "loss": 0.668, + "step": 6616 + }, + { + "epoch": 0.67, + "grad_norm": 1.965638309649969, + "learning_rate": 5.101145366638294e-06, + "loss": 0.6732, + "step": 6617 + }, + { + "epoch": 0.67, + "grad_norm": 1.6207157830214798, + "learning_rate": 5.098273569805443e-06, + "loss": 0.6844, + "step": 6618 + }, + { + "epoch": 0.67, + "grad_norm": 1.4730282022868064, + "learning_rate": 5.095402304978346e-06, + "loss": 0.6946, + "step": 6619 + }, + { + "epoch": 0.67, + "grad_norm": 1.5416613162094852, + "learning_rate": 5.092531572468627e-06, + "loss": 0.6273, + "step": 6620 + }, + { + "epoch": 0.67, + "grad_norm": 1.591078859736904, + "learning_rate": 5.089661372587859e-06, + "loss": 0.7806, + "step": 6621 + }, + { + "epoch": 0.67, + "grad_norm": 1.6373143476009928, + "learning_rate": 5.086791705647566e-06, + "loss": 0.7163, + "step": 6622 + }, + { + "epoch": 0.67, + "grad_norm": 1.5145191628770813, + "learning_rate": 5.083922571959194e-06, + "loss": 0.5919, + "step": 6623 + }, + { + "epoch": 0.67, + "grad_norm": 1.4928294951142096, + "learning_rate": 5.081053971834153e-06, + "loss": 0.6832, + "step": 6624 + }, + { + "epoch": 0.67, + "grad_norm": 1.551921705364535, + "learning_rate": 5.078185905583774e-06, + "loss": 0.6539, + "step": 6625 + }, + { + "epoch": 0.67, + "grad_norm": 1.6046975733459894, + "learning_rate": 5.075318373519349e-06, + "loss": 0.7742, + "step": 6626 + }, + { + "epoch": 0.67, + "grad_norm": 1.6771471734669585, + "learning_rate": 5.072451375952104e-06, + "loss": 0.7703, + "step": 6627 + }, + { + "epoch": 0.67, + "grad_norm": 1.7927702142712978, + "learning_rate": 5.069584913193199e-06, + "loss": 0.812, + "step": 6628 + }, + { + "epoch": 0.67, + "grad_norm": 1.6327378040139973, + "learning_rate": 5.066718985553749e-06, + "loss": 0.6578, + "step": 6629 + }, + { + "epoch": 0.67, + "grad_norm": 1.5301086870655425, + "learning_rate": 5.06385359334481e-06, + "loss": 0.6644, + "step": 6630 + }, + { + "epoch": 0.67, + "grad_norm": 1.553619736705293, + "learning_rate": 5.060988736877366e-06, + "loss": 0.727, + "step": 6631 + }, + { + "epoch": 0.67, + "grad_norm": 1.6295412264746585, + "learning_rate": 5.0581244164623576e-06, + "loss": 0.687, + "step": 6632 + }, + { + "epoch": 0.67, + "grad_norm": 1.4718060176301735, + "learning_rate": 5.055260632410665e-06, + "loss": 0.6943, + "step": 6633 + }, + { + "epoch": 0.67, + "grad_norm": 1.5675917131961066, + "learning_rate": 5.0523973850331e-06, + "loss": 0.7724, + "step": 6634 + }, + { + "epoch": 0.67, + "grad_norm": 1.678519567246987, + "learning_rate": 5.049534674640426e-06, + "loss": 0.6734, + "step": 6635 + }, + { + "epoch": 0.68, + "grad_norm": 1.6314176517551688, + "learning_rate": 5.0466725015433526e-06, + "loss": 0.6701, + "step": 6636 + }, + { + "epoch": 0.68, + "grad_norm": 1.6684931385240929, + "learning_rate": 5.043810866052512e-06, + "loss": 0.7685, + "step": 6637 + }, + { + "epoch": 0.68, + "grad_norm": 1.559866500388052, + "learning_rate": 5.040949768478496e-06, + "loss": 0.7064, + "step": 6638 + }, + { + "epoch": 0.68, + "grad_norm": 1.6826584219645397, + "learning_rate": 5.038089209131837e-06, + "loss": 0.7978, + "step": 6639 + }, + { + "epoch": 0.68, + "grad_norm": 1.8855229766745862, + "learning_rate": 5.035229188322994e-06, + "loss": 0.7099, + "step": 6640 + }, + { + "epoch": 0.68, + "grad_norm": 1.6390837167916354, + "learning_rate": 5.032369706362381e-06, + "loss": 0.7395, + "step": 6641 + }, + { + "epoch": 0.68, + "grad_norm": 1.501803420191579, + "learning_rate": 5.029510763560357e-06, + "loss": 0.7012, + "step": 6642 + }, + { + "epoch": 0.68, + "grad_norm": 1.4287509327965948, + "learning_rate": 5.026652360227205e-06, + "loss": 0.6711, + "step": 6643 + }, + { + "epoch": 0.68, + "grad_norm": 1.6337553722301748, + "learning_rate": 5.023794496673169e-06, + "loss": 0.597, + "step": 6644 + }, + { + "epoch": 0.68, + "grad_norm": 1.6900452701037583, + "learning_rate": 5.020937173208418e-06, + "loss": 0.7713, + "step": 6645 + }, + { + "epoch": 0.68, + "grad_norm": 1.630566533375451, + "learning_rate": 5.018080390143072e-06, + "loss": 0.7175, + "step": 6646 + }, + { + "epoch": 0.68, + "grad_norm": 1.6482122448531207, + "learning_rate": 5.0152241477871955e-06, + "loss": 0.665, + "step": 6647 + }, + { + "epoch": 0.68, + "grad_norm": 1.5223059886944974, + "learning_rate": 5.0123684464507805e-06, + "loss": 0.6861, + "step": 6648 + }, + { + "epoch": 0.68, + "grad_norm": 1.5625226235889387, + "learning_rate": 5.009513286443773e-06, + "loss": 0.7124, + "step": 6649 + }, + { + "epoch": 0.68, + "grad_norm": 1.6641033534342602, + "learning_rate": 5.00665866807606e-06, + "loss": 0.6798, + "step": 6650 + }, + { + "epoch": 0.68, + "grad_norm": 1.814997332535629, + "learning_rate": 5.003804591657457e-06, + "loss": 0.6784, + "step": 6651 + }, + { + "epoch": 0.68, + "grad_norm": 1.7418241486270254, + "learning_rate": 5.0009510574977385e-06, + "loss": 0.684, + "step": 6652 + }, + { + "epoch": 0.68, + "grad_norm": 1.8990670470019306, + "learning_rate": 4.998098065906601e-06, + "loss": 0.7724, + "step": 6653 + }, + { + "epoch": 0.68, + "grad_norm": 1.5522110614566593, + "learning_rate": 4.9952456171936985e-06, + "loss": 0.6414, + "step": 6654 + }, + { + "epoch": 0.68, + "grad_norm": 1.6383848530820382, + "learning_rate": 4.992393711668622e-06, + "loss": 0.8211, + "step": 6655 + }, + { + "epoch": 0.68, + "grad_norm": 1.7466103343291657, + "learning_rate": 4.989542349640894e-06, + "loss": 0.7061, + "step": 6656 + }, + { + "epoch": 0.68, + "grad_norm": 1.569027050350729, + "learning_rate": 4.986691531419989e-06, + "loss": 0.5821, + "step": 6657 + }, + { + "epoch": 0.68, + "grad_norm": 1.6108341079235893, + "learning_rate": 4.983841257315324e-06, + "loss": 0.7087, + "step": 6658 + }, + { + "epoch": 0.68, + "grad_norm": 1.786948163093711, + "learning_rate": 4.980991527636242e-06, + "loss": 0.6429, + "step": 6659 + }, + { + "epoch": 0.68, + "grad_norm": 1.698101564669535, + "learning_rate": 4.978142342692045e-06, + "loss": 0.7126, + "step": 6660 + }, + { + "epoch": 0.68, + "grad_norm": 1.6742980171896125, + "learning_rate": 4.975293702791959e-06, + "loss": 0.7271, + "step": 6661 + }, + { + "epoch": 0.68, + "grad_norm": 1.8063655791321431, + "learning_rate": 4.9724456082451655e-06, + "loss": 0.6859, + "step": 6662 + }, + { + "epoch": 0.68, + "grad_norm": 1.692416378690283, + "learning_rate": 4.9695980593607815e-06, + "loss": 0.7298, + "step": 6663 + }, + { + "epoch": 0.68, + "grad_norm": 1.610922610672971, + "learning_rate": 4.9667510564478584e-06, + "loss": 0.7034, + "step": 6664 + }, + { + "epoch": 0.68, + "grad_norm": 1.6859325226267035, + "learning_rate": 4.9639045998154e-06, + "loss": 0.7184, + "step": 6665 + }, + { + "epoch": 0.68, + "grad_norm": 1.5747449742645898, + "learning_rate": 4.961058689772339e-06, + "loss": 0.6004, + "step": 6666 + }, + { + "epoch": 0.68, + "grad_norm": 1.6240207523487893, + "learning_rate": 4.958213326627555e-06, + "loss": 0.6998, + "step": 6667 + }, + { + "epoch": 0.68, + "grad_norm": 1.5305125840708707, + "learning_rate": 4.9553685106898765e-06, + "loss": 0.6738, + "step": 6668 + }, + { + "epoch": 0.68, + "grad_norm": 1.4862581273327402, + "learning_rate": 4.9525242422680504e-06, + "loss": 0.6822, + "step": 6669 + }, + { + "epoch": 0.68, + "grad_norm": 1.6232042430434463, + "learning_rate": 4.9496805216707885e-06, + "loss": 0.682, + "step": 6670 + }, + { + "epoch": 0.68, + "grad_norm": 1.579863545696646, + "learning_rate": 4.946837349206726e-06, + "loss": 0.6512, + "step": 6671 + }, + { + "epoch": 0.68, + "grad_norm": 1.6551829456643596, + "learning_rate": 4.943994725184445e-06, + "loss": 0.759, + "step": 6672 + }, + { + "epoch": 0.68, + "grad_norm": 1.8637227571573647, + "learning_rate": 4.941152649912473e-06, + "loss": 0.7387, + "step": 6673 + }, + { + "epoch": 0.68, + "grad_norm": 1.6064385826752752, + "learning_rate": 4.938311123699265e-06, + "loss": 0.6942, + "step": 6674 + }, + { + "epoch": 0.68, + "grad_norm": 1.514599431310791, + "learning_rate": 4.935470146853229e-06, + "loss": 0.6767, + "step": 6675 + }, + { + "epoch": 0.68, + "grad_norm": 1.6687265203998711, + "learning_rate": 4.9326297196827136e-06, + "loss": 0.6838, + "step": 6676 + }, + { + "epoch": 0.68, + "grad_norm": 1.3655127070699202, + "learning_rate": 4.929789842495992e-06, + "loss": 0.6231, + "step": 6677 + }, + { + "epoch": 0.68, + "grad_norm": 1.674511965233358, + "learning_rate": 4.9269505156012974e-06, + "loss": 0.7355, + "step": 6678 + }, + { + "epoch": 0.68, + "grad_norm": 1.6532554229523138, + "learning_rate": 4.924111739306788e-06, + "loss": 0.732, + "step": 6679 + }, + { + "epoch": 0.68, + "grad_norm": 1.6648534043225724, + "learning_rate": 4.92127351392057e-06, + "loss": 0.7031, + "step": 6680 + }, + { + "epoch": 0.68, + "grad_norm": 1.6532155219495979, + "learning_rate": 4.918435839750695e-06, + "loss": 0.6672, + "step": 6681 + }, + { + "epoch": 0.68, + "grad_norm": 1.5060944261708848, + "learning_rate": 4.9155987171051374e-06, + "loss": 0.5954, + "step": 6682 + }, + { + "epoch": 0.68, + "grad_norm": 1.4914021316838773, + "learning_rate": 4.912762146291833e-06, + "loss": 0.7485, + "step": 6683 + }, + { + "epoch": 0.68, + "grad_norm": 1.708142153364821, + "learning_rate": 4.909926127618638e-06, + "loss": 0.8124, + "step": 6684 + }, + { + "epoch": 0.68, + "grad_norm": 1.6513017550261346, + "learning_rate": 4.907090661393362e-06, + "loss": 0.6309, + "step": 6685 + }, + { + "epoch": 0.68, + "grad_norm": 1.7913658290602579, + "learning_rate": 4.904255747923754e-06, + "loss": 0.7755, + "step": 6686 + }, + { + "epoch": 0.68, + "grad_norm": 1.6348298452794083, + "learning_rate": 4.901421387517492e-06, + "loss": 0.6811, + "step": 6687 + }, + { + "epoch": 0.68, + "grad_norm": 1.582267175662689, + "learning_rate": 4.89858758048221e-06, + "loss": 0.6593, + "step": 6688 + }, + { + "epoch": 0.68, + "grad_norm": 1.5243067426073116, + "learning_rate": 4.895754327125465e-06, + "loss": 0.7288, + "step": 6689 + }, + { + "epoch": 0.68, + "grad_norm": 1.7220747486755543, + "learning_rate": 4.8929216277547665e-06, + "loss": 0.6891, + "step": 6690 + }, + { + "epoch": 0.68, + "grad_norm": 1.6523749858452668, + "learning_rate": 4.8900894826775626e-06, + "loss": 0.6876, + "step": 6691 + }, + { + "epoch": 0.68, + "grad_norm": 1.6800362457047207, + "learning_rate": 4.887257892201231e-06, + "loss": 0.7344, + "step": 6692 + }, + { + "epoch": 0.68, + "grad_norm": 1.6366830126364655, + "learning_rate": 4.884426856633102e-06, + "loss": 0.7273, + "step": 6693 + }, + { + "epoch": 0.68, + "grad_norm": 1.625082643217557, + "learning_rate": 4.881596376280443e-06, + "loss": 0.7569, + "step": 6694 + }, + { + "epoch": 0.68, + "grad_norm": 1.601983835016552, + "learning_rate": 4.878766451450451e-06, + "loss": 0.6197, + "step": 6695 + }, + { + "epoch": 0.68, + "grad_norm": 1.631133494077483, + "learning_rate": 4.875937082450276e-06, + "loss": 0.7626, + "step": 6696 + }, + { + "epoch": 0.68, + "grad_norm": 1.8279187393898182, + "learning_rate": 4.873108269586997e-06, + "loss": 0.7777, + "step": 6697 + }, + { + "epoch": 0.68, + "grad_norm": 1.499195022793931, + "learning_rate": 4.870280013167639e-06, + "loss": 0.6354, + "step": 6698 + }, + { + "epoch": 0.68, + "grad_norm": 1.5436191227668006, + "learning_rate": 4.867452313499169e-06, + "loss": 0.7183, + "step": 6699 + }, + { + "epoch": 0.68, + "grad_norm": 1.9507237044500054, + "learning_rate": 4.864625170888483e-06, + "loss": 0.7534, + "step": 6700 + }, + { + "epoch": 0.68, + "grad_norm": 1.5253344116402054, + "learning_rate": 4.86179858564243e-06, + "loss": 0.6205, + "step": 6701 + }, + { + "epoch": 0.68, + "grad_norm": 1.5286591265294258, + "learning_rate": 4.858972558067784e-06, + "loss": 0.5347, + "step": 6702 + }, + { + "epoch": 0.68, + "grad_norm": 1.673034454696896, + "learning_rate": 4.856147088471271e-06, + "loss": 0.6869, + "step": 6703 + }, + { + "epoch": 0.68, + "grad_norm": 1.7823550632594058, + "learning_rate": 4.853322177159554e-06, + "loss": 0.7187, + "step": 6704 + }, + { + "epoch": 0.68, + "grad_norm": 1.7547562334998144, + "learning_rate": 4.850497824439226e-06, + "loss": 0.7486, + "step": 6705 + }, + { + "epoch": 0.68, + "grad_norm": 1.5239934070506733, + "learning_rate": 4.847674030616833e-06, + "loss": 0.678, + "step": 6706 + }, + { + "epoch": 0.68, + "grad_norm": 1.6717769248144259, + "learning_rate": 4.844850795998848e-06, + "loss": 0.8623, + "step": 6707 + }, + { + "epoch": 0.68, + "grad_norm": 1.5968692392461008, + "learning_rate": 4.842028120891691e-06, + "loss": 0.6768, + "step": 6708 + }, + { + "epoch": 0.68, + "grad_norm": 1.619490677510862, + "learning_rate": 4.839206005601724e-06, + "loss": 0.6379, + "step": 6709 + }, + { + "epoch": 0.68, + "grad_norm": 1.419424593862716, + "learning_rate": 4.8363844504352344e-06, + "loss": 0.6016, + "step": 6710 + }, + { + "epoch": 0.68, + "grad_norm": 1.558674176987791, + "learning_rate": 4.8335634556984675e-06, + "loss": 0.672, + "step": 6711 + }, + { + "epoch": 0.68, + "grad_norm": 1.719086987642181, + "learning_rate": 4.8307430216975895e-06, + "loss": 0.7243, + "step": 6712 + }, + { + "epoch": 0.68, + "grad_norm": 1.6798971927393056, + "learning_rate": 4.827923148738718e-06, + "loss": 0.7477, + "step": 6713 + }, + { + "epoch": 0.68, + "grad_norm": 1.715498035259579, + "learning_rate": 4.825103837127911e-06, + "loss": 0.7564, + "step": 6714 + }, + { + "epoch": 0.68, + "grad_norm": 1.7866680476785195, + "learning_rate": 4.8222850871711525e-06, + "loss": 0.7131, + "step": 6715 + }, + { + "epoch": 0.68, + "grad_norm": 1.6589252405439832, + "learning_rate": 4.819466899174377e-06, + "loss": 0.6056, + "step": 6716 + }, + { + "epoch": 0.68, + "grad_norm": 1.7596553684310678, + "learning_rate": 4.816649273443457e-06, + "loss": 0.6624, + "step": 6717 + }, + { + "epoch": 0.68, + "grad_norm": 1.533186019876807, + "learning_rate": 4.813832210284198e-06, + "loss": 0.6477, + "step": 6718 + }, + { + "epoch": 0.68, + "grad_norm": 1.782257739414974, + "learning_rate": 4.8110157100023545e-06, + "loss": 0.6781, + "step": 6719 + }, + { + "epoch": 0.68, + "grad_norm": 1.5627791612424267, + "learning_rate": 4.808199772903604e-06, + "loss": 0.6295, + "step": 6720 + }, + { + "epoch": 0.68, + "grad_norm": 1.538510864281181, + "learning_rate": 4.805384399293578e-06, + "loss": 0.6813, + "step": 6721 + }, + { + "epoch": 0.68, + "grad_norm": 1.6580403085905862, + "learning_rate": 4.802569589477844e-06, + "loss": 0.6827, + "step": 6722 + }, + { + "epoch": 0.68, + "grad_norm": 1.7106126858256103, + "learning_rate": 4.799755343761899e-06, + "loss": 0.7553, + "step": 6723 + }, + { + "epoch": 0.68, + "grad_norm": 1.9278062978061696, + "learning_rate": 4.796941662451192e-06, + "loss": 0.7088, + "step": 6724 + }, + { + "epoch": 0.68, + "grad_norm": 1.6350428421712366, + "learning_rate": 4.794128545851098e-06, + "loss": 0.63, + "step": 6725 + }, + { + "epoch": 0.68, + "grad_norm": 1.6086575592589927, + "learning_rate": 4.791315994266939e-06, + "loss": 0.6599, + "step": 6726 + }, + { + "epoch": 0.68, + "grad_norm": 1.4378733825249688, + "learning_rate": 4.788504008003978e-06, + "loss": 0.6634, + "step": 6727 + }, + { + "epoch": 0.68, + "grad_norm": 1.6964357179844956, + "learning_rate": 4.785692587367404e-06, + "loss": 0.7541, + "step": 6728 + }, + { + "epoch": 0.68, + "grad_norm": 1.5927379582068117, + "learning_rate": 4.782881732662361e-06, + "loss": 0.6434, + "step": 6729 + }, + { + "epoch": 0.68, + "grad_norm": 1.5532184256900274, + "learning_rate": 4.780071444193917e-06, + "loss": 0.5576, + "step": 6730 + }, + { + "epoch": 0.68, + "grad_norm": 1.4686464144373634, + "learning_rate": 4.7772617222670846e-06, + "loss": 0.8035, + "step": 6731 + }, + { + "epoch": 0.68, + "grad_norm": 1.7957799314682596, + "learning_rate": 4.7744525671868235e-06, + "loss": 0.6975, + "step": 6732 + }, + { + "epoch": 0.68, + "grad_norm": 1.8435799373752362, + "learning_rate": 4.771643979258013e-06, + "loss": 0.7369, + "step": 6733 + }, + { + "epoch": 0.68, + "grad_norm": 1.501883749908581, + "learning_rate": 4.768835958785487e-06, + "loss": 0.6216, + "step": 6734 + }, + { + "epoch": 0.69, + "grad_norm": 1.3488585383477605, + "learning_rate": 4.766028506074014e-06, + "loss": 0.6155, + "step": 6735 + }, + { + "epoch": 0.69, + "grad_norm": 2.0300629350024613, + "learning_rate": 4.763221621428294e-06, + "loss": 0.7919, + "step": 6736 + }, + { + "epoch": 0.69, + "grad_norm": 1.6568561202507486, + "learning_rate": 4.760415305152973e-06, + "loss": 0.7961, + "step": 6737 + }, + { + "epoch": 0.69, + "grad_norm": 1.5372026401250878, + "learning_rate": 4.757609557552635e-06, + "loss": 0.6851, + "step": 6738 + }, + { + "epoch": 0.69, + "grad_norm": 1.675958938449828, + "learning_rate": 4.754804378931794e-06, + "loss": 0.7546, + "step": 6739 + }, + { + "epoch": 0.69, + "grad_norm": 1.5616244437905444, + "learning_rate": 4.7519997695949135e-06, + "loss": 0.6774, + "step": 6740 + }, + { + "epoch": 0.69, + "grad_norm": 1.6035797884092577, + "learning_rate": 4.749195729846392e-06, + "loss": 0.7169, + "step": 6741 + }, + { + "epoch": 0.69, + "grad_norm": 1.589931570461331, + "learning_rate": 4.746392259990557e-06, + "loss": 0.6673, + "step": 6742 + }, + { + "epoch": 0.69, + "grad_norm": 1.4731801910893658, + "learning_rate": 4.743589360331683e-06, + "loss": 0.5595, + "step": 6743 + }, + { + "epoch": 0.69, + "grad_norm": 1.599173395270854, + "learning_rate": 4.740787031173989e-06, + "loss": 0.7061, + "step": 6744 + }, + { + "epoch": 0.69, + "grad_norm": 1.634886642584961, + "learning_rate": 4.737985272821613e-06, + "loss": 0.6353, + "step": 6745 + }, + { + "epoch": 0.69, + "grad_norm": 1.457103767397003, + "learning_rate": 4.735184085578647e-06, + "loss": 0.6493, + "step": 6746 + }, + { + "epoch": 0.69, + "grad_norm": 1.6446556562271464, + "learning_rate": 4.732383469749119e-06, + "loss": 0.7001, + "step": 6747 + }, + { + "epoch": 0.69, + "grad_norm": 1.505602973831173, + "learning_rate": 4.7295834256369865e-06, + "loss": 0.7058, + "step": 6748 + }, + { + "epoch": 0.69, + "grad_norm": 1.7229690110012572, + "learning_rate": 4.726783953546156e-06, + "loss": 0.7497, + "step": 6749 + }, + { + "epoch": 0.69, + "grad_norm": 1.7983347955264688, + "learning_rate": 4.72398505378046e-06, + "loss": 0.7001, + "step": 6750 + }, + { + "epoch": 0.69, + "grad_norm": 1.5291712364707872, + "learning_rate": 4.721186726643679e-06, + "loss": 0.7008, + "step": 6751 + }, + { + "epoch": 0.69, + "grad_norm": 1.5772590436598817, + "learning_rate": 4.718388972439531e-06, + "loss": 0.6702, + "step": 6752 + }, + { + "epoch": 0.69, + "grad_norm": 1.6705254133139626, + "learning_rate": 4.715591791471661e-06, + "loss": 0.7293, + "step": 6753 + }, + { + "epoch": 0.69, + "grad_norm": 1.4528902800979537, + "learning_rate": 4.712795184043664e-06, + "loss": 0.7014, + "step": 6754 + }, + { + "epoch": 0.69, + "grad_norm": 1.8495839131221563, + "learning_rate": 4.709999150459069e-06, + "loss": 0.7272, + "step": 6755 + }, + { + "epoch": 0.69, + "grad_norm": 1.8308693369169993, + "learning_rate": 4.707203691021338e-06, + "loss": 0.8219, + "step": 6756 + }, + { + "epoch": 0.69, + "grad_norm": 1.8251269818080302, + "learning_rate": 4.704408806033876e-06, + "loss": 0.7729, + "step": 6757 + }, + { + "epoch": 0.69, + "grad_norm": 1.5691962502332488, + "learning_rate": 4.7016144958000275e-06, + "loss": 0.6621, + "step": 6758 + }, + { + "epoch": 0.69, + "grad_norm": 1.5036672090092813, + "learning_rate": 4.698820760623064e-06, + "loss": 0.759, + "step": 6759 + }, + { + "epoch": 0.69, + "grad_norm": 1.6126843045237915, + "learning_rate": 4.69602760080621e-06, + "loss": 0.7757, + "step": 6760 + }, + { + "epoch": 0.69, + "grad_norm": 1.5749756295000583, + "learning_rate": 4.693235016652611e-06, + "loss": 0.7203, + "step": 6761 + }, + { + "epoch": 0.69, + "grad_norm": 1.5723201831816331, + "learning_rate": 4.690443008465363e-06, + "loss": 0.7017, + "step": 6762 + }, + { + "epoch": 0.69, + "grad_norm": 1.6678943205347223, + "learning_rate": 4.687651576547497e-06, + "loss": 0.7812, + "step": 6763 + }, + { + "epoch": 0.69, + "grad_norm": 1.6563925187150188, + "learning_rate": 4.684860721201973e-06, + "loss": 0.737, + "step": 6764 + }, + { + "epoch": 0.69, + "grad_norm": 1.6299027094882985, + "learning_rate": 4.682070442731703e-06, + "loss": 0.6962, + "step": 6765 + }, + { + "epoch": 0.69, + "grad_norm": 1.6136354271562663, + "learning_rate": 4.679280741439519e-06, + "loss": 0.7279, + "step": 6766 + }, + { + "epoch": 0.69, + "grad_norm": 1.5619144216443062, + "learning_rate": 4.676491617628204e-06, + "loss": 0.7066, + "step": 6767 + }, + { + "epoch": 0.69, + "grad_norm": 1.691344837835973, + "learning_rate": 4.673703071600477e-06, + "loss": 0.7679, + "step": 6768 + }, + { + "epoch": 0.69, + "grad_norm": 1.7127039489976044, + "learning_rate": 4.670915103658984e-06, + "loss": 0.7076, + "step": 6769 + }, + { + "epoch": 0.69, + "grad_norm": 1.5145736321312864, + "learning_rate": 4.6681277141063185e-06, + "loss": 0.7129, + "step": 6770 + }, + { + "epoch": 0.69, + "grad_norm": 1.6572377542918708, + "learning_rate": 4.665340903245013e-06, + "loss": 0.7299, + "step": 6771 + }, + { + "epoch": 0.69, + "grad_norm": 1.7087069822360468, + "learning_rate": 4.662554671377523e-06, + "loss": 0.6861, + "step": 6772 + }, + { + "epoch": 0.69, + "grad_norm": 1.5534019045426446, + "learning_rate": 4.659769018806259e-06, + "loss": 0.6448, + "step": 6773 + }, + { + "epoch": 0.69, + "grad_norm": 1.697895456294768, + "learning_rate": 4.656983945833553e-06, + "loss": 0.7074, + "step": 6774 + }, + { + "epoch": 0.69, + "grad_norm": 1.5580354877752964, + "learning_rate": 4.654199452761684e-06, + "loss": 0.6663, + "step": 6775 + }, + { + "epoch": 0.69, + "grad_norm": 1.7294341968255658, + "learning_rate": 4.65141553989287e-06, + "loss": 0.6761, + "step": 6776 + }, + { + "epoch": 0.69, + "grad_norm": 1.4565405990269753, + "learning_rate": 4.648632207529251e-06, + "loss": 0.6189, + "step": 6777 + }, + { + "epoch": 0.69, + "grad_norm": 1.567351511841771, + "learning_rate": 4.645849455972925e-06, + "loss": 0.7025, + "step": 6778 + }, + { + "epoch": 0.69, + "grad_norm": 1.6730705756743636, + "learning_rate": 4.643067285525907e-06, + "loss": 0.7288, + "step": 6779 + }, + { + "epoch": 0.69, + "grad_norm": 2.4232719505119826, + "learning_rate": 4.640285696490162e-06, + "loss": 0.7396, + "step": 6780 + }, + { + "epoch": 0.69, + "grad_norm": 1.7484179890091003, + "learning_rate": 4.637504689167593e-06, + "loss": 0.772, + "step": 6781 + }, + { + "epoch": 0.69, + "grad_norm": 1.4096556326775183, + "learning_rate": 4.634724263860024e-06, + "loss": 0.6992, + "step": 6782 + }, + { + "epoch": 0.69, + "grad_norm": 1.431220240140526, + "learning_rate": 4.631944420869237e-06, + "loss": 0.6121, + "step": 6783 + }, + { + "epoch": 0.69, + "grad_norm": 1.5569615212600543, + "learning_rate": 4.629165160496933e-06, + "loss": 0.7122, + "step": 6784 + }, + { + "epoch": 0.69, + "grad_norm": 1.61376524661055, + "learning_rate": 4.62638648304476e-06, + "loss": 0.7074, + "step": 6785 + }, + { + "epoch": 0.69, + "grad_norm": 1.758923600513649, + "learning_rate": 4.623608388814305e-06, + "loss": 0.7616, + "step": 6786 + }, + { + "epoch": 0.69, + "grad_norm": 1.8161912002507001, + "learning_rate": 4.620830878107076e-06, + "loss": 0.6772, + "step": 6787 + }, + { + "epoch": 0.69, + "grad_norm": 1.5930783730088638, + "learning_rate": 4.618053951224537e-06, + "loss": 0.5818, + "step": 6788 + }, + { + "epoch": 0.69, + "grad_norm": 1.6296840520642266, + "learning_rate": 4.6152776084680795e-06, + "loss": 0.6261, + "step": 6789 + }, + { + "epoch": 0.69, + "grad_norm": 1.637986537306099, + "learning_rate": 4.612501850139027e-06, + "loss": 0.6713, + "step": 6790 + }, + { + "epoch": 0.69, + "grad_norm": 1.7392169691741854, + "learning_rate": 4.609726676538652e-06, + "loss": 0.669, + "step": 6791 + }, + { + "epoch": 0.69, + "grad_norm": 1.6813433964446511, + "learning_rate": 4.6069520879681464e-06, + "loss": 0.6574, + "step": 6792 + }, + { + "epoch": 0.69, + "grad_norm": 1.566228995523543, + "learning_rate": 4.604178084728655e-06, + "loss": 0.7067, + "step": 6793 + }, + { + "epoch": 0.69, + "grad_norm": 1.6410680316688766, + "learning_rate": 4.601404667121254e-06, + "loss": 0.7132, + "step": 6794 + }, + { + "epoch": 0.69, + "grad_norm": 1.6200882326123738, + "learning_rate": 4.5986318354469495e-06, + "loss": 0.6418, + "step": 6795 + }, + { + "epoch": 0.69, + "grad_norm": 1.846150123331626, + "learning_rate": 4.595859590006694e-06, + "loss": 0.8386, + "step": 6796 + }, + { + "epoch": 0.69, + "grad_norm": 1.4435592575044285, + "learning_rate": 4.593087931101366e-06, + "loss": 0.6287, + "step": 6797 + }, + { + "epoch": 0.69, + "grad_norm": 1.5756239357254955, + "learning_rate": 4.590316859031787e-06, + "loss": 0.606, + "step": 6798 + }, + { + "epoch": 0.69, + "grad_norm": 1.5562145930337141, + "learning_rate": 4.5875463740987196e-06, + "loss": 0.6404, + "step": 6799 + }, + { + "epoch": 0.69, + "grad_norm": 1.7451565402362117, + "learning_rate": 4.584776476602848e-06, + "loss": 0.7641, + "step": 6800 + }, + { + "epoch": 0.69, + "grad_norm": 1.7561251878769575, + "learning_rate": 4.582007166844808e-06, + "loss": 0.7515, + "step": 6801 + }, + { + "epoch": 0.69, + "grad_norm": 1.6443111262582564, + "learning_rate": 4.579238445125159e-06, + "loss": 0.7593, + "step": 6802 + }, + { + "epoch": 0.69, + "grad_norm": 1.8102861573409768, + "learning_rate": 4.576470311744406e-06, + "loss": 0.7399, + "step": 6803 + }, + { + "epoch": 0.69, + "grad_norm": 1.6888475048886598, + "learning_rate": 4.573702767002991e-06, + "loss": 0.6528, + "step": 6804 + }, + { + "epoch": 0.69, + "grad_norm": 1.6537094560055399, + "learning_rate": 4.570935811201277e-06, + "loss": 0.743, + "step": 6805 + }, + { + "epoch": 0.69, + "grad_norm": 1.609274630536314, + "learning_rate": 4.5681694446395805e-06, + "loss": 0.7941, + "step": 6806 + }, + { + "epoch": 0.69, + "grad_norm": 1.6042699551754307, + "learning_rate": 4.5654036676181505e-06, + "loss": 0.7543, + "step": 6807 + }, + { + "epoch": 0.69, + "grad_norm": 1.6824929983387507, + "learning_rate": 4.5626384804371605e-06, + "loss": 0.8842, + "step": 6808 + }, + { + "epoch": 0.69, + "grad_norm": 1.48386149237531, + "learning_rate": 4.559873883396737e-06, + "loss": 0.5557, + "step": 6809 + }, + { + "epoch": 0.69, + "grad_norm": 1.541725899034693, + "learning_rate": 4.557109876796926e-06, + "loss": 0.6702, + "step": 6810 + }, + { + "epoch": 0.69, + "grad_norm": 1.71511462621031, + "learning_rate": 4.554346460937719e-06, + "loss": 0.6853, + "step": 6811 + }, + { + "epoch": 0.69, + "grad_norm": 1.5697263264236596, + "learning_rate": 4.551583636119048e-06, + "loss": 0.6495, + "step": 6812 + }, + { + "epoch": 0.69, + "grad_norm": 1.593495035401068, + "learning_rate": 4.548821402640765e-06, + "loss": 0.8006, + "step": 6813 + }, + { + "epoch": 0.69, + "grad_norm": 1.5161862699596542, + "learning_rate": 4.5460597608026765e-06, + "loss": 0.7516, + "step": 6814 + }, + { + "epoch": 0.69, + "grad_norm": 1.7836780242640413, + "learning_rate": 4.543298710904507e-06, + "loss": 0.6723, + "step": 6815 + }, + { + "epoch": 0.69, + "grad_norm": 1.6477562008663253, + "learning_rate": 4.54053825324593e-06, + "loss": 0.6872, + "step": 6816 + }, + { + "epoch": 0.69, + "grad_norm": 1.6047023712842892, + "learning_rate": 4.537778388126552e-06, + "loss": 0.6548, + "step": 6817 + }, + { + "epoch": 0.69, + "grad_norm": 1.59282147631226, + "learning_rate": 4.535019115845905e-06, + "loss": 0.6552, + "step": 6818 + }, + { + "epoch": 0.69, + "grad_norm": 1.5241484978282966, + "learning_rate": 4.532260436703476e-06, + "loss": 0.6947, + "step": 6819 + }, + { + "epoch": 0.69, + "grad_norm": 1.629690488840868, + "learning_rate": 4.529502350998667e-06, + "loss": 0.6956, + "step": 6820 + }, + { + "epoch": 0.69, + "grad_norm": 1.7973943291335965, + "learning_rate": 4.526744859030828e-06, + "loss": 0.7769, + "step": 6821 + }, + { + "epoch": 0.69, + "grad_norm": 1.8716176708220122, + "learning_rate": 4.523987961099247e-06, + "loss": 0.8152, + "step": 6822 + }, + { + "epoch": 0.69, + "grad_norm": 1.7628031001157096, + "learning_rate": 4.5212316575031325e-06, + "loss": 0.7304, + "step": 6823 + }, + { + "epoch": 0.69, + "grad_norm": 1.6365695307885848, + "learning_rate": 4.518475948541644e-06, + "loss": 0.626, + "step": 6824 + }, + { + "epoch": 0.69, + "grad_norm": 1.8518163270282286, + "learning_rate": 4.5157208345138735e-06, + "loss": 0.7279, + "step": 6825 + }, + { + "epoch": 0.69, + "grad_norm": 1.503081942774786, + "learning_rate": 4.5129663157188385e-06, + "loss": 0.6428, + "step": 6826 + }, + { + "epoch": 0.69, + "grad_norm": 1.6458945015573683, + "learning_rate": 4.510212392455506e-06, + "loss": 0.6701, + "step": 6827 + }, + { + "epoch": 0.69, + "grad_norm": 1.528661732099273, + "learning_rate": 4.507459065022764e-06, + "loss": 0.6397, + "step": 6828 + }, + { + "epoch": 0.69, + "grad_norm": 1.7525925171036087, + "learning_rate": 4.504706333719446e-06, + "loss": 0.7344, + "step": 6829 + }, + { + "epoch": 0.69, + "grad_norm": 1.6132996914626434, + "learning_rate": 4.501954198844323e-06, + "loss": 0.6716, + "step": 6830 + }, + { + "epoch": 0.69, + "grad_norm": 1.728012838361854, + "learning_rate": 4.499202660696088e-06, + "loss": 0.6334, + "step": 6831 + }, + { + "epoch": 0.69, + "grad_norm": 1.6940877753151904, + "learning_rate": 4.4964517195733845e-06, + "loss": 0.7678, + "step": 6832 + }, + { + "epoch": 0.7, + "grad_norm": 1.4309951646007812, + "learning_rate": 4.493701375774778e-06, + "loss": 0.6187, + "step": 6833 + }, + { + "epoch": 0.7, + "grad_norm": 1.6994002275015478, + "learning_rate": 4.490951629598779e-06, + "loss": 0.6717, + "step": 6834 + }, + { + "epoch": 0.7, + "grad_norm": 1.5225830335907207, + "learning_rate": 4.488202481343831e-06, + "loss": 0.6211, + "step": 6835 + }, + { + "epoch": 0.7, + "grad_norm": 1.7419741594304876, + "learning_rate": 4.485453931308306e-06, + "loss": 0.7343, + "step": 6836 + }, + { + "epoch": 0.7, + "grad_norm": 1.7353467457564924, + "learning_rate": 4.482705979790522e-06, + "loss": 0.623, + "step": 6837 + }, + { + "epoch": 0.7, + "grad_norm": 1.6379948590288491, + "learning_rate": 4.479958627088719e-06, + "loss": 0.626, + "step": 6838 + }, + { + "epoch": 0.7, + "grad_norm": 1.5886343067209587, + "learning_rate": 4.477211873501085e-06, + "loss": 0.6164, + "step": 6839 + }, + { + "epoch": 0.7, + "grad_norm": 1.6088601476355608, + "learning_rate": 4.474465719325738e-06, + "loss": 0.5527, + "step": 6840 + }, + { + "epoch": 0.7, + "grad_norm": 1.4754554021635713, + "learning_rate": 4.471720164860723e-06, + "loss": 0.6268, + "step": 6841 + }, + { + "epoch": 0.7, + "grad_norm": 1.6109931340884776, + "learning_rate": 4.468975210404032e-06, + "loss": 0.7196, + "step": 6842 + }, + { + "epoch": 0.7, + "grad_norm": 1.4207320523598963, + "learning_rate": 4.4662308562535905e-06, + "loss": 0.6549, + "step": 6843 + }, + { + "epoch": 0.7, + "grad_norm": 1.5971967413292498, + "learning_rate": 4.463487102707247e-06, + "loss": 0.7004, + "step": 6844 + }, + { + "epoch": 0.7, + "grad_norm": 1.5846312944544336, + "learning_rate": 4.460743950062797e-06, + "loss": 0.6683, + "step": 6845 + }, + { + "epoch": 0.7, + "grad_norm": 1.5170102406603916, + "learning_rate": 4.458001398617971e-06, + "loss": 0.6649, + "step": 6846 + }, + { + "epoch": 0.7, + "grad_norm": 1.6523674664305048, + "learning_rate": 4.455259448670423e-06, + "loss": 0.7747, + "step": 6847 + }, + { + "epoch": 0.7, + "grad_norm": 1.5808458259236084, + "learning_rate": 4.452518100517751e-06, + "loss": 0.7314, + "step": 6848 + }, + { + "epoch": 0.7, + "grad_norm": 1.5507415500961412, + "learning_rate": 4.449777354457489e-06, + "loss": 0.6792, + "step": 6849 + }, + { + "epoch": 0.7, + "grad_norm": 1.6687667637042354, + "learning_rate": 4.447037210787098e-06, + "loss": 0.6677, + "step": 6850 + }, + { + "epoch": 0.7, + "grad_norm": 1.6465995920302823, + "learning_rate": 4.444297669803981e-06, + "loss": 0.688, + "step": 6851 + }, + { + "epoch": 0.7, + "grad_norm": 1.3837375906710079, + "learning_rate": 4.441558731805467e-06, + "loss": 0.6833, + "step": 6852 + }, + { + "epoch": 0.7, + "grad_norm": 1.6111410414667975, + "learning_rate": 4.438820397088829e-06, + "loss": 0.7622, + "step": 6853 + }, + { + "epoch": 0.7, + "grad_norm": 1.7558194753522183, + "learning_rate": 4.436082665951272e-06, + "loss": 0.7165, + "step": 6854 + }, + { + "epoch": 0.7, + "grad_norm": 1.7187189582051745, + "learning_rate": 4.43334553868993e-06, + "loss": 0.7752, + "step": 6855 + }, + { + "epoch": 0.7, + "grad_norm": 1.5985573828842572, + "learning_rate": 4.430609015601875e-06, + "loss": 0.7959, + "step": 6856 + }, + { + "epoch": 0.7, + "grad_norm": 1.6240805305240187, + "learning_rate": 4.427873096984122e-06, + "loss": 0.7092, + "step": 6857 + }, + { + "epoch": 0.7, + "grad_norm": 1.6149474725862267, + "learning_rate": 4.4251377831336005e-06, + "loss": 0.711, + "step": 6858 + }, + { + "epoch": 0.7, + "grad_norm": 1.5581486292014552, + "learning_rate": 4.422403074347192e-06, + "loss": 0.7262, + "step": 6859 + }, + { + "epoch": 0.7, + "grad_norm": 1.6611056685471053, + "learning_rate": 4.41966897092171e-06, + "loss": 0.7394, + "step": 6860 + }, + { + "epoch": 0.7, + "grad_norm": 1.6201934588493567, + "learning_rate": 4.416935473153891e-06, + "loss": 0.7084, + "step": 6861 + }, + { + "epoch": 0.7, + "grad_norm": 1.698794288828691, + "learning_rate": 4.414202581340416e-06, + "loss": 0.7496, + "step": 6862 + }, + { + "epoch": 0.7, + "grad_norm": 1.5843943183950784, + "learning_rate": 4.411470295777904e-06, + "loss": 0.7264, + "step": 6863 + }, + { + "epoch": 0.7, + "grad_norm": 1.7047690072131172, + "learning_rate": 4.4087386167628905e-06, + "loss": 0.6368, + "step": 6864 + }, + { + "epoch": 0.7, + "grad_norm": 1.5051173591130502, + "learning_rate": 4.4060075445918635e-06, + "loss": 0.672, + "step": 6865 + }, + { + "epoch": 0.7, + "grad_norm": 1.6290516020944361, + "learning_rate": 4.4032770795612426e-06, + "loss": 0.6965, + "step": 6866 + }, + { + "epoch": 0.7, + "grad_norm": 1.4918612948316938, + "learning_rate": 4.400547221967366e-06, + "loss": 0.7075, + "step": 6867 + }, + { + "epoch": 0.7, + "grad_norm": 1.5594186028274144, + "learning_rate": 4.397817972106527e-06, + "loss": 0.6735, + "step": 6868 + }, + { + "epoch": 0.7, + "grad_norm": 1.4450516338889967, + "learning_rate": 4.395089330274936e-06, + "loss": 0.5918, + "step": 6869 + }, + { + "epoch": 0.7, + "grad_norm": 1.5175043132909714, + "learning_rate": 4.392361296768747e-06, + "loss": 0.7221, + "step": 6870 + }, + { + "epoch": 0.7, + "grad_norm": 1.6293215499348013, + "learning_rate": 4.389633871884048e-06, + "loss": 0.7563, + "step": 6871 + }, + { + "epoch": 0.7, + "grad_norm": 1.4893946119439887, + "learning_rate": 4.386907055916854e-06, + "loss": 0.5862, + "step": 6872 + }, + { + "epoch": 0.7, + "grad_norm": 1.5866286954728583, + "learning_rate": 4.384180849163122e-06, + "loss": 0.6307, + "step": 6873 + }, + { + "epoch": 0.7, + "grad_norm": 1.533145190738263, + "learning_rate": 4.381455251918735e-06, + "loss": 0.6756, + "step": 6874 + }, + { + "epoch": 0.7, + "grad_norm": 1.6868162377836897, + "learning_rate": 4.378730264479516e-06, + "loss": 0.6691, + "step": 6875 + }, + { + "epoch": 0.7, + "grad_norm": 1.6819700258658672, + "learning_rate": 4.376005887141223e-06, + "loss": 0.6826, + "step": 6876 + }, + { + "epoch": 0.7, + "grad_norm": 1.7399392860899976, + "learning_rate": 4.373282120199538e-06, + "loss": 0.6818, + "step": 6877 + }, + { + "epoch": 0.7, + "grad_norm": 1.7575382587651571, + "learning_rate": 4.370558963950091e-06, + "loss": 0.7339, + "step": 6878 + }, + { + "epoch": 0.7, + "grad_norm": 1.790098215420934, + "learning_rate": 4.36783641868843e-06, + "loss": 0.7084, + "step": 6879 + }, + { + "epoch": 0.7, + "grad_norm": 1.7916971074702635, + "learning_rate": 4.365114484710048e-06, + "loss": 0.7472, + "step": 6880 + }, + { + "epoch": 0.7, + "grad_norm": 1.634292686325334, + "learning_rate": 4.3623931623103735e-06, + "loss": 0.611, + "step": 6881 + }, + { + "epoch": 0.7, + "grad_norm": 1.5793655961743505, + "learning_rate": 4.359672451784754e-06, + "loss": 0.624, + "step": 6882 + }, + { + "epoch": 0.7, + "grad_norm": 1.750260127143254, + "learning_rate": 4.356952353428486e-06, + "loss": 0.7962, + "step": 6883 + }, + { + "epoch": 0.7, + "grad_norm": 1.7649362401639273, + "learning_rate": 4.354232867536797e-06, + "loss": 0.7676, + "step": 6884 + }, + { + "epoch": 0.7, + "grad_norm": 1.752447193640601, + "learning_rate": 4.351513994404835e-06, + "loss": 0.7463, + "step": 6885 + }, + { + "epoch": 0.7, + "grad_norm": 1.5594556890072002, + "learning_rate": 4.348795734327701e-06, + "loss": 0.6703, + "step": 6886 + }, + { + "epoch": 0.7, + "grad_norm": 1.648662065675375, + "learning_rate": 4.346078087600411e-06, + "loss": 0.7742, + "step": 6887 + }, + { + "epoch": 0.7, + "grad_norm": 1.699704656113283, + "learning_rate": 4.343361054517927e-06, + "loss": 0.7956, + "step": 6888 + }, + { + "epoch": 0.7, + "grad_norm": 1.6594430543250784, + "learning_rate": 4.340644635375146e-06, + "loss": 0.7169, + "step": 6889 + }, + { + "epoch": 0.7, + "grad_norm": 1.7104654398985026, + "learning_rate": 4.337928830466882e-06, + "loss": 0.7365, + "step": 6890 + }, + { + "epoch": 0.7, + "grad_norm": 1.755419660001335, + "learning_rate": 4.335213640087902e-06, + "loss": 0.7746, + "step": 6891 + }, + { + "epoch": 0.7, + "grad_norm": 1.6038165272888565, + "learning_rate": 4.332499064532891e-06, + "loss": 0.6323, + "step": 6892 + }, + { + "epoch": 0.7, + "grad_norm": 1.7456593937639815, + "learning_rate": 4.329785104096477e-06, + "loss": 0.7454, + "step": 6893 + }, + { + "epoch": 0.7, + "grad_norm": 1.6068745247970297, + "learning_rate": 4.327071759073221e-06, + "loss": 0.6262, + "step": 6894 + }, + { + "epoch": 0.7, + "grad_norm": 1.6818545930186335, + "learning_rate": 4.324359029757607e-06, + "loss": 0.6403, + "step": 6895 + }, + { + "epoch": 0.7, + "grad_norm": 1.4850855188004741, + "learning_rate": 4.321646916444068e-06, + "loss": 0.6217, + "step": 6896 + }, + { + "epoch": 0.7, + "grad_norm": 1.7718084166975085, + "learning_rate": 4.318935419426951e-06, + "loss": 0.7067, + "step": 6897 + }, + { + "epoch": 0.7, + "grad_norm": 1.5457835040849344, + "learning_rate": 4.316224539000554e-06, + "loss": 0.6807, + "step": 6898 + }, + { + "epoch": 0.7, + "grad_norm": 1.7754546506945497, + "learning_rate": 4.313514275459103e-06, + "loss": 0.8154, + "step": 6899 + }, + { + "epoch": 0.7, + "grad_norm": 1.6481037379539556, + "learning_rate": 4.310804629096746e-06, + "loss": 0.7368, + "step": 6900 + }, + { + "epoch": 0.7, + "grad_norm": 1.6254104503335625, + "learning_rate": 4.308095600207578e-06, + "loss": 0.6914, + "step": 6901 + }, + { + "epoch": 0.7, + "grad_norm": 1.6361449332363762, + "learning_rate": 4.3053871890856235e-06, + "loss": 0.6178, + "step": 6902 + }, + { + "epoch": 0.7, + "grad_norm": 1.4364947519767834, + "learning_rate": 4.302679396024834e-06, + "loss": 0.6167, + "step": 6903 + }, + { + "epoch": 0.7, + "grad_norm": 1.6631765011083663, + "learning_rate": 4.299972221319103e-06, + "loss": 0.6483, + "step": 6904 + }, + { + "epoch": 0.7, + "grad_norm": 1.6516281319651802, + "learning_rate": 4.2972656652622446e-06, + "loss": 0.769, + "step": 6905 + }, + { + "epoch": 0.7, + "grad_norm": 1.5056024308236098, + "learning_rate": 4.294559728148018e-06, + "loss": 0.6928, + "step": 6906 + }, + { + "epoch": 0.7, + "grad_norm": 1.6055309450592068, + "learning_rate": 4.291854410270113e-06, + "loss": 0.7404, + "step": 6907 + }, + { + "epoch": 0.7, + "grad_norm": 1.671936816083641, + "learning_rate": 4.289149711922143e-06, + "loss": 0.746, + "step": 6908 + }, + { + "epoch": 0.7, + "grad_norm": 1.4401133849964285, + "learning_rate": 4.2864456333976676e-06, + "loss": 0.6935, + "step": 6909 + }, + { + "epoch": 0.7, + "grad_norm": 1.663931022188339, + "learning_rate": 4.2837421749901655e-06, + "loss": 0.7025, + "step": 6910 + }, + { + "epoch": 0.7, + "grad_norm": 1.6514456944306992, + "learning_rate": 4.281039336993058e-06, + "loss": 0.7564, + "step": 6911 + }, + { + "epoch": 0.7, + "grad_norm": 1.590988972850426, + "learning_rate": 4.278337119699699e-06, + "loss": 0.6574, + "step": 6912 + }, + { + "epoch": 0.7, + "grad_norm": 1.7075607813218283, + "learning_rate": 4.275635523403367e-06, + "loss": 0.6821, + "step": 6913 + }, + { + "epoch": 0.7, + "grad_norm": 1.6835764812133978, + "learning_rate": 4.27293454839728e-06, + "loss": 0.7864, + "step": 6914 + }, + { + "epoch": 0.7, + "grad_norm": 1.6808712795164975, + "learning_rate": 4.270234194974586e-06, + "loss": 0.7632, + "step": 6915 + }, + { + "epoch": 0.7, + "grad_norm": 1.6984519613192404, + "learning_rate": 4.267534463428365e-06, + "loss": 0.7588, + "step": 6916 + }, + { + "epoch": 0.7, + "grad_norm": 1.7880445241677712, + "learning_rate": 4.264835354051636e-06, + "loss": 0.7459, + "step": 6917 + }, + { + "epoch": 0.7, + "grad_norm": 1.524997395722238, + "learning_rate": 4.262136867137339e-06, + "loss": 0.7045, + "step": 6918 + }, + { + "epoch": 0.7, + "grad_norm": 1.8045010726243293, + "learning_rate": 4.259439002978354e-06, + "loss": 0.7398, + "step": 6919 + }, + { + "epoch": 0.7, + "grad_norm": 1.5151094218387287, + "learning_rate": 4.256741761867497e-06, + "loss": 0.6549, + "step": 6920 + }, + { + "epoch": 0.7, + "grad_norm": 1.6018412694118114, + "learning_rate": 4.2540451440975025e-06, + "loss": 0.7484, + "step": 6921 + }, + { + "epoch": 0.7, + "grad_norm": 1.5854447630426047, + "learning_rate": 4.251349149961056e-06, + "loss": 0.7279, + "step": 6922 + }, + { + "epoch": 0.7, + "grad_norm": 1.4938982874778577, + "learning_rate": 4.248653779750756e-06, + "loss": 0.6409, + "step": 6923 + }, + { + "epoch": 0.7, + "grad_norm": 1.5468644761910109, + "learning_rate": 4.245959033759147e-06, + "loss": 0.6326, + "step": 6924 + }, + { + "epoch": 0.7, + "grad_norm": 1.666205435921289, + "learning_rate": 4.243264912278706e-06, + "loss": 0.804, + "step": 6925 + }, + { + "epoch": 0.7, + "grad_norm": 1.6496617351717349, + "learning_rate": 4.2405714156018295e-06, + "loss": 0.7252, + "step": 6926 + }, + { + "epoch": 0.7, + "grad_norm": 1.5762382787063267, + "learning_rate": 4.237878544020861e-06, + "loss": 0.6196, + "step": 6927 + }, + { + "epoch": 0.7, + "grad_norm": 1.7349560074224841, + "learning_rate": 4.235186297828066e-06, + "loss": 0.682, + "step": 6928 + }, + { + "epoch": 0.7, + "grad_norm": 1.6572899000817862, + "learning_rate": 4.232494677315645e-06, + "loss": 0.6835, + "step": 6929 + }, + { + "epoch": 0.7, + "grad_norm": 1.70827240724343, + "learning_rate": 4.2298036827757375e-06, + "loss": 0.7192, + "step": 6930 + }, + { + "epoch": 0.71, + "grad_norm": 1.9011840639795596, + "learning_rate": 4.2271133145003995e-06, + "loss": 0.7168, + "step": 6931 + }, + { + "epoch": 0.71, + "grad_norm": 1.5761513167033099, + "learning_rate": 4.224423572781637e-06, + "loss": 0.768, + "step": 6932 + }, + { + "epoch": 0.71, + "grad_norm": 1.5980729940148364, + "learning_rate": 4.221734457911374e-06, + "loss": 0.6542, + "step": 6933 + }, + { + "epoch": 0.71, + "grad_norm": 1.577183650117114, + "learning_rate": 4.219045970181472e-06, + "loss": 0.6186, + "step": 6934 + }, + { + "epoch": 0.71, + "grad_norm": 1.5433345907486062, + "learning_rate": 4.21635810988373e-06, + "loss": 0.6485, + "step": 6935 + }, + { + "epoch": 0.71, + "grad_norm": 1.6069760167637266, + "learning_rate": 4.213670877309867e-06, + "loss": 0.6646, + "step": 6936 + }, + { + "epoch": 0.71, + "grad_norm": 1.7019206675257987, + "learning_rate": 4.210984272751541e-06, + "loss": 0.6954, + "step": 6937 + }, + { + "epoch": 0.71, + "grad_norm": 1.7657777698386452, + "learning_rate": 4.208298296500345e-06, + "loss": 0.7059, + "step": 6938 + }, + { + "epoch": 0.71, + "grad_norm": 1.5774180458446942, + "learning_rate": 4.2056129488477936e-06, + "loss": 0.593, + "step": 6939 + }, + { + "epoch": 0.71, + "grad_norm": 1.6366090495196506, + "learning_rate": 4.2029282300853455e-06, + "loss": 0.6479, + "step": 6940 + }, + { + "epoch": 0.71, + "grad_norm": 1.6983953653821355, + "learning_rate": 4.200244140504379e-06, + "loss": 0.7052, + "step": 6941 + }, + { + "epoch": 0.71, + "grad_norm": 1.6921720513226657, + "learning_rate": 4.197560680396214e-06, + "loss": 0.7774, + "step": 6942 + }, + { + "epoch": 0.71, + "grad_norm": 1.5070654563971895, + "learning_rate": 4.1948778500521e-06, + "loss": 0.6766, + "step": 6943 + }, + { + "epoch": 0.71, + "grad_norm": 1.518188850551967, + "learning_rate": 4.19219564976321e-06, + "loss": 0.7744, + "step": 6944 + }, + { + "epoch": 0.71, + "grad_norm": 1.479936391862102, + "learning_rate": 4.189514079820662e-06, + "loss": 0.6599, + "step": 6945 + }, + { + "epoch": 0.71, + "grad_norm": 1.6671871455146776, + "learning_rate": 4.1868331405154905e-06, + "loss": 0.6885, + "step": 6946 + }, + { + "epoch": 0.71, + "grad_norm": 1.7031726324212713, + "learning_rate": 4.184152832138673e-06, + "loss": 0.7379, + "step": 6947 + }, + { + "epoch": 0.71, + "grad_norm": 1.5421456573559247, + "learning_rate": 4.181473154981122e-06, + "loss": 0.5938, + "step": 6948 + }, + { + "epoch": 0.71, + "grad_norm": 1.7023553958191253, + "learning_rate": 4.1787941093336635e-06, + "loss": 0.7094, + "step": 6949 + }, + { + "epoch": 0.71, + "grad_norm": 1.4727549625930916, + "learning_rate": 4.176115695487071e-06, + "loss": 0.6625, + "step": 6950 + }, + { + "epoch": 0.71, + "grad_norm": 1.7742791080305533, + "learning_rate": 4.173437913732048e-06, + "loss": 0.7806, + "step": 6951 + }, + { + "epoch": 0.71, + "grad_norm": 1.6522047579037553, + "learning_rate": 4.1707607643592185e-06, + "loss": 0.6883, + "step": 6952 + }, + { + "epoch": 0.71, + "grad_norm": 1.5643275729858779, + "learning_rate": 4.1680842476591484e-06, + "loss": 0.7131, + "step": 6953 + }, + { + "epoch": 0.71, + "grad_norm": 1.6108363196318192, + "learning_rate": 4.165408363922337e-06, + "loss": 0.7833, + "step": 6954 + }, + { + "epoch": 0.71, + "grad_norm": 1.7994867885909294, + "learning_rate": 4.1627331134392e-06, + "loss": 0.6775, + "step": 6955 + }, + { + "epoch": 0.71, + "grad_norm": 1.7217488003033425, + "learning_rate": 4.160058496500103e-06, + "loss": 0.6403, + "step": 6956 + }, + { + "epoch": 0.71, + "grad_norm": 1.6425976302194905, + "learning_rate": 4.1573845133953275e-06, + "loss": 0.7085, + "step": 6957 + }, + { + "epoch": 0.71, + "grad_norm": 1.497892765596392, + "learning_rate": 4.154711164415094e-06, + "loss": 0.6671, + "step": 6958 + }, + { + "epoch": 0.71, + "grad_norm": 1.8054907860900105, + "learning_rate": 4.152038449849556e-06, + "loss": 0.8228, + "step": 6959 + }, + { + "epoch": 0.71, + "grad_norm": 1.562254776905117, + "learning_rate": 4.14936636998879e-06, + "loss": 0.6295, + "step": 6960 + }, + { + "epoch": 0.71, + "grad_norm": 1.7181351357516357, + "learning_rate": 4.14669492512281e-06, + "loss": 0.7395, + "step": 6961 + }, + { + "epoch": 0.71, + "grad_norm": 1.6343855996101109, + "learning_rate": 4.144024115541565e-06, + "loss": 0.7496, + "step": 6962 + }, + { + "epoch": 0.71, + "grad_norm": 1.6993064244870406, + "learning_rate": 4.14135394153492e-06, + "loss": 0.6616, + "step": 6963 + }, + { + "epoch": 0.71, + "grad_norm": 1.543490876092734, + "learning_rate": 4.138684403392688e-06, + "loss": 0.7636, + "step": 6964 + }, + { + "epoch": 0.71, + "grad_norm": 1.5772250311731004, + "learning_rate": 4.136015501404604e-06, + "loss": 0.6949, + "step": 6965 + }, + { + "epoch": 0.71, + "grad_norm": 2.0741991444368217, + "learning_rate": 4.133347235860333e-06, + "loss": 0.863, + "step": 6966 + }, + { + "epoch": 0.71, + "grad_norm": 1.647681598984127, + "learning_rate": 4.1306796070494755e-06, + "loss": 0.7291, + "step": 6967 + }, + { + "epoch": 0.71, + "grad_norm": 1.5520196505035508, + "learning_rate": 4.1280126152615644e-06, + "loss": 0.6729, + "step": 6968 + }, + { + "epoch": 0.71, + "grad_norm": 1.5718487268044565, + "learning_rate": 4.1253462607860515e-06, + "loss": 0.6882, + "step": 6969 + }, + { + "epoch": 0.71, + "grad_norm": 1.6006646987325581, + "learning_rate": 4.122680543912334e-06, + "loss": 0.6832, + "step": 6970 + }, + { + "epoch": 0.71, + "grad_norm": 1.5881086125681252, + "learning_rate": 4.120015464929735e-06, + "loss": 0.6927, + "step": 6971 + }, + { + "epoch": 0.71, + "grad_norm": 1.6140338625443162, + "learning_rate": 4.117351024127501e-06, + "loss": 0.7615, + "step": 6972 + }, + { + "epoch": 0.71, + "grad_norm": 1.7039718584416361, + "learning_rate": 4.11468722179482e-06, + "loss": 0.8646, + "step": 6973 + }, + { + "epoch": 0.71, + "grad_norm": 1.5917611835614012, + "learning_rate": 4.112024058220808e-06, + "loss": 0.6972, + "step": 6974 + }, + { + "epoch": 0.71, + "grad_norm": 1.740603652542512, + "learning_rate": 4.109361533694504e-06, + "loss": 0.7257, + "step": 6975 + }, + { + "epoch": 0.71, + "grad_norm": 1.7885656752367534, + "learning_rate": 4.10669964850489e-06, + "loss": 0.7949, + "step": 6976 + }, + { + "epoch": 0.71, + "grad_norm": 1.7294983851635601, + "learning_rate": 4.104038402940863e-06, + "loss": 0.7238, + "step": 6977 + }, + { + "epoch": 0.71, + "grad_norm": 1.6325619934130196, + "learning_rate": 4.101377797291265e-06, + "loss": 0.6057, + "step": 6978 + }, + { + "epoch": 0.71, + "grad_norm": 1.5344764459127072, + "learning_rate": 4.098717831844867e-06, + "loss": 0.6152, + "step": 6979 + }, + { + "epoch": 0.71, + "grad_norm": 1.6435855427908748, + "learning_rate": 4.09605850689036e-06, + "loss": 0.6682, + "step": 6980 + }, + { + "epoch": 0.71, + "grad_norm": 1.9189032336719265, + "learning_rate": 4.093399822716377e-06, + "loss": 0.6975, + "step": 6981 + }, + { + "epoch": 0.71, + "grad_norm": 1.6852666215395304, + "learning_rate": 4.090741779611472e-06, + "loss": 0.7259, + "step": 6982 + }, + { + "epoch": 0.71, + "grad_norm": 1.710324596376418, + "learning_rate": 4.088084377864136e-06, + "loss": 0.767, + "step": 6983 + }, + { + "epoch": 0.71, + "grad_norm": 1.6380525152065986, + "learning_rate": 4.085427617762792e-06, + "loss": 0.6968, + "step": 6984 + }, + { + "epoch": 0.71, + "grad_norm": 1.5433413494513588, + "learning_rate": 4.082771499595782e-06, + "loss": 0.6676, + "step": 6985 + }, + { + "epoch": 0.71, + "grad_norm": 1.779847131311914, + "learning_rate": 4.080116023651396e-06, + "loss": 0.6974, + "step": 6986 + }, + { + "epoch": 0.71, + "grad_norm": 1.5818748331761248, + "learning_rate": 4.077461190217835e-06, + "loss": 0.7561, + "step": 6987 + }, + { + "epoch": 0.71, + "grad_norm": 1.7205646792957754, + "learning_rate": 4.074806999583243e-06, + "loss": 0.7579, + "step": 6988 + }, + { + "epoch": 0.71, + "grad_norm": 1.503529710542259, + "learning_rate": 4.0721534520356955e-06, + "loss": 0.6626, + "step": 6989 + }, + { + "epoch": 0.71, + "grad_norm": 1.7433595830714752, + "learning_rate": 4.069500547863185e-06, + "loss": 0.7432, + "step": 6990 + }, + { + "epoch": 0.71, + "grad_norm": 1.5407275190849465, + "learning_rate": 4.06684828735365e-06, + "loss": 0.6634, + "step": 6991 + }, + { + "epoch": 0.71, + "grad_norm": 1.879585303039927, + "learning_rate": 4.0641966707949506e-06, + "loss": 0.7283, + "step": 6992 + }, + { + "epoch": 0.71, + "grad_norm": 1.6665885895090278, + "learning_rate": 4.061545698474875e-06, + "loss": 0.7186, + "step": 6993 + }, + { + "epoch": 0.71, + "grad_norm": 1.6482473801952147, + "learning_rate": 4.0588953706811485e-06, + "loss": 0.6727, + "step": 6994 + }, + { + "epoch": 0.71, + "grad_norm": 1.6151987819235434, + "learning_rate": 4.056245687701419e-06, + "loss": 0.6786, + "step": 6995 + }, + { + "epoch": 0.71, + "grad_norm": 1.723556396094163, + "learning_rate": 4.053596649823269e-06, + "loss": 0.6509, + "step": 6996 + }, + { + "epoch": 0.71, + "grad_norm": 1.7011356810748508, + "learning_rate": 4.050948257334215e-06, + "loss": 0.8039, + "step": 6997 + }, + { + "epoch": 0.71, + "grad_norm": 1.6895154305021753, + "learning_rate": 4.048300510521692e-06, + "loss": 0.6526, + "step": 6998 + }, + { + "epoch": 0.71, + "grad_norm": 1.9452574818201525, + "learning_rate": 4.045653409673079e-06, + "loss": 0.8085, + "step": 6999 + }, + { + "epoch": 0.71, + "grad_norm": 1.7609192721254838, + "learning_rate": 4.043006955075667e-06, + "loss": 0.7762, + "step": 7000 + }, + { + "epoch": 0.71, + "grad_norm": 1.7793784128833332, + "learning_rate": 4.040361147016693e-06, + "loss": 0.7615, + "step": 7001 + }, + { + "epoch": 0.71, + "grad_norm": 1.7238975070132547, + "learning_rate": 4.0377159857833235e-06, + "loss": 0.6991, + "step": 7002 + }, + { + "epoch": 0.71, + "grad_norm": 1.6611134034266273, + "learning_rate": 4.035071471662641e-06, + "loss": 0.7009, + "step": 7003 + }, + { + "epoch": 0.71, + "grad_norm": 1.7967481323766012, + "learning_rate": 4.032427604941671e-06, + "loss": 0.6879, + "step": 7004 + }, + { + "epoch": 0.71, + "grad_norm": 1.6103088335675344, + "learning_rate": 4.029784385907359e-06, + "loss": 0.6998, + "step": 7005 + }, + { + "epoch": 0.71, + "grad_norm": 1.4755229277285078, + "learning_rate": 4.027141814846588e-06, + "loss": 0.6892, + "step": 7006 + }, + { + "epoch": 0.71, + "grad_norm": 1.7663439626959418, + "learning_rate": 4.024499892046172e-06, + "loss": 0.6752, + "step": 7007 + }, + { + "epoch": 0.71, + "grad_norm": 1.6168490704977603, + "learning_rate": 4.021858617792843e-06, + "loss": 0.6625, + "step": 7008 + }, + { + "epoch": 0.71, + "grad_norm": 1.7391448458975851, + "learning_rate": 4.019217992373277e-06, + "loss": 0.7316, + "step": 7009 + }, + { + "epoch": 0.71, + "grad_norm": 1.7057666540441399, + "learning_rate": 4.016578016074064e-06, + "loss": 0.7041, + "step": 7010 + }, + { + "epoch": 0.71, + "grad_norm": 1.6059359526383257, + "learning_rate": 4.0139386891817376e-06, + "loss": 0.6557, + "step": 7011 + }, + { + "epoch": 0.71, + "grad_norm": 1.773911345778964, + "learning_rate": 4.011300011982759e-06, + "loss": 0.7215, + "step": 7012 + }, + { + "epoch": 0.71, + "grad_norm": 1.4984640233051176, + "learning_rate": 4.0086619847635075e-06, + "loss": 0.6175, + "step": 7013 + }, + { + "epoch": 0.71, + "grad_norm": 1.7100583414353592, + "learning_rate": 4.006024607810304e-06, + "loss": 0.7669, + "step": 7014 + }, + { + "epoch": 0.71, + "grad_norm": 1.4668525303618518, + "learning_rate": 4.003387881409397e-06, + "loss": 0.5917, + "step": 7015 + }, + { + "epoch": 0.71, + "grad_norm": 1.4201488465349763, + "learning_rate": 4.000751805846956e-06, + "loss": 0.6535, + "step": 7016 + }, + { + "epoch": 0.71, + "grad_norm": 1.7988283699217553, + "learning_rate": 3.998116381409091e-06, + "loss": 0.7743, + "step": 7017 + }, + { + "epoch": 0.71, + "grad_norm": 1.761206047677621, + "learning_rate": 3.995481608381833e-06, + "loss": 0.7344, + "step": 7018 + }, + { + "epoch": 0.71, + "grad_norm": 1.6422469485706646, + "learning_rate": 3.992847487051144e-06, + "loss": 0.7581, + "step": 7019 + }, + { + "epoch": 0.71, + "grad_norm": 1.625424783824051, + "learning_rate": 3.990214017702923e-06, + "loss": 0.707, + "step": 7020 + }, + { + "epoch": 0.71, + "grad_norm": 1.456008490060363, + "learning_rate": 3.987581200622984e-06, + "loss": 0.6372, + "step": 7021 + }, + { + "epoch": 0.71, + "grad_norm": 1.7653640607260257, + "learning_rate": 3.984949036097085e-06, + "loss": 0.7238, + "step": 7022 + }, + { + "epoch": 0.71, + "grad_norm": 1.70968000846081, + "learning_rate": 3.982317524410901e-06, + "loss": 0.6212, + "step": 7023 + }, + { + "epoch": 0.71, + "grad_norm": 1.7276547862614866, + "learning_rate": 3.979686665850043e-06, + "loss": 0.7209, + "step": 7024 + }, + { + "epoch": 0.71, + "grad_norm": 1.4841760329433793, + "learning_rate": 3.977056460700054e-06, + "loss": 0.8146, + "step": 7025 + }, + { + "epoch": 0.71, + "grad_norm": 1.7281365965194087, + "learning_rate": 3.974426909246395e-06, + "loss": 0.6069, + "step": 7026 + }, + { + "epoch": 0.71, + "grad_norm": 1.7366179797813863, + "learning_rate": 3.971798011774468e-06, + "loss": 0.681, + "step": 7027 + }, + { + "epoch": 0.71, + "grad_norm": 1.7665892061525008, + "learning_rate": 3.969169768569593e-06, + "loss": 0.6869, + "step": 7028 + }, + { + "epoch": 0.71, + "grad_norm": 1.6786218317981734, + "learning_rate": 3.966542179917028e-06, + "loss": 0.6813, + "step": 7029 + }, + { + "epoch": 0.72, + "grad_norm": 1.4711116283729369, + "learning_rate": 3.963915246101961e-06, + "loss": 0.5982, + "step": 7030 + }, + { + "epoch": 0.72, + "grad_norm": 1.6434577385828204, + "learning_rate": 3.9612889674094955e-06, + "loss": 0.7025, + "step": 7031 + }, + { + "epoch": 0.72, + "grad_norm": 1.5554711330154485, + "learning_rate": 3.958663344124678e-06, + "loss": 0.6731, + "step": 7032 + }, + { + "epoch": 0.72, + "grad_norm": 1.6676558574806524, + "learning_rate": 3.956038376532482e-06, + "loss": 0.6567, + "step": 7033 + }, + { + "epoch": 0.72, + "grad_norm": 1.7124209469173053, + "learning_rate": 3.9534140649177995e-06, + "loss": 0.7216, + "step": 7034 + }, + { + "epoch": 0.72, + "grad_norm": 1.7075703905836943, + "learning_rate": 3.950790409565467e-06, + "loss": 0.7102, + "step": 7035 + }, + { + "epoch": 0.72, + "grad_norm": 1.5417142310579948, + "learning_rate": 3.948167410760232e-06, + "loss": 0.664, + "step": 7036 + }, + { + "epoch": 0.72, + "grad_norm": 1.461830993842134, + "learning_rate": 3.9455450687867845e-06, + "loss": 0.6277, + "step": 7037 + }, + { + "epoch": 0.72, + "grad_norm": 1.8940398474818356, + "learning_rate": 3.942923383929743e-06, + "loss": 0.7572, + "step": 7038 + }, + { + "epoch": 0.72, + "grad_norm": 1.9355738539011664, + "learning_rate": 3.940302356473642e-06, + "loss": 0.7879, + "step": 7039 + }, + { + "epoch": 0.72, + "grad_norm": 1.687447276435726, + "learning_rate": 3.93768198670296e-06, + "loss": 0.6419, + "step": 7040 + }, + { + "epoch": 0.72, + "grad_norm": 1.5842593327172416, + "learning_rate": 3.935062274902092e-06, + "loss": 0.7642, + "step": 7041 + }, + { + "epoch": 0.72, + "grad_norm": 1.5795026594057051, + "learning_rate": 3.932443221355369e-06, + "loss": 0.6418, + "step": 7042 + }, + { + "epoch": 0.72, + "grad_norm": 1.5554013080977451, + "learning_rate": 3.929824826347051e-06, + "loss": 0.6573, + "step": 7043 + }, + { + "epoch": 0.72, + "grad_norm": 1.5958131006786735, + "learning_rate": 3.927207090161318e-06, + "loss": 0.648, + "step": 7044 + }, + { + "epoch": 0.72, + "grad_norm": 1.4619791680331131, + "learning_rate": 3.9245900130822915e-06, + "loss": 0.6172, + "step": 7045 + }, + { + "epoch": 0.72, + "grad_norm": 1.751601619237473, + "learning_rate": 3.921973595394005e-06, + "loss": 0.711, + "step": 7046 + }, + { + "epoch": 0.72, + "grad_norm": 1.7066907412184105, + "learning_rate": 3.919357837380437e-06, + "loss": 0.7086, + "step": 7047 + }, + { + "epoch": 0.72, + "grad_norm": 1.573108845705456, + "learning_rate": 3.916742739325488e-06, + "loss": 0.7856, + "step": 7048 + }, + { + "epoch": 0.72, + "grad_norm": 1.6505398116251775, + "learning_rate": 3.91412830151298e-06, + "loss": 0.6768, + "step": 7049 + }, + { + "epoch": 0.72, + "grad_norm": 1.565935148019075, + "learning_rate": 3.9115145242266714e-06, + "loss": 0.7227, + "step": 7050 + }, + { + "epoch": 0.72, + "grad_norm": 1.7356480172771545, + "learning_rate": 3.9089014077502515e-06, + "loss": 0.7685, + "step": 7051 + }, + { + "epoch": 0.72, + "grad_norm": 1.6549946772587585, + "learning_rate": 3.906288952367328e-06, + "loss": 0.7455, + "step": 7052 + }, + { + "epoch": 0.72, + "grad_norm": 1.6271433182157338, + "learning_rate": 3.903677158361445e-06, + "loss": 0.6842, + "step": 7053 + }, + { + "epoch": 0.72, + "grad_norm": 1.5938354446085299, + "learning_rate": 3.901066026016068e-06, + "loss": 0.5795, + "step": 7054 + }, + { + "epoch": 0.72, + "grad_norm": 1.612843481164936, + "learning_rate": 3.898455555614598e-06, + "loss": 0.7869, + "step": 7055 + }, + { + "epoch": 0.72, + "grad_norm": 1.514332800714335, + "learning_rate": 3.895845747440363e-06, + "loss": 0.6233, + "step": 7056 + }, + { + "epoch": 0.72, + "grad_norm": 1.7666438214701161, + "learning_rate": 3.89323660177661e-06, + "loss": 0.7276, + "step": 7057 + }, + { + "epoch": 0.72, + "grad_norm": 1.7673801040896533, + "learning_rate": 3.890628118906525e-06, + "loss": 0.6719, + "step": 7058 + }, + { + "epoch": 0.72, + "grad_norm": 1.618196212982791, + "learning_rate": 3.888020299113221e-06, + "loss": 0.6988, + "step": 7059 + }, + { + "epoch": 0.72, + "grad_norm": 1.6641889014520415, + "learning_rate": 3.885413142679729e-06, + "loss": 0.6884, + "step": 7060 + }, + { + "epoch": 0.72, + "grad_norm": 1.5790124585813679, + "learning_rate": 3.882806649889024e-06, + "loss": 0.7119, + "step": 7061 + }, + { + "epoch": 0.72, + "grad_norm": 1.5846068340561161, + "learning_rate": 3.8802008210239906e-06, + "loss": 0.6524, + "step": 7062 + }, + { + "epoch": 0.72, + "grad_norm": 1.519531275732951, + "learning_rate": 3.877595656367454e-06, + "loss": 0.638, + "step": 7063 + }, + { + "epoch": 0.72, + "grad_norm": 1.5115523763776504, + "learning_rate": 3.8749911562021705e-06, + "loss": 0.6822, + "step": 7064 + }, + { + "epoch": 0.72, + "grad_norm": 1.764434489462425, + "learning_rate": 3.872387320810807e-06, + "loss": 0.7642, + "step": 7065 + }, + { + "epoch": 0.72, + "grad_norm": 1.5793773972876426, + "learning_rate": 3.869784150475977e-06, + "loss": 0.6887, + "step": 7066 + }, + { + "epoch": 0.72, + "grad_norm": 1.7045250995664494, + "learning_rate": 3.867181645480214e-06, + "loss": 0.676, + "step": 7067 + }, + { + "epoch": 0.72, + "grad_norm": 1.5399538449783368, + "learning_rate": 3.864579806105974e-06, + "loss": 0.6566, + "step": 7068 + }, + { + "epoch": 0.72, + "grad_norm": 1.6457773798036648, + "learning_rate": 3.861978632635648e-06, + "loss": 0.6843, + "step": 7069 + }, + { + "epoch": 0.72, + "grad_norm": 1.5939853478787616, + "learning_rate": 3.859378125351556e-06, + "loss": 0.7579, + "step": 7070 + }, + { + "epoch": 0.72, + "grad_norm": 1.8740643030417445, + "learning_rate": 3.856778284535938e-06, + "loss": 0.7391, + "step": 7071 + }, + { + "epoch": 0.72, + "grad_norm": 1.6024680692373516, + "learning_rate": 3.854179110470968e-06, + "loss": 0.7591, + "step": 7072 + }, + { + "epoch": 0.72, + "grad_norm": 1.8009607749236867, + "learning_rate": 3.851580603438748e-06, + "loss": 0.8274, + "step": 7073 + }, + { + "epoch": 0.72, + "grad_norm": 1.6075153854514934, + "learning_rate": 3.8489827637213e-06, + "loss": 0.7427, + "step": 7074 + }, + { + "epoch": 0.72, + "grad_norm": 1.764959273544147, + "learning_rate": 3.846385591600582e-06, + "loss": 0.7193, + "step": 7075 + }, + { + "epoch": 0.72, + "grad_norm": 1.5309895025303242, + "learning_rate": 3.843789087358481e-06, + "loss": 0.6079, + "step": 7076 + }, + { + "epoch": 0.72, + "grad_norm": 1.7067092529866092, + "learning_rate": 3.841193251276797e-06, + "loss": 0.6646, + "step": 7077 + }, + { + "epoch": 0.72, + "grad_norm": 1.54944657886839, + "learning_rate": 3.838598083637273e-06, + "loss": 0.598, + "step": 7078 + }, + { + "epoch": 0.72, + "grad_norm": 1.7348919530284566, + "learning_rate": 3.836003584721577e-06, + "loss": 0.7491, + "step": 7079 + }, + { + "epoch": 0.72, + "grad_norm": 1.404024633603411, + "learning_rate": 3.833409754811295e-06, + "loss": 0.6814, + "step": 7080 + }, + { + "epoch": 0.72, + "grad_norm": 1.555957462060979, + "learning_rate": 3.830816594187951e-06, + "loss": 0.748, + "step": 7081 + }, + { + "epoch": 0.72, + "grad_norm": 1.7083144056582373, + "learning_rate": 3.828224103132989e-06, + "loss": 0.6971, + "step": 7082 + }, + { + "epoch": 0.72, + "grad_norm": 1.4838984338133978, + "learning_rate": 3.825632281927784e-06, + "loss": 0.6188, + "step": 7083 + }, + { + "epoch": 0.72, + "grad_norm": 1.6210945937368182, + "learning_rate": 3.823041130853642e-06, + "loss": 0.7154, + "step": 7084 + }, + { + "epoch": 0.72, + "grad_norm": 1.6502752504544569, + "learning_rate": 3.820450650191785e-06, + "loss": 0.609, + "step": 7085 + }, + { + "epoch": 0.72, + "grad_norm": 1.8119556281486937, + "learning_rate": 3.817860840223373e-06, + "loss": 0.6798, + "step": 7086 + }, + { + "epoch": 0.72, + "grad_norm": 1.6083606169328408, + "learning_rate": 3.815271701229491e-06, + "loss": 0.6415, + "step": 7087 + }, + { + "epoch": 0.72, + "grad_norm": 1.7234006915737945, + "learning_rate": 3.8126832334911457e-06, + "loss": 0.7479, + "step": 7088 + }, + { + "epoch": 0.72, + "grad_norm": 1.7823470873162353, + "learning_rate": 3.8100954372892796e-06, + "loss": 0.7111, + "step": 7089 + }, + { + "epoch": 0.72, + "grad_norm": 1.54738129409935, + "learning_rate": 3.8075083129047507e-06, + "loss": 0.6821, + "step": 7090 + }, + { + "epoch": 0.72, + "grad_norm": 1.6814801668504775, + "learning_rate": 3.8049218606183547e-06, + "loss": 0.616, + "step": 7091 + }, + { + "epoch": 0.72, + "grad_norm": 1.8021188952408784, + "learning_rate": 3.8023360807108135e-06, + "loss": 0.7924, + "step": 7092 + }, + { + "epoch": 0.72, + "grad_norm": 1.743377592740335, + "learning_rate": 3.799750973462768e-06, + "loss": 0.7971, + "step": 7093 + }, + { + "epoch": 0.72, + "grad_norm": 1.6937420380740584, + "learning_rate": 3.797166539154795e-06, + "loss": 0.6509, + "step": 7094 + }, + { + "epoch": 0.72, + "grad_norm": 1.6321361424032252, + "learning_rate": 3.7945827780673904e-06, + "loss": 0.6793, + "step": 7095 + }, + { + "epoch": 0.72, + "grad_norm": 1.603854582495709, + "learning_rate": 3.791999690480984e-06, + "loss": 0.6254, + "step": 7096 + }, + { + "epoch": 0.72, + "grad_norm": 1.593723128503487, + "learning_rate": 3.7894172766759307e-06, + "loss": 0.6594, + "step": 7097 + }, + { + "epoch": 0.72, + "grad_norm": 1.5564792340475972, + "learning_rate": 3.786835536932507e-06, + "loss": 0.6598, + "step": 7098 + }, + { + "epoch": 0.72, + "grad_norm": 1.5831785482522764, + "learning_rate": 3.784254471530926e-06, + "loss": 0.7586, + "step": 7099 + }, + { + "epoch": 0.72, + "grad_norm": 1.7443724076857514, + "learning_rate": 3.7816740807513154e-06, + "loss": 0.6868, + "step": 7100 + }, + { + "epoch": 0.72, + "grad_norm": 1.509927809623996, + "learning_rate": 3.7790943648737397e-06, + "loss": 0.6671, + "step": 7101 + }, + { + "epoch": 0.72, + "grad_norm": 1.725056179937763, + "learning_rate": 3.7765153241781893e-06, + "loss": 0.7384, + "step": 7102 + }, + { + "epoch": 0.72, + "grad_norm": 1.6151974285444082, + "learning_rate": 3.773936958944574e-06, + "loss": 0.649, + "step": 7103 + }, + { + "epoch": 0.72, + "grad_norm": 1.6428686899132872, + "learning_rate": 3.7713592694527357e-06, + "loss": 0.6798, + "step": 7104 + }, + { + "epoch": 0.72, + "grad_norm": 1.5815525541499031, + "learning_rate": 3.768782255982447e-06, + "loss": 0.6567, + "step": 7105 + }, + { + "epoch": 0.72, + "grad_norm": 1.693255150657801, + "learning_rate": 3.766205918813397e-06, + "loss": 0.6504, + "step": 7106 + }, + { + "epoch": 0.72, + "grad_norm": 1.7690595597513055, + "learning_rate": 3.7636302582252114e-06, + "loss": 0.7456, + "step": 7107 + }, + { + "epoch": 0.72, + "grad_norm": 1.584515931711388, + "learning_rate": 3.761055274497433e-06, + "loss": 0.6839, + "step": 7108 + }, + { + "epoch": 0.72, + "grad_norm": 1.5816360333856814, + "learning_rate": 3.758480967909537e-06, + "loss": 0.6714, + "step": 7109 + }, + { + "epoch": 0.72, + "grad_norm": 1.566909665599923, + "learning_rate": 3.7559073387409286e-06, + "loss": 0.7095, + "step": 7110 + }, + { + "epoch": 0.72, + "grad_norm": 1.4996558972408467, + "learning_rate": 3.7533343872709295e-06, + "loss": 0.7148, + "step": 7111 + }, + { + "epoch": 0.72, + "grad_norm": 1.7782962742990462, + "learning_rate": 3.7507621137787988e-06, + "loss": 0.7736, + "step": 7112 + }, + { + "epoch": 0.72, + "grad_norm": 1.5960500136687246, + "learning_rate": 3.7481905185437105e-06, + "loss": 0.7787, + "step": 7113 + }, + { + "epoch": 0.72, + "grad_norm": 1.606927467079132, + "learning_rate": 3.745619601844773e-06, + "loss": 0.6923, + "step": 7114 + }, + { + "epoch": 0.72, + "grad_norm": 1.455797703919836, + "learning_rate": 3.7430493639610245e-06, + "loss": 0.6307, + "step": 7115 + }, + { + "epoch": 0.72, + "grad_norm": 1.5626622983287617, + "learning_rate": 3.7404798051714165e-06, + "loss": 0.695, + "step": 7116 + }, + { + "epoch": 0.72, + "grad_norm": 1.6079084286137084, + "learning_rate": 3.7379109257548406e-06, + "loss": 0.7181, + "step": 7117 + }, + { + "epoch": 0.72, + "grad_norm": 1.4823136615497665, + "learning_rate": 3.7353427259901022e-06, + "loss": 0.7078, + "step": 7118 + }, + { + "epoch": 0.72, + "grad_norm": 1.6497796267623495, + "learning_rate": 3.732775206155943e-06, + "loss": 0.7823, + "step": 7119 + }, + { + "epoch": 0.72, + "grad_norm": 1.5963666093680742, + "learning_rate": 3.7302083665310295e-06, + "loss": 0.5863, + "step": 7120 + }, + { + "epoch": 0.72, + "grad_norm": 1.7135779255987058, + "learning_rate": 3.7276422073939466e-06, + "loss": 0.7041, + "step": 7121 + }, + { + "epoch": 0.72, + "grad_norm": 1.729275030874236, + "learning_rate": 3.7250767290232124e-06, + "loss": 0.7154, + "step": 7122 + }, + { + "epoch": 0.72, + "grad_norm": 1.5755392311483456, + "learning_rate": 3.7225119316972747e-06, + "loss": 0.6911, + "step": 7123 + }, + { + "epoch": 0.72, + "grad_norm": 1.663429556378475, + "learning_rate": 3.7199478156944947e-06, + "loss": 0.6331, + "step": 7124 + }, + { + "epoch": 0.72, + "grad_norm": 1.711823333115215, + "learning_rate": 3.717384381293173e-06, + "loss": 0.7237, + "step": 7125 + }, + { + "epoch": 0.72, + "grad_norm": 1.6672270773065978, + "learning_rate": 3.7148216287715243e-06, + "loss": 0.7352, + "step": 7126 + }, + { + "epoch": 0.72, + "grad_norm": 1.8010218071305932, + "learning_rate": 3.712259558407698e-06, + "loss": 0.638, + "step": 7127 + }, + { + "epoch": 0.73, + "grad_norm": 1.7201988979672762, + "learning_rate": 3.709698170479772e-06, + "loss": 0.6107, + "step": 7128 + }, + { + "epoch": 0.73, + "grad_norm": 1.7275487216334133, + "learning_rate": 3.7071374652657366e-06, + "loss": 0.8083, + "step": 7129 + }, + { + "epoch": 0.73, + "grad_norm": 1.6241632422864194, + "learning_rate": 3.704577443043523e-06, + "loss": 0.6765, + "step": 7130 + }, + { + "epoch": 0.73, + "grad_norm": 1.5499977286232118, + "learning_rate": 3.7020181040909755e-06, + "loss": 0.6343, + "step": 7131 + }, + { + "epoch": 0.73, + "grad_norm": 1.6018603460961796, + "learning_rate": 3.6994594486858724e-06, + "loss": 0.6779, + "step": 7132 + }, + { + "epoch": 0.73, + "grad_norm": 1.6908374645330553, + "learning_rate": 3.696901477105921e-06, + "loss": 0.6236, + "step": 7133 + }, + { + "epoch": 0.73, + "grad_norm": 1.521919889461445, + "learning_rate": 3.694344189628741e-06, + "loss": 0.6738, + "step": 7134 + }, + { + "epoch": 0.73, + "grad_norm": 1.7491997799433427, + "learning_rate": 3.6917875865318944e-06, + "loss": 0.6773, + "step": 7135 + }, + { + "epoch": 0.73, + "grad_norm": 1.6868041845930088, + "learning_rate": 3.6892316680928508e-06, + "loss": 0.6554, + "step": 7136 + }, + { + "epoch": 0.73, + "grad_norm": 1.556084069113492, + "learning_rate": 3.6866764345890217e-06, + "loss": 0.6641, + "step": 7137 + }, + { + "epoch": 0.73, + "grad_norm": 1.4191396117041615, + "learning_rate": 3.684121886297739e-06, + "loss": 0.6768, + "step": 7138 + }, + { + "epoch": 0.73, + "grad_norm": 1.6062587399945396, + "learning_rate": 3.6815680234962527e-06, + "loss": 0.5832, + "step": 7139 + }, + { + "epoch": 0.73, + "grad_norm": 1.7773274881370058, + "learning_rate": 3.6790148464617483e-06, + "loss": 0.7483, + "step": 7140 + }, + { + "epoch": 0.73, + "grad_norm": 1.7298207064959892, + "learning_rate": 3.6764623554713364e-06, + "loss": 0.7185, + "step": 7141 + }, + { + "epoch": 0.73, + "grad_norm": 1.802729381434695, + "learning_rate": 3.6739105508020433e-06, + "loss": 0.7059, + "step": 7142 + }, + { + "epoch": 0.73, + "grad_norm": 1.55814796727082, + "learning_rate": 3.6713594327308343e-06, + "loss": 0.6882, + "step": 7143 + }, + { + "epoch": 0.73, + "grad_norm": 1.9841458785651938, + "learning_rate": 3.6688090015345878e-06, + "loss": 0.7185, + "step": 7144 + }, + { + "epoch": 0.73, + "grad_norm": 1.5236445095995443, + "learning_rate": 3.6662592574901147e-06, + "loss": 0.763, + "step": 7145 + }, + { + "epoch": 0.73, + "grad_norm": 1.667091898622156, + "learning_rate": 3.663710200874153e-06, + "loss": 0.6871, + "step": 7146 + }, + { + "epoch": 0.73, + "grad_norm": 1.640283370489685, + "learning_rate": 3.661161831963358e-06, + "loss": 0.6242, + "step": 7147 + }, + { + "epoch": 0.73, + "grad_norm": 1.5959959058927706, + "learning_rate": 3.658614151034321e-06, + "loss": 0.7617, + "step": 7148 + }, + { + "epoch": 0.73, + "grad_norm": 1.7141601341216794, + "learning_rate": 3.6560671583635467e-06, + "loss": 0.6534, + "step": 7149 + }, + { + "epoch": 0.73, + "grad_norm": 1.7300259568139287, + "learning_rate": 3.653520854227475e-06, + "loss": 0.7433, + "step": 7150 + }, + { + "epoch": 0.73, + "grad_norm": 1.5532886686274163, + "learning_rate": 3.6509752389024687e-06, + "loss": 0.579, + "step": 7151 + }, + { + "epoch": 0.73, + "grad_norm": 1.6833266687520791, + "learning_rate": 3.648430312664809e-06, + "loss": 0.6755, + "step": 7152 + }, + { + "epoch": 0.73, + "grad_norm": 1.5564738321146763, + "learning_rate": 3.6458860757907155e-06, + "loss": 0.6367, + "step": 7153 + }, + { + "epoch": 0.73, + "grad_norm": 1.7105058056686722, + "learning_rate": 3.6433425285563186e-06, + "loss": 0.6857, + "step": 7154 + }, + { + "epoch": 0.73, + "grad_norm": 1.6487205971867094, + "learning_rate": 3.6407996712376826e-06, + "loss": 0.6234, + "step": 7155 + }, + { + "epoch": 0.73, + "grad_norm": 1.8336973044214955, + "learning_rate": 3.6382575041107982e-06, + "loss": 0.735, + "step": 7156 + }, + { + "epoch": 0.73, + "grad_norm": 1.6426974325555639, + "learning_rate": 3.6357160274515723e-06, + "loss": 0.6363, + "step": 7157 + }, + { + "epoch": 0.73, + "grad_norm": 1.5300249782218753, + "learning_rate": 3.633175241535849e-06, + "loss": 0.6511, + "step": 7158 + }, + { + "epoch": 0.73, + "grad_norm": 1.6712026210826763, + "learning_rate": 3.630635146639384e-06, + "loss": 0.7023, + "step": 7159 + }, + { + "epoch": 0.73, + "grad_norm": 1.5512216505261869, + "learning_rate": 3.6280957430378683e-06, + "loss": 0.6601, + "step": 7160 + }, + { + "epoch": 0.73, + "grad_norm": 1.6199132742303428, + "learning_rate": 3.6255570310069166e-06, + "loss": 0.6928, + "step": 7161 + }, + { + "epoch": 0.73, + "grad_norm": 1.7001832013414149, + "learning_rate": 3.6230190108220607e-06, + "loss": 0.7222, + "step": 7162 + }, + { + "epoch": 0.73, + "grad_norm": 1.5664502892848486, + "learning_rate": 3.6204816827587676e-06, + "loss": 0.6602, + "step": 7163 + }, + { + "epoch": 0.73, + "grad_norm": 1.7106532424063383, + "learning_rate": 3.617945047092426e-06, + "loss": 0.6976, + "step": 7164 + }, + { + "epoch": 0.73, + "grad_norm": 1.5093380311405362, + "learning_rate": 3.615409104098342e-06, + "loss": 0.5972, + "step": 7165 + }, + { + "epoch": 0.73, + "grad_norm": 1.6008320342553601, + "learning_rate": 3.612873854051759e-06, + "loss": 0.6195, + "step": 7166 + }, + { + "epoch": 0.73, + "grad_norm": 1.65846303644904, + "learning_rate": 3.610339297227833e-06, + "loss": 0.7601, + "step": 7167 + }, + { + "epoch": 0.73, + "grad_norm": 1.6312980744454015, + "learning_rate": 3.6078054339016523e-06, + "loss": 0.7859, + "step": 7168 + }, + { + "epoch": 0.73, + "grad_norm": 1.6668920120325388, + "learning_rate": 3.605272264348233e-06, + "loss": 0.697, + "step": 7169 + }, + { + "epoch": 0.73, + "grad_norm": 1.581682435271611, + "learning_rate": 3.602739788842503e-06, + "loss": 0.6864, + "step": 7170 + }, + { + "epoch": 0.73, + "grad_norm": 1.622319941173912, + "learning_rate": 3.6002080076593273e-06, + "loss": 0.732, + "step": 7171 + }, + { + "epoch": 0.73, + "grad_norm": 1.6851021641033066, + "learning_rate": 3.597676921073494e-06, + "loss": 0.7444, + "step": 7172 + }, + { + "epoch": 0.73, + "grad_norm": 1.7772998980522972, + "learning_rate": 3.5951465293597067e-06, + "loss": 0.7153, + "step": 7173 + }, + { + "epoch": 0.73, + "grad_norm": 1.7090383637910858, + "learning_rate": 3.5926168327926037e-06, + "loss": 0.7717, + "step": 7174 + }, + { + "epoch": 0.73, + "grad_norm": 1.5283316067335495, + "learning_rate": 3.590087831646746e-06, + "loss": 0.5902, + "step": 7175 + }, + { + "epoch": 0.73, + "grad_norm": 1.70593382715811, + "learning_rate": 3.58755952619661e-06, + "loss": 0.7558, + "step": 7176 + }, + { + "epoch": 0.73, + "grad_norm": 1.4769021537695464, + "learning_rate": 3.585031916716609e-06, + "loss": 0.5626, + "step": 7177 + }, + { + "epoch": 0.73, + "grad_norm": 1.5193009552710477, + "learning_rate": 3.5825050034810783e-06, + "loss": 0.6463, + "step": 7178 + }, + { + "epoch": 0.73, + "grad_norm": 1.5464268469172968, + "learning_rate": 3.5799787867642668e-06, + "loss": 0.6902, + "step": 7179 + }, + { + "epoch": 0.73, + "grad_norm": 1.9070860927402575, + "learning_rate": 3.5774532668403606e-06, + "loss": 0.8036, + "step": 7180 + }, + { + "epoch": 0.73, + "grad_norm": 1.677251422282328, + "learning_rate": 3.5749284439834686e-06, + "loss": 0.7145, + "step": 7181 + }, + { + "epoch": 0.73, + "grad_norm": 1.5070874738175213, + "learning_rate": 3.5724043184676127e-06, + "loss": 0.6111, + "step": 7182 + }, + { + "epoch": 0.73, + "grad_norm": 1.3679124163375487, + "learning_rate": 3.5698808905667525e-06, + "loss": 0.6639, + "step": 7183 + }, + { + "epoch": 0.73, + "grad_norm": 1.6294192752826915, + "learning_rate": 3.567358160554769e-06, + "loss": 0.6482, + "step": 7184 + }, + { + "epoch": 0.73, + "grad_norm": 1.6115377083728697, + "learning_rate": 3.564836128705459e-06, + "loss": 0.6533, + "step": 7185 + }, + { + "epoch": 0.73, + "grad_norm": 1.9367670221869318, + "learning_rate": 3.562314795292552e-06, + "loss": 0.6958, + "step": 7186 + }, + { + "epoch": 0.73, + "grad_norm": 1.5439196900733712, + "learning_rate": 3.5597941605897037e-06, + "loss": 0.6759, + "step": 7187 + }, + { + "epoch": 0.73, + "grad_norm": 1.8954483496019165, + "learning_rate": 3.5572742248704817e-06, + "loss": 0.702, + "step": 7188 + }, + { + "epoch": 0.73, + "grad_norm": 1.5821038172561497, + "learning_rate": 3.554754988408393e-06, + "loss": 0.5861, + "step": 7189 + }, + { + "epoch": 0.73, + "grad_norm": 1.6938894940828406, + "learning_rate": 3.552236451476855e-06, + "loss": 0.7871, + "step": 7190 + }, + { + "epoch": 0.73, + "grad_norm": 1.614556870374976, + "learning_rate": 3.5497186143492178e-06, + "loss": 0.7146, + "step": 7191 + }, + { + "epoch": 0.73, + "grad_norm": 1.5828409470899927, + "learning_rate": 3.5472014772987573e-06, + "loss": 0.7527, + "step": 7192 + }, + { + "epoch": 0.73, + "grad_norm": 1.6435787132195512, + "learning_rate": 3.544685040598663e-06, + "loss": 0.6479, + "step": 7193 + }, + { + "epoch": 0.73, + "grad_norm": 1.7080262144710854, + "learning_rate": 3.5421693045220617e-06, + "loss": 0.6583, + "step": 7194 + }, + { + "epoch": 0.73, + "grad_norm": 1.6662616193281345, + "learning_rate": 3.539654269341989e-06, + "loss": 0.7191, + "step": 7195 + }, + { + "epoch": 0.73, + "grad_norm": 1.7599616953319472, + "learning_rate": 3.5371399353314174e-06, + "loss": 0.659, + "step": 7196 + }, + { + "epoch": 0.73, + "grad_norm": 1.5910727310443404, + "learning_rate": 3.534626302763241e-06, + "loss": 0.6978, + "step": 7197 + }, + { + "epoch": 0.73, + "grad_norm": 1.7541929338966649, + "learning_rate": 3.5321133719102685e-06, + "loss": 0.665, + "step": 7198 + }, + { + "epoch": 0.73, + "grad_norm": 1.563447514004173, + "learning_rate": 3.529601143045244e-06, + "loss": 0.5738, + "step": 7199 + }, + { + "epoch": 0.73, + "grad_norm": 1.6272858485757566, + "learning_rate": 3.527089616440832e-06, + "loss": 0.6693, + "step": 7200 + }, + { + "epoch": 0.73, + "grad_norm": 1.6767842031362263, + "learning_rate": 3.524578792369614e-06, + "loss": 0.6784, + "step": 7201 + }, + { + "epoch": 0.73, + "grad_norm": 1.741601124837262, + "learning_rate": 3.5220686711041085e-06, + "loss": 0.6576, + "step": 7202 + }, + { + "epoch": 0.73, + "grad_norm": 1.6554235824440988, + "learning_rate": 3.51955925291674e-06, + "loss": 0.7596, + "step": 7203 + }, + { + "epoch": 0.73, + "grad_norm": 1.8480889756347079, + "learning_rate": 3.5170505380798736e-06, + "loss": 0.7017, + "step": 7204 + }, + { + "epoch": 0.73, + "grad_norm": 1.7918181666049604, + "learning_rate": 3.5145425268657916e-06, + "loss": 0.7029, + "step": 7205 + }, + { + "epoch": 0.73, + "grad_norm": 1.790400059060796, + "learning_rate": 3.5120352195466932e-06, + "loss": 0.6969, + "step": 7206 + }, + { + "epoch": 0.73, + "grad_norm": 1.6471726588298956, + "learning_rate": 3.509528616394716e-06, + "loss": 0.6233, + "step": 7207 + }, + { + "epoch": 0.73, + "grad_norm": 1.4980354612253914, + "learning_rate": 3.5070227176819036e-06, + "loss": 0.7079, + "step": 7208 + }, + { + "epoch": 0.73, + "grad_norm": 1.570210483481177, + "learning_rate": 3.5045175236802363e-06, + "loss": 0.7148, + "step": 7209 + }, + { + "epoch": 0.73, + "grad_norm": 1.553566799177608, + "learning_rate": 3.502013034661619e-06, + "loss": 0.6731, + "step": 7210 + }, + { + "epoch": 0.73, + "grad_norm": 1.8000474948425484, + "learning_rate": 3.4995092508978647e-06, + "loss": 0.7375, + "step": 7211 + }, + { + "epoch": 0.73, + "grad_norm": 1.6913024060790467, + "learning_rate": 3.4970061726607273e-06, + "loss": 0.6878, + "step": 7212 + }, + { + "epoch": 0.73, + "grad_norm": 1.7068333067586372, + "learning_rate": 3.494503800221872e-06, + "loss": 0.71, + "step": 7213 + }, + { + "epoch": 0.73, + "grad_norm": 1.817682464669328, + "learning_rate": 3.4920021338528955e-06, + "loss": 0.672, + "step": 7214 + }, + { + "epoch": 0.73, + "grad_norm": 1.6311735296816936, + "learning_rate": 3.489501173825316e-06, + "loss": 0.7779, + "step": 7215 + }, + { + "epoch": 0.73, + "grad_norm": 1.50529393797726, + "learning_rate": 3.487000920410568e-06, + "loss": 0.6823, + "step": 7216 + }, + { + "epoch": 0.73, + "grad_norm": 1.6558426245962694, + "learning_rate": 3.4845013738800194e-06, + "loss": 0.61, + "step": 7217 + }, + { + "epoch": 0.73, + "grad_norm": 1.612431482351146, + "learning_rate": 3.482002534504958e-06, + "loss": 0.7768, + "step": 7218 + }, + { + "epoch": 0.73, + "grad_norm": 1.7138504683026008, + "learning_rate": 3.479504402556587e-06, + "loss": 0.6834, + "step": 7219 + }, + { + "epoch": 0.73, + "grad_norm": 1.5804252165412198, + "learning_rate": 3.477006978306049e-06, + "loss": 0.7372, + "step": 7220 + }, + { + "epoch": 0.73, + "grad_norm": 1.5959790428928604, + "learning_rate": 3.474510262024391e-06, + "loss": 0.6769, + "step": 7221 + }, + { + "epoch": 0.73, + "grad_norm": 1.6203807255279645, + "learning_rate": 3.4720142539825975e-06, + "loss": 0.691, + "step": 7222 + }, + { + "epoch": 0.73, + "grad_norm": 1.7315453441096231, + "learning_rate": 3.4695189544515727e-06, + "loss": 0.6951, + "step": 7223 + }, + { + "epoch": 0.73, + "grad_norm": 1.508462206764295, + "learning_rate": 3.467024363702138e-06, + "loss": 0.6509, + "step": 7224 + }, + { + "epoch": 0.73, + "grad_norm": 1.7963743245623451, + "learning_rate": 3.4645304820050473e-06, + "loss": 0.6806, + "step": 7225 + }, + { + "epoch": 0.74, + "grad_norm": 1.538399822037882, + "learning_rate": 3.4620373096309656e-06, + "loss": 0.6781, + "step": 7226 + }, + { + "epoch": 0.74, + "grad_norm": 1.7490947649773623, + "learning_rate": 3.459544846850492e-06, + "loss": 0.6844, + "step": 7227 + }, + { + "epoch": 0.74, + "grad_norm": 1.7330712725230233, + "learning_rate": 3.4570530939341472e-06, + "loss": 0.6949, + "step": 7228 + }, + { + "epoch": 0.74, + "grad_norm": 1.6639851511795398, + "learning_rate": 3.454562051152366e-06, + "loss": 0.7073, + "step": 7229 + }, + { + "epoch": 0.74, + "grad_norm": 1.667892739832757, + "learning_rate": 3.452071718775518e-06, + "loss": 0.7281, + "step": 7230 + }, + { + "epoch": 0.74, + "grad_norm": 1.5559555783691887, + "learning_rate": 3.4495820970738834e-06, + "loss": 0.6977, + "step": 7231 + }, + { + "epoch": 0.74, + "grad_norm": 1.8376998130377118, + "learning_rate": 3.4470931863176747e-06, + "loss": 0.6824, + "step": 7232 + }, + { + "epoch": 0.74, + "grad_norm": 1.624520742992479, + "learning_rate": 3.4446049867770283e-06, + "loss": 0.642, + "step": 7233 + }, + { + "epoch": 0.74, + "grad_norm": 1.665652967735254, + "learning_rate": 3.442117498721992e-06, + "loss": 0.6934, + "step": 7234 + }, + { + "epoch": 0.74, + "grad_norm": 1.495900862109376, + "learning_rate": 3.439630722422548e-06, + "loss": 0.6932, + "step": 7235 + }, + { + "epoch": 0.74, + "grad_norm": 1.6200504827919735, + "learning_rate": 3.437144658148599e-06, + "loss": 0.6874, + "step": 7236 + }, + { + "epoch": 0.74, + "grad_norm": 1.6871382210453179, + "learning_rate": 3.434659306169963e-06, + "loss": 0.7193, + "step": 7237 + }, + { + "epoch": 0.74, + "grad_norm": 1.6020384570556934, + "learning_rate": 3.4321746667563925e-06, + "loss": 0.6491, + "step": 7238 + }, + { + "epoch": 0.74, + "grad_norm": 1.4852594541553965, + "learning_rate": 3.4296907401775494e-06, + "loss": 0.612, + "step": 7239 + }, + { + "epoch": 0.74, + "grad_norm": 1.653249261782761, + "learning_rate": 3.4272075267030267e-06, + "loss": 0.6335, + "step": 7240 + }, + { + "epoch": 0.74, + "grad_norm": 1.5842397819032858, + "learning_rate": 3.4247250266023446e-06, + "loss": 0.6908, + "step": 7241 + }, + { + "epoch": 0.74, + "grad_norm": 1.5817026113440948, + "learning_rate": 3.4222432401449313e-06, + "loss": 0.7081, + "step": 7242 + }, + { + "epoch": 0.74, + "grad_norm": 1.6773022328073637, + "learning_rate": 3.4197621676001537e-06, + "loss": 0.7217, + "step": 7243 + }, + { + "epoch": 0.74, + "grad_norm": 1.582428050031458, + "learning_rate": 3.417281809237285e-06, + "loss": 0.6992, + "step": 7244 + }, + { + "epoch": 0.74, + "grad_norm": 1.6711688415393713, + "learning_rate": 3.4148021653255347e-06, + "loss": 0.754, + "step": 7245 + }, + { + "epoch": 0.74, + "grad_norm": 1.8417283173332568, + "learning_rate": 3.412323236134032e-06, + "loss": 0.6941, + "step": 7246 + }, + { + "epoch": 0.74, + "grad_norm": 1.8817162021377392, + "learning_rate": 3.409845021931818e-06, + "loss": 0.695, + "step": 7247 + }, + { + "epoch": 0.74, + "grad_norm": 1.5693407212436574, + "learning_rate": 3.4073675229878724e-06, + "loss": 0.6799, + "step": 7248 + }, + { + "epoch": 0.74, + "grad_norm": 1.602686936513114, + "learning_rate": 3.404890739571081e-06, + "loss": 0.7407, + "step": 7249 + }, + { + "epoch": 0.74, + "grad_norm": 1.5627001624359027, + "learning_rate": 3.4024146719502636e-06, + "loss": 0.6668, + "step": 7250 + }, + { + "epoch": 0.74, + "grad_norm": 1.9331849062270565, + "learning_rate": 3.3999393203941623e-06, + "loss": 0.731, + "step": 7251 + }, + { + "epoch": 0.74, + "grad_norm": 1.5206783627275, + "learning_rate": 3.397464685171431e-06, + "loss": 0.6608, + "step": 7252 + }, + { + "epoch": 0.74, + "grad_norm": 1.7107663982664039, + "learning_rate": 3.3949907665506564e-06, + "loss": 0.73, + "step": 7253 + }, + { + "epoch": 0.74, + "grad_norm": 1.8654410443640381, + "learning_rate": 3.3925175648003473e-06, + "loss": 0.7268, + "step": 7254 + }, + { + "epoch": 0.74, + "grad_norm": 1.6870964608863164, + "learning_rate": 3.390045080188923e-06, + "loss": 0.6921, + "step": 7255 + }, + { + "epoch": 0.74, + "grad_norm": 1.6475194906845443, + "learning_rate": 3.38757331298474e-06, + "loss": 0.7748, + "step": 7256 + }, + { + "epoch": 0.74, + "grad_norm": 1.8213267285460604, + "learning_rate": 3.385102263456065e-06, + "loss": 0.5969, + "step": 7257 + }, + { + "epoch": 0.74, + "grad_norm": 1.6459408638945416, + "learning_rate": 3.3826319318710933e-06, + "loss": 0.697, + "step": 7258 + }, + { + "epoch": 0.74, + "grad_norm": 1.703788221164889, + "learning_rate": 3.3801623184979437e-06, + "loss": 0.6976, + "step": 7259 + }, + { + "epoch": 0.74, + "grad_norm": 1.7504852878801893, + "learning_rate": 3.377693423604649e-06, + "loss": 0.7039, + "step": 7260 + }, + { + "epoch": 0.74, + "grad_norm": 1.5253648258267434, + "learning_rate": 3.3752252474591762e-06, + "loss": 0.6294, + "step": 7261 + }, + { + "epoch": 0.74, + "grad_norm": 1.5040219167465418, + "learning_rate": 3.3727577903293985e-06, + "loss": 0.631, + "step": 7262 + }, + { + "epoch": 0.74, + "grad_norm": 1.7870140766431388, + "learning_rate": 3.3702910524831246e-06, + "loss": 0.7444, + "step": 7263 + }, + { + "epoch": 0.74, + "grad_norm": 1.607093992786678, + "learning_rate": 3.3678250341880825e-06, + "loss": 0.7117, + "step": 7264 + }, + { + "epoch": 0.74, + "grad_norm": 1.5595706608009965, + "learning_rate": 3.365359735711915e-06, + "loss": 0.6255, + "step": 7265 + }, + { + "epoch": 0.74, + "grad_norm": 1.7040629735909656, + "learning_rate": 3.3628951573221966e-06, + "loss": 0.7388, + "step": 7266 + }, + { + "epoch": 0.74, + "grad_norm": 1.7937033138858276, + "learning_rate": 3.360431299286413e-06, + "loss": 0.6807, + "step": 7267 + }, + { + "epoch": 0.74, + "grad_norm": 1.6216251520551597, + "learning_rate": 3.3579681618719805e-06, + "loss": 0.7217, + "step": 7268 + }, + { + "epoch": 0.74, + "grad_norm": 1.5615083234069798, + "learning_rate": 3.355505745346238e-06, + "loss": 0.6231, + "step": 7269 + }, + { + "epoch": 0.74, + "grad_norm": 1.532634334500432, + "learning_rate": 3.353044049976435e-06, + "loss": 0.645, + "step": 7270 + }, + { + "epoch": 0.74, + "grad_norm": 1.638134421357406, + "learning_rate": 3.3505830760297543e-06, + "loss": 0.6466, + "step": 7271 + }, + { + "epoch": 0.74, + "grad_norm": 1.653769742345302, + "learning_rate": 3.3481228237732975e-06, + "loss": 0.7258, + "step": 7272 + }, + { + "epoch": 0.74, + "grad_norm": 1.5837947311775986, + "learning_rate": 3.345663293474083e-06, + "loss": 0.6474, + "step": 7273 + }, + { + "epoch": 0.74, + "grad_norm": 1.64849332588951, + "learning_rate": 3.3432044853990576e-06, + "loss": 0.6081, + "step": 7274 + }, + { + "epoch": 0.74, + "grad_norm": 1.6257045316549825, + "learning_rate": 3.3407463998150815e-06, + "loss": 0.5885, + "step": 7275 + }, + { + "epoch": 0.74, + "grad_norm": 1.9181770773628148, + "learning_rate": 3.338289036988945e-06, + "loss": 0.6925, + "step": 7276 + }, + { + "epoch": 0.74, + "grad_norm": 1.6579765006359775, + "learning_rate": 3.3358323971873586e-06, + "loss": 0.7907, + "step": 7277 + }, + { + "epoch": 0.74, + "grad_norm": 1.6223788293680472, + "learning_rate": 3.3333764806769474e-06, + "loss": 0.7057, + "step": 7278 + }, + { + "epoch": 0.74, + "grad_norm": 1.5562895761921027, + "learning_rate": 3.3309212877242635e-06, + "loss": 0.6549, + "step": 7279 + }, + { + "epoch": 0.74, + "grad_norm": 1.8141933918317033, + "learning_rate": 3.3284668185957857e-06, + "loss": 0.7398, + "step": 7280 + }, + { + "epoch": 0.74, + "grad_norm": 1.54263284558655, + "learning_rate": 3.3260130735579e-06, + "loss": 0.6571, + "step": 7281 + }, + { + "epoch": 0.74, + "grad_norm": 1.619131530660715, + "learning_rate": 3.3235600528769253e-06, + "loss": 0.7058, + "step": 7282 + }, + { + "epoch": 0.74, + "grad_norm": 1.465021228430089, + "learning_rate": 3.321107756819103e-06, + "loss": 0.5369, + "step": 7283 + }, + { + "epoch": 0.74, + "grad_norm": 1.6381336452278406, + "learning_rate": 3.318656185650584e-06, + "loss": 0.6229, + "step": 7284 + }, + { + "epoch": 0.74, + "grad_norm": 1.5536313913242494, + "learning_rate": 3.3162053396374504e-06, + "loss": 0.6387, + "step": 7285 + }, + { + "epoch": 0.74, + "grad_norm": 1.6618093267828662, + "learning_rate": 3.313755219045709e-06, + "loss": 0.6368, + "step": 7286 + }, + { + "epoch": 0.74, + "grad_norm": 1.6517390858077126, + "learning_rate": 3.311305824141273e-06, + "loss": 0.6692, + "step": 7287 + }, + { + "epoch": 0.74, + "grad_norm": 1.4256680556420662, + "learning_rate": 3.308857155189992e-06, + "loss": 0.7384, + "step": 7288 + }, + { + "epoch": 0.74, + "grad_norm": 1.5064498910789945, + "learning_rate": 3.306409212457631e-06, + "loss": 0.6581, + "step": 7289 + }, + { + "epoch": 0.74, + "grad_norm": 1.7608860865876834, + "learning_rate": 3.30396199620987e-06, + "loss": 0.6862, + "step": 7290 + }, + { + "epoch": 0.74, + "grad_norm": 1.6037617675529898, + "learning_rate": 3.301515506712322e-06, + "loss": 0.7146, + "step": 7291 + }, + { + "epoch": 0.74, + "grad_norm": 1.7593652994874995, + "learning_rate": 3.2990697442305143e-06, + "loss": 0.6931, + "step": 7292 + }, + { + "epoch": 0.74, + "grad_norm": 1.5088961297220562, + "learning_rate": 3.296624709029893e-06, + "loss": 0.5407, + "step": 7293 + }, + { + "epoch": 0.74, + "grad_norm": 1.549978531019286, + "learning_rate": 3.2941804013758337e-06, + "loss": 0.7049, + "step": 7294 + }, + { + "epoch": 0.74, + "grad_norm": 1.6325363025182515, + "learning_rate": 3.291736821533621e-06, + "loss": 0.7165, + "step": 7295 + }, + { + "epoch": 0.74, + "grad_norm": 1.6735142078675114, + "learning_rate": 3.289293969768471e-06, + "loss": 0.6012, + "step": 7296 + }, + { + "epoch": 0.74, + "grad_norm": 1.7178665383223046, + "learning_rate": 3.2868518463455203e-06, + "loss": 0.6026, + "step": 7297 + }, + { + "epoch": 0.74, + "grad_norm": 1.6928985980299383, + "learning_rate": 3.284410451529816e-06, + "loss": 0.65, + "step": 7298 + }, + { + "epoch": 0.74, + "grad_norm": 1.7305481371698612, + "learning_rate": 3.2819697855863374e-06, + "loss": 0.7129, + "step": 7299 + }, + { + "epoch": 0.74, + "grad_norm": 1.7258542451488783, + "learning_rate": 3.279529848779983e-06, + "loss": 0.6139, + "step": 7300 + }, + { + "epoch": 0.74, + "grad_norm": 1.6523221761729956, + "learning_rate": 3.2770906413755644e-06, + "loss": 0.7421, + "step": 7301 + }, + { + "epoch": 0.74, + "grad_norm": 1.534840481965886, + "learning_rate": 3.274652163637826e-06, + "loss": 0.6207, + "step": 7302 + }, + { + "epoch": 0.74, + "grad_norm": 1.7502912095179775, + "learning_rate": 3.2722144158314184e-06, + "loss": 0.7288, + "step": 7303 + }, + { + "epoch": 0.74, + "grad_norm": 1.6496887127591713, + "learning_rate": 3.2697773982209257e-06, + "loss": 0.6785, + "step": 7304 + }, + { + "epoch": 0.74, + "grad_norm": 1.6368353847674915, + "learning_rate": 3.2673411110708507e-06, + "loss": 0.666, + "step": 7305 + }, + { + "epoch": 0.74, + "grad_norm": 1.5451756159890135, + "learning_rate": 3.2649055546456076e-06, + "loss": 0.7173, + "step": 7306 + }, + { + "epoch": 0.74, + "grad_norm": 1.7004419330703944, + "learning_rate": 3.2624707292095446e-06, + "loss": 0.7743, + "step": 7307 + }, + { + "epoch": 0.74, + "grad_norm": 1.595740560798837, + "learning_rate": 3.2600366350269176e-06, + "loss": 0.5945, + "step": 7308 + }, + { + "epoch": 0.74, + "grad_norm": 1.722661566376523, + "learning_rate": 3.257603272361912e-06, + "loss": 0.6278, + "step": 7309 + }, + { + "epoch": 0.74, + "grad_norm": 1.6681602626098242, + "learning_rate": 3.2551706414786366e-06, + "loss": 0.6901, + "step": 7310 + }, + { + "epoch": 0.74, + "grad_norm": 1.6689875835667756, + "learning_rate": 3.252738742641106e-06, + "loss": 0.7482, + "step": 7311 + }, + { + "epoch": 0.74, + "grad_norm": 1.6118580407253362, + "learning_rate": 3.2503075761132707e-06, + "loss": 0.6896, + "step": 7312 + }, + { + "epoch": 0.74, + "grad_norm": 1.6008888195431321, + "learning_rate": 3.247877142158997e-06, + "loss": 0.7138, + "step": 7313 + }, + { + "epoch": 0.74, + "grad_norm": 1.6096208333217168, + "learning_rate": 3.2454474410420657e-06, + "loss": 0.6315, + "step": 7314 + }, + { + "epoch": 0.74, + "grad_norm": 1.6230174119350946, + "learning_rate": 3.2430184730261883e-06, + "loss": 0.6744, + "step": 7315 + }, + { + "epoch": 0.74, + "grad_norm": 1.685018707256806, + "learning_rate": 3.2405902383749856e-06, + "loss": 0.684, + "step": 7316 + }, + { + "epoch": 0.74, + "grad_norm": 1.6803874750251426, + "learning_rate": 3.2381627373520066e-06, + "loss": 0.7465, + "step": 7317 + }, + { + "epoch": 0.74, + "grad_norm": 1.6429226662638055, + "learning_rate": 3.2357359702207216e-06, + "loss": 0.7916, + "step": 7318 + }, + { + "epoch": 0.74, + "grad_norm": 1.7563407976090484, + "learning_rate": 3.233309937244513e-06, + "loss": 0.736, + "step": 7319 + }, + { + "epoch": 0.74, + "grad_norm": 1.5215856115164546, + "learning_rate": 3.2308846386866954e-06, + "loss": 0.6469, + "step": 7320 + }, + { + "epoch": 0.74, + "grad_norm": 1.5886452708615701, + "learning_rate": 3.228460074810489e-06, + "loss": 0.7408, + "step": 7321 + }, + { + "epoch": 0.74, + "grad_norm": 1.6400376738359026, + "learning_rate": 3.226036245879047e-06, + "loss": 0.6373, + "step": 7322 + }, + { + "epoch": 0.74, + "grad_norm": 1.6690055408195899, + "learning_rate": 3.2236131521554405e-06, + "loss": 0.6703, + "step": 7323 + }, + { + "epoch": 0.74, + "grad_norm": 1.5373330223676502, + "learning_rate": 3.221190793902653e-06, + "loss": 0.5949, + "step": 7324 + }, + { + "epoch": 0.75, + "grad_norm": 1.6379524975379682, + "learning_rate": 3.218769171383599e-06, + "loss": 0.6989, + "step": 7325 + }, + { + "epoch": 0.75, + "grad_norm": 1.819067342413871, + "learning_rate": 3.2163482848611003e-06, + "loss": 0.6376, + "step": 7326 + }, + { + "epoch": 0.75, + "grad_norm": 1.7154236629745496, + "learning_rate": 3.213928134597912e-06, + "loss": 0.7656, + "step": 7327 + }, + { + "epoch": 0.75, + "grad_norm": 1.6693865924740787, + "learning_rate": 3.2115087208567064e-06, + "loss": 0.7179, + "step": 7328 + }, + { + "epoch": 0.75, + "grad_norm": 1.7508571742865737, + "learning_rate": 3.209090043900065e-06, + "loss": 0.696, + "step": 7329 + }, + { + "epoch": 0.75, + "grad_norm": 1.5057491696391703, + "learning_rate": 3.206672103990501e-06, + "loss": 0.596, + "step": 7330 + }, + { + "epoch": 0.75, + "grad_norm": 1.6512693921958572, + "learning_rate": 3.2042549013904476e-06, + "loss": 0.7441, + "step": 7331 + }, + { + "epoch": 0.75, + "grad_norm": 1.7628128435052854, + "learning_rate": 3.201838436362249e-06, + "loss": 0.6603, + "step": 7332 + }, + { + "epoch": 0.75, + "grad_norm": 1.6365190818788702, + "learning_rate": 3.1994227091681783e-06, + "loss": 0.6335, + "step": 7333 + }, + { + "epoch": 0.75, + "grad_norm": 1.5780363922604819, + "learning_rate": 3.197007720070422e-06, + "loss": 0.6621, + "step": 7334 + }, + { + "epoch": 0.75, + "grad_norm": 1.572747001069289, + "learning_rate": 3.1945934693310897e-06, + "loss": 0.6726, + "step": 7335 + }, + { + "epoch": 0.75, + "grad_norm": 1.5666495089845731, + "learning_rate": 3.192179957212215e-06, + "loss": 0.6427, + "step": 7336 + }, + { + "epoch": 0.75, + "grad_norm": 1.6216586270269608, + "learning_rate": 3.1897671839757405e-06, + "loss": 0.6585, + "step": 7337 + }, + { + "epoch": 0.75, + "grad_norm": 1.5232294696713657, + "learning_rate": 3.187355149883541e-06, + "loss": 0.6636, + "step": 7338 + }, + { + "epoch": 0.75, + "grad_norm": 1.5701600681094992, + "learning_rate": 3.1849438551973986e-06, + "loss": 0.6563, + "step": 7339 + }, + { + "epoch": 0.75, + "grad_norm": 1.7231236105797698, + "learning_rate": 3.182533300179026e-06, + "loss": 0.672, + "step": 7340 + }, + { + "epoch": 0.75, + "grad_norm": 1.660836767656615, + "learning_rate": 3.180123485090053e-06, + "loss": 0.6324, + "step": 7341 + }, + { + "epoch": 0.75, + "grad_norm": 1.6430898224518655, + "learning_rate": 3.177714410192022e-06, + "loss": 0.6109, + "step": 7342 + }, + { + "epoch": 0.75, + "grad_norm": 1.6269418239884736, + "learning_rate": 3.175306075746406e-06, + "loss": 0.7617, + "step": 7343 + }, + { + "epoch": 0.75, + "grad_norm": 1.6439427634892574, + "learning_rate": 3.172898482014586e-06, + "loss": 0.6748, + "step": 7344 + }, + { + "epoch": 0.75, + "grad_norm": 1.7691958231320482, + "learning_rate": 3.1704916292578715e-06, + "loss": 0.6396, + "step": 7345 + }, + { + "epoch": 0.75, + "grad_norm": 1.637905279936545, + "learning_rate": 3.1680855177374935e-06, + "loss": 0.6897, + "step": 7346 + }, + { + "epoch": 0.75, + "grad_norm": 1.6798196456772034, + "learning_rate": 3.1656801477145893e-06, + "loss": 0.7228, + "step": 7347 + }, + { + "epoch": 0.75, + "grad_norm": 1.7038195826667484, + "learning_rate": 3.1632755194502286e-06, + "loss": 0.6807, + "step": 7348 + }, + { + "epoch": 0.75, + "grad_norm": 1.6980391062663183, + "learning_rate": 3.160871633205398e-06, + "loss": 0.6299, + "step": 7349 + }, + { + "epoch": 0.75, + "grad_norm": 1.4863795099644443, + "learning_rate": 3.1584684892409977e-06, + "loss": 0.5528, + "step": 7350 + }, + { + "epoch": 0.75, + "grad_norm": 1.5142489059614048, + "learning_rate": 3.156066087817856e-06, + "loss": 0.6406, + "step": 7351 + }, + { + "epoch": 0.75, + "grad_norm": 1.7486622198807542, + "learning_rate": 3.15366442919671e-06, + "loss": 0.6327, + "step": 7352 + }, + { + "epoch": 0.75, + "grad_norm": 1.6055617758275391, + "learning_rate": 3.1512635136382252e-06, + "loss": 0.5865, + "step": 7353 + }, + { + "epoch": 0.75, + "grad_norm": 1.5219472276420638, + "learning_rate": 3.1488633414029877e-06, + "loss": 0.6461, + "step": 7354 + }, + { + "epoch": 0.75, + "grad_norm": 1.8176492653029488, + "learning_rate": 3.146463912751491e-06, + "loss": 0.7689, + "step": 7355 + }, + { + "epoch": 0.75, + "grad_norm": 1.6826159035851425, + "learning_rate": 3.144065227944163e-06, + "loss": 0.7403, + "step": 7356 + }, + { + "epoch": 0.75, + "grad_norm": 1.6247469531612129, + "learning_rate": 3.1416672872413357e-06, + "loss": 0.7106, + "step": 7357 + }, + { + "epoch": 0.75, + "grad_norm": 1.5779106715212203, + "learning_rate": 3.1392700909032724e-06, + "loss": 0.6871, + "step": 7358 + }, + { + "epoch": 0.75, + "grad_norm": 1.4920076107625604, + "learning_rate": 3.1368736391901544e-06, + "loss": 0.6714, + "step": 7359 + }, + { + "epoch": 0.75, + "grad_norm": 1.961808390580555, + "learning_rate": 3.134477932362072e-06, + "loss": 0.7265, + "step": 7360 + }, + { + "epoch": 0.75, + "grad_norm": 1.938920841767802, + "learning_rate": 3.132082970679049e-06, + "loss": 0.7427, + "step": 7361 + }, + { + "epoch": 0.75, + "grad_norm": 1.567752759558388, + "learning_rate": 3.1296887544010133e-06, + "loss": 0.6539, + "step": 7362 + }, + { + "epoch": 0.75, + "grad_norm": 1.5151465329107645, + "learning_rate": 3.1272952837878245e-06, + "loss": 0.6595, + "step": 7363 + }, + { + "epoch": 0.75, + "grad_norm": 1.705679499416008, + "learning_rate": 3.124902559099258e-06, + "loss": 0.6997, + "step": 7364 + }, + { + "epoch": 0.75, + "grad_norm": 1.8858540122131102, + "learning_rate": 3.1225105805950017e-06, + "loss": 0.7913, + "step": 7365 + }, + { + "epoch": 0.75, + "grad_norm": 1.7419499972808716, + "learning_rate": 3.120119348534669e-06, + "loss": 0.7856, + "step": 7366 + }, + { + "epoch": 0.75, + "grad_norm": 1.573465112824827, + "learning_rate": 3.1177288631777956e-06, + "loss": 0.6733, + "step": 7367 + }, + { + "epoch": 0.75, + "grad_norm": 1.6325035220696849, + "learning_rate": 3.115339124783825e-06, + "loss": 0.6511, + "step": 7368 + }, + { + "epoch": 0.75, + "grad_norm": 1.7307285220036097, + "learning_rate": 3.1129501336121303e-06, + "loss": 0.615, + "step": 7369 + }, + { + "epoch": 0.75, + "grad_norm": 1.9298849037539572, + "learning_rate": 3.1105618899219946e-06, + "loss": 0.6952, + "step": 7370 + }, + { + "epoch": 0.75, + "grad_norm": 1.8535262033904598, + "learning_rate": 3.1081743939726262e-06, + "loss": 0.7454, + "step": 7371 + }, + { + "epoch": 0.75, + "grad_norm": 1.6615890889245795, + "learning_rate": 3.105787646023155e-06, + "loss": 0.803, + "step": 7372 + }, + { + "epoch": 0.75, + "grad_norm": 1.7370173676111043, + "learning_rate": 3.103401646332619e-06, + "loss": 0.6846, + "step": 7373 + }, + { + "epoch": 0.75, + "grad_norm": 1.810281054365534, + "learning_rate": 3.101016395159986e-06, + "loss": 0.7291, + "step": 7374 + }, + { + "epoch": 0.75, + "grad_norm": 1.6837037222155278, + "learning_rate": 3.0986318927641312e-06, + "loss": 0.7795, + "step": 7375 + }, + { + "epoch": 0.75, + "grad_norm": 1.7021722264339456, + "learning_rate": 3.09624813940386e-06, + "loss": 0.6884, + "step": 7376 + }, + { + "epoch": 0.75, + "grad_norm": 1.6763761247294078, + "learning_rate": 3.0938651353378946e-06, + "loss": 0.7276, + "step": 7377 + }, + { + "epoch": 0.75, + "grad_norm": 1.5836681850071694, + "learning_rate": 3.091482880824864e-06, + "loss": 0.6714, + "step": 7378 + }, + { + "epoch": 0.75, + "grad_norm": 1.731965481202213, + "learning_rate": 3.0891013761233345e-06, + "loss": 0.6925, + "step": 7379 + }, + { + "epoch": 0.75, + "grad_norm": 1.7464462818615787, + "learning_rate": 3.0867206214917723e-06, + "loss": 0.7193, + "step": 7380 + }, + { + "epoch": 0.75, + "grad_norm": 1.849544297503174, + "learning_rate": 3.0843406171885746e-06, + "loss": 0.8064, + "step": 7381 + }, + { + "epoch": 0.75, + "grad_norm": 1.540043474640889, + "learning_rate": 3.081961363472058e-06, + "loss": 0.7355, + "step": 7382 + }, + { + "epoch": 0.75, + "grad_norm": 1.6909442617771338, + "learning_rate": 3.0795828606004453e-06, + "loss": 0.6626, + "step": 7383 + }, + { + "epoch": 0.75, + "grad_norm": 1.7735932876842673, + "learning_rate": 3.0772051088318912e-06, + "loss": 0.6367, + "step": 7384 + }, + { + "epoch": 0.75, + "grad_norm": 1.6463646066423563, + "learning_rate": 3.0748281084244635e-06, + "loss": 0.7147, + "step": 7385 + }, + { + "epoch": 0.75, + "grad_norm": 1.5406094829194776, + "learning_rate": 3.0724518596361452e-06, + "loss": 0.6248, + "step": 7386 + }, + { + "epoch": 0.75, + "grad_norm": 1.6502764101119622, + "learning_rate": 3.070076362724843e-06, + "loss": 0.5867, + "step": 7387 + }, + { + "epoch": 0.75, + "grad_norm": 1.4455671784961945, + "learning_rate": 3.0677016179483823e-06, + "loss": 0.7, + "step": 7388 + }, + { + "epoch": 0.75, + "grad_norm": 1.7474920723140612, + "learning_rate": 3.065327625564498e-06, + "loss": 0.6584, + "step": 7389 + }, + { + "epoch": 0.75, + "grad_norm": 1.6490934938705888, + "learning_rate": 3.0629543858308552e-06, + "loss": 0.6237, + "step": 7390 + }, + { + "epoch": 0.75, + "grad_norm": 1.620282671271724, + "learning_rate": 3.0605818990050327e-06, + "loss": 0.6551, + "step": 7391 + }, + { + "epoch": 0.75, + "grad_norm": 1.6635610737609734, + "learning_rate": 3.0582101653445227e-06, + "loss": 0.6701, + "step": 7392 + }, + { + "epoch": 0.75, + "grad_norm": 1.7532758162200444, + "learning_rate": 3.0558391851067416e-06, + "loss": 0.783, + "step": 7393 + }, + { + "epoch": 0.75, + "grad_norm": 1.6603614563648805, + "learning_rate": 3.053468958549025e-06, + "loss": 0.6462, + "step": 7394 + }, + { + "epoch": 0.75, + "grad_norm": 1.5306270774029103, + "learning_rate": 3.051099485928618e-06, + "loss": 0.6622, + "step": 7395 + }, + { + "epoch": 0.75, + "grad_norm": 1.500571714029283, + "learning_rate": 3.0487307675026967e-06, + "loss": 0.631, + "step": 7396 + }, + { + "epoch": 0.75, + "grad_norm": 1.3669527221428746, + "learning_rate": 3.0463628035283423e-06, + "loss": 0.6113, + "step": 7397 + }, + { + "epoch": 0.75, + "grad_norm": 1.6656957768204146, + "learning_rate": 3.043995594262562e-06, + "loss": 0.612, + "step": 7398 + }, + { + "epoch": 0.75, + "grad_norm": 1.7199610639093554, + "learning_rate": 3.0416291399622834e-06, + "loss": 0.7657, + "step": 7399 + }, + { + "epoch": 0.75, + "grad_norm": 1.5742384138792822, + "learning_rate": 3.039263440884341e-06, + "loss": 0.6016, + "step": 7400 + }, + { + "epoch": 0.75, + "grad_norm": 1.5696383649368324, + "learning_rate": 3.0368984972854997e-06, + "loss": 0.5948, + "step": 7401 + }, + { + "epoch": 0.75, + "grad_norm": 1.5431594277390508, + "learning_rate": 3.034534309422439e-06, + "loss": 0.7673, + "step": 7402 + }, + { + "epoch": 0.75, + "grad_norm": 1.7611851032651105, + "learning_rate": 3.0321708775517476e-06, + "loss": 0.7376, + "step": 7403 + }, + { + "epoch": 0.75, + "grad_norm": 1.6782638099138119, + "learning_rate": 3.0298082019299424e-06, + "loss": 0.6546, + "step": 7404 + }, + { + "epoch": 0.75, + "grad_norm": 1.655260128106923, + "learning_rate": 3.0274462828134587e-06, + "loss": 0.8029, + "step": 7405 + }, + { + "epoch": 0.75, + "grad_norm": 1.7300337613368677, + "learning_rate": 3.0250851204586394e-06, + "loss": 0.64, + "step": 7406 + }, + { + "epoch": 0.75, + "grad_norm": 1.6253057086422333, + "learning_rate": 3.0227247151217553e-06, + "loss": 0.6651, + "step": 7407 + }, + { + "epoch": 0.75, + "grad_norm": 1.7339750324433152, + "learning_rate": 3.0203650670589945e-06, + "loss": 0.672, + "step": 7408 + }, + { + "epoch": 0.75, + "grad_norm": 1.477359102111727, + "learning_rate": 3.0180061765264523e-06, + "loss": 0.658, + "step": 7409 + }, + { + "epoch": 0.75, + "grad_norm": 1.621876733888337, + "learning_rate": 3.015648043780156e-06, + "loss": 0.5895, + "step": 7410 + }, + { + "epoch": 0.75, + "grad_norm": 1.6668748997638514, + "learning_rate": 3.0132906690760398e-06, + "loss": 0.6339, + "step": 7411 + }, + { + "epoch": 0.75, + "grad_norm": 1.6796738882866564, + "learning_rate": 3.0109340526699604e-06, + "loss": 0.5918, + "step": 7412 + }, + { + "epoch": 0.75, + "grad_norm": 1.5084714791846123, + "learning_rate": 3.0085781948176974e-06, + "loss": 0.6234, + "step": 7413 + }, + { + "epoch": 0.75, + "grad_norm": 1.7944341554365648, + "learning_rate": 3.006223095774933e-06, + "loss": 0.7622, + "step": 7414 + }, + { + "epoch": 0.75, + "grad_norm": 1.7037480514842924, + "learning_rate": 3.003868755797286e-06, + "loss": 0.7089, + "step": 7415 + }, + { + "epoch": 0.75, + "grad_norm": 1.7032944351810269, + "learning_rate": 3.0015151751402725e-06, + "loss": 0.6763, + "step": 7416 + }, + { + "epoch": 0.75, + "grad_norm": 1.745469835993225, + "learning_rate": 2.9991623540593453e-06, + "loss": 0.71, + "step": 7417 + }, + { + "epoch": 0.75, + "grad_norm": 1.6436550601102142, + "learning_rate": 2.996810292809865e-06, + "loss": 0.8204, + "step": 7418 + }, + { + "epoch": 0.75, + "grad_norm": 1.6882237970692697, + "learning_rate": 2.994458991647108e-06, + "loss": 0.6516, + "step": 7419 + }, + { + "epoch": 0.75, + "grad_norm": 1.6759976105871366, + "learning_rate": 2.9921084508262722e-06, + "loss": 0.7869, + "step": 7420 + }, + { + "epoch": 0.75, + "grad_norm": 1.632224254599666, + "learning_rate": 2.9897586706024763e-06, + "loss": 0.5967, + "step": 7421 + }, + { + "epoch": 0.75, + "grad_norm": 1.7319331472604649, + "learning_rate": 2.987409651230746e-06, + "loss": 0.6773, + "step": 7422 + }, + { + "epoch": 0.76, + "grad_norm": 1.6792342451955644, + "learning_rate": 2.9850613929660366e-06, + "loss": 0.698, + "step": 7423 + }, + { + "epoch": 0.76, + "grad_norm": 1.7380245477625635, + "learning_rate": 2.9827138960632076e-06, + "loss": 0.6786, + "step": 7424 + }, + { + "epoch": 0.76, + "grad_norm": 1.9112873659596736, + "learning_rate": 2.980367160777048e-06, + "loss": 0.8075, + "step": 7425 + }, + { + "epoch": 0.76, + "grad_norm": 1.4329946044809587, + "learning_rate": 2.978021187362261e-06, + "loss": 0.5534, + "step": 7426 + }, + { + "epoch": 0.76, + "grad_norm": 1.4934816434283067, + "learning_rate": 2.97567597607346e-06, + "loss": 0.5699, + "step": 7427 + }, + { + "epoch": 0.76, + "grad_norm": 1.6117630516190384, + "learning_rate": 2.9733315271651863e-06, + "loss": 0.6782, + "step": 7428 + }, + { + "epoch": 0.76, + "grad_norm": 1.8126687606012002, + "learning_rate": 2.9709878408918867e-06, + "loss": 0.7555, + "step": 7429 + }, + { + "epoch": 0.76, + "grad_norm": 1.6912605643295366, + "learning_rate": 2.968644917507937e-06, + "loss": 0.746, + "step": 7430 + }, + { + "epoch": 0.76, + "grad_norm": 1.626182602352006, + "learning_rate": 2.966302757267625e-06, + "loss": 0.661, + "step": 7431 + }, + { + "epoch": 0.76, + "grad_norm": 1.5037473745802517, + "learning_rate": 2.963961360425153e-06, + "loss": 0.8026, + "step": 7432 + }, + { + "epoch": 0.76, + "grad_norm": 1.559354351532829, + "learning_rate": 2.961620727234645e-06, + "loss": 0.725, + "step": 7433 + }, + { + "epoch": 0.76, + "grad_norm": 1.9363196714314468, + "learning_rate": 2.9592808579501364e-06, + "loss": 0.7002, + "step": 7434 + }, + { + "epoch": 0.76, + "grad_norm": 1.5776840241249293, + "learning_rate": 2.956941752825587e-06, + "loss": 0.634, + "step": 7435 + }, + { + "epoch": 0.76, + "grad_norm": 1.7381703510621536, + "learning_rate": 2.9546034121148714e-06, + "loss": 0.7127, + "step": 7436 + }, + { + "epoch": 0.76, + "grad_norm": 1.7190856091505584, + "learning_rate": 2.952265836071776e-06, + "loss": 0.6978, + "step": 7437 + }, + { + "epoch": 0.76, + "grad_norm": 1.6735200753638035, + "learning_rate": 2.9499290249500078e-06, + "loss": 0.6708, + "step": 7438 + }, + { + "epoch": 0.76, + "grad_norm": 1.7328932677954882, + "learning_rate": 2.9475929790031975e-06, + "loss": 0.8166, + "step": 7439 + }, + { + "epoch": 0.76, + "grad_norm": 1.5408188030507148, + "learning_rate": 2.945257698484879e-06, + "loss": 0.7174, + "step": 7440 + }, + { + "epoch": 0.76, + "grad_norm": 1.5689412370963598, + "learning_rate": 2.9429231836485174e-06, + "loss": 0.6592, + "step": 7441 + }, + { + "epoch": 0.76, + "grad_norm": 1.6627008111717447, + "learning_rate": 2.9405894347474793e-06, + "loss": 0.6254, + "step": 7442 + }, + { + "epoch": 0.76, + "grad_norm": 1.837083781585254, + "learning_rate": 2.938256452035062e-06, + "loss": 0.6579, + "step": 7443 + }, + { + "epoch": 0.76, + "grad_norm": 1.7666715035355245, + "learning_rate": 2.9359242357644757e-06, + "loss": 0.7378, + "step": 7444 + }, + { + "epoch": 0.76, + "grad_norm": 1.6119800098653143, + "learning_rate": 2.9335927861888414e-06, + "loss": 0.6269, + "step": 7445 + }, + { + "epoch": 0.76, + "grad_norm": 1.7211340131128936, + "learning_rate": 2.9312621035612077e-06, + "loss": 0.8051, + "step": 7446 + }, + { + "epoch": 0.76, + "grad_norm": 1.6180189522856365, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.7323, + "step": 7447 + }, + { + "epoch": 0.76, + "grad_norm": 1.7639254750773472, + "learning_rate": 2.9266030401616762e-06, + "loss": 0.6893, + "step": 7448 + }, + { + "epoch": 0.76, + "grad_norm": 1.7179222972882686, + "learning_rate": 2.924274659895454e-06, + "loss": 0.7707, + "step": 7449 + }, + { + "epoch": 0.76, + "grad_norm": 1.6182082420055828, + "learning_rate": 2.9219470475885626e-06, + "loss": 0.7779, + "step": 7450 + }, + { + "epoch": 0.76, + "grad_norm": 1.6851692872088693, + "learning_rate": 2.919620203493634e-06, + "loss": 0.6607, + "step": 7451 + }, + { + "epoch": 0.76, + "grad_norm": 1.640265505425813, + "learning_rate": 2.917294127863204e-06, + "loss": 0.7147, + "step": 7452 + }, + { + "epoch": 0.76, + "grad_norm": 1.8049523286991427, + "learning_rate": 2.9149688209497353e-06, + "loss": 0.7451, + "step": 7453 + }, + { + "epoch": 0.76, + "grad_norm": 1.7823267160412626, + "learning_rate": 2.9126442830056066e-06, + "loss": 0.6662, + "step": 7454 + }, + { + "epoch": 0.76, + "grad_norm": 1.5495471833684278, + "learning_rate": 2.910320514283104e-06, + "loss": 0.7109, + "step": 7455 + }, + { + "epoch": 0.76, + "grad_norm": 1.5447094856687085, + "learning_rate": 2.9079975150344433e-06, + "loss": 0.6821, + "step": 7456 + }, + { + "epoch": 0.76, + "grad_norm": 1.858363830339255, + "learning_rate": 2.905675285511742e-06, + "loss": 0.6572, + "step": 7457 + }, + { + "epoch": 0.76, + "grad_norm": 1.822158149306591, + "learning_rate": 2.9033538259670458e-06, + "loss": 0.7401, + "step": 7458 + }, + { + "epoch": 0.76, + "grad_norm": 1.5929794743645305, + "learning_rate": 2.901033136652316e-06, + "loss": 0.6964, + "step": 7459 + }, + { + "epoch": 0.76, + "grad_norm": 1.5170105854274156, + "learning_rate": 2.89871321781942e-06, + "loss": 0.6719, + "step": 7460 + }, + { + "epoch": 0.76, + "grad_norm": 1.6390930304050815, + "learning_rate": 2.8963940697201532e-06, + "loss": 0.7041, + "step": 7461 + }, + { + "epoch": 0.76, + "grad_norm": 1.7532602718217476, + "learning_rate": 2.894075692606225e-06, + "loss": 0.6553, + "step": 7462 + }, + { + "epoch": 0.76, + "grad_norm": 1.7868171723470738, + "learning_rate": 2.891758086729253e-06, + "loss": 0.8173, + "step": 7463 + }, + { + "epoch": 0.76, + "grad_norm": 1.828714452933475, + "learning_rate": 2.8894412523407833e-06, + "loss": 0.7114, + "step": 7464 + }, + { + "epoch": 0.76, + "grad_norm": 1.6684999336577175, + "learning_rate": 2.8871251896922657e-06, + "loss": 0.6909, + "step": 7465 + }, + { + "epoch": 0.76, + "grad_norm": 1.6634060136535094, + "learning_rate": 2.884809899035077e-06, + "loss": 0.7066, + "step": 7466 + }, + { + "epoch": 0.76, + "grad_norm": 1.5578753536708678, + "learning_rate": 2.882495380620507e-06, + "loss": 0.629, + "step": 7467 + }, + { + "epoch": 0.76, + "grad_norm": 1.856286344143219, + "learning_rate": 2.8801816346997547e-06, + "loss": 0.767, + "step": 7468 + }, + { + "epoch": 0.76, + "grad_norm": 1.7429771399973584, + "learning_rate": 2.877868661523947e-06, + "loss": 0.7595, + "step": 7469 + }, + { + "epoch": 0.76, + "grad_norm": 1.6852910997798851, + "learning_rate": 2.875556461344117e-06, + "loss": 0.6857, + "step": 7470 + }, + { + "epoch": 0.76, + "grad_norm": 1.7373081083150153, + "learning_rate": 2.873245034411217e-06, + "loss": 0.6821, + "step": 7471 + }, + { + "epoch": 0.76, + "grad_norm": 1.725776716881047, + "learning_rate": 2.870934380976124e-06, + "loss": 0.6353, + "step": 7472 + }, + { + "epoch": 0.76, + "grad_norm": 1.5202372783896243, + "learning_rate": 2.868624501289613e-06, + "loss": 0.6614, + "step": 7473 + }, + { + "epoch": 0.76, + "grad_norm": 1.7779656779653663, + "learning_rate": 2.8663153956023938e-06, + "loss": 0.7451, + "step": 7474 + }, + { + "epoch": 0.76, + "grad_norm": 1.755161174321142, + "learning_rate": 2.8640070641650776e-06, + "loss": 0.7465, + "step": 7475 + }, + { + "epoch": 0.76, + "grad_norm": 1.6513658016609747, + "learning_rate": 2.8616995072281996e-06, + "loss": 0.7849, + "step": 7476 + }, + { + "epoch": 0.76, + "grad_norm": 1.8120431805519797, + "learning_rate": 2.8593927250422126e-06, + "loss": 0.7905, + "step": 7477 + }, + { + "epoch": 0.76, + "grad_norm": 1.5569497819936433, + "learning_rate": 2.8570867178574757e-06, + "loss": 0.7119, + "step": 7478 + }, + { + "epoch": 0.76, + "grad_norm": 1.573670506635637, + "learning_rate": 2.854781485924273e-06, + "loss": 0.6166, + "step": 7479 + }, + { + "epoch": 0.76, + "grad_norm": 1.6970547434025136, + "learning_rate": 2.852477029492804e-06, + "loss": 0.8129, + "step": 7480 + }, + { + "epoch": 0.76, + "grad_norm": 1.6728848081611243, + "learning_rate": 2.8501733488131755e-06, + "loss": 0.6801, + "step": 7481 + }, + { + "epoch": 0.76, + "grad_norm": 1.786059631586086, + "learning_rate": 2.847870444135422e-06, + "loss": 0.7519, + "step": 7482 + }, + { + "epoch": 0.76, + "grad_norm": 1.6756514266528544, + "learning_rate": 2.8455683157094827e-06, + "loss": 0.7224, + "step": 7483 + }, + { + "epoch": 0.76, + "grad_norm": 1.5194335545928992, + "learning_rate": 2.8432669637852208e-06, + "loss": 0.5874, + "step": 7484 + }, + { + "epoch": 0.76, + "grad_norm": 1.6876242980806946, + "learning_rate": 2.840966388612413e-06, + "loss": 0.7177, + "step": 7485 + }, + { + "epoch": 0.76, + "grad_norm": 1.8113180493913745, + "learning_rate": 2.838666590440746e-06, + "loss": 0.7245, + "step": 7486 + }, + { + "epoch": 0.76, + "grad_norm": 1.6203584425862203, + "learning_rate": 2.8363675695198322e-06, + "loss": 0.6698, + "step": 7487 + }, + { + "epoch": 0.76, + "grad_norm": 1.4849723093207292, + "learning_rate": 2.83406932609919e-06, + "loss": 0.6125, + "step": 7488 + }, + { + "epoch": 0.76, + "grad_norm": 1.850867969596928, + "learning_rate": 2.8317718604282596e-06, + "loss": 0.6292, + "step": 7489 + }, + { + "epoch": 0.76, + "grad_norm": 1.5714885693578153, + "learning_rate": 2.8294751727563986e-06, + "loss": 0.6172, + "step": 7490 + }, + { + "epoch": 0.76, + "grad_norm": 1.609748364086573, + "learning_rate": 2.82717926333287e-06, + "loss": 0.5915, + "step": 7491 + }, + { + "epoch": 0.76, + "grad_norm": 1.7166256303963017, + "learning_rate": 2.824884132406862e-06, + "loss": 0.6597, + "step": 7492 + }, + { + "epoch": 0.76, + "grad_norm": 1.9578253922637714, + "learning_rate": 2.8225897802274784e-06, + "loss": 0.8443, + "step": 7493 + }, + { + "epoch": 0.76, + "grad_norm": 1.815363824497048, + "learning_rate": 2.820296207043728e-06, + "loss": 0.7102, + "step": 7494 + }, + { + "epoch": 0.76, + "grad_norm": 1.634902000615274, + "learning_rate": 2.8180034131045463e-06, + "loss": 0.7201, + "step": 7495 + }, + { + "epoch": 0.76, + "grad_norm": 1.5772494222413496, + "learning_rate": 2.8157113986587846e-06, + "loss": 0.65, + "step": 7496 + }, + { + "epoch": 0.76, + "grad_norm": 1.5834670448103767, + "learning_rate": 2.8134201639551974e-06, + "loss": 0.6062, + "step": 7497 + }, + { + "epoch": 0.76, + "grad_norm": 1.7440391226487488, + "learning_rate": 2.811129709242465e-06, + "loss": 0.6941, + "step": 7498 + }, + { + "epoch": 0.76, + "grad_norm": 1.7803969915245104, + "learning_rate": 2.8088400347691845e-06, + "loss": 0.6201, + "step": 7499 + }, + { + "epoch": 0.76, + "grad_norm": 1.646599239304442, + "learning_rate": 2.8065511407838584e-06, + "loss": 0.6177, + "step": 7500 + }, + { + "epoch": 0.76, + "grad_norm": 1.6726681217351695, + "learning_rate": 2.8042630275349168e-06, + "loss": 0.7302, + "step": 7501 + }, + { + "epoch": 0.76, + "grad_norm": 1.6567772383571295, + "learning_rate": 2.801975695270691e-06, + "loss": 0.6778, + "step": 7502 + }, + { + "epoch": 0.76, + "grad_norm": 1.7253150467123428, + "learning_rate": 2.7996891442394393e-06, + "loss": 0.7049, + "step": 7503 + }, + { + "epoch": 0.76, + "grad_norm": 1.6743500942376304, + "learning_rate": 2.7974033746893326e-06, + "loss": 0.702, + "step": 7504 + }, + { + "epoch": 0.76, + "grad_norm": 1.6545542816533727, + "learning_rate": 2.7951183868684516e-06, + "loss": 0.8051, + "step": 7505 + }, + { + "epoch": 0.76, + "grad_norm": 1.5076080445687705, + "learning_rate": 2.792834181024798e-06, + "loss": 0.6293, + "step": 7506 + }, + { + "epoch": 0.76, + "grad_norm": 1.7708757256910792, + "learning_rate": 2.7905507574062895e-06, + "loss": 0.7554, + "step": 7507 + }, + { + "epoch": 0.76, + "grad_norm": 1.7475955529744212, + "learning_rate": 2.7882681162607494e-06, + "loss": 0.7865, + "step": 7508 + }, + { + "epoch": 0.76, + "grad_norm": 1.472225771004639, + "learning_rate": 2.7859862578359276e-06, + "loss": 0.6569, + "step": 7509 + }, + { + "epoch": 0.76, + "grad_norm": 1.6686558241730758, + "learning_rate": 2.783705182379485e-06, + "loss": 0.7829, + "step": 7510 + }, + { + "epoch": 0.76, + "grad_norm": 1.695276979117581, + "learning_rate": 2.781424890138992e-06, + "loss": 0.7095, + "step": 7511 + }, + { + "epoch": 0.76, + "grad_norm": 1.646657766974499, + "learning_rate": 2.779145381361942e-06, + "loss": 0.6725, + "step": 7512 + }, + { + "epoch": 0.76, + "grad_norm": 1.783718666727429, + "learning_rate": 2.7768666562957413e-06, + "loss": 0.6186, + "step": 7513 + }, + { + "epoch": 0.76, + "grad_norm": 1.4484205856045251, + "learning_rate": 2.774588715187705e-06, + "loss": 0.5485, + "step": 7514 + }, + { + "epoch": 0.76, + "grad_norm": 1.5731511109970013, + "learning_rate": 2.7723115582850714e-06, + "loss": 0.7341, + "step": 7515 + }, + { + "epoch": 0.76, + "grad_norm": 1.691000533782961, + "learning_rate": 2.7700351858349917e-06, + "loss": 0.6658, + "step": 7516 + }, + { + "epoch": 0.76, + "grad_norm": 1.7362006623624997, + "learning_rate": 2.767759598084526e-06, + "loss": 0.729, + "step": 7517 + }, + { + "epoch": 0.76, + "grad_norm": 1.8996942080774435, + "learning_rate": 2.7654847952806597e-06, + "loss": 0.7411, + "step": 7518 + }, + { + "epoch": 0.76, + "grad_norm": 1.6757634471720677, + "learning_rate": 2.763210777670281e-06, + "loss": 0.7495, + "step": 7519 + }, + { + "epoch": 0.76, + "grad_norm": 1.6398472456760649, + "learning_rate": 2.7609375455002007e-06, + "loss": 0.6899, + "step": 7520 + }, + { + "epoch": 0.77, + "grad_norm": 1.6280316209407817, + "learning_rate": 2.758665099017147e-06, + "loss": 0.7565, + "step": 7521 + }, + { + "epoch": 0.77, + "grad_norm": 1.7555060068451787, + "learning_rate": 2.7563934384677525e-06, + "loss": 0.6801, + "step": 7522 + }, + { + "epoch": 0.77, + "grad_norm": 1.503827561698193, + "learning_rate": 2.7541225640985757e-06, + "loss": 0.6543, + "step": 7523 + }, + { + "epoch": 0.77, + "grad_norm": 1.3849744538887527, + "learning_rate": 2.7518524761560784e-06, + "loss": 0.6735, + "step": 7524 + }, + { + "epoch": 0.77, + "grad_norm": 1.4838992879970025, + "learning_rate": 2.749583174886646e-06, + "loss": 0.6396, + "step": 7525 + }, + { + "epoch": 0.77, + "grad_norm": 1.6575210361822101, + "learning_rate": 2.7473146605365807e-06, + "loss": 0.7453, + "step": 7526 + }, + { + "epoch": 0.77, + "grad_norm": 1.7375430861012504, + "learning_rate": 2.7450469333520856e-06, + "loss": 0.7819, + "step": 7527 + }, + { + "epoch": 0.77, + "grad_norm": 1.6032934711359028, + "learning_rate": 2.7427799935792943e-06, + "loss": 0.7154, + "step": 7528 + }, + { + "epoch": 0.77, + "grad_norm": 1.7372976790322099, + "learning_rate": 2.7405138414642405e-06, + "loss": 0.6819, + "step": 7529 + }, + { + "epoch": 0.77, + "grad_norm": 1.734259457654482, + "learning_rate": 2.738248477252885e-06, + "loss": 0.762, + "step": 7530 + }, + { + "epoch": 0.77, + "grad_norm": 1.5259134220595947, + "learning_rate": 2.7359839011910983e-06, + "loss": 0.6118, + "step": 7531 + }, + { + "epoch": 0.77, + "grad_norm": 1.6101814510917998, + "learning_rate": 2.7337201135246604e-06, + "loss": 0.7868, + "step": 7532 + }, + { + "epoch": 0.77, + "grad_norm": 1.536141528487024, + "learning_rate": 2.731457114499272e-06, + "loss": 0.6493, + "step": 7533 + }, + { + "epoch": 0.77, + "grad_norm": 1.8158612352878118, + "learning_rate": 2.729194904360549e-06, + "loss": 0.7682, + "step": 7534 + }, + { + "epoch": 0.77, + "grad_norm": 1.5484988134903386, + "learning_rate": 2.726933483354014e-06, + "loss": 0.7837, + "step": 7535 + }, + { + "epoch": 0.77, + "grad_norm": 1.6539243202922078, + "learning_rate": 2.7246728517251142e-06, + "loss": 0.6873, + "step": 7536 + }, + { + "epoch": 0.77, + "grad_norm": 1.6073679457280263, + "learning_rate": 2.722413009719199e-06, + "loss": 0.6061, + "step": 7537 + }, + { + "epoch": 0.77, + "grad_norm": 1.6858862021617782, + "learning_rate": 2.720153957581544e-06, + "loss": 0.6199, + "step": 7538 + }, + { + "epoch": 0.77, + "grad_norm": 1.6222117288119169, + "learning_rate": 2.7178956955573367e-06, + "loss": 0.7623, + "step": 7539 + }, + { + "epoch": 0.77, + "grad_norm": 1.733992369513456, + "learning_rate": 2.715638223891668e-06, + "loss": 0.7118, + "step": 7540 + }, + { + "epoch": 0.77, + "grad_norm": 1.4844849837150704, + "learning_rate": 2.7133815428295585e-06, + "loss": 0.6526, + "step": 7541 + }, + { + "epoch": 0.77, + "grad_norm": 1.6291670828286082, + "learning_rate": 2.711125652615929e-06, + "loss": 0.7378, + "step": 7542 + }, + { + "epoch": 0.77, + "grad_norm": 1.6348415045516584, + "learning_rate": 2.708870553495625e-06, + "loss": 0.7867, + "step": 7543 + }, + { + "epoch": 0.77, + "grad_norm": 1.6860056402117856, + "learning_rate": 2.706616245713406e-06, + "loss": 0.6777, + "step": 7544 + }, + { + "epoch": 0.77, + "grad_norm": 1.604620396060263, + "learning_rate": 2.704362729513934e-06, + "loss": 0.712, + "step": 7545 + }, + { + "epoch": 0.77, + "grad_norm": 1.4305280672456266, + "learning_rate": 2.7021100051417994e-06, + "loss": 0.7249, + "step": 7546 + }, + { + "epoch": 0.77, + "grad_norm": 1.7898274507982526, + "learning_rate": 2.6998580728414948e-06, + "loss": 0.7568, + "step": 7547 + }, + { + "epoch": 0.77, + "grad_norm": 1.6675353834219373, + "learning_rate": 2.697606932857435e-06, + "loss": 0.705, + "step": 7548 + }, + { + "epoch": 0.77, + "grad_norm": 1.6749144420557143, + "learning_rate": 2.695356585433949e-06, + "loss": 0.8273, + "step": 7549 + }, + { + "epoch": 0.77, + "grad_norm": 1.6316908542704445, + "learning_rate": 2.693107030815271e-06, + "loss": 0.6924, + "step": 7550 + }, + { + "epoch": 0.77, + "grad_norm": 1.7561594210981057, + "learning_rate": 2.6908582692455575e-06, + "loss": 0.7581, + "step": 7551 + }, + { + "epoch": 0.77, + "grad_norm": 1.6024697973330937, + "learning_rate": 2.688610300968879e-06, + "loss": 0.6427, + "step": 7552 + }, + { + "epoch": 0.77, + "grad_norm": 1.6147055907512202, + "learning_rate": 2.6863631262292136e-06, + "loss": 0.7513, + "step": 7553 + }, + { + "epoch": 0.77, + "grad_norm": 1.652596883620534, + "learning_rate": 2.6841167452704597e-06, + "loss": 0.6232, + "step": 7554 + }, + { + "epoch": 0.77, + "grad_norm": 1.783816422018661, + "learning_rate": 2.681871158336423e-06, + "loss": 0.7052, + "step": 7555 + }, + { + "epoch": 0.77, + "grad_norm": 1.7142749273013935, + "learning_rate": 2.6796263656708297e-06, + "loss": 0.7456, + "step": 7556 + }, + { + "epoch": 0.77, + "grad_norm": 1.6310600719430486, + "learning_rate": 2.6773823675173195e-06, + "loss": 0.6341, + "step": 7557 + }, + { + "epoch": 0.77, + "grad_norm": 1.7806935404941204, + "learning_rate": 2.6751391641194378e-06, + "loss": 0.7257, + "step": 7558 + }, + { + "epoch": 0.77, + "grad_norm": 1.6827389065638336, + "learning_rate": 2.6728967557206544e-06, + "loss": 0.6987, + "step": 7559 + }, + { + "epoch": 0.77, + "grad_norm": 1.6514931921887344, + "learning_rate": 2.670655142564341e-06, + "loss": 0.7026, + "step": 7560 + }, + { + "epoch": 0.77, + "grad_norm": 1.616645078206799, + "learning_rate": 2.6684143248937955e-06, + "loss": 0.7544, + "step": 7561 + }, + { + "epoch": 0.77, + "grad_norm": 1.7718859583686468, + "learning_rate": 2.6661743029522236e-06, + "loss": 0.6108, + "step": 7562 + }, + { + "epoch": 0.77, + "grad_norm": 1.68104639669912, + "learning_rate": 2.663935076982741e-06, + "loss": 0.8155, + "step": 7563 + }, + { + "epoch": 0.77, + "grad_norm": 1.8021958735996275, + "learning_rate": 2.6616966472283843e-06, + "loss": 0.7101, + "step": 7564 + }, + { + "epoch": 0.77, + "grad_norm": 1.5721868545560538, + "learning_rate": 2.6594590139320962e-06, + "loss": 0.6331, + "step": 7565 + }, + { + "epoch": 0.77, + "grad_norm": 1.777546054232048, + "learning_rate": 2.6572221773367392e-06, + "loss": 0.7279, + "step": 7566 + }, + { + "epoch": 0.77, + "grad_norm": 1.845961818905768, + "learning_rate": 2.6549861376850883e-06, + "loss": 0.7134, + "step": 7567 + }, + { + "epoch": 0.77, + "grad_norm": 1.6594238425370311, + "learning_rate": 2.6527508952198276e-06, + "loss": 0.7744, + "step": 7568 + }, + { + "epoch": 0.77, + "grad_norm": 1.5296321223944114, + "learning_rate": 2.650516450183558e-06, + "loss": 0.6848, + "step": 7569 + }, + { + "epoch": 0.77, + "grad_norm": 1.5650590985861443, + "learning_rate": 2.6482828028187977e-06, + "loss": 0.6022, + "step": 7570 + }, + { + "epoch": 0.77, + "grad_norm": 1.7019456186669477, + "learning_rate": 2.6460499533679685e-06, + "loss": 0.6716, + "step": 7571 + }, + { + "epoch": 0.77, + "grad_norm": 1.716828661162016, + "learning_rate": 2.6438179020734165e-06, + "loss": 0.6754, + "step": 7572 + }, + { + "epoch": 0.77, + "grad_norm": 1.5025504069938012, + "learning_rate": 2.641586649177391e-06, + "loss": 0.7557, + "step": 7573 + }, + { + "epoch": 0.77, + "grad_norm": 1.6821351398535194, + "learning_rate": 2.6393561949220625e-06, + "loss": 0.6906, + "step": 7574 + }, + { + "epoch": 0.77, + "grad_norm": 1.6593300718919421, + "learning_rate": 2.637126539549514e-06, + "loss": 0.7463, + "step": 7575 + }, + { + "epoch": 0.77, + "grad_norm": 2.0347185767831557, + "learning_rate": 2.6348976833017336e-06, + "loss": 0.7112, + "step": 7576 + }, + { + "epoch": 0.77, + "grad_norm": 1.7191074392544898, + "learning_rate": 2.6326696264206352e-06, + "loss": 0.8615, + "step": 7577 + }, + { + "epoch": 0.77, + "grad_norm": 1.8354634059261874, + "learning_rate": 2.630442369148034e-06, + "loss": 0.7765, + "step": 7578 + }, + { + "epoch": 0.77, + "grad_norm": 1.61215000399093, + "learning_rate": 2.628215911725668e-06, + "loss": 0.7777, + "step": 7579 + }, + { + "epoch": 0.77, + "grad_norm": 1.6966535322409748, + "learning_rate": 2.6259902543951844e-06, + "loss": 0.7105, + "step": 7580 + }, + { + "epoch": 0.77, + "grad_norm": 1.6420719462317774, + "learning_rate": 2.6237653973981393e-06, + "loss": 0.6423, + "step": 7581 + }, + { + "epoch": 0.77, + "grad_norm": 1.729672866162764, + "learning_rate": 2.6215413409760113e-06, + "loss": 0.755, + "step": 7582 + }, + { + "epoch": 0.77, + "grad_norm": 1.8109345442166387, + "learning_rate": 2.6193180853701825e-06, + "loss": 0.7652, + "step": 7583 + }, + { + "epoch": 0.77, + "grad_norm": 1.670833498715669, + "learning_rate": 2.617095630821955e-06, + "loss": 0.7259, + "step": 7584 + }, + { + "epoch": 0.77, + "grad_norm": 1.626731046412688, + "learning_rate": 2.6148739775725427e-06, + "loss": 0.6965, + "step": 7585 + }, + { + "epoch": 0.77, + "grad_norm": 1.7216672219356004, + "learning_rate": 2.6126531258630682e-06, + "loss": 0.5594, + "step": 7586 + }, + { + "epoch": 0.77, + "grad_norm": 1.6056511431005087, + "learning_rate": 2.6104330759345707e-06, + "loss": 0.7508, + "step": 7587 + }, + { + "epoch": 0.77, + "grad_norm": 1.727656277588795, + "learning_rate": 2.6082138280280068e-06, + "loss": 0.703, + "step": 7588 + }, + { + "epoch": 0.77, + "grad_norm": 1.5854088043978658, + "learning_rate": 2.6059953823842333e-06, + "loss": 0.6803, + "step": 7589 + }, + { + "epoch": 0.77, + "grad_norm": 1.599543277038824, + "learning_rate": 2.6037777392440346e-06, + "loss": 0.6838, + "step": 7590 + }, + { + "epoch": 0.77, + "grad_norm": 1.4815510408464663, + "learning_rate": 2.6015608988480956e-06, + "loss": 0.6471, + "step": 7591 + }, + { + "epoch": 0.77, + "grad_norm": 1.789488671669432, + "learning_rate": 2.599344861437023e-06, + "loss": 0.8288, + "step": 7592 + }, + { + "epoch": 0.77, + "grad_norm": 1.7897867030103445, + "learning_rate": 2.5971296272513347e-06, + "loss": 0.6958, + "step": 7593 + }, + { + "epoch": 0.77, + "grad_norm": 1.7961417034922662, + "learning_rate": 2.5949151965314547e-06, + "loss": 0.6808, + "step": 7594 + }, + { + "epoch": 0.77, + "grad_norm": 1.4180384475284087, + "learning_rate": 2.5927015695177305e-06, + "loss": 0.6841, + "step": 7595 + }, + { + "epoch": 0.77, + "grad_norm": 1.6465686467448446, + "learning_rate": 2.5904887464504115e-06, + "loss": 0.6897, + "step": 7596 + }, + { + "epoch": 0.77, + "grad_norm": 1.6561505477920748, + "learning_rate": 2.5882767275696663e-06, + "loss": 0.7551, + "step": 7597 + }, + { + "epoch": 0.77, + "grad_norm": 1.6116171921167224, + "learning_rate": 2.5860655131155788e-06, + "loss": 0.6677, + "step": 7598 + }, + { + "epoch": 0.77, + "grad_norm": 1.7015043649888153, + "learning_rate": 2.5838551033281366e-06, + "loss": 0.7358, + "step": 7599 + }, + { + "epoch": 0.77, + "grad_norm": 1.6427765139219823, + "learning_rate": 2.581645498447247e-06, + "loss": 0.7323, + "step": 7600 + }, + { + "epoch": 0.77, + "grad_norm": 1.6021299536685927, + "learning_rate": 2.579436698712732e-06, + "loss": 0.7018, + "step": 7601 + }, + { + "epoch": 0.77, + "grad_norm": 1.544168120396529, + "learning_rate": 2.577228704364314e-06, + "loss": 0.7018, + "step": 7602 + }, + { + "epoch": 0.77, + "grad_norm": 1.5393286785916402, + "learning_rate": 2.575021515641646e-06, + "loss": 0.6846, + "step": 7603 + }, + { + "epoch": 0.77, + "grad_norm": 1.5298286568053745, + "learning_rate": 2.572815132784274e-06, + "loss": 0.56, + "step": 7604 + }, + { + "epoch": 0.77, + "grad_norm": 1.846411894237571, + "learning_rate": 2.570609556031672e-06, + "loss": 0.7396, + "step": 7605 + }, + { + "epoch": 0.77, + "grad_norm": 1.7371574415157183, + "learning_rate": 2.568404785623224e-06, + "loss": 0.7063, + "step": 7606 + }, + { + "epoch": 0.77, + "grad_norm": 1.6308986377759827, + "learning_rate": 2.566200821798216e-06, + "loss": 0.6815, + "step": 7607 + }, + { + "epoch": 0.77, + "grad_norm": 1.6082901733996213, + "learning_rate": 2.5639976647958564e-06, + "loss": 0.6715, + "step": 7608 + }, + { + "epoch": 0.77, + "grad_norm": 1.7413064688558009, + "learning_rate": 2.5617953148552677e-06, + "loss": 0.7327, + "step": 7609 + }, + { + "epoch": 0.77, + "grad_norm": 1.5715591340768313, + "learning_rate": 2.559593772215474e-06, + "loss": 0.6851, + "step": 7610 + }, + { + "epoch": 0.77, + "grad_norm": 1.7550216930518252, + "learning_rate": 2.557393037115421e-06, + "loss": 0.7452, + "step": 7611 + }, + { + "epoch": 0.77, + "grad_norm": 1.4456545312409386, + "learning_rate": 2.5551931097939676e-06, + "loss": 0.5707, + "step": 7612 + }, + { + "epoch": 0.77, + "grad_norm": 1.5996138048373714, + "learning_rate": 2.552993990489876e-06, + "loss": 0.7159, + "step": 7613 + }, + { + "epoch": 0.77, + "grad_norm": 1.526924196735615, + "learning_rate": 2.5507956794418275e-06, + "loss": 0.6277, + "step": 7614 + }, + { + "epoch": 0.77, + "grad_norm": 1.6733090546228477, + "learning_rate": 2.548598176888419e-06, + "loss": 0.7264, + "step": 7615 + }, + { + "epoch": 0.77, + "grad_norm": 1.5680082178643975, + "learning_rate": 2.5464014830681496e-06, + "loss": 0.7707, + "step": 7616 + }, + { + "epoch": 0.77, + "grad_norm": 1.7232490509422664, + "learning_rate": 2.544205598219437e-06, + "loss": 0.7231, + "step": 7617 + }, + { + "epoch": 0.77, + "grad_norm": 1.7050713253171732, + "learning_rate": 2.5420105225806134e-06, + "loss": 0.6982, + "step": 7618 + }, + { + "epoch": 0.77, + "grad_norm": 1.6466518129271135, + "learning_rate": 2.5398162563899154e-06, + "loss": 0.7152, + "step": 7619 + }, + { + "epoch": 0.78, + "grad_norm": 1.640540070141524, + "learning_rate": 2.5376227998854987e-06, + "loss": 0.6974, + "step": 7620 + }, + { + "epoch": 0.78, + "grad_norm": 1.6753118861849827, + "learning_rate": 2.5354301533054315e-06, + "loss": 0.7223, + "step": 7621 + }, + { + "epoch": 0.78, + "grad_norm": 1.6313335673420037, + "learning_rate": 2.5332383168876853e-06, + "loss": 0.6825, + "step": 7622 + }, + { + "epoch": 0.78, + "grad_norm": 1.856265822522839, + "learning_rate": 2.5310472908701555e-06, + "loss": 0.7676, + "step": 7623 + }, + { + "epoch": 0.78, + "grad_norm": 1.5648429901605199, + "learning_rate": 2.5288570754906374e-06, + "loss": 0.7181, + "step": 7624 + }, + { + "epoch": 0.78, + "grad_norm": 1.5724235897268772, + "learning_rate": 2.5266676709868486e-06, + "loss": 0.697, + "step": 7625 + }, + { + "epoch": 0.78, + "grad_norm": 1.6165299290103299, + "learning_rate": 2.524479077596418e-06, + "loss": 0.7445, + "step": 7626 + }, + { + "epoch": 0.78, + "grad_norm": 1.7399987056517636, + "learning_rate": 2.5222912955568757e-06, + "loss": 0.7388, + "step": 7627 + }, + { + "epoch": 0.78, + "grad_norm": 1.5705171211115758, + "learning_rate": 2.5201043251056756e-06, + "loss": 0.7054, + "step": 7628 + }, + { + "epoch": 0.78, + "grad_norm": 1.7790089942542553, + "learning_rate": 2.517918166480181e-06, + "loss": 0.7001, + "step": 7629 + }, + { + "epoch": 0.78, + "grad_norm": 1.6494147915260875, + "learning_rate": 2.51573281991766e-06, + "loss": 0.7247, + "step": 7630 + }, + { + "epoch": 0.78, + "grad_norm": 1.5286186025147486, + "learning_rate": 2.5135482856553027e-06, + "loss": 0.6604, + "step": 7631 + }, + { + "epoch": 0.78, + "grad_norm": 1.7716342052578273, + "learning_rate": 2.511364563930202e-06, + "loss": 0.7095, + "step": 7632 + }, + { + "epoch": 0.78, + "grad_norm": 1.5227796689934394, + "learning_rate": 2.509181654979368e-06, + "loss": 0.6938, + "step": 7633 + }, + { + "epoch": 0.78, + "grad_norm": 1.7793672610158484, + "learning_rate": 2.5069995590397255e-06, + "loss": 0.7239, + "step": 7634 + }, + { + "epoch": 0.78, + "grad_norm": 1.7725492488200256, + "learning_rate": 2.504818276348101e-06, + "loss": 0.6739, + "step": 7635 + }, + { + "epoch": 0.78, + "grad_norm": 1.6668210564804644, + "learning_rate": 2.5026378071412428e-06, + "loss": 0.653, + "step": 7636 + }, + { + "epoch": 0.78, + "grad_norm": 1.543062332712173, + "learning_rate": 2.5004581516558033e-06, + "loss": 0.509, + "step": 7637 + }, + { + "epoch": 0.78, + "grad_norm": 1.767271637712074, + "learning_rate": 2.498279310128351e-06, + "loss": 0.6499, + "step": 7638 + }, + { + "epoch": 0.78, + "grad_norm": 1.6216565293780412, + "learning_rate": 2.496101282795369e-06, + "loss": 0.7518, + "step": 7639 + }, + { + "epoch": 0.78, + "grad_norm": 1.85997349227327, + "learning_rate": 2.493924069893241e-06, + "loss": 0.6976, + "step": 7640 + }, + { + "epoch": 0.78, + "grad_norm": 1.661397393733398, + "learning_rate": 2.4917476716582776e-06, + "loss": 0.7266, + "step": 7641 + }, + { + "epoch": 0.78, + "grad_norm": 1.5895203634704989, + "learning_rate": 2.4895720883266848e-06, + "loss": 0.6325, + "step": 7642 + }, + { + "epoch": 0.78, + "grad_norm": 1.680372868464412, + "learning_rate": 2.4873973201345924e-06, + "loss": 0.7073, + "step": 7643 + }, + { + "epoch": 0.78, + "grad_norm": 1.6119049547484752, + "learning_rate": 2.4852233673180394e-06, + "loss": 0.5843, + "step": 7644 + }, + { + "epoch": 0.78, + "grad_norm": 1.6759906839769962, + "learning_rate": 2.483050230112969e-06, + "loss": 0.7405, + "step": 7645 + }, + { + "epoch": 0.78, + "grad_norm": 1.8364949521475253, + "learning_rate": 2.480877908755245e-06, + "loss": 0.72, + "step": 7646 + }, + { + "epoch": 0.78, + "grad_norm": 1.6927374995881244, + "learning_rate": 2.4787064034806397e-06, + "loss": 0.779, + "step": 7647 + }, + { + "epoch": 0.78, + "grad_norm": 1.5775482181187785, + "learning_rate": 2.4765357145248327e-06, + "loss": 0.6775, + "step": 7648 + }, + { + "epoch": 0.78, + "grad_norm": 1.9862724855522034, + "learning_rate": 2.474365842123422e-06, + "loss": 0.7681, + "step": 7649 + }, + { + "epoch": 0.78, + "grad_norm": 1.5724179992615193, + "learning_rate": 2.472196786511909e-06, + "loss": 0.6553, + "step": 7650 + }, + { + "epoch": 0.78, + "grad_norm": 1.711717378165331, + "learning_rate": 2.4700285479257125e-06, + "loss": 0.6962, + "step": 7651 + }, + { + "epoch": 0.78, + "grad_norm": 1.6741527308645596, + "learning_rate": 2.467861126600165e-06, + "loss": 0.6617, + "step": 7652 + }, + { + "epoch": 0.78, + "grad_norm": 1.7115676529691866, + "learning_rate": 2.4656945227705e-06, + "loss": 0.7768, + "step": 7653 + }, + { + "epoch": 0.78, + "grad_norm": 1.575870000219261, + "learning_rate": 2.463528736671874e-06, + "loss": 0.6564, + "step": 7654 + }, + { + "epoch": 0.78, + "grad_norm": 1.6279772119924147, + "learning_rate": 2.4613637685393433e-06, + "loss": 0.704, + "step": 7655 + }, + { + "epoch": 0.78, + "grad_norm": 1.601742728230388, + "learning_rate": 2.459199618607885e-06, + "loss": 0.6312, + "step": 7656 + }, + { + "epoch": 0.78, + "grad_norm": 1.6462873763439139, + "learning_rate": 2.4570362871123856e-06, + "loss": 0.745, + "step": 7657 + }, + { + "epoch": 0.78, + "grad_norm": 1.7267682492023038, + "learning_rate": 2.454873774287635e-06, + "loss": 0.7738, + "step": 7658 + }, + { + "epoch": 0.78, + "grad_norm": 1.8839223215834573, + "learning_rate": 2.452712080368348e-06, + "loss": 0.7788, + "step": 7659 + }, + { + "epoch": 0.78, + "grad_norm": 1.6246102550786494, + "learning_rate": 2.4505512055891345e-06, + "loss": 0.7169, + "step": 7660 + }, + { + "epoch": 0.78, + "grad_norm": 1.586426427748253, + "learning_rate": 2.448391150184527e-06, + "loss": 0.7218, + "step": 7661 + }, + { + "epoch": 0.78, + "grad_norm": 1.6988982131523829, + "learning_rate": 2.44623191438897e-06, + "loss": 0.6827, + "step": 7662 + }, + { + "epoch": 0.78, + "grad_norm": 1.665934565217917, + "learning_rate": 2.4440734984368075e-06, + "loss": 0.596, + "step": 7663 + }, + { + "epoch": 0.78, + "grad_norm": 1.603377046407116, + "learning_rate": 2.441915902562305e-06, + "loss": 0.6256, + "step": 7664 + }, + { + "epoch": 0.78, + "grad_norm": 1.5480250415462409, + "learning_rate": 2.439759126999639e-06, + "loss": 0.6626, + "step": 7665 + }, + { + "epoch": 0.78, + "grad_norm": 1.5877128801469451, + "learning_rate": 2.4376031719828885e-06, + "loss": 0.5916, + "step": 7666 + }, + { + "epoch": 0.78, + "grad_norm": 1.631419448675794, + "learning_rate": 2.4354480377460544e-06, + "loss": 0.665, + "step": 7667 + }, + { + "epoch": 0.78, + "grad_norm": 1.5382330429891682, + "learning_rate": 2.433293724523036e-06, + "loss": 0.6861, + "step": 7668 + }, + { + "epoch": 0.78, + "grad_norm": 1.7312481669876751, + "learning_rate": 2.431140232547653e-06, + "loss": 0.7801, + "step": 7669 + }, + { + "epoch": 0.78, + "grad_norm": 1.9421959679511438, + "learning_rate": 2.4289875620536375e-06, + "loss": 0.7, + "step": 7670 + }, + { + "epoch": 0.78, + "grad_norm": 1.691814528916687, + "learning_rate": 2.426835713274622e-06, + "loss": 0.6839, + "step": 7671 + }, + { + "epoch": 0.78, + "grad_norm": 1.5587877855016785, + "learning_rate": 2.424684686444162e-06, + "loss": 0.6143, + "step": 7672 + }, + { + "epoch": 0.78, + "grad_norm": 1.6222870119900274, + "learning_rate": 2.422534481795711e-06, + "loss": 0.6179, + "step": 7673 + }, + { + "epoch": 0.78, + "grad_norm": 1.6791909756698284, + "learning_rate": 2.420385099562644e-06, + "loss": 0.6535, + "step": 7674 + }, + { + "epoch": 0.78, + "grad_norm": 1.6326619615601652, + "learning_rate": 2.4182365399782448e-06, + "loss": 0.7029, + "step": 7675 + }, + { + "epoch": 0.78, + "grad_norm": 1.5625829982073207, + "learning_rate": 2.4160888032757014e-06, + "loss": 0.7023, + "step": 7676 + }, + { + "epoch": 0.78, + "grad_norm": 1.5667168902829505, + "learning_rate": 2.413941889688123e-06, + "loss": 0.7171, + "step": 7677 + }, + { + "epoch": 0.78, + "grad_norm": 1.6014556397474373, + "learning_rate": 2.4117957994485154e-06, + "loss": 0.692, + "step": 7678 + }, + { + "epoch": 0.78, + "grad_norm": 1.8147424714482825, + "learning_rate": 2.409650532789808e-06, + "loss": 0.7188, + "step": 7679 + }, + { + "epoch": 0.78, + "grad_norm": 1.645615433366562, + "learning_rate": 2.4075060899448388e-06, + "loss": 0.6687, + "step": 7680 + }, + { + "epoch": 0.78, + "grad_norm": 1.6718486820675713, + "learning_rate": 2.4053624711463476e-06, + "loss": 0.6324, + "step": 7681 + }, + { + "epoch": 0.78, + "grad_norm": 1.8500146601648426, + "learning_rate": 2.4032196766269924e-06, + "loss": 0.7548, + "step": 7682 + }, + { + "epoch": 0.78, + "grad_norm": 1.580356225872168, + "learning_rate": 2.4010777066193437e-06, + "loss": 0.7034, + "step": 7683 + }, + { + "epoch": 0.78, + "grad_norm": 1.5901655451727288, + "learning_rate": 2.3989365613558745e-06, + "loss": 0.8001, + "step": 7684 + }, + { + "epoch": 0.78, + "grad_norm": 1.6629579059500412, + "learning_rate": 2.396796241068976e-06, + "loss": 0.696, + "step": 7685 + }, + { + "epoch": 0.78, + "grad_norm": 1.822392458462985, + "learning_rate": 2.394656745990942e-06, + "loss": 0.7652, + "step": 7686 + }, + { + "epoch": 0.78, + "grad_norm": 1.62098466809815, + "learning_rate": 2.3925180763539847e-06, + "loss": 0.6656, + "step": 7687 + }, + { + "epoch": 0.78, + "grad_norm": 1.4926140404062171, + "learning_rate": 2.3903802323902246e-06, + "loss": 0.6894, + "step": 7688 + }, + { + "epoch": 0.78, + "grad_norm": 1.5855805740664932, + "learning_rate": 2.388243214331686e-06, + "loss": 0.6984, + "step": 7689 + }, + { + "epoch": 0.78, + "grad_norm": 1.6375661414055525, + "learning_rate": 2.3861070224103156e-06, + "loss": 0.5445, + "step": 7690 + }, + { + "epoch": 0.78, + "grad_norm": 1.593248155857221, + "learning_rate": 2.3839716568579563e-06, + "loss": 0.6813, + "step": 7691 + }, + { + "epoch": 0.78, + "grad_norm": 1.6421393986690094, + "learning_rate": 2.3818371179063728e-06, + "loss": 0.6451, + "step": 7692 + }, + { + "epoch": 0.78, + "grad_norm": 1.7259233059031243, + "learning_rate": 2.379703405787238e-06, + "loss": 0.6793, + "step": 7693 + }, + { + "epoch": 0.78, + "grad_norm": 1.7278740003752728, + "learning_rate": 2.3775705207321275e-06, + "loss": 0.6303, + "step": 7694 + }, + { + "epoch": 0.78, + "grad_norm": 1.6994493656161245, + "learning_rate": 2.3754384629725392e-06, + "loss": 0.7465, + "step": 7695 + }, + { + "epoch": 0.78, + "grad_norm": 1.3677942954050732, + "learning_rate": 2.373307232739868e-06, + "loss": 0.6011, + "step": 7696 + }, + { + "epoch": 0.78, + "grad_norm": 1.5774428491429453, + "learning_rate": 2.371176830265427e-06, + "loss": 0.6296, + "step": 7697 + }, + { + "epoch": 0.78, + "grad_norm": 1.6453817635216235, + "learning_rate": 2.3690472557804436e-06, + "loss": 0.6988, + "step": 7698 + }, + { + "epoch": 0.78, + "grad_norm": 1.4742886891745193, + "learning_rate": 2.366918509516043e-06, + "loss": 0.676, + "step": 7699 + }, + { + "epoch": 0.78, + "grad_norm": 1.7364301584235993, + "learning_rate": 2.36479059170327e-06, + "loss": 0.6986, + "step": 7700 + }, + { + "epoch": 0.78, + "grad_norm": 1.6897643385036853, + "learning_rate": 2.3626635025730803e-06, + "loss": 0.7172, + "step": 7701 + }, + { + "epoch": 0.78, + "grad_norm": 1.7457534652300224, + "learning_rate": 2.360537242356329e-06, + "loss": 0.6111, + "step": 7702 + }, + { + "epoch": 0.78, + "grad_norm": 1.7300879418703208, + "learning_rate": 2.358411811283796e-06, + "loss": 0.8058, + "step": 7703 + }, + { + "epoch": 0.78, + "grad_norm": 1.6630483822750175, + "learning_rate": 2.356287209586157e-06, + "loss": 0.6579, + "step": 7704 + }, + { + "epoch": 0.78, + "grad_norm": 1.7146054640600066, + "learning_rate": 2.354163437494006e-06, + "loss": 0.6689, + "step": 7705 + }, + { + "epoch": 0.78, + "grad_norm": 1.7399670370764073, + "learning_rate": 2.3520404952378495e-06, + "loss": 0.7783, + "step": 7706 + }, + { + "epoch": 0.78, + "grad_norm": 1.82897046288453, + "learning_rate": 2.349918383048094e-06, + "loss": 0.6876, + "step": 7707 + }, + { + "epoch": 0.78, + "grad_norm": 1.55184455103855, + "learning_rate": 2.347797101155067e-06, + "loss": 0.635, + "step": 7708 + }, + { + "epoch": 0.78, + "grad_norm": 1.6004898009171504, + "learning_rate": 2.345676649788995e-06, + "loss": 0.6636, + "step": 7709 + }, + { + "epoch": 0.78, + "grad_norm": 1.6645234770267519, + "learning_rate": 2.3435570291800214e-06, + "loss": 0.6275, + "step": 7710 + }, + { + "epoch": 0.78, + "grad_norm": 1.6193108195328993, + "learning_rate": 2.3414382395582024e-06, + "loss": 0.6788, + "step": 7711 + }, + { + "epoch": 0.78, + "grad_norm": 1.6830045676431642, + "learning_rate": 2.339320281153494e-06, + "loss": 0.6558, + "step": 7712 + }, + { + "epoch": 0.78, + "grad_norm": 1.5169238635880982, + "learning_rate": 2.3372031541957684e-06, + "loss": 0.6622, + "step": 7713 + }, + { + "epoch": 0.78, + "grad_norm": 1.6380575150031764, + "learning_rate": 2.335086858914811e-06, + "loss": 0.7452, + "step": 7714 + }, + { + "epoch": 0.78, + "grad_norm": 1.834230091319366, + "learning_rate": 2.3329713955403067e-06, + "loss": 0.6683, + "step": 7715 + }, + { + "epoch": 0.78, + "grad_norm": 1.585914593940905, + "learning_rate": 2.3308567643018574e-06, + "loss": 0.6325, + "step": 7716 + }, + { + "epoch": 0.78, + "grad_norm": 1.5950944991975575, + "learning_rate": 2.3287429654289774e-06, + "loss": 0.7191, + "step": 7717 + }, + { + "epoch": 0.79, + "grad_norm": 1.7535720455581632, + "learning_rate": 2.326629999151081e-06, + "loss": 0.6727, + "step": 7718 + }, + { + "epoch": 0.79, + "grad_norm": 1.650372784385992, + "learning_rate": 2.324517865697501e-06, + "loss": 0.651, + "step": 7719 + }, + { + "epoch": 0.79, + "grad_norm": 1.6714604758276976, + "learning_rate": 2.322406565297477e-06, + "loss": 0.6351, + "step": 7720 + }, + { + "epoch": 0.79, + "grad_norm": 1.8328149534027336, + "learning_rate": 2.320296098180154e-06, + "loss": 0.6832, + "step": 7721 + }, + { + "epoch": 0.79, + "grad_norm": 1.6831044613699842, + "learning_rate": 2.318186464574592e-06, + "loss": 0.6869, + "step": 7722 + }, + { + "epoch": 0.79, + "grad_norm": 1.7370014886232752, + "learning_rate": 2.316077664709763e-06, + "loss": 0.7249, + "step": 7723 + }, + { + "epoch": 0.79, + "grad_norm": 1.5167056453499903, + "learning_rate": 2.3139696988145367e-06, + "loss": 0.6399, + "step": 7724 + }, + { + "epoch": 0.79, + "grad_norm": 1.7606882595223425, + "learning_rate": 2.311862567117703e-06, + "loss": 0.6867, + "step": 7725 + }, + { + "epoch": 0.79, + "grad_norm": 1.7430394028378702, + "learning_rate": 2.3097562698479625e-06, + "loss": 0.722, + "step": 7726 + }, + { + "epoch": 0.79, + "grad_norm": 1.8257985091538622, + "learning_rate": 2.307650807233913e-06, + "loss": 0.735, + "step": 7727 + }, + { + "epoch": 0.79, + "grad_norm": 1.370233037411586, + "learning_rate": 2.3055461795040745e-06, + "loss": 0.6408, + "step": 7728 + }, + { + "epoch": 0.79, + "grad_norm": 1.9713638047994393, + "learning_rate": 2.3034423868868728e-06, + "loss": 0.7296, + "step": 7729 + }, + { + "epoch": 0.79, + "grad_norm": 1.5408962439710896, + "learning_rate": 2.301339429610637e-06, + "loss": 0.6946, + "step": 7730 + }, + { + "epoch": 0.79, + "grad_norm": 1.7594833023077556, + "learning_rate": 2.2992373079036146e-06, + "loss": 0.7895, + "step": 7731 + }, + { + "epoch": 0.79, + "grad_norm": 1.8144959438221029, + "learning_rate": 2.2971360219939543e-06, + "loss": 0.7169, + "step": 7732 + }, + { + "epoch": 0.79, + "grad_norm": 1.6810027650585382, + "learning_rate": 2.295035572109718e-06, + "loss": 0.5644, + "step": 7733 + }, + { + "epoch": 0.79, + "grad_norm": 1.7103028604041395, + "learning_rate": 2.2929359584788814e-06, + "loss": 0.74, + "step": 7734 + }, + { + "epoch": 0.79, + "grad_norm": 1.66925295250059, + "learning_rate": 2.2908371813293195e-06, + "loss": 0.6676, + "step": 7735 + }, + { + "epoch": 0.79, + "grad_norm": 1.7461573353125468, + "learning_rate": 2.2887392408888233e-06, + "loss": 0.8088, + "step": 7736 + }, + { + "epoch": 0.79, + "grad_norm": 1.6376307701564305, + "learning_rate": 2.2866421373850943e-06, + "loss": 0.7629, + "step": 7737 + }, + { + "epoch": 0.79, + "grad_norm": 1.536706107384853, + "learning_rate": 2.284545871045736e-06, + "loss": 0.669, + "step": 7738 + }, + { + "epoch": 0.79, + "grad_norm": 1.5271382033522722, + "learning_rate": 2.2824504420982683e-06, + "loss": 0.5886, + "step": 7739 + }, + { + "epoch": 0.79, + "grad_norm": 1.7006083419959541, + "learning_rate": 2.280355850770115e-06, + "loss": 0.7803, + "step": 7740 + }, + { + "epoch": 0.79, + "grad_norm": 1.5722117495050347, + "learning_rate": 2.2782620972886116e-06, + "loss": 0.6991, + "step": 7741 + }, + { + "epoch": 0.79, + "grad_norm": 1.4054192798777325, + "learning_rate": 2.276169181881006e-06, + "loss": 0.6412, + "step": 7742 + }, + { + "epoch": 0.79, + "grad_norm": 1.6087736248439393, + "learning_rate": 2.274077104774446e-06, + "loss": 0.6813, + "step": 7743 + }, + { + "epoch": 0.79, + "grad_norm": 1.7157752209095043, + "learning_rate": 2.2719858661959994e-06, + "loss": 0.7376, + "step": 7744 + }, + { + "epoch": 0.79, + "grad_norm": 1.514585889933833, + "learning_rate": 2.26989546637263e-06, + "loss": 0.6464, + "step": 7745 + }, + { + "epoch": 0.79, + "grad_norm": 1.4337615493436142, + "learning_rate": 2.267805905531224e-06, + "loss": 0.6228, + "step": 7746 + }, + { + "epoch": 0.79, + "grad_norm": 1.6849361457896674, + "learning_rate": 2.2657171838985714e-06, + "loss": 0.722, + "step": 7747 + }, + { + "epoch": 0.79, + "grad_norm": 1.6944312615856458, + "learning_rate": 2.2636293017013646e-06, + "loss": 0.7142, + "step": 7748 + }, + { + "epoch": 0.79, + "grad_norm": 1.879071950573806, + "learning_rate": 2.2615422591662175e-06, + "loss": 0.7979, + "step": 7749 + }, + { + "epoch": 0.79, + "grad_norm": 1.614926120698719, + "learning_rate": 2.259456056519639e-06, + "loss": 0.6204, + "step": 7750 + }, + { + "epoch": 0.79, + "grad_norm": 1.8507881781454831, + "learning_rate": 2.257370693988056e-06, + "loss": 0.7752, + "step": 7751 + }, + { + "epoch": 0.79, + "grad_norm": 1.6303593430397691, + "learning_rate": 2.255286171797807e-06, + "loss": 0.6652, + "step": 7752 + }, + { + "epoch": 0.79, + "grad_norm": 1.7243678063678334, + "learning_rate": 2.2532024901751273e-06, + "loss": 0.7451, + "step": 7753 + }, + { + "epoch": 0.79, + "grad_norm": 1.6780366300873248, + "learning_rate": 2.251119649346173e-06, + "loss": 0.676, + "step": 7754 + }, + { + "epoch": 0.79, + "grad_norm": 1.7377231196349392, + "learning_rate": 2.249037649536999e-06, + "loss": 0.7033, + "step": 7755 + }, + { + "epoch": 0.79, + "grad_norm": 1.4944129500941656, + "learning_rate": 2.2469564909735773e-06, + "loss": 0.6111, + "step": 7756 + }, + { + "epoch": 0.79, + "grad_norm": 1.6804228090544804, + "learning_rate": 2.244876173881787e-06, + "loss": 0.7282, + "step": 7757 + }, + { + "epoch": 0.79, + "grad_norm": 1.5289001691878608, + "learning_rate": 2.2427966984874095e-06, + "loss": 0.669, + "step": 7758 + }, + { + "epoch": 0.79, + "grad_norm": 1.6648677967922552, + "learning_rate": 2.240718065016141e-06, + "loss": 0.7155, + "step": 7759 + }, + { + "epoch": 0.79, + "grad_norm": 1.5616347105803132, + "learning_rate": 2.2386402736935865e-06, + "loss": 0.6067, + "step": 7760 + }, + { + "epoch": 0.79, + "grad_norm": 1.6999639438907512, + "learning_rate": 2.2365633247452546e-06, + "loss": 0.645, + "step": 7761 + }, + { + "epoch": 0.79, + "grad_norm": 1.7686905266747723, + "learning_rate": 2.2344872183965694e-06, + "loss": 0.6266, + "step": 7762 + }, + { + "epoch": 0.79, + "grad_norm": 1.551309634097524, + "learning_rate": 2.232411954872855e-06, + "loss": 0.6743, + "step": 7763 + }, + { + "epoch": 0.79, + "grad_norm": 1.683891809634969, + "learning_rate": 2.2303375343993515e-06, + "loss": 0.7688, + "step": 7764 + }, + { + "epoch": 0.79, + "grad_norm": 1.585579032545016, + "learning_rate": 2.2282639572012075e-06, + "loss": 0.6695, + "step": 7765 + }, + { + "epoch": 0.79, + "grad_norm": 1.6017477879438695, + "learning_rate": 2.226191223503472e-06, + "loss": 0.718, + "step": 7766 + }, + { + "epoch": 0.79, + "grad_norm": 1.5631122799217767, + "learning_rate": 2.2241193335311127e-06, + "loss": 0.6414, + "step": 7767 + }, + { + "epoch": 0.79, + "grad_norm": 1.6802047965642206, + "learning_rate": 2.2220482875089965e-06, + "loss": 0.629, + "step": 7768 + }, + { + "epoch": 0.79, + "grad_norm": 1.7959499571883017, + "learning_rate": 2.2199780856619045e-06, + "loss": 0.7789, + "step": 7769 + }, + { + "epoch": 0.79, + "grad_norm": 1.700554129412106, + "learning_rate": 2.2179087282145283e-06, + "loss": 0.7761, + "step": 7770 + }, + { + "epoch": 0.79, + "grad_norm": 1.7018819641783725, + "learning_rate": 2.2158402153914583e-06, + "loss": 0.7092, + "step": 7771 + }, + { + "epoch": 0.79, + "grad_norm": 2.092281198826054, + "learning_rate": 2.2137725474172056e-06, + "loss": 0.7933, + "step": 7772 + }, + { + "epoch": 0.79, + "grad_norm": 1.6815014348058988, + "learning_rate": 2.2117057245161767e-06, + "loss": 0.669, + "step": 7773 + }, + { + "epoch": 0.79, + "grad_norm": 1.7578458090852305, + "learning_rate": 2.209639746912696e-06, + "loss": 0.7681, + "step": 7774 + }, + { + "epoch": 0.79, + "grad_norm": 1.6446787326998182, + "learning_rate": 2.2075746148309964e-06, + "loss": 0.6735, + "step": 7775 + }, + { + "epoch": 0.79, + "grad_norm": 1.7808940087861367, + "learning_rate": 2.2055103284952094e-06, + "loss": 0.7404, + "step": 7776 + }, + { + "epoch": 0.79, + "grad_norm": 1.5939379532221074, + "learning_rate": 2.2034468881293845e-06, + "loss": 0.6936, + "step": 7777 + }, + { + "epoch": 0.79, + "grad_norm": 1.6748246487282306, + "learning_rate": 2.2013842939574783e-06, + "loss": 0.6473, + "step": 7778 + }, + { + "epoch": 0.79, + "grad_norm": 1.6786313202255883, + "learning_rate": 2.1993225462033465e-06, + "loss": 0.6747, + "step": 7779 + }, + { + "epoch": 0.79, + "grad_norm": 1.865528066751501, + "learning_rate": 2.197261645090767e-06, + "loss": 0.7479, + "step": 7780 + }, + { + "epoch": 0.79, + "grad_norm": 1.5587845597683763, + "learning_rate": 2.195201590843412e-06, + "loss": 0.5715, + "step": 7781 + }, + { + "epoch": 0.79, + "grad_norm": 1.6196625061757, + "learning_rate": 2.1931423836848697e-06, + "loss": 0.6109, + "step": 7782 + }, + { + "epoch": 0.79, + "grad_norm": 1.7556785120047103, + "learning_rate": 2.19108402383864e-06, + "loss": 0.6765, + "step": 7783 + }, + { + "epoch": 0.79, + "grad_norm": 1.7597770933828298, + "learning_rate": 2.1890265115281185e-06, + "loss": 0.7867, + "step": 7784 + }, + { + "epoch": 0.79, + "grad_norm": 1.7962624226990016, + "learning_rate": 2.186969846976623e-06, + "loss": 0.6741, + "step": 7785 + }, + { + "epoch": 0.79, + "grad_norm": 1.5725184446544542, + "learning_rate": 2.1849140304073647e-06, + "loss": 0.7164, + "step": 7786 + }, + { + "epoch": 0.79, + "grad_norm": 1.6603822362173248, + "learning_rate": 2.1828590620434742e-06, + "loss": 0.7547, + "step": 7787 + }, + { + "epoch": 0.79, + "grad_norm": 1.8415056371952363, + "learning_rate": 2.180804942107989e-06, + "loss": 0.6835, + "step": 7788 + }, + { + "epoch": 0.79, + "grad_norm": 1.716076411318612, + "learning_rate": 2.1787516708238454e-06, + "loss": 0.7467, + "step": 7789 + }, + { + "epoch": 0.79, + "grad_norm": 1.6953951498862214, + "learning_rate": 2.1766992484139015e-06, + "loss": 0.778, + "step": 7790 + }, + { + "epoch": 0.79, + "grad_norm": 1.5108432572094461, + "learning_rate": 2.174647675100907e-06, + "loss": 0.7437, + "step": 7791 + }, + { + "epoch": 0.79, + "grad_norm": 1.659610127336146, + "learning_rate": 2.172596951107534e-06, + "loss": 0.6632, + "step": 7792 + }, + { + "epoch": 0.79, + "grad_norm": 1.6463493242617568, + "learning_rate": 2.1705470766563573e-06, + "loss": 0.7365, + "step": 7793 + }, + { + "epoch": 0.79, + "grad_norm": 1.6967353791802484, + "learning_rate": 2.168498051969854e-06, + "loss": 0.6863, + "step": 7794 + }, + { + "epoch": 0.79, + "grad_norm": 1.7089615810906633, + "learning_rate": 2.166449877270416e-06, + "loss": 0.6617, + "step": 7795 + }, + { + "epoch": 0.79, + "grad_norm": 1.6528730277901496, + "learning_rate": 2.1644025527803426e-06, + "loss": 0.5896, + "step": 7796 + }, + { + "epoch": 0.79, + "grad_norm": 1.7234934710601961, + "learning_rate": 2.1623560787218355e-06, + "loss": 0.6047, + "step": 7797 + }, + { + "epoch": 0.79, + "grad_norm": 1.7342875782041085, + "learning_rate": 2.1603104553170108e-06, + "loss": 0.6267, + "step": 7798 + }, + { + "epoch": 0.79, + "grad_norm": 1.806632183962203, + "learning_rate": 2.1582656827878846e-06, + "loss": 0.7972, + "step": 7799 + }, + { + "epoch": 0.79, + "grad_norm": 1.5230796084491267, + "learning_rate": 2.1562217613563885e-06, + "loss": 0.6782, + "step": 7800 + }, + { + "epoch": 0.79, + "grad_norm": 1.5813890671558295, + "learning_rate": 2.1541786912443596e-06, + "loss": 0.6945, + "step": 7801 + }, + { + "epoch": 0.79, + "grad_norm": 1.9117029170190478, + "learning_rate": 2.152136472673535e-06, + "loss": 0.656, + "step": 7802 + }, + { + "epoch": 0.79, + "grad_norm": 1.7405050503451025, + "learning_rate": 2.1500951058655727e-06, + "loss": 0.7311, + "step": 7803 + }, + { + "epoch": 0.79, + "grad_norm": 1.8246200827578696, + "learning_rate": 2.148054591042026e-06, + "loss": 0.7639, + "step": 7804 + }, + { + "epoch": 0.79, + "grad_norm": 1.5414344095577803, + "learning_rate": 2.1460149284243625e-06, + "loss": 0.6125, + "step": 7805 + }, + { + "epoch": 0.79, + "grad_norm": 1.7695724635486085, + "learning_rate": 2.143976118233958e-06, + "loss": 0.7938, + "step": 7806 + }, + { + "epoch": 0.79, + "grad_norm": 1.6712057945183567, + "learning_rate": 2.1419381606920887e-06, + "loss": 0.6595, + "step": 7807 + }, + { + "epoch": 0.79, + "grad_norm": 1.7053677147235984, + "learning_rate": 2.1399010560199494e-06, + "loss": 0.749, + "step": 7808 + }, + { + "epoch": 0.79, + "grad_norm": 1.588553082819756, + "learning_rate": 2.1378648044386285e-06, + "loss": 0.6854, + "step": 7809 + }, + { + "epoch": 0.79, + "grad_norm": 1.6490243239056168, + "learning_rate": 2.135829406169133e-06, + "loss": 0.7192, + "step": 7810 + }, + { + "epoch": 0.79, + "grad_norm": 1.829693884474, + "learning_rate": 2.133794861432378e-06, + "loss": 0.7406, + "step": 7811 + }, + { + "epoch": 0.79, + "grad_norm": 1.7410750894386053, + "learning_rate": 2.1317611704491727e-06, + "loss": 0.6912, + "step": 7812 + }, + { + "epoch": 0.79, + "grad_norm": 1.6367704761381865, + "learning_rate": 2.1297283334402474e-06, + "loss": 0.7254, + "step": 7813 + }, + { + "epoch": 0.79, + "grad_norm": 1.62040426586708, + "learning_rate": 2.127696350626236e-06, + "loss": 0.7247, + "step": 7814 + }, + { + "epoch": 0.79, + "grad_norm": 1.6298201574818223, + "learning_rate": 2.1256652222276753e-06, + "loss": 0.6358, + "step": 7815 + }, + { + "epoch": 0.8, + "grad_norm": 1.701264821377545, + "learning_rate": 2.1236349484650164e-06, + "loss": 0.7002, + "step": 7816 + }, + { + "epoch": 0.8, + "grad_norm": 1.687077714298927, + "learning_rate": 2.121605529558608e-06, + "loss": 0.6635, + "step": 7817 + }, + { + "epoch": 0.8, + "grad_norm": 1.522822106403037, + "learning_rate": 2.1195769657287145e-06, + "loss": 0.5933, + "step": 7818 + }, + { + "epoch": 0.8, + "grad_norm": 1.8707382783379618, + "learning_rate": 2.117549257195509e-06, + "loss": 0.7004, + "step": 7819 + }, + { + "epoch": 0.8, + "grad_norm": 1.7169068537145045, + "learning_rate": 2.1155224041790614e-06, + "loss": 0.6212, + "step": 7820 + }, + { + "epoch": 0.8, + "grad_norm": 1.7913526157090918, + "learning_rate": 2.1134964068993568e-06, + "loss": 0.6165, + "step": 7821 + }, + { + "epoch": 0.8, + "grad_norm": 1.68792325925831, + "learning_rate": 2.1114712655762893e-06, + "loss": 0.7724, + "step": 7822 + }, + { + "epoch": 0.8, + "grad_norm": 1.6619920318546355, + "learning_rate": 2.10944698042965e-06, + "loss": 0.7583, + "step": 7823 + }, + { + "epoch": 0.8, + "grad_norm": 1.5725040329920634, + "learning_rate": 2.1074235516791475e-06, + "loss": 0.6454, + "step": 7824 + }, + { + "epoch": 0.8, + "grad_norm": 1.776077694659041, + "learning_rate": 2.105400979544394e-06, + "loss": 0.6861, + "step": 7825 + }, + { + "epoch": 0.8, + "grad_norm": 1.6810506472695297, + "learning_rate": 2.1033792642449037e-06, + "loss": 0.7583, + "step": 7826 + }, + { + "epoch": 0.8, + "grad_norm": 1.6039372665750604, + "learning_rate": 2.101358406000106e-06, + "loss": 0.6526, + "step": 7827 + }, + { + "epoch": 0.8, + "grad_norm": 1.5976535524612527, + "learning_rate": 2.0993384050293343e-06, + "loss": 0.714, + "step": 7828 + }, + { + "epoch": 0.8, + "grad_norm": 1.8511945629783728, + "learning_rate": 2.0973192615518234e-06, + "loss": 0.6959, + "step": 7829 + }, + { + "epoch": 0.8, + "grad_norm": 1.709399661423479, + "learning_rate": 2.095300975786723e-06, + "loss": 0.6782, + "step": 7830 + }, + { + "epoch": 0.8, + "grad_norm": 1.7332709455559026, + "learning_rate": 2.093283547953088e-06, + "loss": 0.6648, + "step": 7831 + }, + { + "epoch": 0.8, + "grad_norm": 1.6192736018667757, + "learning_rate": 2.0912669782698737e-06, + "loss": 0.6277, + "step": 7832 + }, + { + "epoch": 0.8, + "grad_norm": 1.6947247635348386, + "learning_rate": 2.0892512669559505e-06, + "loss": 0.7146, + "step": 7833 + }, + { + "epoch": 0.8, + "grad_norm": 1.654417397999407, + "learning_rate": 2.087236414230096e-06, + "loss": 0.7693, + "step": 7834 + }, + { + "epoch": 0.8, + "grad_norm": 1.6487583791677305, + "learning_rate": 2.0852224203109826e-06, + "loss": 0.6617, + "step": 7835 + }, + { + "epoch": 0.8, + "grad_norm": 1.556761764582044, + "learning_rate": 2.083209285417206e-06, + "loss": 0.7113, + "step": 7836 + }, + { + "epoch": 0.8, + "grad_norm": 1.7912806691152143, + "learning_rate": 2.0811970097672527e-06, + "loss": 0.6975, + "step": 7837 + }, + { + "epoch": 0.8, + "grad_norm": 1.717649994233873, + "learning_rate": 2.079185593579529e-06, + "loss": 0.7063, + "step": 7838 + }, + { + "epoch": 0.8, + "grad_norm": 1.5437100663665961, + "learning_rate": 2.0771750370723444e-06, + "loss": 0.6612, + "step": 7839 + }, + { + "epoch": 0.8, + "grad_norm": 1.7449689894327503, + "learning_rate": 2.0751653404639073e-06, + "loss": 0.6719, + "step": 7840 + }, + { + "epoch": 0.8, + "grad_norm": 1.5018868498932967, + "learning_rate": 2.073156503972341e-06, + "loss": 0.6652, + "step": 7841 + }, + { + "epoch": 0.8, + "grad_norm": 1.636787781130712, + "learning_rate": 2.0711485278156774e-06, + "loss": 0.6727, + "step": 7842 + }, + { + "epoch": 0.8, + "grad_norm": 1.6536033634879708, + "learning_rate": 2.069141412211846e-06, + "loss": 0.7434, + "step": 7843 + }, + { + "epoch": 0.8, + "grad_norm": 1.6512144230931152, + "learning_rate": 2.067135157378691e-06, + "loss": 0.7674, + "step": 7844 + }, + { + "epoch": 0.8, + "grad_norm": 1.5484002894263678, + "learning_rate": 2.065129763533956e-06, + "loss": 0.7699, + "step": 7845 + }, + { + "epoch": 0.8, + "grad_norm": 1.5211767111624397, + "learning_rate": 2.0631252308952986e-06, + "loss": 0.6553, + "step": 7846 + }, + { + "epoch": 0.8, + "grad_norm": 1.6673412897745399, + "learning_rate": 2.06112155968028e-06, + "loss": 0.6335, + "step": 7847 + }, + { + "epoch": 0.8, + "grad_norm": 1.6113060699168706, + "learning_rate": 2.059118750106365e-06, + "loss": 0.6341, + "step": 7848 + }, + { + "epoch": 0.8, + "grad_norm": 1.6460704313269863, + "learning_rate": 2.0571168023909272e-06, + "loss": 0.6317, + "step": 7849 + }, + { + "epoch": 0.8, + "grad_norm": 1.7483784351965763, + "learning_rate": 2.0551157167512503e-06, + "loss": 0.7829, + "step": 7850 + }, + { + "epoch": 0.8, + "grad_norm": 1.8651438770398827, + "learning_rate": 2.053115493404515e-06, + "loss": 0.7116, + "step": 7851 + }, + { + "epoch": 0.8, + "grad_norm": 1.7193660661457268, + "learning_rate": 2.0511161325678206e-06, + "loss": 0.6865, + "step": 7852 + }, + { + "epoch": 0.8, + "grad_norm": 1.622650547540691, + "learning_rate": 2.0491176344581608e-06, + "loss": 0.6429, + "step": 7853 + }, + { + "epoch": 0.8, + "grad_norm": 1.524886595091858, + "learning_rate": 2.047119999292444e-06, + "loss": 0.6307, + "step": 7854 + }, + { + "epoch": 0.8, + "grad_norm": 1.6940262251806701, + "learning_rate": 2.0451232272874845e-06, + "loss": 0.6906, + "step": 7855 + }, + { + "epoch": 0.8, + "grad_norm": 1.5474554934700888, + "learning_rate": 2.0431273186599964e-06, + "loss": 0.5805, + "step": 7856 + }, + { + "epoch": 0.8, + "grad_norm": 1.5127942854920668, + "learning_rate": 2.041132273626608e-06, + "loss": 0.6684, + "step": 7857 + }, + { + "epoch": 0.8, + "grad_norm": 1.5997306154029722, + "learning_rate": 2.039138092403846e-06, + "loss": 0.6363, + "step": 7858 + }, + { + "epoch": 0.8, + "grad_norm": 1.875216592490199, + "learning_rate": 2.037144775208151e-06, + "loss": 0.5804, + "step": 7859 + }, + { + "epoch": 0.8, + "grad_norm": 1.6753429884560422, + "learning_rate": 2.035152322255868e-06, + "loss": 0.6385, + "step": 7860 + }, + { + "epoch": 0.8, + "grad_norm": 1.7548658352691162, + "learning_rate": 2.0331607337632407e-06, + "loss": 0.6925, + "step": 7861 + }, + { + "epoch": 0.8, + "grad_norm": 1.5398454208128212, + "learning_rate": 2.0311700099464315e-06, + "loss": 0.6483, + "step": 7862 + }, + { + "epoch": 0.8, + "grad_norm": 1.6031866344789765, + "learning_rate": 2.029180151021496e-06, + "loss": 0.6919, + "step": 7863 + }, + { + "epoch": 0.8, + "grad_norm": 1.5614753142650077, + "learning_rate": 2.027191157204406e-06, + "loss": 0.7527, + "step": 7864 + }, + { + "epoch": 0.8, + "grad_norm": 1.835885954488514, + "learning_rate": 2.025203028711038e-06, + "loss": 0.7981, + "step": 7865 + }, + { + "epoch": 0.8, + "grad_norm": 1.6885653035275787, + "learning_rate": 2.023215765757166e-06, + "loss": 0.7251, + "step": 7866 + }, + { + "epoch": 0.8, + "grad_norm": 1.8891035652301391, + "learning_rate": 2.0212293685584794e-06, + "loss": 0.7153, + "step": 7867 + }, + { + "epoch": 0.8, + "grad_norm": 1.974342714359956, + "learning_rate": 2.0192438373305747e-06, + "loss": 0.7538, + "step": 7868 + }, + { + "epoch": 0.8, + "grad_norm": 1.7252816726702502, + "learning_rate": 2.0172591722889423e-06, + "loss": 0.7309, + "step": 7869 + }, + { + "epoch": 0.8, + "grad_norm": 1.791791144501412, + "learning_rate": 2.015275373648994e-06, + "loss": 0.7317, + "step": 7870 + }, + { + "epoch": 0.8, + "grad_norm": 1.853562934767157, + "learning_rate": 2.0132924416260347e-06, + "loss": 0.8221, + "step": 7871 + }, + { + "epoch": 0.8, + "grad_norm": 1.7272393998078934, + "learning_rate": 2.0113103764352814e-06, + "loss": 0.7894, + "step": 7872 + }, + { + "epoch": 0.8, + "grad_norm": 1.6633087363346242, + "learning_rate": 2.009329178291861e-06, + "loss": 0.768, + "step": 7873 + }, + { + "epoch": 0.8, + "grad_norm": 1.9128303825762558, + "learning_rate": 2.0073488474107962e-06, + "loss": 0.6392, + "step": 7874 + }, + { + "epoch": 0.8, + "grad_norm": 1.7616167689658218, + "learning_rate": 2.0053693840070242e-06, + "loss": 0.6602, + "step": 7875 + }, + { + "epoch": 0.8, + "grad_norm": 1.699303749633239, + "learning_rate": 2.0033907882953818e-06, + "loss": 0.7439, + "step": 7876 + }, + { + "epoch": 0.8, + "grad_norm": 1.6070346725208509, + "learning_rate": 2.0014130604906167e-06, + "loss": 0.7094, + "step": 7877 + }, + { + "epoch": 0.8, + "grad_norm": 1.741448073560214, + "learning_rate": 1.999436200807382e-06, + "loss": 0.7499, + "step": 7878 + }, + { + "epoch": 0.8, + "grad_norm": 1.6944355963588895, + "learning_rate": 1.99746020946023e-06, + "loss": 0.7529, + "step": 7879 + }, + { + "epoch": 0.8, + "grad_norm": 1.8895579187524152, + "learning_rate": 1.99548508666363e-06, + "loss": 0.7048, + "step": 7880 + }, + { + "epoch": 0.8, + "grad_norm": 1.9920721222550972, + "learning_rate": 1.993510832631944e-06, + "loss": 0.7369, + "step": 7881 + }, + { + "epoch": 0.8, + "grad_norm": 1.5922170424114257, + "learning_rate": 1.9915374475794492e-06, + "loss": 0.6501, + "step": 7882 + }, + { + "epoch": 0.8, + "grad_norm": 1.5634638693807896, + "learning_rate": 1.989564931720329e-06, + "loss": 0.6196, + "step": 7883 + }, + { + "epoch": 0.8, + "grad_norm": 1.5635383047081672, + "learning_rate": 1.987593285268664e-06, + "loss": 0.7479, + "step": 7884 + }, + { + "epoch": 0.8, + "grad_norm": 1.4883323785564222, + "learning_rate": 1.9856225084384484e-06, + "loss": 0.7076, + "step": 7885 + }, + { + "epoch": 0.8, + "grad_norm": 1.6282107550188445, + "learning_rate": 1.983652601443581e-06, + "loss": 0.6514, + "step": 7886 + }, + { + "epoch": 0.8, + "grad_norm": 1.527130392726828, + "learning_rate": 1.981683564497858e-06, + "loss": 0.683, + "step": 7887 + }, + { + "epoch": 0.8, + "grad_norm": 1.5838502653009983, + "learning_rate": 1.979715397814996e-06, + "loss": 0.737, + "step": 7888 + }, + { + "epoch": 0.8, + "grad_norm": 1.6495537665659166, + "learning_rate": 1.977748101608601e-06, + "loss": 0.6973, + "step": 7889 + }, + { + "epoch": 0.8, + "grad_norm": 1.7194286106142869, + "learning_rate": 1.9757816760921955e-06, + "loss": 0.7559, + "step": 7890 + }, + { + "epoch": 0.8, + "grad_norm": 1.5641719812122743, + "learning_rate": 1.973816121479207e-06, + "loss": 0.6447, + "step": 7891 + }, + { + "epoch": 0.8, + "grad_norm": 1.5147137650724953, + "learning_rate": 1.971851437982961e-06, + "loss": 0.5577, + "step": 7892 + }, + { + "epoch": 0.8, + "grad_norm": 1.7297562578736958, + "learning_rate": 1.9698876258166968e-06, + "loss": 0.698, + "step": 7893 + }, + { + "epoch": 0.8, + "grad_norm": 1.872273742313337, + "learning_rate": 1.967924685193552e-06, + "loss": 0.7417, + "step": 7894 + }, + { + "epoch": 0.8, + "grad_norm": 1.5730148472491021, + "learning_rate": 1.965962616326574e-06, + "loss": 0.6882, + "step": 7895 + }, + { + "epoch": 0.8, + "grad_norm": 1.7367694504986644, + "learning_rate": 1.9640014194287206e-06, + "loss": 0.7052, + "step": 7896 + }, + { + "epoch": 0.8, + "grad_norm": 1.794095966158344, + "learning_rate": 1.96204109471284e-06, + "loss": 0.6673, + "step": 7897 + }, + { + "epoch": 0.8, + "grad_norm": 1.6628464531180407, + "learning_rate": 1.9600816423917013e-06, + "loss": 0.7936, + "step": 7898 + }, + { + "epoch": 0.8, + "grad_norm": 1.7052997907461107, + "learning_rate": 1.958123062677968e-06, + "loss": 0.7681, + "step": 7899 + }, + { + "epoch": 0.8, + "grad_norm": 1.6754284096878078, + "learning_rate": 1.9561653557842142e-06, + "loss": 0.7549, + "step": 7900 + }, + { + "epoch": 0.8, + "grad_norm": 1.5443046736290091, + "learning_rate": 1.9542085219229235e-06, + "loss": 0.682, + "step": 7901 + }, + { + "epoch": 0.8, + "grad_norm": 1.7456115388925508, + "learning_rate": 1.9522525613064723e-06, + "loss": 0.6539, + "step": 7902 + }, + { + "epoch": 0.8, + "grad_norm": 1.7079663646484742, + "learning_rate": 1.950297474147156e-06, + "loss": 0.6013, + "step": 7903 + }, + { + "epoch": 0.8, + "grad_norm": 1.6242157401336126, + "learning_rate": 1.9483432606571627e-06, + "loss": 0.6876, + "step": 7904 + }, + { + "epoch": 0.8, + "grad_norm": 1.6509617781874775, + "learning_rate": 1.946389921048595e-06, + "loss": 0.6657, + "step": 7905 + }, + { + "epoch": 0.8, + "grad_norm": 1.5558128791049708, + "learning_rate": 1.944437455533459e-06, + "loss": 0.6951, + "step": 7906 + }, + { + "epoch": 0.8, + "grad_norm": 1.5442432225603513, + "learning_rate": 1.9424858643236598e-06, + "loss": 0.6857, + "step": 7907 + }, + { + "epoch": 0.8, + "grad_norm": 1.8953492852649612, + "learning_rate": 1.9405351476310154e-06, + "loss": 0.6654, + "step": 7908 + }, + { + "epoch": 0.8, + "grad_norm": 1.6694936693614468, + "learning_rate": 1.9385853056672467e-06, + "loss": 0.6512, + "step": 7909 + }, + { + "epoch": 0.8, + "grad_norm": 1.5791933585290685, + "learning_rate": 1.9366363386439747e-06, + "loss": 0.7247, + "step": 7910 + }, + { + "epoch": 0.8, + "grad_norm": 1.7048568831881048, + "learning_rate": 1.9346882467727323e-06, + "loss": 0.7314, + "step": 7911 + }, + { + "epoch": 0.8, + "grad_norm": 1.6997667202698497, + "learning_rate": 1.932741030264952e-06, + "loss": 0.7043, + "step": 7912 + }, + { + "epoch": 0.8, + "grad_norm": 1.6307421070504355, + "learning_rate": 1.930794689331975e-06, + "loss": 0.6322, + "step": 7913 + }, + { + "epoch": 0.81, + "grad_norm": 2.0600411136366907, + "learning_rate": 1.9288492241850486e-06, + "loss": 0.8343, + "step": 7914 + }, + { + "epoch": 0.81, + "grad_norm": 1.6263978112754838, + "learning_rate": 1.9269046350353184e-06, + "loss": 0.5917, + "step": 7915 + }, + { + "epoch": 0.81, + "grad_norm": 1.7212214231886043, + "learning_rate": 1.9249609220938425e-06, + "loss": 0.6988, + "step": 7916 + }, + { + "epoch": 0.81, + "grad_norm": 1.7923502452489541, + "learning_rate": 1.9230180855715765e-06, + "loss": 0.7721, + "step": 7917 + }, + { + "epoch": 0.81, + "grad_norm": 1.6510583005199575, + "learning_rate": 1.9210761256793876e-06, + "loss": 0.6751, + "step": 7918 + }, + { + "epoch": 0.81, + "grad_norm": 1.6965142454396533, + "learning_rate": 1.9191350426280476e-06, + "loss": 0.7389, + "step": 7919 + }, + { + "epoch": 0.81, + "grad_norm": 1.5883857332917075, + "learning_rate": 1.9171948366282256e-06, + "loss": 0.6797, + "step": 7920 + }, + { + "epoch": 0.81, + "grad_norm": 1.7740325510562096, + "learning_rate": 1.9152555078905054e-06, + "loss": 0.6812, + "step": 7921 + }, + { + "epoch": 0.81, + "grad_norm": 1.7902049614453566, + "learning_rate": 1.9133170566253665e-06, + "loss": 0.6932, + "step": 7922 + }, + { + "epoch": 0.81, + "grad_norm": 1.7610525327182251, + "learning_rate": 1.9113794830431998e-06, + "loss": 0.7173, + "step": 7923 + }, + { + "epoch": 0.81, + "grad_norm": 1.737127894919951, + "learning_rate": 1.9094427873542997e-06, + "loss": 0.7971, + "step": 7924 + }, + { + "epoch": 0.81, + "grad_norm": 1.842918131941055, + "learning_rate": 1.907506969768862e-06, + "loss": 0.6717, + "step": 7925 + }, + { + "epoch": 0.81, + "grad_norm": 1.8822392409548165, + "learning_rate": 1.9055720304969894e-06, + "loss": 0.757, + "step": 7926 + }, + { + "epoch": 0.81, + "grad_norm": 1.72327856763112, + "learning_rate": 1.903637969748693e-06, + "loss": 0.5784, + "step": 7927 + }, + { + "epoch": 0.81, + "grad_norm": 1.5513018101818548, + "learning_rate": 1.901704787733879e-06, + "loss": 0.6345, + "step": 7928 + }, + { + "epoch": 0.81, + "grad_norm": 1.6058853335363181, + "learning_rate": 1.8997724846623666e-06, + "loss": 0.6129, + "step": 7929 + }, + { + "epoch": 0.81, + "grad_norm": 1.732733524266622, + "learning_rate": 1.8978410607438812e-06, + "loss": 0.6901, + "step": 7930 + }, + { + "epoch": 0.81, + "grad_norm": 1.7278955082537923, + "learning_rate": 1.895910516188042e-06, + "loss": 0.7151, + "step": 7931 + }, + { + "epoch": 0.81, + "grad_norm": 1.7211882101735263, + "learning_rate": 1.8939808512043822e-06, + "loss": 0.7134, + "step": 7932 + }, + { + "epoch": 0.81, + "grad_norm": 1.6376433863918067, + "learning_rate": 1.8920520660023412e-06, + "loss": 0.6377, + "step": 7933 + }, + { + "epoch": 0.81, + "grad_norm": 1.9063083868432706, + "learning_rate": 1.8901241607912502e-06, + "loss": 0.6743, + "step": 7934 + }, + { + "epoch": 0.81, + "grad_norm": 1.6773780116605972, + "learning_rate": 1.8881971357803575e-06, + "loss": 0.6674, + "step": 7935 + }, + { + "epoch": 0.81, + "grad_norm": 1.756703253587125, + "learning_rate": 1.8862709911788145e-06, + "loss": 0.7528, + "step": 7936 + }, + { + "epoch": 0.81, + "grad_norm": 1.8278159313895643, + "learning_rate": 1.8843457271956679e-06, + "loss": 0.6927, + "step": 7937 + }, + { + "epoch": 0.81, + "grad_norm": 1.5222474436371374, + "learning_rate": 1.8824213440398776e-06, + "loss": 0.6128, + "step": 7938 + }, + { + "epoch": 0.81, + "grad_norm": 1.5055087749630518, + "learning_rate": 1.880497841920308e-06, + "loss": 0.635, + "step": 7939 + }, + { + "epoch": 0.81, + "grad_norm": 1.817582363757226, + "learning_rate": 1.8785752210457208e-06, + "loss": 0.6813, + "step": 7940 + }, + { + "epoch": 0.81, + "grad_norm": 1.76457845246533, + "learning_rate": 1.8766534816247917e-06, + "loss": 0.6767, + "step": 7941 + }, + { + "epoch": 0.81, + "grad_norm": 1.5273211143254652, + "learning_rate": 1.874732623866089e-06, + "loss": 0.6553, + "step": 7942 + }, + { + "epoch": 0.81, + "grad_norm": 1.7697396621253008, + "learning_rate": 1.8728126479780951e-06, + "loss": 0.7514, + "step": 7943 + }, + { + "epoch": 0.81, + "grad_norm": 1.8317315916672514, + "learning_rate": 1.8708935541691964e-06, + "loss": 0.7215, + "step": 7944 + }, + { + "epoch": 0.81, + "grad_norm": 1.7793918921800997, + "learning_rate": 1.868975342647673e-06, + "loss": 0.6822, + "step": 7945 + }, + { + "epoch": 0.81, + "grad_norm": 1.813369886642259, + "learning_rate": 1.8670580136217232e-06, + "loss": 0.7838, + "step": 7946 + }, + { + "epoch": 0.81, + "grad_norm": 1.6914376056588731, + "learning_rate": 1.865141567299442e-06, + "loss": 0.637, + "step": 7947 + }, + { + "epoch": 0.81, + "grad_norm": 1.8448253000887582, + "learning_rate": 1.8632260038888272e-06, + "loss": 0.7215, + "step": 7948 + }, + { + "epoch": 0.81, + "grad_norm": 1.6885452785875725, + "learning_rate": 1.8613113235977854e-06, + "loss": 0.7254, + "step": 7949 + }, + { + "epoch": 0.81, + "grad_norm": 1.786429205383274, + "learning_rate": 1.8593975266341258e-06, + "loss": 0.7589, + "step": 7950 + }, + { + "epoch": 0.81, + "grad_norm": 1.6108942657093994, + "learning_rate": 1.857484613205558e-06, + "loss": 0.7062, + "step": 7951 + }, + { + "epoch": 0.81, + "grad_norm": 1.7621078214450279, + "learning_rate": 1.8555725835197024e-06, + "loss": 0.6255, + "step": 7952 + }, + { + "epoch": 0.81, + "grad_norm": 1.7650961049535976, + "learning_rate": 1.8536614377840767e-06, + "loss": 0.6909, + "step": 7953 + }, + { + "epoch": 0.81, + "grad_norm": 1.6656129367567623, + "learning_rate": 1.851751176206107e-06, + "loss": 0.6863, + "step": 7954 + }, + { + "epoch": 0.81, + "grad_norm": 1.745463279141544, + "learning_rate": 1.849841798993126e-06, + "loss": 0.8086, + "step": 7955 + }, + { + "epoch": 0.81, + "grad_norm": 1.6093159687854266, + "learning_rate": 1.8479333063523596e-06, + "loss": 0.666, + "step": 7956 + }, + { + "epoch": 0.81, + "grad_norm": 1.6844585878472074, + "learning_rate": 1.846025698490952e-06, + "loss": 0.7696, + "step": 7957 + }, + { + "epoch": 0.81, + "grad_norm": 1.6175676462127535, + "learning_rate": 1.8441189756159384e-06, + "loss": 0.7112, + "step": 7958 + }, + { + "epoch": 0.81, + "grad_norm": 1.6762261447013207, + "learning_rate": 1.8422131379342668e-06, + "loss": 0.5827, + "step": 7959 + }, + { + "epoch": 0.81, + "grad_norm": 1.6781795279750065, + "learning_rate": 1.840308185652787e-06, + "loss": 0.6993, + "step": 7960 + }, + { + "epoch": 0.81, + "grad_norm": 1.5292222257030226, + "learning_rate": 1.8384041189782487e-06, + "loss": 0.5964, + "step": 7961 + }, + { + "epoch": 0.81, + "grad_norm": 1.781420757861504, + "learning_rate": 1.8365009381173104e-06, + "loss": 0.7575, + "step": 7962 + }, + { + "epoch": 0.81, + "grad_norm": 1.6092231268001482, + "learning_rate": 1.8345986432765338e-06, + "loss": 0.6708, + "step": 7963 + }, + { + "epoch": 0.81, + "grad_norm": 1.569521817337266, + "learning_rate": 1.8326972346623806e-06, + "loss": 0.6602, + "step": 7964 + }, + { + "epoch": 0.81, + "grad_norm": 1.606489507221082, + "learning_rate": 1.8307967124812221e-06, + "loss": 0.7262, + "step": 7965 + }, + { + "epoch": 0.81, + "grad_norm": 1.6885580857972373, + "learning_rate": 1.8288970769393267e-06, + "loss": 0.7442, + "step": 7966 + }, + { + "epoch": 0.81, + "grad_norm": 1.730201721005992, + "learning_rate": 1.8269983282428705e-06, + "loss": 0.7575, + "step": 7967 + }, + { + "epoch": 0.81, + "grad_norm": 1.6215040305929973, + "learning_rate": 1.8251004665979378e-06, + "loss": 0.6663, + "step": 7968 + }, + { + "epoch": 0.81, + "grad_norm": 1.5726907533294119, + "learning_rate": 1.8232034922105058e-06, + "loss": 0.6834, + "step": 7969 + }, + { + "epoch": 0.81, + "grad_norm": 1.6229999795058794, + "learning_rate": 1.8213074052864654e-06, + "loss": 0.76, + "step": 7970 + }, + { + "epoch": 0.81, + "grad_norm": 1.5485299111513613, + "learning_rate": 1.8194122060316044e-06, + "loss": 0.6972, + "step": 7971 + }, + { + "epoch": 0.81, + "grad_norm": 1.5521491925286681, + "learning_rate": 1.817517894651617e-06, + "loss": 0.593, + "step": 7972 + }, + { + "epoch": 0.81, + "grad_norm": 1.7300321739270723, + "learning_rate": 1.8156244713521065e-06, + "loss": 0.7811, + "step": 7973 + }, + { + "epoch": 0.81, + "grad_norm": 1.829781856253929, + "learning_rate": 1.8137319363385664e-06, + "loss": 0.6688, + "step": 7974 + }, + { + "epoch": 0.81, + "grad_norm": 1.6007145369222389, + "learning_rate": 1.811840289816409e-06, + "loss": 0.7405, + "step": 7975 + }, + { + "epoch": 0.81, + "grad_norm": 1.597049195511601, + "learning_rate": 1.8099495319909377e-06, + "loss": 0.6841, + "step": 7976 + }, + { + "epoch": 0.81, + "grad_norm": 1.465891069369129, + "learning_rate": 1.8080596630673652e-06, + "loss": 0.6568, + "step": 7977 + }, + { + "epoch": 0.81, + "grad_norm": 1.6293286430781235, + "learning_rate": 1.806170683250813e-06, + "loss": 0.6951, + "step": 7978 + }, + { + "epoch": 0.81, + "grad_norm": 1.6922012635236878, + "learning_rate": 1.8042825927462915e-06, + "loss": 0.7233, + "step": 7979 + }, + { + "epoch": 0.81, + "grad_norm": 1.5849598458552254, + "learning_rate": 1.802395391758729e-06, + "loss": 0.6481, + "step": 7980 + }, + { + "epoch": 0.81, + "grad_norm": 1.8039497724725537, + "learning_rate": 1.8005090804929525e-06, + "loss": 0.6401, + "step": 7981 + }, + { + "epoch": 0.81, + "grad_norm": 1.4792034062672803, + "learning_rate": 1.7986236591536875e-06, + "loss": 0.7558, + "step": 7982 + }, + { + "epoch": 0.81, + "grad_norm": 1.6653008329867707, + "learning_rate": 1.7967391279455715e-06, + "loss": 0.6519, + "step": 7983 + }, + { + "epoch": 0.81, + "grad_norm": 1.6590694279898175, + "learning_rate": 1.794855487073136e-06, + "loss": 0.7157, + "step": 7984 + }, + { + "epoch": 0.81, + "grad_norm": 1.6776772981968426, + "learning_rate": 1.7929727367408233e-06, + "loss": 0.7889, + "step": 7985 + }, + { + "epoch": 0.81, + "grad_norm": 1.6534317199844832, + "learning_rate": 1.79109087715298e-06, + "loss": 0.6789, + "step": 7986 + }, + { + "epoch": 0.81, + "grad_norm": 1.6922990817285495, + "learning_rate": 1.7892099085138448e-06, + "loss": 0.6641, + "step": 7987 + }, + { + "epoch": 0.81, + "grad_norm": 1.6048970808366634, + "learning_rate": 1.7873298310275755e-06, + "loss": 0.6557, + "step": 7988 + }, + { + "epoch": 0.81, + "grad_norm": 1.690427387402649, + "learning_rate": 1.7854506448982179e-06, + "loss": 0.7262, + "step": 7989 + }, + { + "epoch": 0.81, + "grad_norm": 1.691584509499384, + "learning_rate": 1.783572350329732e-06, + "loss": 0.736, + "step": 7990 + }, + { + "epoch": 0.81, + "grad_norm": 1.7445444143992106, + "learning_rate": 1.7816949475259793e-06, + "loss": 0.7128, + "step": 7991 + }, + { + "epoch": 0.81, + "grad_norm": 1.821762724528318, + "learning_rate": 1.7798184366907167e-06, + "loss": 0.7088, + "step": 7992 + }, + { + "epoch": 0.81, + "grad_norm": 1.6726024812181441, + "learning_rate": 1.777942818027617e-06, + "loss": 0.7523, + "step": 7993 + }, + { + "epoch": 0.81, + "grad_norm": 1.6375638362892346, + "learning_rate": 1.776068091740244e-06, + "loss": 0.703, + "step": 7994 + }, + { + "epoch": 0.81, + "grad_norm": 1.7839059170132194, + "learning_rate": 1.7741942580320704e-06, + "loss": 0.7539, + "step": 7995 + }, + { + "epoch": 0.81, + "grad_norm": 1.8828237716172438, + "learning_rate": 1.7723213171064757e-06, + "loss": 0.6589, + "step": 7996 + }, + { + "epoch": 0.81, + "grad_norm": 1.5749266185283466, + "learning_rate": 1.770449269166733e-06, + "loss": 0.6152, + "step": 7997 + }, + { + "epoch": 0.81, + "grad_norm": 1.637547348638589, + "learning_rate": 1.7685781144160276e-06, + "loss": 0.7638, + "step": 7998 + }, + { + "epoch": 0.81, + "grad_norm": 1.6348983075135142, + "learning_rate": 1.7667078530574432e-06, + "loss": 0.6158, + "step": 7999 + }, + { + "epoch": 0.81, + "grad_norm": 1.7768978652343665, + "learning_rate": 1.764838485293966e-06, + "loss": 0.6817, + "step": 8000 + }, + { + "epoch": 0.81, + "grad_norm": 1.9138605814810459, + "learning_rate": 1.7629700113284898e-06, + "loss": 0.6924, + "step": 8001 + }, + { + "epoch": 0.81, + "grad_norm": 1.8006453653947696, + "learning_rate": 1.7611024313638036e-06, + "loss": 0.685, + "step": 8002 + }, + { + "epoch": 0.81, + "grad_norm": 1.621591850869959, + "learning_rate": 1.7592357456026065e-06, + "loss": 0.668, + "step": 8003 + }, + { + "epoch": 0.81, + "grad_norm": 1.5252090986604172, + "learning_rate": 1.7573699542475009e-06, + "loss": 0.6939, + "step": 8004 + }, + { + "epoch": 0.81, + "grad_norm": 1.594839187279194, + "learning_rate": 1.7555050575009836e-06, + "loss": 0.6475, + "step": 8005 + }, + { + "epoch": 0.81, + "grad_norm": 1.7306387118407995, + "learning_rate": 1.7536410555654658e-06, + "loss": 0.6955, + "step": 8006 + }, + { + "epoch": 0.81, + "grad_norm": 1.7925998469644877, + "learning_rate": 1.7517779486432495e-06, + "loss": 0.691, + "step": 8007 + }, + { + "epoch": 0.81, + "grad_norm": 1.8125246527297612, + "learning_rate": 1.7499157369365504e-06, + "loss": 0.6745, + "step": 8008 + }, + { + "epoch": 0.81, + "grad_norm": 1.6051500991727927, + "learning_rate": 1.7480544206474824e-06, + "loss": 0.6287, + "step": 8009 + }, + { + "epoch": 0.81, + "grad_norm": 1.721327369042911, + "learning_rate": 1.7461939999780586e-06, + "loss": 0.685, + "step": 8010 + }, + { + "epoch": 0.81, + "grad_norm": 1.8154002512357872, + "learning_rate": 1.7443344751302048e-06, + "loss": 0.7326, + "step": 8011 + }, + { + "epoch": 0.81, + "grad_norm": 1.734009665808617, + "learning_rate": 1.7424758463057356e-06, + "loss": 0.574, + "step": 8012 + }, + { + "epoch": 0.82, + "grad_norm": 1.5821619651938132, + "learning_rate": 1.7406181137063804e-06, + "loss": 0.5944, + "step": 8013 + }, + { + "epoch": 0.82, + "grad_norm": 1.6127780706422297, + "learning_rate": 1.7387612775337703e-06, + "loss": 0.6382, + "step": 8014 + }, + { + "epoch": 0.82, + "grad_norm": 1.7682106393097323, + "learning_rate": 1.7369053379894285e-06, + "loss": 0.7225, + "step": 8015 + }, + { + "epoch": 0.82, + "grad_norm": 1.6834214431659507, + "learning_rate": 1.7350502952747916e-06, + "loss": 0.7673, + "step": 8016 + }, + { + "epoch": 0.82, + "grad_norm": 1.7819707141660162, + "learning_rate": 1.7331961495911997e-06, + "loss": 0.6721, + "step": 8017 + }, + { + "epoch": 0.82, + "grad_norm": 1.6772630357351819, + "learning_rate": 1.731342901139884e-06, + "loss": 0.6176, + "step": 8018 + }, + { + "epoch": 0.82, + "grad_norm": 1.6709942471951538, + "learning_rate": 1.7294905501219915e-06, + "loss": 0.6154, + "step": 8019 + }, + { + "epoch": 0.82, + "grad_norm": 1.7757882299108645, + "learning_rate": 1.7276390967385614e-06, + "loss": 0.758, + "step": 8020 + }, + { + "epoch": 0.82, + "grad_norm": 1.7015110760484848, + "learning_rate": 1.7257885411905416e-06, + "loss": 0.7283, + "step": 8021 + }, + { + "epoch": 0.82, + "grad_norm": 1.7701017096659257, + "learning_rate": 1.723938883678784e-06, + "loss": 0.7977, + "step": 8022 + }, + { + "epoch": 0.82, + "grad_norm": 1.93781008303018, + "learning_rate": 1.7220901244040355e-06, + "loss": 0.6425, + "step": 8023 + }, + { + "epoch": 0.82, + "grad_norm": 1.545177983002849, + "learning_rate": 1.7202422635669536e-06, + "loss": 0.7073, + "step": 8024 + }, + { + "epoch": 0.82, + "grad_norm": 1.8987430084382586, + "learning_rate": 1.718395301368091e-06, + "loss": 0.764, + "step": 8025 + }, + { + "epoch": 0.82, + "grad_norm": 1.4753951907006009, + "learning_rate": 1.7165492380079084e-06, + "loss": 0.682, + "step": 8026 + }, + { + "epoch": 0.82, + "grad_norm": 1.8881432214443394, + "learning_rate": 1.7147040736867704e-06, + "loss": 0.7082, + "step": 8027 + }, + { + "epoch": 0.82, + "grad_norm": 1.7149537130909085, + "learning_rate": 1.7128598086049353e-06, + "loss": 0.6842, + "step": 8028 + }, + { + "epoch": 0.82, + "grad_norm": 1.612966254277709, + "learning_rate": 1.711016442962573e-06, + "loss": 0.6647, + "step": 8029 + }, + { + "epoch": 0.82, + "grad_norm": 1.7496388997343648, + "learning_rate": 1.7091739769597492e-06, + "loss": 0.7958, + "step": 8030 + }, + { + "epoch": 0.82, + "grad_norm": 1.6416089663102067, + "learning_rate": 1.7073324107964363e-06, + "loss": 0.6912, + "step": 8031 + }, + { + "epoch": 0.82, + "grad_norm": 1.7959977773068057, + "learning_rate": 1.7054917446725083e-06, + "loss": 0.7446, + "step": 8032 + }, + { + "epoch": 0.82, + "grad_norm": 1.758838901126341, + "learning_rate": 1.7036519787877393e-06, + "loss": 0.7721, + "step": 8033 + }, + { + "epoch": 0.82, + "grad_norm": 1.6413026632094811, + "learning_rate": 1.701813113341806e-06, + "loss": 0.6721, + "step": 8034 + }, + { + "epoch": 0.82, + "grad_norm": 1.6692477027547694, + "learning_rate": 1.699975148534293e-06, + "loss": 0.5955, + "step": 8035 + }, + { + "epoch": 0.82, + "grad_norm": 1.641655359956405, + "learning_rate": 1.6981380845646779e-06, + "loss": 0.6715, + "step": 8036 + }, + { + "epoch": 0.82, + "grad_norm": 1.6286894104599299, + "learning_rate": 1.6963019216323472e-06, + "loss": 0.6614, + "step": 8037 + }, + { + "epoch": 0.82, + "grad_norm": 1.5692556277032406, + "learning_rate": 1.69446665993659e-06, + "loss": 0.7314, + "step": 8038 + }, + { + "epoch": 0.82, + "grad_norm": 1.7951697399440905, + "learning_rate": 1.6926322996765899e-06, + "loss": 0.6347, + "step": 8039 + }, + { + "epoch": 0.82, + "grad_norm": 1.76987488315246, + "learning_rate": 1.6907988410514408e-06, + "loss": 0.7911, + "step": 8040 + }, + { + "epoch": 0.82, + "grad_norm": 1.6345706578407215, + "learning_rate": 1.6889662842601384e-06, + "loss": 0.6243, + "step": 8041 + }, + { + "epoch": 0.82, + "grad_norm": 1.7310740912828402, + "learning_rate": 1.6871346295015744e-06, + "loss": 0.6846, + "step": 8042 + }, + { + "epoch": 0.82, + "grad_norm": 1.6632516322796669, + "learning_rate": 1.6853038769745466e-06, + "loss": 0.6219, + "step": 8043 + }, + { + "epoch": 0.82, + "grad_norm": 1.4446934371691147, + "learning_rate": 1.6834740268777594e-06, + "loss": 0.6495, + "step": 8044 + }, + { + "epoch": 0.82, + "grad_norm": 1.7943768358241974, + "learning_rate": 1.681645079409807e-06, + "loss": 0.7088, + "step": 8045 + }, + { + "epoch": 0.82, + "grad_norm": 1.619771756196256, + "learning_rate": 1.6798170347692e-06, + "loss": 0.7114, + "step": 8046 + }, + { + "epoch": 0.82, + "grad_norm": 1.4099576457270464, + "learning_rate": 1.6779898931543382e-06, + "loss": 0.5875, + "step": 8047 + }, + { + "epoch": 0.82, + "grad_norm": 1.7769641316748075, + "learning_rate": 1.6761636547635308e-06, + "loss": 0.6824, + "step": 8048 + }, + { + "epoch": 0.82, + "grad_norm": 1.6713721303852005, + "learning_rate": 1.6743383197949925e-06, + "loss": 0.7363, + "step": 8049 + }, + { + "epoch": 0.82, + "grad_norm": 1.5786940687358804, + "learning_rate": 1.6725138884468273e-06, + "loss": 0.6727, + "step": 8050 + }, + { + "epoch": 0.82, + "grad_norm": 1.78872539059089, + "learning_rate": 1.6706903609170522e-06, + "loss": 0.7712, + "step": 8051 + }, + { + "epoch": 0.82, + "grad_norm": 1.5147655598325824, + "learning_rate": 1.6688677374035856e-06, + "loss": 0.6125, + "step": 8052 + }, + { + "epoch": 0.82, + "grad_norm": 1.6544674731977558, + "learning_rate": 1.6670460181042381e-06, + "loss": 0.7031, + "step": 8053 + }, + { + "epoch": 0.82, + "grad_norm": 1.709302577018215, + "learning_rate": 1.6652252032167337e-06, + "loss": 0.6829, + "step": 8054 + }, + { + "epoch": 0.82, + "grad_norm": 1.6225215150122227, + "learning_rate": 1.6634052929386946e-06, + "loss": 0.7626, + "step": 8055 + }, + { + "epoch": 0.82, + "grad_norm": 1.5709590449037962, + "learning_rate": 1.661586287467638e-06, + "loss": 0.6229, + "step": 8056 + }, + { + "epoch": 0.82, + "grad_norm": 1.8451733707947704, + "learning_rate": 1.6597681870009917e-06, + "loss": 0.7429, + "step": 8057 + }, + { + "epoch": 0.82, + "grad_norm": 1.620867770846572, + "learning_rate": 1.6579509917360859e-06, + "loss": 0.6697, + "step": 8058 + }, + { + "epoch": 0.82, + "grad_norm": 1.6393845757334204, + "learning_rate": 1.6561347018701412e-06, + "loss": 0.5999, + "step": 8059 + }, + { + "epoch": 0.82, + "grad_norm": 1.7245146300654126, + "learning_rate": 1.6543193176002936e-06, + "loss": 0.6413, + "step": 8060 + }, + { + "epoch": 0.82, + "grad_norm": 1.6501570293526682, + "learning_rate": 1.6525048391235698e-06, + "loss": 0.6118, + "step": 8061 + }, + { + "epoch": 0.82, + "grad_norm": 1.5671003694062557, + "learning_rate": 1.6506912666369067e-06, + "loss": 0.7136, + "step": 8062 + }, + { + "epoch": 0.82, + "grad_norm": 1.7856317394038808, + "learning_rate": 1.6488786003371393e-06, + "loss": 0.7216, + "step": 8063 + }, + { + "epoch": 0.82, + "grad_norm": 1.673438416970208, + "learning_rate": 1.647066840421001e-06, + "loss": 0.6913, + "step": 8064 + }, + { + "epoch": 0.82, + "grad_norm": 1.6861994386869004, + "learning_rate": 1.6452559870851336e-06, + "loss": 0.7163, + "step": 8065 + }, + { + "epoch": 0.82, + "grad_norm": 1.6480156694754329, + "learning_rate": 1.6434460405260733e-06, + "loss": 0.7316, + "step": 8066 + }, + { + "epoch": 0.82, + "grad_norm": 1.465375917084765, + "learning_rate": 1.6416370009402627e-06, + "loss": 0.6073, + "step": 8067 + }, + { + "epoch": 0.82, + "grad_norm": 1.6110628327794576, + "learning_rate": 1.6398288685240494e-06, + "loss": 0.7929, + "step": 8068 + }, + { + "epoch": 0.82, + "grad_norm": 1.6048075073218324, + "learning_rate": 1.6380216434736706e-06, + "loss": 0.6459, + "step": 8069 + }, + { + "epoch": 0.82, + "grad_norm": 1.643815248003964, + "learning_rate": 1.6362153259852775e-06, + "loss": 0.6194, + "step": 8070 + }, + { + "epoch": 0.82, + "grad_norm": 1.6452411850619555, + "learning_rate": 1.6344099162549143e-06, + "loss": 0.7063, + "step": 8071 + }, + { + "epoch": 0.82, + "grad_norm": 1.7036325914469437, + "learning_rate": 1.6326054144785319e-06, + "loss": 0.6956, + "step": 8072 + }, + { + "epoch": 0.82, + "grad_norm": 1.7277501315511394, + "learning_rate": 1.6308018208519815e-06, + "loss": 0.6394, + "step": 8073 + }, + { + "epoch": 0.82, + "grad_norm": 1.7138251157363322, + "learning_rate": 1.6289991355710121e-06, + "loss": 0.7295, + "step": 8074 + }, + { + "epoch": 0.82, + "grad_norm": 1.6265331835084222, + "learning_rate": 1.6271973588312784e-06, + "loss": 0.5779, + "step": 8075 + }, + { + "epoch": 0.82, + "grad_norm": 1.6653808390691958, + "learning_rate": 1.6253964908283382e-06, + "loss": 0.7051, + "step": 8076 + }, + { + "epoch": 0.82, + "grad_norm": 1.6804402910950127, + "learning_rate": 1.6235965317576418e-06, + "loss": 0.7356, + "step": 8077 + }, + { + "epoch": 0.82, + "grad_norm": 1.7370222042757812, + "learning_rate": 1.6217974818145532e-06, + "loss": 0.7099, + "step": 8078 + }, + { + "epoch": 0.82, + "grad_norm": 1.4747458546716434, + "learning_rate": 1.6199993411943238e-06, + "loss": 0.5782, + "step": 8079 + }, + { + "epoch": 0.82, + "grad_norm": 1.6859470291193008, + "learning_rate": 1.6182021100921185e-06, + "loss": 0.7637, + "step": 8080 + }, + { + "epoch": 0.82, + "grad_norm": 1.7702922525523404, + "learning_rate": 1.6164057887029994e-06, + "loss": 0.5905, + "step": 8081 + }, + { + "epoch": 0.82, + "grad_norm": 1.6709702750784812, + "learning_rate": 1.6146103772219246e-06, + "loss": 0.6451, + "step": 8082 + }, + { + "epoch": 0.82, + "grad_norm": 1.6401880834646672, + "learning_rate": 1.612815875843763e-06, + "loss": 0.5818, + "step": 8083 + }, + { + "epoch": 0.82, + "grad_norm": 1.8810582744200497, + "learning_rate": 1.611022284763274e-06, + "loss": 0.7115, + "step": 8084 + }, + { + "epoch": 0.82, + "grad_norm": 1.7683313979746031, + "learning_rate": 1.6092296041751277e-06, + "loss": 0.7298, + "step": 8085 + }, + { + "epoch": 0.82, + "grad_norm": 1.7469266290104861, + "learning_rate": 1.607437834273894e-06, + "loss": 0.7389, + "step": 8086 + }, + { + "epoch": 0.82, + "grad_norm": 1.5873409497942523, + "learning_rate": 1.6056469752540349e-06, + "loss": 0.7042, + "step": 8087 + }, + { + "epoch": 0.82, + "grad_norm": 1.683437781534904, + "learning_rate": 1.6038570273099273e-06, + "loss": 0.6705, + "step": 8088 + }, + { + "epoch": 0.82, + "grad_norm": 1.847510008683372, + "learning_rate": 1.602067990635835e-06, + "loss": 0.6703, + "step": 8089 + }, + { + "epoch": 0.82, + "grad_norm": 1.5865058761843016, + "learning_rate": 1.6002798654259333e-06, + "loss": 0.6765, + "step": 8090 + }, + { + "epoch": 0.82, + "grad_norm": 1.4981225938372302, + "learning_rate": 1.5984926518742982e-06, + "loss": 0.5621, + "step": 8091 + }, + { + "epoch": 0.82, + "grad_norm": 1.6472692118142727, + "learning_rate": 1.596706350174898e-06, + "loss": 0.6508, + "step": 8092 + }, + { + "epoch": 0.82, + "grad_norm": 1.645000625397688, + "learning_rate": 1.594920960521611e-06, + "loss": 0.756, + "step": 8093 + }, + { + "epoch": 0.82, + "grad_norm": 1.628324434163129, + "learning_rate": 1.5931364831082152e-06, + "loss": 0.7143, + "step": 8094 + }, + { + "epoch": 0.82, + "grad_norm": 1.8753708051209703, + "learning_rate": 1.5913529181283837e-06, + "loss": 0.7856, + "step": 8095 + }, + { + "epoch": 0.82, + "grad_norm": 1.6221303737097068, + "learning_rate": 1.5895702657756984e-06, + "loss": 0.7017, + "step": 8096 + }, + { + "epoch": 0.82, + "grad_norm": 1.601246385274789, + "learning_rate": 1.5877885262436333e-06, + "loss": 0.6951, + "step": 8097 + }, + { + "epoch": 0.82, + "grad_norm": 1.5418810995917551, + "learning_rate": 1.5860076997255725e-06, + "loss": 0.6298, + "step": 8098 + }, + { + "epoch": 0.82, + "grad_norm": 1.7361284550203793, + "learning_rate": 1.5842277864147971e-06, + "loss": 0.7036, + "step": 8099 + }, + { + "epoch": 0.82, + "grad_norm": 1.5257967081619923, + "learning_rate": 1.5824487865044857e-06, + "loss": 0.5985, + "step": 8100 + }, + { + "epoch": 0.82, + "grad_norm": 1.83644377022114, + "learning_rate": 1.5806707001877253e-06, + "loss": 0.7191, + "step": 8101 + }, + { + "epoch": 0.82, + "grad_norm": 1.6539965105696637, + "learning_rate": 1.5788935276574947e-06, + "loss": 0.7151, + "step": 8102 + }, + { + "epoch": 0.82, + "grad_norm": 1.7310246798846824, + "learning_rate": 1.5771172691066793e-06, + "loss": 0.6899, + "step": 8103 + }, + { + "epoch": 0.82, + "grad_norm": 1.723225438860287, + "learning_rate": 1.5753419247280676e-06, + "loss": 0.7134, + "step": 8104 + }, + { + "epoch": 0.82, + "grad_norm": 1.5959154250822916, + "learning_rate": 1.573567494714342e-06, + "loss": 0.677, + "step": 8105 + }, + { + "epoch": 0.82, + "grad_norm": 1.742594180725846, + "learning_rate": 1.5717939792580916e-06, + "loss": 0.6746, + "step": 8106 + }, + { + "epoch": 0.82, + "grad_norm": 1.709142755507124, + "learning_rate": 1.5700213785518003e-06, + "loss": 0.7439, + "step": 8107 + }, + { + "epoch": 0.82, + "grad_norm": 1.6564530461284355, + "learning_rate": 1.5682496927878577e-06, + "loss": 0.6564, + "step": 8108 + }, + { + "epoch": 0.82, + "grad_norm": 1.949082537417003, + "learning_rate": 1.5664789221585552e-06, + "loss": 0.6938, + "step": 8109 + }, + { + "epoch": 0.82, + "grad_norm": 1.7131596161923532, + "learning_rate": 1.5647090668560794e-06, + "loss": 0.6444, + "step": 8110 + }, + { + "epoch": 0.83, + "grad_norm": 1.6780301599326606, + "learning_rate": 1.5629401270725197e-06, + "loss": 0.6874, + "step": 8111 + }, + { + "epoch": 0.83, + "grad_norm": 1.4744301947942198, + "learning_rate": 1.5611721029998716e-06, + "loss": 0.7268, + "step": 8112 + }, + { + "epoch": 0.83, + "grad_norm": 1.6870236838857529, + "learning_rate": 1.5594049948300205e-06, + "loss": 0.6121, + "step": 8113 + }, + { + "epoch": 0.83, + "grad_norm": 1.4872314938621514, + "learning_rate": 1.557638802754763e-06, + "loss": 0.6208, + "step": 8114 + }, + { + "epoch": 0.83, + "grad_norm": 1.7013662140599177, + "learning_rate": 1.5558735269657877e-06, + "loss": 0.6237, + "step": 8115 + }, + { + "epoch": 0.83, + "grad_norm": 1.7271219737991148, + "learning_rate": 1.5541091676546904e-06, + "loss": 0.6577, + "step": 8116 + }, + { + "epoch": 0.83, + "grad_norm": 1.5893707443428777, + "learning_rate": 1.5523457250129648e-06, + "loss": 0.6867, + "step": 8117 + }, + { + "epoch": 0.83, + "grad_norm": 1.9039068155013918, + "learning_rate": 1.5505831992320031e-06, + "loss": 0.665, + "step": 8118 + }, + { + "epoch": 0.83, + "grad_norm": 1.7217487973886862, + "learning_rate": 1.5488215905031033e-06, + "loss": 0.7883, + "step": 8119 + }, + { + "epoch": 0.83, + "grad_norm": 1.6230993010708583, + "learning_rate": 1.5470608990174551e-06, + "loss": 0.724, + "step": 8120 + }, + { + "epoch": 0.83, + "grad_norm": 1.5198966086936623, + "learning_rate": 1.5453011249661577e-06, + "loss": 0.6361, + "step": 8121 + }, + { + "epoch": 0.83, + "grad_norm": 1.7769794565911328, + "learning_rate": 1.543542268540209e-06, + "loss": 0.7086, + "step": 8122 + }, + { + "epoch": 0.83, + "grad_norm": 1.7323610176209234, + "learning_rate": 1.5417843299305002e-06, + "loss": 0.6249, + "step": 8123 + }, + { + "epoch": 0.83, + "grad_norm": 1.6130541426278338, + "learning_rate": 1.5400273093278328e-06, + "loss": 0.5961, + "step": 8124 + }, + { + "epoch": 0.83, + "grad_norm": 1.8589668993459887, + "learning_rate": 1.5382712069228999e-06, + "loss": 0.6291, + "step": 8125 + }, + { + "epoch": 0.83, + "grad_norm": 1.588078222019581, + "learning_rate": 1.5365160229063004e-06, + "loss": 0.6874, + "step": 8126 + }, + { + "epoch": 0.83, + "grad_norm": 1.7482045277109108, + "learning_rate": 1.5347617574685358e-06, + "loss": 0.5672, + "step": 8127 + }, + { + "epoch": 0.83, + "grad_norm": 1.6977910833089311, + "learning_rate": 1.5330084107999976e-06, + "loss": 0.6242, + "step": 8128 + }, + { + "epoch": 0.83, + "grad_norm": 1.8183946031637237, + "learning_rate": 1.5312559830909878e-06, + "loss": 0.714, + "step": 8129 + }, + { + "epoch": 0.83, + "grad_norm": 1.5263848763255998, + "learning_rate": 1.5295044745317068e-06, + "loss": 0.747, + "step": 8130 + }, + { + "epoch": 0.83, + "grad_norm": 1.8710990805940892, + "learning_rate": 1.5277538853122497e-06, + "loss": 0.731, + "step": 8131 + }, + { + "epoch": 0.83, + "grad_norm": 1.6603387288595752, + "learning_rate": 1.5260042156226184e-06, + "loss": 0.6077, + "step": 8132 + }, + { + "epoch": 0.83, + "grad_norm": 1.671883523275829, + "learning_rate": 1.5242554656527097e-06, + "loss": 0.8259, + "step": 8133 + }, + { + "epoch": 0.83, + "grad_norm": 1.840611579339051, + "learning_rate": 1.522507635592324e-06, + "loss": 0.609, + "step": 8134 + }, + { + "epoch": 0.83, + "grad_norm": 1.851169885764712, + "learning_rate": 1.5207607256311641e-06, + "loss": 0.7148, + "step": 8135 + }, + { + "epoch": 0.83, + "grad_norm": 1.619183101913956, + "learning_rate": 1.5190147359588237e-06, + "loss": 0.6515, + "step": 8136 + }, + { + "epoch": 0.83, + "grad_norm": 1.7002117308407043, + "learning_rate": 1.517269666764809e-06, + "loss": 0.6969, + "step": 8137 + }, + { + "epoch": 0.83, + "grad_norm": 1.7142362259110455, + "learning_rate": 1.5155255182385143e-06, + "loss": 0.6511, + "step": 8138 + }, + { + "epoch": 0.83, + "grad_norm": 1.601914764339988, + "learning_rate": 1.5137822905692423e-06, + "loss": 0.6475, + "step": 8139 + }, + { + "epoch": 0.83, + "grad_norm": 1.7644138479405698, + "learning_rate": 1.5120399839461953e-06, + "loss": 0.6861, + "step": 8140 + }, + { + "epoch": 0.83, + "grad_norm": 1.7064974699464002, + "learning_rate": 1.5102985985584695e-06, + "loss": 0.7408, + "step": 8141 + }, + { + "epoch": 0.83, + "grad_norm": 1.6246618466450624, + "learning_rate": 1.508558134595066e-06, + "loss": 0.6684, + "step": 8142 + }, + { + "epoch": 0.83, + "grad_norm": 1.6562685064954157, + "learning_rate": 1.5068185922448887e-06, + "loss": 0.6657, + "step": 8143 + }, + { + "epoch": 0.83, + "grad_norm": 1.9585946166025932, + "learning_rate": 1.5050799716967313e-06, + "loss": 0.6331, + "step": 8144 + }, + { + "epoch": 0.83, + "grad_norm": 1.671683570356811, + "learning_rate": 1.5033422731392977e-06, + "loss": 0.7035, + "step": 8145 + }, + { + "epoch": 0.83, + "grad_norm": 1.550323061271016, + "learning_rate": 1.5016054967611904e-06, + "loss": 0.6498, + "step": 8146 + }, + { + "epoch": 0.83, + "grad_norm": 1.7797696570553185, + "learning_rate": 1.499869642750904e-06, + "loss": 0.747, + "step": 8147 + }, + { + "epoch": 0.83, + "grad_norm": 1.7232419058262327, + "learning_rate": 1.4981347112968426e-06, + "loss": 0.7048, + "step": 8148 + }, + { + "epoch": 0.83, + "grad_norm": 1.8706299110360487, + "learning_rate": 1.4964007025873017e-06, + "loss": 0.6893, + "step": 8149 + }, + { + "epoch": 0.83, + "grad_norm": 1.604912756002985, + "learning_rate": 1.4946676168104834e-06, + "loss": 0.6486, + "step": 8150 + }, + { + "epoch": 0.83, + "grad_norm": 1.7382472735836743, + "learning_rate": 1.4929354541544882e-06, + "loss": 0.7424, + "step": 8151 + }, + { + "epoch": 0.83, + "grad_norm": 1.9145589612518605, + "learning_rate": 1.4912042148073124e-06, + "loss": 0.7238, + "step": 8152 + }, + { + "epoch": 0.83, + "grad_norm": 1.5925140534896371, + "learning_rate": 1.489473898956857e-06, + "loss": 0.5963, + "step": 8153 + }, + { + "epoch": 0.83, + "grad_norm": 1.7374229282832934, + "learning_rate": 1.4877445067909213e-06, + "loss": 0.7043, + "step": 8154 + }, + { + "epoch": 0.83, + "grad_norm": 1.8373494978688998, + "learning_rate": 1.4860160384972e-06, + "loss": 0.6932, + "step": 8155 + }, + { + "epoch": 0.83, + "grad_norm": 1.7972090772416303, + "learning_rate": 1.4842884942632952e-06, + "loss": 0.7046, + "step": 8156 + }, + { + "epoch": 0.83, + "grad_norm": 1.9947400366471757, + "learning_rate": 1.4825618742767045e-06, + "loss": 0.7492, + "step": 8157 + }, + { + "epoch": 0.83, + "grad_norm": 1.5968741451412138, + "learning_rate": 1.4808361787248237e-06, + "loss": 0.6735, + "step": 8158 + }, + { + "epoch": 0.83, + "grad_norm": 1.7897791644317969, + "learning_rate": 1.4791114077949497e-06, + "loss": 0.6743, + "step": 8159 + }, + { + "epoch": 0.83, + "grad_norm": 1.525728328072972, + "learning_rate": 1.4773875616742828e-06, + "loss": 0.6981, + "step": 8160 + }, + { + "epoch": 0.83, + "grad_norm": 1.8020126122351128, + "learning_rate": 1.4756646405499164e-06, + "loss": 0.7265, + "step": 8161 + }, + { + "epoch": 0.83, + "grad_norm": 1.7665628087081688, + "learning_rate": 1.4739426446088467e-06, + "loss": 0.7165, + "step": 8162 + }, + { + "epoch": 0.83, + "grad_norm": 1.7674743044682641, + "learning_rate": 1.472221574037972e-06, + "loss": 0.6207, + "step": 8163 + }, + { + "epoch": 0.83, + "grad_norm": 1.6395601025438578, + "learning_rate": 1.4705014290240838e-06, + "loss": 0.6381, + "step": 8164 + }, + { + "epoch": 0.83, + "grad_norm": 1.5560174701688523, + "learning_rate": 1.4687822097538796e-06, + "loss": 0.5278, + "step": 8165 + }, + { + "epoch": 0.83, + "grad_norm": 1.5288415132268074, + "learning_rate": 1.4670639164139555e-06, + "loss": 0.6048, + "step": 8166 + }, + { + "epoch": 0.83, + "grad_norm": 1.805040181247626, + "learning_rate": 1.4653465491908003e-06, + "loss": 0.6458, + "step": 8167 + }, + { + "epoch": 0.83, + "grad_norm": 1.552499177462516, + "learning_rate": 1.4636301082708127e-06, + "loss": 0.6218, + "step": 8168 + }, + { + "epoch": 0.83, + "grad_norm": 1.6076231371219833, + "learning_rate": 1.4619145938402813e-06, + "loss": 0.6282, + "step": 8169 + }, + { + "epoch": 0.83, + "grad_norm": 1.7702824958542995, + "learning_rate": 1.4602000060853994e-06, + "loss": 0.7116, + "step": 8170 + }, + { + "epoch": 0.83, + "grad_norm": 1.8301214132381038, + "learning_rate": 1.4584863451922615e-06, + "loss": 0.7077, + "step": 8171 + }, + { + "epoch": 0.83, + "grad_norm": 1.8228686877917728, + "learning_rate": 1.4567736113468545e-06, + "loss": 0.7512, + "step": 8172 + }, + { + "epoch": 0.83, + "grad_norm": 1.7104293452984662, + "learning_rate": 1.4550618047350728e-06, + "loss": 0.7394, + "step": 8173 + }, + { + "epoch": 0.83, + "grad_norm": 1.8864743707993548, + "learning_rate": 1.4533509255427013e-06, + "loss": 0.7376, + "step": 8174 + }, + { + "epoch": 0.83, + "grad_norm": 1.7191844775183622, + "learning_rate": 1.4516409739554338e-06, + "loss": 0.6737, + "step": 8175 + }, + { + "epoch": 0.83, + "grad_norm": 1.7633488087541436, + "learning_rate": 1.4499319501588582e-06, + "loss": 0.7298, + "step": 8176 + }, + { + "epoch": 0.83, + "grad_norm": 1.7316191985407565, + "learning_rate": 1.4482238543384596e-06, + "loss": 0.6715, + "step": 8177 + }, + { + "epoch": 0.83, + "grad_norm": 1.5948138141341184, + "learning_rate": 1.4465166866796298e-06, + "loss": 0.6769, + "step": 8178 + }, + { + "epoch": 0.83, + "grad_norm": 1.6954440427739106, + "learning_rate": 1.4448104473676483e-06, + "loss": 0.7358, + "step": 8179 + }, + { + "epoch": 0.83, + "grad_norm": 1.8078562613994282, + "learning_rate": 1.4431051365877058e-06, + "loss": 0.7675, + "step": 8180 + }, + { + "epoch": 0.83, + "grad_norm": 1.6272987902402685, + "learning_rate": 1.4414007545248875e-06, + "loss": 0.6328, + "step": 8181 + }, + { + "epoch": 0.83, + "grad_norm": 1.6629866223470973, + "learning_rate": 1.4396973013641736e-06, + "loss": 0.7112, + "step": 8182 + }, + { + "epoch": 0.83, + "grad_norm": 1.7560931270102156, + "learning_rate": 1.4379947772904502e-06, + "loss": 0.6473, + "step": 8183 + }, + { + "epoch": 0.83, + "grad_norm": 1.6533183558470694, + "learning_rate": 1.4362931824884995e-06, + "loss": 0.7396, + "step": 8184 + }, + { + "epoch": 0.83, + "grad_norm": 1.7162203646747494, + "learning_rate": 1.4345925171430019e-06, + "loss": 0.6405, + "step": 8185 + }, + { + "epoch": 0.83, + "grad_norm": 1.7371762319524415, + "learning_rate": 1.4328927814385397e-06, + "loss": 0.6946, + "step": 8186 + }, + { + "epoch": 0.83, + "grad_norm": 1.6011820265500123, + "learning_rate": 1.4311939755595904e-06, + "loss": 0.6324, + "step": 8187 + }, + { + "epoch": 0.83, + "grad_norm": 1.718557478458414, + "learning_rate": 1.4294960996905328e-06, + "loss": 0.7573, + "step": 8188 + }, + { + "epoch": 0.83, + "grad_norm": 1.8507287971729904, + "learning_rate": 1.4277991540156487e-06, + "loss": 0.6725, + "step": 8189 + }, + { + "epoch": 0.83, + "grad_norm": 1.6812252065295867, + "learning_rate": 1.4261031387191093e-06, + "loss": 0.6939, + "step": 8190 + }, + { + "epoch": 0.83, + "grad_norm": 1.5990936705216816, + "learning_rate": 1.424408053984997e-06, + "loss": 0.6721, + "step": 8191 + }, + { + "epoch": 0.83, + "grad_norm": 1.644941347595725, + "learning_rate": 1.4227138999972801e-06, + "loss": 0.6082, + "step": 8192 + }, + { + "epoch": 0.83, + "grad_norm": 1.6488723534423682, + "learning_rate": 1.421020676939835e-06, + "loss": 0.6637, + "step": 8193 + }, + { + "epoch": 0.83, + "grad_norm": 1.641270793531587, + "learning_rate": 1.4193283849964379e-06, + "loss": 0.7056, + "step": 8194 + }, + { + "epoch": 0.83, + "grad_norm": 1.6060072990891627, + "learning_rate": 1.4176370243507564e-06, + "loss": 0.6986, + "step": 8195 + }, + { + "epoch": 0.83, + "grad_norm": 1.7266934811155912, + "learning_rate": 1.4159465951863638e-06, + "loss": 0.7088, + "step": 8196 + }, + { + "epoch": 0.83, + "grad_norm": 1.5718313566278994, + "learning_rate": 1.4142570976867276e-06, + "loss": 0.7139, + "step": 8197 + }, + { + "epoch": 0.83, + "grad_norm": 1.7411643009476758, + "learning_rate": 1.4125685320352167e-06, + "loss": 0.6195, + "step": 8198 + }, + { + "epoch": 0.83, + "grad_norm": 1.8999500432719307, + "learning_rate": 1.4108808984151023e-06, + "loss": 0.7139, + "step": 8199 + }, + { + "epoch": 0.83, + "grad_norm": 1.493831320737935, + "learning_rate": 1.4091941970095446e-06, + "loss": 0.6281, + "step": 8200 + }, + { + "epoch": 0.83, + "grad_norm": 1.7084876257684942, + "learning_rate": 1.4075084280016138e-06, + "loss": 0.7867, + "step": 8201 + }, + { + "epoch": 0.83, + "grad_norm": 1.7936129382568762, + "learning_rate": 1.4058235915742702e-06, + "loss": 0.7075, + "step": 8202 + }, + { + "epoch": 0.83, + "grad_norm": 1.6964101364789455, + "learning_rate": 1.4041396879103763e-06, + "loss": 0.6834, + "step": 8203 + }, + { + "epoch": 0.83, + "grad_norm": 1.738053483920836, + "learning_rate": 1.4024567171926984e-06, + "loss": 0.7705, + "step": 8204 + }, + { + "epoch": 0.83, + "grad_norm": 1.8014855759132964, + "learning_rate": 1.4007746796038912e-06, + "loss": 0.6916, + "step": 8205 + }, + { + "epoch": 0.83, + "grad_norm": 1.7326603410539312, + "learning_rate": 1.3990935753265155e-06, + "loss": 0.816, + "step": 8206 + }, + { + "epoch": 0.83, + "grad_norm": 1.7264773657111878, + "learning_rate": 1.397413404543031e-06, + "loss": 0.7231, + "step": 8207 + }, + { + "epoch": 0.83, + "grad_norm": 1.5624102154971813, + "learning_rate": 1.3957341674357904e-06, + "loss": 0.7574, + "step": 8208 + }, + { + "epoch": 0.84, + "grad_norm": 1.5545206389114075, + "learning_rate": 1.3940558641870517e-06, + "loss": 0.6499, + "step": 8209 + }, + { + "epoch": 0.84, + "grad_norm": 1.6880268153759315, + "learning_rate": 1.3923784949789654e-06, + "loss": 0.7573, + "step": 8210 + }, + { + "epoch": 0.84, + "grad_norm": 1.720853432426547, + "learning_rate": 1.390702059993585e-06, + "loss": 0.7196, + "step": 8211 + }, + { + "epoch": 0.84, + "grad_norm": 1.6481869374763833, + "learning_rate": 1.3890265594128638e-06, + "loss": 0.6756, + "step": 8212 + }, + { + "epoch": 0.84, + "grad_norm": 1.6094648483301166, + "learning_rate": 1.3873519934186474e-06, + "loss": 0.7078, + "step": 8213 + }, + { + "epoch": 0.84, + "grad_norm": 1.618792032977977, + "learning_rate": 1.3856783621926873e-06, + "loss": 0.682, + "step": 8214 + }, + { + "epoch": 0.84, + "grad_norm": 1.5878214192541364, + "learning_rate": 1.3840056659166257e-06, + "loss": 0.7473, + "step": 8215 + }, + { + "epoch": 0.84, + "grad_norm": 1.5842895543466722, + "learning_rate": 1.3823339047720096e-06, + "loss": 0.6335, + "step": 8216 + }, + { + "epoch": 0.84, + "grad_norm": 1.7782310880484486, + "learning_rate": 1.3806630789402852e-06, + "loss": 0.7653, + "step": 8217 + }, + { + "epoch": 0.84, + "grad_norm": 1.6956498406697338, + "learning_rate": 1.3789931886027907e-06, + "loss": 0.5894, + "step": 8218 + }, + { + "epoch": 0.84, + "grad_norm": 1.7475029519745575, + "learning_rate": 1.37732423394077e-06, + "loss": 0.7174, + "step": 8219 + }, + { + "epoch": 0.84, + "grad_norm": 1.6154412892748924, + "learning_rate": 1.375656215135358e-06, + "loss": 0.6459, + "step": 8220 + }, + { + "epoch": 0.84, + "grad_norm": 1.9434901904421764, + "learning_rate": 1.3739891323675935e-06, + "loss": 0.6698, + "step": 8221 + }, + { + "epoch": 0.84, + "grad_norm": 1.6567625078813784, + "learning_rate": 1.3723229858184162e-06, + "loss": 0.6578, + "step": 8222 + }, + { + "epoch": 0.84, + "grad_norm": 1.776408853955685, + "learning_rate": 1.3706577756686545e-06, + "loss": 0.7233, + "step": 8223 + }, + { + "epoch": 0.84, + "grad_norm": 1.8514879954569197, + "learning_rate": 1.368993502099043e-06, + "loss": 0.7251, + "step": 8224 + }, + { + "epoch": 0.84, + "grad_norm": 1.5768850107459567, + "learning_rate": 1.367330165290215e-06, + "loss": 0.6744, + "step": 8225 + }, + { + "epoch": 0.84, + "grad_norm": 1.4427110331792115, + "learning_rate": 1.3656677654226957e-06, + "loss": 0.5715, + "step": 8226 + }, + { + "epoch": 0.84, + "grad_norm": 1.7838598756278934, + "learning_rate": 1.364006302676918e-06, + "loss": 0.7783, + "step": 8227 + }, + { + "epoch": 0.84, + "grad_norm": 1.6732822161714365, + "learning_rate": 1.3623457772332005e-06, + "loss": 0.6944, + "step": 8228 + }, + { + "epoch": 0.84, + "grad_norm": 1.7225981120830656, + "learning_rate": 1.3606861892717715e-06, + "loss": 0.5786, + "step": 8229 + }, + { + "epoch": 0.84, + "grad_norm": 1.7587114484153354, + "learning_rate": 1.359027538972756e-06, + "loss": 0.7033, + "step": 8230 + }, + { + "epoch": 0.84, + "grad_norm": 1.850649807807029, + "learning_rate": 1.3573698265161683e-06, + "loss": 0.6353, + "step": 8231 + }, + { + "epoch": 0.84, + "grad_norm": 1.6641610341874145, + "learning_rate": 1.3557130520819328e-06, + "loss": 0.7124, + "step": 8232 + }, + { + "epoch": 0.84, + "grad_norm": 1.9149203507188295, + "learning_rate": 1.3540572158498622e-06, + "loss": 0.7167, + "step": 8233 + }, + { + "epoch": 0.84, + "grad_norm": 1.7525317054907879, + "learning_rate": 1.3524023179996725e-06, + "loss": 0.767, + "step": 8234 + }, + { + "epoch": 0.84, + "grad_norm": 1.7889664661142302, + "learning_rate": 1.3507483587109805e-06, + "loss": 0.7336, + "step": 8235 + }, + { + "epoch": 0.84, + "grad_norm": 1.5378168604185913, + "learning_rate": 1.3490953381632933e-06, + "loss": 0.6962, + "step": 8236 + }, + { + "epoch": 0.84, + "grad_norm": 1.8147784384972894, + "learning_rate": 1.347443256536024e-06, + "loss": 0.6153, + "step": 8237 + }, + { + "epoch": 0.84, + "grad_norm": 1.7302525034904064, + "learning_rate": 1.3457921140084761e-06, + "loss": 0.7129, + "step": 8238 + }, + { + "epoch": 0.84, + "grad_norm": 1.5835440184825371, + "learning_rate": 1.3441419107598575e-06, + "loss": 0.7461, + "step": 8239 + }, + { + "epoch": 0.84, + "grad_norm": 1.9160856857123725, + "learning_rate": 1.3424926469692734e-06, + "loss": 0.7012, + "step": 8240 + }, + { + "epoch": 0.84, + "grad_norm": 1.7432989014314013, + "learning_rate": 1.340844322815723e-06, + "loss": 0.698, + "step": 8241 + }, + { + "epoch": 0.84, + "grad_norm": 1.826200832807161, + "learning_rate": 1.3391969384781066e-06, + "loss": 0.768, + "step": 8242 + }, + { + "epoch": 0.84, + "grad_norm": 1.6947304695110976, + "learning_rate": 1.3375504941352257e-06, + "loss": 0.666, + "step": 8243 + }, + { + "epoch": 0.84, + "grad_norm": 1.6355678464956755, + "learning_rate": 1.3359049899657706e-06, + "loss": 0.6915, + "step": 8244 + }, + { + "epoch": 0.84, + "grad_norm": 1.7745025367531957, + "learning_rate": 1.3342604261483406e-06, + "loss": 0.7305, + "step": 8245 + }, + { + "epoch": 0.84, + "grad_norm": 1.7418465569279045, + "learning_rate": 1.3326168028614206e-06, + "loss": 0.6738, + "step": 8246 + }, + { + "epoch": 0.84, + "grad_norm": 1.7872437432117918, + "learning_rate": 1.3309741202834047e-06, + "loss": 0.6452, + "step": 8247 + }, + { + "epoch": 0.84, + "grad_norm": 1.7614011552562099, + "learning_rate": 1.3293323785925816e-06, + "loss": 0.6973, + "step": 8248 + }, + { + "epoch": 0.84, + "grad_norm": 1.8171301925702645, + "learning_rate": 1.327691577967133e-06, + "loss": 0.6592, + "step": 8249 + }, + { + "epoch": 0.84, + "grad_norm": 1.7057564407848962, + "learning_rate": 1.326051718585144e-06, + "loss": 0.7125, + "step": 8250 + }, + { + "epoch": 0.84, + "grad_norm": 1.9809910038871592, + "learning_rate": 1.324412800624597e-06, + "loss": 0.6787, + "step": 8251 + }, + { + "epoch": 0.84, + "grad_norm": 1.6486996929916975, + "learning_rate": 1.3227748242633675e-06, + "loss": 0.6453, + "step": 8252 + }, + { + "epoch": 0.84, + "grad_norm": 1.612309839141793, + "learning_rate": 1.3211377896792365e-06, + "loss": 0.6477, + "step": 8253 + }, + { + "epoch": 0.84, + "grad_norm": 1.7284065002363935, + "learning_rate": 1.319501697049874e-06, + "loss": 0.6668, + "step": 8254 + }, + { + "epoch": 0.84, + "grad_norm": 1.7727740544108945, + "learning_rate": 1.3178665465528551e-06, + "loss": 0.6749, + "step": 8255 + }, + { + "epoch": 0.84, + "grad_norm": 1.7506034964250732, + "learning_rate": 1.3162323383656506e-06, + "loss": 0.5859, + "step": 8256 + }, + { + "epoch": 0.84, + "grad_norm": 1.4653029512735753, + "learning_rate": 1.3145990726656244e-06, + "loss": 0.6423, + "step": 8257 + }, + { + "epoch": 0.84, + "grad_norm": 1.7090647923838687, + "learning_rate": 1.3129667496300446e-06, + "loss": 0.6786, + "step": 8258 + }, + { + "epoch": 0.84, + "grad_norm": 1.7074457410507926, + "learning_rate": 1.3113353694360764e-06, + "loss": 0.6967, + "step": 8259 + }, + { + "epoch": 0.84, + "grad_norm": 1.469735361171108, + "learning_rate": 1.309704932260777e-06, + "loss": 0.6104, + "step": 8260 + }, + { + "epoch": 0.84, + "grad_norm": 1.7563334951394989, + "learning_rate": 1.3080754382811055e-06, + "loss": 0.7081, + "step": 8261 + }, + { + "epoch": 0.84, + "grad_norm": 1.9075491493123156, + "learning_rate": 1.30644688767392e-06, + "loss": 0.769, + "step": 8262 + }, + { + "epoch": 0.84, + "grad_norm": 1.911303763578774, + "learning_rate": 1.3048192806159721e-06, + "loss": 0.598, + "step": 8263 + }, + { + "epoch": 0.84, + "grad_norm": 1.7526459129348975, + "learning_rate": 1.3031926172839126e-06, + "loss": 0.7159, + "step": 8264 + }, + { + "epoch": 0.84, + "grad_norm": 1.6665938973477412, + "learning_rate": 1.301566897854295e-06, + "loss": 0.6275, + "step": 8265 + }, + { + "epoch": 0.84, + "grad_norm": 1.5753078101727518, + "learning_rate": 1.2999421225035602e-06, + "loss": 0.634, + "step": 8266 + }, + { + "epoch": 0.84, + "grad_norm": 1.537542156790524, + "learning_rate": 1.298318291408054e-06, + "loss": 0.602, + "step": 8267 + }, + { + "epoch": 0.84, + "grad_norm": 1.8364242230592978, + "learning_rate": 1.2966954047440194e-06, + "loss": 0.6858, + "step": 8268 + }, + { + "epoch": 0.84, + "grad_norm": 1.5620594579846787, + "learning_rate": 1.2950734626875927e-06, + "loss": 0.6455, + "step": 8269 + }, + { + "epoch": 0.84, + "grad_norm": 1.643541532600793, + "learning_rate": 1.2934524654148118e-06, + "loss": 0.6662, + "step": 8270 + }, + { + "epoch": 0.84, + "grad_norm": 1.7780861869537536, + "learning_rate": 1.2918324131016135e-06, + "loss": 0.6691, + "step": 8271 + }, + { + "epoch": 0.84, + "grad_norm": 1.477593720782914, + "learning_rate": 1.290213305923823e-06, + "loss": 0.6359, + "step": 8272 + }, + { + "epoch": 0.84, + "grad_norm": 1.873980085787594, + "learning_rate": 1.2885951440571754e-06, + "loss": 0.6693, + "step": 8273 + }, + { + "epoch": 0.84, + "grad_norm": 1.6247128817864538, + "learning_rate": 1.286977927677291e-06, + "loss": 0.6256, + "step": 8274 + }, + { + "epoch": 0.84, + "grad_norm": 1.723425124139416, + "learning_rate": 1.285361656959696e-06, + "loss": 0.7455, + "step": 8275 + }, + { + "epoch": 0.84, + "grad_norm": 1.5730964779506738, + "learning_rate": 1.2837463320798138e-06, + "loss": 0.7081, + "step": 8276 + }, + { + "epoch": 0.84, + "grad_norm": 1.566190339493104, + "learning_rate": 1.2821319532129584e-06, + "loss": 0.6472, + "step": 8277 + }, + { + "epoch": 0.84, + "grad_norm": 1.870457612150168, + "learning_rate": 1.2805185205343462e-06, + "loss": 0.6416, + "step": 8278 + }, + { + "epoch": 0.84, + "grad_norm": 1.63866119536756, + "learning_rate": 1.278906034219094e-06, + "loss": 0.7834, + "step": 8279 + }, + { + "epoch": 0.84, + "grad_norm": 1.8807338285351183, + "learning_rate": 1.2772944944422073e-06, + "loss": 0.721, + "step": 8280 + }, + { + "epoch": 0.84, + "grad_norm": 1.8934271455408902, + "learning_rate": 1.2756839013785971e-06, + "loss": 0.7553, + "step": 8281 + }, + { + "epoch": 0.84, + "grad_norm": 1.9750340584837798, + "learning_rate": 1.274074255203065e-06, + "loss": 0.7215, + "step": 8282 + }, + { + "epoch": 0.84, + "grad_norm": 1.605310988428569, + "learning_rate": 1.2724655560903133e-06, + "loss": 0.7834, + "step": 8283 + }, + { + "epoch": 0.84, + "grad_norm": 1.695518140987458, + "learning_rate": 1.2708578042149444e-06, + "loss": 0.7367, + "step": 8284 + }, + { + "epoch": 0.84, + "grad_norm": 1.668408369362041, + "learning_rate": 1.2692509997514513e-06, + "loss": 0.5848, + "step": 8285 + }, + { + "epoch": 0.84, + "grad_norm": 1.7085874099620402, + "learning_rate": 1.2676451428742297e-06, + "loss": 0.744, + "step": 8286 + }, + { + "epoch": 0.84, + "grad_norm": 1.8688038523063668, + "learning_rate": 1.2660402337575672e-06, + "loss": 0.681, + "step": 8287 + }, + { + "epoch": 0.84, + "grad_norm": 1.969306422104363, + "learning_rate": 1.2644362725756531e-06, + "loss": 0.7181, + "step": 8288 + }, + { + "epoch": 0.84, + "grad_norm": 1.6681407177929888, + "learning_rate": 1.262833259502575e-06, + "loss": 0.7593, + "step": 8289 + }, + { + "epoch": 0.84, + "grad_norm": 1.661293988860556, + "learning_rate": 1.2612311947123102e-06, + "loss": 0.7058, + "step": 8290 + }, + { + "epoch": 0.84, + "grad_norm": 1.840624486989448, + "learning_rate": 1.259630078378743e-06, + "loss": 0.6597, + "step": 8291 + }, + { + "epoch": 0.84, + "grad_norm": 1.7114529523092323, + "learning_rate": 1.2580299106756444e-06, + "loss": 0.6687, + "step": 8292 + }, + { + "epoch": 0.84, + "grad_norm": 1.579609069103059, + "learning_rate": 1.2564306917766888e-06, + "loss": 0.6612, + "step": 8293 + }, + { + "epoch": 0.84, + "grad_norm": 1.6687574529398108, + "learning_rate": 1.2548324218554508e-06, + "loss": 0.6822, + "step": 8294 + }, + { + "epoch": 0.84, + "grad_norm": 1.6662587476244746, + "learning_rate": 1.2532351010853916e-06, + "loss": 0.701, + "step": 8295 + }, + { + "epoch": 0.84, + "grad_norm": 1.780952607420748, + "learning_rate": 1.251638729639878e-06, + "loss": 0.6609, + "step": 8296 + }, + { + "epoch": 0.84, + "grad_norm": 1.6701796692628006, + "learning_rate": 1.2500433076921737e-06, + "loss": 0.7236, + "step": 8297 + }, + { + "epoch": 0.84, + "grad_norm": 1.6958271656639399, + "learning_rate": 1.2484488354154322e-06, + "loss": 0.7342, + "step": 8298 + }, + { + "epoch": 0.84, + "grad_norm": 1.5776014698998455, + "learning_rate": 1.2468553129827132e-06, + "loss": 0.65, + "step": 8299 + }, + { + "epoch": 0.84, + "grad_norm": 1.7931854484696272, + "learning_rate": 1.2452627405669637e-06, + "loss": 0.8245, + "step": 8300 + }, + { + "epoch": 0.84, + "grad_norm": 1.7502794056969053, + "learning_rate": 1.2436711183410344e-06, + "loss": 0.5966, + "step": 8301 + }, + { + "epoch": 0.84, + "grad_norm": 1.6829815801088328, + "learning_rate": 1.2420804464776736e-06, + "loss": 0.6106, + "step": 8302 + }, + { + "epoch": 0.84, + "grad_norm": 1.519296170317051, + "learning_rate": 1.24049072514952e-06, + "loss": 0.6843, + "step": 8303 + }, + { + "epoch": 0.84, + "grad_norm": 1.6324457551020213, + "learning_rate": 1.238901954529117e-06, + "loss": 0.6252, + "step": 8304 + }, + { + "epoch": 0.84, + "grad_norm": 1.5249173181106488, + "learning_rate": 1.2373141347888973e-06, + "loss": 0.6963, + "step": 8305 + }, + { + "epoch": 0.84, + "grad_norm": 2.622005851140321, + "learning_rate": 1.2357272661011943e-06, + "loss": 0.7062, + "step": 8306 + }, + { + "epoch": 0.84, + "grad_norm": 1.8225506321546634, + "learning_rate": 1.2341413486382404e-06, + "loss": 0.6429, + "step": 8307 + }, + { + "epoch": 0.85, + "grad_norm": 1.4583705955133515, + "learning_rate": 1.2325563825721587e-06, + "loss": 0.6684, + "step": 8308 + }, + { + "epoch": 0.85, + "grad_norm": 1.7633612538154486, + "learning_rate": 1.2309723680749763e-06, + "loss": 0.6642, + "step": 8309 + }, + { + "epoch": 0.85, + "grad_norm": 1.8176392806658217, + "learning_rate": 1.229389305318609e-06, + "loss": 0.7658, + "step": 8310 + }, + { + "epoch": 0.85, + "grad_norm": 1.7129799594557809, + "learning_rate": 1.2278071944748748e-06, + "loss": 0.7848, + "step": 8311 + }, + { + "epoch": 0.85, + "grad_norm": 1.6597211231973432, + "learning_rate": 1.226226035715491e-06, + "loss": 0.7077, + "step": 8312 + }, + { + "epoch": 0.85, + "grad_norm": 1.509184909464347, + "learning_rate": 1.2246458292120621e-06, + "loss": 0.5519, + "step": 8313 + }, + { + "epoch": 0.85, + "grad_norm": 1.7605713217212096, + "learning_rate": 1.2230665751360983e-06, + "loss": 0.691, + "step": 8314 + }, + { + "epoch": 0.85, + "grad_norm": 1.6014344717266955, + "learning_rate": 1.2214882736590029e-06, + "loss": 0.6665, + "step": 8315 + }, + { + "epoch": 0.85, + "grad_norm": 1.7419877348732173, + "learning_rate": 1.2199109249520724e-06, + "loss": 0.6899, + "step": 8316 + }, + { + "epoch": 0.85, + "grad_norm": 1.726432291563157, + "learning_rate": 1.2183345291865089e-06, + "loss": 0.6821, + "step": 8317 + }, + { + "epoch": 0.85, + "grad_norm": 1.8575890894392022, + "learning_rate": 1.2167590865333988e-06, + "loss": 0.7333, + "step": 8318 + }, + { + "epoch": 0.85, + "grad_norm": 1.5533278448200125, + "learning_rate": 1.2151845971637365e-06, + "loss": 0.6554, + "step": 8319 + }, + { + "epoch": 0.85, + "grad_norm": 1.7866467208283314, + "learning_rate": 1.2136110612484086e-06, + "loss": 0.7351, + "step": 8320 + }, + { + "epoch": 0.85, + "grad_norm": 1.8386786003634514, + "learning_rate": 1.2120384789581953e-06, + "loss": 0.7588, + "step": 8321 + }, + { + "epoch": 0.85, + "grad_norm": 1.5810040818908622, + "learning_rate": 1.2104668504637773e-06, + "loss": 0.6545, + "step": 8322 + }, + { + "epoch": 0.85, + "grad_norm": 1.562322712822364, + "learning_rate": 1.2088961759357287e-06, + "loss": 0.6201, + "step": 8323 + }, + { + "epoch": 0.85, + "grad_norm": 1.4897774268415849, + "learning_rate": 1.207326455544522e-06, + "loss": 0.6774, + "step": 8324 + }, + { + "epoch": 0.85, + "grad_norm": 1.723186570192101, + "learning_rate": 1.2057576894605294e-06, + "loss": 0.7934, + "step": 8325 + }, + { + "epoch": 0.85, + "grad_norm": 1.6427088516879875, + "learning_rate": 1.2041898778540096e-06, + "loss": 0.6514, + "step": 8326 + }, + { + "epoch": 0.85, + "grad_norm": 1.6999594989840934, + "learning_rate": 1.2026230208951307e-06, + "loss": 0.7262, + "step": 8327 + }, + { + "epoch": 0.85, + "grad_norm": 1.5443055445751726, + "learning_rate": 1.2010571187539454e-06, + "loss": 0.8429, + "step": 8328 + }, + { + "epoch": 0.85, + "grad_norm": 1.7848686000066742, + "learning_rate": 1.1994921716004093e-06, + "loss": 0.6515, + "step": 8329 + }, + { + "epoch": 0.85, + "grad_norm": 1.7576592002961013, + "learning_rate": 1.1979281796043752e-06, + "loss": 0.7112, + "step": 8330 + }, + { + "epoch": 0.85, + "grad_norm": 1.7057954207529185, + "learning_rate": 1.1963651429355871e-06, + "loss": 0.6285, + "step": 8331 + }, + { + "epoch": 0.85, + "grad_norm": 1.8488761820970288, + "learning_rate": 1.1948030617636885e-06, + "loss": 0.6229, + "step": 8332 + }, + { + "epoch": 0.85, + "grad_norm": 1.8905900773570874, + "learning_rate": 1.1932419362582215e-06, + "loss": 0.8246, + "step": 8333 + }, + { + "epoch": 0.85, + "grad_norm": 1.8718436174150623, + "learning_rate": 1.1916817665886183e-06, + "loss": 0.6485, + "step": 8334 + }, + { + "epoch": 0.85, + "grad_norm": 1.912184871813602, + "learning_rate": 1.1901225529242145e-06, + "loss": 0.7346, + "step": 8335 + }, + { + "epoch": 0.85, + "grad_norm": 1.7569317956831032, + "learning_rate": 1.1885642954342347e-06, + "loss": 0.8115, + "step": 8336 + }, + { + "epoch": 0.85, + "grad_norm": 1.5771053460843896, + "learning_rate": 1.187006994287806e-06, + "loss": 0.6744, + "step": 8337 + }, + { + "epoch": 0.85, + "grad_norm": 1.8762386730077556, + "learning_rate": 1.1854506496539485e-06, + "loss": 0.6746, + "step": 8338 + }, + { + "epoch": 0.85, + "grad_norm": 1.664766180889851, + "learning_rate": 1.1838952617015786e-06, + "loss": 0.7055, + "step": 8339 + }, + { + "epoch": 0.85, + "grad_norm": 1.8189649427529284, + "learning_rate": 1.182340830599511e-06, + "loss": 0.7021, + "step": 8340 + }, + { + "epoch": 0.85, + "grad_norm": 1.5837208446038582, + "learning_rate": 1.1807873565164507e-06, + "loss": 0.5835, + "step": 8341 + }, + { + "epoch": 0.85, + "grad_norm": 1.592398960952057, + "learning_rate": 1.1792348396210064e-06, + "loss": 0.6298, + "step": 8342 + }, + { + "epoch": 0.85, + "grad_norm": 1.7031410402159544, + "learning_rate": 1.1776832800816807e-06, + "loss": 0.6705, + "step": 8343 + }, + { + "epoch": 0.85, + "grad_norm": 1.39023837442245, + "learning_rate": 1.176132678066868e-06, + "loss": 0.5857, + "step": 8344 + }, + { + "epoch": 0.85, + "grad_norm": 1.5811862674391624, + "learning_rate": 1.174583033744865e-06, + "loss": 0.6163, + "step": 8345 + }, + { + "epoch": 0.85, + "grad_norm": 1.7060951684814682, + "learning_rate": 1.1730343472838568e-06, + "loss": 0.7131, + "step": 8346 + }, + { + "epoch": 0.85, + "grad_norm": 1.672908454020415, + "learning_rate": 1.1714866188519325e-06, + "loss": 0.6297, + "step": 8347 + }, + { + "epoch": 0.85, + "grad_norm": 1.6320128241024243, + "learning_rate": 1.1699398486170755e-06, + "loss": 0.5627, + "step": 8348 + }, + { + "epoch": 0.85, + "grad_norm": 1.6833927256365844, + "learning_rate": 1.1683940367471592e-06, + "loss": 0.6213, + "step": 8349 + }, + { + "epoch": 0.85, + "grad_norm": 1.722046538171397, + "learning_rate": 1.1668491834099606e-06, + "loss": 0.7279, + "step": 8350 + }, + { + "epoch": 0.85, + "grad_norm": 1.6351716824061726, + "learning_rate": 1.1653052887731465e-06, + "loss": 0.5468, + "step": 8351 + }, + { + "epoch": 0.85, + "grad_norm": 1.9516902571195383, + "learning_rate": 1.1637623530042842e-06, + "loss": 0.6836, + "step": 8352 + }, + { + "epoch": 0.85, + "grad_norm": 1.5973794419071494, + "learning_rate": 1.1622203762708374e-06, + "loss": 0.6042, + "step": 8353 + }, + { + "epoch": 0.85, + "grad_norm": 1.7871573535980738, + "learning_rate": 1.1606793587401588e-06, + "loss": 0.7547, + "step": 8354 + }, + { + "epoch": 0.85, + "grad_norm": 1.593337729041243, + "learning_rate": 1.1591393005795049e-06, + "loss": 0.6016, + "step": 8355 + }, + { + "epoch": 0.85, + "grad_norm": 1.6813138223461972, + "learning_rate": 1.1576002019560261e-06, + "loss": 0.6707, + "step": 8356 + }, + { + "epoch": 0.85, + "grad_norm": 1.782124641021241, + "learning_rate": 1.1560620630367635e-06, + "loss": 0.6561, + "step": 8357 + }, + { + "epoch": 0.85, + "grad_norm": 1.6552514721499414, + "learning_rate": 1.1545248839886624e-06, + "loss": 0.6258, + "step": 8358 + }, + { + "epoch": 0.85, + "grad_norm": 1.7510283040904677, + "learning_rate": 1.152988664978556e-06, + "loss": 0.6952, + "step": 8359 + }, + { + "epoch": 0.85, + "grad_norm": 1.6683898421711663, + "learning_rate": 1.1514534061731774e-06, + "loss": 0.6231, + "step": 8360 + }, + { + "epoch": 0.85, + "grad_norm": 1.7258501235344073, + "learning_rate": 1.149919107739158e-06, + "loss": 0.6234, + "step": 8361 + }, + { + "epoch": 0.85, + "grad_norm": 1.5408850111580221, + "learning_rate": 1.1483857698430178e-06, + "loss": 0.6398, + "step": 8362 + }, + { + "epoch": 0.85, + "grad_norm": 1.6650248318563607, + "learning_rate": 1.1468533926511783e-06, + "loss": 0.7315, + "step": 8363 + }, + { + "epoch": 0.85, + "grad_norm": 1.6331762688693068, + "learning_rate": 1.1453219763299572e-06, + "loss": 0.7218, + "step": 8364 + }, + { + "epoch": 0.85, + "grad_norm": 1.6692634858355666, + "learning_rate": 1.143791521045562e-06, + "loss": 0.662, + "step": 8365 + }, + { + "epoch": 0.85, + "grad_norm": 1.7281442159501406, + "learning_rate": 1.1422620269641015e-06, + "loss": 0.6767, + "step": 8366 + }, + { + "epoch": 0.85, + "grad_norm": 1.8944519361838665, + "learning_rate": 1.1407334942515801e-06, + "loss": 0.7541, + "step": 8367 + }, + { + "epoch": 0.85, + "grad_norm": 1.72952315228512, + "learning_rate": 1.139205923073894e-06, + "loss": 0.7566, + "step": 8368 + }, + { + "epoch": 0.85, + "grad_norm": 1.615345607547421, + "learning_rate": 1.1376793135968357e-06, + "loss": 0.694, + "step": 8369 + }, + { + "epoch": 0.85, + "grad_norm": 1.5608929827737215, + "learning_rate": 1.1361536659861005e-06, + "loss": 0.7242, + "step": 8370 + }, + { + "epoch": 0.85, + "grad_norm": 1.814474961764568, + "learning_rate": 1.1346289804072664e-06, + "loss": 0.7095, + "step": 8371 + }, + { + "epoch": 0.85, + "grad_norm": 1.6189912166838378, + "learning_rate": 1.1331052570258183e-06, + "loss": 0.7125, + "step": 8372 + }, + { + "epoch": 0.85, + "grad_norm": 1.7259453310980772, + "learning_rate": 1.131582496007133e-06, + "loss": 0.7864, + "step": 8373 + }, + { + "epoch": 0.85, + "grad_norm": 1.5270498544236337, + "learning_rate": 1.1300606975164807e-06, + "loss": 0.6883, + "step": 8374 + }, + { + "epoch": 0.85, + "grad_norm": 1.9169050697498091, + "learning_rate": 1.128539861719028e-06, + "loss": 0.8026, + "step": 8375 + }, + { + "epoch": 0.85, + "grad_norm": 1.7362936142752212, + "learning_rate": 1.1270199887798417e-06, + "loss": 0.6875, + "step": 8376 + }, + { + "epoch": 0.85, + "grad_norm": 1.6440481486462275, + "learning_rate": 1.1255010788638753e-06, + "loss": 0.6966, + "step": 8377 + }, + { + "epoch": 0.85, + "grad_norm": 1.646675454268616, + "learning_rate": 1.1239831321359862e-06, + "loss": 0.6603, + "step": 8378 + }, + { + "epoch": 0.85, + "grad_norm": 1.725336742038386, + "learning_rate": 1.1224661487609234e-06, + "loss": 0.7253, + "step": 8379 + }, + { + "epoch": 0.85, + "grad_norm": 1.995017255507201, + "learning_rate": 1.1209501289033298e-06, + "loss": 0.732, + "step": 8380 + }, + { + "epoch": 0.85, + "grad_norm": 1.7151332610577539, + "learning_rate": 1.1194350727277493e-06, + "loss": 0.6279, + "step": 8381 + }, + { + "epoch": 0.85, + "grad_norm": 1.488836247487096, + "learning_rate": 1.1179209803986124e-06, + "loss": 0.5818, + "step": 8382 + }, + { + "epoch": 0.85, + "grad_norm": 1.7011228341231284, + "learning_rate": 1.1164078520802535e-06, + "loss": 0.7556, + "step": 8383 + }, + { + "epoch": 0.85, + "grad_norm": 1.5593983291258506, + "learning_rate": 1.1148956879369e-06, + "loss": 0.6098, + "step": 8384 + }, + { + "epoch": 0.85, + "grad_norm": 1.5765603798333052, + "learning_rate": 1.1133844881326706e-06, + "loss": 0.6282, + "step": 8385 + }, + { + "epoch": 0.85, + "grad_norm": 1.480114990831892, + "learning_rate": 1.111874252831585e-06, + "loss": 0.4523, + "step": 8386 + }, + { + "epoch": 0.85, + "grad_norm": 1.6096880954934176, + "learning_rate": 1.1103649821975527e-06, + "loss": 0.6659, + "step": 8387 + }, + { + "epoch": 0.85, + "grad_norm": 1.7778688660947681, + "learning_rate": 1.1088566763943843e-06, + "loss": 0.7247, + "step": 8388 + }, + { + "epoch": 0.85, + "grad_norm": 1.803483749854071, + "learning_rate": 1.1073493355857823e-06, + "loss": 0.6551, + "step": 8389 + }, + { + "epoch": 0.85, + "grad_norm": 1.6709014820841264, + "learning_rate": 1.1058429599353426e-06, + "loss": 0.6616, + "step": 8390 + }, + { + "epoch": 0.85, + "grad_norm": 1.9903915640329826, + "learning_rate": 1.104337549606561e-06, + "loss": 0.8037, + "step": 8391 + }, + { + "epoch": 0.85, + "grad_norm": 1.5789512190681958, + "learning_rate": 1.1028331047628282e-06, + "loss": 0.7054, + "step": 8392 + }, + { + "epoch": 0.85, + "grad_norm": 1.7115052153277104, + "learning_rate": 1.1013296255674233e-06, + "loss": 0.7137, + "step": 8393 + }, + { + "epoch": 0.85, + "grad_norm": 1.5073715688022733, + "learning_rate": 1.0998271121835302e-06, + "loss": 0.595, + "step": 8394 + }, + { + "epoch": 0.85, + "grad_norm": 1.887979213898425, + "learning_rate": 1.0983255647742185e-06, + "loss": 0.707, + "step": 8395 + }, + { + "epoch": 0.85, + "grad_norm": 1.701860423331792, + "learning_rate": 1.0968249835024603e-06, + "loss": 0.7961, + "step": 8396 + }, + { + "epoch": 0.85, + "grad_norm": 1.619199574800476, + "learning_rate": 1.0953253685311227e-06, + "loss": 0.5768, + "step": 8397 + }, + { + "epoch": 0.85, + "grad_norm": 1.987229741846533, + "learning_rate": 1.0938267200229603e-06, + "loss": 0.6983, + "step": 8398 + }, + { + "epoch": 0.85, + "grad_norm": 1.8108338151674153, + "learning_rate": 1.0923290381406316e-06, + "loss": 0.7066, + "step": 8399 + }, + { + "epoch": 0.85, + "grad_norm": 1.7244286465155514, + "learning_rate": 1.0908323230466845e-06, + "loss": 0.6901, + "step": 8400 + }, + { + "epoch": 0.85, + "grad_norm": 1.6688581986336752, + "learning_rate": 1.0893365749035646e-06, + "loss": 0.6577, + "step": 8401 + }, + { + "epoch": 0.85, + "grad_norm": 1.601690834802701, + "learning_rate": 1.0878417938736142e-06, + "loss": 0.6563, + "step": 8402 + }, + { + "epoch": 0.85, + "grad_norm": 1.653747397467601, + "learning_rate": 1.0863479801190645e-06, + "loss": 0.7102, + "step": 8403 + }, + { + "epoch": 0.85, + "grad_norm": 1.6179264464657146, + "learning_rate": 1.0848551338020497e-06, + "loss": 0.702, + "step": 8404 + }, + { + "epoch": 0.85, + "grad_norm": 1.6147101757367943, + "learning_rate": 1.0833632550845907e-06, + "loss": 0.6662, + "step": 8405 + }, + { + "epoch": 0.86, + "grad_norm": 1.8821238740303694, + "learning_rate": 1.0818723441286105e-06, + "loss": 0.7495, + "step": 8406 + }, + { + "epoch": 0.86, + "grad_norm": 1.6447027933493943, + "learning_rate": 1.0803824010959253e-06, + "loss": 0.7016, + "step": 8407 + }, + { + "epoch": 0.86, + "grad_norm": 1.7410971803330708, + "learning_rate": 1.078893426148241e-06, + "loss": 0.6797, + "step": 8408 + }, + { + "epoch": 0.86, + "grad_norm": 1.609515505448147, + "learning_rate": 1.0774054194471638e-06, + "loss": 0.6954, + "step": 8409 + }, + { + "epoch": 0.86, + "grad_norm": 1.8647808273584001, + "learning_rate": 1.0759183811541973e-06, + "loss": 0.7015, + "step": 8410 + }, + { + "epoch": 0.86, + "grad_norm": 1.656207416925414, + "learning_rate": 1.074432311430732e-06, + "loss": 0.7844, + "step": 8411 + }, + { + "epoch": 0.86, + "grad_norm": 1.5818580586379434, + "learning_rate": 1.0729472104380601e-06, + "loss": 0.6853, + "step": 8412 + }, + { + "epoch": 0.86, + "grad_norm": 1.6628763789830707, + "learning_rate": 1.0714630783373636e-06, + "loss": 0.7416, + "step": 8413 + }, + { + "epoch": 0.86, + "grad_norm": 1.8792780484068097, + "learning_rate": 1.0699799152897238e-06, + "loss": 0.8003, + "step": 8414 + }, + { + "epoch": 0.86, + "grad_norm": 1.6385320775186256, + "learning_rate": 1.0684977214561155e-06, + "loss": 0.6372, + "step": 8415 + }, + { + "epoch": 0.86, + "grad_norm": 1.3279034371879015, + "learning_rate": 1.067016496997404e-06, + "loss": 0.5391, + "step": 8416 + }, + { + "epoch": 0.86, + "grad_norm": 1.7867343700510006, + "learning_rate": 1.0655362420743587e-06, + "loss": 0.7462, + "step": 8417 + }, + { + "epoch": 0.86, + "grad_norm": 1.5461375040632466, + "learning_rate": 1.0640569568476323e-06, + "loss": 0.5684, + "step": 8418 + }, + { + "epoch": 0.86, + "grad_norm": 1.6583070609877475, + "learning_rate": 1.0625786414777805e-06, + "loss": 0.6747, + "step": 8419 + }, + { + "epoch": 0.86, + "grad_norm": 1.703374935104682, + "learning_rate": 1.0611012961252543e-06, + "loss": 0.755, + "step": 8420 + }, + { + "epoch": 0.86, + "grad_norm": 1.66739442848443, + "learning_rate": 1.0596249209503906e-06, + "loss": 0.8473, + "step": 8421 + }, + { + "epoch": 0.86, + "grad_norm": 1.8210961467259086, + "learning_rate": 1.0581495161134325e-06, + "loss": 0.6847, + "step": 8422 + }, + { + "epoch": 0.86, + "grad_norm": 1.5643904053537698, + "learning_rate": 1.0566750817745076e-06, + "loss": 0.6154, + "step": 8423 + }, + { + "epoch": 0.86, + "grad_norm": 1.6738856312856583, + "learning_rate": 1.0552016180936442e-06, + "loss": 0.648, + "step": 8424 + }, + { + "epoch": 0.86, + "grad_norm": 1.7984497698344761, + "learning_rate": 1.0537291252307668e-06, + "loss": 0.7255, + "step": 8425 + }, + { + "epoch": 0.86, + "grad_norm": 1.611425019120535, + "learning_rate": 1.0522576033456865e-06, + "loss": 0.5955, + "step": 8426 + }, + { + "epoch": 0.86, + "grad_norm": 1.5985930161310276, + "learning_rate": 1.0507870525981166e-06, + "loss": 0.647, + "step": 8427 + }, + { + "epoch": 0.86, + "grad_norm": 1.602373516978655, + "learning_rate": 1.0493174731476641e-06, + "loss": 0.6878, + "step": 8428 + }, + { + "epoch": 0.86, + "grad_norm": 1.684892750947452, + "learning_rate": 1.0478488651538244e-06, + "loss": 0.6671, + "step": 8429 + }, + { + "epoch": 0.86, + "grad_norm": 1.751413082663372, + "learning_rate": 1.0463812287759967e-06, + "loss": 0.8063, + "step": 8430 + }, + { + "epoch": 0.86, + "grad_norm": 1.6046268645591264, + "learning_rate": 1.0449145641734648e-06, + "loss": 0.5918, + "step": 8431 + }, + { + "epoch": 0.86, + "grad_norm": 1.7159864744314377, + "learning_rate": 1.0434488715054158e-06, + "loss": 0.6877, + "step": 8432 + }, + { + "epoch": 0.86, + "grad_norm": 1.631233184520412, + "learning_rate": 1.0419841509309293e-06, + "loss": 0.7765, + "step": 8433 + }, + { + "epoch": 0.86, + "grad_norm": 1.532792792300549, + "learning_rate": 1.0405204026089732e-06, + "loss": 0.5547, + "step": 8434 + }, + { + "epoch": 0.86, + "grad_norm": 1.6352150853995833, + "learning_rate": 1.0390576266984186e-06, + "loss": 0.785, + "step": 8435 + }, + { + "epoch": 0.86, + "grad_norm": 1.621107971810868, + "learning_rate": 1.0375958233580241e-06, + "loss": 0.6549, + "step": 8436 + }, + { + "epoch": 0.86, + "grad_norm": 1.7423638741067684, + "learning_rate": 1.0361349927464459e-06, + "loss": 0.671, + "step": 8437 + }, + { + "epoch": 0.86, + "grad_norm": 1.6553449685180674, + "learning_rate": 1.0346751350222373e-06, + "loss": 0.7222, + "step": 8438 + }, + { + "epoch": 0.86, + "grad_norm": 1.69168484776965, + "learning_rate": 1.0332162503438382e-06, + "loss": 0.7429, + "step": 8439 + }, + { + "epoch": 0.86, + "grad_norm": 1.7137645740993, + "learning_rate": 1.031758338869593e-06, + "loss": 0.7141, + "step": 8440 + }, + { + "epoch": 0.86, + "grad_norm": 1.7547540988136974, + "learning_rate": 1.0303014007577306e-06, + "loss": 0.6966, + "step": 8441 + }, + { + "epoch": 0.86, + "grad_norm": 1.6525465199540916, + "learning_rate": 1.0288454361663802e-06, + "loss": 0.6479, + "step": 8442 + }, + { + "epoch": 0.86, + "grad_norm": 1.5072249642946518, + "learning_rate": 1.0273904452535666e-06, + "loss": 0.6614, + "step": 8443 + }, + { + "epoch": 0.86, + "grad_norm": 1.5417080513159942, + "learning_rate": 1.0259364281772023e-06, + "loss": 0.6281, + "step": 8444 + }, + { + "epoch": 0.86, + "grad_norm": 1.586239143977994, + "learning_rate": 1.024483385095101e-06, + "loss": 0.739, + "step": 8445 + }, + { + "epoch": 0.86, + "grad_norm": 1.6299264084653204, + "learning_rate": 1.0230313161649674e-06, + "loss": 0.71, + "step": 8446 + }, + { + "epoch": 0.86, + "grad_norm": 1.652423537669246, + "learning_rate": 1.0215802215443993e-06, + "loss": 0.6331, + "step": 8447 + }, + { + "epoch": 0.86, + "grad_norm": 1.486458058211073, + "learning_rate": 1.0201301013908926e-06, + "loss": 0.6625, + "step": 8448 + }, + { + "epoch": 0.86, + "grad_norm": 1.7202949780514047, + "learning_rate": 1.0186809558618327e-06, + "loss": 0.7799, + "step": 8449 + }, + { + "epoch": 0.86, + "grad_norm": 1.851452722138668, + "learning_rate": 1.0172327851145026e-06, + "loss": 0.713, + "step": 8450 + }, + { + "epoch": 0.86, + "grad_norm": 1.7589655012461458, + "learning_rate": 1.0157855893060809e-06, + "loss": 0.706, + "step": 8451 + }, + { + "epoch": 0.86, + "grad_norm": 1.7341879348494615, + "learning_rate": 1.0143393685936342e-06, + "loss": 0.7593, + "step": 8452 + }, + { + "epoch": 0.86, + "grad_norm": 1.9346470924362966, + "learning_rate": 1.0128941231341305e-06, + "loss": 0.7522, + "step": 8453 + }, + { + "epoch": 0.86, + "grad_norm": 1.6773163911161024, + "learning_rate": 1.0114498530844253e-06, + "loss": 0.732, + "step": 8454 + }, + { + "epoch": 0.86, + "grad_norm": 1.6882931865366573, + "learning_rate": 1.010006558601274e-06, + "loss": 0.6606, + "step": 8455 + }, + { + "epoch": 0.86, + "grad_norm": 1.6757261676598967, + "learning_rate": 1.0085642398413243e-06, + "loss": 0.5838, + "step": 8456 + }, + { + "epoch": 0.86, + "grad_norm": 1.8452775158196029, + "learning_rate": 1.0071228969611146e-06, + "loss": 0.7233, + "step": 8457 + }, + { + "epoch": 0.86, + "grad_norm": 1.5740820266658542, + "learning_rate": 1.0056825301170836e-06, + "loss": 0.5603, + "step": 8458 + }, + { + "epoch": 0.86, + "grad_norm": 1.6426284893961631, + "learning_rate": 1.0042431394655562e-06, + "loss": 0.671, + "step": 8459 + }, + { + "epoch": 0.86, + "grad_norm": 1.6699412335146264, + "learning_rate": 1.0028047251627583e-06, + "loss": 0.6582, + "step": 8460 + }, + { + "epoch": 0.86, + "grad_norm": 1.6824431706589318, + "learning_rate": 1.0013672873648083e-06, + "loss": 0.7196, + "step": 8461 + }, + { + "epoch": 0.86, + "grad_norm": 1.963283484565238, + "learning_rate": 9.999308262277152e-07, + "loss": 0.7811, + "step": 8462 + }, + { + "epoch": 0.86, + "grad_norm": 1.5592969312288327, + "learning_rate": 9.98495341907385e-07, + "loss": 0.5841, + "step": 8463 + }, + { + "epoch": 0.86, + "grad_norm": 1.578809012565993, + "learning_rate": 9.9706083455962e-07, + "loss": 0.6566, + "step": 8464 + }, + { + "epoch": 0.86, + "grad_norm": 1.7662396613126716, + "learning_rate": 9.9562730434011e-07, + "loss": 0.7146, + "step": 8465 + }, + { + "epoch": 0.86, + "grad_norm": 1.564066354381204, + "learning_rate": 9.941947514044437e-07, + "loss": 0.6503, + "step": 8466 + }, + { + "epoch": 0.86, + "grad_norm": 1.74140264632033, + "learning_rate": 9.927631759081013e-07, + "loss": 0.7223, + "step": 8467 + }, + { + "epoch": 0.86, + "grad_norm": 1.6125573691640296, + "learning_rate": 9.913325780064586e-07, + "loss": 0.7539, + "step": 8468 + }, + { + "epoch": 0.86, + "grad_norm": 1.7445756450530683, + "learning_rate": 9.89902957854787e-07, + "loss": 0.5746, + "step": 8469 + }, + { + "epoch": 0.86, + "grad_norm": 1.6232908198351832, + "learning_rate": 9.884743156082445e-07, + "loss": 0.6638, + "step": 8470 + }, + { + "epoch": 0.86, + "grad_norm": 1.6749833434917487, + "learning_rate": 9.870466514218912e-07, + "loss": 0.6728, + "step": 8471 + }, + { + "epoch": 0.86, + "grad_norm": 1.7949419456953852, + "learning_rate": 9.856199654506781e-07, + "loss": 0.6282, + "step": 8472 + }, + { + "epoch": 0.86, + "grad_norm": 1.5343271623648493, + "learning_rate": 9.84194257849448e-07, + "loss": 0.6372, + "step": 8473 + }, + { + "epoch": 0.86, + "grad_norm": 1.715316359108728, + "learning_rate": 9.827695287729389e-07, + "loss": 0.5719, + "step": 8474 + }, + { + "epoch": 0.86, + "grad_norm": 1.6786352216456508, + "learning_rate": 9.813457783757852e-07, + "loss": 0.6733, + "step": 8475 + }, + { + "epoch": 0.86, + "grad_norm": 1.6605747453901634, + "learning_rate": 9.7992300681251e-07, + "loss": 0.7578, + "step": 8476 + }, + { + "epoch": 0.86, + "grad_norm": 1.675631402409404, + "learning_rate": 9.78501214237535e-07, + "loss": 0.694, + "step": 8477 + }, + { + "epoch": 0.86, + "grad_norm": 1.5661955438528077, + "learning_rate": 9.770804008051738e-07, + "loss": 0.6373, + "step": 8478 + }, + { + "epoch": 0.86, + "grad_norm": 1.583340621840412, + "learning_rate": 9.756605666696305e-07, + "loss": 0.7857, + "step": 8479 + }, + { + "epoch": 0.86, + "grad_norm": 1.639875934388625, + "learning_rate": 9.742417119850078e-07, + "loss": 0.6977, + "step": 8480 + }, + { + "epoch": 0.86, + "grad_norm": 1.572932766498734, + "learning_rate": 9.72823836905301e-07, + "loss": 0.643, + "step": 8481 + }, + { + "epoch": 0.86, + "grad_norm": 1.7348617566859073, + "learning_rate": 9.714069415843975e-07, + "loss": 0.6105, + "step": 8482 + }, + { + "epoch": 0.86, + "grad_norm": 1.5129019769957652, + "learning_rate": 9.699910261760781e-07, + "loss": 0.6587, + "step": 8483 + }, + { + "epoch": 0.86, + "grad_norm": 1.7286293367869179, + "learning_rate": 9.685760908340215e-07, + "loss": 0.6924, + "step": 8484 + }, + { + "epoch": 0.86, + "grad_norm": 1.6286863062115362, + "learning_rate": 9.671621357117923e-07, + "loss": 0.6642, + "step": 8485 + }, + { + "epoch": 0.86, + "grad_norm": 1.7886869240578565, + "learning_rate": 9.657491609628577e-07, + "loss": 0.6126, + "step": 8486 + }, + { + "epoch": 0.86, + "grad_norm": 1.6579069465573368, + "learning_rate": 9.6433716674057e-07, + "loss": 0.6803, + "step": 8487 + }, + { + "epoch": 0.86, + "grad_norm": 1.6616465793411628, + "learning_rate": 9.629261531981803e-07, + "loss": 0.5699, + "step": 8488 + }, + { + "epoch": 0.86, + "grad_norm": 1.6520740195369406, + "learning_rate": 9.615161204888346e-07, + "loss": 0.6986, + "step": 8489 + }, + { + "epoch": 0.86, + "grad_norm": 1.7609516169236654, + "learning_rate": 9.601070687655667e-07, + "loss": 0.6555, + "step": 8490 + }, + { + "epoch": 0.86, + "grad_norm": 1.7617317793956033, + "learning_rate": 9.586989981813078e-07, + "loss": 0.6843, + "step": 8491 + }, + { + "epoch": 0.86, + "grad_norm": 1.6235398129125809, + "learning_rate": 9.572919088888844e-07, + "loss": 0.7243, + "step": 8492 + }, + { + "epoch": 0.86, + "grad_norm": 1.618641132828586, + "learning_rate": 9.55885801041011e-07, + "loss": 0.675, + "step": 8493 + }, + { + "epoch": 0.86, + "grad_norm": 1.7389103985815473, + "learning_rate": 9.544806747903013e-07, + "loss": 0.7018, + "step": 8494 + }, + { + "epoch": 0.86, + "grad_norm": 1.6473771952026806, + "learning_rate": 9.530765302892553e-07, + "loss": 0.586, + "step": 8495 + }, + { + "epoch": 0.86, + "grad_norm": 1.5652406055054269, + "learning_rate": 9.516733676902756e-07, + "loss": 0.6362, + "step": 8496 + }, + { + "epoch": 0.86, + "grad_norm": 1.5478809662387278, + "learning_rate": 9.502711871456527e-07, + "loss": 0.5799, + "step": 8497 + }, + { + "epoch": 0.86, + "grad_norm": 1.5968218731230266, + "learning_rate": 9.488699888075681e-07, + "loss": 0.695, + "step": 8498 + }, + { + "epoch": 0.86, + "grad_norm": 1.9413116133308845, + "learning_rate": 9.474697728281058e-07, + "loss": 0.6953, + "step": 8499 + }, + { + "epoch": 0.86, + "grad_norm": 1.7387764741853589, + "learning_rate": 9.460705393592307e-07, + "loss": 0.5861, + "step": 8500 + }, + { + "epoch": 0.86, + "grad_norm": 1.6879635537094295, + "learning_rate": 9.446722885528115e-07, + "loss": 0.7975, + "step": 8501 + }, + { + "epoch": 0.86, + "grad_norm": 1.7870935928368428, + "learning_rate": 9.432750205606079e-07, + "loss": 0.6651, + "step": 8502 + }, + { + "epoch": 0.86, + "grad_norm": 1.7000098571909354, + "learning_rate": 9.418787355342674e-07, + "loss": 0.6625, + "step": 8503 + }, + { + "epoch": 0.87, + "grad_norm": 1.6898540784275544, + "learning_rate": 9.404834336253366e-07, + "loss": 0.6732, + "step": 8504 + }, + { + "epoch": 0.87, + "grad_norm": 1.5420769396071596, + "learning_rate": 9.390891149852566e-07, + "loss": 0.5964, + "step": 8505 + }, + { + "epoch": 0.87, + "grad_norm": 1.8262952647153974, + "learning_rate": 9.376957797653541e-07, + "loss": 0.6322, + "step": 8506 + }, + { + "epoch": 0.87, + "grad_norm": 1.6713233513580645, + "learning_rate": 9.363034281168581e-07, + "loss": 0.7156, + "step": 8507 + }, + { + "epoch": 0.87, + "grad_norm": 1.558887253750619, + "learning_rate": 9.349120601908834e-07, + "loss": 0.6006, + "step": 8508 + }, + { + "epoch": 0.87, + "grad_norm": 1.5578407002530088, + "learning_rate": 9.335216761384414e-07, + "loss": 0.6743, + "step": 8509 + }, + { + "epoch": 0.87, + "grad_norm": 1.4893998244873128, + "learning_rate": 9.321322761104401e-07, + "loss": 0.5171, + "step": 8510 + }, + { + "epoch": 0.87, + "grad_norm": 1.8732182263707253, + "learning_rate": 9.307438602576724e-07, + "loss": 0.7097, + "step": 8511 + }, + { + "epoch": 0.87, + "grad_norm": 1.6579487951787695, + "learning_rate": 9.293564287308332e-07, + "loss": 0.6302, + "step": 8512 + }, + { + "epoch": 0.87, + "grad_norm": 1.5898531234427316, + "learning_rate": 9.279699816805032e-07, + "loss": 0.7759, + "step": 8513 + }, + { + "epoch": 0.87, + "grad_norm": 1.5373161200871173, + "learning_rate": 9.26584519257161e-07, + "loss": 0.5888, + "step": 8514 + }, + { + "epoch": 0.87, + "grad_norm": 1.5786124799035222, + "learning_rate": 9.252000416111784e-07, + "loss": 0.6454, + "step": 8515 + }, + { + "epoch": 0.87, + "grad_norm": 1.5839456176787687, + "learning_rate": 9.238165488928152e-07, + "loss": 0.6728, + "step": 8516 + }, + { + "epoch": 0.87, + "grad_norm": 1.7281204784134618, + "learning_rate": 9.224340412522325e-07, + "loss": 0.6542, + "step": 8517 + }, + { + "epoch": 0.87, + "grad_norm": 1.507704352537572, + "learning_rate": 9.210525188394747e-07, + "loss": 0.6282, + "step": 8518 + }, + { + "epoch": 0.87, + "grad_norm": 1.6918760455004862, + "learning_rate": 9.196719818044886e-07, + "loss": 0.6208, + "step": 8519 + }, + { + "epoch": 0.87, + "grad_norm": 1.7589913626033948, + "learning_rate": 9.182924302971086e-07, + "loss": 0.6981, + "step": 8520 + }, + { + "epoch": 0.87, + "grad_norm": 1.6422584710703088, + "learning_rate": 9.169138644670627e-07, + "loss": 0.6739, + "step": 8521 + }, + { + "epoch": 0.87, + "grad_norm": 1.767490173430962, + "learning_rate": 9.155362844639725e-07, + "loss": 0.6822, + "step": 8522 + }, + { + "epoch": 0.87, + "grad_norm": 1.8181788312046294, + "learning_rate": 9.14159690437355e-07, + "loss": 0.7665, + "step": 8523 + }, + { + "epoch": 0.87, + "grad_norm": 1.7480484857115235, + "learning_rate": 9.127840825366152e-07, + "loss": 0.7395, + "step": 8524 + }, + { + "epoch": 0.87, + "grad_norm": 1.8396355344859228, + "learning_rate": 9.114094609110569e-07, + "loss": 0.582, + "step": 8525 + }, + { + "epoch": 0.87, + "grad_norm": 1.6794017948340934, + "learning_rate": 9.100358257098707e-07, + "loss": 0.8449, + "step": 8526 + }, + { + "epoch": 0.87, + "grad_norm": 1.7382601173910843, + "learning_rate": 9.086631770821431e-07, + "loss": 0.7845, + "step": 8527 + }, + { + "epoch": 0.87, + "grad_norm": 1.6555141456200846, + "learning_rate": 9.07291515176858e-07, + "loss": 0.7532, + "step": 8528 + }, + { + "epoch": 0.87, + "grad_norm": 1.4531404575168607, + "learning_rate": 9.05920840142882e-07, + "loss": 0.6653, + "step": 8529 + }, + { + "epoch": 0.87, + "grad_norm": 1.9042978793688385, + "learning_rate": 9.045511521289862e-07, + "loss": 0.6411, + "step": 8530 + }, + { + "epoch": 0.87, + "grad_norm": 1.6141730691986127, + "learning_rate": 9.031824512838239e-07, + "loss": 0.6548, + "step": 8531 + }, + { + "epoch": 0.87, + "grad_norm": 1.7912378321838687, + "learning_rate": 9.018147377559483e-07, + "loss": 0.6434, + "step": 8532 + }, + { + "epoch": 0.87, + "grad_norm": 1.6969882014294067, + "learning_rate": 9.004480116938041e-07, + "loss": 0.6282, + "step": 8533 + }, + { + "epoch": 0.87, + "grad_norm": 1.7507718131371228, + "learning_rate": 8.99082273245726e-07, + "loss": 0.65, + "step": 8534 + }, + { + "epoch": 0.87, + "grad_norm": 1.7382108953727158, + "learning_rate": 8.977175225599466e-07, + "loss": 0.7068, + "step": 8535 + }, + { + "epoch": 0.87, + "grad_norm": 1.8785085612766732, + "learning_rate": 8.963537597845839e-07, + "loss": 0.6955, + "step": 8536 + }, + { + "epoch": 0.87, + "grad_norm": 1.9876005931416723, + "learning_rate": 8.949909850676564e-07, + "loss": 0.796, + "step": 8537 + }, + { + "epoch": 0.87, + "grad_norm": 1.8335946031497297, + "learning_rate": 8.936291985570722e-07, + "loss": 0.5972, + "step": 8538 + }, + { + "epoch": 0.87, + "grad_norm": 1.6787740376123934, + "learning_rate": 8.922684004006299e-07, + "loss": 0.7053, + "step": 8539 + }, + { + "epoch": 0.87, + "grad_norm": 1.7260937405636285, + "learning_rate": 8.909085907460224e-07, + "loss": 0.7055, + "step": 8540 + }, + { + "epoch": 0.87, + "grad_norm": 1.8156357333218092, + "learning_rate": 8.895497697408395e-07, + "loss": 0.7014, + "step": 8541 + }, + { + "epoch": 0.87, + "grad_norm": 1.8502892872932764, + "learning_rate": 8.881919375325565e-07, + "loss": 0.7066, + "step": 8542 + }, + { + "epoch": 0.87, + "grad_norm": 1.7520094569059954, + "learning_rate": 8.868350942685467e-07, + "loss": 0.7422, + "step": 8543 + }, + { + "epoch": 0.87, + "grad_norm": 1.6762712922095546, + "learning_rate": 8.854792400960721e-07, + "loss": 0.6672, + "step": 8544 + }, + { + "epoch": 0.87, + "grad_norm": 1.7009066805680277, + "learning_rate": 8.841243751622908e-07, + "loss": 0.7102, + "step": 8545 + }, + { + "epoch": 0.87, + "grad_norm": 1.7598393690465033, + "learning_rate": 8.827704996142539e-07, + "loss": 0.6883, + "step": 8546 + }, + { + "epoch": 0.87, + "grad_norm": 1.6953232913217238, + "learning_rate": 8.814176135989006e-07, + "loss": 0.6376, + "step": 8547 + }, + { + "epoch": 0.87, + "grad_norm": 1.7558576220517113, + "learning_rate": 8.800657172630678e-07, + "loss": 0.7404, + "step": 8548 + }, + { + "epoch": 0.87, + "grad_norm": 1.8020573268309257, + "learning_rate": 8.787148107534793e-07, + "loss": 0.6439, + "step": 8549 + }, + { + "epoch": 0.87, + "grad_norm": 1.7623781627660229, + "learning_rate": 8.773648942167578e-07, + "loss": 0.7491, + "step": 8550 + }, + { + "epoch": 0.87, + "grad_norm": 1.5599459156928088, + "learning_rate": 8.760159677994174e-07, + "loss": 0.6676, + "step": 8551 + }, + { + "epoch": 0.87, + "grad_norm": 1.7111379677657992, + "learning_rate": 8.746680316478573e-07, + "loss": 0.7308, + "step": 8552 + }, + { + "epoch": 0.87, + "grad_norm": 1.7050018071892619, + "learning_rate": 8.733210859083807e-07, + "loss": 0.7254, + "step": 8553 + }, + { + "epoch": 0.87, + "grad_norm": 1.6601874396647305, + "learning_rate": 8.719751307271739e-07, + "loss": 0.6913, + "step": 8554 + }, + { + "epoch": 0.87, + "grad_norm": 1.7903128696098336, + "learning_rate": 8.706301662503191e-07, + "loss": 0.7505, + "step": 8555 + }, + { + "epoch": 0.87, + "grad_norm": 1.6630933552740055, + "learning_rate": 8.69286192623795e-07, + "loss": 0.6685, + "step": 8556 + }, + { + "epoch": 0.87, + "grad_norm": 1.8305512837202105, + "learning_rate": 8.679432099934637e-07, + "loss": 0.7627, + "step": 8557 + }, + { + "epoch": 0.87, + "grad_norm": 1.644433408519988, + "learning_rate": 8.666012185050876e-07, + "loss": 0.7553, + "step": 8558 + }, + { + "epoch": 0.87, + "grad_norm": 1.7837745416315152, + "learning_rate": 8.652602183043213e-07, + "loss": 0.7348, + "step": 8559 + }, + { + "epoch": 0.87, + "grad_norm": 1.6394931714912497, + "learning_rate": 8.63920209536705e-07, + "loss": 0.7273, + "step": 8560 + }, + { + "epoch": 0.87, + "grad_norm": 1.759179568705848, + "learning_rate": 8.62581192347679e-07, + "loss": 0.8114, + "step": 8561 + }, + { + "epoch": 0.87, + "grad_norm": 1.9408825514066173, + "learning_rate": 8.612431668825705e-07, + "loss": 0.7462, + "step": 8562 + }, + { + "epoch": 0.87, + "grad_norm": 1.6359865231405288, + "learning_rate": 8.59906133286601e-07, + "loss": 0.6535, + "step": 8563 + }, + { + "epoch": 0.87, + "grad_norm": 1.7203917526452943, + "learning_rate": 8.585700917048879e-07, + "loss": 0.6331, + "step": 8564 + }, + { + "epoch": 0.87, + "grad_norm": 1.715284772326796, + "learning_rate": 8.572350422824338e-07, + "loss": 0.7302, + "step": 8565 + }, + { + "epoch": 0.87, + "grad_norm": 1.9761566130879007, + "learning_rate": 8.559009851641397e-07, + "loss": 0.7234, + "step": 8566 + }, + { + "epoch": 0.87, + "grad_norm": 1.60364359740448, + "learning_rate": 8.545679204947954e-07, + "loss": 0.6343, + "step": 8567 + }, + { + "epoch": 0.87, + "grad_norm": 1.7097803626771462, + "learning_rate": 8.532358484190851e-07, + "loss": 0.6143, + "step": 8568 + }, + { + "epoch": 0.87, + "grad_norm": 1.6386662138382595, + "learning_rate": 8.519047690815841e-07, + "loss": 0.7445, + "step": 8569 + }, + { + "epoch": 0.87, + "grad_norm": 1.692997496401604, + "learning_rate": 8.505746826267602e-07, + "loss": 0.7129, + "step": 8570 + }, + { + "epoch": 0.87, + "grad_norm": 1.7807662897180458, + "learning_rate": 8.492455891989737e-07, + "loss": 0.7059, + "step": 8571 + }, + { + "epoch": 0.87, + "grad_norm": 1.6263058459356212, + "learning_rate": 8.479174889424758e-07, + "loss": 0.7066, + "step": 8572 + }, + { + "epoch": 0.87, + "grad_norm": 1.7883197031359765, + "learning_rate": 8.465903820014121e-07, + "loss": 0.7816, + "step": 8573 + }, + { + "epoch": 0.87, + "grad_norm": 1.6248322732286755, + "learning_rate": 8.452642685198209e-07, + "loss": 0.8102, + "step": 8574 + }, + { + "epoch": 0.87, + "grad_norm": 1.6366081161488895, + "learning_rate": 8.43939148641627e-07, + "loss": 0.7074, + "step": 8575 + }, + { + "epoch": 0.87, + "grad_norm": 1.6256476717105108, + "learning_rate": 8.426150225106544e-07, + "loss": 0.7085, + "step": 8576 + }, + { + "epoch": 0.87, + "grad_norm": 1.6823461597388514, + "learning_rate": 8.412918902706169e-07, + "loss": 0.8242, + "step": 8577 + }, + { + "epoch": 0.87, + "grad_norm": 1.599186246228301, + "learning_rate": 8.399697520651163e-07, + "loss": 0.6813, + "step": 8578 + }, + { + "epoch": 0.87, + "grad_norm": 2.0396075459147673, + "learning_rate": 8.386486080376532e-07, + "loss": 0.736, + "step": 8579 + }, + { + "epoch": 0.87, + "grad_norm": 1.8394322681952746, + "learning_rate": 8.373284583316176e-07, + "loss": 0.6502, + "step": 8580 + }, + { + "epoch": 0.87, + "grad_norm": 1.682872999505341, + "learning_rate": 8.360093030902871e-07, + "loss": 0.6431, + "step": 8581 + }, + { + "epoch": 0.87, + "grad_norm": 1.7358414224590166, + "learning_rate": 8.346911424568394e-07, + "loss": 0.5806, + "step": 8582 + }, + { + "epoch": 0.87, + "grad_norm": 1.7439800889886863, + "learning_rate": 8.333739765743399e-07, + "loss": 0.7109, + "step": 8583 + }, + { + "epoch": 0.87, + "grad_norm": 1.699991984763425, + "learning_rate": 8.320578055857432e-07, + "loss": 0.7801, + "step": 8584 + }, + { + "epoch": 0.87, + "grad_norm": 1.6099028565865006, + "learning_rate": 8.307426296339017e-07, + "loss": 0.6203, + "step": 8585 + }, + { + "epoch": 0.87, + "grad_norm": 1.6382126778019257, + "learning_rate": 8.294284488615578e-07, + "loss": 0.6541, + "step": 8586 + }, + { + "epoch": 0.87, + "grad_norm": 1.74805337997645, + "learning_rate": 8.281152634113432e-07, + "loss": 0.7318, + "step": 8587 + }, + { + "epoch": 0.87, + "grad_norm": 1.6270872487229284, + "learning_rate": 8.268030734257848e-07, + "loss": 0.6896, + "step": 8588 + }, + { + "epoch": 0.87, + "grad_norm": 1.5915860465129363, + "learning_rate": 8.254918790472999e-07, + "loss": 0.6669, + "step": 8589 + }, + { + "epoch": 0.87, + "grad_norm": 1.5851282974358303, + "learning_rate": 8.241816804181968e-07, + "loss": 0.5469, + "step": 8590 + }, + { + "epoch": 0.87, + "grad_norm": 1.6259615558647373, + "learning_rate": 8.228724776806818e-07, + "loss": 0.666, + "step": 8591 + }, + { + "epoch": 0.87, + "grad_norm": 1.501919932252582, + "learning_rate": 8.215642709768423e-07, + "loss": 0.7237, + "step": 8592 + }, + { + "epoch": 0.87, + "grad_norm": 1.5567954460938576, + "learning_rate": 8.20257060448667e-07, + "loss": 0.6374, + "step": 8593 + }, + { + "epoch": 0.87, + "grad_norm": 1.837719264984136, + "learning_rate": 8.189508462380335e-07, + "loss": 0.7212, + "step": 8594 + }, + { + "epoch": 0.87, + "grad_norm": 1.61006922062226, + "learning_rate": 8.176456284867096e-07, + "loss": 0.7692, + "step": 8595 + }, + { + "epoch": 0.87, + "grad_norm": 1.5625071262859511, + "learning_rate": 8.163414073363562e-07, + "loss": 0.7378, + "step": 8596 + }, + { + "epoch": 0.87, + "grad_norm": 1.731467292703643, + "learning_rate": 8.150381829285282e-07, + "loss": 0.6837, + "step": 8597 + }, + { + "epoch": 0.87, + "grad_norm": 1.7203878203226242, + "learning_rate": 8.137359554046676e-07, + "loss": 0.6837, + "step": 8598 + }, + { + "epoch": 0.87, + "grad_norm": 1.7322407343324207, + "learning_rate": 8.124347249061115e-07, + "loss": 0.5974, + "step": 8599 + }, + { + "epoch": 0.87, + "grad_norm": 1.708328769120278, + "learning_rate": 8.111344915740893e-07, + "loss": 0.7015, + "step": 8600 + }, + { + "epoch": 0.87, + "grad_norm": 1.492761598293974, + "learning_rate": 8.098352555497202e-07, + "loss": 0.6527, + "step": 8601 + }, + { + "epoch": 0.87, + "grad_norm": 1.4512688790266, + "learning_rate": 8.085370169740169e-07, + "loss": 0.602, + "step": 8602 + }, + { + "epoch": 0.88, + "grad_norm": 1.5763027777487575, + "learning_rate": 8.072397759878803e-07, + "loss": 0.6192, + "step": 8603 + }, + { + "epoch": 0.88, + "grad_norm": 1.7378737574428034, + "learning_rate": 8.059435327321074e-07, + "loss": 0.7466, + "step": 8604 + }, + { + "epoch": 0.88, + "grad_norm": 1.7063685731013865, + "learning_rate": 8.046482873473871e-07, + "loss": 0.6301, + "step": 8605 + }, + { + "epoch": 0.88, + "grad_norm": 1.7740020706333472, + "learning_rate": 8.033540399742945e-07, + "loss": 0.7094, + "step": 8606 + }, + { + "epoch": 0.88, + "grad_norm": 1.5270984069291418, + "learning_rate": 8.020607907533017e-07, + "loss": 0.6618, + "step": 8607 + }, + { + "epoch": 0.88, + "grad_norm": 1.597389958525877, + "learning_rate": 8.0076853982477e-07, + "loss": 0.699, + "step": 8608 + }, + { + "epoch": 0.88, + "grad_norm": 1.5897788497630603, + "learning_rate": 7.994772873289536e-07, + "loss": 0.76, + "step": 8609 + }, + { + "epoch": 0.88, + "grad_norm": 1.6962932147366152, + "learning_rate": 7.981870334059983e-07, + "loss": 0.6271, + "step": 8610 + }, + { + "epoch": 0.88, + "grad_norm": 1.667922649893392, + "learning_rate": 7.968977781959387e-07, + "loss": 0.6334, + "step": 8611 + }, + { + "epoch": 0.88, + "grad_norm": 1.5968713655185358, + "learning_rate": 7.95609521838705e-07, + "loss": 0.6992, + "step": 8612 + }, + { + "epoch": 0.88, + "grad_norm": 1.5791815893923733, + "learning_rate": 7.943222644741189e-07, + "loss": 0.6811, + "step": 8613 + }, + { + "epoch": 0.88, + "grad_norm": 1.7572013151258459, + "learning_rate": 7.930360062418896e-07, + "loss": 0.6329, + "step": 8614 + }, + { + "epoch": 0.88, + "grad_norm": 1.6938162903103777, + "learning_rate": 7.91750747281621e-07, + "loss": 0.7315, + "step": 8615 + }, + { + "epoch": 0.88, + "grad_norm": 1.6832955244066146, + "learning_rate": 7.904664877328072e-07, + "loss": 0.6837, + "step": 8616 + }, + { + "epoch": 0.88, + "grad_norm": 1.8860479223720243, + "learning_rate": 7.891832277348344e-07, + "loss": 0.7744, + "step": 8617 + }, + { + "epoch": 0.88, + "grad_norm": 1.7369431625286054, + "learning_rate": 7.879009674269845e-07, + "loss": 0.7475, + "step": 8618 + }, + { + "epoch": 0.88, + "grad_norm": 1.5136845094116347, + "learning_rate": 7.866197069484205e-07, + "loss": 0.6582, + "step": 8619 + }, + { + "epoch": 0.88, + "grad_norm": 1.7750629918797527, + "learning_rate": 7.85339446438208e-07, + "loss": 0.7294, + "step": 8620 + }, + { + "epoch": 0.88, + "grad_norm": 1.587406446054958, + "learning_rate": 7.840601860352947e-07, + "loss": 0.6335, + "step": 8621 + }, + { + "epoch": 0.88, + "grad_norm": 1.6404575019637098, + "learning_rate": 7.827819258785285e-07, + "loss": 0.7823, + "step": 8622 + }, + { + "epoch": 0.88, + "grad_norm": 1.5917314155579665, + "learning_rate": 7.815046661066439e-07, + "loss": 0.6375, + "step": 8623 + }, + { + "epoch": 0.88, + "grad_norm": 1.8137847228022808, + "learning_rate": 7.802284068582655e-07, + "loss": 0.707, + "step": 8624 + }, + { + "epoch": 0.88, + "grad_norm": 1.6504081878106207, + "learning_rate": 7.789531482719148e-07, + "loss": 0.6542, + "step": 8625 + }, + { + "epoch": 0.88, + "grad_norm": 1.623803955785065, + "learning_rate": 7.776788904859956e-07, + "loss": 0.6638, + "step": 8626 + }, + { + "epoch": 0.88, + "grad_norm": 1.6243223053097606, + "learning_rate": 7.764056336388127e-07, + "loss": 0.6989, + "step": 8627 + }, + { + "epoch": 0.88, + "grad_norm": 1.798107487378668, + "learning_rate": 7.751333778685588e-07, + "loss": 0.814, + "step": 8628 + }, + { + "epoch": 0.88, + "grad_norm": 1.6166916424175415, + "learning_rate": 7.738621233133148e-07, + "loss": 0.6075, + "step": 8629 + }, + { + "epoch": 0.88, + "grad_norm": 1.4663527224002506, + "learning_rate": 7.725918701110557e-07, + "loss": 0.5568, + "step": 8630 + }, + { + "epoch": 0.88, + "grad_norm": 1.78406648879962, + "learning_rate": 7.713226183996513e-07, + "loss": 0.7568, + "step": 8631 + }, + { + "epoch": 0.88, + "grad_norm": 1.718185309054279, + "learning_rate": 7.700543683168537e-07, + "loss": 0.6125, + "step": 8632 + }, + { + "epoch": 0.88, + "grad_norm": 1.6783471847093223, + "learning_rate": 7.687871200003172e-07, + "loss": 0.5707, + "step": 8633 + }, + { + "epoch": 0.88, + "grad_norm": 1.8218426363881475, + "learning_rate": 7.675208735875761e-07, + "loss": 0.6897, + "step": 8634 + }, + { + "epoch": 0.88, + "grad_norm": 1.7452221894583109, + "learning_rate": 7.662556292160639e-07, + "loss": 0.7302, + "step": 8635 + }, + { + "epoch": 0.88, + "grad_norm": 1.8217020412534808, + "learning_rate": 7.649913870231063e-07, + "loss": 0.8073, + "step": 8636 + }, + { + "epoch": 0.88, + "grad_norm": 1.9268201950458284, + "learning_rate": 7.637281471459124e-07, + "loss": 0.6536, + "step": 8637 + }, + { + "epoch": 0.88, + "grad_norm": 1.7537251047143585, + "learning_rate": 7.624659097215903e-07, + "loss": 0.7079, + "step": 8638 + }, + { + "epoch": 0.88, + "grad_norm": 1.7704002735144673, + "learning_rate": 7.612046748871327e-07, + "loss": 0.6428, + "step": 8639 + }, + { + "epoch": 0.88, + "grad_norm": 1.7637132366632193, + "learning_rate": 7.599444427794301e-07, + "loss": 0.6163, + "step": 8640 + }, + { + "epoch": 0.88, + "grad_norm": 1.6060984338985305, + "learning_rate": 7.586852135352607e-07, + "loss": 0.6644, + "step": 8641 + }, + { + "epoch": 0.88, + "grad_norm": 1.557104348665908, + "learning_rate": 7.574269872912921e-07, + "loss": 0.6215, + "step": 8642 + }, + { + "epoch": 0.88, + "grad_norm": 1.5334753071785745, + "learning_rate": 7.561697641840882e-07, + "loss": 0.7568, + "step": 8643 + }, + { + "epoch": 0.88, + "grad_norm": 1.7956769060024518, + "learning_rate": 7.549135443500966e-07, + "loss": 0.653, + "step": 8644 + }, + { + "epoch": 0.88, + "grad_norm": 1.6257566675549924, + "learning_rate": 7.536583279256638e-07, + "loss": 0.712, + "step": 8645 + }, + { + "epoch": 0.88, + "grad_norm": 1.7807779192366155, + "learning_rate": 7.524041150470241e-07, + "loss": 0.7125, + "step": 8646 + }, + { + "epoch": 0.88, + "grad_norm": 1.595006510159101, + "learning_rate": 7.511509058502997e-07, + "loss": 0.6043, + "step": 8647 + }, + { + "epoch": 0.88, + "grad_norm": 1.7246952203941068, + "learning_rate": 7.498987004715108e-07, + "loss": 0.67, + "step": 8648 + }, + { + "epoch": 0.88, + "grad_norm": 1.7245351538870213, + "learning_rate": 7.486474990465598e-07, + "loss": 0.6304, + "step": 8649 + }, + { + "epoch": 0.88, + "grad_norm": 1.6393893794984247, + "learning_rate": 7.47397301711249e-07, + "loss": 0.5836, + "step": 8650 + }, + { + "epoch": 0.88, + "grad_norm": 1.6812533149537447, + "learning_rate": 7.461481086012679e-07, + "loss": 0.796, + "step": 8651 + }, + { + "epoch": 0.88, + "grad_norm": 1.6652425440486422, + "learning_rate": 7.448999198521934e-07, + "loss": 0.5835, + "step": 8652 + }, + { + "epoch": 0.88, + "grad_norm": 1.7745124037983675, + "learning_rate": 7.436527355994994e-07, + "loss": 0.7556, + "step": 8653 + }, + { + "epoch": 0.88, + "grad_norm": 1.6955759539400153, + "learning_rate": 7.424065559785498e-07, + "loss": 0.758, + "step": 8654 + }, + { + "epoch": 0.88, + "grad_norm": 1.7262393580932531, + "learning_rate": 7.411613811245944e-07, + "loss": 0.696, + "step": 8655 + }, + { + "epoch": 0.88, + "grad_norm": 1.7838213988096059, + "learning_rate": 7.399172111727804e-07, + "loss": 0.6724, + "step": 8656 + }, + { + "epoch": 0.88, + "grad_norm": 1.7588142980376191, + "learning_rate": 7.386740462581399e-07, + "loss": 0.7651, + "step": 8657 + }, + { + "epoch": 0.88, + "grad_norm": 1.5901706489580727, + "learning_rate": 7.374318865156005e-07, + "loss": 0.6707, + "step": 8658 + }, + { + "epoch": 0.88, + "grad_norm": 1.7232413404704006, + "learning_rate": 7.361907320799811e-07, + "loss": 0.6283, + "step": 8659 + }, + { + "epoch": 0.88, + "grad_norm": 1.6051860022865811, + "learning_rate": 7.349505830859871e-07, + "loss": 0.5946, + "step": 8660 + }, + { + "epoch": 0.88, + "grad_norm": 1.5686212020624424, + "learning_rate": 7.337114396682187e-07, + "loss": 0.666, + "step": 8661 + }, + { + "epoch": 0.88, + "grad_norm": 1.7657274710165263, + "learning_rate": 7.324733019611641e-07, + "loss": 0.6962, + "step": 8662 + }, + { + "epoch": 0.88, + "grad_norm": 1.8283228142017602, + "learning_rate": 7.312361700992043e-07, + "loss": 0.7268, + "step": 8663 + }, + { + "epoch": 0.88, + "grad_norm": 1.545472626591203, + "learning_rate": 7.300000442166133e-07, + "loss": 0.7287, + "step": 8664 + }, + { + "epoch": 0.88, + "grad_norm": 1.5451732294601264, + "learning_rate": 7.287649244475481e-07, + "loss": 0.6112, + "step": 8665 + }, + { + "epoch": 0.88, + "grad_norm": 1.6576688661857788, + "learning_rate": 7.27530810926067e-07, + "loss": 0.6033, + "step": 8666 + }, + { + "epoch": 0.88, + "grad_norm": 1.8983431471455487, + "learning_rate": 7.262977037861096e-07, + "loss": 0.6632, + "step": 8667 + }, + { + "epoch": 0.88, + "grad_norm": 1.539195718199749, + "learning_rate": 7.250656031615111e-07, + "loss": 0.6502, + "step": 8668 + }, + { + "epoch": 0.88, + "grad_norm": 1.6100641613515339, + "learning_rate": 7.23834509186e-07, + "loss": 0.6516, + "step": 8669 + }, + { + "epoch": 0.88, + "grad_norm": 1.712817735434763, + "learning_rate": 7.226044219931883e-07, + "loss": 0.6382, + "step": 8670 + }, + { + "epoch": 0.88, + "grad_norm": 1.75885847360641, + "learning_rate": 7.213753417165836e-07, + "loss": 0.7896, + "step": 8671 + }, + { + "epoch": 0.88, + "grad_norm": 1.7738662827225833, + "learning_rate": 7.201472684895872e-07, + "loss": 0.7514, + "step": 8672 + }, + { + "epoch": 0.88, + "grad_norm": 1.5742849695052534, + "learning_rate": 7.189202024454811e-07, + "loss": 0.6635, + "step": 8673 + }, + { + "epoch": 0.88, + "grad_norm": 1.6258377366677337, + "learning_rate": 7.176941437174489e-07, + "loss": 0.6817, + "step": 8674 + }, + { + "epoch": 0.88, + "grad_norm": 1.6906760253031172, + "learning_rate": 7.164690924385576e-07, + "loss": 0.7801, + "step": 8675 + }, + { + "epoch": 0.88, + "grad_norm": 1.5194349350543774, + "learning_rate": 7.152450487417673e-07, + "loss": 0.6269, + "step": 8676 + }, + { + "epoch": 0.88, + "grad_norm": 1.6111332506431482, + "learning_rate": 7.140220127599318e-07, + "loss": 0.5915, + "step": 8677 + }, + { + "epoch": 0.88, + "grad_norm": 1.6512927420232688, + "learning_rate": 7.127999846257893e-07, + "loss": 0.6012, + "step": 8678 + }, + { + "epoch": 0.88, + "grad_norm": 1.5815162841761377, + "learning_rate": 7.115789644719728e-07, + "loss": 0.6751, + "step": 8679 + }, + { + "epoch": 0.88, + "grad_norm": 1.5351316643915993, + "learning_rate": 7.103589524310051e-07, + "loss": 0.608, + "step": 8680 + }, + { + "epoch": 0.88, + "grad_norm": 1.7310893799644735, + "learning_rate": 7.09139948635299e-07, + "loss": 0.7092, + "step": 8681 + }, + { + "epoch": 0.88, + "grad_norm": 1.744228287786194, + "learning_rate": 7.079219532171599e-07, + "loss": 0.722, + "step": 8682 + }, + { + "epoch": 0.88, + "grad_norm": 1.6453074599655961, + "learning_rate": 7.067049663087788e-07, + "loss": 0.7212, + "step": 8683 + }, + { + "epoch": 0.88, + "grad_norm": 1.721431416393383, + "learning_rate": 7.054889880422433e-07, + "loss": 0.6608, + "step": 8684 + }, + { + "epoch": 0.88, + "grad_norm": 1.603757868663997, + "learning_rate": 7.0427401854953e-07, + "loss": 0.717, + "step": 8685 + }, + { + "epoch": 0.88, + "grad_norm": 1.9247365371113263, + "learning_rate": 7.030600579625014e-07, + "loss": 0.6379, + "step": 8686 + }, + { + "epoch": 0.88, + "grad_norm": 1.8475447505522269, + "learning_rate": 7.018471064129162e-07, + "loss": 0.7608, + "step": 8687 + }, + { + "epoch": 0.88, + "grad_norm": 1.6599123438996048, + "learning_rate": 7.006351640324215e-07, + "loss": 0.7324, + "step": 8688 + }, + { + "epoch": 0.88, + "grad_norm": 1.5674875844829181, + "learning_rate": 6.994242309525523e-07, + "loss": 0.7213, + "step": 8689 + }, + { + "epoch": 0.88, + "grad_norm": 1.9027885332859753, + "learning_rate": 6.982143073047387e-07, + "loss": 0.7656, + "step": 8690 + }, + { + "epoch": 0.88, + "grad_norm": 1.8030207958156752, + "learning_rate": 6.970053932202991e-07, + "loss": 0.686, + "step": 8691 + }, + { + "epoch": 0.88, + "grad_norm": 1.7440038038627508, + "learning_rate": 6.957974888304408e-07, + "loss": 0.7075, + "step": 8692 + }, + { + "epoch": 0.88, + "grad_norm": 1.9613385767698652, + "learning_rate": 6.945905942662646e-07, + "loss": 0.7465, + "step": 8693 + }, + { + "epoch": 0.88, + "grad_norm": 1.828627449456084, + "learning_rate": 6.933847096587575e-07, + "loss": 0.7298, + "step": 8694 + }, + { + "epoch": 0.88, + "grad_norm": 1.552784833683063, + "learning_rate": 6.92179835138802e-07, + "loss": 0.6045, + "step": 8695 + }, + { + "epoch": 0.88, + "grad_norm": 1.7410167562173768, + "learning_rate": 6.909759708371678e-07, + "loss": 0.6272, + "step": 8696 + }, + { + "epoch": 0.88, + "grad_norm": 1.5793295105967549, + "learning_rate": 6.897731168845145e-07, + "loss": 0.6606, + "step": 8697 + }, + { + "epoch": 0.88, + "grad_norm": 1.6323565095980503, + "learning_rate": 6.885712734113925e-07, + "loss": 0.6883, + "step": 8698 + }, + { + "epoch": 0.88, + "grad_norm": 1.460791374584712, + "learning_rate": 6.873704405482473e-07, + "loss": 0.6915, + "step": 8699 + }, + { + "epoch": 0.88, + "grad_norm": 1.5657541778249906, + "learning_rate": 6.861706184254046e-07, + "loss": 0.6578, + "step": 8700 + }, + { + "epoch": 0.89, + "grad_norm": 1.754491594995093, + "learning_rate": 6.849718071730905e-07, + "loss": 0.6067, + "step": 8701 + }, + { + "epoch": 0.89, + "grad_norm": 1.697738919853421, + "learning_rate": 6.837740069214161e-07, + "loss": 0.6951, + "step": 8702 + }, + { + "epoch": 0.89, + "grad_norm": 1.6302173531920074, + "learning_rate": 6.825772178003831e-07, + "loss": 0.7389, + "step": 8703 + }, + { + "epoch": 0.89, + "grad_norm": 1.9243450934911965, + "learning_rate": 6.813814399398855e-07, + "loss": 0.745, + "step": 8704 + }, + { + "epoch": 0.89, + "grad_norm": 1.7761708127088411, + "learning_rate": 6.80186673469706e-07, + "loss": 0.7524, + "step": 8705 + }, + { + "epoch": 0.89, + "grad_norm": 1.614586091489057, + "learning_rate": 6.789929185195166e-07, + "loss": 0.7115, + "step": 8706 + }, + { + "epoch": 0.89, + "grad_norm": 1.676059358685063, + "learning_rate": 6.778001752188823e-07, + "loss": 0.6826, + "step": 8707 + }, + { + "epoch": 0.89, + "grad_norm": 1.8001808657182368, + "learning_rate": 6.766084436972564e-07, + "loss": 0.6911, + "step": 8708 + }, + { + "epoch": 0.89, + "grad_norm": 1.7749660708941688, + "learning_rate": 6.75417724083981e-07, + "loss": 0.6824, + "step": 8709 + }, + { + "epoch": 0.89, + "grad_norm": 1.716846862284347, + "learning_rate": 6.742280165082937e-07, + "loss": 0.6694, + "step": 8710 + }, + { + "epoch": 0.89, + "grad_norm": 1.6762582029546438, + "learning_rate": 6.730393210993147e-07, + "loss": 0.6982, + "step": 8711 + }, + { + "epoch": 0.89, + "grad_norm": 1.7126512371411937, + "learning_rate": 6.718516379860595e-07, + "loss": 0.6423, + "step": 8712 + }, + { + "epoch": 0.89, + "grad_norm": 1.6213435185517746, + "learning_rate": 6.706649672974353e-07, + "loss": 0.6024, + "step": 8713 + }, + { + "epoch": 0.89, + "grad_norm": 1.7864519428301608, + "learning_rate": 6.694793091622331e-07, + "loss": 0.7844, + "step": 8714 + }, + { + "epoch": 0.89, + "grad_norm": 1.778521136186733, + "learning_rate": 6.682946637091404e-07, + "loss": 0.6831, + "step": 8715 + }, + { + "epoch": 0.89, + "grad_norm": 1.54754015717152, + "learning_rate": 6.671110310667283e-07, + "loss": 0.7726, + "step": 8716 + }, + { + "epoch": 0.89, + "grad_norm": 1.6000257477234023, + "learning_rate": 6.659284113634634e-07, + "loss": 0.6161, + "step": 8717 + }, + { + "epoch": 0.89, + "grad_norm": 1.5372503227345617, + "learning_rate": 6.647468047277029e-07, + "loss": 0.5754, + "step": 8718 + }, + { + "epoch": 0.89, + "grad_norm": 1.704517683947603, + "learning_rate": 6.635662112876884e-07, + "loss": 0.6452, + "step": 8719 + }, + { + "epoch": 0.89, + "grad_norm": 1.6588122662892621, + "learning_rate": 6.623866311715576e-07, + "loss": 0.7267, + "step": 8720 + }, + { + "epoch": 0.89, + "grad_norm": 1.459853858961382, + "learning_rate": 6.612080645073316e-07, + "loss": 0.5757, + "step": 8721 + }, + { + "epoch": 0.89, + "grad_norm": 1.5936972799654623, + "learning_rate": 6.600305114229288e-07, + "loss": 0.6597, + "step": 8722 + }, + { + "epoch": 0.89, + "grad_norm": 1.8774277055499191, + "learning_rate": 6.588539720461551e-07, + "loss": 0.7892, + "step": 8723 + }, + { + "epoch": 0.89, + "grad_norm": 1.7669374648720841, + "learning_rate": 6.576784465047014e-07, + "loss": 0.7177, + "step": 8724 + }, + { + "epoch": 0.89, + "grad_norm": 1.560008410701888, + "learning_rate": 6.565039349261548e-07, + "loss": 0.6919, + "step": 8725 + }, + { + "epoch": 0.89, + "grad_norm": 1.7644269109338624, + "learning_rate": 6.55330437437991e-07, + "loss": 0.6835, + "step": 8726 + }, + { + "epoch": 0.89, + "grad_norm": 1.5372222677673302, + "learning_rate": 6.541579541675736e-07, + "loss": 0.5947, + "step": 8727 + }, + { + "epoch": 0.89, + "grad_norm": 1.8259938415601342, + "learning_rate": 6.529864852421586e-07, + "loss": 0.7568, + "step": 8728 + }, + { + "epoch": 0.89, + "grad_norm": 1.4769277739964026, + "learning_rate": 6.518160307888877e-07, + "loss": 0.5646, + "step": 8729 + }, + { + "epoch": 0.89, + "grad_norm": 1.7987995664627618, + "learning_rate": 6.506465909347981e-07, + "loss": 0.6142, + "step": 8730 + }, + { + "epoch": 0.89, + "grad_norm": 1.477685080019239, + "learning_rate": 6.49478165806815e-07, + "loss": 0.5199, + "step": 8731 + }, + { + "epoch": 0.89, + "grad_norm": 1.7147832264792515, + "learning_rate": 6.483107555317491e-07, + "loss": 0.6818, + "step": 8732 + }, + { + "epoch": 0.89, + "grad_norm": 1.7458082121307719, + "learning_rate": 6.471443602363093e-07, + "loss": 0.7152, + "step": 8733 + }, + { + "epoch": 0.89, + "grad_norm": 1.745055777773359, + "learning_rate": 6.45978980047085e-07, + "loss": 0.5758, + "step": 8734 + }, + { + "epoch": 0.89, + "grad_norm": 1.6966452521157125, + "learning_rate": 6.44814615090561e-07, + "loss": 0.6355, + "step": 8735 + }, + { + "epoch": 0.89, + "grad_norm": 1.633730183689762, + "learning_rate": 6.436512654931138e-07, + "loss": 0.6368, + "step": 8736 + }, + { + "epoch": 0.89, + "grad_norm": 1.6072410354623625, + "learning_rate": 6.424889313810034e-07, + "loss": 0.6961, + "step": 8737 + }, + { + "epoch": 0.89, + "grad_norm": 1.8507298257170395, + "learning_rate": 6.413276128803858e-07, + "loss": 0.6825, + "step": 8738 + }, + { + "epoch": 0.89, + "grad_norm": 1.746313217113024, + "learning_rate": 6.401673101173012e-07, + "loss": 0.7445, + "step": 8739 + }, + { + "epoch": 0.89, + "grad_norm": 1.6390561996209958, + "learning_rate": 6.390080232176832e-07, + "loss": 0.6135, + "step": 8740 + }, + { + "epoch": 0.89, + "grad_norm": 1.5743849846487263, + "learning_rate": 6.378497523073568e-07, + "loss": 0.6436, + "step": 8741 + }, + { + "epoch": 0.89, + "grad_norm": 1.9545625466793428, + "learning_rate": 6.366924975120303e-07, + "loss": 0.7543, + "step": 8742 + }, + { + "epoch": 0.89, + "grad_norm": 1.6648504342011303, + "learning_rate": 6.355362589573078e-07, + "loss": 0.6485, + "step": 8743 + }, + { + "epoch": 0.89, + "grad_norm": 1.716423230872207, + "learning_rate": 6.34381036768682e-07, + "loss": 0.6911, + "step": 8744 + }, + { + "epoch": 0.89, + "grad_norm": 1.65445994435496, + "learning_rate": 6.332268310715306e-07, + "loss": 0.7274, + "step": 8745 + }, + { + "epoch": 0.89, + "grad_norm": 1.6244381998771817, + "learning_rate": 6.320736419911289e-07, + "loss": 0.5986, + "step": 8746 + }, + { + "epoch": 0.89, + "grad_norm": 1.893230528199649, + "learning_rate": 6.309214696526345e-07, + "loss": 0.7471, + "step": 8747 + }, + { + "epoch": 0.89, + "grad_norm": 1.6760771995237522, + "learning_rate": 6.297703141810973e-07, + "loss": 0.7281, + "step": 8748 + }, + { + "epoch": 0.89, + "grad_norm": 1.758727507648121, + "learning_rate": 6.286201757014609e-07, + "loss": 0.673, + "step": 8749 + }, + { + "epoch": 0.89, + "grad_norm": 1.6252006539545014, + "learning_rate": 6.274710543385498e-07, + "loss": 0.6644, + "step": 8750 + }, + { + "epoch": 0.89, + "grad_norm": 1.711993065300969, + "learning_rate": 6.263229502170887e-07, + "loss": 0.72, + "step": 8751 + }, + { + "epoch": 0.89, + "grad_norm": 1.598204332420291, + "learning_rate": 6.251758634616811e-07, + "loss": 0.6851, + "step": 8752 + }, + { + "epoch": 0.89, + "grad_norm": 1.7099373984607578, + "learning_rate": 6.240297941968276e-07, + "loss": 0.6626, + "step": 8753 + }, + { + "epoch": 0.89, + "grad_norm": 2.067578511288023, + "learning_rate": 6.228847425469176e-07, + "loss": 0.7337, + "step": 8754 + }, + { + "epoch": 0.89, + "grad_norm": 1.6329116330234184, + "learning_rate": 6.217407086362259e-07, + "loss": 0.5908, + "step": 8755 + }, + { + "epoch": 0.89, + "grad_norm": 1.8695458114514938, + "learning_rate": 6.205976925889223e-07, + "loss": 0.6287, + "step": 8756 + }, + { + "epoch": 0.89, + "grad_norm": 1.6009515502837723, + "learning_rate": 6.194556945290597e-07, + "loss": 0.5986, + "step": 8757 + }, + { + "epoch": 0.89, + "grad_norm": 1.7133996021572895, + "learning_rate": 6.183147145805868e-07, + "loss": 0.6671, + "step": 8758 + }, + { + "epoch": 0.89, + "grad_norm": 1.547406379390001, + "learning_rate": 6.171747528673399e-07, + "loss": 0.5963, + "step": 8759 + }, + { + "epoch": 0.89, + "grad_norm": 1.8441570176178421, + "learning_rate": 6.160358095130414e-07, + "loss": 0.7358, + "step": 8760 + }, + { + "epoch": 0.89, + "grad_norm": 1.5426579135934184, + "learning_rate": 6.148978846413067e-07, + "loss": 0.6821, + "step": 8761 + }, + { + "epoch": 0.89, + "grad_norm": 1.5955286906448458, + "learning_rate": 6.137609783756415e-07, + "loss": 0.703, + "step": 8762 + }, + { + "epoch": 0.89, + "grad_norm": 1.9207348627068204, + "learning_rate": 6.12625090839436e-07, + "loss": 0.761, + "step": 8763 + }, + { + "epoch": 0.89, + "grad_norm": 1.6145845753468058, + "learning_rate": 6.11490222155976e-07, + "loss": 0.7207, + "step": 8764 + }, + { + "epoch": 0.89, + "grad_norm": 1.8111292561758374, + "learning_rate": 6.103563724484318e-07, + "loss": 0.5999, + "step": 8765 + }, + { + "epoch": 0.89, + "grad_norm": 1.6732627046727127, + "learning_rate": 6.092235418398651e-07, + "loss": 0.6734, + "step": 8766 + }, + { + "epoch": 0.89, + "grad_norm": 1.8411285377702618, + "learning_rate": 6.080917304532297e-07, + "loss": 0.6682, + "step": 8767 + }, + { + "epoch": 0.89, + "grad_norm": 1.6387647062242647, + "learning_rate": 6.069609384113617e-07, + "loss": 0.629, + "step": 8768 + }, + { + "epoch": 0.89, + "grad_norm": 1.6815980212566355, + "learning_rate": 6.058311658369954e-07, + "loss": 0.6013, + "step": 8769 + }, + { + "epoch": 0.89, + "grad_norm": 1.706867933567596, + "learning_rate": 6.047024128527456e-07, + "loss": 0.7289, + "step": 8770 + }, + { + "epoch": 0.89, + "grad_norm": 1.716827005321816, + "learning_rate": 6.035746795811226e-07, + "loss": 0.6986, + "step": 8771 + }, + { + "epoch": 0.89, + "grad_norm": 1.5820734195546344, + "learning_rate": 6.024479661445271e-07, + "loss": 0.6539, + "step": 8772 + }, + { + "epoch": 0.89, + "grad_norm": 1.6433261887082913, + "learning_rate": 6.013222726652412e-07, + "loss": 0.5254, + "step": 8773 + }, + { + "epoch": 0.89, + "grad_norm": 1.6064092532647984, + "learning_rate": 6.001975992654452e-07, + "loss": 0.5876, + "step": 8774 + }, + { + "epoch": 0.89, + "grad_norm": 1.6371089009253044, + "learning_rate": 5.990739460672024e-07, + "loss": 0.6335, + "step": 8775 + }, + { + "epoch": 0.89, + "grad_norm": 1.9318912881798678, + "learning_rate": 5.979513131924686e-07, + "loss": 0.7516, + "step": 8776 + }, + { + "epoch": 0.89, + "grad_norm": 1.696039269204613, + "learning_rate": 5.968297007630897e-07, + "loss": 0.7741, + "step": 8777 + }, + { + "epoch": 0.89, + "grad_norm": 1.704697818598336, + "learning_rate": 5.95709108900796e-07, + "loss": 0.6288, + "step": 8778 + }, + { + "epoch": 0.89, + "grad_norm": 1.6282649929871835, + "learning_rate": 5.945895377272115e-07, + "loss": 0.6602, + "step": 8779 + }, + { + "epoch": 0.89, + "grad_norm": 1.7299319787649439, + "learning_rate": 5.9347098736385e-07, + "loss": 0.6479, + "step": 8780 + }, + { + "epoch": 0.89, + "grad_norm": 1.5855476532183754, + "learning_rate": 5.9235345793211e-07, + "loss": 0.7551, + "step": 8781 + }, + { + "epoch": 0.89, + "grad_norm": 1.9476339944739738, + "learning_rate": 5.912369495532844e-07, + "loss": 0.8062, + "step": 8782 + }, + { + "epoch": 0.89, + "grad_norm": 1.8287390115049857, + "learning_rate": 5.901214623485507e-07, + "loss": 0.7113, + "step": 8783 + }, + { + "epoch": 0.89, + "grad_norm": 1.7171646753440621, + "learning_rate": 5.890069964389766e-07, + "loss": 0.8326, + "step": 8784 + }, + { + "epoch": 0.89, + "grad_norm": 1.6001861328056501, + "learning_rate": 5.878935519455243e-07, + "loss": 0.6403, + "step": 8785 + }, + { + "epoch": 0.89, + "grad_norm": 1.8358706633985116, + "learning_rate": 5.867811289890357e-07, + "loss": 0.6297, + "step": 8786 + }, + { + "epoch": 0.89, + "grad_norm": 1.8183021473531173, + "learning_rate": 5.856697276902512e-07, + "loss": 0.7184, + "step": 8787 + }, + { + "epoch": 0.89, + "grad_norm": 1.7328689608493777, + "learning_rate": 5.845593481697931e-07, + "loss": 0.6518, + "step": 8788 + }, + { + "epoch": 0.89, + "grad_norm": 1.7449931725599808, + "learning_rate": 5.834499905481761e-07, + "loss": 0.6504, + "step": 8789 + }, + { + "epoch": 0.89, + "grad_norm": 1.7985757893626173, + "learning_rate": 5.823416549458061e-07, + "loss": 0.768, + "step": 8790 + }, + { + "epoch": 0.89, + "grad_norm": 1.7058853899108344, + "learning_rate": 5.812343414829725e-07, + "loss": 0.6564, + "step": 8791 + }, + { + "epoch": 0.89, + "grad_norm": 1.5406487918403375, + "learning_rate": 5.801280502798578e-07, + "loss": 0.6524, + "step": 8792 + }, + { + "epoch": 0.89, + "grad_norm": 1.7917898832172519, + "learning_rate": 5.790227814565342e-07, + "loss": 0.6613, + "step": 8793 + }, + { + "epoch": 0.89, + "grad_norm": 1.6094745785554516, + "learning_rate": 5.779185351329586e-07, + "loss": 0.677, + "step": 8794 + }, + { + "epoch": 0.89, + "grad_norm": 1.6648861762187483, + "learning_rate": 5.76815311428982e-07, + "loss": 0.7195, + "step": 8795 + }, + { + "epoch": 0.89, + "grad_norm": 1.7563619296436717, + "learning_rate": 5.75713110464341e-07, + "loss": 0.7305, + "step": 8796 + }, + { + "epoch": 0.89, + "grad_norm": 1.7070913640267633, + "learning_rate": 5.746119323586619e-07, + "loss": 0.5819, + "step": 8797 + }, + { + "epoch": 0.89, + "grad_norm": 1.791324973188892, + "learning_rate": 5.735117772314625e-07, + "loss": 0.6723, + "step": 8798 + }, + { + "epoch": 0.9, + "grad_norm": 1.5459587913165356, + "learning_rate": 5.724126452021439e-07, + "loss": 0.6511, + "step": 8799 + }, + { + "epoch": 0.9, + "grad_norm": 1.7139177229387026, + "learning_rate": 5.713145363900019e-07, + "loss": 0.6557, + "step": 8800 + }, + { + "epoch": 0.9, + "grad_norm": 1.6248645438784817, + "learning_rate": 5.702174509142211e-07, + "loss": 0.6548, + "step": 8801 + }, + { + "epoch": 0.9, + "grad_norm": 1.5587540887616815, + "learning_rate": 5.691213888938684e-07, + "loss": 0.7533, + "step": 8802 + }, + { + "epoch": 0.9, + "grad_norm": 1.8162267140969848, + "learning_rate": 5.680263504479067e-07, + "loss": 0.691, + "step": 8803 + }, + { + "epoch": 0.9, + "grad_norm": 1.9258082565468186, + "learning_rate": 5.669323356951873e-07, + "loss": 0.7071, + "step": 8804 + }, + { + "epoch": 0.9, + "grad_norm": 1.8151940950777297, + "learning_rate": 5.658393447544441e-07, + "loss": 0.7587, + "step": 8805 + }, + { + "epoch": 0.9, + "grad_norm": 1.5083937395603395, + "learning_rate": 5.647473777443057e-07, + "loss": 0.6167, + "step": 8806 + }, + { + "epoch": 0.9, + "grad_norm": 1.6689479666754305, + "learning_rate": 5.636564347832907e-07, + "loss": 0.645, + "step": 8807 + }, + { + "epoch": 0.9, + "grad_norm": 1.6817830151070023, + "learning_rate": 5.625665159897997e-07, + "loss": 0.6561, + "step": 8808 + }, + { + "epoch": 0.9, + "grad_norm": 1.5860333611721988, + "learning_rate": 5.614776214821293e-07, + "loss": 0.6598, + "step": 8809 + }, + { + "epoch": 0.9, + "grad_norm": 1.5677057151523508, + "learning_rate": 5.603897513784628e-07, + "loss": 0.6577, + "step": 8810 + }, + { + "epoch": 0.9, + "grad_norm": 1.7430558459350933, + "learning_rate": 5.593029057968668e-07, + "loss": 0.7343, + "step": 8811 + }, + { + "epoch": 0.9, + "grad_norm": 1.8275938824697127, + "learning_rate": 5.582170848553059e-07, + "loss": 0.7346, + "step": 8812 + }, + { + "epoch": 0.9, + "grad_norm": 1.6236573604172517, + "learning_rate": 5.571322886716279e-07, + "loss": 0.6967, + "step": 8813 + }, + { + "epoch": 0.9, + "grad_norm": 1.8136108917306695, + "learning_rate": 5.560485173635699e-07, + "loss": 0.6979, + "step": 8814 + }, + { + "epoch": 0.9, + "grad_norm": 1.5996349472778744, + "learning_rate": 5.549657710487588e-07, + "loss": 0.6484, + "step": 8815 + }, + { + "epoch": 0.9, + "grad_norm": 1.9571357824539959, + "learning_rate": 5.538840498447084e-07, + "loss": 0.7165, + "step": 8816 + }, + { + "epoch": 0.9, + "grad_norm": 1.7955011318416891, + "learning_rate": 5.528033538688227e-07, + "loss": 0.723, + "step": 8817 + }, + { + "epoch": 0.9, + "grad_norm": 1.6455737570632334, + "learning_rate": 5.517236832383976e-07, + "loss": 0.6117, + "step": 8818 + }, + { + "epoch": 0.9, + "grad_norm": 1.5258600663929245, + "learning_rate": 5.506450380706108e-07, + "loss": 0.5365, + "step": 8819 + }, + { + "epoch": 0.9, + "grad_norm": 1.5290508706361885, + "learning_rate": 5.495674184825339e-07, + "loss": 0.6308, + "step": 8820 + }, + { + "epoch": 0.9, + "grad_norm": 1.4450450071362761, + "learning_rate": 5.484908245911269e-07, + "loss": 0.5544, + "step": 8821 + }, + { + "epoch": 0.9, + "grad_norm": 1.861407949886194, + "learning_rate": 5.47415256513234e-07, + "loss": 0.7452, + "step": 8822 + }, + { + "epoch": 0.9, + "grad_norm": 1.6397086283679179, + "learning_rate": 5.463407143655941e-07, + "loss": 0.6864, + "step": 8823 + }, + { + "epoch": 0.9, + "grad_norm": 1.6945030783026827, + "learning_rate": 5.452671982648306e-07, + "loss": 0.5986, + "step": 8824 + }, + { + "epoch": 0.9, + "grad_norm": 2.1115877118850843, + "learning_rate": 5.441947083274568e-07, + "loss": 0.6233, + "step": 8825 + }, + { + "epoch": 0.9, + "grad_norm": 1.6672585918040428, + "learning_rate": 5.431232446698764e-07, + "loss": 0.68, + "step": 8826 + }, + { + "epoch": 0.9, + "grad_norm": 1.7129379266769043, + "learning_rate": 5.420528074083775e-07, + "loss": 0.7318, + "step": 8827 + }, + { + "epoch": 0.9, + "grad_norm": 1.7850490370440566, + "learning_rate": 5.409833966591416e-07, + "loss": 0.7114, + "step": 8828 + }, + { + "epoch": 0.9, + "grad_norm": 1.6887809896724186, + "learning_rate": 5.399150125382346e-07, + "loss": 0.619, + "step": 8829 + }, + { + "epoch": 0.9, + "grad_norm": 1.7080850987518374, + "learning_rate": 5.38847655161614e-07, + "loss": 0.7035, + "step": 8830 + }, + { + "epoch": 0.9, + "grad_norm": 1.5918728979806458, + "learning_rate": 5.377813246451258e-07, + "loss": 0.6376, + "step": 8831 + }, + { + "epoch": 0.9, + "grad_norm": 1.6662792606891546, + "learning_rate": 5.367160211044997e-07, + "loss": 0.5893, + "step": 8832 + }, + { + "epoch": 0.9, + "grad_norm": 2.052849741595967, + "learning_rate": 5.356517446553632e-07, + "loss": 0.6952, + "step": 8833 + }, + { + "epoch": 0.9, + "grad_norm": 1.8600307295170615, + "learning_rate": 5.345884954132219e-07, + "loss": 0.8211, + "step": 8834 + }, + { + "epoch": 0.9, + "grad_norm": 1.7060491542482932, + "learning_rate": 5.335262734934777e-07, + "loss": 0.7927, + "step": 8835 + }, + { + "epoch": 0.9, + "grad_norm": 1.6548610817413254, + "learning_rate": 5.324650790114183e-07, + "loss": 0.6175, + "step": 8836 + }, + { + "epoch": 0.9, + "grad_norm": 1.5534172736814402, + "learning_rate": 5.314049120822173e-07, + "loss": 0.6578, + "step": 8837 + }, + { + "epoch": 0.9, + "grad_norm": 1.6066570013867365, + "learning_rate": 5.303457728209405e-07, + "loss": 0.6216, + "step": 8838 + }, + { + "epoch": 0.9, + "grad_norm": 1.7314963661007452, + "learning_rate": 5.292876613425435e-07, + "loss": 0.5863, + "step": 8839 + }, + { + "epoch": 0.9, + "grad_norm": 1.8081504341128891, + "learning_rate": 5.282305777618635e-07, + "loss": 0.6855, + "step": 8840 + }, + { + "epoch": 0.9, + "grad_norm": 1.6112349432475361, + "learning_rate": 5.271745221936342e-07, + "loss": 0.7426, + "step": 8841 + }, + { + "epoch": 0.9, + "grad_norm": 1.8055474774673523, + "learning_rate": 5.261194947524706e-07, + "loss": 0.7428, + "step": 8842 + }, + { + "epoch": 0.9, + "grad_norm": 1.5931208961495853, + "learning_rate": 5.2506549555288e-07, + "loss": 0.6494, + "step": 8843 + }, + { + "epoch": 0.9, + "grad_norm": 1.640245601508315, + "learning_rate": 5.240125247092609e-07, + "loss": 0.6547, + "step": 8844 + }, + { + "epoch": 0.9, + "grad_norm": 1.6570113616081257, + "learning_rate": 5.22960582335893e-07, + "loss": 0.6057, + "step": 8845 + }, + { + "epoch": 0.9, + "grad_norm": 1.7830829445931253, + "learning_rate": 5.219096685469505e-07, + "loss": 0.6648, + "step": 8846 + }, + { + "epoch": 0.9, + "grad_norm": 1.5182028049341616, + "learning_rate": 5.20859783456491e-07, + "loss": 0.6532, + "step": 8847 + }, + { + "epoch": 0.9, + "grad_norm": 1.6825405089912515, + "learning_rate": 5.198109271784657e-07, + "loss": 0.7685, + "step": 8848 + }, + { + "epoch": 0.9, + "grad_norm": 1.5108201298654471, + "learning_rate": 5.187630998267112e-07, + "loss": 0.6859, + "step": 8849 + }, + { + "epoch": 0.9, + "grad_norm": 1.7284137246283864, + "learning_rate": 5.177163015149522e-07, + "loss": 0.7895, + "step": 8850 + }, + { + "epoch": 0.9, + "grad_norm": 1.6011817453650048, + "learning_rate": 5.166705323568022e-07, + "loss": 0.7195, + "step": 8851 + }, + { + "epoch": 0.9, + "grad_norm": 1.6879633198914614, + "learning_rate": 5.156257924657626e-07, + "loss": 0.6305, + "step": 8852 + }, + { + "epoch": 0.9, + "grad_norm": 1.5509576011661719, + "learning_rate": 5.145820819552239e-07, + "loss": 0.5572, + "step": 8853 + }, + { + "epoch": 0.9, + "grad_norm": 1.6281892283913746, + "learning_rate": 5.135394009384665e-07, + "loss": 0.7006, + "step": 8854 + }, + { + "epoch": 0.9, + "grad_norm": 1.5648948142189403, + "learning_rate": 5.124977495286543e-07, + "loss": 0.634, + "step": 8855 + }, + { + "epoch": 0.9, + "grad_norm": 1.6985997006874525, + "learning_rate": 5.114571278388436e-07, + "loss": 0.662, + "step": 8856 + }, + { + "epoch": 0.9, + "grad_norm": 1.7954802063819606, + "learning_rate": 5.104175359819785e-07, + "loss": 0.7451, + "step": 8857 + }, + { + "epoch": 0.9, + "grad_norm": 1.6763239305455557, + "learning_rate": 5.093789740708877e-07, + "loss": 0.6913, + "step": 8858 + }, + { + "epoch": 0.9, + "grad_norm": 1.7869358727317133, + "learning_rate": 5.083414422182942e-07, + "loss": 0.7111, + "step": 8859 + }, + { + "epoch": 0.9, + "grad_norm": 1.7085254944971797, + "learning_rate": 5.073049405368025e-07, + "loss": 0.8868, + "step": 8860 + }, + { + "epoch": 0.9, + "grad_norm": 1.7339717725846475, + "learning_rate": 5.062694691389114e-07, + "loss": 0.7482, + "step": 8861 + }, + { + "epoch": 0.9, + "grad_norm": 1.5826376971671454, + "learning_rate": 5.052350281370045e-07, + "loss": 0.7272, + "step": 8862 + }, + { + "epoch": 0.9, + "grad_norm": 1.7025735469207472, + "learning_rate": 5.042016176433529e-07, + "loss": 0.7245, + "step": 8863 + }, + { + "epoch": 0.9, + "grad_norm": 1.7507929274931837, + "learning_rate": 5.031692377701192e-07, + "loss": 0.7461, + "step": 8864 + }, + { + "epoch": 0.9, + "grad_norm": 1.7509772240735764, + "learning_rate": 5.021378886293493e-07, + "loss": 0.7327, + "step": 8865 + }, + { + "epoch": 0.9, + "grad_norm": 1.5796503675823719, + "learning_rate": 5.011075703329816e-07, + "loss": 0.6357, + "step": 8866 + }, + { + "epoch": 0.9, + "grad_norm": 1.6355729150637843, + "learning_rate": 5.00078282992843e-07, + "loss": 0.6922, + "step": 8867 + }, + { + "epoch": 0.9, + "grad_norm": 1.9593713687066243, + "learning_rate": 4.990500267206422e-07, + "loss": 0.6745, + "step": 8868 + }, + { + "epoch": 0.9, + "grad_norm": 1.8007290348212777, + "learning_rate": 4.980228016279853e-07, + "loss": 0.6228, + "step": 8869 + }, + { + "epoch": 0.9, + "grad_norm": 1.5854190455395145, + "learning_rate": 4.969966078263578e-07, + "loss": 0.6157, + "step": 8870 + }, + { + "epoch": 0.9, + "grad_norm": 1.7029363611122843, + "learning_rate": 4.95971445427137e-07, + "loss": 0.6451, + "step": 8871 + }, + { + "epoch": 0.9, + "grad_norm": 1.7041474249843223, + "learning_rate": 4.949473145415917e-07, + "loss": 0.6162, + "step": 8872 + }, + { + "epoch": 0.9, + "grad_norm": 1.9193299614643426, + "learning_rate": 4.939242152808709e-07, + "loss": 0.6908, + "step": 8873 + }, + { + "epoch": 0.9, + "grad_norm": 1.6604203951924335, + "learning_rate": 4.929021477560181e-07, + "loss": 0.6437, + "step": 8874 + }, + { + "epoch": 0.9, + "grad_norm": 1.7807366861294014, + "learning_rate": 4.918811120779655e-07, + "loss": 0.7067, + "step": 8875 + }, + { + "epoch": 0.9, + "grad_norm": 1.8913639048565587, + "learning_rate": 4.908611083575254e-07, + "loss": 0.6881, + "step": 8876 + }, + { + "epoch": 0.9, + "grad_norm": 1.605533038984582, + "learning_rate": 4.898421367054063e-07, + "loss": 0.6292, + "step": 8877 + }, + { + "epoch": 0.9, + "grad_norm": 1.7706508305498252, + "learning_rate": 4.888241972322005e-07, + "loss": 0.7253, + "step": 8878 + }, + { + "epoch": 0.9, + "grad_norm": 1.7098943647143408, + "learning_rate": 4.878072900483899e-07, + "loss": 0.6648, + "step": 8879 + }, + { + "epoch": 0.9, + "grad_norm": 1.7194533016750646, + "learning_rate": 4.86791415264346e-07, + "loss": 0.7055, + "step": 8880 + }, + { + "epoch": 0.9, + "grad_norm": 1.6214507345058031, + "learning_rate": 4.857765729903219e-07, + "loss": 0.6689, + "step": 8881 + }, + { + "epoch": 0.9, + "grad_norm": 1.5346747157802176, + "learning_rate": 4.84762763336466e-07, + "loss": 0.641, + "step": 8882 + }, + { + "epoch": 0.9, + "grad_norm": 1.5087485363749233, + "learning_rate": 4.837499864128104e-07, + "loss": 0.573, + "step": 8883 + }, + { + "epoch": 0.9, + "grad_norm": 1.7344990112796763, + "learning_rate": 4.827382423292748e-07, + "loss": 0.723, + "step": 8884 + }, + { + "epoch": 0.9, + "grad_norm": 1.7580447093681373, + "learning_rate": 4.817275311956715e-07, + "loss": 0.6838, + "step": 8885 + }, + { + "epoch": 0.9, + "grad_norm": 1.7700545438137756, + "learning_rate": 4.80717853121695e-07, + "loss": 0.8107, + "step": 8886 + }, + { + "epoch": 0.9, + "grad_norm": 1.608306797865548, + "learning_rate": 4.797092082169308e-07, + "loss": 0.6479, + "step": 8887 + }, + { + "epoch": 0.9, + "grad_norm": 1.8590731878633593, + "learning_rate": 4.787015965908504e-07, + "loss": 0.7478, + "step": 8888 + }, + { + "epoch": 0.9, + "grad_norm": 1.9107457527445748, + "learning_rate": 4.776950183528161e-07, + "loss": 0.6514, + "step": 8889 + }, + { + "epoch": 0.9, + "grad_norm": 1.7168042445704972, + "learning_rate": 4.766894736120753e-07, + "loss": 0.6907, + "step": 8890 + }, + { + "epoch": 0.9, + "grad_norm": 1.8278719101055414, + "learning_rate": 4.7568496247776373e-07, + "loss": 0.7917, + "step": 8891 + }, + { + "epoch": 0.9, + "grad_norm": 1.530010529849326, + "learning_rate": 4.746814850589054e-07, + "loss": 0.6842, + "step": 8892 + }, + { + "epoch": 0.9, + "grad_norm": 1.488161536929116, + "learning_rate": 4.7367904146441435e-07, + "loss": 0.6651, + "step": 8893 + }, + { + "epoch": 0.9, + "grad_norm": 1.8409654711792904, + "learning_rate": 4.7267763180308676e-07, + "loss": 0.719, + "step": 8894 + }, + { + "epoch": 0.9, + "grad_norm": 1.883523084424903, + "learning_rate": 4.716772561836136e-07, + "loss": 0.7707, + "step": 8895 + }, + { + "epoch": 0.9, + "grad_norm": 1.771972446027783, + "learning_rate": 4.706779147145657e-07, + "loss": 0.6725, + "step": 8896 + }, + { + "epoch": 0.9, + "grad_norm": 1.6792519467501912, + "learning_rate": 4.6967960750440855e-07, + "loss": 0.6237, + "step": 8897 + }, + { + "epoch": 0.91, + "grad_norm": 1.7675836119266148, + "learning_rate": 4.6868233466149327e-07, + "loss": 0.7123, + "step": 8898 + }, + { + "epoch": 0.91, + "grad_norm": 1.6269745798391906, + "learning_rate": 4.676860962940577e-07, + "loss": 0.6876, + "step": 8899 + }, + { + "epoch": 0.91, + "grad_norm": 1.7769618813041788, + "learning_rate": 4.6669089251022757e-07, + "loss": 0.6378, + "step": 8900 + }, + { + "epoch": 0.91, + "grad_norm": 1.5590363379953227, + "learning_rate": 4.6569672341801544e-07, + "loss": 0.7152, + "step": 8901 + }, + { + "epoch": 0.91, + "grad_norm": 1.8800667696896778, + "learning_rate": 4.647035891253249e-07, + "loss": 0.68, + "step": 8902 + }, + { + "epoch": 0.91, + "grad_norm": 1.7554247388764033, + "learning_rate": 4.637114897399453e-07, + "loss": 0.7968, + "step": 8903 + }, + { + "epoch": 0.91, + "grad_norm": 1.5907387453462725, + "learning_rate": 4.6272042536955164e-07, + "loss": 0.7354, + "step": 8904 + }, + { + "epoch": 0.91, + "grad_norm": 1.718431267879274, + "learning_rate": 4.61730396121709e-07, + "loss": 0.6396, + "step": 8905 + }, + { + "epoch": 0.91, + "grad_norm": 1.7295315815770955, + "learning_rate": 4.607414021038714e-07, + "loss": 0.695, + "step": 8906 + }, + { + "epoch": 0.91, + "grad_norm": 1.66657712292234, + "learning_rate": 4.5975344342337525e-07, + "loss": 0.646, + "step": 8907 + }, + { + "epoch": 0.91, + "grad_norm": 1.620845024767798, + "learning_rate": 4.5876652018745136e-07, + "loss": 0.6812, + "step": 8908 + }, + { + "epoch": 0.91, + "grad_norm": 1.5818359888527045, + "learning_rate": 4.5778063250321415e-07, + "loss": 0.6347, + "step": 8909 + }, + { + "epoch": 0.91, + "grad_norm": 1.575993128204435, + "learning_rate": 4.567957804776646e-07, + "loss": 0.6201, + "step": 8910 + }, + { + "epoch": 0.91, + "grad_norm": 1.71544634015368, + "learning_rate": 4.558119642176939e-07, + "loss": 0.6866, + "step": 8911 + }, + { + "epoch": 0.91, + "grad_norm": 1.8006115963235978, + "learning_rate": 4.548291838300811e-07, + "loss": 0.6697, + "step": 8912 + }, + { + "epoch": 0.91, + "grad_norm": 1.6565075655767947, + "learning_rate": 4.5384743942148977e-07, + "loss": 0.7009, + "step": 8913 + }, + { + "epoch": 0.91, + "grad_norm": 1.7720099961704778, + "learning_rate": 4.5286673109847357e-07, + "loss": 0.6587, + "step": 8914 + }, + { + "epoch": 0.91, + "grad_norm": 1.7665613437823307, + "learning_rate": 4.518870589674751e-07, + "loss": 0.6843, + "step": 8915 + }, + { + "epoch": 0.91, + "grad_norm": 1.6085851494760013, + "learning_rate": 4.509084231348182e-07, + "loss": 0.6389, + "step": 8916 + }, + { + "epoch": 0.91, + "grad_norm": 1.5926002358110811, + "learning_rate": 4.499308237067213e-07, + "loss": 0.7338, + "step": 8917 + }, + { + "epoch": 0.91, + "grad_norm": 1.6111636451724387, + "learning_rate": 4.489542607892894e-07, + "loss": 0.6652, + "step": 8918 + }, + { + "epoch": 0.91, + "grad_norm": 1.7833895836947011, + "learning_rate": 4.479787344885078e-07, + "loss": 0.8026, + "step": 8919 + }, + { + "epoch": 0.91, + "grad_norm": 1.6879517925619654, + "learning_rate": 4.470042449102596e-07, + "loss": 0.6402, + "step": 8920 + }, + { + "epoch": 0.91, + "grad_norm": 1.8778262970244906, + "learning_rate": 4.4603079216030797e-07, + "loss": 0.7691, + "step": 8921 + }, + { + "epoch": 0.91, + "grad_norm": 1.7471342930414981, + "learning_rate": 4.4505837634430616e-07, + "loss": 0.7868, + "step": 8922 + }, + { + "epoch": 0.91, + "grad_norm": 1.4882165634147697, + "learning_rate": 4.440869975677964e-07, + "loss": 0.6551, + "step": 8923 + }, + { + "epoch": 0.91, + "grad_norm": 1.7413866953660853, + "learning_rate": 4.4311665593620323e-07, + "loss": 0.6163, + "step": 8924 + }, + { + "epoch": 0.91, + "grad_norm": 1.5929686827582712, + "learning_rate": 4.421473515548447e-07, + "loss": 0.6359, + "step": 8925 + }, + { + "epoch": 0.91, + "grad_norm": 1.7485565665955185, + "learning_rate": 4.411790845289243e-07, + "loss": 0.6752, + "step": 8926 + }, + { + "epoch": 0.91, + "grad_norm": 1.5022991327522344, + "learning_rate": 4.4021185496353036e-07, + "loss": 0.537, + "step": 8927 + }, + { + "epoch": 0.91, + "grad_norm": 1.7332070844888663, + "learning_rate": 4.392456629636399e-07, + "loss": 0.6884, + "step": 8928 + }, + { + "epoch": 0.91, + "grad_norm": 1.6941139000307985, + "learning_rate": 4.382805086341213e-07, + "loss": 0.6324, + "step": 8929 + }, + { + "epoch": 0.91, + "grad_norm": 1.8277559933369407, + "learning_rate": 4.373163920797241e-07, + "loss": 0.6778, + "step": 8930 + }, + { + "epoch": 0.91, + "grad_norm": 1.8101863523173403, + "learning_rate": 4.36353313405089e-07, + "loss": 0.7158, + "step": 8931 + }, + { + "epoch": 0.91, + "grad_norm": 1.6692541425579466, + "learning_rate": 4.353912727147425e-07, + "loss": 0.7753, + "step": 8932 + }, + { + "epoch": 0.91, + "grad_norm": 1.718180299453628, + "learning_rate": 4.344302701130998e-07, + "loss": 0.6736, + "step": 8933 + }, + { + "epoch": 0.91, + "grad_norm": 1.7976225939029056, + "learning_rate": 4.3347030570446314e-07, + "loss": 0.6929, + "step": 8934 + }, + { + "epoch": 0.91, + "grad_norm": 2.0948448179186894, + "learning_rate": 4.3251137959302023e-07, + "loss": 0.7218, + "step": 8935 + }, + { + "epoch": 0.91, + "grad_norm": 1.6384943101690133, + "learning_rate": 4.3155349188284903e-07, + "loss": 0.7612, + "step": 8936 + }, + { + "epoch": 0.91, + "grad_norm": 1.7067810498067775, + "learning_rate": 4.305966426779118e-07, + "loss": 0.6568, + "step": 8937 + }, + { + "epoch": 0.91, + "grad_norm": 1.7004994744578605, + "learning_rate": 4.2964083208206e-07, + "loss": 0.5903, + "step": 8938 + }, + { + "epoch": 0.91, + "grad_norm": 1.7022825552040735, + "learning_rate": 4.286860601990328e-07, + "loss": 0.7522, + "step": 8939 + }, + { + "epoch": 0.91, + "grad_norm": 1.6869400366291563, + "learning_rate": 4.2773232713245515e-07, + "loss": 0.7511, + "step": 8940 + }, + { + "epoch": 0.91, + "grad_norm": 1.6267627208860855, + "learning_rate": 4.267796329858398e-07, + "loss": 0.6479, + "step": 8941 + }, + { + "epoch": 0.91, + "grad_norm": 1.5098530524214537, + "learning_rate": 4.2582797786258735e-07, + "loss": 0.5881, + "step": 8942 + }, + { + "epoch": 0.91, + "grad_norm": 1.678415811161424, + "learning_rate": 4.2487736186598297e-07, + "loss": 0.7102, + "step": 8943 + }, + { + "epoch": 0.91, + "grad_norm": 1.811580603149009, + "learning_rate": 4.2392778509920516e-07, + "loss": 0.6838, + "step": 8944 + }, + { + "epoch": 0.91, + "grad_norm": 1.9324127880015471, + "learning_rate": 4.229792476653116e-07, + "loss": 0.8057, + "step": 8945 + }, + { + "epoch": 0.91, + "grad_norm": 1.8912088902745035, + "learning_rate": 4.2203174966725324e-07, + "loss": 0.7762, + "step": 8946 + }, + { + "epoch": 0.91, + "grad_norm": 1.607190207534778, + "learning_rate": 4.210852912078656e-07, + "loss": 0.643, + "step": 8947 + }, + { + "epoch": 0.91, + "grad_norm": 1.7417460029699203, + "learning_rate": 4.2013987238987197e-07, + "loss": 0.7045, + "step": 8948 + }, + { + "epoch": 0.91, + "grad_norm": 1.7337059018397725, + "learning_rate": 4.1919549331588374e-07, + "loss": 0.599, + "step": 8949 + }, + { + "epoch": 0.91, + "grad_norm": 1.7286472884952369, + "learning_rate": 4.182521540883966e-07, + "loss": 0.7168, + "step": 8950 + }, + { + "epoch": 0.91, + "grad_norm": 1.6656075871975395, + "learning_rate": 4.1730985480979645e-07, + "loss": 0.6924, + "step": 8951 + }, + { + "epoch": 0.91, + "grad_norm": 1.6302672025258802, + "learning_rate": 4.1636859558235707e-07, + "loss": 0.7471, + "step": 8952 + }, + { + "epoch": 0.91, + "grad_norm": 1.6921406858909696, + "learning_rate": 4.154283765082334e-07, + "loss": 0.7204, + "step": 8953 + }, + { + "epoch": 0.91, + "grad_norm": 1.7806522406686651, + "learning_rate": 4.144891976894749e-07, + "loss": 0.6436, + "step": 8954 + }, + { + "epoch": 0.91, + "grad_norm": 1.872644394026765, + "learning_rate": 4.135510592280112e-07, + "loss": 0.7537, + "step": 8955 + }, + { + "epoch": 0.91, + "grad_norm": 1.6822983087884402, + "learning_rate": 4.126139612256663e-07, + "loss": 0.632, + "step": 8956 + }, + { + "epoch": 0.91, + "grad_norm": 1.6924687648307566, + "learning_rate": 4.1167790378414674e-07, + "loss": 0.601, + "step": 8957 + }, + { + "epoch": 0.91, + "grad_norm": 1.6927001381944409, + "learning_rate": 4.107428870050445e-07, + "loss": 0.7585, + "step": 8958 + }, + { + "epoch": 0.91, + "grad_norm": 1.6511589387598713, + "learning_rate": 4.09808910989844e-07, + "loss": 0.6367, + "step": 8959 + }, + { + "epoch": 0.91, + "grad_norm": 1.7125663757115257, + "learning_rate": 4.088759758399108e-07, + "loss": 0.681, + "step": 8960 + }, + { + "epoch": 0.91, + "grad_norm": 1.8104751149832377, + "learning_rate": 4.079440816565028e-07, + "loss": 0.6205, + "step": 8961 + }, + { + "epoch": 0.91, + "grad_norm": 1.6243580652214884, + "learning_rate": 4.070132285407624e-07, + "loss": 0.7469, + "step": 8962 + }, + { + "epoch": 0.91, + "grad_norm": 1.8825980210289652, + "learning_rate": 4.0608341659371774e-07, + "loss": 0.7058, + "step": 8963 + }, + { + "epoch": 0.91, + "grad_norm": 1.6623960622591962, + "learning_rate": 4.0515464591628696e-07, + "loss": 0.6874, + "step": 8964 + }, + { + "epoch": 0.91, + "grad_norm": 1.5681784036517072, + "learning_rate": 4.042269166092716e-07, + "loss": 0.8016, + "step": 8965 + }, + { + "epoch": 0.91, + "grad_norm": 1.7266043370859399, + "learning_rate": 4.0330022877336337e-07, + "loss": 0.6505, + "step": 8966 + }, + { + "epoch": 0.91, + "grad_norm": 1.781958601565994, + "learning_rate": 4.023745825091407e-07, + "loss": 0.628, + "step": 8967 + }, + { + "epoch": 0.91, + "grad_norm": 1.7222770015039668, + "learning_rate": 4.0144997791706664e-07, + "loss": 0.6065, + "step": 8968 + }, + { + "epoch": 0.91, + "grad_norm": 1.7610167368399874, + "learning_rate": 4.005264150974919e-07, + "loss": 0.7912, + "step": 8969 + }, + { + "epoch": 0.91, + "grad_norm": 1.6324699798367595, + "learning_rate": 3.9960389415065746e-07, + "loss": 0.6925, + "step": 8970 + }, + { + "epoch": 0.91, + "grad_norm": 2.0978893687631777, + "learning_rate": 3.986824151766855e-07, + "loss": 0.708, + "step": 8971 + }, + { + "epoch": 0.91, + "grad_norm": 1.5194927759176355, + "learning_rate": 3.977619782755915e-07, + "loss": 0.5378, + "step": 8972 + }, + { + "epoch": 0.91, + "grad_norm": 1.7378334005633167, + "learning_rate": 3.968425835472711e-07, + "loss": 0.6148, + "step": 8973 + }, + { + "epoch": 0.91, + "grad_norm": 1.8144345071554742, + "learning_rate": 3.959242310915112e-07, + "loss": 0.7466, + "step": 8974 + }, + { + "epoch": 0.91, + "grad_norm": 1.8078036734219984, + "learning_rate": 3.9500692100798656e-07, + "loss": 0.7714, + "step": 8975 + }, + { + "epoch": 0.91, + "grad_norm": 1.617085545757556, + "learning_rate": 3.9409065339625407e-07, + "loss": 0.6702, + "step": 8976 + }, + { + "epoch": 0.91, + "grad_norm": 1.7290694019525905, + "learning_rate": 3.9317542835576317e-07, + "loss": 0.707, + "step": 8977 + }, + { + "epoch": 0.91, + "grad_norm": 1.6922559342798456, + "learning_rate": 3.9226124598584323e-07, + "loss": 0.6726, + "step": 8978 + }, + { + "epoch": 0.91, + "grad_norm": 1.9071775216199967, + "learning_rate": 3.913481063857183e-07, + "loss": 0.6511, + "step": 8979 + }, + { + "epoch": 0.91, + "grad_norm": 1.5998712830431994, + "learning_rate": 3.904360096544935e-07, + "loss": 0.7038, + "step": 8980 + }, + { + "epoch": 0.91, + "grad_norm": 1.6427375601725964, + "learning_rate": 3.895249558911629e-07, + "loss": 0.672, + "step": 8981 + }, + { + "epoch": 0.91, + "grad_norm": 1.6755330097592833, + "learning_rate": 3.8861494519460865e-07, + "loss": 0.6397, + "step": 8982 + }, + { + "epoch": 0.91, + "grad_norm": 1.6342444584998612, + "learning_rate": 3.87705977663595e-07, + "loss": 0.6957, + "step": 8983 + }, + { + "epoch": 0.91, + "grad_norm": 1.619564578301826, + "learning_rate": 3.867980533967786e-07, + "loss": 0.6582, + "step": 8984 + }, + { + "epoch": 0.91, + "grad_norm": 1.5857180219289881, + "learning_rate": 3.858911724927006e-07, + "loss": 0.6964, + "step": 8985 + }, + { + "epoch": 0.91, + "grad_norm": 1.6850104647723942, + "learning_rate": 3.849853350497879e-07, + "loss": 0.7092, + "step": 8986 + }, + { + "epoch": 0.91, + "grad_norm": 1.6564782792292456, + "learning_rate": 3.84080541166354e-07, + "loss": 0.7242, + "step": 8987 + }, + { + "epoch": 0.91, + "grad_norm": 1.7271970757460506, + "learning_rate": 3.831767909406026e-07, + "loss": 0.6899, + "step": 8988 + }, + { + "epoch": 0.91, + "grad_norm": 1.8320539782625036, + "learning_rate": 3.8227408447061853e-07, + "loss": 0.774, + "step": 8989 + }, + { + "epoch": 0.91, + "grad_norm": 1.8335983391878803, + "learning_rate": 3.813724218543802e-07, + "loss": 0.7207, + "step": 8990 + }, + { + "epoch": 0.91, + "grad_norm": 1.6308164696229996, + "learning_rate": 3.8047180318974474e-07, + "loss": 0.7297, + "step": 8991 + }, + { + "epoch": 0.91, + "grad_norm": 1.3703166795086241, + "learning_rate": 3.7957222857446297e-07, + "loss": 0.6287, + "step": 8992 + }, + { + "epoch": 0.91, + "grad_norm": 1.7678726313798627, + "learning_rate": 3.7867369810617114e-07, + "loss": 0.7225, + "step": 8993 + }, + { + "epoch": 0.91, + "grad_norm": 1.561412053470494, + "learning_rate": 3.777762118823869e-07, + "loss": 0.6146, + "step": 8994 + }, + { + "epoch": 0.91, + "grad_norm": 1.7741039057431165, + "learning_rate": 3.768797700005211e-07, + "loss": 0.7123, + "step": 8995 + }, + { + "epoch": 0.92, + "grad_norm": 1.809479864639746, + "learning_rate": 3.75984372557866e-07, + "loss": 0.7771, + "step": 8996 + }, + { + "epoch": 0.92, + "grad_norm": 2.029375389380463, + "learning_rate": 3.7509001965160494e-07, + "loss": 0.6924, + "step": 8997 + }, + { + "epoch": 0.92, + "grad_norm": 1.7141203015363913, + "learning_rate": 3.74196711378807e-07, + "loss": 0.6498, + "step": 8998 + }, + { + "epoch": 0.92, + "grad_norm": 1.5722400717490779, + "learning_rate": 3.733044478364234e-07, + "loss": 0.6627, + "step": 8999 + }, + { + "epoch": 0.92, + "grad_norm": 1.7598986440797793, + "learning_rate": 3.72413229121299e-07, + "loss": 0.6553, + "step": 9000 + }, + { + "epoch": 0.92, + "grad_norm": 1.7653352445711585, + "learning_rate": 3.715230553301585e-07, + "loss": 0.689, + "step": 9001 + }, + { + "epoch": 0.92, + "grad_norm": 1.887705746303693, + "learning_rate": 3.706339265596182e-07, + "loss": 0.8099, + "step": 9002 + }, + { + "epoch": 0.92, + "grad_norm": 1.5972384367430816, + "learning_rate": 3.6974584290617955e-07, + "loss": 0.6685, + "step": 9003 + }, + { + "epoch": 0.92, + "grad_norm": 1.7070696333151447, + "learning_rate": 3.688588044662289e-07, + "loss": 0.6961, + "step": 9004 + }, + { + "epoch": 0.92, + "grad_norm": 1.7616232130572633, + "learning_rate": 3.6797281133603926e-07, + "loss": 0.6333, + "step": 9005 + }, + { + "epoch": 0.92, + "grad_norm": 1.7008064665959346, + "learning_rate": 3.6708786361177586e-07, + "loss": 0.725, + "step": 9006 + }, + { + "epoch": 0.92, + "grad_norm": 1.6544849763586182, + "learning_rate": 3.662039613894808e-07, + "loss": 0.6224, + "step": 9007 + }, + { + "epoch": 0.92, + "grad_norm": 1.6644316689410115, + "learning_rate": 3.6532110476509064e-07, + "loss": 0.6801, + "step": 9008 + }, + { + "epoch": 0.92, + "grad_norm": 1.7592494890729364, + "learning_rate": 3.644392938344254e-07, + "loss": 0.7281, + "step": 9009 + }, + { + "epoch": 0.92, + "grad_norm": 1.7409770219731049, + "learning_rate": 3.6355852869318976e-07, + "loss": 0.6187, + "step": 9010 + }, + { + "epoch": 0.92, + "grad_norm": 1.7447307456725696, + "learning_rate": 3.6267880943698153e-07, + "loss": 0.6459, + "step": 9011 + }, + { + "epoch": 0.92, + "grad_norm": 1.76726525973141, + "learning_rate": 3.6180013616127554e-07, + "loss": 0.7181, + "step": 9012 + }, + { + "epoch": 0.92, + "grad_norm": 1.74741714153782, + "learning_rate": 3.6092250896143986e-07, + "loss": 0.6133, + "step": 9013 + }, + { + "epoch": 0.92, + "grad_norm": 1.8751732333083337, + "learning_rate": 3.6004592793272954e-07, + "loss": 0.7615, + "step": 9014 + }, + { + "epoch": 0.92, + "grad_norm": 1.6830093484207342, + "learning_rate": 3.5917039317028057e-07, + "loss": 0.7213, + "step": 9015 + }, + { + "epoch": 0.92, + "grad_norm": 1.7216518180084097, + "learning_rate": 3.5829590476911925e-07, + "loss": 0.8031, + "step": 9016 + }, + { + "epoch": 0.92, + "grad_norm": 1.6066940758025394, + "learning_rate": 3.574224628241596e-07, + "loss": 0.6596, + "step": 9017 + }, + { + "epoch": 0.92, + "grad_norm": 1.741718527853461, + "learning_rate": 3.5655006743019695e-07, + "loss": 0.6479, + "step": 9018 + }, + { + "epoch": 0.92, + "grad_norm": 1.956259263388941, + "learning_rate": 3.556787186819177e-07, + "loss": 0.7758, + "step": 9019 + }, + { + "epoch": 0.92, + "grad_norm": 1.664427510137072, + "learning_rate": 3.548084166738952e-07, + "loss": 0.682, + "step": 9020 + }, + { + "epoch": 0.92, + "grad_norm": 1.393762831262044, + "learning_rate": 3.5393916150058274e-07, + "loss": 0.5206, + "step": 9021 + }, + { + "epoch": 0.92, + "grad_norm": 1.6615508318160366, + "learning_rate": 3.5307095325632814e-07, + "loss": 0.7007, + "step": 9022 + }, + { + "epoch": 0.92, + "grad_norm": 1.8205337937904904, + "learning_rate": 3.522037920353605e-07, + "loss": 0.669, + "step": 9023 + }, + { + "epoch": 0.92, + "grad_norm": 1.8779304425650445, + "learning_rate": 3.5133767793179676e-07, + "loss": 0.7525, + "step": 9024 + }, + { + "epoch": 0.92, + "grad_norm": 1.7115456730628904, + "learning_rate": 3.5047261103963837e-07, + "loss": 0.6273, + "step": 9025 + }, + { + "epoch": 0.92, + "grad_norm": 1.7026022015340625, + "learning_rate": 3.49608591452778e-07, + "loss": 0.7278, + "step": 9026 + }, + { + "epoch": 0.92, + "grad_norm": 1.5533481880995001, + "learning_rate": 3.4874561926498964e-07, + "loss": 0.52, + "step": 9027 + }, + { + "epoch": 0.92, + "grad_norm": 1.6861445365830685, + "learning_rate": 3.478836945699349e-07, + "loss": 0.706, + "step": 9028 + }, + { + "epoch": 0.92, + "grad_norm": 1.7246204553515043, + "learning_rate": 3.470228174611634e-07, + "loss": 0.6582, + "step": 9029 + }, + { + "epoch": 0.92, + "grad_norm": 1.7259912400155208, + "learning_rate": 3.461629880321082e-07, + "loss": 0.6964, + "step": 9030 + }, + { + "epoch": 0.92, + "grad_norm": 1.5258391955713864, + "learning_rate": 3.4530420637609365e-07, + "loss": 0.6223, + "step": 9031 + }, + { + "epoch": 0.92, + "grad_norm": 1.9501077686189958, + "learning_rate": 3.444464725863228e-07, + "loss": 0.7221, + "step": 9032 + }, + { + "epoch": 0.92, + "grad_norm": 1.519065257951722, + "learning_rate": 3.435897867558924e-07, + "loss": 0.7602, + "step": 9033 + }, + { + "epoch": 0.92, + "grad_norm": 1.6559610091291315, + "learning_rate": 3.4273414897778133e-07, + "loss": 0.6338, + "step": 9034 + }, + { + "epoch": 0.92, + "grad_norm": 1.8860713373495857, + "learning_rate": 3.418795593448554e-07, + "loss": 0.6843, + "step": 9035 + }, + { + "epoch": 0.92, + "grad_norm": 1.7613812992796087, + "learning_rate": 3.4102601794986813e-07, + "loss": 0.6712, + "step": 9036 + }, + { + "epoch": 0.92, + "grad_norm": 1.6267258861120946, + "learning_rate": 3.401735248854554e-07, + "loss": 0.6966, + "step": 9037 + }, + { + "epoch": 0.92, + "grad_norm": 1.7168945312821067, + "learning_rate": 3.3932208024414435e-07, + "loss": 0.8096, + "step": 9038 + }, + { + "epoch": 0.92, + "grad_norm": 1.8824721722015318, + "learning_rate": 3.3847168411834666e-07, + "loss": 0.7043, + "step": 9039 + }, + { + "epoch": 0.92, + "grad_norm": 1.644816582797017, + "learning_rate": 3.3762233660035724e-07, + "loss": 0.7229, + "step": 9040 + }, + { + "epoch": 0.92, + "grad_norm": 1.6122426967789936, + "learning_rate": 3.367740377823603e-07, + "loss": 0.6559, + "step": 9041 + }, + { + "epoch": 0.92, + "grad_norm": 1.7948017004198267, + "learning_rate": 3.3592678775642653e-07, + "loss": 0.7649, + "step": 9042 + }, + { + "epoch": 0.92, + "grad_norm": 1.6036517898197797, + "learning_rate": 3.35080586614509e-07, + "loss": 0.7244, + "step": 9043 + }, + { + "epoch": 0.92, + "grad_norm": 1.5588599713340119, + "learning_rate": 3.342354344484533e-07, + "loss": 0.6241, + "step": 9044 + }, + { + "epoch": 0.92, + "grad_norm": 1.7085589108728771, + "learning_rate": 3.333913313499848e-07, + "loss": 0.642, + "step": 9045 + }, + { + "epoch": 0.92, + "grad_norm": 1.756713623597841, + "learning_rate": 3.3254827741071806e-07, + "loss": 0.6681, + "step": 9046 + }, + { + "epoch": 0.92, + "grad_norm": 1.4899810166224117, + "learning_rate": 3.3170627272215427e-07, + "loss": 0.6863, + "step": 9047 + }, + { + "epoch": 0.92, + "grad_norm": 1.531364479212317, + "learning_rate": 3.308653173756793e-07, + "loss": 0.755, + "step": 9048 + }, + { + "epoch": 0.92, + "grad_norm": 1.5168149460058475, + "learning_rate": 3.300254114625656e-07, + "loss": 0.7084, + "step": 9049 + }, + { + "epoch": 0.92, + "grad_norm": 1.7459672256196828, + "learning_rate": 3.2918655507397144e-07, + "loss": 0.7533, + "step": 9050 + }, + { + "epoch": 0.92, + "grad_norm": 1.7158933208599039, + "learning_rate": 3.283487483009429e-07, + "loss": 0.6339, + "step": 9051 + }, + { + "epoch": 0.92, + "grad_norm": 1.7430776876204692, + "learning_rate": 3.2751199123441046e-07, + "loss": 0.6434, + "step": 9052 + }, + { + "epoch": 0.92, + "grad_norm": 1.5704904697330333, + "learning_rate": 3.2667628396518936e-07, + "loss": 0.5842, + "step": 9053 + }, + { + "epoch": 0.92, + "grad_norm": 1.7800244667694278, + "learning_rate": 3.258416265839848e-07, + "loss": 0.7669, + "step": 9054 + }, + { + "epoch": 0.92, + "grad_norm": 1.7638354518588866, + "learning_rate": 3.2500801918138425e-07, + "loss": 0.5612, + "step": 9055 + }, + { + "epoch": 0.92, + "grad_norm": 1.8344341984251336, + "learning_rate": 3.241754618478632e-07, + "loss": 0.647, + "step": 9056 + }, + { + "epoch": 0.92, + "grad_norm": 1.7039427641608778, + "learning_rate": 3.2334395467378266e-07, + "loss": 0.6021, + "step": 9057 + }, + { + "epoch": 0.92, + "grad_norm": 1.7322214223361194, + "learning_rate": 3.2251349774939046e-07, + "loss": 0.6456, + "step": 9058 + }, + { + "epoch": 0.92, + "grad_norm": 1.7517206466744195, + "learning_rate": 3.216840911648178e-07, + "loss": 0.7309, + "step": 9059 + }, + { + "epoch": 0.92, + "grad_norm": 1.6778998604568132, + "learning_rate": 3.2085573501008717e-07, + "loss": 0.6758, + "step": 9060 + }, + { + "epoch": 0.92, + "grad_norm": 1.8758051183890794, + "learning_rate": 3.200284293750999e-07, + "loss": 0.7864, + "step": 9061 + }, + { + "epoch": 0.92, + "grad_norm": 1.5772792146727868, + "learning_rate": 3.1920217434964985e-07, + "loss": 0.6732, + "step": 9062 + }, + { + "epoch": 0.92, + "grad_norm": 1.8914308924735, + "learning_rate": 3.1837697002341293e-07, + "loss": 0.6585, + "step": 9063 + }, + { + "epoch": 0.92, + "grad_norm": 1.7453450043856464, + "learning_rate": 3.1755281648595093e-07, + "loss": 0.7913, + "step": 9064 + }, + { + "epoch": 0.92, + "grad_norm": 1.3858743411899503, + "learning_rate": 3.1672971382671556e-07, + "loss": 0.6196, + "step": 9065 + }, + { + "epoch": 0.92, + "grad_norm": 1.8338183072323526, + "learning_rate": 3.159076621350399e-07, + "loss": 0.7064, + "step": 9066 + }, + { + "epoch": 0.92, + "grad_norm": 1.685855771910675, + "learning_rate": 3.1508666150014575e-07, + "loss": 0.6349, + "step": 9067 + }, + { + "epoch": 0.92, + "grad_norm": 1.6489397211084076, + "learning_rate": 3.142667120111387e-07, + "loss": 0.6519, + "step": 9068 + }, + { + "epoch": 0.92, + "grad_norm": 1.49196056325252, + "learning_rate": 3.134478137570118e-07, + "loss": 0.6841, + "step": 9069 + }, + { + "epoch": 0.92, + "grad_norm": 1.6392600812772116, + "learning_rate": 3.126299668266453e-07, + "loss": 0.6858, + "step": 9070 + }, + { + "epoch": 0.92, + "grad_norm": 1.6652442805073417, + "learning_rate": 3.1181317130880127e-07, + "loss": 0.6811, + "step": 9071 + }, + { + "epoch": 0.92, + "grad_norm": 1.708651031293616, + "learning_rate": 3.1099742729213235e-07, + "loss": 0.71, + "step": 9072 + }, + { + "epoch": 0.92, + "grad_norm": 1.754886721993718, + "learning_rate": 3.101827348651731e-07, + "loss": 0.6408, + "step": 9073 + }, + { + "epoch": 0.92, + "grad_norm": 1.6962840997056352, + "learning_rate": 3.093690941163452e-07, + "loss": 0.6917, + "step": 9074 + }, + { + "epoch": 0.92, + "grad_norm": 1.5700005927580967, + "learning_rate": 3.08556505133959e-07, + "loss": 0.6271, + "step": 9075 + }, + { + "epoch": 0.92, + "grad_norm": 1.581948249429852, + "learning_rate": 3.077449680062061e-07, + "loss": 0.6254, + "step": 9076 + }, + { + "epoch": 0.92, + "grad_norm": 1.51273905759815, + "learning_rate": 3.069344828211662e-07, + "loss": 0.6443, + "step": 9077 + }, + { + "epoch": 0.92, + "grad_norm": 1.7028985474900407, + "learning_rate": 3.061250496668078e-07, + "loss": 0.749, + "step": 9078 + }, + { + "epoch": 0.92, + "grad_norm": 1.6677452775571646, + "learning_rate": 3.053166686309783e-07, + "loss": 0.6592, + "step": 9079 + }, + { + "epoch": 0.92, + "grad_norm": 1.6062980603348134, + "learning_rate": 3.0450933980141763e-07, + "loss": 0.6491, + "step": 9080 + }, + { + "epoch": 0.92, + "grad_norm": 1.5340881022151536, + "learning_rate": 3.0370306326574673e-07, + "loss": 0.6637, + "step": 9081 + }, + { + "epoch": 0.92, + "grad_norm": 1.6672173189261768, + "learning_rate": 3.028978391114745e-07, + "loss": 0.7864, + "step": 9082 + }, + { + "epoch": 0.92, + "grad_norm": 1.736912952197448, + "learning_rate": 3.020936674259989e-07, + "loss": 0.6965, + "step": 9083 + }, + { + "epoch": 0.92, + "grad_norm": 1.688420218707243, + "learning_rate": 3.012905482965944e-07, + "loss": 0.6519, + "step": 9084 + }, + { + "epoch": 0.92, + "grad_norm": 1.735607676551278, + "learning_rate": 3.0048848181043256e-07, + "loss": 0.673, + "step": 9085 + }, + { + "epoch": 0.92, + "grad_norm": 1.7040912601415914, + "learning_rate": 2.996874680545603e-07, + "loss": 0.7363, + "step": 9086 + }, + { + "epoch": 0.92, + "grad_norm": 1.6373307067637413, + "learning_rate": 2.9888750711591805e-07, + "loss": 0.6322, + "step": 9087 + }, + { + "epoch": 0.92, + "grad_norm": 1.8062957317799482, + "learning_rate": 2.980885990813298e-07, + "loss": 0.6328, + "step": 9088 + }, + { + "epoch": 0.92, + "grad_norm": 1.5429622083207397, + "learning_rate": 2.972907440375017e-07, + "loss": 0.6703, + "step": 9089 + }, + { + "epoch": 0.92, + "grad_norm": 1.7175453590035683, + "learning_rate": 2.964939420710311e-07, + "loss": 0.7612, + "step": 9090 + }, + { + "epoch": 0.92, + "grad_norm": 1.6746756006078176, + "learning_rate": 2.956981932683967e-07, + "loss": 0.697, + "step": 9091 + }, + { + "epoch": 0.92, + "grad_norm": 1.7654789376597413, + "learning_rate": 2.949034977159648e-07, + "loss": 0.6529, + "step": 9092 + }, + { + "epoch": 0.92, + "grad_norm": 1.5521819010013271, + "learning_rate": 2.941098554999877e-07, + "loss": 0.6326, + "step": 9093 + }, + { + "epoch": 0.93, + "grad_norm": 1.7965320659002997, + "learning_rate": 2.933172667066031e-07, + "loss": 0.6972, + "step": 9094 + }, + { + "epoch": 0.93, + "grad_norm": 1.5800734066412943, + "learning_rate": 2.9252573142183327e-07, + "loss": 0.6148, + "step": 9095 + }, + { + "epoch": 0.93, + "grad_norm": 1.8429652754631338, + "learning_rate": 2.917352497315873e-07, + "loss": 0.6747, + "step": 9096 + }, + { + "epoch": 0.93, + "grad_norm": 1.6399270165705966, + "learning_rate": 2.9094582172165876e-07, + "loss": 0.6913, + "step": 9097 + }, + { + "epoch": 0.93, + "grad_norm": 1.6281374340831094, + "learning_rate": 2.9015744747773024e-07, + "loss": 0.7202, + "step": 9098 + }, + { + "epoch": 0.93, + "grad_norm": 1.552728970873856, + "learning_rate": 2.893701270853655e-07, + "loss": 0.752, + "step": 9099 + }, + { + "epoch": 0.93, + "grad_norm": 1.65165872718086, + "learning_rate": 2.885838606300151e-07, + "loss": 0.6869, + "step": 9100 + }, + { + "epoch": 0.93, + "grad_norm": 1.7091754810620192, + "learning_rate": 2.8779864819701853e-07, + "loss": 0.6345, + "step": 9101 + }, + { + "epoch": 0.93, + "grad_norm": 1.6686871905612022, + "learning_rate": 2.8701448987159654e-07, + "loss": 0.5509, + "step": 9102 + }, + { + "epoch": 0.93, + "grad_norm": 1.8173373699523554, + "learning_rate": 2.8623138573885767e-07, + "loss": 0.7899, + "step": 9103 + }, + { + "epoch": 0.93, + "grad_norm": 1.7692576744780697, + "learning_rate": 2.854493358837951e-07, + "loss": 0.6359, + "step": 9104 + }, + { + "epoch": 0.93, + "grad_norm": 1.8505447600940077, + "learning_rate": 2.8466834039128754e-07, + "loss": 0.6465, + "step": 9105 + }, + { + "epoch": 0.93, + "grad_norm": 1.7751451897404646, + "learning_rate": 2.838883993461028e-07, + "loss": 0.7206, + "step": 9106 + }, + { + "epoch": 0.93, + "grad_norm": 1.8723784794316785, + "learning_rate": 2.831095128328876e-07, + "loss": 0.6643, + "step": 9107 + }, + { + "epoch": 0.93, + "grad_norm": 1.6200263587394934, + "learning_rate": 2.823316809361809e-07, + "loss": 0.5683, + "step": 9108 + }, + { + "epoch": 0.93, + "grad_norm": 1.702563720865403, + "learning_rate": 2.8155490374040196e-07, + "loss": 0.7154, + "step": 9109 + }, + { + "epoch": 0.93, + "grad_norm": 1.7451961706558843, + "learning_rate": 2.807791813298588e-07, + "loss": 0.7522, + "step": 9110 + }, + { + "epoch": 0.93, + "grad_norm": 1.6920928919156772, + "learning_rate": 2.8000451378874525e-07, + "loss": 0.6407, + "step": 9111 + }, + { + "epoch": 0.93, + "grad_norm": 1.7605926944594885, + "learning_rate": 2.7923090120113626e-07, + "loss": 0.6672, + "step": 9112 + }, + { + "epoch": 0.93, + "grad_norm": 1.5573327560649162, + "learning_rate": 2.7845834365099913e-07, + "loss": 0.6749, + "step": 9113 + }, + { + "epoch": 0.93, + "grad_norm": 1.616019278808511, + "learning_rate": 2.7768684122217893e-07, + "loss": 0.6522, + "step": 9114 + }, + { + "epoch": 0.93, + "grad_norm": 1.7863348392332858, + "learning_rate": 2.769163939984121e-07, + "loss": 0.6702, + "step": 9115 + }, + { + "epoch": 0.93, + "grad_norm": 1.9669771441858406, + "learning_rate": 2.7614700206332056e-07, + "loss": 0.7218, + "step": 9116 + }, + { + "epoch": 0.93, + "grad_norm": 1.6853909847735868, + "learning_rate": 2.7537866550040647e-07, + "loss": 0.5787, + "step": 9117 + }, + { + "epoch": 0.93, + "grad_norm": 1.7771673876183924, + "learning_rate": 2.74611384393062e-07, + "loss": 0.698, + "step": 9118 + }, + { + "epoch": 0.93, + "grad_norm": 1.5863480135223418, + "learning_rate": 2.7384515882456386e-07, + "loss": 0.685, + "step": 9119 + }, + { + "epoch": 0.93, + "grad_norm": 1.6153180520052017, + "learning_rate": 2.730799888780744e-07, + "loss": 0.6658, + "step": 9120 + }, + { + "epoch": 0.93, + "grad_norm": 1.5779497539387755, + "learning_rate": 2.7231587463663945e-07, + "loss": 0.627, + "step": 9121 + }, + { + "epoch": 0.93, + "grad_norm": 1.8041035551184352, + "learning_rate": 2.7155281618319265e-07, + "loss": 0.7443, + "step": 9122 + }, + { + "epoch": 0.93, + "grad_norm": 1.8765336984772065, + "learning_rate": 2.7079081360055106e-07, + "loss": 0.6708, + "step": 9123 + }, + { + "epoch": 0.93, + "grad_norm": 1.5160141983328415, + "learning_rate": 2.7002986697141966e-07, + "loss": 0.5983, + "step": 9124 + }, + { + "epoch": 0.93, + "grad_norm": 1.854303661617946, + "learning_rate": 2.692699763783868e-07, + "loss": 0.7064, + "step": 9125 + }, + { + "epoch": 0.93, + "grad_norm": 1.5622552568194452, + "learning_rate": 2.685111419039255e-07, + "loss": 0.6858, + "step": 9126 + }, + { + "epoch": 0.93, + "grad_norm": 1.5987403896005212, + "learning_rate": 2.677533636303964e-07, + "loss": 0.6825, + "step": 9127 + }, + { + "epoch": 0.93, + "grad_norm": 1.6486100461672395, + "learning_rate": 2.669966416400449e-07, + "loss": 0.7009, + "step": 9128 + }, + { + "epoch": 0.93, + "grad_norm": 1.8947303431333133, + "learning_rate": 2.6624097601499976e-07, + "loss": 0.7161, + "step": 9129 + }, + { + "epoch": 0.93, + "grad_norm": 1.6099671906456852, + "learning_rate": 2.654863668372787e-07, + "loss": 0.5869, + "step": 9130 + }, + { + "epoch": 0.93, + "grad_norm": 1.8616912431349213, + "learning_rate": 2.6473281418878173e-07, + "loss": 0.6781, + "step": 9131 + }, + { + "epoch": 0.93, + "grad_norm": 1.87333267111817, + "learning_rate": 2.6398031815129454e-07, + "loss": 0.8606, + "step": 9132 + }, + { + "epoch": 0.93, + "grad_norm": 1.7026026627353354, + "learning_rate": 2.6322887880649074e-07, + "loss": 0.5823, + "step": 9133 + }, + { + "epoch": 0.93, + "grad_norm": 1.663095785623857, + "learning_rate": 2.6247849623592504e-07, + "loss": 0.613, + "step": 9134 + }, + { + "epoch": 0.93, + "grad_norm": 1.7113272331280165, + "learning_rate": 2.617291705210412e-07, + "loss": 0.691, + "step": 9135 + }, + { + "epoch": 0.93, + "grad_norm": 1.8262977928445028, + "learning_rate": 2.6098090174316636e-07, + "loss": 0.6946, + "step": 9136 + }, + { + "epoch": 0.93, + "grad_norm": 1.7582436591373753, + "learning_rate": 2.602336899835134e-07, + "loss": 0.6886, + "step": 9137 + }, + { + "epoch": 0.93, + "grad_norm": 1.8419965747676532, + "learning_rate": 2.594875353231796e-07, + "loss": 0.7765, + "step": 9138 + }, + { + "epoch": 0.93, + "grad_norm": 1.622396277431157, + "learning_rate": 2.5874243784315025e-07, + "loss": 0.5949, + "step": 9139 + }, + { + "epoch": 0.93, + "grad_norm": 1.635845730948259, + "learning_rate": 2.579983976242928e-07, + "loss": 0.6032, + "step": 9140 + }, + { + "epoch": 0.93, + "grad_norm": 1.9015091477785202, + "learning_rate": 2.572554147473616e-07, + "loss": 0.7317, + "step": 9141 + }, + { + "epoch": 0.93, + "grad_norm": 1.6575673802244377, + "learning_rate": 2.565134892929955e-07, + "loss": 0.6806, + "step": 9142 + }, + { + "epoch": 0.93, + "grad_norm": 1.5589129509589466, + "learning_rate": 2.5577262134171887e-07, + "loss": 0.666, + "step": 9143 + }, + { + "epoch": 0.93, + "grad_norm": 1.7090507291266002, + "learning_rate": 2.5503281097394194e-07, + "loss": 0.6619, + "step": 9144 + }, + { + "epoch": 0.93, + "grad_norm": 1.7464178687329157, + "learning_rate": 2.5429405826995934e-07, + "loss": 0.688, + "step": 9145 + }, + { + "epoch": 0.93, + "grad_norm": 1.585521708580421, + "learning_rate": 2.5355636330994915e-07, + "loss": 0.6253, + "step": 9146 + }, + { + "epoch": 0.93, + "grad_norm": 1.6957372162455264, + "learning_rate": 2.5281972617398064e-07, + "loss": 0.7472, + "step": 9147 + }, + { + "epoch": 0.93, + "grad_norm": 1.7136949583543848, + "learning_rate": 2.52084146942001e-07, + "loss": 0.6604, + "step": 9148 + }, + { + "epoch": 0.93, + "grad_norm": 1.5735246095230109, + "learning_rate": 2.5134962569384746e-07, + "loss": 0.7197, + "step": 9149 + }, + { + "epoch": 0.93, + "grad_norm": 1.4968907551306718, + "learning_rate": 2.506161625092396e-07, + "loss": 0.5578, + "step": 9150 + }, + { + "epoch": 0.93, + "grad_norm": 1.9557448471366423, + "learning_rate": 2.498837574677837e-07, + "loss": 0.7901, + "step": 9151 + }, + { + "epoch": 0.93, + "grad_norm": 1.6133938766958, + "learning_rate": 2.491524106489718e-07, + "loss": 0.792, + "step": 9152 + }, + { + "epoch": 0.93, + "grad_norm": 1.734088575672457, + "learning_rate": 2.484221221321792e-07, + "loss": 0.7127, + "step": 9153 + }, + { + "epoch": 0.93, + "grad_norm": 1.7922240857189475, + "learning_rate": 2.47692891996667e-07, + "loss": 0.6896, + "step": 9154 + }, + { + "epoch": 0.93, + "grad_norm": 1.5268419657742744, + "learning_rate": 2.4696472032158305e-07, + "loss": 0.5813, + "step": 9155 + }, + { + "epoch": 0.93, + "grad_norm": 1.7239929918768928, + "learning_rate": 2.462376071859585e-07, + "loss": 0.7521, + "step": 9156 + }, + { + "epoch": 0.93, + "grad_norm": 1.6846588000318006, + "learning_rate": 2.4551155266871017e-07, + "loss": 0.6519, + "step": 9157 + }, + { + "epoch": 0.93, + "grad_norm": 1.8150620464602065, + "learning_rate": 2.447865568486385e-07, + "loss": 0.6218, + "step": 9158 + }, + { + "epoch": 0.93, + "grad_norm": 1.7133394743606516, + "learning_rate": 2.440626198044327e-07, + "loss": 0.6594, + "step": 9159 + }, + { + "epoch": 0.93, + "grad_norm": 1.9240325579831237, + "learning_rate": 2.4333974161466324e-07, + "loss": 0.7677, + "step": 9160 + }, + { + "epoch": 0.93, + "grad_norm": 1.8672205907680561, + "learning_rate": 2.4261792235778737e-07, + "loss": 0.6923, + "step": 9161 + }, + { + "epoch": 0.93, + "grad_norm": 1.5890577955916911, + "learning_rate": 2.418971621121491e-07, + "loss": 0.6073, + "step": 9162 + }, + { + "epoch": 0.93, + "grad_norm": 1.6869253072456738, + "learning_rate": 2.411774609559725e-07, + "loss": 0.6317, + "step": 9163 + }, + { + "epoch": 0.93, + "grad_norm": 1.6117386715023836, + "learning_rate": 2.404588189673718e-07, + "loss": 0.5953, + "step": 9164 + }, + { + "epoch": 0.93, + "grad_norm": 1.695397700532162, + "learning_rate": 2.3974123622434566e-07, + "loss": 0.6871, + "step": 9165 + }, + { + "epoch": 0.93, + "grad_norm": 1.6048962661185335, + "learning_rate": 2.390247128047729e-07, + "loss": 0.6849, + "step": 9166 + }, + { + "epoch": 0.93, + "grad_norm": 1.4696291515897013, + "learning_rate": 2.383092487864247e-07, + "loss": 0.6443, + "step": 9167 + }, + { + "epoch": 0.93, + "grad_norm": 1.4987936697717255, + "learning_rate": 2.3759484424695113e-07, + "loss": 0.6706, + "step": 9168 + }, + { + "epoch": 0.93, + "grad_norm": 1.7643556140864138, + "learning_rate": 2.3688149926389015e-07, + "loss": 0.6497, + "step": 9169 + }, + { + "epoch": 0.93, + "grad_norm": 1.6598835699626207, + "learning_rate": 2.361692139146643e-07, + "loss": 0.6873, + "step": 9170 + }, + { + "epoch": 0.93, + "grad_norm": 1.6054352567438275, + "learning_rate": 2.354579882765806e-07, + "loss": 0.6001, + "step": 9171 + }, + { + "epoch": 0.93, + "grad_norm": 1.8758251422175432, + "learning_rate": 2.3474782242683292e-07, + "loss": 0.7149, + "step": 9172 + }, + { + "epoch": 0.93, + "grad_norm": 2.1173561572795223, + "learning_rate": 2.3403871644249731e-07, + "loss": 0.6189, + "step": 9173 + }, + { + "epoch": 0.93, + "grad_norm": 1.6994143128015073, + "learning_rate": 2.3333067040053558e-07, + "loss": 0.6513, + "step": 9174 + }, + { + "epoch": 0.93, + "grad_norm": 1.6467310548387322, + "learning_rate": 2.3262368437779736e-07, + "loss": 0.7023, + "step": 9175 + }, + { + "epoch": 0.93, + "grad_norm": 1.601730453794561, + "learning_rate": 2.3191775845101238e-07, + "loss": 0.6781, + "step": 9176 + }, + { + "epoch": 0.93, + "grad_norm": 1.6270212271280904, + "learning_rate": 2.3121289269679937e-07, + "loss": 0.5984, + "step": 9177 + }, + { + "epoch": 0.93, + "grad_norm": 1.6570884566322033, + "learning_rate": 2.3050908719166155e-07, + "loss": 0.6897, + "step": 9178 + }, + { + "epoch": 0.93, + "grad_norm": 1.6162197492295876, + "learning_rate": 2.2980634201198227e-07, + "loss": 0.6794, + "step": 9179 + }, + { + "epoch": 0.93, + "grad_norm": 1.8218868665569699, + "learning_rate": 2.2910465723403717e-07, + "loss": 0.7722, + "step": 9180 + }, + { + "epoch": 0.93, + "grad_norm": 1.6672965841302891, + "learning_rate": 2.2840403293398095e-07, + "loss": 0.7197, + "step": 9181 + }, + { + "epoch": 0.93, + "grad_norm": 1.7340015665776833, + "learning_rate": 2.2770446918785716e-07, + "loss": 0.7679, + "step": 9182 + }, + { + "epoch": 0.93, + "grad_norm": 1.6038653974037864, + "learning_rate": 2.2700596607159175e-07, + "loss": 0.6896, + "step": 9183 + }, + { + "epoch": 0.93, + "grad_norm": 1.7365201637290837, + "learning_rate": 2.263085236609952e-07, + "loss": 0.6372, + "step": 9184 + }, + { + "epoch": 0.93, + "grad_norm": 1.7596204753893094, + "learning_rate": 2.2561214203176583e-07, + "loss": 0.7103, + "step": 9185 + }, + { + "epoch": 0.93, + "grad_norm": 1.7182996150070722, + "learning_rate": 2.249168212594832e-07, + "loss": 0.6525, + "step": 9186 + }, + { + "epoch": 0.93, + "grad_norm": 1.8044017196554798, + "learning_rate": 2.2422256141961473e-07, + "loss": 0.7075, + "step": 9187 + }, + { + "epoch": 0.93, + "grad_norm": 1.6296108621493695, + "learning_rate": 2.2352936258751235e-07, + "loss": 0.5933, + "step": 9188 + }, + { + "epoch": 0.93, + "grad_norm": 1.587229110833575, + "learning_rate": 2.228372248384092e-07, + "loss": 0.6692, + "step": 9189 + }, + { + "epoch": 0.93, + "grad_norm": 1.4784135454097325, + "learning_rate": 2.2214614824742853e-07, + "loss": 0.6358, + "step": 9190 + }, + { + "epoch": 0.93, + "grad_norm": 1.7090710335488055, + "learning_rate": 2.214561328895748e-07, + "loss": 0.6261, + "step": 9191 + }, + { + "epoch": 0.94, + "grad_norm": 1.842598291770892, + "learning_rate": 2.2076717883973808e-07, + "loss": 0.6602, + "step": 9192 + }, + { + "epoch": 0.94, + "grad_norm": 1.7598264214121744, + "learning_rate": 2.2007928617269414e-07, + "loss": 0.6436, + "step": 9193 + }, + { + "epoch": 0.94, + "grad_norm": 1.8035073777880144, + "learning_rate": 2.1939245496310324e-07, + "loss": 0.7111, + "step": 9194 + }, + { + "epoch": 0.94, + "grad_norm": 1.6420986225961434, + "learning_rate": 2.187066852855091e-07, + "loss": 0.6878, + "step": 9195 + }, + { + "epoch": 0.94, + "grad_norm": 1.6271321324358616, + "learning_rate": 2.1802197721434215e-07, + "loss": 0.6896, + "step": 9196 + }, + { + "epoch": 0.94, + "grad_norm": 1.7339393507898984, + "learning_rate": 2.1733833082391632e-07, + "loss": 0.7423, + "step": 9197 + }, + { + "epoch": 0.94, + "grad_norm": 1.8347943762378955, + "learning_rate": 2.166557461884322e-07, + "loss": 0.7663, + "step": 9198 + }, + { + "epoch": 0.94, + "grad_norm": 1.6972038780825494, + "learning_rate": 2.1597422338197172e-07, + "loss": 0.7218, + "step": 9199 + }, + { + "epoch": 0.94, + "grad_norm": 2.034996875430456, + "learning_rate": 2.1529376247850342e-07, + "loss": 0.6533, + "step": 9200 + }, + { + "epoch": 0.94, + "grad_norm": 1.603699345889999, + "learning_rate": 2.146143635518827e-07, + "loss": 0.7444, + "step": 9201 + }, + { + "epoch": 0.94, + "grad_norm": 1.763026468203593, + "learning_rate": 2.139360266758461e-07, + "loss": 0.6964, + "step": 9202 + }, + { + "epoch": 0.94, + "grad_norm": 1.5337279377060986, + "learning_rate": 2.13258751924017e-07, + "loss": 0.7364, + "step": 9203 + }, + { + "epoch": 0.94, + "grad_norm": 1.6065746501014624, + "learning_rate": 2.1258253936990213e-07, + "loss": 0.7114, + "step": 9204 + }, + { + "epoch": 0.94, + "grad_norm": 1.6610368388773546, + "learning_rate": 2.1190738908689502e-07, + "loss": 0.63, + "step": 9205 + }, + { + "epoch": 0.94, + "grad_norm": 1.6533510918146666, + "learning_rate": 2.1123330114827256e-07, + "loss": 0.6356, + "step": 9206 + }, + { + "epoch": 0.94, + "grad_norm": 1.7435137130120424, + "learning_rate": 2.1056027562719517e-07, + "loss": 0.7129, + "step": 9207 + }, + { + "epoch": 0.94, + "grad_norm": 1.5918102940992866, + "learning_rate": 2.0988831259670994e-07, + "loss": 0.5952, + "step": 9208 + }, + { + "epoch": 0.94, + "grad_norm": 1.7703162917156607, + "learning_rate": 2.0921741212974967e-07, + "loss": 0.6956, + "step": 9209 + }, + { + "epoch": 0.94, + "grad_norm": 1.725491826595931, + "learning_rate": 2.085475742991272e-07, + "loss": 0.6831, + "step": 9210 + }, + { + "epoch": 0.94, + "grad_norm": 1.522459509380362, + "learning_rate": 2.078787991775455e-07, + "loss": 0.5566, + "step": 9211 + }, + { + "epoch": 0.94, + "grad_norm": 1.5164115294750733, + "learning_rate": 2.072110868375876e-07, + "loss": 0.629, + "step": 9212 + }, + { + "epoch": 0.94, + "grad_norm": 1.6799430968173943, + "learning_rate": 2.0654443735172447e-07, + "loss": 0.6195, + "step": 9213 + }, + { + "epoch": 0.94, + "grad_norm": 1.7529038428627008, + "learning_rate": 2.058788507923104e-07, + "loss": 0.6232, + "step": 9214 + }, + { + "epoch": 0.94, + "grad_norm": 1.6082086073174993, + "learning_rate": 2.052143272315843e-07, + "loss": 0.6703, + "step": 9215 + }, + { + "epoch": 0.94, + "grad_norm": 1.6880699939080532, + "learning_rate": 2.0455086674166957e-07, + "loss": 0.7222, + "step": 9216 + }, + { + "epoch": 0.94, + "grad_norm": 1.6500289904255732, + "learning_rate": 2.038884693945742e-07, + "loss": 0.6659, + "step": 9217 + }, + { + "epoch": 0.94, + "grad_norm": 1.5552756948520066, + "learning_rate": 2.0322713526219062e-07, + "loss": 0.6413, + "step": 9218 + }, + { + "epoch": 0.94, + "grad_norm": 1.6993894982632298, + "learning_rate": 2.0256686441629814e-07, + "loss": 0.694, + "step": 9219 + }, + { + "epoch": 0.94, + "grad_norm": 1.7077427956672064, + "learning_rate": 2.019076569285583e-07, + "loss": 0.6796, + "step": 9220 + }, + { + "epoch": 0.94, + "grad_norm": 1.700456720601353, + "learning_rate": 2.0124951287051718e-07, + "loss": 0.6839, + "step": 9221 + }, + { + "epoch": 0.94, + "grad_norm": 1.7073823170278, + "learning_rate": 2.0059243231360437e-07, + "loss": 0.63, + "step": 9222 + }, + { + "epoch": 0.94, + "grad_norm": 1.7442798621212803, + "learning_rate": 1.999364153291383e-07, + "loss": 0.6994, + "step": 9223 + }, + { + "epoch": 0.94, + "grad_norm": 2.003575890582916, + "learning_rate": 1.9928146198831987e-07, + "loss": 0.7964, + "step": 9224 + }, + { + "epoch": 0.94, + "grad_norm": 1.7730842780328422, + "learning_rate": 1.9862757236223108e-07, + "loss": 0.7864, + "step": 9225 + }, + { + "epoch": 0.94, + "grad_norm": 1.5536572444139543, + "learning_rate": 1.979747465218429e-07, + "loss": 0.6897, + "step": 9226 + }, + { + "epoch": 0.94, + "grad_norm": 1.7481819975287163, + "learning_rate": 1.9732298453801092e-07, + "loss": 0.6845, + "step": 9227 + }, + { + "epoch": 0.94, + "grad_norm": 1.744509227646625, + "learning_rate": 1.9667228648147074e-07, + "loss": 0.6998, + "step": 9228 + }, + { + "epoch": 0.94, + "grad_norm": 1.7519756322022366, + "learning_rate": 1.9602265242284813e-07, + "loss": 0.7248, + "step": 9229 + }, + { + "epoch": 0.94, + "grad_norm": 1.6632754313037552, + "learning_rate": 1.9537408243264888e-07, + "loss": 0.673, + "step": 9230 + }, + { + "epoch": 0.94, + "grad_norm": 1.8540429133686656, + "learning_rate": 1.947265765812656e-07, + "loss": 0.7019, + "step": 9231 + }, + { + "epoch": 0.94, + "grad_norm": 1.7358934404834045, + "learning_rate": 1.9408013493897537e-07, + "loss": 0.7499, + "step": 9232 + }, + { + "epoch": 0.94, + "grad_norm": 1.579191613274508, + "learning_rate": 1.9343475757593987e-07, + "loss": 0.6206, + "step": 9233 + }, + { + "epoch": 0.94, + "grad_norm": 2.1684428344582893, + "learning_rate": 1.9279044456220307e-07, + "loss": 0.6809, + "step": 9234 + }, + { + "epoch": 0.94, + "grad_norm": 1.7471847525721256, + "learning_rate": 1.921471959676957e-07, + "loss": 0.6492, + "step": 9235 + }, + { + "epoch": 0.94, + "grad_norm": 1.679965055034842, + "learning_rate": 1.91505011862233e-07, + "loss": 0.6013, + "step": 9236 + }, + { + "epoch": 0.94, + "grad_norm": 1.6388236682172188, + "learning_rate": 1.908638923155126e-07, + "loss": 0.6785, + "step": 9237 + }, + { + "epoch": 0.94, + "grad_norm": 1.5508458601759716, + "learning_rate": 1.9022383739712104e-07, + "loss": 0.7022, + "step": 9238 + }, + { + "epoch": 0.94, + "grad_norm": 1.7548631919890791, + "learning_rate": 1.895848471765227e-07, + "loss": 0.733, + "step": 9239 + }, + { + "epoch": 0.94, + "grad_norm": 1.663254244472624, + "learning_rate": 1.8894692172307106e-07, + "loss": 0.6702, + "step": 9240 + }, + { + "epoch": 0.94, + "grad_norm": 1.6462632236988877, + "learning_rate": 1.8831006110600404e-07, + "loss": 0.8247, + "step": 9241 + }, + { + "epoch": 0.94, + "grad_norm": 1.6436328578247896, + "learning_rate": 1.8767426539444188e-07, + "loss": 0.6637, + "step": 9242 + }, + { + "epoch": 0.94, + "grad_norm": 1.692688762963928, + "learning_rate": 1.8703953465739055e-07, + "loss": 0.735, + "step": 9243 + }, + { + "epoch": 0.94, + "grad_norm": 1.6500218020690514, + "learning_rate": 1.8640586896374157e-07, + "loss": 0.5977, + "step": 9244 + }, + { + "epoch": 0.94, + "grad_norm": 1.7399479592499723, + "learning_rate": 1.857732683822666e-07, + "loss": 0.6792, + "step": 9245 + }, + { + "epoch": 0.94, + "grad_norm": 1.8599038601472067, + "learning_rate": 1.851417329816263e-07, + "loss": 0.6901, + "step": 9246 + }, + { + "epoch": 0.94, + "grad_norm": 1.572795620263803, + "learning_rate": 1.8451126283036358e-07, + "loss": 0.5979, + "step": 9247 + }, + { + "epoch": 0.94, + "grad_norm": 1.7503420322332646, + "learning_rate": 1.8388185799690705e-07, + "loss": 0.7422, + "step": 9248 + }, + { + "epoch": 0.94, + "grad_norm": 1.8567826988069571, + "learning_rate": 1.8325351854956652e-07, + "loss": 0.7061, + "step": 9249 + }, + { + "epoch": 0.94, + "grad_norm": 1.7429096529080754, + "learning_rate": 1.8262624455654077e-07, + "loss": 0.65, + "step": 9250 + }, + { + "epoch": 0.94, + "grad_norm": 1.6255268881969265, + "learning_rate": 1.8200003608590977e-07, + "loss": 0.6084, + "step": 9251 + }, + { + "epoch": 0.94, + "grad_norm": 1.602696923514729, + "learning_rate": 1.8137489320563806e-07, + "loss": 0.6819, + "step": 9252 + }, + { + "epoch": 0.94, + "grad_norm": 1.8004520193464986, + "learning_rate": 1.807508159835758e-07, + "loss": 0.6275, + "step": 9253 + }, + { + "epoch": 0.94, + "grad_norm": 1.8567473176405347, + "learning_rate": 1.8012780448745548e-07, + "loss": 0.7466, + "step": 9254 + }, + { + "epoch": 0.94, + "grad_norm": 1.7660728132743209, + "learning_rate": 1.7950585878489856e-07, + "loss": 0.6696, + "step": 9255 + }, + { + "epoch": 0.94, + "grad_norm": 1.6680895285956447, + "learning_rate": 1.7888497894340328e-07, + "loss": 0.6764, + "step": 9256 + }, + { + "epoch": 0.94, + "grad_norm": 1.8890462461888413, + "learning_rate": 1.7826516503036018e-07, + "loss": 0.6705, + "step": 9257 + }, + { + "epoch": 0.94, + "grad_norm": 1.6092121708191458, + "learning_rate": 1.7764641711303764e-07, + "loss": 0.6481, + "step": 9258 + }, + { + "epoch": 0.94, + "grad_norm": 1.853872643159592, + "learning_rate": 1.77028735258592e-07, + "loss": 0.6965, + "step": 9259 + }, + { + "epoch": 0.94, + "grad_norm": 1.6994009901835503, + "learning_rate": 1.764121195340629e-07, + "loss": 0.657, + "step": 9260 + }, + { + "epoch": 0.94, + "grad_norm": 1.6002015651261523, + "learning_rate": 1.7579657000637464e-07, + "loss": 0.6356, + "step": 9261 + }, + { + "epoch": 0.94, + "grad_norm": 1.7882959951825168, + "learning_rate": 1.7518208674233595e-07, + "loss": 0.8028, + "step": 9262 + }, + { + "epoch": 0.94, + "grad_norm": 1.5715626462526937, + "learning_rate": 1.7456866980863795e-07, + "loss": 0.5792, + "step": 9263 + }, + { + "epoch": 0.94, + "grad_norm": 1.606745758826103, + "learning_rate": 1.7395631927185853e-07, + "loss": 0.6333, + "step": 9264 + }, + { + "epoch": 0.94, + "grad_norm": 1.588132832958181, + "learning_rate": 1.7334503519846002e-07, + "loss": 0.7629, + "step": 9265 + }, + { + "epoch": 0.94, + "grad_norm": 1.5870608189041842, + "learning_rate": 1.7273481765478383e-07, + "loss": 0.6958, + "step": 9266 + }, + { + "epoch": 0.94, + "grad_norm": 1.7650485962558329, + "learning_rate": 1.7212566670706366e-07, + "loss": 0.7763, + "step": 9267 + }, + { + "epoch": 0.94, + "grad_norm": 1.6304775041889885, + "learning_rate": 1.7151758242141102e-07, + "loss": 0.8069, + "step": 9268 + }, + { + "epoch": 0.94, + "grad_norm": 1.6757597922240886, + "learning_rate": 1.7091056486382428e-07, + "loss": 0.7548, + "step": 9269 + }, + { + "epoch": 0.94, + "grad_norm": 1.6025496608955452, + "learning_rate": 1.7030461410018628e-07, + "loss": 0.6505, + "step": 9270 + }, + { + "epoch": 0.94, + "grad_norm": 1.609210244897395, + "learning_rate": 1.696997301962633e-07, + "loss": 0.6027, + "step": 9271 + }, + { + "epoch": 0.94, + "grad_norm": 1.56754740181272, + "learning_rate": 1.69095913217705e-07, + "loss": 0.6646, + "step": 9272 + }, + { + "epoch": 0.94, + "grad_norm": 1.5314525225926743, + "learning_rate": 1.6849316323004794e-07, + "loss": 0.5952, + "step": 9273 + }, + { + "epoch": 0.94, + "grad_norm": 1.704004564930101, + "learning_rate": 1.6789148029871083e-07, + "loss": 0.7793, + "step": 9274 + }, + { + "epoch": 0.94, + "grad_norm": 1.5067426360899583, + "learning_rate": 1.672908644889959e-07, + "loss": 0.5874, + "step": 9275 + }, + { + "epoch": 0.94, + "grad_norm": 1.7121136462489004, + "learning_rate": 1.666913158660921e-07, + "loss": 0.6971, + "step": 9276 + }, + { + "epoch": 0.94, + "grad_norm": 1.5541399878664655, + "learning_rate": 1.6609283449506853e-07, + "loss": 0.6459, + "step": 9277 + }, + { + "epoch": 0.94, + "grad_norm": 1.651047465860301, + "learning_rate": 1.654954204408843e-07, + "loss": 0.7469, + "step": 9278 + }, + { + "epoch": 0.94, + "grad_norm": 1.7380456494653596, + "learning_rate": 1.6489907376837644e-07, + "loss": 0.6762, + "step": 9279 + }, + { + "epoch": 0.94, + "grad_norm": 1.8908809620929607, + "learning_rate": 1.643037945422721e-07, + "loss": 0.7793, + "step": 9280 + }, + { + "epoch": 0.94, + "grad_norm": 1.5530664329085104, + "learning_rate": 1.6370958282717619e-07, + "loss": 0.716, + "step": 9281 + }, + { + "epoch": 0.94, + "grad_norm": 1.9011341123308512, + "learning_rate": 1.6311643868758274e-07, + "loss": 0.6896, + "step": 9282 + }, + { + "epoch": 0.94, + "grad_norm": 1.7537647607023992, + "learning_rate": 1.6252436218786915e-07, + "loss": 0.712, + "step": 9283 + }, + { + "epoch": 0.94, + "grad_norm": 1.756264117673827, + "learning_rate": 1.619333533922951e-07, + "loss": 0.7068, + "step": 9284 + }, + { + "epoch": 0.94, + "grad_norm": 1.7453694164411129, + "learning_rate": 1.6134341236500373e-07, + "loss": 0.625, + "step": 9285 + }, + { + "epoch": 0.94, + "grad_norm": 1.612092023051016, + "learning_rate": 1.6075453917002827e-07, + "loss": 0.68, + "step": 9286 + }, + { + "epoch": 0.94, + "grad_norm": 1.4284971486950722, + "learning_rate": 1.6016673387127645e-07, + "loss": 0.6504, + "step": 9287 + }, + { + "epoch": 0.94, + "grad_norm": 1.8336605844552831, + "learning_rate": 1.595799965325495e-07, + "loss": 0.7225, + "step": 9288 + }, + { + "epoch": 0.94, + "grad_norm": 1.7925815124293898, + "learning_rate": 1.589943272175265e-07, + "loss": 0.6753, + "step": 9289 + }, + { + "epoch": 0.94, + "grad_norm": 1.833749887623125, + "learning_rate": 1.5840972598977212e-07, + "loss": 0.6691, + "step": 9290 + }, + { + "epoch": 0.95, + "grad_norm": 1.7636615851990618, + "learning_rate": 1.5782619291273894e-07, + "loss": 0.6865, + "step": 9291 + }, + { + "epoch": 0.95, + "grad_norm": 1.7029992810691348, + "learning_rate": 1.572437280497563e-07, + "loss": 0.7755, + "step": 9292 + }, + { + "epoch": 0.95, + "grad_norm": 1.6787600559848652, + "learning_rate": 1.5666233146404474e-07, + "loss": 0.5814, + "step": 9293 + }, + { + "epoch": 0.95, + "grad_norm": 1.6399835606911737, + "learning_rate": 1.5608200321870382e-07, + "loss": 0.5949, + "step": 9294 + }, + { + "epoch": 0.95, + "grad_norm": 1.9466123957984622, + "learning_rate": 1.5550274337671868e-07, + "loss": 0.6788, + "step": 9295 + }, + { + "epoch": 0.95, + "grad_norm": 1.7304152725692181, + "learning_rate": 1.5492455200096234e-07, + "loss": 0.7364, + "step": 9296 + }, + { + "epoch": 0.95, + "grad_norm": 1.8101918901000436, + "learning_rate": 1.5434742915418466e-07, + "loss": 0.7217, + "step": 9297 + }, + { + "epoch": 0.95, + "grad_norm": 1.6226630353045093, + "learning_rate": 1.5377137489902548e-07, + "loss": 0.6549, + "step": 9298 + }, + { + "epoch": 0.95, + "grad_norm": 1.8351805596438682, + "learning_rate": 1.5319638929800485e-07, + "loss": 0.7632, + "step": 9299 + }, + { + "epoch": 0.95, + "grad_norm": 1.8081645554575871, + "learning_rate": 1.5262247241352945e-07, + "loss": 0.6583, + "step": 9300 + }, + { + "epoch": 0.95, + "grad_norm": 1.6134327541969915, + "learning_rate": 1.520496243078895e-07, + "loss": 0.6681, + "step": 9301 + }, + { + "epoch": 0.95, + "grad_norm": 1.5380435585216454, + "learning_rate": 1.5147784504325746e-07, + "loss": 0.6873, + "step": 9302 + }, + { + "epoch": 0.95, + "grad_norm": 1.7978338298694088, + "learning_rate": 1.509071346816926e-07, + "loss": 0.7485, + "step": 9303 + }, + { + "epoch": 0.95, + "grad_norm": 1.5582241308265918, + "learning_rate": 1.5033749328513537e-07, + "loss": 0.7597, + "step": 9304 + }, + { + "epoch": 0.95, + "grad_norm": 1.5338763086526532, + "learning_rate": 1.4976892091541185e-07, + "loss": 0.6083, + "step": 9305 + }, + { + "epoch": 0.95, + "grad_norm": 1.8149298068655273, + "learning_rate": 1.4920141763423158e-07, + "loss": 0.6472, + "step": 9306 + }, + { + "epoch": 0.95, + "grad_norm": 1.7713512451374713, + "learning_rate": 1.486349835031875e-07, + "loss": 0.7545, + "step": 9307 + }, + { + "epoch": 0.95, + "grad_norm": 1.6077642775620882, + "learning_rate": 1.4806961858375824e-07, + "loss": 0.6135, + "step": 9308 + }, + { + "epoch": 0.95, + "grad_norm": 1.695935839933717, + "learning_rate": 1.475053229373047e-07, + "loss": 0.6744, + "step": 9309 + }, + { + "epoch": 0.95, + "grad_norm": 1.7030861479239043, + "learning_rate": 1.469420966250723e-07, + "loss": 0.7265, + "step": 9310 + }, + { + "epoch": 0.95, + "grad_norm": 1.8379397844754426, + "learning_rate": 1.4637993970819219e-07, + "loss": 0.6841, + "step": 9311 + }, + { + "epoch": 0.95, + "grad_norm": 1.58928195504432, + "learning_rate": 1.4581885224767557e-07, + "loss": 0.6871, + "step": 9312 + }, + { + "epoch": 0.95, + "grad_norm": 1.6598694335921578, + "learning_rate": 1.4525883430441922e-07, + "loss": 0.7469, + "step": 9313 + }, + { + "epoch": 0.95, + "grad_norm": 1.847375042284769, + "learning_rate": 1.446998859392068e-07, + "loss": 0.7513, + "step": 9314 + }, + { + "epoch": 0.95, + "grad_norm": 1.6196942469957114, + "learning_rate": 1.4414200721270198e-07, + "loss": 0.7, + "step": 9315 + }, + { + "epoch": 0.95, + "grad_norm": 1.651722372283378, + "learning_rate": 1.4358519818545302e-07, + "loss": 0.6817, + "step": 9316 + }, + { + "epoch": 0.95, + "grad_norm": 1.4620185736229712, + "learning_rate": 1.4302945891789487e-07, + "loss": 0.6987, + "step": 9317 + }, + { + "epoch": 0.95, + "grad_norm": 1.5969563156865756, + "learning_rate": 1.4247478947034155e-07, + "loss": 0.6432, + "step": 9318 + }, + { + "epoch": 0.95, + "grad_norm": 1.7740376885253586, + "learning_rate": 1.419211899029971e-07, + "loss": 0.7709, + "step": 9319 + }, + { + "epoch": 0.95, + "grad_norm": 1.5724361492200893, + "learning_rate": 1.4136866027594341e-07, + "loss": 0.7302, + "step": 9320 + }, + { + "epoch": 0.95, + "grad_norm": 1.762529061812301, + "learning_rate": 1.4081720064915037e-07, + "loss": 0.6829, + "step": 9321 + }, + { + "epoch": 0.95, + "grad_norm": 1.732177495747799, + "learning_rate": 1.402668110824701e-07, + "loss": 0.7528, + "step": 9322 + }, + { + "epoch": 0.95, + "grad_norm": 1.5117135004060325, + "learning_rate": 1.3971749163563696e-07, + "loss": 0.7106, + "step": 9323 + }, + { + "epoch": 0.95, + "grad_norm": 1.936486960015918, + "learning_rate": 1.3916924236827444e-07, + "loss": 0.7927, + "step": 9324 + }, + { + "epoch": 0.95, + "grad_norm": 1.6315407141874998, + "learning_rate": 1.3862206333988383e-07, + "loss": 0.5763, + "step": 9325 + }, + { + "epoch": 0.95, + "grad_norm": 1.6832826870046844, + "learning_rate": 1.3807595460985312e-07, + "loss": 0.6027, + "step": 9326 + }, + { + "epoch": 0.95, + "grad_norm": 1.633657285099894, + "learning_rate": 1.3753091623745497e-07, + "loss": 0.7159, + "step": 9327 + }, + { + "epoch": 0.95, + "grad_norm": 1.5657399469617896, + "learning_rate": 1.3698694828184312e-07, + "loss": 0.5689, + "step": 9328 + }, + { + "epoch": 0.95, + "grad_norm": 1.5576252990477029, + "learning_rate": 1.3644405080205925e-07, + "loss": 0.6211, + "step": 9329 + }, + { + "epoch": 0.95, + "grad_norm": 1.6803219682719743, + "learning_rate": 1.359022238570229e-07, + "loss": 0.7426, + "step": 9330 + }, + { + "epoch": 0.95, + "grad_norm": 1.7206962427488917, + "learning_rate": 1.353614675055437e-07, + "loss": 0.7504, + "step": 9331 + }, + { + "epoch": 0.95, + "grad_norm": 1.695239140095105, + "learning_rate": 1.3482178180631243e-07, + "loss": 0.7181, + "step": 9332 + }, + { + "epoch": 0.95, + "grad_norm": 1.5461146588452048, + "learning_rate": 1.3428316681790232e-07, + "loss": 0.6312, + "step": 9333 + }, + { + "epoch": 0.95, + "grad_norm": 1.634849581054608, + "learning_rate": 1.33745622598771e-07, + "loss": 0.7263, + "step": 9334 + }, + { + "epoch": 0.95, + "grad_norm": 1.7191373351509227, + "learning_rate": 1.3320914920726292e-07, + "loss": 0.7151, + "step": 9335 + }, + { + "epoch": 0.95, + "grad_norm": 1.806927458293626, + "learning_rate": 1.3267374670160037e-07, + "loss": 0.7173, + "step": 9336 + }, + { + "epoch": 0.95, + "grad_norm": 1.847358132459803, + "learning_rate": 1.3213941513989582e-07, + "loss": 0.6338, + "step": 9337 + }, + { + "epoch": 0.95, + "grad_norm": 1.6792896756264375, + "learning_rate": 1.316061545801417e-07, + "loss": 0.6552, + "step": 9338 + }, + { + "epoch": 0.95, + "grad_norm": 1.7975803650935682, + "learning_rate": 1.3107396508021508e-07, + "loss": 0.649, + "step": 9339 + }, + { + "epoch": 0.95, + "grad_norm": 1.7320047443084134, + "learning_rate": 1.3054284669787754e-07, + "loss": 0.757, + "step": 9340 + }, + { + "epoch": 0.95, + "grad_norm": 1.7064991477573936, + "learning_rate": 1.3001279949077184e-07, + "loss": 0.7516, + "step": 9341 + }, + { + "epoch": 0.95, + "grad_norm": 1.8049030315444434, + "learning_rate": 1.294838235164275e-07, + "loss": 0.7043, + "step": 9342 + }, + { + "epoch": 0.95, + "grad_norm": 1.783648924170492, + "learning_rate": 1.2895591883225754e-07, + "loss": 0.7772, + "step": 9343 + }, + { + "epoch": 0.95, + "grad_norm": 1.661130797786391, + "learning_rate": 1.284290854955561e-07, + "loss": 0.6693, + "step": 9344 + }, + { + "epoch": 0.95, + "grad_norm": 1.682829079794363, + "learning_rate": 1.27903323563503e-07, + "loss": 0.6384, + "step": 9345 + }, + { + "epoch": 0.95, + "grad_norm": 1.7792700905516068, + "learning_rate": 1.2737863309316257e-07, + "loss": 0.6837, + "step": 9346 + }, + { + "epoch": 0.95, + "grad_norm": 1.8187133886049904, + "learning_rate": 1.268550141414804e-07, + "loss": 0.753, + "step": 9347 + }, + { + "epoch": 0.95, + "grad_norm": 1.758836973325861, + "learning_rate": 1.2633246676528764e-07, + "loss": 0.6515, + "step": 9348 + }, + { + "epoch": 0.95, + "grad_norm": 1.7328795167778492, + "learning_rate": 1.2581099102129902e-07, + "loss": 0.7636, + "step": 9349 + }, + { + "epoch": 0.95, + "grad_norm": 1.8121824781438312, + "learning_rate": 1.252905869661114e-07, + "loss": 0.6873, + "step": 9350 + }, + { + "epoch": 0.95, + "grad_norm": 1.741948485860588, + "learning_rate": 1.2477125465620854e-07, + "loss": 0.6576, + "step": 9351 + }, + { + "epoch": 0.95, + "grad_norm": 1.6665539985254938, + "learning_rate": 1.242529941479542e-07, + "loss": 0.6524, + "step": 9352 + }, + { + "epoch": 0.95, + "grad_norm": 1.6435808873911482, + "learning_rate": 1.237358054975979e-07, + "loss": 0.7197, + "step": 9353 + }, + { + "epoch": 0.95, + "grad_norm": 1.6404318516170804, + "learning_rate": 1.2321968876127244e-07, + "loss": 0.723, + "step": 9354 + }, + { + "epoch": 0.95, + "grad_norm": 1.831279473076429, + "learning_rate": 1.2270464399499416e-07, + "loss": 0.8039, + "step": 9355 + }, + { + "epoch": 0.95, + "grad_norm": 1.6247816181543355, + "learning_rate": 1.2219067125466388e-07, + "loss": 0.6802, + "step": 9356 + }, + { + "epoch": 0.95, + "grad_norm": 1.7632533356131936, + "learning_rate": 1.2167777059606367e-07, + "loss": 0.715, + "step": 9357 + }, + { + "epoch": 0.95, + "grad_norm": 1.656251792043068, + "learning_rate": 1.2116594207486232e-07, + "loss": 0.5863, + "step": 9358 + }, + { + "epoch": 0.95, + "grad_norm": 1.6262676141905716, + "learning_rate": 1.2065518574660983e-07, + "loss": 0.653, + "step": 9359 + }, + { + "epoch": 0.95, + "grad_norm": 1.8073241511420874, + "learning_rate": 1.201455016667419e-07, + "loss": 0.6898, + "step": 9360 + }, + { + "epoch": 0.95, + "grad_norm": 1.6183084246280248, + "learning_rate": 1.196368898905753e-07, + "loss": 0.655, + "step": 9361 + }, + { + "epoch": 0.95, + "grad_norm": 1.616972179558239, + "learning_rate": 1.1912935047331265e-07, + "loss": 0.6692, + "step": 9362 + }, + { + "epoch": 0.95, + "grad_norm": 1.7388663060507172, + "learning_rate": 1.1862288347004091e-07, + "loss": 0.6522, + "step": 9363 + }, + { + "epoch": 0.95, + "grad_norm": 1.5250952476420325, + "learning_rate": 1.1811748893572616e-07, + "loss": 0.686, + "step": 9364 + }, + { + "epoch": 0.95, + "grad_norm": 1.6662472338063785, + "learning_rate": 1.176131669252234e-07, + "loss": 0.6984, + "step": 9365 + }, + { + "epoch": 0.95, + "grad_norm": 1.5549764713843681, + "learning_rate": 1.1710991749326772e-07, + "loss": 0.6081, + "step": 9366 + }, + { + "epoch": 0.95, + "grad_norm": 1.5991869130619443, + "learning_rate": 1.1660774069447877e-07, + "loss": 0.6039, + "step": 9367 + }, + { + "epoch": 0.95, + "grad_norm": 1.7864587149948645, + "learning_rate": 1.1610663658336186e-07, + "loss": 0.7464, + "step": 9368 + }, + { + "epoch": 0.95, + "grad_norm": 1.9867487428760133, + "learning_rate": 1.1560660521430233e-07, + "loss": 0.6608, + "step": 9369 + }, + { + "epoch": 0.95, + "grad_norm": 1.6219424473624813, + "learning_rate": 1.1510764664157126e-07, + "loss": 0.7251, + "step": 9370 + }, + { + "epoch": 0.95, + "grad_norm": 1.6957119355691972, + "learning_rate": 1.1460976091932307e-07, + "loss": 0.6999, + "step": 9371 + }, + { + "epoch": 0.95, + "grad_norm": 1.6944825952036535, + "learning_rate": 1.1411294810159457e-07, + "loss": 0.6964, + "step": 9372 + }, + { + "epoch": 0.95, + "grad_norm": 1.7535197596326562, + "learning_rate": 1.1361720824230704e-07, + "loss": 0.6444, + "step": 9373 + }, + { + "epoch": 0.95, + "grad_norm": 1.8297612829240806, + "learning_rate": 1.1312254139526635e-07, + "loss": 0.7812, + "step": 9374 + }, + { + "epoch": 0.95, + "grad_norm": 1.8623045745422762, + "learning_rate": 1.1262894761416066e-07, + "loss": 0.797, + "step": 9375 + }, + { + "epoch": 0.95, + "grad_norm": 1.801772300675876, + "learning_rate": 1.1213642695256156e-07, + "loss": 0.7934, + "step": 9376 + }, + { + "epoch": 0.95, + "grad_norm": 1.6568338935636044, + "learning_rate": 1.1164497946392406e-07, + "loss": 0.6191, + "step": 9377 + }, + { + "epoch": 0.95, + "grad_norm": 1.629710990885931, + "learning_rate": 1.1115460520158772e-07, + "loss": 0.5813, + "step": 9378 + }, + { + "epoch": 0.95, + "grad_norm": 1.6004914877405867, + "learning_rate": 1.1066530421877442e-07, + "loss": 0.5771, + "step": 9379 + }, + { + "epoch": 0.95, + "grad_norm": 1.7226413707376684, + "learning_rate": 1.1017707656859055e-07, + "loss": 0.6676, + "step": 9380 + }, + { + "epoch": 0.95, + "grad_norm": 1.8615447304953088, + "learning_rate": 1.0968992230402598e-07, + "loss": 0.7985, + "step": 9381 + }, + { + "epoch": 0.95, + "grad_norm": 1.6921151869689077, + "learning_rate": 1.0920384147795172e-07, + "loss": 0.7771, + "step": 9382 + }, + { + "epoch": 0.95, + "grad_norm": 1.6221705984273835, + "learning_rate": 1.0871883414312778e-07, + "loss": 0.6666, + "step": 9383 + }, + { + "epoch": 0.95, + "grad_norm": 1.5705962467314438, + "learning_rate": 1.0823490035218986e-07, + "loss": 0.6474, + "step": 9384 + }, + { + "epoch": 0.95, + "grad_norm": 1.7104006167842383, + "learning_rate": 1.0775204015766483e-07, + "loss": 0.8605, + "step": 9385 + }, + { + "epoch": 0.95, + "grad_norm": 1.6798442346632074, + "learning_rate": 1.0727025361195853e-07, + "loss": 0.7488, + "step": 9386 + }, + { + "epoch": 0.95, + "grad_norm": 1.8652450432846577, + "learning_rate": 1.0678954076736136e-07, + "loss": 0.7213, + "step": 9387 + }, + { + "epoch": 0.95, + "grad_norm": 1.7415937054078674, + "learning_rate": 1.0630990167604716e-07, + "loss": 0.5931, + "step": 9388 + }, + { + "epoch": 0.96, + "grad_norm": 1.8481711044022067, + "learning_rate": 1.0583133639007203e-07, + "loss": 0.706, + "step": 9389 + }, + { + "epoch": 0.96, + "grad_norm": 1.7516193585064288, + "learning_rate": 1.053538449613778e-07, + "loss": 0.7088, + "step": 9390 + }, + { + "epoch": 0.96, + "grad_norm": 1.620252240786182, + "learning_rate": 1.0487742744178964e-07, + "loss": 0.6785, + "step": 9391 + }, + { + "epoch": 0.96, + "grad_norm": 1.7365674839200946, + "learning_rate": 1.0440208388301399e-07, + "loss": 0.732, + "step": 9392 + }, + { + "epoch": 0.96, + "grad_norm": 1.637589589718617, + "learning_rate": 1.039278143366429e-07, + "loss": 0.6338, + "step": 9393 + }, + { + "epoch": 0.96, + "grad_norm": 1.8376359016842168, + "learning_rate": 1.0345461885414965e-07, + "loss": 0.6336, + "step": 9394 + }, + { + "epoch": 0.96, + "grad_norm": 1.7308196052245322, + "learning_rate": 1.0298249748689204e-07, + "loss": 0.7295, + "step": 9395 + }, + { + "epoch": 0.96, + "grad_norm": 1.6808380277400239, + "learning_rate": 1.0251145028611464e-07, + "loss": 0.6251, + "step": 9396 + }, + { + "epoch": 0.96, + "grad_norm": 1.6294942373354988, + "learning_rate": 1.0204147730293767e-07, + "loss": 0.6, + "step": 9397 + }, + { + "epoch": 0.96, + "grad_norm": 1.7041566127035381, + "learning_rate": 1.0157257858837255e-07, + "loss": 0.7238, + "step": 9398 + }, + { + "epoch": 0.96, + "grad_norm": 1.6222572131657285, + "learning_rate": 1.0110475419330967e-07, + "loss": 0.7622, + "step": 9399 + }, + { + "epoch": 0.96, + "grad_norm": 2.0364432927850973, + "learning_rate": 1.0063800416852399e-07, + "loss": 0.768, + "step": 9400 + }, + { + "epoch": 0.96, + "grad_norm": 1.7975191300854436, + "learning_rate": 1.0017232856467495e-07, + "loss": 0.7176, + "step": 9401 + }, + { + "epoch": 0.96, + "grad_norm": 1.5093214742141057, + "learning_rate": 9.970772743230329e-08, + "loss": 0.649, + "step": 9402 + }, + { + "epoch": 0.96, + "grad_norm": 1.4343705311458053, + "learning_rate": 9.924420082183416e-08, + "loss": 0.6965, + "step": 9403 + }, + { + "epoch": 0.96, + "grad_norm": 1.6913536743725621, + "learning_rate": 9.878174878357738e-08, + "loss": 0.6269, + "step": 9404 + }, + { + "epoch": 0.96, + "grad_norm": 1.7215768900124486, + "learning_rate": 9.832037136772387e-08, + "loss": 0.6705, + "step": 9405 + }, + { + "epoch": 0.96, + "grad_norm": 1.565162152804664, + "learning_rate": 9.786006862434916e-08, + "loss": 0.6139, + "step": 9406 + }, + { + "epoch": 0.96, + "grad_norm": 1.6399417139822021, + "learning_rate": 9.740084060341104e-08, + "loss": 0.6996, + "step": 9407 + }, + { + "epoch": 0.96, + "grad_norm": 1.671760353975966, + "learning_rate": 9.694268735475299e-08, + "loss": 0.6743, + "step": 9408 + }, + { + "epoch": 0.96, + "grad_norm": 1.6602976217731225, + "learning_rate": 9.648560892809967e-08, + "loss": 0.7442, + "step": 9409 + }, + { + "epoch": 0.96, + "grad_norm": 1.7598413490910223, + "learning_rate": 9.602960537306027e-08, + "loss": 0.7322, + "step": 9410 + }, + { + "epoch": 0.96, + "grad_norm": 1.7576825593121441, + "learning_rate": 9.557467673912635e-08, + "loss": 0.6904, + "step": 9411 + }, + { + "epoch": 0.96, + "grad_norm": 1.6407909634932698, + "learning_rate": 9.512082307567283e-08, + "loss": 0.6866, + "step": 9412 + }, + { + "epoch": 0.96, + "grad_norm": 1.8693682646759682, + "learning_rate": 9.46680444319581e-08, + "loss": 0.7631, + "step": 9413 + }, + { + "epoch": 0.96, + "grad_norm": 1.78065338467431, + "learning_rate": 9.421634085712728e-08, + "loss": 0.6334, + "step": 9414 + }, + { + "epoch": 0.96, + "grad_norm": 1.6792038849549156, + "learning_rate": 9.376571240020227e-08, + "loss": 0.6723, + "step": 9415 + }, + { + "epoch": 0.96, + "grad_norm": 1.74326436533906, + "learning_rate": 9.331615911009284e-08, + "loss": 0.682, + "step": 9416 + }, + { + "epoch": 0.96, + "grad_norm": 1.7774904025160487, + "learning_rate": 9.286768103559107e-08, + "loss": 0.8439, + "step": 9417 + }, + { + "epoch": 0.96, + "grad_norm": 1.6803450951448946, + "learning_rate": 9.242027822537247e-08, + "loss": 0.6999, + "step": 9418 + }, + { + "epoch": 0.96, + "grad_norm": 1.626514736365701, + "learning_rate": 9.197395072799597e-08, + "loss": 0.7507, + "step": 9419 + }, + { + "epoch": 0.96, + "grad_norm": 1.9147094916880236, + "learning_rate": 9.152869859190283e-08, + "loss": 0.5663, + "step": 9420 + }, + { + "epoch": 0.96, + "grad_norm": 1.645184270408986, + "learning_rate": 9.108452186541771e-08, + "loss": 0.7222, + "step": 9421 + }, + { + "epoch": 0.96, + "grad_norm": 1.745259445087553, + "learning_rate": 9.064142059674985e-08, + "loss": 0.7458, + "step": 9422 + }, + { + "epoch": 0.96, + "grad_norm": 1.6908506915448078, + "learning_rate": 9.019939483399076e-08, + "loss": 0.6719, + "step": 9423 + }, + { + "epoch": 0.96, + "grad_norm": 1.8034257047030355, + "learning_rate": 8.975844462511652e-08, + "loss": 0.7185, + "step": 9424 + }, + { + "epoch": 0.96, + "grad_norm": 1.7462225913889697, + "learning_rate": 8.931857001798216e-08, + "loss": 0.6562, + "step": 9425 + }, + { + "epoch": 0.96, + "grad_norm": 1.700078979326214, + "learning_rate": 8.887977106033285e-08, + "loss": 0.6466, + "step": 9426 + }, + { + "epoch": 0.96, + "grad_norm": 1.8522994307817322, + "learning_rate": 8.84420477997916e-08, + "loss": 0.7504, + "step": 9427 + }, + { + "epoch": 0.96, + "grad_norm": 1.6080686398108495, + "learning_rate": 8.800540028386595e-08, + "loss": 0.6406, + "step": 9428 + }, + { + "epoch": 0.96, + "grad_norm": 1.7004782602843376, + "learning_rate": 8.756982855994911e-08, + "loss": 0.6996, + "step": 9429 + }, + { + "epoch": 0.96, + "grad_norm": 1.8207728151070466, + "learning_rate": 8.713533267531326e-08, + "loss": 0.7679, + "step": 9430 + }, + { + "epoch": 0.96, + "grad_norm": 1.7478520752621098, + "learning_rate": 8.670191267711736e-08, + "loss": 0.715, + "step": 9431 + }, + { + "epoch": 0.96, + "grad_norm": 1.66706241086095, + "learning_rate": 8.626956861240265e-08, + "loss": 0.6582, + "step": 9432 + }, + { + "epoch": 0.96, + "grad_norm": 1.6439981567690312, + "learning_rate": 8.58383005280916e-08, + "loss": 0.5828, + "step": 9433 + }, + { + "epoch": 0.96, + "grad_norm": 1.7405589455947643, + "learning_rate": 8.540810847099345e-08, + "loss": 0.7158, + "step": 9434 + }, + { + "epoch": 0.96, + "grad_norm": 1.7367774924168071, + "learning_rate": 8.497899248779862e-08, + "loss": 0.757, + "step": 9435 + }, + { + "epoch": 0.96, + "grad_norm": 1.6737570175181034, + "learning_rate": 8.455095262508095e-08, + "loss": 0.7011, + "step": 9436 + }, + { + "epoch": 0.96, + "grad_norm": 1.6452133588641074, + "learning_rate": 8.412398892929663e-08, + "loss": 0.6645, + "step": 9437 + }, + { + "epoch": 0.96, + "grad_norm": 1.681366338145333, + "learning_rate": 8.369810144678636e-08, + "loss": 0.77, + "step": 9438 + }, + { + "epoch": 0.96, + "grad_norm": 1.550502000121852, + "learning_rate": 8.327329022377317e-08, + "loss": 0.6442, + "step": 9439 + }, + { + "epoch": 0.96, + "grad_norm": 1.6022409815852374, + "learning_rate": 8.284955530636462e-08, + "loss": 0.6408, + "step": 9440 + }, + { + "epoch": 0.96, + "grad_norm": 1.5452686011312082, + "learning_rate": 8.242689674054949e-08, + "loss": 0.6169, + "step": 9441 + }, + { + "epoch": 0.96, + "grad_norm": 1.8167821762833238, + "learning_rate": 8.200531457220218e-08, + "loss": 0.7093, + "step": 9442 + }, + { + "epoch": 0.96, + "grad_norm": 1.7396942787558083, + "learning_rate": 8.15848088470772e-08, + "loss": 0.7135, + "step": 9443 + }, + { + "epoch": 0.96, + "grad_norm": 1.9776231560210884, + "learning_rate": 8.116537961081473e-08, + "loss": 0.6921, + "step": 9444 + }, + { + "epoch": 0.96, + "grad_norm": 1.75401788492991, + "learning_rate": 8.074702690893722e-08, + "loss": 0.6245, + "step": 9445 + }, + { + "epoch": 0.96, + "grad_norm": 1.638899199915196, + "learning_rate": 8.032975078684945e-08, + "loss": 0.6686, + "step": 9446 + }, + { + "epoch": 0.96, + "grad_norm": 1.6823860072883687, + "learning_rate": 7.99135512898408e-08, + "loss": 0.7626, + "step": 9447 + }, + { + "epoch": 0.96, + "grad_norm": 1.5854351031807767, + "learning_rate": 7.949842846308398e-08, + "loss": 0.6313, + "step": 9448 + }, + { + "epoch": 0.96, + "grad_norm": 1.5310605391572794, + "learning_rate": 7.908438235163407e-08, + "loss": 0.5886, + "step": 9449 + }, + { + "epoch": 0.96, + "grad_norm": 1.8708623272585712, + "learning_rate": 7.867141300042736e-08, + "loss": 0.7019, + "step": 9450 + }, + { + "epoch": 0.96, + "grad_norm": 1.3911077288709224, + "learning_rate": 7.825952045428797e-08, + "loss": 0.5764, + "step": 9451 + }, + { + "epoch": 0.96, + "grad_norm": 1.6768022829466143, + "learning_rate": 7.784870475791794e-08, + "loss": 0.6353, + "step": 9452 + }, + { + "epoch": 0.96, + "grad_norm": 1.5242529784224401, + "learning_rate": 7.743896595590605e-08, + "loss": 0.6805, + "step": 9453 + }, + { + "epoch": 0.96, + "grad_norm": 1.8249475019113455, + "learning_rate": 7.703030409272339e-08, + "loss": 0.744, + "step": 9454 + }, + { + "epoch": 0.96, + "grad_norm": 1.7586110111232993, + "learning_rate": 7.662271921272224e-08, + "loss": 0.6442, + "step": 9455 + }, + { + "epoch": 0.96, + "grad_norm": 1.7143857110798468, + "learning_rate": 7.621621136014168e-08, + "loss": 0.5942, + "step": 9456 + }, + { + "epoch": 0.96, + "grad_norm": 1.6898773689992317, + "learning_rate": 7.581078057909974e-08, + "loss": 0.6309, + "step": 9457 + }, + { + "epoch": 0.96, + "grad_norm": 1.634801653449755, + "learning_rate": 7.540642691360123e-08, + "loss": 0.6671, + "step": 9458 + }, + { + "epoch": 0.96, + "grad_norm": 1.8680966984373397, + "learning_rate": 7.500315040753214e-08, + "loss": 0.6873, + "step": 9459 + }, + { + "epoch": 0.96, + "grad_norm": 1.6719597635551595, + "learning_rate": 7.460095110466192e-08, + "loss": 0.6266, + "step": 9460 + }, + { + "epoch": 0.96, + "grad_norm": 1.6156246547794542, + "learning_rate": 7.41998290486412e-08, + "loss": 0.6576, + "step": 9461 + }, + { + "epoch": 0.96, + "grad_norm": 1.4988781002438796, + "learning_rate": 7.379978428300738e-08, + "loss": 0.5995, + "step": 9462 + }, + { + "epoch": 0.96, + "grad_norm": 1.6309987376080932, + "learning_rate": 7.340081685117906e-08, + "loss": 0.6821, + "step": 9463 + }, + { + "epoch": 0.96, + "grad_norm": 1.8204765391454656, + "learning_rate": 7.300292679645716e-08, + "loss": 0.715, + "step": 9464 + }, + { + "epoch": 0.96, + "grad_norm": 1.781509439351711, + "learning_rate": 7.260611416202712e-08, + "loss": 0.8175, + "step": 9465 + }, + { + "epoch": 0.96, + "grad_norm": 1.7769442386139096, + "learning_rate": 7.221037899095561e-08, + "loss": 0.6977, + "step": 9466 + }, + { + "epoch": 0.96, + "grad_norm": 1.884030339551365, + "learning_rate": 7.181572132619385e-08, + "loss": 0.645, + "step": 9467 + }, + { + "epoch": 0.96, + "grad_norm": 1.5657382868794574, + "learning_rate": 7.142214121057755e-08, + "loss": 0.6959, + "step": 9468 + }, + { + "epoch": 0.96, + "grad_norm": 1.7086982932887396, + "learning_rate": 7.102963868682034e-08, + "loss": 0.5867, + "step": 9469 + }, + { + "epoch": 0.96, + "grad_norm": 1.4962641401982082, + "learning_rate": 7.063821379752589e-08, + "loss": 0.5343, + "step": 9470 + }, + { + "epoch": 0.96, + "grad_norm": 1.7827013511340437, + "learning_rate": 7.024786658517468e-08, + "loss": 0.7294, + "step": 9471 + }, + { + "epoch": 0.96, + "grad_norm": 1.7029572387275005, + "learning_rate": 6.985859709213283e-08, + "loss": 0.6746, + "step": 9472 + }, + { + "epoch": 0.96, + "grad_norm": 1.528253388269664, + "learning_rate": 6.947040536065208e-08, + "loss": 0.6944, + "step": 9473 + }, + { + "epoch": 0.96, + "grad_norm": 1.866562625793677, + "learning_rate": 6.908329143286096e-08, + "loss": 0.6758, + "step": 9474 + }, + { + "epoch": 0.96, + "grad_norm": 1.6755032350387424, + "learning_rate": 6.869725535077698e-08, + "loss": 0.7056, + "step": 9475 + }, + { + "epoch": 0.96, + "grad_norm": 1.7746080634624017, + "learning_rate": 6.831229715629884e-08, + "loss": 0.6918, + "step": 9476 + }, + { + "epoch": 0.96, + "grad_norm": 1.710479335270481, + "learning_rate": 6.792841689120533e-08, + "loss": 0.8261, + "step": 9477 + }, + { + "epoch": 0.96, + "grad_norm": 1.633912267250495, + "learning_rate": 6.754561459716202e-08, + "loss": 0.7372, + "step": 9478 + }, + { + "epoch": 0.96, + "grad_norm": 1.8059470626753045, + "learning_rate": 6.716389031571568e-08, + "loss": 0.6856, + "step": 9479 + }, + { + "epoch": 0.96, + "grad_norm": 1.7177100237104101, + "learning_rate": 6.678324408829762e-08, + "loss": 0.6838, + "step": 9480 + }, + { + "epoch": 0.96, + "grad_norm": 1.6373046738415207, + "learning_rate": 6.640367595622033e-08, + "loss": 0.7585, + "step": 9481 + }, + { + "epoch": 0.96, + "grad_norm": 1.8719583939143258, + "learning_rate": 6.602518596067975e-08, + "loss": 0.6701, + "step": 9482 + }, + { + "epoch": 0.96, + "grad_norm": 1.7507052228301732, + "learning_rate": 6.564777414275525e-08, + "loss": 0.7428, + "step": 9483 + }, + { + "epoch": 0.96, + "grad_norm": 1.6062017308145797, + "learning_rate": 6.52714405434085e-08, + "loss": 0.65, + "step": 9484 + }, + { + "epoch": 0.96, + "grad_norm": 1.8043004451790021, + "learning_rate": 6.489618520348573e-08, + "loss": 0.7727, + "step": 9485 + }, + { + "epoch": 0.96, + "grad_norm": 1.6345253892914255, + "learning_rate": 6.452200816371435e-08, + "loss": 0.7133, + "step": 9486 + }, + { + "epoch": 0.97, + "grad_norm": 1.6192129197947587, + "learning_rate": 6.41489094647052e-08, + "loss": 0.705, + "step": 9487 + }, + { + "epoch": 0.97, + "grad_norm": 1.7888330879688503, + "learning_rate": 6.377688914695256e-08, + "loss": 0.7332, + "step": 9488 + }, + { + "epoch": 0.97, + "grad_norm": 1.5641196925584768, + "learning_rate": 6.340594725083415e-08, + "loss": 0.734, + "step": 9489 + }, + { + "epoch": 0.97, + "grad_norm": 1.9912579431641058, + "learning_rate": 6.303608381660887e-08, + "loss": 0.7217, + "step": 9490 + }, + { + "epoch": 0.97, + "grad_norm": 1.5776840514556638, + "learning_rate": 6.266729888442013e-08, + "loss": 0.6663, + "step": 9491 + }, + { + "epoch": 0.97, + "grad_norm": 1.7249885914365968, + "learning_rate": 6.229959249429263e-08, + "loss": 0.6978, + "step": 9492 + }, + { + "epoch": 0.97, + "grad_norm": 1.8158158270080882, + "learning_rate": 6.193296468613663e-08, + "loss": 0.6448, + "step": 9493 + }, + { + "epoch": 0.97, + "grad_norm": 1.6464321238990238, + "learning_rate": 6.156741549974365e-08, + "loss": 0.6866, + "step": 9494 + }, + { + "epoch": 0.97, + "grad_norm": 1.6656953848178833, + "learning_rate": 6.120294497478752e-08, + "loss": 0.6183, + "step": 9495 + }, + { + "epoch": 0.97, + "grad_norm": 1.9088722193655174, + "learning_rate": 6.083955315082657e-08, + "loss": 0.6723, + "step": 9496 + }, + { + "epoch": 0.97, + "grad_norm": 1.5716830550858858, + "learning_rate": 6.04772400673015e-08, + "loss": 0.6995, + "step": 9497 + }, + { + "epoch": 0.97, + "grad_norm": 1.792143791691787, + "learning_rate": 6.011600576353416e-08, + "loss": 0.7165, + "step": 9498 + }, + { + "epoch": 0.97, + "grad_norm": 1.6182212968308853, + "learning_rate": 5.97558502787332e-08, + "loss": 0.7434, + "step": 9499 + }, + { + "epoch": 0.97, + "grad_norm": 1.7037892359301718, + "learning_rate": 5.939677365198626e-08, + "loss": 0.6117, + "step": 9500 + }, + { + "epoch": 0.97, + "grad_norm": 1.707700336540344, + "learning_rate": 5.903877592226548e-08, + "loss": 0.7474, + "step": 9501 + }, + { + "epoch": 0.97, + "grad_norm": 1.6940925472575223, + "learning_rate": 5.868185712842645e-08, + "loss": 0.6836, + "step": 9502 + }, + { + "epoch": 0.97, + "grad_norm": 1.6486427451514054, + "learning_rate": 5.832601730920706e-08, + "loss": 0.7138, + "step": 9503 + }, + { + "epoch": 0.97, + "grad_norm": 1.5679311999282515, + "learning_rate": 5.797125650322866e-08, + "loss": 0.5144, + "step": 9504 + }, + { + "epoch": 0.97, + "grad_norm": 1.6840151836203492, + "learning_rate": 5.7617574748993764e-08, + "loss": 0.6978, + "step": 9505 + }, + { + "epoch": 0.97, + "grad_norm": 1.7141209646309552, + "learning_rate": 5.726497208488946e-08, + "loss": 0.6544, + "step": 9506 + }, + { + "epoch": 0.97, + "grad_norm": 1.7961651134883347, + "learning_rate": 5.691344854918623e-08, + "loss": 0.8223, + "step": 9507 + }, + { + "epoch": 0.97, + "grad_norm": 1.721568167980704, + "learning_rate": 5.6563004180034685e-08, + "loss": 0.7074, + "step": 9508 + }, + { + "epoch": 0.97, + "grad_norm": 1.7756903237314543, + "learning_rate": 5.6213639015472166e-08, + "loss": 0.729, + "step": 9509 + }, + { + "epoch": 0.97, + "grad_norm": 1.7530493444827209, + "learning_rate": 5.586535309341501e-08, + "loss": 0.6859, + "step": 9510 + }, + { + "epoch": 0.97, + "grad_norm": 1.8363965552736352, + "learning_rate": 5.55181464516652e-08, + "loss": 0.7653, + "step": 9511 + }, + { + "epoch": 0.97, + "grad_norm": 1.632990922923886, + "learning_rate": 5.517201912790593e-08, + "loss": 0.6096, + "step": 9512 + }, + { + "epoch": 0.97, + "grad_norm": 1.6297590626113077, + "learning_rate": 5.4826971159704925e-08, + "loss": 0.6823, + "step": 9513 + }, + { + "epoch": 0.97, + "grad_norm": 2.001130699233612, + "learning_rate": 5.448300258451111e-08, + "loss": 0.7525, + "step": 9514 + }, + { + "epoch": 0.97, + "grad_norm": 1.5364424660076623, + "learning_rate": 5.4140113439655753e-08, + "loss": 0.6215, + "step": 9515 + }, + { + "epoch": 0.97, + "grad_norm": 1.7204574771548173, + "learning_rate": 5.379830376235573e-08, + "loss": 0.6571, + "step": 9516 + }, + { + "epoch": 0.97, + "grad_norm": 1.6017778895293966, + "learning_rate": 5.3457573589709156e-08, + "loss": 0.682, + "step": 9517 + }, + { + "epoch": 0.97, + "grad_norm": 1.5577355583757446, + "learning_rate": 5.311792295869644e-08, + "loss": 0.63, + "step": 9518 + }, + { + "epoch": 0.97, + "grad_norm": 1.682049891016354, + "learning_rate": 5.2779351906181445e-08, + "loss": 0.6591, + "step": 9519 + }, + { + "epoch": 0.97, + "grad_norm": 1.5620593916862504, + "learning_rate": 5.24418604689092e-08, + "loss": 0.5631, + "step": 9520 + }, + { + "epoch": 0.97, + "grad_norm": 1.681310877232562, + "learning_rate": 5.210544868351153e-08, + "loss": 0.5983, + "step": 9521 + }, + { + "epoch": 0.97, + "grad_norm": 1.8030213916103963, + "learning_rate": 5.177011658650033e-08, + "loss": 0.6789, + "step": 9522 + }, + { + "epoch": 0.97, + "grad_norm": 1.8096040261391229, + "learning_rate": 5.143586421426982e-08, + "loss": 0.6906, + "step": 9523 + }, + { + "epoch": 0.97, + "grad_norm": 1.836173510001181, + "learning_rate": 5.1102691603097664e-08, + "loss": 0.7915, + "step": 9524 + }, + { + "epoch": 0.97, + "grad_norm": 1.6777462206542368, + "learning_rate": 5.077059878914492e-08, + "loss": 0.7249, + "step": 9525 + }, + { + "epoch": 0.97, + "grad_norm": 1.7752150890688336, + "learning_rate": 5.043958580845498e-08, + "loss": 0.6774, + "step": 9526 + }, + { + "epoch": 0.97, + "grad_norm": 1.935721956313621, + "learning_rate": 5.010965269695578e-08, + "loss": 0.7067, + "step": 9527 + }, + { + "epoch": 0.97, + "grad_norm": 1.6102260340258132, + "learning_rate": 4.978079949045311e-08, + "loss": 0.7253, + "step": 9528 + }, + { + "epoch": 0.97, + "grad_norm": 1.659590054401639, + "learning_rate": 4.945302622464177e-08, + "loss": 0.619, + "step": 9529 + }, + { + "epoch": 0.97, + "grad_norm": 1.5797987844636674, + "learning_rate": 4.912633293509439e-08, + "loss": 0.7114, + "step": 9530 + }, + { + "epoch": 0.97, + "grad_norm": 1.651232052055731, + "learning_rate": 4.8800719657270404e-08, + "loss": 0.6429, + "step": 9531 + }, + { + "epoch": 0.97, + "grad_norm": 1.7290046370594587, + "learning_rate": 4.84761864265082e-08, + "loss": 0.734, + "step": 9532 + }, + { + "epoch": 0.97, + "grad_norm": 1.7180351129100842, + "learning_rate": 4.815273327803183e-08, + "loss": 0.6383, + "step": 9533 + }, + { + "epoch": 0.97, + "grad_norm": 1.5391683968772378, + "learning_rate": 4.783036024694543e-08, + "loss": 0.6444, + "step": 9534 + }, + { + "epoch": 0.97, + "grad_norm": 1.7809726189745998, + "learning_rate": 4.750906736824101e-08, + "loss": 0.6574, + "step": 9535 + }, + { + "epoch": 0.97, + "grad_norm": 1.6544166039332864, + "learning_rate": 4.7188854676786246e-08, + "loss": 0.7301, + "step": 9536 + }, + { + "epoch": 0.97, + "grad_norm": 1.5079155368243176, + "learning_rate": 4.6869722207337763e-08, + "loss": 0.6591, + "step": 9537 + }, + { + "epoch": 0.97, + "grad_norm": 1.6571193553186434, + "learning_rate": 4.6551669994531204e-08, + "loss": 0.6755, + "step": 9538 + }, + { + "epoch": 0.97, + "grad_norm": 1.7825879578730972, + "learning_rate": 4.623469807288561e-08, + "loss": 0.7396, + "step": 9539 + }, + { + "epoch": 0.97, + "grad_norm": 1.7840446572194073, + "learning_rate": 4.591880647680458e-08, + "loss": 0.657, + "step": 9540 + }, + { + "epoch": 0.97, + "grad_norm": 1.9065872718206953, + "learning_rate": 4.5603995240572906e-08, + "loss": 0.7672, + "step": 9541 + }, + { + "epoch": 0.97, + "grad_norm": 1.696836293573375, + "learning_rate": 4.529026439835771e-08, + "loss": 0.72, + "step": 9542 + }, + { + "epoch": 0.97, + "grad_norm": 1.739553332302951, + "learning_rate": 4.4977613984210634e-08, + "loss": 0.6855, + "step": 9543 + }, + { + "epoch": 0.97, + "grad_norm": 1.6854862459503848, + "learning_rate": 4.4666044032063425e-08, + "loss": 0.7009, + "step": 9544 + }, + { + "epoch": 0.97, + "grad_norm": 1.6963922444075805, + "learning_rate": 4.4355554575734594e-08, + "loss": 0.6945, + "step": 9545 + }, + { + "epoch": 0.97, + "grad_norm": 1.5800756558416964, + "learning_rate": 4.404614564892051e-08, + "loss": 0.588, + "step": 9546 + }, + { + "epoch": 0.97, + "grad_norm": 1.6478454303775683, + "learning_rate": 4.373781728520321e-08, + "loss": 0.6437, + "step": 9547 + }, + { + "epoch": 0.97, + "grad_norm": 1.5023842441634008, + "learning_rate": 4.3430569518048135e-08, + "loss": 0.6218, + "step": 9548 + }, + { + "epoch": 0.97, + "grad_norm": 1.5924937602995637, + "learning_rate": 4.3124402380800846e-08, + "loss": 0.6583, + "step": 9549 + }, + { + "epoch": 0.97, + "grad_norm": 1.6325314575288126, + "learning_rate": 4.281931590669253e-08, + "loss": 0.5673, + "step": 9550 + }, + { + "epoch": 0.97, + "grad_norm": 1.5829462492317814, + "learning_rate": 4.251531012883337e-08, + "loss": 0.6106, + "step": 9551 + }, + { + "epoch": 0.97, + "grad_norm": 1.5521949626144838, + "learning_rate": 4.2212385080220295e-08, + "loss": 0.6161, + "step": 9552 + }, + { + "epoch": 0.97, + "grad_norm": 1.5820619083433027, + "learning_rate": 4.191054079373036e-08, + "loss": 0.7234, + "step": 9553 + }, + { + "epoch": 0.97, + "grad_norm": 1.6808104992758746, + "learning_rate": 4.160977730212401e-08, + "loss": 0.6616, + "step": 9554 + }, + { + "epoch": 0.97, + "grad_norm": 1.924873928536549, + "learning_rate": 4.131009463804403e-08, + "loss": 0.8662, + "step": 9555 + }, + { + "epoch": 0.97, + "grad_norm": 1.4615127567216981, + "learning_rate": 4.101149283401773e-08, + "loss": 0.7099, + "step": 9556 + }, + { + "epoch": 0.97, + "grad_norm": 1.615602544070035, + "learning_rate": 4.071397192245252e-08, + "loss": 0.6222, + "step": 9557 + }, + { + "epoch": 0.97, + "grad_norm": 1.954739249642568, + "learning_rate": 4.041753193563924e-08, + "loss": 0.7783, + "step": 9558 + }, + { + "epoch": 0.97, + "grad_norm": 1.6038174186370082, + "learning_rate": 4.0122172905753264e-08, + "loss": 0.7212, + "step": 9559 + }, + { + "epoch": 0.97, + "grad_norm": 1.5473590138448612, + "learning_rate": 3.982789486485006e-08, + "loss": 0.5879, + "step": 9560 + }, + { + "epoch": 0.97, + "grad_norm": 1.5495334233350668, + "learning_rate": 3.953469784486852e-08, + "loss": 0.6746, + "step": 9561 + }, + { + "epoch": 0.97, + "grad_norm": 1.5247979141054615, + "learning_rate": 3.924258187763208e-08, + "loss": 0.6481, + "step": 9562 + }, + { + "epoch": 0.97, + "grad_norm": 1.923834001273207, + "learning_rate": 3.895154699484427e-08, + "loss": 0.7229, + "step": 9563 + }, + { + "epoch": 0.97, + "grad_norm": 1.9727940095964407, + "learning_rate": 3.866159322809315e-08, + "loss": 0.7823, + "step": 9564 + }, + { + "epoch": 0.97, + "grad_norm": 1.6465939082770047, + "learning_rate": 3.8372720608848e-08, + "loss": 0.7266, + "step": 9565 + }, + { + "epoch": 0.97, + "grad_norm": 1.6971371645450848, + "learning_rate": 3.808492916846041e-08, + "loss": 0.7288, + "step": 9566 + }, + { + "epoch": 0.97, + "grad_norm": 1.7647054382027547, + "learning_rate": 3.77982189381676e-08, + "loss": 0.6905, + "step": 9567 + }, + { + "epoch": 0.97, + "grad_norm": 1.7337785020115393, + "learning_rate": 3.751258994908691e-08, + "loss": 0.7073, + "step": 9568 + }, + { + "epoch": 0.97, + "grad_norm": 1.9913478597915608, + "learning_rate": 3.72280422322191e-08, + "loss": 0.7697, + "step": 9569 + }, + { + "epoch": 0.97, + "grad_norm": 1.7756431875942662, + "learning_rate": 3.6944575818446126e-08, + "loss": 0.69, + "step": 9570 + }, + { + "epoch": 0.97, + "grad_norm": 1.6794395528746324, + "learning_rate": 3.6662190738535606e-08, + "loss": 0.7857, + "step": 9571 + }, + { + "epoch": 0.97, + "grad_norm": 1.7678436916808111, + "learning_rate": 3.638088702313414e-08, + "loss": 0.646, + "step": 9572 + }, + { + "epoch": 0.97, + "grad_norm": 1.7993167160166388, + "learning_rate": 3.610066470277507e-08, + "loss": 0.676, + "step": 9573 + }, + { + "epoch": 0.97, + "grad_norm": 1.5841152046243077, + "learning_rate": 3.582152380786963e-08, + "loss": 0.5893, + "step": 9574 + }, + { + "epoch": 0.97, + "grad_norm": 1.6703393696365203, + "learning_rate": 3.554346436871581e-08, + "loss": 0.7292, + "step": 9575 + }, + { + "epoch": 0.97, + "grad_norm": 1.6449459827254167, + "learning_rate": 3.52664864154928e-08, + "loss": 0.6117, + "step": 9576 + }, + { + "epoch": 0.97, + "grad_norm": 1.9219088939810345, + "learning_rate": 3.499058997826099e-08, + "loss": 0.7298, + "step": 9577 + }, + { + "epoch": 0.97, + "grad_norm": 1.5935011107809127, + "learning_rate": 3.471577508696533e-08, + "loss": 0.5968, + "step": 9578 + }, + { + "epoch": 0.97, + "grad_norm": 1.5925932403139895, + "learning_rate": 3.444204177143307e-08, + "loss": 0.623, + "step": 9579 + }, + { + "epoch": 0.97, + "grad_norm": 1.782416756084071, + "learning_rate": 3.416939006137265e-08, + "loss": 0.7689, + "step": 9580 + }, + { + "epoch": 0.97, + "grad_norm": 1.7447214103114526, + "learning_rate": 3.389781998637709e-08, + "loss": 0.6998, + "step": 9581 + }, + { + "epoch": 0.97, + "grad_norm": 1.6536370107942204, + "learning_rate": 3.3627331575921686e-08, + "loss": 0.6672, + "step": 9582 + }, + { + "epoch": 0.97, + "grad_norm": 1.7187325303429446, + "learning_rate": 3.3357924859361845e-08, + "loss": 0.7012, + "step": 9583 + }, + { + "epoch": 0.97, + "grad_norm": 1.8955684515759144, + "learning_rate": 3.3089599865938625e-08, + "loss": 0.6829, + "step": 9584 + }, + { + "epoch": 0.97, + "grad_norm": 1.7153222583829413, + "learning_rate": 3.282235662477429e-08, + "loss": 0.7687, + "step": 9585 + }, + { + "epoch": 0.98, + "grad_norm": 1.770308421957577, + "learning_rate": 3.2556195164873405e-08, + "loss": 0.6477, + "step": 9586 + }, + { + "epoch": 0.98, + "grad_norm": 1.629486287642064, + "learning_rate": 3.2291115515125093e-08, + "loss": 0.6571, + "step": 9587 + }, + { + "epoch": 0.98, + "grad_norm": 1.7828578010716696, + "learning_rate": 3.202711770429745e-08, + "loss": 0.7029, + "step": 9588 + }, + { + "epoch": 0.98, + "grad_norm": 1.783499374620443, + "learning_rate": 3.176420176104533e-08, + "loss": 0.6968, + "step": 9589 + }, + { + "epoch": 0.98, + "grad_norm": 1.7423644115712555, + "learning_rate": 3.1502367713903695e-08, + "loss": 0.7301, + "step": 9590 + }, + { + "epoch": 0.98, + "grad_norm": 1.7655218329922557, + "learning_rate": 3.12416155912898e-08, + "loss": 0.6931, + "step": 9591 + }, + { + "epoch": 0.98, + "grad_norm": 1.7484148997191309, + "learning_rate": 3.0981945421504345e-08, + "loss": 0.742, + "step": 9592 + }, + { + "epoch": 0.98, + "grad_norm": 1.6889592247363676, + "learning_rate": 3.0723357232731455e-08, + "loss": 0.6322, + "step": 9593 + }, + { + "epoch": 0.98, + "grad_norm": 1.6742068227188056, + "learning_rate": 3.0465851053035345e-08, + "loss": 0.5973, + "step": 9594 + }, + { + "epoch": 0.98, + "grad_norm": 1.658300994042665, + "learning_rate": 3.0209426910364766e-08, + "loss": 0.7638, + "step": 9595 + }, + { + "epoch": 0.98, + "grad_norm": 1.561736898588684, + "learning_rate": 2.995408483255191e-08, + "loss": 0.6049, + "step": 9596 + }, + { + "epoch": 0.98, + "grad_norm": 1.7656308985229812, + "learning_rate": 2.9699824847307933e-08, + "loss": 0.7248, + "step": 9597 + }, + { + "epoch": 0.98, + "grad_norm": 1.5919797364329935, + "learning_rate": 2.9446646982230763e-08, + "loss": 0.6782, + "step": 9598 + }, + { + "epoch": 0.98, + "grad_norm": 1.7407948021472628, + "learning_rate": 2.919455126479731e-08, + "loss": 0.7177, + "step": 9599 + }, + { + "epoch": 0.98, + "grad_norm": 1.6195766093351134, + "learning_rate": 2.894353772237013e-08, + "loss": 0.567, + "step": 9600 + }, + { + "epoch": 0.98, + "grad_norm": 1.8401774040526315, + "learning_rate": 2.8693606382191876e-08, + "loss": 0.8156, + "step": 9601 + }, + { + "epoch": 0.98, + "grad_norm": 1.586921656800293, + "learning_rate": 2.8444757271388625e-08, + "loss": 0.7336, + "step": 9602 + }, + { + "epoch": 0.98, + "grad_norm": 1.7564114273771738, + "learning_rate": 2.819699041696877e-08, + "loss": 0.7562, + "step": 9603 + }, + { + "epoch": 0.98, + "grad_norm": 1.7466936053595925, + "learning_rate": 2.7950305845825254e-08, + "loss": 0.6833, + "step": 9604 + }, + { + "epoch": 0.98, + "grad_norm": 1.7033376017472277, + "learning_rate": 2.7704703584729985e-08, + "loss": 0.7689, + "step": 9605 + }, + { + "epoch": 0.98, + "grad_norm": 1.5863377258152949, + "learning_rate": 2.7460183660339422e-08, + "loss": 0.6984, + "step": 9606 + }, + { + "epoch": 0.98, + "grad_norm": 2.0272524378790626, + "learning_rate": 2.7216746099193448e-08, + "loss": 0.6807, + "step": 9607 + }, + { + "epoch": 0.98, + "grad_norm": 1.5975543790019915, + "learning_rate": 2.6974390927712034e-08, + "loss": 0.6843, + "step": 9608 + }, + { + "epoch": 0.98, + "grad_norm": 1.6388291697059447, + "learning_rate": 2.6733118172200812e-08, + "loss": 0.6857, + "step": 9609 + }, + { + "epoch": 0.98, + "grad_norm": 1.5592279432842686, + "learning_rate": 2.6492927858844386e-08, + "loss": 0.6555, + "step": 9610 + }, + { + "epoch": 0.98, + "grad_norm": 1.6034240035241638, + "learning_rate": 2.6253820013713017e-08, + "loss": 0.6381, + "step": 9611 + }, + { + "epoch": 0.98, + "grad_norm": 1.6946540536432775, + "learning_rate": 2.6015794662757055e-08, + "loss": 0.6846, + "step": 9612 + }, + { + "epoch": 0.98, + "grad_norm": 1.7225744072643763, + "learning_rate": 2.5778851831811392e-08, + "loss": 0.7251, + "step": 9613 + }, + { + "epoch": 0.98, + "grad_norm": 1.6038628126765848, + "learning_rate": 2.554299154659212e-08, + "loss": 0.6683, + "step": 9614 + }, + { + "epoch": 0.98, + "grad_norm": 1.673729472783224, + "learning_rate": 2.5308213832697658e-08, + "loss": 0.7261, + "step": 9615 + }, + { + "epoch": 0.98, + "grad_norm": 1.6990401901929215, + "learning_rate": 2.5074518715609843e-08, + "loss": 0.7117, + "step": 9616 + }, + { + "epoch": 0.98, + "grad_norm": 1.6665138770587684, + "learning_rate": 2.4841906220692825e-08, + "loss": 0.6896, + "step": 9617 + }, + { + "epoch": 0.98, + "grad_norm": 1.7577331276681882, + "learning_rate": 2.461037637319308e-08, + "loss": 0.6992, + "step": 9618 + }, + { + "epoch": 0.98, + "grad_norm": 1.832379629273432, + "learning_rate": 2.4379929198238285e-08, + "loss": 0.7282, + "step": 9619 + }, + { + "epoch": 0.98, + "grad_norm": 1.752051759087949, + "learning_rate": 2.4150564720841763e-08, + "loss": 0.748, + "step": 9620 + }, + { + "epoch": 0.98, + "grad_norm": 1.7443054920924745, + "learning_rate": 2.3922282965896936e-08, + "loss": 0.6948, + "step": 9621 + }, + { + "epoch": 0.98, + "grad_norm": 1.563872679925648, + "learning_rate": 2.3695083958179543e-08, + "loss": 0.654, + "step": 9622 + }, + { + "epoch": 0.98, + "grad_norm": 1.789764500311137, + "learning_rate": 2.3468967722347635e-08, + "loss": 0.7633, + "step": 9623 + }, + { + "epoch": 0.98, + "grad_norm": 1.7614250422498046, + "learning_rate": 2.3243934282944912e-08, + "loss": 0.6923, + "step": 9624 + }, + { + "epoch": 0.98, + "grad_norm": 1.7086680323811136, + "learning_rate": 2.3019983664394064e-08, + "loss": 0.7576, + "step": 9625 + }, + { + "epoch": 0.98, + "grad_norm": 1.8243468464401964, + "learning_rate": 2.2797115891002308e-08, + "loss": 0.7506, + "step": 9626 + }, + { + "epoch": 0.98, + "grad_norm": 1.8382359181164998, + "learning_rate": 2.2575330986956968e-08, + "loss": 0.6899, + "step": 9627 + }, + { + "epoch": 0.98, + "grad_norm": 1.711964555016779, + "learning_rate": 2.2354628976328786e-08, + "loss": 0.6424, + "step": 9628 + }, + { + "epoch": 0.98, + "grad_norm": 1.6574125129336976, + "learning_rate": 2.2135009883074155e-08, + "loss": 0.5457, + "step": 9629 + }, + { + "epoch": 0.98, + "grad_norm": 1.470929810445323, + "learning_rate": 2.1916473731027343e-08, + "loss": 0.6801, + "step": 9630 + }, + { + "epoch": 0.98, + "grad_norm": 1.7670042059843676, + "learning_rate": 2.1699020543907157e-08, + "loss": 0.6014, + "step": 9631 + }, + { + "epoch": 0.98, + "grad_norm": 1.8821606786857161, + "learning_rate": 2.1482650345315826e-08, + "loss": 0.665, + "step": 9632 + }, + { + "epoch": 0.98, + "grad_norm": 1.6664184114715632, + "learning_rate": 2.1267363158735676e-08, + "loss": 0.7083, + "step": 9633 + }, + { + "epoch": 0.98, + "grad_norm": 1.705911715860228, + "learning_rate": 2.1053159007533575e-08, + "loss": 0.6824, + "step": 9634 + }, + { + "epoch": 0.98, + "grad_norm": 1.7456083344015636, + "learning_rate": 2.0840037914958698e-08, + "loss": 0.6966, + "step": 9635 + }, + { + "epoch": 0.98, + "grad_norm": 1.6545046167332191, + "learning_rate": 2.0627999904139218e-08, + "loss": 0.6038, + "step": 9636 + }, + { + "epoch": 0.98, + "grad_norm": 1.8839922766007815, + "learning_rate": 2.041704499809227e-08, + "loss": 0.7633, + "step": 9637 + }, + { + "epoch": 0.98, + "grad_norm": 1.6442871165201445, + "learning_rate": 2.0207173219710664e-08, + "loss": 0.7207, + "step": 9638 + }, + { + "epoch": 0.98, + "grad_norm": 1.8057249181482182, + "learning_rate": 1.9998384591773945e-08, + "loss": 0.7916, + "step": 9639 + }, + { + "epoch": 0.98, + "grad_norm": 1.6379867021567172, + "learning_rate": 1.979067913694399e-08, + "loss": 0.714, + "step": 9640 + }, + { + "epoch": 0.98, + "grad_norm": 1.7978173995445093, + "learning_rate": 1.9584056877761658e-08, + "loss": 0.6308, + "step": 9641 + }, + { + "epoch": 0.98, + "grad_norm": 1.8522532972135437, + "learning_rate": 1.9378517836653454e-08, + "loss": 0.6179, + "step": 9642 + }, + { + "epoch": 0.98, + "grad_norm": 1.6906803348048336, + "learning_rate": 1.91740620359282e-08, + "loss": 0.6019, + "step": 9643 + }, + { + "epoch": 0.98, + "grad_norm": 1.7531966996092696, + "learning_rate": 1.8970689497775917e-08, + "loss": 0.675, + "step": 9644 + }, + { + "epoch": 0.98, + "grad_norm": 1.728651057249247, + "learning_rate": 1.876840024427007e-08, + "loss": 0.7092, + "step": 9645 + }, + { + "epoch": 0.98, + "grad_norm": 1.5379281297714842, + "learning_rate": 1.85671942973642e-08, + "loss": 0.6308, + "step": 9646 + }, + { + "epoch": 0.98, + "grad_norm": 1.7675827172247032, + "learning_rate": 1.8367071678897507e-08, + "loss": 0.6974, + "step": 9647 + }, + { + "epoch": 0.98, + "grad_norm": 1.7158119619808374, + "learning_rate": 1.8168032410590398e-08, + "loss": 0.6727, + "step": 9648 + }, + { + "epoch": 0.98, + "grad_norm": 1.643652158228756, + "learning_rate": 1.7970076514044476e-08, + "loss": 0.8226, + "step": 9649 + }, + { + "epoch": 0.98, + "grad_norm": 1.877227745255658, + "learning_rate": 1.7773204010745892e-08, + "loss": 0.6955, + "step": 9650 + }, + { + "epoch": 0.98, + "grad_norm": 1.8893676527357877, + "learning_rate": 1.7577414922060888e-08, + "loss": 0.7242, + "step": 9651 + }, + { + "epoch": 0.98, + "grad_norm": 1.627296937520523, + "learning_rate": 1.738270926924024e-08, + "loss": 0.6402, + "step": 9652 + }, + { + "epoch": 0.98, + "grad_norm": 1.616663575398968, + "learning_rate": 1.7189087073415933e-08, + "loss": 0.7571, + "step": 9653 + }, + { + "epoch": 0.98, + "grad_norm": 1.6853571802209921, + "learning_rate": 1.699654835560116e-08, + "loss": 0.6763, + "step": 9654 + }, + { + "epoch": 0.98, + "grad_norm": 1.854656056250111, + "learning_rate": 1.680509313669587e-08, + "loss": 0.6867, + "step": 9655 + }, + { + "epoch": 0.98, + "grad_norm": 1.4804921632252286, + "learning_rate": 1.6614721437477887e-08, + "loss": 0.6332, + "step": 9656 + }, + { + "epoch": 0.98, + "grad_norm": 1.6700954955835952, + "learning_rate": 1.642543327860846e-08, + "loss": 0.6996, + "step": 9657 + }, + { + "epoch": 0.98, + "grad_norm": 1.592121528151136, + "learning_rate": 1.6237228680633376e-08, + "loss": 0.6434, + "step": 9658 + }, + { + "epoch": 0.98, + "grad_norm": 1.6533858632363638, + "learning_rate": 1.605010766397741e-08, + "loss": 0.8153, + "step": 9659 + }, + { + "epoch": 0.98, + "grad_norm": 1.804843545317245, + "learning_rate": 1.586407024895209e-08, + "loss": 0.6773, + "step": 9660 + }, + { + "epoch": 0.98, + "grad_norm": 1.4865888180732818, + "learning_rate": 1.5679116455746823e-08, + "loss": 0.6879, + "step": 9661 + }, + { + "epoch": 0.98, + "grad_norm": 1.7862992328554177, + "learning_rate": 1.5495246304435552e-08, + "loss": 0.7552, + "step": 9662 + }, + { + "epoch": 0.98, + "grad_norm": 1.8059227397518534, + "learning_rate": 1.5312459814975644e-08, + "loss": 0.6882, + "step": 9663 + }, + { + "epoch": 0.98, + "grad_norm": 1.5367676202214664, + "learning_rate": 1.5130757007205676e-08, + "loss": 0.5667, + "step": 9664 + }, + { + "epoch": 0.98, + "grad_norm": 1.836586511875315, + "learning_rate": 1.495013790084654e-08, + "loss": 0.7366, + "step": 9665 + }, + { + "epoch": 0.98, + "grad_norm": 1.7310378350968094, + "learning_rate": 1.4770602515500332e-08, + "loss": 0.6892, + "step": 9666 + }, + { + "epoch": 0.98, + "grad_norm": 1.7893982283088452, + "learning_rate": 1.4592150870653688e-08, + "loss": 0.7379, + "step": 9667 + }, + { + "epoch": 0.98, + "grad_norm": 1.6391847665314565, + "learning_rate": 1.4414782985674447e-08, + "loss": 0.6567, + "step": 9668 + }, + { + "epoch": 0.98, + "grad_norm": 1.648479239098497, + "learning_rate": 1.4238498879813878e-08, + "loss": 0.6808, + "step": 9669 + }, + { + "epoch": 0.98, + "grad_norm": 1.6315940355015648, + "learning_rate": 1.4063298572204454e-08, + "loss": 0.7107, + "step": 9670 + }, + { + "epoch": 0.98, + "grad_norm": 1.652497062929891, + "learning_rate": 1.3889182081860963e-08, + "loss": 0.7429, + "step": 9671 + }, + { + "epoch": 0.98, + "grad_norm": 1.646812479137008, + "learning_rate": 1.3716149427682734e-08, + "loss": 0.6522, + "step": 9672 + }, + { + "epoch": 0.98, + "grad_norm": 1.6886811009695795, + "learning_rate": 1.3544200628446968e-08, + "loss": 0.6619, + "step": 9673 + }, + { + "epoch": 0.98, + "grad_norm": 1.7001777649396592, + "learning_rate": 1.3373335702818735e-08, + "loss": 0.7377, + "step": 9674 + }, + { + "epoch": 0.98, + "grad_norm": 1.5931986521862718, + "learning_rate": 1.3203554669339868e-08, + "loss": 0.6784, + "step": 9675 + }, + { + "epoch": 0.98, + "grad_norm": 1.5438624429645642, + "learning_rate": 1.3034857546441182e-08, + "loss": 0.5341, + "step": 9676 + }, + { + "epoch": 0.98, + "grad_norm": 1.4887640945970237, + "learning_rate": 1.2867244352428033e-08, + "loss": 0.6961, + "step": 9677 + }, + { + "epoch": 0.98, + "grad_norm": 1.7453192466708156, + "learning_rate": 1.2700715105495865e-08, + "loss": 0.6643, + "step": 9678 + }, + { + "epoch": 0.98, + "grad_norm": 1.9154223364917178, + "learning_rate": 1.2535269823716889e-08, + "loss": 0.7022, + "step": 9679 + }, + { + "epoch": 0.98, + "grad_norm": 1.6207314273673628, + "learning_rate": 1.2370908525046744e-08, + "loss": 0.7324, + "step": 9680 + }, + { + "epoch": 0.98, + "grad_norm": 1.781701331962845, + "learning_rate": 1.2207631227326711e-08, + "loss": 0.6468, + "step": 9681 + }, + { + "epoch": 0.98, + "grad_norm": 1.766644934566483, + "learning_rate": 1.2045437948275952e-08, + "loss": 0.7227, + "step": 9682 + }, + { + "epoch": 0.98, + "grad_norm": 1.7699821522867707, + "learning_rate": 1.188432870549927e-08, + "loss": 0.7304, + "step": 9683 + }, + { + "epoch": 0.99, + "grad_norm": 1.7396705355516582, + "learning_rate": 1.1724303516481572e-08, + "loss": 0.6502, + "step": 9684 + }, + { + "epoch": 0.99, + "grad_norm": 1.6905569226968575, + "learning_rate": 1.1565362398592295e-08, + "loss": 0.8001, + "step": 9685 + }, + { + "epoch": 0.99, + "grad_norm": 1.6747194976240847, + "learning_rate": 1.1407505369080973e-08, + "loss": 0.6568, + "step": 9686 + }, + { + "epoch": 0.99, + "grad_norm": 1.642429861783825, + "learning_rate": 1.1250732445080569e-08, + "loss": 0.6395, + "step": 9687 + }, + { + "epoch": 0.99, + "grad_norm": 1.6875176860247043, + "learning_rate": 1.1095043643606363e-08, + "loss": 0.7886, + "step": 9688 + }, + { + "epoch": 0.99, + "grad_norm": 1.8228693514058845, + "learning_rate": 1.0940438981555945e-08, + "loss": 0.7219, + "step": 9689 + }, + { + "epoch": 0.99, + "grad_norm": 1.7675485509518638, + "learning_rate": 1.0786918475710339e-08, + "loss": 0.6434, + "step": 9690 + }, + { + "epoch": 0.99, + "grad_norm": 1.647818234835014, + "learning_rate": 1.0634482142730662e-08, + "loss": 0.7144, + "step": 9691 + }, + { + "epoch": 0.99, + "grad_norm": 1.7718676572281018, + "learning_rate": 1.0483129999161456e-08, + "loss": 0.6408, + "step": 9692 + }, + { + "epoch": 0.99, + "grad_norm": 1.6318067602617774, + "learning_rate": 1.0332862061429583e-08, + "loss": 0.6004, + "step": 9693 + }, + { + "epoch": 0.99, + "grad_norm": 1.6351165397625582, + "learning_rate": 1.0183678345845328e-08, + "loss": 0.7552, + "step": 9694 + }, + { + "epoch": 0.99, + "grad_norm": 1.5644902946973864, + "learning_rate": 1.0035578868600182e-08, + "loss": 0.6444, + "step": 9695 + }, + { + "epoch": 0.99, + "grad_norm": 1.6428946667146398, + "learning_rate": 9.888563645765736e-09, + "loss": 0.5811, + "step": 9696 + }, + { + "epoch": 0.99, + "grad_norm": 1.5742527303659848, + "learning_rate": 9.742632693301445e-09, + "loss": 0.692, + "step": 9697 + }, + { + "epoch": 0.99, + "grad_norm": 1.7736699263772455, + "learning_rate": 9.597786027042422e-09, + "loss": 0.7327, + "step": 9698 + }, + { + "epoch": 0.99, + "grad_norm": 1.9491332009997895, + "learning_rate": 9.454023662712752e-09, + "loss": 0.6928, + "step": 9699 + }, + { + "epoch": 0.99, + "grad_norm": 1.7185475542191888, + "learning_rate": 9.311345615913291e-09, + "loss": 0.6323, + "step": 9700 + }, + { + "epoch": 0.99, + "grad_norm": 1.5105438695817894, + "learning_rate": 9.169751902131652e-09, + "loss": 0.6499, + "step": 9701 + }, + { + "epoch": 0.99, + "grad_norm": 1.514827773029279, + "learning_rate": 9.029242536733318e-09, + "loss": 0.6234, + "step": 9702 + }, + { + "epoch": 0.99, + "grad_norm": 1.6461197241040941, + "learning_rate": 8.889817534969425e-09, + "loss": 0.6747, + "step": 9703 + }, + { + "epoch": 0.99, + "grad_norm": 1.7957496588047202, + "learning_rate": 8.751476911972313e-09, + "loss": 0.7837, + "step": 9704 + }, + { + "epoch": 0.99, + "grad_norm": 1.6629104613611898, + "learning_rate": 8.614220682756635e-09, + "loss": 0.6097, + "step": 9705 + }, + { + "epoch": 0.99, + "grad_norm": 1.6919638021174255, + "learning_rate": 8.478048862219368e-09, + "loss": 0.7037, + "step": 9706 + }, + { + "epoch": 0.99, + "grad_norm": 1.5271956567781249, + "learning_rate": 8.342961465140908e-09, + "loss": 0.5894, + "step": 9707 + }, + { + "epoch": 0.99, + "grad_norm": 1.6611920184684048, + "learning_rate": 8.208958506181752e-09, + "loss": 0.7251, + "step": 9708 + }, + { + "epoch": 0.99, + "grad_norm": 1.7030516189999174, + "learning_rate": 8.076039999885821e-09, + "loss": 0.7094, + "step": 9709 + }, + { + "epoch": 0.99, + "grad_norm": 1.5774737119575863, + "learning_rate": 7.944205960678242e-09, + "loss": 0.6555, + "step": 9710 + }, + { + "epoch": 0.99, + "grad_norm": 1.5424960825862553, + "learning_rate": 7.813456402870901e-09, + "loss": 0.7548, + "step": 9711 + }, + { + "epoch": 0.99, + "grad_norm": 1.9115060508268176, + "learning_rate": 7.683791340651337e-09, + "loss": 0.6898, + "step": 9712 + }, + { + "epoch": 0.99, + "grad_norm": 1.5520360612107897, + "learning_rate": 7.555210788093847e-09, + "loss": 0.7203, + "step": 9713 + }, + { + "epoch": 0.99, + "grad_norm": 1.73846643821148, + "learning_rate": 7.427714759153937e-09, + "loss": 0.7294, + "step": 9714 + }, + { + "epoch": 0.99, + "grad_norm": 1.7606361370390762, + "learning_rate": 7.301303267669424e-09, + "loss": 0.6608, + "step": 9715 + }, + { + "epoch": 0.99, + "grad_norm": 1.8115631849090241, + "learning_rate": 7.1759763273604454e-09, + "loss": 0.814, + "step": 9716 + }, + { + "epoch": 0.99, + "grad_norm": 1.6783185631212085, + "learning_rate": 7.051733951828343e-09, + "loss": 0.6538, + "step": 9717 + }, + { + "epoch": 0.99, + "grad_norm": 1.618824346005183, + "learning_rate": 6.928576154558997e-09, + "loss": 0.5846, + "step": 9718 + }, + { + "epoch": 0.99, + "grad_norm": 1.8248486875496646, + "learning_rate": 6.806502948918381e-09, + "loss": 0.7861, + "step": 9719 + }, + { + "epoch": 0.99, + "grad_norm": 1.6413599961753038, + "learning_rate": 6.685514348154787e-09, + "loss": 0.6407, + "step": 9720 + }, + { + "epoch": 0.99, + "grad_norm": 1.8528942467367489, + "learning_rate": 6.565610365402153e-09, + "loss": 0.738, + "step": 9721 + }, + { + "epoch": 0.99, + "grad_norm": 1.818498691949131, + "learning_rate": 6.446791013671183e-09, + "loss": 0.7306, + "step": 9722 + }, + { + "epoch": 0.99, + "grad_norm": 1.7186762257651678, + "learning_rate": 6.329056305860448e-09, + "loss": 0.6696, + "step": 9723 + }, + { + "epoch": 0.99, + "grad_norm": 1.757030635077287, + "learning_rate": 6.212406254746395e-09, + "loss": 0.7701, + "step": 9724 + }, + { + "epoch": 0.99, + "grad_norm": 1.5623043797000618, + "learning_rate": 6.096840872991117e-09, + "loss": 0.662, + "step": 9725 + }, + { + "epoch": 0.99, + "grad_norm": 1.5841618328268379, + "learning_rate": 5.9823601731356925e-09, + "loss": 0.748, + "step": 9726 + }, + { + "epoch": 0.99, + "grad_norm": 1.6043392083126362, + "learning_rate": 5.86896416760685e-09, + "loss": 0.579, + "step": 9727 + }, + { + "epoch": 0.99, + "grad_norm": 1.9265682270468332, + "learning_rate": 5.7566528687114095e-09, + "loss": 0.7247, + "step": 9728 + }, + { + "epoch": 0.99, + "grad_norm": 1.691440943069805, + "learning_rate": 5.645426288638511e-09, + "loss": 0.7415, + "step": 9729 + }, + { + "epoch": 0.99, + "grad_norm": 1.6450176668324605, + "learning_rate": 5.53528443945961e-09, + "loss": 0.6279, + "step": 9730 + }, + { + "epoch": 0.99, + "grad_norm": 1.7378168961948732, + "learning_rate": 5.426227333130696e-09, + "loss": 0.697, + "step": 9731 + }, + { + "epoch": 0.99, + "grad_norm": 1.7252926169952139, + "learning_rate": 5.318254981486748e-09, + "loss": 0.6545, + "step": 9732 + }, + { + "epoch": 0.99, + "grad_norm": 1.7953360616753136, + "learning_rate": 5.21136739624617e-09, + "loss": 0.7161, + "step": 9733 + }, + { + "epoch": 0.99, + "grad_norm": 1.6194759118926911, + "learning_rate": 5.105564589011902e-09, + "loss": 0.677, + "step": 9734 + }, + { + "epoch": 0.99, + "grad_norm": 1.7126316233812235, + "learning_rate": 5.000846571264762e-09, + "loss": 0.6515, + "step": 9735 + }, + { + "epoch": 0.99, + "grad_norm": 1.9346810637478329, + "learning_rate": 4.897213354372321e-09, + "loss": 0.6876, + "step": 9736 + }, + { + "epoch": 0.99, + "grad_norm": 2.209066745028407, + "learning_rate": 4.7946649495811405e-09, + "loss": 0.7986, + "step": 9737 + }, + { + "epoch": 0.99, + "grad_norm": 1.8773927683013005, + "learning_rate": 4.693201368021205e-09, + "loss": 0.7163, + "step": 9738 + }, + { + "epoch": 0.99, + "grad_norm": 1.8109093998592674, + "learning_rate": 4.592822620705928e-09, + "loss": 0.7481, + "step": 9739 + }, + { + "epoch": 0.99, + "grad_norm": 1.5717984737809685, + "learning_rate": 4.493528718528817e-09, + "loss": 0.6516, + "step": 9740 + }, + { + "epoch": 0.99, + "grad_norm": 1.6172161663400688, + "learning_rate": 4.395319672266807e-09, + "loss": 0.6364, + "step": 9741 + }, + { + "epoch": 0.99, + "grad_norm": 1.6682760972502344, + "learning_rate": 4.2981954925780385e-09, + "loss": 0.7554, + "step": 9742 + }, + { + "epoch": 0.99, + "grad_norm": 1.868953172176466, + "learning_rate": 4.202156190006301e-09, + "loss": 0.7247, + "step": 9743 + }, + { + "epoch": 0.99, + "grad_norm": 1.7716773681131424, + "learning_rate": 4.1072017749732574e-09, + "loss": 0.6867, + "step": 9744 + }, + { + "epoch": 0.99, + "grad_norm": 1.5398947349645191, + "learning_rate": 4.013332257785107e-09, + "loss": 0.5854, + "step": 9745 + }, + { + "epoch": 0.99, + "grad_norm": 1.52256409465704, + "learning_rate": 3.920547648630368e-09, + "loss": 0.6266, + "step": 9746 + }, + { + "epoch": 0.99, + "grad_norm": 1.7581089180332012, + "learning_rate": 3.828847957577653e-09, + "loss": 0.6372, + "step": 9747 + }, + { + "epoch": 0.99, + "grad_norm": 1.596383318497569, + "learning_rate": 3.73823319458233e-09, + "loss": 0.6177, + "step": 9748 + }, + { + "epoch": 0.99, + "grad_norm": 1.6520114837204025, + "learning_rate": 3.6487033694776466e-09, + "loss": 0.6509, + "step": 9749 + }, + { + "epoch": 0.99, + "grad_norm": 1.7583325520760777, + "learning_rate": 3.560258491980273e-09, + "loss": 0.7194, + "step": 9750 + }, + { + "epoch": 0.99, + "grad_norm": 1.596475826131414, + "learning_rate": 3.4728985716903083e-09, + "loss": 0.7752, + "step": 9751 + }, + { + "epoch": 0.99, + "grad_norm": 1.436844800858631, + "learning_rate": 3.3866236180879476e-09, + "loss": 0.6588, + "step": 9752 + }, + { + "epoch": 0.99, + "grad_norm": 1.8173776459488546, + "learning_rate": 3.3014336405390313e-09, + "loss": 0.7334, + "step": 9753 + }, + { + "epoch": 0.99, + "grad_norm": 1.7236920470298163, + "learning_rate": 3.2173286482883868e-09, + "loss": 0.7585, + "step": 9754 + }, + { + "epoch": 0.99, + "grad_norm": 1.6707557679087048, + "learning_rate": 3.1343086504653785e-09, + "loss": 0.6727, + "step": 9755 + }, + { + "epoch": 0.99, + "grad_norm": 1.559160997871232, + "learning_rate": 3.0523736560783558e-09, + "loss": 0.6343, + "step": 9756 + }, + { + "epoch": 0.99, + "grad_norm": 1.6592056636731305, + "learning_rate": 2.971523674022425e-09, + "loss": 0.6621, + "step": 9757 + }, + { + "epoch": 0.99, + "grad_norm": 1.7959185329193494, + "learning_rate": 2.8917587130705695e-09, + "loss": 0.7023, + "step": 9758 + }, + { + "epoch": 0.99, + "grad_norm": 1.5316612752649197, + "learning_rate": 2.8130787818814177e-09, + "loss": 0.7049, + "step": 9759 + }, + { + "epoch": 0.99, + "grad_norm": 1.5013465330785167, + "learning_rate": 2.7354838889948055e-09, + "loss": 0.676, + "step": 9760 + }, + { + "epoch": 0.99, + "grad_norm": 1.6680553354247996, + "learning_rate": 2.6589740428306644e-09, + "loss": 0.7261, + "step": 9761 + }, + { + "epoch": 0.99, + "grad_norm": 1.5739749500286022, + "learning_rate": 2.5835492516945725e-09, + "loss": 0.5215, + "step": 9762 + }, + { + "epoch": 0.99, + "grad_norm": 1.80329783598832, + "learning_rate": 2.5092095237722048e-09, + "loss": 0.7339, + "step": 9763 + }, + { + "epoch": 0.99, + "grad_norm": 1.7697361499388937, + "learning_rate": 2.4359548671315515e-09, + "loss": 0.783, + "step": 9764 + }, + { + "epoch": 0.99, + "grad_norm": 1.8060799010027864, + "learning_rate": 2.36378528972292e-09, + "loss": 0.6659, + "step": 9765 + }, + { + "epoch": 0.99, + "grad_norm": 1.5320767192304863, + "learning_rate": 2.2927007993811535e-09, + "loss": 0.6163, + "step": 9766 + }, + { + "epoch": 0.99, + "grad_norm": 1.8341164287247476, + "learning_rate": 2.222701403818972e-09, + "loss": 0.7484, + "step": 9767 + }, + { + "epoch": 0.99, + "grad_norm": 1.6898445181700266, + "learning_rate": 2.153787110634742e-09, + "loss": 0.7118, + "step": 9768 + }, + { + "epoch": 0.99, + "grad_norm": 1.73467212577717, + "learning_rate": 2.0859579273091457e-09, + "loss": 0.6583, + "step": 9769 + }, + { + "epoch": 0.99, + "grad_norm": 1.8342421843135626, + "learning_rate": 2.019213861201852e-09, + "loss": 0.8027, + "step": 9770 + }, + { + "epoch": 0.99, + "grad_norm": 1.7248601263057854, + "learning_rate": 1.953554919559286e-09, + "loss": 0.7182, + "step": 9771 + }, + { + "epoch": 0.99, + "grad_norm": 1.7852646128943321, + "learning_rate": 1.8889811095046396e-09, + "loss": 0.6439, + "step": 9772 + }, + { + "epoch": 0.99, + "grad_norm": 1.797497617117918, + "learning_rate": 1.8254924380489702e-09, + "loss": 0.7096, + "step": 9773 + }, + { + "epoch": 0.99, + "grad_norm": 1.6025539122176873, + "learning_rate": 1.7630889120823224e-09, + "loss": 0.6443, + "step": 9774 + }, + { + "epoch": 0.99, + "grad_norm": 1.5688322495059948, + "learning_rate": 1.7017705383781668e-09, + "loss": 0.7314, + "step": 9775 + }, + { + "epoch": 0.99, + "grad_norm": 1.7617953713834467, + "learning_rate": 1.64153732359007e-09, + "loss": 0.6941, + "step": 9776 + }, + { + "epoch": 0.99, + "grad_norm": 1.8437779372800862, + "learning_rate": 1.5823892742561352e-09, + "loss": 0.6198, + "step": 9777 + }, + { + "epoch": 0.99, + "grad_norm": 1.5327724689221702, + "learning_rate": 1.5243263967956722e-09, + "loss": 0.7511, + "step": 9778 + }, + { + "epoch": 0.99, + "grad_norm": 1.632644536650647, + "learning_rate": 1.467348697511417e-09, + "loss": 0.7554, + "step": 9779 + }, + { + "epoch": 0.99, + "grad_norm": 1.8949552888377248, + "learning_rate": 1.411456182587312e-09, + "loss": 0.7169, + "step": 9780 + }, + { + "epoch": 0.99, + "grad_norm": 1.6767940261577814, + "learning_rate": 1.356648858088505e-09, + "loss": 0.6894, + "step": 9781 + }, + { + "epoch": 1.0, + "grad_norm": 1.7349273531608402, + "learning_rate": 1.302926729964682e-09, + "loss": 0.6867, + "step": 9782 + }, + { + "epoch": 1.0, + "grad_norm": 1.6491775044986772, + "learning_rate": 1.2502898040456235e-09, + "loss": 0.6245, + "step": 9783 + }, + { + "epoch": 1.0, + "grad_norm": 1.5181692314067778, + "learning_rate": 1.1987380860456477e-09, + "loss": 0.6693, + "step": 9784 + }, + { + "epoch": 1.0, + "grad_norm": 1.5327161046836033, + "learning_rate": 1.148271581558058e-09, + "loss": 0.6834, + "step": 9785 + }, + { + "epoch": 1.0, + "grad_norm": 1.6414243607298673, + "learning_rate": 1.098890296060695e-09, + "loss": 0.6154, + "step": 9786 + }, + { + "epoch": 1.0, + "grad_norm": 1.844885336324718, + "learning_rate": 1.0505942349137155e-09, + "loss": 0.8566, + "step": 9787 + }, + { + "epoch": 1.0, + "grad_norm": 1.5735819481588773, + "learning_rate": 1.003383403358482e-09, + "loss": 0.7284, + "step": 9788 + }, + { + "epoch": 1.0, + "grad_norm": 1.7929415551134282, + "learning_rate": 9.572578065197846e-10, + "loss": 0.6794, + "step": 9789 + }, + { + "epoch": 1.0, + "grad_norm": 1.8147303747186208, + "learning_rate": 9.122174494025082e-10, + "loss": 0.6762, + "step": 9790 + }, + { + "epoch": 1.0, + "grad_norm": 1.8998667504295066, + "learning_rate": 8.682623368971854e-10, + "loss": 0.7023, + "step": 9791 + }, + { + "epoch": 1.0, + "grad_norm": 1.5999912018017683, + "learning_rate": 8.253924737711139e-10, + "loss": 0.6024, + "step": 9792 + }, + { + "epoch": 1.0, + "grad_norm": 1.6802611122367843, + "learning_rate": 7.83607864680569e-10, + "loss": 0.677, + "step": 9793 + }, + { + "epoch": 1.0, + "grad_norm": 1.9299673966168387, + "learning_rate": 7.429085141585912e-10, + "loss": 0.6575, + "step": 9794 + }, + { + "epoch": 1.0, + "grad_norm": 1.8079801054976152, + "learning_rate": 7.032944266227582e-10, + "loss": 0.7263, + "step": 9795 + }, + { + "epoch": 1.0, + "grad_norm": 1.7053584530993402, + "learning_rate": 6.647656063729635e-10, + "loss": 0.7107, + "step": 9796 + }, + { + "epoch": 1.0, + "grad_norm": 1.752559768160014, + "learning_rate": 6.273220575914174e-10, + "loss": 0.6197, + "step": 9797 + }, + { + "epoch": 1.0, + "grad_norm": 1.6022948579476681, + "learning_rate": 5.909637843404259e-10, + "loss": 0.7039, + "step": 9798 + }, + { + "epoch": 1.0, + "grad_norm": 1.6826875055361312, + "learning_rate": 5.556907905679421e-10, + "loss": 0.6946, + "step": 9799 + }, + { + "epoch": 1.0, + "grad_norm": 1.6338947651630946, + "learning_rate": 5.215030801009046e-10, + "loss": 0.7023, + "step": 9800 + }, + { + "epoch": 1.0, + "grad_norm": 1.669612829015242, + "learning_rate": 4.884006566496791e-10, + "loss": 0.7131, + "step": 9801 + }, + { + "epoch": 1.0, + "grad_norm": 1.595339314335866, + "learning_rate": 4.5638352380805716e-10, + "loss": 0.5837, + "step": 9802 + }, + { + "epoch": 1.0, + "grad_norm": 1.7544681428451936, + "learning_rate": 4.2545168505103707e-10, + "loss": 0.6499, + "step": 9803 + }, + { + "epoch": 1.0, + "grad_norm": 1.6948058791459002, + "learning_rate": 3.9560514373593317e-10, + "loss": 0.6961, + "step": 9804 + }, + { + "epoch": 1.0, + "grad_norm": 1.7957334177664486, + "learning_rate": 3.6684390310015584e-10, + "loss": 0.7444, + "step": 9805 + }, + { + "epoch": 1.0, + "grad_norm": 1.5597201209989984, + "learning_rate": 3.391679662678726e-10, + "loss": 0.6784, + "step": 9806 + }, + { + "epoch": 1.0, + "grad_norm": 1.8096052129609148, + "learning_rate": 3.1257733624112665e-10, + "loss": 0.6936, + "step": 9807 + }, + { + "epoch": 1.0, + "grad_norm": 1.6849798366893498, + "learning_rate": 2.8707201590649767e-10, + "loss": 0.6742, + "step": 9808 + }, + { + "epoch": 1.0, + "grad_norm": 1.838312020306633, + "learning_rate": 2.6265200803177184e-10, + "loss": 0.7065, + "step": 9809 + }, + { + "epoch": 1.0, + "grad_norm": 1.767900214133964, + "learning_rate": 2.3931731526927183e-10, + "loss": 0.8026, + "step": 9810 + }, + { + "epoch": 1.0, + "grad_norm": 1.5904778245566455, + "learning_rate": 2.1706794014919596e-10, + "loss": 0.7111, + "step": 9811 + }, + { + "epoch": 1.0, + "grad_norm": 1.5468126215436857, + "learning_rate": 1.9590388508627934e-10, + "loss": 0.7569, + "step": 9812 + }, + { + "epoch": 1.0, + "grad_norm": 1.5375661558070106, + "learning_rate": 1.758251523797938e-10, + "loss": 0.611, + "step": 9813 + }, + { + "epoch": 1.0, + "grad_norm": 1.8357286117999407, + "learning_rate": 1.5683174420799696e-10, + "loss": 0.7772, + "step": 9814 + }, + { + "epoch": 1.0, + "grad_norm": 2.1584454747868387, + "learning_rate": 1.3892366263146273e-10, + "loss": 0.77, + "step": 9815 + }, + { + "epoch": 1.0, + "grad_norm": 1.674939245192305, + "learning_rate": 1.2210090959419162e-10, + "loss": 0.609, + "step": 9816 + }, + { + "epoch": 1.0, + "grad_norm": 1.5174189719877236, + "learning_rate": 1.0636348692250054e-10, + "loss": 0.585, + "step": 9817 + }, + { + "epoch": 1.0, + "grad_norm": 1.6701519955880866, + "learning_rate": 9.171139632391245e-11, + "loss": 0.748, + "step": 9818 + }, + { + "epoch": 1.0, + "grad_norm": 1.5132318407758787, + "learning_rate": 7.8144639389377e-11, + "loss": 0.5916, + "step": 9819 + }, + { + "epoch": 1.0, + "grad_norm": 1.6711317998681496, + "learning_rate": 6.566321759104987e-11, + "loss": 0.7035, + "step": 9820 + }, + { + "epoch": 1.0, + "grad_norm": 1.7089305325553747, + "learning_rate": 5.426713228340319e-11, + "loss": 0.6716, + "step": 9821 + }, + { + "epoch": 1.0, + "grad_norm": 1.7533657894024617, + "learning_rate": 4.3956384703225384e-11, + "loss": 0.7033, + "step": 9822 + }, + { + "epoch": 1.0, + "grad_norm": 1.7866771307941718, + "learning_rate": 3.4730975969621275e-11, + "loss": 0.7179, + "step": 9823 + }, + { + "epoch": 1.0, + "grad_norm": 1.7858168287448422, + "learning_rate": 2.659090708401202e-11, + "loss": 0.6644, + "step": 9824 + }, + { + "epoch": 1.0, + "grad_norm": 1.849749275071706, + "learning_rate": 1.9536178930135152e-11, + "loss": 0.586, + "step": 9825 + }, + { + "epoch": 1.0, + "grad_norm": 1.779817554106832, + "learning_rate": 1.356679227404456e-11, + "loss": 0.6623, + "step": 9826 + }, + { + "epoch": 1.0, + "grad_norm": 1.7365065100153105, + "learning_rate": 8.682747761890042e-12, + "loss": 0.6541, + "step": 9827 + }, + { + "epoch": 1.0, + "grad_norm": 1.7159852243362201, + "learning_rate": 4.884045925468428e-12, + "loss": 0.6903, + "step": 9828 + }, + { + "epoch": 1.0, + "grad_norm": 1.9049278490404138, + "learning_rate": 2.170687175562236e-12, + "loss": 0.5288, + "step": 9829 + }, + { + "epoch": 1.0, + "grad_norm": 1.6298596172428135, + "learning_rate": 5.426718086010141e-13, + "loss": 0.7138, + "step": 9830 + }, + { + "epoch": 1.0, + "grad_norm": 1.6170033821053835, + "learning_rate": 0.0, + "loss": 0.7636, + "step": 9831 + }, + { + "epoch": 1.0, + "step": 9831, + "total_flos": 1617537550008320.0, + "train_loss": 0.7271600768035261, + "train_runtime": 125510.4499, + "train_samples_per_second": 10.025, + "train_steps_per_second": 0.078 + } + ], + "logging_steps": 1.0, + "max_steps": 9831, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1617537550008320.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}