diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9537 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9966777408637872, + "eval_steps": 500, + "global_step": 1353, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0022148394241417496, + "grad_norm": 0.0721800684354911, + "learning_rate": 1.4705882352941177e-06, + "loss": 0.1063, + "step": 1 + }, + { + "epoch": 0.004429678848283499, + "grad_norm": 0.09335411871176735, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.0969, + "step": 2 + }, + { + "epoch": 0.006644518272425249, + "grad_norm": 0.08944846268192445, + "learning_rate": 4.411764705882353e-06, + "loss": 0.1073, + "step": 3 + }, + { + "epoch": 0.008859357696566999, + "grad_norm": 0.08447513716451245, + "learning_rate": 5.882352941176471e-06, + "loss": 0.1187, + "step": 4 + }, + { + "epoch": 0.01107419712070875, + "grad_norm": 0.08080094521695665, + "learning_rate": 7.3529411764705884e-06, + "loss": 0.1025, + "step": 5 + }, + { + "epoch": 0.013289036544850499, + "grad_norm": 0.08868925145042288, + "learning_rate": 8.823529411764707e-06, + "loss": 0.1075, + "step": 6 + }, + { + "epoch": 0.015503875968992248, + "grad_norm": 0.08322582584273044, + "learning_rate": 1.0294117647058824e-05, + "loss": 0.0954, + "step": 7 + }, + { + "epoch": 0.017718715393133997, + "grad_norm": 0.08819058542617031, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.0809, + "step": 8 + }, + { + "epoch": 0.019933554817275746, + "grad_norm": 0.05551525577131559, + "learning_rate": 1.323529411764706e-05, + "loss": 0.0657, + "step": 9 + }, + { + "epoch": 0.0221483942414175, + "grad_norm": 0.09253577721153054, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.0957, + "step": 10 + }, + { + "epoch": 0.024363233665559248, + "grad_norm": 0.08114857926487916, + "learning_rate": 1.6176470588235296e-05, + "loss": 0.0917, + "step": 11 + }, + { + "epoch": 0.026578073089700997, + "grad_norm": 0.07093807368894202, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.0925, + "step": 12 + }, + { + "epoch": 0.028792912513842746, + "grad_norm": 0.09067082819232906, + "learning_rate": 1.9117647058823528e-05, + "loss": 0.0944, + "step": 13 + }, + { + "epoch": 0.031007751937984496, + "grad_norm": 0.07363679440680124, + "learning_rate": 2.058823529411765e-05, + "loss": 0.0951, + "step": 14 + }, + { + "epoch": 0.03322259136212625, + "grad_norm": 0.09993764096075278, + "learning_rate": 2.2058823529411766e-05, + "loss": 0.1116, + "step": 15 + }, + { + "epoch": 0.035437430786267994, + "grad_norm": 0.08973567841771882, + "learning_rate": 2.3529411764705884e-05, + "loss": 0.0995, + "step": 16 + }, + { + "epoch": 0.03765227021040975, + "grad_norm": 0.08175280854883217, + "learning_rate": 2.5e-05, + "loss": 0.1042, + "step": 17 + }, + { + "epoch": 0.03986710963455149, + "grad_norm": 0.11174231540618397, + "learning_rate": 2.647058823529412e-05, + "loss": 0.1329, + "step": 18 + }, + { + "epoch": 0.042081949058693245, + "grad_norm": 0.06765970158794697, + "learning_rate": 2.7941176470588236e-05, + "loss": 0.0756, + "step": 19 + }, + { + "epoch": 0.044296788482835, + "grad_norm": 0.0753049509856608, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.0744, + "step": 20 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 0.09294248803001243, + "learning_rate": 3.0882352941176475e-05, + "loss": 0.0992, + "step": 21 + }, + { + "epoch": 0.048726467331118496, + "grad_norm": 0.10828214535943693, + "learning_rate": 3.235294117647059e-05, + "loss": 0.096, + "step": 22 + }, + { + "epoch": 0.05094130675526024, + "grad_norm": 0.07672015717096983, + "learning_rate": 3.382352941176471e-05, + "loss": 0.1019, + "step": 23 + }, + { + "epoch": 0.053156146179401995, + "grad_norm": 0.09732478051667368, + "learning_rate": 3.529411764705883e-05, + "loss": 0.0635, + "step": 24 + }, + { + "epoch": 0.05537098560354374, + "grad_norm": 0.08156040106060804, + "learning_rate": 3.6764705882352945e-05, + "loss": 0.0888, + "step": 25 + }, + { + "epoch": 0.05758582502768549, + "grad_norm": 0.0998107490555503, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.1172, + "step": 26 + }, + { + "epoch": 0.059800664451827246, + "grad_norm": 0.061133911726606024, + "learning_rate": 3.970588235294117e-05, + "loss": 0.0722, + "step": 27 + }, + { + "epoch": 0.06201550387596899, + "grad_norm": 0.05451604637668177, + "learning_rate": 4.11764705882353e-05, + "loss": 0.0725, + "step": 28 + }, + { + "epoch": 0.06423034330011074, + "grad_norm": 0.04383723850602952, + "learning_rate": 4.2647058823529415e-05, + "loss": 0.0436, + "step": 29 + }, + { + "epoch": 0.0664451827242525, + "grad_norm": 0.07034481919491281, + "learning_rate": 4.411764705882353e-05, + "loss": 0.0763, + "step": 30 + }, + { + "epoch": 0.06866002214839424, + "grad_norm": 0.06857733165021808, + "learning_rate": 4.558823529411765e-05, + "loss": 0.0704, + "step": 31 + }, + { + "epoch": 0.07087486157253599, + "grad_norm": 0.0708593740280568, + "learning_rate": 4.705882352941177e-05, + "loss": 0.0893, + "step": 32 + }, + { + "epoch": 0.07308970099667775, + "grad_norm": 0.07609266030018137, + "learning_rate": 4.8529411764705885e-05, + "loss": 0.1023, + "step": 33 + }, + { + "epoch": 0.0753045404208195, + "grad_norm": 0.05295223273699035, + "learning_rate": 5e-05, + "loss": 0.0658, + "step": 34 + }, + { + "epoch": 0.07751937984496124, + "grad_norm": 0.07272261323535405, + "learning_rate": 5.147058823529411e-05, + "loss": 0.0799, + "step": 35 + }, + { + "epoch": 0.07973421926910298, + "grad_norm": 0.11111655511512655, + "learning_rate": 5.294117647058824e-05, + "loss": 0.0951, + "step": 36 + }, + { + "epoch": 0.08194905869324474, + "grad_norm": 0.07717851034954376, + "learning_rate": 5.441176470588235e-05, + "loss": 0.0862, + "step": 37 + }, + { + "epoch": 0.08416389811738649, + "grad_norm": 0.07569917690422996, + "learning_rate": 5.588235294117647e-05, + "loss": 0.066, + "step": 38 + }, + { + "epoch": 0.08637873754152824, + "grad_norm": 0.053723301619356886, + "learning_rate": 5.735294117647059e-05, + "loss": 0.0677, + "step": 39 + }, + { + "epoch": 0.08859357696567, + "grad_norm": 0.07477930606031036, + "learning_rate": 5.882352941176471e-05, + "loss": 0.0838, + "step": 40 + }, + { + "epoch": 0.09080841638981174, + "grad_norm": 0.054173331706399416, + "learning_rate": 6.0294117647058825e-05, + "loss": 0.0523, + "step": 41 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 0.06421029656321692, + "learning_rate": 6.176470588235295e-05, + "loss": 0.0798, + "step": 42 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 0.07126597992646283, + "learning_rate": 6.323529411764705e-05, + "loss": 0.0911, + "step": 43 + }, + { + "epoch": 0.09745293466223699, + "grad_norm": 0.05630581498426186, + "learning_rate": 6.470588235294118e-05, + "loss": 0.0659, + "step": 44 + }, + { + "epoch": 0.09966777408637874, + "grad_norm": 0.06313097359974353, + "learning_rate": 6.61764705882353e-05, + "loss": 0.0579, + "step": 45 + }, + { + "epoch": 0.10188261351052048, + "grad_norm": 0.058144331383788835, + "learning_rate": 6.764705882352942e-05, + "loss": 0.0744, + "step": 46 + }, + { + "epoch": 0.10409745293466224, + "grad_norm": 0.05635024103886109, + "learning_rate": 6.911764705882354e-05, + "loss": 0.0644, + "step": 47 + }, + { + "epoch": 0.10631229235880399, + "grad_norm": 0.08167685835201441, + "learning_rate": 7.058823529411765e-05, + "loss": 0.091, + "step": 48 + }, + { + "epoch": 0.10852713178294573, + "grad_norm": 0.06155997017836087, + "learning_rate": 7.205882352941177e-05, + "loss": 0.0744, + "step": 49 + }, + { + "epoch": 0.11074197120708748, + "grad_norm": 0.07944895273053709, + "learning_rate": 7.352941176470589e-05, + "loss": 0.0947, + "step": 50 + }, + { + "epoch": 0.11295681063122924, + "grad_norm": 0.06637321712246882, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0654, + "step": 51 + }, + { + "epoch": 0.11517165005537099, + "grad_norm": 0.0811038928050886, + "learning_rate": 7.647058823529411e-05, + "loss": 0.0587, + "step": 52 + }, + { + "epoch": 0.11738648947951273, + "grad_norm": 0.07696872444397503, + "learning_rate": 7.794117647058824e-05, + "loss": 0.0902, + "step": 53 + }, + { + "epoch": 0.11960132890365449, + "grad_norm": 0.05851000003351001, + "learning_rate": 7.941176470588235e-05, + "loss": 0.0631, + "step": 54 + }, + { + "epoch": 0.12181616832779624, + "grad_norm": 0.06344963883210299, + "learning_rate": 8.088235294117648e-05, + "loss": 0.0608, + "step": 55 + }, + { + "epoch": 0.12403100775193798, + "grad_norm": 0.07967929760104514, + "learning_rate": 8.23529411764706e-05, + "loss": 0.0794, + "step": 56 + }, + { + "epoch": 0.12624584717607973, + "grad_norm": 0.07150297442968816, + "learning_rate": 8.382352941176471e-05, + "loss": 0.0731, + "step": 57 + }, + { + "epoch": 0.12846068660022147, + "grad_norm": 0.08065203722782961, + "learning_rate": 8.529411764705883e-05, + "loss": 0.0969, + "step": 58 + }, + { + "epoch": 0.13067552602436322, + "grad_norm": 0.08223262036591997, + "learning_rate": 8.676470588235295e-05, + "loss": 0.0993, + "step": 59 + }, + { + "epoch": 0.132890365448505, + "grad_norm": 0.09371811887202453, + "learning_rate": 8.823529411764706e-05, + "loss": 0.097, + "step": 60 + }, + { + "epoch": 0.13510520487264674, + "grad_norm": 0.08558960946584512, + "learning_rate": 8.970588235294118e-05, + "loss": 0.0714, + "step": 61 + }, + { + "epoch": 0.13732004429678848, + "grad_norm": 0.08865706482105408, + "learning_rate": 9.11764705882353e-05, + "loss": 0.0757, + "step": 62 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 0.08603666340297354, + "learning_rate": 9.264705882352942e-05, + "loss": 0.0802, + "step": 63 + }, + { + "epoch": 0.14174972314507198, + "grad_norm": 0.06164939808586461, + "learning_rate": 9.411764705882353e-05, + "loss": 0.0498, + "step": 64 + }, + { + "epoch": 0.14396456256921372, + "grad_norm": 0.05127760199331982, + "learning_rate": 9.558823529411765e-05, + "loss": 0.0538, + "step": 65 + }, + { + "epoch": 0.1461794019933555, + "grad_norm": 0.07993706290834104, + "learning_rate": 9.705882352941177e-05, + "loss": 0.0702, + "step": 66 + }, + { + "epoch": 0.14839424141749724, + "grad_norm": 0.06259515775005525, + "learning_rate": 9.852941176470589e-05, + "loss": 0.0661, + "step": 67 + }, + { + "epoch": 0.150609080841639, + "grad_norm": 0.09321541135902557, + "learning_rate": 0.0001, + "loss": 0.0829, + "step": 68 + }, + { + "epoch": 0.15282392026578073, + "grad_norm": 0.09454858874232157, + "learning_rate": 0.00010147058823529412, + "loss": 0.0762, + "step": 69 + }, + { + "epoch": 0.15503875968992248, + "grad_norm": 0.08859148041550181, + "learning_rate": 0.00010294117647058823, + "loss": 0.0757, + "step": 70 + }, + { + "epoch": 0.15725359911406422, + "grad_norm": 0.07279891561572177, + "learning_rate": 0.00010441176470588237, + "loss": 0.0566, + "step": 71 + }, + { + "epoch": 0.15946843853820597, + "grad_norm": 0.06524772297334883, + "learning_rate": 0.00010588235294117647, + "loss": 0.0683, + "step": 72 + }, + { + "epoch": 0.16168327796234774, + "grad_norm": 0.08865682678955566, + "learning_rate": 0.00010735294117647059, + "loss": 0.0777, + "step": 73 + }, + { + "epoch": 0.1638981173864895, + "grad_norm": 0.0640637851114597, + "learning_rate": 0.0001088235294117647, + "loss": 0.0515, + "step": 74 + }, + { + "epoch": 0.16611295681063123, + "grad_norm": 0.08262595339457462, + "learning_rate": 0.00011029411764705884, + "loss": 0.0562, + "step": 75 + }, + { + "epoch": 0.16832779623477298, + "grad_norm": 0.0913232033308046, + "learning_rate": 0.00011176470588235294, + "loss": 0.0695, + "step": 76 + }, + { + "epoch": 0.17054263565891473, + "grad_norm": 0.10296920226425402, + "learning_rate": 0.00011323529411764706, + "loss": 0.062, + "step": 77 + }, + { + "epoch": 0.17275747508305647, + "grad_norm": 0.07695785523453476, + "learning_rate": 0.00011470588235294118, + "loss": 0.0733, + "step": 78 + }, + { + "epoch": 0.17497231450719822, + "grad_norm": 0.07153366315779568, + "learning_rate": 0.00011617647058823531, + "loss": 0.0587, + "step": 79 + }, + { + "epoch": 0.17718715393134, + "grad_norm": 0.10045138767519943, + "learning_rate": 0.00011764705882352942, + "loss": 0.0711, + "step": 80 + }, + { + "epoch": 0.17940199335548174, + "grad_norm": 0.09606465721718942, + "learning_rate": 0.00011911764705882353, + "loss": 0.083, + "step": 81 + }, + { + "epoch": 0.18161683277962348, + "grad_norm": 0.0984765804481482, + "learning_rate": 0.00012058823529411765, + "loss": 0.0691, + "step": 82 + }, + { + "epoch": 0.18383167220376523, + "grad_norm": 0.07766466063081534, + "learning_rate": 0.00012205882352941178, + "loss": 0.0603, + "step": 83 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 0.11099451733161966, + "learning_rate": 0.0001235294117647059, + "loss": 0.0813, + "step": 84 + }, + { + "epoch": 0.18826135105204872, + "grad_norm": 0.0699217720181689, + "learning_rate": 0.000125, + "loss": 0.0638, + "step": 85 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.07956726575404767, + "learning_rate": 0.0001264705882352941, + "loss": 0.0589, + "step": 86 + }, + { + "epoch": 0.19269102990033224, + "grad_norm": 0.07852117908055688, + "learning_rate": 0.00012794117647058824, + "loss": 0.0676, + "step": 87 + }, + { + "epoch": 0.19490586932447398, + "grad_norm": 0.07012197755728411, + "learning_rate": 0.00012941176470588237, + "loss": 0.0539, + "step": 88 + }, + { + "epoch": 0.19712070874861573, + "grad_norm": 0.07855812515268695, + "learning_rate": 0.00013088235294117647, + "loss": 0.0509, + "step": 89 + }, + { + "epoch": 0.19933554817275748, + "grad_norm": 0.09323359884606518, + "learning_rate": 0.0001323529411764706, + "loss": 0.0677, + "step": 90 + }, + { + "epoch": 0.20155038759689922, + "grad_norm": 0.07139105561883911, + "learning_rate": 0.0001338235294117647, + "loss": 0.0618, + "step": 91 + }, + { + "epoch": 0.20376522702104097, + "grad_norm": 0.09337863391042964, + "learning_rate": 0.00013529411764705884, + "loss": 0.0582, + "step": 92 + }, + { + "epoch": 0.2059800664451827, + "grad_norm": 0.09343341520240167, + "learning_rate": 0.00013676470588235294, + "loss": 0.0659, + "step": 93 + }, + { + "epoch": 0.2081949058693245, + "grad_norm": 0.07463075916586881, + "learning_rate": 0.00013823529411764707, + "loss": 0.0601, + "step": 94 + }, + { + "epoch": 0.21040974529346623, + "grad_norm": 0.08238137453126172, + "learning_rate": 0.00013970588235294118, + "loss": 0.0672, + "step": 95 + }, + { + "epoch": 0.21262458471760798, + "grad_norm": 0.07808933185873881, + "learning_rate": 0.0001411764705882353, + "loss": 0.0624, + "step": 96 + }, + { + "epoch": 0.21483942414174972, + "grad_norm": 0.07536676229474035, + "learning_rate": 0.0001426470588235294, + "loss": 0.0604, + "step": 97 + }, + { + "epoch": 0.21705426356589147, + "grad_norm": 0.08078951308230173, + "learning_rate": 0.00014411764705882354, + "loss": 0.0674, + "step": 98 + }, + { + "epoch": 0.21926910299003322, + "grad_norm": 0.10032908883698782, + "learning_rate": 0.00014558823529411765, + "loss": 0.0617, + "step": 99 + }, + { + "epoch": 0.22148394241417496, + "grad_norm": 0.08541378751279194, + "learning_rate": 0.00014705882352941178, + "loss": 0.0507, + "step": 100 + }, + { + "epoch": 0.22369878183831673, + "grad_norm": 0.08465554736544875, + "learning_rate": 0.00014852941176470588, + "loss": 0.0649, + "step": 101 + }, + { + "epoch": 0.22591362126245848, + "grad_norm": 0.09872533529622371, + "learning_rate": 0.00015000000000000001, + "loss": 0.0474, + "step": 102 + }, + { + "epoch": 0.22812846068660023, + "grad_norm": 0.0698536136283845, + "learning_rate": 0.00015147058823529412, + "loss": 0.0487, + "step": 103 + }, + { + "epoch": 0.23034330011074197, + "grad_norm": 0.06928754855010523, + "learning_rate": 0.00015294117647058822, + "loss": 0.0538, + "step": 104 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.06768939233984893, + "learning_rate": 0.00015441176470588238, + "loss": 0.0437, + "step": 105 + }, + { + "epoch": 0.23477297895902546, + "grad_norm": 0.13160941197259213, + "learning_rate": 0.00015588235294117648, + "loss": 0.0716, + "step": 106 + }, + { + "epoch": 0.2369878183831672, + "grad_norm": 0.1411985741065893, + "learning_rate": 0.0001573529411764706, + "loss": 0.069, + "step": 107 + }, + { + "epoch": 0.23920265780730898, + "grad_norm": 0.07804745756062613, + "learning_rate": 0.0001588235294117647, + "loss": 0.0486, + "step": 108 + }, + { + "epoch": 0.24141749723145073, + "grad_norm": 0.09143988169924121, + "learning_rate": 0.00016029411764705885, + "loss": 0.0648, + "step": 109 + }, + { + "epoch": 0.24363233665559247, + "grad_norm": 0.10699646041144474, + "learning_rate": 0.00016176470588235295, + "loss": 0.0531, + "step": 110 + }, + { + "epoch": 0.24584717607973422, + "grad_norm": 0.08722516716640076, + "learning_rate": 0.00016323529411764706, + "loss": 0.0728, + "step": 111 + }, + { + "epoch": 0.24806201550387597, + "grad_norm": 0.10522098521986949, + "learning_rate": 0.0001647058823529412, + "loss": 0.0638, + "step": 112 + }, + { + "epoch": 0.2502768549280177, + "grad_norm": 0.07242707245936236, + "learning_rate": 0.00016617647058823532, + "loss": 0.0556, + "step": 113 + }, + { + "epoch": 0.25249169435215946, + "grad_norm": 0.09732229107005017, + "learning_rate": 0.00016764705882352942, + "loss": 0.0647, + "step": 114 + }, + { + "epoch": 0.2547065337763012, + "grad_norm": 0.08172420390570745, + "learning_rate": 0.00016911764705882353, + "loss": 0.0663, + "step": 115 + }, + { + "epoch": 0.25692137320044295, + "grad_norm": 0.07885842753110897, + "learning_rate": 0.00017058823529411766, + "loss": 0.0607, + "step": 116 + }, + { + "epoch": 0.2591362126245847, + "grad_norm": 0.08723096721311976, + "learning_rate": 0.0001720588235294118, + "loss": 0.0605, + "step": 117 + }, + { + "epoch": 0.26135105204872644, + "grad_norm": 0.09849496633147106, + "learning_rate": 0.0001735294117647059, + "loss": 0.0668, + "step": 118 + }, + { + "epoch": 0.26356589147286824, + "grad_norm": 0.06525145262857274, + "learning_rate": 0.000175, + "loss": 0.047, + "step": 119 + }, + { + "epoch": 0.26578073089701, + "grad_norm": 0.07928107928786651, + "learning_rate": 0.00017647058823529413, + "loss": 0.0461, + "step": 120 + }, + { + "epoch": 0.26799557032115173, + "grad_norm": 0.11766523130310784, + "learning_rate": 0.00017794117647058823, + "loss": 0.0718, + "step": 121 + }, + { + "epoch": 0.2702104097452935, + "grad_norm": 0.11886529271181655, + "learning_rate": 0.00017941176470588236, + "loss": 0.0605, + "step": 122 + }, + { + "epoch": 0.2724252491694352, + "grad_norm": 0.10904621334787483, + "learning_rate": 0.00018088235294117647, + "loss": 0.0611, + "step": 123 + }, + { + "epoch": 0.27464008859357697, + "grad_norm": 0.07869869695183741, + "learning_rate": 0.0001823529411764706, + "loss": 0.0483, + "step": 124 + }, + { + "epoch": 0.2768549280177187, + "grad_norm": 0.09996401526350816, + "learning_rate": 0.0001838235294117647, + "loss": 0.0599, + "step": 125 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 0.06915194910918881, + "learning_rate": 0.00018529411764705883, + "loss": 0.0425, + "step": 126 + }, + { + "epoch": 0.2812846068660022, + "grad_norm": 0.06954026359253448, + "learning_rate": 0.00018676470588235297, + "loss": 0.0451, + "step": 127 + }, + { + "epoch": 0.28349944629014395, + "grad_norm": 0.07937939864183062, + "learning_rate": 0.00018823529411764707, + "loss": 0.0663, + "step": 128 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.10543052857994123, + "learning_rate": 0.00018970588235294117, + "loss": 0.0537, + "step": 129 + }, + { + "epoch": 0.28792912513842744, + "grad_norm": 0.09853680673905837, + "learning_rate": 0.0001911764705882353, + "loss": 0.0703, + "step": 130 + }, + { + "epoch": 0.2901439645625692, + "grad_norm": 0.09940547848219704, + "learning_rate": 0.00019264705882352944, + "loss": 0.0533, + "step": 131 + }, + { + "epoch": 0.292358803986711, + "grad_norm": 0.0649609876102772, + "learning_rate": 0.00019411764705882354, + "loss": 0.0428, + "step": 132 + }, + { + "epoch": 0.29457364341085274, + "grad_norm": 0.11578690281463908, + "learning_rate": 0.00019558823529411764, + "loss": 0.0703, + "step": 133 + }, + { + "epoch": 0.2967884828349945, + "grad_norm": 0.10216525271329696, + "learning_rate": 0.00019705882352941177, + "loss": 0.0696, + "step": 134 + }, + { + "epoch": 0.29900332225913623, + "grad_norm": 0.07891009386914495, + "learning_rate": 0.0001985294117647059, + "loss": 0.0522, + "step": 135 + }, + { + "epoch": 0.301218161683278, + "grad_norm": 0.1023889834603996, + "learning_rate": 0.0002, + "loss": 0.0585, + "step": 136 + }, + { + "epoch": 0.3034330011074197, + "grad_norm": 0.06274518197481414, + "learning_rate": 0.00019999966681276683, + "loss": 0.0335, + "step": 137 + }, + { + "epoch": 0.30564784053156147, + "grad_norm": 0.12763937597821692, + "learning_rate": 0.0001999986672532875, + "loss": 0.0545, + "step": 138 + }, + { + "epoch": 0.3078626799557032, + "grad_norm": 0.08810229760057865, + "learning_rate": 0.00019999700132822295, + "loss": 0.051, + "step": 139 + }, + { + "epoch": 0.31007751937984496, + "grad_norm": 0.08970541234311992, + "learning_rate": 0.00019999466904867434, + "loss": 0.0455, + "step": 140 + }, + { + "epoch": 0.3122923588039867, + "grad_norm": 0.0842435604812686, + "learning_rate": 0.00019999167043018346, + "loss": 0.0316, + "step": 141 + }, + { + "epoch": 0.31450719822812845, + "grad_norm": 0.11181170790991606, + "learning_rate": 0.00019998800549273234, + "loss": 0.0653, + "step": 142 + }, + { + "epoch": 0.3167220376522702, + "grad_norm": 0.11407583287598154, + "learning_rate": 0.00019998367426074317, + "loss": 0.0608, + "step": 143 + }, + { + "epoch": 0.31893687707641194, + "grad_norm": 0.1009463329697484, + "learning_rate": 0.00019997867676307816, + "loss": 0.0486, + "step": 144 + }, + { + "epoch": 0.3211517165005537, + "grad_norm": 0.08925939364084035, + "learning_rate": 0.00019997301303303937, + "loss": 0.0485, + "step": 145 + }, + { + "epoch": 0.3233665559246955, + "grad_norm": 0.08043557199210229, + "learning_rate": 0.0001999666831083685, + "loss": 0.0523, + "step": 146 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 0.08659445389168131, + "learning_rate": 0.00019995968703124647, + "loss": 0.0418, + "step": 147 + }, + { + "epoch": 0.327796234772979, + "grad_norm": 0.08448412585699264, + "learning_rate": 0.00019995202484829338, + "loss": 0.0492, + "step": 148 + }, + { + "epoch": 0.3300110741971207, + "grad_norm": 0.08968105406670221, + "learning_rate": 0.00019994369661056812, + "loss": 0.0379, + "step": 149 + }, + { + "epoch": 0.33222591362126247, + "grad_norm": 0.10271245513115, + "learning_rate": 0.00019993470237356784, + "loss": 0.0521, + "step": 150 + }, + { + "epoch": 0.3344407530454042, + "grad_norm": 0.10681878957195191, + "learning_rate": 0.00019992504219722788, + "loss": 0.0527, + "step": 151 + }, + { + "epoch": 0.33665559246954596, + "grad_norm": 0.1134768604369574, + "learning_rate": 0.00019991471614592125, + "loss": 0.0554, + "step": 152 + }, + { + "epoch": 0.3388704318936877, + "grad_norm": 0.06992847089868476, + "learning_rate": 0.00019990372428845804, + "loss": 0.0426, + "step": 153 + }, + { + "epoch": 0.34108527131782945, + "grad_norm": 0.07926412080399618, + "learning_rate": 0.00019989206669808522, + "loss": 0.0364, + "step": 154 + }, + { + "epoch": 0.3433001107419712, + "grad_norm": 0.09270249041916763, + "learning_rate": 0.000199879743452486, + "loss": 0.0448, + "step": 155 + }, + { + "epoch": 0.34551495016611294, + "grad_norm": 0.10278310243863024, + "learning_rate": 0.00019986675463377925, + "loss": 0.0484, + "step": 156 + }, + { + "epoch": 0.3477297895902547, + "grad_norm": 0.0879496359499489, + "learning_rate": 0.00019985310032851928, + "loss": 0.0403, + "step": 157 + }, + { + "epoch": 0.34994462901439644, + "grad_norm": 0.1467148063249797, + "learning_rate": 0.00019983878062769484, + "loss": 0.0553, + "step": 158 + }, + { + "epoch": 0.3521594684385382, + "grad_norm": 0.07957535055595316, + "learning_rate": 0.00019982379562672874, + "loss": 0.0398, + "step": 159 + }, + { + "epoch": 0.35437430786268, + "grad_norm": 0.09006933610163498, + "learning_rate": 0.00019980814542547716, + "loss": 0.0518, + "step": 160 + }, + { + "epoch": 0.35658914728682173, + "grad_norm": 0.12561584585895666, + "learning_rate": 0.00019979183012822915, + "loss": 0.0627, + "step": 161 + }, + { + "epoch": 0.3588039867109635, + "grad_norm": 0.05963150900977816, + "learning_rate": 0.00019977484984370564, + "loss": 0.0334, + "step": 162 + }, + { + "epoch": 0.3610188261351052, + "grad_norm": 0.13355710142043947, + "learning_rate": 0.00019975720468505887, + "loss": 0.0752, + "step": 163 + }, + { + "epoch": 0.36323366555924697, + "grad_norm": 0.0830658954785744, + "learning_rate": 0.0001997388947698717, + "loss": 0.0471, + "step": 164 + }, + { + "epoch": 0.3654485049833887, + "grad_norm": 0.07742951620474012, + "learning_rate": 0.00019971992022015675, + "loss": 0.0438, + "step": 165 + }, + { + "epoch": 0.36766334440753046, + "grad_norm": 0.09440804411675371, + "learning_rate": 0.00019970028116235555, + "loss": 0.0462, + "step": 166 + }, + { + "epoch": 0.3698781838316722, + "grad_norm": 0.1384582918996046, + "learning_rate": 0.00019967997772733778, + "loss": 0.0618, + "step": 167 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.08873540228353521, + "learning_rate": 0.00019965901005040033, + "loss": 0.0644, + "step": 168 + }, + { + "epoch": 0.3743078626799557, + "grad_norm": 0.11767744661478874, + "learning_rate": 0.00019963737827126648, + "loss": 0.0727, + "step": 169 + }, + { + "epoch": 0.37652270210409744, + "grad_norm": 0.06422623349647326, + "learning_rate": 0.00019961508253408484, + "loss": 0.0383, + "step": 170 + }, + { + "epoch": 0.3787375415282392, + "grad_norm": 0.09942498388213745, + "learning_rate": 0.00019959212298742852, + "loss": 0.057, + "step": 171 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.08537791312980779, + "learning_rate": 0.0001995684997842941, + "loss": 0.0559, + "step": 172 + }, + { + "epoch": 0.3831672203765227, + "grad_norm": 0.07657686129784251, + "learning_rate": 0.00019954421308210053, + "loss": 0.041, + "step": 173 + }, + { + "epoch": 0.3853820598006645, + "grad_norm": 0.0937349409846834, + "learning_rate": 0.00019951926304268827, + "loss": 0.0553, + "step": 174 + }, + { + "epoch": 0.3875968992248062, + "grad_norm": 0.08252605272371324, + "learning_rate": 0.00019949364983231794, + "loss": 0.053, + "step": 175 + }, + { + "epoch": 0.38981173864894797, + "grad_norm": 0.09512484692226836, + "learning_rate": 0.00019946737362166946, + "loss": 0.0698, + "step": 176 + }, + { + "epoch": 0.3920265780730897, + "grad_norm": 0.0690803741674879, + "learning_rate": 0.00019944043458584076, + "loss": 0.0386, + "step": 177 + }, + { + "epoch": 0.39424141749723146, + "grad_norm": 0.09459419781457633, + "learning_rate": 0.00019941283290434675, + "loss": 0.0464, + "step": 178 + }, + { + "epoch": 0.3964562569213732, + "grad_norm": 0.13954292245288233, + "learning_rate": 0.00019938456876111794, + "loss": 0.0593, + "step": 179 + }, + { + "epoch": 0.39867109634551495, + "grad_norm": 0.0868841482932631, + "learning_rate": 0.00019935564234449941, + "loss": 0.0416, + "step": 180 + }, + { + "epoch": 0.4008859357696567, + "grad_norm": 0.11000808065978614, + "learning_rate": 0.00019932605384724938, + "loss": 0.0415, + "step": 181 + }, + { + "epoch": 0.40310077519379844, + "grad_norm": 0.12347106573132961, + "learning_rate": 0.00019929580346653803, + "loss": 0.0536, + "step": 182 + }, + { + "epoch": 0.4053156146179402, + "grad_norm": 0.08076464550681232, + "learning_rate": 0.0001992648914039462, + "loss": 0.0428, + "step": 183 + }, + { + "epoch": 0.40753045404208194, + "grad_norm": 0.10488563310793295, + "learning_rate": 0.000199233317865464, + "loss": 0.0477, + "step": 184 + }, + { + "epoch": 0.4097452934662237, + "grad_norm": 0.10940572403774432, + "learning_rate": 0.00019920108306148936, + "loss": 0.046, + "step": 185 + }, + { + "epoch": 0.4119601328903654, + "grad_norm": 0.10403333087055258, + "learning_rate": 0.00019916818720682686, + "loss": 0.0451, + "step": 186 + }, + { + "epoch": 0.4141749723145072, + "grad_norm": 0.1286795069547404, + "learning_rate": 0.00019913463052068602, + "loss": 0.0498, + "step": 187 + }, + { + "epoch": 0.416389811738649, + "grad_norm": 0.08083982360850366, + "learning_rate": 0.00019910041322668005, + "loss": 0.0418, + "step": 188 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 0.11902926053951901, + "learning_rate": 0.00019906553555282426, + "loss": 0.0546, + "step": 189 + }, + { + "epoch": 0.42081949058693247, + "grad_norm": 0.07034210954321099, + "learning_rate": 0.00019902999773153455, + "loss": 0.041, + "step": 190 + }, + { + "epoch": 0.4230343300110742, + "grad_norm": 0.09296222520161809, + "learning_rate": 0.0001989937999996259, + "loss": 0.0444, + "step": 191 + }, + { + "epoch": 0.42524916943521596, + "grad_norm": 0.09547378845746599, + "learning_rate": 0.0001989569425983108, + "loss": 0.0352, + "step": 192 + }, + { + "epoch": 0.4274640088593577, + "grad_norm": 0.06503688811776193, + "learning_rate": 0.00019891942577319747, + "loss": 0.0228, + "step": 193 + }, + { + "epoch": 0.42967884828349945, + "grad_norm": 0.09597594980531651, + "learning_rate": 0.0001988812497742885, + "loss": 0.0369, + "step": 194 + }, + { + "epoch": 0.4318936877076412, + "grad_norm": 0.09702362031270324, + "learning_rate": 0.00019884241485597901, + "loss": 0.0537, + "step": 195 + }, + { + "epoch": 0.43410852713178294, + "grad_norm": 0.11935221291675921, + "learning_rate": 0.00019880292127705495, + "loss": 0.0602, + "step": 196 + }, + { + "epoch": 0.4363233665559247, + "grad_norm": 0.08115284987765316, + "learning_rate": 0.0001987627693006915, + "loss": 0.0438, + "step": 197 + }, + { + "epoch": 0.43853820598006643, + "grad_norm": 0.09339727060503207, + "learning_rate": 0.0001987219591944511, + "loss": 0.0426, + "step": 198 + }, + { + "epoch": 0.4407530454042082, + "grad_norm": 0.09794013907852947, + "learning_rate": 0.0001986804912302819, + "loss": 0.0537, + "step": 199 + }, + { + "epoch": 0.4429678848283499, + "grad_norm": 0.07578939437823465, + "learning_rate": 0.00019863836568451588, + "loss": 0.0382, + "step": 200 + }, + { + "epoch": 0.44518272425249167, + "grad_norm": 0.10046649665717154, + "learning_rate": 0.00019859558283786687, + "loss": 0.0577, + "step": 201 + }, + { + "epoch": 0.44739756367663347, + "grad_norm": 0.07373912019653707, + "learning_rate": 0.00019855214297542887, + "loss": 0.0392, + "step": 202 + }, + { + "epoch": 0.4496124031007752, + "grad_norm": 0.11153186771891217, + "learning_rate": 0.00019850804638667398, + "loss": 0.0328, + "step": 203 + }, + { + "epoch": 0.45182724252491696, + "grad_norm": 0.08140770406754567, + "learning_rate": 0.00019846329336545063, + "loss": 0.0402, + "step": 204 + }, + { + "epoch": 0.4540420819490587, + "grad_norm": 0.11360955150196242, + "learning_rate": 0.00019841788420998154, + "loss": 0.0468, + "step": 205 + }, + { + "epoch": 0.45625692137320045, + "grad_norm": 0.08442053278090955, + "learning_rate": 0.00019837181922286175, + "loss": 0.036, + "step": 206 + }, + { + "epoch": 0.4584717607973422, + "grad_norm": 0.06555034471353549, + "learning_rate": 0.00019832509871105654, + "loss": 0.0362, + "step": 207 + }, + { + "epoch": 0.46068660022148394, + "grad_norm": 0.13441604338054675, + "learning_rate": 0.00019827772298589946, + "loss": 0.0453, + "step": 208 + }, + { + "epoch": 0.4629014396456257, + "grad_norm": 0.08695092923995318, + "learning_rate": 0.00019822969236309027, + "loss": 0.0363, + "step": 209 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.0928730928344598, + "learning_rate": 0.00019818100716269274, + "loss": 0.0402, + "step": 210 + }, + { + "epoch": 0.4673311184939092, + "grad_norm": 0.10197010495519095, + "learning_rate": 0.00019813166770913269, + "loss": 0.0481, + "step": 211 + }, + { + "epoch": 0.4695459579180509, + "grad_norm": 0.10147141642769242, + "learning_rate": 0.00019808167433119555, + "loss": 0.0495, + "step": 212 + }, + { + "epoch": 0.4717607973421927, + "grad_norm": 0.08825000149068297, + "learning_rate": 0.0001980310273620245, + "loss": 0.0402, + "step": 213 + }, + { + "epoch": 0.4739756367663344, + "grad_norm": 0.09743477186089754, + "learning_rate": 0.00019797972713911794, + "loss": 0.0491, + "step": 214 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.07732850124602289, + "learning_rate": 0.00019792777400432754, + "loss": 0.0371, + "step": 215 + }, + { + "epoch": 0.47840531561461797, + "grad_norm": 0.09747676625770421, + "learning_rate": 0.00019787516830385566, + "loss": 0.0371, + "step": 216 + }, + { + "epoch": 0.4806201550387597, + "grad_norm": 0.14477425070641553, + "learning_rate": 0.00019782191038825327, + "loss": 0.0481, + "step": 217 + }, + { + "epoch": 0.48283499446290146, + "grad_norm": 0.09675038085867696, + "learning_rate": 0.00019776800061241753, + "loss": 0.043, + "step": 218 + }, + { + "epoch": 0.4850498338870432, + "grad_norm": 0.09997259339275087, + "learning_rate": 0.00019771343933558942, + "loss": 0.0469, + "step": 219 + }, + { + "epoch": 0.48726467331118495, + "grad_norm": 0.14444553788289177, + "learning_rate": 0.00019765822692135136, + "loss": 0.0465, + "step": 220 + }, + { + "epoch": 0.4894795127353267, + "grad_norm": 0.0726824153185731, + "learning_rate": 0.00019760236373762477, + "loss": 0.0334, + "step": 221 + }, + { + "epoch": 0.49169435215946844, + "grad_norm": 0.10720305073931173, + "learning_rate": 0.00019754585015666765, + "loss": 0.0542, + "step": 222 + }, + { + "epoch": 0.4939091915836102, + "grad_norm": 0.10043045050695712, + "learning_rate": 0.00019748868655507207, + "loss": 0.0531, + "step": 223 + }, + { + "epoch": 0.49612403100775193, + "grad_norm": 0.08189666504330173, + "learning_rate": 0.00019743087331376168, + "loss": 0.0482, + "step": 224 + }, + { + "epoch": 0.4983388704318937, + "grad_norm": 0.09690635869644071, + "learning_rate": 0.00019737241081798916, + "loss": 0.0331, + "step": 225 + }, + { + "epoch": 0.5005537098560354, + "grad_norm": 0.075983833129413, + "learning_rate": 0.00019731329945733364, + "loss": 0.0339, + "step": 226 + }, + { + "epoch": 0.5027685492801772, + "grad_norm": 0.11471299298092144, + "learning_rate": 0.00019725353962569815, + "loss": 0.047, + "step": 227 + }, + { + "epoch": 0.5049833887043189, + "grad_norm": 0.0864124378774015, + "learning_rate": 0.00019719313172130692, + "loss": 0.0356, + "step": 228 + }, + { + "epoch": 0.5071982281284607, + "grad_norm": 0.09986588038053905, + "learning_rate": 0.00019713207614670286, + "loss": 0.0523, + "step": 229 + }, + { + "epoch": 0.5094130675526024, + "grad_norm": 0.09400213012153942, + "learning_rate": 0.00019707037330874468, + "loss": 0.0407, + "step": 230 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 0.09849451555234276, + "learning_rate": 0.00019700802361860434, + "loss": 0.0397, + "step": 231 + }, + { + "epoch": 0.5138427464008859, + "grad_norm": 0.08828130067398302, + "learning_rate": 0.00019694502749176426, + "loss": 0.0452, + "step": 232 + }, + { + "epoch": 0.5160575858250277, + "grad_norm": 0.12186110373046875, + "learning_rate": 0.00019688138534801458, + "loss": 0.0591, + "step": 233 + }, + { + "epoch": 0.5182724252491694, + "grad_norm": 0.11902282198825141, + "learning_rate": 0.0001968170976114502, + "loss": 0.0462, + "step": 234 + }, + { + "epoch": 0.5204872646733112, + "grad_norm": 0.10577415178149568, + "learning_rate": 0.00019675216471046832, + "loss": 0.0492, + "step": 235 + }, + { + "epoch": 0.5227021040974529, + "grad_norm": 0.07340521195460822, + "learning_rate": 0.00019668658707776507, + "loss": 0.034, + "step": 236 + }, + { + "epoch": 0.5249169435215947, + "grad_norm": 0.10920621994168217, + "learning_rate": 0.0001966203651503332, + "loss": 0.0442, + "step": 237 + }, + { + "epoch": 0.5271317829457365, + "grad_norm": 0.08151650994957185, + "learning_rate": 0.00019655349936945857, + "loss": 0.0381, + "step": 238 + }, + { + "epoch": 0.5293466223698782, + "grad_norm": 0.09873257318359331, + "learning_rate": 0.0001964859901807178, + "loss": 0.0393, + "step": 239 + }, + { + "epoch": 0.53156146179402, + "grad_norm": 0.10919240772292028, + "learning_rate": 0.0001964178380339748, + "loss": 0.0519, + "step": 240 + }, + { + "epoch": 0.5337763012181617, + "grad_norm": 0.08522019689431862, + "learning_rate": 0.00019634904338337812, + "loss": 0.0348, + "step": 241 + }, + { + "epoch": 0.5359911406423035, + "grad_norm": 0.06837137119377763, + "learning_rate": 0.0001962796066873577, + "loss": 0.0305, + "step": 242 + }, + { + "epoch": 0.5382059800664452, + "grad_norm": 0.1046335187781712, + "learning_rate": 0.000196209528408622, + "loss": 0.0489, + "step": 243 + }, + { + "epoch": 0.540420819490587, + "grad_norm": 0.0742243376013052, + "learning_rate": 0.00019613880901415477, + "loss": 0.0428, + "step": 244 + }, + { + "epoch": 0.5426356589147286, + "grad_norm": 0.08743964293820417, + "learning_rate": 0.00019606744897521198, + "loss": 0.0398, + "step": 245 + }, + { + "epoch": 0.5448504983388704, + "grad_norm": 0.10380747521647961, + "learning_rate": 0.0001959954487673187, + "loss": 0.0423, + "step": 246 + }, + { + "epoch": 0.5470653377630121, + "grad_norm": 0.06425822920600957, + "learning_rate": 0.000195922808870266, + "loss": 0.0344, + "step": 247 + }, + { + "epoch": 0.5492801771871539, + "grad_norm": 0.10878784438634576, + "learning_rate": 0.0001958495297681075, + "loss": 0.0394, + "step": 248 + }, + { + "epoch": 0.5514950166112956, + "grad_norm": 0.13814612759171893, + "learning_rate": 0.0001957756119491565, + "loss": 0.0387, + "step": 249 + }, + { + "epoch": 0.5537098560354374, + "grad_norm": 0.09388160414967169, + "learning_rate": 0.00019570105590598246, + "loss": 0.0407, + "step": 250 + }, + { + "epoch": 0.5559246954595792, + "grad_norm": 0.08049049378580198, + "learning_rate": 0.00019562586213540777, + "loss": 0.0336, + "step": 251 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.09070946349404929, + "learning_rate": 0.00019555003113850457, + "loss": 0.0319, + "step": 252 + }, + { + "epoch": 0.5603543743078627, + "grad_norm": 0.0898837114231121, + "learning_rate": 0.00019547356342059126, + "loss": 0.0452, + "step": 253 + }, + { + "epoch": 0.5625692137320044, + "grad_norm": 0.09426554558216904, + "learning_rate": 0.00019539645949122916, + "loss": 0.0292, + "step": 254 + }, + { + "epoch": 0.5647840531561462, + "grad_norm": 0.08582145921197389, + "learning_rate": 0.0001953187198642192, + "loss": 0.0408, + "step": 255 + }, + { + "epoch": 0.5669988925802879, + "grad_norm": 0.10266315515262193, + "learning_rate": 0.00019524034505759833, + "loss": 0.0377, + "step": 256 + }, + { + "epoch": 0.5692137320044297, + "grad_norm": 0.14033488314381135, + "learning_rate": 0.00019516133559363633, + "loss": 0.0513, + "step": 257 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.12387444878829866, + "learning_rate": 0.0001950816919988321, + "loss": 0.049, + "step": 258 + }, + { + "epoch": 0.5736434108527132, + "grad_norm": 0.14087057870998987, + "learning_rate": 0.00019500141480391015, + "loss": 0.0408, + "step": 259 + }, + { + "epoch": 0.5758582502768549, + "grad_norm": 0.18033194536584427, + "learning_rate": 0.00019492050454381725, + "loss": 0.0384, + "step": 260 + }, + { + "epoch": 0.5780730897009967, + "grad_norm": 0.08693874261062992, + "learning_rate": 0.0001948389617577187, + "loss": 0.0413, + "step": 261 + }, + { + "epoch": 0.5802879291251384, + "grad_norm": 0.12446006932734319, + "learning_rate": 0.00019475678698899484, + "loss": 0.0462, + "step": 262 + }, + { + "epoch": 0.5825027685492802, + "grad_norm": 0.1330104226262581, + "learning_rate": 0.0001946739807852373, + "loss": 0.0651, + "step": 263 + }, + { + "epoch": 0.584717607973422, + "grad_norm": 0.08937121127881706, + "learning_rate": 0.00019459054369824554, + "loss": 0.0412, + "step": 264 + }, + { + "epoch": 0.5869324473975637, + "grad_norm": 0.09976041295374459, + "learning_rate": 0.00019450647628402293, + "loss": 0.052, + "step": 265 + }, + { + "epoch": 0.5891472868217055, + "grad_norm": 0.07197881779803929, + "learning_rate": 0.00019442177910277328, + "loss": 0.0324, + "step": 266 + }, + { + "epoch": 0.5913621262458472, + "grad_norm": 0.11739702487011985, + "learning_rate": 0.00019433645271889702, + "loss": 0.0398, + "step": 267 + }, + { + "epoch": 0.593576965669989, + "grad_norm": 0.1206713803383517, + "learning_rate": 0.00019425049770098733, + "loss": 0.0476, + "step": 268 + }, + { + "epoch": 0.5957918050941307, + "grad_norm": 0.07703600393424821, + "learning_rate": 0.00019416391462182654, + "loss": 0.0292, + "step": 269 + }, + { + "epoch": 0.5980066445182725, + "grad_norm": 0.10620102535603748, + "learning_rate": 0.00019407670405838215, + "loss": 0.0455, + "step": 270 + }, + { + "epoch": 0.6002214839424141, + "grad_norm": 0.11305113533634766, + "learning_rate": 0.0001939888665918031, + "loss": 0.0372, + "step": 271 + }, + { + "epoch": 0.602436323366556, + "grad_norm": 0.085637212143354, + "learning_rate": 0.00019390040280741584, + "loss": 0.0385, + "step": 272 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 0.12159687024412826, + "learning_rate": 0.00019381131329472044, + "loss": 0.0337, + "step": 273 + }, + { + "epoch": 0.6068660022148394, + "grad_norm": 0.13005848019141178, + "learning_rate": 0.00019372159864738668, + "loss": 0.0533, + "step": 274 + }, + { + "epoch": 0.6090808416389811, + "grad_norm": 0.09576862560100748, + "learning_rate": 0.00019363125946325006, + "loss": 0.036, + "step": 275 + }, + { + "epoch": 0.6112956810631229, + "grad_norm": 0.09591012868627956, + "learning_rate": 0.0001935402963443078, + "loss": 0.0334, + "step": 276 + }, + { + "epoch": 0.6135105204872646, + "grad_norm": 0.10102805477224623, + "learning_rate": 0.00019344870989671496, + "loss": 0.0414, + "step": 277 + }, + { + "epoch": 0.6157253599114064, + "grad_norm": 0.12750188718287817, + "learning_rate": 0.00019335650073078016, + "loss": 0.0368, + "step": 278 + }, + { + "epoch": 0.6179401993355482, + "grad_norm": 0.08646526804467838, + "learning_rate": 0.0001932636694609618, + "loss": 0.0345, + "step": 279 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 0.0945113559054438, + "learning_rate": 0.00019317021670586375, + "loss": 0.0381, + "step": 280 + }, + { + "epoch": 0.6223698781838317, + "grad_norm": 0.09408797958463033, + "learning_rate": 0.0001930761430882313, + "loss": 0.0476, + "step": 281 + }, + { + "epoch": 0.6245847176079734, + "grad_norm": 0.10461874088495726, + "learning_rate": 0.000192981449234947, + "loss": 0.0394, + "step": 282 + }, + { + "epoch": 0.6267995570321152, + "grad_norm": 0.06403593434186135, + "learning_rate": 0.00019288613577702655, + "loss": 0.0241, + "step": 283 + }, + { + "epoch": 0.6290143964562569, + "grad_norm": 0.08584957142924246, + "learning_rate": 0.00019279020334961447, + "loss": 0.0349, + "step": 284 + }, + { + "epoch": 0.6312292358803987, + "grad_norm": 0.12814911437273463, + "learning_rate": 0.00019269365259198, + "loss": 0.0322, + "step": 285 + }, + { + "epoch": 0.6334440753045404, + "grad_norm": 0.10729513052880055, + "learning_rate": 0.00019259648414751265, + "loss": 0.0311, + "step": 286 + }, + { + "epoch": 0.6356589147286822, + "grad_norm": 0.13945058067914204, + "learning_rate": 0.00019249869866371817, + "loss": 0.0387, + "step": 287 + }, + { + "epoch": 0.6378737541528239, + "grad_norm": 0.12452228233741477, + "learning_rate": 0.00019240029679221408, + "loss": 0.041, + "step": 288 + }, + { + "epoch": 0.6400885935769657, + "grad_norm": 0.10128208783039065, + "learning_rate": 0.0001923012791887253, + "loss": 0.038, + "step": 289 + }, + { + "epoch": 0.6423034330011074, + "grad_norm": 0.0952112092775196, + "learning_rate": 0.00019220164651307986, + "loss": 0.0352, + "step": 290 + }, + { + "epoch": 0.6445182724252492, + "grad_norm": 0.08729229257992316, + "learning_rate": 0.0001921013994292045, + "loss": 0.0392, + "step": 291 + }, + { + "epoch": 0.646733111849391, + "grad_norm": 0.08673195518621, + "learning_rate": 0.00019200053860512014, + "loss": 0.0336, + "step": 292 + }, + { + "epoch": 0.6489479512735327, + "grad_norm": 0.082676007696342, + "learning_rate": 0.0001918990647129376, + "loss": 0.0329, + "step": 293 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 0.11975255574742016, + "learning_rate": 0.00019179697842885293, + "loss": 0.0577, + "step": 294 + }, + { + "epoch": 0.6533776301218162, + "grad_norm": 0.12605350967572224, + "learning_rate": 0.00019169428043314314, + "loss": 0.0459, + "step": 295 + }, + { + "epoch": 0.655592469545958, + "grad_norm": 0.09267286728603327, + "learning_rate": 0.0001915909714101614, + "loss": 0.0422, + "step": 296 + }, + { + "epoch": 0.6578073089700996, + "grad_norm": 0.07810676091207971, + "learning_rate": 0.0001914870520483327, + "loss": 0.0356, + "step": 297 + }, + { + "epoch": 0.6600221483942414, + "grad_norm": 0.08386595535870753, + "learning_rate": 0.00019138252304014907, + "loss": 0.0294, + "step": 298 + }, + { + "epoch": 0.6622369878183831, + "grad_norm": 0.07107438580994326, + "learning_rate": 0.00019127738508216516, + "loss": 0.0287, + "step": 299 + }, + { + "epoch": 0.6644518272425249, + "grad_norm": 0.09677977059125471, + "learning_rate": 0.0001911716388749935, + "loss": 0.0434, + "step": 300 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.0782248974530897, + "learning_rate": 0.00019106528512329978, + "loss": 0.0365, + "step": 301 + }, + { + "epoch": 0.6688815060908084, + "grad_norm": 0.08217385203509002, + "learning_rate": 0.0001909583245357983, + "loss": 0.0364, + "step": 302 + }, + { + "epoch": 0.6710963455149501, + "grad_norm": 0.08985375337138271, + "learning_rate": 0.00019085075782524703, + "loss": 0.0357, + "step": 303 + }, + { + "epoch": 0.6733111849390919, + "grad_norm": 0.0879234312213684, + "learning_rate": 0.0001907425857084431, + "loss": 0.036, + "step": 304 + }, + { + "epoch": 0.6755260243632336, + "grad_norm": 0.08161863659846057, + "learning_rate": 0.0001906338089062179, + "loss": 0.0276, + "step": 305 + }, + { + "epoch": 0.6777408637873754, + "grad_norm": 0.10008426882895241, + "learning_rate": 0.0001905244281434322, + "loss": 0.041, + "step": 306 + }, + { + "epoch": 0.6799557032115172, + "grad_norm": 0.08735262875500331, + "learning_rate": 0.00019041444414897153, + "loss": 0.0265, + "step": 307 + }, + { + "epoch": 0.6821705426356589, + "grad_norm": 0.08136440936955826, + "learning_rate": 0.00019030385765574114, + "loss": 0.0302, + "step": 308 + }, + { + "epoch": 0.6843853820598007, + "grad_norm": 0.08153890467854036, + "learning_rate": 0.00019019266940066117, + "loss": 0.0273, + "step": 309 + }, + { + "epoch": 0.6866002214839424, + "grad_norm": 0.11888547002257514, + "learning_rate": 0.00019008088012466179, + "loss": 0.0384, + "step": 310 + }, + { + "epoch": 0.6888150609080842, + "grad_norm": 0.11225940048007611, + "learning_rate": 0.00018996849057267815, + "loss": 0.0451, + "step": 311 + }, + { + "epoch": 0.6910299003322259, + "grad_norm": 0.06682410752653684, + "learning_rate": 0.00018985550149364552, + "loss": 0.0283, + "step": 312 + }, + { + "epoch": 0.6932447397563677, + "grad_norm": 0.09081805371601429, + "learning_rate": 0.00018974191364049434, + "loss": 0.04, + "step": 313 + }, + { + "epoch": 0.6954595791805094, + "grad_norm": 0.11632860648515458, + "learning_rate": 0.000189627727770145, + "loss": 0.038, + "step": 314 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.07395361856728037, + "learning_rate": 0.000189512944643503, + "loss": 0.0293, + "step": 315 + }, + { + "epoch": 0.6998892580287929, + "grad_norm": 0.07851700832675246, + "learning_rate": 0.0001893975650254538, + "loss": 0.0286, + "step": 316 + }, + { + "epoch": 0.7021040974529347, + "grad_norm": 0.1247561874264949, + "learning_rate": 0.00018928158968485769, + "loss": 0.0359, + "step": 317 + }, + { + "epoch": 0.7043189368770764, + "grad_norm": 0.09377197961991464, + "learning_rate": 0.00018916501939454476, + "loss": 0.0371, + "step": 318 + }, + { + "epoch": 0.7065337763012182, + "grad_norm": 0.1166870124760382, + "learning_rate": 0.00018904785493130963, + "loss": 0.0413, + "step": 319 + }, + { + "epoch": 0.70874861572536, + "grad_norm": 0.0743257119116502, + "learning_rate": 0.00018893009707590636, + "loss": 0.0297, + "step": 320 + }, + { + "epoch": 0.7109634551495017, + "grad_norm": 0.10630785726183187, + "learning_rate": 0.00018881174661304327, + "loss": 0.0397, + "step": 321 + }, + { + "epoch": 0.7131782945736435, + "grad_norm": 0.12152039320780698, + "learning_rate": 0.00018869280433137759, + "loss": 0.0332, + "step": 322 + }, + { + "epoch": 0.7153931339977851, + "grad_norm": 0.070266874554528, + "learning_rate": 0.00018857327102351034, + "loss": 0.0274, + "step": 323 + }, + { + "epoch": 0.717607973421927, + "grad_norm": 0.1438673249607899, + "learning_rate": 0.00018845314748598094, + "loss": 0.0388, + "step": 324 + }, + { + "epoch": 0.7198228128460686, + "grad_norm": 0.06592242459297634, + "learning_rate": 0.000188332434519262, + "loss": 0.0236, + "step": 325 + }, + { + "epoch": 0.7220376522702104, + "grad_norm": 0.0867647658297763, + "learning_rate": 0.00018821113292775388, + "loss": 0.0305, + "step": 326 + }, + { + "epoch": 0.7242524916943521, + "grad_norm": 0.08485016030356768, + "learning_rate": 0.00018808924351977944, + "loss": 0.0397, + "step": 327 + }, + { + "epoch": 0.7264673311184939, + "grad_norm": 0.08190673762367395, + "learning_rate": 0.00018796676710757854, + "loss": 0.0335, + "step": 328 + }, + { + "epoch": 0.7286821705426356, + "grad_norm": 0.06365640734164844, + "learning_rate": 0.00018784370450730274, + "loss": 0.0241, + "step": 329 + }, + { + "epoch": 0.7308970099667774, + "grad_norm": 0.06147269741246861, + "learning_rate": 0.00018772005653900977, + "loss": 0.0234, + "step": 330 + }, + { + "epoch": 0.7331118493909191, + "grad_norm": 0.08145153589705575, + "learning_rate": 0.00018759582402665814, + "loss": 0.03, + "step": 331 + }, + { + "epoch": 0.7353266888150609, + "grad_norm": 0.10941707589483284, + "learning_rate": 0.00018747100779810158, + "loss": 0.0457, + "step": 332 + }, + { + "epoch": 0.7375415282392026, + "grad_norm": 0.1033347272411939, + "learning_rate": 0.00018734560868508354, + "loss": 0.0378, + "step": 333 + }, + { + "epoch": 0.7397563676633444, + "grad_norm": 0.11310972391188698, + "learning_rate": 0.00018721962752323175, + "loss": 0.0441, + "step": 334 + }, + { + "epoch": 0.7419712070874862, + "grad_norm": 0.11359563528175586, + "learning_rate": 0.00018709306515205247, + "loss": 0.0303, + "step": 335 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.1361630180679817, + "learning_rate": 0.00018696592241492502, + "loss": 0.0307, + "step": 336 + }, + { + "epoch": 0.7464008859357697, + "grad_norm": 0.13944521632261525, + "learning_rate": 0.00018683820015909615, + "loss": 0.0415, + "step": 337 + }, + { + "epoch": 0.7486157253599114, + "grad_norm": 0.13351660299606588, + "learning_rate": 0.00018670989923567436, + "loss": 0.0452, + "step": 338 + }, + { + "epoch": 0.7508305647840532, + "grad_norm": 0.1265600885613397, + "learning_rate": 0.00018658102049962422, + "loss": 0.045, + "step": 339 + }, + { + "epoch": 0.7530454042081949, + "grad_norm": 0.06771287602714439, + "learning_rate": 0.00018645156480976075, + "loss": 0.0286, + "step": 340 + }, + { + "epoch": 0.7552602436323367, + "grad_norm": 0.10345346764570142, + "learning_rate": 0.0001863215330287436, + "loss": 0.0434, + "step": 341 + }, + { + "epoch": 0.7574750830564784, + "grad_norm": 0.06845528495188237, + "learning_rate": 0.00018619092602307135, + "loss": 0.0239, + "step": 342 + }, + { + "epoch": 0.7596899224806202, + "grad_norm": 0.08800876267813315, + "learning_rate": 0.00018605974466307575, + "loss": 0.0258, + "step": 343 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.10970503559749166, + "learning_rate": 0.00018592798982291592, + "loss": 0.0346, + "step": 344 + }, + { + "epoch": 0.7641196013289037, + "grad_norm": 0.08425071929232138, + "learning_rate": 0.00018579566238057237, + "loss": 0.0254, + "step": 345 + }, + { + "epoch": 0.7663344407530454, + "grad_norm": 0.09517954539253917, + "learning_rate": 0.00018566276321784148, + "loss": 0.0315, + "step": 346 + }, + { + "epoch": 0.7685492801771872, + "grad_norm": 0.12807684343094766, + "learning_rate": 0.00018552929322032932, + "loss": 0.0382, + "step": 347 + }, + { + "epoch": 0.770764119601329, + "grad_norm": 0.09501407453475527, + "learning_rate": 0.00018539525327744585, + "loss": 0.0335, + "step": 348 + }, + { + "epoch": 0.7729789590254706, + "grad_norm": 0.10105571490013905, + "learning_rate": 0.00018526064428239908, + "loss": 0.0337, + "step": 349 + }, + { + "epoch": 0.7751937984496124, + "grad_norm": 0.13897774661812348, + "learning_rate": 0.00018512546713218887, + "loss": 0.0389, + "step": 350 + }, + { + "epoch": 0.7774086378737541, + "grad_norm": 0.07854696396963474, + "learning_rate": 0.00018498972272760136, + "loss": 0.0297, + "step": 351 + }, + { + "epoch": 0.7796234772978959, + "grad_norm": 0.10442397796757871, + "learning_rate": 0.00018485341197320253, + "loss": 0.0494, + "step": 352 + }, + { + "epoch": 0.7818383167220376, + "grad_norm": 0.10643606838272895, + "learning_rate": 0.00018471653577733244, + "loss": 0.031, + "step": 353 + }, + { + "epoch": 0.7840531561461794, + "grad_norm": 0.0687841261051955, + "learning_rate": 0.00018457909505209915, + "loss": 0.03, + "step": 354 + }, + { + "epoch": 0.7862679955703211, + "grad_norm": 0.06076364250480939, + "learning_rate": 0.0001844410907133725, + "loss": 0.0223, + "step": 355 + }, + { + "epoch": 0.7884828349944629, + "grad_norm": 0.08435264402256122, + "learning_rate": 0.00018430252368077822, + "loss": 0.038, + "step": 356 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 0.08659564578586366, + "learning_rate": 0.00018416339487769165, + "loss": 0.0261, + "step": 357 + }, + { + "epoch": 0.7929125138427464, + "grad_norm": 0.12155159327339758, + "learning_rate": 0.00018402370523123155, + "loss": 0.0291, + "step": 358 + }, + { + "epoch": 0.7951273532668881, + "grad_norm": 0.08977674997033257, + "learning_rate": 0.00018388345567225408, + "loss": 0.0337, + "step": 359 + }, + { + "epoch": 0.7973421926910299, + "grad_norm": 0.08983122760185414, + "learning_rate": 0.0001837426471353465, + "loss": 0.0292, + "step": 360 + }, + { + "epoch": 0.7995570321151716, + "grad_norm": 0.11746036150364302, + "learning_rate": 0.00018360128055882092, + "loss": 0.0387, + "step": 361 + }, + { + "epoch": 0.8017718715393134, + "grad_norm": 0.0911247387627866, + "learning_rate": 0.00018345935688470814, + "loss": 0.0298, + "step": 362 + }, + { + "epoch": 0.8039867109634552, + "grad_norm": 0.07907116100660053, + "learning_rate": 0.00018331687705875127, + "loss": 0.0377, + "step": 363 + }, + { + "epoch": 0.8062015503875969, + "grad_norm": 0.08815156050043559, + "learning_rate": 0.00018317384203039952, + "loss": 0.0271, + "step": 364 + }, + { + "epoch": 0.8084163898117387, + "grad_norm": 0.08533766782440835, + "learning_rate": 0.00018303025275280175, + "loss": 0.0297, + "step": 365 + }, + { + "epoch": 0.8106312292358804, + "grad_norm": 0.08618049180672532, + "learning_rate": 0.00018288611018280024, + "loss": 0.0355, + "step": 366 + }, + { + "epoch": 0.8128460686600222, + "grad_norm": 0.07332967126987296, + "learning_rate": 0.00018274141528092433, + "loss": 0.0287, + "step": 367 + }, + { + "epoch": 0.8150609080841639, + "grad_norm": 0.1187966739384006, + "learning_rate": 0.00018259616901138387, + "loss": 0.036, + "step": 368 + }, + { + "epoch": 0.8172757475083057, + "grad_norm": 0.07271417362690233, + "learning_rate": 0.00018245037234206288, + "loss": 0.0248, + "step": 369 + }, + { + "epoch": 0.8194905869324474, + "grad_norm": 0.10645228844355217, + "learning_rate": 0.0001823040262445132, + "loss": 0.039, + "step": 370 + }, + { + "epoch": 0.8217054263565892, + "grad_norm": 0.06234573772794465, + "learning_rate": 0.0001821571316939478, + "loss": 0.022, + "step": 371 + }, + { + "epoch": 0.8239202657807309, + "grad_norm": 0.08186906237808426, + "learning_rate": 0.0001820096896692345, + "loss": 0.0233, + "step": 372 + }, + { + "epoch": 0.8261351052048727, + "grad_norm": 0.06712249706669125, + "learning_rate": 0.0001818617011528893, + "loss": 0.0179, + "step": 373 + }, + { + "epoch": 0.8283499446290143, + "grad_norm": 0.09368356786567972, + "learning_rate": 0.0001817131671310698, + "loss": 0.031, + "step": 374 + }, + { + "epoch": 0.8305647840531561, + "grad_norm": 0.08960440432431373, + "learning_rate": 0.0001815640885935689, + "loss": 0.0404, + "step": 375 + }, + { + "epoch": 0.832779623477298, + "grad_norm": 0.1124807536927241, + "learning_rate": 0.00018141446653380792, + "loss": 0.0299, + "step": 376 + }, + { + "epoch": 0.8349944629014396, + "grad_norm": 0.08188137955909824, + "learning_rate": 0.00018126430194882994, + "loss": 0.022, + "step": 377 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 0.07590675023323204, + "learning_rate": 0.00018111359583929354, + "loss": 0.028, + "step": 378 + }, + { + "epoch": 0.8394241417497231, + "grad_norm": 0.09311017454886383, + "learning_rate": 0.00018096234920946574, + "loss": 0.0245, + "step": 379 + }, + { + "epoch": 0.8416389811738649, + "grad_norm": 0.1365927622215841, + "learning_rate": 0.00018081056306721536, + "loss": 0.0318, + "step": 380 + }, + { + "epoch": 0.8438538205980066, + "grad_norm": 0.13428051271744462, + "learning_rate": 0.0001806582384240066, + "loss": 0.038, + "step": 381 + }, + { + "epoch": 0.8460686600221484, + "grad_norm": 0.13061857200630747, + "learning_rate": 0.00018050537629489191, + "loss": 0.0295, + "step": 382 + }, + { + "epoch": 0.8482834994462901, + "grad_norm": 0.08628924195911908, + "learning_rate": 0.00018035197769850555, + "loss": 0.0253, + "step": 383 + }, + { + "epoch": 0.8504983388704319, + "grad_norm": 0.07292250172041051, + "learning_rate": 0.00018019804365705658, + "loss": 0.0209, + "step": 384 + }, + { + "epoch": 0.8527131782945736, + "grad_norm": 0.07198738334864144, + "learning_rate": 0.00018004357519632217, + "loss": 0.0168, + "step": 385 + }, + { + "epoch": 0.8549280177187154, + "grad_norm": 0.08204200719636238, + "learning_rate": 0.00017988857334564064, + "loss": 0.0226, + "step": 386 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.09461328323004375, + "learning_rate": 0.00017973303913790479, + "loss": 0.0287, + "step": 387 + }, + { + "epoch": 0.8593576965669989, + "grad_norm": 0.09333642851587562, + "learning_rate": 0.00017957697360955485, + "loss": 0.0382, + "step": 388 + }, + { + "epoch": 0.8615725359911407, + "grad_norm": 0.07873607343881187, + "learning_rate": 0.00017942037780057163, + "loss": 0.0323, + "step": 389 + }, + { + "epoch": 0.8637873754152824, + "grad_norm": 0.09585102156567583, + "learning_rate": 0.00017926325275446968, + "loss": 0.0282, + "step": 390 + }, + { + "epoch": 0.8660022148394242, + "grad_norm": 0.10941060321055031, + "learning_rate": 0.00017910559951829017, + "loss": 0.0385, + "step": 391 + }, + { + "epoch": 0.8682170542635659, + "grad_norm": 0.11980210118412196, + "learning_rate": 0.00017894741914259394, + "loss": 0.0294, + "step": 392 + }, + { + "epoch": 0.8704318936877077, + "grad_norm": 0.09805974626327063, + "learning_rate": 0.0001787887126814547, + "loss": 0.0351, + "step": 393 + }, + { + "epoch": 0.8726467331118494, + "grad_norm": 0.0964746682711025, + "learning_rate": 0.00017862948119245177, + "loss": 0.0374, + "step": 394 + }, + { + "epoch": 0.8748615725359912, + "grad_norm": 0.1261563970365322, + "learning_rate": 0.00017846972573666312, + "loss": 0.0318, + "step": 395 + }, + { + "epoch": 0.8770764119601329, + "grad_norm": 0.06322327005685795, + "learning_rate": 0.00017830944737865832, + "loss": 0.021, + "step": 396 + }, + { + "epoch": 0.8792912513842747, + "grad_norm": 0.1232663358011499, + "learning_rate": 0.0001781486471864914, + "loss": 0.0271, + "step": 397 + }, + { + "epoch": 0.8815060908084164, + "grad_norm": 0.08535574173826038, + "learning_rate": 0.00017798732623169384, + "loss": 0.0364, + "step": 398 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 0.11441427321173675, + "learning_rate": 0.00017782548558926723, + "loss": 0.0276, + "step": 399 + }, + { + "epoch": 0.8859357696566998, + "grad_norm": 0.07585446309248646, + "learning_rate": 0.00017766312633767635, + "loss": 0.0248, + "step": 400 + }, + { + "epoch": 0.8881506090808416, + "grad_norm": 0.07164771346087506, + "learning_rate": 0.00017750024955884175, + "loss": 0.021, + "step": 401 + }, + { + "epoch": 0.8903654485049833, + "grad_norm": 0.08743844732682912, + "learning_rate": 0.0001773368563381327, + "loss": 0.0263, + "step": 402 + }, + { + "epoch": 0.8925802879291251, + "grad_norm": 0.08891368050663236, + "learning_rate": 0.0001771729477643599, + "loss": 0.0276, + "step": 403 + }, + { + "epoch": 0.8947951273532669, + "grad_norm": 0.1112219207143199, + "learning_rate": 0.00017700852492976828, + "loss": 0.0383, + "step": 404 + }, + { + "epoch": 0.8970099667774086, + "grad_norm": 0.10862512677116702, + "learning_rate": 0.00017684358893002957, + "loss": 0.0395, + "step": 405 + }, + { + "epoch": 0.8992248062015504, + "grad_norm": 0.08306491846730917, + "learning_rate": 0.0001766781408642352, + "loss": 0.03, + "step": 406 + }, + { + "epoch": 0.9014396456256921, + "grad_norm": 0.09574907602559253, + "learning_rate": 0.0001765121818348888, + "loss": 0.0262, + "step": 407 + }, + { + "epoch": 0.9036544850498339, + "grad_norm": 0.0742833442012322, + "learning_rate": 0.00017634571294789897, + "loss": 0.0275, + "step": 408 + }, + { + "epoch": 0.9058693244739756, + "grad_norm": 0.07251667706314688, + "learning_rate": 0.00017617873531257188, + "loss": 0.0237, + "step": 409 + }, + { + "epoch": 0.9080841638981174, + "grad_norm": 0.11192817565413549, + "learning_rate": 0.00017601125004160386, + "loss": 0.0367, + "step": 410 + }, + { + "epoch": 0.9102990033222591, + "grad_norm": 0.11308273534579694, + "learning_rate": 0.00017584325825107397, + "loss": 0.0366, + "step": 411 + }, + { + "epoch": 0.9125138427464009, + "grad_norm": 0.0769744308164503, + "learning_rate": 0.00017567476106043666, + "loss": 0.0167, + "step": 412 + }, + { + "epoch": 0.9147286821705426, + "grad_norm": 0.08911895912177897, + "learning_rate": 0.0001755057595925141, + "loss": 0.0252, + "step": 413 + }, + { + "epoch": 0.9169435215946844, + "grad_norm": 0.07932586091312234, + "learning_rate": 0.00017533625497348902, + "loss": 0.0304, + "step": 414 + }, + { + "epoch": 0.9191583610188261, + "grad_norm": 0.10081637483948423, + "learning_rate": 0.00017516624833289684, + "loss": 0.0233, + "step": 415 + }, + { + "epoch": 0.9213732004429679, + "grad_norm": 0.09617657367669462, + "learning_rate": 0.00017499574080361845, + "loss": 0.034, + "step": 416 + }, + { + "epoch": 0.9235880398671097, + "grad_norm": 0.08234975475953288, + "learning_rate": 0.00017482473352187247, + "loss": 0.0252, + "step": 417 + }, + { + "epoch": 0.9258028792912514, + "grad_norm": 0.06815247182755384, + "learning_rate": 0.00017465322762720776, + "loss": 0.0187, + "step": 418 + }, + { + "epoch": 0.9280177187153932, + "grad_norm": 0.08137652858841106, + "learning_rate": 0.0001744812242624958, + "loss": 0.0182, + "step": 419 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.07497620553387589, + "learning_rate": 0.00017430872457392308, + "loss": 0.0243, + "step": 420 + }, + { + "epoch": 0.9324473975636767, + "grad_norm": 0.09680827227546589, + "learning_rate": 0.00017413572971098357, + "loss": 0.0249, + "step": 421 + }, + { + "epoch": 0.9346622369878184, + "grad_norm": 0.12645224866463453, + "learning_rate": 0.00017396224082647077, + "loss": 0.0349, + "step": 422 + }, + { + "epoch": 0.9368770764119602, + "grad_norm": 0.15497842702651096, + "learning_rate": 0.00017378825907647034, + "loss": 0.0303, + "step": 423 + }, + { + "epoch": 0.9390919158361019, + "grad_norm": 0.09914314439082061, + "learning_rate": 0.00017361378562035225, + "loss": 0.0318, + "step": 424 + }, + { + "epoch": 0.9413067552602437, + "grad_norm": 0.08728565332022578, + "learning_rate": 0.000173438821620763, + "loss": 0.0252, + "step": 425 + }, + { + "epoch": 0.9435215946843853, + "grad_norm": 0.11668996416790389, + "learning_rate": 0.00017326336824361813, + "loss": 0.0417, + "step": 426 + }, + { + "epoch": 0.9457364341085271, + "grad_norm": 0.08039976502316808, + "learning_rate": 0.00017308742665809402, + "loss": 0.0236, + "step": 427 + }, + { + "epoch": 0.9479512735326688, + "grad_norm": 0.1054313108744657, + "learning_rate": 0.00017291099803662055, + "loss": 0.0281, + "step": 428 + }, + { + "epoch": 0.9501661129568106, + "grad_norm": 0.0992848542428949, + "learning_rate": 0.00017273408355487297, + "loss": 0.0326, + "step": 429 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.11352559565641726, + "learning_rate": 0.00017255668439176421, + "loss": 0.0293, + "step": 430 + }, + { + "epoch": 0.9545957918050941, + "grad_norm": 0.07287653989923466, + "learning_rate": 0.00017237880172943707, + "loss": 0.022, + "step": 431 + }, + { + "epoch": 0.9568106312292359, + "grad_norm": 0.08523354170499019, + "learning_rate": 0.00017220043675325608, + "loss": 0.0251, + "step": 432 + }, + { + "epoch": 0.9590254706533776, + "grad_norm": 0.0898284342654929, + "learning_rate": 0.00017202159065179994, + "loss": 0.0286, + "step": 433 + }, + { + "epoch": 0.9612403100775194, + "grad_norm": 0.08577936792872161, + "learning_rate": 0.00017184226461685345, + "loss": 0.0387, + "step": 434 + }, + { + "epoch": 0.9634551495016611, + "grad_norm": 0.06799708106028148, + "learning_rate": 0.0001716624598433995, + "loss": 0.0196, + "step": 435 + }, + { + "epoch": 0.9656699889258029, + "grad_norm": 0.08173172532410382, + "learning_rate": 0.00017148217752961114, + "loss": 0.0255, + "step": 436 + }, + { + "epoch": 0.9678848283499446, + "grad_norm": 0.08723069666870088, + "learning_rate": 0.0001713014188768437, + "loss": 0.0254, + "step": 437 + }, + { + "epoch": 0.9700996677740864, + "grad_norm": 0.09364013319247487, + "learning_rate": 0.0001711201850896267, + "loss": 0.0185, + "step": 438 + }, + { + "epoch": 0.9723145071982281, + "grad_norm": 0.08747526554258868, + "learning_rate": 0.00017093847737565586, + "loss": 0.0265, + "step": 439 + }, + { + "epoch": 0.9745293466223699, + "grad_norm": 0.09528472492312309, + "learning_rate": 0.0001707562969457849, + "loss": 0.0311, + "step": 440 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 0.09460091156103437, + "learning_rate": 0.00017057364501401776, + "loss": 0.0243, + "step": 441 + }, + { + "epoch": 0.9789590254706534, + "grad_norm": 0.06457437279768916, + "learning_rate": 0.00017039052279750028, + "loss": 0.0202, + "step": 442 + }, + { + "epoch": 0.9811738648947951, + "grad_norm": 0.08322019836911437, + "learning_rate": 0.00017020693151651207, + "loss": 0.022, + "step": 443 + }, + { + "epoch": 0.9833887043189369, + "grad_norm": 0.08712278052026377, + "learning_rate": 0.00017002287239445863, + "loss": 0.0293, + "step": 444 + }, + { + "epoch": 0.9856035437430787, + "grad_norm": 0.08866826799495027, + "learning_rate": 0.00016983834665786293, + "loss": 0.0324, + "step": 445 + }, + { + "epoch": 0.9878183831672204, + "grad_norm": 0.08221966010575803, + "learning_rate": 0.00016965335553635735, + "loss": 0.0227, + "step": 446 + }, + { + "epoch": 0.9900332225913622, + "grad_norm": 0.07823372473994855, + "learning_rate": 0.0001694679002626755, + "loss": 0.0262, + "step": 447 + }, + { + "epoch": 0.9922480620155039, + "grad_norm": 0.1682039836448326, + "learning_rate": 0.00016928198207264399, + "loss": 0.0242, + "step": 448 + }, + { + "epoch": 0.9944629014396457, + "grad_norm": 0.13597478095997192, + "learning_rate": 0.00016909560220517412, + "loss": 0.0351, + "step": 449 + }, + { + "epoch": 0.9966777408637874, + "grad_norm": 0.10411354364469891, + "learning_rate": 0.00016890876190225375, + "loss": 0.0304, + "step": 450 + }, + { + "epoch": 0.9988925802879292, + "grad_norm": 0.06788033707715571, + "learning_rate": 0.000168721462408939, + "loss": 0.0168, + "step": 451 + }, + { + "epoch": 0.9988925802879292, + "eval_loss": 0.02549322322010994, + "eval_runtime": 162.3482, + "eval_samples_per_second": 9.369, + "eval_steps_per_second": 0.296, + "step": 451 + }, + { + "epoch": 1.0011074197120708, + "grad_norm": 0.08589649213920914, + "learning_rate": 0.0001685337049733458, + "loss": 0.023, + "step": 452 + }, + { + "epoch": 1.0033222591362125, + "grad_norm": 0.13532432367872516, + "learning_rate": 0.00016834549084664182, + "loss": 0.0238, + "step": 453 + }, + { + "epoch": 1.0055370985603544, + "grad_norm": 0.07953664058371505, + "learning_rate": 0.00016815682128303792, + "loss": 0.0175, + "step": 454 + }, + { + "epoch": 1.0077519379844961, + "grad_norm": 0.08055093704390931, + "learning_rate": 0.00016796769753977988, + "loss": 0.0198, + "step": 455 + }, + { + "epoch": 1.0099667774086378, + "grad_norm": 0.17124424815528758, + "learning_rate": 0.00016777812087714002, + "loss": 0.0271, + "step": 456 + }, + { + "epoch": 1.0121816168327795, + "grad_norm": 0.07626228751323345, + "learning_rate": 0.0001675880925584089, + "loss": 0.0116, + "step": 457 + }, + { + "epoch": 1.0143964562569214, + "grad_norm": 0.07303368251394754, + "learning_rate": 0.0001673976138498866, + "loss": 0.0229, + "step": 458 + }, + { + "epoch": 1.0166112956810631, + "grad_norm": 0.11078018459147443, + "learning_rate": 0.0001672066860208747, + "loss": 0.026, + "step": 459 + }, + { + "epoch": 1.0188261351052048, + "grad_norm": 0.07994846862742215, + "learning_rate": 0.00016701531034366744, + "loss": 0.0219, + "step": 460 + }, + { + "epoch": 1.0210409745293467, + "grad_norm": 0.1020734679742894, + "learning_rate": 0.00016682348809354347, + "loss": 0.0268, + "step": 461 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 0.08127568455810623, + "learning_rate": 0.00016663122054875727, + "loss": 0.0166, + "step": 462 + }, + { + "epoch": 1.02547065337763, + "grad_norm": 0.07430051588089749, + "learning_rate": 0.00016643850899053075, + "loss": 0.0159, + "step": 463 + }, + { + "epoch": 1.0276854928017718, + "grad_norm": 0.07715248918476807, + "learning_rate": 0.00016624535470304447, + "loss": 0.0129, + "step": 464 + }, + { + "epoch": 1.0299003322259137, + "grad_norm": 0.07347174610141043, + "learning_rate": 0.00016605175897342926, + "loss": 0.0202, + "step": 465 + }, + { + "epoch": 1.0321151716500554, + "grad_norm": 0.07229716608472317, + "learning_rate": 0.0001658577230917577, + "loss": 0.0116, + "step": 466 + }, + { + "epoch": 1.034330011074197, + "grad_norm": 0.07417939197872284, + "learning_rate": 0.00016566324835103527, + "loss": 0.0183, + "step": 467 + }, + { + "epoch": 1.0365448504983388, + "grad_norm": 0.06656534558818392, + "learning_rate": 0.00016546833604719202, + "loss": 0.0164, + "step": 468 + }, + { + "epoch": 1.0387596899224807, + "grad_norm": 0.06936779083577965, + "learning_rate": 0.00016527298747907385, + "loss": 0.0169, + "step": 469 + }, + { + "epoch": 1.0409745293466224, + "grad_norm": 0.10152766178760593, + "learning_rate": 0.00016507720394843363, + "loss": 0.0223, + "step": 470 + }, + { + "epoch": 1.043189368770764, + "grad_norm": 0.12217930497625859, + "learning_rate": 0.00016488098675992286, + "loss": 0.0136, + "step": 471 + }, + { + "epoch": 1.0454042081949058, + "grad_norm": 0.0766251038059969, + "learning_rate": 0.0001646843372210828, + "loss": 0.0165, + "step": 472 + }, + { + "epoch": 1.0476190476190477, + "grad_norm": 0.10466469374372958, + "learning_rate": 0.00016448725664233575, + "loss": 0.0206, + "step": 473 + }, + { + "epoch": 1.0498338870431894, + "grad_norm": 0.09952268715140701, + "learning_rate": 0.00016428974633697635, + "loss": 0.0156, + "step": 474 + }, + { + "epoch": 1.052048726467331, + "grad_norm": 0.10761324867912393, + "learning_rate": 0.00016409180762116287, + "loss": 0.0266, + "step": 475 + }, + { + "epoch": 1.054263565891473, + "grad_norm": 0.09563396934567533, + "learning_rate": 0.00016389344181390837, + "loss": 0.0205, + "step": 476 + }, + { + "epoch": 1.0564784053156147, + "grad_norm": 0.11864213983151412, + "learning_rate": 0.00016369465023707193, + "loss": 0.024, + "step": 477 + }, + { + "epoch": 1.0586932447397563, + "grad_norm": 0.09807832567261117, + "learning_rate": 0.00016349543421534984, + "loss": 0.0139, + "step": 478 + }, + { + "epoch": 1.060908084163898, + "grad_norm": 0.11305006379484095, + "learning_rate": 0.00016329579507626686, + "loss": 0.0289, + "step": 479 + }, + { + "epoch": 1.06312292358804, + "grad_norm": 0.09302174510975784, + "learning_rate": 0.00016309573415016715, + "loss": 0.0135, + "step": 480 + }, + { + "epoch": 1.0653377630121816, + "grad_norm": 0.08756846494305612, + "learning_rate": 0.00016289525277020573, + "loss": 0.0171, + "step": 481 + }, + { + "epoch": 1.0675526024363233, + "grad_norm": 0.05689917857821868, + "learning_rate": 0.00016269435227233927, + "loss": 0.0148, + "step": 482 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 0.07265656240482526, + "learning_rate": 0.0001624930339953174, + "loss": 0.0199, + "step": 483 + }, + { + "epoch": 1.071982281284607, + "grad_norm": 0.09653771960696875, + "learning_rate": 0.00016229129928067374, + "loss": 0.0263, + "step": 484 + }, + { + "epoch": 1.0741971207087486, + "grad_norm": 0.0680134898630167, + "learning_rate": 0.00016208914947271686, + "loss": 0.0169, + "step": 485 + }, + { + "epoch": 1.0764119601328903, + "grad_norm": 0.052701699772681125, + "learning_rate": 0.0001618865859185215, + "loss": 0.0133, + "step": 486 + }, + { + "epoch": 1.078626799557032, + "grad_norm": 0.07006329174735149, + "learning_rate": 0.0001616836099679195, + "loss": 0.0186, + "step": 487 + }, + { + "epoch": 1.080841638981174, + "grad_norm": 0.09243756075297438, + "learning_rate": 0.00016148022297349067, + "loss": 0.0162, + "step": 488 + }, + { + "epoch": 1.0830564784053156, + "grad_norm": 0.06802348226339236, + "learning_rate": 0.00016127642629055412, + "loss": 0.02, + "step": 489 + }, + { + "epoch": 1.0852713178294573, + "grad_norm": 0.09149553777713718, + "learning_rate": 0.00016107222127715887, + "loss": 0.0223, + "step": 490 + }, + { + "epoch": 1.0874861572535992, + "grad_norm": 0.10666620026192312, + "learning_rate": 0.00016086760929407497, + "loss": 0.0195, + "step": 491 + }, + { + "epoch": 1.089700996677741, + "grad_norm": 0.07350745160703971, + "learning_rate": 0.00016066259170478443, + "loss": 0.0192, + "step": 492 + }, + { + "epoch": 1.0919158361018826, + "grad_norm": 0.07649755758370294, + "learning_rate": 0.0001604571698754721, + "loss": 0.0184, + "step": 493 + }, + { + "epoch": 1.0941306755260243, + "grad_norm": 0.07693450016879375, + "learning_rate": 0.00016025134517501664, + "loss": 0.0187, + "step": 494 + }, + { + "epoch": 1.0963455149501662, + "grad_norm": 0.07424284058512753, + "learning_rate": 0.00016004511897498128, + "loss": 0.0166, + "step": 495 + }, + { + "epoch": 1.0985603543743079, + "grad_norm": 0.06091477206840724, + "learning_rate": 0.00015983849264960477, + "loss": 0.0174, + "step": 496 + }, + { + "epoch": 1.1007751937984496, + "grad_norm": 0.07586220724504236, + "learning_rate": 0.00015963146757579216, + "loss": 0.014, + "step": 497 + }, + { + "epoch": 1.1029900332225913, + "grad_norm": 0.08243131404611125, + "learning_rate": 0.00015942404513310575, + "loss": 0.0169, + "step": 498 + }, + { + "epoch": 1.1052048726467332, + "grad_norm": 0.07745939546672638, + "learning_rate": 0.00015921622670375562, + "loss": 0.0137, + "step": 499 + }, + { + "epoch": 1.1074197120708749, + "grad_norm": 0.06507681670689026, + "learning_rate": 0.00015900801367259082, + "loss": 0.0086, + "step": 500 + }, + { + "epoch": 1.1096345514950166, + "grad_norm": 0.08636654044843717, + "learning_rate": 0.00015879940742708976, + "loss": 0.0124, + "step": 501 + }, + { + "epoch": 1.1118493909191585, + "grad_norm": 0.09336839614417057, + "learning_rate": 0.00015859040935735122, + "loss": 0.0199, + "step": 502 + }, + { + "epoch": 1.1140642303433002, + "grad_norm": 0.08361044738696155, + "learning_rate": 0.000158381020856085, + "loss": 0.0149, + "step": 503 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 0.09189649942964037, + "learning_rate": 0.00015817124331860257, + "loss": 0.0162, + "step": 504 + }, + { + "epoch": 1.1184939091915835, + "grad_norm": 0.05803566347391752, + "learning_rate": 0.0001579610781428079, + "loss": 0.0093, + "step": 505 + }, + { + "epoch": 1.1207087486157254, + "grad_norm": 0.0685604093344935, + "learning_rate": 0.00015775052672918803, + "loss": 0.0169, + "step": 506 + }, + { + "epoch": 1.1229235880398671, + "grad_norm": 0.06647423245351942, + "learning_rate": 0.00015753959048080387, + "loss": 0.0138, + "step": 507 + }, + { + "epoch": 1.1251384274640088, + "grad_norm": 0.07574981472625895, + "learning_rate": 0.00015732827080328066, + "loss": 0.0158, + "step": 508 + }, + { + "epoch": 1.1273532668881505, + "grad_norm": 0.09942610916086221, + "learning_rate": 0.0001571165691047988, + "loss": 0.0262, + "step": 509 + }, + { + "epoch": 1.1295681063122924, + "grad_norm": 0.09830858194361332, + "learning_rate": 0.00015690448679608435, + "loss": 0.0232, + "step": 510 + }, + { + "epoch": 1.1317829457364341, + "grad_norm": 0.07161117904964832, + "learning_rate": 0.00015669202529039965, + "loss": 0.014, + "step": 511 + }, + { + "epoch": 1.1339977851605758, + "grad_norm": 0.09194652822526989, + "learning_rate": 0.00015647918600353395, + "loss": 0.0292, + "step": 512 + }, + { + "epoch": 1.1362126245847177, + "grad_norm": 0.08837675652236006, + "learning_rate": 0.00015626597035379392, + "loss": 0.019, + "step": 513 + }, + { + "epoch": 1.1384274640088594, + "grad_norm": 0.07171847899234002, + "learning_rate": 0.0001560523797619942, + "loss": 0.016, + "step": 514 + }, + { + "epoch": 1.140642303433001, + "grad_norm": 0.08285004555855917, + "learning_rate": 0.00015583841565144793, + "loss": 0.0243, + "step": 515 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.05178154733435809, + "learning_rate": 0.0001556240794479573, + "loss": 0.0078, + "step": 516 + }, + { + "epoch": 1.1450719822812845, + "grad_norm": 0.0960196972431174, + "learning_rate": 0.00015540937257980413, + "loss": 0.016, + "step": 517 + }, + { + "epoch": 1.1472868217054264, + "grad_norm": 0.08730986367875605, + "learning_rate": 0.00015519429647774006, + "loss": 0.0185, + "step": 518 + }, + { + "epoch": 1.149501661129568, + "grad_norm": 0.08590566440974896, + "learning_rate": 0.00015497885257497734, + "loss": 0.0234, + "step": 519 + }, + { + "epoch": 1.1517165005537098, + "grad_norm": 0.07604779939650068, + "learning_rate": 0.00015476304230717923, + "loss": 0.0109, + "step": 520 + }, + { + "epoch": 1.1539313399778517, + "grad_norm": 0.04630925444784343, + "learning_rate": 0.0001545468671124501, + "loss": 0.01, + "step": 521 + }, + { + "epoch": 1.1561461794019934, + "grad_norm": 0.0649376527000263, + "learning_rate": 0.00015433032843132635, + "loss": 0.0121, + "step": 522 + }, + { + "epoch": 1.158361018826135, + "grad_norm": 0.06292009291504237, + "learning_rate": 0.00015411342770676645, + "loss": 0.0154, + "step": 523 + }, + { + "epoch": 1.1605758582502768, + "grad_norm": 0.08055266586650985, + "learning_rate": 0.0001538961663841414, + "loss": 0.0157, + "step": 524 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.07511335829066276, + "learning_rate": 0.00015367854591122522, + "loss": 0.0151, + "step": 525 + }, + { + "epoch": 1.1650055370985604, + "grad_norm": 0.07089788363462246, + "learning_rate": 0.00015346056773818517, + "loss": 0.0157, + "step": 526 + }, + { + "epoch": 1.167220376522702, + "grad_norm": 0.11521158492283862, + "learning_rate": 0.00015324223331757213, + "loss": 0.0177, + "step": 527 + }, + { + "epoch": 1.169435215946844, + "grad_norm": 0.05973246782251088, + "learning_rate": 0.0001530235441043109, + "loss": 0.0146, + "step": 528 + }, + { + "epoch": 1.1716500553709857, + "grad_norm": 0.06154426543783861, + "learning_rate": 0.00015280450155569063, + "loss": 0.0188, + "step": 529 + }, + { + "epoch": 1.1738648947951273, + "grad_norm": 0.058458329224942734, + "learning_rate": 0.00015258510713135487, + "loss": 0.0122, + "step": 530 + }, + { + "epoch": 1.176079734219269, + "grad_norm": 0.07787987891702267, + "learning_rate": 0.00015236536229329208, + "loss": 0.0158, + "step": 531 + }, + { + "epoch": 1.178294573643411, + "grad_norm": 0.09829421068026872, + "learning_rate": 0.00015214526850582572, + "loss": 0.019, + "step": 532 + }, + { + "epoch": 1.1805094130675526, + "grad_norm": 0.09308250012352962, + "learning_rate": 0.00015192482723560462, + "loss": 0.0194, + "step": 533 + }, + { + "epoch": 1.1827242524916943, + "grad_norm": 0.09253867986960174, + "learning_rate": 0.0001517040399515931, + "loss": 0.028, + "step": 534 + }, + { + "epoch": 1.184939091915836, + "grad_norm": 0.12061147161004993, + "learning_rate": 0.00015148290812506127, + "loss": 0.0224, + "step": 535 + }, + { + "epoch": 1.187153931339978, + "grad_norm": 0.07936414157309797, + "learning_rate": 0.00015126143322957515, + "loss": 0.0126, + "step": 536 + }, + { + "epoch": 1.1893687707641196, + "grad_norm": 0.08573551577862286, + "learning_rate": 0.00015103961674098687, + "loss": 0.0153, + "step": 537 + }, + { + "epoch": 1.1915836101882613, + "grad_norm": 0.09090244171449426, + "learning_rate": 0.0001508174601374249, + "loss": 0.018, + "step": 538 + }, + { + "epoch": 1.193798449612403, + "grad_norm": 0.06059684589591143, + "learning_rate": 0.00015059496489928413, + "loss": 0.0113, + "step": 539 + }, + { + "epoch": 1.196013289036545, + "grad_norm": 0.06042455116320195, + "learning_rate": 0.00015037213250921596, + "loss": 0.0148, + "step": 540 + }, + { + "epoch": 1.1982281284606866, + "grad_norm": 0.09425381306893758, + "learning_rate": 0.0001501489644521186, + "loss": 0.0256, + "step": 541 + }, + { + "epoch": 1.2004429678848283, + "grad_norm": 0.08496965301125604, + "learning_rate": 0.00014992546221512698, + "loss": 0.017, + "step": 542 + }, + { + "epoch": 1.2026578073089702, + "grad_norm": 0.07725019852419338, + "learning_rate": 0.00014970162728760293, + "loss": 0.0172, + "step": 543 + }, + { + "epoch": 1.204872646733112, + "grad_norm": 0.06569370131436317, + "learning_rate": 0.00014947746116112523, + "loss": 0.0111, + "step": 544 + }, + { + "epoch": 1.2070874861572536, + "grad_norm": 0.09936034314078612, + "learning_rate": 0.00014925296532947976, + "loss": 0.0174, + "step": 545 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 0.08702985013579236, + "learning_rate": 0.00014902814128864942, + "loss": 0.0253, + "step": 546 + }, + { + "epoch": 1.2115171650055372, + "grad_norm": 0.0764467514732507, + "learning_rate": 0.00014880299053680418, + "loss": 0.0197, + "step": 547 + }, + { + "epoch": 1.2137320044296789, + "grad_norm": 0.057459905258067566, + "learning_rate": 0.00014857751457429116, + "loss": 0.0112, + "step": 548 + }, + { + "epoch": 1.2159468438538206, + "grad_norm": 0.08155328051110594, + "learning_rate": 0.0001483517149036246, + "loss": 0.0215, + "step": 549 + }, + { + "epoch": 1.2181616832779623, + "grad_norm": 0.06512603734340787, + "learning_rate": 0.0001481255930294759, + "loss": 0.0111, + "step": 550 + }, + { + "epoch": 1.2203765227021042, + "grad_norm": 0.0823852866211762, + "learning_rate": 0.00014789915045866346, + "loss": 0.0187, + "step": 551 + }, + { + "epoch": 1.2225913621262459, + "grad_norm": 0.14188552071403435, + "learning_rate": 0.00014767238870014272, + "loss": 0.0254, + "step": 552 + }, + { + "epoch": 1.2248062015503876, + "grad_norm": 0.07095528535148037, + "learning_rate": 0.0001474453092649962, + "loss": 0.0169, + "step": 553 + }, + { + "epoch": 1.2270210409745292, + "grad_norm": 0.0620273402779489, + "learning_rate": 0.00014721791366642322, + "loss": 0.0156, + "step": 554 + }, + { + "epoch": 1.2292358803986712, + "grad_norm": 0.07242995826021505, + "learning_rate": 0.00014699020341973, + "loss": 0.0213, + "step": 555 + }, + { + "epoch": 1.2314507198228128, + "grad_norm": 0.08775618038885223, + "learning_rate": 0.00014676218004231953, + "loss": 0.0143, + "step": 556 + }, + { + "epoch": 1.2336655592469545, + "grad_norm": 0.10107224792431188, + "learning_rate": 0.0001465338450536813, + "loss": 0.0215, + "step": 557 + }, + { + "epoch": 1.2358803986710964, + "grad_norm": 0.07481578394385288, + "learning_rate": 0.0001463051999753814, + "loss": 0.0201, + "step": 558 + }, + { + "epoch": 1.2380952380952381, + "grad_norm": 0.055578370712652134, + "learning_rate": 0.0001460762463310523, + "loss": 0.0111, + "step": 559 + }, + { + "epoch": 1.2403100775193798, + "grad_norm": 0.06920554393495616, + "learning_rate": 0.00014584698564638253, + "loss": 0.0132, + "step": 560 + }, + { + "epoch": 1.2425249169435215, + "grad_norm": 0.06974396904159569, + "learning_rate": 0.00014561741944910683, + "loss": 0.017, + "step": 561 + }, + { + "epoch": 1.2447397563676634, + "grad_norm": 0.06281256493900224, + "learning_rate": 0.0001453875492689957, + "loss": 0.0128, + "step": 562 + }, + { + "epoch": 1.2469545957918051, + "grad_norm": 0.06884909068671637, + "learning_rate": 0.00014515737663784535, + "loss": 0.0101, + "step": 563 + }, + { + "epoch": 1.2491694352159468, + "grad_norm": 0.05627820381571176, + "learning_rate": 0.00014492690308946736, + "loss": 0.0196, + "step": 564 + }, + { + "epoch": 1.2513842746400887, + "grad_norm": 0.0698964300385918, + "learning_rate": 0.00014469613015967867, + "loss": 0.0188, + "step": 565 + }, + { + "epoch": 1.2535991140642304, + "grad_norm": 0.08043716033946414, + "learning_rate": 0.00014446505938629114, + "loss": 0.02, + "step": 566 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 0.07337220327813347, + "learning_rate": 0.00014423369230910136, + "loss": 0.0194, + "step": 567 + }, + { + "epoch": 1.2580287929125138, + "grad_norm": 0.09840321104930393, + "learning_rate": 0.00014400203046988053, + "loss": 0.022, + "step": 568 + }, + { + "epoch": 1.2602436323366555, + "grad_norm": 0.08731377429046894, + "learning_rate": 0.00014377007541236395, + "loss": 0.016, + "step": 569 + }, + { + "epoch": 1.2624584717607974, + "grad_norm": 0.06801941956571358, + "learning_rate": 0.00014353782868224092, + "loss": 0.0128, + "step": 570 + }, + { + "epoch": 1.264673311184939, + "grad_norm": 0.07308867871744269, + "learning_rate": 0.0001433052918271443, + "loss": 0.0177, + "step": 571 + }, + { + "epoch": 1.2668881506090808, + "grad_norm": 0.07681844444023452, + "learning_rate": 0.00014307246639664039, + "loss": 0.0136, + "step": 572 + }, + { + "epoch": 1.2691029900332227, + "grad_norm": 0.07567605013534005, + "learning_rate": 0.00014283935394221838, + "loss": 0.0106, + "step": 573 + }, + { + "epoch": 1.2713178294573644, + "grad_norm": 0.09446575449376596, + "learning_rate": 0.0001426059560172801, + "loss": 0.0197, + "step": 574 + }, + { + "epoch": 1.273532668881506, + "grad_norm": 0.13276733596265716, + "learning_rate": 0.00014237227417712974, + "loss": 0.0305, + "step": 575 + }, + { + "epoch": 1.2757475083056478, + "grad_norm": 0.10084028943306088, + "learning_rate": 0.0001421383099789635, + "loss": 0.0131, + "step": 576 + }, + { + "epoch": 1.2779623477297897, + "grad_norm": 0.08587833926101628, + "learning_rate": 0.00014190406498185897, + "loss": 0.0116, + "step": 577 + }, + { + "epoch": 1.2801771871539314, + "grad_norm": 0.0688277391779551, + "learning_rate": 0.00014166954074676502, + "loss": 0.0191, + "step": 578 + }, + { + "epoch": 1.282392026578073, + "grad_norm": 0.08053314238079391, + "learning_rate": 0.00014143473883649127, + "loss": 0.0236, + "step": 579 + }, + { + "epoch": 1.284606866002215, + "grad_norm": 0.0912991017952056, + "learning_rate": 0.0001411996608156977, + "loss": 0.0175, + "step": 580 + }, + { + "epoch": 1.2868217054263567, + "grad_norm": 0.07935442204696004, + "learning_rate": 0.00014096430825088425, + "loss": 0.0117, + "step": 581 + }, + { + "epoch": 1.2890365448504983, + "grad_norm": 0.09144227296861142, + "learning_rate": 0.00014072868271038027, + "loss": 0.0247, + "step": 582 + }, + { + "epoch": 1.29125138427464, + "grad_norm": 0.06954241132745707, + "learning_rate": 0.0001404927857643342, + "loss": 0.0155, + "step": 583 + }, + { + "epoch": 1.2934662236987817, + "grad_norm": 0.10372696956700662, + "learning_rate": 0.00014025661898470307, + "loss": 0.024, + "step": 584 + }, + { + "epoch": 1.2956810631229236, + "grad_norm": 0.08664483584899454, + "learning_rate": 0.00014002018394524205, + "loss": 0.0173, + "step": 585 + }, + { + "epoch": 1.2978959025470653, + "grad_norm": 0.08515541008249282, + "learning_rate": 0.00013978348222149376, + "loss": 0.0193, + "step": 586 + }, + { + "epoch": 1.300110741971207, + "grad_norm": 0.09712854025334372, + "learning_rate": 0.00013954651539077812, + "loss": 0.0159, + "step": 587 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 0.07335540120405447, + "learning_rate": 0.00013930928503218162, + "loss": 0.0134, + "step": 588 + }, + { + "epoch": 1.3045404208194906, + "grad_norm": 0.08866517234131753, + "learning_rate": 0.0001390717927265467, + "loss": 0.022, + "step": 589 + }, + { + "epoch": 1.3067552602436323, + "grad_norm": 0.10738477710476918, + "learning_rate": 0.0001388340400564615, + "loss": 0.0159, + "step": 590 + }, + { + "epoch": 1.308970099667774, + "grad_norm": 0.06111375761634437, + "learning_rate": 0.0001385960286062491, + "loss": 0.0124, + "step": 591 + }, + { + "epoch": 1.311184939091916, + "grad_norm": 0.08604939673491503, + "learning_rate": 0.00013835775996195703, + "loss": 0.0149, + "step": 592 + }, + { + "epoch": 1.3133997785160576, + "grad_norm": 0.05266562425816654, + "learning_rate": 0.00013811923571134668, + "loss": 0.0082, + "step": 593 + }, + { + "epoch": 1.3156146179401993, + "grad_norm": 0.07718581252295655, + "learning_rate": 0.0001378804574438828, + "loss": 0.0168, + "step": 594 + }, + { + "epoch": 1.3178294573643412, + "grad_norm": 0.0818380848852053, + "learning_rate": 0.00013764142675072276, + "loss": 0.0182, + "step": 595 + }, + { + "epoch": 1.320044296788483, + "grad_norm": 0.10531475363228426, + "learning_rate": 0.00013740214522470608, + "loss": 0.0291, + "step": 596 + }, + { + "epoch": 1.3222591362126246, + "grad_norm": 0.0748727596524778, + "learning_rate": 0.00013716261446034374, + "loss": 0.0154, + "step": 597 + }, + { + "epoch": 1.3244739756367663, + "grad_norm": 0.11422790711287975, + "learning_rate": 0.00013692283605380758, + "loss": 0.0252, + "step": 598 + }, + { + "epoch": 1.326688815060908, + "grad_norm": 0.08319891849750775, + "learning_rate": 0.0001366828116029197, + "loss": 0.0175, + "step": 599 + }, + { + "epoch": 1.3289036544850499, + "grad_norm": 0.06445130473523782, + "learning_rate": 0.00013644254270714175, + "loss": 0.0143, + "step": 600 + }, + { + "epoch": 1.3311184939091916, + "grad_norm": 0.11956544993691816, + "learning_rate": 0.0001362020309675643, + "loss": 0.0133, + "step": 601 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.09628818578581089, + "learning_rate": 0.00013596127798689618, + "loss": 0.0168, + "step": 602 + }, + { + "epoch": 1.3355481727574752, + "grad_norm": 0.08084805156025456, + "learning_rate": 0.00013572028536945377, + "loss": 0.0179, + "step": 603 + }, + { + "epoch": 1.3377630121816169, + "grad_norm": 0.10643065749367996, + "learning_rate": 0.00013547905472115034, + "loss": 0.0235, + "step": 604 + }, + { + "epoch": 1.3399778516057586, + "grad_norm": 0.07129438942522039, + "learning_rate": 0.0001352375876494853, + "loss": 0.0132, + "step": 605 + }, + { + "epoch": 1.3421926910299002, + "grad_norm": 0.08272031065767634, + "learning_rate": 0.00013499588576353362, + "loss": 0.0144, + "step": 606 + }, + { + "epoch": 1.3444075304540422, + "grad_norm": 0.07239815281686438, + "learning_rate": 0.00013475395067393493, + "loss": 0.0241, + "step": 607 + }, + { + "epoch": 1.3466223698781838, + "grad_norm": 0.11216577265956344, + "learning_rate": 0.0001345117839928829, + "loss": 0.0151, + "step": 608 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 0.08171192735544974, + "learning_rate": 0.00013426938733411442, + "loss": 0.0257, + "step": 609 + }, + { + "epoch": 1.3510520487264674, + "grad_norm": 0.10109896498688715, + "learning_rate": 0.00013402676231289898, + "loss": 0.022, + "step": 610 + }, + { + "epoch": 1.3532668881506091, + "grad_norm": 0.09892180652067042, + "learning_rate": 0.00013378391054602774, + "loss": 0.0218, + "step": 611 + }, + { + "epoch": 1.3554817275747508, + "grad_norm": 0.08234248398344776, + "learning_rate": 0.00013354083365180292, + "loss": 0.0263, + "step": 612 + }, + { + "epoch": 1.3576965669988925, + "grad_norm": 0.07876103809322832, + "learning_rate": 0.00013329753325002678, + "loss": 0.0114, + "step": 613 + }, + { + "epoch": 1.3599114064230342, + "grad_norm": 0.07529999797653103, + "learning_rate": 0.00013305401096199115, + "loss": 0.0135, + "step": 614 + }, + { + "epoch": 1.3621262458471761, + "grad_norm": 0.07760703842357898, + "learning_rate": 0.00013281026841046635, + "loss": 0.0185, + "step": 615 + }, + { + "epoch": 1.3643410852713178, + "grad_norm": 0.0722738945310531, + "learning_rate": 0.00013256630721969053, + "loss": 0.0102, + "step": 616 + }, + { + "epoch": 1.3665559246954595, + "grad_norm": 0.09389858685946781, + "learning_rate": 0.0001323221290153587, + "loss": 0.0184, + "step": 617 + }, + { + "epoch": 1.3687707641196014, + "grad_norm": 0.11647747491112101, + "learning_rate": 0.00013207773542461216, + "loss": 0.0186, + "step": 618 + }, + { + "epoch": 1.370985603543743, + "grad_norm": 0.2841990785658319, + "learning_rate": 0.00013183312807602736, + "loss": 0.0163, + "step": 619 + }, + { + "epoch": 1.3732004429678848, + "grad_norm": 0.0915400161333746, + "learning_rate": 0.0001315883085996052, + "loss": 0.0169, + "step": 620 + }, + { + "epoch": 1.3754152823920265, + "grad_norm": 0.08125481915136389, + "learning_rate": 0.0001313432786267602, + "loss": 0.0244, + "step": 621 + }, + { + "epoch": 1.3776301218161684, + "grad_norm": 0.11376049923626905, + "learning_rate": 0.00013109803979030949, + "loss": 0.0167, + "step": 622 + }, + { + "epoch": 1.37984496124031, + "grad_norm": 0.08037352540538885, + "learning_rate": 0.00013085259372446205, + "loss": 0.0154, + "step": 623 + }, + { + "epoch": 1.3820598006644518, + "grad_norm": 0.07323329657916658, + "learning_rate": 0.00013060694206480783, + "loss": 0.0173, + "step": 624 + }, + { + "epoch": 1.3842746400885937, + "grad_norm": 0.07523500522763453, + "learning_rate": 0.00013036108644830675, + "loss": 0.0174, + "step": 625 + }, + { + "epoch": 1.3864894795127354, + "grad_norm": 0.06822659173399938, + "learning_rate": 0.00013011502851327785, + "loss": 0.0198, + "step": 626 + }, + { + "epoch": 1.388704318936877, + "grad_norm": 0.057677417110861426, + "learning_rate": 0.00012986876989938843, + "loss": 0.0104, + "step": 627 + }, + { + "epoch": 1.3909191583610188, + "grad_norm": 0.07771669998026971, + "learning_rate": 0.00012962231224764294, + "loss": 0.0142, + "step": 628 + }, + { + "epoch": 1.3931339977851604, + "grad_norm": 0.10227536149267259, + "learning_rate": 0.0001293756572003723, + "loss": 0.0127, + "step": 629 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.07241742143747995, + "learning_rate": 0.00012912880640122273, + "loss": 0.0183, + "step": 630 + }, + { + "epoch": 1.397563676633444, + "grad_norm": 0.0681408656319668, + "learning_rate": 0.00012888176149514492, + "loss": 0.0187, + "step": 631 + }, + { + "epoch": 1.3997785160575857, + "grad_norm": 0.10432312383054601, + "learning_rate": 0.0001286345241283831, + "loss": 0.0226, + "step": 632 + }, + { + "epoch": 1.4019933554817277, + "grad_norm": 0.08984438753543621, + "learning_rate": 0.00012838709594846395, + "loss": 0.0187, + "step": 633 + }, + { + "epoch": 1.4042081949058693, + "grad_norm": 0.06365776498120232, + "learning_rate": 0.0001281394786041856, + "loss": 0.0175, + "step": 634 + }, + { + "epoch": 1.406423034330011, + "grad_norm": 0.0626560367287427, + "learning_rate": 0.00012789167374560687, + "loss": 0.0161, + "step": 635 + }, + { + "epoch": 1.4086378737541527, + "grad_norm": 0.0607345289434909, + "learning_rate": 0.00012764368302403608, + "loss": 0.0139, + "step": 636 + }, + { + "epoch": 1.4108527131782946, + "grad_norm": 0.06758012392712551, + "learning_rate": 0.00012739550809202002, + "loss": 0.0171, + "step": 637 + }, + { + "epoch": 1.4130675526024363, + "grad_norm": 0.15729852946011233, + "learning_rate": 0.0001271471506033331, + "loss": 0.0242, + "step": 638 + }, + { + "epoch": 1.415282392026578, + "grad_norm": 0.07414120877352418, + "learning_rate": 0.0001268986122129662, + "loss": 0.0177, + "step": 639 + }, + { + "epoch": 1.41749723145072, + "grad_norm": 0.098876092881507, + "learning_rate": 0.00012664989457711573, + "loss": 0.0194, + "step": 640 + }, + { + "epoch": 1.4197120708748616, + "grad_norm": 0.057053706868021914, + "learning_rate": 0.00012640099935317246, + "loss": 0.009, + "step": 641 + }, + { + "epoch": 1.4219269102990033, + "grad_norm": 0.05873713765660875, + "learning_rate": 0.00012615192819971063, + "loss": 0.013, + "step": 642 + }, + { + "epoch": 1.424141749723145, + "grad_norm": 0.07925766765560027, + "learning_rate": 0.00012590268277647682, + "loss": 0.0153, + "step": 643 + }, + { + "epoch": 1.4263565891472867, + "grad_norm": 0.08281768648250706, + "learning_rate": 0.00012565326474437888, + "loss": 0.0155, + "step": 644 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.0847978082997192, + "learning_rate": 0.00012540367576547491, + "loss": 0.0203, + "step": 645 + }, + { + "epoch": 1.4307862679955703, + "grad_norm": 0.06572599858080001, + "learning_rate": 0.00012515391750296211, + "loss": 0.0168, + "step": 646 + }, + { + "epoch": 1.4330011074197122, + "grad_norm": 0.16336875686348648, + "learning_rate": 0.00012490399162116576, + "loss": 0.0148, + "step": 647 + }, + { + "epoch": 1.435215946843854, + "grad_norm": 0.07133782241835177, + "learning_rate": 0.00012465389978552814, + "loss": 0.0147, + "step": 648 + }, + { + "epoch": 1.4374307862679956, + "grad_norm": 0.06731764823856598, + "learning_rate": 0.00012440364366259738, + "loss": 0.0104, + "step": 649 + }, + { + "epoch": 1.4396456256921373, + "grad_norm": 0.07373237101310139, + "learning_rate": 0.00012415322492001637, + "loss": 0.0144, + "step": 650 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 0.07151126396092782, + "learning_rate": 0.00012390264522651174, + "loss": 0.0175, + "step": 651 + }, + { + "epoch": 1.4440753045404209, + "grad_norm": 0.058847347145106345, + "learning_rate": 0.00012365190625188247, + "loss": 0.0128, + "step": 652 + }, + { + "epoch": 1.4462901439645626, + "grad_norm": 0.08488483942365929, + "learning_rate": 0.00012340100966698917, + "loss": 0.0179, + "step": 653 + }, + { + "epoch": 1.4485049833887043, + "grad_norm": 0.07937408228080482, + "learning_rate": 0.00012314995714374254, + "loss": 0.0168, + "step": 654 + }, + { + "epoch": 1.4507198228128462, + "grad_norm": 0.08991854273082502, + "learning_rate": 0.00012289875035509254, + "loss": 0.0189, + "step": 655 + }, + { + "epoch": 1.4529346622369879, + "grad_norm": 0.0746518012056686, + "learning_rate": 0.00012264739097501702, + "loss": 0.0094, + "step": 656 + }, + { + "epoch": 1.4551495016611296, + "grad_norm": 0.09916963899610554, + "learning_rate": 0.00012239588067851083, + "loss": 0.0157, + "step": 657 + }, + { + "epoch": 1.4573643410852712, + "grad_norm": 0.07587209230552407, + "learning_rate": 0.0001221442211415742, + "loss": 0.0168, + "step": 658 + }, + { + "epoch": 1.459579180509413, + "grad_norm": 0.0750081853549884, + "learning_rate": 0.00012189241404120211, + "loss": 0.0128, + "step": 659 + }, + { + "epoch": 1.4617940199335548, + "grad_norm": 0.0667706053074553, + "learning_rate": 0.00012164046105537282, + "loss": 0.0127, + "step": 660 + }, + { + "epoch": 1.4640088593576965, + "grad_norm": 0.10605617466922856, + "learning_rate": 0.00012138836386303661, + "loss": 0.014, + "step": 661 + }, + { + "epoch": 1.4662236987818384, + "grad_norm": 0.09870296764326834, + "learning_rate": 0.00012113612414410484, + "loss": 0.0264, + "step": 662 + }, + { + "epoch": 1.4684385382059801, + "grad_norm": 0.08840210666612355, + "learning_rate": 0.00012088374357943861, + "loss": 0.0179, + "step": 663 + }, + { + "epoch": 1.4706533776301218, + "grad_norm": 0.07630614338155196, + "learning_rate": 0.00012063122385083754, + "loss": 0.0119, + "step": 664 + }, + { + "epoch": 1.4728682170542635, + "grad_norm": 0.05804298194824501, + "learning_rate": 0.0001203785666410286, + "loss": 0.0116, + "step": 665 + }, + { + "epoch": 1.4750830564784052, + "grad_norm": 0.10515183508320385, + "learning_rate": 0.00012012577363365499, + "loss": 0.0175, + "step": 666 + }, + { + "epoch": 1.4772978959025471, + "grad_norm": 0.12219238587848143, + "learning_rate": 0.0001198728465132647, + "loss": 0.0182, + "step": 667 + }, + { + "epoch": 1.4795127353266888, + "grad_norm": 0.07279296225238836, + "learning_rate": 0.00011961978696529948, + "loss": 0.017, + "step": 668 + }, + { + "epoch": 1.4817275747508305, + "grad_norm": 0.06431351893366116, + "learning_rate": 0.0001193665966760836, + "loss": 0.0114, + "step": 669 + }, + { + "epoch": 1.4839424141749724, + "grad_norm": 0.0731333034049168, + "learning_rate": 0.00011911327733281246, + "loss": 0.0127, + "step": 670 + }, + { + "epoch": 1.486157253599114, + "grad_norm": 0.058128526624084444, + "learning_rate": 0.00011885983062354147, + "loss": 0.0163, + "step": 671 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 0.1168469989442433, + "learning_rate": 0.00011860625823717484, + "loss": 0.0221, + "step": 672 + }, + { + "epoch": 1.4905869324473975, + "grad_norm": 0.07935583708035619, + "learning_rate": 0.00011835256186345411, + "loss": 0.0214, + "step": 673 + }, + { + "epoch": 1.4928017718715394, + "grad_norm": 0.07077782522055447, + "learning_rate": 0.00011809874319294723, + "loss": 0.0131, + "step": 674 + }, + { + "epoch": 1.495016611295681, + "grad_norm": 0.09310271786266665, + "learning_rate": 0.00011784480391703699, + "loss": 0.0162, + "step": 675 + }, + { + "epoch": 1.4972314507198228, + "grad_norm": 0.06376149811747334, + "learning_rate": 0.00011759074572790985, + "loss": 0.01, + "step": 676 + }, + { + "epoch": 1.4994462901439647, + "grad_norm": 0.06877588774913715, + "learning_rate": 0.00011733657031854473, + "loss": 0.0123, + "step": 677 + }, + { + "epoch": 1.5016611295681064, + "grad_norm": 0.10194226886703607, + "learning_rate": 0.00011708227938270167, + "loss": 0.0229, + "step": 678 + }, + { + "epoch": 1.503875968992248, + "grad_norm": 0.06439752368095918, + "learning_rate": 0.00011682787461491051, + "loss": 0.0113, + "step": 679 + }, + { + "epoch": 1.5060908084163898, + "grad_norm": 0.0493715096623908, + "learning_rate": 0.00011657335771045969, + "loss": 0.0121, + "step": 680 + }, + { + "epoch": 1.5083056478405314, + "grad_norm": 0.0616547714471614, + "learning_rate": 0.0001163187303653849, + "loss": 0.0132, + "step": 681 + }, + { + "epoch": 1.5105204872646734, + "grad_norm": 0.06267121096163195, + "learning_rate": 0.00011606399427645767, + "loss": 0.0119, + "step": 682 + }, + { + "epoch": 1.512735326688815, + "grad_norm": 0.07100280416666961, + "learning_rate": 0.00011580915114117433, + "loss": 0.013, + "step": 683 + }, + { + "epoch": 1.514950166112957, + "grad_norm": 0.05942987447678964, + "learning_rate": 0.00011555420265774444, + "loss": 0.0179, + "step": 684 + }, + { + "epoch": 1.5171650055370987, + "grad_norm": 0.1002258378082055, + "learning_rate": 0.00011529915052507955, + "loss": 0.0161, + "step": 685 + }, + { + "epoch": 1.5193798449612403, + "grad_norm": 0.12951868454563897, + "learning_rate": 0.00011504399644278201, + "loss": 0.0148, + "step": 686 + }, + { + "epoch": 1.521594684385382, + "grad_norm": 0.07656984640616299, + "learning_rate": 0.00011478874211113346, + "loss": 0.0142, + "step": 687 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.06918903309672884, + "learning_rate": 0.00011453338923108358, + "loss": 0.0153, + "step": 688 + }, + { + "epoch": 1.5260243632336654, + "grad_norm": 0.10162961800737266, + "learning_rate": 0.00011427793950423875, + "loss": 0.0189, + "step": 689 + }, + { + "epoch": 1.5282392026578073, + "grad_norm": 0.07469534409725367, + "learning_rate": 0.00011402239463285074, + "loss": 0.0156, + "step": 690 + }, + { + "epoch": 1.530454042081949, + "grad_norm": 0.08835759873469173, + "learning_rate": 0.00011376675631980534, + "loss": 0.0124, + "step": 691 + }, + { + "epoch": 1.532668881506091, + "grad_norm": 0.07435487873381333, + "learning_rate": 0.00011351102626861095, + "loss": 0.0144, + "step": 692 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 0.11354535343261787, + "learning_rate": 0.00011325520618338738, + "loss": 0.0203, + "step": 693 + }, + { + "epoch": 1.5370985603543743, + "grad_norm": 0.08105249954421245, + "learning_rate": 0.00011299929776885434, + "loss": 0.0144, + "step": 694 + }, + { + "epoch": 1.539313399778516, + "grad_norm": 0.0830549503592733, + "learning_rate": 0.00011274330273032014, + "loss": 0.0114, + "step": 695 + }, + { + "epoch": 1.5415282392026577, + "grad_norm": 0.07566810742588137, + "learning_rate": 0.00011248722277367041, + "loss": 0.0107, + "step": 696 + }, + { + "epoch": 1.5437430786267996, + "grad_norm": 0.08394606998737049, + "learning_rate": 0.00011223105960535654, + "loss": 0.0166, + "step": 697 + }, + { + "epoch": 1.5459579180509413, + "grad_norm": 0.06044159208235659, + "learning_rate": 0.00011197481493238447, + "loss": 0.0141, + "step": 698 + }, + { + "epoch": 1.5481727574750832, + "grad_norm": 0.05638646868789201, + "learning_rate": 0.00011171849046230332, + "loss": 0.011, + "step": 699 + }, + { + "epoch": 1.550387596899225, + "grad_norm": 0.109489873955899, + "learning_rate": 0.00011146208790319386, + "loss": 0.0242, + "step": 700 + }, + { + "epoch": 1.5526024363233666, + "grad_norm": 0.072867793979609, + "learning_rate": 0.00011120560896365731, + "loss": 0.0158, + "step": 701 + }, + { + "epoch": 1.5548172757475083, + "grad_norm": 0.07337597250655943, + "learning_rate": 0.00011094905535280383, + "loss": 0.0138, + "step": 702 + }, + { + "epoch": 1.55703211517165, + "grad_norm": 0.05655486779647267, + "learning_rate": 0.00011069242878024115, + "loss": 0.0108, + "step": 703 + }, + { + "epoch": 1.5592469545957917, + "grad_norm": 0.058791597598116946, + "learning_rate": 0.00011043573095606324, + "loss": 0.0116, + "step": 704 + }, + { + "epoch": 1.5614617940199336, + "grad_norm": 0.0801023675221324, + "learning_rate": 0.0001101789635908389, + "loss": 0.0129, + "step": 705 + }, + { + "epoch": 1.5636766334440753, + "grad_norm": 0.10792858347354636, + "learning_rate": 0.0001099221283956002, + "loss": 0.0165, + "step": 706 + }, + { + "epoch": 1.5658914728682172, + "grad_norm": 0.07221398705828098, + "learning_rate": 0.00010966522708183138, + "loss": 0.0122, + "step": 707 + }, + { + "epoch": 1.5681063122923589, + "grad_norm": 0.06347371201895738, + "learning_rate": 0.00010940826136145717, + "loss": 0.0109, + "step": 708 + }, + { + "epoch": 1.5703211517165006, + "grad_norm": 0.056108551777438516, + "learning_rate": 0.00010915123294683151, + "loss": 0.0151, + "step": 709 + }, + { + "epoch": 1.5725359911406422, + "grad_norm": 0.09736049733883133, + "learning_rate": 0.0001088941435507261, + "loss": 0.0122, + "step": 710 + }, + { + "epoch": 1.574750830564784, + "grad_norm": 0.09168512071901921, + "learning_rate": 0.00010863699488631913, + "loss": 0.0184, + "step": 711 + }, + { + "epoch": 1.5769656699889258, + "grad_norm": 0.08015304272515747, + "learning_rate": 0.00010837978866718355, + "loss": 0.0158, + "step": 712 + }, + { + "epoch": 1.5791805094130675, + "grad_norm": 0.10393894843189654, + "learning_rate": 0.00010812252660727593, + "loss": 0.0177, + "step": 713 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 0.07389618639021898, + "learning_rate": 0.00010786521042092499, + "loss": 0.0148, + "step": 714 + }, + { + "epoch": 1.5836101882613511, + "grad_norm": 0.06557992321278563, + "learning_rate": 0.00010760784182282007, + "loss": 0.0119, + "step": 715 + }, + { + "epoch": 1.5858250276854928, + "grad_norm": 0.08013545130721708, + "learning_rate": 0.0001073504225279998, + "loss": 0.0106, + "step": 716 + }, + { + "epoch": 1.5880398671096345, + "grad_norm": 0.0674260510277781, + "learning_rate": 0.00010709295425184064, + "loss": 0.0173, + "step": 717 + }, + { + "epoch": 1.5902547065337762, + "grad_norm": 0.0838290641715308, + "learning_rate": 0.00010683543871004544, + "loss": 0.0132, + "step": 718 + }, + { + "epoch": 1.592469545957918, + "grad_norm": 0.08447391610089702, + "learning_rate": 0.000106577877618632, + "loss": 0.0153, + "step": 719 + }, + { + "epoch": 1.5946843853820598, + "grad_norm": 0.07987851991607445, + "learning_rate": 0.00010632027269392171, + "loss": 0.0185, + "step": 720 + }, + { + "epoch": 1.5968992248062015, + "grad_norm": 0.07793786099276806, + "learning_rate": 0.00010606262565252796, + "loss": 0.0148, + "step": 721 + }, + { + "epoch": 1.5991140642303434, + "grad_norm": 0.08596720294833989, + "learning_rate": 0.00010580493821134486, + "loss": 0.0205, + "step": 722 + }, + { + "epoch": 1.601328903654485, + "grad_norm": 0.10845236921063846, + "learning_rate": 0.00010554721208753575, + "loss": 0.013, + "step": 723 + }, + { + "epoch": 1.6035437430786268, + "grad_norm": 0.061316567982659886, + "learning_rate": 0.00010528944899852167, + "loss": 0.0103, + "step": 724 + }, + { + "epoch": 1.6057585825027685, + "grad_norm": 0.11445273689115372, + "learning_rate": 0.00010503165066197005, + "loss": 0.0214, + "step": 725 + }, + { + "epoch": 1.6079734219269102, + "grad_norm": 0.07379164703144378, + "learning_rate": 0.00010477381879578319, + "loss": 0.0164, + "step": 726 + }, + { + "epoch": 1.610188261351052, + "grad_norm": 0.06538451798026236, + "learning_rate": 0.0001045159551180868, + "loss": 0.015, + "step": 727 + }, + { + "epoch": 1.6124031007751938, + "grad_norm": 0.1072580176704437, + "learning_rate": 0.00010425806134721858, + "loss": 0.0147, + "step": 728 + }, + { + "epoch": 1.6146179401993357, + "grad_norm": 0.1106424811966664, + "learning_rate": 0.00010400013920171682, + "loss": 0.0191, + "step": 729 + }, + { + "epoch": 1.6168327796234774, + "grad_norm": 0.08838943843981076, + "learning_rate": 0.00010374219040030877, + "loss": 0.0196, + "step": 730 + }, + { + "epoch": 1.619047619047619, + "grad_norm": 0.08602791824514236, + "learning_rate": 0.0001034842166618994, + "loss": 0.0157, + "step": 731 + }, + { + "epoch": 1.6212624584717608, + "grad_norm": 0.05919682885808458, + "learning_rate": 0.00010322621970555987, + "loss": 0.0111, + "step": 732 + }, + { + "epoch": 1.6234772978959024, + "grad_norm": 0.06711125021569317, + "learning_rate": 0.00010296820125051599, + "loss": 0.0098, + "step": 733 + }, + { + "epoch": 1.6256921373200441, + "grad_norm": 0.05122210501483838, + "learning_rate": 0.00010271016301613686, + "loss": 0.0071, + "step": 734 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.06414283351247622, + "learning_rate": 0.00010245210672192337, + "loss": 0.0103, + "step": 735 + }, + { + "epoch": 1.6301218161683277, + "grad_norm": 0.071564407349583, + "learning_rate": 0.00010219403408749681, + "loss": 0.0127, + "step": 736 + }, + { + "epoch": 1.6323366555924697, + "grad_norm": 0.09976174258163252, + "learning_rate": 0.00010193594683258732, + "loss": 0.0115, + "step": 737 + }, + { + "epoch": 1.6345514950166113, + "grad_norm": 0.08302512077738045, + "learning_rate": 0.00010167784667702245, + "loss": 0.0091, + "step": 738 + }, + { + "epoch": 1.636766334440753, + "grad_norm": 0.06244774756187711, + "learning_rate": 0.00010141973534071571, + "loss": 0.0106, + "step": 739 + }, + { + "epoch": 1.6389811738648947, + "grad_norm": 0.06873696824482452, + "learning_rate": 0.0001011616145436552, + "loss": 0.0132, + "step": 740 + }, + { + "epoch": 1.6411960132890364, + "grad_norm": 0.10146506702609724, + "learning_rate": 0.00010090348600589199, + "loss": 0.0132, + "step": 741 + }, + { + "epoch": 1.6434108527131783, + "grad_norm": 0.08797052906443875, + "learning_rate": 0.0001006453514475287, + "loss": 0.0079, + "step": 742 + }, + { + "epoch": 1.64562569213732, + "grad_norm": 0.06900180180841788, + "learning_rate": 0.00010038721258870814, + "loss": 0.0156, + "step": 743 + }, + { + "epoch": 1.647840531561462, + "grad_norm": 0.06710019770711002, + "learning_rate": 0.0001001290711496018, + "loss": 0.015, + "step": 744 + }, + { + "epoch": 1.6500553709856036, + "grad_norm": 0.06223303632411454, + "learning_rate": 9.987092885039823e-05, + "loss": 0.0102, + "step": 745 + }, + { + "epoch": 1.6522702104097453, + "grad_norm": 0.10330863387082126, + "learning_rate": 9.961278741129186e-05, + "loss": 0.015, + "step": 746 + }, + { + "epoch": 1.654485049833887, + "grad_norm": 0.06553510035702018, + "learning_rate": 9.935464855247131e-05, + "loss": 0.0117, + "step": 747 + }, + { + "epoch": 1.6566998892580287, + "grad_norm": 0.07100721167714431, + "learning_rate": 9.909651399410806e-05, + "loss": 0.0108, + "step": 748 + }, + { + "epoch": 1.6589147286821704, + "grad_norm": 0.061035376801468824, + "learning_rate": 9.883838545634481e-05, + "loss": 0.0076, + "step": 749 + }, + { + "epoch": 1.6611295681063123, + "grad_norm": 0.06543152780022146, + "learning_rate": 9.858026465928431e-05, + "loss": 0.0096, + "step": 750 + }, + { + "epoch": 1.6633444075304542, + "grad_norm": 0.09797304739350932, + "learning_rate": 9.832215332297759e-05, + "loss": 0.019, + "step": 751 + }, + { + "epoch": 1.665559246954596, + "grad_norm": 0.07417866651789688, + "learning_rate": 9.806405316741268e-05, + "loss": 0.0135, + "step": 752 + }, + { + "epoch": 1.6677740863787376, + "grad_norm": 0.05537264270781063, + "learning_rate": 9.780596591250317e-05, + "loss": 0.0074, + "step": 753 + }, + { + "epoch": 1.6699889258028793, + "grad_norm": 0.04566497904521487, + "learning_rate": 9.754789327807666e-05, + "loss": 0.0076, + "step": 754 + }, + { + "epoch": 1.672203765227021, + "grad_norm": 0.07577208883092387, + "learning_rate": 9.728983698386318e-05, + "loss": 0.0129, + "step": 755 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 0.0918829068299407, + "learning_rate": 9.703179874948403e-05, + "loss": 0.0125, + "step": 756 + }, + { + "epoch": 1.6766334440753046, + "grad_norm": 0.05958000098522351, + "learning_rate": 9.677378029444014e-05, + "loss": 0.0109, + "step": 757 + }, + { + "epoch": 1.6788482834994463, + "grad_norm": 0.08407304030712652, + "learning_rate": 9.65157833381006e-05, + "loss": 0.0147, + "step": 758 + }, + { + "epoch": 1.6810631229235882, + "grad_norm": 0.07157597297314851, + "learning_rate": 9.625780959969123e-05, + "loss": 0.0157, + "step": 759 + }, + { + "epoch": 1.6832779623477299, + "grad_norm": 0.1076293703691059, + "learning_rate": 9.599986079828323e-05, + "loss": 0.0199, + "step": 760 + }, + { + "epoch": 1.6854928017718716, + "grad_norm": 0.07120097005147624, + "learning_rate": 9.574193865278144e-05, + "loss": 0.0144, + "step": 761 + }, + { + "epoch": 1.6877076411960132, + "grad_norm": 0.055986836575224304, + "learning_rate": 9.548404488191323e-05, + "loss": 0.0092, + "step": 762 + }, + { + "epoch": 1.689922480620155, + "grad_norm": 0.05722164549748576, + "learning_rate": 9.522618120421683e-05, + "loss": 0.0093, + "step": 763 + }, + { + "epoch": 1.6921373200442966, + "grad_norm": 0.057248415759553244, + "learning_rate": 9.496834933802997e-05, + "loss": 0.0101, + "step": 764 + }, + { + "epoch": 1.6943521594684385, + "grad_norm": 0.05135625139795403, + "learning_rate": 9.471055100147835e-05, + "loss": 0.0057, + "step": 765 + }, + { + "epoch": 1.6965669988925804, + "grad_norm": 0.08755396234508316, + "learning_rate": 9.44527879124643e-05, + "loss": 0.012, + "step": 766 + }, + { + "epoch": 1.6987818383167221, + "grad_norm": 0.047333162727746826, + "learning_rate": 9.419506178865517e-05, + "loss": 0.0066, + "step": 767 + }, + { + "epoch": 1.7009966777408638, + "grad_norm": 0.06069565134623067, + "learning_rate": 9.393737434747208e-05, + "loss": 0.0154, + "step": 768 + }, + { + "epoch": 1.7032115171650055, + "grad_norm": 0.0817491173129062, + "learning_rate": 9.367972730607831e-05, + "loss": 0.0147, + "step": 769 + }, + { + "epoch": 1.7054263565891472, + "grad_norm": 0.09016237297073272, + "learning_rate": 9.3422122381368e-05, + "loss": 0.0123, + "step": 770 + }, + { + "epoch": 1.707641196013289, + "grad_norm": 0.1021074663110952, + "learning_rate": 9.316456128995456e-05, + "loss": 0.0185, + "step": 771 + }, + { + "epoch": 1.7098560354374308, + "grad_norm": 0.08909795176642595, + "learning_rate": 9.290704574815938e-05, + "loss": 0.0104, + "step": 772 + }, + { + "epoch": 1.7120708748615725, + "grad_norm": 0.06395839734236355, + "learning_rate": 9.264957747200023e-05, + "loss": 0.0113, + "step": 773 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.06347138135918967, + "learning_rate": 9.239215817717995e-05, + "loss": 0.0077, + "step": 774 + }, + { + "epoch": 1.716500553709856, + "grad_norm": 0.08784732937519078, + "learning_rate": 9.213478957907504e-05, + "loss": 0.0144, + "step": 775 + }, + { + "epoch": 1.7187153931339978, + "grad_norm": 0.08119158201310175, + "learning_rate": 9.187747339272408e-05, + "loss": 0.013, + "step": 776 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 0.09864534041188773, + "learning_rate": 9.162021133281649e-05, + "loss": 0.0193, + "step": 777 + }, + { + "epoch": 1.7231450719822812, + "grad_norm": 0.07192910227915265, + "learning_rate": 9.136300511368091e-05, + "loss": 0.0119, + "step": 778 + }, + { + "epoch": 1.725359911406423, + "grad_norm": 0.07108335494772763, + "learning_rate": 9.11058564492739e-05, + "loss": 0.017, + "step": 779 + }, + { + "epoch": 1.7275747508305648, + "grad_norm": 0.09107857107351884, + "learning_rate": 9.084876705316852e-05, + "loss": 0.0144, + "step": 780 + }, + { + "epoch": 1.7297895902547067, + "grad_norm": 0.10206728118448652, + "learning_rate": 9.059173863854284e-05, + "loss": 0.0202, + "step": 781 + }, + { + "epoch": 1.7320044296788484, + "grad_norm": 0.06821740773466613, + "learning_rate": 9.033477291816862e-05, + "loss": 0.0108, + "step": 782 + }, + { + "epoch": 1.73421926910299, + "grad_norm": 0.10509772446039986, + "learning_rate": 9.007787160439979e-05, + "loss": 0.017, + "step": 783 + }, + { + "epoch": 1.7364341085271318, + "grad_norm": 0.06772173784916484, + "learning_rate": 8.982103640916115e-05, + "loss": 0.0133, + "step": 784 + }, + { + "epoch": 1.7386489479512734, + "grad_norm": 0.07671107963155675, + "learning_rate": 8.956426904393677e-05, + "loss": 0.0126, + "step": 785 + }, + { + "epoch": 1.7408637873754151, + "grad_norm": 0.0759494778160756, + "learning_rate": 8.930757121975888e-05, + "loss": 0.0205, + "step": 786 + }, + { + "epoch": 1.743078626799557, + "grad_norm": 0.08471414441848152, + "learning_rate": 8.905094464719621e-05, + "loss": 0.0167, + "step": 787 + }, + { + "epoch": 1.7452934662236987, + "grad_norm": 0.06916111296552842, + "learning_rate": 8.879439103634271e-05, + "loss": 0.0125, + "step": 788 + }, + { + "epoch": 1.7475083056478407, + "grad_norm": 0.06369276176600251, + "learning_rate": 8.853791209680615e-05, + "loss": 0.0119, + "step": 789 + }, + { + "epoch": 1.7497231450719823, + "grad_norm": 0.06594563820252691, + "learning_rate": 8.828150953769672e-05, + "loss": 0.0147, + "step": 790 + }, + { + "epoch": 1.751937984496124, + "grad_norm": 0.05547856745320826, + "learning_rate": 8.802518506761555e-05, + "loss": 0.0104, + "step": 791 + }, + { + "epoch": 1.7541528239202657, + "grad_norm": 0.03371330134124906, + "learning_rate": 8.77689403946435e-05, + "loss": 0.0068, + "step": 792 + }, + { + "epoch": 1.7563676633444074, + "grad_norm": 0.05714344230134165, + "learning_rate": 8.751277722632961e-05, + "loss": 0.0093, + "step": 793 + }, + { + "epoch": 1.7585825027685493, + "grad_norm": 0.053092049757634485, + "learning_rate": 8.725669726967984e-05, + "loss": 0.0085, + "step": 794 + }, + { + "epoch": 1.760797342192691, + "grad_norm": 0.04901859413773457, + "learning_rate": 8.700070223114566e-05, + "loss": 0.0098, + "step": 795 + }, + { + "epoch": 1.763012181616833, + "grad_norm": 0.050878579484852055, + "learning_rate": 8.674479381661264e-05, + "loss": 0.0068, + "step": 796 + }, + { + "epoch": 1.7652270210409746, + "grad_norm": 0.05934677678735517, + "learning_rate": 8.648897373138906e-05, + "loss": 0.0109, + "step": 797 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 0.06506054201053145, + "learning_rate": 8.623324368019469e-05, + "loss": 0.0114, + "step": 798 + }, + { + "epoch": 1.769656699889258, + "grad_norm": 0.05658299336986872, + "learning_rate": 8.597760536714927e-05, + "loss": 0.0092, + "step": 799 + }, + { + "epoch": 1.7718715393133997, + "grad_norm": 0.05064253349007289, + "learning_rate": 8.572206049576126e-05, + "loss": 0.0088, + "step": 800 + }, + { + "epoch": 1.7740863787375414, + "grad_norm": 0.05239135675919535, + "learning_rate": 8.546661076891644e-05, + "loss": 0.0093, + "step": 801 + }, + { + "epoch": 1.7763012181616833, + "grad_norm": 0.06000690369152087, + "learning_rate": 8.521125788886657e-05, + "loss": 0.0133, + "step": 802 + }, + { + "epoch": 1.778516057585825, + "grad_norm": 0.05802824265797107, + "learning_rate": 8.495600355721801e-05, + "loss": 0.0125, + "step": 803 + }, + { + "epoch": 1.780730897009967, + "grad_norm": 0.06289906406312389, + "learning_rate": 8.470084947492046e-05, + "loss": 0.0107, + "step": 804 + }, + { + "epoch": 1.7829457364341086, + "grad_norm": 0.06257567248407582, + "learning_rate": 8.44457973422556e-05, + "loss": 0.0103, + "step": 805 + }, + { + "epoch": 1.7851605758582503, + "grad_norm": 0.06264962667704355, + "learning_rate": 8.419084885882568e-05, + "loss": 0.009, + "step": 806 + }, + { + "epoch": 1.787375415282392, + "grad_norm": 0.07283970218666304, + "learning_rate": 8.393600572354232e-05, + "loss": 0.0084, + "step": 807 + }, + { + "epoch": 1.7895902547065337, + "grad_norm": 0.06074390671958057, + "learning_rate": 8.368126963461516e-05, + "loss": 0.0127, + "step": 808 + }, + { + "epoch": 1.7918050941306756, + "grad_norm": 0.07596491639376073, + "learning_rate": 8.342664228954033e-05, + "loss": 0.0146, + "step": 809 + }, + { + "epoch": 1.7940199335548173, + "grad_norm": 0.07800959631012076, + "learning_rate": 8.317212538508951e-05, + "loss": 0.0094, + "step": 810 + }, + { + "epoch": 1.7962347729789592, + "grad_norm": 0.05692983802157629, + "learning_rate": 8.291772061729837e-05, + "loss": 0.0109, + "step": 811 + }, + { + "epoch": 1.7984496124031009, + "grad_norm": 0.06216402308958548, + "learning_rate": 8.26634296814553e-05, + "loss": 0.0141, + "step": 812 + }, + { + "epoch": 1.8006644518272426, + "grad_norm": 0.0896523418839609, + "learning_rate": 8.240925427209018e-05, + "loss": 0.0138, + "step": 813 + }, + { + "epoch": 1.8028792912513842, + "grad_norm": 0.05908579608673005, + "learning_rate": 8.215519608296305e-05, + "loss": 0.0109, + "step": 814 + }, + { + "epoch": 1.805094130675526, + "grad_norm": 0.04808182895150866, + "learning_rate": 8.190125680705278e-05, + "loss": 0.005, + "step": 815 + }, + { + "epoch": 1.8073089700996676, + "grad_norm": 0.06634808905318802, + "learning_rate": 8.16474381365459e-05, + "loss": 0.0102, + "step": 816 + }, + { + "epoch": 1.8095238095238095, + "grad_norm": 0.0595112733358694, + "learning_rate": 8.139374176282519e-05, + "loss": 0.0094, + "step": 817 + }, + { + "epoch": 1.8117386489479512, + "grad_norm": 0.0787009176364453, + "learning_rate": 8.114016937645854e-05, + "loss": 0.0124, + "step": 818 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 0.06139537582067777, + "learning_rate": 8.088672266718754e-05, + "loss": 0.0095, + "step": 819 + }, + { + "epoch": 1.8161683277962348, + "grad_norm": 0.07413784628879724, + "learning_rate": 8.063340332391643e-05, + "loss": 0.0103, + "step": 820 + }, + { + "epoch": 1.8183831672203765, + "grad_norm": 0.10438699186334588, + "learning_rate": 8.038021303470053e-05, + "loss": 0.0114, + "step": 821 + }, + { + "epoch": 1.8205980066445182, + "grad_norm": 0.0811946125024105, + "learning_rate": 8.012715348673534e-05, + "loss": 0.0079, + "step": 822 + }, + { + "epoch": 1.82281284606866, + "grad_norm": 0.05443916851317431, + "learning_rate": 7.987422636634505e-05, + "loss": 0.0086, + "step": 823 + }, + { + "epoch": 1.8250276854928018, + "grad_norm": 0.08090225273526187, + "learning_rate": 7.96214333589714e-05, + "loss": 0.0162, + "step": 824 + }, + { + "epoch": 1.8272425249169435, + "grad_norm": 0.09352980164671724, + "learning_rate": 7.936877614916248e-05, + "loss": 0.0147, + "step": 825 + }, + { + "epoch": 1.8294573643410854, + "grad_norm": 0.05073426450797381, + "learning_rate": 7.911625642056141e-05, + "loss": 0.0096, + "step": 826 + }, + { + "epoch": 1.831672203765227, + "grad_norm": 0.04955994569670843, + "learning_rate": 7.886387585589518e-05, + "loss": 0.0057, + "step": 827 + }, + { + "epoch": 1.8338870431893688, + "grad_norm": 0.07183419894057542, + "learning_rate": 7.861163613696341e-05, + "loss": 0.0101, + "step": 828 + }, + { + "epoch": 1.8361018826135105, + "grad_norm": 0.07897210058166848, + "learning_rate": 7.835953894462721e-05, + "loss": 0.0089, + "step": 829 + }, + { + "epoch": 1.8383167220376522, + "grad_norm": 0.06289618380548383, + "learning_rate": 7.810758595879788e-05, + "loss": 0.01, + "step": 830 + }, + { + "epoch": 1.8405315614617939, + "grad_norm": 0.07035011337497359, + "learning_rate": 7.785577885842581e-05, + "loss": 0.011, + "step": 831 + }, + { + "epoch": 1.8427464008859358, + "grad_norm": 0.07292372943903819, + "learning_rate": 7.760411932148924e-05, + "loss": 0.0127, + "step": 832 + }, + { + "epoch": 1.8449612403100775, + "grad_norm": 0.04820274192595067, + "learning_rate": 7.735260902498298e-05, + "loss": 0.0071, + "step": 833 + }, + { + "epoch": 1.8471760797342194, + "grad_norm": 0.06454602963108605, + "learning_rate": 7.71012496449075e-05, + "loss": 0.0113, + "step": 834 + }, + { + "epoch": 1.849390919158361, + "grad_norm": 0.09609865526647371, + "learning_rate": 7.685004285625748e-05, + "loss": 0.0135, + "step": 835 + }, + { + "epoch": 1.8516057585825028, + "grad_norm": 0.06216290570398106, + "learning_rate": 7.659899033301086e-05, + "loss": 0.009, + "step": 836 + }, + { + "epoch": 1.8538205980066444, + "grad_norm": 0.06784044665312368, + "learning_rate": 7.634809374811755e-05, + "loss": 0.01, + "step": 837 + }, + { + "epoch": 1.8560354374307861, + "grad_norm": 0.06316392809085221, + "learning_rate": 7.609735477348831e-05, + "loss": 0.0116, + "step": 838 + }, + { + "epoch": 1.858250276854928, + "grad_norm": 0.06124037788010124, + "learning_rate": 7.584677507998364e-05, + "loss": 0.007, + "step": 839 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.07289146559600851, + "learning_rate": 7.559635633740265e-05, + "loss": 0.0114, + "step": 840 + }, + { + "epoch": 1.8626799557032117, + "grad_norm": 0.07776962384831466, + "learning_rate": 7.534610021447189e-05, + "loss": 0.0117, + "step": 841 + }, + { + "epoch": 1.8648947951273533, + "grad_norm": 0.07393954690280202, + "learning_rate": 7.509600837883426e-05, + "loss": 0.0141, + "step": 842 + }, + { + "epoch": 1.867109634551495, + "grad_norm": 0.0805275740400507, + "learning_rate": 7.484608249703795e-05, + "loss": 0.0109, + "step": 843 + }, + { + "epoch": 1.8693244739756367, + "grad_norm": 0.11865959256343682, + "learning_rate": 7.459632423452512e-05, + "loss": 0.0141, + "step": 844 + }, + { + "epoch": 1.8715393133997784, + "grad_norm": 0.04766887390362376, + "learning_rate": 7.434673525562113e-05, + "loss": 0.0075, + "step": 845 + }, + { + "epoch": 1.87375415282392, + "grad_norm": 0.054994704511965954, + "learning_rate": 7.409731722352318e-05, + "loss": 0.0065, + "step": 846 + }, + { + "epoch": 1.875968992248062, + "grad_norm": 0.06592395069447003, + "learning_rate": 7.384807180028938e-05, + "loss": 0.0084, + "step": 847 + }, + { + "epoch": 1.878183831672204, + "grad_norm": 0.10733378027170917, + "learning_rate": 7.359900064682755e-05, + "loss": 0.0135, + "step": 848 + }, + { + "epoch": 1.8803986710963456, + "grad_norm": 0.06367660110343659, + "learning_rate": 7.33501054228843e-05, + "loss": 0.0063, + "step": 849 + }, + { + "epoch": 1.8826135105204873, + "grad_norm": 0.0743119227521803, + "learning_rate": 7.310138778703381e-05, + "loss": 0.0107, + "step": 850 + }, + { + "epoch": 1.884828349944629, + "grad_norm": 0.06258299284847652, + "learning_rate": 7.285284939666692e-05, + "loss": 0.0054, + "step": 851 + }, + { + "epoch": 1.8870431893687707, + "grad_norm": 0.06057404770012533, + "learning_rate": 7.260449190798e-05, + "loss": 0.0083, + "step": 852 + }, + { + "epoch": 1.8892580287929124, + "grad_norm": 0.07260179044289394, + "learning_rate": 7.235631697596396e-05, + "loss": 0.0103, + "step": 853 + }, + { + "epoch": 1.8914728682170543, + "grad_norm": 0.08305683373526285, + "learning_rate": 7.210832625439314e-05, + "loss": 0.013, + "step": 854 + }, + { + "epoch": 1.893687707641196, + "grad_norm": 0.07818551900684265, + "learning_rate": 7.186052139581446e-05, + "loss": 0.0131, + "step": 855 + }, + { + "epoch": 1.895902547065338, + "grad_norm": 0.06403511158854484, + "learning_rate": 7.161290405153611e-05, + "loss": 0.0078, + "step": 856 + }, + { + "epoch": 1.8981173864894796, + "grad_norm": 0.10578221348736559, + "learning_rate": 7.136547587161691e-05, + "loss": 0.0108, + "step": 857 + }, + { + "epoch": 1.9003322259136213, + "grad_norm": 0.07384545558078978, + "learning_rate": 7.111823850485509e-05, + "loss": 0.0106, + "step": 858 + }, + { + "epoch": 1.902547065337763, + "grad_norm": 0.08283093681262026, + "learning_rate": 7.08711935987773e-05, + "loss": 0.008, + "step": 859 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.06802366434099587, + "learning_rate": 7.062434279962773e-05, + "loss": 0.0134, + "step": 860 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 0.06437274367147072, + "learning_rate": 7.037768775235709e-05, + "loss": 0.0147, + "step": 861 + }, + { + "epoch": 1.9091915836101883, + "grad_norm": 0.054346508850977415, + "learning_rate": 7.013123010061162e-05, + "loss": 0.0076, + "step": 862 + }, + { + "epoch": 1.9114064230343302, + "grad_norm": 0.052183737296247486, + "learning_rate": 6.988497148672217e-05, + "loss": 0.0097, + "step": 863 + }, + { + "epoch": 1.9136212624584719, + "grad_norm": 0.07843652717493657, + "learning_rate": 6.963891355169328e-05, + "loss": 0.0149, + "step": 864 + }, + { + "epoch": 1.9158361018826136, + "grad_norm": 0.06121425847634844, + "learning_rate": 6.939305793519218e-05, + "loss": 0.0094, + "step": 865 + }, + { + "epoch": 1.9180509413067552, + "grad_norm": 0.07107948470222449, + "learning_rate": 6.914740627553795e-05, + "loss": 0.0136, + "step": 866 + }, + { + "epoch": 1.920265780730897, + "grad_norm": 0.0593738750211957, + "learning_rate": 6.890196020969056e-05, + "loss": 0.0102, + "step": 867 + }, + { + "epoch": 1.9224806201550386, + "grad_norm": 0.05333139202708231, + "learning_rate": 6.865672137323984e-05, + "loss": 0.008, + "step": 868 + }, + { + "epoch": 1.9246954595791805, + "grad_norm": 0.07437040356938678, + "learning_rate": 6.84116914003948e-05, + "loss": 0.0147, + "step": 869 + }, + { + "epoch": 1.9269102990033222, + "grad_norm": 0.07762333780210634, + "learning_rate": 6.816687192397265e-05, + "loss": 0.0115, + "step": 870 + }, + { + "epoch": 1.9291251384274641, + "grad_norm": 0.055802575232535345, + "learning_rate": 6.792226457538783e-05, + "loss": 0.011, + "step": 871 + }, + { + "epoch": 1.9313399778516058, + "grad_norm": 0.06393953862157768, + "learning_rate": 6.767787098464129e-05, + "loss": 0.0102, + "step": 872 + }, + { + "epoch": 1.9335548172757475, + "grad_norm": 0.08222438836908598, + "learning_rate": 6.743369278030952e-05, + "loss": 0.0119, + "step": 873 + }, + { + "epoch": 1.9357696566998892, + "grad_norm": 0.07063459739990509, + "learning_rate": 6.718973158953367e-05, + "loss": 0.0091, + "step": 874 + }, + { + "epoch": 1.937984496124031, + "grad_norm": 0.06954888262970908, + "learning_rate": 6.694598903800887e-05, + "loss": 0.0154, + "step": 875 + }, + { + "epoch": 1.9401993355481728, + "grad_norm": 0.05016295515280723, + "learning_rate": 6.670246674997324e-05, + "loss": 0.0078, + "step": 876 + }, + { + "epoch": 1.9424141749723145, + "grad_norm": 0.05724271615525153, + "learning_rate": 6.645916634819711e-05, + "loss": 0.0085, + "step": 877 + }, + { + "epoch": 1.9446290143964564, + "grad_norm": 0.07047141526875268, + "learning_rate": 6.621608945397224e-05, + "loss": 0.0099, + "step": 878 + }, + { + "epoch": 1.946843853820598, + "grad_norm": 0.07779820197341124, + "learning_rate": 6.597323768710105e-05, + "loss": 0.0118, + "step": 879 + }, + { + "epoch": 1.9490586932447398, + "grad_norm": 0.07200670931935592, + "learning_rate": 6.57306126658856e-05, + "loss": 0.0071, + "step": 880 + }, + { + "epoch": 1.9512735326688815, + "grad_norm": 0.10373813802234542, + "learning_rate": 6.548821600711715e-05, + "loss": 0.0167, + "step": 881 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 0.03865487832539871, + "learning_rate": 6.52460493260651e-05, + "loss": 0.0045, + "step": 882 + }, + { + "epoch": 1.9557032115171649, + "grad_norm": 0.06396655994697571, + "learning_rate": 6.50041142364664e-05, + "loss": 0.0112, + "step": 883 + }, + { + "epoch": 1.9579180509413068, + "grad_norm": 0.08269568584886126, + "learning_rate": 6.476241235051472e-05, + "loss": 0.0214, + "step": 884 + }, + { + "epoch": 1.9601328903654485, + "grad_norm": 0.08069128240560851, + "learning_rate": 6.452094527884971e-05, + "loss": 0.0111, + "step": 885 + }, + { + "epoch": 1.9623477297895904, + "grad_norm": 0.09136573630426274, + "learning_rate": 6.427971463054627e-05, + "loss": 0.0147, + "step": 886 + }, + { + "epoch": 1.964562569213732, + "grad_norm": 0.06828578377189488, + "learning_rate": 6.403872201310386e-05, + "loss": 0.0102, + "step": 887 + }, + { + "epoch": 1.9667774086378738, + "grad_norm": 0.0611670781531708, + "learning_rate": 6.379796903243571e-05, + "loss": 0.0075, + "step": 888 + }, + { + "epoch": 1.9689922480620154, + "grad_norm": 0.06778558968844421, + "learning_rate": 6.355745729285827e-05, + "loss": 0.0089, + "step": 889 + }, + { + "epoch": 1.9712070874861571, + "grad_norm": 0.12829032052877584, + "learning_rate": 6.33171883970803e-05, + "loss": 0.0142, + "step": 890 + }, + { + "epoch": 1.973421926910299, + "grad_norm": 0.06934000403876096, + "learning_rate": 6.307716394619247e-05, + "loss": 0.0096, + "step": 891 + }, + { + "epoch": 1.9756367663344407, + "grad_norm": 0.058585147765721726, + "learning_rate": 6.28373855396563e-05, + "loss": 0.0103, + "step": 892 + }, + { + "epoch": 1.9778516057585827, + "grad_norm": 0.06725621257138602, + "learning_rate": 6.259785477529396e-05, + "loss": 0.0097, + "step": 893 + }, + { + "epoch": 1.9800664451827243, + "grad_norm": 0.05459200473316461, + "learning_rate": 6.235857324927726e-05, + "loss": 0.0056, + "step": 894 + }, + { + "epoch": 1.982281284606866, + "grad_norm": 0.033616837263766276, + "learning_rate": 6.21195425561172e-05, + "loss": 0.006, + "step": 895 + }, + { + "epoch": 1.9844961240310077, + "grad_norm": 0.049264995249036415, + "learning_rate": 6.188076428865333e-05, + "loss": 0.0076, + "step": 896 + }, + { + "epoch": 1.9867109634551494, + "grad_norm": 0.054918123752961776, + "learning_rate": 6.164224003804301e-05, + "loss": 0.0079, + "step": 897 + }, + { + "epoch": 1.988925802879291, + "grad_norm": 0.05289320847412081, + "learning_rate": 6.140397139375094e-05, + "loss": 0.0084, + "step": 898 + }, + { + "epoch": 1.991140642303433, + "grad_norm": 0.060250223871030545, + "learning_rate": 6.116595994353853e-05, + "loss": 0.0085, + "step": 899 + }, + { + "epoch": 1.9933554817275747, + "grad_norm": 0.05588946340257141, + "learning_rate": 6.092820727345333e-05, + "loss": 0.0101, + "step": 900 + }, + { + "epoch": 1.9955703211517166, + "grad_norm": 0.0667986807130779, + "learning_rate": 6.069071496781842e-05, + "loss": 0.0107, + "step": 901 + }, + { + "epoch": 1.9977851605758583, + "grad_norm": 0.06898814536433204, + "learning_rate": 6.0453484609221864e-05, + "loss": 0.0133, + "step": 902 + }, + { + "epoch": 2.0, + "grad_norm": 0.05523118358668668, + "learning_rate": 6.0216517778506276e-05, + "loss": 0.0092, + "step": 903 + }, + { + "epoch": 2.0, + "eval_loss": 0.013259684666991234, + "eval_runtime": 159.0524, + "eval_samples_per_second": 9.563, + "eval_steps_per_second": 0.302, + "step": 903 + }, + { + "epoch": 2.0022148394241417, + "grad_norm": 0.047884109014966016, + "learning_rate": 5.9979816054758e-05, + "loss": 0.0046, + "step": 904 + }, + { + "epoch": 2.0044296788482834, + "grad_norm": 0.03502084964554704, + "learning_rate": 5.974338101529693e-05, + "loss": 0.0051, + "step": 905 + }, + { + "epoch": 2.006644518272425, + "grad_norm": 0.037793920549444415, + "learning_rate": 5.9507214235665806e-05, + "loss": 0.0068, + "step": 906 + }, + { + "epoch": 2.008859357696567, + "grad_norm": 0.03566061659800092, + "learning_rate": 5.9271317289619744e-05, + "loss": 0.0047, + "step": 907 + }, + { + "epoch": 2.011074197120709, + "grad_norm": 0.05255293240869575, + "learning_rate": 5.9035691749115766e-05, + "loss": 0.0065, + "step": 908 + }, + { + "epoch": 2.0132890365448506, + "grad_norm": 0.044952561738534354, + "learning_rate": 5.880033918430231e-05, + "loss": 0.0048, + "step": 909 + }, + { + "epoch": 2.0155038759689923, + "grad_norm": 0.03612788866923461, + "learning_rate": 5.8565261163508746e-05, + "loss": 0.0066, + "step": 910 + }, + { + "epoch": 2.017718715393134, + "grad_norm": 0.03156772782862181, + "learning_rate": 5.8330459253235e-05, + "loss": 0.0033, + "step": 911 + }, + { + "epoch": 2.0199335548172757, + "grad_norm": 0.04263337878418704, + "learning_rate": 5.809593501814106e-05, + "loss": 0.0043, + "step": 912 + }, + { + "epoch": 2.0221483942414173, + "grad_norm": 0.055809588386506126, + "learning_rate": 5.786169002103652e-05, + "loss": 0.0033, + "step": 913 + }, + { + "epoch": 2.024363233665559, + "grad_norm": 0.08864447935904467, + "learning_rate": 5.762772582287025e-05, + "loss": 0.0052, + "step": 914 + }, + { + "epoch": 2.026578073089701, + "grad_norm": 0.06005330146857931, + "learning_rate": 5.739404398271996e-05, + "loss": 0.0046, + "step": 915 + }, + { + "epoch": 2.028792912513843, + "grad_norm": 0.07693742564000806, + "learning_rate": 5.716064605778168e-05, + "loss": 0.0052, + "step": 916 + }, + { + "epoch": 2.0310077519379846, + "grad_norm": 0.046994059374160874, + "learning_rate": 5.692753360335965e-05, + "loss": 0.0049, + "step": 917 + }, + { + "epoch": 2.0332225913621262, + "grad_norm": 0.0696799967396884, + "learning_rate": 5.6694708172855714e-05, + "loss": 0.0041, + "step": 918 + }, + { + "epoch": 2.035437430786268, + "grad_norm": 0.037905053987891846, + "learning_rate": 5.646217131775911e-05, + "loss": 0.0015, + "step": 919 + }, + { + "epoch": 2.0376522702104096, + "grad_norm": 0.05773372509241241, + "learning_rate": 5.622992458763606e-05, + "loss": 0.005, + "step": 920 + }, + { + "epoch": 2.0398671096345513, + "grad_norm": 0.06284923784755923, + "learning_rate": 5.599796953011951e-05, + "loss": 0.0068, + "step": 921 + }, + { + "epoch": 2.0420819490586934, + "grad_norm": 0.039801286121346216, + "learning_rate": 5.576630769089868e-05, + "loss": 0.0048, + "step": 922 + }, + { + "epoch": 2.044296788482835, + "grad_norm": 0.07521213392100597, + "learning_rate": 5.553494061370892e-05, + "loss": 0.006, + "step": 923 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 0.08017313020853548, + "learning_rate": 5.530386984032136e-05, + "loss": 0.0035, + "step": 924 + }, + { + "epoch": 2.0487264673311185, + "grad_norm": 0.04008443625655525, + "learning_rate": 5.5073096910532665e-05, + "loss": 0.0026, + "step": 925 + }, + { + "epoch": 2.05094130675526, + "grad_norm": 0.09226015468300519, + "learning_rate": 5.484262336215469e-05, + "loss": 0.0115, + "step": 926 + }, + { + "epoch": 2.053156146179402, + "grad_norm": 0.11466709538033887, + "learning_rate": 5.46124507310043e-05, + "loss": 0.0053, + "step": 927 + }, + { + "epoch": 2.0553709856035436, + "grad_norm": 0.06941428940970279, + "learning_rate": 5.438258055089318e-05, + "loss": 0.0039, + "step": 928 + }, + { + "epoch": 2.0575858250276853, + "grad_norm": 0.06671921330100518, + "learning_rate": 5.415301435361747e-05, + "loss": 0.005, + "step": 929 + }, + { + "epoch": 2.0598006644518274, + "grad_norm": 0.08866744228605263, + "learning_rate": 5.392375366894773e-05, + "loss": 0.0042, + "step": 930 + }, + { + "epoch": 2.062015503875969, + "grad_norm": 0.13705785355932507, + "learning_rate": 5.36948000246186e-05, + "loss": 0.0047, + "step": 931 + }, + { + "epoch": 2.064230343300111, + "grad_norm": 0.05311815822707833, + "learning_rate": 5.34661549463187e-05, + "loss": 0.0042, + "step": 932 + }, + { + "epoch": 2.0664451827242525, + "grad_norm": 0.05251806783370328, + "learning_rate": 5.3237819957680515e-05, + "loss": 0.0025, + "step": 933 + }, + { + "epoch": 2.068660022148394, + "grad_norm": 0.05144880618718077, + "learning_rate": 5.300979658027001e-05, + "loss": 0.0052, + "step": 934 + }, + { + "epoch": 2.070874861572536, + "grad_norm": 0.06363334755584713, + "learning_rate": 5.27820863335768e-05, + "loss": 0.0032, + "step": 935 + }, + { + "epoch": 2.0730897009966776, + "grad_norm": 0.07549240758346294, + "learning_rate": 5.255469073500382e-05, + "loss": 0.0022, + "step": 936 + }, + { + "epoch": 2.0753045404208197, + "grad_norm": 0.05796750061416822, + "learning_rate": 5.232761129985728e-05, + "loss": 0.0061, + "step": 937 + }, + { + "epoch": 2.0775193798449614, + "grad_norm": 0.05344943012572635, + "learning_rate": 5.2100849541336536e-05, + "loss": 0.0062, + "step": 938 + }, + { + "epoch": 2.079734219269103, + "grad_norm": 0.06010210319895411, + "learning_rate": 5.1874406970524126e-05, + "loss": 0.0057, + "step": 939 + }, + { + "epoch": 2.0819490586932448, + "grad_norm": 0.04167375667706087, + "learning_rate": 5.1648285096375406e-05, + "loss": 0.0087, + "step": 940 + }, + { + "epoch": 2.0841638981173864, + "grad_norm": 0.1123544103070454, + "learning_rate": 5.142248542570887e-05, + "loss": 0.0044, + "step": 941 + }, + { + "epoch": 2.086378737541528, + "grad_norm": 0.05474059875763921, + "learning_rate": 5.119700946319584e-05, + "loss": 0.003, + "step": 942 + }, + { + "epoch": 2.08859357696567, + "grad_norm": 0.04853202001297147, + "learning_rate": 5.097185871135059e-05, + "loss": 0.0042, + "step": 943 + }, + { + "epoch": 2.0908084163898115, + "grad_norm": 0.06177766005574886, + "learning_rate": 5.074703467052022e-05, + "loss": 0.0056, + "step": 944 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.08850523698772857, + "learning_rate": 5.05225388388748e-05, + "loss": 0.0034, + "step": 945 + }, + { + "epoch": 2.0952380952380953, + "grad_norm": 0.057384614999526525, + "learning_rate": 5.0298372712397124e-05, + "loss": 0.0041, + "step": 946 + }, + { + "epoch": 2.097452934662237, + "grad_norm": 0.08522313351909958, + "learning_rate": 5.007453778487305e-05, + "loss": 0.0032, + "step": 947 + }, + { + "epoch": 2.0996677740863787, + "grad_norm": 0.06777774002072079, + "learning_rate": 4.985103554788142e-05, + "loss": 0.0056, + "step": 948 + }, + { + "epoch": 2.1018826135105204, + "grad_norm": 0.0533027077955871, + "learning_rate": 4.962786749078405e-05, + "loss": 0.0045, + "step": 949 + }, + { + "epoch": 2.104097452934662, + "grad_norm": 0.04607504770391638, + "learning_rate": 4.940503510071589e-05, + "loss": 0.0042, + "step": 950 + }, + { + "epoch": 2.106312292358804, + "grad_norm": 0.042898866808248365, + "learning_rate": 4.918253986257509e-05, + "loss": 0.0035, + "step": 951 + }, + { + "epoch": 2.108527131782946, + "grad_norm": 0.08533847923250186, + "learning_rate": 4.896038325901312e-05, + "loss": 0.0067, + "step": 952 + }, + { + "epoch": 2.1107419712070876, + "grad_norm": 0.06403319041058955, + "learning_rate": 4.8738566770424856e-05, + "loss": 0.0053, + "step": 953 + }, + { + "epoch": 2.1129568106312293, + "grad_norm": 0.05317853196828708, + "learning_rate": 4.851709187493871e-05, + "loss": 0.0029, + "step": 954 + }, + { + "epoch": 2.115171650055371, + "grad_norm": 0.10133219290435107, + "learning_rate": 4.829596004840687e-05, + "loss": 0.0068, + "step": 955 + }, + { + "epoch": 2.1173864894795127, + "grad_norm": 0.05049377703026546, + "learning_rate": 4.807517276439537e-05, + "loss": 0.0046, + "step": 956 + }, + { + "epoch": 2.1196013289036544, + "grad_norm": 0.10127030455279595, + "learning_rate": 4.785473149417431e-05, + "loss": 0.0057, + "step": 957 + }, + { + "epoch": 2.121816168327796, + "grad_norm": 0.09786269704798252, + "learning_rate": 4.7634637706707954e-05, + "loss": 0.0074, + "step": 958 + }, + { + "epoch": 2.124031007751938, + "grad_norm": 0.038022182392734735, + "learning_rate": 4.7414892868645146e-05, + "loss": 0.0032, + "step": 959 + }, + { + "epoch": 2.12624584717608, + "grad_norm": 0.0858309831854262, + "learning_rate": 4.719549844430939e-05, + "loss": 0.0098, + "step": 960 + }, + { + "epoch": 2.1284606866002216, + "grad_norm": 0.06612090321212465, + "learning_rate": 4.69764558956891e-05, + "loss": 0.0087, + "step": 961 + }, + { + "epoch": 2.1306755260243633, + "grad_norm": 0.05537391540435296, + "learning_rate": 4.6757766682427884e-05, + "loss": 0.0102, + "step": 962 + }, + { + "epoch": 2.132890365448505, + "grad_norm": 0.07961340571785278, + "learning_rate": 4.653943226181487e-05, + "loss": 0.0047, + "step": 963 + }, + { + "epoch": 2.1351052048726467, + "grad_norm": 0.0583666267202301, + "learning_rate": 4.632145408877481e-05, + "loss": 0.0069, + "step": 964 + }, + { + "epoch": 2.1373200442967883, + "grad_norm": 0.06860628388497503, + "learning_rate": 4.610383361585863e-05, + "loss": 0.0064, + "step": 965 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 0.05207658584310602, + "learning_rate": 4.5886572293233576e-05, + "loss": 0.0047, + "step": 966 + }, + { + "epoch": 2.141749723145072, + "grad_norm": 0.03917269026146777, + "learning_rate": 4.5669671568673657e-05, + "loss": 0.0038, + "step": 967 + }, + { + "epoch": 2.143964562569214, + "grad_norm": 0.08999050784675415, + "learning_rate": 4.5453132887549895e-05, + "loss": 0.0094, + "step": 968 + }, + { + "epoch": 2.1461794019933556, + "grad_norm": 0.061746398507855904, + "learning_rate": 4.523695769282084e-05, + "loss": 0.0049, + "step": 969 + }, + { + "epoch": 2.1483942414174972, + "grad_norm": 0.04247040358662243, + "learning_rate": 4.502114742502267e-05, + "loss": 0.0045, + "step": 970 + }, + { + "epoch": 2.150609080841639, + "grad_norm": 0.08742263482725642, + "learning_rate": 4.480570352225998e-05, + "loss": 0.0049, + "step": 971 + }, + { + "epoch": 2.1528239202657806, + "grad_norm": 0.06889044805207148, + "learning_rate": 4.459062742019592e-05, + "loss": 0.0032, + "step": 972 + }, + { + "epoch": 2.1550387596899223, + "grad_norm": 0.0565177614866143, + "learning_rate": 4.4375920552042717e-05, + "loss": 0.0028, + "step": 973 + }, + { + "epoch": 2.157253599114064, + "grad_norm": 0.04886580967210398, + "learning_rate": 4.4161584348552096e-05, + "loss": 0.007, + "step": 974 + }, + { + "epoch": 2.159468438538206, + "grad_norm": 0.07606169245631265, + "learning_rate": 4.394762023800582e-05, + "loss": 0.0059, + "step": 975 + }, + { + "epoch": 2.161683277962348, + "grad_norm": 0.043703391888669785, + "learning_rate": 4.373402964620609e-05, + "loss": 0.0045, + "step": 976 + }, + { + "epoch": 2.1638981173864895, + "grad_norm": 0.04738907001004885, + "learning_rate": 4.352081399646604e-05, + "loss": 0.0038, + "step": 977 + }, + { + "epoch": 2.166112956810631, + "grad_norm": 0.05706482748526665, + "learning_rate": 4.330797470960034e-05, + "loss": 0.0035, + "step": 978 + }, + { + "epoch": 2.168327796234773, + "grad_norm": 0.08432655984267098, + "learning_rate": 4.3095513203915664e-05, + "loss": 0.0062, + "step": 979 + }, + { + "epoch": 2.1705426356589146, + "grad_norm": 0.06740944609235186, + "learning_rate": 4.28834308952012e-05, + "loss": 0.0053, + "step": 980 + }, + { + "epoch": 2.1727574750830563, + "grad_norm": 0.04033513421836458, + "learning_rate": 4.2671729196719376e-05, + "loss": 0.0026, + "step": 981 + }, + { + "epoch": 2.1749723145071984, + "grad_norm": 0.051361694747267046, + "learning_rate": 4.2460409519196166e-05, + "loss": 0.0031, + "step": 982 + }, + { + "epoch": 2.17718715393134, + "grad_norm": 0.04836271625283562, + "learning_rate": 4.2249473270811967e-05, + "loss": 0.0021, + "step": 983 + }, + { + "epoch": 2.179401993355482, + "grad_norm": 0.12115102972231681, + "learning_rate": 4.2038921857192104e-05, + "loss": 0.0068, + "step": 984 + }, + { + "epoch": 2.1816168327796235, + "grad_norm": 0.05971801810811427, + "learning_rate": 4.182875668139742e-05, + "loss": 0.0041, + "step": 985 + }, + { + "epoch": 2.183831672203765, + "grad_norm": 0.06013297139989527, + "learning_rate": 4.161897914391498e-05, + "loss": 0.0034, + "step": 986 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 0.07410089053960943, + "learning_rate": 4.1409590642648786e-05, + "loss": 0.0077, + "step": 987 + }, + { + "epoch": 2.1882613510520486, + "grad_norm": 0.030655768959630257, + "learning_rate": 4.1200592572910254e-05, + "loss": 0.0018, + "step": 988 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.045267569300847944, + "learning_rate": 4.099198632740921e-05, + "loss": 0.0046, + "step": 989 + }, + { + "epoch": 2.1926910299003324, + "grad_norm": 0.05530518977807187, + "learning_rate": 4.0783773296244386e-05, + "loss": 0.0041, + "step": 990 + }, + { + "epoch": 2.194905869324474, + "grad_norm": 0.07299981911329738, + "learning_rate": 4.057595486689427e-05, + "loss": 0.0043, + "step": 991 + }, + { + "epoch": 2.1971207087486158, + "grad_norm": 0.04860579973892205, + "learning_rate": 4.036853242420783e-05, + "loss": 0.0029, + "step": 992 + }, + { + "epoch": 2.1993355481727574, + "grad_norm": 0.027782214098304368, + "learning_rate": 4.0161507350395254e-05, + "loss": 0.0012, + "step": 993 + }, + { + "epoch": 2.201550387596899, + "grad_norm": 0.04655909105984263, + "learning_rate": 3.9954881025018745e-05, + "loss": 0.003, + "step": 994 + }, + { + "epoch": 2.203765227021041, + "grad_norm": 0.05629442662807932, + "learning_rate": 3.974865482498339e-05, + "loss": 0.0052, + "step": 995 + }, + { + "epoch": 2.2059800664451825, + "grad_norm": 0.0947086461952899, + "learning_rate": 3.954283012452793e-05, + "loss": 0.0086, + "step": 996 + }, + { + "epoch": 2.2081949058693247, + "grad_norm": 0.07496514780022882, + "learning_rate": 3.93374082952156e-05, + "loss": 0.0069, + "step": 997 + }, + { + "epoch": 2.2104097452934663, + "grad_norm": 0.06770091415882433, + "learning_rate": 3.913239070592506e-05, + "loss": 0.0036, + "step": 998 + }, + { + "epoch": 2.212624584717608, + "grad_norm": 0.05312181673392825, + "learning_rate": 3.892777872284115e-05, + "loss": 0.0041, + "step": 999 + }, + { + "epoch": 2.2148394241417497, + "grad_norm": 0.05836831849978664, + "learning_rate": 3.872357370944587e-05, + "loss": 0.0064, + "step": 1000 + }, + { + "epoch": 2.2170542635658914, + "grad_norm": 0.059085019043960865, + "learning_rate": 3.8519777026509306e-05, + "loss": 0.0036, + "step": 1001 + }, + { + "epoch": 2.219269102990033, + "grad_norm": 0.04608671502692635, + "learning_rate": 3.8316390032080516e-05, + "loss": 0.0021, + "step": 1002 + }, + { + "epoch": 2.221483942414175, + "grad_norm": 0.04998638239522754, + "learning_rate": 3.8113414081478496e-05, + "loss": 0.0036, + "step": 1003 + }, + { + "epoch": 2.223698781838317, + "grad_norm": 0.046261872665798044, + "learning_rate": 3.791085052728314e-05, + "loss": 0.0026, + "step": 1004 + }, + { + "epoch": 2.2259136212624586, + "grad_norm": 0.07817310831496511, + "learning_rate": 3.7708700719326305e-05, + "loss": 0.0078, + "step": 1005 + }, + { + "epoch": 2.2281284606866003, + "grad_norm": 0.04655290446305916, + "learning_rate": 3.750696600468262e-05, + "loss": 0.0037, + "step": 1006 + }, + { + "epoch": 2.230343300110742, + "grad_norm": 0.07017522607192478, + "learning_rate": 3.730564772766074e-05, + "loss": 0.0106, + "step": 1007 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.04999107439569362, + "learning_rate": 3.710474722979428e-05, + "loss": 0.0034, + "step": 1008 + }, + { + "epoch": 2.2347729789590254, + "grad_norm": 0.07419140308161772, + "learning_rate": 3.690426584983283e-05, + "loss": 0.0032, + "step": 1009 + }, + { + "epoch": 2.236987818383167, + "grad_norm": 0.05702399631662921, + "learning_rate": 3.6704204923733174e-05, + "loss": 0.0041, + "step": 1010 + }, + { + "epoch": 2.2392026578073088, + "grad_norm": 0.03877181250489353, + "learning_rate": 3.650456578465017e-05, + "loss": 0.0018, + "step": 1011 + }, + { + "epoch": 2.241417497231451, + "grad_norm": 0.037878375815880265, + "learning_rate": 3.6305349762928085e-05, + "loss": 0.002, + "step": 1012 + }, + { + "epoch": 2.2436323366555926, + "grad_norm": 0.07344479789980497, + "learning_rate": 3.6106558186091644e-05, + "loss": 0.0092, + "step": 1013 + }, + { + "epoch": 2.2458471760797343, + "grad_norm": 0.044472735868496885, + "learning_rate": 3.5908192378837127e-05, + "loss": 0.0023, + "step": 1014 + }, + { + "epoch": 2.248062015503876, + "grad_norm": 0.05068388114554409, + "learning_rate": 3.571025366302365e-05, + "loss": 0.0031, + "step": 1015 + }, + { + "epoch": 2.2502768549280177, + "grad_norm": 0.07403082632078949, + "learning_rate": 3.551274335766429e-05, + "loss": 0.0085, + "step": 1016 + }, + { + "epoch": 2.2524916943521593, + "grad_norm": 0.07477790058361249, + "learning_rate": 3.5315662778917235e-05, + "loss": 0.0099, + "step": 1017 + }, + { + "epoch": 2.254706533776301, + "grad_norm": 0.04792530654858614, + "learning_rate": 3.5119013240077167e-05, + "loss": 0.0042, + "step": 1018 + }, + { + "epoch": 2.256921373200443, + "grad_norm": 0.06818664299361259, + "learning_rate": 3.492279605156641e-05, + "loss": 0.0065, + "step": 1019 + }, + { + "epoch": 2.259136212624585, + "grad_norm": 0.08877435755440466, + "learning_rate": 3.472701252092619e-05, + "loss": 0.0065, + "step": 1020 + }, + { + "epoch": 2.2613510520487266, + "grad_norm": 0.04369119515124063, + "learning_rate": 3.453166395280798e-05, + "loss": 0.0036, + "step": 1021 + }, + { + "epoch": 2.2635658914728682, + "grad_norm": 0.034788647173788365, + "learning_rate": 3.433675164896476e-05, + "loss": 0.0035, + "step": 1022 + }, + { + "epoch": 2.26578073089701, + "grad_norm": 0.11295048624384015, + "learning_rate": 3.4142276908242334e-05, + "loss": 0.0042, + "step": 1023 + }, + { + "epoch": 2.2679955703211516, + "grad_norm": 0.06592150426375552, + "learning_rate": 3.394824102657074e-05, + "loss": 0.0031, + "step": 1024 + }, + { + "epoch": 2.2702104097452933, + "grad_norm": 0.044722326120880715, + "learning_rate": 3.3754645296955535e-05, + "loss": 0.0024, + "step": 1025 + }, + { + "epoch": 2.2724252491694354, + "grad_norm": 0.036464218152527636, + "learning_rate": 3.3561491009469235e-05, + "loss": 0.0022, + "step": 1026 + }, + { + "epoch": 2.274640088593577, + "grad_norm": 0.08366701413861895, + "learning_rate": 3.3368779451242695e-05, + "loss": 0.0067, + "step": 1027 + }, + { + "epoch": 2.276854928017719, + "grad_norm": 0.060563618098847555, + "learning_rate": 3.3176511906456564e-05, + "loss": 0.0042, + "step": 1028 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 0.03526167884086348, + "learning_rate": 3.29846896563326e-05, + "loss": 0.0029, + "step": 1029 + }, + { + "epoch": 2.281284606866002, + "grad_norm": 0.0469097658149562, + "learning_rate": 3.279331397912533e-05, + "loss": 0.0062, + "step": 1030 + }, + { + "epoch": 2.283499446290144, + "grad_norm": 0.06559919515497366, + "learning_rate": 3.260238615011341e-05, + "loss": 0.0047, + "step": 1031 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.0856998342555102, + "learning_rate": 3.2411907441591126e-05, + "loss": 0.0058, + "step": 1032 + }, + { + "epoch": 2.2879291251384273, + "grad_norm": 0.05691729508156917, + "learning_rate": 3.222187912285997e-05, + "loss": 0.002, + "step": 1033 + }, + { + "epoch": 2.290143964562569, + "grad_norm": 0.0416576974376162, + "learning_rate": 3.203230246022017e-05, + "loss": 0.0022, + "step": 1034 + }, + { + "epoch": 2.292358803986711, + "grad_norm": 0.03927972723568208, + "learning_rate": 3.184317871696212e-05, + "loss": 0.0037, + "step": 1035 + }, + { + "epoch": 2.294573643410853, + "grad_norm": 0.04150988377603192, + "learning_rate": 3.16545091533582e-05, + "loss": 0.0037, + "step": 1036 + }, + { + "epoch": 2.2967884828349945, + "grad_norm": 0.08088639939536255, + "learning_rate": 3.14662950266542e-05, + "loss": 0.0072, + "step": 1037 + }, + { + "epoch": 2.299003322259136, + "grad_norm": 0.08284900279689722, + "learning_rate": 3.127853759106102e-05, + "loss": 0.0048, + "step": 1038 + }, + { + "epoch": 2.301218161683278, + "grad_norm": 0.02240476123683619, + "learning_rate": 3.109123809774624e-05, + "loss": 0.0017, + "step": 1039 + }, + { + "epoch": 2.3034330011074196, + "grad_norm": 0.04854147385820792, + "learning_rate": 3.090439779482592e-05, + "loss": 0.0025, + "step": 1040 + }, + { + "epoch": 2.3056478405315612, + "grad_norm": 0.04541404994410462, + "learning_rate": 3.071801792735606e-05, + "loss": 0.0041, + "step": 1041 + }, + { + "epoch": 2.3078626799557034, + "grad_norm": 0.056589585529701274, + "learning_rate": 3.0532099737324524e-05, + "loss": 0.0052, + "step": 1042 + }, + { + "epoch": 2.310077519379845, + "grad_norm": 0.02509548853436473, + "learning_rate": 3.0346644463642683e-05, + "loss": 0.0018, + "step": 1043 + }, + { + "epoch": 2.3122923588039868, + "grad_norm": 0.13246308771068788, + "learning_rate": 3.0161653342137096e-05, + "loss": 0.0028, + "step": 1044 + }, + { + "epoch": 2.3145071982281284, + "grad_norm": 0.0840821834583412, + "learning_rate": 2.997712760554139e-05, + "loss": 0.0077, + "step": 1045 + }, + { + "epoch": 2.31672203765227, + "grad_norm": 0.03781310287436413, + "learning_rate": 2.9793068483487975e-05, + "loss": 0.0017, + "step": 1046 + }, + { + "epoch": 2.318936877076412, + "grad_norm": 0.038121699521746995, + "learning_rate": 2.9609477202499793e-05, + "loss": 0.0025, + "step": 1047 + }, + { + "epoch": 2.3211517165005535, + "grad_norm": 0.03867278828390827, + "learning_rate": 2.9426354985982272e-05, + "loss": 0.0028, + "step": 1048 + }, + { + "epoch": 2.3233665559246957, + "grad_norm": 0.053904044305935886, + "learning_rate": 2.92437030542151e-05, + "loss": 0.0032, + "step": 1049 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.04092878647546068, + "learning_rate": 2.906152262434415e-05, + "loss": 0.0019, + "step": 1050 + }, + { + "epoch": 2.327796234772979, + "grad_norm": 0.0631848895477812, + "learning_rate": 2.887981491037328e-05, + "loss": 0.0069, + "step": 1051 + }, + { + "epoch": 2.3300110741971207, + "grad_norm": 0.029202617591128084, + "learning_rate": 2.8698581123156322e-05, + "loss": 0.002, + "step": 1052 + }, + { + "epoch": 2.3322259136212624, + "grad_norm": 0.07232682804216416, + "learning_rate": 2.851782247038889e-05, + "loss": 0.0056, + "step": 1053 + }, + { + "epoch": 2.334440753045404, + "grad_norm": 0.027810617660917313, + "learning_rate": 2.833754015660053e-05, + "loss": 0.0019, + "step": 1054 + }, + { + "epoch": 2.336655592469546, + "grad_norm": 0.033315691685293164, + "learning_rate": 2.8157735383146543e-05, + "loss": 0.0026, + "step": 1055 + }, + { + "epoch": 2.338870431893688, + "grad_norm": 0.049984056387949054, + "learning_rate": 2.797840934820004e-05, + "loss": 0.0018, + "step": 1056 + }, + { + "epoch": 2.3410852713178296, + "grad_norm": 0.07278395837415463, + "learning_rate": 2.779956324674392e-05, + "loss": 0.006, + "step": 1057 + }, + { + "epoch": 2.3433001107419713, + "grad_norm": 0.0672537326424706, + "learning_rate": 2.7621198270562974e-05, + "loss": 0.0029, + "step": 1058 + }, + { + "epoch": 2.345514950166113, + "grad_norm": 0.03659137644548778, + "learning_rate": 2.7443315608235785e-05, + "loss": 0.0043, + "step": 1059 + }, + { + "epoch": 2.3477297895902547, + "grad_norm": 0.04806081265907534, + "learning_rate": 2.726591644512704e-05, + "loss": 0.0064, + "step": 1060 + }, + { + "epoch": 2.3499446290143964, + "grad_norm": 0.04744142657756985, + "learning_rate": 2.708900196337947e-05, + "loss": 0.0041, + "step": 1061 + }, + { + "epoch": 2.352159468438538, + "grad_norm": 0.06875397010952838, + "learning_rate": 2.6912573341905988e-05, + "loss": 0.0048, + "step": 1062 + }, + { + "epoch": 2.35437430786268, + "grad_norm": 0.07838536628193149, + "learning_rate": 2.673663175638189e-05, + "loss": 0.0049, + "step": 1063 + }, + { + "epoch": 2.356589147286822, + "grad_norm": 0.1205254619475679, + "learning_rate": 2.6561178379237005e-05, + "loss": 0.005, + "step": 1064 + }, + { + "epoch": 2.3588039867109636, + "grad_norm": 0.08299947877755977, + "learning_rate": 2.63862143796478e-05, + "loss": 0.0089, + "step": 1065 + }, + { + "epoch": 2.3610188261351053, + "grad_norm": 0.06528993890691498, + "learning_rate": 2.6211740923529693e-05, + "loss": 0.0035, + "step": 1066 + }, + { + "epoch": 2.363233665559247, + "grad_norm": 0.04612627220250573, + "learning_rate": 2.6037759173529252e-05, + "loss": 0.0029, + "step": 1067 + }, + { + "epoch": 2.3654485049833887, + "grad_norm": 0.03893809361617652, + "learning_rate": 2.5864270289016458e-05, + "loss": 0.0021, + "step": 1068 + }, + { + "epoch": 2.3676633444075303, + "grad_norm": 0.06303212825477, + "learning_rate": 2.569127542607691e-05, + "loss": 0.0042, + "step": 1069 + }, + { + "epoch": 2.369878183831672, + "grad_norm": 0.07255918066704788, + "learning_rate": 2.5518775737504254e-05, + "loss": 0.004, + "step": 1070 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 0.04569504988775164, + "learning_rate": 2.5346772372792295e-05, + "loss": 0.0029, + "step": 1071 + }, + { + "epoch": 2.374307862679956, + "grad_norm": 0.05049370320527537, + "learning_rate": 2.5175266478127567e-05, + "loss": 0.0061, + "step": 1072 + }, + { + "epoch": 2.3765227021040976, + "grad_norm": 0.05202184791230954, + "learning_rate": 2.5004259196381585e-05, + "loss": 0.0036, + "step": 1073 + }, + { + "epoch": 2.3787375415282392, + "grad_norm": 0.06975319613378474, + "learning_rate": 2.4833751667103178e-05, + "loss": 0.0077, + "step": 1074 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.05579102249808788, + "learning_rate": 2.4663745026511e-05, + "loss": 0.006, + "step": 1075 + }, + { + "epoch": 2.3831672203765226, + "grad_norm": 0.04259419497849438, + "learning_rate": 2.449424040748589e-05, + "loss": 0.0033, + "step": 1076 + }, + { + "epoch": 2.3853820598006643, + "grad_norm": 0.06625128244074155, + "learning_rate": 2.4325238939563365e-05, + "loss": 0.0037, + "step": 1077 + }, + { + "epoch": 2.387596899224806, + "grad_norm": 0.05546038853875481, + "learning_rate": 2.4156741748926028e-05, + "loss": 0.0052, + "step": 1078 + }, + { + "epoch": 2.389811738648948, + "grad_norm": 0.11509659232658719, + "learning_rate": 2.398874995839615e-05, + "loss": 0.0043, + "step": 1079 + }, + { + "epoch": 2.39202657807309, + "grad_norm": 0.06303938134361765, + "learning_rate": 2.3821264687428135e-05, + "loss": 0.0064, + "step": 1080 + }, + { + "epoch": 2.3942414174972315, + "grad_norm": 0.07498719780679106, + "learning_rate": 2.3654287052101043e-05, + "loss": 0.0042, + "step": 1081 + }, + { + "epoch": 2.396456256921373, + "grad_norm": 0.07212196011318288, + "learning_rate": 2.3487818165111253e-05, + "loss": 0.0031, + "step": 1082 + }, + { + "epoch": 2.398671096345515, + "grad_norm": 0.03791453812653929, + "learning_rate": 2.3321859135764845e-05, + "loss": 0.0016, + "step": 1083 + }, + { + "epoch": 2.4008859357696566, + "grad_norm": 0.051775210461122975, + "learning_rate": 2.3156411069970452e-05, + "loss": 0.0062, + "step": 1084 + }, + { + "epoch": 2.4031007751937983, + "grad_norm": 0.06832782973908892, + "learning_rate": 2.2991475070231737e-05, + "loss": 0.005, + "step": 1085 + }, + { + "epoch": 2.4053156146179404, + "grad_norm": 0.060605447582523654, + "learning_rate": 2.28270522356401e-05, + "loss": 0.0071, + "step": 1086 + }, + { + "epoch": 2.407530454042082, + "grad_norm": 0.04511825802066576, + "learning_rate": 2.2663143661867314e-05, + "loss": 0.0039, + "step": 1087 + }, + { + "epoch": 2.409745293466224, + "grad_norm": 0.03672982405322077, + "learning_rate": 2.2499750441158284e-05, + "loss": 0.0013, + "step": 1088 + }, + { + "epoch": 2.4119601328903655, + "grad_norm": 0.06989083022418129, + "learning_rate": 2.2336873662323677e-05, + "loss": 0.0061, + "step": 1089 + }, + { + "epoch": 2.414174972314507, + "grad_norm": 0.045799259145791256, + "learning_rate": 2.2174514410732782e-05, + "loss": 0.0035, + "step": 1090 + }, + { + "epoch": 2.416389811738649, + "grad_norm": 0.03677541788881642, + "learning_rate": 2.201267376830619e-05, + "loss": 0.0016, + "step": 1091 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 0.052959357680014, + "learning_rate": 2.1851352813508607e-05, + "loss": 0.0029, + "step": 1092 + }, + { + "epoch": 2.4208194905869327, + "grad_norm": 0.040471313052103886, + "learning_rate": 2.169055262134171e-05, + "loss": 0.003, + "step": 1093 + }, + { + "epoch": 2.4230343300110744, + "grad_norm": 0.03869707277081365, + "learning_rate": 2.1530274263336934e-05, + "loss": 0.0024, + "step": 1094 + }, + { + "epoch": 2.425249169435216, + "grad_norm": 0.09205278409490118, + "learning_rate": 2.137051880754828e-05, + "loss": 0.0062, + "step": 1095 + }, + { + "epoch": 2.4274640088593578, + "grad_norm": 0.03340009868584942, + "learning_rate": 2.1211287318545336e-05, + "loss": 0.0025, + "step": 1096 + }, + { + "epoch": 2.4296788482834994, + "grad_norm": 0.04333661687079504, + "learning_rate": 2.105258085740609e-05, + "loss": 0.003, + "step": 1097 + }, + { + "epoch": 2.431893687707641, + "grad_norm": 0.026275048288069335, + "learning_rate": 2.0894400481709874e-05, + "loss": 0.0017, + "step": 1098 + }, + { + "epoch": 2.434108527131783, + "grad_norm": 0.12733011586967735, + "learning_rate": 2.073674724553034e-05, + "loss": 0.0083, + "step": 1099 + }, + { + "epoch": 2.4363233665559245, + "grad_norm": 0.03188298894134563, + "learning_rate": 2.0579622199428373e-05, + "loss": 0.0033, + "step": 1100 + }, + { + "epoch": 2.438538205980066, + "grad_norm": 0.04843016554773395, + "learning_rate": 2.0423026390445175e-05, + "loss": 0.0048, + "step": 1101 + }, + { + "epoch": 2.4407530454042083, + "grad_norm": 0.04780218356918381, + "learning_rate": 2.026696086209523e-05, + "loss": 0.0033, + "step": 1102 + }, + { + "epoch": 2.44296788482835, + "grad_norm": 0.08671543416475448, + "learning_rate": 2.0111426654359365e-05, + "loss": 0.0057, + "step": 1103 + }, + { + "epoch": 2.4451827242524917, + "grad_norm": 0.043706957772874915, + "learning_rate": 1.9956424803677844e-05, + "loss": 0.0041, + "step": 1104 + }, + { + "epoch": 2.4473975636766334, + "grad_norm": 0.07809983795948948, + "learning_rate": 1.9801956342943394e-05, + "loss": 0.0056, + "step": 1105 + }, + { + "epoch": 2.449612403100775, + "grad_norm": 0.07448137039440934, + "learning_rate": 1.964802230149445e-05, + "loss": 0.0088, + "step": 1106 + }, + { + "epoch": 2.451827242524917, + "grad_norm": 0.05286628530812354, + "learning_rate": 1.9494623705108085e-05, + "loss": 0.003, + "step": 1107 + }, + { + "epoch": 2.4540420819490585, + "grad_norm": 0.04609031328832757, + "learning_rate": 1.934176157599342e-05, + "loss": 0.0017, + "step": 1108 + }, + { + "epoch": 2.4562569213732006, + "grad_norm": 0.03153225169402225, + "learning_rate": 1.9189436932784643e-05, + "loss": 0.0029, + "step": 1109 + }, + { + "epoch": 2.4584717607973423, + "grad_norm": 0.08625122070039164, + "learning_rate": 1.903765079053429e-05, + "loss": 0.0044, + "step": 1110 + }, + { + "epoch": 2.460686600221484, + "grad_norm": 0.10915526368044594, + "learning_rate": 1.888640416070644e-05, + "loss": 0.0114, + "step": 1111 + }, + { + "epoch": 2.4629014396456257, + "grad_norm": 0.03625935277051994, + "learning_rate": 1.873569805117006e-05, + "loss": 0.0032, + "step": 1112 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.0493471108819577, + "learning_rate": 1.858553346619213e-05, + "loss": 0.0042, + "step": 1113 + }, + { + "epoch": 2.467331118493909, + "grad_norm": 0.05477757740551976, + "learning_rate": 1.8435911406431105e-05, + "loss": 0.0059, + "step": 1114 + }, + { + "epoch": 2.4695459579180508, + "grad_norm": 0.03151474780151791, + "learning_rate": 1.8286832868930216e-05, + "loss": 0.0055, + "step": 1115 + }, + { + "epoch": 2.471760797342193, + "grad_norm": 0.024289443510767332, + "learning_rate": 1.8138298847110756e-05, + "loss": 0.0022, + "step": 1116 + }, + { + "epoch": 2.4739756367663346, + "grad_norm": 0.044142432763603676, + "learning_rate": 1.7990310330765526e-05, + "loss": 0.0034, + "step": 1117 + }, + { + "epoch": 2.4761904761904763, + "grad_norm": 0.045614549588255615, + "learning_rate": 1.784286830605224e-05, + "loss": 0.0034, + "step": 1118 + }, + { + "epoch": 2.478405315614618, + "grad_norm": 0.032992861324518424, + "learning_rate": 1.769597375548684e-05, + "loss": 0.0018, + "step": 1119 + }, + { + "epoch": 2.4806201550387597, + "grad_norm": 0.0801516561987975, + "learning_rate": 1.754962765793714e-05, + "loss": 0.0062, + "step": 1120 + }, + { + "epoch": 2.4828349944629013, + "grad_norm": 0.032233750943622824, + "learning_rate": 1.7403830988616154e-05, + "loss": 0.0019, + "step": 1121 + }, + { + "epoch": 2.485049833887043, + "grad_norm": 0.059448613295860944, + "learning_rate": 1.725858471907569e-05, + "loss": 0.0051, + "step": 1122 + }, + { + "epoch": 2.487264673311185, + "grad_norm": 0.07478337348545341, + "learning_rate": 1.7113889817199758e-05, + "loss": 0.0093, + "step": 1123 + }, + { + "epoch": 2.489479512735327, + "grad_norm": 0.05968733604564609, + "learning_rate": 1.6969747247198275e-05, + "loss": 0.0052, + "step": 1124 + }, + { + "epoch": 2.4916943521594686, + "grad_norm": 0.03445176006900081, + "learning_rate": 1.68261579696005e-05, + "loss": 0.003, + "step": 1125 + }, + { + "epoch": 2.4939091915836102, + "grad_norm": 0.032367022282991625, + "learning_rate": 1.668312294124873e-05, + "loss": 0.003, + "step": 1126 + }, + { + "epoch": 2.496124031007752, + "grad_norm": 0.06373661855652572, + "learning_rate": 1.6540643115291867e-05, + "loss": 0.0052, + "step": 1127 + }, + { + "epoch": 2.4983388704318936, + "grad_norm": 0.15072419236351065, + "learning_rate": 1.6398719441179077e-05, + "loss": 0.0054, + "step": 1128 + }, + { + "epoch": 2.5005537098560353, + "grad_norm": 0.07098501739134702, + "learning_rate": 1.625735286465351e-05, + "loss": 0.0059, + "step": 1129 + }, + { + "epoch": 2.5027685492801774, + "grad_norm": 0.0653383588389451, + "learning_rate": 1.6116544327745942e-05, + "loss": 0.0036, + "step": 1130 + }, + { + "epoch": 2.5049833887043187, + "grad_norm": 0.03195565024716648, + "learning_rate": 1.5976294768768474e-05, + "loss": 0.0023, + "step": 1131 + }, + { + "epoch": 2.507198228128461, + "grad_norm": 0.07209415995727858, + "learning_rate": 1.5836605122308366e-05, + "loss": 0.0049, + "step": 1132 + }, + { + "epoch": 2.5094130675526025, + "grad_norm": 0.051671716472344656, + "learning_rate": 1.5697476319221772e-05, + "loss": 0.003, + "step": 1133 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 0.03269369900633084, + "learning_rate": 1.55589092866275e-05, + "loss": 0.0027, + "step": 1134 + }, + { + "epoch": 2.513842746400886, + "grad_norm": 0.05412115953066398, + "learning_rate": 1.542090494790087e-05, + "loss": 0.0028, + "step": 1135 + }, + { + "epoch": 2.5160575858250276, + "grad_norm": 0.035515195517889564, + "learning_rate": 1.5283464222667586e-05, + "loss": 0.0019, + "step": 1136 + }, + { + "epoch": 2.5182724252491693, + "grad_norm": 0.01629417105646653, + "learning_rate": 1.5146588026797514e-05, + "loss": 0.0007, + "step": 1137 + }, + { + "epoch": 2.520487264673311, + "grad_norm": 0.08819780423900109, + "learning_rate": 1.5010277272398677e-05, + "loss": 0.0081, + "step": 1138 + }, + { + "epoch": 2.522702104097453, + "grad_norm": 0.05218587039850283, + "learning_rate": 1.4874532867811142e-05, + "loss": 0.0036, + "step": 1139 + }, + { + "epoch": 2.524916943521595, + "grad_norm": 0.055648031683223735, + "learning_rate": 1.4739355717600956e-05, + "loss": 0.0024, + "step": 1140 + }, + { + "epoch": 2.5271317829457365, + "grad_norm": 0.04543909674761197, + "learning_rate": 1.4604746722554153e-05, + "loss": 0.0041, + "step": 1141 + }, + { + "epoch": 2.529346622369878, + "grad_norm": 0.06720942467054845, + "learning_rate": 1.4470706779670706e-05, + "loss": 0.0053, + "step": 1142 + }, + { + "epoch": 2.53156146179402, + "grad_norm": 0.03127434068490074, + "learning_rate": 1.4337236782158537e-05, + "loss": 0.0013, + "step": 1143 + }, + { + "epoch": 2.5337763012181616, + "grad_norm": 0.05765233201439245, + "learning_rate": 1.4204337619427654e-05, + "loss": 0.0047, + "step": 1144 + }, + { + "epoch": 2.5359911406423032, + "grad_norm": 0.05363199719201621, + "learning_rate": 1.4072010177084127e-05, + "loss": 0.0034, + "step": 1145 + }, + { + "epoch": 2.5382059800664454, + "grad_norm": 0.07415067343779756, + "learning_rate": 1.3940255336924258e-05, + "loss": 0.0052, + "step": 1146 + }, + { + "epoch": 2.540420819490587, + "grad_norm": 0.04640547827184461, + "learning_rate": 1.3809073976928654e-05, + "loss": 0.005, + "step": 1147 + }, + { + "epoch": 2.5426356589147288, + "grad_norm": 0.045716707962102955, + "learning_rate": 1.3678466971256409e-05, + "loss": 0.0034, + "step": 1148 + }, + { + "epoch": 2.5448504983388704, + "grad_norm": 0.07117095473276315, + "learning_rate": 1.3548435190239261e-05, + "loss": 0.0027, + "step": 1149 + }, + { + "epoch": 2.547065337763012, + "grad_norm": 0.046675081150535605, + "learning_rate": 1.3418979500375783e-05, + "loss": 0.0032, + "step": 1150 + }, + { + "epoch": 2.549280177187154, + "grad_norm": 0.04173612922690014, + "learning_rate": 1.3290100764325652e-05, + "loss": 0.0023, + "step": 1151 + }, + { + "epoch": 2.5514950166112955, + "grad_norm": 0.05836259144392446, + "learning_rate": 1.3161799840903855e-05, + "loss": 0.0019, + "step": 1152 + }, + { + "epoch": 2.5537098560354377, + "grad_norm": 0.07715198795996421, + "learning_rate": 1.3034077585074977e-05, + "loss": 0.0059, + "step": 1153 + }, + { + "epoch": 2.5559246954595793, + "grad_norm": 0.036830377528245514, + "learning_rate": 1.290693484794755e-05, + "loss": 0.0023, + "step": 1154 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.07042330978783341, + "learning_rate": 1.2780372476768255e-05, + "loss": 0.0037, + "step": 1155 + }, + { + "epoch": 2.5603543743078627, + "grad_norm": 0.06039391086346949, + "learning_rate": 1.2654391314916459e-05, + "loss": 0.0075, + "step": 1156 + }, + { + "epoch": 2.5625692137320044, + "grad_norm": 0.057758569745767714, + "learning_rate": 1.2528992201898437e-05, + "loss": 0.0056, + "step": 1157 + }, + { + "epoch": 2.564784053156146, + "grad_norm": 0.04218854183585762, + "learning_rate": 1.2404175973341869e-05, + "loss": 0.0019, + "step": 1158 + }, + { + "epoch": 2.566998892580288, + "grad_norm": 0.07347747844360733, + "learning_rate": 1.227994346099023e-05, + "loss": 0.0052, + "step": 1159 + }, + { + "epoch": 2.56921373200443, + "grad_norm": 0.06532286141627236, + "learning_rate": 1.2156295492697289e-05, + "loss": 0.0061, + "step": 1160 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.07305784426907876, + "learning_rate": 1.2033232892421487e-05, + "loss": 0.002, + "step": 1161 + }, + { + "epoch": 2.5736434108527133, + "grad_norm": 0.07136633452162083, + "learning_rate": 1.1910756480220587e-05, + "loss": 0.0048, + "step": 1162 + }, + { + "epoch": 2.575858250276855, + "grad_norm": 0.06329072025868075, + "learning_rate": 1.1788867072246123e-05, + "loss": 0.0033, + "step": 1163 + }, + { + "epoch": 2.5780730897009967, + "grad_norm": 0.07167146337881969, + "learning_rate": 1.1667565480738007e-05, + "loss": 0.0054, + "step": 1164 + }, + { + "epoch": 2.5802879291251384, + "grad_norm": 0.04157935039860682, + "learning_rate": 1.1546852514019057e-05, + "loss": 0.0024, + "step": 1165 + }, + { + "epoch": 2.58250276854928, + "grad_norm": 0.06706107517631839, + "learning_rate": 1.1426728976489675e-05, + "loss": 0.0031, + "step": 1166 + }, + { + "epoch": 2.584717607973422, + "grad_norm": 0.038877592489387255, + "learning_rate": 1.1307195668622428e-05, + "loss": 0.0036, + "step": 1167 + }, + { + "epoch": 2.5869324473975635, + "grad_norm": 0.06447061870059081, + "learning_rate": 1.1188253386956748e-05, + "loss": 0.0035, + "step": 1168 + }, + { + "epoch": 2.5891472868217056, + "grad_norm": 0.04276241578746162, + "learning_rate": 1.1069902924093656e-05, + "loss": 0.0019, + "step": 1169 + }, + { + "epoch": 2.5913621262458473, + "grad_norm": 0.09217322820358438, + "learning_rate": 1.09521450686904e-05, + "loss": 0.0077, + "step": 1170 + }, + { + "epoch": 2.593576965669989, + "grad_norm": 0.08369288583547181, + "learning_rate": 1.0834980605455258e-05, + "loss": 0.005, + "step": 1171 + }, + { + "epoch": 2.5957918050941307, + "grad_norm": 0.040291088078231244, + "learning_rate": 1.0718410315142313e-05, + "loss": 0.0031, + "step": 1172 + }, + { + "epoch": 2.5980066445182723, + "grad_norm": 0.04295959735064777, + "learning_rate": 1.0602434974546215e-05, + "loss": 0.0025, + "step": 1173 + }, + { + "epoch": 2.600221483942414, + "grad_norm": 0.04247333746985145, + "learning_rate": 1.0487055356497e-05, + "loss": 0.0032, + "step": 1174 + }, + { + "epoch": 2.6024363233665557, + "grad_norm": 0.06519822067088907, + "learning_rate": 1.0372272229855007e-05, + "loss": 0.0053, + "step": 1175 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 0.05080580369759981, + "learning_rate": 1.025808635950566e-05, + "loss": 0.0036, + "step": 1176 + }, + { + "epoch": 2.6068660022148396, + "grad_norm": 0.055212016676863596, + "learning_rate": 1.014449850635446e-05, + "loss": 0.0061, + "step": 1177 + }, + { + "epoch": 2.6090808416389812, + "grad_norm": 0.05184179515841359, + "learning_rate": 1.0031509427321873e-05, + "loss": 0.0031, + "step": 1178 + }, + { + "epoch": 2.611295681063123, + "grad_norm": 0.06798603657196776, + "learning_rate": 9.919119875338223e-06, + "loss": 0.0036, + "step": 1179 + }, + { + "epoch": 2.6135105204872646, + "grad_norm": 0.08217190483705764, + "learning_rate": 9.807330599338827e-06, + "loss": 0.005, + "step": 1180 + }, + { + "epoch": 2.6157253599114063, + "grad_norm": 0.060230972789850554, + "learning_rate": 9.696142344258862e-06, + "loss": 0.0057, + "step": 1181 + }, + { + "epoch": 2.617940199335548, + "grad_norm": 0.0495388689666682, + "learning_rate": 9.585555851028472e-06, + "loss": 0.0021, + "step": 1182 + }, + { + "epoch": 2.62015503875969, + "grad_norm": 0.10099260265582757, + "learning_rate": 9.475571856567821e-06, + "loss": 0.0055, + "step": 1183 + }, + { + "epoch": 2.622369878183832, + "grad_norm": 0.062105089087487984, + "learning_rate": 9.36619109378214e-06, + "loss": 0.006, + "step": 1184 + }, + { + "epoch": 2.6245847176079735, + "grad_norm": 0.0497603259370771, + "learning_rate": 9.25741429155692e-06, + "loss": 0.0031, + "step": 1185 + }, + { + "epoch": 2.626799557032115, + "grad_norm": 0.03939021066525151, + "learning_rate": 9.149242174753002e-06, + "loss": 0.0037, + "step": 1186 + }, + { + "epoch": 2.629014396456257, + "grad_norm": 0.0301356264180313, + "learning_rate": 9.041675464201738e-06, + "loss": 0.0011, + "step": 1187 + }, + { + "epoch": 2.6312292358803986, + "grad_norm": 0.0400328088566037, + "learning_rate": 8.934714876700223e-06, + "loss": 0.0025, + "step": 1188 + }, + { + "epoch": 2.6334440753045403, + "grad_norm": 0.056593365728021934, + "learning_rate": 8.828361125006535e-06, + "loss": 0.0032, + "step": 1189 + }, + { + "epoch": 2.6356589147286824, + "grad_norm": 0.06203744054742439, + "learning_rate": 8.722614917834871e-06, + "loss": 0.005, + "step": 1190 + }, + { + "epoch": 2.6378737541528237, + "grad_norm": 0.04186192017082604, + "learning_rate": 8.617476959850967e-06, + "loss": 0.0034, + "step": 1191 + }, + { + "epoch": 2.640088593576966, + "grad_norm": 0.03849423454002746, + "learning_rate": 8.512947951667349e-06, + "loss": 0.0028, + "step": 1192 + }, + { + "epoch": 2.6423034330011075, + "grad_norm": 0.06597772449210257, + "learning_rate": 8.409028589838618e-06, + "loss": 0.0061, + "step": 1193 + }, + { + "epoch": 2.644518272425249, + "grad_norm": 0.0444514170833848, + "learning_rate": 8.305719566856874e-06, + "loss": 0.0041, + "step": 1194 + }, + { + "epoch": 2.646733111849391, + "grad_norm": 0.02321170410933663, + "learning_rate": 8.203021571147074e-06, + "loss": 0.0013, + "step": 1195 + }, + { + "epoch": 2.6489479512735326, + "grad_norm": 0.03380793204583571, + "learning_rate": 8.100935287062428e-06, + "loss": 0.0013, + "step": 1196 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 0.055514516269436355, + "learning_rate": 7.99946139487988e-06, + "loss": 0.0011, + "step": 1197 + }, + { + "epoch": 2.653377630121816, + "grad_norm": 0.018246526293291383, + "learning_rate": 7.898600570795523e-06, + "loss": 0.001, + "step": 1198 + }, + { + "epoch": 2.655592469545958, + "grad_norm": 0.0445194308031563, + "learning_rate": 7.79835348692014e-06, + "loss": 0.0029, + "step": 1199 + }, + { + "epoch": 2.6578073089700998, + "grad_norm": 0.08197623404555328, + "learning_rate": 7.698720811274707e-06, + "loss": 0.0054, + "step": 1200 + }, + { + "epoch": 2.6600221483942414, + "grad_norm": 0.051503102116270134, + "learning_rate": 7.599703207785946e-06, + "loss": 0.0027, + "step": 1201 + }, + { + "epoch": 2.662236987818383, + "grad_norm": 0.07167575086231812, + "learning_rate": 7.501301336281852e-06, + "loss": 0.003, + "step": 1202 + }, + { + "epoch": 2.664451827242525, + "grad_norm": 0.05620964370419043, + "learning_rate": 7.403515852487386e-06, + "loss": 0.0031, + "step": 1203 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.03540737856539731, + "learning_rate": 7.306347408020042e-06, + "loss": 0.0023, + "step": 1204 + }, + { + "epoch": 2.668881506090808, + "grad_norm": 0.04909071113146246, + "learning_rate": 7.209796650385537e-06, + "loss": 0.0054, + "step": 1205 + }, + { + "epoch": 2.6710963455149503, + "grad_norm": 0.06032833943712418, + "learning_rate": 7.113864222973454e-06, + "loss": 0.0038, + "step": 1206 + }, + { + "epoch": 2.673311184939092, + "grad_norm": 0.11358371853713839, + "learning_rate": 7.018550765053012e-06, + "loss": 0.007, + "step": 1207 + }, + { + "epoch": 2.6755260243632337, + "grad_norm": 0.05373294169234185, + "learning_rate": 6.923856911768722e-06, + "loss": 0.0048, + "step": 1208 + }, + { + "epoch": 2.6777408637873754, + "grad_norm": 0.06203424468344484, + "learning_rate": 6.8297832941362605e-06, + "loss": 0.0066, + "step": 1209 + }, + { + "epoch": 2.679955703211517, + "grad_norm": 0.06251516656598335, + "learning_rate": 6.736330539038205e-06, + "loss": 0.0058, + "step": 1210 + }, + { + "epoch": 2.682170542635659, + "grad_norm": 0.03894919395582229, + "learning_rate": 6.643499269219855e-06, + "loss": 0.0033, + "step": 1211 + }, + { + "epoch": 2.6843853820598005, + "grad_norm": 0.04718973623456703, + "learning_rate": 6.5512901032850686e-06, + "loss": 0.0039, + "step": 1212 + }, + { + "epoch": 2.6866002214839426, + "grad_norm": 0.050336106104471076, + "learning_rate": 6.459703655692218e-06, + "loss": 0.0059, + "step": 1213 + }, + { + "epoch": 2.6888150609080843, + "grad_norm": 0.03287033266061476, + "learning_rate": 6.368740536749973e-06, + "loss": 0.0028, + "step": 1214 + }, + { + "epoch": 2.691029900332226, + "grad_norm": 0.05691353907465886, + "learning_rate": 6.278401352613339e-06, + "loss": 0.0064, + "step": 1215 + }, + { + "epoch": 2.6932447397563677, + "grad_norm": 0.03675519955392844, + "learning_rate": 6.1886867052795675e-06, + "loss": 0.0027, + "step": 1216 + }, + { + "epoch": 2.6954595791805094, + "grad_norm": 0.049912876694311645, + "learning_rate": 6.099597192584172e-06, + "loss": 0.0046, + "step": 1217 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 0.05067364612266751, + "learning_rate": 6.0111334081969224e-06, + "loss": 0.0035, + "step": 1218 + }, + { + "epoch": 2.6998892580287928, + "grad_norm": 0.048881184367960184, + "learning_rate": 5.923295941617868e-06, + "loss": 0.0056, + "step": 1219 + }, + { + "epoch": 2.702104097452935, + "grad_norm": 0.07484403324025948, + "learning_rate": 5.836085378173483e-06, + "loss": 0.0039, + "step": 1220 + }, + { + "epoch": 2.704318936877076, + "grad_norm": 0.03429323870901106, + "learning_rate": 5.7495022990126764e-06, + "loss": 0.0018, + "step": 1221 + }, + { + "epoch": 2.7065337763012183, + "grad_norm": 0.054206188019052914, + "learning_rate": 5.663547281102987e-06, + "loss": 0.0032, + "step": 1222 + }, + { + "epoch": 2.70874861572536, + "grad_norm": 0.06391155873844175, + "learning_rate": 5.578220897226705e-06, + "loss": 0.0027, + "step": 1223 + }, + { + "epoch": 2.7109634551495017, + "grad_norm": 0.038110440736920256, + "learning_rate": 5.493523715977067e-06, + "loss": 0.0033, + "step": 1224 + }, + { + "epoch": 2.7131782945736433, + "grad_norm": 0.02661777821996162, + "learning_rate": 5.409456301754479e-06, + "loss": 0.002, + "step": 1225 + }, + { + "epoch": 2.715393133997785, + "grad_norm": 0.06238407276819188, + "learning_rate": 5.32601921476269e-06, + "loss": 0.0029, + "step": 1226 + }, + { + "epoch": 2.717607973421927, + "grad_norm": 0.08458747602346918, + "learning_rate": 5.243213011005166e-06, + "loss": 0.0094, + "step": 1227 + }, + { + "epoch": 2.7198228128460684, + "grad_norm": 0.07927762083469432, + "learning_rate": 5.161038242281291e-06, + "loss": 0.0068, + "step": 1228 + }, + { + "epoch": 2.7220376522702106, + "grad_norm": 0.0727316392142195, + "learning_rate": 5.079495456182759e-06, + "loss": 0.0031, + "step": 1229 + }, + { + "epoch": 2.7242524916943522, + "grad_norm": 0.040409856717198385, + "learning_rate": 4.998585196089856e-06, + "loss": 0.0026, + "step": 1230 + }, + { + "epoch": 2.726467331118494, + "grad_norm": 0.05438770755555333, + "learning_rate": 4.9183080011679375e-06, + "loss": 0.0022, + "step": 1231 + }, + { + "epoch": 2.7286821705426356, + "grad_norm": 0.03924236649928951, + "learning_rate": 4.838664406363669e-06, + "loss": 0.003, + "step": 1232 + }, + { + "epoch": 2.7308970099667773, + "grad_norm": 0.05092529375372127, + "learning_rate": 4.759654942401681e-06, + "loss": 0.0042, + "step": 1233 + }, + { + "epoch": 2.733111849390919, + "grad_norm": 0.05129893855364365, + "learning_rate": 4.681280135780841e-06, + "loss": 0.0031, + "step": 1234 + }, + { + "epoch": 2.7353266888150607, + "grad_norm": 0.0589613171986061, + "learning_rate": 4.603540508770854e-06, + "loss": 0.0043, + "step": 1235 + }, + { + "epoch": 2.737541528239203, + "grad_norm": 0.056121390536117165, + "learning_rate": 4.526436579408755e-06, + "loss": 0.0038, + "step": 1236 + }, + { + "epoch": 2.7397563676633445, + "grad_norm": 0.08234205526340418, + "learning_rate": 4.449968861495446e-06, + "loss": 0.0059, + "step": 1237 + }, + { + "epoch": 2.741971207087486, + "grad_norm": 0.062047969929767736, + "learning_rate": 4.374137864592242e-06, + "loss": 0.007, + "step": 1238 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 0.07474393187501505, + "learning_rate": 4.2989440940175765e-06, + "loss": 0.004, + "step": 1239 + }, + { + "epoch": 2.7464008859357696, + "grad_norm": 0.025545166796079176, + "learning_rate": 4.22438805084352e-06, + "loss": 0.0009, + "step": 1240 + }, + { + "epoch": 2.7486157253599113, + "grad_norm": 0.044611854953026206, + "learning_rate": 4.1504702318925114e-06, + "loss": 0.0032, + "step": 1241 + }, + { + "epoch": 2.750830564784053, + "grad_norm": 0.03423011415103255, + "learning_rate": 4.077191129734026e-06, + "loss": 0.0019, + "step": 1242 + }, + { + "epoch": 2.753045404208195, + "grad_norm": 0.037430729219510676, + "learning_rate": 4.004551232681286e-06, + "loss": 0.0016, + "step": 1243 + }, + { + "epoch": 2.755260243632337, + "grad_norm": 0.042153321165730354, + "learning_rate": 3.932551024788022e-06, + "loss": 0.0019, + "step": 1244 + }, + { + "epoch": 2.7574750830564785, + "grad_norm": 0.05724078678847528, + "learning_rate": 3.8611909858452315e-06, + "loss": 0.0016, + "step": 1245 + }, + { + "epoch": 2.75968992248062, + "grad_norm": 0.06346094327996875, + "learning_rate": 3.7904715913779977e-06, + "loss": 0.0061, + "step": 1246 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.031390902457118376, + "learning_rate": 3.7203933126423007e-06, + "loss": 0.0034, + "step": 1247 + }, + { + "epoch": 2.7641196013289036, + "grad_norm": 0.03899318321952262, + "learning_rate": 3.6509566166219035e-06, + "loss": 0.0017, + "step": 1248 + }, + { + "epoch": 2.7663344407530452, + "grad_norm": 0.049651592220709154, + "learning_rate": 3.5821619660252213e-06, + "loss": 0.0028, + "step": 1249 + }, + { + "epoch": 2.7685492801771874, + "grad_norm": 0.03925642119527002, + "learning_rate": 3.514009819282227e-06, + "loss": 0.0028, + "step": 1250 + }, + { + "epoch": 2.770764119601329, + "grad_norm": 0.09156439631470767, + "learning_rate": 3.446500630541427e-06, + "loss": 0.0066, + "step": 1251 + }, + { + "epoch": 2.7729789590254708, + "grad_norm": 0.03457735750882461, + "learning_rate": 3.379634849666824e-06, + "loss": 0.0021, + "step": 1252 + }, + { + "epoch": 2.7751937984496124, + "grad_norm": 0.04557520341573689, + "learning_rate": 3.3134129222349153e-06, + "loss": 0.0049, + "step": 1253 + }, + { + "epoch": 2.777408637873754, + "grad_norm": 0.03669866353902438, + "learning_rate": 3.247835289531698e-06, + "loss": 0.0023, + "step": 1254 + }, + { + "epoch": 2.779623477297896, + "grad_norm": 0.039882300358933254, + "learning_rate": 3.182902388549791e-06, + "loss": 0.0026, + "step": 1255 + }, + { + "epoch": 2.7818383167220375, + "grad_norm": 0.0419423898807071, + "learning_rate": 3.118614651985452e-06, + "loss": 0.0039, + "step": 1256 + }, + { + "epoch": 2.7840531561461797, + "grad_norm": 0.05497559878801078, + "learning_rate": 3.054972508235754e-06, + "loss": 0.0022, + "step": 1257 + }, + { + "epoch": 2.786267995570321, + "grad_norm": 0.06088785212758285, + "learning_rate": 2.991976381395678e-06, + "loss": 0.0037, + "step": 1258 + }, + { + "epoch": 2.788482834994463, + "grad_norm": 0.040647147337742724, + "learning_rate": 2.9296266912553383e-06, + "loss": 0.003, + "step": 1259 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 0.0501158292751489, + "learning_rate": 2.867923853297161e-06, + "loss": 0.0031, + "step": 1260 + }, + { + "epoch": 2.7929125138427464, + "grad_norm": 0.04392147508207625, + "learning_rate": 2.8068682786930865e-06, + "loss": 0.0018, + "step": 1261 + }, + { + "epoch": 2.795127353266888, + "grad_norm": 0.035664419929105344, + "learning_rate": 2.7464603743018736e-06, + "loss": 0.0036, + "step": 1262 + }, + { + "epoch": 2.79734219269103, + "grad_norm": 0.04953807775886955, + "learning_rate": 2.686700542666376e-06, + "loss": 0.0058, + "step": 1263 + }, + { + "epoch": 2.7995570321151715, + "grad_norm": 0.06013933169977137, + "learning_rate": 2.627589182010859e-06, + "loss": 0.0024, + "step": 1264 + }, + { + "epoch": 2.801771871539313, + "grad_norm": 0.052997289529565776, + "learning_rate": 2.5691266862383323e-06, + "loss": 0.0028, + "step": 1265 + }, + { + "epoch": 2.8039867109634553, + "grad_norm": 0.0870124028167329, + "learning_rate": 2.511313444927943e-06, + "loss": 0.0078, + "step": 1266 + }, + { + "epoch": 2.806201550387597, + "grad_norm": 0.05224663061094042, + "learning_rate": 2.454149843332354e-06, + "loss": 0.0026, + "step": 1267 + }, + { + "epoch": 2.8084163898117387, + "grad_norm": 0.04011390571974365, + "learning_rate": 2.3976362623752358e-06, + "loss": 0.0021, + "step": 1268 + }, + { + "epoch": 2.8106312292358804, + "grad_norm": 0.07203250786863093, + "learning_rate": 2.3417730786486457e-06, + "loss": 0.0049, + "step": 1269 + }, + { + "epoch": 2.812846068660022, + "grad_norm": 0.07888223383407164, + "learning_rate": 2.2865606644105754e-06, + "loss": 0.0073, + "step": 1270 + }, + { + "epoch": 2.8150609080841638, + "grad_norm": 0.04791017546755696, + "learning_rate": 2.2319993875824728e-06, + "loss": 0.0024, + "step": 1271 + }, + { + "epoch": 2.8172757475083055, + "grad_norm": 0.08144541430143945, + "learning_rate": 2.1780896117467253e-06, + "loss": 0.0049, + "step": 1272 + }, + { + "epoch": 2.8194905869324476, + "grad_norm": 0.04225816135468771, + "learning_rate": 2.1248316961443583e-06, + "loss": 0.0029, + "step": 1273 + }, + { + "epoch": 2.8217054263565893, + "grad_norm": 0.07603972968797719, + "learning_rate": 2.072225995672472e-06, + "loss": 0.0106, + "step": 1274 + }, + { + "epoch": 2.823920265780731, + "grad_norm": 0.03336053348521161, + "learning_rate": 2.0202728608820554e-06, + "loss": 0.0027, + "step": 1275 + }, + { + "epoch": 2.8261351052048727, + "grad_norm": 0.0556692444138038, + "learning_rate": 1.9689726379755192e-06, + "loss": 0.0022, + "step": 1276 + }, + { + "epoch": 2.8283499446290143, + "grad_norm": 0.22148672760331933, + "learning_rate": 1.918325668804455e-06, + "loss": 0.0043, + "step": 1277 + }, + { + "epoch": 2.830564784053156, + "grad_norm": 0.026740051760086807, + "learning_rate": 1.8683322908673361e-06, + "loss": 0.0015, + "step": 1278 + }, + { + "epoch": 2.8327796234772977, + "grad_norm": 0.05078835496352603, + "learning_rate": 1.818992837307265e-06, + "loss": 0.0049, + "step": 1279 + }, + { + "epoch": 2.83499446290144, + "grad_norm": 0.034969379295299585, + "learning_rate": 1.7703076369097627e-06, + "loss": 0.0027, + "step": 1280 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 0.05938141242462468, + "learning_rate": 1.7222770141005596e-06, + "loss": 0.0032, + "step": 1281 + }, + { + "epoch": 2.8394241417497232, + "grad_norm": 0.08145988126753562, + "learning_rate": 1.6749012889434868e-06, + "loss": 0.0055, + "step": 1282 + }, + { + "epoch": 2.841638981173865, + "grad_norm": 0.09209301893519133, + "learning_rate": 1.6281807771382662e-06, + "loss": 0.0061, + "step": 1283 + }, + { + "epoch": 2.8438538205980066, + "grad_norm": 0.060545212241547006, + "learning_rate": 1.5821157900184568e-06, + "loss": 0.0058, + "step": 1284 + }, + { + "epoch": 2.8460686600221483, + "grad_norm": 0.10776481345323365, + "learning_rate": 1.5367066345493898e-06, + "loss": 0.0083, + "step": 1285 + }, + { + "epoch": 2.84828349944629, + "grad_norm": 0.05616631163865477, + "learning_rate": 1.4919536133260582e-06, + "loss": 0.0026, + "step": 1286 + }, + { + "epoch": 2.850498338870432, + "grad_norm": 0.0358131204148807, + "learning_rate": 1.4478570245711754e-06, + "loss": 0.0046, + "step": 1287 + }, + { + "epoch": 2.8527131782945734, + "grad_norm": 0.04183438448108713, + "learning_rate": 1.4044171621331536e-06, + "loss": 0.0025, + "step": 1288 + }, + { + "epoch": 2.8549280177187155, + "grad_norm": 0.04870098758616501, + "learning_rate": 1.3616343154841393e-06, + "loss": 0.0063, + "step": 1289 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.05143889781117799, + "learning_rate": 1.319508769718103e-06, + "loss": 0.0036, + "step": 1290 + }, + { + "epoch": 2.859357696566999, + "grad_norm": 0.05421082616478394, + "learning_rate": 1.27804080554893e-06, + "loss": 0.0062, + "step": 1291 + }, + { + "epoch": 2.8615725359911406, + "grad_norm": 0.025193891744949962, + "learning_rate": 1.2372306993085337e-06, + "loss": 0.0009, + "step": 1292 + }, + { + "epoch": 2.8637873754152823, + "grad_norm": 0.032114163834799425, + "learning_rate": 1.1970787229450552e-06, + "loss": 0.0017, + "step": 1293 + }, + { + "epoch": 2.8660022148394244, + "grad_norm": 0.07832500225149808, + "learning_rate": 1.1575851440210005e-06, + "loss": 0.006, + "step": 1294 + }, + { + "epoch": 2.8682170542635657, + "grad_norm": 0.050058193159415745, + "learning_rate": 1.1187502257115068e-06, + "loss": 0.0024, + "step": 1295 + }, + { + "epoch": 2.870431893687708, + "grad_norm": 0.05199923764907305, + "learning_rate": 1.0805742268025332e-06, + "loss": 0.006, + "step": 1296 + }, + { + "epoch": 2.8726467331118495, + "grad_norm": 0.10130857608106641, + "learning_rate": 1.043057401689218e-06, + "loss": 0.0129, + "step": 1297 + }, + { + "epoch": 2.874861572535991, + "grad_norm": 0.029845325117723354, + "learning_rate": 1.0062000003740913e-06, + "loss": 0.0018, + "step": 1298 + }, + { + "epoch": 2.877076411960133, + "grad_norm": 0.03504222120188151, + "learning_rate": 9.70002268465453e-07, + "loss": 0.0017, + "step": 1299 + }, + { + "epoch": 2.8792912513842746, + "grad_norm": 0.04753898075919357, + "learning_rate": 9.344644471757535e-07, + "loss": 0.0044, + "step": 1300 + }, + { + "epoch": 2.8815060908084162, + "grad_norm": 0.0828340726141958, + "learning_rate": 8.995867733199604e-07, + "loss": 0.0045, + "step": 1301 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 0.04133120811317434, + "learning_rate": 8.653694793139932e-07, + "loss": 0.0029, + "step": 1302 + }, + { + "epoch": 2.8859357696567, + "grad_norm": 0.07049669057805709, + "learning_rate": 8.31812793173159e-07, + "loss": 0.0063, + "step": 1303 + }, + { + "epoch": 2.8881506090808418, + "grad_norm": 0.043198641239296384, + "learning_rate": 7.989169385106409e-07, + "loss": 0.0034, + "step": 1304 + }, + { + "epoch": 2.8903654485049834, + "grad_norm": 0.03602962986230159, + "learning_rate": 7.666821345360231e-07, + "loss": 0.0034, + "step": 1305 + }, + { + "epoch": 2.892580287929125, + "grad_norm": 0.041600254268213385, + "learning_rate": 7.351085960538017e-07, + "loss": 0.0028, + "step": 1306 + }, + { + "epoch": 2.894795127353267, + "grad_norm": 0.046313750457007516, + "learning_rate": 7.041965334619649e-07, + "loss": 0.003, + "step": 1307 + }, + { + "epoch": 2.8970099667774085, + "grad_norm": 0.04598822036500629, + "learning_rate": 6.739461527506263e-07, + "loss": 0.0035, + "step": 1308 + }, + { + "epoch": 2.89922480620155, + "grad_norm": 0.03712301626427292, + "learning_rate": 6.443576555005937e-07, + "loss": 0.0039, + "step": 1309 + }, + { + "epoch": 2.9014396456256923, + "grad_norm": 0.03680799313235935, + "learning_rate": 6.154312388820472e-07, + "loss": 0.0023, + "step": 1310 + }, + { + "epoch": 2.903654485049834, + "grad_norm": 0.05967486627507132, + "learning_rate": 5.871670956532515e-07, + "loss": 0.0072, + "step": 1311 + }, + { + "epoch": 2.9058693244739757, + "grad_norm": 0.024474526794078823, + "learning_rate": 5.595654141592354e-07, + "loss": 0.0021, + "step": 1312 + }, + { + "epoch": 2.9080841638981174, + "grad_norm": 0.07270464893609062, + "learning_rate": 5.326263783305585e-07, + "loss": 0.0051, + "step": 1313 + }, + { + "epoch": 2.910299003322259, + "grad_norm": 0.0365461174721488, + "learning_rate": 5.063501676820681e-07, + "loss": 0.0028, + "step": 1314 + }, + { + "epoch": 2.912513842746401, + "grad_norm": 0.07255673764619744, + "learning_rate": 4.807369573117449e-07, + "loss": 0.0074, + "step": 1315 + }, + { + "epoch": 2.9147286821705425, + "grad_norm": 0.04497722737740748, + "learning_rate": 4.557869178994589e-07, + "loss": 0.0039, + "step": 1316 + }, + { + "epoch": 2.9169435215946846, + "grad_norm": 0.04390103123719084, + "learning_rate": 4.3150021570591557e-07, + "loss": 0.0023, + "step": 1317 + }, + { + "epoch": 2.919158361018826, + "grad_norm": 0.05062372752258898, + "learning_rate": 4.0787701257148925e-07, + "loss": 0.0061, + "step": 1318 + }, + { + "epoch": 2.921373200442968, + "grad_norm": 0.055115234936654066, + "learning_rate": 3.8491746591518e-07, + "loss": 0.0027, + "step": 1319 + }, + { + "epoch": 2.9235880398671097, + "grad_norm": 0.08930392675345035, + "learning_rate": 3.626217287335365e-07, + "loss": 0.0082, + "step": 1320 + }, + { + "epoch": 2.9258028792912514, + "grad_norm": 0.07615580906234574, + "learning_rate": 3.4098994959967933e-07, + "loss": 0.0034, + "step": 1321 + }, + { + "epoch": 2.928017718715393, + "grad_norm": 0.05600100546105138, + "learning_rate": 3.2002227266223483e-07, + "loss": 0.0035, + "step": 1322 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.05293270398582902, + "learning_rate": 2.997188376444582e-07, + "loss": 0.0038, + "step": 1323 + }, + { + "epoch": 2.932447397563677, + "grad_norm": 0.1022191976274968, + "learning_rate": 2.800797798432564e-07, + "loss": 0.0066, + "step": 1324 + }, + { + "epoch": 2.934662236987818, + "grad_norm": 0.0673364898220236, + "learning_rate": 2.6110523012831125e-07, + "loss": 0.0034, + "step": 1325 + }, + { + "epoch": 2.9368770764119603, + "grad_norm": 0.04009044983837874, + "learning_rate": 2.427953149411466e-07, + "loss": 0.0027, + "step": 1326 + }, + { + "epoch": 2.939091915836102, + "grad_norm": 0.0927591664248732, + "learning_rate": 2.2515015629438475e-07, + "loss": 0.0086, + "step": 1327 + }, + { + "epoch": 2.9413067552602437, + "grad_norm": 0.052463632969209244, + "learning_rate": 2.08169871770858e-07, + "loss": 0.0043, + "step": 1328 + }, + { + "epoch": 2.9435215946843853, + "grad_norm": 0.038388397966482864, + "learning_rate": 1.9185457452283173e-07, + "loss": 0.0022, + "step": 1329 + }, + { + "epoch": 2.945736434108527, + "grad_norm": 0.054488448006327975, + "learning_rate": 1.7620437327129368e-07, + "loss": 0.0041, + "step": 1330 + }, + { + "epoch": 2.9479512735326687, + "grad_norm": 0.0767964924507602, + "learning_rate": 1.6121937230517693e-07, + "loss": 0.0047, + "step": 1331 + }, + { + "epoch": 2.9501661129568104, + "grad_norm": 0.031239979428894997, + "learning_rate": 1.468996714807158e-07, + "loss": 0.0014, + "step": 1332 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.07582526666842028, + "learning_rate": 1.3324536622073557e-07, + "loss": 0.0044, + "step": 1333 + }, + { + "epoch": 2.9545957918050942, + "grad_norm": 0.05857489697149861, + "learning_rate": 1.2025654751404157e-07, + "loss": 0.0049, + "step": 1334 + }, + { + "epoch": 2.956810631229236, + "grad_norm": 0.05995448549188797, + "learning_rate": 1.0793330191478657e-07, + "loss": 0.0052, + "step": 1335 + }, + { + "epoch": 2.9590254706533776, + "grad_norm": 0.05540393557021224, + "learning_rate": 9.627571154195991e-08, + "loss": 0.0025, + "step": 1336 + }, + { + "epoch": 2.9612403100775193, + "grad_norm": 0.03452934988914927, + "learning_rate": 8.528385407875483e-08, + "loss": 0.0024, + "step": 1337 + }, + { + "epoch": 2.963455149501661, + "grad_norm": 0.04931088029003263, + "learning_rate": 7.495780277210207e-08, + "loss": 0.0018, + "step": 1338 + }, + { + "epoch": 2.9656699889258027, + "grad_norm": 0.053590781928846094, + "learning_rate": 6.52976264321703e-08, + "loss": 0.0032, + "step": 1339 + }, + { + "epoch": 2.967884828349945, + "grad_norm": 0.03534520119036059, + "learning_rate": 5.6303389431899836e-08, + "loss": 0.001, + "step": 1340 + }, + { + "epoch": 2.9700996677740865, + "grad_norm": 0.06210524643031671, + "learning_rate": 4.797515170661404e-08, + "loss": 0.0053, + "step": 1341 + }, + { + "epoch": 2.972314507198228, + "grad_norm": 0.05948572992537153, + "learning_rate": 4.031296875354196e-08, + "loss": 0.0043, + "step": 1342 + }, + { + "epoch": 2.97452934662237, + "grad_norm": 0.04729419227891204, + "learning_rate": 3.331689163151852e-08, + "loss": 0.0034, + "step": 1343 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 0.05773974253801171, + "learning_rate": 2.69869669606293e-08, + "loss": 0.0051, + "step": 1344 + }, + { + "epoch": 2.9789590254706533, + "grad_norm": 0.04050609665876146, + "learning_rate": 2.1323236921855226e-08, + "loss": 0.0041, + "step": 1345 + }, + { + "epoch": 2.981173864894795, + "grad_norm": 0.08149930875775906, + "learning_rate": 1.6325739256850548e-08, + "loss": 0.0083, + "step": 1346 + }, + { + "epoch": 2.983388704318937, + "grad_norm": 0.06091240471532231, + "learning_rate": 1.199450726767637e-08, + "loss": 0.0048, + "step": 1347 + }, + { + "epoch": 2.985603543743079, + "grad_norm": 0.04667482524397884, + "learning_rate": 8.329569816545312e-09, + "loss": 0.0023, + "step": 1348 + }, + { + "epoch": 2.9878183831672205, + "grad_norm": 0.03514878010226889, + "learning_rate": 5.33095132567718e-09, + "loss": 0.0028, + "step": 1349 + }, + { + "epoch": 2.990033222591362, + "grad_norm": 0.05193403445905056, + "learning_rate": 2.9986717770769112e-09, + "loss": 0.0027, + "step": 1350 + }, + { + "epoch": 2.992248062015504, + "grad_norm": 0.032304677547805596, + "learning_rate": 1.3327467124901739e-09, + "loss": 0.0026, + "step": 1351 + }, + { + "epoch": 2.9944629014396456, + "grad_norm": 0.06521411143607302, + "learning_rate": 3.331872331924224e-10, + "loss": 0.0054, + "step": 1352 + }, + { + "epoch": 2.9966777408637872, + "grad_norm": 0.03573150950698123, + "learning_rate": 0.0, + "loss": 0.0019, + "step": 1353 + }, + { + "epoch": 2.9966777408637872, + "eval_loss": 0.012778724543750286, + "eval_runtime": 157.089, + "eval_samples_per_second": 9.682, + "eval_steps_per_second": 0.306, + "step": 1353 + }, + { + "epoch": 2.9966777408637872, + "step": 1353, + "total_flos": 3.497196041273344e+17, + "train_loss": 0.022314726969768228, + "train_runtime": 28510.8763, + "train_samples_per_second": 3.04, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 1, + "max_steps": 1353, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.497196041273344e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}