diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,129073 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.7495898583146907, + "eval_steps": 500, + "global_step": 18436, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 24.499163826921794, + "learning_rate": 1.9880715705765407e-09, + "loss": 0.7246, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 13.844882231508665, + "learning_rate": 3.976143141153081e-09, + "loss": 0.6927, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 24.869892765553143, + "learning_rate": 5.9642147117296215e-09, + "loss": 0.7552, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 26.385384547798115, + "learning_rate": 7.952286282306163e-09, + "loss": 0.7389, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 29.218001303560865, + "learning_rate": 9.940357852882704e-09, + "loss": 0.7572, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 24.35391251439786, + "learning_rate": 1.1928429423459243e-08, + "loss": 0.7227, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 9.310519781810637, + "learning_rate": 1.3916500994035786e-08, + "loss": 0.6953, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 18.6447604220992, + "learning_rate": 1.5904572564612325e-08, + "loss": 0.7064, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 9.480838387500022, + "learning_rate": 1.7892644135188866e-08, + "loss": 0.7038, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 20.876805425738677, + "learning_rate": 1.9880715705765407e-08, + "loss": 0.7513, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 12.660078722475282, + "learning_rate": 2.186878727634195e-08, + "loss": 0.7044, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 25.21963344106092, + "learning_rate": 2.3856858846918486e-08, + "loss": 0.7383, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 13.019857314615871, + "learning_rate": 2.584493041749503e-08, + "loss": 0.7292, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 16.496130485277952, + "learning_rate": 2.783300198807157e-08, + "loss": 0.7233, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 16.09147161519466, + "learning_rate": 2.9821073558648106e-08, + "loss": 0.7233, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 18.387203937810977, + "learning_rate": 3.180914512922465e-08, + "loss": 0.737, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 14.288880135026632, + "learning_rate": 3.3797216699801195e-08, + "loss": 0.6986, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 12.269914776207411, + "learning_rate": 3.578528827037773e-08, + "loss": 0.7129, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 18.059142453553292, + "learning_rate": 3.777335984095427e-08, + "loss": 0.7259, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 24.776974026045124, + "learning_rate": 3.9761431411530815e-08, + "loss": 0.7305, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 10.135453576211118, + "learning_rate": 4.174950298210735e-08, + "loss": 0.6771, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 16.404769665161314, + "learning_rate": 4.37375745526839e-08, + "loss": 0.722, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 12.725641784414934, + "learning_rate": 4.5725646123260435e-08, + "loss": 0.6986, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 17.805350818812755, + "learning_rate": 4.771371769383697e-08, + "loss": 0.722, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 30.579340790709406, + "learning_rate": 4.970178926441352e-08, + "loss": 0.7578, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 21.353250087231306, + "learning_rate": 5.168986083499006e-08, + "loss": 0.7467, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 11.737559850019615, + "learning_rate": 5.36779324055666e-08, + "loss": 0.6979, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 32.51774692766562, + "learning_rate": 5.566600397614314e-08, + "loss": 0.7663, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 26.340207312680363, + "learning_rate": 5.765407554671968e-08, + "loss": 0.7441, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 19.611359067464182, + "learning_rate": 5.964214711729621e-08, + "loss": 0.7305, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 19.911174163776828, + "learning_rate": 6.163021868787277e-08, + "loss": 0.7201, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 18.443041480710598, + "learning_rate": 6.36182902584493e-08, + "loss": 0.7331, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 22.750800834579348, + "learning_rate": 6.560636182902585e-08, + "loss": 0.7454, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 12.389633183706188, + "learning_rate": 6.759443339960239e-08, + "loss": 0.7155, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 8.614846715291455, + "learning_rate": 6.958250497017892e-08, + "loss": 0.694, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 10.303020098843145, + "learning_rate": 7.157057654075547e-08, + "loss": 0.7129, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 16.976729238305676, + "learning_rate": 7.3558648111332e-08, + "loss": 0.7142, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 14.281141553521149, + "learning_rate": 7.554671968190854e-08, + "loss": 0.6921, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 9.97590389010619, + "learning_rate": 7.753479125248509e-08, + "loss": 0.696, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 19.404236684870334, + "learning_rate": 7.952286282306163e-08, + "loss": 0.694, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 29.67511430769177, + "learning_rate": 8.151093439363816e-08, + "loss": 0.7435, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 11.447412412274621, + "learning_rate": 8.34990059642147e-08, + "loss": 0.7064, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 21.79141707379374, + "learning_rate": 8.548707753479125e-08, + "loss": 0.7168, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 24.826111898578436, + "learning_rate": 8.74751491053678e-08, + "loss": 0.7292, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 13.118190427804656, + "learning_rate": 8.946322067594434e-08, + "loss": 0.7122, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 7.489061860847059, + "learning_rate": 9.145129224652087e-08, + "loss": 0.6999, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 17.417653960941234, + "learning_rate": 9.343936381709741e-08, + "loss": 0.7064, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 14.234906080918769, + "learning_rate": 9.542743538767394e-08, + "loss": 0.7318, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 12.724395544227617, + "learning_rate": 9.741550695825049e-08, + "loss": 0.709, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 21.291154194949023, + "learning_rate": 9.940357852882703e-08, + "loss": 0.7324, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 8.472890011552881, + "learning_rate": 1.0139165009940358e-07, + "loss": 0.7201, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 12.989606563656245, + "learning_rate": 1.0337972166998012e-07, + "loss": 0.7246, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 9.761329531813221, + "learning_rate": 1.0536779324055665e-07, + "loss": 0.7031, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 6.826023943398433, + "learning_rate": 1.073558648111332e-07, + "loss": 0.6934, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 10.198638910120003, + "learning_rate": 1.0934393638170973e-07, + "loss": 0.7253, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 16.560391281868366, + "learning_rate": 1.1133200795228629e-07, + "loss": 0.7103, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 9.02576284572131, + "learning_rate": 1.1332007952286282e-07, + "loss": 0.6901, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 10.007662960941557, + "learning_rate": 1.1530815109343936e-07, + "loss": 0.7057, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 11.265760313566703, + "learning_rate": 1.1729622266401589e-07, + "loss": 0.6803, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 11.566401304297036, + "learning_rate": 1.1928429423459242e-07, + "loss": 0.7064, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 5.9881631788582945, + "learning_rate": 1.2127236580516898e-07, + "loss": 0.6986, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 10.540851415471602, + "learning_rate": 1.2326043737574554e-07, + "loss": 0.7266, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 9.859211696523065, + "learning_rate": 1.2524850894632204e-07, + "loss": 0.7012, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 8.238447945620525, + "learning_rate": 1.272365805168986e-07, + "loss": 0.6934, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 11.331280405705904, + "learning_rate": 1.2922465208747516e-07, + "loss": 0.6999, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 8.806504875394937, + "learning_rate": 1.312127236580517e-07, + "loss": 0.7018, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 7.690019869078491, + "learning_rate": 1.3320079522862822e-07, + "loss": 0.7109, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 17.859994989916984, + "learning_rate": 1.3518886679920478e-07, + "loss": 0.7214, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 8.361773269896933, + "learning_rate": 1.371769383697813e-07, + "loss": 0.7057, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 9.291579054779309, + "learning_rate": 1.3916500994035784e-07, + "loss": 0.7018, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 10.534546399712637, + "learning_rate": 1.4115308151093437e-07, + "loss": 0.7116, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 8.802269215676397, + "learning_rate": 1.4314115308151093e-07, + "loss": 0.6829, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 10.129126282797703, + "learning_rate": 1.4512922465208746e-07, + "loss": 0.6992, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 11.058949419865149, + "learning_rate": 1.47117296222664e-07, + "loss": 0.7201, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 10.45112455235193, + "learning_rate": 1.4910536779324055e-07, + "loss": 0.7214, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 15.567052664136712, + "learning_rate": 1.5109343936381708e-07, + "loss": 0.7025, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 6.281020543727984, + "learning_rate": 1.5308151093439364e-07, + "loss": 0.6914, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 8.666454420555736, + "learning_rate": 1.5506958250497017e-07, + "loss": 0.6979, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 9.382466428751778, + "learning_rate": 1.5705765407554673e-07, + "loss": 0.7038, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 18.091048282639214, + "learning_rate": 1.5904572564612326e-07, + "loss": 0.7135, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 11.150491817630403, + "learning_rate": 1.610337972166998e-07, + "loss": 0.7096, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 13.64761922867085, + "learning_rate": 1.6302186878727632e-07, + "loss": 0.6934, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 17.816630042662357, + "learning_rate": 1.6500994035785288e-07, + "loss": 0.7051, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 6.643628404497148, + "learning_rate": 1.669980119284294e-07, + "loss": 0.6901, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 8.097322687284572, + "learning_rate": 1.6898608349900594e-07, + "loss": 0.6901, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 19.096391842239505, + "learning_rate": 1.709741550695825e-07, + "loss": 0.7012, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 11.35751914390632, + "learning_rate": 1.7296222664015903e-07, + "loss": 0.7038, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 11.967649705514523, + "learning_rate": 1.749502982107356e-07, + "loss": 0.7174, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 13.215951849720813, + "learning_rate": 1.7693836978131212e-07, + "loss": 0.7083, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 18.627669209565727, + "learning_rate": 1.7892644135188868e-07, + "loss": 0.6895, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 8.945488033487356, + "learning_rate": 1.809145129224652e-07, + "loss": 0.6882, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 11.859445740889933, + "learning_rate": 1.8290258449304174e-07, + "loss": 0.6986, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 9.626364205274266, + "learning_rate": 1.848906560636183e-07, + "loss": 0.6999, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 10.642856374584046, + "learning_rate": 1.8687872763419483e-07, + "loss": 0.6908, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 13.940388225305853, + "learning_rate": 1.8886679920477136e-07, + "loss": 0.7155, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 14.77869579158993, + "learning_rate": 1.908548707753479e-07, + "loss": 0.6986, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 8.495662336550748, + "learning_rate": 1.9284294234592445e-07, + "loss": 0.7109, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 7.969066596698855, + "learning_rate": 1.9483101391650098e-07, + "loss": 0.7103, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 15.151406641235983, + "learning_rate": 1.968190854870775e-07, + "loss": 0.6966, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 9.054713987899888, + "learning_rate": 1.9880715705765407e-07, + "loss": 0.7161, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 14.431771945316525, + "learning_rate": 2.0079522862823062e-07, + "loss": 0.7077, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 10.932734251906004, + "learning_rate": 2.0278330019880716e-07, + "loss": 0.7031, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 20.314518051799478, + "learning_rate": 2.047713717693837e-07, + "loss": 0.6966, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 11.439046498761027, + "learning_rate": 2.0675944333996024e-07, + "loss": 0.7142, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 14.900652834713968, + "learning_rate": 2.0874751491053678e-07, + "loss": 0.696, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 7.161657127109603, + "learning_rate": 2.107355864811133e-07, + "loss": 0.6999, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 9.950955066276629, + "learning_rate": 2.1272365805168984e-07, + "loss": 0.6953, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 12.067386347840333, + "learning_rate": 2.147117296222664e-07, + "loss": 0.6725, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 11.543152464772712, + "learning_rate": 2.1669980119284293e-07, + "loss": 0.6849, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 12.604300046963612, + "learning_rate": 2.1868787276341946e-07, + "loss": 0.6973, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 17.78293640711407, + "learning_rate": 2.20675944333996e-07, + "loss": 0.6992, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 15.804083128828628, + "learning_rate": 2.2266401590457257e-07, + "loss": 0.7077, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 8.354349578729526, + "learning_rate": 2.246520874751491e-07, + "loss": 0.6908, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 18.33395134404084, + "learning_rate": 2.2664015904572564e-07, + "loss": 0.7103, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 14.319865071781924, + "learning_rate": 2.286282306163022e-07, + "loss": 0.709, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 16.226714659637416, + "learning_rate": 2.3061630218687872e-07, + "loss": 0.7025, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 14.380441824199682, + "learning_rate": 2.3260437375745526e-07, + "loss": 0.6888, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 7.710454499920294, + "learning_rate": 2.3459244532803179e-07, + "loss": 0.6973, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 8.613694452575448, + "learning_rate": 2.3658051689860834e-07, + "loss": 0.6895, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 7.234806556900114, + "learning_rate": 2.3856858846918485e-07, + "loss": 0.7031, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 10.809416112059246, + "learning_rate": 2.405566600397614e-07, + "loss": 0.6973, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 11.702103063908027, + "learning_rate": 2.4254473161033796e-07, + "loss": 0.6934, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 13.56965141757914, + "learning_rate": 2.445328031809145e-07, + "loss": 0.7077, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 8.110497199174853, + "learning_rate": 2.465208747514911e-07, + "loss": 0.6934, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 9.988481809796136, + "learning_rate": 2.485089463220676e-07, + "loss": 0.7012, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 7.2477238778832875, + "learning_rate": 2.504970178926441e-07, + "loss": 0.6927, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 11.723928197530425, + "learning_rate": 2.5248508946322065e-07, + "loss": 0.7005, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 9.613792941341233, + "learning_rate": 2.544731610337972e-07, + "loss": 0.6895, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 7.624417831100566, + "learning_rate": 2.5646123260437376e-07, + "loss": 0.694, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 10.313441311786733, + "learning_rate": 2.584493041749503e-07, + "loss": 0.7005, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 7.1267435033805935, + "learning_rate": 2.604373757455268e-07, + "loss": 0.6829, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 5.403280107465562, + "learning_rate": 2.624254473161034e-07, + "loss": 0.6947, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 10.340734408965702, + "learning_rate": 2.644135188866799e-07, + "loss": 0.7096, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 13.588529061202205, + "learning_rate": 2.6640159045725644e-07, + "loss": 0.694, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 9.383856981558846, + "learning_rate": 2.68389662027833e-07, + "loss": 0.6836, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 8.93930587340735, + "learning_rate": 2.7037773359840956e-07, + "loss": 0.6979, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 8.353156560726323, + "learning_rate": 2.7236580516898606e-07, + "loss": 0.6882, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 8.407364884013392, + "learning_rate": 2.743538767395626e-07, + "loss": 0.6836, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 13.067924299107743, + "learning_rate": 2.763419483101391e-07, + "loss": 0.7064, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 15.248695636900457, + "learning_rate": 2.783300198807157e-07, + "loss": 0.6908, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 15.017377545311174, + "learning_rate": 2.8031809145129224e-07, + "loss": 0.7122, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 10.357652234362792, + "learning_rate": 2.8230616302186875e-07, + "loss": 0.6947, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 9.57942372839956, + "learning_rate": 2.8429423459244536e-07, + "loss": 0.7083, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 12.922456186225475, + "learning_rate": 2.8628230616302186e-07, + "loss": 0.7194, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 14.473115631800136, + "learning_rate": 2.882703777335984e-07, + "loss": 0.7148, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 11.331664942924284, + "learning_rate": 2.902584493041749e-07, + "loss": 0.7109, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 12.673539635049096, + "learning_rate": 2.922465208747515e-07, + "loss": 0.7038, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 9.04107337352101, + "learning_rate": 2.94234592445328e-07, + "loss": 0.6901, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 11.255638228404129, + "learning_rate": 2.9622266401590454e-07, + "loss": 0.6973, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 10.758485345818503, + "learning_rate": 2.982107355864811e-07, + "loss": 0.7148, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 12.832774466364654, + "learning_rate": 3.0019880715705766e-07, + "loss": 0.7018, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 7.936029466771296, + "learning_rate": 3.0218687872763416e-07, + "loss": 0.7051, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 16.11735546998571, + "learning_rate": 3.041749502982107e-07, + "loss": 0.7129, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 9.934445035623654, + "learning_rate": 3.061630218687873e-07, + "loss": 0.6927, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 9.860628341537415, + "learning_rate": 3.081510934393638e-07, + "loss": 0.709, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 20.776979428644395, + "learning_rate": 3.1013916500994034e-07, + "loss": 0.6973, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 6.060839235696866, + "learning_rate": 3.121272365805169e-07, + "loss": 0.6875, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 9.016636156484555, + "learning_rate": 3.1411530815109346e-07, + "loss": 0.7005, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 10.664366811602976, + "learning_rate": 3.1610337972166996e-07, + "loss": 0.6953, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 9.97073369438387, + "learning_rate": 3.180914512922465e-07, + "loss": 0.6986, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 9.626382143390192, + "learning_rate": 3.20079522862823e-07, + "loss": 0.7038, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 10.069823255980422, + "learning_rate": 3.220675944333996e-07, + "loss": 0.6973, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 13.293485849087814, + "learning_rate": 3.240556660039761e-07, + "loss": 0.7051, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 11.194341501675366, + "learning_rate": 3.2604373757455264e-07, + "loss": 0.709, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 13.760642254352529, + "learning_rate": 3.2803180914512925e-07, + "loss": 0.6908, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 6.718925011211521, + "learning_rate": 3.3001988071570576e-07, + "loss": 0.6888, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 6.51965340046931, + "learning_rate": 3.320079522862823e-07, + "loss": 0.709, + "step": 167 + }, + { + "epoch": 0.03, + "grad_norm": 10.370216470811773, + "learning_rate": 3.339960238568588e-07, + "loss": 0.6999, + "step": 168 + }, + { + "epoch": 0.03, + "grad_norm": 9.641459829903065, + "learning_rate": 3.359840954274354e-07, + "loss": 0.7005, + "step": 169 + }, + { + "epoch": 0.03, + "grad_norm": 12.227145148657694, + "learning_rate": 3.379721669980119e-07, + "loss": 0.6986, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 10.030945147939798, + "learning_rate": 3.3996023856858844e-07, + "loss": 0.7083, + "step": 171 + }, + { + "epoch": 0.03, + "grad_norm": 6.754347731302461, + "learning_rate": 3.41948310139165e-07, + "loss": 0.6973, + "step": 172 + }, + { + "epoch": 0.03, + "grad_norm": 8.306275307575275, + "learning_rate": 3.4393638170974155e-07, + "loss": 0.7148, + "step": 173 + }, + { + "epoch": 0.03, + "grad_norm": 8.938887954973122, + "learning_rate": 3.4592445328031806e-07, + "loss": 0.694, + "step": 174 + }, + { + "epoch": 0.03, + "grad_norm": 11.033173801266212, + "learning_rate": 3.479125248508946e-07, + "loss": 0.6921, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 10.15575759084492, + "learning_rate": 3.499005964214712e-07, + "loss": 0.7038, + "step": 176 + }, + { + "epoch": 0.03, + "grad_norm": 8.081121940232887, + "learning_rate": 3.518886679920477e-07, + "loss": 0.7012, + "step": 177 + }, + { + "epoch": 0.03, + "grad_norm": 6.144905665693727, + "learning_rate": 3.5387673956262424e-07, + "loss": 0.6895, + "step": 178 + }, + { + "epoch": 0.03, + "grad_norm": 9.840880249842803, + "learning_rate": 3.558648111332008e-07, + "loss": 0.7038, + "step": 179 + }, + { + "epoch": 0.03, + "grad_norm": 12.656604381349382, + "learning_rate": 3.5785288270377735e-07, + "loss": 0.7057, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 9.173394848571524, + "learning_rate": 3.5984095427435386e-07, + "loss": 0.6979, + "step": 181 + }, + { + "epoch": 0.03, + "grad_norm": 8.665419354119585, + "learning_rate": 3.618290258449304e-07, + "loss": 0.6953, + "step": 182 + }, + { + "epoch": 0.03, + "grad_norm": 8.25288637593796, + "learning_rate": 3.638170974155069e-07, + "loss": 0.6855, + "step": 183 + }, + { + "epoch": 0.03, + "grad_norm": 19.802545830999897, + "learning_rate": 3.658051689860835e-07, + "loss": 0.7044, + "step": 184 + }, + { + "epoch": 0.03, + "grad_norm": 12.949341678898739, + "learning_rate": 3.6779324055666e-07, + "loss": 0.6927, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 8.233998150714905, + "learning_rate": 3.697813121272366e-07, + "loss": 0.6979, + "step": 186 + }, + { + "epoch": 0.03, + "grad_norm": 7.981150297023035, + "learning_rate": 3.717693836978131e-07, + "loss": 0.6927, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 8.258410302466865, + "learning_rate": 3.7375745526838965e-07, + "loss": 0.6986, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 5.97096519852186, + "learning_rate": 3.757455268389662e-07, + "loss": 0.6986, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 9.662015794529262, + "learning_rate": 3.777335984095427e-07, + "loss": 0.6966, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 6.101682221907611, + "learning_rate": 3.797216699801193e-07, + "loss": 0.7005, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 13.063709314217139, + "learning_rate": 3.817097415506958e-07, + "loss": 0.7018, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 11.267474946684223, + "learning_rate": 3.8369781312127234e-07, + "loss": 0.6973, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 11.724586513910879, + "learning_rate": 3.856858846918489e-07, + "loss": 0.7064, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 7.577509707164589, + "learning_rate": 3.8767395626242545e-07, + "loss": 0.7005, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 13.262078697888994, + "learning_rate": 3.8966202783300196e-07, + "loss": 0.6979, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 9.432761043712745, + "learning_rate": 3.916500994035785e-07, + "loss": 0.6921, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 8.170122449542886, + "learning_rate": 3.93638170974155e-07, + "loss": 0.6901, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 11.92221347737594, + "learning_rate": 3.956262425447316e-07, + "loss": 0.696, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 8.47046784375388, + "learning_rate": 3.9761431411530813e-07, + "loss": 0.6888, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 11.16719653926047, + "learning_rate": 3.996023856858847e-07, + "loss": 0.6784, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 10.225514652092729, + "learning_rate": 4.0159045725646125e-07, + "loss": 0.6901, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 6.837374911018087, + "learning_rate": 4.0357852882703775e-07, + "loss": 0.6751, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 15.297190274257355, + "learning_rate": 4.055666003976143e-07, + "loss": 0.6849, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 15.255966378500446, + "learning_rate": 4.075546719681908e-07, + "loss": 0.7025, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 6.611746354866194, + "learning_rate": 4.095427435387674e-07, + "loss": 0.6953, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 6.829940443601568, + "learning_rate": 4.115308151093439e-07, + "loss": 0.696, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 18.840135382636408, + "learning_rate": 4.135188866799205e-07, + "loss": 0.707, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 12.499216410916286, + "learning_rate": 4.15506958250497e-07, + "loss": 0.6986, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 14.801073041781994, + "learning_rate": 4.1749502982107355e-07, + "loss": 0.7103, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 8.988704484186014, + "learning_rate": 4.194831013916501e-07, + "loss": 0.707, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 5.907977393176701, + "learning_rate": 4.214711729622266e-07, + "loss": 0.6829, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 8.375759500490327, + "learning_rate": 4.2345924453280317e-07, + "loss": 0.7031, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 7.946511169671699, + "learning_rate": 4.254473161033797e-07, + "loss": 0.7044, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 12.998716399542566, + "learning_rate": 4.274353876739563e-07, + "loss": 0.6947, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 6.959329216082167, + "learning_rate": 4.294234592445328e-07, + "loss": 0.6979, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 12.042285846305406, + "learning_rate": 4.3141153081510935e-07, + "loss": 0.7161, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 5.662373378121598, + "learning_rate": 4.3339960238568585e-07, + "loss": 0.6973, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 15.55518016185341, + "learning_rate": 4.353876739562624e-07, + "loss": 0.6875, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 8.234713264660966, + "learning_rate": 4.373757455268389e-07, + "loss": 0.7142, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 16.953434684821925, + "learning_rate": 4.3936381709741547e-07, + "loss": 0.7122, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 9.956100526957465, + "learning_rate": 4.41351888667992e-07, + "loss": 0.7018, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 6.7557209006064145, + "learning_rate": 4.433399602385686e-07, + "loss": 0.6953, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 11.816441376599446, + "learning_rate": 4.4532803180914515e-07, + "loss": 0.6992, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 15.419234313521548, + "learning_rate": 4.4731610337972165e-07, + "loss": 0.7031, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 17.944268927972523, + "learning_rate": 4.493041749502982e-07, + "loss": 0.6914, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 11.803096837220538, + "learning_rate": 4.512922465208747e-07, + "loss": 0.6999, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 5.815172575766862, + "learning_rate": 4.5328031809145127e-07, + "loss": 0.7038, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 14.891398203553509, + "learning_rate": 4.552683896620278e-07, + "loss": 0.696, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 8.826592139031577, + "learning_rate": 4.572564612326044e-07, + "loss": 0.6966, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 13.151296064025711, + "learning_rate": 4.592445328031809e-07, + "loss": 0.6992, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 5.599015253621187, + "learning_rate": 4.6123260437375745e-07, + "loss": 0.6979, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 12.293974973231238, + "learning_rate": 4.6322067594433395e-07, + "loss": 0.6921, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 6.225342863274202, + "learning_rate": 4.652087475149105e-07, + "loss": 0.6914, + "step": 234 + }, + { + "epoch": 0.04, + "grad_norm": 9.056641346363579, + "learning_rate": 4.6719681908548707e-07, + "loss": 0.7012, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 9.536569729443992, + "learning_rate": 4.6918489065606357e-07, + "loss": 0.6999, + "step": 236 + }, + { + "epoch": 0.04, + "grad_norm": 11.35370236589335, + "learning_rate": 4.711729622266402e-07, + "loss": 0.6979, + "step": 237 + }, + { + "epoch": 0.04, + "grad_norm": 11.52220772969608, + "learning_rate": 4.731610337972167e-07, + "loss": 0.6914, + "step": 238 + }, + { + "epoch": 0.04, + "grad_norm": 8.076343622547835, + "learning_rate": 4.7514910536779325e-07, + "loss": 0.6914, + "step": 239 + }, + { + "epoch": 0.04, + "grad_norm": 8.554060849744815, + "learning_rate": 4.771371769383697e-07, + "loss": 0.6979, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 11.480350807203205, + "learning_rate": 4.791252485089463e-07, + "loss": 0.6927, + "step": 241 + }, + { + "epoch": 0.04, + "grad_norm": 15.342242299633007, + "learning_rate": 4.811133200795228e-07, + "loss": 0.7012, + "step": 242 + }, + { + "epoch": 0.04, + "grad_norm": 9.664015945809206, + "learning_rate": 4.831013916500994e-07, + "loss": 0.6882, + "step": 243 + }, + { + "epoch": 0.04, + "grad_norm": 7.958828991977214, + "learning_rate": 4.850894632206759e-07, + "loss": 0.7012, + "step": 244 + }, + { + "epoch": 0.04, + "grad_norm": 12.77870046224619, + "learning_rate": 4.870775347912524e-07, + "loss": 0.6947, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 18.493526176559502, + "learning_rate": 4.89065606361829e-07, + "loss": 0.7051, + "step": 246 + }, + { + "epoch": 0.04, + "grad_norm": 9.502814758093153, + "learning_rate": 4.910536779324055e-07, + "loss": 0.6777, + "step": 247 + }, + { + "epoch": 0.04, + "grad_norm": 6.431062177489339, + "learning_rate": 4.930417495029822e-07, + "loss": 0.7214, + "step": 248 + }, + { + "epoch": 0.04, + "grad_norm": 12.080436747086564, + "learning_rate": 4.950298210735587e-07, + "loss": 0.696, + "step": 249 + }, + { + "epoch": 0.04, + "grad_norm": 13.34637214528923, + "learning_rate": 4.970178926441352e-07, + "loss": 0.6849, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 8.074846397954222, + "learning_rate": 4.990059642147117e-07, + "loss": 0.7025, + "step": 251 + }, + { + "epoch": 0.04, + "grad_norm": 9.661703632519153, + "learning_rate": 5.009940357852882e-07, + "loss": 0.6908, + "step": 252 + }, + { + "epoch": 0.04, + "grad_norm": 11.772694233410913, + "learning_rate": 5.029821073558648e-07, + "loss": 0.707, + "step": 253 + }, + { + "epoch": 0.04, + "grad_norm": 6.5479659506665975, + "learning_rate": 5.049701789264413e-07, + "loss": 0.6855, + "step": 254 + }, + { + "epoch": 0.04, + "grad_norm": 10.003122075923246, + "learning_rate": 5.069582504970179e-07, + "loss": 0.694, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 13.28882510341652, + "learning_rate": 5.089463220675944e-07, + "loss": 0.6901, + "step": 256 + }, + { + "epoch": 0.04, + "grad_norm": 11.135345888142215, + "learning_rate": 5.109343936381709e-07, + "loss": 0.7025, + "step": 257 + }, + { + "epoch": 0.04, + "grad_norm": 8.775941597901639, + "learning_rate": 5.129224652087475e-07, + "loss": 0.696, + "step": 258 + }, + { + "epoch": 0.04, + "grad_norm": 6.338576180857166, + "learning_rate": 5.14910536779324e-07, + "loss": 0.7038, + "step": 259 + }, + { + "epoch": 0.04, + "grad_norm": 7.42640118414473, + "learning_rate": 5.168986083499006e-07, + "loss": 0.7018, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 6.873366258012261, + "learning_rate": 5.18886679920477e-07, + "loss": 0.6979, + "step": 261 + }, + { + "epoch": 0.04, + "grad_norm": 15.039274197150805, + "learning_rate": 5.208747514910536e-07, + "loss": 0.7025, + "step": 262 + }, + { + "epoch": 0.04, + "grad_norm": 4.518861953464397, + "learning_rate": 5.228628230616303e-07, + "loss": 0.6855, + "step": 263 + }, + { + "epoch": 0.04, + "grad_norm": 10.259611086979236, + "learning_rate": 5.248508946322068e-07, + "loss": 0.7096, + "step": 264 + }, + { + "epoch": 0.04, + "grad_norm": 10.92115540095605, + "learning_rate": 5.268389662027833e-07, + "loss": 0.6803, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 8.794876894489468, + "learning_rate": 5.288270377733598e-07, + "loss": 0.6947, + "step": 266 + }, + { + "epoch": 0.04, + "grad_norm": 5.474805137418248, + "learning_rate": 5.308151093439364e-07, + "loss": 0.6914, + "step": 267 + }, + { + "epoch": 0.04, + "grad_norm": 12.691991274029654, + "learning_rate": 5.328031809145129e-07, + "loss": 0.7018, + "step": 268 + }, + { + "epoch": 0.04, + "grad_norm": 12.681309089813006, + "learning_rate": 5.347912524850894e-07, + "loss": 0.6914, + "step": 269 + }, + { + "epoch": 0.04, + "grad_norm": 7.488518547907686, + "learning_rate": 5.36779324055666e-07, + "loss": 0.7116, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 11.375410364517245, + "learning_rate": 5.387673956262425e-07, + "loss": 0.6914, + "step": 271 + }, + { + "epoch": 0.04, + "grad_norm": 10.790093212642846, + "learning_rate": 5.407554671968191e-07, + "loss": 0.7005, + "step": 272 + }, + { + "epoch": 0.04, + "grad_norm": 8.5063460069214, + "learning_rate": 5.427435387673956e-07, + "loss": 0.681, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 6.451475537748225, + "learning_rate": 5.447316103379721e-07, + "loss": 0.7025, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 9.859379857762386, + "learning_rate": 5.467196819085486e-07, + "loss": 0.7077, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 7.711859151555616, + "learning_rate": 5.487077534791252e-07, + "loss": 0.694, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 10.964127175962684, + "learning_rate": 5.506958250497019e-07, + "loss": 0.7064, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 9.224217021485792, + "learning_rate": 5.526838966202783e-07, + "loss": 0.6934, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 8.560728741945537, + "learning_rate": 5.546719681908549e-07, + "loss": 0.6836, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 4.353656930563279, + "learning_rate": 5.566600397614314e-07, + "loss": 0.6947, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 7.661472247020613, + "learning_rate": 5.58648111332008e-07, + "loss": 0.7083, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 7.149483876298925, + "learning_rate": 5.606361829025845e-07, + "loss": 0.6921, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 12.139296603661725, + "learning_rate": 5.62624254473161e-07, + "loss": 0.6842, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 12.739375060295059, + "learning_rate": 5.646123260437375e-07, + "loss": 0.6901, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 3.895838255805702, + "learning_rate": 5.666003976143141e-07, + "loss": 0.6934, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 4.899371361015414, + "learning_rate": 5.685884691848907e-07, + "loss": 0.6921, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 8.847583962669692, + "learning_rate": 5.705765407554671e-07, + "loss": 0.6829, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 6.451751245363089, + "learning_rate": 5.725646123260437e-07, + "loss": 0.6914, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 7.221409177869833, + "learning_rate": 5.745526838966202e-07, + "loss": 0.7064, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 12.123138139895522, + "learning_rate": 5.765407554671968e-07, + "loss": 0.7077, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 7.3840821201685936, + "learning_rate": 5.785288270377732e-07, + "loss": 0.7012, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 19.41883436126114, + "learning_rate": 5.805168986083498e-07, + "loss": 0.7207, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 5.93971694207146, + "learning_rate": 5.825049701789265e-07, + "loss": 0.7051, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 7.307009090413998, + "learning_rate": 5.84493041749503e-07, + "loss": 0.7005, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 21.045378058309787, + "learning_rate": 5.864811133200796e-07, + "loss": 0.7012, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 7.290509368371086, + "learning_rate": 5.88469184890656e-07, + "loss": 0.7005, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 8.937779129744793, + "learning_rate": 5.904572564612326e-07, + "loss": 0.6927, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 4.502500035460147, + "learning_rate": 5.924453280318091e-07, + "loss": 0.6914, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 4.403420813579434, + "learning_rate": 5.944333996023857e-07, + "loss": 0.7018, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 4.23040202536099, + "learning_rate": 5.964214711729622e-07, + "loss": 0.6927, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 9.527493376871403, + "learning_rate": 5.984095427435387e-07, + "loss": 0.6836, + "step": 301 + }, + { + "epoch": 0.05, + "grad_norm": 8.040441324881188, + "learning_rate": 6.003976143141153e-07, + "loss": 0.6914, + "step": 302 + }, + { + "epoch": 0.05, + "grad_norm": 9.559476574674447, + "learning_rate": 6.023856858846918e-07, + "loss": 0.6888, + "step": 303 + }, + { + "epoch": 0.05, + "grad_norm": 19.497114163905213, + "learning_rate": 6.043737574552683e-07, + "loss": 0.6953, + "step": 304 + }, + { + "epoch": 0.05, + "grad_norm": 16.788947607189833, + "learning_rate": 6.063618290258448e-07, + "loss": 0.6803, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 12.970019694739763, + "learning_rate": 6.083499005964214e-07, + "loss": 0.6797, + "step": 306 + }, + { + "epoch": 0.05, + "grad_norm": 4.031853208461742, + "learning_rate": 6.10337972166998e-07, + "loss": 0.7012, + "step": 307 + }, + { + "epoch": 0.05, + "grad_norm": 19.668592005887024, + "learning_rate": 6.123260437375746e-07, + "loss": 0.7194, + "step": 308 + }, + { + "epoch": 0.05, + "grad_norm": 10.242629683158277, + "learning_rate": 6.143141153081511e-07, + "loss": 0.681, + "step": 309 + }, + { + "epoch": 0.05, + "grad_norm": 8.958835029968174, + "learning_rate": 6.163021868787276e-07, + "loss": 0.6999, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 8.15745856648311, + "learning_rate": 6.182902584493042e-07, + "loss": 0.7012, + "step": 311 + }, + { + "epoch": 0.05, + "grad_norm": 6.7483854137757, + "learning_rate": 6.202783300198807e-07, + "loss": 0.7064, + "step": 312 + }, + { + "epoch": 0.05, + "grad_norm": 6.3213437620023, + "learning_rate": 6.222664015904572e-07, + "loss": 0.6797, + "step": 313 + }, + { + "epoch": 0.05, + "grad_norm": 6.89129801222546, + "learning_rate": 6.242544731610338e-07, + "loss": 0.7161, + "step": 314 + }, + { + "epoch": 0.05, + "grad_norm": 11.487854901671309, + "learning_rate": 6.262425447316103e-07, + "loss": 0.681, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 11.631634579184208, + "learning_rate": 6.282306163021869e-07, + "loss": 0.6829, + "step": 316 + }, + { + "epoch": 0.05, + "grad_norm": 9.479589784559503, + "learning_rate": 6.302186878727634e-07, + "loss": 0.7103, + "step": 317 + }, + { + "epoch": 0.05, + "grad_norm": 23.88680818714399, + "learning_rate": 6.322067594433399e-07, + "loss": 0.7331, + "step": 318 + }, + { + "epoch": 0.05, + "grad_norm": 10.16012988872503, + "learning_rate": 6.341948310139164e-07, + "loss": 0.6986, + "step": 319 + }, + { + "epoch": 0.05, + "grad_norm": 6.506304447319892, + "learning_rate": 6.36182902584493e-07, + "loss": 0.6953, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 13.117948308429597, + "learning_rate": 6.381709741550696e-07, + "loss": 0.6882, + "step": 321 + }, + { + "epoch": 0.05, + "grad_norm": 11.869821176900725, + "learning_rate": 6.40159045725646e-07, + "loss": 0.6862, + "step": 322 + }, + { + "epoch": 0.05, + "grad_norm": 9.270755309687003, + "learning_rate": 6.421471172962227e-07, + "loss": 0.6973, + "step": 323 + }, + { + "epoch": 0.05, + "grad_norm": 23.256454017634205, + "learning_rate": 6.441351888667992e-07, + "loss": 0.7109, + "step": 324 + }, + { + "epoch": 0.05, + "grad_norm": 8.2785209851603, + "learning_rate": 6.461232604373758e-07, + "loss": 0.6882, + "step": 325 + }, + { + "epoch": 0.05, + "grad_norm": 13.652188222633303, + "learning_rate": 6.481113320079522e-07, + "loss": 0.7168, + "step": 326 + }, + { + "epoch": 0.05, + "grad_norm": 19.2935449198507, + "learning_rate": 6.500994035785288e-07, + "loss": 0.6999, + "step": 327 + }, + { + "epoch": 0.05, + "grad_norm": 12.584956941540586, + "learning_rate": 6.520874751491053e-07, + "loss": 0.7031, + "step": 328 + }, + { + "epoch": 0.05, + "grad_norm": 8.32122696980023, + "learning_rate": 6.540755467196819e-07, + "loss": 0.7109, + "step": 329 + }, + { + "epoch": 0.05, + "grad_norm": 12.228668761961366, + "learning_rate": 6.560636182902585e-07, + "loss": 0.6855, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 6.269165490326464, + "learning_rate": 6.580516898608349e-07, + "loss": 0.6979, + "step": 331 + }, + { + "epoch": 0.05, + "grad_norm": 8.469514141986535, + "learning_rate": 6.600397614314115e-07, + "loss": 0.6908, + "step": 332 + }, + { + "epoch": 0.05, + "grad_norm": 15.648791090569956, + "learning_rate": 6.62027833001988e-07, + "loss": 0.7129, + "step": 333 + }, + { + "epoch": 0.05, + "grad_norm": 11.435671443202395, + "learning_rate": 6.640159045725646e-07, + "loss": 0.7077, + "step": 334 + }, + { + "epoch": 0.05, + "grad_norm": 12.02730451898599, + "learning_rate": 6.66003976143141e-07, + "loss": 0.7057, + "step": 335 + }, + { + "epoch": 0.05, + "grad_norm": 6.411932022858572, + "learning_rate": 6.679920477137176e-07, + "loss": 0.6966, + "step": 336 + }, + { + "epoch": 0.05, + "grad_norm": 11.456042587914371, + "learning_rate": 6.699801192842943e-07, + "loss": 0.7012, + "step": 337 + }, + { + "epoch": 0.05, + "grad_norm": 27.09701742298638, + "learning_rate": 6.719681908548708e-07, + "loss": 0.7279, + "step": 338 + }, + { + "epoch": 0.05, + "grad_norm": 12.255417844602752, + "learning_rate": 6.739562624254473e-07, + "loss": 0.6992, + "step": 339 + }, + { + "epoch": 0.05, + "grad_norm": 7.715787721672236, + "learning_rate": 6.759443339960238e-07, + "loss": 0.7005, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 10.663202418344564, + "learning_rate": 6.779324055666004e-07, + "loss": 0.679, + "step": 341 + }, + { + "epoch": 0.05, + "grad_norm": 6.518507768649536, + "learning_rate": 6.799204771371769e-07, + "loss": 0.7018, + "step": 342 + }, + { + "epoch": 0.05, + "grad_norm": 15.246755644395869, + "learning_rate": 6.819085487077535e-07, + "loss": 0.7148, + "step": 343 + }, + { + "epoch": 0.05, + "grad_norm": 12.423848041665416, + "learning_rate": 6.8389662027833e-07, + "loss": 0.7031, + "step": 344 + }, + { + "epoch": 0.05, + "grad_norm": 6.81795434462883, + "learning_rate": 6.858846918489065e-07, + "loss": 0.694, + "step": 345 + }, + { + "epoch": 0.05, + "grad_norm": 19.803204587071182, + "learning_rate": 6.878727634194831e-07, + "loss": 0.7174, + "step": 346 + }, + { + "epoch": 0.05, + "grad_norm": 13.044685463189259, + "learning_rate": 6.898608349900596e-07, + "loss": 0.707, + "step": 347 + }, + { + "epoch": 0.05, + "grad_norm": 19.31576570191332, + "learning_rate": 6.918489065606361e-07, + "loss": 0.7357, + "step": 348 + }, + { + "epoch": 0.05, + "grad_norm": 4.5244057852237125, + "learning_rate": 6.938369781312126e-07, + "loss": 0.6842, + "step": 349 + }, + { + "epoch": 0.05, + "grad_norm": 9.672189949041798, + "learning_rate": 6.958250497017892e-07, + "loss": 0.7018, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 3.708731563152467, + "learning_rate": 6.978131212723658e-07, + "loss": 0.6888, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 9.346615596017632, + "learning_rate": 6.998011928429423e-07, + "loss": 0.6882, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 8.217379093401352, + "learning_rate": 7.017892644135189e-07, + "loss": 0.7044, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 12.22298505042355, + "learning_rate": 7.037773359840954e-07, + "loss": 0.7096, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 5.965405242738737, + "learning_rate": 7.05765407554672e-07, + "loss": 0.6862, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 8.07423428462319, + "learning_rate": 7.077534791252485e-07, + "loss": 0.6934, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 5.538680568175445, + "learning_rate": 7.09741550695825e-07, + "loss": 0.6934, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 8.447196378650604, + "learning_rate": 7.117296222664016e-07, + "loss": 0.6882, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 9.595457251445302, + "learning_rate": 7.137176938369781e-07, + "loss": 0.7116, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 12.270679139037185, + "learning_rate": 7.157057654075547e-07, + "loss": 0.6992, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 4.95894194849116, + "learning_rate": 7.176938369781311e-07, + "loss": 0.6934, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 4.580946630106774, + "learning_rate": 7.196819085487077e-07, + "loss": 0.6979, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 5.934452903396781, + "learning_rate": 7.216699801192842e-07, + "loss": 0.6895, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 9.242218230831424, + "learning_rate": 7.236580516898608e-07, + "loss": 0.6882, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 16.582871328306762, + "learning_rate": 7.256461232604374e-07, + "loss": 0.7057, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 7.420252218903687, + "learning_rate": 7.276341948310138e-07, + "loss": 0.6992, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 4.218207045599389, + "learning_rate": 7.296222664015904e-07, + "loss": 0.6914, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 11.992630320930392, + "learning_rate": 7.31610337972167e-07, + "loss": 0.6888, + "step": 368 + }, + { + "epoch": 0.06, + "grad_norm": 6.260562920680358, + "learning_rate": 7.335984095427436e-07, + "loss": 0.7064, + "step": 369 + }, + { + "epoch": 0.06, + "grad_norm": 4.62249577164887, + "learning_rate": 7.3558648111332e-07, + "loss": 0.6927, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 8.023191653089489, + "learning_rate": 7.375745526838966e-07, + "loss": 0.6966, + "step": 371 + }, + { + "epoch": 0.06, + "grad_norm": 4.715700792629364, + "learning_rate": 7.395626242544732e-07, + "loss": 0.6947, + "step": 372 + }, + { + "epoch": 0.06, + "grad_norm": 17.709381222170816, + "learning_rate": 7.415506958250497e-07, + "loss": 0.7096, + "step": 373 + }, + { + "epoch": 0.06, + "grad_norm": 4.9668223725696095, + "learning_rate": 7.435387673956262e-07, + "loss": 0.6914, + "step": 374 + }, + { + "epoch": 0.06, + "grad_norm": 5.429722063057382, + "learning_rate": 7.455268389662027e-07, + "loss": 0.6992, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 9.765781287024563, + "learning_rate": 7.475149105367793e-07, + "loss": 0.6992, + "step": 376 + }, + { + "epoch": 0.06, + "grad_norm": 16.237920315094957, + "learning_rate": 7.495029821073558e-07, + "loss": 0.6868, + "step": 377 + }, + { + "epoch": 0.06, + "grad_norm": 4.753510666237331, + "learning_rate": 7.514910536779324e-07, + "loss": 0.6953, + "step": 378 + }, + { + "epoch": 0.06, + "grad_norm": 4.819031307744193, + "learning_rate": 7.534791252485088e-07, + "loss": 0.6979, + "step": 379 + }, + { + "epoch": 0.06, + "grad_norm": 9.359119021154847, + "learning_rate": 7.554671968190854e-07, + "loss": 0.6882, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 4.975239810305357, + "learning_rate": 7.57455268389662e-07, + "loss": 0.6986, + "step": 381 + }, + { + "epoch": 0.06, + "grad_norm": 8.391374484385123, + "learning_rate": 7.594433399602385e-07, + "loss": 0.6882, + "step": 382 + }, + { + "epoch": 0.06, + "grad_norm": 10.18481705077234, + "learning_rate": 7.614314115308151e-07, + "loss": 0.6973, + "step": 383 + }, + { + "epoch": 0.06, + "grad_norm": 3.7108158854706517, + "learning_rate": 7.634194831013916e-07, + "loss": 0.6947, + "step": 384 + }, + { + "epoch": 0.06, + "grad_norm": 7.969229283378244, + "learning_rate": 7.654075546719682e-07, + "loss": 0.6966, + "step": 385 + }, + { + "epoch": 0.06, + "grad_norm": 5.498605957072399, + "learning_rate": 7.673956262425447e-07, + "loss": 0.6888, + "step": 386 + }, + { + "epoch": 0.06, + "grad_norm": 7.167664576577339, + "learning_rate": 7.693836978131213e-07, + "loss": 0.6875, + "step": 387 + }, + { + "epoch": 0.06, + "grad_norm": 5.648576794861765, + "learning_rate": 7.713717693836978e-07, + "loss": 0.6992, + "step": 388 + }, + { + "epoch": 0.06, + "grad_norm": 12.69657815357591, + "learning_rate": 7.733598409542743e-07, + "loss": 0.6953, + "step": 389 + }, + { + "epoch": 0.06, + "grad_norm": 6.759916539951178, + "learning_rate": 7.753479125248509e-07, + "loss": 0.7018, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 5.002824055520336, + "learning_rate": 7.773359840954274e-07, + "loss": 0.6921, + "step": 391 + }, + { + "epoch": 0.06, + "grad_norm": 9.27988438047741, + "learning_rate": 7.793240556660039e-07, + "loss": 0.6882, + "step": 392 + }, + { + "epoch": 0.06, + "grad_norm": 5.899708947821971, + "learning_rate": 7.813121272365804e-07, + "loss": 0.6914, + "step": 393 + }, + { + "epoch": 0.06, + "grad_norm": 10.310142559440736, + "learning_rate": 7.83300198807157e-07, + "loss": 0.6868, + "step": 394 + }, + { + "epoch": 0.06, + "grad_norm": 5.348290669805046, + "learning_rate": 7.852882703777336e-07, + "loss": 0.6914, + "step": 395 + }, + { + "epoch": 0.06, + "grad_norm": 6.074712235233003, + "learning_rate": 7.8727634194831e-07, + "loss": 0.6973, + "step": 396 + }, + { + "epoch": 0.06, + "grad_norm": 9.755669635860118, + "learning_rate": 7.892644135188866e-07, + "loss": 0.6764, + "step": 397 + }, + { + "epoch": 0.06, + "grad_norm": 22.060253342539383, + "learning_rate": 7.912524850894632e-07, + "loss": 0.7233, + "step": 398 + }, + { + "epoch": 0.06, + "grad_norm": 6.622080041868642, + "learning_rate": 7.932405566600398e-07, + "loss": 0.6908, + "step": 399 + }, + { + "epoch": 0.06, + "grad_norm": 9.01335992502021, + "learning_rate": 7.952286282306163e-07, + "loss": 0.6706, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 13.430039412991032, + "learning_rate": 7.972166998011928e-07, + "loss": 0.7155, + "step": 401 + }, + { + "epoch": 0.06, + "grad_norm": 5.5076534878656505, + "learning_rate": 7.992047713717694e-07, + "loss": 0.6999, + "step": 402 + }, + { + "epoch": 0.06, + "grad_norm": 4.408574469276335, + "learning_rate": 8.011928429423459e-07, + "loss": 0.6823, + "step": 403 + }, + { + "epoch": 0.06, + "grad_norm": 6.422820808627556, + "learning_rate": 8.031809145129225e-07, + "loss": 0.6921, + "step": 404 + }, + { + "epoch": 0.06, + "grad_norm": 15.449839818787293, + "learning_rate": 8.051689860834989e-07, + "loss": 0.7116, + "step": 405 + }, + { + "epoch": 0.06, + "grad_norm": 13.761614260195895, + "learning_rate": 8.071570576540755e-07, + "loss": 0.7096, + "step": 406 + }, + { + "epoch": 0.06, + "grad_norm": 7.6536118745375745, + "learning_rate": 8.09145129224652e-07, + "loss": 0.7012, + "step": 407 + }, + { + "epoch": 0.06, + "grad_norm": 19.80124304011683, + "learning_rate": 8.111332007952286e-07, + "loss": 0.6934, + "step": 408 + }, + { + "epoch": 0.06, + "grad_norm": 8.076430830711598, + "learning_rate": 8.131212723658051e-07, + "loss": 0.6921, + "step": 409 + }, + { + "epoch": 0.06, + "grad_norm": 3.5804951230610906, + "learning_rate": 8.151093439363816e-07, + "loss": 0.7018, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 8.663045435912005, + "learning_rate": 8.170974155069582e-07, + "loss": 0.6986, + "step": 411 + }, + { + "epoch": 0.06, + "grad_norm": 12.617753884722543, + "learning_rate": 8.190854870775347e-07, + "loss": 0.6927, + "step": 412 + }, + { + "epoch": 0.06, + "grad_norm": 5.263784393923575, + "learning_rate": 8.210735586481114e-07, + "loss": 0.694, + "step": 413 + }, + { + "epoch": 0.06, + "grad_norm": 5.087364771630666, + "learning_rate": 8.230616302186878e-07, + "loss": 0.6868, + "step": 414 + }, + { + "epoch": 0.06, + "grad_norm": 6.845926581637131, + "learning_rate": 8.250497017892644e-07, + "loss": 0.6868, + "step": 415 + }, + { + "epoch": 0.06, + "grad_norm": 5.35737299848163, + "learning_rate": 8.27037773359841e-07, + "loss": 0.6953, + "step": 416 + }, + { + "epoch": 0.06, + "grad_norm": 4.9204887768672, + "learning_rate": 8.290258449304175e-07, + "loss": 0.6888, + "step": 417 + }, + { + "epoch": 0.06, + "grad_norm": 12.01726074190478, + "learning_rate": 8.31013916500994e-07, + "loss": 0.6986, + "step": 418 + }, + { + "epoch": 0.06, + "grad_norm": 8.910697122066416, + "learning_rate": 8.330019880715705e-07, + "loss": 0.7103, + "step": 419 + }, + { + "epoch": 0.06, + "grad_norm": 5.00278808441115, + "learning_rate": 8.349900596421471e-07, + "loss": 0.6927, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 10.85072521664762, + "learning_rate": 8.369781312127236e-07, + "loss": 0.7044, + "step": 421 + }, + { + "epoch": 0.06, + "grad_norm": 9.291031958074681, + "learning_rate": 8.389662027833002e-07, + "loss": 0.6829, + "step": 422 + }, + { + "epoch": 0.06, + "grad_norm": 12.345945126420645, + "learning_rate": 8.409542743538766e-07, + "loss": 0.7057, + "step": 423 + }, + { + "epoch": 0.06, + "grad_norm": 4.496387549038152, + "learning_rate": 8.429423459244532e-07, + "loss": 0.6784, + "step": 424 + }, + { + "epoch": 0.06, + "grad_norm": 6.203790711474915, + "learning_rate": 8.449304174950298e-07, + "loss": 0.7103, + "step": 425 + }, + { + "epoch": 0.06, + "grad_norm": 6.173600079443536, + "learning_rate": 8.469184890656063e-07, + "loss": 0.7064, + "step": 426 + }, + { + "epoch": 0.06, + "grad_norm": 7.3966878786481844, + "learning_rate": 8.489065606361828e-07, + "loss": 0.6986, + "step": 427 + }, + { + "epoch": 0.06, + "grad_norm": 8.41491011466304, + "learning_rate": 8.508946322067594e-07, + "loss": 0.7031, + "step": 428 + }, + { + "epoch": 0.06, + "grad_norm": 25.221646466088906, + "learning_rate": 8.52882703777336e-07, + "loss": 0.7031, + "step": 429 + }, + { + "epoch": 0.06, + "grad_norm": 6.440835669617919, + "learning_rate": 8.548707753479126e-07, + "loss": 0.6882, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 6.410414979566638, + "learning_rate": 8.56858846918489e-07, + "loss": 0.6829, + "step": 431 + }, + { + "epoch": 0.06, + "grad_norm": 3.6541470471282707, + "learning_rate": 8.588469184890656e-07, + "loss": 0.694, + "step": 432 + }, + { + "epoch": 0.06, + "grad_norm": 6.825399853341736, + "learning_rate": 8.608349900596421e-07, + "loss": 0.7012, + "step": 433 + }, + { + "epoch": 0.06, + "grad_norm": 9.241159712497785, + "learning_rate": 8.628230616302187e-07, + "loss": 0.6895, + "step": 434 + }, + { + "epoch": 0.06, + "grad_norm": 17.64648569340244, + "learning_rate": 8.648111332007952e-07, + "loss": 0.7109, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 7.618834453677141, + "learning_rate": 8.667992047713717e-07, + "loss": 0.6882, + "step": 436 + }, + { + "epoch": 0.07, + "grad_norm": 3.961434646159333, + "learning_rate": 8.687872763419482e-07, + "loss": 0.6882, + "step": 437 + }, + { + "epoch": 0.07, + "grad_norm": 5.412230465469577, + "learning_rate": 8.707753479125248e-07, + "loss": 0.694, + "step": 438 + }, + { + "epoch": 0.07, + "grad_norm": 15.271707813629812, + "learning_rate": 8.727634194831014e-07, + "loss": 0.7025, + "step": 439 + }, + { + "epoch": 0.07, + "grad_norm": 3.9123434932165413, + "learning_rate": 8.747514910536778e-07, + "loss": 0.694, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 5.727255073248538, + "learning_rate": 8.767395626242544e-07, + "loss": 0.6829, + "step": 441 + }, + { + "epoch": 0.07, + "grad_norm": 4.663170765197002, + "learning_rate": 8.787276341948309e-07, + "loss": 0.6986, + "step": 442 + }, + { + "epoch": 0.07, + "grad_norm": 8.58580329212734, + "learning_rate": 8.807157057654076e-07, + "loss": 0.6986, + "step": 443 + }, + { + "epoch": 0.07, + "grad_norm": 5.382640569657567, + "learning_rate": 8.82703777335984e-07, + "loss": 0.7005, + "step": 444 + }, + { + "epoch": 0.07, + "grad_norm": 5.998459850477437, + "learning_rate": 8.846918489065606e-07, + "loss": 0.6966, + "step": 445 + }, + { + "epoch": 0.07, + "grad_norm": 6.196969531762864, + "learning_rate": 8.866799204771372e-07, + "loss": 0.694, + "step": 446 + }, + { + "epoch": 0.07, + "grad_norm": 7.110700316840523, + "learning_rate": 8.886679920477137e-07, + "loss": 0.6908, + "step": 447 + }, + { + "epoch": 0.07, + "grad_norm": 4.872377673505329, + "learning_rate": 8.906560636182903e-07, + "loss": 0.6979, + "step": 448 + }, + { + "epoch": 0.07, + "grad_norm": 5.792518513803613, + "learning_rate": 8.926441351888667e-07, + "loss": 0.7038, + "step": 449 + }, + { + "epoch": 0.07, + "grad_norm": 9.936113753412021, + "learning_rate": 8.946322067594433e-07, + "loss": 0.696, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 11.402359219039022, + "learning_rate": 8.966202783300198e-07, + "loss": 0.7018, + "step": 451 + }, + { + "epoch": 0.07, + "grad_norm": 4.250859017357887, + "learning_rate": 8.986083499005964e-07, + "loss": 0.6908, + "step": 452 + }, + { + "epoch": 0.07, + "grad_norm": 4.647497302263608, + "learning_rate": 9.005964214711729e-07, + "loss": 0.6986, + "step": 453 + }, + { + "epoch": 0.07, + "grad_norm": 14.239188208441256, + "learning_rate": 9.025844930417494e-07, + "loss": 0.7129, + "step": 454 + }, + { + "epoch": 0.07, + "grad_norm": 7.01944299677375, + "learning_rate": 9.04572564612326e-07, + "loss": 0.7025, + "step": 455 + }, + { + "epoch": 0.07, + "grad_norm": 11.47478599367114, + "learning_rate": 9.065606361829025e-07, + "loss": 0.6973, + "step": 456 + }, + { + "epoch": 0.07, + "grad_norm": 10.07670611041669, + "learning_rate": 9.085487077534792e-07, + "loss": 0.6966, + "step": 457 + }, + { + "epoch": 0.07, + "grad_norm": 8.938007873172984, + "learning_rate": 9.105367793240556e-07, + "loss": 0.6953, + "step": 458 + }, + { + "epoch": 0.07, + "grad_norm": 6.398989247042089, + "learning_rate": 9.125248508946322e-07, + "loss": 0.6882, + "step": 459 + }, + { + "epoch": 0.07, + "grad_norm": 11.234039483570864, + "learning_rate": 9.145129224652088e-07, + "loss": 0.6953, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 19.256819380324774, + "learning_rate": 9.165009940357853e-07, + "loss": 0.7077, + "step": 461 + }, + { + "epoch": 0.07, + "grad_norm": 5.085103493094819, + "learning_rate": 9.184890656063618e-07, + "loss": 0.6992, + "step": 462 + }, + { + "epoch": 0.07, + "grad_norm": 4.179958248418888, + "learning_rate": 9.204771371769383e-07, + "loss": 0.6934, + "step": 463 + }, + { + "epoch": 0.07, + "grad_norm": 6.99224820934023, + "learning_rate": 9.224652087475149e-07, + "loss": 0.6927, + "step": 464 + }, + { + "epoch": 0.07, + "grad_norm": 11.601916318311844, + "learning_rate": 9.244532803180914e-07, + "loss": 0.6868, + "step": 465 + }, + { + "epoch": 0.07, + "grad_norm": 10.122625753962767, + "learning_rate": 9.264413518886679e-07, + "loss": 0.6797, + "step": 466 + }, + { + "epoch": 0.07, + "grad_norm": 12.061398905431677, + "learning_rate": 9.284294234592445e-07, + "loss": 0.7057, + "step": 467 + }, + { + "epoch": 0.07, + "grad_norm": 10.285879006891568, + "learning_rate": 9.30417495029821e-07, + "loss": 0.7038, + "step": 468 + }, + { + "epoch": 0.07, + "grad_norm": 7.9173601426388425, + "learning_rate": 9.324055666003976e-07, + "loss": 0.707, + "step": 469 + }, + { + "epoch": 0.07, + "grad_norm": 5.356312970302222, + "learning_rate": 9.343936381709741e-07, + "loss": 0.6986, + "step": 470 + }, + { + "epoch": 0.07, + "grad_norm": 4.220522688834469, + "learning_rate": 9.363817097415506e-07, + "loss": 0.6947, + "step": 471 + }, + { + "epoch": 0.07, + "grad_norm": 4.275996947554272, + "learning_rate": 9.383697813121271e-07, + "loss": 0.6849, + "step": 472 + }, + { + "epoch": 0.07, + "grad_norm": 4.3495858663739595, + "learning_rate": 9.403578528827038e-07, + "loss": 0.6888, + "step": 473 + }, + { + "epoch": 0.07, + "grad_norm": 9.369505746521481, + "learning_rate": 9.423459244532804e-07, + "loss": 0.7057, + "step": 474 + }, + { + "epoch": 0.07, + "grad_norm": 6.391852754326754, + "learning_rate": 9.443339960238568e-07, + "loss": 0.6966, + "step": 475 + }, + { + "epoch": 0.07, + "grad_norm": 4.950069493750081, + "learning_rate": 9.463220675944334e-07, + "loss": 0.6934, + "step": 476 + }, + { + "epoch": 0.07, + "grad_norm": 8.024658626942301, + "learning_rate": 9.483101391650099e-07, + "loss": 0.6953, + "step": 477 + }, + { + "epoch": 0.07, + "grad_norm": 7.691549091054149, + "learning_rate": 9.502982107355865e-07, + "loss": 0.6829, + "step": 478 + }, + { + "epoch": 0.07, + "grad_norm": 6.775700605840227, + "learning_rate": 9.522862823061629e-07, + "loss": 0.7096, + "step": 479 + }, + { + "epoch": 0.07, + "grad_norm": 5.0041927243758595, + "learning_rate": 9.542743538767394e-07, + "loss": 0.6862, + "step": 480 + }, + { + "epoch": 0.07, + "grad_norm": 9.533423179187292, + "learning_rate": 9.56262425447316e-07, + "loss": 0.7018, + "step": 481 + }, + { + "epoch": 0.07, + "grad_norm": 17.884333810444446, + "learning_rate": 9.582504970178926e-07, + "loss": 0.7142, + "step": 482 + }, + { + "epoch": 0.07, + "grad_norm": 5.359430917658416, + "learning_rate": 9.602385685884692e-07, + "loss": 0.6901, + "step": 483 + }, + { + "epoch": 0.07, + "grad_norm": 12.146621502821807, + "learning_rate": 9.622266401590456e-07, + "loss": 0.7005, + "step": 484 + }, + { + "epoch": 0.07, + "grad_norm": 5.080225126192796, + "learning_rate": 9.642147117296222e-07, + "loss": 0.6934, + "step": 485 + }, + { + "epoch": 0.07, + "grad_norm": 8.267687070628451, + "learning_rate": 9.662027833001988e-07, + "loss": 0.6999, + "step": 486 + }, + { + "epoch": 0.07, + "grad_norm": 11.032027219328823, + "learning_rate": 9.681908548707752e-07, + "loss": 0.6901, + "step": 487 + }, + { + "epoch": 0.07, + "grad_norm": 6.583521019047392, + "learning_rate": 9.701789264413519e-07, + "loss": 0.6829, + "step": 488 + }, + { + "epoch": 0.07, + "grad_norm": 7.84528949586536, + "learning_rate": 9.721669980119285e-07, + "loss": 0.6934, + "step": 489 + }, + { + "epoch": 0.07, + "grad_norm": 3.9676123568870856, + "learning_rate": 9.741550695825049e-07, + "loss": 0.6914, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 6.0339132379314995, + "learning_rate": 9.761431411530815e-07, + "loss": 0.7018, + "step": 491 + }, + { + "epoch": 0.07, + "grad_norm": 11.324983497481007, + "learning_rate": 9.78131212723658e-07, + "loss": 0.7031, + "step": 492 + }, + { + "epoch": 0.07, + "grad_norm": 6.743191483733425, + "learning_rate": 9.801192842942345e-07, + "loss": 0.6934, + "step": 493 + }, + { + "epoch": 0.07, + "grad_norm": 8.35293849240906, + "learning_rate": 9.82107355864811e-07, + "loss": 0.6927, + "step": 494 + }, + { + "epoch": 0.07, + "grad_norm": 6.315914290259441, + "learning_rate": 9.840954274353877e-07, + "loss": 0.6751, + "step": 495 + }, + { + "epoch": 0.07, + "grad_norm": 4.522622491525206, + "learning_rate": 9.860834990059643e-07, + "loss": 0.6979, + "step": 496 + }, + { + "epoch": 0.07, + "grad_norm": 6.682024879932865, + "learning_rate": 9.880715705765407e-07, + "loss": 0.7025, + "step": 497 + }, + { + "epoch": 0.07, + "grad_norm": 4.530071851446565, + "learning_rate": 9.900596421471173e-07, + "loss": 0.6979, + "step": 498 + }, + { + "epoch": 0.07, + "grad_norm": 8.31542231209298, + "learning_rate": 9.920477137176937e-07, + "loss": 0.7038, + "step": 499 + }, + { + "epoch": 0.07, + "grad_norm": 6.090412387018624, + "learning_rate": 9.940357852882703e-07, + "loss": 0.6979, + "step": 500 + }, + { + "epoch": 0.07, + "grad_norm": 7.262156457250189, + "learning_rate": 9.960238568588467e-07, + "loss": 0.6953, + "step": 501 + }, + { + "epoch": 0.07, + "grad_norm": 4.183620395435941, + "learning_rate": 9.980119284294233e-07, + "loss": 0.6947, + "step": 502 + }, + { + "epoch": 0.08, + "grad_norm": 5.163453231005009, + "learning_rate": 1e-06, + "loss": 0.6842, + "step": 503 + }, + { + "epoch": 0.08, + "grad_norm": 5.418602916514263, + "learning_rate": 1.0019880715705764e-06, + "loss": 0.6979, + "step": 504 + }, + { + "epoch": 0.08, + "grad_norm": 20.072188656649313, + "learning_rate": 1.0039761431411532e-06, + "loss": 0.7129, + "step": 505 + }, + { + "epoch": 0.08, + "grad_norm": 6.619144111725088, + "learning_rate": 1.0059642147117296e-06, + "loss": 0.6888, + "step": 506 + }, + { + "epoch": 0.08, + "grad_norm": 4.226641974353168, + "learning_rate": 1.0079522862823062e-06, + "loss": 0.6914, + "step": 507 + }, + { + "epoch": 0.08, + "grad_norm": 6.677779879467535, + "learning_rate": 1.0099403578528826e-06, + "loss": 0.6986, + "step": 508 + }, + { + "epoch": 0.08, + "grad_norm": 13.02624753693883, + "learning_rate": 1.0119284294234592e-06, + "loss": 0.7122, + "step": 509 + }, + { + "epoch": 0.08, + "grad_norm": 2.7104013240754985, + "learning_rate": 1.0139165009940358e-06, + "loss": 0.6849, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 9.260916557391647, + "learning_rate": 1.0159045725646122e-06, + "loss": 0.7012, + "step": 511 + }, + { + "epoch": 0.08, + "grad_norm": 6.901448748100435, + "learning_rate": 1.0178926441351888e-06, + "loss": 0.7005, + "step": 512 + }, + { + "epoch": 0.08, + "grad_norm": 15.43048172735375, + "learning_rate": 1.0198807157057654e-06, + "loss": 0.709, + "step": 513 + }, + { + "epoch": 0.08, + "grad_norm": 17.304640476317108, + "learning_rate": 1.0218687872763418e-06, + "loss": 0.724, + "step": 514 + }, + { + "epoch": 0.08, + "grad_norm": 5.8421785817269685, + "learning_rate": 1.0238568588469184e-06, + "loss": 0.6973, + "step": 515 + }, + { + "epoch": 0.08, + "grad_norm": 12.06734693129196, + "learning_rate": 1.025844930417495e-06, + "loss": 0.6966, + "step": 516 + }, + { + "epoch": 0.08, + "grad_norm": 3.9452917410078827, + "learning_rate": 1.0278330019880714e-06, + "loss": 0.7005, + "step": 517 + }, + { + "epoch": 0.08, + "grad_norm": 8.430703518510104, + "learning_rate": 1.029821073558648e-06, + "loss": 0.7083, + "step": 518 + }, + { + "epoch": 0.08, + "grad_norm": 9.255136346579855, + "learning_rate": 1.0318091451292247e-06, + "loss": 0.707, + "step": 519 + }, + { + "epoch": 0.08, + "grad_norm": 4.970970427974498, + "learning_rate": 1.0337972166998013e-06, + "loss": 0.694, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 19.13403930888799, + "learning_rate": 1.0357852882703777e-06, + "loss": 0.7331, + "step": 521 + }, + { + "epoch": 0.08, + "grad_norm": 13.859881512946767, + "learning_rate": 1.037773359840954e-06, + "loss": 0.7103, + "step": 522 + }, + { + "epoch": 0.08, + "grad_norm": 4.879063950265113, + "learning_rate": 1.039761431411531e-06, + "loss": 0.6927, + "step": 523 + }, + { + "epoch": 0.08, + "grad_norm": 4.6663259397210695, + "learning_rate": 1.0417495029821073e-06, + "loss": 0.6882, + "step": 524 + }, + { + "epoch": 0.08, + "grad_norm": 3.9339650397260515, + "learning_rate": 1.0437375745526837e-06, + "loss": 0.6979, + "step": 525 + }, + { + "epoch": 0.08, + "grad_norm": 14.1883454948212, + "learning_rate": 1.0457256461232605e-06, + "loss": 0.7012, + "step": 526 + }, + { + "epoch": 0.08, + "grad_norm": 6.077547783119257, + "learning_rate": 1.047713717693837e-06, + "loss": 0.6953, + "step": 527 + }, + { + "epoch": 0.08, + "grad_norm": 8.547127559078849, + "learning_rate": 1.0497017892644135e-06, + "loss": 0.6979, + "step": 528 + }, + { + "epoch": 0.08, + "grad_norm": 10.28098370004434, + "learning_rate": 1.05168986083499e-06, + "loss": 0.7005, + "step": 529 + }, + { + "epoch": 0.08, + "grad_norm": 8.516615459205955, + "learning_rate": 1.0536779324055665e-06, + "loss": 0.7096, + "step": 530 + }, + { + "epoch": 0.08, + "grad_norm": 14.497336664777578, + "learning_rate": 1.0556660039761431e-06, + "loss": 0.7038, + "step": 531 + }, + { + "epoch": 0.08, + "grad_norm": 7.322412516111854, + "learning_rate": 1.0576540755467195e-06, + "loss": 0.6966, + "step": 532 + }, + { + "epoch": 0.08, + "grad_norm": 4.006924577776056, + "learning_rate": 1.0596421471172964e-06, + "loss": 0.6901, + "step": 533 + }, + { + "epoch": 0.08, + "grad_norm": 6.304927135750833, + "learning_rate": 1.0616302186878728e-06, + "loss": 0.7025, + "step": 534 + }, + { + "epoch": 0.08, + "grad_norm": 3.094680724742609, + "learning_rate": 1.0636182902584492e-06, + "loss": 0.6979, + "step": 535 + }, + { + "epoch": 0.08, + "grad_norm": 3.163511060546819, + "learning_rate": 1.0656063618290258e-06, + "loss": 0.6979, + "step": 536 + }, + { + "epoch": 0.08, + "grad_norm": 11.444300291700802, + "learning_rate": 1.0675944333996024e-06, + "loss": 0.6908, + "step": 537 + }, + { + "epoch": 0.08, + "grad_norm": 6.181437040736761, + "learning_rate": 1.0695825049701788e-06, + "loss": 0.6895, + "step": 538 + }, + { + "epoch": 0.08, + "grad_norm": 12.391461439272584, + "learning_rate": 1.0715705765407554e-06, + "loss": 0.7031, + "step": 539 + }, + { + "epoch": 0.08, + "grad_norm": 9.12804606707508, + "learning_rate": 1.073558648111332e-06, + "loss": 0.7031, + "step": 540 + }, + { + "epoch": 0.08, + "grad_norm": 6.478678982006108, + "learning_rate": 1.0755467196819086e-06, + "loss": 0.6947, + "step": 541 + }, + { + "epoch": 0.08, + "grad_norm": 3.6733800384955217, + "learning_rate": 1.077534791252485e-06, + "loss": 0.6803, + "step": 542 + }, + { + "epoch": 0.08, + "grad_norm": 5.894014155780146, + "learning_rate": 1.0795228628230614e-06, + "loss": 0.6992, + "step": 543 + }, + { + "epoch": 0.08, + "grad_norm": 7.988728839969133, + "learning_rate": 1.0815109343936382e-06, + "loss": 0.696, + "step": 544 + }, + { + "epoch": 0.08, + "grad_norm": 5.632688240609994, + "learning_rate": 1.0834990059642146e-06, + "loss": 0.6836, + "step": 545 + }, + { + "epoch": 0.08, + "grad_norm": 10.636605857739047, + "learning_rate": 1.0854870775347912e-06, + "loss": 0.7005, + "step": 546 + }, + { + "epoch": 0.08, + "grad_norm": 8.832613026014412, + "learning_rate": 1.0874751491053679e-06, + "loss": 0.7031, + "step": 547 + }, + { + "epoch": 0.08, + "grad_norm": 3.36679552263619, + "learning_rate": 1.0894632206759443e-06, + "loss": 0.6966, + "step": 548 + }, + { + "epoch": 0.08, + "grad_norm": 5.247422152422811, + "learning_rate": 1.0914512922465209e-06, + "loss": 0.6823, + "step": 549 + }, + { + "epoch": 0.08, + "grad_norm": 8.65899564673693, + "learning_rate": 1.0934393638170973e-06, + "loss": 0.7038, + "step": 550 + }, + { + "epoch": 0.08, + "grad_norm": 5.834442825999414, + "learning_rate": 1.095427435387674e-06, + "loss": 0.6934, + "step": 551 + }, + { + "epoch": 0.08, + "grad_norm": 5.1319823971564915, + "learning_rate": 1.0974155069582505e-06, + "loss": 0.6953, + "step": 552 + }, + { + "epoch": 0.08, + "grad_norm": 12.43470451247899, + "learning_rate": 1.0994035785288269e-06, + "loss": 0.7064, + "step": 553 + }, + { + "epoch": 0.08, + "grad_norm": 5.433089481322773, + "learning_rate": 1.1013916500994037e-06, + "loss": 0.7109, + "step": 554 + }, + { + "epoch": 0.08, + "grad_norm": 7.434571010222418, + "learning_rate": 1.10337972166998e-06, + "loss": 0.6868, + "step": 555 + }, + { + "epoch": 0.08, + "grad_norm": 10.29523521974586, + "learning_rate": 1.1053677932405565e-06, + "loss": 0.7135, + "step": 556 + }, + { + "epoch": 0.08, + "grad_norm": 5.319717067813744, + "learning_rate": 1.1073558648111331e-06, + "loss": 0.6947, + "step": 557 + }, + { + "epoch": 0.08, + "grad_norm": 5.115030841778958, + "learning_rate": 1.1093439363817097e-06, + "loss": 0.6992, + "step": 558 + }, + { + "epoch": 0.08, + "grad_norm": 4.382198520422729, + "learning_rate": 1.1113320079522863e-06, + "loss": 0.696, + "step": 559 + }, + { + "epoch": 0.08, + "grad_norm": 10.330662841633563, + "learning_rate": 1.1133200795228627e-06, + "loss": 0.6992, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 8.414789693749533, + "learning_rate": 1.1153081510934391e-06, + "loss": 0.6901, + "step": 561 + }, + { + "epoch": 0.08, + "grad_norm": 3.7086701740571186, + "learning_rate": 1.117296222664016e-06, + "loss": 0.6868, + "step": 562 + }, + { + "epoch": 0.08, + "grad_norm": 5.97436901558898, + "learning_rate": 1.1192842942345924e-06, + "loss": 0.694, + "step": 563 + }, + { + "epoch": 0.08, + "grad_norm": 6.606245042088812, + "learning_rate": 1.121272365805169e-06, + "loss": 0.7012, + "step": 564 + }, + { + "epoch": 0.08, + "grad_norm": 3.252655827873364, + "learning_rate": 1.1232604373757456e-06, + "loss": 0.6875, + "step": 565 + }, + { + "epoch": 0.08, + "grad_norm": 4.511350212359119, + "learning_rate": 1.125248508946322e-06, + "loss": 0.7064, + "step": 566 + }, + { + "epoch": 0.08, + "grad_norm": 11.9828458133611, + "learning_rate": 1.1272365805168986e-06, + "loss": 0.7083, + "step": 567 + }, + { + "epoch": 0.08, + "grad_norm": 7.6594079055623485, + "learning_rate": 1.129224652087475e-06, + "loss": 0.6986, + "step": 568 + }, + { + "epoch": 0.08, + "grad_norm": 4.686213666272609, + "learning_rate": 1.1312127236580516e-06, + "loss": 0.6992, + "step": 569 + }, + { + "epoch": 0.09, + "grad_norm": 4.080922020949418, + "learning_rate": 1.1332007952286282e-06, + "loss": 0.6914, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 9.643354733008756, + "learning_rate": 1.1351888667992046e-06, + "loss": 0.6855, + "step": 571 + }, + { + "epoch": 0.09, + "grad_norm": 4.363525015418019, + "learning_rate": 1.1371769383697814e-06, + "loss": 0.6895, + "step": 572 + }, + { + "epoch": 0.09, + "grad_norm": 3.229756977715118, + "learning_rate": 1.1391650099403578e-06, + "loss": 0.6953, + "step": 573 + }, + { + "epoch": 0.09, + "grad_norm": 4.780942098933473, + "learning_rate": 1.1411530815109342e-06, + "loss": 0.6934, + "step": 574 + }, + { + "epoch": 0.09, + "grad_norm": 3.4199084015895833, + "learning_rate": 1.1431411530815108e-06, + "loss": 0.6882, + "step": 575 + }, + { + "epoch": 0.09, + "grad_norm": 7.2390547986045055, + "learning_rate": 1.1451292246520874e-06, + "loss": 0.6875, + "step": 576 + }, + { + "epoch": 0.09, + "grad_norm": 5.11115449556599, + "learning_rate": 1.147117296222664e-06, + "loss": 0.7031, + "step": 577 + }, + { + "epoch": 0.09, + "grad_norm": 2.316263138657083, + "learning_rate": 1.1491053677932405e-06, + "loss": 0.6966, + "step": 578 + }, + { + "epoch": 0.09, + "grad_norm": 4.197562937472902, + "learning_rate": 1.151093439363817e-06, + "loss": 0.6953, + "step": 579 + }, + { + "epoch": 0.09, + "grad_norm": 3.496578413211144, + "learning_rate": 1.1530815109343937e-06, + "loss": 0.696, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 7.223130693210218, + "learning_rate": 1.15506958250497e-06, + "loss": 0.6921, + "step": 581 + }, + { + "epoch": 0.09, + "grad_norm": 7.211105917304277, + "learning_rate": 1.1570576540755465e-06, + "loss": 0.6901, + "step": 582 + }, + { + "epoch": 0.09, + "grad_norm": 4.98675792202501, + "learning_rate": 1.1590457256461233e-06, + "loss": 0.7038, + "step": 583 + }, + { + "epoch": 0.09, + "grad_norm": 3.324975606750778, + "learning_rate": 1.1610337972166997e-06, + "loss": 0.6875, + "step": 584 + }, + { + "epoch": 0.09, + "grad_norm": 10.823120912532936, + "learning_rate": 1.1630218687872763e-06, + "loss": 0.6921, + "step": 585 + }, + { + "epoch": 0.09, + "grad_norm": 5.758654226409222, + "learning_rate": 1.165009940357853e-06, + "loss": 0.6868, + "step": 586 + }, + { + "epoch": 0.09, + "grad_norm": 13.81769607925928, + "learning_rate": 1.1669980119284293e-06, + "loss": 0.7012, + "step": 587 + }, + { + "epoch": 0.09, + "grad_norm": 7.097128201123073, + "learning_rate": 1.168986083499006e-06, + "loss": 0.6868, + "step": 588 + }, + { + "epoch": 0.09, + "grad_norm": 7.067156859268307, + "learning_rate": 1.1709741550695823e-06, + "loss": 0.6868, + "step": 589 + }, + { + "epoch": 0.09, + "grad_norm": 2.208893985365875, + "learning_rate": 1.1729622266401591e-06, + "loss": 0.6966, + "step": 590 + }, + { + "epoch": 0.09, + "grad_norm": 3.9518001746524343, + "learning_rate": 1.1749502982107355e-06, + "loss": 0.7012, + "step": 591 + }, + { + "epoch": 0.09, + "grad_norm": 14.509828392003515, + "learning_rate": 1.176938369781312e-06, + "loss": 0.6947, + "step": 592 + }, + { + "epoch": 0.09, + "grad_norm": 8.599005545430293, + "learning_rate": 1.1789264413518888e-06, + "loss": 0.6914, + "step": 593 + }, + { + "epoch": 0.09, + "grad_norm": 2.685852035882882, + "learning_rate": 1.1809145129224652e-06, + "loss": 0.6973, + "step": 594 + }, + { + "epoch": 0.09, + "grad_norm": 5.609262652777283, + "learning_rate": 1.1829025844930416e-06, + "loss": 0.6914, + "step": 595 + }, + { + "epoch": 0.09, + "grad_norm": 10.897498417845508, + "learning_rate": 1.1848906560636182e-06, + "loss": 0.6862, + "step": 596 + }, + { + "epoch": 0.09, + "grad_norm": 3.8035611272751213, + "learning_rate": 1.1868787276341948e-06, + "loss": 0.6947, + "step": 597 + }, + { + "epoch": 0.09, + "grad_norm": 5.500835531422666, + "learning_rate": 1.1888667992047714e-06, + "loss": 0.6934, + "step": 598 + }, + { + "epoch": 0.09, + "grad_norm": 9.710218068328336, + "learning_rate": 1.1908548707753478e-06, + "loss": 0.7005, + "step": 599 + }, + { + "epoch": 0.09, + "grad_norm": 10.14820936720081, + "learning_rate": 1.1928429423459244e-06, + "loss": 0.7057, + "step": 600 + }, + { + "epoch": 0.09, + "grad_norm": 2.442736038552533, + "learning_rate": 1.194831013916501e-06, + "loss": 0.6868, + "step": 601 + }, + { + "epoch": 0.09, + "grad_norm": 5.036574265775883, + "learning_rate": 1.1968190854870774e-06, + "loss": 0.6966, + "step": 602 + }, + { + "epoch": 0.09, + "grad_norm": 9.271070586493611, + "learning_rate": 1.198807157057654e-06, + "loss": 0.6921, + "step": 603 + }, + { + "epoch": 0.09, + "grad_norm": 15.252597700257553, + "learning_rate": 1.2007952286282306e-06, + "loss": 0.6777, + "step": 604 + }, + { + "epoch": 0.09, + "grad_norm": 5.323094064669315, + "learning_rate": 1.202783300198807e-06, + "loss": 0.6888, + "step": 605 + }, + { + "epoch": 0.09, + "grad_norm": 7.6766081651647715, + "learning_rate": 1.2047713717693836e-06, + "loss": 0.7135, + "step": 606 + }, + { + "epoch": 0.09, + "grad_norm": 5.174125467267188, + "learning_rate": 1.2067594433399603e-06, + "loss": 0.6901, + "step": 607 + }, + { + "epoch": 0.09, + "grad_norm": 5.643546738305966, + "learning_rate": 1.2087475149105367e-06, + "loss": 0.6999, + "step": 608 + }, + { + "epoch": 0.09, + "grad_norm": 11.72836797884339, + "learning_rate": 1.2107355864811133e-06, + "loss": 0.6882, + "step": 609 + }, + { + "epoch": 0.09, + "grad_norm": 7.597207955075841, + "learning_rate": 1.2127236580516897e-06, + "loss": 0.707, + "step": 610 + }, + { + "epoch": 0.09, + "grad_norm": 9.280970898747162, + "learning_rate": 1.2147117296222665e-06, + "loss": 0.709, + "step": 611 + }, + { + "epoch": 0.09, + "grad_norm": 11.068098018157594, + "learning_rate": 1.2166998011928429e-06, + "loss": 0.7122, + "step": 612 + }, + { + "epoch": 0.09, + "grad_norm": 1.8420286689733354, + "learning_rate": 1.2186878727634193e-06, + "loss": 0.6816, + "step": 613 + }, + { + "epoch": 0.09, + "grad_norm": 5.688499458491182, + "learning_rate": 1.220675944333996e-06, + "loss": 0.6986, + "step": 614 + }, + { + "epoch": 0.09, + "grad_norm": 4.298814618960796, + "learning_rate": 1.2226640159045725e-06, + "loss": 0.679, + "step": 615 + }, + { + "epoch": 0.09, + "grad_norm": 7.694461714325365, + "learning_rate": 1.2246520874751491e-06, + "loss": 0.696, + "step": 616 + }, + { + "epoch": 0.09, + "grad_norm": 7.704332122573015, + "learning_rate": 1.2266401590457255e-06, + "loss": 0.6862, + "step": 617 + }, + { + "epoch": 0.09, + "grad_norm": 3.0239484795059814, + "learning_rate": 1.2286282306163021e-06, + "loss": 0.696, + "step": 618 + }, + { + "epoch": 0.09, + "grad_norm": 14.196575200588363, + "learning_rate": 1.2306163021868787e-06, + "loss": 0.7168, + "step": 619 + }, + { + "epoch": 0.09, + "grad_norm": 8.506721977824297, + "learning_rate": 1.2326043737574551e-06, + "loss": 0.6986, + "step": 620 + }, + { + "epoch": 0.09, + "grad_norm": 8.727090407606426, + "learning_rate": 1.234592445328032e-06, + "loss": 0.7018, + "step": 621 + }, + { + "epoch": 0.09, + "grad_norm": 8.121996299907073, + "learning_rate": 1.2365805168986084e-06, + "loss": 0.7044, + "step": 622 + }, + { + "epoch": 0.09, + "grad_norm": 10.736573134637975, + "learning_rate": 1.2385685884691848e-06, + "loss": 0.7064, + "step": 623 + }, + { + "epoch": 0.09, + "grad_norm": 11.653981756090145, + "learning_rate": 1.2405566600397614e-06, + "loss": 0.7031, + "step": 624 + }, + { + "epoch": 0.09, + "grad_norm": 3.375611345406801, + "learning_rate": 1.242544731610338e-06, + "loss": 0.6895, + "step": 625 + }, + { + "epoch": 0.09, + "grad_norm": 3.6994060349320694, + "learning_rate": 1.2445328031809144e-06, + "loss": 0.6914, + "step": 626 + }, + { + "epoch": 0.09, + "grad_norm": 16.095606249831906, + "learning_rate": 1.246520874751491e-06, + "loss": 0.7044, + "step": 627 + }, + { + "epoch": 0.09, + "grad_norm": 13.505045665206723, + "learning_rate": 1.2485089463220676e-06, + "loss": 0.6953, + "step": 628 + }, + { + "epoch": 0.09, + "grad_norm": 9.382530463728617, + "learning_rate": 1.2504970178926442e-06, + "loss": 0.7038, + "step": 629 + }, + { + "epoch": 0.09, + "grad_norm": 4.35557147372081, + "learning_rate": 1.2524850894632206e-06, + "loss": 0.6875, + "step": 630 + }, + { + "epoch": 0.09, + "grad_norm": 4.857828621907085, + "learning_rate": 1.254473161033797e-06, + "loss": 0.6855, + "step": 631 + }, + { + "epoch": 0.09, + "grad_norm": 9.186696923835534, + "learning_rate": 1.2564612326043738e-06, + "loss": 0.6992, + "step": 632 + }, + { + "epoch": 0.09, + "grad_norm": 9.261246141549304, + "learning_rate": 1.2584493041749502e-06, + "loss": 0.7077, + "step": 633 + }, + { + "epoch": 0.09, + "grad_norm": 10.832393800920693, + "learning_rate": 1.2604373757455268e-06, + "loss": 0.709, + "step": 634 + }, + { + "epoch": 0.09, + "grad_norm": 12.02198232140092, + "learning_rate": 1.2624254473161034e-06, + "loss": 0.7096, + "step": 635 + }, + { + "epoch": 0.09, + "grad_norm": 11.15939200119468, + "learning_rate": 1.2644135188866798e-06, + "loss": 0.6901, + "step": 636 + }, + { + "epoch": 0.1, + "grad_norm": 2.7424973055227113, + "learning_rate": 1.2664015904572565e-06, + "loss": 0.6947, + "step": 637 + }, + { + "epoch": 0.1, + "grad_norm": 7.630347838780393, + "learning_rate": 1.2683896620278328e-06, + "loss": 0.6888, + "step": 638 + }, + { + "epoch": 0.1, + "grad_norm": 2.4869373451962167, + "learning_rate": 1.2703777335984095e-06, + "loss": 0.696, + "step": 639 + }, + { + "epoch": 0.1, + "grad_norm": 11.625763686153396, + "learning_rate": 1.272365805168986e-06, + "loss": 0.7012, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 6.7216312323956116, + "learning_rate": 1.2743538767395625e-06, + "loss": 0.6979, + "step": 641 + }, + { + "epoch": 0.1, + "grad_norm": 4.9474608779915785, + "learning_rate": 1.2763419483101393e-06, + "loss": 0.6934, + "step": 642 + }, + { + "epoch": 0.1, + "grad_norm": 6.24545569342793, + "learning_rate": 1.2783300198807157e-06, + "loss": 0.6979, + "step": 643 + }, + { + "epoch": 0.1, + "grad_norm": 4.746720068903385, + "learning_rate": 1.280318091451292e-06, + "loss": 0.6875, + "step": 644 + }, + { + "epoch": 0.1, + "grad_norm": 3.4487058475976426, + "learning_rate": 1.2823061630218687e-06, + "loss": 0.6927, + "step": 645 + }, + { + "epoch": 0.1, + "grad_norm": 5.794807657098796, + "learning_rate": 1.2842942345924453e-06, + "loss": 0.6842, + "step": 646 + }, + { + "epoch": 0.1, + "grad_norm": 17.54106958413656, + "learning_rate": 1.286282306163022e-06, + "loss": 0.709, + "step": 647 + }, + { + "epoch": 0.1, + "grad_norm": 14.058476441368475, + "learning_rate": 1.2882703777335983e-06, + "loss": 0.6901, + "step": 648 + }, + { + "epoch": 0.1, + "grad_norm": 3.849345462002358, + "learning_rate": 1.2902584493041747e-06, + "loss": 0.6999, + "step": 649 + }, + { + "epoch": 0.1, + "grad_norm": 12.417058739590987, + "learning_rate": 1.2922465208747515e-06, + "loss": 0.6999, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 5.072228597645178, + "learning_rate": 1.294234592445328e-06, + "loss": 0.6842, + "step": 651 + }, + { + "epoch": 0.1, + "grad_norm": 5.185718715847009, + "learning_rate": 1.2962226640159043e-06, + "loss": 0.696, + "step": 652 + }, + { + "epoch": 0.1, + "grad_norm": 4.0774946461725925, + "learning_rate": 1.2982107355864812e-06, + "loss": 0.6882, + "step": 653 + }, + { + "epoch": 0.1, + "grad_norm": 10.588609709820584, + "learning_rate": 1.3001988071570576e-06, + "loss": 0.7077, + "step": 654 + }, + { + "epoch": 0.1, + "grad_norm": 5.2333722844678, + "learning_rate": 1.3021868787276342e-06, + "loss": 0.6868, + "step": 655 + }, + { + "epoch": 0.1, + "grad_norm": 2.8894288895334497, + "learning_rate": 1.3041749502982106e-06, + "loss": 0.6934, + "step": 656 + }, + { + "epoch": 0.1, + "grad_norm": 10.06667322926377, + "learning_rate": 1.3061630218687872e-06, + "loss": 0.6875, + "step": 657 + }, + { + "epoch": 0.1, + "grad_norm": 5.0186944117810866, + "learning_rate": 1.3081510934393638e-06, + "loss": 0.6966, + "step": 658 + }, + { + "epoch": 0.1, + "grad_norm": 10.784197743621213, + "learning_rate": 1.3101391650099402e-06, + "loss": 0.694, + "step": 659 + }, + { + "epoch": 0.1, + "grad_norm": 5.900353426606522, + "learning_rate": 1.312127236580517e-06, + "loss": 0.7057, + "step": 660 + }, + { + "epoch": 0.1, + "grad_norm": 3.0124914382529964, + "learning_rate": 1.3141153081510934e-06, + "loss": 0.6855, + "step": 661 + }, + { + "epoch": 0.1, + "grad_norm": 2.8834033368839913, + "learning_rate": 1.3161033797216698e-06, + "loss": 0.6901, + "step": 662 + }, + { + "epoch": 0.1, + "grad_norm": 4.567263050881383, + "learning_rate": 1.3180914512922466e-06, + "loss": 0.7038, + "step": 663 + }, + { + "epoch": 0.1, + "grad_norm": 3.242099647498014, + "learning_rate": 1.320079522862823e-06, + "loss": 0.7012, + "step": 664 + }, + { + "epoch": 0.1, + "grad_norm": 1.9723518901399488, + "learning_rate": 1.3220675944333994e-06, + "loss": 0.6882, + "step": 665 + }, + { + "epoch": 0.1, + "grad_norm": 2.461923132465631, + "learning_rate": 1.324055666003976e-06, + "loss": 0.694, + "step": 666 + }, + { + "epoch": 0.1, + "grad_norm": 8.58953646806992, + "learning_rate": 1.3260437375745526e-06, + "loss": 0.6908, + "step": 667 + }, + { + "epoch": 0.1, + "grad_norm": 5.47409607666015, + "learning_rate": 1.3280318091451293e-06, + "loss": 0.7129, + "step": 668 + }, + { + "epoch": 0.1, + "grad_norm": 7.213808292299258, + "learning_rate": 1.3300198807157057e-06, + "loss": 0.6849, + "step": 669 + }, + { + "epoch": 0.1, + "grad_norm": 3.4717154935488086, + "learning_rate": 1.332007952286282e-06, + "loss": 0.6921, + "step": 670 + }, + { + "epoch": 0.1, + "grad_norm": 11.070376911700755, + "learning_rate": 1.3339960238568589e-06, + "loss": 0.7077, + "step": 671 + }, + { + "epoch": 0.1, + "grad_norm": 12.420341158899666, + "learning_rate": 1.3359840954274353e-06, + "loss": 0.7031, + "step": 672 + }, + { + "epoch": 0.1, + "grad_norm": 10.613933634344162, + "learning_rate": 1.3379721669980119e-06, + "loss": 0.7031, + "step": 673 + }, + { + "epoch": 0.1, + "grad_norm": 2.5405311902519383, + "learning_rate": 1.3399602385685885e-06, + "loss": 0.6927, + "step": 674 + }, + { + "epoch": 0.1, + "grad_norm": 4.613747829709012, + "learning_rate": 1.341948310139165e-06, + "loss": 0.6953, + "step": 675 + }, + { + "epoch": 0.1, + "grad_norm": 2.9741554156520436, + "learning_rate": 1.3439363817097415e-06, + "loss": 0.6895, + "step": 676 + }, + { + "epoch": 0.1, + "grad_norm": 9.267987670972875, + "learning_rate": 1.345924453280318e-06, + "loss": 0.694, + "step": 677 + }, + { + "epoch": 0.1, + "grad_norm": 5.124753952560912, + "learning_rate": 1.3479125248508945e-06, + "loss": 0.6921, + "step": 678 + }, + { + "epoch": 0.1, + "grad_norm": 2.5428385266758267, + "learning_rate": 1.3499005964214711e-06, + "loss": 0.6986, + "step": 679 + }, + { + "epoch": 0.1, + "grad_norm": 10.004663899475316, + "learning_rate": 1.3518886679920475e-06, + "loss": 0.696, + "step": 680 + }, + { + "epoch": 0.1, + "grad_norm": 9.92167149725243, + "learning_rate": 1.3538767395626244e-06, + "loss": 0.709, + "step": 681 + }, + { + "epoch": 0.1, + "grad_norm": 8.67542665245066, + "learning_rate": 1.3558648111332007e-06, + "loss": 0.6953, + "step": 682 + }, + { + "epoch": 0.1, + "grad_norm": 4.674141716070041, + "learning_rate": 1.3578528827037771e-06, + "loss": 0.694, + "step": 683 + }, + { + "epoch": 0.1, + "grad_norm": 3.8650857575912183, + "learning_rate": 1.3598409542743538e-06, + "loss": 0.6934, + "step": 684 + }, + { + "epoch": 0.1, + "grad_norm": 3.575851291054732, + "learning_rate": 1.3618290258449304e-06, + "loss": 0.696, + "step": 685 + }, + { + "epoch": 0.1, + "grad_norm": 3.381687272808106, + "learning_rate": 1.363817097415507e-06, + "loss": 0.7012, + "step": 686 + }, + { + "epoch": 0.1, + "grad_norm": 13.274017349782966, + "learning_rate": 1.3658051689860834e-06, + "loss": 0.7031, + "step": 687 + }, + { + "epoch": 0.1, + "grad_norm": 11.28274353590572, + "learning_rate": 1.36779324055666e-06, + "loss": 0.7044, + "step": 688 + }, + { + "epoch": 0.1, + "grad_norm": 11.126059850363927, + "learning_rate": 1.3697813121272366e-06, + "loss": 0.7109, + "step": 689 + }, + { + "epoch": 0.1, + "grad_norm": 3.7320886731214613, + "learning_rate": 1.371769383697813e-06, + "loss": 0.6921, + "step": 690 + }, + { + "epoch": 0.1, + "grad_norm": 2.970846101608013, + "learning_rate": 1.3737574552683896e-06, + "loss": 0.6947, + "step": 691 + }, + { + "epoch": 0.1, + "grad_norm": 12.976906344046878, + "learning_rate": 1.3757455268389662e-06, + "loss": 0.6986, + "step": 692 + }, + { + "epoch": 0.1, + "grad_norm": 4.318895732265828, + "learning_rate": 1.3777335984095426e-06, + "loss": 0.6947, + "step": 693 + }, + { + "epoch": 0.1, + "grad_norm": 7.016659482680183, + "learning_rate": 1.3797216699801192e-06, + "loss": 0.6914, + "step": 694 + }, + { + "epoch": 0.1, + "grad_norm": 16.06260670235136, + "learning_rate": 1.3817097415506958e-06, + "loss": 0.6842, + "step": 695 + }, + { + "epoch": 0.1, + "grad_norm": 6.509494425910525, + "learning_rate": 1.3836978131212722e-06, + "loss": 0.6921, + "step": 696 + }, + { + "epoch": 0.1, + "grad_norm": 7.904654335595832, + "learning_rate": 1.3856858846918488e-06, + "loss": 0.6973, + "step": 697 + }, + { + "epoch": 0.1, + "grad_norm": 10.15615686953745, + "learning_rate": 1.3876739562624252e-06, + "loss": 0.7018, + "step": 698 + }, + { + "epoch": 0.1, + "grad_norm": 14.611280419490965, + "learning_rate": 1.389662027833002e-06, + "loss": 0.7051, + "step": 699 + }, + { + "epoch": 0.1, + "grad_norm": 7.482498495024733, + "learning_rate": 1.3916500994035785e-06, + "loss": 0.6986, + "step": 700 + }, + { + "epoch": 0.1, + "grad_norm": 3.406271857962913, + "learning_rate": 1.3936381709741549e-06, + "loss": 0.6868, + "step": 701 + }, + { + "epoch": 0.1, + "grad_norm": 3.1281539354218375, + "learning_rate": 1.3956262425447317e-06, + "loss": 0.6927, + "step": 702 + }, + { + "epoch": 0.1, + "grad_norm": 3.1731932316236207, + "learning_rate": 1.397614314115308e-06, + "loss": 0.6901, + "step": 703 + }, + { + "epoch": 0.1, + "grad_norm": 11.518857666658953, + "learning_rate": 1.3996023856858847e-06, + "loss": 0.7018, + "step": 704 + }, + { + "epoch": 0.11, + "grad_norm": 3.517245983851073, + "learning_rate": 1.401590457256461e-06, + "loss": 0.6888, + "step": 705 + }, + { + "epoch": 0.11, + "grad_norm": 3.0719156233168006, + "learning_rate": 1.4035785288270377e-06, + "loss": 0.6953, + "step": 706 + }, + { + "epoch": 0.11, + "grad_norm": 7.0395656791189225, + "learning_rate": 1.4055666003976143e-06, + "loss": 0.6934, + "step": 707 + }, + { + "epoch": 0.11, + "grad_norm": 12.06152076356384, + "learning_rate": 1.4075546719681907e-06, + "loss": 0.7038, + "step": 708 + }, + { + "epoch": 0.11, + "grad_norm": 3.418956003729204, + "learning_rate": 1.4095427435387673e-06, + "loss": 0.6868, + "step": 709 + }, + { + "epoch": 0.11, + "grad_norm": 4.079939706110253, + "learning_rate": 1.411530815109344e-06, + "loss": 0.6849, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 5.737939421479942, + "learning_rate": 1.4135188866799203e-06, + "loss": 0.6901, + "step": 711 + }, + { + "epoch": 0.11, + "grad_norm": 5.920454004241152, + "learning_rate": 1.415506958250497e-06, + "loss": 0.707, + "step": 712 + }, + { + "epoch": 0.11, + "grad_norm": 3.5733712631304413, + "learning_rate": 1.4174950298210736e-06, + "loss": 0.7005, + "step": 713 + }, + { + "epoch": 0.11, + "grad_norm": 5.633089096184563, + "learning_rate": 1.41948310139165e-06, + "loss": 0.7012, + "step": 714 + }, + { + "epoch": 0.11, + "grad_norm": 2.96071105235213, + "learning_rate": 1.4214711729622266e-06, + "loss": 0.6914, + "step": 715 + }, + { + "epoch": 0.11, + "grad_norm": 10.886094691804784, + "learning_rate": 1.4234592445328032e-06, + "loss": 0.7044, + "step": 716 + }, + { + "epoch": 0.11, + "grad_norm": 8.209582020112334, + "learning_rate": 1.4254473161033798e-06, + "loss": 0.7077, + "step": 717 + }, + { + "epoch": 0.11, + "grad_norm": 2.73087379011374, + "learning_rate": 1.4274353876739562e-06, + "loss": 0.6921, + "step": 718 + }, + { + "epoch": 0.11, + "grad_norm": 6.795742744308835, + "learning_rate": 1.4294234592445326e-06, + "loss": 0.7174, + "step": 719 + }, + { + "epoch": 0.11, + "grad_norm": 7.978140228777336, + "learning_rate": 1.4314115308151094e-06, + "loss": 0.6921, + "step": 720 + }, + { + "epoch": 0.11, + "grad_norm": 9.170140702662188, + "learning_rate": 1.4333996023856858e-06, + "loss": 0.6934, + "step": 721 + }, + { + "epoch": 0.11, + "grad_norm": 5.927656232446777, + "learning_rate": 1.4353876739562622e-06, + "loss": 0.7025, + "step": 722 + }, + { + "epoch": 0.11, + "grad_norm": 6.243543086265904, + "learning_rate": 1.437375745526839e-06, + "loss": 0.7012, + "step": 723 + }, + { + "epoch": 0.11, + "grad_norm": 7.802369803367568, + "learning_rate": 1.4393638170974154e-06, + "loss": 0.6868, + "step": 724 + }, + { + "epoch": 0.11, + "grad_norm": 2.29732395639185, + "learning_rate": 1.441351888667992e-06, + "loss": 0.6947, + "step": 725 + }, + { + "epoch": 0.11, + "grad_norm": 8.255932921480646, + "learning_rate": 1.4433399602385684e-06, + "loss": 0.7057, + "step": 726 + }, + { + "epoch": 0.11, + "grad_norm": 7.188879582778989, + "learning_rate": 1.445328031809145e-06, + "loss": 0.6901, + "step": 727 + }, + { + "epoch": 0.11, + "grad_norm": 2.9189520420944173, + "learning_rate": 1.4473161033797217e-06, + "loss": 0.6973, + "step": 728 + }, + { + "epoch": 0.11, + "grad_norm": 3.8099476699365287, + "learning_rate": 1.449304174950298e-06, + "loss": 0.6882, + "step": 729 + }, + { + "epoch": 0.11, + "grad_norm": 5.38115792724125, + "learning_rate": 1.4512922465208749e-06, + "loss": 0.707, + "step": 730 + }, + { + "epoch": 0.11, + "grad_norm": 7.85124453896501, + "learning_rate": 1.4532803180914513e-06, + "loss": 0.6934, + "step": 731 + }, + { + "epoch": 0.11, + "grad_norm": 4.03414713169438, + "learning_rate": 1.4552683896620277e-06, + "loss": 0.7005, + "step": 732 + }, + { + "epoch": 0.11, + "grad_norm": 5.0744709454407895, + "learning_rate": 1.4572564612326043e-06, + "loss": 0.7077, + "step": 733 + }, + { + "epoch": 0.11, + "grad_norm": 3.7442063877091454, + "learning_rate": 1.459244532803181e-06, + "loss": 0.6966, + "step": 734 + }, + { + "epoch": 0.11, + "grad_norm": 4.571013589241142, + "learning_rate": 1.4612326043737573e-06, + "loss": 0.7025, + "step": 735 + }, + { + "epoch": 0.11, + "grad_norm": 1.8455058392229227, + "learning_rate": 1.463220675944334e-06, + "loss": 0.6842, + "step": 736 + }, + { + "epoch": 0.11, + "grad_norm": 7.8481905873117634, + "learning_rate": 1.4652087475149105e-06, + "loss": 0.6934, + "step": 737 + }, + { + "epoch": 0.11, + "grad_norm": 4.86089570029459, + "learning_rate": 1.4671968190854871e-06, + "loss": 0.7038, + "step": 738 + }, + { + "epoch": 0.11, + "grad_norm": 6.011539143721105, + "learning_rate": 1.4691848906560635e-06, + "loss": 0.7005, + "step": 739 + }, + { + "epoch": 0.11, + "grad_norm": 6.5612696023568065, + "learning_rate": 1.47117296222664e-06, + "loss": 0.7005, + "step": 740 + }, + { + "epoch": 0.11, + "grad_norm": 3.3346354350585243, + "learning_rate": 1.4731610337972167e-06, + "loss": 0.6992, + "step": 741 + }, + { + "epoch": 0.11, + "grad_norm": 1.9369823820493375, + "learning_rate": 1.4751491053677931e-06, + "loss": 0.6895, + "step": 742 + }, + { + "epoch": 0.11, + "grad_norm": 7.186277853291797, + "learning_rate": 1.4771371769383698e-06, + "loss": 0.694, + "step": 743 + }, + { + "epoch": 0.11, + "grad_norm": 2.5293727387739136, + "learning_rate": 1.4791252485089464e-06, + "loss": 0.6901, + "step": 744 + }, + { + "epoch": 0.11, + "grad_norm": 2.4279978995442795, + "learning_rate": 1.4811133200795228e-06, + "loss": 0.6901, + "step": 745 + }, + { + "epoch": 0.11, + "grad_norm": 7.676656842267215, + "learning_rate": 1.4831013916500994e-06, + "loss": 0.6953, + "step": 746 + }, + { + "epoch": 0.11, + "grad_norm": 3.2076244796453994, + "learning_rate": 1.4850894632206758e-06, + "loss": 0.6966, + "step": 747 + }, + { + "epoch": 0.11, + "grad_norm": 2.807778496006594, + "learning_rate": 1.4870775347912524e-06, + "loss": 0.6953, + "step": 748 + }, + { + "epoch": 0.11, + "grad_norm": 2.854190312274918, + "learning_rate": 1.489065606361829e-06, + "loss": 0.6868, + "step": 749 + }, + { + "epoch": 0.11, + "grad_norm": 1.2288436256685733, + "learning_rate": 1.4910536779324054e-06, + "loss": 0.6855, + "step": 750 + }, + { + "epoch": 0.11, + "grad_norm": 4.546913073582353, + "learning_rate": 1.4930417495029822e-06, + "loss": 0.6979, + "step": 751 + }, + { + "epoch": 0.11, + "grad_norm": 5.476842305239768, + "learning_rate": 1.4950298210735586e-06, + "loss": 0.6816, + "step": 752 + }, + { + "epoch": 0.11, + "grad_norm": 10.860560601971525, + "learning_rate": 1.497017892644135e-06, + "loss": 0.6927, + "step": 753 + }, + { + "epoch": 0.11, + "grad_norm": 5.09201614461111, + "learning_rate": 1.4990059642147116e-06, + "loss": 0.6862, + "step": 754 + }, + { + "epoch": 0.11, + "grad_norm": 5.9646642375804175, + "learning_rate": 1.5009940357852882e-06, + "loss": 0.6875, + "step": 755 + }, + { + "epoch": 0.11, + "grad_norm": 8.303105821147822, + "learning_rate": 1.5029821073558648e-06, + "loss": 0.7083, + "step": 756 + }, + { + "epoch": 0.11, + "grad_norm": 3.57978633517658, + "learning_rate": 1.5049701789264412e-06, + "loss": 0.6862, + "step": 757 + }, + { + "epoch": 0.11, + "grad_norm": 5.113271185174702, + "learning_rate": 1.5069582504970176e-06, + "loss": 0.7025, + "step": 758 + }, + { + "epoch": 0.11, + "grad_norm": 5.531494791935649, + "learning_rate": 1.5089463220675945e-06, + "loss": 0.7005, + "step": 759 + }, + { + "epoch": 0.11, + "grad_norm": 6.279278305900604, + "learning_rate": 1.5109343936381709e-06, + "loss": 0.6842, + "step": 760 + }, + { + "epoch": 0.11, + "grad_norm": 5.239824252608611, + "learning_rate": 1.5129224652087475e-06, + "loss": 0.6999, + "step": 761 + }, + { + "epoch": 0.11, + "grad_norm": 5.133398688881819, + "learning_rate": 1.514910536779324e-06, + "loss": 0.696, + "step": 762 + }, + { + "epoch": 0.11, + "grad_norm": 4.092101666798966, + "learning_rate": 1.5168986083499005e-06, + "loss": 0.681, + "step": 763 + }, + { + "epoch": 0.11, + "grad_norm": 3.6324887664400154, + "learning_rate": 1.518886679920477e-06, + "loss": 0.6927, + "step": 764 + }, + { + "epoch": 0.11, + "grad_norm": 3.0698263954982004, + "learning_rate": 1.5208747514910535e-06, + "loss": 0.6908, + "step": 765 + }, + { + "epoch": 0.11, + "grad_norm": 7.44348397070806, + "learning_rate": 1.5228628230616301e-06, + "loss": 0.7012, + "step": 766 + }, + { + "epoch": 0.11, + "grad_norm": 2.001789144022712, + "learning_rate": 1.5248508946322067e-06, + "loss": 0.6953, + "step": 767 + }, + { + "epoch": 0.11, + "grad_norm": 3.205625044195502, + "learning_rate": 1.5268389662027831e-06, + "loss": 0.694, + "step": 768 + }, + { + "epoch": 0.11, + "grad_norm": 6.257676359377128, + "learning_rate": 1.52882703777336e-06, + "loss": 0.6934, + "step": 769 + }, + { + "epoch": 0.11, + "grad_norm": 4.066924888978332, + "learning_rate": 1.5308151093439363e-06, + "loss": 0.6992, + "step": 770 + }, + { + "epoch": 0.11, + "grad_norm": 7.626236107943773, + "learning_rate": 1.5328031809145127e-06, + "loss": 0.7031, + "step": 771 + }, + { + "epoch": 0.12, + "grad_norm": 3.1571463453615696, + "learning_rate": 1.5347912524850893e-06, + "loss": 0.694, + "step": 772 + }, + { + "epoch": 0.12, + "grad_norm": 4.325181065740585, + "learning_rate": 1.536779324055666e-06, + "loss": 0.6862, + "step": 773 + }, + { + "epoch": 0.12, + "grad_norm": 7.748178990754408, + "learning_rate": 1.5387673956262426e-06, + "loss": 0.6979, + "step": 774 + }, + { + "epoch": 0.12, + "grad_norm": 5.957090298163642, + "learning_rate": 1.540755467196819e-06, + "loss": 0.694, + "step": 775 + }, + { + "epoch": 0.12, + "grad_norm": 4.260243368872378, + "learning_rate": 1.5427435387673956e-06, + "loss": 0.7018, + "step": 776 + }, + { + "epoch": 0.12, + "grad_norm": 3.121314697846971, + "learning_rate": 1.5447316103379722e-06, + "loss": 0.6999, + "step": 777 + }, + { + "epoch": 0.12, + "grad_norm": 7.621044847287063, + "learning_rate": 1.5467196819085486e-06, + "loss": 0.6868, + "step": 778 + }, + { + "epoch": 0.12, + "grad_norm": 2.633354935330933, + "learning_rate": 1.548707753479125e-06, + "loss": 0.6979, + "step": 779 + }, + { + "epoch": 0.12, + "grad_norm": 10.258156757916286, + "learning_rate": 1.5506958250497018e-06, + "loss": 0.7018, + "step": 780 + }, + { + "epoch": 0.12, + "grad_norm": 8.718622393824843, + "learning_rate": 1.5526838966202782e-06, + "loss": 0.6986, + "step": 781 + }, + { + "epoch": 0.12, + "grad_norm": 14.883887156184672, + "learning_rate": 1.5546719681908548e-06, + "loss": 0.7051, + "step": 782 + }, + { + "epoch": 0.12, + "grad_norm": 4.091463540000469, + "learning_rate": 1.5566600397614314e-06, + "loss": 0.6914, + "step": 783 + }, + { + "epoch": 0.12, + "grad_norm": 4.98827483212532, + "learning_rate": 1.5586481113320078e-06, + "loss": 0.6947, + "step": 784 + }, + { + "epoch": 0.12, + "grad_norm": 3.8967075696144398, + "learning_rate": 1.5606361829025844e-06, + "loss": 0.6862, + "step": 785 + }, + { + "epoch": 0.12, + "grad_norm": 3.874866217761622, + "learning_rate": 1.5626242544731608e-06, + "loss": 0.6888, + "step": 786 + }, + { + "epoch": 0.12, + "grad_norm": 1.790886456253155, + "learning_rate": 1.5646123260437377e-06, + "loss": 0.6979, + "step": 787 + }, + { + "epoch": 0.12, + "grad_norm": 7.311910489759373, + "learning_rate": 1.566600397614314e-06, + "loss": 0.6836, + "step": 788 + }, + { + "epoch": 0.12, + "grad_norm": 9.851523073087906, + "learning_rate": 1.5685884691848905e-06, + "loss": 0.6719, + "step": 789 + }, + { + "epoch": 0.12, + "grad_norm": 8.248698390718802, + "learning_rate": 1.5705765407554673e-06, + "loss": 0.7018, + "step": 790 + }, + { + "epoch": 0.12, + "grad_norm": 3.5911043318535123, + "learning_rate": 1.5725646123260437e-06, + "loss": 0.6979, + "step": 791 + }, + { + "epoch": 0.12, + "grad_norm": 2.9172251752473777, + "learning_rate": 1.57455268389662e-06, + "loss": 0.6947, + "step": 792 + }, + { + "epoch": 0.12, + "grad_norm": 13.658620566041053, + "learning_rate": 1.5765407554671967e-06, + "loss": 0.7155, + "step": 793 + }, + { + "epoch": 0.12, + "grad_norm": 9.51799950136088, + "learning_rate": 1.5785288270377733e-06, + "loss": 0.7077, + "step": 794 + }, + { + "epoch": 0.12, + "grad_norm": 9.1275633695255, + "learning_rate": 1.58051689860835e-06, + "loss": 0.7031, + "step": 795 + }, + { + "epoch": 0.12, + "grad_norm": 3.0903253384747473, + "learning_rate": 1.5825049701789263e-06, + "loss": 0.6901, + "step": 796 + }, + { + "epoch": 0.12, + "grad_norm": 2.1670889986735915, + "learning_rate": 1.584493041749503e-06, + "loss": 0.696, + "step": 797 + }, + { + "epoch": 0.12, + "grad_norm": 6.243664722168212, + "learning_rate": 1.5864811133200795e-06, + "loss": 0.6966, + "step": 798 + }, + { + "epoch": 0.12, + "grad_norm": 3.8149891677560364, + "learning_rate": 1.588469184890656e-06, + "loss": 0.6868, + "step": 799 + }, + { + "epoch": 0.12, + "grad_norm": 13.361473306979857, + "learning_rate": 1.5904572564612325e-06, + "loss": 0.7109, + "step": 800 + }, + { + "epoch": 0.12, + "grad_norm": 4.7540764958430755, + "learning_rate": 1.5924453280318091e-06, + "loss": 0.7064, + "step": 801 + }, + { + "epoch": 0.12, + "grad_norm": 5.040970476980475, + "learning_rate": 1.5944333996023855e-06, + "loss": 0.6842, + "step": 802 + }, + { + "epoch": 0.12, + "grad_norm": 5.697334296091878, + "learning_rate": 1.5964214711729622e-06, + "loss": 0.6914, + "step": 803 + }, + { + "epoch": 0.12, + "grad_norm": 7.822933270695639, + "learning_rate": 1.5984095427435388e-06, + "loss": 0.6953, + "step": 804 + }, + { + "epoch": 0.12, + "grad_norm": 3.12098867089369, + "learning_rate": 1.6003976143141152e-06, + "loss": 0.6849, + "step": 805 + }, + { + "epoch": 0.12, + "grad_norm": 3.04958074298262, + "learning_rate": 1.6023856858846918e-06, + "loss": 0.694, + "step": 806 + }, + { + "epoch": 0.12, + "grad_norm": 9.512802393091038, + "learning_rate": 1.6043737574552682e-06, + "loss": 0.6973, + "step": 807 + }, + { + "epoch": 0.12, + "grad_norm": 9.244398525200364, + "learning_rate": 1.606361829025845e-06, + "loss": 0.7057, + "step": 808 + }, + { + "epoch": 0.12, + "grad_norm": 2.1680227943799606, + "learning_rate": 1.6083499005964214e-06, + "loss": 0.6908, + "step": 809 + }, + { + "epoch": 0.12, + "grad_norm": 9.571008011877753, + "learning_rate": 1.6103379721669978e-06, + "loss": 0.6979, + "step": 810 + }, + { + "epoch": 0.12, + "grad_norm": 6.487897550527417, + "learning_rate": 1.6123260437375746e-06, + "loss": 0.696, + "step": 811 + }, + { + "epoch": 0.12, + "grad_norm": 2.687701256891149, + "learning_rate": 1.614314115308151e-06, + "loss": 0.6927, + "step": 812 + }, + { + "epoch": 0.12, + "grad_norm": 4.203078280589674, + "learning_rate": 1.6163021868787276e-06, + "loss": 0.6953, + "step": 813 + }, + { + "epoch": 0.12, + "grad_norm": 6.111125466210192, + "learning_rate": 1.618290258449304e-06, + "loss": 0.6921, + "step": 814 + }, + { + "epoch": 0.12, + "grad_norm": 6.0845965452657556, + "learning_rate": 1.6202783300198806e-06, + "loss": 0.694, + "step": 815 + }, + { + "epoch": 0.12, + "grad_norm": 3.789169743675614, + "learning_rate": 1.6222664015904572e-06, + "loss": 0.6914, + "step": 816 + }, + { + "epoch": 0.12, + "grad_norm": 10.168326435341886, + "learning_rate": 1.6242544731610336e-06, + "loss": 0.6992, + "step": 817 + }, + { + "epoch": 0.12, + "grad_norm": 11.416312026385379, + "learning_rate": 1.6262425447316103e-06, + "loss": 0.6973, + "step": 818 + }, + { + "epoch": 0.12, + "grad_norm": 7.2006073183154085, + "learning_rate": 1.6282306163021869e-06, + "loss": 0.696, + "step": 819 + }, + { + "epoch": 0.12, + "grad_norm": 3.694974248897912, + "learning_rate": 1.6302186878727633e-06, + "loss": 0.6862, + "step": 820 + }, + { + "epoch": 0.12, + "grad_norm": 2.2342980818642535, + "learning_rate": 1.6322067594433399e-06, + "loss": 0.6895, + "step": 821 + }, + { + "epoch": 0.12, + "grad_norm": 2.016232699283681, + "learning_rate": 1.6341948310139165e-06, + "loss": 0.6934, + "step": 822 + }, + { + "epoch": 0.12, + "grad_norm": 12.480752754489313, + "learning_rate": 1.6361829025844929e-06, + "loss": 0.707, + "step": 823 + }, + { + "epoch": 0.12, + "grad_norm": 4.888109401924429, + "learning_rate": 1.6381709741550695e-06, + "loss": 0.6855, + "step": 824 + }, + { + "epoch": 0.12, + "grad_norm": 5.102245392615632, + "learning_rate": 1.640159045725646e-06, + "loss": 0.6921, + "step": 825 + }, + { + "epoch": 0.12, + "grad_norm": 1.815973491929639, + "learning_rate": 1.6421471172962227e-06, + "loss": 0.6947, + "step": 826 + }, + { + "epoch": 0.12, + "grad_norm": 7.241995965827673, + "learning_rate": 1.6441351888667991e-06, + "loss": 0.6947, + "step": 827 + }, + { + "epoch": 0.12, + "grad_norm": 9.905531560431864, + "learning_rate": 1.6461232604373755e-06, + "loss": 0.6921, + "step": 828 + }, + { + "epoch": 0.12, + "grad_norm": 7.477817967263282, + "learning_rate": 1.6481113320079523e-06, + "loss": 0.6973, + "step": 829 + }, + { + "epoch": 0.12, + "grad_norm": 3.4678022331061817, + "learning_rate": 1.6500994035785287e-06, + "loss": 0.6947, + "step": 830 + }, + { + "epoch": 0.12, + "grad_norm": 2.42406347542297, + "learning_rate": 1.6520874751491053e-06, + "loss": 0.6908, + "step": 831 + }, + { + "epoch": 0.12, + "grad_norm": 4.268178733701501, + "learning_rate": 1.654075546719682e-06, + "loss": 0.6875, + "step": 832 + }, + { + "epoch": 0.12, + "grad_norm": 2.6479181928502165, + "learning_rate": 1.6560636182902584e-06, + "loss": 0.696, + "step": 833 + }, + { + "epoch": 0.12, + "grad_norm": 5.95319217918017, + "learning_rate": 1.658051689860835e-06, + "loss": 0.6868, + "step": 834 + }, + { + "epoch": 0.12, + "grad_norm": 2.7550869964533624, + "learning_rate": 1.6600397614314114e-06, + "loss": 0.6979, + "step": 835 + }, + { + "epoch": 0.12, + "grad_norm": 6.201782050412735, + "learning_rate": 1.662027833001988e-06, + "loss": 0.6986, + "step": 836 + }, + { + "epoch": 0.12, + "grad_norm": 14.098151704629643, + "learning_rate": 1.6640159045725646e-06, + "loss": 0.7129, + "step": 837 + }, + { + "epoch": 0.12, + "grad_norm": 7.8408204257313665, + "learning_rate": 1.666003976143141e-06, + "loss": 0.6992, + "step": 838 + }, + { + "epoch": 0.13, + "grad_norm": 4.288241059253356, + "learning_rate": 1.6679920477137178e-06, + "loss": 0.6823, + "step": 839 + }, + { + "epoch": 0.13, + "grad_norm": 2.2146650115410673, + "learning_rate": 1.6699801192842942e-06, + "loss": 0.6908, + "step": 840 + }, + { + "epoch": 0.13, + "grad_norm": 7.166939611881107, + "learning_rate": 1.6719681908548706e-06, + "loss": 0.6888, + "step": 841 + }, + { + "epoch": 0.13, + "grad_norm": 4.183287797176236, + "learning_rate": 1.6739562624254472e-06, + "loss": 0.6921, + "step": 842 + }, + { + "epoch": 0.13, + "grad_norm": 7.112974310600647, + "learning_rate": 1.6759443339960238e-06, + "loss": 0.6953, + "step": 843 + }, + { + "epoch": 0.13, + "grad_norm": 3.7483603402552927, + "learning_rate": 1.6779324055666004e-06, + "loss": 0.694, + "step": 844 + }, + { + "epoch": 0.13, + "grad_norm": 4.364137008373639, + "learning_rate": 1.6799204771371768e-06, + "loss": 0.6992, + "step": 845 + }, + { + "epoch": 0.13, + "grad_norm": 6.796934809294173, + "learning_rate": 1.6819085487077532e-06, + "loss": 0.6914, + "step": 846 + }, + { + "epoch": 0.13, + "grad_norm": 7.044946171310735, + "learning_rate": 1.68389662027833e-06, + "loss": 0.6953, + "step": 847 + }, + { + "epoch": 0.13, + "grad_norm": 3.4287484493844365, + "learning_rate": 1.6858846918489065e-06, + "loss": 0.6901, + "step": 848 + }, + { + "epoch": 0.13, + "grad_norm": 5.8512671533024125, + "learning_rate": 1.6878727634194829e-06, + "loss": 0.6973, + "step": 849 + }, + { + "epoch": 0.13, + "grad_norm": 3.555857703472488, + "learning_rate": 1.6898608349900597e-06, + "loss": 0.6868, + "step": 850 + }, + { + "epoch": 0.13, + "grad_norm": 4.636012271866163, + "learning_rate": 1.691848906560636e-06, + "loss": 0.6882, + "step": 851 + }, + { + "epoch": 0.13, + "grad_norm": 11.14776603769053, + "learning_rate": 1.6938369781312127e-06, + "loss": 0.694, + "step": 852 + }, + { + "epoch": 0.13, + "grad_norm": 6.350572689704162, + "learning_rate": 1.6958250497017893e-06, + "loss": 0.6947, + "step": 853 + }, + { + "epoch": 0.13, + "grad_norm": 12.555264810453602, + "learning_rate": 1.6978131212723657e-06, + "loss": 0.694, + "step": 854 + }, + { + "epoch": 0.13, + "grad_norm": 5.376179555087542, + "learning_rate": 1.6998011928429423e-06, + "loss": 0.6973, + "step": 855 + }, + { + "epoch": 0.13, + "grad_norm": 1.4504591297740483, + "learning_rate": 1.7017892644135187e-06, + "loss": 0.6927, + "step": 856 + }, + { + "epoch": 0.13, + "grad_norm": 1.6490423543718726, + "learning_rate": 1.7037773359840955e-06, + "loss": 0.6914, + "step": 857 + }, + { + "epoch": 0.13, + "grad_norm": 9.958504608183791, + "learning_rate": 1.705765407554672e-06, + "loss": 0.7038, + "step": 858 + }, + { + "epoch": 0.13, + "grad_norm": 2.4465968932213764, + "learning_rate": 1.7077534791252483e-06, + "loss": 0.6875, + "step": 859 + }, + { + "epoch": 0.13, + "grad_norm": 5.923110782232334, + "learning_rate": 1.7097415506958251e-06, + "loss": 0.6901, + "step": 860 + }, + { + "epoch": 0.13, + "grad_norm": 7.2555047403152715, + "learning_rate": 1.7117296222664015e-06, + "loss": 0.7103, + "step": 861 + }, + { + "epoch": 0.13, + "grad_norm": 3.344862543399948, + "learning_rate": 1.713717693836978e-06, + "loss": 0.6953, + "step": 862 + }, + { + "epoch": 0.13, + "grad_norm": 2.240120278013498, + "learning_rate": 1.7157057654075546e-06, + "loss": 0.6921, + "step": 863 + }, + { + "epoch": 0.13, + "grad_norm": 6.323403567178968, + "learning_rate": 1.7176938369781312e-06, + "loss": 0.6934, + "step": 864 + }, + { + "epoch": 0.13, + "grad_norm": 10.052378316269431, + "learning_rate": 1.7196819085487078e-06, + "loss": 0.6882, + "step": 865 + }, + { + "epoch": 0.13, + "grad_norm": 2.992688593094907, + "learning_rate": 1.7216699801192842e-06, + "loss": 0.6934, + "step": 866 + }, + { + "epoch": 0.13, + "grad_norm": 4.004078207652702, + "learning_rate": 1.7236580516898606e-06, + "loss": 0.6953, + "step": 867 + }, + { + "epoch": 0.13, + "grad_norm": 2.401299343905316, + "learning_rate": 1.7256461232604374e-06, + "loss": 0.6966, + "step": 868 + }, + { + "epoch": 0.13, + "grad_norm": 2.7520273499640306, + "learning_rate": 1.7276341948310138e-06, + "loss": 0.6914, + "step": 869 + }, + { + "epoch": 0.13, + "grad_norm": 4.036626063259436, + "learning_rate": 1.7296222664015904e-06, + "loss": 0.6947, + "step": 870 + }, + { + "epoch": 0.13, + "grad_norm": 1.7441984781279734, + "learning_rate": 1.731610337972167e-06, + "loss": 0.6979, + "step": 871 + }, + { + "epoch": 0.13, + "grad_norm": 10.30973965034451, + "learning_rate": 1.7335984095427434e-06, + "loss": 0.7077, + "step": 872 + }, + { + "epoch": 0.13, + "grad_norm": 2.486208730388135, + "learning_rate": 1.73558648111332e-06, + "loss": 0.6855, + "step": 873 + }, + { + "epoch": 0.13, + "grad_norm": 4.921968898442971, + "learning_rate": 1.7375745526838964e-06, + "loss": 0.6986, + "step": 874 + }, + { + "epoch": 0.13, + "grad_norm": 4.451367604930032, + "learning_rate": 1.739562624254473e-06, + "loss": 0.6908, + "step": 875 + }, + { + "epoch": 0.13, + "grad_norm": 12.27837154708852, + "learning_rate": 1.7415506958250496e-06, + "loss": 0.6888, + "step": 876 + }, + { + "epoch": 0.13, + "grad_norm": 17.84877394856048, + "learning_rate": 1.743538767395626e-06, + "loss": 0.7148, + "step": 877 + }, + { + "epoch": 0.13, + "grad_norm": 5.2508256863175795, + "learning_rate": 1.7455268389662029e-06, + "loss": 0.7025, + "step": 878 + }, + { + "epoch": 0.13, + "grad_norm": 1.933498956363293, + "learning_rate": 1.7475149105367793e-06, + "loss": 0.6953, + "step": 879 + }, + { + "epoch": 0.13, + "grad_norm": 5.585763802745018, + "learning_rate": 1.7495029821073557e-06, + "loss": 0.6888, + "step": 880 + }, + { + "epoch": 0.13, + "grad_norm": 10.650566717731353, + "learning_rate": 1.7514910536779323e-06, + "loss": 0.6992, + "step": 881 + }, + { + "epoch": 0.13, + "grad_norm": 4.547887536753185, + "learning_rate": 1.7534791252485089e-06, + "loss": 0.6882, + "step": 882 + }, + { + "epoch": 0.13, + "grad_norm": 1.1463896645877167, + "learning_rate": 1.7554671968190855e-06, + "loss": 0.6966, + "step": 883 + }, + { + "epoch": 0.13, + "grad_norm": 6.145493115407018, + "learning_rate": 1.7574552683896619e-06, + "loss": 0.6927, + "step": 884 + }, + { + "epoch": 0.13, + "grad_norm": 11.704999594535566, + "learning_rate": 1.7594433399602385e-06, + "loss": 0.7025, + "step": 885 + }, + { + "epoch": 0.13, + "grad_norm": 4.443099532638297, + "learning_rate": 1.7614314115308151e-06, + "loss": 0.6953, + "step": 886 + }, + { + "epoch": 0.13, + "grad_norm": 2.801918790388745, + "learning_rate": 1.7634194831013915e-06, + "loss": 0.7005, + "step": 887 + }, + { + "epoch": 0.13, + "grad_norm": 2.7597522323247743, + "learning_rate": 1.765407554671968e-06, + "loss": 0.6914, + "step": 888 + }, + { + "epoch": 0.13, + "grad_norm": 1.7854203661462174, + "learning_rate": 1.7673956262425447e-06, + "loss": 0.6986, + "step": 889 + }, + { + "epoch": 0.13, + "grad_norm": 5.740268298448499, + "learning_rate": 1.7693836978131211e-06, + "loss": 0.6888, + "step": 890 + }, + { + "epoch": 0.13, + "grad_norm": 10.142980007679268, + "learning_rate": 1.7713717693836977e-06, + "loss": 0.6999, + "step": 891 + }, + { + "epoch": 0.13, + "grad_norm": 1.3643598292335815, + "learning_rate": 1.7733598409542744e-06, + "loss": 0.6927, + "step": 892 + }, + { + "epoch": 0.13, + "grad_norm": 6.759985147997545, + "learning_rate": 1.7753479125248508e-06, + "loss": 0.7031, + "step": 893 + }, + { + "epoch": 0.13, + "grad_norm": 4.368191892480012, + "learning_rate": 1.7773359840954274e-06, + "loss": 0.6855, + "step": 894 + }, + { + "epoch": 0.13, + "grad_norm": 15.134420868032946, + "learning_rate": 1.7793240556660038e-06, + "loss": 0.6836, + "step": 895 + }, + { + "epoch": 0.13, + "grad_norm": 5.419566128256565, + "learning_rate": 1.7813121272365806e-06, + "loss": 0.6953, + "step": 896 + }, + { + "epoch": 0.13, + "grad_norm": 9.023914620215725, + "learning_rate": 1.783300198807157e-06, + "loss": 0.707, + "step": 897 + }, + { + "epoch": 0.13, + "grad_norm": 1.4512502150893682, + "learning_rate": 1.7852882703777334e-06, + "loss": 0.6836, + "step": 898 + }, + { + "epoch": 0.13, + "grad_norm": 19.233359757096487, + "learning_rate": 1.7872763419483102e-06, + "loss": 0.7181, + "step": 899 + }, + { + "epoch": 0.13, + "grad_norm": 10.692481405932554, + "learning_rate": 1.7892644135188866e-06, + "loss": 0.7103, + "step": 900 + }, + { + "epoch": 0.13, + "grad_norm": 7.263017577929976, + "learning_rate": 1.7912524850894632e-06, + "loss": 0.6927, + "step": 901 + }, + { + "epoch": 0.13, + "grad_norm": 6.595389245834784, + "learning_rate": 1.7932405566600396e-06, + "loss": 0.6979, + "step": 902 + }, + { + "epoch": 0.13, + "grad_norm": 4.317935169854547, + "learning_rate": 1.7952286282306162e-06, + "loss": 0.6934, + "step": 903 + }, + { + "epoch": 0.13, + "grad_norm": 6.7791324390880465, + "learning_rate": 1.7972166998011928e-06, + "loss": 0.6986, + "step": 904 + }, + { + "epoch": 0.13, + "grad_norm": 13.888575927155742, + "learning_rate": 1.7992047713717692e-06, + "loss": 0.7096, + "step": 905 + }, + { + "epoch": 0.14, + "grad_norm": 10.685215470306302, + "learning_rate": 1.8011928429423458e-06, + "loss": 0.7031, + "step": 906 + }, + { + "epoch": 0.14, + "grad_norm": 3.113720243186466, + "learning_rate": 1.8031809145129225e-06, + "loss": 0.6901, + "step": 907 + }, + { + "epoch": 0.14, + "grad_norm": 3.377371280473697, + "learning_rate": 1.8051689860834989e-06, + "loss": 0.6966, + "step": 908 + }, + { + "epoch": 0.14, + "grad_norm": 6.664667620433667, + "learning_rate": 1.8071570576540755e-06, + "loss": 0.6966, + "step": 909 + }, + { + "epoch": 0.14, + "grad_norm": 1.7331593395876153, + "learning_rate": 1.809145129224652e-06, + "loss": 0.6927, + "step": 910 + }, + { + "epoch": 0.14, + "grad_norm": 6.53706180788123, + "learning_rate": 1.8111332007952285e-06, + "loss": 0.7038, + "step": 911 + }, + { + "epoch": 0.14, + "grad_norm": 1.9243559893004265, + "learning_rate": 1.813121272365805e-06, + "loss": 0.6921, + "step": 912 + }, + { + "epoch": 0.14, + "grad_norm": 2.5873251473390653, + "learning_rate": 1.8151093439363817e-06, + "loss": 0.694, + "step": 913 + }, + { + "epoch": 0.14, + "grad_norm": 4.688911650515929, + "learning_rate": 1.8170974155069583e-06, + "loss": 0.6979, + "step": 914 + }, + { + "epoch": 0.14, + "grad_norm": 5.297208287957993, + "learning_rate": 1.8190854870775347e-06, + "loss": 0.6901, + "step": 915 + }, + { + "epoch": 0.14, + "grad_norm": 15.505636881497116, + "learning_rate": 1.821073558648111e-06, + "loss": 0.7038, + "step": 916 + }, + { + "epoch": 0.14, + "grad_norm": 2.378347719476801, + "learning_rate": 1.823061630218688e-06, + "loss": 0.6934, + "step": 917 + }, + { + "epoch": 0.14, + "grad_norm": 6.937263683087334, + "learning_rate": 1.8250497017892643e-06, + "loss": 0.6953, + "step": 918 + }, + { + "epoch": 0.14, + "grad_norm": 2.3795973449589685, + "learning_rate": 1.8270377733598407e-06, + "loss": 0.6888, + "step": 919 + }, + { + "epoch": 0.14, + "grad_norm": 2.9109049499024344, + "learning_rate": 1.8290258449304175e-06, + "loss": 0.6953, + "step": 920 + }, + { + "epoch": 0.14, + "grad_norm": 5.387039622736997, + "learning_rate": 1.831013916500994e-06, + "loss": 0.6914, + "step": 921 + }, + { + "epoch": 0.14, + "grad_norm": 2.7285302974104395, + "learning_rate": 1.8330019880715706e-06, + "loss": 0.7005, + "step": 922 + }, + { + "epoch": 0.14, + "grad_norm": 2.9697974011415194, + "learning_rate": 1.834990059642147e-06, + "loss": 0.6934, + "step": 923 + }, + { + "epoch": 0.14, + "grad_norm": 6.209383703613921, + "learning_rate": 1.8369781312127236e-06, + "loss": 0.6934, + "step": 924 + }, + { + "epoch": 0.14, + "grad_norm": 10.480493816172714, + "learning_rate": 1.8389662027833002e-06, + "loss": 0.7005, + "step": 925 + }, + { + "epoch": 0.14, + "grad_norm": 3.56786775133432, + "learning_rate": 1.8409542743538766e-06, + "loss": 0.696, + "step": 926 + }, + { + "epoch": 0.14, + "grad_norm": 5.446553145302807, + "learning_rate": 1.8429423459244534e-06, + "loss": 0.6934, + "step": 927 + }, + { + "epoch": 0.14, + "grad_norm": 5.191238875811201, + "learning_rate": 1.8449304174950298e-06, + "loss": 0.6947, + "step": 928 + }, + { + "epoch": 0.14, + "grad_norm": 11.729379240651248, + "learning_rate": 1.8469184890656062e-06, + "loss": 0.6973, + "step": 929 + }, + { + "epoch": 0.14, + "grad_norm": 9.338064752072578, + "learning_rate": 1.8489065606361828e-06, + "loss": 0.6921, + "step": 930 + }, + { + "epoch": 0.14, + "grad_norm": 4.823985415617739, + "learning_rate": 1.8508946322067594e-06, + "loss": 0.6901, + "step": 931 + }, + { + "epoch": 0.14, + "grad_norm": 3.0129590782576803, + "learning_rate": 1.8528827037773358e-06, + "loss": 0.6914, + "step": 932 + }, + { + "epoch": 0.14, + "grad_norm": 5.815092061487661, + "learning_rate": 1.8548707753479124e-06, + "loss": 0.6914, + "step": 933 + }, + { + "epoch": 0.14, + "grad_norm": 6.107460013451897, + "learning_rate": 1.856858846918489e-06, + "loss": 0.6947, + "step": 934 + }, + { + "epoch": 0.14, + "grad_norm": 11.324410589756834, + "learning_rate": 1.8588469184890656e-06, + "loss": 0.7083, + "step": 935 + }, + { + "epoch": 0.14, + "grad_norm": 3.907670997978956, + "learning_rate": 1.860834990059642e-06, + "loss": 0.6999, + "step": 936 + }, + { + "epoch": 0.14, + "grad_norm": 6.107926851275585, + "learning_rate": 1.8628230616302184e-06, + "loss": 0.696, + "step": 937 + }, + { + "epoch": 0.14, + "grad_norm": 7.211790977100659, + "learning_rate": 1.8648111332007953e-06, + "loss": 0.7044, + "step": 938 + }, + { + "epoch": 0.14, + "grad_norm": 9.212009696072005, + "learning_rate": 1.8667992047713717e-06, + "loss": 0.6966, + "step": 939 + }, + { + "epoch": 0.14, + "grad_norm": 4.108176027758403, + "learning_rate": 1.8687872763419483e-06, + "loss": 0.6895, + "step": 940 + }, + { + "epoch": 0.14, + "grad_norm": 9.452140350553258, + "learning_rate": 1.8707753479125249e-06, + "loss": 0.6836, + "step": 941 + }, + { + "epoch": 0.14, + "grad_norm": 3.1429591141635713, + "learning_rate": 1.8727634194831013e-06, + "loss": 0.6855, + "step": 942 + }, + { + "epoch": 0.14, + "grad_norm": 26.96665132527698, + "learning_rate": 1.8747514910536779e-06, + "loss": 0.7324, + "step": 943 + }, + { + "epoch": 0.14, + "grad_norm": 1.7487982317642337, + "learning_rate": 1.8767395626242543e-06, + "loss": 0.6836, + "step": 944 + }, + { + "epoch": 0.14, + "grad_norm": 5.673817536790449, + "learning_rate": 1.878727634194831e-06, + "loss": 0.6973, + "step": 945 + }, + { + "epoch": 0.14, + "grad_norm": 11.138354063842728, + "learning_rate": 1.8807157057654075e-06, + "loss": 0.6999, + "step": 946 + }, + { + "epoch": 0.14, + "grad_norm": 4.400058315331106, + "learning_rate": 1.882703777335984e-06, + "loss": 0.694, + "step": 947 + }, + { + "epoch": 0.14, + "grad_norm": 3.939466441615692, + "learning_rate": 1.8846918489065607e-06, + "loss": 0.6979, + "step": 948 + }, + { + "epoch": 0.14, + "grad_norm": 4.454498584064968, + "learning_rate": 1.8866799204771371e-06, + "loss": 0.696, + "step": 949 + }, + { + "epoch": 0.14, + "grad_norm": 8.0852354280009, + "learning_rate": 1.8886679920477135e-06, + "loss": 0.7031, + "step": 950 + }, + { + "epoch": 0.14, + "grad_norm": 2.316087498116654, + "learning_rate": 1.8906560636182901e-06, + "loss": 0.6868, + "step": 951 + }, + { + "epoch": 0.14, + "grad_norm": 2.4427317234696493, + "learning_rate": 1.8926441351888668e-06, + "loss": 0.6803, + "step": 952 + }, + { + "epoch": 0.14, + "grad_norm": 3.32645785235573, + "learning_rate": 1.8946322067594434e-06, + "loss": 0.6908, + "step": 953 + }, + { + "epoch": 0.14, + "grad_norm": 14.28341041332783, + "learning_rate": 1.8966202783300198e-06, + "loss": 0.7129, + "step": 954 + }, + { + "epoch": 0.14, + "grad_norm": 7.053775899630727, + "learning_rate": 1.8986083499005962e-06, + "loss": 0.7018, + "step": 955 + }, + { + "epoch": 0.14, + "grad_norm": 7.654887654306456, + "learning_rate": 1.900596421471173e-06, + "loss": 0.6986, + "step": 956 + }, + { + "epoch": 0.14, + "grad_norm": 13.78805953649366, + "learning_rate": 1.9025844930417494e-06, + "loss": 0.7057, + "step": 957 + }, + { + "epoch": 0.14, + "grad_norm": 6.101018212950043, + "learning_rate": 1.9045725646123258e-06, + "loss": 0.696, + "step": 958 + }, + { + "epoch": 0.14, + "grad_norm": 4.510518061211453, + "learning_rate": 1.9065606361829026e-06, + "loss": 0.6875, + "step": 959 + }, + { + "epoch": 0.14, + "grad_norm": 1.5667045304070333, + "learning_rate": 1.908548707753479e-06, + "loss": 0.6934, + "step": 960 + }, + { + "epoch": 0.14, + "grad_norm": 10.668021883283828, + "learning_rate": 1.9105367793240556e-06, + "loss": 0.7096, + "step": 961 + }, + { + "epoch": 0.14, + "grad_norm": 4.995754625538203, + "learning_rate": 1.912524850894632e-06, + "loss": 0.7005, + "step": 962 + }, + { + "epoch": 0.14, + "grad_norm": 3.505631905080863, + "learning_rate": 1.9145129224652084e-06, + "loss": 0.694, + "step": 963 + }, + { + "epoch": 0.14, + "grad_norm": 13.484901594098195, + "learning_rate": 1.9165009940357852e-06, + "loss": 0.7233, + "step": 964 + }, + { + "epoch": 0.14, + "grad_norm": 8.698380885789234, + "learning_rate": 1.9184890656063616e-06, + "loss": 0.6999, + "step": 965 + }, + { + "epoch": 0.14, + "grad_norm": 6.866185858593128, + "learning_rate": 1.9204771371769385e-06, + "loss": 0.6999, + "step": 966 + }, + { + "epoch": 0.14, + "grad_norm": 1.8943103297600585, + "learning_rate": 1.922465208747515e-06, + "loss": 0.6927, + "step": 967 + }, + { + "epoch": 0.14, + "grad_norm": 2.359312904663794, + "learning_rate": 1.9244532803180912e-06, + "loss": 0.6934, + "step": 968 + }, + { + "epoch": 0.14, + "grad_norm": 5.780516317145175, + "learning_rate": 1.926441351888668e-06, + "loss": 0.6855, + "step": 969 + }, + { + "epoch": 0.14, + "grad_norm": 4.458153069959146, + "learning_rate": 1.9284294234592445e-06, + "loss": 0.6888, + "step": 970 + }, + { + "epoch": 0.14, + "grad_norm": 2.989755080349235, + "learning_rate": 1.9304174950298213e-06, + "loss": 0.6849, + "step": 971 + }, + { + "epoch": 0.14, + "grad_norm": 11.66016755641419, + "learning_rate": 1.9324055666003977e-06, + "loss": 0.7174, + "step": 972 + }, + { + "epoch": 0.15, + "grad_norm": 26.67117899392022, + "learning_rate": 1.934393638170974e-06, + "loss": 0.7493, + "step": 973 + }, + { + "epoch": 0.15, + "grad_norm": 4.489191144365526, + "learning_rate": 1.9363817097415505e-06, + "loss": 0.6875, + "step": 974 + }, + { + "epoch": 0.15, + "grad_norm": 4.807026818030657, + "learning_rate": 1.9383697813121273e-06, + "loss": 0.6953, + "step": 975 + }, + { + "epoch": 0.15, + "grad_norm": 13.632523198503545, + "learning_rate": 1.9403578528827037e-06, + "loss": 0.7109, + "step": 976 + }, + { + "epoch": 0.15, + "grad_norm": 4.548432584874475, + "learning_rate": 1.94234592445328e-06, + "loss": 0.6921, + "step": 977 + }, + { + "epoch": 0.15, + "grad_norm": 3.2076861200367723, + "learning_rate": 1.944333996023857e-06, + "loss": 0.6829, + "step": 978 + }, + { + "epoch": 0.15, + "grad_norm": 6.859934419987951, + "learning_rate": 1.9463220675944333e-06, + "loss": 0.696, + "step": 979 + }, + { + "epoch": 0.15, + "grad_norm": 9.023774474510033, + "learning_rate": 1.9483101391650097e-06, + "loss": 0.7051, + "step": 980 + }, + { + "epoch": 0.15, + "grad_norm": 7.610599491642174, + "learning_rate": 1.950298210735586e-06, + "loss": 0.7005, + "step": 981 + }, + { + "epoch": 0.15, + "grad_norm": 3.895048692446782, + "learning_rate": 1.952286282306163e-06, + "loss": 0.6882, + "step": 982 + }, + { + "epoch": 0.15, + "grad_norm": 7.454060064388314, + "learning_rate": 1.9542743538767393e-06, + "loss": 0.7031, + "step": 983 + }, + { + "epoch": 0.15, + "grad_norm": 8.82195238407596, + "learning_rate": 1.956262425447316e-06, + "loss": 0.7012, + "step": 984 + }, + { + "epoch": 0.15, + "grad_norm": 12.519786887660453, + "learning_rate": 1.9582504970178926e-06, + "loss": 0.7188, + "step": 985 + }, + { + "epoch": 0.15, + "grad_norm": 5.123442382374296, + "learning_rate": 1.960238568588469e-06, + "loss": 0.7012, + "step": 986 + }, + { + "epoch": 0.15, + "grad_norm": 5.867602290505366, + "learning_rate": 1.962226640159046e-06, + "loss": 0.6934, + "step": 987 + }, + { + "epoch": 0.15, + "grad_norm": 4.752153268937873, + "learning_rate": 1.964214711729622e-06, + "loss": 0.6816, + "step": 988 + }, + { + "epoch": 0.15, + "grad_norm": 9.484104747690235, + "learning_rate": 1.9662027833001986e-06, + "loss": 0.7005, + "step": 989 + }, + { + "epoch": 0.15, + "grad_norm": 2.0951532231199543, + "learning_rate": 1.9681908548707754e-06, + "loss": 0.6914, + "step": 990 + }, + { + "epoch": 0.15, + "grad_norm": 11.084750556866899, + "learning_rate": 1.970178926441352e-06, + "loss": 0.7148, + "step": 991 + }, + { + "epoch": 0.15, + "grad_norm": 5.394004964666189, + "learning_rate": 1.9721669980119286e-06, + "loss": 0.6966, + "step": 992 + }, + { + "epoch": 0.15, + "grad_norm": 11.25705084846513, + "learning_rate": 1.974155069582505e-06, + "loss": 0.7077, + "step": 993 + }, + { + "epoch": 0.15, + "grad_norm": 4.208039831303938, + "learning_rate": 1.9761431411530814e-06, + "loss": 0.6927, + "step": 994 + }, + { + "epoch": 0.15, + "grad_norm": 5.647153453978154, + "learning_rate": 1.978131212723658e-06, + "loss": 0.6953, + "step": 995 + }, + { + "epoch": 0.15, + "grad_norm": 4.4066691627509265, + "learning_rate": 1.9801192842942347e-06, + "loss": 0.6986, + "step": 996 + }, + { + "epoch": 0.15, + "grad_norm": 1.7467144898241598, + "learning_rate": 1.982107355864811e-06, + "loss": 0.6934, + "step": 997 + }, + { + "epoch": 0.15, + "grad_norm": 12.465321296554912, + "learning_rate": 1.9840954274353874e-06, + "loss": 0.7025, + "step": 998 + }, + { + "epoch": 0.15, + "grad_norm": 2.9380069431693756, + "learning_rate": 1.986083499005964e-06, + "loss": 0.6927, + "step": 999 + }, + { + "epoch": 0.15, + "grad_norm": 4.964899707486991, + "learning_rate": 1.9880715705765407e-06, + "loss": 0.6966, + "step": 1000 + }, + { + "epoch": 0.15, + "grad_norm": 11.702601292751394, + "learning_rate": 1.990059642147117e-06, + "loss": 0.7044, + "step": 1001 + }, + { + "epoch": 0.15, + "grad_norm": 7.033127393068792, + "learning_rate": 1.9920477137176935e-06, + "loss": 0.7005, + "step": 1002 + }, + { + "epoch": 0.15, + "grad_norm": 3.3626985941966243, + "learning_rate": 1.9940357852882703e-06, + "loss": 0.6934, + "step": 1003 + }, + { + "epoch": 0.15, + "grad_norm": 3.186095863539595, + "learning_rate": 1.9960238568588467e-06, + "loss": 0.6895, + "step": 1004 + }, + { + "epoch": 0.15, + "grad_norm": 5.225176223828071, + "learning_rate": 1.9980119284294235e-06, + "loss": 0.6934, + "step": 1005 + }, + { + "epoch": 0.15, + "grad_norm": 1.8292797066799271, + "learning_rate": 2e-06, + "loss": 0.6934, + "step": 1006 + }, + { + "epoch": 0.15, + "grad_norm": 4.06997557449564, + "learning_rate": 1.9999999953334557e-06, + "loss": 0.6999, + "step": 1007 + }, + { + "epoch": 0.15, + "grad_norm": 9.801547040411194, + "learning_rate": 1.999999981333823e-06, + "loss": 0.7012, + "step": 1008 + }, + { + "epoch": 0.15, + "grad_norm": 3.5371437373558954, + "learning_rate": 1.9999999580011024e-06, + "loss": 0.6914, + "step": 1009 + }, + { + "epoch": 0.15, + "grad_norm": 6.366782017762354, + "learning_rate": 1.9999999253352933e-06, + "loss": 0.696, + "step": 1010 + }, + { + "epoch": 0.15, + "grad_norm": 8.421677210542779, + "learning_rate": 1.9999998833363966e-06, + "loss": 0.6979, + "step": 1011 + }, + { + "epoch": 0.15, + "grad_norm": 5.800402821991273, + "learning_rate": 1.9999998320044128e-06, + "loss": 0.6875, + "step": 1012 + }, + { + "epoch": 0.15, + "grad_norm": 4.209937443209005, + "learning_rate": 1.9999997713393418e-06, + "loss": 0.6829, + "step": 1013 + }, + { + "epoch": 0.15, + "grad_norm": 9.431312669822333, + "learning_rate": 1.9999997013411844e-06, + "loss": 0.7083, + "step": 1014 + }, + { + "epoch": 0.15, + "grad_norm": 6.3487598605755435, + "learning_rate": 1.9999996220099416e-06, + "loss": 0.6992, + "step": 1015 + }, + { + "epoch": 0.15, + "grad_norm": 9.040252050926318, + "learning_rate": 1.9999995333456142e-06, + "loss": 0.7083, + "step": 1016 + }, + { + "epoch": 0.15, + "grad_norm": 13.569140077976554, + "learning_rate": 1.999999435348202e-06, + "loss": 0.7142, + "step": 1017 + }, + { + "epoch": 0.15, + "grad_norm": 5.4251909418188236, + "learning_rate": 1.9999993280177072e-06, + "loss": 0.6947, + "step": 1018 + }, + { + "epoch": 0.15, + "grad_norm": 0.986405164623672, + "learning_rate": 1.99999921135413e-06, + "loss": 0.6908, + "step": 1019 + }, + { + "epoch": 0.15, + "grad_norm": 8.982672591336462, + "learning_rate": 1.999999085357472e-06, + "loss": 0.6882, + "step": 1020 + }, + { + "epoch": 0.15, + "grad_norm": 2.741972701781011, + "learning_rate": 1.999998950027734e-06, + "loss": 0.6908, + "step": 1021 + }, + { + "epoch": 0.15, + "grad_norm": 2.683610942511333, + "learning_rate": 1.999998805364917e-06, + "loss": 0.6908, + "step": 1022 + }, + { + "epoch": 0.15, + "grad_norm": 2.089404390858266, + "learning_rate": 1.9999986513690232e-06, + "loss": 0.6953, + "step": 1023 + }, + { + "epoch": 0.15, + "grad_norm": 2.312837301763771, + "learning_rate": 1.999998488040053e-06, + "loss": 0.6875, + "step": 1024 + }, + { + "epoch": 0.15, + "grad_norm": 8.3967329862305, + "learning_rate": 1.999998315378009e-06, + "loss": 0.6921, + "step": 1025 + }, + { + "epoch": 0.15, + "grad_norm": 7.6675648220760895, + "learning_rate": 1.9999981333828917e-06, + "loss": 0.6934, + "step": 1026 + }, + { + "epoch": 0.15, + "grad_norm": 7.572276601879375, + "learning_rate": 1.999997942054704e-06, + "loss": 0.6973, + "step": 1027 + }, + { + "epoch": 0.15, + "grad_norm": 13.021227842198956, + "learning_rate": 1.999997741393447e-06, + "loss": 0.7031, + "step": 1028 + }, + { + "epoch": 0.15, + "grad_norm": 2.7073995141668146, + "learning_rate": 1.9999975313991223e-06, + "loss": 0.6855, + "step": 1029 + }, + { + "epoch": 0.15, + "grad_norm": 3.7707845282187926, + "learning_rate": 1.9999973120717326e-06, + "loss": 0.6992, + "step": 1030 + }, + { + "epoch": 0.15, + "grad_norm": 7.712513347887992, + "learning_rate": 1.999997083411279e-06, + "loss": 0.7077, + "step": 1031 + }, + { + "epoch": 0.15, + "grad_norm": 2.6898046425554734, + "learning_rate": 1.999996845417765e-06, + "loss": 0.6947, + "step": 1032 + }, + { + "epoch": 0.15, + "grad_norm": 4.472729570775615, + "learning_rate": 1.9999965980911916e-06, + "loss": 0.6953, + "step": 1033 + }, + { + "epoch": 0.15, + "grad_norm": 9.119011581176359, + "learning_rate": 1.9999963414315613e-06, + "loss": 0.7018, + "step": 1034 + }, + { + "epoch": 0.15, + "grad_norm": 7.342953698051504, + "learning_rate": 1.9999960754388767e-06, + "loss": 0.6986, + "step": 1035 + }, + { + "epoch": 0.15, + "grad_norm": 4.373106531205004, + "learning_rate": 1.9999958001131407e-06, + "loss": 0.6908, + "step": 1036 + }, + { + "epoch": 0.15, + "grad_norm": 5.95118280892305, + "learning_rate": 1.999995515454355e-06, + "loss": 0.6855, + "step": 1037 + }, + { + "epoch": 0.15, + "grad_norm": 5.828028171833577, + "learning_rate": 1.999995221462523e-06, + "loss": 0.6797, + "step": 1038 + }, + { + "epoch": 0.15, + "grad_norm": 1.8056218497854815, + "learning_rate": 1.9999949181376474e-06, + "loss": 0.6901, + "step": 1039 + }, + { + "epoch": 0.16, + "grad_norm": 7.676499712737745, + "learning_rate": 1.9999946054797304e-06, + "loss": 0.6999, + "step": 1040 + }, + { + "epoch": 0.16, + "grad_norm": 3.1378962927328473, + "learning_rate": 1.999994283488775e-06, + "loss": 0.6979, + "step": 1041 + }, + { + "epoch": 0.16, + "grad_norm": 7.486372586674187, + "learning_rate": 1.999993952164785e-06, + "loss": 0.6908, + "step": 1042 + }, + { + "epoch": 0.16, + "grad_norm": 5.550477579060876, + "learning_rate": 1.999993611507763e-06, + "loss": 0.709, + "step": 1043 + }, + { + "epoch": 0.16, + "grad_norm": 2.115908029392567, + "learning_rate": 1.9999932615177124e-06, + "loss": 0.696, + "step": 1044 + }, + { + "epoch": 0.16, + "grad_norm": 11.49215764946554, + "learning_rate": 1.9999929021946357e-06, + "loss": 0.6966, + "step": 1045 + }, + { + "epoch": 0.16, + "grad_norm": 1.7341376696346615, + "learning_rate": 1.999992533538537e-06, + "loss": 0.6855, + "step": 1046 + }, + { + "epoch": 0.16, + "grad_norm": 9.702693218827038, + "learning_rate": 1.99999215554942e-06, + "loss": 0.7025, + "step": 1047 + }, + { + "epoch": 0.16, + "grad_norm": 2.1422197661104967, + "learning_rate": 1.999991768227287e-06, + "loss": 0.6979, + "step": 1048 + }, + { + "epoch": 0.16, + "grad_norm": 11.599565008256906, + "learning_rate": 1.999991371572143e-06, + "loss": 0.6849, + "step": 1049 + }, + { + "epoch": 0.16, + "grad_norm": 3.131311747800264, + "learning_rate": 1.9999909655839907e-06, + "loss": 0.6973, + "step": 1050 + }, + { + "epoch": 0.16, + "grad_norm": 8.460783430822108, + "learning_rate": 1.999990550262835e-06, + "loss": 0.6875, + "step": 1051 + }, + { + "epoch": 0.16, + "grad_norm": 2.1547616199794746, + "learning_rate": 1.9999901256086783e-06, + "loss": 0.7044, + "step": 1052 + }, + { + "epoch": 0.16, + "grad_norm": 3.0733281893798976, + "learning_rate": 1.9999896916215256e-06, + "loss": 0.6947, + "step": 1053 + }, + { + "epoch": 0.16, + "grad_norm": 1.0330888037459063, + "learning_rate": 1.9999892483013805e-06, + "loss": 0.6849, + "step": 1054 + }, + { + "epoch": 0.16, + "grad_norm": 1.207153915036544, + "learning_rate": 1.9999887956482475e-06, + "loss": 0.6921, + "step": 1055 + }, + { + "epoch": 0.16, + "grad_norm": 9.936306034891713, + "learning_rate": 1.9999883336621306e-06, + "loss": 0.7057, + "step": 1056 + }, + { + "epoch": 0.16, + "grad_norm": 3.3574005347310862, + "learning_rate": 1.999987862343034e-06, + "loss": 0.6914, + "step": 1057 + }, + { + "epoch": 0.16, + "grad_norm": 1.6197703033041064, + "learning_rate": 1.9999873816909622e-06, + "loss": 0.6921, + "step": 1058 + }, + { + "epoch": 0.16, + "grad_norm": 6.946024696556024, + "learning_rate": 1.99998689170592e-06, + "loss": 0.6973, + "step": 1059 + }, + { + "epoch": 0.16, + "grad_norm": 5.125268425717447, + "learning_rate": 1.9999863923879115e-06, + "loss": 0.6829, + "step": 1060 + }, + { + "epoch": 0.16, + "grad_norm": 10.599275111520482, + "learning_rate": 1.999985883736942e-06, + "loss": 0.6849, + "step": 1061 + }, + { + "epoch": 0.16, + "grad_norm": 3.340330018109908, + "learning_rate": 1.9999853657530153e-06, + "loss": 0.6862, + "step": 1062 + }, + { + "epoch": 0.16, + "grad_norm": 3.190436644698012, + "learning_rate": 1.999984838436137e-06, + "loss": 0.6973, + "step": 1063 + }, + { + "epoch": 0.16, + "grad_norm": 2.3648404494239506, + "learning_rate": 1.9999843017863117e-06, + "loss": 0.6868, + "step": 1064 + }, + { + "epoch": 0.16, + "grad_norm": 1.8906334931854407, + "learning_rate": 1.9999837558035445e-06, + "loss": 0.6914, + "step": 1065 + }, + { + "epoch": 0.16, + "grad_norm": 5.960138270427374, + "learning_rate": 1.9999832004878405e-06, + "loss": 0.6771, + "step": 1066 + }, + { + "epoch": 0.16, + "grad_norm": 1.6950856043915379, + "learning_rate": 1.999982635839205e-06, + "loss": 0.6914, + "step": 1067 + }, + { + "epoch": 0.16, + "grad_norm": 2.4288420811769913, + "learning_rate": 1.999982061857643e-06, + "loss": 0.6914, + "step": 1068 + }, + { + "epoch": 0.16, + "grad_norm": 10.3267676124202, + "learning_rate": 1.9999814785431597e-06, + "loss": 0.7057, + "step": 1069 + }, + { + "epoch": 0.16, + "grad_norm": 9.997122500887164, + "learning_rate": 1.999980885895761e-06, + "loss": 0.7077, + "step": 1070 + }, + { + "epoch": 0.16, + "grad_norm": 7.355467509040908, + "learning_rate": 1.999980283915452e-06, + "loss": 0.6953, + "step": 1071 + }, + { + "epoch": 0.16, + "grad_norm": 10.468938521766761, + "learning_rate": 1.9999796726022394e-06, + "loss": 0.7135, + "step": 1072 + }, + { + "epoch": 0.16, + "grad_norm": 5.092185102579719, + "learning_rate": 1.9999790519561277e-06, + "loss": 0.7057, + "step": 1073 + }, + { + "epoch": 0.16, + "grad_norm": 9.65412007035965, + "learning_rate": 1.9999784219771227e-06, + "loss": 0.6953, + "step": 1074 + }, + { + "epoch": 0.16, + "grad_norm": 4.355596414639488, + "learning_rate": 1.999977782665231e-06, + "loss": 0.6921, + "step": 1075 + }, + { + "epoch": 0.16, + "grad_norm": 21.166281625965155, + "learning_rate": 1.999977134020458e-06, + "loss": 0.7298, + "step": 1076 + }, + { + "epoch": 0.16, + "grad_norm": 2.461406970761206, + "learning_rate": 1.99997647604281e-06, + "loss": 0.6908, + "step": 1077 + }, + { + "epoch": 0.16, + "grad_norm": 12.713121176123474, + "learning_rate": 1.9999758087322933e-06, + "loss": 0.7214, + "step": 1078 + }, + { + "epoch": 0.16, + "grad_norm": 13.33504300089685, + "learning_rate": 1.999975132088914e-06, + "loss": 0.7181, + "step": 1079 + }, + { + "epoch": 0.16, + "grad_norm": 6.85947937926659, + "learning_rate": 1.999974446112678e-06, + "loss": 0.6979, + "step": 1080 + }, + { + "epoch": 0.16, + "grad_norm": 2.244352018271897, + "learning_rate": 1.999973750803592e-06, + "loss": 0.6868, + "step": 1081 + }, + { + "epoch": 0.16, + "grad_norm": 9.763459526534081, + "learning_rate": 1.999973046161663e-06, + "loss": 0.7038, + "step": 1082 + }, + { + "epoch": 0.16, + "grad_norm": 15.485122607292187, + "learning_rate": 1.999972332186897e-06, + "loss": 0.7051, + "step": 1083 + }, + { + "epoch": 0.16, + "grad_norm": 3.207732589726451, + "learning_rate": 1.999971608879301e-06, + "loss": 0.6875, + "step": 1084 + }, + { + "epoch": 0.16, + "grad_norm": 13.554304353470423, + "learning_rate": 1.9999708762388812e-06, + "loss": 0.7031, + "step": 1085 + }, + { + "epoch": 0.16, + "grad_norm": 3.2008074637578723, + "learning_rate": 1.9999701342656446e-06, + "loss": 0.6992, + "step": 1086 + }, + { + "epoch": 0.16, + "grad_norm": 3.565145568102359, + "learning_rate": 1.9999693829595986e-06, + "loss": 0.6953, + "step": 1087 + }, + { + "epoch": 0.16, + "grad_norm": 2.929480238599367, + "learning_rate": 1.99996862232075e-06, + "loss": 0.6947, + "step": 1088 + }, + { + "epoch": 0.16, + "grad_norm": 2.4340590630972843, + "learning_rate": 1.9999678523491057e-06, + "loss": 0.6908, + "step": 1089 + }, + { + "epoch": 0.16, + "grad_norm": 1.8052291658461228, + "learning_rate": 1.999967073044673e-06, + "loss": 0.681, + "step": 1090 + }, + { + "epoch": 0.16, + "grad_norm": 4.176593627051166, + "learning_rate": 1.9999662844074593e-06, + "loss": 0.6888, + "step": 1091 + }, + { + "epoch": 0.16, + "grad_norm": 1.3341003354068368, + "learning_rate": 1.9999654864374716e-06, + "loss": 0.6934, + "step": 1092 + }, + { + "epoch": 0.16, + "grad_norm": 4.153145981076235, + "learning_rate": 1.9999646791347174e-06, + "loss": 0.6947, + "step": 1093 + }, + { + "epoch": 0.16, + "grad_norm": 3.3625530243765125, + "learning_rate": 1.999963862499205e-06, + "loss": 0.6947, + "step": 1094 + }, + { + "epoch": 0.16, + "grad_norm": 3.237755900420671, + "learning_rate": 1.999963036530941e-06, + "loss": 0.6849, + "step": 1095 + }, + { + "epoch": 0.16, + "grad_norm": 6.879942023361139, + "learning_rate": 1.9999622012299338e-06, + "loss": 0.694, + "step": 1096 + }, + { + "epoch": 0.16, + "grad_norm": 1.9494018016163188, + "learning_rate": 1.999961356596191e-06, + "loss": 0.6921, + "step": 1097 + }, + { + "epoch": 0.16, + "grad_norm": 7.105014001298896, + "learning_rate": 1.9999605026297203e-06, + "loss": 0.6888, + "step": 1098 + }, + { + "epoch": 0.16, + "grad_norm": 3.0583132346450617, + "learning_rate": 1.9999596393305298e-06, + "loss": 0.6992, + "step": 1099 + }, + { + "epoch": 0.16, + "grad_norm": 3.1082907500851253, + "learning_rate": 1.9999587666986273e-06, + "loss": 0.6921, + "step": 1100 + }, + { + "epoch": 0.16, + "grad_norm": 2.592443649518697, + "learning_rate": 1.9999578847340214e-06, + "loss": 0.6966, + "step": 1101 + }, + { + "epoch": 0.16, + "grad_norm": 10.293147857952295, + "learning_rate": 1.99995699343672e-06, + "loss": 0.6953, + "step": 1102 + }, + { + "epoch": 0.16, + "grad_norm": 3.073635242636787, + "learning_rate": 1.999956092806732e-06, + "loss": 0.6947, + "step": 1103 + }, + { + "epoch": 0.16, + "grad_norm": 4.273002994714191, + "learning_rate": 1.999955182844065e-06, + "loss": 0.6927, + "step": 1104 + }, + { + "epoch": 0.16, + "grad_norm": 6.120205948984377, + "learning_rate": 1.999954263548728e-06, + "loss": 0.6882, + "step": 1105 + }, + { + "epoch": 0.16, + "grad_norm": 2.2828269188515153, + "learning_rate": 1.9999533349207293e-06, + "loss": 0.6986, + "step": 1106 + }, + { + "epoch": 0.17, + "grad_norm": 2.6669140192811063, + "learning_rate": 1.9999523969600776e-06, + "loss": 0.694, + "step": 1107 + }, + { + "epoch": 0.17, + "grad_norm": 3.405711319125375, + "learning_rate": 1.9999514496667817e-06, + "loss": 0.6862, + "step": 1108 + }, + { + "epoch": 0.17, + "grad_norm": 8.881777949832793, + "learning_rate": 1.999950493040851e-06, + "loss": 0.6868, + "step": 1109 + }, + { + "epoch": 0.17, + "grad_norm": 5.41491955023962, + "learning_rate": 1.9999495270822934e-06, + "loss": 0.6908, + "step": 1110 + }, + { + "epoch": 0.17, + "grad_norm": 5.891205228384799, + "learning_rate": 1.9999485517911183e-06, + "loss": 0.6947, + "step": 1111 + }, + { + "epoch": 0.17, + "grad_norm": 3.03462738798572, + "learning_rate": 1.9999475671673356e-06, + "loss": 0.6927, + "step": 1112 + }, + { + "epoch": 0.17, + "grad_norm": 1.113472105657358, + "learning_rate": 1.999946573210953e-06, + "loss": 0.6895, + "step": 1113 + }, + { + "epoch": 0.17, + "grad_norm": 6.240797708231301, + "learning_rate": 1.9999455699219815e-06, + "loss": 0.6895, + "step": 1114 + }, + { + "epoch": 0.17, + "grad_norm": 0.769018044821202, + "learning_rate": 1.999944557300429e-06, + "loss": 0.6908, + "step": 1115 + }, + { + "epoch": 0.17, + "grad_norm": 6.28548763203119, + "learning_rate": 1.9999435353463058e-06, + "loss": 0.6927, + "step": 1116 + }, + { + "epoch": 0.17, + "grad_norm": 13.5960387885094, + "learning_rate": 1.999942504059621e-06, + "loss": 0.7012, + "step": 1117 + }, + { + "epoch": 0.17, + "grad_norm": 2.9639005003594, + "learning_rate": 1.9999414634403846e-06, + "loss": 0.6927, + "step": 1118 + }, + { + "epoch": 0.17, + "grad_norm": 6.268709356194492, + "learning_rate": 1.9999404134886055e-06, + "loss": 0.6947, + "step": 1119 + }, + { + "epoch": 0.17, + "grad_norm": 4.442407388119941, + "learning_rate": 1.9999393542042945e-06, + "loss": 0.6868, + "step": 1120 + }, + { + "epoch": 0.17, + "grad_norm": 9.09281745482155, + "learning_rate": 1.999938285587461e-06, + "loss": 0.6927, + "step": 1121 + }, + { + "epoch": 0.17, + "grad_norm": 10.592562370312127, + "learning_rate": 1.9999372076381153e-06, + "loss": 0.7044, + "step": 1122 + }, + { + "epoch": 0.17, + "grad_norm": 6.292267745933659, + "learning_rate": 1.999936120356267e-06, + "loss": 0.6966, + "step": 1123 + }, + { + "epoch": 0.17, + "grad_norm": 2.9324335616958215, + "learning_rate": 1.9999350237419265e-06, + "loss": 0.7031, + "step": 1124 + }, + { + "epoch": 0.17, + "grad_norm": 3.9216260853603027, + "learning_rate": 1.9999339177951037e-06, + "loss": 0.7044, + "step": 1125 + }, + { + "epoch": 0.17, + "grad_norm": 5.065565419999506, + "learning_rate": 1.9999328025158095e-06, + "loss": 0.6921, + "step": 1126 + }, + { + "epoch": 0.17, + "grad_norm": 7.32093635421495, + "learning_rate": 1.999931677904054e-06, + "loss": 0.7077, + "step": 1127 + }, + { + "epoch": 0.17, + "grad_norm": 1.7266447103548401, + "learning_rate": 1.9999305439598476e-06, + "loss": 0.6868, + "step": 1128 + }, + { + "epoch": 0.17, + "grad_norm": 2.6201452676479646, + "learning_rate": 1.999929400683201e-06, + "loss": 0.6973, + "step": 1129 + }, + { + "epoch": 0.17, + "grad_norm": 5.693128303337555, + "learning_rate": 1.9999282480741252e-06, + "loss": 0.6953, + "step": 1130 + }, + { + "epoch": 0.17, + "grad_norm": 4.463308825226945, + "learning_rate": 1.9999270861326304e-06, + "loss": 0.6947, + "step": 1131 + }, + { + "epoch": 0.17, + "grad_norm": 5.636111441031145, + "learning_rate": 1.999925914858728e-06, + "loss": 0.6842, + "step": 1132 + }, + { + "epoch": 0.17, + "grad_norm": 8.622987484593875, + "learning_rate": 1.999924734252428e-06, + "loss": 0.6999, + "step": 1133 + }, + { + "epoch": 0.17, + "grad_norm": 7.373111946928275, + "learning_rate": 1.9999235443137422e-06, + "loss": 0.6986, + "step": 1134 + }, + { + "epoch": 0.17, + "grad_norm": 6.308617443775636, + "learning_rate": 1.9999223450426817e-06, + "loss": 0.6849, + "step": 1135 + }, + { + "epoch": 0.17, + "grad_norm": 3.409164647338666, + "learning_rate": 1.9999211364392574e-06, + "loss": 0.6973, + "step": 1136 + }, + { + "epoch": 0.17, + "grad_norm": 4.123339406931054, + "learning_rate": 1.9999199185034808e-06, + "loss": 0.694, + "step": 1137 + }, + { + "epoch": 0.17, + "grad_norm": 3.3103314446455516, + "learning_rate": 1.999918691235363e-06, + "loss": 0.6947, + "step": 1138 + }, + { + "epoch": 0.17, + "grad_norm": 3.7993947725072954, + "learning_rate": 1.999917454634916e-06, + "loss": 0.6921, + "step": 1139 + }, + { + "epoch": 0.17, + "grad_norm": 5.866112368596856, + "learning_rate": 1.999916208702151e-06, + "loss": 0.6921, + "step": 1140 + }, + { + "epoch": 0.17, + "grad_norm": 1.5800756497612052, + "learning_rate": 1.999914953437079e-06, + "loss": 0.6947, + "step": 1141 + }, + { + "epoch": 0.17, + "grad_norm": 1.2132383305480718, + "learning_rate": 1.999913688839713e-06, + "loss": 0.6908, + "step": 1142 + }, + { + "epoch": 0.17, + "grad_norm": 2.867811737760581, + "learning_rate": 1.9999124149100637e-06, + "loss": 0.6927, + "step": 1143 + }, + { + "epoch": 0.17, + "grad_norm": 4.224760080111763, + "learning_rate": 1.999911131648143e-06, + "loss": 0.6966, + "step": 1144 + }, + { + "epoch": 0.17, + "grad_norm": 1.4555319442809755, + "learning_rate": 1.9999098390539644e-06, + "loss": 0.6849, + "step": 1145 + }, + { + "epoch": 0.17, + "grad_norm": 2.8688398326297655, + "learning_rate": 1.9999085371275382e-06, + "loss": 0.6895, + "step": 1146 + }, + { + "epoch": 0.17, + "grad_norm": 8.692880023560281, + "learning_rate": 1.999907225868877e-06, + "loss": 0.6901, + "step": 1147 + }, + { + "epoch": 0.17, + "grad_norm": 8.984078387968019, + "learning_rate": 1.9999059052779935e-06, + "loss": 0.6973, + "step": 1148 + }, + { + "epoch": 0.17, + "grad_norm": 2.70640874704877, + "learning_rate": 1.9999045753549e-06, + "loss": 0.6973, + "step": 1149 + }, + { + "epoch": 0.17, + "grad_norm": 7.95764234206401, + "learning_rate": 1.999903236099608e-06, + "loss": 0.6855, + "step": 1150 + }, + { + "epoch": 0.17, + "grad_norm": 2.626211123211836, + "learning_rate": 1.9999018875121312e-06, + "loss": 0.6901, + "step": 1151 + }, + { + "epoch": 0.17, + "grad_norm": 5.132150016883938, + "learning_rate": 1.9999005295924816e-06, + "loss": 0.7025, + "step": 1152 + }, + { + "epoch": 0.17, + "grad_norm": 6.5792927725200485, + "learning_rate": 1.999899162340672e-06, + "loss": 0.696, + "step": 1153 + }, + { + "epoch": 0.17, + "grad_norm": 4.027206116333765, + "learning_rate": 1.9998977857567147e-06, + "loss": 0.6914, + "step": 1154 + }, + { + "epoch": 0.17, + "grad_norm": 3.1241899961615154, + "learning_rate": 1.9998963998406233e-06, + "loss": 0.7038, + "step": 1155 + }, + { + "epoch": 0.17, + "grad_norm": 13.070500594075202, + "learning_rate": 1.99989500459241e-06, + "loss": 0.6914, + "step": 1156 + }, + { + "epoch": 0.17, + "grad_norm": 2.405804580580018, + "learning_rate": 1.9998936000120886e-06, + "loss": 0.6849, + "step": 1157 + }, + { + "epoch": 0.17, + "grad_norm": 10.202190728814944, + "learning_rate": 1.9998921860996716e-06, + "loss": 0.7044, + "step": 1158 + }, + { + "epoch": 0.17, + "grad_norm": 7.111036619756035, + "learning_rate": 1.9998907628551724e-06, + "loss": 0.7051, + "step": 1159 + }, + { + "epoch": 0.17, + "grad_norm": 5.926859624173719, + "learning_rate": 1.9998893302786045e-06, + "loss": 0.7005, + "step": 1160 + }, + { + "epoch": 0.17, + "grad_norm": 2.8749291842652838, + "learning_rate": 1.999887888369981e-06, + "loss": 0.6895, + "step": 1161 + }, + { + "epoch": 0.17, + "grad_norm": 1.2675493119964891, + "learning_rate": 1.9998864371293148e-06, + "loss": 0.6823, + "step": 1162 + }, + { + "epoch": 0.17, + "grad_norm": 1.3377159721767071, + "learning_rate": 1.9998849765566207e-06, + "loss": 0.6882, + "step": 1163 + }, + { + "epoch": 0.17, + "grad_norm": 4.4495386689291045, + "learning_rate": 1.9998835066519115e-06, + "loss": 0.6823, + "step": 1164 + }, + { + "epoch": 0.17, + "grad_norm": 5.5099027131317975, + "learning_rate": 1.999882027415201e-06, + "loss": 0.694, + "step": 1165 + }, + { + "epoch": 0.17, + "grad_norm": 10.148281112362167, + "learning_rate": 1.9998805388465036e-06, + "loss": 0.7051, + "step": 1166 + }, + { + "epoch": 0.17, + "grad_norm": 6.148400739736749, + "learning_rate": 1.9998790409458325e-06, + "loss": 0.6947, + "step": 1167 + }, + { + "epoch": 0.17, + "grad_norm": 1.438837741029189, + "learning_rate": 1.9998775337132016e-06, + "loss": 0.6849, + "step": 1168 + }, + { + "epoch": 0.17, + "grad_norm": 2.1116378917383614, + "learning_rate": 1.9998760171486253e-06, + "loss": 0.6888, + "step": 1169 + }, + { + "epoch": 0.17, + "grad_norm": 7.100321719508706, + "learning_rate": 1.9998744912521177e-06, + "loss": 0.6966, + "step": 1170 + }, + { + "epoch": 0.17, + "grad_norm": 2.2602714523163256, + "learning_rate": 1.9998729560236935e-06, + "loss": 0.6797, + "step": 1171 + }, + { + "epoch": 0.17, + "grad_norm": 10.247934959260723, + "learning_rate": 1.9998714114633663e-06, + "loss": 0.7031, + "step": 1172 + }, + { + "epoch": 0.17, + "grad_norm": 5.6569328650555235, + "learning_rate": 1.9998698575711504e-06, + "loss": 0.6797, + "step": 1173 + }, + { + "epoch": 0.18, + "grad_norm": 2.6773676688478627, + "learning_rate": 1.9998682943470612e-06, + "loss": 0.6875, + "step": 1174 + }, + { + "epoch": 0.18, + "grad_norm": 8.532692883453713, + "learning_rate": 1.9998667217911127e-06, + "loss": 0.694, + "step": 1175 + }, + { + "epoch": 0.18, + "grad_norm": 1.4877735252505908, + "learning_rate": 1.999865139903319e-06, + "loss": 0.6829, + "step": 1176 + }, + { + "epoch": 0.18, + "grad_norm": 9.121436558830522, + "learning_rate": 1.9998635486836966e-06, + "loss": 0.696, + "step": 1177 + }, + { + "epoch": 0.18, + "grad_norm": 3.501992168468318, + "learning_rate": 1.9998619481322584e-06, + "loss": 0.6953, + "step": 1178 + }, + { + "epoch": 0.18, + "grad_norm": 8.12063662584594, + "learning_rate": 1.9998603382490205e-06, + "loss": 0.6927, + "step": 1179 + }, + { + "epoch": 0.18, + "grad_norm": 5.729865536161565, + "learning_rate": 1.9998587190339978e-06, + "loss": 0.6868, + "step": 1180 + }, + { + "epoch": 0.18, + "grad_norm": 1.2984276017212013, + "learning_rate": 1.9998570904872054e-06, + "loss": 0.6849, + "step": 1181 + }, + { + "epoch": 0.18, + "grad_norm": 1.8889990216551422, + "learning_rate": 1.9998554526086583e-06, + "loss": 0.6973, + "step": 1182 + }, + { + "epoch": 0.18, + "grad_norm": 3.5977493647859826, + "learning_rate": 1.999853805398371e-06, + "loss": 0.694, + "step": 1183 + }, + { + "epoch": 0.18, + "grad_norm": 5.713306461904431, + "learning_rate": 1.999852148856361e-06, + "loss": 0.696, + "step": 1184 + }, + { + "epoch": 0.18, + "grad_norm": 1.4208176163483734, + "learning_rate": 1.9998504829826415e-06, + "loss": 0.6816, + "step": 1185 + }, + { + "epoch": 0.18, + "grad_norm": 15.667434111258073, + "learning_rate": 1.9998488077772294e-06, + "loss": 0.6992, + "step": 1186 + }, + { + "epoch": 0.18, + "grad_norm": 3.387019295765688, + "learning_rate": 1.99984712324014e-06, + "loss": 0.6875, + "step": 1187 + }, + { + "epoch": 0.18, + "grad_norm": 8.222444778071212, + "learning_rate": 1.999845429371389e-06, + "loss": 0.6908, + "step": 1188 + }, + { + "epoch": 0.18, + "grad_norm": 7.552313858954987, + "learning_rate": 1.999843726170992e-06, + "loss": 0.6966, + "step": 1189 + }, + { + "epoch": 0.18, + "grad_norm": 2.9598457029092704, + "learning_rate": 1.9998420136389654e-06, + "loss": 0.6914, + "step": 1190 + }, + { + "epoch": 0.18, + "grad_norm": 3.8769262919545064, + "learning_rate": 1.9998402917753245e-06, + "loss": 0.6979, + "step": 1191 + }, + { + "epoch": 0.18, + "grad_norm": 13.458183403288928, + "learning_rate": 1.999838560580086e-06, + "loss": 0.709, + "step": 1192 + }, + { + "epoch": 0.18, + "grad_norm": 12.689946583403858, + "learning_rate": 1.9998368200532657e-06, + "loss": 0.7135, + "step": 1193 + }, + { + "epoch": 0.18, + "grad_norm": 1.3637289494686995, + "learning_rate": 1.9998350701948797e-06, + "loss": 0.6868, + "step": 1194 + }, + { + "epoch": 0.18, + "grad_norm": 13.772905123461479, + "learning_rate": 1.9998333110049447e-06, + "loss": 0.7122, + "step": 1195 + }, + { + "epoch": 0.18, + "grad_norm": 1.066918958811793, + "learning_rate": 1.9998315424834773e-06, + "loss": 0.6901, + "step": 1196 + }, + { + "epoch": 0.18, + "grad_norm": 2.6406991957509494, + "learning_rate": 1.9998297646304935e-06, + "loss": 0.7064, + "step": 1197 + }, + { + "epoch": 0.18, + "grad_norm": 5.494386596196189, + "learning_rate": 1.99982797744601e-06, + "loss": 0.7038, + "step": 1198 + }, + { + "epoch": 0.18, + "grad_norm": 6.616809410004511, + "learning_rate": 1.9998261809300434e-06, + "loss": 0.6927, + "step": 1199 + }, + { + "epoch": 0.18, + "grad_norm": 1.477145452779399, + "learning_rate": 1.999824375082611e-06, + "loss": 0.6816, + "step": 1200 + }, + { + "epoch": 0.18, + "grad_norm": 3.002883307869293, + "learning_rate": 1.999822559903729e-06, + "loss": 0.6986, + "step": 1201 + }, + { + "epoch": 0.18, + "grad_norm": 12.695323585949646, + "learning_rate": 1.999820735393415e-06, + "loss": 0.7161, + "step": 1202 + }, + { + "epoch": 0.18, + "grad_norm": 2.331742549770151, + "learning_rate": 1.999818901551685e-06, + "loss": 0.6908, + "step": 1203 + }, + { + "epoch": 0.18, + "grad_norm": 9.023467135499061, + "learning_rate": 1.9998170583785574e-06, + "loss": 0.7018, + "step": 1204 + }, + { + "epoch": 0.18, + "grad_norm": 1.1813961376035205, + "learning_rate": 1.9998152058740485e-06, + "loss": 0.6803, + "step": 1205 + }, + { + "epoch": 0.18, + "grad_norm": 7.813279127866117, + "learning_rate": 1.9998133440381762e-06, + "loss": 0.6979, + "step": 1206 + }, + { + "epoch": 0.18, + "grad_norm": 6.432925945731422, + "learning_rate": 1.9998114728709574e-06, + "loss": 0.6953, + "step": 1207 + }, + { + "epoch": 0.18, + "grad_norm": 1.3645925717720657, + "learning_rate": 1.999809592372409e-06, + "loss": 0.6953, + "step": 1208 + }, + { + "epoch": 0.18, + "grad_norm": 11.4150350968884, + "learning_rate": 1.99980770254255e-06, + "loss": 0.6999, + "step": 1209 + }, + { + "epoch": 0.18, + "grad_norm": 0.9788356973942921, + "learning_rate": 1.999805803381397e-06, + "loss": 0.6908, + "step": 1210 + }, + { + "epoch": 0.18, + "grad_norm": 4.23548359788962, + "learning_rate": 1.999803894888968e-06, + "loss": 0.7012, + "step": 1211 + }, + { + "epoch": 0.18, + "grad_norm": 5.48532804602444, + "learning_rate": 1.999801977065281e-06, + "loss": 0.7012, + "step": 1212 + }, + { + "epoch": 0.18, + "grad_norm": 9.636968838103016, + "learning_rate": 1.999800049910354e-06, + "loss": 0.7096, + "step": 1213 + }, + { + "epoch": 0.18, + "grad_norm": 4.154523824214468, + "learning_rate": 1.9997981134242044e-06, + "loss": 0.6914, + "step": 1214 + }, + { + "epoch": 0.18, + "grad_norm": 8.986611007766323, + "learning_rate": 1.9997961676068505e-06, + "loss": 0.7018, + "step": 1215 + }, + { + "epoch": 0.18, + "grad_norm": 7.466604494787924, + "learning_rate": 1.999794212458311e-06, + "loss": 0.6979, + "step": 1216 + }, + { + "epoch": 0.18, + "grad_norm": 2.2422484080020837, + "learning_rate": 1.9997922479786033e-06, + "loss": 0.6947, + "step": 1217 + }, + { + "epoch": 0.18, + "grad_norm": 4.744303436816323, + "learning_rate": 1.999790274167746e-06, + "loss": 0.6953, + "step": 1218 + }, + { + "epoch": 0.18, + "grad_norm": 8.835739474224289, + "learning_rate": 1.9997882910257577e-06, + "loss": 0.6992, + "step": 1219 + }, + { + "epoch": 0.18, + "grad_norm": 4.647085156674823, + "learning_rate": 1.999786298552657e-06, + "loss": 0.6901, + "step": 1220 + }, + { + "epoch": 0.18, + "grad_norm": 8.971176047140974, + "learning_rate": 1.9997842967484626e-06, + "loss": 0.6953, + "step": 1221 + }, + { + "epoch": 0.18, + "grad_norm": 10.223657024765664, + "learning_rate": 1.999782285613193e-06, + "loss": 0.696, + "step": 1222 + }, + { + "epoch": 0.18, + "grad_norm": 1.6736473897080892, + "learning_rate": 1.9997802651468664e-06, + "loss": 0.6947, + "step": 1223 + }, + { + "epoch": 0.18, + "grad_norm": 2.77338787885395, + "learning_rate": 1.999778235349502e-06, + "loss": 0.6914, + "step": 1224 + }, + { + "epoch": 0.18, + "grad_norm": 0.9687129239705199, + "learning_rate": 1.9997761962211196e-06, + "loss": 0.6953, + "step": 1225 + }, + { + "epoch": 0.18, + "grad_norm": 6.839872607290245, + "learning_rate": 1.999774147761737e-06, + "loss": 0.6986, + "step": 1226 + }, + { + "epoch": 0.18, + "grad_norm": 7.181880984220184, + "learning_rate": 1.999772089971374e-06, + "loss": 0.7077, + "step": 1227 + }, + { + "epoch": 0.18, + "grad_norm": 5.857169845698413, + "learning_rate": 1.99977002285005e-06, + "loss": 0.6953, + "step": 1228 + }, + { + "epoch": 0.18, + "grad_norm": 1.5353980442142992, + "learning_rate": 1.999767946397784e-06, + "loss": 0.6862, + "step": 1229 + }, + { + "epoch": 0.18, + "grad_norm": 2.1415436240028636, + "learning_rate": 1.999765860614595e-06, + "loss": 0.7025, + "step": 1230 + }, + { + "epoch": 0.18, + "grad_norm": 5.1380485777421, + "learning_rate": 1.9997637655005025e-06, + "loss": 0.6986, + "step": 1231 + }, + { + "epoch": 0.18, + "grad_norm": 3.168018878858367, + "learning_rate": 1.999761661055527e-06, + "loss": 0.6914, + "step": 1232 + }, + { + "epoch": 0.18, + "grad_norm": 4.612162747731336, + "learning_rate": 1.999759547279687e-06, + "loss": 0.6992, + "step": 1233 + }, + { + "epoch": 0.18, + "grad_norm": 4.622766172899571, + "learning_rate": 1.999757424173003e-06, + "loss": 0.6914, + "step": 1234 + }, + { + "epoch": 0.18, + "grad_norm": 0.9770470780173028, + "learning_rate": 1.999755291735495e-06, + "loss": 0.6901, + "step": 1235 + }, + { + "epoch": 0.18, + "grad_norm": 1.049556702207654, + "learning_rate": 1.9997531499671817e-06, + "loss": 0.6934, + "step": 1236 + }, + { + "epoch": 0.18, + "grad_norm": 4.320300089583626, + "learning_rate": 1.9997509988680845e-06, + "loss": 0.696, + "step": 1237 + }, + { + "epoch": 0.18, + "grad_norm": 4.088392187966064, + "learning_rate": 1.9997488384382223e-06, + "loss": 0.6966, + "step": 1238 + }, + { + "epoch": 0.18, + "grad_norm": 1.2107489532104567, + "learning_rate": 1.9997466686776157e-06, + "loss": 0.6927, + "step": 1239 + }, + { + "epoch": 0.18, + "grad_norm": 8.670673205376453, + "learning_rate": 1.9997444895862853e-06, + "loss": 0.7005, + "step": 1240 + }, + { + "epoch": 0.19, + "grad_norm": 6.729533113636872, + "learning_rate": 1.9997423011642512e-06, + "loss": 0.7038, + "step": 1241 + }, + { + "epoch": 0.19, + "grad_norm": 4.789357449723623, + "learning_rate": 1.9997401034115335e-06, + "loss": 0.6927, + "step": 1242 + }, + { + "epoch": 0.19, + "grad_norm": 3.010746892037609, + "learning_rate": 1.999737896328153e-06, + "loss": 0.6934, + "step": 1243 + }, + { + "epoch": 0.19, + "grad_norm": 2.0664320122140882, + "learning_rate": 1.999735679914131e-06, + "loss": 0.6888, + "step": 1244 + }, + { + "epoch": 0.19, + "grad_norm": 4.579495803631911, + "learning_rate": 1.9997334541694867e-06, + "loss": 0.6888, + "step": 1245 + }, + { + "epoch": 0.19, + "grad_norm": 7.054798169429937, + "learning_rate": 1.999731219094242e-06, + "loss": 0.6986, + "step": 1246 + }, + { + "epoch": 0.19, + "grad_norm": 3.7930285160000805, + "learning_rate": 1.9997289746884175e-06, + "loss": 0.694, + "step": 1247 + }, + { + "epoch": 0.19, + "grad_norm": 0.8681443999272316, + "learning_rate": 1.9997267209520337e-06, + "loss": 0.6882, + "step": 1248 + }, + { + "epoch": 0.19, + "grad_norm": 5.472390037033551, + "learning_rate": 1.9997244578851124e-06, + "loss": 0.6979, + "step": 1249 + }, + { + "epoch": 0.19, + "grad_norm": 4.45181429090312, + "learning_rate": 1.999722185487674e-06, + "loss": 0.6921, + "step": 1250 + }, + { + "epoch": 0.19, + "grad_norm": 6.405767358155009, + "learning_rate": 1.9997199037597403e-06, + "loss": 0.6934, + "step": 1251 + }, + { + "epoch": 0.19, + "grad_norm": 1.1321796032638491, + "learning_rate": 1.9997176127013323e-06, + "loss": 0.6882, + "step": 1252 + }, + { + "epoch": 0.19, + "grad_norm": 5.232206107865191, + "learning_rate": 1.9997153123124715e-06, + "loss": 0.6908, + "step": 1253 + }, + { + "epoch": 0.19, + "grad_norm": 5.287180239466589, + "learning_rate": 1.999713002593179e-06, + "loss": 0.6953, + "step": 1254 + }, + { + "epoch": 0.19, + "grad_norm": 6.156212370002754, + "learning_rate": 1.9997106835434767e-06, + "loss": 0.694, + "step": 1255 + }, + { + "epoch": 0.19, + "grad_norm": 1.0741840937618974, + "learning_rate": 1.9997083551633863e-06, + "loss": 0.6927, + "step": 1256 + }, + { + "epoch": 0.19, + "grad_norm": 11.859232594702279, + "learning_rate": 1.999706017452929e-06, + "loss": 0.7018, + "step": 1257 + }, + { + "epoch": 0.19, + "grad_norm": 2.0352310722440654, + "learning_rate": 1.9997036704121273e-06, + "loss": 0.6862, + "step": 1258 + }, + { + "epoch": 0.19, + "grad_norm": 10.743472893236556, + "learning_rate": 1.999701314041003e-06, + "loss": 0.6999, + "step": 1259 + }, + { + "epoch": 0.19, + "grad_norm": 3.781929098705826, + "learning_rate": 1.9996989483395777e-06, + "loss": 0.694, + "step": 1260 + }, + { + "epoch": 0.19, + "grad_norm": 3.153136937510018, + "learning_rate": 1.9996965733078743e-06, + "loss": 0.6895, + "step": 1261 + }, + { + "epoch": 0.19, + "grad_norm": 0.8306797966532783, + "learning_rate": 1.9996941889459136e-06, + "loss": 0.6986, + "step": 1262 + }, + { + "epoch": 0.19, + "grad_norm": 1.90014173216667, + "learning_rate": 1.999691795253719e-06, + "loss": 0.6999, + "step": 1263 + }, + { + "epoch": 0.19, + "grad_norm": 1.9594625827507681, + "learning_rate": 1.9996893922313126e-06, + "loss": 0.6888, + "step": 1264 + }, + { + "epoch": 0.19, + "grad_norm": 8.14362682392756, + "learning_rate": 1.999686979878716e-06, + "loss": 0.7096, + "step": 1265 + }, + { + "epoch": 0.19, + "grad_norm": 1.6605323414023918, + "learning_rate": 1.9996845581959532e-06, + "loss": 0.6888, + "step": 1266 + }, + { + "epoch": 0.19, + "grad_norm": 14.139231519348176, + "learning_rate": 1.9996821271830457e-06, + "loss": 0.7142, + "step": 1267 + }, + { + "epoch": 0.19, + "grad_norm": 1.0843785039271072, + "learning_rate": 1.9996796868400164e-06, + "loss": 0.6875, + "step": 1268 + }, + { + "epoch": 0.19, + "grad_norm": 2.6018181722601916, + "learning_rate": 1.9996772371668884e-06, + "loss": 0.6855, + "step": 1269 + }, + { + "epoch": 0.19, + "grad_norm": 7.762104577095524, + "learning_rate": 1.9996747781636843e-06, + "loss": 0.6797, + "step": 1270 + }, + { + "epoch": 0.19, + "grad_norm": 5.476519944529524, + "learning_rate": 1.9996723098304268e-06, + "loss": 0.6947, + "step": 1271 + }, + { + "epoch": 0.19, + "grad_norm": 6.510140985254167, + "learning_rate": 1.99966983216714e-06, + "loss": 0.6992, + "step": 1272 + }, + { + "epoch": 0.19, + "grad_norm": 4.975974311486059, + "learning_rate": 1.9996673451738456e-06, + "loss": 0.6901, + "step": 1273 + }, + { + "epoch": 0.19, + "grad_norm": 8.396368395862599, + "learning_rate": 1.9996648488505673e-06, + "loss": 0.6914, + "step": 1274 + }, + { + "epoch": 0.19, + "grad_norm": 2.3106436102571353, + "learning_rate": 1.999662343197329e-06, + "loss": 0.6927, + "step": 1275 + }, + { + "epoch": 0.19, + "grad_norm": 1.6122974535648966, + "learning_rate": 1.9996598282141533e-06, + "loss": 0.6934, + "step": 1276 + }, + { + "epoch": 0.19, + "grad_norm": 6.257249902612633, + "learning_rate": 1.9996573039010642e-06, + "loss": 0.6921, + "step": 1277 + }, + { + "epoch": 0.19, + "grad_norm": 3.807640439896453, + "learning_rate": 1.999654770258085e-06, + "loss": 0.694, + "step": 1278 + }, + { + "epoch": 0.19, + "grad_norm": 9.575759846635215, + "learning_rate": 1.9996522272852397e-06, + "loss": 0.7051, + "step": 1279 + }, + { + "epoch": 0.19, + "grad_norm": 10.64196492575006, + "learning_rate": 1.9996496749825513e-06, + "loss": 0.7122, + "step": 1280 + }, + { + "epoch": 0.19, + "grad_norm": 7.127270697302432, + "learning_rate": 1.9996471133500444e-06, + "loss": 0.7005, + "step": 1281 + }, + { + "epoch": 0.19, + "grad_norm": 5.066129580383378, + "learning_rate": 1.9996445423877428e-06, + "loss": 0.7018, + "step": 1282 + }, + { + "epoch": 0.19, + "grad_norm": 8.049409493543406, + "learning_rate": 1.9996419620956697e-06, + "loss": 0.6888, + "step": 1283 + }, + { + "epoch": 0.19, + "grad_norm": 0.8655735024874135, + "learning_rate": 1.99963937247385e-06, + "loss": 0.6875, + "step": 1284 + }, + { + "epoch": 0.19, + "grad_norm": 1.947494002884089, + "learning_rate": 1.999636773522308e-06, + "loss": 0.6908, + "step": 1285 + }, + { + "epoch": 0.19, + "grad_norm": 7.0890236445937544, + "learning_rate": 1.999634165241067e-06, + "loss": 0.7038, + "step": 1286 + }, + { + "epoch": 0.19, + "grad_norm": 2.9869335573724656, + "learning_rate": 1.9996315476301524e-06, + "loss": 0.6927, + "step": 1287 + }, + { + "epoch": 0.19, + "grad_norm": 6.054760653732889, + "learning_rate": 1.999628920689588e-06, + "loss": 0.694, + "step": 1288 + }, + { + "epoch": 0.19, + "grad_norm": 7.654051156478234, + "learning_rate": 1.999626284419398e-06, + "loss": 0.7031, + "step": 1289 + }, + { + "epoch": 0.19, + "grad_norm": 3.581355788761137, + "learning_rate": 1.999623638819608e-06, + "loss": 0.6921, + "step": 1290 + }, + { + "epoch": 0.19, + "grad_norm": 6.3617304601292215, + "learning_rate": 1.999620983890242e-06, + "loss": 0.6914, + "step": 1291 + }, + { + "epoch": 0.19, + "grad_norm": 6.814202836529461, + "learning_rate": 1.9996183196313254e-06, + "loss": 0.696, + "step": 1292 + }, + { + "epoch": 0.19, + "grad_norm": 6.4843085017173125, + "learning_rate": 1.999615646042882e-06, + "loss": 0.6953, + "step": 1293 + }, + { + "epoch": 0.19, + "grad_norm": 5.113120086695793, + "learning_rate": 1.9996129631249378e-06, + "loss": 0.6855, + "step": 1294 + }, + { + "epoch": 0.19, + "grad_norm": 2.3797217480070145, + "learning_rate": 1.9996102708775173e-06, + "loss": 0.6875, + "step": 1295 + }, + { + "epoch": 0.19, + "grad_norm": 4.068520073642747, + "learning_rate": 1.9996075693006456e-06, + "loss": 0.6979, + "step": 1296 + }, + { + "epoch": 0.19, + "grad_norm": 7.049757070952307, + "learning_rate": 1.9996048583943484e-06, + "loss": 0.7005, + "step": 1297 + }, + { + "epoch": 0.19, + "grad_norm": 3.465870318288912, + "learning_rate": 1.9996021381586503e-06, + "loss": 0.6849, + "step": 1298 + }, + { + "epoch": 0.19, + "grad_norm": 12.724118437911253, + "learning_rate": 1.9995994085935772e-06, + "loss": 0.7148, + "step": 1299 + }, + { + "epoch": 0.19, + "grad_norm": 12.296287527054576, + "learning_rate": 1.9995966696991545e-06, + "loss": 0.7181, + "step": 1300 + }, + { + "epoch": 0.19, + "grad_norm": 15.538296506226393, + "learning_rate": 1.9995939214754076e-06, + "loss": 0.7201, + "step": 1301 + }, + { + "epoch": 0.19, + "grad_norm": 3.9520455308259126, + "learning_rate": 1.999591163922362e-06, + "loss": 0.6973, + "step": 1302 + }, + { + "epoch": 0.19, + "grad_norm": 3.4895262147234547, + "learning_rate": 1.999588397040044e-06, + "loss": 0.6986, + "step": 1303 + }, + { + "epoch": 0.19, + "grad_norm": 0.7404258374466436, + "learning_rate": 1.999585620828479e-06, + "loss": 0.6901, + "step": 1304 + }, + { + "epoch": 0.19, + "grad_norm": 13.402410375050735, + "learning_rate": 1.999582835287693e-06, + "loss": 0.7012, + "step": 1305 + }, + { + "epoch": 0.19, + "grad_norm": 8.296414451848538, + "learning_rate": 1.9995800404177117e-06, + "loss": 0.6973, + "step": 1306 + }, + { + "epoch": 0.19, + "grad_norm": 4.606303220903318, + "learning_rate": 1.999577236218562e-06, + "loss": 0.6816, + "step": 1307 + }, + { + "epoch": 0.2, + "grad_norm": 9.306373523552896, + "learning_rate": 1.9995744226902692e-06, + "loss": 0.7038, + "step": 1308 + }, + { + "epoch": 0.2, + "grad_norm": 6.787490040691799, + "learning_rate": 1.99957159983286e-06, + "loss": 0.6895, + "step": 1309 + }, + { + "epoch": 0.2, + "grad_norm": 2.021838302249624, + "learning_rate": 1.9995687676463604e-06, + "loss": 0.6901, + "step": 1310 + }, + { + "epoch": 0.2, + "grad_norm": 10.33482801334691, + "learning_rate": 1.9995659261307977e-06, + "loss": 0.6999, + "step": 1311 + }, + { + "epoch": 0.2, + "grad_norm": 3.1428061393689, + "learning_rate": 1.9995630752861975e-06, + "loss": 0.6882, + "step": 1312 + }, + { + "epoch": 0.2, + "grad_norm": 5.4353747236627665, + "learning_rate": 1.9995602151125864e-06, + "loss": 0.694, + "step": 1313 + }, + { + "epoch": 0.2, + "grad_norm": 1.142343908983896, + "learning_rate": 1.999557345609992e-06, + "loss": 0.6927, + "step": 1314 + }, + { + "epoch": 0.2, + "grad_norm": 5.015649135381721, + "learning_rate": 1.99955446677844e-06, + "loss": 0.6816, + "step": 1315 + }, + { + "epoch": 0.2, + "grad_norm": 0.9089167177234908, + "learning_rate": 1.999551578617958e-06, + "loss": 0.6947, + "step": 1316 + }, + { + "epoch": 0.2, + "grad_norm": 2.9916049157789493, + "learning_rate": 1.9995486811285728e-06, + "loss": 0.6738, + "step": 1317 + }, + { + "epoch": 0.2, + "grad_norm": 17.231007788519676, + "learning_rate": 1.9995457743103113e-06, + "loss": 0.7135, + "step": 1318 + }, + { + "epoch": 0.2, + "grad_norm": 3.333495271337855, + "learning_rate": 1.9995428581632007e-06, + "loss": 0.6914, + "step": 1319 + }, + { + "epoch": 0.2, + "grad_norm": 1.9931078727363534, + "learning_rate": 1.9995399326872682e-06, + "loss": 0.6895, + "step": 1320 + }, + { + "epoch": 0.2, + "grad_norm": 6.7279735217706165, + "learning_rate": 1.9995369978825413e-06, + "loss": 0.7012, + "step": 1321 + }, + { + "epoch": 0.2, + "grad_norm": 8.457980288386507, + "learning_rate": 1.9995340537490467e-06, + "loss": 0.6973, + "step": 1322 + }, + { + "epoch": 0.2, + "grad_norm": 2.1535462039995195, + "learning_rate": 1.999531100286813e-06, + "loss": 0.6882, + "step": 1323 + }, + { + "epoch": 0.2, + "grad_norm": 4.7209132251517225, + "learning_rate": 1.9995281374958666e-06, + "loss": 0.6947, + "step": 1324 + }, + { + "epoch": 0.2, + "grad_norm": 6.041346045421298, + "learning_rate": 1.999525165376236e-06, + "loss": 0.6921, + "step": 1325 + }, + { + "epoch": 0.2, + "grad_norm": 4.821284703522376, + "learning_rate": 1.9995221839279485e-06, + "loss": 0.6966, + "step": 1326 + }, + { + "epoch": 0.2, + "grad_norm": 1.3169094661821898, + "learning_rate": 1.999519193151032e-06, + "loss": 0.6953, + "step": 1327 + }, + { + "epoch": 0.2, + "grad_norm": 4.2988998825990015, + "learning_rate": 1.9995161930455147e-06, + "loss": 0.6908, + "step": 1328 + }, + { + "epoch": 0.2, + "grad_norm": 6.380739230361549, + "learning_rate": 1.999513183611424e-06, + "loss": 0.6947, + "step": 1329 + }, + { + "epoch": 0.2, + "grad_norm": 2.586516280188938, + "learning_rate": 1.999510164848789e-06, + "loss": 0.6888, + "step": 1330 + }, + { + "epoch": 0.2, + "grad_norm": 2.6288614215216497, + "learning_rate": 1.999507136757637e-06, + "loss": 0.6868, + "step": 1331 + }, + { + "epoch": 0.2, + "grad_norm": 1.0358191289497845, + "learning_rate": 1.9995040993379965e-06, + "loss": 0.6947, + "step": 1332 + }, + { + "epoch": 0.2, + "grad_norm": 2.908100699371478, + "learning_rate": 1.9995010525898954e-06, + "loss": 0.6986, + "step": 1333 + }, + { + "epoch": 0.2, + "grad_norm": 5.102991223645948, + "learning_rate": 1.999497996513363e-06, + "loss": 0.7005, + "step": 1334 + }, + { + "epoch": 0.2, + "grad_norm": 1.334604708842187, + "learning_rate": 1.9994949311084274e-06, + "loss": 0.6836, + "step": 1335 + }, + { + "epoch": 0.2, + "grad_norm": 5.083411795557312, + "learning_rate": 1.999491856375117e-06, + "loss": 0.6888, + "step": 1336 + }, + { + "epoch": 0.2, + "grad_norm": 5.097987227219584, + "learning_rate": 1.999488772313461e-06, + "loss": 0.6979, + "step": 1337 + }, + { + "epoch": 0.2, + "grad_norm": 3.1596776156825794, + "learning_rate": 1.999485678923488e-06, + "loss": 0.6934, + "step": 1338 + }, + { + "epoch": 0.2, + "grad_norm": 3.9323683299458003, + "learning_rate": 1.999482576205226e-06, + "loss": 0.6908, + "step": 1339 + }, + { + "epoch": 0.2, + "grad_norm": 2.760677986941379, + "learning_rate": 1.9994794641587057e-06, + "loss": 0.6888, + "step": 1340 + }, + { + "epoch": 0.2, + "grad_norm": 1.362544715245116, + "learning_rate": 1.9994763427839544e-06, + "loss": 0.6888, + "step": 1341 + }, + { + "epoch": 0.2, + "grad_norm": 6.656320930663419, + "learning_rate": 1.9994732120810025e-06, + "loss": 0.6947, + "step": 1342 + }, + { + "epoch": 0.2, + "grad_norm": 4.087475156553227, + "learning_rate": 1.9994700720498783e-06, + "loss": 0.6868, + "step": 1343 + }, + { + "epoch": 0.2, + "grad_norm": 3.33713385621432, + "learning_rate": 1.999466922690612e-06, + "loss": 0.6836, + "step": 1344 + }, + { + "epoch": 0.2, + "grad_norm": 2.288453691280976, + "learning_rate": 1.999463764003232e-06, + "loss": 0.6882, + "step": 1345 + }, + { + "epoch": 0.2, + "grad_norm": 1.381092242878674, + "learning_rate": 1.9994605959877687e-06, + "loss": 0.6947, + "step": 1346 + }, + { + "epoch": 0.2, + "grad_norm": 0.8482473093385436, + "learning_rate": 1.999457418644251e-06, + "loss": 0.6882, + "step": 1347 + }, + { + "epoch": 0.2, + "grad_norm": 8.647695332017683, + "learning_rate": 1.9994542319727095e-06, + "loss": 0.6908, + "step": 1348 + }, + { + "epoch": 0.2, + "grad_norm": 1.888796396380525, + "learning_rate": 1.9994510359731723e-06, + "loss": 0.6953, + "step": 1349 + }, + { + "epoch": 0.2, + "grad_norm": 0.9270326173885076, + "learning_rate": 1.9994478306456707e-06, + "loss": 0.6908, + "step": 1350 + }, + { + "epoch": 0.2, + "grad_norm": 11.243929726819895, + "learning_rate": 1.9994446159902345e-06, + "loss": 0.6947, + "step": 1351 + }, + { + "epoch": 0.2, + "grad_norm": 7.435124783954313, + "learning_rate": 1.999441392006893e-06, + "loss": 0.7057, + "step": 1352 + }, + { + "epoch": 0.2, + "grad_norm": 6.83963017526487, + "learning_rate": 1.999438158695676e-06, + "loss": 0.7005, + "step": 1353 + }, + { + "epoch": 0.2, + "grad_norm": 5.271352178902257, + "learning_rate": 1.9994349160566147e-06, + "loss": 0.6888, + "step": 1354 + }, + { + "epoch": 0.2, + "grad_norm": 2.1583591269810745, + "learning_rate": 1.9994316640897393e-06, + "loss": 0.6921, + "step": 1355 + }, + { + "epoch": 0.2, + "grad_norm": 6.380381840069722, + "learning_rate": 1.9994284027950795e-06, + "loss": 0.6979, + "step": 1356 + }, + { + "epoch": 0.2, + "grad_norm": 15.452136783738231, + "learning_rate": 1.999425132172666e-06, + "loss": 0.709, + "step": 1357 + }, + { + "epoch": 0.2, + "grad_norm": 6.46442592813191, + "learning_rate": 1.999421852222529e-06, + "loss": 0.6992, + "step": 1358 + }, + { + "epoch": 0.2, + "grad_norm": 3.5963977928197286, + "learning_rate": 1.9994185629447e-06, + "loss": 0.6868, + "step": 1359 + }, + { + "epoch": 0.2, + "grad_norm": 8.63879693467515, + "learning_rate": 1.9994152643392093e-06, + "loss": 0.6855, + "step": 1360 + }, + { + "epoch": 0.2, + "grad_norm": 0.9571382087868398, + "learning_rate": 1.999411956406087e-06, + "loss": 0.6914, + "step": 1361 + }, + { + "epoch": 0.2, + "grad_norm": 2.4779328412614885, + "learning_rate": 1.9994086391453652e-06, + "loss": 0.6908, + "step": 1362 + }, + { + "epoch": 0.2, + "grad_norm": 10.12266705249971, + "learning_rate": 1.9994053125570737e-06, + "loss": 0.6953, + "step": 1363 + }, + { + "epoch": 0.2, + "grad_norm": 3.1835715872091432, + "learning_rate": 1.999401976641244e-06, + "loss": 0.6999, + "step": 1364 + }, + { + "epoch": 0.2, + "grad_norm": 1.3016687799312983, + "learning_rate": 1.9993986313979074e-06, + "loss": 0.6882, + "step": 1365 + }, + { + "epoch": 0.2, + "grad_norm": 3.917511275703195, + "learning_rate": 1.999395276827095e-06, + "loss": 0.6875, + "step": 1366 + }, + { + "epoch": 0.2, + "grad_norm": 2.2106454311332624, + "learning_rate": 1.9993919129288383e-06, + "loss": 0.6908, + "step": 1367 + }, + { + "epoch": 0.2, + "grad_norm": 6.310291805035572, + "learning_rate": 1.999388539703168e-06, + "loss": 0.6829, + "step": 1368 + }, + { + "epoch": 0.2, + "grad_norm": 1.429168606881153, + "learning_rate": 1.9993851571501164e-06, + "loss": 0.694, + "step": 1369 + }, + { + "epoch": 0.2, + "grad_norm": 10.001206525988767, + "learning_rate": 1.999381765269715e-06, + "loss": 0.6947, + "step": 1370 + }, + { + "epoch": 0.2, + "grad_norm": 4.3587401426970365, + "learning_rate": 1.999378364061995e-06, + "loss": 0.6914, + "step": 1371 + }, + { + "epoch": 0.2, + "grad_norm": 3.5299353773312183, + "learning_rate": 1.999374953526988e-06, + "loss": 0.694, + "step": 1372 + }, + { + "epoch": 0.2, + "grad_norm": 0.9859151292271555, + "learning_rate": 1.9993715336647267e-06, + "loss": 0.6875, + "step": 1373 + }, + { + "epoch": 0.2, + "grad_norm": 11.247166597509331, + "learning_rate": 1.999368104475242e-06, + "loss": 0.7044, + "step": 1374 + }, + { + "epoch": 0.21, + "grad_norm": 1.9503879327995626, + "learning_rate": 1.9993646659585668e-06, + "loss": 0.6849, + "step": 1375 + }, + { + "epoch": 0.21, + "grad_norm": 1.0152628961774994, + "learning_rate": 1.9993612181147323e-06, + "loss": 0.6842, + "step": 1376 + }, + { + "epoch": 0.21, + "grad_norm": 4.078941763629482, + "learning_rate": 1.9993577609437717e-06, + "loss": 0.694, + "step": 1377 + }, + { + "epoch": 0.21, + "grad_norm": 7.659759900662239, + "learning_rate": 1.9993542944457167e-06, + "loss": 0.7044, + "step": 1378 + }, + { + "epoch": 0.21, + "grad_norm": 4.026285832605804, + "learning_rate": 1.9993508186205994e-06, + "loss": 0.6986, + "step": 1379 + }, + { + "epoch": 0.21, + "grad_norm": 5.267818771409319, + "learning_rate": 1.9993473334684526e-06, + "loss": 0.6934, + "step": 1380 + }, + { + "epoch": 0.21, + "grad_norm": 3.0229882663812377, + "learning_rate": 1.9993438389893087e-06, + "loss": 0.7018, + "step": 1381 + }, + { + "epoch": 0.21, + "grad_norm": 3.1615523089778144, + "learning_rate": 1.9993403351832005e-06, + "loss": 0.6888, + "step": 1382 + }, + { + "epoch": 0.21, + "grad_norm": 1.6102151246732936, + "learning_rate": 1.999336822050161e-06, + "loss": 0.6901, + "step": 1383 + }, + { + "epoch": 0.21, + "grad_norm": 7.707922626199945, + "learning_rate": 1.9993332995902217e-06, + "loss": 0.6855, + "step": 1384 + }, + { + "epoch": 0.21, + "grad_norm": 6.291693610377123, + "learning_rate": 1.999329767803417e-06, + "loss": 0.6992, + "step": 1385 + }, + { + "epoch": 0.21, + "grad_norm": 5.62972351143295, + "learning_rate": 1.999326226689779e-06, + "loss": 0.694, + "step": 1386 + }, + { + "epoch": 0.21, + "grad_norm": 0.7236064358296789, + "learning_rate": 1.999322676249341e-06, + "loss": 0.6829, + "step": 1387 + }, + { + "epoch": 0.21, + "grad_norm": 7.370766239780016, + "learning_rate": 1.9993191164821358e-06, + "loss": 0.7012, + "step": 1388 + }, + { + "epoch": 0.21, + "grad_norm": 6.144971691653064, + "learning_rate": 1.999315547388197e-06, + "loss": 0.6966, + "step": 1389 + }, + { + "epoch": 0.21, + "grad_norm": 12.456980046165912, + "learning_rate": 1.999311968967558e-06, + "loss": 0.7064, + "step": 1390 + }, + { + "epoch": 0.21, + "grad_norm": 6.8192011757707585, + "learning_rate": 1.9993083812202517e-06, + "loss": 0.6986, + "step": 1391 + }, + { + "epoch": 0.21, + "grad_norm": 5.393815201126012, + "learning_rate": 1.9993047841463123e-06, + "loss": 0.6973, + "step": 1392 + }, + { + "epoch": 0.21, + "grad_norm": 1.864807960931258, + "learning_rate": 1.999301177745773e-06, + "loss": 0.6921, + "step": 1393 + }, + { + "epoch": 0.21, + "grad_norm": 1.2086541712553573, + "learning_rate": 1.9992975620186672e-06, + "loss": 0.6979, + "step": 1394 + }, + { + "epoch": 0.21, + "grad_norm": 6.372256497596961, + "learning_rate": 1.999293936965029e-06, + "loss": 0.6973, + "step": 1395 + }, + { + "epoch": 0.21, + "grad_norm": 0.8481659987928563, + "learning_rate": 1.9992903025848917e-06, + "loss": 0.6979, + "step": 1396 + }, + { + "epoch": 0.21, + "grad_norm": 2.22787993865055, + "learning_rate": 1.9992866588782903e-06, + "loss": 0.6875, + "step": 1397 + }, + { + "epoch": 0.21, + "grad_norm": 6.968308898670227, + "learning_rate": 1.9992830058452577e-06, + "loss": 0.7018, + "step": 1398 + }, + { + "epoch": 0.21, + "grad_norm": 4.3345894238453235, + "learning_rate": 1.9992793434858287e-06, + "loss": 0.6973, + "step": 1399 + }, + { + "epoch": 0.21, + "grad_norm": 10.72630080141327, + "learning_rate": 1.999275671800037e-06, + "loss": 0.6849, + "step": 1400 + }, + { + "epoch": 0.21, + "grad_norm": 4.0998767153567925, + "learning_rate": 1.999271990787917e-06, + "loss": 0.6908, + "step": 1401 + }, + { + "epoch": 0.21, + "grad_norm": 0.910933972594582, + "learning_rate": 1.9992683004495037e-06, + "loss": 0.6842, + "step": 1402 + }, + { + "epoch": 0.21, + "grad_norm": 4.808772309092146, + "learning_rate": 1.9992646007848306e-06, + "loss": 0.6888, + "step": 1403 + }, + { + "epoch": 0.21, + "grad_norm": 3.3173534659933797, + "learning_rate": 1.9992608917939322e-06, + "loss": 0.6934, + "step": 1404 + }, + { + "epoch": 0.21, + "grad_norm": 6.695833477945199, + "learning_rate": 1.9992571734768444e-06, + "loss": 0.6953, + "step": 1405 + }, + { + "epoch": 0.21, + "grad_norm": 3.4754760115528986, + "learning_rate": 1.9992534458336008e-06, + "loss": 0.6927, + "step": 1406 + }, + { + "epoch": 0.21, + "grad_norm": 5.135721713350302, + "learning_rate": 1.999249708864236e-06, + "loss": 0.6901, + "step": 1407 + }, + { + "epoch": 0.21, + "grad_norm": 7.730005580133342, + "learning_rate": 1.999245962568785e-06, + "loss": 0.6947, + "step": 1408 + }, + { + "epoch": 0.21, + "grad_norm": 2.3095089379911298, + "learning_rate": 1.999242206947284e-06, + "loss": 0.6901, + "step": 1409 + }, + { + "epoch": 0.21, + "grad_norm": 7.336873933932744, + "learning_rate": 1.9992384419997664e-06, + "loss": 0.694, + "step": 1410 + }, + { + "epoch": 0.21, + "grad_norm": 4.168439481729182, + "learning_rate": 1.9992346677262684e-06, + "loss": 0.6868, + "step": 1411 + }, + { + "epoch": 0.21, + "grad_norm": 6.690830641350171, + "learning_rate": 1.999230884126825e-06, + "loss": 0.6973, + "step": 1412 + }, + { + "epoch": 0.21, + "grad_norm": 4.8587240803626335, + "learning_rate": 1.999227091201471e-06, + "loss": 0.6966, + "step": 1413 + }, + { + "epoch": 0.21, + "grad_norm": 8.04496128926026, + "learning_rate": 1.9992232889502427e-06, + "loss": 0.681, + "step": 1414 + }, + { + "epoch": 0.21, + "grad_norm": 4.298348957834557, + "learning_rate": 1.9992194773731744e-06, + "loss": 0.694, + "step": 1415 + }, + { + "epoch": 0.21, + "grad_norm": 6.1621953217606915, + "learning_rate": 1.999215656470303e-06, + "loss": 0.6921, + "step": 1416 + }, + { + "epoch": 0.21, + "grad_norm": 7.7691785339472625, + "learning_rate": 1.999211826241663e-06, + "loss": 0.6901, + "step": 1417 + }, + { + "epoch": 0.21, + "grad_norm": 2.912923096481703, + "learning_rate": 1.999207986687291e-06, + "loss": 0.6816, + "step": 1418 + }, + { + "epoch": 0.21, + "grad_norm": 20.857804353670968, + "learning_rate": 1.9992041378072223e-06, + "loss": 0.7031, + "step": 1419 + }, + { + "epoch": 0.21, + "grad_norm": 1.0207858464368178, + "learning_rate": 1.999200279601493e-06, + "loss": 0.7025, + "step": 1420 + }, + { + "epoch": 0.21, + "grad_norm": 3.4483928215027646, + "learning_rate": 1.9991964120701394e-06, + "loss": 0.6953, + "step": 1421 + }, + { + "epoch": 0.21, + "grad_norm": 11.492184381882627, + "learning_rate": 1.999192535213197e-06, + "loss": 0.707, + "step": 1422 + }, + { + "epoch": 0.21, + "grad_norm": 0.9284906663578417, + "learning_rate": 1.9991886490307025e-06, + "loss": 0.6855, + "step": 1423 + }, + { + "epoch": 0.21, + "grad_norm": 2.1616572208596527, + "learning_rate": 1.9991847535226917e-06, + "loss": 0.6927, + "step": 1424 + }, + { + "epoch": 0.21, + "grad_norm": 4.676605184520193, + "learning_rate": 1.9991808486892013e-06, + "loss": 0.6999, + "step": 1425 + }, + { + "epoch": 0.21, + "grad_norm": 1.9636216958048496, + "learning_rate": 1.999176934530268e-06, + "loss": 0.6862, + "step": 1426 + }, + { + "epoch": 0.21, + "grad_norm": 10.980947716983463, + "learning_rate": 1.999173011045928e-06, + "loss": 0.7018, + "step": 1427 + }, + { + "epoch": 0.21, + "grad_norm": 2.0822602410672646, + "learning_rate": 1.9991690782362177e-06, + "loss": 0.6901, + "step": 1428 + }, + { + "epoch": 0.21, + "grad_norm": 16.253262748887245, + "learning_rate": 1.999165136101174e-06, + "loss": 0.7038, + "step": 1429 + }, + { + "epoch": 0.21, + "grad_norm": 2.915637808423142, + "learning_rate": 1.999161184640834e-06, + "loss": 0.6829, + "step": 1430 + }, + { + "epoch": 0.21, + "grad_norm": 6.328520577440131, + "learning_rate": 1.9991572238552337e-06, + "loss": 0.681, + "step": 1431 + }, + { + "epoch": 0.21, + "grad_norm": 2.4696451365236114, + "learning_rate": 1.9991532537444115e-06, + "loss": 0.6712, + "step": 1432 + }, + { + "epoch": 0.21, + "grad_norm": 4.24023921479907, + "learning_rate": 1.9991492743084033e-06, + "loss": 0.6934, + "step": 1433 + }, + { + "epoch": 0.21, + "grad_norm": 8.367698957957453, + "learning_rate": 1.9991452855472463e-06, + "loss": 0.7103, + "step": 1434 + }, + { + "epoch": 0.21, + "grad_norm": 2.6237104675398717, + "learning_rate": 1.9991412874609778e-06, + "loss": 0.6901, + "step": 1435 + }, + { + "epoch": 0.21, + "grad_norm": 4.290902033317405, + "learning_rate": 1.999137280049636e-06, + "loss": 0.6784, + "step": 1436 + }, + { + "epoch": 0.21, + "grad_norm": 10.79148844343837, + "learning_rate": 1.999133263313257e-06, + "loss": 0.6966, + "step": 1437 + }, + { + "epoch": 0.21, + "grad_norm": 1.1506205384082024, + "learning_rate": 1.9991292372518793e-06, + "loss": 0.6888, + "step": 1438 + }, + { + "epoch": 0.21, + "grad_norm": 5.908192062762571, + "learning_rate": 1.99912520186554e-06, + "loss": 0.694, + "step": 1439 + }, + { + "epoch": 0.21, + "grad_norm": 1.6295205609624253, + "learning_rate": 1.999121157154277e-06, + "loss": 0.6986, + "step": 1440 + }, + { + "epoch": 0.21, + "grad_norm": 2.3088512527684473, + "learning_rate": 1.999117103118127e-06, + "loss": 0.6849, + "step": 1441 + }, + { + "epoch": 0.22, + "grad_norm": 5.250723279714798, + "learning_rate": 1.9991130397571297e-06, + "loss": 0.6908, + "step": 1442 + }, + { + "epoch": 0.22, + "grad_norm": 1.1305960313536874, + "learning_rate": 1.9991089670713215e-06, + "loss": 0.6855, + "step": 1443 + }, + { + "epoch": 0.22, + "grad_norm": 3.593224823939013, + "learning_rate": 1.9991048850607415e-06, + "loss": 0.694, + "step": 1444 + }, + { + "epoch": 0.22, + "grad_norm": 4.504946385621945, + "learning_rate": 1.999100793725427e-06, + "loss": 0.6992, + "step": 1445 + }, + { + "epoch": 0.22, + "grad_norm": 0.9090905998453783, + "learning_rate": 1.999096693065416e-06, + "loss": 0.6855, + "step": 1446 + }, + { + "epoch": 0.22, + "grad_norm": 0.9879595653048917, + "learning_rate": 1.9990925830807476e-06, + "loss": 0.7012, + "step": 1447 + }, + { + "epoch": 0.22, + "grad_norm": 1.4649529417059681, + "learning_rate": 1.9990884637714596e-06, + "loss": 0.6862, + "step": 1448 + }, + { + "epoch": 0.22, + "grad_norm": 7.691447858669326, + "learning_rate": 1.999084335137591e-06, + "loss": 0.6908, + "step": 1449 + }, + { + "epoch": 0.22, + "grad_norm": 3.6248457642800878, + "learning_rate": 1.9990801971791793e-06, + "loss": 0.6973, + "step": 1450 + }, + { + "epoch": 0.22, + "grad_norm": 1.3563952294687065, + "learning_rate": 1.9990760498962645e-06, + "loss": 0.7025, + "step": 1451 + }, + { + "epoch": 0.22, + "grad_norm": 4.381495588679319, + "learning_rate": 1.999071893288884e-06, + "loss": 0.7064, + "step": 1452 + }, + { + "epoch": 0.22, + "grad_norm": 7.558572861422781, + "learning_rate": 1.9990677273570773e-06, + "loss": 0.7064, + "step": 1453 + }, + { + "epoch": 0.22, + "grad_norm": 7.592402613188227, + "learning_rate": 1.9990635521008837e-06, + "loss": 0.6966, + "step": 1454 + }, + { + "epoch": 0.22, + "grad_norm": 4.579184911649686, + "learning_rate": 1.999059367520341e-06, + "loss": 0.6855, + "step": 1455 + }, + { + "epoch": 0.22, + "grad_norm": 8.431303756300698, + "learning_rate": 1.999055173615489e-06, + "loss": 0.7038, + "step": 1456 + }, + { + "epoch": 0.22, + "grad_norm": 1.13784639894286, + "learning_rate": 1.9990509703863667e-06, + "loss": 0.6934, + "step": 1457 + }, + { + "epoch": 0.22, + "grad_norm": 3.479105861444483, + "learning_rate": 1.9990467578330136e-06, + "loss": 0.6882, + "step": 1458 + }, + { + "epoch": 0.22, + "grad_norm": 11.256134638338795, + "learning_rate": 1.999042535955468e-06, + "loss": 0.6986, + "step": 1459 + }, + { + "epoch": 0.22, + "grad_norm": 2.060115163738568, + "learning_rate": 1.9990383047537707e-06, + "loss": 0.681, + "step": 1460 + }, + { + "epoch": 0.22, + "grad_norm": 0.7505406396077473, + "learning_rate": 1.9990340642279604e-06, + "loss": 0.6895, + "step": 1461 + }, + { + "epoch": 0.22, + "grad_norm": 7.135624878872752, + "learning_rate": 1.999029814378077e-06, + "loss": 0.707, + "step": 1462 + }, + { + "epoch": 0.22, + "grad_norm": 2.585701746855526, + "learning_rate": 1.99902555520416e-06, + "loss": 0.6823, + "step": 1463 + }, + { + "epoch": 0.22, + "grad_norm": 2.0488945039833766, + "learning_rate": 1.9990212867062486e-06, + "loss": 0.694, + "step": 1464 + }, + { + "epoch": 0.22, + "grad_norm": 6.155114039202514, + "learning_rate": 1.9990170088843843e-06, + "loss": 0.6829, + "step": 1465 + }, + { + "epoch": 0.22, + "grad_norm": 1.7191715368276308, + "learning_rate": 1.999012721738605e-06, + "loss": 0.6888, + "step": 1466 + }, + { + "epoch": 0.22, + "grad_norm": 1.9869717874914117, + "learning_rate": 1.999008425268952e-06, + "loss": 0.6901, + "step": 1467 + }, + { + "epoch": 0.22, + "grad_norm": 3.692726189469868, + "learning_rate": 1.9990041194754646e-06, + "loss": 0.6895, + "step": 1468 + }, + { + "epoch": 0.22, + "grad_norm": 9.321083723221184, + "learning_rate": 1.9989998043581835e-06, + "loss": 0.7031, + "step": 1469 + }, + { + "epoch": 0.22, + "grad_norm": 1.2173074620702786, + "learning_rate": 1.998995479917149e-06, + "loss": 0.6836, + "step": 1470 + }, + { + "epoch": 0.22, + "grad_norm": 0.7649288252967942, + "learning_rate": 1.9989911461524012e-06, + "loss": 0.6895, + "step": 1471 + }, + { + "epoch": 0.22, + "grad_norm": 3.181114195966454, + "learning_rate": 1.998986803063981e-06, + "loss": 0.6992, + "step": 1472 + }, + { + "epoch": 0.22, + "grad_norm": 1.4363371574313684, + "learning_rate": 1.9989824506519286e-06, + "loss": 0.6999, + "step": 1473 + }, + { + "epoch": 0.22, + "grad_norm": 4.140351508335359, + "learning_rate": 1.9989780889162846e-06, + "loss": 0.6842, + "step": 1474 + }, + { + "epoch": 0.22, + "grad_norm": 0.7905022137777103, + "learning_rate": 1.9989737178570894e-06, + "loss": 0.6992, + "step": 1475 + }, + { + "epoch": 0.22, + "grad_norm": 9.22086757010514, + "learning_rate": 1.9989693374743845e-06, + "loss": 0.694, + "step": 1476 + }, + { + "epoch": 0.22, + "grad_norm": 1.2345930780297119, + "learning_rate": 1.9989649477682104e-06, + "loss": 0.6947, + "step": 1477 + }, + { + "epoch": 0.22, + "grad_norm": 3.3756779669012205, + "learning_rate": 1.998960548738608e-06, + "loss": 0.6973, + "step": 1478 + }, + { + "epoch": 0.22, + "grad_norm": 1.30433761135794, + "learning_rate": 1.9989561403856185e-06, + "loss": 0.6862, + "step": 1479 + }, + { + "epoch": 0.22, + "grad_norm": 0.8785420291651507, + "learning_rate": 1.9989517227092828e-06, + "loss": 0.6895, + "step": 1480 + }, + { + "epoch": 0.22, + "grad_norm": 10.872723880130971, + "learning_rate": 1.9989472957096427e-06, + "loss": 0.7057, + "step": 1481 + }, + { + "epoch": 0.22, + "grad_norm": 15.155010806175204, + "learning_rate": 1.998942859386739e-06, + "loss": 0.6953, + "step": 1482 + }, + { + "epoch": 0.22, + "grad_norm": 9.520662239077758, + "learning_rate": 1.998938413740613e-06, + "loss": 0.696, + "step": 1483 + }, + { + "epoch": 0.22, + "grad_norm": 5.660132682275333, + "learning_rate": 1.9989339587713067e-06, + "loss": 0.6908, + "step": 1484 + }, + { + "epoch": 0.22, + "grad_norm": 0.6538546266801375, + "learning_rate": 1.9989294944788617e-06, + "loss": 0.6973, + "step": 1485 + }, + { + "epoch": 0.22, + "grad_norm": 3.5344956739903575, + "learning_rate": 1.998925020863319e-06, + "loss": 0.6888, + "step": 1486 + }, + { + "epoch": 0.22, + "grad_norm": 3.85707919779448, + "learning_rate": 1.9989205379247206e-06, + "loss": 0.696, + "step": 1487 + }, + { + "epoch": 0.22, + "grad_norm": 6.976320106262834, + "learning_rate": 1.9989160456631088e-06, + "loss": 0.6895, + "step": 1488 + }, + { + "epoch": 0.22, + "grad_norm": 4.240571042001372, + "learning_rate": 1.998911544078525e-06, + "loss": 0.6927, + "step": 1489 + }, + { + "epoch": 0.22, + "grad_norm": 1.236716822426893, + "learning_rate": 1.9989070331710118e-06, + "loss": 0.6882, + "step": 1490 + }, + { + "epoch": 0.22, + "grad_norm": 4.404311902637231, + "learning_rate": 1.9989025129406104e-06, + "loss": 0.6914, + "step": 1491 + }, + { + "epoch": 0.22, + "grad_norm": 1.835022036586463, + "learning_rate": 1.998897983387364e-06, + "loss": 0.6908, + "step": 1492 + }, + { + "epoch": 0.22, + "grad_norm": 6.76728307846668, + "learning_rate": 1.998893444511314e-06, + "loss": 0.6927, + "step": 1493 + }, + { + "epoch": 0.22, + "grad_norm": 4.052613348044037, + "learning_rate": 1.998888896312503e-06, + "loss": 0.6901, + "step": 1494 + }, + { + "epoch": 0.22, + "grad_norm": 3.4850082373742692, + "learning_rate": 1.9988843387909744e-06, + "loss": 0.6999, + "step": 1495 + }, + { + "epoch": 0.22, + "grad_norm": 6.908445880725529, + "learning_rate": 1.998879771946769e-06, + "loss": 0.6973, + "step": 1496 + }, + { + "epoch": 0.22, + "grad_norm": 3.6926802055121524, + "learning_rate": 1.998875195779931e-06, + "loss": 0.6901, + "step": 1497 + }, + { + "epoch": 0.22, + "grad_norm": 7.889265356673741, + "learning_rate": 1.998870610290502e-06, + "loss": 0.694, + "step": 1498 + }, + { + "epoch": 0.22, + "grad_norm": 1.2956619961509452, + "learning_rate": 1.9988660154785254e-06, + "loss": 0.6855, + "step": 1499 + }, + { + "epoch": 0.22, + "grad_norm": 4.85843534583009, + "learning_rate": 1.9988614113440444e-06, + "loss": 0.6908, + "step": 1500 + }, + { + "epoch": 0.22, + "grad_norm": 13.33773110098262, + "learning_rate": 1.998856797887101e-06, + "loss": 0.6999, + "step": 1501 + }, + { + "epoch": 0.22, + "grad_norm": 7.1668265829991595, + "learning_rate": 1.9988521751077387e-06, + "loss": 0.6764, + "step": 1502 + }, + { + "epoch": 0.22, + "grad_norm": 10.80851401889903, + "learning_rate": 1.998847543006001e-06, + "loss": 0.707, + "step": 1503 + }, + { + "epoch": 0.22, + "grad_norm": 8.118119438380875, + "learning_rate": 1.9988429015819306e-06, + "loss": 0.6927, + "step": 1504 + }, + { + "epoch": 0.22, + "grad_norm": 4.902865177160359, + "learning_rate": 1.9988382508355716e-06, + "loss": 0.6992, + "step": 1505 + }, + { + "epoch": 0.22, + "grad_norm": 7.155749987313715, + "learning_rate": 1.9988335907669662e-06, + "loss": 0.694, + "step": 1506 + }, + { + "epoch": 0.22, + "grad_norm": 5.494475652624614, + "learning_rate": 1.998828921376159e-06, + "loss": 0.707, + "step": 1507 + }, + { + "epoch": 0.22, + "grad_norm": 7.366087729928321, + "learning_rate": 1.998824242663193e-06, + "loss": 0.6888, + "step": 1508 + }, + { + "epoch": 0.23, + "grad_norm": 13.236118968825025, + "learning_rate": 1.998819554628112e-06, + "loss": 0.6947, + "step": 1509 + }, + { + "epoch": 0.23, + "grad_norm": 7.307144419395531, + "learning_rate": 1.9988148572709597e-06, + "loss": 0.7025, + "step": 1510 + }, + { + "epoch": 0.23, + "grad_norm": 1.187924429859214, + "learning_rate": 1.99881015059178e-06, + "loss": 0.696, + "step": 1511 + }, + { + "epoch": 0.23, + "grad_norm": 2.7054575518024198, + "learning_rate": 1.9988054345906173e-06, + "loss": 0.6992, + "step": 1512 + }, + { + "epoch": 0.23, + "grad_norm": 7.775694970408079, + "learning_rate": 1.9988007092675143e-06, + "loss": 0.6868, + "step": 1513 + }, + { + "epoch": 0.23, + "grad_norm": 4.343438991173024, + "learning_rate": 1.9987959746225165e-06, + "loss": 0.6829, + "step": 1514 + }, + { + "epoch": 0.23, + "grad_norm": 0.6860879142869721, + "learning_rate": 1.9987912306556674e-06, + "loss": 0.6895, + "step": 1515 + }, + { + "epoch": 0.23, + "grad_norm": 0.8729819539144689, + "learning_rate": 1.9987864773670115e-06, + "loss": 0.6868, + "step": 1516 + }, + { + "epoch": 0.23, + "grad_norm": 3.9283908031681842, + "learning_rate": 1.998781714756593e-06, + "loss": 0.6992, + "step": 1517 + }, + { + "epoch": 0.23, + "grad_norm": 5.977755674504611, + "learning_rate": 1.998776942824457e-06, + "loss": 0.6992, + "step": 1518 + }, + { + "epoch": 0.23, + "grad_norm": 0.7788188399253542, + "learning_rate": 1.998772161570647e-06, + "loss": 0.6934, + "step": 1519 + }, + { + "epoch": 0.23, + "grad_norm": 6.098854203152109, + "learning_rate": 1.9987673709952076e-06, + "loss": 0.6875, + "step": 1520 + }, + { + "epoch": 0.23, + "grad_norm": 11.740205941355486, + "learning_rate": 1.9987625710981848e-06, + "loss": 0.6999, + "step": 1521 + }, + { + "epoch": 0.23, + "grad_norm": 0.8726968589614698, + "learning_rate": 1.998757761879622e-06, + "loss": 0.6908, + "step": 1522 + }, + { + "epoch": 0.23, + "grad_norm": 1.352247531292313, + "learning_rate": 1.998752943339565e-06, + "loss": 0.6882, + "step": 1523 + }, + { + "epoch": 0.23, + "grad_norm": 1.0416261146523942, + "learning_rate": 1.9987481154780584e-06, + "loss": 0.6875, + "step": 1524 + }, + { + "epoch": 0.23, + "grad_norm": 1.157950799133916, + "learning_rate": 1.998743278295147e-06, + "loss": 0.6999, + "step": 1525 + }, + { + "epoch": 0.23, + "grad_norm": 3.903312088936948, + "learning_rate": 1.9987384317908766e-06, + "loss": 0.6973, + "step": 1526 + }, + { + "epoch": 0.23, + "grad_norm": 7.583080539075241, + "learning_rate": 1.9987335759652922e-06, + "loss": 0.6934, + "step": 1527 + }, + { + "epoch": 0.23, + "grad_norm": 3.5340326191452216, + "learning_rate": 1.998728710818439e-06, + "loss": 0.6973, + "step": 1528 + }, + { + "epoch": 0.23, + "grad_norm": 2.699189413875399, + "learning_rate": 1.998723836350362e-06, + "loss": 0.7005, + "step": 1529 + }, + { + "epoch": 0.23, + "grad_norm": 0.7843916675773394, + "learning_rate": 1.9987189525611074e-06, + "loss": 0.694, + "step": 1530 + }, + { + "epoch": 0.23, + "grad_norm": 7.379179760625495, + "learning_rate": 1.9987140594507205e-06, + "loss": 0.694, + "step": 1531 + }, + { + "epoch": 0.23, + "grad_norm": 4.139223742460426, + "learning_rate": 1.9987091570192472e-06, + "loss": 0.6921, + "step": 1532 + }, + { + "epoch": 0.23, + "grad_norm": 1.038470341631544, + "learning_rate": 1.9987042452667324e-06, + "loss": 0.6947, + "step": 1533 + }, + { + "epoch": 0.23, + "grad_norm": 7.519043598297152, + "learning_rate": 1.998699324193223e-06, + "loss": 0.6947, + "step": 1534 + }, + { + "epoch": 0.23, + "grad_norm": 1.7410713482483762, + "learning_rate": 1.9986943937987647e-06, + "loss": 0.6875, + "step": 1535 + }, + { + "epoch": 0.23, + "grad_norm": 4.469017526990286, + "learning_rate": 1.9986894540834028e-06, + "loss": 0.6953, + "step": 1536 + }, + { + "epoch": 0.23, + "grad_norm": 1.8880282111989484, + "learning_rate": 1.998684505047184e-06, + "loss": 0.6908, + "step": 1537 + }, + { + "epoch": 0.23, + "grad_norm": 2.9873381161048735, + "learning_rate": 1.9986795466901546e-06, + "loss": 0.6914, + "step": 1538 + }, + { + "epoch": 0.23, + "grad_norm": 1.9438905555989945, + "learning_rate": 1.9986745790123606e-06, + "loss": 0.6895, + "step": 1539 + }, + { + "epoch": 0.23, + "grad_norm": 1.76083225924013, + "learning_rate": 1.9986696020138486e-06, + "loss": 0.6908, + "step": 1540 + }, + { + "epoch": 0.23, + "grad_norm": 2.824374020722826, + "learning_rate": 1.9986646156946645e-06, + "loss": 0.6953, + "step": 1541 + }, + { + "epoch": 0.23, + "grad_norm": 3.5186004784780445, + "learning_rate": 1.998659620054855e-06, + "loss": 0.6888, + "step": 1542 + }, + { + "epoch": 0.23, + "grad_norm": 1.5857597719825511, + "learning_rate": 1.9986546150944677e-06, + "loss": 0.6842, + "step": 1543 + }, + { + "epoch": 0.23, + "grad_norm": 12.725387017719347, + "learning_rate": 1.9986496008135486e-06, + "loss": 0.7031, + "step": 1544 + }, + { + "epoch": 0.23, + "grad_norm": 8.569918251324989, + "learning_rate": 1.9986445772121437e-06, + "loss": 0.7057, + "step": 1545 + }, + { + "epoch": 0.23, + "grad_norm": 1.8265146908632692, + "learning_rate": 1.998639544290301e-06, + "loss": 0.6934, + "step": 1546 + }, + { + "epoch": 0.23, + "grad_norm": 2.5519952165550555, + "learning_rate": 1.9986345020480677e-06, + "loss": 0.6927, + "step": 1547 + }, + { + "epoch": 0.23, + "grad_norm": 7.080543318930328, + "learning_rate": 1.9986294504854893e-06, + "loss": 0.6953, + "step": 1548 + }, + { + "epoch": 0.23, + "grad_norm": 8.246605901880578, + "learning_rate": 1.9986243896026145e-06, + "loss": 0.6927, + "step": 1549 + }, + { + "epoch": 0.23, + "grad_norm": 6.999942661009999, + "learning_rate": 1.9986193193994896e-06, + "loss": 0.6921, + "step": 1550 + }, + { + "epoch": 0.23, + "grad_norm": 5.113024421214025, + "learning_rate": 1.9986142398761627e-06, + "loss": 0.7012, + "step": 1551 + }, + { + "epoch": 0.23, + "grad_norm": 6.947071652858249, + "learning_rate": 1.9986091510326807e-06, + "loss": 0.6966, + "step": 1552 + }, + { + "epoch": 0.23, + "grad_norm": 4.675297567527166, + "learning_rate": 1.9986040528690906e-06, + "loss": 0.6986, + "step": 1553 + }, + { + "epoch": 0.23, + "grad_norm": 4.830767704950079, + "learning_rate": 1.998598945385441e-06, + "loss": 0.6992, + "step": 1554 + }, + { + "epoch": 0.23, + "grad_norm": 3.552290728559268, + "learning_rate": 1.9985938285817798e-06, + "loss": 0.6921, + "step": 1555 + }, + { + "epoch": 0.23, + "grad_norm": 5.147455377316412, + "learning_rate": 1.998588702458153e-06, + "loss": 0.6947, + "step": 1556 + }, + { + "epoch": 0.23, + "grad_norm": 2.743104247072329, + "learning_rate": 1.99858356701461e-06, + "loss": 0.6868, + "step": 1557 + }, + { + "epoch": 0.23, + "grad_norm": 2.7502941282094118, + "learning_rate": 1.9985784222511983e-06, + "loss": 0.6979, + "step": 1558 + }, + { + "epoch": 0.23, + "grad_norm": 1.1305367453853248, + "learning_rate": 1.998573268167966e-06, + "loss": 0.6947, + "step": 1559 + }, + { + "epoch": 0.23, + "grad_norm": 0.6861034043419411, + "learning_rate": 1.9985681047649607e-06, + "loss": 0.6895, + "step": 1560 + }, + { + "epoch": 0.23, + "grad_norm": 0.6458015220654232, + "learning_rate": 1.998562932042231e-06, + "loss": 0.6888, + "step": 1561 + }, + { + "epoch": 0.23, + "grad_norm": 12.46144657906583, + "learning_rate": 1.9985577499998256e-06, + "loss": 0.7005, + "step": 1562 + }, + { + "epoch": 0.23, + "grad_norm": 1.0152310691816928, + "learning_rate": 1.998552558637792e-06, + "loss": 0.6921, + "step": 1563 + }, + { + "epoch": 0.23, + "grad_norm": 1.627741924089239, + "learning_rate": 1.9985473579561792e-06, + "loss": 0.6908, + "step": 1564 + }, + { + "epoch": 0.23, + "grad_norm": 4.255235829810926, + "learning_rate": 1.9985421479550357e-06, + "loss": 0.6901, + "step": 1565 + }, + { + "epoch": 0.23, + "grad_norm": 3.3494389126895694, + "learning_rate": 1.99853692863441e-06, + "loss": 0.6947, + "step": 1566 + }, + { + "epoch": 0.23, + "grad_norm": 2.1460994405208043, + "learning_rate": 1.9985316999943505e-06, + "loss": 0.6868, + "step": 1567 + }, + { + "epoch": 0.23, + "grad_norm": 3.4084413003414373, + "learning_rate": 1.9985264620349067e-06, + "loss": 0.6882, + "step": 1568 + }, + { + "epoch": 0.23, + "grad_norm": 3.687557630674605, + "learning_rate": 1.998521214756127e-06, + "loss": 0.6868, + "step": 1569 + }, + { + "epoch": 0.23, + "grad_norm": 3.4799908063906995, + "learning_rate": 1.9985159581580602e-06, + "loss": 0.6908, + "step": 1570 + }, + { + "epoch": 0.23, + "grad_norm": 1.4215022144351395, + "learning_rate": 1.9985106922407562e-06, + "loss": 0.6921, + "step": 1571 + }, + { + "epoch": 0.23, + "grad_norm": 2.565014955617805, + "learning_rate": 1.998505417004263e-06, + "loss": 0.6914, + "step": 1572 + }, + { + "epoch": 0.23, + "grad_norm": 0.517504934573657, + "learning_rate": 1.998500132448631e-06, + "loss": 0.6914, + "step": 1573 + }, + { + "epoch": 0.23, + "grad_norm": 1.5359858101084367, + "learning_rate": 1.998494838573909e-06, + "loss": 0.6895, + "step": 1574 + }, + { + "epoch": 0.23, + "grad_norm": 0.5549559454041229, + "learning_rate": 1.998489535380146e-06, + "loss": 0.6973, + "step": 1575 + }, + { + "epoch": 0.24, + "grad_norm": 2.1942567684412024, + "learning_rate": 1.998484222867392e-06, + "loss": 0.6914, + "step": 1576 + }, + { + "epoch": 0.24, + "grad_norm": 1.400683344337876, + "learning_rate": 1.9984789010356965e-06, + "loss": 0.6953, + "step": 1577 + }, + { + "epoch": 0.24, + "grad_norm": 1.0783367419777454, + "learning_rate": 1.998473569885109e-06, + "loss": 0.7012, + "step": 1578 + }, + { + "epoch": 0.24, + "grad_norm": 1.6963836635757945, + "learning_rate": 1.9984682294156796e-06, + "loss": 0.6979, + "step": 1579 + }, + { + "epoch": 0.24, + "grad_norm": 3.4068140699898493, + "learning_rate": 1.9984628796274576e-06, + "loss": 0.6849, + "step": 1580 + }, + { + "epoch": 0.24, + "grad_norm": 9.290381960016264, + "learning_rate": 1.9984575205204933e-06, + "loss": 0.696, + "step": 1581 + }, + { + "epoch": 0.24, + "grad_norm": 2.340362553008042, + "learning_rate": 1.998452152094837e-06, + "loss": 0.6979, + "step": 1582 + }, + { + "epoch": 0.24, + "grad_norm": 1.3370064005579032, + "learning_rate": 1.998446774350538e-06, + "loss": 0.6979, + "step": 1583 + }, + { + "epoch": 0.24, + "grad_norm": 3.8861791204493215, + "learning_rate": 1.9984413872876474e-06, + "loss": 0.6855, + "step": 1584 + }, + { + "epoch": 0.24, + "grad_norm": 2.852711729472746, + "learning_rate": 1.998435990906215e-06, + "loss": 0.694, + "step": 1585 + }, + { + "epoch": 0.24, + "grad_norm": 3.44685151917126, + "learning_rate": 1.9984305852062907e-06, + "loss": 0.6992, + "step": 1586 + }, + { + "epoch": 0.24, + "grad_norm": 3.4189454533842563, + "learning_rate": 1.998425170187926e-06, + "loss": 0.6901, + "step": 1587 + }, + { + "epoch": 0.24, + "grad_norm": 4.452967935859514, + "learning_rate": 1.9984197458511704e-06, + "loss": 0.7005, + "step": 1588 + }, + { + "epoch": 0.24, + "grad_norm": 2.8411422121489402, + "learning_rate": 1.9984143121960755e-06, + "loss": 0.6953, + "step": 1589 + }, + { + "epoch": 0.24, + "grad_norm": 0.6236646724892793, + "learning_rate": 1.998408869222691e-06, + "loss": 0.6862, + "step": 1590 + }, + { + "epoch": 0.24, + "grad_norm": 0.8460367294446257, + "learning_rate": 1.9984034169310687e-06, + "loss": 0.6816, + "step": 1591 + }, + { + "epoch": 0.24, + "grad_norm": 3.407571404430164, + "learning_rate": 1.998397955321259e-06, + "loss": 0.6868, + "step": 1592 + }, + { + "epoch": 0.24, + "grad_norm": 1.6356123729107992, + "learning_rate": 1.9983924843933126e-06, + "loss": 0.6908, + "step": 1593 + }, + { + "epoch": 0.24, + "grad_norm": 5.275019858775861, + "learning_rate": 1.998387004147281e-06, + "loss": 0.6849, + "step": 1594 + }, + { + "epoch": 0.24, + "grad_norm": 4.404253541504839, + "learning_rate": 1.998381514583215e-06, + "loss": 0.694, + "step": 1595 + }, + { + "epoch": 0.24, + "grad_norm": 7.024806797410414, + "learning_rate": 1.9983760157011664e-06, + "loss": 0.7012, + "step": 1596 + }, + { + "epoch": 0.24, + "grad_norm": 9.486182033835039, + "learning_rate": 1.9983705075011856e-06, + "loss": 0.6921, + "step": 1597 + }, + { + "epoch": 0.24, + "grad_norm": 3.322936193216696, + "learning_rate": 1.9983649899833254e-06, + "loss": 0.6947, + "step": 1598 + }, + { + "epoch": 0.24, + "grad_norm": 4.794555007728654, + "learning_rate": 1.998359463147636e-06, + "loss": 0.6934, + "step": 1599 + }, + { + "epoch": 0.24, + "grad_norm": 3.908177061182766, + "learning_rate": 1.998353926994169e-06, + "loss": 0.6986, + "step": 1600 + }, + { + "epoch": 0.24, + "grad_norm": 1.6457877768423204, + "learning_rate": 1.9983483815229773e-06, + "loss": 0.6862, + "step": 1601 + }, + { + "epoch": 0.24, + "grad_norm": 1.0309336261492552, + "learning_rate": 1.9983428267341113e-06, + "loss": 0.6908, + "step": 1602 + }, + { + "epoch": 0.24, + "grad_norm": 1.8714146780875534, + "learning_rate": 1.9983372626276237e-06, + "loss": 0.6934, + "step": 1603 + }, + { + "epoch": 0.24, + "grad_norm": 6.30435298685771, + "learning_rate": 1.998331689203566e-06, + "loss": 0.6914, + "step": 1604 + }, + { + "epoch": 0.24, + "grad_norm": 1.268348980171225, + "learning_rate": 1.9983261064619904e-06, + "loss": 0.6862, + "step": 1605 + }, + { + "epoch": 0.24, + "grad_norm": 1.2901307050246993, + "learning_rate": 1.9983205144029493e-06, + "loss": 0.6816, + "step": 1606 + }, + { + "epoch": 0.24, + "grad_norm": 1.9179867435191638, + "learning_rate": 1.998314913026494e-06, + "loss": 0.6908, + "step": 1607 + }, + { + "epoch": 0.24, + "grad_norm": 14.208068285332532, + "learning_rate": 1.9983093023326778e-06, + "loss": 0.7064, + "step": 1608 + }, + { + "epoch": 0.24, + "grad_norm": 2.1082117493754375, + "learning_rate": 1.998303682321552e-06, + "loss": 0.6947, + "step": 1609 + }, + { + "epoch": 0.24, + "grad_norm": 12.595879245570591, + "learning_rate": 1.99829805299317e-06, + "loss": 0.6868, + "step": 1610 + }, + { + "epoch": 0.24, + "grad_norm": 10.115481429542578, + "learning_rate": 1.998292414347584e-06, + "loss": 0.6927, + "step": 1611 + }, + { + "epoch": 0.24, + "grad_norm": 4.222653229764159, + "learning_rate": 1.998286766384847e-06, + "loss": 0.6862, + "step": 1612 + }, + { + "epoch": 0.24, + "grad_norm": 9.200913433576645, + "learning_rate": 1.998281109105011e-06, + "loss": 0.7038, + "step": 1613 + }, + { + "epoch": 0.24, + "grad_norm": 1.212193992644344, + "learning_rate": 1.9982754425081288e-06, + "loss": 0.696, + "step": 1614 + }, + { + "epoch": 0.24, + "grad_norm": 8.501754666327898, + "learning_rate": 1.998269766594254e-06, + "loss": 0.6908, + "step": 1615 + }, + { + "epoch": 0.24, + "grad_norm": 2.946064345510439, + "learning_rate": 1.998264081363439e-06, + "loss": 0.6966, + "step": 1616 + }, + { + "epoch": 0.24, + "grad_norm": 6.087211222053248, + "learning_rate": 1.998258386815737e-06, + "loss": 0.6953, + "step": 1617 + }, + { + "epoch": 0.24, + "grad_norm": 0.9765365442018128, + "learning_rate": 1.998252682951201e-06, + "loss": 0.6758, + "step": 1618 + }, + { + "epoch": 0.24, + "grad_norm": 7.495186050946474, + "learning_rate": 1.9982469697698846e-06, + "loss": 0.696, + "step": 1619 + }, + { + "epoch": 0.24, + "grad_norm": 4.203571827908287, + "learning_rate": 1.9982412472718413e-06, + "loss": 0.679, + "step": 1620 + }, + { + "epoch": 0.24, + "grad_norm": 1.3931333360358749, + "learning_rate": 1.998235515457124e-06, + "loss": 0.6868, + "step": 1621 + }, + { + "epoch": 0.24, + "grad_norm": 0.7196929798264255, + "learning_rate": 1.9982297743257862e-06, + "loss": 0.6855, + "step": 1622 + }, + { + "epoch": 0.24, + "grad_norm": 5.565883817835107, + "learning_rate": 1.9982240238778817e-06, + "loss": 0.6901, + "step": 1623 + }, + { + "epoch": 0.24, + "grad_norm": 5.569580187592717, + "learning_rate": 1.9982182641134643e-06, + "loss": 0.6953, + "step": 1624 + }, + { + "epoch": 0.24, + "grad_norm": 11.340102309528225, + "learning_rate": 1.9982124950325874e-06, + "loss": 0.696, + "step": 1625 + }, + { + "epoch": 0.24, + "grad_norm": 3.471030253261582, + "learning_rate": 1.9982067166353048e-06, + "loss": 0.681, + "step": 1626 + }, + { + "epoch": 0.24, + "grad_norm": 4.991380963675249, + "learning_rate": 1.9982009289216707e-06, + "loss": 0.6934, + "step": 1627 + }, + { + "epoch": 0.24, + "grad_norm": 8.607249341148323, + "learning_rate": 1.9981951318917394e-06, + "loss": 0.7057, + "step": 1628 + }, + { + "epoch": 0.24, + "grad_norm": 1.251186384191094, + "learning_rate": 1.9981893255455645e-06, + "loss": 0.6862, + "step": 1629 + }, + { + "epoch": 0.24, + "grad_norm": 2.6865127947307337, + "learning_rate": 1.9981835098832e-06, + "loss": 0.6855, + "step": 1630 + }, + { + "epoch": 0.24, + "grad_norm": 4.048699197118374, + "learning_rate": 1.9981776849047008e-06, + "loss": 0.6842, + "step": 1631 + }, + { + "epoch": 0.24, + "grad_norm": 1.4365875265588306, + "learning_rate": 1.998171850610121e-06, + "loss": 0.6895, + "step": 1632 + }, + { + "epoch": 0.24, + "grad_norm": 0.724215268923454, + "learning_rate": 1.9981660069995153e-06, + "loss": 0.6849, + "step": 1633 + }, + { + "epoch": 0.24, + "grad_norm": 1.3947253637787764, + "learning_rate": 1.998160154072938e-06, + "loss": 0.6895, + "step": 1634 + }, + { + "epoch": 0.24, + "grad_norm": 12.326004436044123, + "learning_rate": 1.9981542918304435e-06, + "loss": 0.7161, + "step": 1635 + }, + { + "epoch": 0.24, + "grad_norm": 6.155731769378703, + "learning_rate": 1.9981484202720863e-06, + "loss": 0.6914, + "step": 1636 + }, + { + "epoch": 0.24, + "grad_norm": 8.58989171226234, + "learning_rate": 1.998142539397922e-06, + "loss": 0.6901, + "step": 1637 + }, + { + "epoch": 0.24, + "grad_norm": 0.8744460440956406, + "learning_rate": 1.9981366492080053e-06, + "loss": 0.6921, + "step": 1638 + }, + { + "epoch": 0.24, + "grad_norm": 4.7156451938889905, + "learning_rate": 1.9981307497023907e-06, + "loss": 0.6901, + "step": 1639 + }, + { + "epoch": 0.24, + "grad_norm": 2.2277335393079185, + "learning_rate": 1.9981248408811335e-06, + "loss": 0.6901, + "step": 1640 + }, + { + "epoch": 0.24, + "grad_norm": 2.6315519602757287, + "learning_rate": 1.9981189227442893e-06, + "loss": 0.6973, + "step": 1641 + }, + { + "epoch": 0.24, + "grad_norm": 1.0758100761734088, + "learning_rate": 1.9981129952919123e-06, + "loss": 0.6914, + "step": 1642 + }, + { + "epoch": 0.25, + "grad_norm": 6.129532419892393, + "learning_rate": 1.998107058524059e-06, + "loss": 0.6921, + "step": 1643 + }, + { + "epoch": 0.25, + "grad_norm": 3.8206675906290006, + "learning_rate": 1.998101112440784e-06, + "loss": 0.6829, + "step": 1644 + }, + { + "epoch": 0.25, + "grad_norm": 4.342844322566621, + "learning_rate": 1.998095157042143e-06, + "loss": 0.6842, + "step": 1645 + }, + { + "epoch": 0.25, + "grad_norm": 6.098992497143411, + "learning_rate": 1.9980891923281915e-06, + "loss": 0.6979, + "step": 1646 + }, + { + "epoch": 0.25, + "grad_norm": 3.9625789964917293, + "learning_rate": 1.9980832182989856e-06, + "loss": 0.696, + "step": 1647 + }, + { + "epoch": 0.25, + "grad_norm": 7.035442956401762, + "learning_rate": 1.998077234954581e-06, + "loss": 0.6816, + "step": 1648 + }, + { + "epoch": 0.25, + "grad_norm": 2.0501705085614965, + "learning_rate": 1.9980712422950326e-06, + "loss": 0.7031, + "step": 1649 + }, + { + "epoch": 0.25, + "grad_norm": 15.133108137506468, + "learning_rate": 1.9980652403203975e-06, + "loss": 0.6966, + "step": 1650 + }, + { + "epoch": 0.25, + "grad_norm": 5.294483669339859, + "learning_rate": 1.9980592290307314e-06, + "loss": 0.6901, + "step": 1651 + }, + { + "epoch": 0.25, + "grad_norm": 3.9729553950110086, + "learning_rate": 1.99805320842609e-06, + "loss": 0.6868, + "step": 1652 + }, + { + "epoch": 0.25, + "grad_norm": 1.2450217975116464, + "learning_rate": 1.9980471785065295e-06, + "loss": 0.6934, + "step": 1653 + }, + { + "epoch": 0.25, + "grad_norm": 10.21259511283591, + "learning_rate": 1.998041139272107e-06, + "loss": 0.6973, + "step": 1654 + }, + { + "epoch": 0.25, + "grad_norm": 7.7118678887976655, + "learning_rate": 1.9980350907228776e-06, + "loss": 0.6934, + "step": 1655 + }, + { + "epoch": 0.25, + "grad_norm": 1.4225741404561323, + "learning_rate": 1.998029032858899e-06, + "loss": 0.6868, + "step": 1656 + }, + { + "epoch": 0.25, + "grad_norm": 1.2493721603353753, + "learning_rate": 1.998022965680227e-06, + "loss": 0.6764, + "step": 1657 + }, + { + "epoch": 0.25, + "grad_norm": 0.5228460805188365, + "learning_rate": 1.9980168891869185e-06, + "loss": 0.694, + "step": 1658 + }, + { + "epoch": 0.25, + "grad_norm": 0.8133461729086762, + "learning_rate": 1.99801080337903e-06, + "loss": 0.6823, + "step": 1659 + }, + { + "epoch": 0.25, + "grad_norm": 0.769774113557354, + "learning_rate": 1.9980047082566183e-06, + "loss": 0.6914, + "step": 1660 + }, + { + "epoch": 0.25, + "grad_norm": 2.0010870931287443, + "learning_rate": 1.9979986038197406e-06, + "loss": 0.6973, + "step": 1661 + }, + { + "epoch": 0.25, + "grad_norm": 6.176162478070075, + "learning_rate": 1.9979924900684536e-06, + "loss": 0.6875, + "step": 1662 + }, + { + "epoch": 0.25, + "grad_norm": 3.82348591010482, + "learning_rate": 1.9979863670028142e-06, + "loss": 0.6895, + "step": 1663 + }, + { + "epoch": 0.25, + "grad_norm": 9.027177941968828, + "learning_rate": 1.99798023462288e-06, + "loss": 0.6992, + "step": 1664 + }, + { + "epoch": 0.25, + "grad_norm": 4.817945809438226, + "learning_rate": 1.997974092928708e-06, + "loss": 0.6908, + "step": 1665 + }, + { + "epoch": 0.25, + "grad_norm": 1.0753856827414967, + "learning_rate": 1.9979679419203553e-06, + "loss": 0.6999, + "step": 1666 + }, + { + "epoch": 0.25, + "grad_norm": 1.5443525676291443, + "learning_rate": 1.9979617815978803e-06, + "loss": 0.6862, + "step": 1667 + }, + { + "epoch": 0.25, + "grad_norm": 7.546565204157627, + "learning_rate": 1.997955611961339e-06, + "loss": 0.6999, + "step": 1668 + }, + { + "epoch": 0.25, + "grad_norm": 1.3007517607327679, + "learning_rate": 1.99794943301079e-06, + "loss": 0.681, + "step": 1669 + }, + { + "epoch": 0.25, + "grad_norm": 0.6921311659973253, + "learning_rate": 1.9979432447462904e-06, + "loss": 0.6842, + "step": 1670 + }, + { + "epoch": 0.25, + "grad_norm": 1.064131228141342, + "learning_rate": 1.9979370471678986e-06, + "loss": 0.668, + "step": 1671 + }, + { + "epoch": 0.25, + "grad_norm": 5.270042700125399, + "learning_rate": 1.997930840275672e-06, + "loss": 0.6992, + "step": 1672 + }, + { + "epoch": 0.25, + "grad_norm": 1.5408794992957389, + "learning_rate": 1.9979246240696684e-06, + "loss": 0.6816, + "step": 1673 + }, + { + "epoch": 0.25, + "grad_norm": 4.479269617334028, + "learning_rate": 1.9979183985499463e-06, + "loss": 0.6979, + "step": 1674 + }, + { + "epoch": 0.25, + "grad_norm": 0.9581459056257842, + "learning_rate": 1.9979121637165633e-06, + "loss": 0.6868, + "step": 1675 + }, + { + "epoch": 0.25, + "grad_norm": 5.855720688261195, + "learning_rate": 1.997905919569578e-06, + "loss": 0.6979, + "step": 1676 + }, + { + "epoch": 0.25, + "grad_norm": 3.8365189740512924, + "learning_rate": 1.9978996661090485e-06, + "loss": 0.6947, + "step": 1677 + }, + { + "epoch": 0.25, + "grad_norm": 9.685634819765307, + "learning_rate": 1.997893403335033e-06, + "loss": 0.7044, + "step": 1678 + }, + { + "epoch": 0.25, + "grad_norm": 4.282323274554267, + "learning_rate": 1.9978871312475902e-06, + "loss": 0.6927, + "step": 1679 + }, + { + "epoch": 0.25, + "grad_norm": 2.9007650066797157, + "learning_rate": 1.9978808498467787e-06, + "loss": 0.6973, + "step": 1680 + }, + { + "epoch": 0.25, + "grad_norm": 1.731492378976467, + "learning_rate": 1.997874559132657e-06, + "loss": 0.6901, + "step": 1681 + }, + { + "epoch": 0.25, + "grad_norm": 1.1071475306279925, + "learning_rate": 1.9978682591052838e-06, + "loss": 0.6855, + "step": 1682 + }, + { + "epoch": 0.25, + "grad_norm": 3.5381640978465945, + "learning_rate": 1.9978619497647178e-06, + "loss": 0.6882, + "step": 1683 + }, + { + "epoch": 0.25, + "grad_norm": 8.457767049228258, + "learning_rate": 1.9978556311110177e-06, + "loss": 0.7064, + "step": 1684 + }, + { + "epoch": 0.25, + "grad_norm": 11.610392584250535, + "learning_rate": 1.997849303144243e-06, + "loss": 0.7103, + "step": 1685 + }, + { + "epoch": 0.25, + "grad_norm": 2.3494868122469272, + "learning_rate": 1.9978429658644522e-06, + "loss": 0.6888, + "step": 1686 + }, + { + "epoch": 0.25, + "grad_norm": 0.6646815806652946, + "learning_rate": 1.9978366192717054e-06, + "loss": 0.6921, + "step": 1687 + }, + { + "epoch": 0.25, + "grad_norm": 1.7410519245757201, + "learning_rate": 1.997830263366061e-06, + "loss": 0.6875, + "step": 1688 + }, + { + "epoch": 0.25, + "grad_norm": 1.4916402249873393, + "learning_rate": 1.997823898147578e-06, + "loss": 0.6947, + "step": 1689 + }, + { + "epoch": 0.25, + "grad_norm": 1.9585999806720356, + "learning_rate": 1.9978175236163167e-06, + "loss": 0.6862, + "step": 1690 + }, + { + "epoch": 0.25, + "grad_norm": 5.779386171738725, + "learning_rate": 1.997811139772336e-06, + "loss": 0.6914, + "step": 1691 + }, + { + "epoch": 0.25, + "grad_norm": 2.493897224284139, + "learning_rate": 1.997804746615696e-06, + "loss": 0.6868, + "step": 1692 + }, + { + "epoch": 0.25, + "grad_norm": 3.861236157848409, + "learning_rate": 1.997798344146456e-06, + "loss": 0.6895, + "step": 1693 + }, + { + "epoch": 0.25, + "grad_norm": 2.0461672836730442, + "learning_rate": 1.9977919323646758e-06, + "loss": 0.6849, + "step": 1694 + }, + { + "epoch": 0.25, + "grad_norm": 1.2518969128109747, + "learning_rate": 1.997785511270415e-06, + "loss": 0.6934, + "step": 1695 + }, + { + "epoch": 0.25, + "grad_norm": 9.406059025627673, + "learning_rate": 1.9977790808637344e-06, + "loss": 0.6914, + "step": 1696 + }, + { + "epoch": 0.25, + "grad_norm": 5.3466288991572934, + "learning_rate": 1.9977726411446926e-06, + "loss": 0.6888, + "step": 1697 + }, + { + "epoch": 0.25, + "grad_norm": 7.972945216073848, + "learning_rate": 1.9977661921133507e-06, + "loss": 0.696, + "step": 1698 + }, + { + "epoch": 0.25, + "grad_norm": 3.266333369310395, + "learning_rate": 1.9977597337697692e-06, + "loss": 0.6999, + "step": 1699 + }, + { + "epoch": 0.25, + "grad_norm": 0.7954096992169426, + "learning_rate": 1.9977532661140075e-06, + "loss": 0.6966, + "step": 1700 + }, + { + "epoch": 0.25, + "grad_norm": 0.6706086817385994, + "learning_rate": 1.9977467891461265e-06, + "loss": 0.681, + "step": 1701 + }, + { + "epoch": 0.25, + "grad_norm": 5.493955110601981, + "learning_rate": 1.9977403028661863e-06, + "loss": 0.6784, + "step": 1702 + }, + { + "epoch": 0.25, + "grad_norm": 3.046176234305001, + "learning_rate": 1.9977338072742475e-06, + "loss": 0.694, + "step": 1703 + }, + { + "epoch": 0.25, + "grad_norm": 7.2050430255441755, + "learning_rate": 1.997727302370371e-06, + "loss": 0.6973, + "step": 1704 + }, + { + "epoch": 0.25, + "grad_norm": 1.9165798741680815, + "learning_rate": 1.9977207881546174e-06, + "loss": 0.6862, + "step": 1705 + }, + { + "epoch": 0.25, + "grad_norm": 10.974545277278057, + "learning_rate": 1.9977142646270475e-06, + "loss": 0.6992, + "step": 1706 + }, + { + "epoch": 0.25, + "grad_norm": 7.4194016586406635, + "learning_rate": 1.997707731787722e-06, + "loss": 0.6953, + "step": 1707 + }, + { + "epoch": 0.25, + "grad_norm": 10.1653137033887, + "learning_rate": 1.997701189636702e-06, + "loss": 0.6908, + "step": 1708 + }, + { + "epoch": 0.25, + "grad_norm": 0.5646101730088771, + "learning_rate": 1.9976946381740485e-06, + "loss": 0.6888, + "step": 1709 + }, + { + "epoch": 0.26, + "grad_norm": 4.0750690939107095, + "learning_rate": 1.997688077399823e-06, + "loss": 0.679, + "step": 1710 + }, + { + "epoch": 0.26, + "grad_norm": 4.224457441071677, + "learning_rate": 1.997681507314086e-06, + "loss": 0.6914, + "step": 1711 + }, + { + "epoch": 0.26, + "grad_norm": 1.372484454938346, + "learning_rate": 1.9976749279168998e-06, + "loss": 0.6934, + "step": 1712 + }, + { + "epoch": 0.26, + "grad_norm": 0.7148499435186948, + "learning_rate": 1.9976683392083248e-06, + "loss": 0.6882, + "step": 1713 + }, + { + "epoch": 0.26, + "grad_norm": 3.750872240297091, + "learning_rate": 1.9976617411884233e-06, + "loss": 0.6953, + "step": 1714 + }, + { + "epoch": 0.26, + "grad_norm": 7.497533753151583, + "learning_rate": 1.9976551338572564e-06, + "loss": 0.6953, + "step": 1715 + }, + { + "epoch": 0.26, + "grad_norm": 2.119147856669996, + "learning_rate": 1.9976485172148858e-06, + "loss": 0.6875, + "step": 1716 + }, + { + "epoch": 0.26, + "grad_norm": 13.394086720284962, + "learning_rate": 1.9976418912613734e-06, + "loss": 0.7142, + "step": 1717 + }, + { + "epoch": 0.26, + "grad_norm": 2.6226873700375197, + "learning_rate": 1.9976352559967814e-06, + "loss": 0.6706, + "step": 1718 + }, + { + "epoch": 0.26, + "grad_norm": 6.382407614416505, + "learning_rate": 1.9976286114211705e-06, + "loss": 0.6992, + "step": 1719 + }, + { + "epoch": 0.26, + "grad_norm": 7.35107874584466, + "learning_rate": 1.997621957534604e-06, + "loss": 0.6758, + "step": 1720 + }, + { + "epoch": 0.26, + "grad_norm": 3.4035281063762857, + "learning_rate": 1.997615294337144e-06, + "loss": 0.6934, + "step": 1721 + }, + { + "epoch": 0.26, + "grad_norm": 5.296239625427763, + "learning_rate": 1.9976086218288518e-06, + "loss": 0.6888, + "step": 1722 + }, + { + "epoch": 0.26, + "grad_norm": 1.3308804432730155, + "learning_rate": 1.99760194000979e-06, + "loss": 0.6895, + "step": 1723 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095148328279062, + "learning_rate": 1.9975952488800213e-06, + "loss": 0.6849, + "step": 1724 + }, + { + "epoch": 0.26, + "grad_norm": 7.784512590657828, + "learning_rate": 1.9975885484396077e-06, + "loss": 0.7064, + "step": 1725 + }, + { + "epoch": 0.26, + "grad_norm": 3.209262368316938, + "learning_rate": 1.997581838688612e-06, + "loss": 0.6992, + "step": 1726 + }, + { + "epoch": 0.26, + "grad_norm": 2.082951126823297, + "learning_rate": 1.997575119627097e-06, + "loss": 0.6849, + "step": 1727 + }, + { + "epoch": 0.26, + "grad_norm": 5.052375056917112, + "learning_rate": 1.997568391255125e-06, + "loss": 0.679, + "step": 1728 + }, + { + "epoch": 0.26, + "grad_norm": 5.913467539833512, + "learning_rate": 1.997561653572759e-06, + "loss": 0.6999, + "step": 1729 + }, + { + "epoch": 0.26, + "grad_norm": 2.938791661826796, + "learning_rate": 1.997554906580062e-06, + "loss": 0.6875, + "step": 1730 + }, + { + "epoch": 0.26, + "grad_norm": 1.5866727833732528, + "learning_rate": 1.9975481502770966e-06, + "loss": 0.7005, + "step": 1731 + }, + { + "epoch": 0.26, + "grad_norm": 2.8270200316659735, + "learning_rate": 1.9975413846639266e-06, + "loss": 0.6895, + "step": 1732 + }, + { + "epoch": 0.26, + "grad_norm": 3.975061000904369, + "learning_rate": 1.997534609740614e-06, + "loss": 0.6882, + "step": 1733 + }, + { + "epoch": 0.26, + "grad_norm": 1.0964345335905896, + "learning_rate": 1.9975278255072228e-06, + "loss": 0.6895, + "step": 1734 + }, + { + "epoch": 0.26, + "grad_norm": 4.3693843343620316, + "learning_rate": 1.9975210319638164e-06, + "loss": 0.6966, + "step": 1735 + }, + { + "epoch": 0.26, + "grad_norm": 1.8531611437742384, + "learning_rate": 1.997514229110458e-06, + "loss": 0.6777, + "step": 1736 + }, + { + "epoch": 0.26, + "grad_norm": 6.416044916361979, + "learning_rate": 1.997507416947211e-06, + "loss": 0.696, + "step": 1737 + }, + { + "epoch": 0.26, + "grad_norm": 0.9916692737932127, + "learning_rate": 1.997500595474139e-06, + "loss": 0.6888, + "step": 1738 + }, + { + "epoch": 0.26, + "grad_norm": 4.212238054276413, + "learning_rate": 1.9974937646913057e-06, + "loss": 0.6836, + "step": 1739 + }, + { + "epoch": 0.26, + "grad_norm": 1.3163543866766587, + "learning_rate": 1.997486924598775e-06, + "loss": 0.6992, + "step": 1740 + }, + { + "epoch": 0.26, + "grad_norm": 9.337699268193681, + "learning_rate": 1.9974800751966105e-06, + "loss": 0.7005, + "step": 1741 + }, + { + "epoch": 0.26, + "grad_norm": 10.50079428754215, + "learning_rate": 1.9974732164848766e-06, + "loss": 0.6836, + "step": 1742 + }, + { + "epoch": 0.26, + "grad_norm": 2.7040055422363456, + "learning_rate": 1.9974663484636364e-06, + "loss": 0.6992, + "step": 1743 + }, + { + "epoch": 0.26, + "grad_norm": 1.8170346790101009, + "learning_rate": 1.997459471132955e-06, + "loss": 0.6855, + "step": 1744 + }, + { + "epoch": 0.26, + "grad_norm": 10.234104219270103, + "learning_rate": 1.997452584492896e-06, + "loss": 0.7044, + "step": 1745 + }, + { + "epoch": 0.26, + "grad_norm": 6.265170923681604, + "learning_rate": 1.997445688543524e-06, + "loss": 0.6966, + "step": 1746 + }, + { + "epoch": 0.26, + "grad_norm": 1.7244007768476572, + "learning_rate": 1.997438783284903e-06, + "loss": 0.6784, + "step": 1747 + }, + { + "epoch": 0.26, + "grad_norm": 1.5784947214961316, + "learning_rate": 1.9974318687170974e-06, + "loss": 0.6986, + "step": 1748 + }, + { + "epoch": 0.26, + "grad_norm": 1.9521640937677553, + "learning_rate": 1.997424944840172e-06, + "loss": 0.6927, + "step": 1749 + }, + { + "epoch": 0.26, + "grad_norm": 4.165430748975872, + "learning_rate": 1.997418011654192e-06, + "loss": 0.6823, + "step": 1750 + }, + { + "epoch": 0.26, + "grad_norm": 0.6363373063349678, + "learning_rate": 1.9974110691592205e-06, + "loss": 0.6953, + "step": 1751 + }, + { + "epoch": 0.26, + "grad_norm": 1.6392700780198126, + "learning_rate": 1.997404117355324e-06, + "loss": 0.6829, + "step": 1752 + }, + { + "epoch": 0.26, + "grad_norm": 3.5846631347852327, + "learning_rate": 1.9973971562425665e-06, + "loss": 0.6947, + "step": 1753 + }, + { + "epoch": 0.26, + "grad_norm": 7.686545962127944, + "learning_rate": 1.997390185821013e-06, + "loss": 0.6986, + "step": 1754 + }, + { + "epoch": 0.26, + "grad_norm": 7.185680148764041, + "learning_rate": 1.997383206090729e-06, + "loss": 0.6849, + "step": 1755 + }, + { + "epoch": 0.26, + "grad_norm": 1.3935474859542625, + "learning_rate": 1.9973762170517793e-06, + "loss": 0.6875, + "step": 1756 + }, + { + "epoch": 0.26, + "grad_norm": 4.657229887511287, + "learning_rate": 1.9973692187042288e-06, + "loss": 0.6862, + "step": 1757 + }, + { + "epoch": 0.26, + "grad_norm": 3.4852855762145727, + "learning_rate": 1.9973622110481436e-06, + "loss": 0.6973, + "step": 1758 + }, + { + "epoch": 0.26, + "grad_norm": 4.033273091478141, + "learning_rate": 1.9973551940835886e-06, + "loss": 0.6882, + "step": 1759 + }, + { + "epoch": 0.26, + "grad_norm": 0.6438373277100433, + "learning_rate": 1.997348167810629e-06, + "loss": 0.694, + "step": 1760 + }, + { + "epoch": 0.26, + "grad_norm": 0.6871113432650409, + "learning_rate": 1.997341132229331e-06, + "loss": 0.6901, + "step": 1761 + }, + { + "epoch": 0.26, + "grad_norm": 1.096555649728867, + "learning_rate": 1.99733408733976e-06, + "loss": 0.6921, + "step": 1762 + }, + { + "epoch": 0.26, + "grad_norm": 0.6169829799847172, + "learning_rate": 1.997327033141982e-06, + "loss": 0.6888, + "step": 1763 + }, + { + "epoch": 0.26, + "grad_norm": 3.18899015148802, + "learning_rate": 1.9973199696360623e-06, + "loss": 0.6947, + "step": 1764 + }, + { + "epoch": 0.26, + "grad_norm": 6.972831518197226, + "learning_rate": 1.997312896822067e-06, + "loss": 0.6875, + "step": 1765 + }, + { + "epoch": 0.26, + "grad_norm": 11.342368122517657, + "learning_rate": 1.9973058147000626e-06, + "loss": 0.6973, + "step": 1766 + }, + { + "epoch": 0.26, + "grad_norm": 4.623865043498567, + "learning_rate": 1.9972987232701146e-06, + "loss": 0.6836, + "step": 1767 + }, + { + "epoch": 0.26, + "grad_norm": 1.787001941161843, + "learning_rate": 1.9972916225322897e-06, + "loss": 0.7031, + "step": 1768 + }, + { + "epoch": 0.26, + "grad_norm": 1.820517246483959, + "learning_rate": 1.9972845124866535e-06, + "loss": 0.6823, + "step": 1769 + }, + { + "epoch": 0.26, + "grad_norm": 1.8561942136063438, + "learning_rate": 1.997277393133273e-06, + "loss": 0.6855, + "step": 1770 + }, + { + "epoch": 0.26, + "grad_norm": 4.819000375726261, + "learning_rate": 1.9972702644722146e-06, + "loss": 0.6868, + "step": 1771 + }, + { + "epoch": 0.26, + "grad_norm": 3.631515252285526, + "learning_rate": 1.9972631265035443e-06, + "loss": 0.6921, + "step": 1772 + }, + { + "epoch": 0.26, + "grad_norm": 2.2068231264775475, + "learning_rate": 1.997255979227329e-06, + "loss": 0.6973, + "step": 1773 + }, + { + "epoch": 0.26, + "grad_norm": 0.7782017362437023, + "learning_rate": 1.9972488226436353e-06, + "loss": 0.6999, + "step": 1774 + }, + { + "epoch": 0.26, + "grad_norm": 1.6408663677826503, + "learning_rate": 1.9972416567525304e-06, + "loss": 0.6842, + "step": 1775 + }, + { + "epoch": 0.26, + "grad_norm": 6.099510967985226, + "learning_rate": 1.9972344815540813e-06, + "loss": 0.7005, + "step": 1776 + }, + { + "epoch": 0.27, + "grad_norm": 5.003273339459401, + "learning_rate": 1.997227297048354e-06, + "loss": 0.7012, + "step": 1777 + }, + { + "epoch": 0.27, + "grad_norm": 2.653673829512176, + "learning_rate": 1.997220103235416e-06, + "loss": 0.6855, + "step": 1778 + }, + { + "epoch": 0.27, + "grad_norm": 1.0412659656747654, + "learning_rate": 1.9972129001153353e-06, + "loss": 0.6992, + "step": 1779 + }, + { + "epoch": 0.27, + "grad_norm": 5.2557769981833795, + "learning_rate": 1.997205687688178e-06, + "loss": 0.6914, + "step": 1780 + }, + { + "epoch": 0.27, + "grad_norm": 3.9051840641123827, + "learning_rate": 1.997198465954012e-06, + "loss": 0.6908, + "step": 1781 + }, + { + "epoch": 0.27, + "grad_norm": 0.711262711170887, + "learning_rate": 1.9971912349129045e-06, + "loss": 0.6842, + "step": 1782 + }, + { + "epoch": 0.27, + "grad_norm": 2.6766579493085705, + "learning_rate": 1.997183994564923e-06, + "loss": 0.6836, + "step": 1783 + }, + { + "epoch": 0.27, + "grad_norm": 4.509212374292903, + "learning_rate": 1.997176744910135e-06, + "loss": 0.6855, + "step": 1784 + }, + { + "epoch": 0.27, + "grad_norm": 7.392145817719588, + "learning_rate": 1.997169485948608e-06, + "loss": 0.6797, + "step": 1785 + }, + { + "epoch": 0.27, + "grad_norm": 6.707356216736125, + "learning_rate": 1.9971622176804103e-06, + "loss": 0.6927, + "step": 1786 + }, + { + "epoch": 0.27, + "grad_norm": 11.640765224054396, + "learning_rate": 1.99715494010561e-06, + "loss": 0.7109, + "step": 1787 + }, + { + "epoch": 0.27, + "grad_norm": 7.570185564929984, + "learning_rate": 1.997147653224274e-06, + "loss": 0.6849, + "step": 1788 + }, + { + "epoch": 0.27, + "grad_norm": 4.802075798536925, + "learning_rate": 1.9971403570364704e-06, + "loss": 0.6908, + "step": 1789 + }, + { + "epoch": 0.27, + "grad_norm": 0.9709269649205211, + "learning_rate": 1.9971330515422676e-06, + "loss": 0.6836, + "step": 1790 + }, + { + "epoch": 0.27, + "grad_norm": 1.7189503477870511, + "learning_rate": 1.997125736741734e-06, + "loss": 0.6836, + "step": 1791 + }, + { + "epoch": 0.27, + "grad_norm": 1.1616459812006188, + "learning_rate": 1.9971184126349382e-06, + "loss": 0.6927, + "step": 1792 + }, + { + "epoch": 0.27, + "grad_norm": 8.152126271325931, + "learning_rate": 1.997111079221948e-06, + "loss": 0.694, + "step": 1793 + }, + { + "epoch": 0.27, + "grad_norm": 1.5314171793009081, + "learning_rate": 1.9971037365028317e-06, + "loss": 0.6921, + "step": 1794 + }, + { + "epoch": 0.27, + "grad_norm": 4.850546967332377, + "learning_rate": 1.997096384477658e-06, + "loss": 0.6888, + "step": 1795 + }, + { + "epoch": 0.27, + "grad_norm": 11.376039645538997, + "learning_rate": 1.9970890231464956e-06, + "loss": 0.7083, + "step": 1796 + }, + { + "epoch": 0.27, + "grad_norm": 1.0947510154489883, + "learning_rate": 1.997081652509413e-06, + "loss": 0.6901, + "step": 1797 + }, + { + "epoch": 0.27, + "grad_norm": 5.277309123701361, + "learning_rate": 1.9970742725664793e-06, + "loss": 0.694, + "step": 1798 + }, + { + "epoch": 0.27, + "grad_norm": 3.33957817177092, + "learning_rate": 1.9970668833177633e-06, + "loss": 0.6927, + "step": 1799 + }, + { + "epoch": 0.27, + "grad_norm": 5.000696742253296, + "learning_rate": 1.997059484763334e-06, + "loss": 0.6842, + "step": 1800 + }, + { + "epoch": 0.27, + "grad_norm": 3.6897326725364983, + "learning_rate": 1.99705207690326e-06, + "loss": 0.6823, + "step": 1801 + }, + { + "epoch": 0.27, + "grad_norm": 0.7936201179844063, + "learning_rate": 1.997044659737611e-06, + "loss": 0.6882, + "step": 1802 + }, + { + "epoch": 0.27, + "grad_norm": 4.617304358381109, + "learning_rate": 1.997037233266456e-06, + "loss": 0.681, + "step": 1803 + }, + { + "epoch": 0.27, + "grad_norm": 0.9259132358432596, + "learning_rate": 1.9970297974898642e-06, + "loss": 0.6973, + "step": 1804 + }, + { + "epoch": 0.27, + "grad_norm": 8.92925814616004, + "learning_rate": 1.9970223524079055e-06, + "loss": 0.6992, + "step": 1805 + }, + { + "epoch": 0.27, + "grad_norm": 1.5226413241308028, + "learning_rate": 1.997014898020649e-06, + "loss": 0.6862, + "step": 1806 + }, + { + "epoch": 0.27, + "grad_norm": 4.057630119648912, + "learning_rate": 1.997007434328164e-06, + "loss": 0.6947, + "step": 1807 + }, + { + "epoch": 0.27, + "grad_norm": 5.308825868620024, + "learning_rate": 1.9969999613305206e-06, + "loss": 0.6986, + "step": 1808 + }, + { + "epoch": 0.27, + "grad_norm": 2.5936845615183888, + "learning_rate": 1.9969924790277882e-06, + "loss": 0.6784, + "step": 1809 + }, + { + "epoch": 0.27, + "grad_norm": 7.21511251049453, + "learning_rate": 1.9969849874200372e-06, + "loss": 0.6868, + "step": 1810 + }, + { + "epoch": 0.27, + "grad_norm": 5.326643652977999, + "learning_rate": 1.9969774865073367e-06, + "loss": 0.6901, + "step": 1811 + }, + { + "epoch": 0.27, + "grad_norm": 5.20600371214184, + "learning_rate": 1.9969699762897573e-06, + "loss": 0.7031, + "step": 1812 + }, + { + "epoch": 0.27, + "grad_norm": 3.0854367342669202, + "learning_rate": 1.996962456767369e-06, + "loss": 0.6784, + "step": 1813 + }, + { + "epoch": 0.27, + "grad_norm": 7.60441118441897, + "learning_rate": 1.996954927940242e-06, + "loss": 0.7103, + "step": 1814 + }, + { + "epoch": 0.27, + "grad_norm": 11.414465274849144, + "learning_rate": 1.9969473898084465e-06, + "loss": 0.7116, + "step": 1815 + }, + { + "epoch": 0.27, + "grad_norm": 7.16072729785614, + "learning_rate": 1.9969398423720527e-06, + "loss": 0.7051, + "step": 1816 + }, + { + "epoch": 0.27, + "grad_norm": 6.855196631204361, + "learning_rate": 1.996932285631131e-06, + "loss": 0.6966, + "step": 1817 + }, + { + "epoch": 0.27, + "grad_norm": 6.077452268921966, + "learning_rate": 1.9969247195857524e-06, + "loss": 0.694, + "step": 1818 + }, + { + "epoch": 0.27, + "grad_norm": 3.0139134704569472, + "learning_rate": 1.996917144235987e-06, + "loss": 0.6764, + "step": 1819 + }, + { + "epoch": 0.27, + "grad_norm": 2.1784726360547384, + "learning_rate": 1.996909559581906e-06, + "loss": 0.6882, + "step": 1820 + }, + { + "epoch": 0.27, + "grad_norm": 3.240215594954575, + "learning_rate": 1.99690196562358e-06, + "loss": 0.6934, + "step": 1821 + }, + { + "epoch": 0.27, + "grad_norm": 10.394766724826576, + "learning_rate": 1.9968943623610795e-06, + "loss": 0.681, + "step": 1822 + }, + { + "epoch": 0.27, + "grad_norm": 0.9651335084166299, + "learning_rate": 1.996886749794476e-06, + "loss": 0.6823, + "step": 1823 + }, + { + "epoch": 0.27, + "grad_norm": 1.1569368874526456, + "learning_rate": 1.99687912792384e-06, + "loss": 0.6868, + "step": 1824 + }, + { + "epoch": 0.27, + "grad_norm": 0.5905814838115874, + "learning_rate": 1.9968714967492432e-06, + "loss": 0.6908, + "step": 1825 + }, + { + "epoch": 0.27, + "grad_norm": 4.285263884842644, + "learning_rate": 1.9968638562707563e-06, + "loss": 0.6829, + "step": 1826 + }, + { + "epoch": 0.27, + "grad_norm": 3.631035020141265, + "learning_rate": 1.996856206488451e-06, + "loss": 0.6836, + "step": 1827 + }, + { + "epoch": 0.27, + "grad_norm": 13.858558247776294, + "learning_rate": 1.996848547402399e-06, + "loss": 0.7253, + "step": 1828 + }, + { + "epoch": 0.27, + "grad_norm": 13.13746622115907, + "learning_rate": 1.9968408790126706e-06, + "loss": 0.7155, + "step": 1829 + }, + { + "epoch": 0.27, + "grad_norm": 13.976536693651665, + "learning_rate": 1.9968332013193383e-06, + "loss": 0.7148, + "step": 1830 + }, + { + "epoch": 0.27, + "grad_norm": 8.758264684544402, + "learning_rate": 1.996825514322474e-06, + "loss": 0.6862, + "step": 1831 + }, + { + "epoch": 0.27, + "grad_norm": 4.062785539905678, + "learning_rate": 1.9968178180221488e-06, + "loss": 0.6966, + "step": 1832 + }, + { + "epoch": 0.27, + "grad_norm": 3.4295458426529652, + "learning_rate": 1.996810112418435e-06, + "loss": 0.6908, + "step": 1833 + }, + { + "epoch": 0.27, + "grad_norm": 5.286709703183017, + "learning_rate": 1.996802397511404e-06, + "loss": 0.6849, + "step": 1834 + }, + { + "epoch": 0.27, + "grad_norm": 6.729585664792097, + "learning_rate": 1.996794673301128e-06, + "loss": 0.7064, + "step": 1835 + }, + { + "epoch": 0.27, + "grad_norm": 8.74636538924312, + "learning_rate": 1.996786939787679e-06, + "loss": 0.7005, + "step": 1836 + }, + { + "epoch": 0.27, + "grad_norm": 10.120442166241165, + "learning_rate": 1.9967791969711296e-06, + "loss": 0.7142, + "step": 1837 + }, + { + "epoch": 0.27, + "grad_norm": 10.23511997923501, + "learning_rate": 1.9967714448515517e-06, + "loss": 0.7122, + "step": 1838 + }, + { + "epoch": 0.27, + "grad_norm": 5.815848118828664, + "learning_rate": 1.9967636834290177e-06, + "loss": 0.6868, + "step": 1839 + }, + { + "epoch": 0.27, + "grad_norm": 15.886435585425918, + "learning_rate": 1.9967559127036005e-06, + "loss": 0.7298, + "step": 1840 + }, + { + "epoch": 0.27, + "grad_norm": 5.102158989199764, + "learning_rate": 1.996748132675372e-06, + "loss": 0.6947, + "step": 1841 + }, + { + "epoch": 0.27, + "grad_norm": 0.7854044855762727, + "learning_rate": 1.9967403433444047e-06, + "loss": 0.6777, + "step": 1842 + }, + { + "epoch": 0.27, + "grad_norm": 5.005878827771043, + "learning_rate": 1.9967325447107722e-06, + "loss": 0.6823, + "step": 1843 + }, + { + "epoch": 0.28, + "grad_norm": 2.147309681410732, + "learning_rate": 1.9967247367745462e-06, + "loss": 0.6882, + "step": 1844 + }, + { + "epoch": 0.28, + "grad_norm": 0.8845538128866177, + "learning_rate": 1.9967169195358004e-06, + "loss": 0.6862, + "step": 1845 + }, + { + "epoch": 0.28, + "grad_norm": 2.7872471926071163, + "learning_rate": 1.9967090929946075e-06, + "loss": 0.6849, + "step": 1846 + }, + { + "epoch": 0.28, + "grad_norm": 0.6011196270162839, + "learning_rate": 1.99670125715104e-06, + "loss": 0.6862, + "step": 1847 + }, + { + "epoch": 0.28, + "grad_norm": 2.13816836093163, + "learning_rate": 1.996693412005172e-06, + "loss": 0.6868, + "step": 1848 + }, + { + "epoch": 0.28, + "grad_norm": 5.477994693742774, + "learning_rate": 1.996685557557076e-06, + "loss": 0.6999, + "step": 1849 + }, + { + "epoch": 0.28, + "grad_norm": 6.019945867455358, + "learning_rate": 1.9966776938068257e-06, + "loss": 0.6816, + "step": 1850 + }, + { + "epoch": 0.28, + "grad_norm": 1.1869799744972072, + "learning_rate": 1.9966698207544943e-06, + "loss": 0.696, + "step": 1851 + }, + { + "epoch": 0.28, + "grad_norm": 3.4433053678201366, + "learning_rate": 1.9966619384001554e-06, + "loss": 0.6855, + "step": 1852 + }, + { + "epoch": 0.28, + "grad_norm": 7.564802204959158, + "learning_rate": 1.9966540467438824e-06, + "loss": 0.6953, + "step": 1853 + }, + { + "epoch": 0.28, + "grad_norm": 1.688522866277068, + "learning_rate": 1.996646145785749e-06, + "loss": 0.6771, + "step": 1854 + }, + { + "epoch": 0.28, + "grad_norm": 9.32650200054695, + "learning_rate": 1.9966382355258287e-06, + "loss": 0.7018, + "step": 1855 + }, + { + "epoch": 0.28, + "grad_norm": 11.927511319762765, + "learning_rate": 1.9966303159641963e-06, + "loss": 0.7064, + "step": 1856 + }, + { + "epoch": 0.28, + "grad_norm": 2.6620599294334943, + "learning_rate": 1.9966223871009245e-06, + "loss": 0.6914, + "step": 1857 + }, + { + "epoch": 0.28, + "grad_norm": 5.118484071102985, + "learning_rate": 1.9966144489360883e-06, + "loss": 0.7018, + "step": 1858 + }, + { + "epoch": 0.28, + "grad_norm": 3.401468656066619, + "learning_rate": 1.9966065014697605e-06, + "loss": 0.6855, + "step": 1859 + }, + { + "epoch": 0.28, + "grad_norm": 4.751733145682448, + "learning_rate": 1.996598544702017e-06, + "loss": 0.7038, + "step": 1860 + }, + { + "epoch": 0.28, + "grad_norm": 5.668417159615413, + "learning_rate": 1.9965905786329305e-06, + "loss": 0.6979, + "step": 1861 + }, + { + "epoch": 0.28, + "grad_norm": 2.908337431090505, + "learning_rate": 1.996582603262576e-06, + "loss": 0.6908, + "step": 1862 + }, + { + "epoch": 0.28, + "grad_norm": 4.9360482119529925, + "learning_rate": 1.9965746185910284e-06, + "loss": 0.6868, + "step": 1863 + }, + { + "epoch": 0.28, + "grad_norm": 3.1449463908543964, + "learning_rate": 1.9965666246183613e-06, + "loss": 0.6888, + "step": 1864 + }, + { + "epoch": 0.28, + "grad_norm": 4.369760831957967, + "learning_rate": 1.9965586213446503e-06, + "loss": 0.6888, + "step": 1865 + }, + { + "epoch": 0.28, + "grad_norm": 2.5196420983631027, + "learning_rate": 1.996550608769969e-06, + "loss": 0.6966, + "step": 1866 + }, + { + "epoch": 0.28, + "grad_norm": 2.834207002087839, + "learning_rate": 1.996542586894393e-06, + "loss": 0.6895, + "step": 1867 + }, + { + "epoch": 0.28, + "grad_norm": 1.470202003913432, + "learning_rate": 1.9965345557179967e-06, + "loss": 0.6999, + "step": 1868 + }, + { + "epoch": 0.28, + "grad_norm": 7.086118735865163, + "learning_rate": 1.9965265152408555e-06, + "loss": 0.7018, + "step": 1869 + }, + { + "epoch": 0.28, + "grad_norm": 7.9676629199414934, + "learning_rate": 1.9965184654630444e-06, + "loss": 0.6921, + "step": 1870 + }, + { + "epoch": 0.28, + "grad_norm": 0.9474743772134738, + "learning_rate": 1.996510406384638e-06, + "loss": 0.6914, + "step": 1871 + }, + { + "epoch": 0.28, + "grad_norm": 5.126440495953518, + "learning_rate": 1.996502338005712e-06, + "loss": 0.681, + "step": 1872 + }, + { + "epoch": 0.28, + "grad_norm": 1.0433778154676114, + "learning_rate": 1.9964942603263417e-06, + "loss": 0.6966, + "step": 1873 + }, + { + "epoch": 0.28, + "grad_norm": 3.699484871147893, + "learning_rate": 1.996486173346602e-06, + "loss": 0.6914, + "step": 1874 + }, + { + "epoch": 0.28, + "grad_norm": 5.384461529025496, + "learning_rate": 1.9964780770665686e-06, + "loss": 0.6979, + "step": 1875 + }, + { + "epoch": 0.28, + "grad_norm": 1.2764021990052121, + "learning_rate": 1.9964699714863175e-06, + "loss": 0.7025, + "step": 1876 + }, + { + "epoch": 0.28, + "grad_norm": 1.7493988205425888, + "learning_rate": 1.9964618566059237e-06, + "loss": 0.6901, + "step": 1877 + }, + { + "epoch": 0.28, + "grad_norm": 6.615755667870812, + "learning_rate": 1.996453732425464e-06, + "loss": 0.6888, + "step": 1878 + }, + { + "epoch": 0.28, + "grad_norm": 1.3199750627847446, + "learning_rate": 1.9964455989450127e-06, + "loss": 0.6862, + "step": 1879 + }, + { + "epoch": 0.28, + "grad_norm": 2.2389520369249323, + "learning_rate": 1.996437456164647e-06, + "loss": 0.6986, + "step": 1880 + }, + { + "epoch": 0.28, + "grad_norm": 2.122331511452429, + "learning_rate": 1.996429304084442e-06, + "loss": 0.6862, + "step": 1881 + }, + { + "epoch": 0.28, + "grad_norm": 11.143603309074773, + "learning_rate": 1.9964211427044746e-06, + "loss": 0.7077, + "step": 1882 + }, + { + "epoch": 0.28, + "grad_norm": 1.930489202351744, + "learning_rate": 1.9964129720248203e-06, + "loss": 0.6927, + "step": 1883 + }, + { + "epoch": 0.28, + "grad_norm": 1.6402510000867419, + "learning_rate": 1.9964047920455556e-06, + "loss": 0.6868, + "step": 1884 + }, + { + "epoch": 0.28, + "grad_norm": 3.5645001749695555, + "learning_rate": 1.9963966027667573e-06, + "loss": 0.6882, + "step": 1885 + }, + { + "epoch": 0.28, + "grad_norm": 2.693478430769394, + "learning_rate": 1.9963884041885008e-06, + "loss": 0.6901, + "step": 1886 + }, + { + "epoch": 0.28, + "grad_norm": 7.19566290009337, + "learning_rate": 1.9963801963108634e-06, + "loss": 0.6973, + "step": 1887 + }, + { + "epoch": 0.28, + "grad_norm": 0.6351791331936364, + "learning_rate": 1.9963719791339216e-06, + "loss": 0.6829, + "step": 1888 + }, + { + "epoch": 0.28, + "grad_norm": 6.333981404910573, + "learning_rate": 1.996363752657752e-06, + "loss": 0.6849, + "step": 1889 + }, + { + "epoch": 0.28, + "grad_norm": 2.61981848025029, + "learning_rate": 1.9963555168824314e-06, + "loss": 0.6973, + "step": 1890 + }, + { + "epoch": 0.28, + "grad_norm": 11.460621659705765, + "learning_rate": 1.9963472718080367e-06, + "loss": 0.6979, + "step": 1891 + }, + { + "epoch": 0.28, + "grad_norm": 2.685836276849453, + "learning_rate": 1.996339017434644e-06, + "loss": 0.6868, + "step": 1892 + }, + { + "epoch": 0.28, + "grad_norm": 2.239662494035462, + "learning_rate": 1.996330753762332e-06, + "loss": 0.6908, + "step": 1893 + }, + { + "epoch": 0.28, + "grad_norm": 6.603496469697187, + "learning_rate": 1.996322480791177e-06, + "loss": 0.6855, + "step": 1894 + }, + { + "epoch": 0.28, + "grad_norm": 2.056147299474493, + "learning_rate": 1.996314198521256e-06, + "loss": 0.6986, + "step": 1895 + }, + { + "epoch": 0.28, + "grad_norm": 1.338204910773419, + "learning_rate": 1.9963059069526464e-06, + "loss": 0.6803, + "step": 1896 + }, + { + "epoch": 0.28, + "grad_norm": 2.626400949663979, + "learning_rate": 1.9962976060854257e-06, + "loss": 0.6921, + "step": 1897 + }, + { + "epoch": 0.28, + "grad_norm": 0.8845670711338506, + "learning_rate": 1.996289295919671e-06, + "loss": 0.6855, + "step": 1898 + }, + { + "epoch": 0.28, + "grad_norm": 1.84389074042448, + "learning_rate": 1.996280976455461e-06, + "loss": 0.6868, + "step": 1899 + }, + { + "epoch": 0.28, + "grad_norm": 5.828585062150965, + "learning_rate": 1.996272647692872e-06, + "loss": 0.6849, + "step": 1900 + }, + { + "epoch": 0.28, + "grad_norm": 3.9504482483595247, + "learning_rate": 1.996264309631982e-06, + "loss": 0.707, + "step": 1901 + }, + { + "epoch": 0.28, + "grad_norm": 12.670174495820472, + "learning_rate": 1.9962559622728695e-06, + "loss": 0.707, + "step": 1902 + }, + { + "epoch": 0.28, + "grad_norm": 12.086282584101712, + "learning_rate": 1.996247605615612e-06, + "loss": 0.707, + "step": 1903 + }, + { + "epoch": 0.28, + "grad_norm": 1.83719911486593, + "learning_rate": 1.9962392396602878e-06, + "loss": 0.6914, + "step": 1904 + }, + { + "epoch": 0.28, + "grad_norm": 2.0044266005143085, + "learning_rate": 1.996230864406974e-06, + "loss": 0.6895, + "step": 1905 + }, + { + "epoch": 0.28, + "grad_norm": 2.3680255517894664, + "learning_rate": 1.9962224798557497e-06, + "loss": 0.6849, + "step": 1906 + }, + { + "epoch": 0.28, + "grad_norm": 3.1563352017662876, + "learning_rate": 1.996214086006693e-06, + "loss": 0.7025, + "step": 1907 + }, + { + "epoch": 0.28, + "grad_norm": 1.5163612737206382, + "learning_rate": 1.996205682859882e-06, + "loss": 0.6875, + "step": 1908 + }, + { + "epoch": 0.28, + "grad_norm": 3.6280165036848278, + "learning_rate": 1.9961972704153952e-06, + "loss": 0.6927, + "step": 1909 + }, + { + "epoch": 0.28, + "grad_norm": 5.261634276743366, + "learning_rate": 1.9961888486733114e-06, + "loss": 0.707, + "step": 1910 + }, + { + "epoch": 0.29, + "grad_norm": 5.353699290908804, + "learning_rate": 1.996180417633709e-06, + "loss": 0.6953, + "step": 1911 + }, + { + "epoch": 0.29, + "grad_norm": 8.891270007416212, + "learning_rate": 1.9961719772966664e-06, + "loss": 0.6973, + "step": 1912 + }, + { + "epoch": 0.29, + "grad_norm": 3.8728025191878053, + "learning_rate": 1.996163527662263e-06, + "loss": 0.6842, + "step": 1913 + }, + { + "epoch": 0.29, + "grad_norm": 2.5075655048400134, + "learning_rate": 1.996155068730577e-06, + "loss": 0.6921, + "step": 1914 + }, + { + "epoch": 0.29, + "grad_norm": 3.86181834334752, + "learning_rate": 1.9961466005016878e-06, + "loss": 0.6829, + "step": 1915 + }, + { + "epoch": 0.29, + "grad_norm": 2.1339522825491555, + "learning_rate": 1.9961381229756742e-06, + "loss": 0.6882, + "step": 1916 + }, + { + "epoch": 0.29, + "grad_norm": 3.620767716545684, + "learning_rate": 1.996129636152615e-06, + "loss": 0.6849, + "step": 1917 + }, + { + "epoch": 0.29, + "grad_norm": 3.3745328946192923, + "learning_rate": 1.9961211400325907e-06, + "loss": 0.6842, + "step": 1918 + }, + { + "epoch": 0.29, + "grad_norm": 5.492692221195482, + "learning_rate": 1.996112634615679e-06, + "loss": 0.6849, + "step": 1919 + }, + { + "epoch": 0.29, + "grad_norm": 5.784627357594357, + "learning_rate": 1.9961041199019605e-06, + "loss": 0.6966, + "step": 1920 + }, + { + "epoch": 0.29, + "grad_norm": 1.7349687338935216, + "learning_rate": 1.9960955958915137e-06, + "loss": 0.6888, + "step": 1921 + }, + { + "epoch": 0.29, + "grad_norm": 5.160622704525065, + "learning_rate": 1.996087062584419e-06, + "loss": 0.6914, + "step": 1922 + }, + { + "epoch": 0.29, + "grad_norm": 3.276249184779373, + "learning_rate": 1.9960785199807552e-06, + "loss": 0.6803, + "step": 1923 + }, + { + "epoch": 0.29, + "grad_norm": 4.15907767817942, + "learning_rate": 1.996069968080603e-06, + "loss": 0.6797, + "step": 1924 + }, + { + "epoch": 0.29, + "grad_norm": 6.453320320756302, + "learning_rate": 1.9960614068840413e-06, + "loss": 0.7005, + "step": 1925 + }, + { + "epoch": 0.29, + "grad_norm": 1.5908392047449198, + "learning_rate": 1.9960528363911508e-06, + "loss": 0.6751, + "step": 1926 + }, + { + "epoch": 0.29, + "grad_norm": 7.4526139530227375, + "learning_rate": 1.9960442566020105e-06, + "loss": 0.6947, + "step": 1927 + }, + { + "epoch": 0.29, + "grad_norm": 0.7460881879518189, + "learning_rate": 1.9960356675167015e-06, + "loss": 0.6895, + "step": 1928 + }, + { + "epoch": 0.29, + "grad_norm": 5.096238702013258, + "learning_rate": 1.9960270691353032e-06, + "loss": 0.694, + "step": 1929 + }, + { + "epoch": 0.29, + "grad_norm": 1.5749894085344498, + "learning_rate": 1.9960184614578967e-06, + "loss": 0.6927, + "step": 1930 + }, + { + "epoch": 0.29, + "grad_norm": 3.9218377573638525, + "learning_rate": 1.9960098444845615e-06, + "loss": 0.6888, + "step": 1931 + }, + { + "epoch": 0.29, + "grad_norm": 2.7019652516838155, + "learning_rate": 1.9960012182153785e-06, + "loss": 0.7044, + "step": 1932 + }, + { + "epoch": 0.29, + "grad_norm": 0.7254194187159759, + "learning_rate": 1.9959925826504277e-06, + "loss": 0.6849, + "step": 1933 + }, + { + "epoch": 0.29, + "grad_norm": 4.856728572214746, + "learning_rate": 1.99598393778979e-06, + "loss": 0.6849, + "step": 1934 + }, + { + "epoch": 0.29, + "grad_norm": 8.1163379436324, + "learning_rate": 1.9959752836335466e-06, + "loss": 0.6836, + "step": 1935 + }, + { + "epoch": 0.29, + "grad_norm": 2.708432906291821, + "learning_rate": 1.9959666201817776e-06, + "loss": 0.6895, + "step": 1936 + }, + { + "epoch": 0.29, + "grad_norm": 0.7710740266396161, + "learning_rate": 1.995957947434564e-06, + "loss": 0.6966, + "step": 1937 + }, + { + "epoch": 0.29, + "grad_norm": 2.1705774608443935, + "learning_rate": 1.995949265391987e-06, + "loss": 0.6771, + "step": 1938 + }, + { + "epoch": 0.29, + "grad_norm": 3.3172252601989016, + "learning_rate": 1.9959405740541273e-06, + "loss": 0.6647, + "step": 1939 + }, + { + "epoch": 0.29, + "grad_norm": 0.8530515447323538, + "learning_rate": 1.995931873421066e-06, + "loss": 0.679, + "step": 1940 + }, + { + "epoch": 0.29, + "grad_norm": 1.295527182394645, + "learning_rate": 1.9959231634928847e-06, + "loss": 0.6823, + "step": 1941 + }, + { + "epoch": 0.29, + "grad_norm": 3.973357170008156, + "learning_rate": 1.995914444269664e-06, + "loss": 0.7044, + "step": 1942 + }, + { + "epoch": 0.29, + "grad_norm": 7.261028914164652, + "learning_rate": 1.9959057157514863e-06, + "loss": 0.6862, + "step": 1943 + }, + { + "epoch": 0.29, + "grad_norm": 2.96327467302479, + "learning_rate": 1.9958969779384322e-06, + "loss": 0.679, + "step": 1944 + }, + { + "epoch": 0.29, + "grad_norm": 5.4031760656654, + "learning_rate": 1.9958882308305832e-06, + "loss": 0.6973, + "step": 1945 + }, + { + "epoch": 0.29, + "grad_norm": 1.3500851769793778, + "learning_rate": 1.995879474428022e-06, + "loss": 0.6855, + "step": 1946 + }, + { + "epoch": 0.29, + "grad_norm": 1.234159359955198, + "learning_rate": 1.995870708730829e-06, + "loss": 0.6901, + "step": 1947 + }, + { + "epoch": 0.29, + "grad_norm": 1.2323204904741218, + "learning_rate": 1.9958619337390868e-06, + "loss": 0.6882, + "step": 1948 + }, + { + "epoch": 0.29, + "grad_norm": 8.809498119086575, + "learning_rate": 1.995853149452877e-06, + "loss": 0.6999, + "step": 1949 + }, + { + "epoch": 0.29, + "grad_norm": 3.557884500079079, + "learning_rate": 1.995844355872282e-06, + "loss": 0.724, + "step": 1950 + }, + { + "epoch": 0.29, + "grad_norm": 0.7443323879835331, + "learning_rate": 1.9958355529973832e-06, + "loss": 0.6862, + "step": 1951 + }, + { + "epoch": 0.29, + "grad_norm": 1.6141509585968927, + "learning_rate": 1.9958267408282636e-06, + "loss": 0.6842, + "step": 1952 + }, + { + "epoch": 0.29, + "grad_norm": 8.030571982065727, + "learning_rate": 1.9958179193650046e-06, + "loss": 0.6771, + "step": 1953 + }, + { + "epoch": 0.29, + "grad_norm": 7.905846908170793, + "learning_rate": 1.9958090886076885e-06, + "loss": 0.6895, + "step": 1954 + }, + { + "epoch": 0.29, + "grad_norm": 1.0654355593027525, + "learning_rate": 1.995800248556399e-06, + "loss": 0.6849, + "step": 1955 + }, + { + "epoch": 0.29, + "grad_norm": 3.7699487990756637, + "learning_rate": 1.995791399211217e-06, + "loss": 0.7018, + "step": 1956 + }, + { + "epoch": 0.29, + "grad_norm": 2.424491443056506, + "learning_rate": 1.995782540572226e-06, + "loss": 0.6803, + "step": 1957 + }, + { + "epoch": 0.29, + "grad_norm": 0.630259567715457, + "learning_rate": 1.995773672639509e-06, + "loss": 0.6868, + "step": 1958 + }, + { + "epoch": 0.29, + "grad_norm": 0.7125763102251735, + "learning_rate": 1.9957647954131474e-06, + "loss": 0.6868, + "step": 1959 + }, + { + "epoch": 0.29, + "grad_norm": 0.6909407567367195, + "learning_rate": 1.9957559088932256e-06, + "loss": 0.6973, + "step": 1960 + }, + { + "epoch": 0.29, + "grad_norm": 8.747246562417498, + "learning_rate": 1.995747013079825e-06, + "loss": 0.6816, + "step": 1961 + }, + { + "epoch": 0.29, + "grad_norm": 5.952571070855079, + "learning_rate": 1.99573810797303e-06, + "loss": 0.6868, + "step": 1962 + }, + { + "epoch": 0.29, + "grad_norm": 2.423353296809203, + "learning_rate": 1.9957291935729235e-06, + "loss": 0.6934, + "step": 1963 + }, + { + "epoch": 0.29, + "grad_norm": 5.657788276197407, + "learning_rate": 1.995720269879588e-06, + "loss": 0.6979, + "step": 1964 + }, + { + "epoch": 0.29, + "grad_norm": 9.284360413534841, + "learning_rate": 1.995711336893107e-06, + "loss": 0.6966, + "step": 1965 + }, + { + "epoch": 0.29, + "grad_norm": 5.912613259632315, + "learning_rate": 1.9957023946135646e-06, + "loss": 0.6999, + "step": 1966 + }, + { + "epoch": 0.29, + "grad_norm": 0.7658659288805398, + "learning_rate": 1.9956934430410437e-06, + "loss": 0.6868, + "step": 1967 + }, + { + "epoch": 0.29, + "grad_norm": 2.919863011119986, + "learning_rate": 1.9956844821756272e-06, + "loss": 0.6797, + "step": 1968 + }, + { + "epoch": 0.29, + "grad_norm": 2.393153776045875, + "learning_rate": 1.9956755120173997e-06, + "loss": 0.6947, + "step": 1969 + }, + { + "epoch": 0.29, + "grad_norm": 2.719280163340467, + "learning_rate": 1.9956665325664448e-06, + "loss": 0.6882, + "step": 1970 + }, + { + "epoch": 0.29, + "grad_norm": 6.678259680279034, + "learning_rate": 1.9956575438228455e-06, + "loss": 0.6836, + "step": 1971 + }, + { + "epoch": 0.29, + "grad_norm": 7.868006440444137, + "learning_rate": 1.9956485457866867e-06, + "loss": 0.6927, + "step": 1972 + }, + { + "epoch": 0.29, + "grad_norm": 1.747126413998155, + "learning_rate": 1.995639538458052e-06, + "loss": 0.6875, + "step": 1973 + }, + { + "epoch": 0.29, + "grad_norm": 1.8835950350061261, + "learning_rate": 1.9956305218370253e-06, + "loss": 0.6908, + "step": 1974 + }, + { + "epoch": 0.29, + "grad_norm": 5.85570412564342, + "learning_rate": 1.995621495923691e-06, + "loss": 0.6882, + "step": 1975 + }, + { + "epoch": 0.29, + "grad_norm": 2.857765191539288, + "learning_rate": 1.9956124607181335e-06, + "loss": 0.6888, + "step": 1976 + }, + { + "epoch": 0.29, + "grad_norm": 2.8433322633419253, + "learning_rate": 1.995603416220436e-06, + "loss": 0.6888, + "step": 1977 + }, + { + "epoch": 0.3, + "grad_norm": 7.145107297270967, + "learning_rate": 1.9955943624306846e-06, + "loss": 0.6895, + "step": 1978 + }, + { + "epoch": 0.3, + "grad_norm": 0.7693732202829348, + "learning_rate": 1.9955852993489625e-06, + "loss": 0.6836, + "step": 1979 + }, + { + "epoch": 0.3, + "grad_norm": 1.9277512244403014, + "learning_rate": 1.995576226975355e-06, + "loss": 0.6829, + "step": 1980 + }, + { + "epoch": 0.3, + "grad_norm": 3.4254109300355733, + "learning_rate": 1.9955671453099464e-06, + "loss": 0.6875, + "step": 1981 + }, + { + "epoch": 0.3, + "grad_norm": 2.054021916369393, + "learning_rate": 1.995558054352822e-06, + "loss": 0.6901, + "step": 1982 + }, + { + "epoch": 0.3, + "grad_norm": 2.0560488274605175, + "learning_rate": 1.9955489541040657e-06, + "loss": 0.6953, + "step": 1983 + }, + { + "epoch": 0.3, + "grad_norm": 0.8643900959065074, + "learning_rate": 1.9955398445637634e-06, + "loss": 0.6875, + "step": 1984 + }, + { + "epoch": 0.3, + "grad_norm": 4.504174284950175, + "learning_rate": 1.9955307257319993e-06, + "loss": 0.6803, + "step": 1985 + }, + { + "epoch": 0.3, + "grad_norm": 8.615904781152501, + "learning_rate": 1.9955215976088592e-06, + "loss": 0.6888, + "step": 1986 + }, + { + "epoch": 0.3, + "grad_norm": 1.4978601550127804, + "learning_rate": 1.9955124601944276e-06, + "loss": 0.6849, + "step": 1987 + }, + { + "epoch": 0.3, + "grad_norm": 5.082485108952971, + "learning_rate": 1.9955033134887908e-06, + "loss": 0.6947, + "step": 1988 + }, + { + "epoch": 0.3, + "grad_norm": 8.032246435371706, + "learning_rate": 1.995494157492033e-06, + "loss": 0.6895, + "step": 1989 + }, + { + "epoch": 0.3, + "grad_norm": 0.7230928404605168, + "learning_rate": 1.9954849922042404e-06, + "loss": 0.6823, + "step": 1990 + }, + { + "epoch": 0.3, + "grad_norm": 3.7925944690981708, + "learning_rate": 1.9954758176254983e-06, + "loss": 0.6888, + "step": 1991 + }, + { + "epoch": 0.3, + "grad_norm": 1.525433378470288, + "learning_rate": 1.995466633755892e-06, + "loss": 0.7025, + "step": 1992 + }, + { + "epoch": 0.3, + "grad_norm": 2.6783022990016727, + "learning_rate": 1.995457440595508e-06, + "loss": 0.6771, + "step": 1993 + }, + { + "epoch": 0.3, + "grad_norm": 4.082143867118819, + "learning_rate": 1.9954482381444317e-06, + "loss": 0.694, + "step": 1994 + }, + { + "epoch": 0.3, + "grad_norm": 1.8944470797360826, + "learning_rate": 1.995439026402749e-06, + "loss": 0.6921, + "step": 1995 + }, + { + "epoch": 0.3, + "grad_norm": 13.990696838661297, + "learning_rate": 1.9954298053705456e-06, + "loss": 0.7272, + "step": 1996 + }, + { + "epoch": 0.3, + "grad_norm": 10.72717974060425, + "learning_rate": 1.995420575047908e-06, + "loss": 0.7096, + "step": 1997 + }, + { + "epoch": 0.3, + "grad_norm": 7.803881545073114, + "learning_rate": 1.995411335434922e-06, + "loss": 0.7051, + "step": 1998 + }, + { + "epoch": 0.3, + "grad_norm": 6.156991078650013, + "learning_rate": 1.995402086531674e-06, + "loss": 0.7077, + "step": 1999 + }, + { + "epoch": 0.3, + "grad_norm": 6.152033311169977, + "learning_rate": 1.99539282833825e-06, + "loss": 0.6882, + "step": 2000 + }, + { + "epoch": 0.3, + "grad_norm": 0.9092221035453314, + "learning_rate": 1.995383560854737e-06, + "loss": 0.6966, + "step": 2001 + }, + { + "epoch": 0.3, + "grad_norm": 5.547844913546583, + "learning_rate": 1.9953742840812214e-06, + "loss": 0.696, + "step": 2002 + }, + { + "epoch": 0.3, + "grad_norm": 2.9034034601832075, + "learning_rate": 1.9953649980177892e-06, + "loss": 0.6895, + "step": 2003 + }, + { + "epoch": 0.3, + "grad_norm": 14.361358256025763, + "learning_rate": 1.995355702664528e-06, + "loss": 0.724, + "step": 2004 + }, + { + "epoch": 0.3, + "grad_norm": 3.2372921413990507, + "learning_rate": 1.9953463980215233e-06, + "loss": 0.6875, + "step": 2005 + }, + { + "epoch": 0.3, + "grad_norm": 10.45747902310465, + "learning_rate": 1.995337084088863e-06, + "loss": 0.7051, + "step": 2006 + }, + { + "epoch": 0.3, + "grad_norm": 4.113875162550306, + "learning_rate": 1.995327760866634e-06, + "loss": 0.6875, + "step": 2007 + }, + { + "epoch": 0.3, + "grad_norm": 2.04517824435416, + "learning_rate": 1.995318428354922e-06, + "loss": 0.6849, + "step": 2008 + }, + { + "epoch": 0.3, + "grad_norm": 7.920224868879356, + "learning_rate": 1.995309086553816e-06, + "loss": 0.6921, + "step": 2009 + }, + { + "epoch": 0.3, + "grad_norm": 4.5067930457413405, + "learning_rate": 1.995299735463402e-06, + "loss": 0.6882, + "step": 2010 + }, + { + "epoch": 0.3, + "grad_norm": 5.359690449831231, + "learning_rate": 1.9952903750837674e-06, + "loss": 0.6875, + "step": 2011 + }, + { + "epoch": 0.3, + "grad_norm": 10.455096117509393, + "learning_rate": 1.995281005415e-06, + "loss": 0.694, + "step": 2012 + }, + { + "epoch": 0.3, + "grad_norm": 3.231817309036877, + "learning_rate": 1.995271626457187e-06, + "loss": 0.6966, + "step": 2013 + }, + { + "epoch": 0.3, + "grad_norm": 9.680034279443335, + "learning_rate": 1.9952622382104155e-06, + "loss": 0.709, + "step": 2014 + }, + { + "epoch": 0.3, + "grad_norm": 4.568553459686599, + "learning_rate": 1.9952528406747738e-06, + "loss": 0.6953, + "step": 2015 + }, + { + "epoch": 0.3, + "grad_norm": 4.363112363038669, + "learning_rate": 1.9952434338503494e-06, + "loss": 0.6914, + "step": 2016 + }, + { + "epoch": 0.3, + "grad_norm": 13.092192954961947, + "learning_rate": 1.9952340177372295e-06, + "loss": 0.6966, + "step": 2017 + }, + { + "epoch": 0.3, + "grad_norm": 9.118839387464089, + "learning_rate": 1.9952245923355032e-06, + "loss": 0.6829, + "step": 2018 + }, + { + "epoch": 0.3, + "grad_norm": 0.7804083239797605, + "learning_rate": 1.995215157645257e-06, + "loss": 0.6868, + "step": 2019 + }, + { + "epoch": 0.3, + "grad_norm": 6.160734921234478, + "learning_rate": 1.9952057136665805e-06, + "loss": 0.696, + "step": 2020 + }, + { + "epoch": 0.3, + "grad_norm": 10.257677240953326, + "learning_rate": 1.9951962603995606e-06, + "loss": 0.7005, + "step": 2021 + }, + { + "epoch": 0.3, + "grad_norm": 11.639954529516167, + "learning_rate": 1.995186797844286e-06, + "loss": 0.7116, + "step": 2022 + }, + { + "epoch": 0.3, + "grad_norm": 5.286008533981664, + "learning_rate": 1.995177326000845e-06, + "loss": 0.6927, + "step": 2023 + }, + { + "epoch": 0.3, + "grad_norm": 0.762902588660575, + "learning_rate": 1.9951678448693262e-06, + "loss": 0.6868, + "step": 2024 + }, + { + "epoch": 0.3, + "grad_norm": 8.756253486191087, + "learning_rate": 1.995158354449818e-06, + "loss": 0.707, + "step": 2025 + }, + { + "epoch": 0.3, + "grad_norm": 3.345667565943277, + "learning_rate": 1.9951488547424084e-06, + "loss": 0.6816, + "step": 2026 + }, + { + "epoch": 0.3, + "grad_norm": 10.725159893646314, + "learning_rate": 1.995139345747187e-06, + "loss": 0.7012, + "step": 2027 + }, + { + "epoch": 0.3, + "grad_norm": 6.311623236822453, + "learning_rate": 1.995129827464242e-06, + "loss": 0.6823, + "step": 2028 + }, + { + "epoch": 0.3, + "grad_norm": 5.5340475090147985, + "learning_rate": 1.995120299893662e-06, + "loss": 0.696, + "step": 2029 + }, + { + "epoch": 0.3, + "grad_norm": 1.487341278465876, + "learning_rate": 1.9951107630355364e-06, + "loss": 0.6823, + "step": 2030 + }, + { + "epoch": 0.3, + "grad_norm": 1.8832009858088183, + "learning_rate": 1.995101216889954e-06, + "loss": 0.6986, + "step": 2031 + }, + { + "epoch": 0.3, + "grad_norm": 0.5676434378814575, + "learning_rate": 1.9950916614570042e-06, + "loss": 0.6895, + "step": 2032 + }, + { + "epoch": 0.3, + "grad_norm": 9.761581428312644, + "learning_rate": 1.9950820967367755e-06, + "loss": 0.6953, + "step": 2033 + }, + { + "epoch": 0.3, + "grad_norm": 4.643963584080869, + "learning_rate": 1.995072522729358e-06, + "loss": 0.6921, + "step": 2034 + }, + { + "epoch": 0.3, + "grad_norm": 6.125433435534502, + "learning_rate": 1.9950629394348406e-06, + "loss": 0.6966, + "step": 2035 + }, + { + "epoch": 0.3, + "grad_norm": 6.585322606386857, + "learning_rate": 1.9950533468533126e-06, + "loss": 0.6966, + "step": 2036 + }, + { + "epoch": 0.3, + "grad_norm": 0.5812881738918659, + "learning_rate": 1.9950437449848637e-06, + "loss": 0.6888, + "step": 2037 + }, + { + "epoch": 0.3, + "grad_norm": 8.384283057275008, + "learning_rate": 1.9950341338295836e-06, + "loss": 0.6901, + "step": 2038 + }, + { + "epoch": 0.3, + "grad_norm": 0.6341321847754362, + "learning_rate": 1.995024513387562e-06, + "loss": 0.694, + "step": 2039 + }, + { + "epoch": 0.3, + "grad_norm": 2.001355182830089, + "learning_rate": 1.9950148836588886e-06, + "loss": 0.694, + "step": 2040 + }, + { + "epoch": 0.3, + "grad_norm": 0.7124265257553761, + "learning_rate": 1.995005244643653e-06, + "loss": 0.6816, + "step": 2041 + }, + { + "epoch": 0.3, + "grad_norm": 9.317650816756803, + "learning_rate": 1.994995596341946e-06, + "loss": 0.6882, + "step": 2042 + }, + { + "epoch": 0.3, + "grad_norm": 6.054312913684877, + "learning_rate": 1.9949859387538568e-06, + "loss": 0.6921, + "step": 2043 + }, + { + "epoch": 0.3, + "grad_norm": 2.4258608366876286, + "learning_rate": 1.994976271879476e-06, + "loss": 0.6882, + "step": 2044 + }, + { + "epoch": 0.3, + "grad_norm": 0.6275386363189513, + "learning_rate": 1.9949665957188934e-06, + "loss": 0.6855, + "step": 2045 + }, + { + "epoch": 0.31, + "grad_norm": 0.8862428319677573, + "learning_rate": 1.9949569102721997e-06, + "loss": 0.6758, + "step": 2046 + }, + { + "epoch": 0.31, + "grad_norm": 6.298749701967591, + "learning_rate": 1.994947215539485e-06, + "loss": 0.6973, + "step": 2047 + }, + { + "epoch": 0.31, + "grad_norm": 2.8185325745705474, + "learning_rate": 1.9949375115208405e-06, + "loss": 0.6934, + "step": 2048 + }, + { + "epoch": 0.31, + "grad_norm": 1.4503573954041318, + "learning_rate": 1.9949277982163555e-06, + "loss": 0.6855, + "step": 2049 + }, + { + "epoch": 0.31, + "grad_norm": 5.2956205415081365, + "learning_rate": 1.994918075626122e-06, + "loss": 0.6797, + "step": 2050 + }, + { + "epoch": 0.31, + "grad_norm": 1.2738551365923259, + "learning_rate": 1.99490834375023e-06, + "loss": 0.6829, + "step": 2051 + }, + { + "epoch": 0.31, + "grad_norm": 3.600001360228916, + "learning_rate": 1.99489860258877e-06, + "loss": 0.6875, + "step": 2052 + }, + { + "epoch": 0.31, + "grad_norm": 4.183156788966657, + "learning_rate": 1.9948888521418335e-06, + "loss": 0.6823, + "step": 2053 + }, + { + "epoch": 0.31, + "grad_norm": 8.068161408867821, + "learning_rate": 1.9948790924095118e-06, + "loss": 0.6999, + "step": 2054 + }, + { + "epoch": 0.31, + "grad_norm": 3.7982894192473213, + "learning_rate": 1.994869323391895e-06, + "loss": 0.6992, + "step": 2055 + }, + { + "epoch": 0.31, + "grad_norm": 10.863045846447953, + "learning_rate": 1.994859545089075e-06, + "loss": 0.694, + "step": 2056 + }, + { + "epoch": 0.31, + "grad_norm": 1.846845818957937, + "learning_rate": 1.9948497575011427e-06, + "loss": 0.7038, + "step": 2057 + }, + { + "epoch": 0.31, + "grad_norm": 3.7758328000616714, + "learning_rate": 1.99483996062819e-06, + "loss": 0.6908, + "step": 2058 + }, + { + "epoch": 0.31, + "grad_norm": 2.4756741427466027, + "learning_rate": 1.994830154470308e-06, + "loss": 0.6764, + "step": 2059 + }, + { + "epoch": 0.31, + "grad_norm": 4.592956030895011, + "learning_rate": 1.994820339027588e-06, + "loss": 0.6823, + "step": 2060 + }, + { + "epoch": 0.31, + "grad_norm": 1.4456124638029932, + "learning_rate": 1.9948105143001217e-06, + "loss": 0.6803, + "step": 2061 + }, + { + "epoch": 0.31, + "grad_norm": 8.267826905128766, + "learning_rate": 1.9948006802880012e-06, + "loss": 0.7025, + "step": 2062 + }, + { + "epoch": 0.31, + "grad_norm": 1.1525362073924537, + "learning_rate": 1.994790836991318e-06, + "loss": 0.6784, + "step": 2063 + }, + { + "epoch": 0.31, + "grad_norm": 3.663751417688394, + "learning_rate": 1.9947809844101634e-06, + "loss": 0.6849, + "step": 2064 + }, + { + "epoch": 0.31, + "grad_norm": 13.383155322468333, + "learning_rate": 1.9947711225446303e-06, + "loss": 0.7129, + "step": 2065 + }, + { + "epoch": 0.31, + "grad_norm": 0.7062354148819434, + "learning_rate": 1.9947612513948106e-06, + "loss": 0.6868, + "step": 2066 + }, + { + "epoch": 0.31, + "grad_norm": 5.2864146226542745, + "learning_rate": 1.9947513709607958e-06, + "loss": 0.6895, + "step": 2067 + }, + { + "epoch": 0.31, + "grad_norm": 2.443775392294684, + "learning_rate": 1.9947414812426785e-06, + "loss": 0.6849, + "step": 2068 + }, + { + "epoch": 0.31, + "grad_norm": 2.6710792873400595, + "learning_rate": 1.994731582240551e-06, + "loss": 0.6875, + "step": 2069 + }, + { + "epoch": 0.31, + "grad_norm": 1.8348885362936054, + "learning_rate": 1.9947216739545056e-06, + "loss": 0.6895, + "step": 2070 + }, + { + "epoch": 0.31, + "grad_norm": 3.133164318366473, + "learning_rate": 1.994711756384635e-06, + "loss": 0.7057, + "step": 2071 + }, + { + "epoch": 0.31, + "grad_norm": 5.770695876380639, + "learning_rate": 1.9947018295310315e-06, + "loss": 0.696, + "step": 2072 + }, + { + "epoch": 0.31, + "grad_norm": 5.818196140200967, + "learning_rate": 1.994691893393788e-06, + "loss": 0.6966, + "step": 2073 + }, + { + "epoch": 0.31, + "grad_norm": 1.0202607611252796, + "learning_rate": 1.994681947972997e-06, + "loss": 0.6868, + "step": 2074 + }, + { + "epoch": 0.31, + "grad_norm": 15.040139442168075, + "learning_rate": 1.9946719932687514e-06, + "loss": 0.6829, + "step": 2075 + }, + { + "epoch": 0.31, + "grad_norm": 6.712751633177436, + "learning_rate": 1.994662029281144e-06, + "loss": 0.6914, + "step": 2076 + }, + { + "epoch": 0.31, + "grad_norm": 0.6823353807143185, + "learning_rate": 1.994652056010268e-06, + "loss": 0.6914, + "step": 2077 + }, + { + "epoch": 0.31, + "grad_norm": 1.1601387137138648, + "learning_rate": 1.994642073456217e-06, + "loss": 0.6777, + "step": 2078 + }, + { + "epoch": 0.31, + "grad_norm": 4.490106671360172, + "learning_rate": 1.9946320816190826e-06, + "loss": 0.6862, + "step": 2079 + }, + { + "epoch": 0.31, + "grad_norm": 6.3448887702269285, + "learning_rate": 1.9946220804989596e-06, + "loss": 0.6882, + "step": 2080 + }, + { + "epoch": 0.31, + "grad_norm": 7.424001310876028, + "learning_rate": 1.9946120700959408e-06, + "loss": 0.7005, + "step": 2081 + }, + { + "epoch": 0.31, + "grad_norm": 6.155785122024268, + "learning_rate": 1.994602050410119e-06, + "loss": 0.6895, + "step": 2082 + }, + { + "epoch": 0.31, + "grad_norm": 18.749798656685194, + "learning_rate": 1.994592021441589e-06, + "loss": 0.7402, + "step": 2083 + }, + { + "epoch": 0.31, + "grad_norm": 11.578842641677133, + "learning_rate": 1.9945819831904434e-06, + "loss": 0.694, + "step": 2084 + }, + { + "epoch": 0.31, + "grad_norm": 5.700416161048151, + "learning_rate": 1.994571935656776e-06, + "loss": 0.6862, + "step": 2085 + }, + { + "epoch": 0.31, + "grad_norm": 1.353735769810609, + "learning_rate": 1.994561878840681e-06, + "loss": 0.6934, + "step": 2086 + }, + { + "epoch": 0.31, + "grad_norm": 0.7395966661774404, + "learning_rate": 1.994551812742252e-06, + "loss": 0.6973, + "step": 2087 + }, + { + "epoch": 0.31, + "grad_norm": 10.272285523246088, + "learning_rate": 1.9945417373615833e-06, + "loss": 0.7064, + "step": 2088 + }, + { + "epoch": 0.31, + "grad_norm": 6.584929317478945, + "learning_rate": 1.994531652698768e-06, + "loss": 0.7044, + "step": 2089 + }, + { + "epoch": 0.31, + "grad_norm": 6.085380799831157, + "learning_rate": 1.994521558753901e-06, + "loss": 0.681, + "step": 2090 + }, + { + "epoch": 0.31, + "grad_norm": 2.3346828712935213, + "learning_rate": 1.9945114555270767e-06, + "loss": 0.6647, + "step": 2091 + }, + { + "epoch": 0.31, + "grad_norm": 3.4129533608897633, + "learning_rate": 1.9945013430183887e-06, + "loss": 0.6927, + "step": 2092 + }, + { + "epoch": 0.31, + "grad_norm": 12.808605288489463, + "learning_rate": 1.994491221227932e-06, + "loss": 0.7214, + "step": 2093 + }, + { + "epoch": 0.31, + "grad_norm": 8.878503302309573, + "learning_rate": 1.9944810901558006e-06, + "loss": 0.7116, + "step": 2094 + }, + { + "epoch": 0.31, + "grad_norm": 8.912496176473203, + "learning_rate": 1.994470949802089e-06, + "loss": 0.7051, + "step": 2095 + }, + { + "epoch": 0.31, + "grad_norm": 4.396770571714642, + "learning_rate": 1.9944608001668924e-06, + "loss": 0.7057, + "step": 2096 + }, + { + "epoch": 0.31, + "grad_norm": 2.5606462826800245, + "learning_rate": 1.994450641250305e-06, + "loss": 0.6882, + "step": 2097 + }, + { + "epoch": 0.31, + "grad_norm": 4.329153344028439, + "learning_rate": 1.994440473052422e-06, + "loss": 0.6927, + "step": 2098 + }, + { + "epoch": 0.31, + "grad_norm": 0.6605678929078531, + "learning_rate": 1.994430295573338e-06, + "loss": 0.6888, + "step": 2099 + }, + { + "epoch": 0.31, + "grad_norm": 6.113261500299247, + "learning_rate": 1.9944201088131487e-06, + "loss": 0.6914, + "step": 2100 + }, + { + "epoch": 0.31, + "grad_norm": 2.5880637594148963, + "learning_rate": 1.994409912771948e-06, + "loss": 0.6921, + "step": 2101 + }, + { + "epoch": 0.31, + "grad_norm": 3.2798923135290963, + "learning_rate": 1.9943997074498315e-06, + "loss": 0.6816, + "step": 2102 + }, + { + "epoch": 0.31, + "grad_norm": 4.654119098771072, + "learning_rate": 1.994389492846895e-06, + "loss": 0.6803, + "step": 2103 + }, + { + "epoch": 0.31, + "grad_norm": 6.199313599514428, + "learning_rate": 1.994379268963233e-06, + "loss": 0.6947, + "step": 2104 + }, + { + "epoch": 0.31, + "grad_norm": 6.851114309405642, + "learning_rate": 1.9943690357989417e-06, + "loss": 0.7031, + "step": 2105 + }, + { + "epoch": 0.31, + "grad_norm": 5.220419920568698, + "learning_rate": 1.994358793354116e-06, + "loss": 0.681, + "step": 2106 + }, + { + "epoch": 0.31, + "grad_norm": 8.98584162773109, + "learning_rate": 1.994348541628852e-06, + "loss": 0.7083, + "step": 2107 + }, + { + "epoch": 0.31, + "grad_norm": 3.0433817876059175, + "learning_rate": 1.994338280623245e-06, + "loss": 0.6921, + "step": 2108 + }, + { + "epoch": 0.31, + "grad_norm": 1.7915302427096433, + "learning_rate": 1.994328010337391e-06, + "loss": 0.6836, + "step": 2109 + }, + { + "epoch": 0.31, + "grad_norm": 1.6399370438282699, + "learning_rate": 1.994317730771385e-06, + "loss": 0.7064, + "step": 2110 + }, + { + "epoch": 0.31, + "grad_norm": 2.5503988747216986, + "learning_rate": 1.994307441925325e-06, + "loss": 0.6882, + "step": 2111 + }, + { + "epoch": 0.31, + "grad_norm": 4.597942605622542, + "learning_rate": 1.9942971437993048e-06, + "loss": 0.6777, + "step": 2112 + }, + { + "epoch": 0.32, + "grad_norm": 3.6389595284438983, + "learning_rate": 1.9942868363934216e-06, + "loss": 0.6823, + "step": 2113 + }, + { + "epoch": 0.32, + "grad_norm": 4.112761335468765, + "learning_rate": 1.994276519707771e-06, + "loss": 0.694, + "step": 2114 + }, + { + "epoch": 0.32, + "grad_norm": 4.256079865361053, + "learning_rate": 1.9942661937424504e-06, + "loss": 0.6927, + "step": 2115 + }, + { + "epoch": 0.32, + "grad_norm": 5.989820918350867, + "learning_rate": 1.9942558584975555e-06, + "loss": 0.694, + "step": 2116 + }, + { + "epoch": 0.32, + "grad_norm": 0.5021041033888342, + "learning_rate": 1.9942455139731826e-06, + "loss": 0.6784, + "step": 2117 + }, + { + "epoch": 0.32, + "grad_norm": 13.211411721378317, + "learning_rate": 1.994235160169428e-06, + "loss": 0.7083, + "step": 2118 + }, + { + "epoch": 0.32, + "grad_norm": 3.0393163733592456, + "learning_rate": 1.9942247970863893e-06, + "loss": 0.6927, + "step": 2119 + }, + { + "epoch": 0.32, + "grad_norm": 0.6852619063555013, + "learning_rate": 1.9942144247241625e-06, + "loss": 0.6999, + "step": 2120 + }, + { + "epoch": 0.32, + "grad_norm": 5.776295997754243, + "learning_rate": 1.9942040430828443e-06, + "loss": 0.6888, + "step": 2121 + }, + { + "epoch": 0.32, + "grad_norm": 0.9254284931912148, + "learning_rate": 1.994193652162532e-06, + "loss": 0.681, + "step": 2122 + }, + { + "epoch": 0.32, + "grad_norm": 8.543272595122522, + "learning_rate": 1.9941832519633223e-06, + "loss": 0.696, + "step": 2123 + }, + { + "epoch": 0.32, + "grad_norm": 4.049792294444601, + "learning_rate": 1.9941728424853125e-06, + "loss": 0.7012, + "step": 2124 + }, + { + "epoch": 0.32, + "grad_norm": 5.382386896432917, + "learning_rate": 1.9941624237285996e-06, + "loss": 0.6842, + "step": 2125 + }, + { + "epoch": 0.32, + "grad_norm": 1.7955270672325234, + "learning_rate": 1.9941519956932807e-06, + "loss": 0.6875, + "step": 2126 + }, + { + "epoch": 0.32, + "grad_norm": 3.7404221622733624, + "learning_rate": 1.9941415583794537e-06, + "loss": 0.6901, + "step": 2127 + }, + { + "epoch": 0.32, + "grad_norm": 0.5555171105404972, + "learning_rate": 1.9941311117872155e-06, + "loss": 0.6953, + "step": 2128 + }, + { + "epoch": 0.32, + "grad_norm": 1.5700488862188897, + "learning_rate": 1.994120655916664e-06, + "loss": 0.6934, + "step": 2129 + }, + { + "epoch": 0.32, + "grad_norm": 1.52458283869441, + "learning_rate": 1.9941101907678955e-06, + "loss": 0.6966, + "step": 2130 + }, + { + "epoch": 0.32, + "grad_norm": 4.149561956964611, + "learning_rate": 1.9940997163410095e-06, + "loss": 0.6888, + "step": 2131 + }, + { + "epoch": 0.32, + "grad_norm": 6.502647042932812, + "learning_rate": 1.9940892326361027e-06, + "loss": 0.6732, + "step": 2132 + }, + { + "epoch": 0.32, + "grad_norm": 1.1024101252463365, + "learning_rate": 1.994078739653273e-06, + "loss": 0.6836, + "step": 2133 + }, + { + "epoch": 0.32, + "grad_norm": 14.745992811628698, + "learning_rate": 1.9940682373926192e-06, + "loss": 0.7148, + "step": 2134 + }, + { + "epoch": 0.32, + "grad_norm": 8.299384507796885, + "learning_rate": 1.994057725854238e-06, + "loss": 0.7005, + "step": 2135 + }, + { + "epoch": 0.32, + "grad_norm": 2.535001025142034, + "learning_rate": 1.994047205038228e-06, + "loss": 0.6973, + "step": 2136 + }, + { + "epoch": 0.32, + "grad_norm": 4.600565469445625, + "learning_rate": 1.9940366749446884e-06, + "loss": 0.6745, + "step": 2137 + }, + { + "epoch": 0.32, + "grad_norm": 3.2039056649276487, + "learning_rate": 1.9940261355737157e-06, + "loss": 0.6979, + "step": 2138 + }, + { + "epoch": 0.32, + "grad_norm": 0.7447022415051809, + "learning_rate": 1.9940155869254096e-06, + "loss": 0.6895, + "step": 2139 + }, + { + "epoch": 0.32, + "grad_norm": 6.523029006600892, + "learning_rate": 1.994005028999868e-06, + "loss": 0.696, + "step": 2140 + }, + { + "epoch": 0.32, + "grad_norm": 7.493312344079601, + "learning_rate": 1.9939944617971897e-06, + "loss": 0.6901, + "step": 2141 + }, + { + "epoch": 0.32, + "grad_norm": 5.607744978245882, + "learning_rate": 1.9939838853174732e-06, + "loss": 0.6855, + "step": 2142 + }, + { + "epoch": 0.32, + "grad_norm": 0.6261402375276124, + "learning_rate": 1.993973299560817e-06, + "loss": 0.6745, + "step": 2143 + }, + { + "epoch": 0.32, + "grad_norm": 3.8889716687456835, + "learning_rate": 1.99396270452732e-06, + "loss": 0.6771, + "step": 2144 + }, + { + "epoch": 0.32, + "grad_norm": 3.8106358285679502, + "learning_rate": 1.993952100217081e-06, + "loss": 0.6953, + "step": 2145 + }, + { + "epoch": 0.32, + "grad_norm": 6.352645258122751, + "learning_rate": 1.9939414866302e-06, + "loss": 0.6797, + "step": 2146 + }, + { + "epoch": 0.32, + "grad_norm": 14.700121439960679, + "learning_rate": 1.993930863766775e-06, + "loss": 0.7044, + "step": 2147 + }, + { + "epoch": 0.32, + "grad_norm": 1.7545887794329897, + "learning_rate": 1.9939202316269046e-06, + "loss": 0.6836, + "step": 2148 + }, + { + "epoch": 0.32, + "grad_norm": 10.492695815242632, + "learning_rate": 1.9939095902106892e-06, + "loss": 0.6829, + "step": 2149 + }, + { + "epoch": 0.32, + "grad_norm": 2.556374903893604, + "learning_rate": 1.9938989395182276e-06, + "loss": 0.6888, + "step": 2150 + }, + { + "epoch": 0.32, + "grad_norm": 6.987735533520493, + "learning_rate": 1.9938882795496194e-06, + "loss": 0.6895, + "step": 2151 + }, + { + "epoch": 0.32, + "grad_norm": 4.812515994201732, + "learning_rate": 1.993877610304964e-06, + "loss": 0.6992, + "step": 2152 + }, + { + "epoch": 0.32, + "grad_norm": 2.766116478520033, + "learning_rate": 1.993866931784361e-06, + "loss": 0.694, + "step": 2153 + }, + { + "epoch": 0.32, + "grad_norm": 6.315207898980258, + "learning_rate": 1.9938562439879105e-06, + "loss": 0.6621, + "step": 2154 + }, + { + "epoch": 0.32, + "grad_norm": 5.041173453602977, + "learning_rate": 1.9938455469157114e-06, + "loss": 0.6868, + "step": 2155 + }, + { + "epoch": 0.32, + "grad_norm": 3.1258223265797733, + "learning_rate": 1.993834840567863e-06, + "loss": 0.6979, + "step": 2156 + }, + { + "epoch": 0.32, + "grad_norm": 0.7905317266681554, + "learning_rate": 1.9938241249444673e-06, + "loss": 0.6764, + "step": 2157 + }, + { + "epoch": 0.32, + "grad_norm": 4.371708594732771, + "learning_rate": 1.993813400045623e-06, + "loss": 0.6934, + "step": 2158 + }, + { + "epoch": 0.32, + "grad_norm": 7.493948390560731, + "learning_rate": 1.99380266587143e-06, + "loss": 0.6986, + "step": 2159 + }, + { + "epoch": 0.32, + "grad_norm": 8.12311069258627, + "learning_rate": 1.993791922421989e-06, + "loss": 0.7012, + "step": 2160 + }, + { + "epoch": 0.32, + "grad_norm": 3.9436113594107916, + "learning_rate": 1.9937811696974004e-06, + "loss": 0.6992, + "step": 2161 + }, + { + "epoch": 0.32, + "grad_norm": 2.3673607061302313, + "learning_rate": 1.993770407697764e-06, + "loss": 0.7018, + "step": 2162 + }, + { + "epoch": 0.32, + "grad_norm": 4.658847689327028, + "learning_rate": 1.99375963642318e-06, + "loss": 0.6771, + "step": 2163 + }, + { + "epoch": 0.32, + "grad_norm": 1.87020084339545, + "learning_rate": 1.99374885587375e-06, + "loss": 0.6947, + "step": 2164 + }, + { + "epoch": 0.32, + "grad_norm": 0.7457409902662344, + "learning_rate": 1.9937380660495738e-06, + "loss": 0.6797, + "step": 2165 + }, + { + "epoch": 0.32, + "grad_norm": 11.832011694452225, + "learning_rate": 1.9937272669507526e-06, + "loss": 0.6836, + "step": 2166 + }, + { + "epoch": 0.32, + "grad_norm": 5.820818304251833, + "learning_rate": 1.993716458577387e-06, + "loss": 0.7135, + "step": 2167 + }, + { + "epoch": 0.32, + "grad_norm": 5.414683475710164, + "learning_rate": 1.9937056409295778e-06, + "loss": 0.696, + "step": 2168 + }, + { + "epoch": 0.32, + "grad_norm": 2.1640065440815346, + "learning_rate": 1.9936948140074256e-06, + "loss": 0.6921, + "step": 2169 + }, + { + "epoch": 0.32, + "grad_norm": 3.619055603631372, + "learning_rate": 1.993683977811032e-06, + "loss": 0.6882, + "step": 2170 + }, + { + "epoch": 0.32, + "grad_norm": 3.767654621168968, + "learning_rate": 1.993673132340498e-06, + "loss": 0.7012, + "step": 2171 + }, + { + "epoch": 0.32, + "grad_norm": 4.693632661079222, + "learning_rate": 1.9936622775959244e-06, + "loss": 0.6849, + "step": 2172 + }, + { + "epoch": 0.32, + "grad_norm": 0.7692208279091427, + "learning_rate": 1.9936514135774134e-06, + "loss": 0.6797, + "step": 2173 + }, + { + "epoch": 0.32, + "grad_norm": 0.6299987937048089, + "learning_rate": 1.993640540285066e-06, + "loss": 0.681, + "step": 2174 + }, + { + "epoch": 0.32, + "grad_norm": 0.6619590465918539, + "learning_rate": 1.993629657718983e-06, + "loss": 0.6849, + "step": 2175 + }, + { + "epoch": 0.32, + "grad_norm": 1.2447398219516865, + "learning_rate": 1.9936187658792673e-06, + "loss": 0.6725, + "step": 2176 + }, + { + "epoch": 0.32, + "grad_norm": 5.883013522765589, + "learning_rate": 1.993607864766019e-06, + "loss": 0.6875, + "step": 2177 + }, + { + "epoch": 0.32, + "grad_norm": 0.7726782454043695, + "learning_rate": 1.9935969543793408e-06, + "loss": 0.679, + "step": 2178 + }, + { + "epoch": 0.32, + "grad_norm": 6.413285909322824, + "learning_rate": 1.9935860347193345e-06, + "loss": 0.6803, + "step": 2179 + }, + { + "epoch": 0.33, + "grad_norm": 8.252662784834964, + "learning_rate": 1.9935751057861017e-06, + "loss": 0.6999, + "step": 2180 + }, + { + "epoch": 0.33, + "grad_norm": 1.5745574790869732, + "learning_rate": 1.9935641675797453e-06, + "loss": 0.6875, + "step": 2181 + }, + { + "epoch": 0.33, + "grad_norm": 1.2246222537745581, + "learning_rate": 1.9935532201003658e-06, + "loss": 0.6888, + "step": 2182 + }, + { + "epoch": 0.33, + "grad_norm": 3.9966419847495245, + "learning_rate": 1.9935422633480663e-06, + "loss": 0.6823, + "step": 2183 + }, + { + "epoch": 0.33, + "grad_norm": 0.7194377782750362, + "learning_rate": 1.9935312973229495e-06, + "loss": 0.6999, + "step": 2184 + }, + { + "epoch": 0.33, + "grad_norm": 2.854237634755463, + "learning_rate": 1.993520322025117e-06, + "loss": 0.6862, + "step": 2185 + }, + { + "epoch": 0.33, + "grad_norm": 1.6402124465243826, + "learning_rate": 1.9935093374546716e-06, + "loss": 0.6992, + "step": 2186 + }, + { + "epoch": 0.33, + "grad_norm": 1.9287914395701322, + "learning_rate": 1.9934983436117154e-06, + "loss": 0.6888, + "step": 2187 + }, + { + "epoch": 0.33, + "grad_norm": 0.6292821983492249, + "learning_rate": 1.9934873404963514e-06, + "loss": 0.6908, + "step": 2188 + }, + { + "epoch": 0.33, + "grad_norm": 3.4242998395390254, + "learning_rate": 1.993476328108682e-06, + "loss": 0.6855, + "step": 2189 + }, + { + "epoch": 0.33, + "grad_norm": 1.6929148670356424, + "learning_rate": 1.9934653064488103e-06, + "loss": 0.6947, + "step": 2190 + }, + { + "epoch": 0.33, + "grad_norm": 4.516629257379315, + "learning_rate": 1.9934542755168395e-06, + "loss": 0.6966, + "step": 2191 + }, + { + "epoch": 0.33, + "grad_norm": 4.563783104534915, + "learning_rate": 1.9934432353128716e-06, + "loss": 0.6849, + "step": 2192 + }, + { + "epoch": 0.33, + "grad_norm": 5.320336265445, + "learning_rate": 1.99343218583701e-06, + "loss": 0.6855, + "step": 2193 + }, + { + "epoch": 0.33, + "grad_norm": 0.9682675806463187, + "learning_rate": 1.9934211270893585e-06, + "loss": 0.6882, + "step": 2194 + }, + { + "epoch": 0.33, + "grad_norm": 3.5899030623705874, + "learning_rate": 1.9934100590700196e-06, + "loss": 0.7018, + "step": 2195 + }, + { + "epoch": 0.33, + "grad_norm": 7.891305161653392, + "learning_rate": 1.9933989817790965e-06, + "loss": 0.679, + "step": 2196 + }, + { + "epoch": 0.33, + "grad_norm": 3.742194484886591, + "learning_rate": 1.993387895216693e-06, + "loss": 0.6875, + "step": 2197 + }, + { + "epoch": 0.33, + "grad_norm": 5.56204442573362, + "learning_rate": 1.9933767993829123e-06, + "loss": 0.6745, + "step": 2198 + }, + { + "epoch": 0.33, + "grad_norm": 5.044499938657159, + "learning_rate": 1.9933656942778583e-06, + "loss": 0.6947, + "step": 2199 + }, + { + "epoch": 0.33, + "grad_norm": 6.930336924678023, + "learning_rate": 1.9933545799016343e-06, + "loss": 0.6868, + "step": 2200 + }, + { + "epoch": 0.33, + "grad_norm": 3.4270942823785298, + "learning_rate": 1.9933434562543442e-06, + "loss": 0.6953, + "step": 2201 + }, + { + "epoch": 0.33, + "grad_norm": 2.1157076392385075, + "learning_rate": 1.9933323233360917e-06, + "loss": 0.6999, + "step": 2202 + }, + { + "epoch": 0.33, + "grad_norm": 3.0636803627438773, + "learning_rate": 1.9933211811469807e-06, + "loss": 0.6751, + "step": 2203 + }, + { + "epoch": 0.33, + "grad_norm": 2.8451427765030215, + "learning_rate": 1.9933100296871156e-06, + "loss": 0.6882, + "step": 2204 + }, + { + "epoch": 0.33, + "grad_norm": 6.230688723308228, + "learning_rate": 1.9932988689566e-06, + "loss": 0.6842, + "step": 2205 + }, + { + "epoch": 0.33, + "grad_norm": 2.857625442752405, + "learning_rate": 1.993287698955538e-06, + "loss": 0.6908, + "step": 2206 + }, + { + "epoch": 0.33, + "grad_norm": 6.852961501683365, + "learning_rate": 1.9932765196840343e-06, + "loss": 0.6855, + "step": 2207 + }, + { + "epoch": 0.33, + "grad_norm": 4.025892959034675, + "learning_rate": 1.993265331142193e-06, + "loss": 0.6979, + "step": 2208 + }, + { + "epoch": 0.33, + "grad_norm": 1.9099491630723764, + "learning_rate": 1.9932541333301184e-06, + "loss": 0.7025, + "step": 2209 + }, + { + "epoch": 0.33, + "grad_norm": 3.6872875051754535, + "learning_rate": 1.993242926247915e-06, + "loss": 0.6895, + "step": 2210 + }, + { + "epoch": 0.33, + "grad_norm": 1.6373581972127378, + "learning_rate": 1.9932317098956878e-06, + "loss": 0.6934, + "step": 2211 + }, + { + "epoch": 0.33, + "grad_norm": 3.185894222548832, + "learning_rate": 1.9932204842735413e-06, + "loss": 0.6647, + "step": 2212 + }, + { + "epoch": 0.33, + "grad_norm": 6.644617201697543, + "learning_rate": 1.99320924938158e-06, + "loss": 0.6908, + "step": 2213 + }, + { + "epoch": 0.33, + "grad_norm": 5.8786071939596924, + "learning_rate": 1.993198005219909e-06, + "loss": 0.6908, + "step": 2214 + }, + { + "epoch": 0.33, + "grad_norm": 11.100392856421475, + "learning_rate": 1.993186751788633e-06, + "loss": 0.7064, + "step": 2215 + }, + { + "epoch": 0.33, + "grad_norm": 7.675661861918666, + "learning_rate": 1.993175489087857e-06, + "loss": 0.694, + "step": 2216 + }, + { + "epoch": 0.33, + "grad_norm": 0.8993535007554266, + "learning_rate": 1.993164217117687e-06, + "loss": 0.6823, + "step": 2217 + }, + { + "epoch": 0.33, + "grad_norm": 3.0220853624615156, + "learning_rate": 1.993152935878227e-06, + "loss": 0.681, + "step": 2218 + }, + { + "epoch": 0.33, + "grad_norm": 3.7694124129233306, + "learning_rate": 1.993141645369583e-06, + "loss": 0.6999, + "step": 2219 + }, + { + "epoch": 0.33, + "grad_norm": 7.10813239040174, + "learning_rate": 1.9931303455918603e-06, + "loss": 0.707, + "step": 2220 + }, + { + "epoch": 0.33, + "grad_norm": 7.691324897531362, + "learning_rate": 1.9931190365451644e-06, + "loss": 0.6999, + "step": 2221 + }, + { + "epoch": 0.33, + "grad_norm": 2.470810045590243, + "learning_rate": 1.9931077182296004e-06, + "loss": 0.6947, + "step": 2222 + }, + { + "epoch": 0.33, + "grad_norm": 5.5437045215206044, + "learning_rate": 1.993096390645274e-06, + "loss": 0.668, + "step": 2223 + }, + { + "epoch": 0.33, + "grad_norm": 8.7139427375484, + "learning_rate": 1.9930850537922917e-06, + "loss": 0.6882, + "step": 2224 + }, + { + "epoch": 0.33, + "grad_norm": 2.3890151866918785, + "learning_rate": 1.9930737076707587e-06, + "loss": 0.6895, + "step": 2225 + }, + { + "epoch": 0.33, + "grad_norm": 1.4463552671182922, + "learning_rate": 1.9930623522807807e-06, + "loss": 0.6979, + "step": 2226 + }, + { + "epoch": 0.33, + "grad_norm": 1.911240973851349, + "learning_rate": 1.993050987622464e-06, + "loss": 0.6914, + "step": 2227 + }, + { + "epoch": 0.33, + "grad_norm": 6.237078473459028, + "learning_rate": 1.993039613695915e-06, + "loss": 0.6953, + "step": 2228 + }, + { + "epoch": 0.33, + "grad_norm": 3.2217792942812893, + "learning_rate": 1.9930282305012393e-06, + "loss": 0.6966, + "step": 2229 + }, + { + "epoch": 0.33, + "grad_norm": 2.8912423111094836, + "learning_rate": 1.9930168380385427e-06, + "loss": 0.7018, + "step": 2230 + }, + { + "epoch": 0.33, + "grad_norm": 2.9602712265165847, + "learning_rate": 1.9930054363079325e-06, + "loss": 0.6823, + "step": 2231 + }, + { + "epoch": 0.33, + "grad_norm": 9.524989064059023, + "learning_rate": 1.9929940253095148e-06, + "loss": 0.6927, + "step": 2232 + }, + { + "epoch": 0.33, + "grad_norm": 1.04593519553391, + "learning_rate": 1.992982605043396e-06, + "loss": 0.6855, + "step": 2233 + }, + { + "epoch": 0.33, + "grad_norm": 3.405621452132216, + "learning_rate": 1.992971175509683e-06, + "loss": 0.7038, + "step": 2234 + }, + { + "epoch": 0.33, + "grad_norm": 0.6622529358346688, + "learning_rate": 1.992959736708482e-06, + "loss": 0.6803, + "step": 2235 + }, + { + "epoch": 0.33, + "grad_norm": 2.8725074117294827, + "learning_rate": 1.9929482886399e-06, + "loss": 0.6758, + "step": 2236 + }, + { + "epoch": 0.33, + "grad_norm": 2.894361820810704, + "learning_rate": 1.992936831304044e-06, + "loss": 0.6868, + "step": 2237 + }, + { + "epoch": 0.33, + "grad_norm": 1.190575241060966, + "learning_rate": 1.9929253647010203e-06, + "loss": 0.6921, + "step": 2238 + }, + { + "epoch": 0.33, + "grad_norm": 1.9752207682225762, + "learning_rate": 1.9929138888309368e-06, + "loss": 0.6888, + "step": 2239 + }, + { + "epoch": 0.33, + "grad_norm": 2.561220083584678, + "learning_rate": 1.9929024036939e-06, + "loss": 0.6875, + "step": 2240 + }, + { + "epoch": 0.33, + "grad_norm": 3.6385739978115232, + "learning_rate": 1.992890909290017e-06, + "loss": 0.6829, + "step": 2241 + }, + { + "epoch": 0.33, + "grad_norm": 3.7160125826680717, + "learning_rate": 1.9928794056193955e-06, + "loss": 0.6875, + "step": 2242 + }, + { + "epoch": 0.33, + "grad_norm": 2.813691600158747, + "learning_rate": 1.9928678926821427e-06, + "loss": 0.6927, + "step": 2243 + }, + { + "epoch": 0.33, + "grad_norm": 9.08489813081945, + "learning_rate": 1.9928563704783664e-06, + "loss": 0.6842, + "step": 2244 + }, + { + "epoch": 0.33, + "grad_norm": 7.674841762860167, + "learning_rate": 1.9928448390081734e-06, + "loss": 0.6829, + "step": 2245 + }, + { + "epoch": 0.33, + "grad_norm": 0.7389777782958432, + "learning_rate": 1.992833298271672e-06, + "loss": 0.6901, + "step": 2246 + }, + { + "epoch": 0.34, + "grad_norm": 3.1746677211388077, + "learning_rate": 1.9928217482689693e-06, + "loss": 0.6803, + "step": 2247 + }, + { + "epoch": 0.34, + "grad_norm": 3.034862729827823, + "learning_rate": 1.9928101890001734e-06, + "loss": 0.6667, + "step": 2248 + }, + { + "epoch": 0.34, + "grad_norm": 9.335795386263849, + "learning_rate": 1.9927986204653924e-06, + "loss": 0.6842, + "step": 2249 + }, + { + "epoch": 0.34, + "grad_norm": 7.459900007243406, + "learning_rate": 1.992787042664734e-06, + "loss": 0.6953, + "step": 2250 + }, + { + "epoch": 0.34, + "grad_norm": 4.690398869667616, + "learning_rate": 1.9927754555983066e-06, + "loss": 0.6862, + "step": 2251 + }, + { + "epoch": 0.34, + "grad_norm": 0.5334006077595865, + "learning_rate": 1.992763859266218e-06, + "loss": 0.6875, + "step": 2252 + }, + { + "epoch": 0.34, + "grad_norm": 2.8139741219076213, + "learning_rate": 1.992752253668576e-06, + "loss": 0.6901, + "step": 2253 + }, + { + "epoch": 0.34, + "grad_norm": 1.6290716890214458, + "learning_rate": 1.99274063880549e-06, + "loss": 0.6947, + "step": 2254 + }, + { + "epoch": 0.34, + "grad_norm": 5.37987042440755, + "learning_rate": 1.9927290146770673e-06, + "loss": 0.6823, + "step": 2255 + }, + { + "epoch": 0.34, + "grad_norm": 5.733879051592735, + "learning_rate": 1.9927173812834175e-06, + "loss": 0.6803, + "step": 2256 + }, + { + "epoch": 0.34, + "grad_norm": 8.71290006123081, + "learning_rate": 1.992705738624648e-06, + "loss": 0.7038, + "step": 2257 + }, + { + "epoch": 0.34, + "grad_norm": 1.2253052666038076, + "learning_rate": 1.9926940867008684e-06, + "loss": 0.7057, + "step": 2258 + }, + { + "epoch": 0.34, + "grad_norm": 1.1594701944996562, + "learning_rate": 1.9926824255121874e-06, + "loss": 0.6947, + "step": 2259 + }, + { + "epoch": 0.34, + "grad_norm": 0.8265243891545587, + "learning_rate": 1.9926707550587133e-06, + "loss": 0.709, + "step": 2260 + }, + { + "epoch": 0.34, + "grad_norm": 3.506752072694046, + "learning_rate": 1.992659075340555e-06, + "loss": 0.6882, + "step": 2261 + }, + { + "epoch": 0.34, + "grad_norm": 3.544044121728527, + "learning_rate": 1.992647386357822e-06, + "loss": 0.6947, + "step": 2262 + }, + { + "epoch": 0.34, + "grad_norm": 1.305233831697887, + "learning_rate": 1.992635688110623e-06, + "loss": 0.6803, + "step": 2263 + }, + { + "epoch": 0.34, + "grad_norm": 7.116519307157374, + "learning_rate": 1.9926239805990674e-06, + "loss": 0.6947, + "step": 2264 + }, + { + "epoch": 0.34, + "grad_norm": 3.4685152728352278, + "learning_rate": 1.9926122638232643e-06, + "loss": 0.694, + "step": 2265 + }, + { + "epoch": 0.34, + "grad_norm": 4.223307139261506, + "learning_rate": 1.9926005377833233e-06, + "loss": 0.6934, + "step": 2266 + }, + { + "epoch": 0.34, + "grad_norm": 2.2605626641778476, + "learning_rate": 1.9925888024793538e-06, + "loss": 0.6921, + "step": 2267 + }, + { + "epoch": 0.34, + "grad_norm": 0.9502291520027989, + "learning_rate": 1.992577057911465e-06, + "loss": 0.6953, + "step": 2268 + }, + { + "epoch": 0.34, + "grad_norm": 4.32379508581916, + "learning_rate": 1.992565304079767e-06, + "loss": 0.6921, + "step": 2269 + }, + { + "epoch": 0.34, + "grad_norm": 0.8785238681931542, + "learning_rate": 1.992553540984369e-06, + "loss": 0.6725, + "step": 2270 + }, + { + "epoch": 0.34, + "grad_norm": 1.4347165251415621, + "learning_rate": 1.992541768625381e-06, + "loss": 0.6849, + "step": 2271 + }, + { + "epoch": 0.34, + "grad_norm": 2.143832571028357, + "learning_rate": 1.992529987002913e-06, + "loss": 0.6771, + "step": 2272 + }, + { + "epoch": 0.34, + "grad_norm": 6.927830999202969, + "learning_rate": 1.992518196117075e-06, + "loss": 0.6973, + "step": 2273 + }, + { + "epoch": 0.34, + "grad_norm": 4.671685377824438, + "learning_rate": 1.9925063959679765e-06, + "loss": 0.6901, + "step": 2274 + }, + { + "epoch": 0.34, + "grad_norm": 2.0832242567717163, + "learning_rate": 1.992494586555728e-06, + "loss": 0.6882, + "step": 2275 + }, + { + "epoch": 0.34, + "grad_norm": 2.313836913515264, + "learning_rate": 1.99248276788044e-06, + "loss": 0.6966, + "step": 2276 + }, + { + "epoch": 0.34, + "grad_norm": 3.0414865618147986, + "learning_rate": 1.992470939942223e-06, + "loss": 0.6803, + "step": 2277 + }, + { + "epoch": 0.34, + "grad_norm": 2.3136649696114544, + "learning_rate": 1.9924591027411865e-06, + "loss": 0.6986, + "step": 2278 + }, + { + "epoch": 0.34, + "grad_norm": 2.049608567680274, + "learning_rate": 1.992447256277442e-06, + "loss": 0.6738, + "step": 2279 + }, + { + "epoch": 0.34, + "grad_norm": 5.36033366136174, + "learning_rate": 1.9924354005510985e-06, + "loss": 0.6953, + "step": 2280 + }, + { + "epoch": 0.34, + "grad_norm": 7.628671278934357, + "learning_rate": 1.992423535562268e-06, + "loss": 0.6934, + "step": 2281 + }, + { + "epoch": 0.34, + "grad_norm": 5.037281595458311, + "learning_rate": 1.992411661311061e-06, + "loss": 0.6836, + "step": 2282 + }, + { + "epoch": 0.34, + "grad_norm": 12.913575437574883, + "learning_rate": 1.9923997777975884e-06, + "loss": 0.7025, + "step": 2283 + }, + { + "epoch": 0.34, + "grad_norm": 5.998230790176866, + "learning_rate": 1.9923878850219604e-06, + "loss": 0.7005, + "step": 2284 + }, + { + "epoch": 0.34, + "grad_norm": 1.0455580370656008, + "learning_rate": 1.992375982984289e-06, + "loss": 0.6803, + "step": 2285 + }, + { + "epoch": 0.34, + "grad_norm": 1.935221177736731, + "learning_rate": 1.9923640716846847e-06, + "loss": 0.6797, + "step": 2286 + }, + { + "epoch": 0.34, + "grad_norm": 6.773057780477317, + "learning_rate": 1.9923521511232585e-06, + "loss": 0.6908, + "step": 2287 + }, + { + "epoch": 0.34, + "grad_norm": 3.6323952219347397, + "learning_rate": 1.992340221300122e-06, + "loss": 0.6849, + "step": 2288 + }, + { + "epoch": 0.34, + "grad_norm": 2.8898065495989793, + "learning_rate": 1.9923282822153868e-06, + "loss": 0.6823, + "step": 2289 + }, + { + "epoch": 0.34, + "grad_norm": 3.747321196373675, + "learning_rate": 1.9923163338691636e-06, + "loss": 0.6849, + "step": 2290 + }, + { + "epoch": 0.34, + "grad_norm": 6.095010572132168, + "learning_rate": 1.9923043762615644e-06, + "loss": 0.6803, + "step": 2291 + }, + { + "epoch": 0.34, + "grad_norm": 2.3941613700499444, + "learning_rate": 1.9922924093927004e-06, + "loss": 0.6842, + "step": 2292 + }, + { + "epoch": 0.34, + "grad_norm": 1.967353156313163, + "learning_rate": 1.992280433262684e-06, + "loss": 0.6875, + "step": 2293 + }, + { + "epoch": 0.34, + "grad_norm": 2.4871467802912086, + "learning_rate": 1.9922684478716265e-06, + "loss": 0.6986, + "step": 2294 + }, + { + "epoch": 0.34, + "grad_norm": 1.2362750151710877, + "learning_rate": 1.9922564532196397e-06, + "loss": 0.6641, + "step": 2295 + }, + { + "epoch": 0.34, + "grad_norm": 3.7338515204920957, + "learning_rate": 1.992244449306836e-06, + "loss": 0.6823, + "step": 2296 + }, + { + "epoch": 0.34, + "grad_norm": 4.554037977384171, + "learning_rate": 1.992232436133327e-06, + "loss": 0.6966, + "step": 2297 + }, + { + "epoch": 0.34, + "grad_norm": 8.911124013074286, + "learning_rate": 1.992220413699225e-06, + "loss": 0.6999, + "step": 2298 + }, + { + "epoch": 0.34, + "grad_norm": 7.651910643519013, + "learning_rate": 1.992208382004641e-06, + "loss": 0.6934, + "step": 2299 + }, + { + "epoch": 0.34, + "grad_norm": 5.983009746382285, + "learning_rate": 1.99219634104969e-06, + "loss": 0.7051, + "step": 2300 + }, + { + "epoch": 0.34, + "grad_norm": 1.9496113032164744, + "learning_rate": 1.9921842908344818e-06, + "loss": 0.6934, + "step": 2301 + }, + { + "epoch": 0.34, + "grad_norm": 2.943371883459314, + "learning_rate": 1.9921722313591305e-06, + "loss": 0.6829, + "step": 2302 + }, + { + "epoch": 0.34, + "grad_norm": 2.4816611092904166, + "learning_rate": 1.9921601626237477e-06, + "loss": 0.6908, + "step": 2303 + }, + { + "epoch": 0.34, + "grad_norm": 0.6176177068976274, + "learning_rate": 1.9921480846284462e-06, + "loss": 0.6888, + "step": 2304 + }, + { + "epoch": 0.34, + "grad_norm": 2.1519241763573707, + "learning_rate": 1.992135997373339e-06, + "loss": 0.6888, + "step": 2305 + }, + { + "epoch": 0.34, + "grad_norm": 5.476402791546761, + "learning_rate": 1.9921239008585393e-06, + "loss": 0.6992, + "step": 2306 + }, + { + "epoch": 0.34, + "grad_norm": 0.9518622672244236, + "learning_rate": 1.9921117950841587e-06, + "loss": 0.6868, + "step": 2307 + }, + { + "epoch": 0.34, + "grad_norm": 2.2378999022109305, + "learning_rate": 1.9920996800503117e-06, + "loss": 0.6862, + "step": 2308 + }, + { + "epoch": 0.34, + "grad_norm": 4.452299854438484, + "learning_rate": 1.99208755575711e-06, + "loss": 0.6849, + "step": 2309 + }, + { + "epoch": 0.34, + "grad_norm": 3.1798554515653494, + "learning_rate": 1.9920754222046677e-06, + "loss": 0.6849, + "step": 2310 + }, + { + "epoch": 0.34, + "grad_norm": 0.6060081224609783, + "learning_rate": 1.9920632793930977e-06, + "loss": 0.681, + "step": 2311 + }, + { + "epoch": 0.34, + "grad_norm": 7.713670288982561, + "learning_rate": 1.9920511273225135e-06, + "loss": 0.6803, + "step": 2312 + }, + { + "epoch": 0.34, + "grad_norm": 5.343097783019866, + "learning_rate": 1.9920389659930288e-06, + "loss": 0.6836, + "step": 2313 + }, + { + "epoch": 0.35, + "grad_norm": 0.8065378792795349, + "learning_rate": 1.992026795404756e-06, + "loss": 0.6862, + "step": 2314 + }, + { + "epoch": 0.35, + "grad_norm": 4.2856841709613045, + "learning_rate": 1.9920146155578096e-06, + "loss": 0.6934, + "step": 2315 + }, + { + "epoch": 0.35, + "grad_norm": 1.0772636111970415, + "learning_rate": 1.992002426452303e-06, + "loss": 0.6829, + "step": 2316 + }, + { + "epoch": 0.35, + "grad_norm": 1.237011071970526, + "learning_rate": 1.99199022808835e-06, + "loss": 0.6868, + "step": 2317 + }, + { + "epoch": 0.35, + "grad_norm": 0.8977306885338818, + "learning_rate": 1.9919780204660647e-06, + "loss": 0.6784, + "step": 2318 + }, + { + "epoch": 0.35, + "grad_norm": 6.436987403462461, + "learning_rate": 1.9919658035855605e-06, + "loss": 0.6973, + "step": 2319 + }, + { + "epoch": 0.35, + "grad_norm": 2.3502802905856433, + "learning_rate": 1.991953577446952e-06, + "loss": 0.6777, + "step": 2320 + }, + { + "epoch": 0.35, + "grad_norm": 2.014034904756365, + "learning_rate": 1.991941342050353e-06, + "loss": 0.7038, + "step": 2321 + }, + { + "epoch": 0.35, + "grad_norm": 0.8457710509927011, + "learning_rate": 1.9919290973958777e-06, + "loss": 0.7012, + "step": 2322 + }, + { + "epoch": 0.35, + "grad_norm": 10.636039435009982, + "learning_rate": 1.9919168434836403e-06, + "loss": 0.7018, + "step": 2323 + }, + { + "epoch": 0.35, + "grad_norm": 10.06673171141674, + "learning_rate": 1.9919045803137554e-06, + "loss": 0.7012, + "step": 2324 + }, + { + "epoch": 0.35, + "grad_norm": 8.31036030359721, + "learning_rate": 1.9918923078863375e-06, + "loss": 0.7077, + "step": 2325 + }, + { + "epoch": 0.35, + "grad_norm": 6.776510620727537, + "learning_rate": 1.9918800262015004e-06, + "loss": 0.6836, + "step": 2326 + }, + { + "epoch": 0.35, + "grad_norm": 1.8904436520686976, + "learning_rate": 1.99186773525936e-06, + "loss": 0.6823, + "step": 2327 + }, + { + "epoch": 0.35, + "grad_norm": 9.934085915906719, + "learning_rate": 1.99185543506003e-06, + "loss": 0.7246, + "step": 2328 + }, + { + "epoch": 0.35, + "grad_norm": 11.714077319731466, + "learning_rate": 1.9918431256036255e-06, + "loss": 0.7077, + "step": 2329 + }, + { + "epoch": 0.35, + "grad_norm": 2.6436917216103746, + "learning_rate": 1.9918308068902616e-06, + "loss": 0.696, + "step": 2330 + }, + { + "epoch": 0.35, + "grad_norm": 3.157862924556312, + "learning_rate": 1.9918184789200527e-06, + "loss": 0.6745, + "step": 2331 + }, + { + "epoch": 0.35, + "grad_norm": 4.472181095801415, + "learning_rate": 1.9918061416931146e-06, + "loss": 0.6901, + "step": 2332 + }, + { + "epoch": 0.35, + "grad_norm": 5.167223082985449, + "learning_rate": 1.9917937952095616e-06, + "loss": 0.6888, + "step": 2333 + }, + { + "epoch": 0.35, + "grad_norm": 0.8069277735616647, + "learning_rate": 1.9917814394695098e-06, + "loss": 0.6816, + "step": 2334 + }, + { + "epoch": 0.35, + "grad_norm": 3.775345913348262, + "learning_rate": 1.991769074473074e-06, + "loss": 0.6908, + "step": 2335 + }, + { + "epoch": 0.35, + "grad_norm": 1.5765318134631081, + "learning_rate": 1.99175670022037e-06, + "loss": 0.679, + "step": 2336 + }, + { + "epoch": 0.35, + "grad_norm": 3.7691607434128964, + "learning_rate": 1.991744316711512e-06, + "loss": 0.6921, + "step": 2337 + }, + { + "epoch": 0.35, + "grad_norm": 7.179160236549375, + "learning_rate": 1.9917319239466175e-06, + "loss": 0.6888, + "step": 2338 + }, + { + "epoch": 0.35, + "grad_norm": 1.9772501227003914, + "learning_rate": 1.991719521925801e-06, + "loss": 0.6784, + "step": 2339 + }, + { + "epoch": 0.35, + "grad_norm": 1.6585465663997132, + "learning_rate": 1.9917071106491784e-06, + "loss": 0.6849, + "step": 2340 + }, + { + "epoch": 0.35, + "grad_norm": 1.3183524043315844, + "learning_rate": 1.991694690116866e-06, + "loss": 0.6842, + "step": 2341 + }, + { + "epoch": 0.35, + "grad_norm": 1.1821763842605333, + "learning_rate": 1.991682260328979e-06, + "loss": 0.6895, + "step": 2342 + }, + { + "epoch": 0.35, + "grad_norm": 6.838521368866493, + "learning_rate": 1.991669821285634e-06, + "loss": 0.6966, + "step": 2343 + }, + { + "epoch": 0.35, + "grad_norm": 13.414708481394875, + "learning_rate": 1.991657372986947e-06, + "loss": 0.707, + "step": 2344 + }, + { + "epoch": 0.35, + "grad_norm": 6.369101320292697, + "learning_rate": 1.991644915433034e-06, + "loss": 0.7025, + "step": 2345 + }, + { + "epoch": 0.35, + "grad_norm": 0.6885073685842326, + "learning_rate": 1.991632448624011e-06, + "loss": 0.6908, + "step": 2346 + }, + { + "epoch": 0.35, + "grad_norm": 7.923307268010873, + "learning_rate": 1.9916199725599944e-06, + "loss": 0.6966, + "step": 2347 + }, + { + "epoch": 0.35, + "grad_norm": 3.1471148860358538, + "learning_rate": 1.9916074872411014e-06, + "loss": 0.6797, + "step": 2348 + }, + { + "epoch": 0.35, + "grad_norm": 6.095058859246579, + "learning_rate": 1.9915949926674477e-06, + "loss": 0.6888, + "step": 2349 + }, + { + "epoch": 0.35, + "grad_norm": 6.581398190345814, + "learning_rate": 1.9915824888391506e-06, + "loss": 0.6986, + "step": 2350 + }, + { + "epoch": 0.35, + "grad_norm": 1.0637824721023226, + "learning_rate": 1.991569975756326e-06, + "loss": 0.6868, + "step": 2351 + }, + { + "epoch": 0.35, + "grad_norm": 0.9331366626695359, + "learning_rate": 1.991557453419092e-06, + "loss": 0.6849, + "step": 2352 + }, + { + "epoch": 0.35, + "grad_norm": 6.719258639469793, + "learning_rate": 1.9915449218275636e-06, + "loss": 0.6973, + "step": 2353 + }, + { + "epoch": 0.35, + "grad_norm": 6.361172541045516, + "learning_rate": 1.991532380981859e-06, + "loss": 0.6842, + "step": 2354 + }, + { + "epoch": 0.35, + "grad_norm": 0.7782802076870772, + "learning_rate": 1.9915198308820955e-06, + "loss": 0.7012, + "step": 2355 + }, + { + "epoch": 0.35, + "grad_norm": 8.645311177527937, + "learning_rate": 1.9915072715283894e-06, + "loss": 0.6868, + "step": 2356 + }, + { + "epoch": 0.35, + "grad_norm": 0.45311244108103604, + "learning_rate": 1.991494702920858e-06, + "loss": 0.6849, + "step": 2357 + }, + { + "epoch": 0.35, + "grad_norm": 2.3955219787872766, + "learning_rate": 1.991482125059619e-06, + "loss": 0.6829, + "step": 2358 + }, + { + "epoch": 0.35, + "grad_norm": 3.2463520298662694, + "learning_rate": 1.9914695379447897e-06, + "loss": 0.6862, + "step": 2359 + }, + { + "epoch": 0.35, + "grad_norm": 1.7220375715786762, + "learning_rate": 1.9914569415764875e-06, + "loss": 0.6908, + "step": 2360 + }, + { + "epoch": 0.35, + "grad_norm": 2.6371019877846984, + "learning_rate": 1.9914443359548303e-06, + "loss": 0.6862, + "step": 2361 + }, + { + "epoch": 0.35, + "grad_norm": 10.72273289321425, + "learning_rate": 1.9914317210799352e-06, + "loss": 0.6986, + "step": 2362 + }, + { + "epoch": 0.35, + "grad_norm": 3.2395283359596143, + "learning_rate": 1.99141909695192e-06, + "loss": 0.6862, + "step": 2363 + }, + { + "epoch": 0.35, + "grad_norm": 0.6040482542572079, + "learning_rate": 1.991406463570903e-06, + "loss": 0.6712, + "step": 2364 + }, + { + "epoch": 0.35, + "grad_norm": 9.72296237359763, + "learning_rate": 1.9913938209370015e-06, + "loss": 0.6921, + "step": 2365 + }, + { + "epoch": 0.35, + "grad_norm": 5.762056507314355, + "learning_rate": 1.991381169050334e-06, + "loss": 0.6784, + "step": 2366 + }, + { + "epoch": 0.35, + "grad_norm": 1.7271170560212206, + "learning_rate": 1.9913685079110186e-06, + "loss": 0.6725, + "step": 2367 + }, + { + "epoch": 0.35, + "grad_norm": 2.074174316011279, + "learning_rate": 1.9913558375191726e-06, + "loss": 0.6855, + "step": 2368 + }, + { + "epoch": 0.35, + "grad_norm": 2.5949354362220207, + "learning_rate": 1.9913431578749152e-06, + "loss": 0.6934, + "step": 2369 + }, + { + "epoch": 0.35, + "grad_norm": 4.418178023189968, + "learning_rate": 1.9913304689783644e-06, + "loss": 0.6849, + "step": 2370 + }, + { + "epoch": 0.35, + "grad_norm": 1.35548319673339, + "learning_rate": 1.9913177708296393e-06, + "loss": 0.6882, + "step": 2371 + }, + { + "epoch": 0.35, + "grad_norm": 1.9625451158890335, + "learning_rate": 1.9913050634288567e-06, + "loss": 0.6758, + "step": 2372 + }, + { + "epoch": 0.35, + "grad_norm": 7.964763012693123, + "learning_rate": 1.991292346776137e-06, + "loss": 0.7057, + "step": 2373 + }, + { + "epoch": 0.35, + "grad_norm": 1.749692294516227, + "learning_rate": 1.9912796208715983e-06, + "loss": 0.6979, + "step": 2374 + }, + { + "epoch": 0.35, + "grad_norm": 4.8986423088413105, + "learning_rate": 1.9912668857153587e-06, + "loss": 0.6895, + "step": 2375 + }, + { + "epoch": 0.35, + "grad_norm": 14.423591533609114, + "learning_rate": 1.9912541413075376e-06, + "loss": 0.7214, + "step": 2376 + }, + { + "epoch": 0.35, + "grad_norm": 1.8013112197700947, + "learning_rate": 1.991241387648254e-06, + "loss": 0.6947, + "step": 2377 + }, + { + "epoch": 0.35, + "grad_norm": 2.5569539405450574, + "learning_rate": 1.9912286247376274e-06, + "loss": 0.6797, + "step": 2378 + }, + { + "epoch": 0.35, + "grad_norm": 1.2111193150773094, + "learning_rate": 1.991215852575776e-06, + "loss": 0.6986, + "step": 2379 + }, + { + "epoch": 0.35, + "grad_norm": 5.269252816751187, + "learning_rate": 1.991203071162819e-06, + "loss": 0.6699, + "step": 2380 + }, + { + "epoch": 0.36, + "grad_norm": 5.913521127095258, + "learning_rate": 1.9911902804988766e-06, + "loss": 0.6888, + "step": 2381 + }, + { + "epoch": 0.36, + "grad_norm": 4.570259280884476, + "learning_rate": 1.9911774805840675e-06, + "loss": 0.7083, + "step": 2382 + }, + { + "epoch": 0.36, + "grad_norm": 1.3857210898458456, + "learning_rate": 1.9911646714185117e-06, + "loss": 0.681, + "step": 2383 + }, + { + "epoch": 0.36, + "grad_norm": 1.406032786867253, + "learning_rate": 1.991151853002328e-06, + "loss": 0.6725, + "step": 2384 + }, + { + "epoch": 0.36, + "grad_norm": 4.977565790351673, + "learning_rate": 1.9911390253356363e-06, + "loss": 0.6927, + "step": 2385 + }, + { + "epoch": 0.36, + "grad_norm": 1.0183733445131988, + "learning_rate": 1.9911261884185563e-06, + "loss": 0.6901, + "step": 2386 + }, + { + "epoch": 0.36, + "grad_norm": 1.083822810631499, + "learning_rate": 1.991113342251208e-06, + "loss": 0.6562, + "step": 2387 + }, + { + "epoch": 0.36, + "grad_norm": 4.28944118573875, + "learning_rate": 1.991100486833712e-06, + "loss": 0.6992, + "step": 2388 + }, + { + "epoch": 0.36, + "grad_norm": 6.813179021549531, + "learning_rate": 1.9910876221661867e-06, + "loss": 0.694, + "step": 2389 + }, + { + "epoch": 0.36, + "grad_norm": 8.183013346472238, + "learning_rate": 1.991074748248753e-06, + "loss": 0.7012, + "step": 2390 + }, + { + "epoch": 0.36, + "grad_norm": 0.6052952500584277, + "learning_rate": 1.9910618650815315e-06, + "loss": 0.6862, + "step": 2391 + }, + { + "epoch": 0.36, + "grad_norm": 5.736633709905756, + "learning_rate": 1.991048972664641e-06, + "loss": 0.7012, + "step": 2392 + }, + { + "epoch": 0.36, + "grad_norm": 0.9882256882324154, + "learning_rate": 1.991036070998204e-06, + "loss": 0.6849, + "step": 2393 + }, + { + "epoch": 0.36, + "grad_norm": 1.8717732826680542, + "learning_rate": 1.991023160082339e-06, + "loss": 0.6999, + "step": 2394 + }, + { + "epoch": 0.36, + "grad_norm": 10.814056045632999, + "learning_rate": 1.9910102399171673e-06, + "loss": 0.707, + "step": 2395 + }, + { + "epoch": 0.36, + "grad_norm": 3.1519946595215487, + "learning_rate": 1.9909973105028096e-06, + "loss": 0.6751, + "step": 2396 + }, + { + "epoch": 0.36, + "grad_norm": 6.516383975920999, + "learning_rate": 1.990984371839386e-06, + "loss": 0.6953, + "step": 2397 + }, + { + "epoch": 0.36, + "grad_norm": 5.305978381448393, + "learning_rate": 1.9909714239270177e-06, + "loss": 0.694, + "step": 2398 + }, + { + "epoch": 0.36, + "grad_norm": 5.427928167504524, + "learning_rate": 1.9909584667658258e-06, + "loss": 0.6849, + "step": 2399 + }, + { + "epoch": 0.36, + "grad_norm": 5.705015343842275, + "learning_rate": 1.9909455003559306e-06, + "loss": 0.6927, + "step": 2400 + }, + { + "epoch": 0.36, + "grad_norm": 3.3550374341969866, + "learning_rate": 1.990932524697454e-06, + "loss": 0.6973, + "step": 2401 + }, + { + "epoch": 0.36, + "grad_norm": 1.6529971859164365, + "learning_rate": 1.9909195397905156e-06, + "loss": 0.6966, + "step": 2402 + }, + { + "epoch": 0.36, + "grad_norm": 8.172481873794828, + "learning_rate": 1.990906545635238e-06, + "loss": 0.6836, + "step": 2403 + }, + { + "epoch": 0.36, + "grad_norm": 5.196402323169406, + "learning_rate": 1.990893542231742e-06, + "loss": 0.6784, + "step": 2404 + }, + { + "epoch": 0.36, + "grad_norm": 10.541393542638346, + "learning_rate": 1.990880529580149e-06, + "loss": 0.694, + "step": 2405 + }, + { + "epoch": 0.36, + "grad_norm": 3.2323274672313453, + "learning_rate": 1.9908675076805802e-06, + "loss": 0.6855, + "step": 2406 + }, + { + "epoch": 0.36, + "grad_norm": 9.119114121513338, + "learning_rate": 1.9908544765331576e-06, + "loss": 0.6927, + "step": 2407 + }, + { + "epoch": 0.36, + "grad_norm": 1.0068103032453524, + "learning_rate": 1.990841436138002e-06, + "loss": 0.6966, + "step": 2408 + }, + { + "epoch": 0.36, + "grad_norm": 4.124206148595375, + "learning_rate": 1.990828386495236e-06, + "loss": 0.6855, + "step": 2409 + }, + { + "epoch": 0.36, + "grad_norm": 5.0350839293117575, + "learning_rate": 1.990815327604981e-06, + "loss": 0.6895, + "step": 2410 + }, + { + "epoch": 0.36, + "grad_norm": 7.775156966703518, + "learning_rate": 1.9908022594673592e-06, + "loss": 0.6953, + "step": 2411 + }, + { + "epoch": 0.36, + "grad_norm": 3.630559836103916, + "learning_rate": 1.9907891820824917e-06, + "loss": 0.6868, + "step": 2412 + }, + { + "epoch": 0.36, + "grad_norm": 6.51708831587758, + "learning_rate": 1.9907760954505015e-06, + "loss": 0.6888, + "step": 2413 + }, + { + "epoch": 0.36, + "grad_norm": 2.638509719456267, + "learning_rate": 1.9907629995715103e-06, + "loss": 0.7077, + "step": 2414 + }, + { + "epoch": 0.36, + "grad_norm": 1.0546937761544428, + "learning_rate": 1.9907498944456403e-06, + "loss": 0.6849, + "step": 2415 + }, + { + "epoch": 0.36, + "grad_norm": 10.852517228226061, + "learning_rate": 1.990736780073014e-06, + "loss": 0.6901, + "step": 2416 + }, + { + "epoch": 0.36, + "grad_norm": 11.350049284103575, + "learning_rate": 1.990723656453754e-06, + "loss": 0.6953, + "step": 2417 + }, + { + "epoch": 0.36, + "grad_norm": 0.7407917077462464, + "learning_rate": 1.9907105235879823e-06, + "loss": 0.6842, + "step": 2418 + }, + { + "epoch": 0.36, + "grad_norm": 1.4568428471564547, + "learning_rate": 1.9906973814758216e-06, + "loss": 0.6732, + "step": 2419 + }, + { + "epoch": 0.36, + "grad_norm": 3.0980099492397213, + "learning_rate": 1.9906842301173946e-06, + "loss": 0.6862, + "step": 2420 + }, + { + "epoch": 0.36, + "grad_norm": 0.7228362506623108, + "learning_rate": 1.990671069512824e-06, + "loss": 0.6868, + "step": 2421 + }, + { + "epoch": 0.36, + "grad_norm": 3.060907661308508, + "learning_rate": 1.9906578996622333e-06, + "loss": 0.6986, + "step": 2422 + }, + { + "epoch": 0.36, + "grad_norm": 2.4026712230872165, + "learning_rate": 1.990644720565744e-06, + "loss": 0.6934, + "step": 2423 + }, + { + "epoch": 0.36, + "grad_norm": 14.030935426755939, + "learning_rate": 1.9906315322234804e-06, + "loss": 0.7135, + "step": 2424 + }, + { + "epoch": 0.36, + "grad_norm": 0.6663004120809151, + "learning_rate": 1.990618334635565e-06, + "loss": 0.6888, + "step": 2425 + }, + { + "epoch": 0.36, + "grad_norm": 9.051692268575131, + "learning_rate": 1.990605127802121e-06, + "loss": 0.6882, + "step": 2426 + }, + { + "epoch": 0.36, + "grad_norm": 1.6639793592614784, + "learning_rate": 1.9905919117232717e-06, + "loss": 0.6914, + "step": 2427 + }, + { + "epoch": 0.36, + "grad_norm": 0.8057250878916925, + "learning_rate": 1.9905786863991406e-06, + "loss": 0.6895, + "step": 2428 + }, + { + "epoch": 0.36, + "grad_norm": 5.529140140163367, + "learning_rate": 1.990565451829851e-06, + "loss": 0.6901, + "step": 2429 + }, + { + "epoch": 0.36, + "grad_norm": 5.766092312271186, + "learning_rate": 1.9905522080155264e-06, + "loss": 0.6986, + "step": 2430 + }, + { + "epoch": 0.36, + "grad_norm": 3.881884538546903, + "learning_rate": 1.9905389549562904e-06, + "loss": 0.6823, + "step": 2431 + }, + { + "epoch": 0.36, + "grad_norm": 1.9299842480295004, + "learning_rate": 1.990525692652267e-06, + "loss": 0.6667, + "step": 2432 + }, + { + "epoch": 0.36, + "grad_norm": 7.8484660996013025, + "learning_rate": 1.9905124211035793e-06, + "loss": 0.7064, + "step": 2433 + }, + { + "epoch": 0.36, + "grad_norm": 6.335072796553507, + "learning_rate": 1.990499140310352e-06, + "loss": 0.6855, + "step": 2434 + }, + { + "epoch": 0.36, + "grad_norm": 10.567434184931717, + "learning_rate": 1.990485850272708e-06, + "loss": 0.6823, + "step": 2435 + }, + { + "epoch": 0.36, + "grad_norm": 2.7489772719650274, + "learning_rate": 1.9904725509907727e-06, + "loss": 0.679, + "step": 2436 + }, + { + "epoch": 0.36, + "grad_norm": 5.414622101707826, + "learning_rate": 1.990459242464669e-06, + "loss": 0.681, + "step": 2437 + }, + { + "epoch": 0.36, + "grad_norm": 6.941909127006602, + "learning_rate": 1.990445924694522e-06, + "loss": 0.7005, + "step": 2438 + }, + { + "epoch": 0.36, + "grad_norm": 7.634478378363209, + "learning_rate": 1.9904325976804555e-06, + "loss": 0.6849, + "step": 2439 + }, + { + "epoch": 0.36, + "grad_norm": 8.100449453001142, + "learning_rate": 1.9904192614225935e-06, + "loss": 0.6784, + "step": 2440 + }, + { + "epoch": 0.36, + "grad_norm": 1.8225501584510908, + "learning_rate": 1.9904059159210615e-06, + "loss": 0.7012, + "step": 2441 + }, + { + "epoch": 0.36, + "grad_norm": 10.123599575650918, + "learning_rate": 1.9903925611759834e-06, + "loss": 0.7168, + "step": 2442 + }, + { + "epoch": 0.36, + "grad_norm": 0.9232016891829709, + "learning_rate": 1.9903791971874834e-06, + "loss": 0.6771, + "step": 2443 + }, + { + "epoch": 0.36, + "grad_norm": 3.776691509353578, + "learning_rate": 1.990365823955687e-06, + "loss": 0.6706, + "step": 2444 + }, + { + "epoch": 0.36, + "grad_norm": 6.426914068420084, + "learning_rate": 1.9903524414807198e-06, + "loss": 0.6816, + "step": 2445 + }, + { + "epoch": 0.36, + "grad_norm": 1.071395166967945, + "learning_rate": 1.990339049762705e-06, + "loss": 0.6979, + "step": 2446 + }, + { + "epoch": 0.36, + "grad_norm": 1.3358753345925634, + "learning_rate": 1.9903256488017676e-06, + "loss": 0.7109, + "step": 2447 + }, + { + "epoch": 0.37, + "grad_norm": 14.370873037547973, + "learning_rate": 1.990312238598034e-06, + "loss": 0.6927, + "step": 2448 + }, + { + "epoch": 0.37, + "grad_norm": 7.9713920644756975, + "learning_rate": 1.9902988191516285e-06, + "loss": 0.7005, + "step": 2449 + }, + { + "epoch": 0.37, + "grad_norm": 1.9564420462763736, + "learning_rate": 1.9902853904626765e-06, + "loss": 0.7057, + "step": 2450 + }, + { + "epoch": 0.37, + "grad_norm": 9.7750909577266, + "learning_rate": 1.9902719525313032e-06, + "loss": 0.7051, + "step": 2451 + }, + { + "epoch": 0.37, + "grad_norm": 1.7040609256600752, + "learning_rate": 1.9902585053576346e-06, + "loss": 0.6908, + "step": 2452 + }, + { + "epoch": 0.37, + "grad_norm": 0.45358365792047206, + "learning_rate": 1.990245048941796e-06, + "loss": 0.6803, + "step": 2453 + }, + { + "epoch": 0.37, + "grad_norm": 11.639056699397374, + "learning_rate": 1.9902315832839122e-06, + "loss": 0.709, + "step": 2454 + }, + { + "epoch": 0.37, + "grad_norm": 2.3184729448969468, + "learning_rate": 1.99021810838411e-06, + "loss": 0.6953, + "step": 2455 + }, + { + "epoch": 0.37, + "grad_norm": 1.8977666774821498, + "learning_rate": 1.9902046242425143e-06, + "loss": 0.6849, + "step": 2456 + }, + { + "epoch": 0.37, + "grad_norm": 1.8435245984188402, + "learning_rate": 1.9901911308592512e-06, + "loss": 0.6868, + "step": 2457 + }, + { + "epoch": 0.37, + "grad_norm": 1.5304279024331808, + "learning_rate": 1.9901776282344466e-06, + "loss": 0.6947, + "step": 2458 + }, + { + "epoch": 0.37, + "grad_norm": 2.155596731123696, + "learning_rate": 1.990164116368227e-06, + "loss": 0.6934, + "step": 2459 + }, + { + "epoch": 0.37, + "grad_norm": 1.2006659536436732, + "learning_rate": 1.990150595260718e-06, + "loss": 0.6803, + "step": 2460 + }, + { + "epoch": 0.37, + "grad_norm": 0.8340066213619642, + "learning_rate": 1.990137064912046e-06, + "loss": 0.6875, + "step": 2461 + }, + { + "epoch": 0.37, + "grad_norm": 4.7478393076042735, + "learning_rate": 1.990123525322337e-06, + "loss": 0.6921, + "step": 2462 + }, + { + "epoch": 0.37, + "grad_norm": 3.8443809911615485, + "learning_rate": 1.990109976491718e-06, + "loss": 0.6986, + "step": 2463 + }, + { + "epoch": 0.37, + "grad_norm": 0.9755522452975106, + "learning_rate": 1.9900964184203144e-06, + "loss": 0.6868, + "step": 2464 + }, + { + "epoch": 0.37, + "grad_norm": 2.1869806016008937, + "learning_rate": 1.9900828511082537e-06, + "loss": 0.6875, + "step": 2465 + }, + { + "epoch": 0.37, + "grad_norm": 7.779783717613751, + "learning_rate": 1.990069274555662e-06, + "loss": 0.6986, + "step": 2466 + }, + { + "epoch": 0.37, + "grad_norm": 0.6294631644937062, + "learning_rate": 1.9900556887626668e-06, + "loss": 0.6849, + "step": 2467 + }, + { + "epoch": 0.37, + "grad_norm": 4.91282053933091, + "learning_rate": 1.9900420937293937e-06, + "loss": 0.6927, + "step": 2468 + }, + { + "epoch": 0.37, + "grad_norm": 7.605068472637087, + "learning_rate": 1.9900284894559704e-06, + "loss": 0.6992, + "step": 2469 + }, + { + "epoch": 0.37, + "grad_norm": 1.1219484708551961, + "learning_rate": 1.990014875942524e-06, + "loss": 0.6764, + "step": 2470 + }, + { + "epoch": 0.37, + "grad_norm": 6.508883945726252, + "learning_rate": 1.9900012531891804e-06, + "loss": 0.6855, + "step": 2471 + }, + { + "epoch": 0.37, + "grad_norm": 8.550505027566599, + "learning_rate": 1.989987621196068e-06, + "loss": 0.6921, + "step": 2472 + }, + { + "epoch": 0.37, + "grad_norm": 1.1806279030931466, + "learning_rate": 1.989973979963314e-06, + "loss": 0.6849, + "step": 2473 + }, + { + "epoch": 0.37, + "grad_norm": 5.614877264529312, + "learning_rate": 1.9899603294910445e-06, + "loss": 0.6855, + "step": 2474 + }, + { + "epoch": 0.37, + "grad_norm": 6.343655910128949, + "learning_rate": 1.989946669779388e-06, + "loss": 0.6953, + "step": 2475 + }, + { + "epoch": 0.37, + "grad_norm": 6.544477823431974, + "learning_rate": 1.989933000828472e-06, + "loss": 0.696, + "step": 2476 + }, + { + "epoch": 0.37, + "grad_norm": 2.1820093594172985, + "learning_rate": 1.9899193226384233e-06, + "loss": 0.6927, + "step": 2477 + }, + { + "epoch": 0.37, + "grad_norm": 11.46641626006166, + "learning_rate": 1.9899056352093705e-06, + "loss": 0.7044, + "step": 2478 + }, + { + "epoch": 0.37, + "grad_norm": 4.990002386271633, + "learning_rate": 1.9898919385414404e-06, + "loss": 0.7038, + "step": 2479 + }, + { + "epoch": 0.37, + "grad_norm": 3.5314104933583175, + "learning_rate": 1.9898782326347614e-06, + "loss": 0.6927, + "step": 2480 + }, + { + "epoch": 0.37, + "grad_norm": 1.8827268593009416, + "learning_rate": 1.9898645174894615e-06, + "loss": 0.694, + "step": 2481 + }, + { + "epoch": 0.37, + "grad_norm": 2.1256577840738355, + "learning_rate": 1.9898507931056685e-06, + "loss": 0.679, + "step": 2482 + }, + { + "epoch": 0.37, + "grad_norm": 10.314393813642246, + "learning_rate": 1.9898370594835104e-06, + "loss": 0.6979, + "step": 2483 + }, + { + "epoch": 0.37, + "grad_norm": 2.3923716092039267, + "learning_rate": 1.9898233166231154e-06, + "loss": 0.6927, + "step": 2484 + }, + { + "epoch": 0.37, + "grad_norm": 2.870837542310202, + "learning_rate": 1.989809564524612e-06, + "loss": 0.6823, + "step": 2485 + }, + { + "epoch": 0.37, + "grad_norm": 6.363497887558907, + "learning_rate": 1.9897958031881283e-06, + "loss": 0.6966, + "step": 2486 + }, + { + "epoch": 0.37, + "grad_norm": 5.201004976679577, + "learning_rate": 1.989782032613793e-06, + "loss": 0.6895, + "step": 2487 + }, + { + "epoch": 0.37, + "grad_norm": 3.106759051132948, + "learning_rate": 1.9897682528017345e-06, + "loss": 0.6895, + "step": 2488 + }, + { + "epoch": 0.37, + "grad_norm": 2.552946773501974, + "learning_rate": 1.989754463752081e-06, + "loss": 0.6823, + "step": 2489 + }, + { + "epoch": 0.37, + "grad_norm": 4.790618569149767, + "learning_rate": 1.989740665464962e-06, + "loss": 0.6797, + "step": 2490 + }, + { + "epoch": 0.37, + "grad_norm": 3.4184648571678795, + "learning_rate": 1.9897268579405057e-06, + "loss": 0.694, + "step": 2491 + }, + { + "epoch": 0.37, + "grad_norm": 3.3241428204397026, + "learning_rate": 1.989713041178841e-06, + "loss": 0.6992, + "step": 2492 + }, + { + "epoch": 0.37, + "grad_norm": 2.4204896229942934, + "learning_rate": 1.989699215180097e-06, + "loss": 0.6803, + "step": 2493 + }, + { + "epoch": 0.37, + "grad_norm": 3.0625499754084213, + "learning_rate": 1.9896853799444026e-06, + "loss": 0.6738, + "step": 2494 + }, + { + "epoch": 0.37, + "grad_norm": 1.7694960925696954, + "learning_rate": 1.989671535471887e-06, + "loss": 0.6745, + "step": 2495 + }, + { + "epoch": 0.37, + "grad_norm": 1.6032849241978036, + "learning_rate": 1.98965768176268e-06, + "loss": 0.6947, + "step": 2496 + }, + { + "epoch": 0.37, + "grad_norm": 8.02787621920832, + "learning_rate": 1.98964381881691e-06, + "loss": 0.6986, + "step": 2497 + }, + { + "epoch": 0.37, + "grad_norm": 7.319470313016433, + "learning_rate": 1.9896299466347063e-06, + "loss": 0.694, + "step": 2498 + }, + { + "epoch": 0.37, + "grad_norm": 12.087994119514148, + "learning_rate": 1.989616065216199e-06, + "loss": 0.7305, + "step": 2499 + }, + { + "epoch": 0.37, + "grad_norm": 4.390122342368475, + "learning_rate": 1.9896021745615176e-06, + "loss": 0.7109, + "step": 2500 + }, + { + "epoch": 0.37, + "grad_norm": 13.870564587983292, + "learning_rate": 1.9895882746707917e-06, + "loss": 0.7031, + "step": 2501 + }, + { + "epoch": 0.37, + "grad_norm": 2.30563019271204, + "learning_rate": 1.9895743655441505e-06, + "loss": 0.6855, + "step": 2502 + }, + { + "epoch": 0.37, + "grad_norm": 2.0206765481800906, + "learning_rate": 1.9895604471817247e-06, + "loss": 0.6882, + "step": 2503 + }, + { + "epoch": 0.37, + "grad_norm": 2.524134590533019, + "learning_rate": 1.9895465195836434e-06, + "loss": 0.6882, + "step": 2504 + }, + { + "epoch": 0.37, + "grad_norm": 4.484072501908744, + "learning_rate": 1.989532582750037e-06, + "loss": 0.6934, + "step": 2505 + }, + { + "epoch": 0.37, + "grad_norm": 4.405899945420772, + "learning_rate": 1.9895186366810357e-06, + "loss": 0.6921, + "step": 2506 + }, + { + "epoch": 0.37, + "grad_norm": 4.988694858722698, + "learning_rate": 1.9895046813767693e-06, + "loss": 0.6888, + "step": 2507 + }, + { + "epoch": 0.37, + "grad_norm": 5.405451511544117, + "learning_rate": 1.989490716837368e-06, + "loss": 0.6986, + "step": 2508 + }, + { + "epoch": 0.37, + "grad_norm": 8.13300120815804, + "learning_rate": 1.9894767430629627e-06, + "loss": 0.7018, + "step": 2509 + }, + { + "epoch": 0.37, + "grad_norm": 7.278927858476665, + "learning_rate": 1.9894627600536834e-06, + "loss": 0.7077, + "step": 2510 + }, + { + "epoch": 0.37, + "grad_norm": 1.2163170224776274, + "learning_rate": 1.9894487678096605e-06, + "loss": 0.6914, + "step": 2511 + }, + { + "epoch": 0.37, + "grad_norm": 7.9633531091308205, + "learning_rate": 1.989434766331025e-06, + "loss": 0.6934, + "step": 2512 + }, + { + "epoch": 0.37, + "grad_norm": 10.23942815388428, + "learning_rate": 1.9894207556179067e-06, + "loss": 0.7025, + "step": 2513 + }, + { + "epoch": 0.37, + "grad_norm": 3.1559010983803857, + "learning_rate": 1.9894067356704375e-06, + "loss": 0.6901, + "step": 2514 + }, + { + "epoch": 0.38, + "grad_norm": 2.309370961094347, + "learning_rate": 1.9893927064887477e-06, + "loss": 0.6862, + "step": 2515 + }, + { + "epoch": 0.38, + "grad_norm": 1.2612793501344064, + "learning_rate": 1.989378668072968e-06, + "loss": 0.6901, + "step": 2516 + }, + { + "epoch": 0.38, + "grad_norm": 1.7499554328727192, + "learning_rate": 1.98936462042323e-06, + "loss": 0.6979, + "step": 2517 + }, + { + "epoch": 0.38, + "grad_norm": 4.068414677793844, + "learning_rate": 1.9893505635396645e-06, + "loss": 0.6973, + "step": 2518 + }, + { + "epoch": 0.38, + "grad_norm": 9.301527112922328, + "learning_rate": 1.9893364974224025e-06, + "loss": 0.6973, + "step": 2519 + }, + { + "epoch": 0.38, + "grad_norm": 1.0749205207714458, + "learning_rate": 1.989322422071576e-06, + "loss": 0.6882, + "step": 2520 + }, + { + "epoch": 0.38, + "grad_norm": 2.8923598243039756, + "learning_rate": 1.9893083374873152e-06, + "loss": 0.6758, + "step": 2521 + }, + { + "epoch": 0.38, + "grad_norm": 4.200197691260833, + "learning_rate": 1.9892942436697525e-06, + "loss": 0.6986, + "step": 2522 + }, + { + "epoch": 0.38, + "grad_norm": 6.191720537369071, + "learning_rate": 1.989280140619019e-06, + "loss": 0.7025, + "step": 2523 + }, + { + "epoch": 0.38, + "grad_norm": 7.105171613913576, + "learning_rate": 1.9892660283352465e-06, + "loss": 0.6973, + "step": 2524 + }, + { + "epoch": 0.38, + "grad_norm": 5.501943415268559, + "learning_rate": 1.9892519068185667e-06, + "loss": 0.6986, + "step": 2525 + }, + { + "epoch": 0.38, + "grad_norm": 7.118973312549695, + "learning_rate": 1.9892377760691113e-06, + "loss": 0.6999, + "step": 2526 + }, + { + "epoch": 0.38, + "grad_norm": 1.0712739832200344, + "learning_rate": 1.9892236360870124e-06, + "loss": 0.6868, + "step": 2527 + }, + { + "epoch": 0.38, + "grad_norm": 8.326682877995164, + "learning_rate": 1.989209486872402e-06, + "loss": 0.6855, + "step": 2528 + }, + { + "epoch": 0.38, + "grad_norm": 3.184204564768249, + "learning_rate": 1.9891953284254116e-06, + "loss": 0.6914, + "step": 2529 + }, + { + "epoch": 0.38, + "grad_norm": 6.228720731038672, + "learning_rate": 1.9891811607461736e-06, + "loss": 0.696, + "step": 2530 + }, + { + "epoch": 0.38, + "grad_norm": 1.0880970320917565, + "learning_rate": 1.9891669838348206e-06, + "loss": 0.6914, + "step": 2531 + }, + { + "epoch": 0.38, + "grad_norm": 7.658612568081262, + "learning_rate": 1.9891527976914844e-06, + "loss": 0.6973, + "step": 2532 + }, + { + "epoch": 0.38, + "grad_norm": 1.4229572433802278, + "learning_rate": 1.989138602316298e-06, + "loss": 0.6882, + "step": 2533 + }, + { + "epoch": 0.38, + "grad_norm": 7.746283910362651, + "learning_rate": 1.9891243977093935e-06, + "loss": 0.7051, + "step": 2534 + }, + { + "epoch": 0.38, + "grad_norm": 8.128162435232586, + "learning_rate": 1.9891101838709035e-06, + "loss": 0.696, + "step": 2535 + }, + { + "epoch": 0.38, + "grad_norm": 4.808877845251727, + "learning_rate": 1.9890959608009606e-06, + "loss": 0.6816, + "step": 2536 + }, + { + "epoch": 0.38, + "grad_norm": 3.3961652390914865, + "learning_rate": 1.9890817284996977e-06, + "loss": 0.6901, + "step": 2537 + }, + { + "epoch": 0.38, + "grad_norm": 2.044491323522878, + "learning_rate": 1.989067486967248e-06, + "loss": 0.6914, + "step": 2538 + }, + { + "epoch": 0.38, + "grad_norm": 0.5782453501324029, + "learning_rate": 1.9890532362037435e-06, + "loss": 0.6868, + "step": 2539 + }, + { + "epoch": 0.38, + "grad_norm": 1.570686520973919, + "learning_rate": 1.989038976209318e-06, + "loss": 0.6862, + "step": 2540 + }, + { + "epoch": 0.38, + "grad_norm": 5.339204114613456, + "learning_rate": 1.989024706984103e-06, + "loss": 0.681, + "step": 2541 + }, + { + "epoch": 0.38, + "grad_norm": 0.9403896968106258, + "learning_rate": 1.9890104285282344e-06, + "loss": 0.6979, + "step": 2542 + }, + { + "epoch": 0.38, + "grad_norm": 7.224262089170245, + "learning_rate": 1.9889961408418435e-06, + "loss": 0.679, + "step": 2543 + }, + { + "epoch": 0.38, + "grad_norm": 0.966427092640978, + "learning_rate": 1.9889818439250644e-06, + "loss": 0.6888, + "step": 2544 + }, + { + "epoch": 0.38, + "grad_norm": 2.577174362911111, + "learning_rate": 1.9889675377780296e-06, + "loss": 0.6986, + "step": 2545 + }, + { + "epoch": 0.38, + "grad_norm": 6.396374091397961, + "learning_rate": 1.988953222400874e-06, + "loss": 0.6771, + "step": 2546 + }, + { + "epoch": 0.38, + "grad_norm": 2.3447197087197975, + "learning_rate": 1.98893889779373e-06, + "loss": 0.6816, + "step": 2547 + }, + { + "epoch": 0.38, + "grad_norm": 5.160639552481637, + "learning_rate": 1.988924563956732e-06, + "loss": 0.6895, + "step": 2548 + }, + { + "epoch": 0.38, + "grad_norm": 5.08270093829534, + "learning_rate": 1.9889102208900143e-06, + "loss": 0.6797, + "step": 2549 + }, + { + "epoch": 0.38, + "grad_norm": 2.9904869476129323, + "learning_rate": 1.988895868593709e-06, + "loss": 0.6973, + "step": 2550 + }, + { + "epoch": 0.38, + "grad_norm": 3.360214720341294, + "learning_rate": 1.9888815070679516e-06, + "loss": 0.7051, + "step": 2551 + }, + { + "epoch": 0.38, + "grad_norm": 5.003682656156149, + "learning_rate": 1.988867136312876e-06, + "loss": 0.6862, + "step": 2552 + }, + { + "epoch": 0.38, + "grad_norm": 3.3609459694407446, + "learning_rate": 1.988852756328615e-06, + "loss": 0.6855, + "step": 2553 + }, + { + "epoch": 0.38, + "grad_norm": 0.5091255388247373, + "learning_rate": 1.9888383671153048e-06, + "loss": 0.6842, + "step": 2554 + }, + { + "epoch": 0.38, + "grad_norm": 1.5271510329536506, + "learning_rate": 1.9888239686730783e-06, + "loss": 0.7044, + "step": 2555 + }, + { + "epoch": 0.38, + "grad_norm": 3.6557364583367375, + "learning_rate": 1.98880956100207e-06, + "loss": 0.6849, + "step": 2556 + }, + { + "epoch": 0.38, + "grad_norm": 4.474382556949493, + "learning_rate": 1.988795144102415e-06, + "loss": 0.6947, + "step": 2557 + }, + { + "epoch": 0.38, + "grad_norm": 6.861421726870536, + "learning_rate": 1.988780717974247e-06, + "loss": 0.6966, + "step": 2558 + }, + { + "epoch": 0.38, + "grad_norm": 7.383612523508205, + "learning_rate": 1.9887662826177015e-06, + "loss": 0.6836, + "step": 2559 + }, + { + "epoch": 0.38, + "grad_norm": 5.297961513192983, + "learning_rate": 1.9887518380329126e-06, + "loss": 0.6966, + "step": 2560 + }, + { + "epoch": 0.38, + "grad_norm": 8.553644615588608, + "learning_rate": 1.9887373842200153e-06, + "loss": 0.6921, + "step": 2561 + }, + { + "epoch": 0.38, + "grad_norm": 2.343507402624377, + "learning_rate": 1.988722921179145e-06, + "loss": 0.6992, + "step": 2562 + }, + { + "epoch": 0.38, + "grad_norm": 1.2911529929713519, + "learning_rate": 1.9887084489104358e-06, + "loss": 0.6836, + "step": 2563 + }, + { + "epoch": 0.38, + "grad_norm": 1.3729143338718164, + "learning_rate": 1.9886939674140237e-06, + "loss": 0.6725, + "step": 2564 + }, + { + "epoch": 0.38, + "grad_norm": 9.698378273947798, + "learning_rate": 1.988679476690043e-06, + "loss": 0.7116, + "step": 2565 + }, + { + "epoch": 0.38, + "grad_norm": 11.355851502930417, + "learning_rate": 1.988664976738629e-06, + "loss": 0.7285, + "step": 2566 + }, + { + "epoch": 0.38, + "grad_norm": 2.887997694497216, + "learning_rate": 1.9886504675599177e-06, + "loss": 0.6927, + "step": 2567 + }, + { + "epoch": 0.38, + "grad_norm": 8.362733555006235, + "learning_rate": 1.988635949154044e-06, + "loss": 0.6921, + "step": 2568 + }, + { + "epoch": 0.38, + "grad_norm": 4.567894301704592, + "learning_rate": 1.988621421521144e-06, + "loss": 0.6823, + "step": 2569 + }, + { + "epoch": 0.38, + "grad_norm": 1.1918692219291867, + "learning_rate": 1.988606884661352e-06, + "loss": 0.6745, + "step": 2570 + }, + { + "epoch": 0.38, + "grad_norm": 1.1180804755172373, + "learning_rate": 1.9885923385748057e-06, + "loss": 0.696, + "step": 2571 + }, + { + "epoch": 0.38, + "grad_norm": 0.7959646298185701, + "learning_rate": 1.988577783261639e-06, + "loss": 0.6882, + "step": 2572 + }, + { + "epoch": 0.38, + "grad_norm": 4.972592171104218, + "learning_rate": 1.988563218721988e-06, + "loss": 0.694, + "step": 2573 + }, + { + "epoch": 0.38, + "grad_norm": 4.367959554050634, + "learning_rate": 1.9885486449559897e-06, + "loss": 0.6823, + "step": 2574 + }, + { + "epoch": 0.38, + "grad_norm": 1.6260904374768683, + "learning_rate": 1.9885340619637794e-06, + "loss": 0.6771, + "step": 2575 + }, + { + "epoch": 0.38, + "grad_norm": 3.8123565044747347, + "learning_rate": 1.988519469745493e-06, + "loss": 0.7077, + "step": 2576 + }, + { + "epoch": 0.38, + "grad_norm": 8.201085954711838, + "learning_rate": 1.988504868301267e-06, + "loss": 0.6927, + "step": 2577 + }, + { + "epoch": 0.38, + "grad_norm": 6.833124187569013, + "learning_rate": 1.988490257631238e-06, + "loss": 0.6862, + "step": 2578 + }, + { + "epoch": 0.38, + "grad_norm": 4.203629904482312, + "learning_rate": 1.988475637735542e-06, + "loss": 0.6777, + "step": 2579 + }, + { + "epoch": 0.38, + "grad_norm": 1.377624226742618, + "learning_rate": 1.988461008614315e-06, + "loss": 0.6921, + "step": 2580 + }, + { + "epoch": 0.38, + "grad_norm": 6.419926700781408, + "learning_rate": 1.9884463702676943e-06, + "loss": 0.6986, + "step": 2581 + }, + { + "epoch": 0.39, + "grad_norm": 11.923988936200002, + "learning_rate": 1.9884317226958164e-06, + "loss": 0.694, + "step": 2582 + }, + { + "epoch": 0.39, + "grad_norm": 5.134002478786731, + "learning_rate": 1.9884170658988174e-06, + "loss": 0.6882, + "step": 2583 + }, + { + "epoch": 0.39, + "grad_norm": 0.8001396234115418, + "learning_rate": 1.9884023998768348e-06, + "loss": 0.6934, + "step": 2584 + }, + { + "epoch": 0.39, + "grad_norm": 3.9730952581815666, + "learning_rate": 1.9883877246300047e-06, + "loss": 0.6745, + "step": 2585 + }, + { + "epoch": 0.39, + "grad_norm": 0.6431368729716742, + "learning_rate": 1.988373040158465e-06, + "loss": 0.6934, + "step": 2586 + }, + { + "epoch": 0.39, + "grad_norm": 0.9782081261658017, + "learning_rate": 1.9883583464623523e-06, + "loss": 0.6947, + "step": 2587 + }, + { + "epoch": 0.39, + "grad_norm": 4.505876216294109, + "learning_rate": 1.988343643541804e-06, + "loss": 0.6868, + "step": 2588 + }, + { + "epoch": 0.39, + "grad_norm": 4.175055979481037, + "learning_rate": 1.9883289313969566e-06, + "loss": 0.6758, + "step": 2589 + }, + { + "epoch": 0.39, + "grad_norm": 3.8637605272501063, + "learning_rate": 1.988314210027948e-06, + "loss": 0.6953, + "step": 2590 + }, + { + "epoch": 0.39, + "grad_norm": 1.6215259331134593, + "learning_rate": 1.9882994794349154e-06, + "loss": 0.6771, + "step": 2591 + }, + { + "epoch": 0.39, + "grad_norm": 6.100364727793992, + "learning_rate": 1.988284739617997e-06, + "loss": 0.7044, + "step": 2592 + }, + { + "epoch": 0.39, + "grad_norm": 3.279948491369992, + "learning_rate": 1.9882699905773286e-06, + "loss": 0.6888, + "step": 2593 + }, + { + "epoch": 0.39, + "grad_norm": 1.3428433801793462, + "learning_rate": 1.9882552323130497e-06, + "loss": 0.6862, + "step": 2594 + }, + { + "epoch": 0.39, + "grad_norm": 5.3523923641673905, + "learning_rate": 1.9882404648252974e-06, + "loss": 0.6829, + "step": 2595 + }, + { + "epoch": 0.39, + "grad_norm": 5.423684344977644, + "learning_rate": 1.988225688114209e-06, + "loss": 0.7044, + "step": 2596 + }, + { + "epoch": 0.39, + "grad_norm": 10.843085852199613, + "learning_rate": 1.988210902179923e-06, + "loss": 0.6979, + "step": 2597 + }, + { + "epoch": 0.39, + "grad_norm": 6.103747858253418, + "learning_rate": 1.9881961070225775e-06, + "loss": 0.6986, + "step": 2598 + }, + { + "epoch": 0.39, + "grad_norm": 9.929580848768335, + "learning_rate": 1.98818130264231e-06, + "loss": 0.707, + "step": 2599 + }, + { + "epoch": 0.39, + "grad_norm": 0.9383082321797218, + "learning_rate": 1.9881664890392597e-06, + "loss": 0.6895, + "step": 2600 + }, + { + "epoch": 0.39, + "grad_norm": 1.7627893381612147, + "learning_rate": 1.9881516662135634e-06, + "loss": 0.679, + "step": 2601 + }, + { + "epoch": 0.39, + "grad_norm": 0.5407811695520812, + "learning_rate": 1.988136834165361e-06, + "loss": 0.6927, + "step": 2602 + }, + { + "epoch": 0.39, + "grad_norm": 8.058421154029782, + "learning_rate": 1.9881219928947893e-06, + "loss": 0.6784, + "step": 2603 + }, + { + "epoch": 0.39, + "grad_norm": 2.655288368205597, + "learning_rate": 1.988107142401988e-06, + "loss": 0.6595, + "step": 2604 + }, + { + "epoch": 0.39, + "grad_norm": 5.142414745754062, + "learning_rate": 1.9880922826870956e-06, + "loss": 0.7116, + "step": 2605 + }, + { + "epoch": 0.39, + "grad_norm": 6.5233586964609245, + "learning_rate": 1.98807741375025e-06, + "loss": 0.6921, + "step": 2606 + }, + { + "epoch": 0.39, + "grad_norm": 0.5555740652791237, + "learning_rate": 1.988062535591591e-06, + "loss": 0.6875, + "step": 2607 + }, + { + "epoch": 0.39, + "grad_norm": 9.918239358004728, + "learning_rate": 1.9880476482112563e-06, + "loss": 0.7077, + "step": 2608 + }, + { + "epoch": 0.39, + "grad_norm": 9.690519138243026, + "learning_rate": 1.988032751609386e-06, + "loss": 0.7129, + "step": 2609 + }, + { + "epoch": 0.39, + "grad_norm": 1.3445249332257476, + "learning_rate": 1.988017845786119e-06, + "loss": 0.6862, + "step": 2610 + }, + { + "epoch": 0.39, + "grad_norm": 5.71887353710108, + "learning_rate": 1.9880029307415935e-06, + "loss": 0.6921, + "step": 2611 + }, + { + "epoch": 0.39, + "grad_norm": 2.712302915036871, + "learning_rate": 1.9879880064759497e-06, + "loss": 0.6862, + "step": 2612 + }, + { + "epoch": 0.39, + "grad_norm": 5.267992196909362, + "learning_rate": 1.9879730729893264e-06, + "loss": 0.6914, + "step": 2613 + }, + { + "epoch": 0.39, + "grad_norm": 3.2916644086429314, + "learning_rate": 1.9879581302818627e-06, + "loss": 0.6966, + "step": 2614 + }, + { + "epoch": 0.39, + "grad_norm": 8.838531717493355, + "learning_rate": 1.9879431783536983e-06, + "loss": 0.7005, + "step": 2615 + }, + { + "epoch": 0.39, + "grad_norm": 5.894343176095047, + "learning_rate": 1.9879282172049733e-06, + "loss": 0.6849, + "step": 2616 + }, + { + "epoch": 0.39, + "grad_norm": 0.8670915889466251, + "learning_rate": 1.9879132468358267e-06, + "loss": 0.6882, + "step": 2617 + }, + { + "epoch": 0.39, + "grad_norm": 3.0952748508773924, + "learning_rate": 1.9878982672463987e-06, + "loss": 0.7025, + "step": 2618 + }, + { + "epoch": 0.39, + "grad_norm": 8.07685596855473, + "learning_rate": 1.987883278436828e-06, + "loss": 0.6947, + "step": 2619 + }, + { + "epoch": 0.39, + "grad_norm": 7.756715606257146, + "learning_rate": 1.987868280407256e-06, + "loss": 0.7018, + "step": 2620 + }, + { + "epoch": 0.39, + "grad_norm": 3.085994334046859, + "learning_rate": 1.987853273157822e-06, + "loss": 0.6921, + "step": 2621 + }, + { + "epoch": 0.39, + "grad_norm": 5.4663661541679724, + "learning_rate": 1.987838256688666e-06, + "loss": 0.6823, + "step": 2622 + }, + { + "epoch": 0.39, + "grad_norm": 4.248598407849384, + "learning_rate": 1.987823230999928e-06, + "loss": 0.6855, + "step": 2623 + }, + { + "epoch": 0.39, + "grad_norm": 0.5880769367801058, + "learning_rate": 1.9878081960917484e-06, + "loss": 0.6849, + "step": 2624 + }, + { + "epoch": 0.39, + "grad_norm": 0.6667717391086113, + "learning_rate": 1.9877931519642675e-06, + "loss": 0.6921, + "step": 2625 + }, + { + "epoch": 0.39, + "grad_norm": 9.477423518412438, + "learning_rate": 1.987778098617626e-06, + "loss": 0.6868, + "step": 2626 + }, + { + "epoch": 0.39, + "grad_norm": 3.0616554454686193, + "learning_rate": 1.9877630360519638e-06, + "loss": 0.6862, + "step": 2627 + }, + { + "epoch": 0.39, + "grad_norm": 1.9676764262665398, + "learning_rate": 1.9877479642674222e-06, + "loss": 0.6862, + "step": 2628 + }, + { + "epoch": 0.39, + "grad_norm": 0.9600444396355154, + "learning_rate": 1.9877328832641413e-06, + "loss": 0.6986, + "step": 2629 + }, + { + "epoch": 0.39, + "grad_norm": 3.5349827435882775, + "learning_rate": 1.987717793042262e-06, + "loss": 0.6986, + "step": 2630 + }, + { + "epoch": 0.39, + "grad_norm": 1.634228599136488, + "learning_rate": 1.9877026936019253e-06, + "loss": 0.6764, + "step": 2631 + }, + { + "epoch": 0.39, + "grad_norm": 1.5835123089100809, + "learning_rate": 1.9876875849432723e-06, + "loss": 0.6758, + "step": 2632 + }, + { + "epoch": 0.39, + "grad_norm": 0.6192342819281551, + "learning_rate": 1.987672467066443e-06, + "loss": 0.6992, + "step": 2633 + }, + { + "epoch": 0.39, + "grad_norm": 2.5564616991128437, + "learning_rate": 1.98765733997158e-06, + "loss": 0.6745, + "step": 2634 + }, + { + "epoch": 0.39, + "grad_norm": 1.77237579643837, + "learning_rate": 1.987642203658823e-06, + "loss": 0.6934, + "step": 2635 + }, + { + "epoch": 0.39, + "grad_norm": 3.998979745733603, + "learning_rate": 1.9876270581283144e-06, + "loss": 0.6797, + "step": 2636 + }, + { + "epoch": 0.39, + "grad_norm": 1.4863051061961812, + "learning_rate": 1.9876119033801953e-06, + "loss": 0.6921, + "step": 2637 + }, + { + "epoch": 0.39, + "grad_norm": 1.995896910907803, + "learning_rate": 1.9875967394146065e-06, + "loss": 0.6966, + "step": 2638 + }, + { + "epoch": 0.39, + "grad_norm": 1.6201819163698368, + "learning_rate": 1.9875815662316903e-06, + "loss": 0.6973, + "step": 2639 + }, + { + "epoch": 0.39, + "grad_norm": 6.7264928557656285, + "learning_rate": 1.987566383831588e-06, + "loss": 0.6986, + "step": 2640 + }, + { + "epoch": 0.39, + "grad_norm": 1.5738890968150132, + "learning_rate": 1.987551192214441e-06, + "loss": 0.6875, + "step": 2641 + }, + { + "epoch": 0.39, + "grad_norm": 10.141881969545281, + "learning_rate": 1.9875359913803917e-06, + "loss": 0.6921, + "step": 2642 + }, + { + "epoch": 0.39, + "grad_norm": 1.2367747412108219, + "learning_rate": 1.9875207813295817e-06, + "loss": 0.6816, + "step": 2643 + }, + { + "epoch": 0.39, + "grad_norm": 1.8032678648765017, + "learning_rate": 1.9875055620621524e-06, + "loss": 0.6934, + "step": 2644 + }, + { + "epoch": 0.39, + "grad_norm": 2.2559552551390594, + "learning_rate": 1.987490333578247e-06, + "loss": 0.6927, + "step": 2645 + }, + { + "epoch": 0.39, + "grad_norm": 4.886487870325657, + "learning_rate": 1.987475095878007e-06, + "loss": 0.6693, + "step": 2646 + }, + { + "epoch": 0.39, + "grad_norm": 6.619991433800253, + "learning_rate": 1.9874598489615743e-06, + "loss": 0.6966, + "step": 2647 + }, + { + "epoch": 0.39, + "grad_norm": 2.2365361266484736, + "learning_rate": 1.9874445928290913e-06, + "loss": 0.6875, + "step": 2648 + }, + { + "epoch": 0.4, + "grad_norm": 0.7580965757838912, + "learning_rate": 1.987429327480701e-06, + "loss": 0.6842, + "step": 2649 + }, + { + "epoch": 0.4, + "grad_norm": 2.6546765680032807, + "learning_rate": 1.987414052916545e-06, + "loss": 0.6882, + "step": 2650 + }, + { + "epoch": 0.4, + "grad_norm": 3.9670806851383773, + "learning_rate": 1.9873987691367665e-06, + "loss": 0.6921, + "step": 2651 + }, + { + "epoch": 0.4, + "grad_norm": 9.321901899800652, + "learning_rate": 1.9873834761415083e-06, + "loss": 0.7077, + "step": 2652 + }, + { + "epoch": 0.4, + "grad_norm": 1.3708037430503193, + "learning_rate": 1.9873681739309124e-06, + "loss": 0.6836, + "step": 2653 + }, + { + "epoch": 0.4, + "grad_norm": 3.727222950206262, + "learning_rate": 1.9873528625051223e-06, + "loss": 0.6947, + "step": 2654 + }, + { + "epoch": 0.4, + "grad_norm": 2.9121041766046236, + "learning_rate": 1.9873375418642804e-06, + "loss": 0.6979, + "step": 2655 + }, + { + "epoch": 0.4, + "grad_norm": 1.0180305955746314, + "learning_rate": 1.98732221200853e-06, + "loss": 0.6914, + "step": 2656 + }, + { + "epoch": 0.4, + "grad_norm": 1.6835117834728306, + "learning_rate": 1.987306872938014e-06, + "loss": 0.6973, + "step": 2657 + }, + { + "epoch": 0.4, + "grad_norm": 1.3716689197652405, + "learning_rate": 1.987291524652876e-06, + "loss": 0.6875, + "step": 2658 + }, + { + "epoch": 0.4, + "grad_norm": 2.255876161675588, + "learning_rate": 1.9872761671532584e-06, + "loss": 0.6758, + "step": 2659 + }, + { + "epoch": 0.4, + "grad_norm": 1.2280988959835852, + "learning_rate": 1.9872608004393053e-06, + "loss": 0.6888, + "step": 2660 + }, + { + "epoch": 0.4, + "grad_norm": 1.3142767733801897, + "learning_rate": 1.9872454245111596e-06, + "loss": 0.6868, + "step": 2661 + }, + { + "epoch": 0.4, + "grad_norm": 6.6370487876779, + "learning_rate": 1.9872300393689655e-06, + "loss": 0.7012, + "step": 2662 + }, + { + "epoch": 0.4, + "grad_norm": 4.820782695693171, + "learning_rate": 1.987214645012866e-06, + "loss": 0.6764, + "step": 2663 + }, + { + "epoch": 0.4, + "grad_norm": 3.674497908384114, + "learning_rate": 1.9871992414430045e-06, + "loss": 0.6771, + "step": 2664 + }, + { + "epoch": 0.4, + "grad_norm": 3.1048942397401436, + "learning_rate": 1.9871838286595253e-06, + "loss": 0.694, + "step": 2665 + }, + { + "epoch": 0.4, + "grad_norm": 0.9519413198965808, + "learning_rate": 1.9871684066625726e-06, + "loss": 0.6966, + "step": 2666 + }, + { + "epoch": 0.4, + "grad_norm": 3.4936300647417395, + "learning_rate": 1.9871529754522893e-06, + "loss": 0.6914, + "step": 2667 + }, + { + "epoch": 0.4, + "grad_norm": 2.345638438193076, + "learning_rate": 1.98713753502882e-06, + "loss": 0.6986, + "step": 2668 + }, + { + "epoch": 0.4, + "grad_norm": 4.3720812098801005, + "learning_rate": 1.987122085392309e-06, + "loss": 0.7031, + "step": 2669 + }, + { + "epoch": 0.4, + "grad_norm": 2.0391489284839466, + "learning_rate": 1.9871066265429004e-06, + "loss": 0.6829, + "step": 2670 + }, + { + "epoch": 0.4, + "grad_norm": 2.5045503921982175, + "learning_rate": 1.9870911584807385e-06, + "loss": 0.6953, + "step": 2671 + }, + { + "epoch": 0.4, + "grad_norm": 1.7026557752962024, + "learning_rate": 1.987075681205967e-06, + "loss": 0.6875, + "step": 2672 + }, + { + "epoch": 0.4, + "grad_norm": 2.268882718826785, + "learning_rate": 1.9870601947187314e-06, + "loss": 0.679, + "step": 2673 + }, + { + "epoch": 0.4, + "grad_norm": 4.626731427009179, + "learning_rate": 1.9870446990191754e-06, + "loss": 0.6875, + "step": 2674 + }, + { + "epoch": 0.4, + "grad_norm": 1.2504530052121015, + "learning_rate": 1.987029194107444e-06, + "loss": 0.6862, + "step": 2675 + }, + { + "epoch": 0.4, + "grad_norm": 8.594838894076204, + "learning_rate": 1.9870136799836818e-06, + "loss": 0.6693, + "step": 2676 + }, + { + "epoch": 0.4, + "grad_norm": 3.878473007558301, + "learning_rate": 1.9869981566480334e-06, + "loss": 0.7096, + "step": 2677 + }, + { + "epoch": 0.4, + "grad_norm": 2.9157385946034737, + "learning_rate": 1.9869826241006443e-06, + "loss": 0.6908, + "step": 2678 + }, + { + "epoch": 0.4, + "grad_norm": 0.5754086686427042, + "learning_rate": 1.9869670823416592e-06, + "loss": 0.6986, + "step": 2679 + }, + { + "epoch": 0.4, + "grad_norm": 1.8290270271324258, + "learning_rate": 1.9869515313712226e-06, + "loss": 0.681, + "step": 2680 + }, + { + "epoch": 0.4, + "grad_norm": 1.748220198018641, + "learning_rate": 1.98693597118948e-06, + "loss": 0.6803, + "step": 2681 + }, + { + "epoch": 0.4, + "grad_norm": 0.6268594156246572, + "learning_rate": 1.986920401796577e-06, + "loss": 0.6712, + "step": 2682 + }, + { + "epoch": 0.4, + "grad_norm": 2.7102154609454012, + "learning_rate": 1.9869048231926587e-06, + "loss": 0.6823, + "step": 2683 + }, + { + "epoch": 0.4, + "grad_norm": 6.385010573931936, + "learning_rate": 1.9868892353778707e-06, + "loss": 0.6862, + "step": 2684 + }, + { + "epoch": 0.4, + "grad_norm": 7.0377772262049465, + "learning_rate": 1.9868736383523575e-06, + "loss": 0.6921, + "step": 2685 + }, + { + "epoch": 0.4, + "grad_norm": 4.679484417286736, + "learning_rate": 1.9868580321162656e-06, + "loss": 0.6882, + "step": 2686 + }, + { + "epoch": 0.4, + "grad_norm": 3.07939731224141, + "learning_rate": 1.986842416669741e-06, + "loss": 0.6868, + "step": 2687 + }, + { + "epoch": 0.4, + "grad_norm": 4.112492586476916, + "learning_rate": 1.986826792012928e-06, + "loss": 0.681, + "step": 2688 + }, + { + "epoch": 0.4, + "grad_norm": 2.3441906884793293, + "learning_rate": 1.9868111581459736e-06, + "loss": 0.6895, + "step": 2689 + }, + { + "epoch": 0.4, + "grad_norm": 0.6846142220796325, + "learning_rate": 1.986795515069023e-06, + "loss": 0.6979, + "step": 2690 + }, + { + "epoch": 0.4, + "grad_norm": 4.0017926520403515, + "learning_rate": 1.9867798627822233e-06, + "loss": 0.7031, + "step": 2691 + }, + { + "epoch": 0.4, + "grad_norm": 3.9718905950482846, + "learning_rate": 1.9867642012857197e-06, + "loss": 0.6641, + "step": 2692 + }, + { + "epoch": 0.4, + "grad_norm": 2.8997910096511665, + "learning_rate": 1.986748530579658e-06, + "loss": 0.6888, + "step": 2693 + }, + { + "epoch": 0.4, + "grad_norm": 0.7191583375045364, + "learning_rate": 1.9867328506641856e-06, + "loss": 0.7018, + "step": 2694 + }, + { + "epoch": 0.4, + "grad_norm": 3.219389891542809, + "learning_rate": 1.986717161539448e-06, + "loss": 0.6901, + "step": 2695 + }, + { + "epoch": 0.4, + "grad_norm": 6.552618363196877, + "learning_rate": 1.986701463205592e-06, + "loss": 0.6934, + "step": 2696 + }, + { + "epoch": 0.4, + "grad_norm": 5.26478701861701, + "learning_rate": 1.986685755662764e-06, + "loss": 0.7044, + "step": 2697 + }, + { + "epoch": 0.4, + "grad_norm": 7.42749643515195, + "learning_rate": 1.9866700389111105e-06, + "loss": 0.6888, + "step": 2698 + }, + { + "epoch": 0.4, + "grad_norm": 4.367789698522069, + "learning_rate": 1.9866543129507782e-06, + "loss": 0.6732, + "step": 2699 + }, + { + "epoch": 0.4, + "grad_norm": 9.28198069911321, + "learning_rate": 1.9866385777819137e-06, + "loss": 0.6914, + "step": 2700 + }, + { + "epoch": 0.4, + "grad_norm": 2.311004814212521, + "learning_rate": 1.9866228334046644e-06, + "loss": 0.6999, + "step": 2701 + }, + { + "epoch": 0.4, + "grad_norm": 3.5541657533982574, + "learning_rate": 1.986607079819177e-06, + "loss": 0.6862, + "step": 2702 + }, + { + "epoch": 0.4, + "grad_norm": 0.757408733355727, + "learning_rate": 1.9865913170255985e-06, + "loss": 0.679, + "step": 2703 + }, + { + "epoch": 0.4, + "grad_norm": 1.8874385504836464, + "learning_rate": 1.986575545024076e-06, + "loss": 0.6823, + "step": 2704 + }, + { + "epoch": 0.4, + "grad_norm": 1.8113989763619143, + "learning_rate": 1.9865597638147563e-06, + "loss": 0.681, + "step": 2705 + }, + { + "epoch": 0.4, + "grad_norm": 3.6553560509082157, + "learning_rate": 1.9865439733977875e-06, + "loss": 0.6836, + "step": 2706 + }, + { + "epoch": 0.4, + "grad_norm": 0.8354153736449231, + "learning_rate": 1.9865281737733165e-06, + "loss": 0.6973, + "step": 2707 + }, + { + "epoch": 0.4, + "grad_norm": 1.0546773919768007, + "learning_rate": 1.9865123649414906e-06, + "loss": 0.6914, + "step": 2708 + }, + { + "epoch": 0.4, + "grad_norm": 4.835377104922984, + "learning_rate": 1.9864965469024576e-06, + "loss": 0.6927, + "step": 2709 + }, + { + "epoch": 0.4, + "grad_norm": 0.7260727275038332, + "learning_rate": 1.9864807196563654e-06, + "loss": 0.6836, + "step": 2710 + }, + { + "epoch": 0.4, + "grad_norm": 7.152448676418845, + "learning_rate": 1.986464883203361e-06, + "loss": 0.6895, + "step": 2711 + }, + { + "epoch": 0.4, + "grad_norm": 0.8515444681577244, + "learning_rate": 1.9864490375435928e-06, + "loss": 0.6901, + "step": 2712 + }, + { + "epoch": 0.4, + "grad_norm": 3.172125528005953, + "learning_rate": 1.9864331826772084e-06, + "loss": 0.6888, + "step": 2713 + }, + { + "epoch": 0.4, + "grad_norm": 3.492711778017652, + "learning_rate": 1.986417318604356e-06, + "loss": 0.6973, + "step": 2714 + }, + { + "epoch": 0.4, + "grad_norm": 3.3630461110051915, + "learning_rate": 1.9864014453251838e-06, + "loss": 0.6738, + "step": 2715 + }, + { + "epoch": 0.41, + "grad_norm": 1.089129343275885, + "learning_rate": 1.9863855628398388e-06, + "loss": 0.6751, + "step": 2716 + }, + { + "epoch": 0.41, + "grad_norm": 4.967195283040302, + "learning_rate": 1.986369671148471e-06, + "loss": 0.6999, + "step": 2717 + }, + { + "epoch": 0.41, + "grad_norm": 1.951154276549892, + "learning_rate": 1.9863537702512274e-06, + "loss": 0.6868, + "step": 2718 + }, + { + "epoch": 0.41, + "grad_norm": 3.22213983741441, + "learning_rate": 1.986337860148257e-06, + "loss": 0.7012, + "step": 2719 + }, + { + "epoch": 0.41, + "grad_norm": 3.0031969398455267, + "learning_rate": 1.986321940839708e-06, + "loss": 0.6849, + "step": 2720 + }, + { + "epoch": 0.41, + "grad_norm": 1.6514609929133144, + "learning_rate": 1.986306012325729e-06, + "loss": 0.6908, + "step": 2721 + }, + { + "epoch": 0.41, + "grad_norm": 0.5484900366301101, + "learning_rate": 1.9862900746064694e-06, + "loss": 0.6732, + "step": 2722 + }, + { + "epoch": 0.41, + "grad_norm": 0.5241144229280511, + "learning_rate": 1.9862741276820765e-06, + "loss": 0.6875, + "step": 2723 + }, + { + "epoch": 0.41, + "grad_norm": 3.4937997570975363, + "learning_rate": 1.9862581715527004e-06, + "loss": 0.6686, + "step": 2724 + }, + { + "epoch": 0.41, + "grad_norm": 1.6003811510502914, + "learning_rate": 1.9862422062184893e-06, + "loss": 0.6777, + "step": 2725 + }, + { + "epoch": 0.41, + "grad_norm": 2.842368798043899, + "learning_rate": 1.9862262316795924e-06, + "loss": 0.681, + "step": 2726 + }, + { + "epoch": 0.41, + "grad_norm": 0.538346983459479, + "learning_rate": 1.986210247936159e-06, + "loss": 0.6953, + "step": 2727 + }, + { + "epoch": 0.41, + "grad_norm": 3.3917899658237087, + "learning_rate": 1.9861942549883386e-06, + "loss": 0.6777, + "step": 2728 + }, + { + "epoch": 0.41, + "grad_norm": 2.1614899697109404, + "learning_rate": 1.9861782528362795e-06, + "loss": 0.6667, + "step": 2729 + }, + { + "epoch": 0.41, + "grad_norm": 3.8288953217579222, + "learning_rate": 1.9861622414801316e-06, + "loss": 0.6966, + "step": 2730 + }, + { + "epoch": 0.41, + "grad_norm": 0.8534775802250759, + "learning_rate": 1.9861462209200445e-06, + "loss": 0.6875, + "step": 2731 + }, + { + "epoch": 0.41, + "grad_norm": 5.109905375459795, + "learning_rate": 1.9861301911561673e-06, + "loss": 0.6797, + "step": 2732 + }, + { + "epoch": 0.41, + "grad_norm": 3.2625627705722913, + "learning_rate": 1.98611415218865e-06, + "loss": 0.7077, + "step": 2733 + }, + { + "epoch": 0.41, + "grad_norm": 5.7821169839398125, + "learning_rate": 1.986098104017642e-06, + "loss": 0.6999, + "step": 2734 + }, + { + "epoch": 0.41, + "grad_norm": 5.878907596668762, + "learning_rate": 1.986082046643293e-06, + "loss": 0.7188, + "step": 2735 + }, + { + "epoch": 0.41, + "grad_norm": 8.9842711772866, + "learning_rate": 1.9860659800657532e-06, + "loss": 0.707, + "step": 2736 + }, + { + "epoch": 0.41, + "grad_norm": 8.659735051559842, + "learning_rate": 1.9860499042851726e-06, + "loss": 0.696, + "step": 2737 + }, + { + "epoch": 0.41, + "grad_norm": 3.659683115471302, + "learning_rate": 1.986033819301701e-06, + "loss": 0.6888, + "step": 2738 + }, + { + "epoch": 0.41, + "grad_norm": 3.8590030324471565, + "learning_rate": 1.9860177251154882e-06, + "loss": 0.6842, + "step": 2739 + }, + { + "epoch": 0.41, + "grad_norm": 4.917430947113688, + "learning_rate": 1.986001621726685e-06, + "loss": 0.6784, + "step": 2740 + }, + { + "epoch": 0.41, + "grad_norm": 2.7624334401317565, + "learning_rate": 1.985985509135442e-06, + "loss": 0.6855, + "step": 2741 + }, + { + "epoch": 0.41, + "grad_norm": 1.8466172796581894, + "learning_rate": 1.985969387341908e-06, + "loss": 0.6908, + "step": 2742 + }, + { + "epoch": 0.41, + "grad_norm": 5.050366836334191, + "learning_rate": 1.9859532563462353e-06, + "loss": 0.6927, + "step": 2743 + }, + { + "epoch": 0.41, + "grad_norm": 3.932701441874374, + "learning_rate": 1.9859371161485733e-06, + "loss": 0.6914, + "step": 2744 + }, + { + "epoch": 0.41, + "grad_norm": 0.44804174369772637, + "learning_rate": 1.9859209667490727e-06, + "loss": 0.6842, + "step": 2745 + }, + { + "epoch": 0.41, + "grad_norm": 6.514797002070457, + "learning_rate": 1.985904808147885e-06, + "loss": 0.6914, + "step": 2746 + }, + { + "epoch": 0.41, + "grad_norm": 1.9542797708631272, + "learning_rate": 1.9858886403451606e-06, + "loss": 0.6745, + "step": 2747 + }, + { + "epoch": 0.41, + "grad_norm": 1.6384226097229617, + "learning_rate": 1.9858724633410503e-06, + "loss": 0.6868, + "step": 2748 + }, + { + "epoch": 0.41, + "grad_norm": 1.1396228929314802, + "learning_rate": 1.985856277135705e-06, + "loss": 0.6764, + "step": 2749 + }, + { + "epoch": 0.41, + "grad_norm": 5.1380479814171895, + "learning_rate": 1.9858400817292756e-06, + "loss": 0.6953, + "step": 2750 + }, + { + "epoch": 0.41, + "grad_norm": 3.429319024897904, + "learning_rate": 1.985823877121914e-06, + "loss": 0.6921, + "step": 2751 + }, + { + "epoch": 0.41, + "grad_norm": 5.339529857152535, + "learning_rate": 1.9858076633137707e-06, + "loss": 0.6888, + "step": 2752 + }, + { + "epoch": 0.41, + "grad_norm": 7.0784003687528125, + "learning_rate": 1.9857914403049975e-06, + "loss": 0.6927, + "step": 2753 + }, + { + "epoch": 0.41, + "grad_norm": 5.074400743796269, + "learning_rate": 1.9857752080957456e-06, + "loss": 0.6836, + "step": 2754 + }, + { + "epoch": 0.41, + "grad_norm": 3.223848592735763, + "learning_rate": 1.9857589666861667e-06, + "loss": 0.6895, + "step": 2755 + }, + { + "epoch": 0.41, + "grad_norm": 0.9884451161273982, + "learning_rate": 1.9857427160764115e-06, + "loss": 0.6882, + "step": 2756 + }, + { + "epoch": 0.41, + "grad_norm": 6.60717124309206, + "learning_rate": 1.985726456266633e-06, + "loss": 0.6712, + "step": 2757 + }, + { + "epoch": 0.41, + "grad_norm": 1.4778580158763206, + "learning_rate": 1.985710187256982e-06, + "loss": 0.6764, + "step": 2758 + }, + { + "epoch": 0.41, + "grad_norm": 4.731780131970611, + "learning_rate": 1.985693909047611e-06, + "loss": 0.6784, + "step": 2759 + }, + { + "epoch": 0.41, + "grad_norm": 12.631549828715404, + "learning_rate": 1.9856776216386712e-06, + "loss": 0.7298, + "step": 2760 + }, + { + "epoch": 0.41, + "grad_norm": 11.102163964733258, + "learning_rate": 1.985661325030315e-06, + "loss": 0.7331, + "step": 2761 + }, + { + "epoch": 0.41, + "grad_norm": 11.841821353321926, + "learning_rate": 1.9856450192226943e-06, + "loss": 0.7116, + "step": 2762 + }, + { + "epoch": 0.41, + "grad_norm": 6.310909472733324, + "learning_rate": 1.985628704215962e-06, + "loss": 0.7005, + "step": 2763 + }, + { + "epoch": 0.41, + "grad_norm": 5.534057792793658, + "learning_rate": 1.9856123800102695e-06, + "loss": 0.6901, + "step": 2764 + }, + { + "epoch": 0.41, + "grad_norm": 2.7622786885280552, + "learning_rate": 1.9855960466057695e-06, + "loss": 0.6842, + "step": 2765 + }, + { + "epoch": 0.41, + "grad_norm": 2.196802359194576, + "learning_rate": 1.985579704002615e-06, + "loss": 0.6823, + "step": 2766 + }, + { + "epoch": 0.41, + "grad_norm": 6.427342407535922, + "learning_rate": 1.985563352200957e-06, + "loss": 0.6842, + "step": 2767 + }, + { + "epoch": 0.41, + "grad_norm": 4.336694258914513, + "learning_rate": 1.9855469912009494e-06, + "loss": 0.6868, + "step": 2768 + }, + { + "epoch": 0.41, + "grad_norm": 4.941506695959788, + "learning_rate": 1.985530621002745e-06, + "loss": 0.6842, + "step": 2769 + }, + { + "epoch": 0.41, + "grad_norm": 2.0306985579809607, + "learning_rate": 1.9855142416064957e-06, + "loss": 0.6934, + "step": 2770 + }, + { + "epoch": 0.41, + "grad_norm": 5.667996452414096, + "learning_rate": 1.9854978530123553e-06, + "loss": 0.6999, + "step": 2771 + }, + { + "epoch": 0.41, + "grad_norm": 6.2964053999572505, + "learning_rate": 1.985481455220476e-06, + "loss": 0.6882, + "step": 2772 + }, + { + "epoch": 0.41, + "grad_norm": 3.5837814473607903, + "learning_rate": 1.985465048231011e-06, + "loss": 0.6764, + "step": 2773 + }, + { + "epoch": 0.41, + "grad_norm": 6.236111897915404, + "learning_rate": 1.985448632044114e-06, + "loss": 0.709, + "step": 2774 + }, + { + "epoch": 0.41, + "grad_norm": 5.755348156425577, + "learning_rate": 1.9854322066599367e-06, + "loss": 0.6921, + "step": 2775 + }, + { + "epoch": 0.41, + "grad_norm": 0.5683292233671846, + "learning_rate": 1.985415772078635e-06, + "loss": 0.6973, + "step": 2776 + }, + { + "epoch": 0.41, + "grad_norm": 8.661537131309071, + "learning_rate": 1.9853993283003598e-06, + "loss": 0.6927, + "step": 2777 + }, + { + "epoch": 0.41, + "grad_norm": 0.8487323223658496, + "learning_rate": 1.9853828753252656e-06, + "loss": 0.681, + "step": 2778 + }, + { + "epoch": 0.41, + "grad_norm": 4.424023328218876, + "learning_rate": 1.9853664131535058e-06, + "loss": 0.6921, + "step": 2779 + }, + { + "epoch": 0.41, + "grad_norm": 0.6036215598985739, + "learning_rate": 1.9853499417852343e-06, + "loss": 0.6842, + "step": 2780 + }, + { + "epoch": 0.41, + "grad_norm": 4.202977394696708, + "learning_rate": 1.9853334612206047e-06, + "loss": 0.6901, + "step": 2781 + }, + { + "epoch": 0.41, + "grad_norm": 3.960533624940487, + "learning_rate": 1.985316971459771e-06, + "loss": 0.6882, + "step": 2782 + }, + { + "epoch": 0.42, + "grad_norm": 2.117343319894245, + "learning_rate": 1.9853004725028863e-06, + "loss": 0.6927, + "step": 2783 + }, + { + "epoch": 0.42, + "grad_norm": 3.592332812836963, + "learning_rate": 1.9852839643501055e-06, + "loss": 0.6836, + "step": 2784 + }, + { + "epoch": 0.42, + "grad_norm": 0.8812959296151113, + "learning_rate": 1.9852674470015823e-06, + "loss": 0.6862, + "step": 2785 + }, + { + "epoch": 0.42, + "grad_norm": 0.7218475436475636, + "learning_rate": 1.9852509204574708e-06, + "loss": 0.6947, + "step": 2786 + }, + { + "epoch": 0.42, + "grad_norm": 0.6144154900158058, + "learning_rate": 1.9852343847179255e-06, + "loss": 0.6953, + "step": 2787 + }, + { + "epoch": 0.42, + "grad_norm": 1.1193741440711225, + "learning_rate": 1.9852178397831003e-06, + "loss": 0.694, + "step": 2788 + }, + { + "epoch": 0.42, + "grad_norm": 2.522958153784763, + "learning_rate": 1.98520128565315e-06, + "loss": 0.694, + "step": 2789 + }, + { + "epoch": 0.42, + "grad_norm": 4.3295469625742, + "learning_rate": 1.985184722328229e-06, + "loss": 0.6986, + "step": 2790 + }, + { + "epoch": 0.42, + "grad_norm": 0.6141441852591305, + "learning_rate": 1.985168149808492e-06, + "loss": 0.6829, + "step": 2791 + }, + { + "epoch": 0.42, + "grad_norm": 3.7921908416960766, + "learning_rate": 1.9851515680940932e-06, + "loss": 0.6921, + "step": 2792 + }, + { + "epoch": 0.42, + "grad_norm": 2.5558193099973203, + "learning_rate": 1.985134977185188e-06, + "loss": 0.6862, + "step": 2793 + }, + { + "epoch": 0.42, + "grad_norm": 1.3221552166343626, + "learning_rate": 1.985118377081931e-06, + "loss": 0.6803, + "step": 2794 + }, + { + "epoch": 0.42, + "grad_norm": 6.1479336768016575, + "learning_rate": 1.9851017677844766e-06, + "loss": 0.6823, + "step": 2795 + }, + { + "epoch": 0.42, + "grad_norm": 0.6295515268765236, + "learning_rate": 1.9850851492929807e-06, + "loss": 0.6725, + "step": 2796 + }, + { + "epoch": 0.42, + "grad_norm": 1.4888216822179035, + "learning_rate": 1.985068521607598e-06, + "loss": 0.6868, + "step": 2797 + }, + { + "epoch": 0.42, + "grad_norm": 1.9451100930340404, + "learning_rate": 1.9850518847284833e-06, + "loss": 0.6719, + "step": 2798 + }, + { + "epoch": 0.42, + "grad_norm": 4.205928282549369, + "learning_rate": 1.985035238655792e-06, + "loss": 0.681, + "step": 2799 + }, + { + "epoch": 0.42, + "grad_norm": 6.5844167404223315, + "learning_rate": 1.9850185833896807e-06, + "loss": 0.694, + "step": 2800 + }, + { + "epoch": 0.42, + "grad_norm": 1.665060502971122, + "learning_rate": 1.9850019189303035e-06, + "loss": 0.679, + "step": 2801 + }, + { + "epoch": 0.42, + "grad_norm": 1.7738133681608343, + "learning_rate": 1.9849852452778156e-06, + "loss": 0.7012, + "step": 2802 + }, + { + "epoch": 0.42, + "grad_norm": 1.679612981918824, + "learning_rate": 1.984968562432374e-06, + "loss": 0.6908, + "step": 2803 + }, + { + "epoch": 0.42, + "grad_norm": 3.024200979107528, + "learning_rate": 1.9849518703941335e-06, + "loss": 0.6628, + "step": 2804 + }, + { + "epoch": 0.42, + "grad_norm": 3.3324227959310293, + "learning_rate": 1.98493516916325e-06, + "loss": 0.6836, + "step": 2805 + }, + { + "epoch": 0.42, + "grad_norm": 1.0193929387953227, + "learning_rate": 1.9849184587398796e-06, + "loss": 0.6914, + "step": 2806 + }, + { + "epoch": 0.42, + "grad_norm": 3.372834278037163, + "learning_rate": 1.984901739124178e-06, + "loss": 0.6686, + "step": 2807 + }, + { + "epoch": 0.42, + "grad_norm": 2.247841541979525, + "learning_rate": 1.984885010316302e-06, + "loss": 0.6784, + "step": 2808 + }, + { + "epoch": 0.42, + "grad_norm": 1.8671810165050837, + "learning_rate": 1.984868272316406e-06, + "loss": 0.6686, + "step": 2809 + }, + { + "epoch": 0.42, + "grad_norm": 2.495878249698834, + "learning_rate": 1.984851525124648e-06, + "loss": 0.6745, + "step": 2810 + }, + { + "epoch": 0.42, + "grad_norm": 4.063618152023506, + "learning_rate": 1.9848347687411834e-06, + "loss": 0.6686, + "step": 2811 + }, + { + "epoch": 0.42, + "grad_norm": 1.8554489397687073, + "learning_rate": 1.984818003166169e-06, + "loss": 0.681, + "step": 2812 + }, + { + "epoch": 0.42, + "grad_norm": 3.1723416277576724, + "learning_rate": 1.984801228399761e-06, + "loss": 0.7103, + "step": 2813 + }, + { + "epoch": 0.42, + "grad_norm": 4.420332902254207, + "learning_rate": 1.984784444442116e-06, + "loss": 0.6784, + "step": 2814 + }, + { + "epoch": 0.42, + "grad_norm": 1.6876444547270133, + "learning_rate": 1.9847676512933907e-06, + "loss": 0.7305, + "step": 2815 + }, + { + "epoch": 0.42, + "grad_norm": 1.1406922931153576, + "learning_rate": 1.9847508489537418e-06, + "loss": 0.7051, + "step": 2816 + }, + { + "epoch": 0.42, + "grad_norm": 3.4408440732891874, + "learning_rate": 1.9847340374233263e-06, + "loss": 0.6771, + "step": 2817 + }, + { + "epoch": 0.42, + "grad_norm": 7.587280379546873, + "learning_rate": 1.984717216702301e-06, + "loss": 0.6934, + "step": 2818 + }, + { + "epoch": 0.42, + "grad_norm": 4.076342760524867, + "learning_rate": 1.9847003867908227e-06, + "loss": 0.6908, + "step": 2819 + }, + { + "epoch": 0.42, + "grad_norm": 6.686302628574128, + "learning_rate": 1.984683547689048e-06, + "loss": 0.6921, + "step": 2820 + }, + { + "epoch": 0.42, + "grad_norm": 0.6568980337666327, + "learning_rate": 1.9846666993971355e-06, + "loss": 0.6973, + "step": 2821 + }, + { + "epoch": 0.42, + "grad_norm": 7.76877514055872, + "learning_rate": 1.9846498419152414e-06, + "loss": 0.6908, + "step": 2822 + }, + { + "epoch": 0.42, + "grad_norm": 8.471143336506184, + "learning_rate": 1.984632975243523e-06, + "loss": 0.7083, + "step": 2823 + }, + { + "epoch": 0.42, + "grad_norm": 5.410405917484212, + "learning_rate": 1.9846160993821386e-06, + "loss": 0.7064, + "step": 2824 + }, + { + "epoch": 0.42, + "grad_norm": 11.309959108848512, + "learning_rate": 1.9845992143312444e-06, + "loss": 0.7116, + "step": 2825 + }, + { + "epoch": 0.42, + "grad_norm": 2.585234890645064, + "learning_rate": 1.9845823200909987e-06, + "loss": 0.679, + "step": 2826 + }, + { + "epoch": 0.42, + "grad_norm": 5.0985605534627965, + "learning_rate": 1.9845654166615593e-06, + "loss": 0.6986, + "step": 2827 + }, + { + "epoch": 0.42, + "grad_norm": 0.9206437117341134, + "learning_rate": 1.984548504043084e-06, + "loss": 0.7005, + "step": 2828 + }, + { + "epoch": 0.42, + "grad_norm": 7.7362122055304265, + "learning_rate": 1.9845315822357303e-06, + "loss": 0.6914, + "step": 2829 + }, + { + "epoch": 0.42, + "grad_norm": 5.074714489763068, + "learning_rate": 1.9845146512396562e-06, + "loss": 0.696, + "step": 2830 + }, + { + "epoch": 0.42, + "grad_norm": 0.8017566435732338, + "learning_rate": 1.98449771105502e-06, + "loss": 0.696, + "step": 2831 + }, + { + "epoch": 0.42, + "grad_norm": 2.392953866532917, + "learning_rate": 1.984480761681979e-06, + "loss": 0.6895, + "step": 2832 + }, + { + "epoch": 0.42, + "grad_norm": 2.403439869782384, + "learning_rate": 1.9844638031206923e-06, + "loss": 0.6849, + "step": 2833 + }, + { + "epoch": 0.42, + "grad_norm": 2.534075420709103, + "learning_rate": 1.9844468353713182e-06, + "loss": 0.6842, + "step": 2834 + }, + { + "epoch": 0.42, + "grad_norm": 6.590874140437125, + "learning_rate": 1.9844298584340143e-06, + "loss": 0.6829, + "step": 2835 + }, + { + "epoch": 0.42, + "grad_norm": 2.2902415618907783, + "learning_rate": 1.98441287230894e-06, + "loss": 0.694, + "step": 2836 + }, + { + "epoch": 0.42, + "grad_norm": 3.478084336368275, + "learning_rate": 1.984395876996253e-06, + "loss": 0.6862, + "step": 2837 + }, + { + "epoch": 0.42, + "grad_norm": 2.765842111156135, + "learning_rate": 1.9843788724961123e-06, + "loss": 0.6908, + "step": 2838 + }, + { + "epoch": 0.42, + "grad_norm": 2.9093188917186095, + "learning_rate": 1.9843618588086763e-06, + "loss": 0.6895, + "step": 2839 + }, + { + "epoch": 0.42, + "grad_norm": 1.7698835825266286, + "learning_rate": 1.984344835934104e-06, + "loss": 0.6803, + "step": 2840 + }, + { + "epoch": 0.42, + "grad_norm": 2.0109644368916113, + "learning_rate": 1.9843278038725544e-06, + "loss": 0.6895, + "step": 2841 + }, + { + "epoch": 0.42, + "grad_norm": 1.5014680143037489, + "learning_rate": 1.9843107626241867e-06, + "loss": 0.681, + "step": 2842 + }, + { + "epoch": 0.42, + "grad_norm": 7.863681846398097, + "learning_rate": 1.9842937121891592e-06, + "loss": 0.6914, + "step": 2843 + }, + { + "epoch": 0.42, + "grad_norm": 7.014957755284304, + "learning_rate": 1.9842766525676313e-06, + "loss": 0.6901, + "step": 2844 + }, + { + "epoch": 0.42, + "grad_norm": 0.9323124896237116, + "learning_rate": 1.984259583759763e-06, + "loss": 0.6829, + "step": 2845 + }, + { + "epoch": 0.42, + "grad_norm": 1.7948962071141144, + "learning_rate": 1.984242505765712e-06, + "loss": 0.6875, + "step": 2846 + }, + { + "epoch": 0.42, + "grad_norm": 0.8971376541307475, + "learning_rate": 1.9842254185856395e-06, + "loss": 0.6953, + "step": 2847 + }, + { + "epoch": 0.42, + "grad_norm": 2.15915280694592, + "learning_rate": 1.9842083222197037e-06, + "loss": 0.6836, + "step": 2848 + }, + { + "epoch": 0.42, + "grad_norm": 5.142296534639556, + "learning_rate": 1.984191216668065e-06, + "loss": 0.6875, + "step": 2849 + }, + { + "epoch": 0.43, + "grad_norm": 0.9337368373222449, + "learning_rate": 1.9841741019308824e-06, + "loss": 0.6914, + "step": 2850 + }, + { + "epoch": 0.43, + "grad_norm": 4.3782035351296145, + "learning_rate": 1.9841569780083157e-06, + "loss": 0.6947, + "step": 2851 + }, + { + "epoch": 0.43, + "grad_norm": 0.6096856247817002, + "learning_rate": 1.984139844900525e-06, + "loss": 0.6868, + "step": 2852 + }, + { + "epoch": 0.43, + "grad_norm": 2.866845076098853, + "learning_rate": 1.9841227026076705e-06, + "loss": 0.6868, + "step": 2853 + }, + { + "epoch": 0.43, + "grad_norm": 2.409108875038111, + "learning_rate": 1.9841055511299112e-06, + "loss": 0.6882, + "step": 2854 + }, + { + "epoch": 0.43, + "grad_norm": 1.5443903345292074, + "learning_rate": 1.984088390467408e-06, + "loss": 0.681, + "step": 2855 + }, + { + "epoch": 0.43, + "grad_norm": 9.751815042289893, + "learning_rate": 1.9840712206203208e-06, + "loss": 0.6934, + "step": 2856 + }, + { + "epoch": 0.43, + "grad_norm": 3.6103679544976477, + "learning_rate": 1.9840540415888097e-06, + "loss": 0.6816, + "step": 2857 + }, + { + "epoch": 0.43, + "grad_norm": 5.939559713238672, + "learning_rate": 1.9840368533730356e-06, + "loss": 0.6953, + "step": 2858 + }, + { + "epoch": 0.43, + "grad_norm": 3.112185991895307, + "learning_rate": 1.984019655973158e-06, + "loss": 0.6966, + "step": 2859 + }, + { + "epoch": 0.43, + "grad_norm": 1.1359174797802425, + "learning_rate": 1.9840024493893386e-06, + "loss": 0.7025, + "step": 2860 + }, + { + "epoch": 0.43, + "grad_norm": 3.3901796551167567, + "learning_rate": 1.9839852336217367e-06, + "loss": 0.6849, + "step": 2861 + }, + { + "epoch": 0.43, + "grad_norm": 6.024008269444834, + "learning_rate": 1.983968008670514e-06, + "loss": 0.6803, + "step": 2862 + }, + { + "epoch": 0.43, + "grad_norm": 4.223270358188794, + "learning_rate": 1.9839507745358304e-06, + "loss": 0.707, + "step": 2863 + }, + { + "epoch": 0.43, + "grad_norm": 3.9019642854210512, + "learning_rate": 1.983933531217848e-06, + "loss": 0.694, + "step": 2864 + }, + { + "epoch": 0.43, + "grad_norm": 7.036046558829472, + "learning_rate": 1.983916278716726e-06, + "loss": 0.6999, + "step": 2865 + }, + { + "epoch": 0.43, + "grad_norm": 4.441517441991927, + "learning_rate": 1.9838990170326268e-06, + "loss": 0.6882, + "step": 2866 + }, + { + "epoch": 0.43, + "grad_norm": 3.084624293963832, + "learning_rate": 1.9838817461657112e-06, + "loss": 0.6882, + "step": 2867 + }, + { + "epoch": 0.43, + "grad_norm": 0.9900174878450365, + "learning_rate": 1.9838644661161403e-06, + "loss": 0.6719, + "step": 2868 + }, + { + "epoch": 0.43, + "grad_norm": 7.277037696343748, + "learning_rate": 1.983847176884075e-06, + "loss": 0.6875, + "step": 2869 + }, + { + "epoch": 0.43, + "grad_norm": 1.4100739211661824, + "learning_rate": 1.983829878469677e-06, + "loss": 0.6868, + "step": 2870 + }, + { + "epoch": 0.43, + "grad_norm": 2.076270226452727, + "learning_rate": 1.983812570873108e-06, + "loss": 0.6927, + "step": 2871 + }, + { + "epoch": 0.43, + "grad_norm": 0.8310104262529219, + "learning_rate": 1.983795254094529e-06, + "loss": 0.6855, + "step": 2872 + }, + { + "epoch": 0.43, + "grad_norm": 2.3982457747415395, + "learning_rate": 1.983777928134102e-06, + "loss": 0.6966, + "step": 2873 + }, + { + "epoch": 0.43, + "grad_norm": 0.5246796255081403, + "learning_rate": 1.983760592991989e-06, + "loss": 0.6797, + "step": 2874 + }, + { + "epoch": 0.43, + "grad_norm": 2.8804392173450837, + "learning_rate": 1.983743248668351e-06, + "loss": 0.6842, + "step": 2875 + }, + { + "epoch": 0.43, + "grad_norm": 1.4086110894928587, + "learning_rate": 1.98372589516335e-06, + "loss": 0.6882, + "step": 2876 + }, + { + "epoch": 0.43, + "grad_norm": 5.5067205461054005, + "learning_rate": 1.9837085324771487e-06, + "loss": 0.6732, + "step": 2877 + }, + { + "epoch": 0.43, + "grad_norm": 2.4666578620041753, + "learning_rate": 1.983691160609908e-06, + "loss": 0.7005, + "step": 2878 + }, + { + "epoch": 0.43, + "grad_norm": 6.9415460886607185, + "learning_rate": 1.9836737795617917e-06, + "loss": 0.6992, + "step": 2879 + }, + { + "epoch": 0.43, + "grad_norm": 3.436168981780964, + "learning_rate": 1.9836563893329604e-06, + "loss": 0.6966, + "step": 2880 + }, + { + "epoch": 0.43, + "grad_norm": 10.426235191363586, + "learning_rate": 1.983638989923577e-06, + "loss": 0.6895, + "step": 2881 + }, + { + "epoch": 0.43, + "grad_norm": 5.681513800371783, + "learning_rate": 1.983621581333804e-06, + "loss": 0.6966, + "step": 2882 + }, + { + "epoch": 0.43, + "grad_norm": 1.6286999802364128, + "learning_rate": 1.983604163563804e-06, + "loss": 0.6712, + "step": 2883 + }, + { + "epoch": 0.43, + "grad_norm": 1.658821717781594, + "learning_rate": 1.983586736613739e-06, + "loss": 0.6895, + "step": 2884 + }, + { + "epoch": 0.43, + "grad_norm": 3.594574947073663, + "learning_rate": 1.9835693004837722e-06, + "loss": 0.6842, + "step": 2885 + }, + { + "epoch": 0.43, + "grad_norm": 4.584619281745614, + "learning_rate": 1.9835518551740664e-06, + "loss": 0.6712, + "step": 2886 + }, + { + "epoch": 0.43, + "grad_norm": 2.9189798820864774, + "learning_rate": 1.9835344006847838e-06, + "loss": 0.6947, + "step": 2887 + }, + { + "epoch": 0.43, + "grad_norm": 4.481417150169232, + "learning_rate": 1.9835169370160878e-06, + "loss": 0.6784, + "step": 2888 + }, + { + "epoch": 0.43, + "grad_norm": 2.0083347460247145, + "learning_rate": 1.9834994641681414e-06, + "loss": 0.7038, + "step": 2889 + }, + { + "epoch": 0.43, + "grad_norm": 13.605904371415717, + "learning_rate": 1.983481982141107e-06, + "loss": 0.6966, + "step": 2890 + }, + { + "epoch": 0.43, + "grad_norm": 1.2916364536232898, + "learning_rate": 1.983464490935149e-06, + "loss": 0.6927, + "step": 2891 + }, + { + "epoch": 0.43, + "grad_norm": 2.9243942950983404, + "learning_rate": 1.9834469905504295e-06, + "loss": 0.6888, + "step": 2892 + }, + { + "epoch": 0.43, + "grad_norm": 6.1675771710051785, + "learning_rate": 1.983429480987112e-06, + "loss": 0.6862, + "step": 2893 + }, + { + "epoch": 0.43, + "grad_norm": 6.538487072537848, + "learning_rate": 1.9834119622453604e-06, + "loss": 0.6816, + "step": 2894 + }, + { + "epoch": 0.43, + "grad_norm": 1.3751127499260443, + "learning_rate": 1.9833944343253384e-06, + "loss": 0.6914, + "step": 2895 + }, + { + "epoch": 0.43, + "grad_norm": 6.552077544908535, + "learning_rate": 1.983376897227209e-06, + "loss": 0.6953, + "step": 2896 + }, + { + "epoch": 0.43, + "grad_norm": 4.764331220281935, + "learning_rate": 1.983359350951136e-06, + "loss": 0.6934, + "step": 2897 + }, + { + "epoch": 0.43, + "grad_norm": 9.174791187751516, + "learning_rate": 1.9833417954972827e-06, + "loss": 0.7012, + "step": 2898 + }, + { + "epoch": 0.43, + "grad_norm": 3.228034417054952, + "learning_rate": 1.983324230865814e-06, + "loss": 0.6803, + "step": 2899 + }, + { + "epoch": 0.43, + "grad_norm": 9.388242724081222, + "learning_rate": 1.983306657056893e-06, + "loss": 0.6895, + "step": 2900 + }, + { + "epoch": 0.43, + "grad_norm": 5.349692227747338, + "learning_rate": 1.983289074070684e-06, + "loss": 0.6895, + "step": 2901 + }, + { + "epoch": 0.43, + "grad_norm": 2.3863414049508, + "learning_rate": 1.9832714819073518e-06, + "loss": 0.6914, + "step": 2902 + }, + { + "epoch": 0.43, + "grad_norm": 6.204626097901225, + "learning_rate": 1.9832538805670595e-06, + "loss": 0.6921, + "step": 2903 + }, + { + "epoch": 0.43, + "grad_norm": 3.3475404747315824, + "learning_rate": 1.9832362700499715e-06, + "loss": 0.6888, + "step": 2904 + }, + { + "epoch": 0.43, + "grad_norm": 4.4166165706593405, + "learning_rate": 1.9832186503562528e-06, + "loss": 0.6816, + "step": 2905 + }, + { + "epoch": 0.43, + "grad_norm": 1.9383034069146707, + "learning_rate": 1.9832010214860674e-06, + "loss": 0.7031, + "step": 2906 + }, + { + "epoch": 0.43, + "grad_norm": 1.0100201412852734, + "learning_rate": 1.9831833834395795e-06, + "loss": 0.6719, + "step": 2907 + }, + { + "epoch": 0.43, + "grad_norm": 10.443525120326846, + "learning_rate": 1.9831657362169545e-06, + "loss": 0.7207, + "step": 2908 + }, + { + "epoch": 0.43, + "grad_norm": 2.306117754931878, + "learning_rate": 1.983148079818357e-06, + "loss": 0.696, + "step": 2909 + }, + { + "epoch": 0.43, + "grad_norm": 3.2389685375528456, + "learning_rate": 1.9831304142439507e-06, + "loss": 0.6829, + "step": 2910 + }, + { + "epoch": 0.43, + "grad_norm": 1.3533171955572068, + "learning_rate": 1.983112739493902e-06, + "loss": 0.6849, + "step": 2911 + }, + { + "epoch": 0.43, + "grad_norm": 1.8537546584749542, + "learning_rate": 1.9830950555683753e-06, + "loss": 0.6862, + "step": 2912 + }, + { + "epoch": 0.43, + "grad_norm": 2.84899886376441, + "learning_rate": 1.983077362467535e-06, + "loss": 0.6888, + "step": 2913 + }, + { + "epoch": 0.43, + "grad_norm": 0.9208632820445138, + "learning_rate": 1.983059660191547e-06, + "loss": 0.6888, + "step": 2914 + }, + { + "epoch": 0.43, + "grad_norm": 3.6453340308251554, + "learning_rate": 1.983041948740576e-06, + "loss": 0.6882, + "step": 2915 + }, + { + "epoch": 0.43, + "grad_norm": 4.158513085865327, + "learning_rate": 1.983024228114788e-06, + "loss": 0.6986, + "step": 2916 + }, + { + "epoch": 0.44, + "grad_norm": 1.230352402070574, + "learning_rate": 1.9830064983143474e-06, + "loss": 0.6953, + "step": 2917 + }, + { + "epoch": 0.44, + "grad_norm": 0.743048385381829, + "learning_rate": 1.9829887593394204e-06, + "loss": 0.6953, + "step": 2918 + }, + { + "epoch": 0.44, + "grad_norm": 0.5502402931446573, + "learning_rate": 1.9829710111901728e-06, + "loss": 0.6823, + "step": 2919 + }, + { + "epoch": 0.44, + "grad_norm": 1.2215204787121436, + "learning_rate": 1.9829532538667693e-06, + "loss": 0.6901, + "step": 2920 + }, + { + "epoch": 0.44, + "grad_norm": 5.611428629618499, + "learning_rate": 1.9829354873693764e-06, + "loss": 0.6823, + "step": 2921 + }, + { + "epoch": 0.44, + "grad_norm": 3.589828677148211, + "learning_rate": 1.98291771169816e-06, + "loss": 0.6758, + "step": 2922 + }, + { + "epoch": 0.44, + "grad_norm": 7.869295446781418, + "learning_rate": 1.982899926853285e-06, + "loss": 0.6725, + "step": 2923 + }, + { + "epoch": 0.44, + "grad_norm": 0.6088386110085334, + "learning_rate": 1.9828821328349183e-06, + "loss": 0.6823, + "step": 2924 + }, + { + "epoch": 0.44, + "grad_norm": 2.3440849046551606, + "learning_rate": 1.9828643296432256e-06, + "loss": 0.6855, + "step": 2925 + }, + { + "epoch": 0.44, + "grad_norm": 7.448663052051138, + "learning_rate": 1.9828465172783735e-06, + "loss": 0.6836, + "step": 2926 + }, + { + "epoch": 0.44, + "grad_norm": 3.8886292744355426, + "learning_rate": 1.9828286957405278e-06, + "loss": 0.6921, + "step": 2927 + }, + { + "epoch": 0.44, + "grad_norm": 0.675350587327813, + "learning_rate": 1.982810865029855e-06, + "loss": 0.6777, + "step": 2928 + }, + { + "epoch": 0.44, + "grad_norm": 6.236924628609626, + "learning_rate": 1.9827930251465218e-06, + "loss": 0.6862, + "step": 2929 + }, + { + "epoch": 0.44, + "grad_norm": 4.912686651084785, + "learning_rate": 1.982775176090694e-06, + "loss": 0.6712, + "step": 2930 + }, + { + "epoch": 0.44, + "grad_norm": 0.5194568001863912, + "learning_rate": 1.9827573178625383e-06, + "loss": 0.6862, + "step": 2931 + }, + { + "epoch": 0.44, + "grad_norm": 2.312797906560409, + "learning_rate": 1.9827394504622223e-06, + "loss": 0.696, + "step": 2932 + }, + { + "epoch": 0.44, + "grad_norm": 5.28969733982192, + "learning_rate": 1.9827215738899118e-06, + "loss": 0.7012, + "step": 2933 + }, + { + "epoch": 0.44, + "grad_norm": 1.0874079779805752, + "learning_rate": 1.982703688145774e-06, + "loss": 0.6829, + "step": 2934 + }, + { + "epoch": 0.44, + "grad_norm": 0.7048633908982191, + "learning_rate": 1.9826857932299755e-06, + "loss": 0.7031, + "step": 2935 + }, + { + "epoch": 0.44, + "grad_norm": 1.0042145725106915, + "learning_rate": 1.9826678891426835e-06, + "loss": 0.6875, + "step": 2936 + }, + { + "epoch": 0.44, + "grad_norm": 4.62359593727324, + "learning_rate": 1.9826499758840656e-06, + "loss": 0.6803, + "step": 2937 + }, + { + "epoch": 0.44, + "grad_norm": 4.428462355769395, + "learning_rate": 1.9826320534542883e-06, + "loss": 0.6895, + "step": 2938 + }, + { + "epoch": 0.44, + "grad_norm": 2.3074382277959615, + "learning_rate": 1.9826141218535197e-06, + "loss": 0.6751, + "step": 2939 + }, + { + "epoch": 0.44, + "grad_norm": 1.859843682762858, + "learning_rate": 1.9825961810819258e-06, + "loss": 0.6875, + "step": 2940 + }, + { + "epoch": 0.44, + "grad_norm": 7.161550730706648, + "learning_rate": 1.982578231139675e-06, + "loss": 0.7005, + "step": 2941 + }, + { + "epoch": 0.44, + "grad_norm": 3.1967563341638043, + "learning_rate": 1.9825602720269354e-06, + "loss": 0.6797, + "step": 2942 + }, + { + "epoch": 0.44, + "grad_norm": 3.460090338348227, + "learning_rate": 1.982542303743873e-06, + "loss": 0.6641, + "step": 2943 + }, + { + "epoch": 0.44, + "grad_norm": 4.499817196100047, + "learning_rate": 1.982524326290657e-06, + "loss": 0.6947, + "step": 2944 + }, + { + "epoch": 0.44, + "grad_norm": 1.953142678372877, + "learning_rate": 1.9825063396674542e-06, + "loss": 0.6973, + "step": 2945 + }, + { + "epoch": 0.44, + "grad_norm": 4.761483987678275, + "learning_rate": 1.982488343874433e-06, + "loss": 0.6875, + "step": 2946 + }, + { + "epoch": 0.44, + "grad_norm": 3.7586879932372312, + "learning_rate": 1.9824703389117614e-06, + "loss": 0.6908, + "step": 2947 + }, + { + "epoch": 0.44, + "grad_norm": 1.9850632766021439, + "learning_rate": 1.982452324779607e-06, + "loss": 0.6797, + "step": 2948 + }, + { + "epoch": 0.44, + "grad_norm": 2.2335448190073977, + "learning_rate": 1.982434301478138e-06, + "loss": 0.6654, + "step": 2949 + }, + { + "epoch": 0.44, + "grad_norm": 6.803104456942374, + "learning_rate": 1.982416269007523e-06, + "loss": 0.6784, + "step": 2950 + }, + { + "epoch": 0.44, + "grad_norm": 5.652986902244935, + "learning_rate": 1.9823982273679302e-06, + "loss": 0.7025, + "step": 2951 + }, + { + "epoch": 0.44, + "grad_norm": 2.754084792199685, + "learning_rate": 1.9823801765595275e-06, + "loss": 0.6855, + "step": 2952 + }, + { + "epoch": 0.44, + "grad_norm": 5.938286185736328, + "learning_rate": 1.9823621165824843e-06, + "loss": 0.6973, + "step": 2953 + }, + { + "epoch": 0.44, + "grad_norm": 0.8190869807519975, + "learning_rate": 1.982344047436968e-06, + "loss": 0.6901, + "step": 2954 + }, + { + "epoch": 0.44, + "grad_norm": 12.40123442743233, + "learning_rate": 1.982325969123148e-06, + "loss": 0.7038, + "step": 2955 + }, + { + "epoch": 0.44, + "grad_norm": 2.25763890188319, + "learning_rate": 1.982307881641193e-06, + "loss": 0.6868, + "step": 2956 + }, + { + "epoch": 0.44, + "grad_norm": 2.2506099084517297, + "learning_rate": 1.9822897849912715e-06, + "loss": 0.6849, + "step": 2957 + }, + { + "epoch": 0.44, + "grad_norm": 1.016384314284113, + "learning_rate": 1.9822716791735527e-06, + "loss": 0.7005, + "step": 2958 + }, + { + "epoch": 0.44, + "grad_norm": 4.605293932049143, + "learning_rate": 1.9822535641882054e-06, + "loss": 0.6849, + "step": 2959 + }, + { + "epoch": 0.44, + "grad_norm": 1.7796442160775683, + "learning_rate": 1.982235440035399e-06, + "loss": 0.7116, + "step": 2960 + }, + { + "epoch": 0.44, + "grad_norm": 4.5289449185133, + "learning_rate": 1.982217306715302e-06, + "loss": 0.6849, + "step": 2961 + }, + { + "epoch": 0.44, + "grad_norm": 2.0353120888406284, + "learning_rate": 1.982199164228084e-06, + "loss": 0.6732, + "step": 2962 + }, + { + "epoch": 0.44, + "grad_norm": 4.509309579487616, + "learning_rate": 1.982181012573915e-06, + "loss": 0.6784, + "step": 2963 + }, + { + "epoch": 0.44, + "grad_norm": 4.936904343334161, + "learning_rate": 1.982162851752963e-06, + "loss": 0.7005, + "step": 2964 + }, + { + "epoch": 0.44, + "grad_norm": 1.5509396941786273, + "learning_rate": 1.9821446817653983e-06, + "loss": 0.6758, + "step": 2965 + }, + { + "epoch": 0.44, + "grad_norm": 4.972364033129919, + "learning_rate": 1.9821265026113908e-06, + "loss": 0.6803, + "step": 2966 + }, + { + "epoch": 0.44, + "grad_norm": 2.3049406723365267, + "learning_rate": 1.9821083142911095e-06, + "loss": 0.696, + "step": 2967 + }, + { + "epoch": 0.44, + "grad_norm": 1.239893851389402, + "learning_rate": 1.9820901168047243e-06, + "loss": 0.6849, + "step": 2968 + }, + { + "epoch": 0.44, + "grad_norm": 1.0181668307691347, + "learning_rate": 1.982071910152406e-06, + "loss": 0.6751, + "step": 2969 + }, + { + "epoch": 0.44, + "grad_norm": 6.040217771842623, + "learning_rate": 1.982053694334323e-06, + "loss": 0.6927, + "step": 2970 + }, + { + "epoch": 0.44, + "grad_norm": 3.0452993493226437, + "learning_rate": 1.982035469350646e-06, + "loss": 0.6771, + "step": 2971 + }, + { + "epoch": 0.44, + "grad_norm": 2.440599746729316, + "learning_rate": 1.9820172352015453e-06, + "loss": 0.6738, + "step": 2972 + }, + { + "epoch": 0.44, + "grad_norm": 0.6329439651920824, + "learning_rate": 1.981998991887191e-06, + "loss": 0.6816, + "step": 2973 + }, + { + "epoch": 0.44, + "grad_norm": 5.0261269626771865, + "learning_rate": 1.981980739407753e-06, + "loss": 0.7018, + "step": 2974 + }, + { + "epoch": 0.44, + "grad_norm": 2.959158090131071, + "learning_rate": 1.981962477763402e-06, + "loss": 0.6979, + "step": 2975 + }, + { + "epoch": 0.44, + "grad_norm": 2.3334731164748646, + "learning_rate": 1.9819442069543085e-06, + "loss": 0.6738, + "step": 2976 + }, + { + "epoch": 0.44, + "grad_norm": 4.4899825835431555, + "learning_rate": 1.981925926980643e-06, + "loss": 0.6777, + "step": 2977 + }, + { + "epoch": 0.44, + "grad_norm": 1.4735388022035956, + "learning_rate": 1.9819076378425755e-06, + "loss": 0.6882, + "step": 2978 + }, + { + "epoch": 0.44, + "grad_norm": 0.8066834709537599, + "learning_rate": 1.9818893395402774e-06, + "loss": 0.6751, + "step": 2979 + }, + { + "epoch": 0.44, + "grad_norm": 2.998998907525343, + "learning_rate": 1.9818710320739193e-06, + "loss": 0.6823, + "step": 2980 + }, + { + "epoch": 0.44, + "grad_norm": 7.143724495836616, + "learning_rate": 1.981852715443672e-06, + "loss": 0.694, + "step": 2981 + }, + { + "epoch": 0.44, + "grad_norm": 5.0668050620519995, + "learning_rate": 1.981834389649707e-06, + "loss": 0.6927, + "step": 2982 + }, + { + "epoch": 0.44, + "grad_norm": 5.039039806802821, + "learning_rate": 1.981816054692194e-06, + "loss": 0.6901, + "step": 2983 + }, + { + "epoch": 0.45, + "grad_norm": 8.474797907457845, + "learning_rate": 1.9817977105713053e-06, + "loss": 0.7103, + "step": 2984 + }, + { + "epoch": 0.45, + "grad_norm": 4.363754289014343, + "learning_rate": 1.9817793572872117e-06, + "loss": 0.6979, + "step": 2985 + }, + { + "epoch": 0.45, + "grad_norm": 1.9583961178999942, + "learning_rate": 1.9817609948400847e-06, + "loss": 0.6855, + "step": 2986 + }, + { + "epoch": 0.45, + "grad_norm": 3.919889704207995, + "learning_rate": 1.981742623230095e-06, + "loss": 0.7005, + "step": 2987 + }, + { + "epoch": 0.45, + "grad_norm": 3.7668654726565185, + "learning_rate": 1.9817242424574152e-06, + "loss": 0.6953, + "step": 2988 + }, + { + "epoch": 0.45, + "grad_norm": 1.6350404802972804, + "learning_rate": 1.981705852522216e-06, + "loss": 0.6849, + "step": 2989 + }, + { + "epoch": 0.45, + "grad_norm": 15.544051687930288, + "learning_rate": 1.9816874534246694e-06, + "loss": 0.7005, + "step": 2990 + }, + { + "epoch": 0.45, + "grad_norm": 1.4930473935247102, + "learning_rate": 1.9816690451649464e-06, + "loss": 0.6686, + "step": 2991 + }, + { + "epoch": 0.45, + "grad_norm": 1.6715745134382116, + "learning_rate": 1.98165062774322e-06, + "loss": 0.6712, + "step": 2992 + }, + { + "epoch": 0.45, + "grad_norm": 3.4260709381435643, + "learning_rate": 1.981632201159661e-06, + "loss": 0.6823, + "step": 2993 + }, + { + "epoch": 0.45, + "grad_norm": 0.7000961780690039, + "learning_rate": 1.9816137654144423e-06, + "loss": 0.6836, + "step": 2994 + }, + { + "epoch": 0.45, + "grad_norm": 3.851148847954693, + "learning_rate": 1.9815953205077355e-06, + "loss": 0.6901, + "step": 2995 + }, + { + "epoch": 0.45, + "grad_norm": 8.729830574640234, + "learning_rate": 1.9815768664397125e-06, + "loss": 0.7051, + "step": 2996 + }, + { + "epoch": 0.45, + "grad_norm": 3.005634936858878, + "learning_rate": 1.981558403210546e-06, + "loss": 0.6816, + "step": 2997 + }, + { + "epoch": 0.45, + "grad_norm": 7.598300080061404, + "learning_rate": 1.981539930820408e-06, + "loss": 0.6829, + "step": 2998 + }, + { + "epoch": 0.45, + "grad_norm": 1.5122823996541779, + "learning_rate": 1.981521449269471e-06, + "loss": 0.6836, + "step": 2999 + }, + { + "epoch": 0.45, + "grad_norm": 1.095502539072187, + "learning_rate": 1.9815029585579075e-06, + "loss": 0.6895, + "step": 3000 + }, + { + "epoch": 0.45, + "grad_norm": 1.7473129055130263, + "learning_rate": 1.98148445868589e-06, + "loss": 0.6934, + "step": 3001 + }, + { + "epoch": 0.45, + "grad_norm": 7.190014414186361, + "learning_rate": 1.981465949653592e-06, + "loss": 0.6758, + "step": 3002 + }, + { + "epoch": 0.45, + "grad_norm": 1.6847211165585547, + "learning_rate": 1.9814474314611844e-06, + "loss": 0.6895, + "step": 3003 + }, + { + "epoch": 0.45, + "grad_norm": 5.341259585037921, + "learning_rate": 1.9814289041088416e-06, + "loss": 0.6784, + "step": 3004 + }, + { + "epoch": 0.45, + "grad_norm": 0.5318821760324388, + "learning_rate": 1.981410367596736e-06, + "loss": 0.6855, + "step": 3005 + }, + { + "epoch": 0.45, + "grad_norm": 0.5404281238233862, + "learning_rate": 1.981391821925041e-06, + "loss": 0.6842, + "step": 3006 + }, + { + "epoch": 0.45, + "grad_norm": 5.3512874756181805, + "learning_rate": 1.981373267093929e-06, + "loss": 0.6816, + "step": 3007 + }, + { + "epoch": 0.45, + "grad_norm": 3.070356223054269, + "learning_rate": 1.9813547031035734e-06, + "loss": 0.707, + "step": 3008 + }, + { + "epoch": 0.45, + "grad_norm": 2.3232122530146664, + "learning_rate": 1.981336129954148e-06, + "loss": 0.7064, + "step": 3009 + }, + { + "epoch": 0.45, + "grad_norm": 2.1800787950993494, + "learning_rate": 1.981317547645825e-06, + "loss": 0.6816, + "step": 3010 + }, + { + "epoch": 0.45, + "grad_norm": 4.746604948832256, + "learning_rate": 1.9812989561787794e-06, + "loss": 0.6719, + "step": 3011 + }, + { + "epoch": 0.45, + "grad_norm": 0.665345387857187, + "learning_rate": 1.9812803555531832e-06, + "loss": 0.6849, + "step": 3012 + }, + { + "epoch": 0.45, + "grad_norm": 9.26303624619338, + "learning_rate": 1.981261745769211e-06, + "loss": 0.6973, + "step": 3013 + }, + { + "epoch": 0.45, + "grad_norm": 0.642725879884296, + "learning_rate": 1.9812431268270363e-06, + "loss": 0.7012, + "step": 3014 + }, + { + "epoch": 0.45, + "grad_norm": 5.031519257166516, + "learning_rate": 1.981224498726832e-06, + "loss": 0.6875, + "step": 3015 + }, + { + "epoch": 0.45, + "grad_norm": 3.9982833082505334, + "learning_rate": 1.9812058614687736e-06, + "loss": 0.6914, + "step": 3016 + }, + { + "epoch": 0.45, + "grad_norm": 2.0288851025933043, + "learning_rate": 1.9811872150530337e-06, + "loss": 0.6882, + "step": 3017 + }, + { + "epoch": 0.45, + "grad_norm": 7.268683834228295, + "learning_rate": 1.9811685594797867e-06, + "loss": 0.6888, + "step": 3018 + }, + { + "epoch": 0.45, + "grad_norm": 3.0887147207803434, + "learning_rate": 1.981149894749207e-06, + "loss": 0.6777, + "step": 3019 + }, + { + "epoch": 0.45, + "grad_norm": 5.225650666921747, + "learning_rate": 1.9811312208614682e-06, + "loss": 0.6706, + "step": 3020 + }, + { + "epoch": 0.45, + "grad_norm": 1.885039701234593, + "learning_rate": 1.981112537816745e-06, + "loss": 0.6882, + "step": 3021 + }, + { + "epoch": 0.45, + "grad_norm": 5.123319857493474, + "learning_rate": 1.981093845615212e-06, + "loss": 0.6816, + "step": 3022 + }, + { + "epoch": 0.45, + "grad_norm": 5.08778630212537, + "learning_rate": 1.981075144257043e-06, + "loss": 0.6855, + "step": 3023 + }, + { + "epoch": 0.45, + "grad_norm": 0.9991380599653888, + "learning_rate": 1.981056433742413e-06, + "loss": 0.6888, + "step": 3024 + }, + { + "epoch": 0.45, + "grad_norm": 3.500843690123799, + "learning_rate": 1.9810377140714968e-06, + "loss": 0.6803, + "step": 3025 + }, + { + "epoch": 0.45, + "grad_norm": 4.839228570160179, + "learning_rate": 1.981018985244469e-06, + "loss": 0.6777, + "step": 3026 + }, + { + "epoch": 0.45, + "grad_norm": 0.8671371565519894, + "learning_rate": 1.981000247261504e-06, + "loss": 0.6868, + "step": 3027 + }, + { + "epoch": 0.45, + "grad_norm": 0.5698405523029593, + "learning_rate": 1.9809815001227766e-06, + "loss": 0.6784, + "step": 3028 + }, + { + "epoch": 0.45, + "grad_norm": 1.2988255533424193, + "learning_rate": 1.980962743828463e-06, + "loss": 0.6816, + "step": 3029 + }, + { + "epoch": 0.45, + "grad_norm": 0.8827309517194194, + "learning_rate": 1.9809439783787364e-06, + "loss": 0.6868, + "step": 3030 + }, + { + "epoch": 0.45, + "grad_norm": 5.891610035360143, + "learning_rate": 1.980925203773773e-06, + "loss": 0.707, + "step": 3031 + }, + { + "epoch": 0.45, + "grad_norm": 4.561779767771808, + "learning_rate": 1.9809064200137486e-06, + "loss": 0.6862, + "step": 3032 + }, + { + "epoch": 0.45, + "grad_norm": 1.3286953387100038, + "learning_rate": 1.9808876270988374e-06, + "loss": 0.6816, + "step": 3033 + }, + { + "epoch": 0.45, + "grad_norm": 2.872056980126314, + "learning_rate": 1.9808688250292153e-06, + "loss": 0.6908, + "step": 3034 + }, + { + "epoch": 0.45, + "grad_norm": 1.2222248795447732, + "learning_rate": 1.9808500138050576e-06, + "loss": 0.6888, + "step": 3035 + }, + { + "epoch": 0.45, + "grad_norm": 5.481345347824915, + "learning_rate": 1.9808311934265397e-06, + "loss": 0.6947, + "step": 3036 + }, + { + "epoch": 0.45, + "grad_norm": 4.579566988658866, + "learning_rate": 1.980812363893838e-06, + "loss": 0.7018, + "step": 3037 + }, + { + "epoch": 0.45, + "grad_norm": 5.620609810909627, + "learning_rate": 1.980793525207128e-06, + "loss": 0.6751, + "step": 3038 + }, + { + "epoch": 0.45, + "grad_norm": 1.6499587938995328, + "learning_rate": 1.980774677366585e-06, + "loss": 0.6947, + "step": 3039 + }, + { + "epoch": 0.45, + "grad_norm": 8.73001415913525, + "learning_rate": 1.980755820372385e-06, + "loss": 0.7142, + "step": 3040 + }, + { + "epoch": 0.45, + "grad_norm": 6.034940298166839, + "learning_rate": 1.9807369542247042e-06, + "loss": 0.7096, + "step": 3041 + }, + { + "epoch": 0.45, + "grad_norm": 0.6388102757567165, + "learning_rate": 1.980718078923719e-06, + "loss": 0.6934, + "step": 3042 + }, + { + "epoch": 0.45, + "grad_norm": 4.642097218635058, + "learning_rate": 1.980699194469605e-06, + "loss": 0.6973, + "step": 3043 + }, + { + "epoch": 0.45, + "grad_norm": 5.571573736339227, + "learning_rate": 1.9806803008625388e-06, + "loss": 0.6895, + "step": 3044 + }, + { + "epoch": 0.45, + "grad_norm": 7.2308109888543015, + "learning_rate": 1.9806613981026967e-06, + "loss": 0.6862, + "step": 3045 + }, + { + "epoch": 0.45, + "grad_norm": 3.2930830366725536, + "learning_rate": 1.9806424861902545e-06, + "loss": 0.6868, + "step": 3046 + }, + { + "epoch": 0.45, + "grad_norm": 4.185624109974866, + "learning_rate": 1.98062356512539e-06, + "loss": 0.6908, + "step": 3047 + }, + { + "epoch": 0.45, + "grad_norm": 3.6173211158051823, + "learning_rate": 1.9806046349082787e-06, + "loss": 0.6816, + "step": 3048 + }, + { + "epoch": 0.45, + "grad_norm": 3.1448249015430365, + "learning_rate": 1.9805856955390975e-06, + "loss": 0.6842, + "step": 3049 + }, + { + "epoch": 0.45, + "grad_norm": 3.2351895998987525, + "learning_rate": 1.9805667470180235e-06, + "loss": 0.6914, + "step": 3050 + }, + { + "epoch": 0.46, + "grad_norm": 0.4224246656823515, + "learning_rate": 1.9805477893452332e-06, + "loss": 0.7018, + "step": 3051 + }, + { + "epoch": 0.46, + "grad_norm": 3.209761235118297, + "learning_rate": 1.9805288225209037e-06, + "loss": 0.6862, + "step": 3052 + }, + { + "epoch": 0.46, + "grad_norm": 2.8490707822653216, + "learning_rate": 1.980509846545212e-06, + "loss": 0.6836, + "step": 3053 + }, + { + "epoch": 0.46, + "grad_norm": 1.6386159549401704, + "learning_rate": 1.9804908614183353e-06, + "loss": 0.6712, + "step": 3054 + }, + { + "epoch": 0.46, + "grad_norm": 2.2122976882876313, + "learning_rate": 1.9804718671404504e-06, + "loss": 0.6914, + "step": 3055 + }, + { + "epoch": 0.46, + "grad_norm": 0.3486249264636195, + "learning_rate": 1.9804528637117352e-06, + "loss": 0.6914, + "step": 3056 + }, + { + "epoch": 0.46, + "grad_norm": 3.605385072947633, + "learning_rate": 1.9804338511323665e-06, + "loss": 0.694, + "step": 3057 + }, + { + "epoch": 0.46, + "grad_norm": 0.5957200727418956, + "learning_rate": 1.9804148294025224e-06, + "loss": 0.6875, + "step": 3058 + }, + { + "epoch": 0.46, + "grad_norm": 1.929055666992203, + "learning_rate": 1.9803957985223795e-06, + "loss": 0.6875, + "step": 3059 + }, + { + "epoch": 0.46, + "grad_norm": 1.4504051351529255, + "learning_rate": 1.980376758492116e-06, + "loss": 0.6842, + "step": 3060 + }, + { + "epoch": 0.46, + "grad_norm": 1.0061971564687076, + "learning_rate": 1.9803577093119092e-06, + "loss": 0.6862, + "step": 3061 + }, + { + "epoch": 0.46, + "grad_norm": 1.7195847209323913, + "learning_rate": 1.980338650981938e-06, + "loss": 0.6992, + "step": 3062 + }, + { + "epoch": 0.46, + "grad_norm": 3.5569311415913263, + "learning_rate": 1.9803195835023785e-06, + "loss": 0.6777, + "step": 3063 + }, + { + "epoch": 0.46, + "grad_norm": 1.8578800202553742, + "learning_rate": 1.9803005068734102e-06, + "loss": 0.6875, + "step": 3064 + }, + { + "epoch": 0.46, + "grad_norm": 7.289533506676331, + "learning_rate": 1.980281421095211e-06, + "loss": 0.6999, + "step": 3065 + }, + { + "epoch": 0.46, + "grad_norm": 11.313166783725178, + "learning_rate": 1.980262326167958e-06, + "loss": 0.7012, + "step": 3066 + }, + { + "epoch": 0.46, + "grad_norm": 3.150554952939769, + "learning_rate": 1.98024322209183e-06, + "loss": 0.6816, + "step": 3067 + }, + { + "epoch": 0.46, + "grad_norm": 3.8180990240893173, + "learning_rate": 1.9802241088670055e-06, + "loss": 0.6908, + "step": 3068 + }, + { + "epoch": 0.46, + "grad_norm": 6.506800437297779, + "learning_rate": 1.980204986493663e-06, + "loss": 0.694, + "step": 3069 + }, + { + "epoch": 0.46, + "grad_norm": 1.4804253797473008, + "learning_rate": 1.98018585497198e-06, + "loss": 0.6797, + "step": 3070 + }, + { + "epoch": 0.46, + "grad_norm": 1.1908431436041762, + "learning_rate": 1.980166714302136e-06, + "loss": 0.681, + "step": 3071 + }, + { + "epoch": 0.46, + "grad_norm": 0.48236764373473195, + "learning_rate": 1.9801475644843094e-06, + "loss": 0.6862, + "step": 3072 + }, + { + "epoch": 0.46, + "grad_norm": 12.390393574196901, + "learning_rate": 1.9801284055186793e-06, + "loss": 0.7057, + "step": 3073 + }, + { + "epoch": 0.46, + "grad_norm": 3.4080996165653383, + "learning_rate": 1.9801092374054233e-06, + "loss": 0.6836, + "step": 3074 + }, + { + "epoch": 0.46, + "grad_norm": 9.601176259288163, + "learning_rate": 1.9800900601447218e-06, + "loss": 0.6986, + "step": 3075 + }, + { + "epoch": 0.46, + "grad_norm": 0.5490763487122594, + "learning_rate": 1.9800708737367527e-06, + "loss": 0.6875, + "step": 3076 + }, + { + "epoch": 0.46, + "grad_norm": 2.80999226758673, + "learning_rate": 1.9800516781816955e-06, + "loss": 0.679, + "step": 3077 + }, + { + "epoch": 0.46, + "grad_norm": 1.9419144568328839, + "learning_rate": 1.9800324734797294e-06, + "loss": 0.6888, + "step": 3078 + }, + { + "epoch": 0.46, + "grad_norm": 4.061369564914996, + "learning_rate": 1.980013259631034e-06, + "loss": 0.6784, + "step": 3079 + }, + { + "epoch": 0.46, + "grad_norm": 1.27162224541536, + "learning_rate": 1.9799940366357877e-06, + "loss": 0.6712, + "step": 3080 + }, + { + "epoch": 0.46, + "grad_norm": 1.450923681868789, + "learning_rate": 1.9799748044941706e-06, + "loss": 0.6875, + "step": 3081 + }, + { + "epoch": 0.46, + "grad_norm": 7.608364224197894, + "learning_rate": 1.9799555632063617e-06, + "loss": 0.6745, + "step": 3082 + }, + { + "epoch": 0.46, + "grad_norm": 8.24115139384331, + "learning_rate": 1.979936312772541e-06, + "loss": 0.6751, + "step": 3083 + }, + { + "epoch": 0.46, + "grad_norm": 2.2770122702708595, + "learning_rate": 1.9799170531928883e-06, + "loss": 0.6855, + "step": 3084 + }, + { + "epoch": 0.46, + "grad_norm": 0.6007879974874349, + "learning_rate": 1.9798977844675824e-06, + "loss": 0.6836, + "step": 3085 + }, + { + "epoch": 0.46, + "grad_norm": 1.8994664174253713, + "learning_rate": 1.9798785065968046e-06, + "loss": 0.6973, + "step": 3086 + }, + { + "epoch": 0.46, + "grad_norm": 9.722164426947481, + "learning_rate": 1.9798592195807337e-06, + "loss": 0.7174, + "step": 3087 + }, + { + "epoch": 0.46, + "grad_norm": 5.566801349011789, + "learning_rate": 1.97983992341955e-06, + "loss": 0.6927, + "step": 3088 + }, + { + "epoch": 0.46, + "grad_norm": 3.067255822129543, + "learning_rate": 1.979820618113434e-06, + "loss": 0.6745, + "step": 3089 + }, + { + "epoch": 0.46, + "grad_norm": 6.245193725312089, + "learning_rate": 1.979801303662565e-06, + "loss": 0.6686, + "step": 3090 + }, + { + "epoch": 0.46, + "grad_norm": 5.460472073798966, + "learning_rate": 1.979781980067124e-06, + "loss": 0.6947, + "step": 3091 + }, + { + "epoch": 0.46, + "grad_norm": 2.4888863832974013, + "learning_rate": 1.979762647327291e-06, + "loss": 0.6693, + "step": 3092 + }, + { + "epoch": 0.46, + "grad_norm": 1.5755371231368285, + "learning_rate": 1.9797433054432466e-06, + "loss": 0.6764, + "step": 3093 + }, + { + "epoch": 0.46, + "grad_norm": 2.1816493552453124, + "learning_rate": 1.9797239544151717e-06, + "loss": 0.6855, + "step": 3094 + }, + { + "epoch": 0.46, + "grad_norm": 6.12593606572561, + "learning_rate": 1.979704594243246e-06, + "loss": 0.7214, + "step": 3095 + }, + { + "epoch": 0.46, + "grad_norm": 7.770985360455103, + "learning_rate": 1.979685224927651e-06, + "loss": 0.6855, + "step": 3096 + }, + { + "epoch": 0.46, + "grad_norm": 2.996820863572172, + "learning_rate": 1.979665846468567e-06, + "loss": 0.679, + "step": 3097 + }, + { + "epoch": 0.46, + "grad_norm": 6.40609538343781, + "learning_rate": 1.979646458866175e-06, + "loss": 0.7064, + "step": 3098 + }, + { + "epoch": 0.46, + "grad_norm": 5.934518010256724, + "learning_rate": 1.979627062120656e-06, + "loss": 0.7018, + "step": 3099 + }, + { + "epoch": 0.46, + "grad_norm": 6.463181670159533, + "learning_rate": 1.9796076562321907e-06, + "loss": 0.6992, + "step": 3100 + }, + { + "epoch": 0.46, + "grad_norm": 0.8634566468414097, + "learning_rate": 1.979588241200961e-06, + "loss": 0.6797, + "step": 3101 + }, + { + "epoch": 0.46, + "grad_norm": 1.6623828959711031, + "learning_rate": 1.9795688170271473e-06, + "loss": 0.6855, + "step": 3102 + }, + { + "epoch": 0.46, + "grad_norm": 4.280319351030144, + "learning_rate": 1.9795493837109312e-06, + "loss": 0.6908, + "step": 3103 + }, + { + "epoch": 0.46, + "grad_norm": 3.3468649334265073, + "learning_rate": 1.9795299412524945e-06, + "loss": 0.6784, + "step": 3104 + }, + { + "epoch": 0.46, + "grad_norm": 6.595235206046532, + "learning_rate": 1.9795104896520177e-06, + "loss": 0.6934, + "step": 3105 + }, + { + "epoch": 0.46, + "grad_norm": 0.7053996227419984, + "learning_rate": 1.9794910289096832e-06, + "loss": 0.6855, + "step": 3106 + }, + { + "epoch": 0.46, + "grad_norm": 3.192819364515391, + "learning_rate": 1.979471559025672e-06, + "loss": 0.6855, + "step": 3107 + }, + { + "epoch": 0.46, + "grad_norm": 5.844064114213031, + "learning_rate": 1.9794520800001665e-06, + "loss": 0.694, + "step": 3108 + }, + { + "epoch": 0.46, + "grad_norm": 1.2892493775719498, + "learning_rate": 1.979432591833348e-06, + "loss": 0.6771, + "step": 3109 + }, + { + "epoch": 0.46, + "grad_norm": 4.890348921410966, + "learning_rate": 1.9794130945253988e-06, + "loss": 0.7044, + "step": 3110 + }, + { + "epoch": 0.46, + "grad_norm": 1.8090223341101273, + "learning_rate": 1.9793935880765e-06, + "loss": 0.6895, + "step": 3111 + }, + { + "epoch": 0.46, + "grad_norm": 4.029605692229435, + "learning_rate": 1.9793740724868343e-06, + "loss": 0.6888, + "step": 3112 + }, + { + "epoch": 0.46, + "grad_norm": 0.8095400605675465, + "learning_rate": 1.979354547756584e-06, + "loss": 0.6842, + "step": 3113 + }, + { + "epoch": 0.46, + "grad_norm": 1.2418810900796013, + "learning_rate": 1.979335013885931e-06, + "loss": 0.6771, + "step": 3114 + }, + { + "epoch": 0.46, + "grad_norm": 0.6418164356880279, + "learning_rate": 1.9793154708750577e-06, + "loss": 0.6875, + "step": 3115 + }, + { + "epoch": 0.46, + "grad_norm": 1.0467008757835796, + "learning_rate": 1.9792959187241463e-06, + "loss": 0.7005, + "step": 3116 + }, + { + "epoch": 0.46, + "grad_norm": 7.095986423668685, + "learning_rate": 1.97927635743338e-06, + "loss": 0.6836, + "step": 3117 + }, + { + "epoch": 0.47, + "grad_norm": 6.123545668893634, + "learning_rate": 1.9792567870029404e-06, + "loss": 0.6849, + "step": 3118 + }, + { + "epoch": 0.47, + "grad_norm": 2.4769749478176855, + "learning_rate": 1.979237207433011e-06, + "loss": 0.7005, + "step": 3119 + }, + { + "epoch": 0.47, + "grad_norm": 0.4759592786363758, + "learning_rate": 1.9792176187237738e-06, + "loss": 0.6849, + "step": 3120 + }, + { + "epoch": 0.47, + "grad_norm": 0.5444092936943171, + "learning_rate": 1.9791980208754124e-06, + "loss": 0.694, + "step": 3121 + }, + { + "epoch": 0.47, + "grad_norm": 1.8001784854042797, + "learning_rate": 1.9791784138881085e-06, + "loss": 0.6888, + "step": 3122 + }, + { + "epoch": 0.47, + "grad_norm": 2.1077847962964187, + "learning_rate": 1.9791587977620467e-06, + "loss": 0.6758, + "step": 3123 + }, + { + "epoch": 0.47, + "grad_norm": 7.120421271526948, + "learning_rate": 1.9791391724974087e-06, + "loss": 0.6914, + "step": 3124 + }, + { + "epoch": 0.47, + "grad_norm": 1.5663823177729, + "learning_rate": 1.9791195380943783e-06, + "loss": 0.6849, + "step": 3125 + }, + { + "epoch": 0.47, + "grad_norm": 1.5503785283606837, + "learning_rate": 1.979099894553139e-06, + "loss": 0.6836, + "step": 3126 + }, + { + "epoch": 0.47, + "grad_norm": 5.952104790536909, + "learning_rate": 1.9790802418738733e-06, + "loss": 0.6862, + "step": 3127 + }, + { + "epoch": 0.47, + "grad_norm": 3.373124378082684, + "learning_rate": 1.9790605800567656e-06, + "loss": 0.6803, + "step": 3128 + }, + { + "epoch": 0.47, + "grad_norm": 1.513180872535552, + "learning_rate": 1.9790409091019985e-06, + "loss": 0.6979, + "step": 3129 + }, + { + "epoch": 0.47, + "grad_norm": 4.309178305240091, + "learning_rate": 1.9790212290097564e-06, + "loss": 0.6823, + "step": 3130 + }, + { + "epoch": 0.47, + "grad_norm": 1.900546845770343, + "learning_rate": 1.9790015397802226e-06, + "loss": 0.6855, + "step": 3131 + }, + { + "epoch": 0.47, + "grad_norm": 3.1943681065000575, + "learning_rate": 1.9789818414135805e-06, + "loss": 0.6842, + "step": 3132 + }, + { + "epoch": 0.47, + "grad_norm": 0.7446065636686301, + "learning_rate": 1.9789621339100143e-06, + "loss": 0.7044, + "step": 3133 + }, + { + "epoch": 0.47, + "grad_norm": 2.885902753936155, + "learning_rate": 1.978942417269708e-06, + "loss": 0.6882, + "step": 3134 + }, + { + "epoch": 0.47, + "grad_norm": 0.7895814502682835, + "learning_rate": 1.9789226914928458e-06, + "loss": 0.6836, + "step": 3135 + }, + { + "epoch": 0.47, + "grad_norm": 5.931185466702865, + "learning_rate": 1.978902956579611e-06, + "loss": 0.681, + "step": 3136 + }, + { + "epoch": 0.47, + "grad_norm": 4.419213592076695, + "learning_rate": 1.978883212530189e-06, + "loss": 0.6771, + "step": 3137 + }, + { + "epoch": 0.47, + "grad_norm": 5.848586021560974, + "learning_rate": 1.978863459344763e-06, + "loss": 0.694, + "step": 3138 + }, + { + "epoch": 0.47, + "grad_norm": 1.6176643055528444, + "learning_rate": 1.978843697023518e-06, + "loss": 0.6641, + "step": 3139 + }, + { + "epoch": 0.47, + "grad_norm": 2.816241456110386, + "learning_rate": 1.9788239255666382e-06, + "loss": 0.7031, + "step": 3140 + }, + { + "epoch": 0.47, + "grad_norm": 2.1503289482804866, + "learning_rate": 1.978804144974308e-06, + "loss": 0.6914, + "step": 3141 + }, + { + "epoch": 0.47, + "grad_norm": 0.4837945471749851, + "learning_rate": 1.978784355246712e-06, + "loss": 0.6849, + "step": 3142 + }, + { + "epoch": 0.47, + "grad_norm": 4.290993868966678, + "learning_rate": 1.978764556384035e-06, + "loss": 0.7051, + "step": 3143 + }, + { + "epoch": 0.47, + "grad_norm": 5.957995389652617, + "learning_rate": 1.9787447483864625e-06, + "loss": 0.6908, + "step": 3144 + }, + { + "epoch": 0.47, + "grad_norm": 7.571557587348527, + "learning_rate": 1.978724931254178e-06, + "loss": 0.6901, + "step": 3145 + }, + { + "epoch": 0.47, + "grad_norm": 3.3630641854819934, + "learning_rate": 1.9787051049873678e-06, + "loss": 0.6823, + "step": 3146 + }, + { + "epoch": 0.47, + "grad_norm": 4.782754133877477, + "learning_rate": 1.978685269586216e-06, + "loss": 0.6901, + "step": 3147 + }, + { + "epoch": 0.47, + "grad_norm": 4.790946497331579, + "learning_rate": 1.9786654250509076e-06, + "loss": 0.6693, + "step": 3148 + }, + { + "epoch": 0.47, + "grad_norm": 4.2907092523086945, + "learning_rate": 1.978645571381629e-06, + "loss": 0.6797, + "step": 3149 + }, + { + "epoch": 0.47, + "grad_norm": 5.312085215335766, + "learning_rate": 1.9786257085785642e-06, + "loss": 0.6921, + "step": 3150 + }, + { + "epoch": 0.47, + "grad_norm": 2.3546338580277455, + "learning_rate": 1.9786058366418993e-06, + "loss": 0.6836, + "step": 3151 + }, + { + "epoch": 0.47, + "grad_norm": 0.7824321650539378, + "learning_rate": 1.9785859555718192e-06, + "loss": 0.6771, + "step": 3152 + }, + { + "epoch": 0.47, + "grad_norm": 5.738421438018369, + "learning_rate": 1.97856606536851e-06, + "loss": 0.6947, + "step": 3153 + }, + { + "epoch": 0.47, + "grad_norm": 1.6922949229824384, + "learning_rate": 1.9785461660321578e-06, + "loss": 0.679, + "step": 3154 + }, + { + "epoch": 0.47, + "grad_norm": 1.1054509615341181, + "learning_rate": 1.9785262575629475e-06, + "loss": 0.6914, + "step": 3155 + }, + { + "epoch": 0.47, + "grad_norm": 1.4513629334856608, + "learning_rate": 1.9785063399610648e-06, + "loss": 0.6849, + "step": 3156 + }, + { + "epoch": 0.47, + "grad_norm": 4.839868274767603, + "learning_rate": 1.9784864132266956e-06, + "loss": 0.6882, + "step": 3157 + }, + { + "epoch": 0.47, + "grad_norm": 1.110496693433575, + "learning_rate": 1.9784664773600267e-06, + "loss": 0.6836, + "step": 3158 + }, + { + "epoch": 0.47, + "grad_norm": 6.125679102509735, + "learning_rate": 1.9784465323612433e-06, + "loss": 0.6908, + "step": 3159 + }, + { + "epoch": 0.47, + "grad_norm": 0.9634466873050622, + "learning_rate": 1.978426578230532e-06, + "loss": 0.6816, + "step": 3160 + }, + { + "epoch": 0.47, + "grad_norm": 9.46791446551438, + "learning_rate": 1.978406614968079e-06, + "loss": 0.6966, + "step": 3161 + }, + { + "epoch": 0.47, + "grad_norm": 1.211263389798821, + "learning_rate": 1.9783866425740707e-06, + "loss": 0.668, + "step": 3162 + }, + { + "epoch": 0.47, + "grad_norm": 2.6875625531498124, + "learning_rate": 1.978366661048693e-06, + "loss": 0.681, + "step": 3163 + }, + { + "epoch": 0.47, + "grad_norm": 8.514977396087717, + "learning_rate": 1.978346670392133e-06, + "loss": 0.6888, + "step": 3164 + }, + { + "epoch": 0.47, + "grad_norm": 1.1603785741067136, + "learning_rate": 1.9783266706045767e-06, + "loss": 0.6803, + "step": 3165 + }, + { + "epoch": 0.47, + "grad_norm": 9.98944050582575, + "learning_rate": 1.9783066616862116e-06, + "loss": 0.6862, + "step": 3166 + }, + { + "epoch": 0.47, + "grad_norm": 3.554000629230342, + "learning_rate": 1.978286643637223e-06, + "loss": 0.696, + "step": 3167 + }, + { + "epoch": 0.47, + "grad_norm": 2.3892983198873132, + "learning_rate": 1.9782666164577995e-06, + "loss": 0.7161, + "step": 3168 + }, + { + "epoch": 0.47, + "grad_norm": 1.1537967071489592, + "learning_rate": 1.9782465801481265e-06, + "loss": 0.6803, + "step": 3169 + }, + { + "epoch": 0.47, + "grad_norm": 1.3745157566439712, + "learning_rate": 1.9782265347083926e-06, + "loss": 0.6888, + "step": 3170 + }, + { + "epoch": 0.47, + "grad_norm": 4.541854149005662, + "learning_rate": 1.978206480138783e-06, + "loss": 0.6842, + "step": 3171 + }, + { + "epoch": 0.47, + "grad_norm": 5.225189474643609, + "learning_rate": 1.9781864164394863e-06, + "loss": 0.6927, + "step": 3172 + }, + { + "epoch": 0.47, + "grad_norm": 1.1019482922089705, + "learning_rate": 1.978166343610689e-06, + "loss": 0.6758, + "step": 3173 + }, + { + "epoch": 0.47, + "grad_norm": 5.23974543670924, + "learning_rate": 1.978146261652579e-06, + "loss": 0.6784, + "step": 3174 + }, + { + "epoch": 0.47, + "grad_norm": 0.6310076828406525, + "learning_rate": 1.9781261705653433e-06, + "loss": 0.6888, + "step": 3175 + }, + { + "epoch": 0.47, + "grad_norm": 6.856283569827794, + "learning_rate": 1.9781060703491694e-06, + "loss": 0.7018, + "step": 3176 + }, + { + "epoch": 0.47, + "grad_norm": 1.1240757057175668, + "learning_rate": 1.978085961004245e-06, + "loss": 0.6888, + "step": 3177 + }, + { + "epoch": 0.47, + "grad_norm": 1.4943456458351791, + "learning_rate": 1.978065842530758e-06, + "loss": 0.6875, + "step": 3178 + }, + { + "epoch": 0.47, + "grad_norm": 1.6205423297282175, + "learning_rate": 1.978045714928896e-06, + "loss": 0.6888, + "step": 3179 + }, + { + "epoch": 0.47, + "grad_norm": 2.4113996770183737, + "learning_rate": 1.978025578198847e-06, + "loss": 0.6901, + "step": 3180 + }, + { + "epoch": 0.47, + "grad_norm": 0.8342517494952572, + "learning_rate": 1.978005432340799e-06, + "loss": 0.6745, + "step": 3181 + }, + { + "epoch": 0.47, + "grad_norm": 4.499637199839961, + "learning_rate": 1.9779852773549392e-06, + "loss": 0.696, + "step": 3182 + }, + { + "epoch": 0.47, + "grad_norm": 1.0795354837540567, + "learning_rate": 1.9779651132414564e-06, + "loss": 0.6875, + "step": 3183 + }, + { + "epoch": 0.47, + "grad_norm": 3.4728151444845357, + "learning_rate": 1.977944940000539e-06, + "loss": 0.6855, + "step": 3184 + }, + { + "epoch": 0.48, + "grad_norm": 5.055917290179741, + "learning_rate": 1.977924757632375e-06, + "loss": 0.6842, + "step": 3185 + }, + { + "epoch": 0.48, + "grad_norm": 5.873640771382161, + "learning_rate": 1.9779045661371524e-06, + "loss": 0.6953, + "step": 3186 + }, + { + "epoch": 0.48, + "grad_norm": 3.701080894688713, + "learning_rate": 1.97788436551506e-06, + "loss": 0.6855, + "step": 3187 + }, + { + "epoch": 0.48, + "grad_norm": 4.090749271043163, + "learning_rate": 1.977864155766287e-06, + "loss": 0.6745, + "step": 3188 + }, + { + "epoch": 0.48, + "grad_norm": 2.8937717390396545, + "learning_rate": 1.9778439368910207e-06, + "loss": 0.6706, + "step": 3189 + }, + { + "epoch": 0.48, + "grad_norm": 0.4791623139427181, + "learning_rate": 1.9778237088894506e-06, + "loss": 0.6771, + "step": 3190 + }, + { + "epoch": 0.48, + "grad_norm": 3.5289294016794748, + "learning_rate": 1.9778034717617655e-06, + "loss": 0.6914, + "step": 3191 + }, + { + "epoch": 0.48, + "grad_norm": 2.0485687699946835, + "learning_rate": 1.977783225508154e-06, + "loss": 0.6784, + "step": 3192 + }, + { + "epoch": 0.48, + "grad_norm": 1.0011782797541044, + "learning_rate": 1.977762970128805e-06, + "loss": 0.6895, + "step": 3193 + }, + { + "epoch": 0.48, + "grad_norm": 0.6646457633774874, + "learning_rate": 1.9777427056239083e-06, + "loss": 0.6842, + "step": 3194 + }, + { + "epoch": 0.48, + "grad_norm": 1.3299521923024087, + "learning_rate": 1.9777224319936517e-06, + "loss": 0.6771, + "step": 3195 + }, + { + "epoch": 0.48, + "grad_norm": 2.784086674291647, + "learning_rate": 1.9777021492382257e-06, + "loss": 0.6823, + "step": 3196 + }, + { + "epoch": 0.48, + "grad_norm": 2.8683339436940476, + "learning_rate": 1.9776818573578188e-06, + "loss": 0.7109, + "step": 3197 + }, + { + "epoch": 0.48, + "grad_norm": 3.284697093993913, + "learning_rate": 1.977661556352621e-06, + "loss": 0.6862, + "step": 3198 + }, + { + "epoch": 0.48, + "grad_norm": 2.715911547702389, + "learning_rate": 1.977641246222821e-06, + "loss": 0.6921, + "step": 3199 + }, + { + "epoch": 0.48, + "grad_norm": 0.9846319673194248, + "learning_rate": 1.977620926968609e-06, + "loss": 0.6732, + "step": 3200 + }, + { + "epoch": 0.48, + "grad_norm": 1.237267171582451, + "learning_rate": 1.977600598590175e-06, + "loss": 0.696, + "step": 3201 + }, + { + "epoch": 0.48, + "grad_norm": 4.305396875139353, + "learning_rate": 1.9775802610877075e-06, + "loss": 0.6829, + "step": 3202 + }, + { + "epoch": 0.48, + "grad_norm": 1.2812228240558374, + "learning_rate": 1.977559914461397e-06, + "loss": 0.6888, + "step": 3203 + }, + { + "epoch": 0.48, + "grad_norm": 0.9567158316038523, + "learning_rate": 1.977539558711434e-06, + "loss": 0.6947, + "step": 3204 + }, + { + "epoch": 0.48, + "grad_norm": 0.574192649093933, + "learning_rate": 1.9775191938380067e-06, + "loss": 0.6882, + "step": 3205 + }, + { + "epoch": 0.48, + "grad_norm": 3.5283333885822636, + "learning_rate": 1.977498819841307e-06, + "loss": 0.6829, + "step": 3206 + }, + { + "epoch": 0.48, + "grad_norm": 4.751969002772794, + "learning_rate": 1.9774784367215245e-06, + "loss": 0.6914, + "step": 3207 + }, + { + "epoch": 0.48, + "grad_norm": 12.869140748538175, + "learning_rate": 1.9774580444788493e-06, + "loss": 0.6934, + "step": 3208 + }, + { + "epoch": 0.48, + "grad_norm": 3.741184888896479, + "learning_rate": 1.9774376431134713e-06, + "loss": 0.6947, + "step": 3209 + }, + { + "epoch": 0.48, + "grad_norm": 4.564794990079831, + "learning_rate": 1.977417232625582e-06, + "loss": 0.7096, + "step": 3210 + }, + { + "epoch": 0.48, + "grad_norm": 3.434101042827853, + "learning_rate": 1.9773968130153708e-06, + "loss": 0.6986, + "step": 3211 + }, + { + "epoch": 0.48, + "grad_norm": 3.4724457007738105, + "learning_rate": 1.9773763842830286e-06, + "loss": 0.694, + "step": 3212 + }, + { + "epoch": 0.48, + "grad_norm": 6.701568572469744, + "learning_rate": 1.977355946428746e-06, + "loss": 0.6927, + "step": 3213 + }, + { + "epoch": 0.48, + "grad_norm": 4.121051720184709, + "learning_rate": 1.9773354994527144e-06, + "loss": 0.681, + "step": 3214 + }, + { + "epoch": 0.48, + "grad_norm": 4.95548868412497, + "learning_rate": 1.977315043355124e-06, + "loss": 0.6882, + "step": 3215 + }, + { + "epoch": 0.48, + "grad_norm": 6.713676629895356, + "learning_rate": 1.9772945781361658e-06, + "loss": 0.6979, + "step": 3216 + }, + { + "epoch": 0.48, + "grad_norm": 0.9124886252456418, + "learning_rate": 1.977274103796031e-06, + "loss": 0.6836, + "step": 3217 + }, + { + "epoch": 0.48, + "grad_norm": 5.489435696740453, + "learning_rate": 1.9772536203349105e-06, + "loss": 0.6849, + "step": 3218 + }, + { + "epoch": 0.48, + "grad_norm": 0.4752922358659086, + "learning_rate": 1.9772331277529957e-06, + "loss": 0.6816, + "step": 3219 + }, + { + "epoch": 0.48, + "grad_norm": 4.3456563486886495, + "learning_rate": 1.9772126260504774e-06, + "loss": 0.6868, + "step": 3220 + }, + { + "epoch": 0.48, + "grad_norm": 7.070441279154482, + "learning_rate": 1.9771921152275475e-06, + "loss": 0.6784, + "step": 3221 + }, + { + "epoch": 0.48, + "grad_norm": 0.8884287837042225, + "learning_rate": 1.9771715952843967e-06, + "loss": 0.6771, + "step": 3222 + }, + { + "epoch": 0.48, + "grad_norm": 1.3922062132908177, + "learning_rate": 1.9771510662212175e-06, + "loss": 0.6719, + "step": 3223 + }, + { + "epoch": 0.48, + "grad_norm": 3.830760557042786, + "learning_rate": 1.977130528038201e-06, + "loss": 0.679, + "step": 3224 + }, + { + "epoch": 0.48, + "grad_norm": 1.619705467286738, + "learning_rate": 1.9771099807355383e-06, + "loss": 0.6953, + "step": 3225 + }, + { + "epoch": 0.48, + "grad_norm": 3.3140175122388698, + "learning_rate": 1.9770894243134224e-06, + "loss": 0.6823, + "step": 3226 + }, + { + "epoch": 0.48, + "grad_norm": 3.6185197424751987, + "learning_rate": 1.9770688587720443e-06, + "loss": 0.6895, + "step": 3227 + }, + { + "epoch": 0.48, + "grad_norm": 5.439482805233935, + "learning_rate": 1.977048284111596e-06, + "loss": 0.6882, + "step": 3228 + }, + { + "epoch": 0.48, + "grad_norm": 2.947458979317542, + "learning_rate": 1.9770277003322694e-06, + "loss": 0.6777, + "step": 3229 + }, + { + "epoch": 0.48, + "grad_norm": 5.1090149011045956, + "learning_rate": 1.977007107434257e-06, + "loss": 0.6862, + "step": 3230 + }, + { + "epoch": 0.48, + "grad_norm": 2.450512685268132, + "learning_rate": 1.976986505417751e-06, + "loss": 0.7018, + "step": 3231 + }, + { + "epoch": 0.48, + "grad_norm": 7.029601460282169, + "learning_rate": 1.9769658942829438e-06, + "loss": 0.6901, + "step": 3232 + }, + { + "epoch": 0.48, + "grad_norm": 3.410877023442343, + "learning_rate": 1.976945274030027e-06, + "loss": 0.6797, + "step": 3233 + }, + { + "epoch": 0.48, + "grad_norm": 6.5418604978458195, + "learning_rate": 1.976924644659194e-06, + "loss": 0.6797, + "step": 3234 + }, + { + "epoch": 0.48, + "grad_norm": 3.7875968193007177, + "learning_rate": 1.976904006170637e-06, + "loss": 0.681, + "step": 3235 + }, + { + "epoch": 0.48, + "grad_norm": 0.6777601504351839, + "learning_rate": 1.976883358564548e-06, + "loss": 0.6868, + "step": 3236 + }, + { + "epoch": 0.48, + "grad_norm": 5.591694628528006, + "learning_rate": 1.9768627018411204e-06, + "loss": 0.7168, + "step": 3237 + }, + { + "epoch": 0.48, + "grad_norm": 0.917307346225396, + "learning_rate": 1.976842036000547e-06, + "loss": 0.668, + "step": 3238 + }, + { + "epoch": 0.48, + "grad_norm": 4.6262652171877425, + "learning_rate": 1.9768213610430205e-06, + "loss": 0.6823, + "step": 3239 + }, + { + "epoch": 0.48, + "grad_norm": 11.2222881570338, + "learning_rate": 1.9768006769687336e-06, + "loss": 0.7038, + "step": 3240 + }, + { + "epoch": 0.48, + "grad_norm": 4.815212508878316, + "learning_rate": 1.97677998377788e-06, + "loss": 0.6895, + "step": 3241 + }, + { + "epoch": 0.48, + "grad_norm": 0.7839076563718684, + "learning_rate": 1.976759281470652e-06, + "loss": 0.6921, + "step": 3242 + }, + { + "epoch": 0.48, + "grad_norm": 4.806298260905028, + "learning_rate": 1.9767385700472433e-06, + "loss": 0.6979, + "step": 3243 + }, + { + "epoch": 0.48, + "grad_norm": 4.829183429487906, + "learning_rate": 1.9767178495078474e-06, + "loss": 0.6868, + "step": 3244 + }, + { + "epoch": 0.48, + "grad_norm": 2.026530236427871, + "learning_rate": 1.976697119852657e-06, + "loss": 0.6862, + "step": 3245 + }, + { + "epoch": 0.48, + "grad_norm": 2.200221609644321, + "learning_rate": 1.976676381081867e-06, + "loss": 0.6888, + "step": 3246 + }, + { + "epoch": 0.48, + "grad_norm": 2.1458053295661967, + "learning_rate": 1.976655633195669e-06, + "loss": 0.6758, + "step": 3247 + }, + { + "epoch": 0.48, + "grad_norm": 6.205258485067019, + "learning_rate": 1.976634876194258e-06, + "loss": 0.707, + "step": 3248 + }, + { + "epoch": 0.48, + "grad_norm": 2.091915619212243, + "learning_rate": 1.9766141100778267e-06, + "loss": 0.6895, + "step": 3249 + }, + { + "epoch": 0.48, + "grad_norm": 9.31549808298204, + "learning_rate": 1.97659333484657e-06, + "loss": 0.6927, + "step": 3250 + }, + { + "epoch": 0.48, + "grad_norm": 3.124659388040409, + "learning_rate": 1.9765725505006816e-06, + "loss": 0.6934, + "step": 3251 + }, + { + "epoch": 0.49, + "grad_norm": 6.797567308433581, + "learning_rate": 1.976551757040355e-06, + "loss": 0.6973, + "step": 3252 + }, + { + "epoch": 0.49, + "grad_norm": 3.9509824216691043, + "learning_rate": 1.9765309544657846e-06, + "loss": 0.6914, + "step": 3253 + }, + { + "epoch": 0.49, + "grad_norm": 5.99546773219133, + "learning_rate": 1.9765101427771644e-06, + "loss": 0.679, + "step": 3254 + }, + { + "epoch": 0.49, + "grad_norm": 6.962684914037917, + "learning_rate": 1.9764893219746886e-06, + "loss": 0.6712, + "step": 3255 + }, + { + "epoch": 0.49, + "grad_norm": 2.029995132521225, + "learning_rate": 1.9764684920585517e-06, + "loss": 0.6816, + "step": 3256 + }, + { + "epoch": 0.49, + "grad_norm": 2.1625626853613817, + "learning_rate": 1.9764476530289476e-06, + "loss": 0.6654, + "step": 3257 + }, + { + "epoch": 0.49, + "grad_norm": 5.52859377110642, + "learning_rate": 1.9764268048860715e-06, + "loss": 0.7005, + "step": 3258 + }, + { + "epoch": 0.49, + "grad_norm": 2.650758810665243, + "learning_rate": 1.976405947630118e-06, + "loss": 0.6829, + "step": 3259 + }, + { + "epoch": 0.49, + "grad_norm": 2.5638487598684265, + "learning_rate": 1.976385081261281e-06, + "loss": 0.6784, + "step": 3260 + }, + { + "epoch": 0.49, + "grad_norm": 11.382418692174264, + "learning_rate": 1.976364205779756e-06, + "loss": 0.7038, + "step": 3261 + }, + { + "epoch": 0.49, + "grad_norm": 3.6718842473538005, + "learning_rate": 1.9763433211857373e-06, + "loss": 0.6979, + "step": 3262 + }, + { + "epoch": 0.49, + "grad_norm": 5.545840614315828, + "learning_rate": 1.97632242747942e-06, + "loss": 0.6999, + "step": 3263 + }, + { + "epoch": 0.49, + "grad_norm": 2.855623247258206, + "learning_rate": 1.976301524660999e-06, + "loss": 0.6797, + "step": 3264 + }, + { + "epoch": 0.49, + "grad_norm": 3.145725021894822, + "learning_rate": 1.97628061273067e-06, + "loss": 0.6895, + "step": 3265 + }, + { + "epoch": 0.49, + "grad_norm": 9.037232270813643, + "learning_rate": 1.9762596916886272e-06, + "loss": 0.6908, + "step": 3266 + }, + { + "epoch": 0.49, + "grad_norm": 3.8814046269320985, + "learning_rate": 1.976238761535067e-06, + "loss": 0.6849, + "step": 3267 + }, + { + "epoch": 0.49, + "grad_norm": 9.669737067370233, + "learning_rate": 1.9762178222701835e-06, + "loss": 0.6953, + "step": 3268 + }, + { + "epoch": 0.49, + "grad_norm": 7.520963169935468, + "learning_rate": 1.976196873894173e-06, + "loss": 0.6882, + "step": 3269 + }, + { + "epoch": 0.49, + "grad_norm": 1.7703623664838655, + "learning_rate": 1.976175916407231e-06, + "loss": 0.6855, + "step": 3270 + }, + { + "epoch": 0.49, + "grad_norm": 1.7264869124374673, + "learning_rate": 1.9761549498095524e-06, + "loss": 0.7005, + "step": 3271 + }, + { + "epoch": 0.49, + "grad_norm": 6.176167628930603, + "learning_rate": 1.9761339741013337e-06, + "loss": 0.6608, + "step": 3272 + }, + { + "epoch": 0.49, + "grad_norm": 2.7902274772194886, + "learning_rate": 1.97611298928277e-06, + "loss": 0.6855, + "step": 3273 + }, + { + "epoch": 0.49, + "grad_norm": 3.3939808733735704, + "learning_rate": 1.9760919953540575e-06, + "loss": 0.6842, + "step": 3274 + }, + { + "epoch": 0.49, + "grad_norm": 4.263406925910313, + "learning_rate": 1.976070992315392e-06, + "loss": 0.6829, + "step": 3275 + }, + { + "epoch": 0.49, + "grad_norm": 6.227676742273933, + "learning_rate": 1.97604998016697e-06, + "loss": 0.6934, + "step": 3276 + }, + { + "epoch": 0.49, + "grad_norm": 6.083990229748188, + "learning_rate": 1.9760289589089867e-06, + "loss": 0.6771, + "step": 3277 + }, + { + "epoch": 0.49, + "grad_norm": 1.1855306305078672, + "learning_rate": 1.9760079285416392e-06, + "loss": 0.679, + "step": 3278 + }, + { + "epoch": 0.49, + "grad_norm": 2.109721992684601, + "learning_rate": 1.975986889065123e-06, + "loss": 0.6751, + "step": 3279 + }, + { + "epoch": 0.49, + "grad_norm": 0.7736510804056428, + "learning_rate": 1.975965840479635e-06, + "loss": 0.6992, + "step": 3280 + }, + { + "epoch": 0.49, + "grad_norm": 1.1705813823116469, + "learning_rate": 1.9759447827853715e-06, + "loss": 0.6999, + "step": 3281 + }, + { + "epoch": 0.49, + "grad_norm": 0.6512400038837048, + "learning_rate": 1.9759237159825296e-06, + "loss": 0.6868, + "step": 3282 + }, + { + "epoch": 0.49, + "grad_norm": 0.9779481182638695, + "learning_rate": 1.9759026400713048e-06, + "loss": 0.6855, + "step": 3283 + }, + { + "epoch": 0.49, + "grad_norm": 4.726258935864044, + "learning_rate": 1.9758815550518944e-06, + "loss": 0.6888, + "step": 3284 + }, + { + "epoch": 0.49, + "grad_norm": 9.97527462791273, + "learning_rate": 1.975860460924495e-06, + "loss": 0.6986, + "step": 3285 + }, + { + "epoch": 0.49, + "grad_norm": 8.295662860823034, + "learning_rate": 1.975839357689304e-06, + "loss": 0.6777, + "step": 3286 + }, + { + "epoch": 0.49, + "grad_norm": 9.876417632005305, + "learning_rate": 1.9758182453465176e-06, + "loss": 0.694, + "step": 3287 + }, + { + "epoch": 0.49, + "grad_norm": 1.364861267238436, + "learning_rate": 1.9757971238963337e-06, + "loss": 0.6764, + "step": 3288 + }, + { + "epoch": 0.49, + "grad_norm": 6.75848089779755, + "learning_rate": 1.9757759933389485e-06, + "loss": 0.6901, + "step": 3289 + }, + { + "epoch": 0.49, + "grad_norm": 0.6878947564477006, + "learning_rate": 1.9757548536745595e-06, + "loss": 0.6908, + "step": 3290 + }, + { + "epoch": 0.49, + "grad_norm": 1.155391841072309, + "learning_rate": 1.9757337049033647e-06, + "loss": 0.6901, + "step": 3291 + }, + { + "epoch": 0.49, + "grad_norm": 0.629703296849899, + "learning_rate": 1.9757125470255608e-06, + "loss": 0.6764, + "step": 3292 + }, + { + "epoch": 0.49, + "grad_norm": 1.888671761701647, + "learning_rate": 1.9756913800413454e-06, + "loss": 0.6797, + "step": 3293 + }, + { + "epoch": 0.49, + "grad_norm": 4.240432642882757, + "learning_rate": 1.975670203950916e-06, + "loss": 0.6947, + "step": 3294 + }, + { + "epoch": 0.49, + "grad_norm": 4.894565450735684, + "learning_rate": 1.97564901875447e-06, + "loss": 0.6823, + "step": 3295 + }, + { + "epoch": 0.49, + "grad_norm": 6.266176515557385, + "learning_rate": 1.975627824452206e-06, + "loss": 0.6868, + "step": 3296 + }, + { + "epoch": 0.49, + "grad_norm": 7.177237161868175, + "learning_rate": 1.9756066210443205e-06, + "loss": 0.6764, + "step": 3297 + }, + { + "epoch": 0.49, + "grad_norm": 3.5380795626753176, + "learning_rate": 1.9755854085310128e-06, + "loss": 0.6921, + "step": 3298 + }, + { + "epoch": 0.49, + "grad_norm": 2.639159906494914, + "learning_rate": 1.97556418691248e-06, + "loss": 0.6914, + "step": 3299 + }, + { + "epoch": 0.49, + "grad_norm": 4.674913730153612, + "learning_rate": 1.9755429561889205e-06, + "loss": 0.6758, + "step": 3300 + }, + { + "epoch": 0.49, + "grad_norm": 2.258477485617365, + "learning_rate": 1.9755217163605323e-06, + "loss": 0.7057, + "step": 3301 + }, + { + "epoch": 0.49, + "grad_norm": 2.8488720246807766, + "learning_rate": 1.9755004674275133e-06, + "loss": 0.7018, + "step": 3302 + }, + { + "epoch": 0.49, + "grad_norm": 2.0653352010691512, + "learning_rate": 1.9754792093900623e-06, + "loss": 0.6803, + "step": 3303 + }, + { + "epoch": 0.49, + "grad_norm": 2.8957337320031256, + "learning_rate": 1.9754579422483777e-06, + "loss": 0.6895, + "step": 3304 + }, + { + "epoch": 0.49, + "grad_norm": 1.6677429485528539, + "learning_rate": 1.975436666002658e-06, + "loss": 0.6634, + "step": 3305 + }, + { + "epoch": 0.49, + "grad_norm": 2.053992740815659, + "learning_rate": 1.9754153806531014e-06, + "loss": 0.6784, + "step": 3306 + }, + { + "epoch": 0.49, + "grad_norm": 8.897608083293544, + "learning_rate": 1.9753940861999067e-06, + "loss": 0.6901, + "step": 3307 + }, + { + "epoch": 0.49, + "grad_norm": 5.950826675011799, + "learning_rate": 1.9753727826432727e-06, + "loss": 0.6829, + "step": 3308 + }, + { + "epoch": 0.49, + "grad_norm": 4.596036642857945, + "learning_rate": 1.9753514699833987e-06, + "loss": 0.6914, + "step": 3309 + }, + { + "epoch": 0.49, + "grad_norm": 0.6179665552469439, + "learning_rate": 1.9753301482204827e-06, + "loss": 0.6895, + "step": 3310 + }, + { + "epoch": 0.49, + "grad_norm": 6.411225204944122, + "learning_rate": 1.9753088173547243e-06, + "loss": 0.6862, + "step": 3311 + }, + { + "epoch": 0.49, + "grad_norm": 8.387351195051, + "learning_rate": 1.975287477386323e-06, + "loss": 0.6764, + "step": 3312 + }, + { + "epoch": 0.49, + "grad_norm": 4.157387857010507, + "learning_rate": 1.975266128315477e-06, + "loss": 0.6673, + "step": 3313 + }, + { + "epoch": 0.49, + "grad_norm": 2.6578078581880082, + "learning_rate": 1.975244770142386e-06, + "loss": 0.6908, + "step": 3314 + }, + { + "epoch": 0.49, + "grad_norm": 3.8998208076332337, + "learning_rate": 1.975223402867249e-06, + "loss": 0.6842, + "step": 3315 + }, + { + "epoch": 0.49, + "grad_norm": 2.548777635914467, + "learning_rate": 1.975202026490266e-06, + "loss": 0.6836, + "step": 3316 + }, + { + "epoch": 0.49, + "grad_norm": 7.325521154278511, + "learning_rate": 1.9751806410116363e-06, + "loss": 0.6875, + "step": 3317 + }, + { + "epoch": 0.49, + "grad_norm": 4.512749568100071, + "learning_rate": 1.975159246431559e-06, + "loss": 0.6908, + "step": 3318 + }, + { + "epoch": 0.5, + "grad_norm": 0.6883724593412025, + "learning_rate": 1.975137842750235e-06, + "loss": 0.6706, + "step": 3319 + }, + { + "epoch": 0.5, + "grad_norm": 1.8655522182117052, + "learning_rate": 1.9751164299678628e-06, + "loss": 0.6673, + "step": 3320 + }, + { + "epoch": 0.5, + "grad_norm": 7.53756834925391, + "learning_rate": 1.9750950080846428e-06, + "loss": 0.6927, + "step": 3321 + }, + { + "epoch": 0.5, + "grad_norm": 2.7862594726318175, + "learning_rate": 1.9750735771007746e-06, + "loss": 0.6777, + "step": 3322 + }, + { + "epoch": 0.5, + "grad_norm": 0.8566338713907861, + "learning_rate": 1.975052137016459e-06, + "loss": 0.6901, + "step": 3323 + }, + { + "epoch": 0.5, + "grad_norm": 5.567276370376521, + "learning_rate": 1.9750306878318954e-06, + "loss": 0.666, + "step": 3324 + }, + { + "epoch": 0.5, + "grad_norm": 2.6966696367296312, + "learning_rate": 1.9750092295472837e-06, + "loss": 0.6725, + "step": 3325 + }, + { + "epoch": 0.5, + "grad_norm": 4.195573655291586, + "learning_rate": 1.974987762162825e-06, + "loss": 0.6901, + "step": 3326 + }, + { + "epoch": 0.5, + "grad_norm": 5.623480590120584, + "learning_rate": 1.9749662856787196e-06, + "loss": 0.6999, + "step": 3327 + }, + { + "epoch": 0.5, + "grad_norm": 4.808847223700556, + "learning_rate": 1.9749448000951674e-06, + "loss": 0.7005, + "step": 3328 + }, + { + "epoch": 0.5, + "grad_norm": 4.767234438183642, + "learning_rate": 1.9749233054123688e-06, + "loss": 0.6725, + "step": 3329 + }, + { + "epoch": 0.5, + "grad_norm": 1.105522415312454, + "learning_rate": 1.974901801630525e-06, + "loss": 0.7064, + "step": 3330 + }, + { + "epoch": 0.5, + "grad_norm": 8.571682383524815, + "learning_rate": 1.9748802887498368e-06, + "loss": 0.7272, + "step": 3331 + }, + { + "epoch": 0.5, + "grad_norm": 1.3803970632002722, + "learning_rate": 1.974858766770504e-06, + "loss": 0.6842, + "step": 3332 + }, + { + "epoch": 0.5, + "grad_norm": 3.2087318687352293, + "learning_rate": 1.9748372356927287e-06, + "loss": 0.6901, + "step": 3333 + }, + { + "epoch": 0.5, + "grad_norm": 3.0998794070113997, + "learning_rate": 1.974815695516711e-06, + "loss": 0.6862, + "step": 3334 + }, + { + "epoch": 0.5, + "grad_norm": 1.6727383526237685, + "learning_rate": 1.9747941462426524e-06, + "loss": 0.6751, + "step": 3335 + }, + { + "epoch": 0.5, + "grad_norm": 0.49524900299519936, + "learning_rate": 1.9747725878707536e-06, + "loss": 0.6875, + "step": 3336 + }, + { + "epoch": 0.5, + "grad_norm": 2.9258425313195144, + "learning_rate": 1.974751020401216e-06, + "loss": 0.6829, + "step": 3337 + }, + { + "epoch": 0.5, + "grad_norm": 5.155417326644257, + "learning_rate": 1.974729443834241e-06, + "loss": 0.6816, + "step": 3338 + }, + { + "epoch": 0.5, + "grad_norm": 7.620606880131691, + "learning_rate": 1.9747078581700303e-06, + "loss": 0.681, + "step": 3339 + }, + { + "epoch": 0.5, + "grad_norm": 4.909173677120375, + "learning_rate": 1.9746862634087846e-06, + "loss": 0.6901, + "step": 3340 + }, + { + "epoch": 0.5, + "grad_norm": 3.527612516959471, + "learning_rate": 1.974664659550706e-06, + "loss": 0.6823, + "step": 3341 + }, + { + "epoch": 0.5, + "grad_norm": 5.174780775016768, + "learning_rate": 1.9746430465959955e-06, + "loss": 0.6868, + "step": 3342 + }, + { + "epoch": 0.5, + "grad_norm": 1.6190434973703973, + "learning_rate": 1.9746214245448555e-06, + "loss": 0.6895, + "step": 3343 + }, + { + "epoch": 0.5, + "grad_norm": 1.5660529815352286, + "learning_rate": 1.9745997933974877e-06, + "loss": 0.6882, + "step": 3344 + }, + { + "epoch": 0.5, + "grad_norm": 0.4855282996370439, + "learning_rate": 1.9745781531540935e-06, + "loss": 0.679, + "step": 3345 + }, + { + "epoch": 0.5, + "grad_norm": 1.67928680699654, + "learning_rate": 1.9745565038148753e-06, + "loss": 0.6745, + "step": 3346 + }, + { + "epoch": 0.5, + "grad_norm": 1.4977537921944974, + "learning_rate": 1.9745348453800353e-06, + "loss": 0.6875, + "step": 3347 + }, + { + "epoch": 0.5, + "grad_norm": 6.903304482088259, + "learning_rate": 1.9745131778497754e-06, + "loss": 0.696, + "step": 3348 + }, + { + "epoch": 0.5, + "grad_norm": 4.968913842109329, + "learning_rate": 1.9744915012242976e-06, + "loss": 0.6882, + "step": 3349 + }, + { + "epoch": 0.5, + "grad_norm": 0.6851488706112824, + "learning_rate": 1.9744698155038044e-06, + "loss": 0.6667, + "step": 3350 + }, + { + "epoch": 0.5, + "grad_norm": 6.365047811938867, + "learning_rate": 1.9744481206884986e-06, + "loss": 0.6816, + "step": 3351 + }, + { + "epoch": 0.5, + "grad_norm": 2.0738252816291767, + "learning_rate": 1.974426416778582e-06, + "loss": 0.7005, + "step": 3352 + }, + { + "epoch": 0.5, + "grad_norm": 3.6735569613085954, + "learning_rate": 1.974404703774258e-06, + "loss": 0.679, + "step": 3353 + }, + { + "epoch": 0.5, + "grad_norm": 2.2708622181928604, + "learning_rate": 1.974382981675728e-06, + "loss": 0.6934, + "step": 3354 + }, + { + "epoch": 0.5, + "grad_norm": 2.4703755619621663, + "learning_rate": 1.974361250483196e-06, + "loss": 0.6875, + "step": 3355 + }, + { + "epoch": 0.5, + "grad_norm": 5.370989447158303, + "learning_rate": 1.974339510196864e-06, + "loss": 0.6751, + "step": 3356 + }, + { + "epoch": 0.5, + "grad_norm": 0.6357387647437704, + "learning_rate": 1.974317760816935e-06, + "loss": 0.6764, + "step": 3357 + }, + { + "epoch": 0.5, + "grad_norm": 1.654476052073029, + "learning_rate": 1.9742960023436125e-06, + "loss": 0.7064, + "step": 3358 + }, + { + "epoch": 0.5, + "grad_norm": 0.8265202147657396, + "learning_rate": 1.9742742347770987e-06, + "loss": 0.6921, + "step": 3359 + }, + { + "epoch": 0.5, + "grad_norm": 3.7733901390983804, + "learning_rate": 1.974252458117598e-06, + "loss": 0.7129, + "step": 3360 + }, + { + "epoch": 0.5, + "grad_norm": 0.6108400596358946, + "learning_rate": 1.9742306723653126e-06, + "loss": 0.6758, + "step": 3361 + }, + { + "epoch": 0.5, + "grad_norm": 3.0373548157480106, + "learning_rate": 1.9742088775204463e-06, + "loss": 0.6777, + "step": 3362 + }, + { + "epoch": 0.5, + "grad_norm": 9.603030146372593, + "learning_rate": 1.9741870735832026e-06, + "loss": 0.7122, + "step": 3363 + }, + { + "epoch": 0.5, + "grad_norm": 2.7803452407166143, + "learning_rate": 1.9741652605537845e-06, + "loss": 0.6634, + "step": 3364 + }, + { + "epoch": 0.5, + "grad_norm": 8.260100907694952, + "learning_rate": 1.974143438432396e-06, + "loss": 0.7018, + "step": 3365 + }, + { + "epoch": 0.5, + "grad_norm": 7.013374249200411, + "learning_rate": 1.9741216072192405e-06, + "loss": 0.6921, + "step": 3366 + }, + { + "epoch": 0.5, + "grad_norm": 2.3688172864286163, + "learning_rate": 1.9740997669145216e-06, + "loss": 0.6751, + "step": 3367 + }, + { + "epoch": 0.5, + "grad_norm": 3.4821060446714345, + "learning_rate": 1.974077917518444e-06, + "loss": 0.6777, + "step": 3368 + }, + { + "epoch": 0.5, + "grad_norm": 1.3904326133410947, + "learning_rate": 1.974056059031211e-06, + "loss": 0.7031, + "step": 3369 + }, + { + "epoch": 0.5, + "grad_norm": 0.40134713973787095, + "learning_rate": 1.974034191453027e-06, + "loss": 0.6921, + "step": 3370 + }, + { + "epoch": 0.5, + "grad_norm": 9.509672299770044, + "learning_rate": 1.9740123147840946e-06, + "loss": 0.709, + "step": 3371 + }, + { + "epoch": 0.5, + "grad_norm": 1.4500144497196439, + "learning_rate": 1.9739904290246203e-06, + "loss": 0.6973, + "step": 3372 + }, + { + "epoch": 0.5, + "grad_norm": 1.9298564443761266, + "learning_rate": 1.9739685341748065e-06, + "loss": 0.6868, + "step": 3373 + }, + { + "epoch": 0.5, + "grad_norm": 6.7345621254960975, + "learning_rate": 1.9739466302348583e-06, + "loss": 0.6888, + "step": 3374 + }, + { + "epoch": 0.5, + "grad_norm": 2.6911850873565273, + "learning_rate": 1.9739247172049803e-06, + "loss": 0.6725, + "step": 3375 + }, + { + "epoch": 0.5, + "grad_norm": 2.416030717942761, + "learning_rate": 1.9739027950853766e-06, + "loss": 0.6908, + "step": 3376 + }, + { + "epoch": 0.5, + "grad_norm": 4.336010569297612, + "learning_rate": 1.973880863876252e-06, + "loss": 0.6862, + "step": 3377 + }, + { + "epoch": 0.5, + "grad_norm": 4.063957410018514, + "learning_rate": 1.973858923577811e-06, + "loss": 0.6868, + "step": 3378 + }, + { + "epoch": 0.5, + "grad_norm": 2.4177671234467364, + "learning_rate": 1.973836974190259e-06, + "loss": 0.6934, + "step": 3379 + }, + { + "epoch": 0.5, + "grad_norm": 0.8060641778416959, + "learning_rate": 1.9738150157138e-06, + "loss": 0.6816, + "step": 3380 + }, + { + "epoch": 0.5, + "grad_norm": 3.962816767343177, + "learning_rate": 1.9737930481486397e-06, + "loss": 0.679, + "step": 3381 + }, + { + "epoch": 0.5, + "grad_norm": 1.4004184491637006, + "learning_rate": 1.9737710714949826e-06, + "loss": 0.6849, + "step": 3382 + }, + { + "epoch": 0.5, + "grad_norm": 0.8550269831682967, + "learning_rate": 1.9737490857530342e-06, + "loss": 0.6829, + "step": 3383 + }, + { + "epoch": 0.5, + "grad_norm": 1.7576637563188244, + "learning_rate": 1.973727090922999e-06, + "loss": 0.6771, + "step": 3384 + }, + { + "epoch": 0.5, + "grad_norm": 2.6499516543530457, + "learning_rate": 1.9737050870050832e-06, + "loss": 0.6712, + "step": 3385 + }, + { + "epoch": 0.5, + "grad_norm": 4.01776548098043, + "learning_rate": 1.973683073999491e-06, + "loss": 0.6673, + "step": 3386 + }, + { + "epoch": 0.51, + "grad_norm": 6.210598318876258, + "learning_rate": 1.9736610519064296e-06, + "loss": 0.7012, + "step": 3387 + }, + { + "epoch": 0.51, + "grad_norm": 2.42208949176125, + "learning_rate": 1.973639020726103e-06, + "loss": 0.694, + "step": 3388 + }, + { + "epoch": 0.51, + "grad_norm": 1.4191383287396413, + "learning_rate": 1.973616980458717e-06, + "loss": 0.6758, + "step": 3389 + }, + { + "epoch": 0.51, + "grad_norm": 9.340346460109826, + "learning_rate": 1.9735949311044777e-06, + "loss": 0.7038, + "step": 3390 + }, + { + "epoch": 0.51, + "grad_norm": 7.179790137653763, + "learning_rate": 1.9735728726635913e-06, + "loss": 0.7005, + "step": 3391 + }, + { + "epoch": 0.51, + "grad_norm": 5.5593125403531, + "learning_rate": 1.973550805136263e-06, + "loss": 0.6758, + "step": 3392 + }, + { + "epoch": 0.51, + "grad_norm": 4.639485195211408, + "learning_rate": 1.9735287285226984e-06, + "loss": 0.6816, + "step": 3393 + }, + { + "epoch": 0.51, + "grad_norm": 4.163846421354855, + "learning_rate": 1.9735066428231047e-06, + "loss": 0.6673, + "step": 3394 + }, + { + "epoch": 0.51, + "grad_norm": 4.119726741517224, + "learning_rate": 1.973484548037687e-06, + "loss": 0.6823, + "step": 3395 + }, + { + "epoch": 0.51, + "grad_norm": 1.6744236532698147, + "learning_rate": 1.9734624441666524e-06, + "loss": 0.6986, + "step": 3396 + }, + { + "epoch": 0.51, + "grad_norm": 3.4433849237001746, + "learning_rate": 1.9734403312102064e-06, + "loss": 0.7077, + "step": 3397 + }, + { + "epoch": 0.51, + "grad_norm": 10.898067417143446, + "learning_rate": 1.9734182091685557e-06, + "loss": 0.7057, + "step": 3398 + }, + { + "epoch": 0.51, + "grad_norm": 9.276185537706004, + "learning_rate": 1.973396078041907e-06, + "loss": 0.7266, + "step": 3399 + }, + { + "epoch": 0.51, + "grad_norm": 1.5939320468611955, + "learning_rate": 1.973373937830466e-06, + "loss": 0.6921, + "step": 3400 + }, + { + "epoch": 0.51, + "grad_norm": 2.1719486102175383, + "learning_rate": 1.9733517885344406e-06, + "loss": 0.6725, + "step": 3401 + }, + { + "epoch": 0.51, + "grad_norm": 4.599858942357769, + "learning_rate": 1.9733296301540367e-06, + "loss": 0.6758, + "step": 3402 + }, + { + "epoch": 0.51, + "grad_norm": 6.491464837209866, + "learning_rate": 1.973307462689461e-06, + "loss": 0.6777, + "step": 3403 + }, + { + "epoch": 0.51, + "grad_norm": 5.259275704803548, + "learning_rate": 1.9732852861409213e-06, + "loss": 0.7064, + "step": 3404 + }, + { + "epoch": 0.51, + "grad_norm": 1.4881136313926255, + "learning_rate": 1.973263100508623e-06, + "loss": 0.6901, + "step": 3405 + }, + { + "epoch": 0.51, + "grad_norm": 1.5672711098791419, + "learning_rate": 1.973240905792775e-06, + "loss": 0.6979, + "step": 3406 + }, + { + "epoch": 0.51, + "grad_norm": 0.7358751663539989, + "learning_rate": 1.973218701993583e-06, + "loss": 0.6615, + "step": 3407 + }, + { + "epoch": 0.51, + "grad_norm": 4.176678602909893, + "learning_rate": 1.9731964891112555e-06, + "loss": 0.6882, + "step": 3408 + }, + { + "epoch": 0.51, + "grad_norm": 3.3711094134759825, + "learning_rate": 1.9731742671459984e-06, + "loss": 0.6875, + "step": 3409 + }, + { + "epoch": 0.51, + "grad_norm": 4.975793610190349, + "learning_rate": 1.9731520360980204e-06, + "loss": 0.6849, + "step": 3410 + }, + { + "epoch": 0.51, + "grad_norm": 0.5650723558685778, + "learning_rate": 1.973129795967528e-06, + "loss": 0.6699, + "step": 3411 + }, + { + "epoch": 0.51, + "grad_norm": 1.6757287760872324, + "learning_rate": 1.973107546754729e-06, + "loss": 0.6927, + "step": 3412 + }, + { + "epoch": 0.51, + "grad_norm": 8.765419865062475, + "learning_rate": 1.973085288459831e-06, + "loss": 0.6855, + "step": 3413 + }, + { + "epoch": 0.51, + "grad_norm": 2.0286961882341252, + "learning_rate": 1.973063021083043e-06, + "loss": 0.6777, + "step": 3414 + }, + { + "epoch": 0.51, + "grad_norm": 3.7802510067681516, + "learning_rate": 1.9730407446245707e-06, + "loss": 0.6901, + "step": 3415 + }, + { + "epoch": 0.51, + "grad_norm": 6.461290771343401, + "learning_rate": 1.973018459084624e-06, + "loss": 0.6979, + "step": 3416 + }, + { + "epoch": 0.51, + "grad_norm": 2.627316982960048, + "learning_rate": 1.9729961644634092e-06, + "loss": 0.6901, + "step": 3417 + }, + { + "epoch": 0.51, + "grad_norm": 1.4872305703821873, + "learning_rate": 1.9729738607611354e-06, + "loss": 0.6823, + "step": 3418 + }, + { + "epoch": 0.51, + "grad_norm": 7.432848250033093, + "learning_rate": 1.9729515479780106e-06, + "loss": 0.6973, + "step": 3419 + }, + { + "epoch": 0.51, + "grad_norm": 4.673441857016266, + "learning_rate": 1.972929226114243e-06, + "loss": 0.6751, + "step": 3420 + }, + { + "epoch": 0.51, + "grad_norm": 6.6723192945413174, + "learning_rate": 1.9729068951700405e-06, + "loss": 0.7005, + "step": 3421 + }, + { + "epoch": 0.51, + "grad_norm": 5.84766082380395, + "learning_rate": 1.9728845551456123e-06, + "loss": 0.6745, + "step": 3422 + }, + { + "epoch": 0.51, + "grad_norm": 0.6209749611232093, + "learning_rate": 1.9728622060411663e-06, + "loss": 0.6693, + "step": 3423 + }, + { + "epoch": 0.51, + "grad_norm": 2.2475647034978725, + "learning_rate": 1.9728398478569113e-06, + "loss": 0.6797, + "step": 3424 + }, + { + "epoch": 0.51, + "grad_norm": 5.373444561172873, + "learning_rate": 1.972817480593056e-06, + "loss": 0.6784, + "step": 3425 + }, + { + "epoch": 0.51, + "grad_norm": 8.059216054167099, + "learning_rate": 1.972795104249809e-06, + "loss": 0.6797, + "step": 3426 + }, + { + "epoch": 0.51, + "grad_norm": 0.9047732841123061, + "learning_rate": 1.9727727188273796e-06, + "loss": 0.6745, + "step": 3427 + }, + { + "epoch": 0.51, + "grad_norm": 1.5668799930909005, + "learning_rate": 1.972750324325976e-06, + "loss": 0.7031, + "step": 3428 + }, + { + "epoch": 0.51, + "grad_norm": 1.7197317793902867, + "learning_rate": 1.972727920745808e-06, + "loss": 0.6621, + "step": 3429 + }, + { + "epoch": 0.51, + "grad_norm": 3.680379040128478, + "learning_rate": 1.972705508087084e-06, + "loss": 0.7005, + "step": 3430 + }, + { + "epoch": 0.51, + "grad_norm": 5.412655686590141, + "learning_rate": 1.972683086350014e-06, + "loss": 0.6706, + "step": 3431 + }, + { + "epoch": 0.51, + "grad_norm": 2.055456646537599, + "learning_rate": 1.9726606555348063e-06, + "loss": 0.6667, + "step": 3432 + }, + { + "epoch": 0.51, + "grad_norm": 1.6866615264787044, + "learning_rate": 1.9726382156416704e-06, + "loss": 0.7018, + "step": 3433 + }, + { + "epoch": 0.51, + "grad_norm": 11.931593070633431, + "learning_rate": 1.9726157666708164e-06, + "loss": 0.6999, + "step": 3434 + }, + { + "epoch": 0.51, + "grad_norm": 6.182346886249022, + "learning_rate": 1.9725933086224534e-06, + "loss": 0.6986, + "step": 3435 + }, + { + "epoch": 0.51, + "grad_norm": 2.348344024201573, + "learning_rate": 1.972570841496791e-06, + "loss": 0.6686, + "step": 3436 + }, + { + "epoch": 0.51, + "grad_norm": 1.5472192061395664, + "learning_rate": 1.972548365294039e-06, + "loss": 0.679, + "step": 3437 + }, + { + "epoch": 0.51, + "grad_norm": 3.151894273438815, + "learning_rate": 1.9725258800144072e-06, + "loss": 0.6862, + "step": 3438 + }, + { + "epoch": 0.51, + "grad_norm": 7.154757413874521, + "learning_rate": 1.972503385658105e-06, + "loss": 0.6693, + "step": 3439 + }, + { + "epoch": 0.51, + "grad_norm": 4.3339650556923335, + "learning_rate": 1.972480882225343e-06, + "loss": 0.6979, + "step": 3440 + }, + { + "epoch": 0.51, + "grad_norm": 5.352015649922371, + "learning_rate": 1.972458369716331e-06, + "loss": 0.6999, + "step": 3441 + }, + { + "epoch": 0.51, + "grad_norm": 0.683042256838884, + "learning_rate": 1.9724358481312782e-06, + "loss": 0.6595, + "step": 3442 + }, + { + "epoch": 0.51, + "grad_norm": 6.146119138132416, + "learning_rate": 1.9724133174703967e-06, + "loss": 0.6673, + "step": 3443 + }, + { + "epoch": 0.51, + "grad_norm": 6.583989486930955, + "learning_rate": 1.9723907777338955e-06, + "loss": 0.653, + "step": 3444 + }, + { + "epoch": 0.51, + "grad_norm": 2.972556836634812, + "learning_rate": 1.9723682289219847e-06, + "loss": 0.6673, + "step": 3445 + }, + { + "epoch": 0.51, + "grad_norm": 5.173897157272226, + "learning_rate": 1.9723456710348752e-06, + "loss": 0.681, + "step": 3446 + }, + { + "epoch": 0.51, + "grad_norm": 2.0394465586600914, + "learning_rate": 1.972323104072778e-06, + "loss": 0.6745, + "step": 3447 + }, + { + "epoch": 0.51, + "grad_norm": 8.036557524533068, + "learning_rate": 1.972300528035903e-06, + "loss": 0.6986, + "step": 3448 + }, + { + "epoch": 0.51, + "grad_norm": 3.174873144487973, + "learning_rate": 1.9722779429244615e-06, + "loss": 0.6823, + "step": 3449 + }, + { + "epoch": 0.51, + "grad_norm": 6.15295608644208, + "learning_rate": 1.9722553487386635e-06, + "loss": 0.7044, + "step": 3450 + }, + { + "epoch": 0.51, + "grad_norm": 3.2314183448091325, + "learning_rate": 1.972232745478721e-06, + "loss": 0.6927, + "step": 3451 + }, + { + "epoch": 0.51, + "grad_norm": 5.9524156776247406, + "learning_rate": 1.972210133144844e-06, + "loss": 0.6803, + "step": 3452 + }, + { + "epoch": 0.51, + "grad_norm": 6.338643200112965, + "learning_rate": 1.9721875117372436e-06, + "loss": 0.7044, + "step": 3453 + }, + { + "epoch": 0.52, + "grad_norm": 1.253169578899131, + "learning_rate": 1.9721648812561312e-06, + "loss": 0.6686, + "step": 3454 + }, + { + "epoch": 0.52, + "grad_norm": 7.570495594635108, + "learning_rate": 1.9721422417017185e-06, + "loss": 0.6875, + "step": 3455 + }, + { + "epoch": 0.52, + "grad_norm": 6.950044866577773, + "learning_rate": 1.972119593074216e-06, + "loss": 0.7194, + "step": 3456 + }, + { + "epoch": 0.52, + "grad_norm": 7.809143305014506, + "learning_rate": 1.972096935373835e-06, + "loss": 0.7012, + "step": 3457 + }, + { + "epoch": 0.52, + "grad_norm": 1.1935977538267588, + "learning_rate": 1.9720742686007877e-06, + "loss": 0.6628, + "step": 3458 + }, + { + "epoch": 0.52, + "grad_norm": 0.9984507083670017, + "learning_rate": 1.9720515927552856e-06, + "loss": 0.6895, + "step": 3459 + }, + { + "epoch": 0.52, + "grad_norm": 8.64769386355576, + "learning_rate": 1.9720289078375397e-06, + "loss": 0.7103, + "step": 3460 + }, + { + "epoch": 0.52, + "grad_norm": 6.96266057180559, + "learning_rate": 1.9720062138477617e-06, + "loss": 0.7031, + "step": 3461 + }, + { + "epoch": 0.52, + "grad_norm": 1.4929795022950665, + "learning_rate": 1.9719835107861643e-06, + "loss": 0.6823, + "step": 3462 + }, + { + "epoch": 0.52, + "grad_norm": 8.082481383542202, + "learning_rate": 1.9719607986529584e-06, + "loss": 0.7038, + "step": 3463 + }, + { + "epoch": 0.52, + "grad_norm": 0.8796348955787175, + "learning_rate": 1.9719380774483566e-06, + "loss": 0.6947, + "step": 3464 + }, + { + "epoch": 0.52, + "grad_norm": 1.237659606778615, + "learning_rate": 1.9719153471725707e-06, + "loss": 0.6849, + "step": 3465 + }, + { + "epoch": 0.52, + "grad_norm": 5.630863879332325, + "learning_rate": 1.971892607825813e-06, + "loss": 0.6641, + "step": 3466 + }, + { + "epoch": 0.52, + "grad_norm": 4.031999373858843, + "learning_rate": 1.9718698594082955e-06, + "loss": 0.6836, + "step": 3467 + }, + { + "epoch": 0.52, + "grad_norm": 0.5623814697033072, + "learning_rate": 1.971847101920231e-06, + "loss": 0.6862, + "step": 3468 + }, + { + "epoch": 0.52, + "grad_norm": 0.41463565902898497, + "learning_rate": 1.9718243353618315e-06, + "loss": 0.6751, + "step": 3469 + }, + { + "epoch": 0.52, + "grad_norm": 0.47233601594434366, + "learning_rate": 1.971801559733309e-06, + "loss": 0.6823, + "step": 3470 + }, + { + "epoch": 0.52, + "grad_norm": 1.5154637996464804, + "learning_rate": 1.971778775034877e-06, + "loss": 0.6816, + "step": 3471 + }, + { + "epoch": 0.52, + "grad_norm": 3.4323571288888672, + "learning_rate": 1.971755981266748e-06, + "loss": 0.694, + "step": 3472 + }, + { + "epoch": 0.52, + "grad_norm": 4.504332895306242, + "learning_rate": 1.971733178429134e-06, + "loss": 0.709, + "step": 3473 + }, + { + "epoch": 0.52, + "grad_norm": 4.302195392780128, + "learning_rate": 1.9717103665222486e-06, + "loss": 0.6947, + "step": 3474 + }, + { + "epoch": 0.52, + "grad_norm": 7.737602488665911, + "learning_rate": 1.971687545546304e-06, + "loss": 0.6908, + "step": 3475 + }, + { + "epoch": 0.52, + "grad_norm": 8.942699006562348, + "learning_rate": 1.971664715501514e-06, + "loss": 0.6855, + "step": 3476 + }, + { + "epoch": 0.52, + "grad_norm": 5.020594557264396, + "learning_rate": 1.971641876388091e-06, + "loss": 0.6836, + "step": 3477 + }, + { + "epoch": 0.52, + "grad_norm": 1.9746718402464425, + "learning_rate": 1.971619028206249e-06, + "loss": 0.6927, + "step": 3478 + }, + { + "epoch": 0.52, + "grad_norm": 3.37789546515368, + "learning_rate": 1.9715961709562004e-06, + "loss": 0.6777, + "step": 3479 + }, + { + "epoch": 0.52, + "grad_norm": 1.5697077598547327, + "learning_rate": 1.971573304638159e-06, + "loss": 0.679, + "step": 3480 + }, + { + "epoch": 0.52, + "grad_norm": 1.4307041771557598, + "learning_rate": 1.9715504292523374e-06, + "loss": 0.6882, + "step": 3481 + }, + { + "epoch": 0.52, + "grad_norm": 0.651042718783252, + "learning_rate": 1.97152754479895e-06, + "loss": 0.6732, + "step": 3482 + }, + { + "epoch": 0.52, + "grad_norm": 5.950060242976906, + "learning_rate": 1.9715046512782102e-06, + "loss": 0.6842, + "step": 3483 + }, + { + "epoch": 0.52, + "grad_norm": 2.909974442637807, + "learning_rate": 1.971481748690332e-06, + "loss": 0.6797, + "step": 3484 + }, + { + "epoch": 0.52, + "grad_norm": 2.12926832986189, + "learning_rate": 1.971458837035528e-06, + "loss": 0.6895, + "step": 3485 + }, + { + "epoch": 0.52, + "grad_norm": 4.3182424592967354, + "learning_rate": 1.971435916314013e-06, + "loss": 0.6842, + "step": 3486 + }, + { + "epoch": 0.52, + "grad_norm": 4.371839102953884, + "learning_rate": 1.971412986526001e-06, + "loss": 0.6986, + "step": 3487 + }, + { + "epoch": 0.52, + "grad_norm": 0.7151416399873288, + "learning_rate": 1.9713900476717055e-06, + "loss": 0.679, + "step": 3488 + }, + { + "epoch": 0.52, + "grad_norm": 3.4126880930906602, + "learning_rate": 1.9713670997513403e-06, + "loss": 0.6901, + "step": 3489 + }, + { + "epoch": 0.52, + "grad_norm": 4.874493086521593, + "learning_rate": 1.9713441427651205e-06, + "loss": 0.681, + "step": 3490 + }, + { + "epoch": 0.52, + "grad_norm": 2.013089565084478, + "learning_rate": 1.97132117671326e-06, + "loss": 0.6803, + "step": 3491 + }, + { + "epoch": 0.52, + "grad_norm": 3.9811021932495185, + "learning_rate": 1.971298201595973e-06, + "loss": 0.6803, + "step": 3492 + }, + { + "epoch": 0.52, + "grad_norm": 1.4922514240319815, + "learning_rate": 1.9712752174134743e-06, + "loss": 0.6895, + "step": 3493 + }, + { + "epoch": 0.52, + "grad_norm": 5.853107151547969, + "learning_rate": 1.9712522241659774e-06, + "loss": 0.6803, + "step": 3494 + }, + { + "epoch": 0.52, + "grad_norm": 8.295170432881223, + "learning_rate": 1.971229221853698e-06, + "loss": 0.6803, + "step": 3495 + }, + { + "epoch": 0.52, + "grad_norm": 1.7223700720786268, + "learning_rate": 1.9712062104768505e-06, + "loss": 0.6855, + "step": 3496 + }, + { + "epoch": 0.52, + "grad_norm": 7.277492689767486, + "learning_rate": 1.971183190035649e-06, + "loss": 0.6882, + "step": 3497 + }, + { + "epoch": 0.52, + "grad_norm": 3.6410049247753724, + "learning_rate": 1.9711601605303093e-06, + "loss": 0.681, + "step": 3498 + }, + { + "epoch": 0.52, + "grad_norm": 8.797761189828696, + "learning_rate": 1.971137121961046e-06, + "loss": 0.6693, + "step": 3499 + }, + { + "epoch": 0.52, + "grad_norm": 8.883366757211563, + "learning_rate": 1.9711140743280737e-06, + "loss": 0.6842, + "step": 3500 + }, + { + "epoch": 0.52, + "grad_norm": 3.8834279009480444, + "learning_rate": 1.9710910176316083e-06, + "loss": 0.6979, + "step": 3501 + }, + { + "epoch": 0.52, + "grad_norm": 1.1539081807068086, + "learning_rate": 1.9710679518718645e-06, + "loss": 0.6855, + "step": 3502 + }, + { + "epoch": 0.52, + "grad_norm": 1.1147280753314195, + "learning_rate": 1.9710448770490575e-06, + "loss": 0.6921, + "step": 3503 + }, + { + "epoch": 0.52, + "grad_norm": 5.362755197060164, + "learning_rate": 1.971021793163403e-06, + "loss": 0.6921, + "step": 3504 + }, + { + "epoch": 0.52, + "grad_norm": 9.33803268305054, + "learning_rate": 1.9709987002151157e-06, + "loss": 0.6829, + "step": 3505 + }, + { + "epoch": 0.52, + "grad_norm": 0.7872107027454743, + "learning_rate": 1.9709755982044124e-06, + "loss": 0.6823, + "step": 3506 + }, + { + "epoch": 0.52, + "grad_norm": 4.63260940608839, + "learning_rate": 1.9709524871315073e-06, + "loss": 0.7122, + "step": 3507 + }, + { + "epoch": 0.52, + "grad_norm": 6.8551426872696535, + "learning_rate": 1.970929366996617e-06, + "loss": 0.681, + "step": 3508 + }, + { + "epoch": 0.52, + "grad_norm": 5.091326443572131, + "learning_rate": 1.970906237799957e-06, + "loss": 0.6745, + "step": 3509 + }, + { + "epoch": 0.52, + "grad_norm": 4.0139197541761105, + "learning_rate": 1.9708830995417435e-06, + "loss": 0.6875, + "step": 3510 + }, + { + "epoch": 0.52, + "grad_norm": 3.0493741642247048, + "learning_rate": 1.970859952222192e-06, + "loss": 0.6771, + "step": 3511 + }, + { + "epoch": 0.52, + "grad_norm": 0.9615595052686716, + "learning_rate": 1.9708367958415188e-06, + "loss": 0.6816, + "step": 3512 + }, + { + "epoch": 0.52, + "grad_norm": 2.505168203879857, + "learning_rate": 1.9708136303999392e-06, + "loss": 0.6829, + "step": 3513 + }, + { + "epoch": 0.52, + "grad_norm": 3.9920653945079048, + "learning_rate": 1.970790455897671e-06, + "loss": 0.6777, + "step": 3514 + }, + { + "epoch": 0.52, + "grad_norm": 3.698138205763462, + "learning_rate": 1.9707672723349297e-06, + "loss": 0.6816, + "step": 3515 + }, + { + "epoch": 0.52, + "grad_norm": 4.342773871708384, + "learning_rate": 1.970744079711931e-06, + "loss": 0.6829, + "step": 3516 + }, + { + "epoch": 0.52, + "grad_norm": 4.335851762071092, + "learning_rate": 1.970720878028892e-06, + "loss": 0.6842, + "step": 3517 + }, + { + "epoch": 0.52, + "grad_norm": 3.2425745466147653, + "learning_rate": 1.9706976672860296e-06, + "loss": 0.694, + "step": 3518 + }, + { + "epoch": 0.52, + "grad_norm": 4.665765807376907, + "learning_rate": 1.97067444748356e-06, + "loss": 0.6732, + "step": 3519 + }, + { + "epoch": 0.52, + "grad_norm": 2.234647784529134, + "learning_rate": 1.9706512186216996e-06, + "loss": 0.6947, + "step": 3520 + }, + { + "epoch": 0.53, + "grad_norm": 6.219799714259669, + "learning_rate": 1.9706279807006657e-06, + "loss": 0.6829, + "step": 3521 + }, + { + "epoch": 0.53, + "grad_norm": 0.6340866786377587, + "learning_rate": 1.970604733720675e-06, + "loss": 0.6966, + "step": 3522 + }, + { + "epoch": 0.53, + "grad_norm": 3.4183458307990087, + "learning_rate": 1.970581477681945e-06, + "loss": 0.6816, + "step": 3523 + }, + { + "epoch": 0.53, + "grad_norm": 2.758255444165924, + "learning_rate": 1.9705582125846914e-06, + "loss": 0.6725, + "step": 3524 + }, + { + "epoch": 0.53, + "grad_norm": 5.918031474809785, + "learning_rate": 1.9705349384291326e-06, + "loss": 0.6914, + "step": 3525 + }, + { + "epoch": 0.53, + "grad_norm": 3.783067578654024, + "learning_rate": 1.9705116552154857e-06, + "loss": 0.6966, + "step": 3526 + }, + { + "epoch": 0.53, + "grad_norm": 5.433904413299071, + "learning_rate": 1.970488362943967e-06, + "loss": 0.6882, + "step": 3527 + }, + { + "epoch": 0.53, + "grad_norm": 0.7680183571638314, + "learning_rate": 1.970465061614795e-06, + "loss": 0.6673, + "step": 3528 + }, + { + "epoch": 0.53, + "grad_norm": 2.896723069816814, + "learning_rate": 1.970441751228186e-06, + "loss": 0.6947, + "step": 3529 + }, + { + "epoch": 0.53, + "grad_norm": 6.427634833547123, + "learning_rate": 1.9704184317843596e-06, + "loss": 0.6797, + "step": 3530 + }, + { + "epoch": 0.53, + "grad_norm": 6.754002834436968, + "learning_rate": 1.9703951032835315e-06, + "loss": 0.7031, + "step": 3531 + }, + { + "epoch": 0.53, + "grad_norm": 5.8718522954910535, + "learning_rate": 1.97037176572592e-06, + "loss": 0.6732, + "step": 3532 + }, + { + "epoch": 0.53, + "grad_norm": 0.6399882499125903, + "learning_rate": 1.970348419111743e-06, + "loss": 0.7083, + "step": 3533 + }, + { + "epoch": 0.53, + "grad_norm": 0.8800335070831883, + "learning_rate": 1.9703250634412186e-06, + "loss": 0.6699, + "step": 3534 + }, + { + "epoch": 0.53, + "grad_norm": 4.5370472614907635, + "learning_rate": 1.9703016987145642e-06, + "loss": 0.6966, + "step": 3535 + }, + { + "epoch": 0.53, + "grad_norm": 1.635494803024828, + "learning_rate": 1.9702783249319986e-06, + "loss": 0.7005, + "step": 3536 + }, + { + "epoch": 0.53, + "grad_norm": 3.5219030681291437, + "learning_rate": 1.9702549420937397e-06, + "loss": 0.7018, + "step": 3537 + }, + { + "epoch": 0.53, + "grad_norm": 5.993109168566994, + "learning_rate": 1.9702315502000053e-06, + "loss": 0.6992, + "step": 3538 + }, + { + "epoch": 0.53, + "grad_norm": 5.451277227844012, + "learning_rate": 1.970208149251014e-06, + "loss": 0.6628, + "step": 3539 + }, + { + "epoch": 0.53, + "grad_norm": 1.6951726991547227, + "learning_rate": 1.9701847392469847e-06, + "loss": 0.6829, + "step": 3540 + }, + { + "epoch": 0.53, + "grad_norm": 0.7996316804926198, + "learning_rate": 1.970161320188135e-06, + "loss": 0.6908, + "step": 3541 + }, + { + "epoch": 0.53, + "grad_norm": 2.5732038826822525, + "learning_rate": 1.970137892074684e-06, + "loss": 0.6745, + "step": 3542 + }, + { + "epoch": 0.53, + "grad_norm": 2.9587112298861995, + "learning_rate": 1.9701144549068505e-06, + "loss": 0.6784, + "step": 3543 + }, + { + "epoch": 0.53, + "grad_norm": 3.5288555034870566, + "learning_rate": 1.970091008684853e-06, + "loss": 0.7038, + "step": 3544 + }, + { + "epoch": 0.53, + "grad_norm": 1.350544149744056, + "learning_rate": 1.9700675534089097e-06, + "loss": 0.6673, + "step": 3545 + }, + { + "epoch": 0.53, + "grad_norm": 4.9332381898261914, + "learning_rate": 1.9700440890792403e-06, + "loss": 0.6875, + "step": 3546 + }, + { + "epoch": 0.53, + "grad_norm": 6.187464901320088, + "learning_rate": 1.9700206156960643e-06, + "loss": 0.694, + "step": 3547 + }, + { + "epoch": 0.53, + "grad_norm": 4.656791502303474, + "learning_rate": 1.9699971332595994e-06, + "loss": 0.6699, + "step": 3548 + }, + { + "epoch": 0.53, + "grad_norm": 0.7756807291794633, + "learning_rate": 1.969973641770066e-06, + "loss": 0.6888, + "step": 3549 + }, + { + "epoch": 0.53, + "grad_norm": 0.496462882828012, + "learning_rate": 1.9699501412276825e-06, + "loss": 0.6927, + "step": 3550 + }, + { + "epoch": 0.53, + "grad_norm": 6.666027201886202, + "learning_rate": 1.9699266316326685e-06, + "loss": 0.6732, + "step": 3551 + }, + { + "epoch": 0.53, + "grad_norm": 3.918367989190711, + "learning_rate": 1.9699031129852436e-06, + "loss": 0.707, + "step": 3552 + }, + { + "epoch": 0.53, + "grad_norm": 0.5588843518984536, + "learning_rate": 1.969879585285627e-06, + "loss": 0.6829, + "step": 3553 + }, + { + "epoch": 0.53, + "grad_norm": 1.1195752522419609, + "learning_rate": 1.969856048534039e-06, + "loss": 0.6816, + "step": 3554 + }, + { + "epoch": 0.53, + "grad_norm": 1.621024908198184, + "learning_rate": 1.969832502730698e-06, + "loss": 0.6862, + "step": 3555 + }, + { + "epoch": 0.53, + "grad_norm": 2.9564199039169945, + "learning_rate": 1.969808947875825e-06, + "loss": 0.707, + "step": 3556 + }, + { + "epoch": 0.53, + "grad_norm": 7.151435337942184, + "learning_rate": 1.9697853839696395e-06, + "loss": 0.6849, + "step": 3557 + }, + { + "epoch": 0.53, + "grad_norm": 1.3763400516511493, + "learning_rate": 1.969761811012361e-06, + "loss": 0.679, + "step": 3558 + }, + { + "epoch": 0.53, + "grad_norm": 4.963512337798386, + "learning_rate": 1.9697382290042094e-06, + "loss": 0.6797, + "step": 3559 + }, + { + "epoch": 0.53, + "grad_norm": 3.1302384136339416, + "learning_rate": 1.9697146379454057e-06, + "loss": 0.6712, + "step": 3560 + }, + { + "epoch": 0.53, + "grad_norm": 3.3243947225862134, + "learning_rate": 1.969691037836169e-06, + "loss": 0.6875, + "step": 3561 + }, + { + "epoch": 0.53, + "grad_norm": 1.3051323047794186, + "learning_rate": 1.9696674286767204e-06, + "loss": 0.681, + "step": 3562 + }, + { + "epoch": 0.53, + "grad_norm": 0.9891936539987681, + "learning_rate": 1.9696438104672798e-06, + "loss": 0.694, + "step": 3563 + }, + { + "epoch": 0.53, + "grad_norm": 0.9505473676023859, + "learning_rate": 1.969620183208068e-06, + "loss": 0.6673, + "step": 3564 + }, + { + "epoch": 0.53, + "grad_norm": 0.810352792794082, + "learning_rate": 1.969596546899305e-06, + "loss": 0.6888, + "step": 3565 + }, + { + "epoch": 0.53, + "grad_norm": 7.672511325520642, + "learning_rate": 1.9695729015412117e-06, + "loss": 0.6966, + "step": 3566 + }, + { + "epoch": 0.53, + "grad_norm": 3.8768071903047288, + "learning_rate": 1.969549247134009e-06, + "loss": 0.6836, + "step": 3567 + }, + { + "epoch": 0.53, + "grad_norm": 2.5330383739598616, + "learning_rate": 1.9695255836779174e-06, + "loss": 0.6764, + "step": 3568 + }, + { + "epoch": 0.53, + "grad_norm": 1.1392415679453876, + "learning_rate": 1.9695019111731575e-06, + "loss": 0.7064, + "step": 3569 + }, + { + "epoch": 0.53, + "grad_norm": 2.4732741409610273, + "learning_rate": 1.9694782296199507e-06, + "loss": 0.7025, + "step": 3570 + }, + { + "epoch": 0.53, + "grad_norm": 3.463378462623523, + "learning_rate": 1.969454539018518e-06, + "loss": 0.6719, + "step": 3571 + }, + { + "epoch": 0.53, + "grad_norm": 1.4473621218775345, + "learning_rate": 1.96943083936908e-06, + "loss": 0.6738, + "step": 3572 + }, + { + "epoch": 0.53, + "grad_norm": 3.462615171688518, + "learning_rate": 1.9694071306718585e-06, + "loss": 0.679, + "step": 3573 + }, + { + "epoch": 0.53, + "grad_norm": 4.916905013888394, + "learning_rate": 1.9693834129270747e-06, + "loss": 0.6882, + "step": 3574 + }, + { + "epoch": 0.53, + "grad_norm": 5.0835019955868175, + "learning_rate": 1.9693596861349495e-06, + "loss": 0.6576, + "step": 3575 + }, + { + "epoch": 0.53, + "grad_norm": 7.82407869543537, + "learning_rate": 1.969335950295705e-06, + "loss": 0.694, + "step": 3576 + }, + { + "epoch": 0.53, + "grad_norm": 6.4955395580249995, + "learning_rate": 1.9693122054095617e-06, + "loss": 0.6842, + "step": 3577 + }, + { + "epoch": 0.53, + "grad_norm": 0.711701973488923, + "learning_rate": 1.9692884514767424e-06, + "loss": 0.6641, + "step": 3578 + }, + { + "epoch": 0.53, + "grad_norm": 1.0821108887887332, + "learning_rate": 1.9692646884974677e-06, + "loss": 0.679, + "step": 3579 + }, + { + "epoch": 0.53, + "grad_norm": 1.939240837516655, + "learning_rate": 1.9692409164719607e-06, + "loss": 0.6608, + "step": 3580 + }, + { + "epoch": 0.53, + "grad_norm": 1.3810394695217574, + "learning_rate": 1.9692171354004417e-06, + "loss": 0.6953, + "step": 3581 + }, + { + "epoch": 0.53, + "grad_norm": 1.8251247493628804, + "learning_rate": 1.9691933452831343e-06, + "loss": 0.6654, + "step": 3582 + }, + { + "epoch": 0.53, + "grad_norm": 1.1184837110688253, + "learning_rate": 1.969169546120259e-06, + "loss": 0.6901, + "step": 3583 + }, + { + "epoch": 0.53, + "grad_norm": 2.8933573181919225, + "learning_rate": 1.969145737912039e-06, + "loss": 0.6797, + "step": 3584 + }, + { + "epoch": 0.53, + "grad_norm": 4.5895607220243, + "learning_rate": 1.969121920658696e-06, + "loss": 0.6647, + "step": 3585 + }, + { + "epoch": 0.53, + "grad_norm": 7.73171948947845, + "learning_rate": 1.9690980943604523e-06, + "loss": 0.6855, + "step": 3586 + }, + { + "epoch": 0.53, + "grad_norm": 2.866092360914073, + "learning_rate": 1.9690742590175304e-06, + "loss": 0.6764, + "step": 3587 + }, + { + "epoch": 0.54, + "grad_norm": 5.597298374348386, + "learning_rate": 1.969050414630153e-06, + "loss": 0.707, + "step": 3588 + }, + { + "epoch": 0.54, + "grad_norm": 1.9303640017597252, + "learning_rate": 1.969026561198542e-06, + "loss": 0.7188, + "step": 3589 + }, + { + "epoch": 0.54, + "grad_norm": 1.170024686038682, + "learning_rate": 1.969002698722921e-06, + "loss": 0.6738, + "step": 3590 + }, + { + "epoch": 0.54, + "grad_norm": 2.2735795298093024, + "learning_rate": 1.9689788272035116e-06, + "loss": 0.6875, + "step": 3591 + }, + { + "epoch": 0.54, + "grad_norm": 2.961760363617031, + "learning_rate": 1.9689549466405373e-06, + "loss": 0.6868, + "step": 3592 + }, + { + "epoch": 0.54, + "grad_norm": 2.5807653095794687, + "learning_rate": 1.9689310570342208e-06, + "loss": 0.6966, + "step": 3593 + }, + { + "epoch": 0.54, + "grad_norm": 3.3708786255471144, + "learning_rate": 1.968907158384785e-06, + "loss": 0.6784, + "step": 3594 + }, + { + "epoch": 0.54, + "grad_norm": 4.167946159089024, + "learning_rate": 1.9688832506924527e-06, + "loss": 0.6842, + "step": 3595 + }, + { + "epoch": 0.54, + "grad_norm": 1.7934129563619194, + "learning_rate": 1.968859333957448e-06, + "loss": 0.6829, + "step": 3596 + }, + { + "epoch": 0.54, + "grad_norm": 1.839411804374205, + "learning_rate": 1.968835408179993e-06, + "loss": 0.6895, + "step": 3597 + }, + { + "epoch": 0.54, + "grad_norm": 5.347911374383133, + "learning_rate": 1.968811473360312e-06, + "loss": 0.6823, + "step": 3598 + }, + { + "epoch": 0.54, + "grad_norm": 5.079747059220617, + "learning_rate": 1.968787529498627e-06, + "loss": 0.6849, + "step": 3599 + }, + { + "epoch": 0.54, + "grad_norm": 3.6836367966392416, + "learning_rate": 1.9687635765951627e-06, + "loss": 0.6673, + "step": 3600 + }, + { + "epoch": 0.54, + "grad_norm": 0.7495840620639854, + "learning_rate": 1.9687396146501425e-06, + "loss": 0.6745, + "step": 3601 + }, + { + "epoch": 0.54, + "grad_norm": 1.7337724534988834, + "learning_rate": 1.9687156436637896e-06, + "loss": 0.6979, + "step": 3602 + }, + { + "epoch": 0.54, + "grad_norm": 0.43907575434639706, + "learning_rate": 1.968691663636328e-06, + "loss": 0.6719, + "step": 3603 + }, + { + "epoch": 0.54, + "grad_norm": 2.696253727765155, + "learning_rate": 1.9686676745679815e-06, + "loss": 0.6901, + "step": 3604 + }, + { + "epoch": 0.54, + "grad_norm": 0.7366895016029822, + "learning_rate": 1.9686436764589734e-06, + "loss": 0.6732, + "step": 3605 + }, + { + "epoch": 0.54, + "grad_norm": 10.034748150432366, + "learning_rate": 1.968619669309529e-06, + "loss": 0.6882, + "step": 3606 + }, + { + "epoch": 0.54, + "grad_norm": 0.664784857869408, + "learning_rate": 1.9685956531198707e-06, + "loss": 0.6875, + "step": 3607 + }, + { + "epoch": 0.54, + "grad_norm": 0.5213461740754184, + "learning_rate": 1.968571627890224e-06, + "loss": 0.6732, + "step": 3608 + }, + { + "epoch": 0.54, + "grad_norm": 3.4857547658388524, + "learning_rate": 1.968547593620813e-06, + "loss": 0.6882, + "step": 3609 + }, + { + "epoch": 0.54, + "grad_norm": 1.5288782611238645, + "learning_rate": 1.968523550311861e-06, + "loss": 0.696, + "step": 3610 + }, + { + "epoch": 0.54, + "grad_norm": 1.6778919992781658, + "learning_rate": 1.968499497963593e-06, + "loss": 0.6719, + "step": 3611 + }, + { + "epoch": 0.54, + "grad_norm": 1.144118482638643, + "learning_rate": 1.9684754365762345e-06, + "loss": 0.6732, + "step": 3612 + }, + { + "epoch": 0.54, + "grad_norm": 0.8914479605053919, + "learning_rate": 1.968451366150008e-06, + "loss": 0.6875, + "step": 3613 + }, + { + "epoch": 0.54, + "grad_norm": 3.3866457955456535, + "learning_rate": 1.9684272866851397e-06, + "loss": 0.6979, + "step": 3614 + }, + { + "epoch": 0.54, + "grad_norm": 1.6757865785873396, + "learning_rate": 1.968403198181854e-06, + "loss": 0.6738, + "step": 3615 + }, + { + "epoch": 0.54, + "grad_norm": 2.472663236032101, + "learning_rate": 1.9683791006403756e-06, + "loss": 0.6745, + "step": 3616 + }, + { + "epoch": 0.54, + "grad_norm": 0.8884580043697836, + "learning_rate": 1.9683549940609293e-06, + "loss": 0.6732, + "step": 3617 + }, + { + "epoch": 0.54, + "grad_norm": 1.184649030279349, + "learning_rate": 1.96833087844374e-06, + "loss": 0.6758, + "step": 3618 + }, + { + "epoch": 0.54, + "grad_norm": 3.3210561339896136, + "learning_rate": 1.9683067537890333e-06, + "loss": 0.6882, + "step": 3619 + }, + { + "epoch": 0.54, + "grad_norm": 1.8510506182891955, + "learning_rate": 1.9682826200970343e-06, + "loss": 0.6784, + "step": 3620 + }, + { + "epoch": 0.54, + "grad_norm": 3.138523162273659, + "learning_rate": 1.9682584773679675e-06, + "loss": 0.7012, + "step": 3621 + }, + { + "epoch": 0.54, + "grad_norm": 1.4625007672276469, + "learning_rate": 1.9682343256020587e-06, + "loss": 0.6868, + "step": 3622 + }, + { + "epoch": 0.54, + "grad_norm": 1.9637723972345444, + "learning_rate": 1.9682101647995335e-06, + "loss": 0.6966, + "step": 3623 + }, + { + "epoch": 0.54, + "grad_norm": 2.1567341033791125, + "learning_rate": 1.9681859949606174e-06, + "loss": 0.6621, + "step": 3624 + }, + { + "epoch": 0.54, + "grad_norm": 2.3594543712297003, + "learning_rate": 1.968161816085535e-06, + "loss": 0.7005, + "step": 3625 + }, + { + "epoch": 0.54, + "grad_norm": 3.3398909804120573, + "learning_rate": 1.9681376281745137e-06, + "loss": 0.6927, + "step": 3626 + }, + { + "epoch": 0.54, + "grad_norm": 0.7974214204358928, + "learning_rate": 1.9681134312277777e-06, + "loss": 0.6732, + "step": 3627 + }, + { + "epoch": 0.54, + "grad_norm": 3.353501886564669, + "learning_rate": 1.9680892252455536e-06, + "loss": 0.6849, + "step": 3628 + }, + { + "epoch": 0.54, + "grad_norm": 2.3803021995330034, + "learning_rate": 1.968065010228067e-06, + "loss": 0.6895, + "step": 3629 + }, + { + "epoch": 0.54, + "grad_norm": 6.8477894712186345, + "learning_rate": 1.9680407861755446e-06, + "loss": 0.6901, + "step": 3630 + }, + { + "epoch": 0.54, + "grad_norm": 1.4670733729152745, + "learning_rate": 1.9680165530882116e-06, + "loss": 0.6745, + "step": 3631 + }, + { + "epoch": 0.54, + "grad_norm": 4.2319592792209315, + "learning_rate": 1.9679923109662942e-06, + "loss": 0.6947, + "step": 3632 + }, + { + "epoch": 0.54, + "grad_norm": 0.790642825941291, + "learning_rate": 1.9679680598100196e-06, + "loss": 0.6927, + "step": 3633 + }, + { + "epoch": 0.54, + "grad_norm": 7.179558056193482, + "learning_rate": 1.967943799619613e-06, + "loss": 0.6875, + "step": 3634 + }, + { + "epoch": 0.54, + "grad_norm": 2.5740789574946907, + "learning_rate": 1.967919530395301e-06, + "loss": 0.6947, + "step": 3635 + }, + { + "epoch": 0.54, + "grad_norm": 0.5191899145524347, + "learning_rate": 1.967895252137311e-06, + "loss": 0.6862, + "step": 3636 + }, + { + "epoch": 0.54, + "grad_norm": 5.468106084507407, + "learning_rate": 1.967870964845869e-06, + "loss": 0.6758, + "step": 3637 + }, + { + "epoch": 0.54, + "grad_norm": 2.94370922670345, + "learning_rate": 1.9678466685212017e-06, + "loss": 0.6797, + "step": 3638 + }, + { + "epoch": 0.54, + "grad_norm": 3.0996475215432615, + "learning_rate": 1.9678223631635357e-06, + "loss": 0.6764, + "step": 3639 + }, + { + "epoch": 0.54, + "grad_norm": 8.298703800001254, + "learning_rate": 1.9677980487730977e-06, + "loss": 0.6901, + "step": 3640 + }, + { + "epoch": 0.54, + "grad_norm": 2.474378098522654, + "learning_rate": 1.967773725350115e-06, + "loss": 0.6921, + "step": 3641 + }, + { + "epoch": 0.54, + "grad_norm": 2.8032390544675003, + "learning_rate": 1.967749392894815e-06, + "loss": 0.6855, + "step": 3642 + }, + { + "epoch": 0.54, + "grad_norm": 6.386196891552804, + "learning_rate": 1.9677250514074243e-06, + "loss": 0.6908, + "step": 3643 + }, + { + "epoch": 0.54, + "grad_norm": 1.737834247611416, + "learning_rate": 1.9677007008881698e-06, + "loss": 0.6641, + "step": 3644 + }, + { + "epoch": 0.54, + "grad_norm": 1.2124108183276345, + "learning_rate": 1.967676341337279e-06, + "loss": 0.6921, + "step": 3645 + }, + { + "epoch": 0.54, + "grad_norm": 2.0414629178265122, + "learning_rate": 1.9676519727549796e-06, + "loss": 0.6758, + "step": 3646 + }, + { + "epoch": 0.54, + "grad_norm": 6.236631700128787, + "learning_rate": 1.967627595141499e-06, + "loss": 0.6979, + "step": 3647 + }, + { + "epoch": 0.54, + "grad_norm": 9.119853535283669, + "learning_rate": 1.967603208497064e-06, + "loss": 0.6953, + "step": 3648 + }, + { + "epoch": 0.54, + "grad_norm": 0.9116912866995548, + "learning_rate": 1.9675788128219026e-06, + "loss": 0.6764, + "step": 3649 + }, + { + "epoch": 0.54, + "grad_norm": 2.2768312782923426, + "learning_rate": 1.967554408116243e-06, + "loss": 0.6719, + "step": 3650 + }, + { + "epoch": 0.54, + "grad_norm": 0.5868529843443566, + "learning_rate": 1.9675299943803123e-06, + "loss": 0.6882, + "step": 3651 + }, + { + "epoch": 0.54, + "grad_norm": 7.176932832604509, + "learning_rate": 1.9675055716143384e-06, + "loss": 0.6849, + "step": 3652 + }, + { + "epoch": 0.54, + "grad_norm": 3.610695680340132, + "learning_rate": 1.96748113981855e-06, + "loss": 0.6777, + "step": 3653 + }, + { + "epoch": 0.54, + "grad_norm": 0.6709460427609155, + "learning_rate": 1.967456698993174e-06, + "loss": 0.6725, + "step": 3654 + }, + { + "epoch": 0.55, + "grad_norm": 5.206862379209847, + "learning_rate": 1.9674322491384395e-06, + "loss": 0.6686, + "step": 3655 + }, + { + "epoch": 0.55, + "grad_norm": 0.7974632559499063, + "learning_rate": 1.967407790254574e-06, + "loss": 0.6751, + "step": 3656 + }, + { + "epoch": 0.55, + "grad_norm": 9.722734748150511, + "learning_rate": 1.967383322341806e-06, + "loss": 0.6888, + "step": 3657 + }, + { + "epoch": 0.55, + "grad_norm": 5.93457476926023, + "learning_rate": 1.967358845400364e-06, + "loss": 0.6855, + "step": 3658 + }, + { + "epoch": 0.55, + "grad_norm": 1.0784467740433386, + "learning_rate": 1.9673343594304766e-06, + "loss": 0.7044, + "step": 3659 + }, + { + "epoch": 0.55, + "grad_norm": 2.3538162933011684, + "learning_rate": 1.9673098644323718e-06, + "loss": 0.7077, + "step": 3660 + }, + { + "epoch": 0.55, + "grad_norm": 2.045816313115465, + "learning_rate": 1.9672853604062785e-06, + "loss": 0.668, + "step": 3661 + }, + { + "epoch": 0.55, + "grad_norm": 4.755817336486679, + "learning_rate": 1.9672608473524256e-06, + "loss": 0.6797, + "step": 3662 + }, + { + "epoch": 0.55, + "grad_norm": 2.987639788974708, + "learning_rate": 1.967236325271041e-06, + "loss": 0.6719, + "step": 3663 + }, + { + "epoch": 0.55, + "grad_norm": 2.4914882600162653, + "learning_rate": 1.9672117941623552e-06, + "loss": 0.6868, + "step": 3664 + }, + { + "epoch": 0.55, + "grad_norm": 1.6932894896623811, + "learning_rate": 1.9671872540265957e-06, + "loss": 0.6771, + "step": 3665 + }, + { + "epoch": 0.55, + "grad_norm": 5.45853153974924, + "learning_rate": 1.967162704863992e-06, + "loss": 0.679, + "step": 3666 + }, + { + "epoch": 0.55, + "grad_norm": 3.2795099993686705, + "learning_rate": 1.9671381466747734e-06, + "loss": 0.6758, + "step": 3667 + }, + { + "epoch": 0.55, + "grad_norm": 1.208288087877923, + "learning_rate": 1.967113579459169e-06, + "loss": 0.6973, + "step": 3668 + }, + { + "epoch": 0.55, + "grad_norm": 1.3029182384520326, + "learning_rate": 1.9670890032174077e-06, + "loss": 0.6803, + "step": 3669 + }, + { + "epoch": 0.55, + "grad_norm": 4.743093302535752, + "learning_rate": 1.9670644179497194e-06, + "loss": 0.696, + "step": 3670 + }, + { + "epoch": 0.55, + "grad_norm": 5.164164298831938, + "learning_rate": 1.9670398236563333e-06, + "loss": 0.6921, + "step": 3671 + }, + { + "epoch": 0.55, + "grad_norm": 2.1586169021232084, + "learning_rate": 1.9670152203374792e-06, + "loss": 0.6973, + "step": 3672 + }, + { + "epoch": 0.55, + "grad_norm": 4.39785026097882, + "learning_rate": 1.966990607993386e-06, + "loss": 0.6849, + "step": 3673 + }, + { + "epoch": 0.55, + "grad_norm": 3.2243715982332164, + "learning_rate": 1.9669659866242845e-06, + "loss": 0.6895, + "step": 3674 + }, + { + "epoch": 0.55, + "grad_norm": 1.0234968358374699, + "learning_rate": 1.966941356230404e-06, + "loss": 0.6693, + "step": 3675 + }, + { + "epoch": 0.55, + "grad_norm": 3.8485748209311277, + "learning_rate": 1.966916716811974e-06, + "loss": 0.6634, + "step": 3676 + }, + { + "epoch": 0.55, + "grad_norm": 8.842783213630202, + "learning_rate": 1.966892068369225e-06, + "loss": 0.7168, + "step": 3677 + }, + { + "epoch": 0.55, + "grad_norm": 3.653873396743471, + "learning_rate": 1.9668674109023865e-06, + "loss": 0.7018, + "step": 3678 + }, + { + "epoch": 0.55, + "grad_norm": 4.271410374701863, + "learning_rate": 1.966842744411689e-06, + "loss": 0.6888, + "step": 3679 + }, + { + "epoch": 0.55, + "grad_norm": 1.1183514469547402, + "learning_rate": 1.9668180688973627e-06, + "loss": 0.6784, + "step": 3680 + }, + { + "epoch": 0.55, + "grad_norm": 1.1175565394092517, + "learning_rate": 1.966793384359638e-06, + "loss": 0.6842, + "step": 3681 + }, + { + "epoch": 0.55, + "grad_norm": 8.132639851412952, + "learning_rate": 1.9667686907987454e-06, + "loss": 0.6862, + "step": 3682 + }, + { + "epoch": 0.55, + "grad_norm": 1.3075110608724112, + "learning_rate": 1.9667439882149148e-06, + "loss": 0.6953, + "step": 3683 + }, + { + "epoch": 0.55, + "grad_norm": 0.9777636161047825, + "learning_rate": 1.966719276608377e-06, + "loss": 0.6602, + "step": 3684 + }, + { + "epoch": 0.55, + "grad_norm": 6.015347643454091, + "learning_rate": 1.9666945559793628e-06, + "loss": 0.6803, + "step": 3685 + }, + { + "epoch": 0.55, + "grad_norm": 0.5503637940131926, + "learning_rate": 1.9666698263281026e-06, + "loss": 0.6966, + "step": 3686 + }, + { + "epoch": 0.55, + "grad_norm": 2.2468316281595104, + "learning_rate": 1.9666450876548273e-06, + "loss": 0.6732, + "step": 3687 + }, + { + "epoch": 0.55, + "grad_norm": 0.4925165240348916, + "learning_rate": 1.9666203399597685e-06, + "loss": 0.6836, + "step": 3688 + }, + { + "epoch": 0.55, + "grad_norm": 0.8151180705138686, + "learning_rate": 1.966595583243156e-06, + "loss": 0.6888, + "step": 3689 + }, + { + "epoch": 0.55, + "grad_norm": 9.704402345831273, + "learning_rate": 1.966570817505222e-06, + "loss": 0.6992, + "step": 3690 + }, + { + "epoch": 0.55, + "grad_norm": 1.4741413411692523, + "learning_rate": 1.9665460427461967e-06, + "loss": 0.6803, + "step": 3691 + }, + { + "epoch": 0.55, + "grad_norm": 0.8118607788144223, + "learning_rate": 1.966521258966312e-06, + "loss": 0.6732, + "step": 3692 + }, + { + "epoch": 0.55, + "grad_norm": 2.5080881957703163, + "learning_rate": 1.9664964661657985e-06, + "loss": 0.6973, + "step": 3693 + }, + { + "epoch": 0.55, + "grad_norm": 1.5791105844964657, + "learning_rate": 1.9664716643448884e-06, + "loss": 0.6875, + "step": 3694 + }, + { + "epoch": 0.55, + "grad_norm": 4.345589788877535, + "learning_rate": 1.9664468535038126e-06, + "loss": 0.6921, + "step": 3695 + }, + { + "epoch": 0.55, + "grad_norm": 0.5242076335566082, + "learning_rate": 1.966422033642803e-06, + "loss": 0.6855, + "step": 3696 + }, + { + "epoch": 0.55, + "grad_norm": 2.859772626761817, + "learning_rate": 1.9663972047620907e-06, + "loss": 0.6986, + "step": 3697 + }, + { + "epoch": 0.55, + "grad_norm": 5.451286746267935, + "learning_rate": 1.9663723668619084e-06, + "loss": 0.6738, + "step": 3698 + }, + { + "epoch": 0.55, + "grad_norm": 0.582202455098912, + "learning_rate": 1.966347519942487e-06, + "loss": 0.6745, + "step": 3699 + }, + { + "epoch": 0.55, + "grad_norm": 1.6844207369904334, + "learning_rate": 1.9663226640040587e-06, + "loss": 0.6823, + "step": 3700 + }, + { + "epoch": 0.55, + "grad_norm": 3.5988983407193347, + "learning_rate": 1.9662977990468556e-06, + "loss": 0.6934, + "step": 3701 + }, + { + "epoch": 0.55, + "grad_norm": 1.5606673237990367, + "learning_rate": 1.9662729250711098e-06, + "loss": 0.6875, + "step": 3702 + }, + { + "epoch": 0.55, + "grad_norm": 0.7383977010906838, + "learning_rate": 1.9662480420770532e-06, + "loss": 0.6784, + "step": 3703 + }, + { + "epoch": 0.55, + "grad_norm": 1.5577519233570305, + "learning_rate": 1.966223150064918e-06, + "loss": 0.6745, + "step": 3704 + }, + { + "epoch": 0.55, + "grad_norm": 9.401823679669317, + "learning_rate": 1.966198249034937e-06, + "loss": 0.6738, + "step": 3705 + }, + { + "epoch": 0.55, + "grad_norm": 1.486324266732481, + "learning_rate": 1.9661733389873422e-06, + "loss": 0.6823, + "step": 3706 + }, + { + "epoch": 0.55, + "grad_norm": 8.026627755623926, + "learning_rate": 1.966148419922366e-06, + "loss": 0.6745, + "step": 3707 + }, + { + "epoch": 0.55, + "grad_norm": 4.780836996663193, + "learning_rate": 1.9661234918402413e-06, + "loss": 0.7005, + "step": 3708 + }, + { + "epoch": 0.55, + "grad_norm": 3.807484319477287, + "learning_rate": 1.9660985547412006e-06, + "loss": 0.6901, + "step": 3709 + }, + { + "epoch": 0.55, + "grad_norm": 3.3065071552048613, + "learning_rate": 1.9660736086254768e-06, + "loss": 0.6999, + "step": 3710 + }, + { + "epoch": 0.55, + "grad_norm": 4.831822834866347, + "learning_rate": 1.9660486534933023e-06, + "loss": 0.6855, + "step": 3711 + }, + { + "epoch": 0.55, + "grad_norm": 4.587260624263724, + "learning_rate": 1.9660236893449107e-06, + "loss": 0.6966, + "step": 3712 + }, + { + "epoch": 0.55, + "grad_norm": 5.74296962552928, + "learning_rate": 1.9659987161805344e-06, + "loss": 0.6901, + "step": 3713 + }, + { + "epoch": 0.55, + "grad_norm": 4.728369726001187, + "learning_rate": 1.9659737340004067e-06, + "loss": 0.6667, + "step": 3714 + }, + { + "epoch": 0.55, + "grad_norm": 0.8050416409455917, + "learning_rate": 1.96594874280476e-06, + "loss": 0.6725, + "step": 3715 + }, + { + "epoch": 0.55, + "grad_norm": 2.091136714092812, + "learning_rate": 1.965923742593829e-06, + "loss": 0.7194, + "step": 3716 + }, + { + "epoch": 0.55, + "grad_norm": 2.3675373669400974, + "learning_rate": 1.965898733367846e-06, + "loss": 0.6777, + "step": 3717 + }, + { + "epoch": 0.55, + "grad_norm": 5.559252503320548, + "learning_rate": 1.9658737151270454e-06, + "loss": 0.6973, + "step": 3718 + }, + { + "epoch": 0.55, + "grad_norm": 7.977029440042897, + "learning_rate": 1.965848687871659e-06, + "loss": 0.7096, + "step": 3719 + }, + { + "epoch": 0.55, + "grad_norm": 4.1410566658459755, + "learning_rate": 1.965823651601922e-06, + "loss": 0.6829, + "step": 3720 + }, + { + "epoch": 0.55, + "grad_norm": 0.43638493301439035, + "learning_rate": 1.9657986063180673e-06, + "loss": 0.6895, + "step": 3721 + }, + { + "epoch": 0.56, + "grad_norm": 2.8091999605402336, + "learning_rate": 1.965773552020329e-06, + "loss": 0.6934, + "step": 3722 + }, + { + "epoch": 0.56, + "grad_norm": 3.4207734051158596, + "learning_rate": 1.96574848870894e-06, + "loss": 0.6953, + "step": 3723 + }, + { + "epoch": 0.56, + "grad_norm": 2.4149243265913563, + "learning_rate": 1.9657234163841357e-06, + "loss": 0.6914, + "step": 3724 + }, + { + "epoch": 0.56, + "grad_norm": 3.303171284325851, + "learning_rate": 1.965698335046149e-06, + "loss": 0.6764, + "step": 3725 + }, + { + "epoch": 0.56, + "grad_norm": 5.531516925916454, + "learning_rate": 1.9656732446952146e-06, + "loss": 0.6764, + "step": 3726 + }, + { + "epoch": 0.56, + "grad_norm": 3.7864803366761004, + "learning_rate": 1.965648145331566e-06, + "loss": 0.6862, + "step": 3727 + }, + { + "epoch": 0.56, + "grad_norm": 1.894009984196398, + "learning_rate": 1.965623036955438e-06, + "loss": 0.6868, + "step": 3728 + }, + { + "epoch": 0.56, + "grad_norm": 6.049867504624438, + "learning_rate": 1.965597919567065e-06, + "loss": 0.6908, + "step": 3729 + }, + { + "epoch": 0.56, + "grad_norm": 3.565187305079142, + "learning_rate": 1.965572793166681e-06, + "loss": 0.6888, + "step": 3730 + }, + { + "epoch": 0.56, + "grad_norm": 9.52069314700066, + "learning_rate": 1.965547657754521e-06, + "loss": 0.7051, + "step": 3731 + }, + { + "epoch": 0.56, + "grad_norm": 3.360214296857064, + "learning_rate": 1.965522513330819e-06, + "loss": 0.6849, + "step": 3732 + }, + { + "epoch": 0.56, + "grad_norm": 6.500339432686903, + "learning_rate": 1.9654973598958105e-06, + "loss": 0.694, + "step": 3733 + }, + { + "epoch": 0.56, + "grad_norm": 7.894216184673713, + "learning_rate": 1.965472197449729e-06, + "loss": 0.6803, + "step": 3734 + }, + { + "epoch": 0.56, + "grad_norm": 2.218971054787895, + "learning_rate": 1.9654470259928108e-06, + "loss": 0.6855, + "step": 3735 + }, + { + "epoch": 0.56, + "grad_norm": 4.8019126164047625, + "learning_rate": 1.96542184552529e-06, + "loss": 0.6764, + "step": 3736 + }, + { + "epoch": 0.56, + "grad_norm": 1.6287788868470976, + "learning_rate": 1.9653966560474012e-06, + "loss": 0.696, + "step": 3737 + }, + { + "epoch": 0.56, + "grad_norm": 4.812260055488206, + "learning_rate": 1.9653714575593805e-06, + "loss": 0.696, + "step": 3738 + }, + { + "epoch": 0.56, + "grad_norm": 0.9485862299119361, + "learning_rate": 1.9653462500614627e-06, + "loss": 0.6875, + "step": 3739 + }, + { + "epoch": 0.56, + "grad_norm": 3.9881288306921268, + "learning_rate": 1.965321033553883e-06, + "loss": 0.7018, + "step": 3740 + }, + { + "epoch": 0.56, + "grad_norm": 11.4488239336601, + "learning_rate": 1.965295808036876e-06, + "loss": 0.6654, + "step": 3741 + }, + { + "epoch": 0.56, + "grad_norm": 8.467402850939774, + "learning_rate": 1.9652705735106786e-06, + "loss": 0.6966, + "step": 3742 + }, + { + "epoch": 0.56, + "grad_norm": 4.09520792189266, + "learning_rate": 1.9652453299755256e-06, + "loss": 0.6862, + "step": 3743 + }, + { + "epoch": 0.56, + "grad_norm": 0.768131398368025, + "learning_rate": 1.965220077431652e-06, + "loss": 0.6732, + "step": 3744 + }, + { + "epoch": 0.56, + "grad_norm": 8.007908546397788, + "learning_rate": 1.9651948158792945e-06, + "loss": 0.7077, + "step": 3745 + }, + { + "epoch": 0.56, + "grad_norm": 7.34080261520013, + "learning_rate": 1.9651695453186883e-06, + "loss": 0.6908, + "step": 3746 + }, + { + "epoch": 0.56, + "grad_norm": 0.5106959085884515, + "learning_rate": 1.9651442657500693e-06, + "loss": 0.6738, + "step": 3747 + }, + { + "epoch": 0.56, + "grad_norm": 1.6145137563879117, + "learning_rate": 1.9651189771736737e-06, + "loss": 0.7005, + "step": 3748 + }, + { + "epoch": 0.56, + "grad_norm": 4.342788949678008, + "learning_rate": 1.9650936795897375e-06, + "loss": 0.7109, + "step": 3749 + }, + { + "epoch": 0.56, + "grad_norm": 0.7402105330432001, + "learning_rate": 1.965068372998496e-06, + "loss": 0.6784, + "step": 3750 + }, + { + "epoch": 0.56, + "grad_norm": 2.9639859169529568, + "learning_rate": 1.9650430574001867e-06, + "loss": 0.6797, + "step": 3751 + }, + { + "epoch": 0.56, + "grad_norm": 0.6637639697435727, + "learning_rate": 1.9650177327950452e-06, + "loss": 0.679, + "step": 3752 + }, + { + "epoch": 0.56, + "grad_norm": 1.1894620700007232, + "learning_rate": 1.964992399183307e-06, + "loss": 0.6921, + "step": 3753 + }, + { + "epoch": 0.56, + "grad_norm": 2.847707077095592, + "learning_rate": 1.9649670565652104e-06, + "loss": 0.668, + "step": 3754 + }, + { + "epoch": 0.56, + "grad_norm": 7.587382270285597, + "learning_rate": 1.9649417049409906e-06, + "loss": 0.6973, + "step": 3755 + }, + { + "epoch": 0.56, + "grad_norm": 0.9383416732049389, + "learning_rate": 1.9649163443108844e-06, + "loss": 0.6816, + "step": 3756 + }, + { + "epoch": 0.56, + "grad_norm": 6.292110936324973, + "learning_rate": 1.9648909746751286e-06, + "loss": 0.6751, + "step": 3757 + }, + { + "epoch": 0.56, + "grad_norm": 0.6547609057392265, + "learning_rate": 1.96486559603396e-06, + "loss": 0.6875, + "step": 3758 + }, + { + "epoch": 0.56, + "grad_norm": 0.874749821957016, + "learning_rate": 1.9648402083876154e-06, + "loss": 0.6829, + "step": 3759 + }, + { + "epoch": 0.56, + "grad_norm": 4.1454685957970225, + "learning_rate": 1.964814811736332e-06, + "loss": 0.6764, + "step": 3760 + }, + { + "epoch": 0.56, + "grad_norm": 1.9813448207539253, + "learning_rate": 1.9647894060803463e-06, + "loss": 0.6745, + "step": 3761 + }, + { + "epoch": 0.56, + "grad_norm": 0.6220997319214924, + "learning_rate": 1.9647639914198964e-06, + "loss": 0.6947, + "step": 3762 + }, + { + "epoch": 0.56, + "grad_norm": 2.687799170564092, + "learning_rate": 1.9647385677552182e-06, + "loss": 0.6712, + "step": 3763 + }, + { + "epoch": 0.56, + "grad_norm": 0.766074453562593, + "learning_rate": 1.96471313508655e-06, + "loss": 0.668, + "step": 3764 + }, + { + "epoch": 0.56, + "grad_norm": 1.0895104110279157, + "learning_rate": 1.964687693414129e-06, + "loss": 0.6927, + "step": 3765 + }, + { + "epoch": 0.56, + "grad_norm": 2.8148210378090583, + "learning_rate": 1.964662242738192e-06, + "loss": 0.6895, + "step": 3766 + }, + { + "epoch": 0.56, + "grad_norm": 1.4260581386490567, + "learning_rate": 1.964636783058977e-06, + "loss": 0.6836, + "step": 3767 + }, + { + "epoch": 0.56, + "grad_norm": 3.3243533445575006, + "learning_rate": 1.9646113143767217e-06, + "loss": 0.6803, + "step": 3768 + }, + { + "epoch": 0.56, + "grad_norm": 3.361578499152654, + "learning_rate": 1.964585836691664e-06, + "loss": 0.6816, + "step": 3769 + }, + { + "epoch": 0.56, + "grad_norm": 1.0160336827603258, + "learning_rate": 1.964560350004041e-06, + "loss": 0.6686, + "step": 3770 + }, + { + "epoch": 0.56, + "grad_norm": 4.002235608822072, + "learning_rate": 1.964534854314091e-06, + "loss": 0.7051, + "step": 3771 + }, + { + "epoch": 0.56, + "grad_norm": 2.919036141585281, + "learning_rate": 1.9645093496220516e-06, + "loss": 0.6895, + "step": 3772 + }, + { + "epoch": 0.56, + "grad_norm": 3.7562778488848814, + "learning_rate": 1.9644838359281617e-06, + "loss": 0.6719, + "step": 3773 + }, + { + "epoch": 0.56, + "grad_norm": 5.515015585848878, + "learning_rate": 1.9644583132326586e-06, + "loss": 0.6777, + "step": 3774 + }, + { + "epoch": 0.56, + "grad_norm": 5.439102268246573, + "learning_rate": 1.964432781535781e-06, + "loss": 0.6934, + "step": 3775 + }, + { + "epoch": 0.56, + "grad_norm": 4.945146635713698, + "learning_rate": 1.9644072408377666e-06, + "loss": 0.681, + "step": 3776 + }, + { + "epoch": 0.56, + "grad_norm": 0.6824169195298266, + "learning_rate": 1.9643816911388545e-06, + "loss": 0.6888, + "step": 3777 + }, + { + "epoch": 0.56, + "grad_norm": 1.0421027320810763, + "learning_rate": 1.9643561324392826e-06, + "loss": 0.6966, + "step": 3778 + }, + { + "epoch": 0.56, + "grad_norm": 4.358988525563421, + "learning_rate": 1.9643305647392894e-06, + "loss": 0.6673, + "step": 3779 + }, + { + "epoch": 0.56, + "grad_norm": 1.4682808180660423, + "learning_rate": 1.964304988039114e-06, + "loss": 0.6875, + "step": 3780 + }, + { + "epoch": 0.56, + "grad_norm": 1.5684492397625667, + "learning_rate": 1.9642794023389953e-06, + "loss": 0.6875, + "step": 3781 + }, + { + "epoch": 0.56, + "grad_norm": 1.1814185831359894, + "learning_rate": 1.964253807639171e-06, + "loss": 0.681, + "step": 3782 + }, + { + "epoch": 0.56, + "grad_norm": 1.6768923125984199, + "learning_rate": 1.964228203939881e-06, + "loss": 0.6764, + "step": 3783 + }, + { + "epoch": 0.56, + "grad_norm": 8.610517694559695, + "learning_rate": 1.9642025912413637e-06, + "loss": 0.6901, + "step": 3784 + }, + { + "epoch": 0.56, + "grad_norm": 8.778411729644477, + "learning_rate": 1.9641769695438582e-06, + "loss": 0.7135, + "step": 3785 + }, + { + "epoch": 0.56, + "grad_norm": 0.6968882853839965, + "learning_rate": 1.9641513388476043e-06, + "loss": 0.6992, + "step": 3786 + }, + { + "epoch": 0.56, + "grad_norm": 2.6507938719020636, + "learning_rate": 1.9641256991528404e-06, + "loss": 0.666, + "step": 3787 + }, + { + "epoch": 0.56, + "grad_norm": 1.8006640764358124, + "learning_rate": 1.964100050459806e-06, + "loss": 0.679, + "step": 3788 + }, + { + "epoch": 0.57, + "grad_norm": 3.3698174584285154, + "learning_rate": 1.9640743927687407e-06, + "loss": 0.6966, + "step": 3789 + }, + { + "epoch": 0.57, + "grad_norm": 5.408037566255943, + "learning_rate": 1.9640487260798838e-06, + "loss": 0.6777, + "step": 3790 + }, + { + "epoch": 0.57, + "grad_norm": 5.228237462574511, + "learning_rate": 1.964023050393475e-06, + "loss": 0.6725, + "step": 3791 + }, + { + "epoch": 0.57, + "grad_norm": 1.967235259995864, + "learning_rate": 1.9639973657097533e-06, + "loss": 0.6882, + "step": 3792 + }, + { + "epoch": 0.57, + "grad_norm": 8.634036786048584, + "learning_rate": 1.9639716720289595e-06, + "loss": 0.6953, + "step": 3793 + }, + { + "epoch": 0.57, + "grad_norm": 3.8944883823131704, + "learning_rate": 1.963945969351333e-06, + "loss": 0.6934, + "step": 3794 + }, + { + "epoch": 0.57, + "grad_norm": 5.259666153234905, + "learning_rate": 1.9639202576771127e-06, + "loss": 0.6764, + "step": 3795 + }, + { + "epoch": 0.57, + "grad_norm": 1.529404331940159, + "learning_rate": 1.96389453700654e-06, + "loss": 0.6914, + "step": 3796 + }, + { + "epoch": 0.57, + "grad_norm": 1.8649649564085848, + "learning_rate": 1.963868807339854e-06, + "loss": 0.6803, + "step": 3797 + }, + { + "epoch": 0.57, + "grad_norm": 0.445947854360228, + "learning_rate": 1.9638430686772957e-06, + "loss": 0.6855, + "step": 3798 + }, + { + "epoch": 0.57, + "grad_norm": 2.3558316152081895, + "learning_rate": 1.963817321019104e-06, + "loss": 0.6777, + "step": 3799 + }, + { + "epoch": 0.57, + "grad_norm": 1.8825141108701058, + "learning_rate": 1.963791564365521e-06, + "loss": 0.6797, + "step": 3800 + }, + { + "epoch": 0.57, + "grad_norm": 3.0583886434641583, + "learning_rate": 1.9637657987167855e-06, + "loss": 0.6934, + "step": 3801 + }, + { + "epoch": 0.57, + "grad_norm": 3.1042933697449695, + "learning_rate": 1.9637400240731386e-06, + "loss": 0.681, + "step": 3802 + }, + { + "epoch": 0.57, + "grad_norm": 1.5831845611433404, + "learning_rate": 1.963714240434821e-06, + "loss": 0.6829, + "step": 3803 + }, + { + "epoch": 0.57, + "grad_norm": 2.674935211522944, + "learning_rate": 1.963688447802073e-06, + "loss": 0.6868, + "step": 3804 + }, + { + "epoch": 0.57, + "grad_norm": 2.74603858750652, + "learning_rate": 1.9636626461751352e-06, + "loss": 0.6895, + "step": 3805 + }, + { + "epoch": 0.57, + "grad_norm": 1.1469455373624822, + "learning_rate": 1.9636368355542493e-06, + "loss": 0.694, + "step": 3806 + }, + { + "epoch": 0.57, + "grad_norm": 1.887776179581965, + "learning_rate": 1.963611015939655e-06, + "loss": 0.6641, + "step": 3807 + }, + { + "epoch": 0.57, + "grad_norm": 1.8925983118316652, + "learning_rate": 1.9635851873315946e-06, + "loss": 0.6693, + "step": 3808 + }, + { + "epoch": 0.57, + "grad_norm": 1.5382339129168408, + "learning_rate": 1.963559349730308e-06, + "loss": 0.6875, + "step": 3809 + }, + { + "epoch": 0.57, + "grad_norm": 4.403266530005583, + "learning_rate": 1.963533503136037e-06, + "loss": 0.6771, + "step": 3810 + }, + { + "epoch": 0.57, + "grad_norm": 2.6534269853631822, + "learning_rate": 1.963507647549022e-06, + "loss": 0.6882, + "step": 3811 + }, + { + "epoch": 0.57, + "grad_norm": 0.9898416118320187, + "learning_rate": 1.9634817829695056e-06, + "loss": 0.6771, + "step": 3812 + }, + { + "epoch": 0.57, + "grad_norm": 5.242468137685684, + "learning_rate": 1.9634559093977285e-06, + "loss": 0.6816, + "step": 3813 + }, + { + "epoch": 0.57, + "grad_norm": 7.060421819538773, + "learning_rate": 1.9634300268339315e-06, + "loss": 0.6979, + "step": 3814 + }, + { + "epoch": 0.57, + "grad_norm": 3.2217601153456448, + "learning_rate": 1.9634041352783575e-06, + "loss": 0.6823, + "step": 3815 + }, + { + "epoch": 0.57, + "grad_norm": 0.668473946186744, + "learning_rate": 1.9633782347312475e-06, + "loss": 0.6699, + "step": 3816 + }, + { + "epoch": 0.57, + "grad_norm": 2.475856252313151, + "learning_rate": 1.9633523251928426e-06, + "loss": 0.6621, + "step": 3817 + }, + { + "epoch": 0.57, + "grad_norm": 2.8736827739386337, + "learning_rate": 1.963326406663386e-06, + "loss": 0.6647, + "step": 3818 + }, + { + "epoch": 0.57, + "grad_norm": 9.924302773097319, + "learning_rate": 1.9633004791431186e-06, + "loss": 0.7122, + "step": 3819 + }, + { + "epoch": 0.57, + "grad_norm": 4.312038455022098, + "learning_rate": 1.9632745426322827e-06, + "loss": 0.6803, + "step": 3820 + }, + { + "epoch": 0.57, + "grad_norm": 2.119407166037436, + "learning_rate": 1.96324859713112e-06, + "loss": 0.6921, + "step": 3821 + }, + { + "epoch": 0.57, + "grad_norm": 9.310626114666542, + "learning_rate": 1.963222642639873e-06, + "loss": 0.7454, + "step": 3822 + }, + { + "epoch": 0.57, + "grad_norm": 1.2868273519900852, + "learning_rate": 1.963196679158784e-06, + "loss": 0.6536, + "step": 3823 + }, + { + "epoch": 0.57, + "grad_norm": 1.7198580871750435, + "learning_rate": 1.963170706688095e-06, + "loss": 0.6771, + "step": 3824 + }, + { + "epoch": 0.57, + "grad_norm": 4.32653098888482, + "learning_rate": 1.9631447252280495e-06, + "loss": 0.6771, + "step": 3825 + }, + { + "epoch": 0.57, + "grad_norm": 7.934977087537867, + "learning_rate": 1.9631187347788886e-06, + "loss": 0.6895, + "step": 3826 + }, + { + "epoch": 0.57, + "grad_norm": 0.9031189483375662, + "learning_rate": 1.9630927353408553e-06, + "loss": 0.6908, + "step": 3827 + }, + { + "epoch": 0.57, + "grad_norm": 2.718749364216564, + "learning_rate": 1.963066726914192e-06, + "loss": 0.6758, + "step": 3828 + }, + { + "epoch": 0.57, + "grad_norm": 3.2509816412787074, + "learning_rate": 1.963040709499142e-06, + "loss": 0.6758, + "step": 3829 + }, + { + "epoch": 0.57, + "grad_norm": 2.3003813917126092, + "learning_rate": 1.963014683095948e-06, + "loss": 0.681, + "step": 3830 + }, + { + "epoch": 0.57, + "grad_norm": 0.7828907468602248, + "learning_rate": 1.962988647704853e-06, + "loss": 0.6758, + "step": 3831 + }, + { + "epoch": 0.57, + "grad_norm": 4.687745550747511, + "learning_rate": 1.9629626033260993e-06, + "loss": 0.6921, + "step": 3832 + }, + { + "epoch": 0.57, + "grad_norm": 7.771063519583802, + "learning_rate": 1.962936549959931e-06, + "loss": 0.6921, + "step": 3833 + }, + { + "epoch": 0.57, + "grad_norm": 0.9591017085094589, + "learning_rate": 1.9629104876065903e-06, + "loss": 0.6706, + "step": 3834 + }, + { + "epoch": 0.57, + "grad_norm": 2.169516335284841, + "learning_rate": 1.962884416266321e-06, + "loss": 0.6999, + "step": 3835 + }, + { + "epoch": 0.57, + "grad_norm": 3.420780077259901, + "learning_rate": 1.9628583359393664e-06, + "loss": 0.668, + "step": 3836 + }, + { + "epoch": 0.57, + "grad_norm": 0.8307227312236624, + "learning_rate": 1.9628322466259696e-06, + "loss": 0.6816, + "step": 3837 + }, + { + "epoch": 0.57, + "grad_norm": 4.602254495244231, + "learning_rate": 1.9628061483263744e-06, + "loss": 0.6647, + "step": 3838 + }, + { + "epoch": 0.57, + "grad_norm": 5.625919157186785, + "learning_rate": 1.962780041040824e-06, + "loss": 0.6719, + "step": 3839 + }, + { + "epoch": 0.57, + "grad_norm": 0.7242649040625995, + "learning_rate": 1.9627539247695627e-06, + "loss": 0.6862, + "step": 3840 + }, + { + "epoch": 0.57, + "grad_norm": 4.828321940687699, + "learning_rate": 1.9627277995128336e-06, + "loss": 0.6732, + "step": 3841 + }, + { + "epoch": 0.57, + "grad_norm": 3.606434434434044, + "learning_rate": 1.962701665270881e-06, + "loss": 0.6908, + "step": 3842 + }, + { + "epoch": 0.57, + "grad_norm": 1.2954440937993605, + "learning_rate": 1.9626755220439485e-06, + "loss": 0.6927, + "step": 3843 + }, + { + "epoch": 0.57, + "grad_norm": 0.8828609766103519, + "learning_rate": 1.9626493698322805e-06, + "loss": 0.6953, + "step": 3844 + }, + { + "epoch": 0.57, + "grad_norm": 1.501729568661364, + "learning_rate": 1.9626232086361204e-06, + "loss": 0.6803, + "step": 3845 + }, + { + "epoch": 0.57, + "grad_norm": 6.422370457228134, + "learning_rate": 1.962597038455713e-06, + "loss": 0.679, + "step": 3846 + }, + { + "epoch": 0.57, + "grad_norm": 3.8012538895946655, + "learning_rate": 1.9625708592913026e-06, + "loss": 0.6738, + "step": 3847 + }, + { + "epoch": 0.57, + "grad_norm": 1.9125844000420422, + "learning_rate": 1.9625446711431327e-06, + "loss": 0.7025, + "step": 3848 + }, + { + "epoch": 0.57, + "grad_norm": 2.3605454755531463, + "learning_rate": 1.9625184740114482e-06, + "loss": 0.696, + "step": 3849 + }, + { + "epoch": 0.57, + "grad_norm": 4.478191205448045, + "learning_rate": 1.962492267896494e-06, + "loss": 0.6875, + "step": 3850 + }, + { + "epoch": 0.57, + "grad_norm": 0.6699388530126948, + "learning_rate": 1.9624660527985144e-06, + "loss": 0.6777, + "step": 3851 + }, + { + "epoch": 0.57, + "grad_norm": 0.5313534483540796, + "learning_rate": 1.962439828717754e-06, + "loss": 0.6797, + "step": 3852 + }, + { + "epoch": 0.57, + "grad_norm": 0.858563632842911, + "learning_rate": 1.9624135956544574e-06, + "loss": 0.6855, + "step": 3853 + }, + { + "epoch": 0.57, + "grad_norm": 3.8496338630241174, + "learning_rate": 1.96238735360887e-06, + "loss": 0.6719, + "step": 3854 + }, + { + "epoch": 0.57, + "grad_norm": 2.491726328350103, + "learning_rate": 1.9623611025812355e-06, + "loss": 0.6849, + "step": 3855 + }, + { + "epoch": 0.58, + "grad_norm": 6.101991390128922, + "learning_rate": 1.9623348425718004e-06, + "loss": 0.6875, + "step": 3856 + }, + { + "epoch": 0.58, + "grad_norm": 7.5565806986850985, + "learning_rate": 1.962308573580809e-06, + "loss": 0.6719, + "step": 3857 + }, + { + "epoch": 0.58, + "grad_norm": 2.3322255037096307, + "learning_rate": 1.9622822956085064e-06, + "loss": 0.6979, + "step": 3858 + }, + { + "epoch": 0.58, + "grad_norm": 0.9808085616265378, + "learning_rate": 1.9622560086551384e-06, + "loss": 0.6576, + "step": 3859 + }, + { + "epoch": 0.58, + "grad_norm": 3.4297151042146607, + "learning_rate": 1.9622297127209494e-06, + "loss": 0.6823, + "step": 3860 + }, + { + "epoch": 0.58, + "grad_norm": 1.40022625642795, + "learning_rate": 1.962203407806186e-06, + "loss": 0.6784, + "step": 3861 + }, + { + "epoch": 0.58, + "grad_norm": 2.4582127421950792, + "learning_rate": 1.9621770939110926e-06, + "loss": 0.6953, + "step": 3862 + }, + { + "epoch": 0.58, + "grad_norm": 3.4860815685403375, + "learning_rate": 1.9621507710359153e-06, + "loss": 0.6992, + "step": 3863 + }, + { + "epoch": 0.58, + "grad_norm": 1.0733231293517567, + "learning_rate": 1.9621244391809e-06, + "loss": 0.6901, + "step": 3864 + }, + { + "epoch": 0.58, + "grad_norm": 1.0293980443204485, + "learning_rate": 1.9620980983462924e-06, + "loss": 0.6706, + "step": 3865 + }, + { + "epoch": 0.58, + "grad_norm": 3.8379042224033117, + "learning_rate": 1.962071748532338e-06, + "loss": 0.6836, + "step": 3866 + }, + { + "epoch": 0.58, + "grad_norm": 1.5384935170930933, + "learning_rate": 1.962045389739283e-06, + "loss": 0.6745, + "step": 3867 + }, + { + "epoch": 0.58, + "grad_norm": 2.9751888672796776, + "learning_rate": 1.962019021967373e-06, + "loss": 0.6966, + "step": 3868 + }, + { + "epoch": 0.58, + "grad_norm": 6.7809107706631995, + "learning_rate": 1.9619926452168543e-06, + "loss": 0.6758, + "step": 3869 + }, + { + "epoch": 0.58, + "grad_norm": 2.1502845582152235, + "learning_rate": 1.9619662594879737e-06, + "loss": 0.6719, + "step": 3870 + }, + { + "epoch": 0.58, + "grad_norm": 3.7175504573936635, + "learning_rate": 1.9619398647809762e-06, + "loss": 0.6803, + "step": 3871 + }, + { + "epoch": 0.58, + "grad_norm": 0.8297620886708922, + "learning_rate": 1.9619134610961097e-06, + "loss": 0.6719, + "step": 3872 + }, + { + "epoch": 0.58, + "grad_norm": 0.6243641411692787, + "learning_rate": 1.9618870484336193e-06, + "loss": 0.6764, + "step": 3873 + }, + { + "epoch": 0.58, + "grad_norm": 8.117417899724257, + "learning_rate": 1.961860626793752e-06, + "loss": 0.6725, + "step": 3874 + }, + { + "epoch": 0.58, + "grad_norm": 1.229875635156921, + "learning_rate": 1.9618341961767546e-06, + "loss": 0.681, + "step": 3875 + }, + { + "epoch": 0.58, + "grad_norm": 3.5917029141551127, + "learning_rate": 1.9618077565828736e-06, + "loss": 0.7005, + "step": 3876 + }, + { + "epoch": 0.58, + "grad_norm": 10.518349856283, + "learning_rate": 1.9617813080123556e-06, + "loss": 0.7077, + "step": 3877 + }, + { + "epoch": 0.58, + "grad_norm": 0.6023187303669425, + "learning_rate": 1.9617548504654475e-06, + "loss": 0.6816, + "step": 3878 + }, + { + "epoch": 0.58, + "grad_norm": 3.8100896292642643, + "learning_rate": 1.9617283839423965e-06, + "loss": 0.6647, + "step": 3879 + }, + { + "epoch": 0.58, + "grad_norm": 5.331182411383859, + "learning_rate": 1.961701908443449e-06, + "loss": 0.6712, + "step": 3880 + }, + { + "epoch": 0.58, + "grad_norm": 6.634358883948567, + "learning_rate": 1.9616754239688533e-06, + "loss": 0.7077, + "step": 3881 + }, + { + "epoch": 0.58, + "grad_norm": 0.8345240397324116, + "learning_rate": 1.9616489305188554e-06, + "loss": 0.6927, + "step": 3882 + }, + { + "epoch": 0.58, + "grad_norm": 1.052974541459646, + "learning_rate": 1.9616224280937035e-06, + "loss": 0.6927, + "step": 3883 + }, + { + "epoch": 0.58, + "grad_norm": 3.5922564288409395, + "learning_rate": 1.9615959166936437e-06, + "loss": 0.7077, + "step": 3884 + }, + { + "epoch": 0.58, + "grad_norm": 2.5842648740754557, + "learning_rate": 1.9615693963189248e-06, + "loss": 0.6999, + "step": 3885 + }, + { + "epoch": 0.58, + "grad_norm": 0.6896679043815622, + "learning_rate": 1.9615428669697935e-06, + "loss": 0.6875, + "step": 3886 + }, + { + "epoch": 0.58, + "grad_norm": 2.274744898127028, + "learning_rate": 1.961516328646497e-06, + "loss": 0.668, + "step": 3887 + }, + { + "epoch": 0.58, + "grad_norm": 1.96287002777643, + "learning_rate": 1.961489781349284e-06, + "loss": 0.6868, + "step": 3888 + }, + { + "epoch": 0.58, + "grad_norm": 6.819331479056289, + "learning_rate": 1.961463225078402e-06, + "loss": 0.7064, + "step": 3889 + }, + { + "epoch": 0.58, + "grad_norm": 7.367772802069378, + "learning_rate": 1.9614366598340984e-06, + "loss": 0.6999, + "step": 3890 + }, + { + "epoch": 0.58, + "grad_norm": 0.931555628527842, + "learning_rate": 1.961410085616622e-06, + "loss": 0.679, + "step": 3891 + }, + { + "epoch": 0.58, + "grad_norm": 6.511273571548001, + "learning_rate": 1.96138350242622e-06, + "loss": 0.6797, + "step": 3892 + }, + { + "epoch": 0.58, + "grad_norm": 3.582701370324537, + "learning_rate": 1.9613569102631403e-06, + "loss": 0.6921, + "step": 3893 + }, + { + "epoch": 0.58, + "grad_norm": 2.2743651792970305, + "learning_rate": 1.9613303091276317e-06, + "loss": 0.6777, + "step": 3894 + }, + { + "epoch": 0.58, + "grad_norm": 5.668922470201758, + "learning_rate": 1.9613036990199423e-06, + "loss": 0.6999, + "step": 3895 + }, + { + "epoch": 0.58, + "grad_norm": 6.420067242913327, + "learning_rate": 1.961277079940321e-06, + "loss": 0.6999, + "step": 3896 + }, + { + "epoch": 0.58, + "grad_norm": 7.787314281498529, + "learning_rate": 1.961250451889015e-06, + "loss": 0.6784, + "step": 3897 + }, + { + "epoch": 0.58, + "grad_norm": 2.3040130575266384, + "learning_rate": 1.961223814866274e-06, + "loss": 0.6836, + "step": 3898 + }, + { + "epoch": 0.58, + "grad_norm": 0.7393538497158081, + "learning_rate": 1.961197168872346e-06, + "loss": 0.6862, + "step": 3899 + }, + { + "epoch": 0.58, + "grad_norm": 4.174397950270958, + "learning_rate": 1.96117051390748e-06, + "loss": 0.6895, + "step": 3900 + }, + { + "epoch": 0.58, + "grad_norm": 5.552105489514348, + "learning_rate": 1.961143849971924e-06, + "loss": 0.6855, + "step": 3901 + }, + { + "epoch": 0.58, + "grad_norm": 1.9620320726923441, + "learning_rate": 1.9611171770659276e-06, + "loss": 0.6764, + "step": 3902 + }, + { + "epoch": 0.58, + "grad_norm": 1.450534173513307, + "learning_rate": 1.96109049518974e-06, + "loss": 0.6855, + "step": 3903 + }, + { + "epoch": 0.58, + "grad_norm": 4.617473614216905, + "learning_rate": 1.9610638043436097e-06, + "loss": 0.6868, + "step": 3904 + }, + { + "epoch": 0.58, + "grad_norm": 1.5668238659891371, + "learning_rate": 1.9610371045277857e-06, + "loss": 0.6927, + "step": 3905 + }, + { + "epoch": 0.58, + "grad_norm": 1.7820542585544608, + "learning_rate": 1.9610103957425176e-06, + "loss": 0.6966, + "step": 3906 + }, + { + "epoch": 0.58, + "grad_norm": 1.3573145538801878, + "learning_rate": 1.9609836779880545e-06, + "loss": 0.6986, + "step": 3907 + }, + { + "epoch": 0.58, + "grad_norm": 2.5338790594619836, + "learning_rate": 1.9609569512646454e-06, + "loss": 0.6895, + "step": 3908 + }, + { + "epoch": 0.58, + "grad_norm": 3.7075086817340472, + "learning_rate": 1.9609302155725407e-06, + "loss": 0.7057, + "step": 3909 + }, + { + "epoch": 0.58, + "grad_norm": 4.90564956342266, + "learning_rate": 1.9609034709119888e-06, + "loss": 0.6908, + "step": 3910 + }, + { + "epoch": 0.58, + "grad_norm": 2.8730321773919334, + "learning_rate": 1.96087671728324e-06, + "loss": 0.6719, + "step": 3911 + }, + { + "epoch": 0.58, + "grad_norm": 2.7458495144023884, + "learning_rate": 1.960849954686544e-06, + "loss": 0.6797, + "step": 3912 + }, + { + "epoch": 0.58, + "grad_norm": 7.519976055124276, + "learning_rate": 1.96082318312215e-06, + "loss": 0.6849, + "step": 3913 + }, + { + "epoch": 0.58, + "grad_norm": 0.6314889539762295, + "learning_rate": 1.960796402590309e-06, + "loss": 0.6836, + "step": 3914 + }, + { + "epoch": 0.58, + "grad_norm": 4.500973895720287, + "learning_rate": 1.9607696130912696e-06, + "loss": 0.6855, + "step": 3915 + }, + { + "epoch": 0.58, + "grad_norm": 2.784727286440555, + "learning_rate": 1.9607428146252825e-06, + "loss": 0.679, + "step": 3916 + }, + { + "epoch": 0.58, + "grad_norm": 1.3693557616687644, + "learning_rate": 1.9607160071925978e-06, + "loss": 0.6771, + "step": 3917 + }, + { + "epoch": 0.58, + "grad_norm": 0.8853740135386574, + "learning_rate": 1.960689190793466e-06, + "loss": 0.6797, + "step": 3918 + }, + { + "epoch": 0.58, + "grad_norm": 5.102566016979119, + "learning_rate": 1.960662365428136e-06, + "loss": 0.6875, + "step": 3919 + }, + { + "epoch": 0.58, + "grad_norm": 4.341281006733201, + "learning_rate": 1.96063553109686e-06, + "loss": 0.6868, + "step": 3920 + }, + { + "epoch": 0.58, + "grad_norm": 2.6532126380887497, + "learning_rate": 1.9606086877998877e-06, + "loss": 0.6732, + "step": 3921 + }, + { + "epoch": 0.58, + "grad_norm": 1.2995527461786816, + "learning_rate": 1.9605818355374693e-06, + "loss": 0.6882, + "step": 3922 + }, + { + "epoch": 0.59, + "grad_norm": 3.187022434909524, + "learning_rate": 1.960554974309856e-06, + "loss": 0.6615, + "step": 3923 + }, + { + "epoch": 0.59, + "grad_norm": 2.405513982947554, + "learning_rate": 1.9605281041172975e-06, + "loss": 0.6732, + "step": 3924 + }, + { + "epoch": 0.59, + "grad_norm": 5.175441911497109, + "learning_rate": 1.9605012249600457e-06, + "loss": 0.6595, + "step": 3925 + }, + { + "epoch": 0.59, + "grad_norm": 0.577844835985271, + "learning_rate": 1.960474336838351e-06, + "loss": 0.6797, + "step": 3926 + }, + { + "epoch": 0.59, + "grad_norm": 1.2809690172249064, + "learning_rate": 1.9604474397524644e-06, + "loss": 0.681, + "step": 3927 + }, + { + "epoch": 0.59, + "grad_norm": 3.1819926849794826, + "learning_rate": 1.960420533702637e-06, + "loss": 0.6589, + "step": 3928 + }, + { + "epoch": 0.59, + "grad_norm": 8.994122820097765, + "learning_rate": 1.9603936186891196e-06, + "loss": 0.666, + "step": 3929 + }, + { + "epoch": 0.59, + "grad_norm": 2.0055661193507377, + "learning_rate": 1.9603666947121637e-06, + "loss": 0.6921, + "step": 3930 + }, + { + "epoch": 0.59, + "grad_norm": 2.3750371687971645, + "learning_rate": 1.960339761772021e-06, + "loss": 0.6478, + "step": 3931 + }, + { + "epoch": 0.59, + "grad_norm": 1.9185301810686155, + "learning_rate": 1.9603128198689416e-06, + "loss": 0.6888, + "step": 3932 + }, + { + "epoch": 0.59, + "grad_norm": 1.8272131350141942, + "learning_rate": 1.960285869003178e-06, + "loss": 0.7207, + "step": 3933 + }, + { + "epoch": 0.59, + "grad_norm": 5.892668916990297, + "learning_rate": 1.9602589091749814e-06, + "loss": 0.6738, + "step": 3934 + }, + { + "epoch": 0.59, + "grad_norm": 8.997270861776444, + "learning_rate": 1.960231940384604e-06, + "loss": 0.7311, + "step": 3935 + }, + { + "epoch": 0.59, + "grad_norm": 4.366762084384216, + "learning_rate": 1.960204962632296e-06, + "loss": 0.6823, + "step": 3936 + }, + { + "epoch": 0.59, + "grad_norm": 1.2174896554451022, + "learning_rate": 1.960177975918311e-06, + "loss": 0.6914, + "step": 3937 + }, + { + "epoch": 0.59, + "grad_norm": 4.326649322533905, + "learning_rate": 1.9601509802428998e-06, + "loss": 0.6634, + "step": 3938 + }, + { + "epoch": 0.59, + "grad_norm": 0.9605417845694567, + "learning_rate": 1.9601239756063146e-06, + "loss": 0.7109, + "step": 3939 + }, + { + "epoch": 0.59, + "grad_norm": 5.682364880252619, + "learning_rate": 1.9600969620088073e-06, + "loss": 0.7135, + "step": 3940 + }, + { + "epoch": 0.59, + "grad_norm": 2.3441056391409982, + "learning_rate": 1.96006993945063e-06, + "loss": 0.7051, + "step": 3941 + }, + { + "epoch": 0.59, + "grad_norm": 4.744669484340444, + "learning_rate": 1.9600429079320354e-06, + "loss": 0.6836, + "step": 3942 + }, + { + "epoch": 0.59, + "grad_norm": 1.1607581795770285, + "learning_rate": 1.9600158674532752e-06, + "loss": 0.6797, + "step": 3943 + }, + { + "epoch": 0.59, + "grad_norm": 1.9618435704119774, + "learning_rate": 1.959988818014602e-06, + "loss": 0.7181, + "step": 3944 + }, + { + "epoch": 0.59, + "grad_norm": 4.451522302465227, + "learning_rate": 1.9599617596162687e-06, + "loss": 0.6868, + "step": 3945 + }, + { + "epoch": 0.59, + "grad_norm": 3.455942226359356, + "learning_rate": 1.959934692258527e-06, + "loss": 0.6699, + "step": 3946 + }, + { + "epoch": 0.59, + "grad_norm": 6.37638994941778, + "learning_rate": 1.95990761594163e-06, + "loss": 0.7005, + "step": 3947 + }, + { + "epoch": 0.59, + "grad_norm": 3.5476921122875082, + "learning_rate": 1.9598805306658305e-06, + "loss": 0.6862, + "step": 3948 + }, + { + "epoch": 0.59, + "grad_norm": 1.4307311102620996, + "learning_rate": 1.959853436431381e-06, + "loss": 0.6947, + "step": 3949 + }, + { + "epoch": 0.59, + "grad_norm": 3.7359321415145472, + "learning_rate": 1.959826333238534e-06, + "loss": 0.6751, + "step": 3950 + }, + { + "epoch": 0.59, + "grad_norm": 0.9723810924923737, + "learning_rate": 1.9597992210875437e-06, + "loss": 0.6823, + "step": 3951 + }, + { + "epoch": 0.59, + "grad_norm": 5.17342934726576, + "learning_rate": 1.9597720999786623e-06, + "loss": 0.6875, + "step": 3952 + }, + { + "epoch": 0.59, + "grad_norm": 6.028101116324916, + "learning_rate": 1.9597449699121427e-06, + "loss": 0.6784, + "step": 3953 + }, + { + "epoch": 0.59, + "grad_norm": 1.8474883908780593, + "learning_rate": 1.9597178308882387e-06, + "loss": 0.6908, + "step": 3954 + }, + { + "epoch": 0.59, + "grad_norm": 1.4615231514906086, + "learning_rate": 1.9596906829072027e-06, + "loss": 0.6829, + "step": 3955 + }, + { + "epoch": 0.59, + "grad_norm": 5.481818475765517, + "learning_rate": 1.9596635259692892e-06, + "loss": 0.6803, + "step": 3956 + }, + { + "epoch": 0.59, + "grad_norm": 4.095893466086206, + "learning_rate": 1.959636360074751e-06, + "loss": 0.6921, + "step": 3957 + }, + { + "epoch": 0.59, + "grad_norm": 0.48145398992197186, + "learning_rate": 1.959609185223842e-06, + "loss": 0.6823, + "step": 3958 + }, + { + "epoch": 0.59, + "grad_norm": 3.699358222158069, + "learning_rate": 1.9595820014168157e-06, + "loss": 0.6829, + "step": 3959 + }, + { + "epoch": 0.59, + "grad_norm": 9.446082113708135, + "learning_rate": 1.9595548086539253e-06, + "loss": 0.6921, + "step": 3960 + }, + { + "epoch": 0.59, + "grad_norm": 3.8202587940209765, + "learning_rate": 1.959527606935425e-06, + "loss": 0.696, + "step": 3961 + }, + { + "epoch": 0.59, + "grad_norm": 4.497576533677218, + "learning_rate": 1.959500396261569e-06, + "loss": 0.6803, + "step": 3962 + }, + { + "epoch": 0.59, + "grad_norm": 2.1184523794685934, + "learning_rate": 1.959473176632611e-06, + "loss": 0.6803, + "step": 3963 + }, + { + "epoch": 0.59, + "grad_norm": 4.332206006103665, + "learning_rate": 1.959445948048805e-06, + "loss": 0.6862, + "step": 3964 + }, + { + "epoch": 0.59, + "grad_norm": 4.68970349252278, + "learning_rate": 1.959418710510405e-06, + "loss": 0.694, + "step": 3965 + }, + { + "epoch": 0.59, + "grad_norm": 1.8720707643990415, + "learning_rate": 1.959391464017665e-06, + "loss": 0.6875, + "step": 3966 + }, + { + "epoch": 0.59, + "grad_norm": 5.5223639259689445, + "learning_rate": 1.95936420857084e-06, + "loss": 0.6927, + "step": 3967 + }, + { + "epoch": 0.59, + "grad_norm": 3.210141192386821, + "learning_rate": 1.9593369441701835e-06, + "loss": 0.6973, + "step": 3968 + }, + { + "epoch": 0.59, + "grad_norm": 0.8251436365953215, + "learning_rate": 1.9593096708159512e-06, + "loss": 0.6901, + "step": 3969 + }, + { + "epoch": 0.59, + "grad_norm": 0.952787767280889, + "learning_rate": 1.9592823885083965e-06, + "loss": 0.6849, + "step": 3970 + }, + { + "epoch": 0.59, + "grad_norm": 0.47106175284083895, + "learning_rate": 1.959255097247774e-06, + "loss": 0.6855, + "step": 3971 + }, + { + "epoch": 0.59, + "grad_norm": 2.5801544550086577, + "learning_rate": 1.9592277970343397e-06, + "loss": 0.681, + "step": 3972 + }, + { + "epoch": 0.59, + "grad_norm": 4.0360853326189705, + "learning_rate": 1.9592004878683477e-06, + "loss": 0.6771, + "step": 3973 + }, + { + "epoch": 0.59, + "grad_norm": 2.0331613824577497, + "learning_rate": 1.9591731697500517e-06, + "loss": 0.6706, + "step": 3974 + }, + { + "epoch": 0.59, + "grad_norm": 2.0880831188245446, + "learning_rate": 1.9591458426797084e-06, + "loss": 0.6908, + "step": 3975 + }, + { + "epoch": 0.59, + "grad_norm": 0.6235330072136963, + "learning_rate": 1.959118506657572e-06, + "loss": 0.6934, + "step": 3976 + }, + { + "epoch": 0.59, + "grad_norm": 6.50897127400615, + "learning_rate": 1.9590911616838974e-06, + "loss": 0.6901, + "step": 3977 + }, + { + "epoch": 0.59, + "grad_norm": 2.2132503647731894, + "learning_rate": 1.9590638077589404e-06, + "loss": 0.6816, + "step": 3978 + }, + { + "epoch": 0.59, + "grad_norm": 0.5797205435082124, + "learning_rate": 1.9590364448829563e-06, + "loss": 0.6842, + "step": 3979 + }, + { + "epoch": 0.59, + "grad_norm": 0.5154246648316986, + "learning_rate": 1.9590090730562e-06, + "loss": 0.6842, + "step": 3980 + }, + { + "epoch": 0.59, + "grad_norm": 3.696614340214914, + "learning_rate": 1.9589816922789274e-06, + "loss": 0.6921, + "step": 3981 + }, + { + "epoch": 0.59, + "grad_norm": 0.8819309561142313, + "learning_rate": 1.9589543025513933e-06, + "loss": 0.6738, + "step": 3982 + }, + { + "epoch": 0.59, + "grad_norm": 4.8373643823969354, + "learning_rate": 1.9589269038738545e-06, + "loss": 0.6706, + "step": 3983 + }, + { + "epoch": 0.59, + "grad_norm": 7.246328179295314, + "learning_rate": 1.958899496246566e-06, + "loss": 0.6868, + "step": 3984 + }, + { + "epoch": 0.59, + "grad_norm": 1.026683009599949, + "learning_rate": 1.9588720796697837e-06, + "loss": 0.6712, + "step": 3985 + }, + { + "epoch": 0.59, + "grad_norm": 5.787480409276666, + "learning_rate": 1.9588446541437633e-06, + "loss": 0.679, + "step": 3986 + }, + { + "epoch": 0.59, + "grad_norm": 0.8466456183937954, + "learning_rate": 1.958817219668761e-06, + "loss": 0.6764, + "step": 3987 + }, + { + "epoch": 0.59, + "grad_norm": 1.9838579890845673, + "learning_rate": 1.958789776245033e-06, + "loss": 0.6738, + "step": 3988 + }, + { + "epoch": 0.59, + "grad_norm": 0.935653188206649, + "learning_rate": 1.958762323872835e-06, + "loss": 0.6934, + "step": 3989 + }, + { + "epoch": 0.6, + "grad_norm": 3.460340974416339, + "learning_rate": 1.9587348625524235e-06, + "loss": 0.7012, + "step": 3990 + }, + { + "epoch": 0.6, + "grad_norm": 2.56482380113766, + "learning_rate": 1.9587073922840547e-06, + "loss": 0.7038, + "step": 3991 + }, + { + "epoch": 0.6, + "grad_norm": 3.786009789012627, + "learning_rate": 1.9586799130679853e-06, + "loss": 0.6979, + "step": 3992 + }, + { + "epoch": 0.6, + "grad_norm": 0.6839484663140704, + "learning_rate": 1.958652424904471e-06, + "loss": 0.6615, + "step": 3993 + }, + { + "epoch": 0.6, + "grad_norm": 1.5774055451904492, + "learning_rate": 1.958624927793769e-06, + "loss": 0.6875, + "step": 3994 + }, + { + "epoch": 0.6, + "grad_norm": 1.3932997062958101, + "learning_rate": 1.958597421736136e-06, + "loss": 0.6927, + "step": 3995 + }, + { + "epoch": 0.6, + "grad_norm": 3.303530239188593, + "learning_rate": 1.9585699067318285e-06, + "loss": 0.6914, + "step": 3996 + }, + { + "epoch": 0.6, + "grad_norm": 1.1386108801105643, + "learning_rate": 1.958542382781103e-06, + "loss": 0.6699, + "step": 3997 + }, + { + "epoch": 0.6, + "grad_norm": 4.066223885972834, + "learning_rate": 1.9585148498842168e-06, + "loss": 0.6602, + "step": 3998 + }, + { + "epoch": 0.6, + "grad_norm": 1.9795628169799635, + "learning_rate": 1.9584873080414268e-06, + "loss": 0.6973, + "step": 3999 + }, + { + "epoch": 0.6, + "grad_norm": 2.156355283906168, + "learning_rate": 1.95845975725299e-06, + "loss": 0.6855, + "step": 4000 + }, + { + "epoch": 0.6, + "grad_norm": 8.616608917664683, + "learning_rate": 1.9584321975191634e-06, + "loss": 0.7057, + "step": 4001 + }, + { + "epoch": 0.6, + "grad_norm": 1.3174820690563822, + "learning_rate": 1.9584046288402046e-06, + "loss": 0.6582, + "step": 4002 + }, + { + "epoch": 0.6, + "grad_norm": 4.7891821846453935, + "learning_rate": 1.9583770512163704e-06, + "loss": 0.696, + "step": 4003 + }, + { + "epoch": 0.6, + "grad_norm": 1.412268070830071, + "learning_rate": 1.958349464647918e-06, + "loss": 0.6693, + "step": 4004 + }, + { + "epoch": 0.6, + "grad_norm": 1.0452410112915203, + "learning_rate": 1.9583218691351064e-06, + "loss": 0.696, + "step": 4005 + }, + { + "epoch": 0.6, + "grad_norm": 6.895086306830729, + "learning_rate": 1.9582942646781912e-06, + "loss": 0.6927, + "step": 4006 + }, + { + "epoch": 0.6, + "grad_norm": 3.7135245958281526, + "learning_rate": 1.958266651277431e-06, + "loss": 0.6751, + "step": 4007 + }, + { + "epoch": 0.6, + "grad_norm": 2.4472277938719698, + "learning_rate": 1.9582390289330837e-06, + "loss": 0.6738, + "step": 4008 + }, + { + "epoch": 0.6, + "grad_norm": 2.1797451293710517, + "learning_rate": 1.958211397645407e-06, + "loss": 0.6784, + "step": 4009 + }, + { + "epoch": 0.6, + "grad_norm": 2.572481716305568, + "learning_rate": 1.9581837574146578e-06, + "loss": 0.6699, + "step": 4010 + }, + { + "epoch": 0.6, + "grad_norm": 3.014340972075616, + "learning_rate": 1.958156108241095e-06, + "loss": 0.6947, + "step": 4011 + }, + { + "epoch": 0.6, + "grad_norm": 5.464016429637632, + "learning_rate": 1.958128450124977e-06, + "loss": 0.6693, + "step": 4012 + }, + { + "epoch": 0.6, + "grad_norm": 1.3502021434163196, + "learning_rate": 1.958100783066561e-06, + "loss": 0.6914, + "step": 4013 + }, + { + "epoch": 0.6, + "grad_norm": 0.5099893046547015, + "learning_rate": 1.9580731070661057e-06, + "loss": 0.6947, + "step": 4014 + }, + { + "epoch": 0.6, + "grad_norm": 4.83322625940944, + "learning_rate": 1.9580454221238696e-06, + "loss": 0.6823, + "step": 4015 + }, + { + "epoch": 0.6, + "grad_norm": 4.192420526147238, + "learning_rate": 1.958017728240111e-06, + "loss": 0.6751, + "step": 4016 + }, + { + "epoch": 0.6, + "grad_norm": 2.90321364124129, + "learning_rate": 1.957990025415088e-06, + "loss": 0.6882, + "step": 4017 + }, + { + "epoch": 0.6, + "grad_norm": 3.6265197757381697, + "learning_rate": 1.9579623136490594e-06, + "loss": 0.6934, + "step": 4018 + }, + { + "epoch": 0.6, + "grad_norm": 2.761812727392289, + "learning_rate": 1.9579345929422835e-06, + "loss": 0.6953, + "step": 4019 + }, + { + "epoch": 0.6, + "grad_norm": 6.52918903678139, + "learning_rate": 1.95790686329502e-06, + "loss": 0.6764, + "step": 4020 + }, + { + "epoch": 0.6, + "grad_norm": 4.141868885031518, + "learning_rate": 1.9578791247075266e-06, + "loss": 0.6764, + "step": 4021 + }, + { + "epoch": 0.6, + "grad_norm": 3.496821379513251, + "learning_rate": 1.9578513771800623e-06, + "loss": 0.6862, + "step": 4022 + }, + { + "epoch": 0.6, + "grad_norm": 3.1557683368975713, + "learning_rate": 1.957823620712887e-06, + "loss": 0.7025, + "step": 4023 + }, + { + "epoch": 0.6, + "grad_norm": 6.142439758743948, + "learning_rate": 1.9577958553062588e-06, + "loss": 0.7018, + "step": 4024 + }, + { + "epoch": 0.6, + "grad_norm": 4.163085933674491, + "learning_rate": 1.9577680809604375e-06, + "loss": 0.6562, + "step": 4025 + }, + { + "epoch": 0.6, + "grad_norm": 3.7145914203334987, + "learning_rate": 1.9577402976756814e-06, + "loss": 0.6868, + "step": 4026 + }, + { + "epoch": 0.6, + "grad_norm": 9.895840964447812, + "learning_rate": 1.957712505452251e-06, + "loss": 0.6784, + "step": 4027 + }, + { + "epoch": 0.6, + "grad_norm": 3.1594590330718444, + "learning_rate": 1.9576847042904047e-06, + "loss": 0.6667, + "step": 4028 + }, + { + "epoch": 0.6, + "grad_norm": 1.0794737136176846, + "learning_rate": 1.9576568941904024e-06, + "loss": 0.6842, + "step": 4029 + }, + { + "epoch": 0.6, + "grad_norm": 4.004487958916995, + "learning_rate": 1.957629075152504e-06, + "loss": 0.7148, + "step": 4030 + }, + { + "epoch": 0.6, + "grad_norm": 4.3398644028515125, + "learning_rate": 1.9576012471769686e-06, + "loss": 0.7012, + "step": 4031 + }, + { + "epoch": 0.6, + "grad_norm": 4.4921513027963345, + "learning_rate": 1.9575734102640558e-06, + "loss": 0.6829, + "step": 4032 + }, + { + "epoch": 0.6, + "grad_norm": 4.404839557672954, + "learning_rate": 1.957545564414026e-06, + "loss": 0.7064, + "step": 4033 + }, + { + "epoch": 0.6, + "grad_norm": 2.5573902984541856, + "learning_rate": 1.9575177096271386e-06, + "loss": 0.6999, + "step": 4034 + }, + { + "epoch": 0.6, + "grad_norm": 0.938654157431562, + "learning_rate": 1.9574898459036535e-06, + "loss": 0.6849, + "step": 4035 + }, + { + "epoch": 0.6, + "grad_norm": 0.9264968961721327, + "learning_rate": 1.9574619732438315e-06, + "loss": 0.6699, + "step": 4036 + }, + { + "epoch": 0.6, + "grad_norm": 5.9247590474089495, + "learning_rate": 1.957434091647932e-06, + "loss": 0.6823, + "step": 4037 + }, + { + "epoch": 0.6, + "grad_norm": 2.1660168307275596, + "learning_rate": 1.9574062011162155e-06, + "loss": 0.6947, + "step": 4038 + }, + { + "epoch": 0.6, + "grad_norm": 1.4541799612820976, + "learning_rate": 1.957378301648942e-06, + "loss": 0.6803, + "step": 4039 + }, + { + "epoch": 0.6, + "grad_norm": 0.6280357913784317, + "learning_rate": 1.9573503932463723e-06, + "loss": 0.694, + "step": 4040 + }, + { + "epoch": 0.6, + "grad_norm": 7.599817030930131, + "learning_rate": 1.9573224759087666e-06, + "loss": 0.6901, + "step": 4041 + }, + { + "epoch": 0.6, + "grad_norm": 1.8656981710514706, + "learning_rate": 1.957294549636386e-06, + "loss": 0.6823, + "step": 4042 + }, + { + "epoch": 0.6, + "grad_norm": 2.65739125697555, + "learning_rate": 1.95726661442949e-06, + "loss": 0.6823, + "step": 4043 + }, + { + "epoch": 0.6, + "grad_norm": 6.615779307213479, + "learning_rate": 1.9572386702883406e-06, + "loss": 0.6921, + "step": 4044 + }, + { + "epoch": 0.6, + "grad_norm": 3.1292103854380584, + "learning_rate": 1.957210717213198e-06, + "loss": 0.7057, + "step": 4045 + }, + { + "epoch": 0.6, + "grad_norm": 0.9394376220672672, + "learning_rate": 1.957182755204323e-06, + "loss": 0.6836, + "step": 4046 + }, + { + "epoch": 0.6, + "grad_norm": 2.8294437181966376, + "learning_rate": 1.9571547842619767e-06, + "loss": 0.6667, + "step": 4047 + }, + { + "epoch": 0.6, + "grad_norm": 1.6863182660208595, + "learning_rate": 1.9571268043864204e-06, + "loss": 0.6829, + "step": 4048 + }, + { + "epoch": 0.6, + "grad_norm": 4.1507089730724935, + "learning_rate": 1.9570988155779144e-06, + "loss": 0.6921, + "step": 4049 + }, + { + "epoch": 0.6, + "grad_norm": 2.993811303083122, + "learning_rate": 1.957070817836721e-06, + "loss": 0.6816, + "step": 4050 + }, + { + "epoch": 0.6, + "grad_norm": 2.6062388042739855, + "learning_rate": 1.957042811163101e-06, + "loss": 0.6738, + "step": 4051 + }, + { + "epoch": 0.6, + "grad_norm": 5.909260849054104, + "learning_rate": 1.9570147955573155e-06, + "loss": 0.6934, + "step": 4052 + }, + { + "epoch": 0.6, + "grad_norm": 4.647787474408874, + "learning_rate": 1.9569867710196262e-06, + "loss": 0.6836, + "step": 4053 + }, + { + "epoch": 0.6, + "grad_norm": 0.6769760047530439, + "learning_rate": 1.956958737550295e-06, + "loss": 0.6842, + "step": 4054 + }, + { + "epoch": 0.6, + "grad_norm": 6.445888198285777, + "learning_rate": 1.9569306951495834e-06, + "loss": 0.6797, + "step": 4055 + }, + { + "epoch": 0.6, + "grad_norm": 2.1654036358245947, + "learning_rate": 1.956902643817753e-06, + "loss": 0.696, + "step": 4056 + }, + { + "epoch": 0.61, + "grad_norm": 10.176039156737428, + "learning_rate": 1.9568745835550654e-06, + "loss": 0.6855, + "step": 4057 + }, + { + "epoch": 0.61, + "grad_norm": 0.8835827114953932, + "learning_rate": 1.956846514361783e-06, + "loss": 0.6803, + "step": 4058 + }, + { + "epoch": 0.61, + "grad_norm": 1.545362533324015, + "learning_rate": 1.956818436238167e-06, + "loss": 0.6914, + "step": 4059 + }, + { + "epoch": 0.61, + "grad_norm": 1.707571300534269, + "learning_rate": 1.95679034918448e-06, + "loss": 0.6882, + "step": 4060 + }, + { + "epoch": 0.61, + "grad_norm": 3.023370447782207, + "learning_rate": 1.9567622532009842e-06, + "loss": 0.7005, + "step": 4061 + }, + { + "epoch": 0.61, + "grad_norm": 10.622025922313558, + "learning_rate": 1.9567341482879417e-06, + "loss": 0.7161, + "step": 4062 + }, + { + "epoch": 0.61, + "grad_norm": 1.3582454804070343, + "learning_rate": 1.9567060344456146e-06, + "loss": 0.6699, + "step": 4063 + }, + { + "epoch": 0.61, + "grad_norm": 8.05576808461557, + "learning_rate": 1.9566779116742656e-06, + "loss": 0.6849, + "step": 4064 + }, + { + "epoch": 0.61, + "grad_norm": 3.802588523011091, + "learning_rate": 1.9566497799741572e-06, + "loss": 0.6686, + "step": 4065 + }, + { + "epoch": 0.61, + "grad_norm": 4.044516525304923, + "learning_rate": 1.956621639345552e-06, + "loss": 0.6784, + "step": 4066 + }, + { + "epoch": 0.61, + "grad_norm": 1.2919354596622716, + "learning_rate": 1.956593489788712e-06, + "loss": 0.6732, + "step": 4067 + }, + { + "epoch": 0.61, + "grad_norm": 3.1860309055783156, + "learning_rate": 1.9565653313039008e-06, + "loss": 0.6686, + "step": 4068 + }, + { + "epoch": 0.61, + "grad_norm": 3.0705129014451185, + "learning_rate": 1.9565371638913803e-06, + "loss": 0.6797, + "step": 4069 + }, + { + "epoch": 0.61, + "grad_norm": 1.6001056723213165, + "learning_rate": 1.9565089875514137e-06, + "loss": 0.696, + "step": 4070 + }, + { + "epoch": 0.61, + "grad_norm": 3.232888759091858, + "learning_rate": 1.9564808022842645e-06, + "loss": 0.6862, + "step": 4071 + }, + { + "epoch": 0.61, + "grad_norm": 8.53416863475318, + "learning_rate": 1.9564526080901957e-06, + "loss": 0.7057, + "step": 4072 + }, + { + "epoch": 0.61, + "grad_norm": 4.407478958887545, + "learning_rate": 1.9564244049694695e-06, + "loss": 0.6758, + "step": 4073 + }, + { + "epoch": 0.61, + "grad_norm": 0.8395975075415735, + "learning_rate": 1.95639619292235e-06, + "loss": 0.6823, + "step": 4074 + }, + { + "epoch": 0.61, + "grad_norm": 1.9959129523698793, + "learning_rate": 1.9563679719491004e-06, + "loss": 0.6784, + "step": 4075 + }, + { + "epoch": 0.61, + "grad_norm": 0.6186639873051004, + "learning_rate": 1.9563397420499836e-06, + "loss": 0.666, + "step": 4076 + }, + { + "epoch": 0.61, + "grad_norm": 5.325254652146319, + "learning_rate": 1.9563115032252635e-06, + "loss": 0.6914, + "step": 4077 + }, + { + "epoch": 0.61, + "grad_norm": 6.986802091238508, + "learning_rate": 1.956283255475204e-06, + "loss": 0.6764, + "step": 4078 + }, + { + "epoch": 0.61, + "grad_norm": 0.6346504051153264, + "learning_rate": 1.9562549988000678e-06, + "loss": 0.6784, + "step": 4079 + }, + { + "epoch": 0.61, + "grad_norm": 8.714287781105607, + "learning_rate": 1.956226733200119e-06, + "loss": 0.6868, + "step": 4080 + }, + { + "epoch": 0.61, + "grad_norm": 6.320011142950522, + "learning_rate": 1.956198458675622e-06, + "loss": 0.696, + "step": 4081 + }, + { + "epoch": 0.61, + "grad_norm": 3.85701226962054, + "learning_rate": 1.9561701752268404e-06, + "loss": 0.6654, + "step": 4082 + }, + { + "epoch": 0.61, + "grad_norm": 10.371641274754898, + "learning_rate": 1.9561418828540374e-06, + "loss": 0.6999, + "step": 4083 + }, + { + "epoch": 0.61, + "grad_norm": 0.7646446542977416, + "learning_rate": 1.956113581557478e-06, + "loss": 0.6875, + "step": 4084 + }, + { + "epoch": 0.61, + "grad_norm": 0.6472112592791361, + "learning_rate": 1.956085271337426e-06, + "loss": 0.6823, + "step": 4085 + }, + { + "epoch": 0.61, + "grad_norm": 3.83600242593057, + "learning_rate": 1.9560569521941454e-06, + "loss": 0.6842, + "step": 4086 + }, + { + "epoch": 0.61, + "grad_norm": 7.964624928329822, + "learning_rate": 1.9560286241279008e-06, + "loss": 0.7279, + "step": 4087 + }, + { + "epoch": 0.61, + "grad_norm": 6.207567676620581, + "learning_rate": 1.9560002871389565e-06, + "loss": 0.6999, + "step": 4088 + }, + { + "epoch": 0.61, + "grad_norm": 1.739990399510825, + "learning_rate": 1.955971941227577e-06, + "loss": 0.6764, + "step": 4089 + }, + { + "epoch": 0.61, + "grad_norm": 8.021430939342034, + "learning_rate": 1.955943586394027e-06, + "loss": 0.6895, + "step": 4090 + }, + { + "epoch": 0.61, + "grad_norm": 0.7897120484412785, + "learning_rate": 1.9559152226385707e-06, + "loss": 0.6777, + "step": 4091 + }, + { + "epoch": 0.61, + "grad_norm": 0.7126435579093503, + "learning_rate": 1.9558868499614733e-06, + "loss": 0.6934, + "step": 4092 + }, + { + "epoch": 0.61, + "grad_norm": 4.207362716406132, + "learning_rate": 1.9558584683629992e-06, + "loss": 0.6628, + "step": 4093 + }, + { + "epoch": 0.61, + "grad_norm": 1.3618056747164842, + "learning_rate": 1.9558300778434137e-06, + "loss": 0.6823, + "step": 4094 + }, + { + "epoch": 0.61, + "grad_norm": 0.6256646483694887, + "learning_rate": 1.9558016784029814e-06, + "loss": 0.6706, + "step": 4095 + }, + { + "epoch": 0.61, + "grad_norm": 6.000378567130769, + "learning_rate": 1.9557732700419677e-06, + "loss": 0.6738, + "step": 4096 + }, + { + "epoch": 0.61, + "grad_norm": 5.444108857460889, + "learning_rate": 1.955744852760637e-06, + "loss": 0.6855, + "step": 4097 + }, + { + "epoch": 0.61, + "grad_norm": 6.497843608850112, + "learning_rate": 1.9557164265592555e-06, + "loss": 0.6751, + "step": 4098 + }, + { + "epoch": 0.61, + "grad_norm": 6.1969008282766405, + "learning_rate": 1.9556879914380883e-06, + "loss": 0.6771, + "step": 4099 + }, + { + "epoch": 0.61, + "grad_norm": 1.7060918600121062, + "learning_rate": 1.9556595473974e-06, + "loss": 0.6712, + "step": 4100 + }, + { + "epoch": 0.61, + "grad_norm": 4.100710352774798, + "learning_rate": 1.9556310944374573e-06, + "loss": 0.6908, + "step": 4101 + }, + { + "epoch": 0.61, + "grad_norm": 1.8267797665845775, + "learning_rate": 1.9556026325585244e-06, + "loss": 0.724, + "step": 4102 + }, + { + "epoch": 0.61, + "grad_norm": 0.9916786860139205, + "learning_rate": 1.9555741617608682e-06, + "loss": 0.679, + "step": 4103 + }, + { + "epoch": 0.61, + "grad_norm": 0.6988267948305379, + "learning_rate": 1.9555456820447535e-06, + "loss": 0.6888, + "step": 4104 + }, + { + "epoch": 0.61, + "grad_norm": 0.8748325617345508, + "learning_rate": 1.9555171934104465e-06, + "loss": 0.7038, + "step": 4105 + }, + { + "epoch": 0.61, + "grad_norm": 2.273041168778825, + "learning_rate": 1.955488695858213e-06, + "loss": 0.6777, + "step": 4106 + }, + { + "epoch": 0.61, + "grad_norm": 1.1385330700828, + "learning_rate": 1.955460189388319e-06, + "loss": 0.6719, + "step": 4107 + }, + { + "epoch": 0.61, + "grad_norm": 11.471802393442134, + "learning_rate": 1.9554316740010303e-06, + "loss": 0.6914, + "step": 4108 + }, + { + "epoch": 0.61, + "grad_norm": 0.60225655354363, + "learning_rate": 1.9554031496966137e-06, + "loss": 0.6966, + "step": 4109 + }, + { + "epoch": 0.61, + "grad_norm": 4.880698833904422, + "learning_rate": 1.955374616475335e-06, + "loss": 0.6621, + "step": 4110 + }, + { + "epoch": 0.61, + "grad_norm": 1.1981205891515465, + "learning_rate": 1.9553460743374607e-06, + "loss": 0.6868, + "step": 4111 + }, + { + "epoch": 0.61, + "grad_norm": 1.4394497775676525, + "learning_rate": 1.9553175232832566e-06, + "loss": 0.7083, + "step": 4112 + }, + { + "epoch": 0.61, + "grad_norm": 0.4563103330701213, + "learning_rate": 1.9552889633129897e-06, + "loss": 0.6882, + "step": 4113 + }, + { + "epoch": 0.61, + "grad_norm": 3.8958873668022056, + "learning_rate": 1.955260394426926e-06, + "loss": 0.6836, + "step": 4114 + }, + { + "epoch": 0.61, + "grad_norm": 5.288636772308869, + "learning_rate": 1.9552318166253332e-06, + "loss": 0.6842, + "step": 4115 + }, + { + "epoch": 0.61, + "grad_norm": 0.7377629867514531, + "learning_rate": 1.9552032299084772e-06, + "loss": 0.6953, + "step": 4116 + }, + { + "epoch": 0.61, + "grad_norm": 0.9824247007454538, + "learning_rate": 1.955174634276625e-06, + "loss": 0.6868, + "step": 4117 + }, + { + "epoch": 0.61, + "grad_norm": 0.33270897654326337, + "learning_rate": 1.9551460297300434e-06, + "loss": 0.6842, + "step": 4118 + }, + { + "epoch": 0.61, + "grad_norm": 5.206620515780319, + "learning_rate": 1.9551174162689992e-06, + "loss": 0.681, + "step": 4119 + }, + { + "epoch": 0.61, + "grad_norm": 2.3663717289747557, + "learning_rate": 1.95508879389376e-06, + "loss": 0.6934, + "step": 4120 + }, + { + "epoch": 0.61, + "grad_norm": 0.37055828989072204, + "learning_rate": 1.9550601626045925e-06, + "loss": 0.6855, + "step": 4121 + }, + { + "epoch": 0.61, + "grad_norm": 3.1188731659655793, + "learning_rate": 1.9550315224017644e-06, + "loss": 0.6888, + "step": 4122 + }, + { + "epoch": 0.61, + "grad_norm": 7.965669399036667, + "learning_rate": 1.955002873285542e-06, + "loss": 0.6992, + "step": 4123 + }, + { + "epoch": 0.62, + "grad_norm": 9.371014056858924, + "learning_rate": 1.9549742152561937e-06, + "loss": 0.709, + "step": 4124 + }, + { + "epoch": 0.62, + "grad_norm": 0.39357063278099597, + "learning_rate": 1.954945548313987e-06, + "loss": 0.6829, + "step": 4125 + }, + { + "epoch": 0.62, + "grad_norm": 5.771296444108307, + "learning_rate": 1.9549168724591883e-06, + "loss": 0.6882, + "step": 4126 + }, + { + "epoch": 0.62, + "grad_norm": 1.7696142205659247, + "learning_rate": 1.9548881876920663e-06, + "loss": 0.6745, + "step": 4127 + }, + { + "epoch": 0.62, + "grad_norm": 1.8356061137736008, + "learning_rate": 1.9548594940128884e-06, + "loss": 0.6868, + "step": 4128 + }, + { + "epoch": 0.62, + "grad_norm": 1.1730499246704151, + "learning_rate": 1.9548307914219223e-06, + "loss": 0.6862, + "step": 4129 + }, + { + "epoch": 0.62, + "grad_norm": 0.5062654750385044, + "learning_rate": 1.9548020799194364e-06, + "loss": 0.6953, + "step": 4130 + }, + { + "epoch": 0.62, + "grad_norm": 7.2937594168301185, + "learning_rate": 1.9547733595056977e-06, + "loss": 0.7012, + "step": 4131 + }, + { + "epoch": 0.62, + "grad_norm": 3.863578859767148, + "learning_rate": 1.954744630180975e-06, + "loss": 0.6947, + "step": 4132 + }, + { + "epoch": 0.62, + "grad_norm": 0.8225237834697268, + "learning_rate": 1.9547158919455364e-06, + "loss": 0.6758, + "step": 4133 + }, + { + "epoch": 0.62, + "grad_norm": 2.7607928872730487, + "learning_rate": 1.9546871447996496e-06, + "loss": 0.6895, + "step": 4134 + }, + { + "epoch": 0.62, + "grad_norm": 2.5869722276612857, + "learning_rate": 1.9546583887435835e-06, + "loss": 0.6947, + "step": 4135 + }, + { + "epoch": 0.62, + "grad_norm": 2.3321112713247905, + "learning_rate": 1.954629623777606e-06, + "loss": 0.6947, + "step": 4136 + }, + { + "epoch": 0.62, + "grad_norm": 1.7952671108436595, + "learning_rate": 1.9546008499019862e-06, + "loss": 0.6751, + "step": 4137 + }, + { + "epoch": 0.62, + "grad_norm": 1.6342282794371352, + "learning_rate": 1.954572067116992e-06, + "loss": 0.679, + "step": 4138 + }, + { + "epoch": 0.62, + "grad_norm": 3.1927859685353313, + "learning_rate": 1.9545432754228923e-06, + "loss": 0.6992, + "step": 4139 + }, + { + "epoch": 0.62, + "grad_norm": 0.4323577460198612, + "learning_rate": 1.9545144748199557e-06, + "loss": 0.6836, + "step": 4140 + }, + { + "epoch": 0.62, + "grad_norm": 0.7256016321269123, + "learning_rate": 1.9544856653084514e-06, + "loss": 0.6842, + "step": 4141 + }, + { + "epoch": 0.62, + "grad_norm": 0.4373553795964231, + "learning_rate": 1.9544568468886477e-06, + "loss": 0.6797, + "step": 4142 + }, + { + "epoch": 0.62, + "grad_norm": 0.6636642458744444, + "learning_rate": 1.9544280195608137e-06, + "loss": 0.6777, + "step": 4143 + }, + { + "epoch": 0.62, + "grad_norm": 6.314647349541706, + "learning_rate": 1.954399183325219e-06, + "loss": 0.6686, + "step": 4144 + }, + { + "epoch": 0.62, + "grad_norm": 3.958120659583271, + "learning_rate": 1.954370338182132e-06, + "loss": 0.6986, + "step": 4145 + }, + { + "epoch": 0.62, + "grad_norm": 0.4028061784981056, + "learning_rate": 1.9543414841318223e-06, + "loss": 0.694, + "step": 4146 + }, + { + "epoch": 0.62, + "grad_norm": 1.0193035869633762, + "learning_rate": 1.954312621174559e-06, + "loss": 0.6751, + "step": 4147 + }, + { + "epoch": 0.62, + "grad_norm": 0.7613367163371814, + "learning_rate": 1.954283749310612e-06, + "loss": 0.6771, + "step": 4148 + }, + { + "epoch": 0.62, + "grad_norm": 3.789599544089084, + "learning_rate": 1.95425486854025e-06, + "loss": 0.6868, + "step": 4149 + }, + { + "epoch": 0.62, + "grad_norm": 0.7952586511569316, + "learning_rate": 1.9542259788637435e-06, + "loss": 0.6615, + "step": 4150 + }, + { + "epoch": 0.62, + "grad_norm": 1.0726554058488729, + "learning_rate": 1.954197080281361e-06, + "loss": 0.6699, + "step": 4151 + }, + { + "epoch": 0.62, + "grad_norm": 5.421653641474186, + "learning_rate": 1.9541681727933726e-06, + "loss": 0.6764, + "step": 4152 + }, + { + "epoch": 0.62, + "grad_norm": 3.0985999135880866, + "learning_rate": 1.9541392564000487e-06, + "loss": 0.696, + "step": 4153 + }, + { + "epoch": 0.62, + "grad_norm": 2.233185013152147, + "learning_rate": 1.9541103311016587e-06, + "loss": 0.696, + "step": 4154 + }, + { + "epoch": 0.62, + "grad_norm": 1.2588425071331848, + "learning_rate": 1.9540813968984727e-06, + "loss": 0.681, + "step": 4155 + }, + { + "epoch": 0.62, + "grad_norm": 6.076989831941326, + "learning_rate": 1.95405245379076e-06, + "loss": 0.6888, + "step": 4156 + }, + { + "epoch": 0.62, + "grad_norm": 1.1664806921232618, + "learning_rate": 1.954023501778792e-06, + "loss": 0.6803, + "step": 4157 + }, + { + "epoch": 0.62, + "grad_norm": 5.097725117590515, + "learning_rate": 1.953994540862838e-06, + "loss": 0.6953, + "step": 4158 + }, + { + "epoch": 0.62, + "grad_norm": 2.352728659664546, + "learning_rate": 1.953965571043169e-06, + "loss": 0.6719, + "step": 4159 + }, + { + "epoch": 0.62, + "grad_norm": 0.6753098493358649, + "learning_rate": 1.9539365923200546e-06, + "loss": 0.6927, + "step": 4160 + }, + { + "epoch": 0.62, + "grad_norm": 1.2304742896238494, + "learning_rate": 1.953907604693766e-06, + "loss": 0.6882, + "step": 4161 + }, + { + "epoch": 0.62, + "grad_norm": 8.204827068015192, + "learning_rate": 1.953878608164573e-06, + "loss": 0.6953, + "step": 4162 + }, + { + "epoch": 0.62, + "grad_norm": 4.097934337331125, + "learning_rate": 1.953849602732746e-06, + "loss": 0.7077, + "step": 4163 + }, + { + "epoch": 0.62, + "grad_norm": 0.9781291020207437, + "learning_rate": 1.9538205883985573e-06, + "loss": 0.6719, + "step": 4164 + }, + { + "epoch": 0.62, + "grad_norm": 6.182942485114819, + "learning_rate": 1.9537915651622763e-06, + "loss": 0.6647, + "step": 4165 + }, + { + "epoch": 0.62, + "grad_norm": 3.3307320409059273, + "learning_rate": 1.953762533024174e-06, + "loss": 0.7005, + "step": 4166 + }, + { + "epoch": 0.62, + "grad_norm": 1.5592081163328801, + "learning_rate": 1.953733491984522e-06, + "loss": 0.6816, + "step": 4167 + }, + { + "epoch": 0.62, + "grad_norm": 10.069126444951943, + "learning_rate": 1.953704442043591e-06, + "loss": 0.6979, + "step": 4168 + }, + { + "epoch": 0.62, + "grad_norm": 1.3536039385477654, + "learning_rate": 1.953675383201652e-06, + "loss": 0.6829, + "step": 4169 + }, + { + "epoch": 0.62, + "grad_norm": 0.6492139097338323, + "learning_rate": 1.9536463154589764e-06, + "loss": 0.6602, + "step": 4170 + }, + { + "epoch": 0.62, + "grad_norm": 4.501588515828579, + "learning_rate": 1.9536172388158358e-06, + "loss": 0.6999, + "step": 4171 + }, + { + "epoch": 0.62, + "grad_norm": 0.5745831126512009, + "learning_rate": 1.9535881532725006e-06, + "loss": 0.6719, + "step": 4172 + }, + { + "epoch": 0.62, + "grad_norm": 4.2532796874893695, + "learning_rate": 1.953559058829243e-06, + "loss": 0.679, + "step": 4173 + }, + { + "epoch": 0.62, + "grad_norm": 2.263019057028312, + "learning_rate": 1.953529955486334e-06, + "loss": 0.679, + "step": 4174 + }, + { + "epoch": 0.62, + "grad_norm": 2.2060340306904744, + "learning_rate": 1.953500843244046e-06, + "loss": 0.6803, + "step": 4175 + }, + { + "epoch": 0.62, + "grad_norm": 2.037520951983313, + "learning_rate": 1.9534717221026504e-06, + "loss": 0.6777, + "step": 4176 + }, + { + "epoch": 0.62, + "grad_norm": 1.5349442380853524, + "learning_rate": 1.953442592062419e-06, + "loss": 0.6888, + "step": 4177 + }, + { + "epoch": 0.62, + "grad_norm": 7.059904043121049, + "learning_rate": 1.953413453123623e-06, + "loss": 0.6888, + "step": 4178 + }, + { + "epoch": 0.62, + "grad_norm": 7.9625035810334825, + "learning_rate": 1.9533843052865355e-06, + "loss": 0.6628, + "step": 4179 + }, + { + "epoch": 0.62, + "grad_norm": 3.1875511784796746, + "learning_rate": 1.953355148551427e-06, + "loss": 0.6803, + "step": 4180 + }, + { + "epoch": 0.62, + "grad_norm": 0.7767569224966479, + "learning_rate": 1.9533259829185717e-06, + "loss": 0.6888, + "step": 4181 + }, + { + "epoch": 0.62, + "grad_norm": 0.6620502875382707, + "learning_rate": 1.9532968083882404e-06, + "loss": 0.6875, + "step": 4182 + }, + { + "epoch": 0.62, + "grad_norm": 4.630677885057153, + "learning_rate": 1.9532676249607055e-06, + "loss": 0.6999, + "step": 4183 + }, + { + "epoch": 0.62, + "grad_norm": 2.3271571628425898, + "learning_rate": 1.9532384326362393e-06, + "loss": 0.6543, + "step": 4184 + }, + { + "epoch": 0.62, + "grad_norm": 2.5869078419518403, + "learning_rate": 1.953209231415115e-06, + "loss": 0.6953, + "step": 4185 + }, + { + "epoch": 0.62, + "grad_norm": 3.4746859434770783, + "learning_rate": 1.9531800212976046e-06, + "loss": 0.6647, + "step": 4186 + }, + { + "epoch": 0.62, + "grad_norm": 2.6278224802940695, + "learning_rate": 1.9531508022839803e-06, + "loss": 0.6901, + "step": 4187 + }, + { + "epoch": 0.62, + "grad_norm": 1.029294787302973, + "learning_rate": 1.9531215743745157e-06, + "loss": 0.6836, + "step": 4188 + }, + { + "epoch": 0.62, + "grad_norm": 3.2995445680001887, + "learning_rate": 1.953092337569483e-06, + "loss": 0.6868, + "step": 4189 + }, + { + "epoch": 0.62, + "grad_norm": 1.3348678564195486, + "learning_rate": 1.9530630918691554e-06, + "loss": 0.6569, + "step": 4190 + }, + { + "epoch": 0.63, + "grad_norm": 1.6771494539552827, + "learning_rate": 1.953033837273806e-06, + "loss": 0.6738, + "step": 4191 + }, + { + "epoch": 0.63, + "grad_norm": 4.2115554499909456, + "learning_rate": 1.953004573783707e-06, + "loss": 0.6823, + "step": 4192 + }, + { + "epoch": 0.63, + "grad_norm": 1.3577845221855702, + "learning_rate": 1.9529753013991315e-06, + "loss": 0.694, + "step": 4193 + }, + { + "epoch": 0.63, + "grad_norm": 0.5538696066719634, + "learning_rate": 1.952946020120354e-06, + "loss": 0.6979, + "step": 4194 + }, + { + "epoch": 0.63, + "grad_norm": 1.6036596213420913, + "learning_rate": 1.9529167299476473e-06, + "loss": 0.696, + "step": 4195 + }, + { + "epoch": 0.63, + "grad_norm": 1.1044746158946455, + "learning_rate": 1.952887430881284e-06, + "loss": 0.6725, + "step": 4196 + }, + { + "epoch": 0.63, + "grad_norm": 3.3262003781287484, + "learning_rate": 1.952858122921538e-06, + "loss": 0.696, + "step": 4197 + }, + { + "epoch": 0.63, + "grad_norm": 1.1232938500682852, + "learning_rate": 1.9528288060686828e-06, + "loss": 0.6914, + "step": 4198 + }, + { + "epoch": 0.63, + "grad_norm": 6.367097872435004, + "learning_rate": 1.9527994803229923e-06, + "loss": 0.6868, + "step": 4199 + }, + { + "epoch": 0.63, + "grad_norm": 0.89099790012284, + "learning_rate": 1.95277014568474e-06, + "loss": 0.6829, + "step": 4200 + }, + { + "epoch": 0.63, + "grad_norm": 5.569110182526372, + "learning_rate": 1.9527408021541998e-06, + "loss": 0.6875, + "step": 4201 + }, + { + "epoch": 0.63, + "grad_norm": 0.6137793974212262, + "learning_rate": 1.952711449731645e-06, + "loss": 0.679, + "step": 4202 + }, + { + "epoch": 0.63, + "grad_norm": 2.842952597803891, + "learning_rate": 1.9526820884173503e-06, + "loss": 0.6934, + "step": 4203 + }, + { + "epoch": 0.63, + "grad_norm": 7.651999818308638, + "learning_rate": 1.9526527182115894e-06, + "loss": 0.6973, + "step": 4204 + }, + { + "epoch": 0.63, + "grad_norm": 1.491120958266712, + "learning_rate": 1.9526233391146364e-06, + "loss": 0.6895, + "step": 4205 + }, + { + "epoch": 0.63, + "grad_norm": 0.5344651143132962, + "learning_rate": 1.9525939511267656e-06, + "loss": 0.6908, + "step": 4206 + }, + { + "epoch": 0.63, + "grad_norm": 5.108809957656665, + "learning_rate": 1.952564554248251e-06, + "loss": 0.7031, + "step": 4207 + }, + { + "epoch": 0.63, + "grad_norm": 3.762282069038684, + "learning_rate": 1.9525351484793672e-06, + "loss": 0.6992, + "step": 4208 + }, + { + "epoch": 0.63, + "grad_norm": 0.8149851562075997, + "learning_rate": 1.9525057338203886e-06, + "loss": 0.6803, + "step": 4209 + }, + { + "epoch": 0.63, + "grad_norm": 4.24451328240114, + "learning_rate": 1.95247631027159e-06, + "loss": 0.6758, + "step": 4210 + }, + { + "epoch": 0.63, + "grad_norm": 1.7875648647998053, + "learning_rate": 1.9524468778332456e-06, + "loss": 0.7031, + "step": 4211 + }, + { + "epoch": 0.63, + "grad_norm": 2.5473316313109122, + "learning_rate": 1.9524174365056303e-06, + "loss": 0.6803, + "step": 4212 + }, + { + "epoch": 0.63, + "grad_norm": 5.4017270566081255, + "learning_rate": 1.952387986289019e-06, + "loss": 0.6797, + "step": 4213 + }, + { + "epoch": 0.63, + "grad_norm": 1.5022385189284118, + "learning_rate": 1.952358527183686e-06, + "loss": 0.694, + "step": 4214 + }, + { + "epoch": 0.63, + "grad_norm": 4.1407055450349715, + "learning_rate": 1.952329059189907e-06, + "loss": 0.6732, + "step": 4215 + }, + { + "epoch": 0.63, + "grad_norm": 1.5096519130289492, + "learning_rate": 1.9522995823079565e-06, + "loss": 0.6888, + "step": 4216 + }, + { + "epoch": 0.63, + "grad_norm": 6.3971554366135015, + "learning_rate": 1.9522700965381097e-06, + "loss": 0.6999, + "step": 4217 + }, + { + "epoch": 0.63, + "grad_norm": 3.2900704012049786, + "learning_rate": 1.9522406018806423e-06, + "loss": 0.707, + "step": 4218 + }, + { + "epoch": 0.63, + "grad_norm": 0.7502533806694988, + "learning_rate": 1.952211098335829e-06, + "loss": 0.6849, + "step": 4219 + }, + { + "epoch": 0.63, + "grad_norm": 0.8532153567730596, + "learning_rate": 1.952181585903945e-06, + "loss": 0.681, + "step": 4220 + }, + { + "epoch": 0.63, + "grad_norm": 4.797489761589331, + "learning_rate": 1.952152064585266e-06, + "loss": 0.6732, + "step": 4221 + }, + { + "epoch": 0.63, + "grad_norm": 3.4033773467514252, + "learning_rate": 1.952122534380068e-06, + "loss": 0.6829, + "step": 4222 + }, + { + "epoch": 0.63, + "grad_norm": 0.6956200073348711, + "learning_rate": 1.952092995288626e-06, + "loss": 0.6868, + "step": 4223 + }, + { + "epoch": 0.63, + "grad_norm": 7.056607873959254, + "learning_rate": 1.952063447311216e-06, + "loss": 0.6888, + "step": 4224 + }, + { + "epoch": 0.63, + "grad_norm": 2.1266547858163816, + "learning_rate": 1.9520338904481135e-06, + "loss": 0.6628, + "step": 4225 + }, + { + "epoch": 0.63, + "grad_norm": 2.0757548653500533, + "learning_rate": 1.9520043246995943e-06, + "loss": 0.6875, + "step": 4226 + }, + { + "epoch": 0.63, + "grad_norm": 3.493999202167379, + "learning_rate": 1.951974750065935e-06, + "loss": 0.6849, + "step": 4227 + }, + { + "epoch": 0.63, + "grad_norm": 5.615832187022963, + "learning_rate": 1.9519451665474108e-06, + "loss": 0.6868, + "step": 4228 + }, + { + "epoch": 0.63, + "grad_norm": 0.6395909485293386, + "learning_rate": 1.9519155741442983e-06, + "loss": 0.668, + "step": 4229 + }, + { + "epoch": 0.63, + "grad_norm": 4.878014997303444, + "learning_rate": 1.9518859728568736e-06, + "loss": 0.6953, + "step": 4230 + }, + { + "epoch": 0.63, + "grad_norm": 0.8150771902779476, + "learning_rate": 1.951856362685413e-06, + "loss": 0.6953, + "step": 4231 + }, + { + "epoch": 0.63, + "grad_norm": 0.5658059223610259, + "learning_rate": 1.9518267436301923e-06, + "loss": 0.6862, + "step": 4232 + }, + { + "epoch": 0.63, + "grad_norm": 1.5131029539483583, + "learning_rate": 1.9517971156914892e-06, + "loss": 0.6719, + "step": 4233 + }, + { + "epoch": 0.63, + "grad_norm": 4.29689791621872, + "learning_rate": 1.951767478869579e-06, + "loss": 0.6706, + "step": 4234 + }, + { + "epoch": 0.63, + "grad_norm": 4.859550788658977, + "learning_rate": 1.951737833164739e-06, + "loss": 0.6914, + "step": 4235 + }, + { + "epoch": 0.63, + "grad_norm": 0.9930812360316752, + "learning_rate": 1.9517081785772454e-06, + "loss": 0.6836, + "step": 4236 + }, + { + "epoch": 0.63, + "grad_norm": 2.6470354263144658, + "learning_rate": 1.951678515107375e-06, + "loss": 0.6797, + "step": 4237 + }, + { + "epoch": 0.63, + "grad_norm": 1.496471279670719, + "learning_rate": 1.951648842755405e-06, + "loss": 0.6947, + "step": 4238 + }, + { + "epoch": 0.63, + "grad_norm": 2.3761182373226597, + "learning_rate": 1.9516191615216122e-06, + "loss": 0.6745, + "step": 4239 + }, + { + "epoch": 0.63, + "grad_norm": 5.178042417691724, + "learning_rate": 1.951589471406274e-06, + "loss": 0.6738, + "step": 4240 + }, + { + "epoch": 0.63, + "grad_norm": 3.900531376684805, + "learning_rate": 1.951559772409667e-06, + "loss": 0.6901, + "step": 4241 + }, + { + "epoch": 0.63, + "grad_norm": 1.8350422135642965, + "learning_rate": 1.951530064532068e-06, + "loss": 0.6647, + "step": 4242 + }, + { + "epoch": 0.63, + "grad_norm": 7.216153854702794, + "learning_rate": 1.9515003477737552e-06, + "loss": 0.6751, + "step": 4243 + }, + { + "epoch": 0.63, + "grad_norm": 1.3312152029019744, + "learning_rate": 1.9514706221350054e-06, + "loss": 0.6816, + "step": 4244 + }, + { + "epoch": 0.63, + "grad_norm": 2.763801843815841, + "learning_rate": 1.951440887616096e-06, + "loss": 0.6758, + "step": 4245 + }, + { + "epoch": 0.63, + "grad_norm": 2.1947618432423197, + "learning_rate": 1.951411144217305e-06, + "loss": 0.6953, + "step": 4246 + }, + { + "epoch": 0.63, + "grad_norm": 0.8276324411985485, + "learning_rate": 1.9513813919389094e-06, + "loss": 0.7044, + "step": 4247 + }, + { + "epoch": 0.63, + "grad_norm": 1.1362251051128947, + "learning_rate": 1.951351630781187e-06, + "loss": 0.7005, + "step": 4248 + }, + { + "epoch": 0.63, + "grad_norm": 3.6518930202249202, + "learning_rate": 1.951321860744416e-06, + "loss": 0.7018, + "step": 4249 + }, + { + "epoch": 0.63, + "grad_norm": 2.2967724380389973, + "learning_rate": 1.9512920818288736e-06, + "loss": 0.6758, + "step": 4250 + }, + { + "epoch": 0.63, + "grad_norm": 3.4184246520790906, + "learning_rate": 1.9512622940348385e-06, + "loss": 0.6784, + "step": 4251 + }, + { + "epoch": 0.63, + "grad_norm": 1.1591135079325599, + "learning_rate": 1.951232497362588e-06, + "loss": 0.6953, + "step": 4252 + }, + { + "epoch": 0.63, + "grad_norm": 5.22689536997153, + "learning_rate": 1.9512026918124005e-06, + "loss": 0.6953, + "step": 4253 + }, + { + "epoch": 0.63, + "grad_norm": 0.5943987232104725, + "learning_rate": 1.9511728773845545e-06, + "loss": 0.6842, + "step": 4254 + }, + { + "epoch": 0.63, + "grad_norm": 0.9414894759686177, + "learning_rate": 1.951143054079327e-06, + "loss": 0.6764, + "step": 4255 + }, + { + "epoch": 0.63, + "grad_norm": 5.792655612533685, + "learning_rate": 1.9511132218969984e-06, + "loss": 0.696, + "step": 4256 + }, + { + "epoch": 0.63, + "grad_norm": 4.988067031721738, + "learning_rate": 1.9510833808378453e-06, + "loss": 0.6882, + "step": 4257 + }, + { + "epoch": 0.64, + "grad_norm": 1.281325675666828, + "learning_rate": 1.951053530902147e-06, + "loss": 0.6823, + "step": 4258 + }, + { + "epoch": 0.64, + "grad_norm": 3.292294698704286, + "learning_rate": 1.9510236720901823e-06, + "loss": 0.6882, + "step": 4259 + }, + { + "epoch": 0.64, + "grad_norm": 2.7791734572008657, + "learning_rate": 1.9509938044022293e-06, + "loss": 0.6816, + "step": 4260 + }, + { + "epoch": 0.64, + "grad_norm": 3.588255214465422, + "learning_rate": 1.950963927838567e-06, + "loss": 0.7018, + "step": 4261 + }, + { + "epoch": 0.64, + "grad_norm": 1.8209263828135651, + "learning_rate": 1.950934042399474e-06, + "loss": 0.681, + "step": 4262 + }, + { + "epoch": 0.64, + "grad_norm": 4.430803174304989, + "learning_rate": 1.9509041480852298e-06, + "loss": 0.6712, + "step": 4263 + }, + { + "epoch": 0.64, + "grad_norm": 2.7466610499611543, + "learning_rate": 1.9508742448961134e-06, + "loss": 0.6706, + "step": 4264 + }, + { + "epoch": 0.64, + "grad_norm": 6.753661906305285, + "learning_rate": 1.9508443328324033e-06, + "loss": 0.6628, + "step": 4265 + }, + { + "epoch": 0.64, + "grad_norm": 1.1517968950312667, + "learning_rate": 1.9508144118943786e-06, + "loss": 0.6901, + "step": 4266 + }, + { + "epoch": 0.64, + "grad_norm": 3.627687314713695, + "learning_rate": 1.9507844820823196e-06, + "loss": 0.6868, + "step": 4267 + }, + { + "epoch": 0.64, + "grad_norm": 0.8751765480975096, + "learning_rate": 1.950754543396504e-06, + "loss": 0.6855, + "step": 4268 + }, + { + "epoch": 0.64, + "grad_norm": 1.241915554717159, + "learning_rate": 1.950724595837213e-06, + "loss": 0.6725, + "step": 4269 + }, + { + "epoch": 0.64, + "grad_norm": 2.8847298841668723, + "learning_rate": 1.950694639404725e-06, + "loss": 0.6842, + "step": 4270 + }, + { + "epoch": 0.64, + "grad_norm": 6.242748178616972, + "learning_rate": 1.95066467409932e-06, + "loss": 0.6706, + "step": 4271 + }, + { + "epoch": 0.64, + "grad_norm": 2.3804360737743244, + "learning_rate": 1.9506346999212773e-06, + "loss": 0.7005, + "step": 4272 + }, + { + "epoch": 0.64, + "grad_norm": 1.3987439011334675, + "learning_rate": 1.950604716870877e-06, + "loss": 0.6699, + "step": 4273 + }, + { + "epoch": 0.64, + "grad_norm": 4.01586975444677, + "learning_rate": 1.9505747249483985e-06, + "loss": 0.6882, + "step": 4274 + }, + { + "epoch": 0.64, + "grad_norm": 3.2110924533431375, + "learning_rate": 1.9505447241541225e-06, + "loss": 0.6816, + "step": 4275 + }, + { + "epoch": 0.64, + "grad_norm": 0.6177476342523456, + "learning_rate": 1.950514714488328e-06, + "loss": 0.6764, + "step": 4276 + }, + { + "epoch": 0.64, + "grad_norm": 3.4337799901849095, + "learning_rate": 1.950484695951296e-06, + "loss": 0.7005, + "step": 4277 + }, + { + "epoch": 0.64, + "grad_norm": 5.92464401649624, + "learning_rate": 1.9504546685433065e-06, + "loss": 0.6999, + "step": 4278 + }, + { + "epoch": 0.64, + "grad_norm": 7.713926938065606, + "learning_rate": 1.9504246322646393e-06, + "loss": 0.7077, + "step": 4279 + }, + { + "epoch": 0.64, + "grad_norm": 5.967359792617599, + "learning_rate": 1.950394587115575e-06, + "loss": 0.6914, + "step": 4280 + }, + { + "epoch": 0.64, + "grad_norm": 1.4890057304549755, + "learning_rate": 1.9503645330963936e-06, + "loss": 0.6725, + "step": 4281 + }, + { + "epoch": 0.64, + "grad_norm": 1.4012441099220403, + "learning_rate": 1.950334470207376e-06, + "loss": 0.6784, + "step": 4282 + }, + { + "epoch": 0.64, + "grad_norm": 1.2047922653123764, + "learning_rate": 1.9503043984488032e-06, + "loss": 0.6953, + "step": 4283 + }, + { + "epoch": 0.64, + "grad_norm": 5.930934544421041, + "learning_rate": 1.950274317820955e-06, + "loss": 0.694, + "step": 4284 + }, + { + "epoch": 0.64, + "grad_norm": 4.878269287197783, + "learning_rate": 1.950244228324113e-06, + "loss": 0.6862, + "step": 4285 + }, + { + "epoch": 0.64, + "grad_norm": 6.031999151454824, + "learning_rate": 1.9502141299585574e-06, + "loss": 0.6934, + "step": 4286 + }, + { + "epoch": 0.64, + "grad_norm": 4.868490087082502, + "learning_rate": 1.9501840227245688e-06, + "loss": 0.6986, + "step": 4287 + }, + { + "epoch": 0.64, + "grad_norm": 4.536906763276398, + "learning_rate": 1.9501539066224293e-06, + "loss": 0.6888, + "step": 4288 + }, + { + "epoch": 0.64, + "grad_norm": 0.5276224796173875, + "learning_rate": 1.950123781652419e-06, + "loss": 0.6895, + "step": 4289 + }, + { + "epoch": 0.64, + "grad_norm": 3.075286948627591, + "learning_rate": 1.95009364781482e-06, + "loss": 0.6686, + "step": 4290 + }, + { + "epoch": 0.64, + "grad_norm": 4.168320125447261, + "learning_rate": 1.9500635051099125e-06, + "loss": 0.6745, + "step": 4291 + }, + { + "epoch": 0.64, + "grad_norm": 8.574887066778595, + "learning_rate": 1.9500333535379783e-06, + "loss": 0.6693, + "step": 4292 + }, + { + "epoch": 0.64, + "grad_norm": 0.9456125014092531, + "learning_rate": 1.9500031930992985e-06, + "loss": 0.6979, + "step": 4293 + }, + { + "epoch": 0.64, + "grad_norm": 1.7558959573554094, + "learning_rate": 1.9499730237941553e-06, + "loss": 0.6992, + "step": 4294 + }, + { + "epoch": 0.64, + "grad_norm": 0.7659452347343915, + "learning_rate": 1.94994284562283e-06, + "loss": 0.6927, + "step": 4295 + }, + { + "epoch": 0.64, + "grad_norm": 4.6122501946205325, + "learning_rate": 1.9499126585856036e-06, + "loss": 0.6992, + "step": 4296 + }, + { + "epoch": 0.64, + "grad_norm": 6.051910683049865, + "learning_rate": 1.949882462682759e-06, + "loss": 0.6914, + "step": 4297 + }, + { + "epoch": 0.64, + "grad_norm": 7.089321703464452, + "learning_rate": 1.9498522579145767e-06, + "loss": 0.6784, + "step": 4298 + }, + { + "epoch": 0.64, + "grad_norm": 2.050157378339197, + "learning_rate": 1.9498220442813397e-06, + "loss": 0.6862, + "step": 4299 + }, + { + "epoch": 0.64, + "grad_norm": 7.356213331073814, + "learning_rate": 1.9497918217833295e-06, + "loss": 0.6914, + "step": 4300 + }, + { + "epoch": 0.64, + "grad_norm": 0.4529404746195979, + "learning_rate": 1.949761590420828e-06, + "loss": 0.6966, + "step": 4301 + }, + { + "epoch": 0.64, + "grad_norm": 1.5083628881595825, + "learning_rate": 1.949731350194118e-06, + "loss": 0.6784, + "step": 4302 + }, + { + "epoch": 0.64, + "grad_norm": 0.877819051702648, + "learning_rate": 1.9497011011034806e-06, + "loss": 0.6849, + "step": 4303 + }, + { + "epoch": 0.64, + "grad_norm": 4.079601677664231, + "learning_rate": 1.9496708431491993e-06, + "loss": 0.6829, + "step": 4304 + }, + { + "epoch": 0.64, + "grad_norm": 1.7089216811009107, + "learning_rate": 1.9496405763315564e-06, + "loss": 0.6953, + "step": 4305 + }, + { + "epoch": 0.64, + "grad_norm": 3.2872231477241165, + "learning_rate": 1.9496103006508335e-06, + "loss": 0.681, + "step": 4306 + }, + { + "epoch": 0.64, + "grad_norm": 1.4574389038013815, + "learning_rate": 1.949580016107314e-06, + "loss": 0.681, + "step": 4307 + }, + { + "epoch": 0.64, + "grad_norm": 4.793967237736386, + "learning_rate": 1.94954972270128e-06, + "loss": 0.6686, + "step": 4308 + }, + { + "epoch": 0.64, + "grad_norm": 1.0809151738695877, + "learning_rate": 1.9495194204330145e-06, + "loss": 0.6647, + "step": 4309 + }, + { + "epoch": 0.64, + "grad_norm": 2.309023910451725, + "learning_rate": 1.9494891093028002e-06, + "loss": 0.6738, + "step": 4310 + }, + { + "epoch": 0.64, + "grad_norm": 0.6984581433107327, + "learning_rate": 1.9494587893109204e-06, + "loss": 0.6823, + "step": 4311 + }, + { + "epoch": 0.64, + "grad_norm": 8.19293156032089, + "learning_rate": 1.9494284604576572e-06, + "loss": 0.7109, + "step": 4312 + }, + { + "epoch": 0.64, + "grad_norm": 1.0359375095252503, + "learning_rate": 1.949398122743295e-06, + "loss": 0.6849, + "step": 4313 + }, + { + "epoch": 0.64, + "grad_norm": 1.6782089182816349, + "learning_rate": 1.9493677761681156e-06, + "loss": 0.6849, + "step": 4314 + }, + { + "epoch": 0.64, + "grad_norm": 6.079978014081709, + "learning_rate": 1.949337420732403e-06, + "loss": 0.6973, + "step": 4315 + }, + { + "epoch": 0.64, + "grad_norm": 6.870849983594082, + "learning_rate": 1.9493070564364404e-06, + "loss": 0.679, + "step": 4316 + }, + { + "epoch": 0.64, + "grad_norm": 2.6735773472121735, + "learning_rate": 1.9492766832805106e-06, + "loss": 0.6836, + "step": 4317 + }, + { + "epoch": 0.64, + "grad_norm": 4.938652164543074, + "learning_rate": 1.9492463012648977e-06, + "loss": 0.6992, + "step": 4318 + }, + { + "epoch": 0.64, + "grad_norm": 0.7865246755938715, + "learning_rate": 1.9492159103898857e-06, + "loss": 0.6602, + "step": 4319 + }, + { + "epoch": 0.64, + "grad_norm": 1.5105029439294169, + "learning_rate": 1.949185510655757e-06, + "loss": 0.6686, + "step": 4320 + }, + { + "epoch": 0.64, + "grad_norm": 2.7904261433842974, + "learning_rate": 1.9491551020627964e-06, + "loss": 0.6921, + "step": 4321 + }, + { + "epoch": 0.64, + "grad_norm": 4.2514237794542575, + "learning_rate": 1.949124684611287e-06, + "loss": 0.6842, + "step": 4322 + }, + { + "epoch": 0.64, + "grad_norm": 3.325815613504528, + "learning_rate": 1.949094258301513e-06, + "loss": 0.6582, + "step": 4323 + }, + { + "epoch": 0.64, + "grad_norm": 2.267815698957933, + "learning_rate": 1.9490638231337586e-06, + "loss": 0.7122, + "step": 4324 + }, + { + "epoch": 0.65, + "grad_norm": 6.234217114717781, + "learning_rate": 1.9490333791083075e-06, + "loss": 0.6927, + "step": 4325 + }, + { + "epoch": 0.65, + "grad_norm": 3.0771540441453915, + "learning_rate": 1.949002926225444e-06, + "loss": 0.6953, + "step": 4326 + }, + { + "epoch": 0.65, + "grad_norm": 2.1471262746690676, + "learning_rate": 1.9489724644854523e-06, + "loss": 0.6914, + "step": 4327 + }, + { + "epoch": 0.65, + "grad_norm": 3.0249691638698506, + "learning_rate": 1.9489419938886166e-06, + "loss": 0.679, + "step": 4328 + }, + { + "epoch": 0.65, + "grad_norm": 1.4648978351805138, + "learning_rate": 1.9489115144352213e-06, + "loss": 0.6966, + "step": 4329 + }, + { + "epoch": 0.65, + "grad_norm": 1.9727681920127664, + "learning_rate": 1.9488810261255507e-06, + "loss": 0.6777, + "step": 4330 + }, + { + "epoch": 0.65, + "grad_norm": 1.1087194130833946, + "learning_rate": 1.9488505289598897e-06, + "loss": 0.6641, + "step": 4331 + }, + { + "epoch": 0.65, + "grad_norm": 1.9033568688307647, + "learning_rate": 1.948820022938523e-06, + "loss": 0.6849, + "step": 4332 + }, + { + "epoch": 0.65, + "grad_norm": 1.3286449572478056, + "learning_rate": 1.948789508061735e-06, + "loss": 0.6693, + "step": 4333 + }, + { + "epoch": 0.65, + "grad_norm": 2.7580054100779687, + "learning_rate": 1.9487589843298107e-06, + "loss": 0.6849, + "step": 4334 + }, + { + "epoch": 0.65, + "grad_norm": 2.900332060570873, + "learning_rate": 1.9487284517430347e-06, + "loss": 0.7057, + "step": 4335 + }, + { + "epoch": 0.65, + "grad_norm": 1.6716299347900863, + "learning_rate": 1.9486979103016923e-06, + "loss": 0.7057, + "step": 4336 + }, + { + "epoch": 0.65, + "grad_norm": 1.4048284874156887, + "learning_rate": 1.9486673600060684e-06, + "loss": 0.6758, + "step": 4337 + }, + { + "epoch": 0.65, + "grad_norm": 4.6307656768430645, + "learning_rate": 1.948636800856448e-06, + "loss": 0.6868, + "step": 4338 + }, + { + "epoch": 0.65, + "grad_norm": 0.587420095198285, + "learning_rate": 1.9486062328531168e-06, + "loss": 0.7005, + "step": 4339 + }, + { + "epoch": 0.65, + "grad_norm": 3.777053382143625, + "learning_rate": 1.9485756559963595e-06, + "loss": 0.6764, + "step": 4340 + }, + { + "epoch": 0.65, + "grad_norm": 3.3982073176858623, + "learning_rate": 1.9485450702864617e-06, + "loss": 0.6842, + "step": 4341 + }, + { + "epoch": 0.65, + "grad_norm": 0.7308977851650629, + "learning_rate": 1.948514475723709e-06, + "loss": 0.6712, + "step": 4342 + }, + { + "epoch": 0.65, + "grad_norm": 1.7333472808400587, + "learning_rate": 1.948483872308387e-06, + "loss": 0.679, + "step": 4343 + }, + { + "epoch": 0.65, + "grad_norm": 1.2486683087005188, + "learning_rate": 1.9484532600407807e-06, + "loss": 0.6895, + "step": 4344 + }, + { + "epoch": 0.65, + "grad_norm": 0.5921630853592034, + "learning_rate": 1.9484226389211763e-06, + "loss": 0.6777, + "step": 4345 + }, + { + "epoch": 0.65, + "grad_norm": 3.326334511880046, + "learning_rate": 1.9483920089498594e-06, + "loss": 0.6836, + "step": 4346 + }, + { + "epoch": 0.65, + "grad_norm": 2.4165419184286665, + "learning_rate": 1.948361370127116e-06, + "loss": 0.6803, + "step": 4347 + }, + { + "epoch": 0.65, + "grad_norm": 2.3751323085217537, + "learning_rate": 1.9483307224532324e-06, + "loss": 0.6849, + "step": 4348 + }, + { + "epoch": 0.65, + "grad_norm": 1.3783681780979369, + "learning_rate": 1.9483000659284944e-06, + "loss": 0.6908, + "step": 4349 + }, + { + "epoch": 0.65, + "grad_norm": 2.2133982767950084, + "learning_rate": 1.948269400553188e-06, + "loss": 0.694, + "step": 4350 + }, + { + "epoch": 0.65, + "grad_norm": 0.8294361372790812, + "learning_rate": 1.9482387263275992e-06, + "loss": 0.6966, + "step": 4351 + }, + { + "epoch": 0.65, + "grad_norm": 4.329670723616537, + "learning_rate": 1.9482080432520146e-06, + "loss": 0.6823, + "step": 4352 + }, + { + "epoch": 0.65, + "grad_norm": 0.765836533871267, + "learning_rate": 1.9481773513267205e-06, + "loss": 0.6966, + "step": 4353 + }, + { + "epoch": 0.65, + "grad_norm": 1.8607326979339438, + "learning_rate": 1.9481466505520034e-06, + "loss": 0.6797, + "step": 4354 + }, + { + "epoch": 0.65, + "grad_norm": 1.351875379493428, + "learning_rate": 1.9481159409281493e-06, + "loss": 0.6621, + "step": 4355 + }, + { + "epoch": 0.65, + "grad_norm": 3.182812969928094, + "learning_rate": 1.948085222455446e-06, + "loss": 0.6953, + "step": 4356 + }, + { + "epoch": 0.65, + "grad_norm": 2.363700813012888, + "learning_rate": 1.9480544951341787e-06, + "loss": 0.6803, + "step": 4357 + }, + { + "epoch": 0.65, + "grad_norm": 3.405297432278351, + "learning_rate": 1.9480237589646352e-06, + "loss": 0.6654, + "step": 4358 + }, + { + "epoch": 0.65, + "grad_norm": 2.600485065031592, + "learning_rate": 1.9479930139471025e-06, + "loss": 0.6751, + "step": 4359 + }, + { + "epoch": 0.65, + "grad_norm": 0.8677072320390962, + "learning_rate": 1.9479622600818667e-06, + "loss": 0.6699, + "step": 4360 + }, + { + "epoch": 0.65, + "grad_norm": 3.421250626387373, + "learning_rate": 1.9479314973692156e-06, + "loss": 0.6888, + "step": 4361 + }, + { + "epoch": 0.65, + "grad_norm": 4.7297049919060665, + "learning_rate": 1.9479007258094357e-06, + "loss": 0.6719, + "step": 4362 + }, + { + "epoch": 0.65, + "grad_norm": 4.316573485676514, + "learning_rate": 1.9478699454028147e-06, + "loss": 0.6842, + "step": 4363 + }, + { + "epoch": 0.65, + "grad_norm": 2.2139762191696604, + "learning_rate": 1.9478391561496398e-06, + "loss": 0.6855, + "step": 4364 + }, + { + "epoch": 0.65, + "grad_norm": 1.0189600125491063, + "learning_rate": 1.947808358050198e-06, + "loss": 0.6914, + "step": 4365 + }, + { + "epoch": 0.65, + "grad_norm": 0.8671354683724904, + "learning_rate": 1.9477775511047773e-06, + "loss": 0.681, + "step": 4366 + }, + { + "epoch": 0.65, + "grad_norm": 1.2902563280247406, + "learning_rate": 1.9477467353136646e-06, + "loss": 0.6823, + "step": 4367 + }, + { + "epoch": 0.65, + "grad_norm": 5.073163610009197, + "learning_rate": 1.947715910677148e-06, + "loss": 0.6673, + "step": 4368 + }, + { + "epoch": 0.65, + "grad_norm": 1.3491883786747287, + "learning_rate": 1.947685077195515e-06, + "loss": 0.6751, + "step": 4369 + }, + { + "epoch": 0.65, + "grad_norm": 1.2824940628825348, + "learning_rate": 1.947654234869053e-06, + "loss": 0.694, + "step": 4370 + }, + { + "epoch": 0.65, + "grad_norm": 0.49976697195277625, + "learning_rate": 1.947623383698051e-06, + "loss": 0.6849, + "step": 4371 + }, + { + "epoch": 0.65, + "grad_norm": 1.9529297216522556, + "learning_rate": 1.9475925236827952e-06, + "loss": 0.6914, + "step": 4372 + }, + { + "epoch": 0.65, + "grad_norm": 2.931378011967905, + "learning_rate": 1.947561654823575e-06, + "loss": 0.6901, + "step": 4373 + }, + { + "epoch": 0.65, + "grad_norm": 4.942184431558608, + "learning_rate": 1.947530777120678e-06, + "loss": 0.7064, + "step": 4374 + }, + { + "epoch": 0.65, + "grad_norm": 3.4684169878002087, + "learning_rate": 1.9474998905743927e-06, + "loss": 0.6875, + "step": 4375 + }, + { + "epoch": 0.65, + "grad_norm": 2.7668009966715554, + "learning_rate": 1.947468995185007e-06, + "loss": 0.6979, + "step": 4376 + }, + { + "epoch": 0.65, + "grad_norm": 0.8272532038400406, + "learning_rate": 1.9474380909528092e-06, + "loss": 0.6992, + "step": 4377 + }, + { + "epoch": 0.65, + "grad_norm": 2.416958448320274, + "learning_rate": 1.9474071778780884e-06, + "loss": 0.6836, + "step": 4378 + }, + { + "epoch": 0.65, + "grad_norm": 1.3525924142179386, + "learning_rate": 1.947376255961132e-06, + "loss": 0.6693, + "step": 4379 + }, + { + "epoch": 0.65, + "grad_norm": 2.3702277315489835, + "learning_rate": 1.9473453252022295e-06, + "loss": 0.6829, + "step": 4380 + }, + { + "epoch": 0.65, + "grad_norm": 0.5217406226207262, + "learning_rate": 1.9473143856016695e-06, + "loss": 0.6777, + "step": 4381 + }, + { + "epoch": 0.65, + "grad_norm": 2.0683768552183257, + "learning_rate": 1.94728343715974e-06, + "loss": 0.6953, + "step": 4382 + }, + { + "epoch": 0.65, + "grad_norm": 1.0310525170935267, + "learning_rate": 1.947252479876731e-06, + "loss": 0.6647, + "step": 4383 + }, + { + "epoch": 0.65, + "grad_norm": 3.220494888601468, + "learning_rate": 1.9472215137529307e-06, + "loss": 0.6888, + "step": 4384 + }, + { + "epoch": 0.65, + "grad_norm": 1.879314633750355, + "learning_rate": 1.947190538788628e-06, + "loss": 0.6855, + "step": 4385 + }, + { + "epoch": 0.65, + "grad_norm": 2.822143404832425, + "learning_rate": 1.9471595549841123e-06, + "loss": 0.666, + "step": 4386 + }, + { + "epoch": 0.65, + "grad_norm": 2.6464637669115394, + "learning_rate": 1.947128562339673e-06, + "loss": 0.6921, + "step": 4387 + }, + { + "epoch": 0.65, + "grad_norm": 3.7614317701416926, + "learning_rate": 1.947097560855599e-06, + "loss": 0.696, + "step": 4388 + }, + { + "epoch": 0.65, + "grad_norm": 1.2037413421216303, + "learning_rate": 1.9470665505321793e-06, + "loss": 0.6901, + "step": 4389 + }, + { + "epoch": 0.65, + "grad_norm": 1.7983608265969113, + "learning_rate": 1.947035531369704e-06, + "loss": 0.6966, + "step": 4390 + }, + { + "epoch": 0.65, + "grad_norm": 0.6130462140384142, + "learning_rate": 1.9470045033684624e-06, + "loss": 0.6777, + "step": 4391 + }, + { + "epoch": 0.66, + "grad_norm": 0.7517741302292564, + "learning_rate": 1.946973466528744e-06, + "loss": 0.6836, + "step": 4392 + }, + { + "epoch": 0.66, + "grad_norm": 3.5946600441459164, + "learning_rate": 1.9469424208508383e-06, + "loss": 0.694, + "step": 4393 + }, + { + "epoch": 0.66, + "grad_norm": 5.191788412910382, + "learning_rate": 1.9469113663350357e-06, + "loss": 0.6921, + "step": 4394 + }, + { + "epoch": 0.66, + "grad_norm": 3.2469101954723882, + "learning_rate": 1.946880302981625e-06, + "loss": 0.6673, + "step": 4395 + }, + { + "epoch": 0.66, + "grad_norm": 1.8406551817464591, + "learning_rate": 1.9468492307908973e-06, + "loss": 0.6882, + "step": 4396 + }, + { + "epoch": 0.66, + "grad_norm": 0.8798229236463184, + "learning_rate": 1.9468181497631413e-06, + "loss": 0.6875, + "step": 4397 + }, + { + "epoch": 0.66, + "grad_norm": 1.9231330201302144, + "learning_rate": 1.946787059898648e-06, + "loss": 0.6829, + "step": 4398 + }, + { + "epoch": 0.66, + "grad_norm": 5.585985003214602, + "learning_rate": 1.9467559611977076e-06, + "loss": 0.6914, + "step": 4399 + }, + { + "epoch": 0.66, + "grad_norm": 4.7923941192961665, + "learning_rate": 1.94672485366061e-06, + "loss": 0.6855, + "step": 4400 + }, + { + "epoch": 0.66, + "grad_norm": 0.6291324706549148, + "learning_rate": 1.9466937372876456e-06, + "loss": 0.6732, + "step": 4401 + }, + { + "epoch": 0.66, + "grad_norm": 3.608010312911152, + "learning_rate": 1.946662612079105e-06, + "loss": 0.6706, + "step": 4402 + }, + { + "epoch": 0.66, + "grad_norm": 0.7329307818655871, + "learning_rate": 1.9466314780352783e-06, + "loss": 0.7057, + "step": 4403 + }, + { + "epoch": 0.66, + "grad_norm": 2.881452448581508, + "learning_rate": 1.9466003351564565e-06, + "loss": 0.6849, + "step": 4404 + }, + { + "epoch": 0.66, + "grad_norm": 1.3781140623353432, + "learning_rate": 1.94656918344293e-06, + "loss": 0.6745, + "step": 4405 + }, + { + "epoch": 0.66, + "grad_norm": 4.691734821699818, + "learning_rate": 1.9465380228949894e-06, + "loss": 0.679, + "step": 4406 + }, + { + "epoch": 0.66, + "grad_norm": 4.59487716052632, + "learning_rate": 1.9465068535129256e-06, + "loss": 0.6986, + "step": 4407 + }, + { + "epoch": 0.66, + "grad_norm": 1.7356463189215126, + "learning_rate": 1.9464756752970298e-06, + "loss": 0.6901, + "step": 4408 + }, + { + "epoch": 0.66, + "grad_norm": 1.0857762887894151, + "learning_rate": 1.946444488247593e-06, + "loss": 0.6908, + "step": 4409 + }, + { + "epoch": 0.66, + "grad_norm": 8.988311728841555, + "learning_rate": 1.946413292364906e-06, + "loss": 0.7129, + "step": 4410 + }, + { + "epoch": 0.66, + "grad_norm": 2.39642472018077, + "learning_rate": 1.94638208764926e-06, + "loss": 0.6901, + "step": 4411 + }, + { + "epoch": 0.66, + "grad_norm": 0.5048648408853417, + "learning_rate": 1.9463508741009464e-06, + "loss": 0.6706, + "step": 4412 + }, + { + "epoch": 0.66, + "grad_norm": 1.4917016133146708, + "learning_rate": 1.9463196517202564e-06, + "loss": 0.6914, + "step": 4413 + }, + { + "epoch": 0.66, + "grad_norm": 0.5057303307208255, + "learning_rate": 1.9462884205074814e-06, + "loss": 0.6738, + "step": 4414 + }, + { + "epoch": 0.66, + "grad_norm": 5.375863162184344, + "learning_rate": 1.9462571804629124e-06, + "loss": 0.6771, + "step": 4415 + }, + { + "epoch": 0.66, + "grad_norm": 1.8565615525484866, + "learning_rate": 1.946225931586842e-06, + "loss": 0.6589, + "step": 4416 + }, + { + "epoch": 0.66, + "grad_norm": 1.3419218738313277, + "learning_rate": 1.9461946738795613e-06, + "loss": 0.6771, + "step": 4417 + }, + { + "epoch": 0.66, + "grad_norm": 3.480835783019396, + "learning_rate": 1.946163407341362e-06, + "loss": 0.6895, + "step": 4418 + }, + { + "epoch": 0.66, + "grad_norm": 1.7547548958047632, + "learning_rate": 1.9461321319725354e-06, + "loss": 0.6751, + "step": 4419 + }, + { + "epoch": 0.66, + "grad_norm": 5.82808719962887, + "learning_rate": 1.9461008477733746e-06, + "loss": 0.6901, + "step": 4420 + }, + { + "epoch": 0.66, + "grad_norm": 3.434418992631334, + "learning_rate": 1.9460695547441707e-06, + "loss": 0.6732, + "step": 4421 + }, + { + "epoch": 0.66, + "grad_norm": 0.8153891654331717, + "learning_rate": 1.946038252885216e-06, + "loss": 0.6751, + "step": 4422 + }, + { + "epoch": 0.66, + "grad_norm": 5.813716032375922, + "learning_rate": 1.9460069421968022e-06, + "loss": 0.6979, + "step": 4423 + }, + { + "epoch": 0.66, + "grad_norm": 0.8368987299761954, + "learning_rate": 1.9459756226792226e-06, + "loss": 0.6465, + "step": 4424 + }, + { + "epoch": 0.66, + "grad_norm": 2.6988408326246804, + "learning_rate": 1.9459442943327686e-06, + "loss": 0.6901, + "step": 4425 + }, + { + "epoch": 0.66, + "grad_norm": 4.946238401390829, + "learning_rate": 1.9459129571577325e-06, + "loss": 0.6921, + "step": 4426 + }, + { + "epoch": 0.66, + "grad_norm": 3.1602362708425598, + "learning_rate": 1.9458816111544075e-06, + "loss": 0.653, + "step": 4427 + }, + { + "epoch": 0.66, + "grad_norm": 5.369064955539727, + "learning_rate": 1.9458502563230857e-06, + "loss": 0.7246, + "step": 4428 + }, + { + "epoch": 0.66, + "grad_norm": 4.090456062617542, + "learning_rate": 1.94581889266406e-06, + "loss": 0.696, + "step": 4429 + }, + { + "epoch": 0.66, + "grad_norm": 2.276506730823712, + "learning_rate": 1.9457875201776225e-06, + "loss": 0.6908, + "step": 4430 + }, + { + "epoch": 0.66, + "grad_norm": 2.758927564452291, + "learning_rate": 1.9457561388640668e-06, + "loss": 0.6562, + "step": 4431 + }, + { + "epoch": 0.66, + "grad_norm": 3.0098241805145065, + "learning_rate": 1.9457247487236853e-06, + "loss": 0.6745, + "step": 4432 + }, + { + "epoch": 0.66, + "grad_norm": 3.0239623739047623, + "learning_rate": 1.945693349756771e-06, + "loss": 0.7018, + "step": 4433 + }, + { + "epoch": 0.66, + "grad_norm": 4.446224221835294, + "learning_rate": 1.945661941963617e-06, + "loss": 0.6797, + "step": 4434 + }, + { + "epoch": 0.66, + "grad_norm": 0.5452059324312172, + "learning_rate": 1.945630525344517e-06, + "loss": 0.6914, + "step": 4435 + }, + { + "epoch": 0.66, + "grad_norm": 0.8526536442244393, + "learning_rate": 1.945599099899763e-06, + "loss": 0.6875, + "step": 4436 + }, + { + "epoch": 0.66, + "grad_norm": 3.4464613511929025, + "learning_rate": 1.9455676656296495e-06, + "loss": 0.696, + "step": 4437 + }, + { + "epoch": 0.66, + "grad_norm": 4.021951193286683, + "learning_rate": 1.945536222534469e-06, + "loss": 0.6803, + "step": 4438 + }, + { + "epoch": 0.66, + "grad_norm": 2.1825320190189594, + "learning_rate": 1.945504770614515e-06, + "loss": 0.7077, + "step": 4439 + }, + { + "epoch": 0.66, + "grad_norm": 1.859129745292048, + "learning_rate": 1.9454733098700823e-06, + "loss": 0.6901, + "step": 4440 + }, + { + "epoch": 0.66, + "grad_norm": 2.1618030872798353, + "learning_rate": 1.945441840301463e-06, + "loss": 0.6732, + "step": 4441 + }, + { + "epoch": 0.66, + "grad_norm": 4.044289831434746, + "learning_rate": 1.9454103619089513e-06, + "loss": 0.6673, + "step": 4442 + }, + { + "epoch": 0.66, + "grad_norm": 8.374127785904934, + "learning_rate": 1.9453788746928415e-06, + "loss": 0.6901, + "step": 4443 + }, + { + "epoch": 0.66, + "grad_norm": 3.3716015120982927, + "learning_rate": 1.9453473786534266e-06, + "loss": 0.6842, + "step": 4444 + }, + { + "epoch": 0.66, + "grad_norm": 1.5287641785493735, + "learning_rate": 1.945315873791001e-06, + "loss": 0.6758, + "step": 4445 + }, + { + "epoch": 0.66, + "grad_norm": 0.9341198326521367, + "learning_rate": 1.9452843601058595e-06, + "loss": 0.6836, + "step": 4446 + }, + { + "epoch": 0.66, + "grad_norm": 2.3926565205622605, + "learning_rate": 1.9452528375982947e-06, + "loss": 0.6836, + "step": 4447 + }, + { + "epoch": 0.66, + "grad_norm": 5.991209034245841, + "learning_rate": 1.945221306268602e-06, + "loss": 0.7044, + "step": 4448 + }, + { + "epoch": 0.66, + "grad_norm": 4.597124017174842, + "learning_rate": 1.9451897661170747e-06, + "loss": 0.6771, + "step": 4449 + }, + { + "epoch": 0.66, + "grad_norm": 6.632686887299034, + "learning_rate": 1.9451582171440083e-06, + "loss": 0.6901, + "step": 4450 + }, + { + "epoch": 0.66, + "grad_norm": 5.271062306661701, + "learning_rate": 1.9451266593496963e-06, + "loss": 0.6797, + "step": 4451 + }, + { + "epoch": 0.66, + "grad_norm": 3.5793495587635134, + "learning_rate": 1.9450950927344337e-06, + "loss": 0.6836, + "step": 4452 + }, + { + "epoch": 0.66, + "grad_norm": 2.596703318096197, + "learning_rate": 1.945063517298515e-06, + "loss": 0.6855, + "step": 4453 + }, + { + "epoch": 0.66, + "grad_norm": 0.6333842612003067, + "learning_rate": 1.9450319330422353e-06, + "loss": 0.6953, + "step": 4454 + }, + { + "epoch": 0.66, + "grad_norm": 2.7585047593706293, + "learning_rate": 1.9450003399658887e-06, + "loss": 0.6908, + "step": 4455 + }, + { + "epoch": 0.66, + "grad_norm": 1.9282247555783196, + "learning_rate": 1.9449687380697704e-06, + "loss": 0.6816, + "step": 4456 + }, + { + "epoch": 0.66, + "grad_norm": 2.4376482218422666, + "learning_rate": 1.944937127354175e-06, + "loss": 0.6862, + "step": 4457 + }, + { + "epoch": 0.66, + "grad_norm": 0.4913275593076998, + "learning_rate": 1.9449055078193985e-06, + "loss": 0.6934, + "step": 4458 + }, + { + "epoch": 0.67, + "grad_norm": 5.37468055125571, + "learning_rate": 1.9448738794657347e-06, + "loss": 0.6842, + "step": 4459 + }, + { + "epoch": 0.67, + "grad_norm": 8.273009961139964, + "learning_rate": 1.9448422422934797e-06, + "loss": 0.6934, + "step": 4460 + }, + { + "epoch": 0.67, + "grad_norm": 3.6272721626614826, + "learning_rate": 1.944810596302928e-06, + "loss": 0.7044, + "step": 4461 + }, + { + "epoch": 0.67, + "grad_norm": 2.666803547850005, + "learning_rate": 1.944778941494376e-06, + "loss": 0.6855, + "step": 4462 + }, + { + "epoch": 0.67, + "grad_norm": 2.3494164470287937, + "learning_rate": 1.9447472778681186e-06, + "loss": 0.6738, + "step": 4463 + }, + { + "epoch": 0.67, + "grad_norm": 5.484934935798241, + "learning_rate": 1.9447156054244515e-06, + "loss": 0.6934, + "step": 4464 + }, + { + "epoch": 0.67, + "grad_norm": 3.730993656123881, + "learning_rate": 1.9446839241636695e-06, + "loss": 0.6882, + "step": 4465 + }, + { + "epoch": 0.67, + "grad_norm": 1.0753801581306104, + "learning_rate": 1.944652234086069e-06, + "loss": 0.694, + "step": 4466 + }, + { + "epoch": 0.67, + "grad_norm": 0.5826377735389067, + "learning_rate": 1.944620535191946e-06, + "loss": 0.7025, + "step": 4467 + }, + { + "epoch": 0.67, + "grad_norm": 2.167371823007878, + "learning_rate": 1.9445888274815956e-06, + "loss": 0.6771, + "step": 4468 + }, + { + "epoch": 0.67, + "grad_norm": 3.6326354687504567, + "learning_rate": 1.9445571109553147e-06, + "loss": 0.6979, + "step": 4469 + }, + { + "epoch": 0.67, + "grad_norm": 4.297225190105134, + "learning_rate": 1.9445253856133982e-06, + "loss": 0.6862, + "step": 4470 + }, + { + "epoch": 0.67, + "grad_norm": 0.7096168341963641, + "learning_rate": 1.944493651456143e-06, + "loss": 0.6758, + "step": 4471 + }, + { + "epoch": 0.67, + "grad_norm": 3.5238738142498804, + "learning_rate": 1.9444619084838448e-06, + "loss": 0.7005, + "step": 4472 + }, + { + "epoch": 0.67, + "grad_norm": 1.949112231654392, + "learning_rate": 1.9444301566968003e-06, + "loss": 0.6882, + "step": 4473 + }, + { + "epoch": 0.67, + "grad_norm": 2.6456974172194725, + "learning_rate": 1.9443983960953057e-06, + "loss": 0.6823, + "step": 4474 + }, + { + "epoch": 0.67, + "grad_norm": 7.25248477437888, + "learning_rate": 1.9443666266796574e-06, + "loss": 0.6836, + "step": 4475 + }, + { + "epoch": 0.67, + "grad_norm": 3.558523480775492, + "learning_rate": 1.944334848450152e-06, + "loss": 0.6895, + "step": 4476 + }, + { + "epoch": 0.67, + "grad_norm": 0.7330063602752914, + "learning_rate": 1.9443030614070855e-06, + "loss": 0.6725, + "step": 4477 + }, + { + "epoch": 0.67, + "grad_norm": 5.259327728322829, + "learning_rate": 1.9442712655507552e-06, + "loss": 0.6771, + "step": 4478 + }, + { + "epoch": 0.67, + "grad_norm": 4.079146534839978, + "learning_rate": 1.9442394608814576e-06, + "loss": 0.6979, + "step": 4479 + }, + { + "epoch": 0.67, + "grad_norm": 1.3279138812555993, + "learning_rate": 1.9442076473994894e-06, + "loss": 0.6875, + "step": 4480 + }, + { + "epoch": 0.67, + "grad_norm": 2.1297716223602507, + "learning_rate": 1.944175825105148e-06, + "loss": 0.6953, + "step": 4481 + }, + { + "epoch": 0.67, + "grad_norm": 2.8673988063173215, + "learning_rate": 1.94414399399873e-06, + "loss": 0.6797, + "step": 4482 + }, + { + "epoch": 0.67, + "grad_norm": 0.5055607322771046, + "learning_rate": 1.944112154080533e-06, + "loss": 0.6986, + "step": 4483 + }, + { + "epoch": 0.67, + "grad_norm": 3.8463054372337657, + "learning_rate": 1.9440803053508535e-06, + "loss": 0.6777, + "step": 4484 + }, + { + "epoch": 0.67, + "grad_norm": 0.5637274754653178, + "learning_rate": 1.944048447809989e-06, + "loss": 0.6849, + "step": 4485 + }, + { + "epoch": 0.67, + "grad_norm": 1.4530317955261198, + "learning_rate": 1.944016581458237e-06, + "loss": 0.6875, + "step": 4486 + }, + { + "epoch": 0.67, + "grad_norm": 3.2373233305755265, + "learning_rate": 1.9439847062958946e-06, + "loss": 0.6855, + "step": 4487 + }, + { + "epoch": 0.67, + "grad_norm": 2.3906604546366252, + "learning_rate": 1.9439528223232596e-06, + "loss": 0.6777, + "step": 4488 + }, + { + "epoch": 0.67, + "grad_norm": 2.474020146038208, + "learning_rate": 1.943920929540629e-06, + "loss": 0.668, + "step": 4489 + }, + { + "epoch": 0.67, + "grad_norm": 5.029656803372926, + "learning_rate": 1.9438890279483015e-06, + "loss": 0.6862, + "step": 4490 + }, + { + "epoch": 0.67, + "grad_norm": 4.433063723603159, + "learning_rate": 1.9438571175465738e-06, + "loss": 0.6875, + "step": 4491 + }, + { + "epoch": 0.67, + "grad_norm": 0.6069535475237148, + "learning_rate": 1.943825198335744e-06, + "loss": 0.6875, + "step": 4492 + }, + { + "epoch": 0.67, + "grad_norm": 2.959295535895532, + "learning_rate": 1.9437932703161104e-06, + "loss": 0.6699, + "step": 4493 + }, + { + "epoch": 0.67, + "grad_norm": 0.6779930709621932, + "learning_rate": 1.943761333487971e-06, + "loss": 0.694, + "step": 4494 + }, + { + "epoch": 0.67, + "grad_norm": 0.4652695697542516, + "learning_rate": 1.9437293878516235e-06, + "loss": 0.6777, + "step": 4495 + }, + { + "epoch": 0.67, + "grad_norm": 3.0402796631238846, + "learning_rate": 1.943697433407366e-06, + "loss": 0.679, + "step": 4496 + }, + { + "epoch": 0.67, + "grad_norm": 0.7788726205529323, + "learning_rate": 1.943665470155497e-06, + "loss": 0.6562, + "step": 4497 + }, + { + "epoch": 0.67, + "grad_norm": 1.5988299652088636, + "learning_rate": 1.9436334980963145e-06, + "loss": 0.6914, + "step": 4498 + }, + { + "epoch": 0.67, + "grad_norm": 3.3274627202503626, + "learning_rate": 1.9436015172301175e-06, + "loss": 0.6699, + "step": 4499 + }, + { + "epoch": 0.67, + "grad_norm": 1.2771882119094313, + "learning_rate": 1.9435695275572037e-06, + "loss": 0.6895, + "step": 4500 + }, + { + "epoch": 0.67, + "grad_norm": 2.602329109873144, + "learning_rate": 1.943537529077872e-06, + "loss": 0.6725, + "step": 4501 + }, + { + "epoch": 0.67, + "grad_norm": 10.442106365750087, + "learning_rate": 1.9435055217924215e-06, + "loss": 0.7122, + "step": 4502 + }, + { + "epoch": 0.67, + "grad_norm": 2.2121675723382177, + "learning_rate": 1.9434735057011506e-06, + "loss": 0.6901, + "step": 4503 + }, + { + "epoch": 0.67, + "grad_norm": 3.261770171577307, + "learning_rate": 1.9434414808043577e-06, + "loss": 0.6667, + "step": 4504 + }, + { + "epoch": 0.67, + "grad_norm": 2.854971591545948, + "learning_rate": 1.9434094471023423e-06, + "loss": 0.6999, + "step": 4505 + }, + { + "epoch": 0.67, + "grad_norm": 1.1737911843219793, + "learning_rate": 1.9433774045954024e-06, + "loss": 0.6673, + "step": 4506 + }, + { + "epoch": 0.67, + "grad_norm": 2.0513650821418796, + "learning_rate": 1.9433453532838384e-06, + "loss": 0.6771, + "step": 4507 + }, + { + "epoch": 0.67, + "grad_norm": 4.424208376937677, + "learning_rate": 1.9433132931679488e-06, + "loss": 0.6647, + "step": 4508 + }, + { + "epoch": 0.67, + "grad_norm": 6.227153442774269, + "learning_rate": 1.9432812242480326e-06, + "loss": 0.681, + "step": 4509 + }, + { + "epoch": 0.67, + "grad_norm": 1.491386646865371, + "learning_rate": 1.9432491465243893e-06, + "loss": 0.668, + "step": 4510 + }, + { + "epoch": 0.67, + "grad_norm": 6.709356891211863, + "learning_rate": 1.9432170599973182e-06, + "loss": 0.6745, + "step": 4511 + }, + { + "epoch": 0.67, + "grad_norm": 3.873794444505156, + "learning_rate": 1.9431849646671193e-06, + "loss": 0.6647, + "step": 4512 + }, + { + "epoch": 0.67, + "grad_norm": 5.072238036828796, + "learning_rate": 1.9431528605340915e-06, + "loss": 0.6803, + "step": 4513 + }, + { + "epoch": 0.67, + "grad_norm": 7.9409862249645755, + "learning_rate": 1.9431207475985343e-06, + "loss": 0.7135, + "step": 4514 + }, + { + "epoch": 0.67, + "grad_norm": 6.52709849628212, + "learning_rate": 1.9430886258607483e-06, + "loss": 0.6706, + "step": 4515 + }, + { + "epoch": 0.67, + "grad_norm": 2.107384897927765, + "learning_rate": 1.9430564953210326e-06, + "loss": 0.7096, + "step": 4516 + }, + { + "epoch": 0.67, + "grad_norm": 1.5726964407584751, + "learning_rate": 1.943024355979687e-06, + "loss": 0.6771, + "step": 4517 + }, + { + "epoch": 0.67, + "grad_norm": 1.1633452070132904, + "learning_rate": 1.9429922078370118e-06, + "loss": 0.6641, + "step": 4518 + }, + { + "epoch": 0.67, + "grad_norm": 4.081431071863616, + "learning_rate": 1.942960050893307e-06, + "loss": 0.6706, + "step": 4519 + }, + { + "epoch": 0.67, + "grad_norm": 1.4174695441415859, + "learning_rate": 1.9429278851488727e-06, + "loss": 0.6673, + "step": 4520 + }, + { + "epoch": 0.67, + "grad_norm": 1.7693233782674973, + "learning_rate": 1.942895710604009e-06, + "loss": 0.6882, + "step": 4521 + }, + { + "epoch": 0.67, + "grad_norm": 2.1217481984540107, + "learning_rate": 1.9428635272590163e-06, + "loss": 0.6719, + "step": 4522 + }, + { + "epoch": 0.67, + "grad_norm": 0.6043258634952771, + "learning_rate": 1.9428313351141946e-06, + "loss": 0.6686, + "step": 4523 + }, + { + "epoch": 0.67, + "grad_norm": 3.7409610363667816, + "learning_rate": 1.942799134169845e-06, + "loss": 0.6758, + "step": 4524 + }, + { + "epoch": 0.67, + "grad_norm": 3.9092279158515453, + "learning_rate": 1.9427669244262678e-06, + "loss": 0.6725, + "step": 4525 + }, + { + "epoch": 0.68, + "grad_norm": 3.3611230185687204, + "learning_rate": 1.942734705883763e-06, + "loss": 0.6706, + "step": 4526 + }, + { + "epoch": 0.68, + "grad_norm": 1.768420774942328, + "learning_rate": 1.942702478542632e-06, + "loss": 0.6947, + "step": 4527 + }, + { + "epoch": 0.68, + "grad_norm": 1.3556815073004074, + "learning_rate": 1.9426702424031757e-06, + "loss": 0.6908, + "step": 4528 + }, + { + "epoch": 0.68, + "grad_norm": 1.6824785269343197, + "learning_rate": 1.942637997465695e-06, + "loss": 0.6673, + "step": 4529 + }, + { + "epoch": 0.68, + "grad_norm": 2.491803567979242, + "learning_rate": 1.9426057437304897e-06, + "loss": 0.6927, + "step": 4530 + }, + { + "epoch": 0.68, + "grad_norm": 4.397241884377238, + "learning_rate": 1.942573481197862e-06, + "loss": 0.6706, + "step": 4531 + }, + { + "epoch": 0.68, + "grad_norm": 0.9564569572119014, + "learning_rate": 1.9425412098681125e-06, + "loss": 0.6745, + "step": 4532 + }, + { + "epoch": 0.68, + "grad_norm": 1.0311001211924835, + "learning_rate": 1.9425089297415428e-06, + "loss": 0.6901, + "step": 4533 + }, + { + "epoch": 0.68, + "grad_norm": 1.8319664594694054, + "learning_rate": 1.9424766408184536e-06, + "loss": 0.6647, + "step": 4534 + }, + { + "epoch": 0.68, + "grad_norm": 0.9738818949631047, + "learning_rate": 1.9424443430991467e-06, + "loss": 0.666, + "step": 4535 + }, + { + "epoch": 0.68, + "grad_norm": 3.8381553877449273, + "learning_rate": 1.942412036583923e-06, + "loss": 0.6855, + "step": 4536 + }, + { + "epoch": 0.68, + "grad_norm": 0.8460506378320364, + "learning_rate": 1.9423797212730854e-06, + "loss": 0.6771, + "step": 4537 + }, + { + "epoch": 0.68, + "grad_norm": 2.1448586300393755, + "learning_rate": 1.9423473971669336e-06, + "loss": 0.6895, + "step": 4538 + }, + { + "epoch": 0.68, + "grad_norm": 2.639377282857431, + "learning_rate": 1.9423150642657706e-06, + "loss": 0.7038, + "step": 4539 + }, + { + "epoch": 0.68, + "grad_norm": 4.77735006872778, + "learning_rate": 1.9422827225698976e-06, + "loss": 0.6628, + "step": 4540 + }, + { + "epoch": 0.68, + "grad_norm": 4.75121689969217, + "learning_rate": 1.942250372079617e-06, + "loss": 0.6966, + "step": 4541 + }, + { + "epoch": 0.68, + "grad_norm": 1.2054110573387624, + "learning_rate": 1.94221801279523e-06, + "loss": 0.668, + "step": 4542 + }, + { + "epoch": 0.68, + "grad_norm": 6.550101206531818, + "learning_rate": 1.942185644717039e-06, + "loss": 0.7227, + "step": 4543 + }, + { + "epoch": 0.68, + "grad_norm": 5.080166159260949, + "learning_rate": 1.942153267845346e-06, + "loss": 0.6901, + "step": 4544 + }, + { + "epoch": 0.68, + "grad_norm": 4.13194905544647, + "learning_rate": 1.9421208821804538e-06, + "loss": 0.6654, + "step": 4545 + }, + { + "epoch": 0.68, + "grad_norm": 1.5310239771397824, + "learning_rate": 1.9420884877226634e-06, + "loss": 0.6849, + "step": 4546 + }, + { + "epoch": 0.68, + "grad_norm": 0.5775369755171336, + "learning_rate": 1.942056084472278e-06, + "loss": 0.696, + "step": 4547 + }, + { + "epoch": 0.68, + "grad_norm": 1.5186996941781012, + "learning_rate": 1.9420236724296e-06, + "loss": 0.696, + "step": 4548 + }, + { + "epoch": 0.68, + "grad_norm": 0.8463125515033169, + "learning_rate": 1.9419912515949317e-06, + "loss": 0.6803, + "step": 4549 + }, + { + "epoch": 0.68, + "grad_norm": 3.1959849505269413, + "learning_rate": 1.9419588219685756e-06, + "loss": 0.6829, + "step": 4550 + }, + { + "epoch": 0.68, + "grad_norm": 1.9348287254665508, + "learning_rate": 1.9419263835508345e-06, + "loss": 0.6868, + "step": 4551 + }, + { + "epoch": 0.68, + "grad_norm": 4.133311905997823, + "learning_rate": 1.941893936342011e-06, + "loss": 0.6608, + "step": 4552 + }, + { + "epoch": 0.68, + "grad_norm": 4.6933538181010155, + "learning_rate": 1.941861480342409e-06, + "loss": 0.6758, + "step": 4553 + }, + { + "epoch": 0.68, + "grad_norm": 4.604582313028975, + "learning_rate": 1.94182901555233e-06, + "loss": 0.6966, + "step": 4554 + }, + { + "epoch": 0.68, + "grad_norm": 1.4421770381464052, + "learning_rate": 1.941796541972077e-06, + "loss": 0.6712, + "step": 4555 + }, + { + "epoch": 0.68, + "grad_norm": 0.5976224543042137, + "learning_rate": 1.941764059601954e-06, + "loss": 0.6777, + "step": 4556 + }, + { + "epoch": 0.68, + "grad_norm": 5.91872124383277, + "learning_rate": 1.9417315684422636e-06, + "loss": 0.6875, + "step": 4557 + }, + { + "epoch": 0.68, + "grad_norm": 0.7695945220833449, + "learning_rate": 1.9416990684933094e-06, + "loss": 0.7161, + "step": 4558 + }, + { + "epoch": 0.68, + "grad_norm": 0.6708754921987589, + "learning_rate": 1.9416665597553945e-06, + "loss": 0.6953, + "step": 4559 + }, + { + "epoch": 0.68, + "grad_norm": 1.9548697104759052, + "learning_rate": 1.9416340422288224e-06, + "loss": 0.6725, + "step": 4560 + }, + { + "epoch": 0.68, + "grad_norm": 4.4910015454085555, + "learning_rate": 1.9416015159138963e-06, + "loss": 0.696, + "step": 4561 + }, + { + "epoch": 0.68, + "grad_norm": 0.6832168845830883, + "learning_rate": 1.94156898081092e-06, + "loss": 0.6868, + "step": 4562 + }, + { + "epoch": 0.68, + "grad_norm": 0.6023346518104261, + "learning_rate": 1.941536436920197e-06, + "loss": 0.666, + "step": 4563 + }, + { + "epoch": 0.68, + "grad_norm": 9.801018197804071, + "learning_rate": 1.9415038842420317e-06, + "loss": 0.6901, + "step": 4564 + }, + { + "epoch": 0.68, + "grad_norm": 3.193880325749101, + "learning_rate": 1.941471322776727e-06, + "loss": 0.7051, + "step": 4565 + }, + { + "epoch": 0.68, + "grad_norm": 11.826071329496873, + "learning_rate": 1.941438752524587e-06, + "loss": 0.7201, + "step": 4566 + }, + { + "epoch": 0.68, + "grad_norm": 1.6260820803764442, + "learning_rate": 1.941406173485916e-06, + "loss": 0.7129, + "step": 4567 + }, + { + "epoch": 0.68, + "grad_norm": 1.3227032298345793, + "learning_rate": 1.941373585661018e-06, + "loss": 0.6888, + "step": 4568 + }, + { + "epoch": 0.68, + "grad_norm": 1.9832203388888803, + "learning_rate": 1.941340989050197e-06, + "loss": 0.6947, + "step": 4569 + }, + { + "epoch": 0.68, + "grad_norm": 1.7948478883573509, + "learning_rate": 1.9413083836537578e-06, + "loss": 0.6842, + "step": 4570 + }, + { + "epoch": 0.68, + "grad_norm": 4.135369273172944, + "learning_rate": 1.9412757694720036e-06, + "loss": 0.6908, + "step": 4571 + }, + { + "epoch": 0.68, + "grad_norm": 1.781969236539982, + "learning_rate": 1.9412431465052397e-06, + "loss": 0.6784, + "step": 4572 + }, + { + "epoch": 0.68, + "grad_norm": 0.7793513455982067, + "learning_rate": 1.94121051475377e-06, + "loss": 0.6999, + "step": 4573 + }, + { + "epoch": 0.68, + "grad_norm": 2.4821255388712866, + "learning_rate": 1.9411778742179e-06, + "loss": 0.7005, + "step": 4574 + }, + { + "epoch": 0.68, + "grad_norm": 5.090045253576871, + "learning_rate": 1.941145224897933e-06, + "loss": 0.6816, + "step": 4575 + }, + { + "epoch": 0.68, + "grad_norm": 0.4860060996668661, + "learning_rate": 1.9411125667941743e-06, + "loss": 0.6914, + "step": 4576 + }, + { + "epoch": 0.68, + "grad_norm": 0.4607806522813893, + "learning_rate": 1.941079899906929e-06, + "loss": 0.6816, + "step": 4577 + }, + { + "epoch": 0.68, + "grad_norm": 1.152073746174679, + "learning_rate": 1.941047224236502e-06, + "loss": 0.668, + "step": 4578 + }, + { + "epoch": 0.68, + "grad_norm": 5.117409988734107, + "learning_rate": 1.9410145397831976e-06, + "loss": 0.6875, + "step": 4579 + }, + { + "epoch": 0.68, + "grad_norm": 3.662626142526115, + "learning_rate": 1.940981846547322e-06, + "loss": 0.6895, + "step": 4580 + }, + { + "epoch": 0.68, + "grad_norm": 3.9039331611767536, + "learning_rate": 1.9409491445291786e-06, + "loss": 0.6829, + "step": 4581 + }, + { + "epoch": 0.68, + "grad_norm": 1.4406917652842848, + "learning_rate": 1.9409164337290744e-06, + "loss": 0.6842, + "step": 4582 + }, + { + "epoch": 0.68, + "grad_norm": 1.4245709936119697, + "learning_rate": 1.9408837141473133e-06, + "loss": 0.6751, + "step": 4583 + }, + { + "epoch": 0.68, + "grad_norm": 6.559608506518679, + "learning_rate": 1.940850985784202e-06, + "loss": 0.6908, + "step": 4584 + }, + { + "epoch": 0.68, + "grad_norm": 1.7958033917158036, + "learning_rate": 1.940818248640045e-06, + "loss": 0.7064, + "step": 4585 + }, + { + "epoch": 0.68, + "grad_norm": 7.274157686631596, + "learning_rate": 1.940785502715148e-06, + "loss": 0.694, + "step": 4586 + }, + { + "epoch": 0.68, + "grad_norm": 2.541694874234467, + "learning_rate": 1.9407527480098165e-06, + "loss": 0.6888, + "step": 4587 + }, + { + "epoch": 0.68, + "grad_norm": 1.9770902666905674, + "learning_rate": 1.9407199845243566e-06, + "loss": 0.6654, + "step": 4588 + }, + { + "epoch": 0.68, + "grad_norm": 1.5045303272787047, + "learning_rate": 1.940687212259074e-06, + "loss": 0.6908, + "step": 4589 + }, + { + "epoch": 0.68, + "grad_norm": 6.158744163828834, + "learning_rate": 1.9406544312142742e-06, + "loss": 0.6868, + "step": 4590 + }, + { + "epoch": 0.68, + "grad_norm": 1.0421864790926838, + "learning_rate": 1.940621641390264e-06, + "loss": 0.6699, + "step": 4591 + }, + { + "epoch": 0.68, + "grad_norm": 4.212733672204899, + "learning_rate": 1.940588842787348e-06, + "loss": 0.6654, + "step": 4592 + }, + { + "epoch": 0.69, + "grad_norm": 3.1965852630733687, + "learning_rate": 1.9405560354058338e-06, + "loss": 0.6868, + "step": 4593 + }, + { + "epoch": 0.69, + "grad_norm": 2.191612091870629, + "learning_rate": 1.9405232192460266e-06, + "loss": 0.6647, + "step": 4594 + }, + { + "epoch": 0.69, + "grad_norm": 1.1328304435626406, + "learning_rate": 1.940490394308233e-06, + "loss": 0.6738, + "step": 4595 + }, + { + "epoch": 0.69, + "grad_norm": 5.105980342280992, + "learning_rate": 1.9404575605927597e-06, + "loss": 0.6855, + "step": 4596 + }, + { + "epoch": 0.69, + "grad_norm": 8.29571355236885, + "learning_rate": 1.9404247180999124e-06, + "loss": 0.6888, + "step": 4597 + }, + { + "epoch": 0.69, + "grad_norm": 1.1031783298403148, + "learning_rate": 1.9403918668299983e-06, + "loss": 0.6914, + "step": 4598 + }, + { + "epoch": 0.69, + "grad_norm": 1.902911569538608, + "learning_rate": 1.9403590067833236e-06, + "loss": 0.6562, + "step": 4599 + }, + { + "epoch": 0.69, + "grad_norm": 1.6732791008764911, + "learning_rate": 1.9403261379601954e-06, + "loss": 0.6849, + "step": 4600 + }, + { + "epoch": 0.69, + "grad_norm": 1.1867861133550601, + "learning_rate": 1.94029326036092e-06, + "loss": 0.6654, + "step": 4601 + }, + { + "epoch": 0.69, + "grad_norm": 3.3704623743513067, + "learning_rate": 1.9402603739858045e-06, + "loss": 0.7038, + "step": 4602 + }, + { + "epoch": 0.69, + "grad_norm": 2.631443757590841, + "learning_rate": 1.9402274788351554e-06, + "loss": 0.6908, + "step": 4603 + }, + { + "epoch": 0.69, + "grad_norm": 2.1038144127974814, + "learning_rate": 1.9401945749092806e-06, + "loss": 0.6966, + "step": 4604 + }, + { + "epoch": 0.69, + "grad_norm": 3.716205210903247, + "learning_rate": 1.9401616622084863e-06, + "loss": 0.6947, + "step": 4605 + }, + { + "epoch": 0.69, + "grad_norm": 2.3581841159752353, + "learning_rate": 1.9401287407330803e-06, + "loss": 0.7064, + "step": 4606 + }, + { + "epoch": 0.69, + "grad_norm": 1.9112253305738953, + "learning_rate": 1.9400958104833696e-06, + "loss": 0.6842, + "step": 4607 + }, + { + "epoch": 0.69, + "grad_norm": 2.8081084849806484, + "learning_rate": 1.9400628714596614e-06, + "loss": 0.6888, + "step": 4608 + }, + { + "epoch": 0.69, + "grad_norm": 2.071337354338524, + "learning_rate": 1.9400299236622636e-06, + "loss": 0.6738, + "step": 4609 + }, + { + "epoch": 0.69, + "grad_norm": 4.761038167713839, + "learning_rate": 1.939996967091483e-06, + "loss": 0.7057, + "step": 4610 + }, + { + "epoch": 0.69, + "grad_norm": 0.6922450186893402, + "learning_rate": 1.939964001747628e-06, + "loss": 0.7051, + "step": 4611 + }, + { + "epoch": 0.69, + "grad_norm": 4.902314129637913, + "learning_rate": 1.939931027631006e-06, + "loss": 0.6986, + "step": 4612 + }, + { + "epoch": 0.69, + "grad_norm": 1.5284157303821155, + "learning_rate": 1.939898044741924e-06, + "loss": 0.694, + "step": 4613 + }, + { + "epoch": 0.69, + "grad_norm": 0.43321576273256107, + "learning_rate": 1.9398650530806905e-06, + "loss": 0.6908, + "step": 4614 + }, + { + "epoch": 0.69, + "grad_norm": 1.0995080259872902, + "learning_rate": 1.9398320526476137e-06, + "loss": 0.6784, + "step": 4615 + }, + { + "epoch": 0.69, + "grad_norm": 5.951948478299124, + "learning_rate": 1.939799043443001e-06, + "loss": 0.6803, + "step": 4616 + }, + { + "epoch": 0.69, + "grad_norm": 0.48029235469037884, + "learning_rate": 1.9397660254671606e-06, + "loss": 0.6914, + "step": 4617 + }, + { + "epoch": 0.69, + "grad_norm": 0.7072911522755735, + "learning_rate": 1.939732998720401e-06, + "loss": 0.6882, + "step": 4618 + }, + { + "epoch": 0.69, + "grad_norm": 2.4989518687888888, + "learning_rate": 1.93969996320303e-06, + "loss": 0.694, + "step": 4619 + }, + { + "epoch": 0.69, + "grad_norm": 1.3906962090221362, + "learning_rate": 1.9396669189153566e-06, + "loss": 0.6738, + "step": 4620 + }, + { + "epoch": 0.69, + "grad_norm": 1.5765388704891201, + "learning_rate": 1.9396338658576886e-06, + "loss": 0.6901, + "step": 4621 + }, + { + "epoch": 0.69, + "grad_norm": 0.5349766711656379, + "learning_rate": 1.9396008040303343e-06, + "loss": 0.6927, + "step": 4622 + }, + { + "epoch": 0.69, + "grad_norm": 1.366399486455268, + "learning_rate": 1.9395677334336027e-06, + "loss": 0.6803, + "step": 4623 + }, + { + "epoch": 0.69, + "grad_norm": 8.152531925475314, + "learning_rate": 1.9395346540678024e-06, + "loss": 0.6842, + "step": 4624 + }, + { + "epoch": 0.69, + "grad_norm": 7.489004114544809, + "learning_rate": 1.939501565933242e-06, + "loss": 0.6777, + "step": 4625 + }, + { + "epoch": 0.69, + "grad_norm": 2.276888325049014, + "learning_rate": 1.9394684690302307e-06, + "loss": 0.6908, + "step": 4626 + }, + { + "epoch": 0.69, + "grad_norm": 1.2579125823539852, + "learning_rate": 1.939435363359077e-06, + "loss": 0.6797, + "step": 4627 + }, + { + "epoch": 0.69, + "grad_norm": 1.8014572297154965, + "learning_rate": 1.93940224892009e-06, + "loss": 0.6862, + "step": 4628 + }, + { + "epoch": 0.69, + "grad_norm": 4.044169160887862, + "learning_rate": 1.939369125713579e-06, + "loss": 0.6797, + "step": 4629 + }, + { + "epoch": 0.69, + "grad_norm": 4.030245279370902, + "learning_rate": 1.939335993739852e-06, + "loss": 0.6934, + "step": 4630 + }, + { + "epoch": 0.69, + "grad_norm": 3.6422805630580934, + "learning_rate": 1.9393028529992198e-06, + "loss": 0.6836, + "step": 4631 + }, + { + "epoch": 0.69, + "grad_norm": 1.7123181271403418, + "learning_rate": 1.9392697034919907e-06, + "loss": 0.6855, + "step": 4632 + }, + { + "epoch": 0.69, + "grad_norm": 6.812792267988028, + "learning_rate": 1.9392365452184743e-06, + "loss": 0.6816, + "step": 4633 + }, + { + "epoch": 0.69, + "grad_norm": 2.388922974100465, + "learning_rate": 1.9392033781789804e-06, + "loss": 0.6875, + "step": 4634 + }, + { + "epoch": 0.69, + "grad_norm": 2.309722557100435, + "learning_rate": 1.939170202373818e-06, + "loss": 0.679, + "step": 4635 + }, + { + "epoch": 0.69, + "grad_norm": 8.264028712008374, + "learning_rate": 1.939137017803298e-06, + "loss": 0.6934, + "step": 4636 + }, + { + "epoch": 0.69, + "grad_norm": 0.8334782513440696, + "learning_rate": 1.9391038244677283e-06, + "loss": 0.6901, + "step": 4637 + }, + { + "epoch": 0.69, + "grad_norm": 5.962769984861091, + "learning_rate": 1.9390706223674197e-06, + "loss": 0.6784, + "step": 4638 + }, + { + "epoch": 0.69, + "grad_norm": 5.576555473821989, + "learning_rate": 1.939037411502682e-06, + "loss": 0.6855, + "step": 4639 + }, + { + "epoch": 0.69, + "grad_norm": 2.9320754513093252, + "learning_rate": 1.939004191873825e-06, + "loss": 0.679, + "step": 4640 + }, + { + "epoch": 0.69, + "grad_norm": 0.7055094360350899, + "learning_rate": 1.938970963481159e-06, + "loss": 0.6882, + "step": 4641 + }, + { + "epoch": 0.69, + "grad_norm": 0.9942293914644408, + "learning_rate": 1.938937726324994e-06, + "loss": 0.6888, + "step": 4642 + }, + { + "epoch": 0.69, + "grad_norm": 2.201398924085725, + "learning_rate": 1.9389044804056398e-06, + "loss": 0.6895, + "step": 4643 + }, + { + "epoch": 0.69, + "grad_norm": 8.162824661151475, + "learning_rate": 1.938871225723407e-06, + "loss": 0.707, + "step": 4644 + }, + { + "epoch": 0.69, + "grad_norm": 3.1187850897245024, + "learning_rate": 1.9388379622786067e-06, + "loss": 0.696, + "step": 4645 + }, + { + "epoch": 0.69, + "grad_norm": 1.7693301994142272, + "learning_rate": 1.9388046900715483e-06, + "loss": 0.6771, + "step": 4646 + }, + { + "epoch": 0.69, + "grad_norm": 1.0500943973166057, + "learning_rate": 1.9387714091025426e-06, + "loss": 0.6706, + "step": 4647 + }, + { + "epoch": 0.69, + "grad_norm": 0.7340264217449948, + "learning_rate": 1.9387381193719005e-06, + "loss": 0.6797, + "step": 4648 + }, + { + "epoch": 0.69, + "grad_norm": 8.431567362695743, + "learning_rate": 1.9387048208799324e-06, + "loss": 0.7064, + "step": 4649 + }, + { + "epoch": 0.69, + "grad_norm": 1.5451260324124858, + "learning_rate": 1.9386715136269493e-06, + "loss": 0.696, + "step": 4650 + }, + { + "epoch": 0.69, + "grad_norm": 1.3724832428135445, + "learning_rate": 1.938638197613262e-06, + "loss": 0.6816, + "step": 4651 + }, + { + "epoch": 0.69, + "grad_norm": 3.6049657404921454, + "learning_rate": 1.938604872839181e-06, + "loss": 0.6693, + "step": 4652 + }, + { + "epoch": 0.69, + "grad_norm": 4.829689903957837, + "learning_rate": 1.938571539305018e-06, + "loss": 0.6849, + "step": 4653 + }, + { + "epoch": 0.69, + "grad_norm": 1.9040573660471478, + "learning_rate": 1.9385381970110835e-06, + "loss": 0.6797, + "step": 4654 + }, + { + "epoch": 0.69, + "grad_norm": 0.6407854203235669, + "learning_rate": 1.938504845957689e-06, + "loss": 0.6712, + "step": 4655 + }, + { + "epoch": 0.69, + "grad_norm": 1.9332937127317347, + "learning_rate": 1.9384714861451466e-06, + "loss": 0.6758, + "step": 4656 + }, + { + "epoch": 0.69, + "grad_norm": 4.805330744579195, + "learning_rate": 1.938438117573766e-06, + "loss": 0.6758, + "step": 4657 + }, + { + "epoch": 0.69, + "grad_norm": 2.7622381374425076, + "learning_rate": 1.9384047402438597e-06, + "loss": 0.7044, + "step": 4658 + }, + { + "epoch": 0.69, + "grad_norm": 1.90286324806706, + "learning_rate": 1.9383713541557393e-06, + "loss": 0.6901, + "step": 4659 + }, + { + "epoch": 0.7, + "grad_norm": 4.3355062697444104, + "learning_rate": 1.9383379593097158e-06, + "loss": 0.6849, + "step": 4660 + }, + { + "epoch": 0.7, + "grad_norm": 1.4162581772560805, + "learning_rate": 1.938304555706101e-06, + "loss": 0.6615, + "step": 4661 + }, + { + "epoch": 0.7, + "grad_norm": 1.072647290292624, + "learning_rate": 1.938271143345207e-06, + "loss": 0.679, + "step": 4662 + }, + { + "epoch": 0.7, + "grad_norm": 0.9667173621117834, + "learning_rate": 1.9382377222273455e-06, + "loss": 0.6842, + "step": 4663 + }, + { + "epoch": 0.7, + "grad_norm": 4.5347195404446765, + "learning_rate": 1.938204292352828e-06, + "loss": 0.6667, + "step": 4664 + }, + { + "epoch": 0.7, + "grad_norm": 0.5122963144739505, + "learning_rate": 1.9381708537219672e-06, + "loss": 0.6764, + "step": 4665 + }, + { + "epoch": 0.7, + "grad_norm": 1.3574941183975835, + "learning_rate": 1.9381374063350745e-06, + "loss": 0.6836, + "step": 4666 + }, + { + "epoch": 0.7, + "grad_norm": 0.9042500665137655, + "learning_rate": 1.9381039501924627e-06, + "loss": 0.6816, + "step": 4667 + }, + { + "epoch": 0.7, + "grad_norm": 7.180596290318754, + "learning_rate": 1.938070485294444e-06, + "loss": 0.6875, + "step": 4668 + }, + { + "epoch": 0.7, + "grad_norm": 4.996751309845459, + "learning_rate": 1.93803701164133e-06, + "loss": 0.6992, + "step": 4669 + }, + { + "epoch": 0.7, + "grad_norm": 0.7256676786052247, + "learning_rate": 1.938003529233434e-06, + "loss": 0.679, + "step": 4670 + }, + { + "epoch": 0.7, + "grad_norm": 3.990306077184205, + "learning_rate": 1.937970038071068e-06, + "loss": 0.6973, + "step": 4671 + }, + { + "epoch": 0.7, + "grad_norm": 4.18639492310863, + "learning_rate": 1.937936538154545e-06, + "loss": 0.6862, + "step": 4672 + }, + { + "epoch": 0.7, + "grad_norm": 2.060703978102196, + "learning_rate": 1.9379030294841767e-06, + "loss": 0.6536, + "step": 4673 + }, + { + "epoch": 0.7, + "grad_norm": 8.559172301312083, + "learning_rate": 1.9378695120602766e-06, + "loss": 0.707, + "step": 4674 + }, + { + "epoch": 0.7, + "grad_norm": 2.0076470548913052, + "learning_rate": 1.937835985883158e-06, + "loss": 0.6751, + "step": 4675 + }, + { + "epoch": 0.7, + "grad_norm": 1.318673220223257, + "learning_rate": 1.9378024509531324e-06, + "loss": 0.6927, + "step": 4676 + }, + { + "epoch": 0.7, + "grad_norm": 2.5907134846551685, + "learning_rate": 1.9377689072705142e-06, + "loss": 0.6784, + "step": 4677 + }, + { + "epoch": 0.7, + "grad_norm": 1.7645098208841323, + "learning_rate": 1.9377353548356156e-06, + "loss": 0.7038, + "step": 4678 + }, + { + "epoch": 0.7, + "grad_norm": 2.141752957352954, + "learning_rate": 1.93770179364875e-06, + "loss": 0.6784, + "step": 4679 + }, + { + "epoch": 0.7, + "grad_norm": 1.6716262009492748, + "learning_rate": 1.9376682237102305e-06, + "loss": 0.6966, + "step": 4680 + }, + { + "epoch": 0.7, + "grad_norm": 2.404699284317886, + "learning_rate": 1.937634645020371e-06, + "loss": 0.6979, + "step": 4681 + }, + { + "epoch": 0.7, + "grad_norm": 0.5102397014080715, + "learning_rate": 1.9376010575794843e-06, + "loss": 0.6901, + "step": 4682 + }, + { + "epoch": 0.7, + "grad_norm": 1.2771539211791687, + "learning_rate": 1.937567461387884e-06, + "loss": 0.6745, + "step": 4683 + }, + { + "epoch": 0.7, + "grad_norm": 7.427584872747752, + "learning_rate": 1.9375338564458833e-06, + "loss": 0.6895, + "step": 4684 + }, + { + "epoch": 0.7, + "grad_norm": 1.5400507097097709, + "learning_rate": 1.9375002427537965e-06, + "loss": 0.696, + "step": 4685 + }, + { + "epoch": 0.7, + "grad_norm": 10.46290758991668, + "learning_rate": 1.937466620311937e-06, + "loss": 0.6829, + "step": 4686 + }, + { + "epoch": 0.7, + "grad_norm": 1.264838348116425, + "learning_rate": 1.9374329891206183e-06, + "loss": 0.6829, + "step": 4687 + }, + { + "epoch": 0.7, + "grad_norm": 0.40562465563685296, + "learning_rate": 1.937399349180155e-06, + "loss": 0.6823, + "step": 4688 + }, + { + "epoch": 0.7, + "grad_norm": 6.229247823206577, + "learning_rate": 1.9373657004908606e-06, + "loss": 0.6927, + "step": 4689 + }, + { + "epoch": 0.7, + "grad_norm": 3.809314744133374, + "learning_rate": 1.937332043053049e-06, + "loss": 0.6914, + "step": 4690 + }, + { + "epoch": 0.7, + "grad_norm": 1.134862857159744, + "learning_rate": 1.937298376867035e-06, + "loss": 0.6842, + "step": 4691 + }, + { + "epoch": 0.7, + "grad_norm": 5.30343672667496, + "learning_rate": 1.9372647019331324e-06, + "loss": 0.6849, + "step": 4692 + }, + { + "epoch": 0.7, + "grad_norm": 3.8673964334679187, + "learning_rate": 1.937231018251655e-06, + "loss": 0.668, + "step": 4693 + }, + { + "epoch": 0.7, + "grad_norm": 0.48503154231043155, + "learning_rate": 1.9371973258229177e-06, + "loss": 0.7057, + "step": 4694 + }, + { + "epoch": 0.7, + "grad_norm": 2.0028639269518895, + "learning_rate": 1.9371636246472353e-06, + "loss": 0.6888, + "step": 4695 + }, + { + "epoch": 0.7, + "grad_norm": 3.9851796343864776, + "learning_rate": 1.9371299147249212e-06, + "loss": 0.679, + "step": 4696 + }, + { + "epoch": 0.7, + "grad_norm": 1.0748006714747054, + "learning_rate": 1.937096196056291e-06, + "loss": 0.6712, + "step": 4697 + }, + { + "epoch": 0.7, + "grad_norm": 8.188295816898359, + "learning_rate": 1.9370624686416595e-06, + "loss": 0.6882, + "step": 4698 + }, + { + "epoch": 0.7, + "grad_norm": 8.36381881542718, + "learning_rate": 1.9370287324813407e-06, + "loss": 0.6803, + "step": 4699 + }, + { + "epoch": 0.7, + "grad_norm": 2.536210286141105, + "learning_rate": 1.9369949875756503e-06, + "loss": 0.6738, + "step": 4700 + }, + { + "epoch": 0.7, + "grad_norm": 2.3298107119892935, + "learning_rate": 1.9369612339249025e-06, + "loss": 0.6764, + "step": 4701 + }, + { + "epoch": 0.7, + "grad_norm": 2.9167886841043678, + "learning_rate": 1.936927471529413e-06, + "loss": 0.6973, + "step": 4702 + }, + { + "epoch": 0.7, + "grad_norm": 4.80969929752904, + "learning_rate": 1.936893700389496e-06, + "loss": 0.6764, + "step": 4703 + }, + { + "epoch": 0.7, + "grad_norm": 3.9844260137917855, + "learning_rate": 1.9368599205054673e-06, + "loss": 0.6803, + "step": 4704 + }, + { + "epoch": 0.7, + "grad_norm": 1.4044342580330442, + "learning_rate": 1.9368261318776424e-06, + "loss": 0.6719, + "step": 4705 + }, + { + "epoch": 0.7, + "grad_norm": 10.233125246594602, + "learning_rate": 1.936792334506336e-06, + "loss": 0.6908, + "step": 4706 + }, + { + "epoch": 0.7, + "grad_norm": 2.2507680459068187, + "learning_rate": 1.9367585283918643e-06, + "loss": 0.6699, + "step": 4707 + }, + { + "epoch": 0.7, + "grad_norm": 4.429741707267216, + "learning_rate": 1.9367247135345424e-06, + "loss": 0.7038, + "step": 4708 + }, + { + "epoch": 0.7, + "grad_norm": 4.038027453097706, + "learning_rate": 1.9366908899346855e-06, + "loss": 0.6849, + "step": 4709 + }, + { + "epoch": 0.7, + "grad_norm": 4.336757968952408, + "learning_rate": 1.93665705759261e-06, + "loss": 0.6621, + "step": 4710 + }, + { + "epoch": 0.7, + "grad_norm": 3.6943542037059798, + "learning_rate": 1.936623216508631e-06, + "loss": 0.6986, + "step": 4711 + }, + { + "epoch": 0.7, + "grad_norm": 3.5011315046830234, + "learning_rate": 1.936589366683065e-06, + "loss": 0.6608, + "step": 4712 + }, + { + "epoch": 0.7, + "grad_norm": 0.5426806018302771, + "learning_rate": 1.936555508116228e-06, + "loss": 0.6745, + "step": 4713 + }, + { + "epoch": 0.7, + "grad_norm": 0.9216644750365588, + "learning_rate": 1.936521640808435e-06, + "loss": 0.6803, + "step": 4714 + }, + { + "epoch": 0.7, + "grad_norm": 2.164147654887322, + "learning_rate": 1.936487764760003e-06, + "loss": 0.7103, + "step": 4715 + }, + { + "epoch": 0.7, + "grad_norm": 2.1065701646908095, + "learning_rate": 1.936453879971248e-06, + "loss": 0.6504, + "step": 4716 + }, + { + "epoch": 0.7, + "grad_norm": 0.7134259955839363, + "learning_rate": 1.936419986442486e-06, + "loss": 0.6927, + "step": 4717 + }, + { + "epoch": 0.7, + "grad_norm": 4.273895737880805, + "learning_rate": 1.9363860841740332e-06, + "loss": 0.679, + "step": 4718 + }, + { + "epoch": 0.7, + "grad_norm": 1.3338901915702712, + "learning_rate": 1.9363521731662063e-06, + "loss": 0.696, + "step": 4719 + }, + { + "epoch": 0.7, + "grad_norm": 1.04318545323967, + "learning_rate": 1.936318253419322e-06, + "loss": 0.6973, + "step": 4720 + }, + { + "epoch": 0.7, + "grad_norm": 1.8390104691785973, + "learning_rate": 1.9362843249336966e-06, + "loss": 0.6914, + "step": 4721 + }, + { + "epoch": 0.7, + "grad_norm": 0.7596618804808181, + "learning_rate": 1.936250387709647e-06, + "loss": 0.6582, + "step": 4722 + }, + { + "epoch": 0.7, + "grad_norm": 3.524548645169548, + "learning_rate": 1.9362164417474892e-06, + "loss": 0.6803, + "step": 4723 + }, + { + "epoch": 0.7, + "grad_norm": 4.44283140021041, + "learning_rate": 1.936182487047541e-06, + "loss": 0.6868, + "step": 4724 + }, + { + "epoch": 0.7, + "grad_norm": 1.0635924257935725, + "learning_rate": 1.9361485236101186e-06, + "loss": 0.6764, + "step": 4725 + }, + { + "epoch": 0.7, + "grad_norm": 2.503073328880959, + "learning_rate": 1.936114551435539e-06, + "loss": 0.6738, + "step": 4726 + }, + { + "epoch": 0.7, + "grad_norm": 1.431551312123146, + "learning_rate": 1.93608057052412e-06, + "loss": 0.6836, + "step": 4727 + }, + { + "epoch": 0.71, + "grad_norm": 4.5782745051659335, + "learning_rate": 1.9360465808761782e-06, + "loss": 0.666, + "step": 4728 + }, + { + "epoch": 0.71, + "grad_norm": 3.6159602708004788, + "learning_rate": 1.9360125824920307e-06, + "loss": 0.6908, + "step": 4729 + }, + { + "epoch": 0.71, + "grad_norm": 1.324174106467415, + "learning_rate": 1.935978575371995e-06, + "loss": 0.6966, + "step": 4730 + }, + { + "epoch": 0.71, + "grad_norm": 3.936741860816285, + "learning_rate": 1.9359445595163885e-06, + "loss": 0.6719, + "step": 4731 + }, + { + "epoch": 0.71, + "grad_norm": 1.339589666343436, + "learning_rate": 1.9359105349255286e-06, + "loss": 0.696, + "step": 4732 + }, + { + "epoch": 0.71, + "grad_norm": 2.618661932007486, + "learning_rate": 1.935876501599733e-06, + "loss": 0.681, + "step": 4733 + }, + { + "epoch": 0.71, + "grad_norm": 5.817252262091888, + "learning_rate": 1.9358424595393196e-06, + "loss": 0.6615, + "step": 4734 + }, + { + "epoch": 0.71, + "grad_norm": 0.8203009766566176, + "learning_rate": 1.9358084087446052e-06, + "loss": 0.6966, + "step": 4735 + }, + { + "epoch": 0.71, + "grad_norm": 4.226761122452084, + "learning_rate": 1.9357743492159085e-06, + "loss": 0.679, + "step": 4736 + }, + { + "epoch": 0.71, + "grad_norm": 4.260295417811491, + "learning_rate": 1.935740280953547e-06, + "loss": 0.6829, + "step": 4737 + }, + { + "epoch": 0.71, + "grad_norm": 3.9819618960716285, + "learning_rate": 1.935706203957838e-06, + "loss": 0.6862, + "step": 4738 + }, + { + "epoch": 0.71, + "grad_norm": 0.7901860537090722, + "learning_rate": 1.9356721182291015e-06, + "loss": 0.6725, + "step": 4739 + }, + { + "epoch": 0.71, + "grad_norm": 2.190699392434953, + "learning_rate": 1.9356380237676534e-06, + "loss": 0.6927, + "step": 4740 + }, + { + "epoch": 0.71, + "grad_norm": 6.2533694581646095, + "learning_rate": 1.935603920573813e-06, + "loss": 0.6673, + "step": 4741 + }, + { + "epoch": 0.71, + "grad_norm": 4.328039485646577, + "learning_rate": 1.935569808647899e-06, + "loss": 0.6921, + "step": 4742 + }, + { + "epoch": 0.71, + "grad_norm": 2.501475157301153, + "learning_rate": 1.935535687990229e-06, + "loss": 0.6654, + "step": 4743 + }, + { + "epoch": 0.71, + "grad_norm": 1.5967532397356112, + "learning_rate": 1.9355015586011218e-06, + "loss": 0.6803, + "step": 4744 + }, + { + "epoch": 0.71, + "grad_norm": 2.760411030872428, + "learning_rate": 1.9354674204808952e-06, + "loss": 0.6836, + "step": 4745 + }, + { + "epoch": 0.71, + "grad_norm": 8.500827878200978, + "learning_rate": 1.935433273629869e-06, + "loss": 0.6816, + "step": 4746 + }, + { + "epoch": 0.71, + "grad_norm": 0.7157121858565523, + "learning_rate": 1.9353991180483615e-06, + "loss": 0.6895, + "step": 4747 + }, + { + "epoch": 0.71, + "grad_norm": 1.911587810757561, + "learning_rate": 1.9353649537366908e-06, + "loss": 0.7057, + "step": 4748 + }, + { + "epoch": 0.71, + "grad_norm": 0.9902374880181716, + "learning_rate": 1.9353307806951762e-06, + "loss": 0.6927, + "step": 4749 + }, + { + "epoch": 0.71, + "grad_norm": 0.8512459901324817, + "learning_rate": 1.935296598924137e-06, + "loss": 0.6758, + "step": 4750 + }, + { + "epoch": 0.71, + "grad_norm": 1.4569899543879241, + "learning_rate": 1.935262408423892e-06, + "loss": 0.6921, + "step": 4751 + }, + { + "epoch": 0.71, + "grad_norm": 4.698130247834169, + "learning_rate": 1.9352282091947607e-06, + "loss": 0.6855, + "step": 4752 + }, + { + "epoch": 0.71, + "grad_norm": 7.271337173325692, + "learning_rate": 1.935194001237061e-06, + "loss": 0.6803, + "step": 4753 + }, + { + "epoch": 0.71, + "grad_norm": 1.9274806634794124, + "learning_rate": 1.935159784551113e-06, + "loss": 0.6758, + "step": 4754 + }, + { + "epoch": 0.71, + "grad_norm": 0.6282085576176926, + "learning_rate": 1.9351255591372363e-06, + "loss": 0.7005, + "step": 4755 + }, + { + "epoch": 0.71, + "grad_norm": 0.8926071223787373, + "learning_rate": 1.93509132499575e-06, + "loss": 0.6914, + "step": 4756 + }, + { + "epoch": 0.71, + "grad_norm": 5.988473113161297, + "learning_rate": 1.935057082126974e-06, + "loss": 0.7012, + "step": 4757 + }, + { + "epoch": 0.71, + "grad_norm": 3.2764787731415606, + "learning_rate": 1.9350228305312274e-06, + "loss": 0.6862, + "step": 4758 + }, + { + "epoch": 0.71, + "grad_norm": 0.44006405003422755, + "learning_rate": 1.9349885702088296e-06, + "loss": 0.6803, + "step": 4759 + }, + { + "epoch": 0.71, + "grad_norm": 0.47164011349076307, + "learning_rate": 1.934954301160101e-06, + "loss": 0.6816, + "step": 4760 + }, + { + "epoch": 0.71, + "grad_norm": 7.5166321132277245, + "learning_rate": 1.9349200233853617e-06, + "loss": 0.6784, + "step": 4761 + }, + { + "epoch": 0.71, + "grad_norm": 2.8731434034551975, + "learning_rate": 1.9348857368849307e-06, + "loss": 0.6777, + "step": 4762 + }, + { + "epoch": 0.71, + "grad_norm": 3.6945732966507476, + "learning_rate": 1.9348514416591282e-06, + "loss": 0.6947, + "step": 4763 + }, + { + "epoch": 0.71, + "grad_norm": 1.3380508464297933, + "learning_rate": 1.934817137708275e-06, + "loss": 0.6764, + "step": 4764 + }, + { + "epoch": 0.71, + "grad_norm": 3.5748463426479633, + "learning_rate": 1.9347828250326903e-06, + "loss": 0.6699, + "step": 4765 + }, + { + "epoch": 0.71, + "grad_norm": 0.8175450690724022, + "learning_rate": 1.9347485036326952e-06, + "loss": 0.6875, + "step": 4766 + }, + { + "epoch": 0.71, + "grad_norm": 2.733459672202913, + "learning_rate": 1.9347141735086098e-06, + "loss": 0.6673, + "step": 4767 + }, + { + "epoch": 0.71, + "grad_norm": 1.1405375170040082, + "learning_rate": 1.934679834660754e-06, + "loss": 0.6829, + "step": 4768 + }, + { + "epoch": 0.71, + "grad_norm": 6.8076122851726115, + "learning_rate": 1.934645487089449e-06, + "loss": 0.6999, + "step": 4769 + }, + { + "epoch": 0.71, + "grad_norm": 1.149052122327441, + "learning_rate": 1.9346111307950145e-06, + "loss": 0.6771, + "step": 4770 + }, + { + "epoch": 0.71, + "grad_norm": 1.4210328857023806, + "learning_rate": 1.934576765777772e-06, + "loss": 0.6875, + "step": 4771 + }, + { + "epoch": 0.71, + "grad_norm": 0.6824997095602688, + "learning_rate": 1.934542392038042e-06, + "loss": 0.6706, + "step": 4772 + }, + { + "epoch": 0.71, + "grad_norm": 1.3553910188688991, + "learning_rate": 1.9345080095761446e-06, + "loss": 0.6836, + "step": 4773 + }, + { + "epoch": 0.71, + "grad_norm": 4.036396851781547, + "learning_rate": 1.934473618392402e-06, + "loss": 0.6751, + "step": 4774 + }, + { + "epoch": 0.71, + "grad_norm": 4.60146947142649, + "learning_rate": 1.9344392184871344e-06, + "loss": 0.6868, + "step": 4775 + }, + { + "epoch": 0.71, + "grad_norm": 2.7828410158209875, + "learning_rate": 1.9344048098606626e-06, + "loss": 0.6667, + "step": 4776 + }, + { + "epoch": 0.71, + "grad_norm": 0.6122713179931263, + "learning_rate": 1.9343703925133086e-06, + "loss": 0.6836, + "step": 4777 + }, + { + "epoch": 0.71, + "grad_norm": 4.1803457236847965, + "learning_rate": 1.9343359664453924e-06, + "loss": 0.6908, + "step": 4778 + }, + { + "epoch": 0.71, + "grad_norm": 2.3284736049581793, + "learning_rate": 1.9343015316572366e-06, + "loss": 0.7044, + "step": 4779 + }, + { + "epoch": 0.71, + "grad_norm": 7.507894860967619, + "learning_rate": 1.9342670881491615e-06, + "loss": 0.679, + "step": 4780 + }, + { + "epoch": 0.71, + "grad_norm": 1.777319125047473, + "learning_rate": 1.9342326359214892e-06, + "loss": 0.6647, + "step": 4781 + }, + { + "epoch": 0.71, + "grad_norm": 2.6333920618551887, + "learning_rate": 1.9341981749745415e-06, + "loss": 0.7025, + "step": 4782 + }, + { + "epoch": 0.71, + "grad_norm": 6.659029691386153, + "learning_rate": 1.934163705308639e-06, + "loss": 0.6855, + "step": 4783 + }, + { + "epoch": 0.71, + "grad_norm": 1.1478235497831506, + "learning_rate": 1.9341292269241042e-06, + "loss": 0.6699, + "step": 4784 + }, + { + "epoch": 0.71, + "grad_norm": 2.34493244920275, + "learning_rate": 1.934094739821259e-06, + "loss": 0.6452, + "step": 4785 + }, + { + "epoch": 0.71, + "grad_norm": 1.08735303422741, + "learning_rate": 1.934060244000425e-06, + "loss": 0.6432, + "step": 4786 + }, + { + "epoch": 0.71, + "grad_norm": 2.940494644845843, + "learning_rate": 1.934025739461924e-06, + "loss": 0.6523, + "step": 4787 + }, + { + "epoch": 0.71, + "grad_norm": 2.504691662797854, + "learning_rate": 1.9339912262060782e-06, + "loss": 0.6621, + "step": 4788 + }, + { + "epoch": 0.71, + "grad_norm": 2.478116278441096, + "learning_rate": 1.933956704233209e-06, + "loss": 0.6901, + "step": 4789 + }, + { + "epoch": 0.71, + "grad_norm": 2.203140323880074, + "learning_rate": 1.9339221735436402e-06, + "loss": 0.6712, + "step": 4790 + }, + { + "epoch": 0.71, + "grad_norm": 2.0228433958510115, + "learning_rate": 1.933887634137693e-06, + "loss": 0.6712, + "step": 4791 + }, + { + "epoch": 0.71, + "grad_norm": 0.9935846318528497, + "learning_rate": 1.9338530860156897e-06, + "loss": 0.6719, + "step": 4792 + }, + { + "epoch": 0.71, + "grad_norm": 7.842351267554437, + "learning_rate": 1.933818529177953e-06, + "loss": 0.6699, + "step": 4793 + }, + { + "epoch": 0.71, + "grad_norm": 5.318055206513973, + "learning_rate": 1.9337839636248054e-06, + "loss": 0.7129, + "step": 4794 + }, + { + "epoch": 0.72, + "grad_norm": 2.44835555780951, + "learning_rate": 1.9337493893565692e-06, + "loss": 0.7168, + "step": 4795 + }, + { + "epoch": 0.72, + "grad_norm": 5.118500488542466, + "learning_rate": 1.933714806373568e-06, + "loss": 0.6771, + "step": 4796 + }, + { + "epoch": 0.72, + "grad_norm": 4.011353402884152, + "learning_rate": 1.933680214676123e-06, + "loss": 0.679, + "step": 4797 + }, + { + "epoch": 0.72, + "grad_norm": 1.0582177162501545, + "learning_rate": 1.933645614264559e-06, + "loss": 0.6849, + "step": 4798 + }, + { + "epoch": 0.72, + "grad_norm": 2.0213217637991154, + "learning_rate": 1.9336110051391976e-06, + "loss": 0.679, + "step": 4799 + }, + { + "epoch": 0.72, + "grad_norm": 2.683027723790547, + "learning_rate": 1.9335763873003617e-06, + "loss": 0.6686, + "step": 4800 + }, + { + "epoch": 0.72, + "grad_norm": 0.5804456009816411, + "learning_rate": 1.9335417607483757e-06, + "loss": 0.6777, + "step": 4801 + }, + { + "epoch": 0.72, + "grad_norm": 2.018551991431303, + "learning_rate": 1.933507125483561e-06, + "loss": 0.6927, + "step": 4802 + }, + { + "epoch": 0.72, + "grad_norm": 4.67658878308912, + "learning_rate": 1.933472481506242e-06, + "loss": 0.6654, + "step": 4803 + }, + { + "epoch": 0.72, + "grad_norm": 5.605523441799136, + "learning_rate": 1.933437828816742e-06, + "loss": 0.6797, + "step": 4804 + }, + { + "epoch": 0.72, + "grad_norm": 0.879618411392443, + "learning_rate": 1.9334031674153843e-06, + "loss": 0.6823, + "step": 4805 + }, + { + "epoch": 0.72, + "grad_norm": 1.0593024664615671, + "learning_rate": 1.933368497302492e-06, + "loss": 0.6979, + "step": 4806 + }, + { + "epoch": 0.72, + "grad_norm": 7.09109944768797, + "learning_rate": 1.933333818478389e-06, + "loss": 0.7116, + "step": 4807 + }, + { + "epoch": 0.72, + "grad_norm": 0.7299671790988922, + "learning_rate": 1.933299130943399e-06, + "loss": 0.6777, + "step": 4808 + }, + { + "epoch": 0.72, + "grad_norm": 3.8288869813274156, + "learning_rate": 1.9332644346978456e-06, + "loss": 0.7103, + "step": 4809 + }, + { + "epoch": 0.72, + "grad_norm": 0.9641520504813703, + "learning_rate": 1.9332297297420527e-06, + "loss": 0.7077, + "step": 4810 + }, + { + "epoch": 0.72, + "grad_norm": 1.2114051706880922, + "learning_rate": 1.933195016076344e-06, + "loss": 0.6855, + "step": 4811 + }, + { + "epoch": 0.72, + "grad_norm": 4.778711965223833, + "learning_rate": 1.933160293701044e-06, + "loss": 0.6686, + "step": 4812 + }, + { + "epoch": 0.72, + "grad_norm": 6.526955233166704, + "learning_rate": 1.9331255626164764e-06, + "loss": 0.6986, + "step": 4813 + }, + { + "epoch": 0.72, + "grad_norm": 0.9041673651011326, + "learning_rate": 1.9330908228229655e-06, + "loss": 0.7012, + "step": 4814 + }, + { + "epoch": 0.72, + "grad_norm": 2.2597785956342897, + "learning_rate": 1.933056074320835e-06, + "loss": 0.6719, + "step": 4815 + }, + { + "epoch": 0.72, + "grad_norm": 3.1500662853532466, + "learning_rate": 1.93302131711041e-06, + "loss": 0.679, + "step": 4816 + }, + { + "epoch": 0.72, + "grad_norm": 4.302249606942449, + "learning_rate": 1.9329865511920147e-06, + "loss": 0.6706, + "step": 4817 + }, + { + "epoch": 0.72, + "grad_norm": 6.364260913368708, + "learning_rate": 1.932951776565973e-06, + "loss": 0.6803, + "step": 4818 + }, + { + "epoch": 0.72, + "grad_norm": 0.703383425462866, + "learning_rate": 1.9329169932326104e-06, + "loss": 0.6888, + "step": 4819 + }, + { + "epoch": 0.72, + "grad_norm": 5.661113455527881, + "learning_rate": 1.9328822011922505e-06, + "loss": 0.7012, + "step": 4820 + }, + { + "epoch": 0.72, + "grad_norm": 2.9871203670481905, + "learning_rate": 1.9328474004452182e-06, + "loss": 0.6882, + "step": 4821 + }, + { + "epoch": 0.72, + "grad_norm": 1.1914845884041976, + "learning_rate": 1.9328125909918397e-06, + "loss": 0.6751, + "step": 4822 + }, + { + "epoch": 0.72, + "grad_norm": 1.4761895412445973, + "learning_rate": 1.932777772832438e-06, + "loss": 0.681, + "step": 4823 + }, + { + "epoch": 0.72, + "grad_norm": 2.9710654787820716, + "learning_rate": 1.932742945967339e-06, + "loss": 0.6862, + "step": 4824 + }, + { + "epoch": 0.72, + "grad_norm": 4.097237621990895, + "learning_rate": 1.9327081103968677e-06, + "loss": 0.6816, + "step": 4825 + }, + { + "epoch": 0.72, + "grad_norm": 1.97096905301523, + "learning_rate": 1.932673266121349e-06, + "loss": 0.6745, + "step": 4826 + }, + { + "epoch": 0.72, + "grad_norm": 8.282163856823253, + "learning_rate": 1.9326384131411083e-06, + "loss": 0.6947, + "step": 4827 + }, + { + "epoch": 0.72, + "grad_norm": 2.8740956642757367, + "learning_rate": 1.932603551456471e-06, + "loss": 0.6999, + "step": 4828 + }, + { + "epoch": 0.72, + "grad_norm": 1.3149310785724895, + "learning_rate": 1.932568681067762e-06, + "loss": 0.6868, + "step": 4829 + }, + { + "epoch": 0.72, + "grad_norm": 2.5379296576069397, + "learning_rate": 1.9325338019753066e-06, + "loss": 0.6849, + "step": 4830 + }, + { + "epoch": 0.72, + "grad_norm": 2.4095332363379685, + "learning_rate": 1.9324989141794314e-06, + "loss": 0.6712, + "step": 4831 + }, + { + "epoch": 0.72, + "grad_norm": 0.6025840244580812, + "learning_rate": 1.9324640176804613e-06, + "loss": 0.6908, + "step": 4832 + }, + { + "epoch": 0.72, + "grad_norm": 1.0574032954461843, + "learning_rate": 1.9324291124787223e-06, + "loss": 0.6823, + "step": 4833 + }, + { + "epoch": 0.72, + "grad_norm": 2.8287160340356876, + "learning_rate": 1.9323941985745393e-06, + "loss": 0.6999, + "step": 4834 + }, + { + "epoch": 0.72, + "grad_norm": 0.9647063406388364, + "learning_rate": 1.9323592759682392e-06, + "loss": 0.6816, + "step": 4835 + }, + { + "epoch": 0.72, + "grad_norm": 4.3024598600067145, + "learning_rate": 1.9323243446601473e-06, + "loss": 0.6999, + "step": 4836 + }, + { + "epoch": 0.72, + "grad_norm": 4.683719107656228, + "learning_rate": 1.93228940465059e-06, + "loss": 0.7012, + "step": 4837 + }, + { + "epoch": 0.72, + "grad_norm": 1.4945575073555126, + "learning_rate": 1.9322544559398933e-06, + "loss": 0.7083, + "step": 4838 + }, + { + "epoch": 0.72, + "grad_norm": 2.071414703426855, + "learning_rate": 1.932219498528383e-06, + "loss": 0.696, + "step": 4839 + }, + { + "epoch": 0.72, + "grad_norm": 0.5837215785761619, + "learning_rate": 1.9321845324163863e-06, + "loss": 0.6829, + "step": 4840 + }, + { + "epoch": 0.72, + "grad_norm": 6.492044713642034, + "learning_rate": 1.932149557604228e-06, + "loss": 0.6927, + "step": 4841 + }, + { + "epoch": 0.72, + "grad_norm": 1.226776791423348, + "learning_rate": 1.932114574092236e-06, + "loss": 0.6745, + "step": 4842 + }, + { + "epoch": 0.72, + "grad_norm": 3.3535013555897124, + "learning_rate": 1.932079581880737e-06, + "loss": 0.6914, + "step": 4843 + }, + { + "epoch": 0.72, + "grad_norm": 0.7618557491594706, + "learning_rate": 1.932044580970056e-06, + "loss": 0.6823, + "step": 4844 + }, + { + "epoch": 0.72, + "grad_norm": 1.768398068886179, + "learning_rate": 1.93200957136052e-06, + "loss": 0.6849, + "step": 4845 + }, + { + "epoch": 0.72, + "grad_norm": 1.900467271572317, + "learning_rate": 1.931974553052457e-06, + "loss": 0.6862, + "step": 4846 + }, + { + "epoch": 0.72, + "grad_norm": 6.178028141489832, + "learning_rate": 1.9319395260461926e-06, + "loss": 0.6947, + "step": 4847 + }, + { + "epoch": 0.72, + "grad_norm": 2.1506840716215048, + "learning_rate": 1.931904490342055e-06, + "loss": 0.6934, + "step": 4848 + }, + { + "epoch": 0.72, + "grad_norm": 5.593564323434981, + "learning_rate": 1.9318694459403697e-06, + "loss": 0.7038, + "step": 4849 + }, + { + "epoch": 0.72, + "grad_norm": 0.5813161853309251, + "learning_rate": 1.9318343928414642e-06, + "loss": 0.6569, + "step": 4850 + }, + { + "epoch": 0.72, + "grad_norm": 1.4045850487523526, + "learning_rate": 1.9317993310456666e-06, + "loss": 0.6764, + "step": 4851 + }, + { + "epoch": 0.72, + "grad_norm": 4.111368666135262, + "learning_rate": 1.931764260553303e-06, + "loss": 0.6725, + "step": 4852 + }, + { + "epoch": 0.72, + "grad_norm": 0.44293673332907635, + "learning_rate": 1.9317291813647014e-06, + "loss": 0.6823, + "step": 4853 + }, + { + "epoch": 0.72, + "grad_norm": 8.444094241288651, + "learning_rate": 1.931694093480189e-06, + "loss": 0.696, + "step": 4854 + }, + { + "epoch": 0.72, + "grad_norm": 1.3410750405202139, + "learning_rate": 1.9316589969000928e-06, + "loss": 0.6973, + "step": 4855 + }, + { + "epoch": 0.72, + "grad_norm": 3.514422924009261, + "learning_rate": 1.9316238916247412e-06, + "loss": 0.666, + "step": 4856 + }, + { + "epoch": 0.72, + "grad_norm": 7.711712311216047, + "learning_rate": 1.9315887776544614e-06, + "loss": 0.681, + "step": 4857 + }, + { + "epoch": 0.72, + "grad_norm": 4.192810095349653, + "learning_rate": 1.9315536549895806e-06, + "loss": 0.6849, + "step": 4858 + }, + { + "epoch": 0.72, + "grad_norm": 1.094871938715038, + "learning_rate": 1.931518523630428e-06, + "loss": 0.6764, + "step": 4859 + }, + { + "epoch": 0.72, + "grad_norm": 2.9763434301952265, + "learning_rate": 1.9314833835773305e-06, + "loss": 0.6797, + "step": 4860 + }, + { + "epoch": 0.72, + "grad_norm": 0.5823219461355453, + "learning_rate": 1.9314482348306154e-06, + "loss": 0.6921, + "step": 4861 + }, + { + "epoch": 0.73, + "grad_norm": 7.667478039476154, + "learning_rate": 1.9314130773906124e-06, + "loss": 0.6693, + "step": 4862 + }, + { + "epoch": 0.73, + "grad_norm": 7.192927064703819, + "learning_rate": 1.9313779112576482e-06, + "loss": 0.6784, + "step": 4863 + }, + { + "epoch": 0.73, + "grad_norm": 0.7843012355411408, + "learning_rate": 1.931342736432052e-06, + "loss": 0.681, + "step": 4864 + }, + { + "epoch": 0.73, + "grad_norm": 1.3602339497066105, + "learning_rate": 1.9313075529141517e-06, + "loss": 0.6725, + "step": 4865 + }, + { + "epoch": 0.73, + "grad_norm": 2.3795427291566726, + "learning_rate": 1.9312723607042754e-06, + "loss": 0.6647, + "step": 4866 + }, + { + "epoch": 0.73, + "grad_norm": 0.8602730480372975, + "learning_rate": 1.931237159802752e-06, + "loss": 0.7044, + "step": 4867 + }, + { + "epoch": 0.73, + "grad_norm": 1.458576514069478, + "learning_rate": 1.9312019502099095e-06, + "loss": 0.681, + "step": 4868 + }, + { + "epoch": 0.73, + "grad_norm": 3.020187800305548, + "learning_rate": 1.931166731926077e-06, + "loss": 0.6842, + "step": 4869 + }, + { + "epoch": 0.73, + "grad_norm": 6.509785113024289, + "learning_rate": 1.9311315049515835e-06, + "loss": 0.7057, + "step": 4870 + }, + { + "epoch": 0.73, + "grad_norm": 1.466447460738513, + "learning_rate": 1.9310962692867566e-06, + "loss": 0.679, + "step": 4871 + }, + { + "epoch": 0.73, + "grad_norm": 0.889253449165543, + "learning_rate": 1.9310610249319265e-06, + "loss": 0.7057, + "step": 4872 + }, + { + "epoch": 0.73, + "grad_norm": 1.0701796699400525, + "learning_rate": 1.931025771887421e-06, + "loss": 0.6842, + "step": 4873 + }, + { + "epoch": 0.73, + "grad_norm": 0.5272820838147309, + "learning_rate": 1.93099051015357e-06, + "loss": 0.6732, + "step": 4874 + }, + { + "epoch": 0.73, + "grad_norm": 6.803979473935386, + "learning_rate": 1.930955239730702e-06, + "loss": 0.6992, + "step": 4875 + }, + { + "epoch": 0.73, + "grad_norm": 0.7469564985082762, + "learning_rate": 1.9309199606191468e-06, + "loss": 0.6849, + "step": 4876 + }, + { + "epoch": 0.73, + "grad_norm": 1.3342868974996094, + "learning_rate": 1.930884672819233e-06, + "loss": 0.681, + "step": 4877 + }, + { + "epoch": 0.73, + "grad_norm": 2.552625904880203, + "learning_rate": 1.93084937633129e-06, + "loss": 0.7188, + "step": 4878 + }, + { + "epoch": 0.73, + "grad_norm": 1.7833128443550443, + "learning_rate": 1.930814071155648e-06, + "loss": 0.7044, + "step": 4879 + }, + { + "epoch": 0.73, + "grad_norm": 0.6472803861643478, + "learning_rate": 1.930778757292635e-06, + "loss": 0.7038, + "step": 4880 + }, + { + "epoch": 0.73, + "grad_norm": 2.20181805763803, + "learning_rate": 1.9307434347425826e-06, + "loss": 0.6745, + "step": 4881 + }, + { + "epoch": 0.73, + "grad_norm": 1.181166007401924, + "learning_rate": 1.9307081035058187e-06, + "loss": 0.6947, + "step": 4882 + }, + { + "epoch": 0.73, + "grad_norm": 3.8674981130133568, + "learning_rate": 1.9306727635826744e-06, + "loss": 0.6921, + "step": 4883 + }, + { + "epoch": 0.73, + "grad_norm": 0.6744968433119748, + "learning_rate": 1.930637414973478e-06, + "loss": 0.6732, + "step": 4884 + }, + { + "epoch": 0.73, + "grad_norm": 4.908725200891529, + "learning_rate": 1.9306020576785614e-06, + "loss": 0.6771, + "step": 4885 + }, + { + "epoch": 0.73, + "grad_norm": 1.9640514356983547, + "learning_rate": 1.9305666916982526e-06, + "loss": 0.6699, + "step": 4886 + }, + { + "epoch": 0.73, + "grad_norm": 1.0227575086083238, + "learning_rate": 1.930531317032883e-06, + "loss": 0.6764, + "step": 4887 + }, + { + "epoch": 0.73, + "grad_norm": 5.502954997791241, + "learning_rate": 1.930495933682782e-06, + "loss": 0.694, + "step": 4888 + }, + { + "epoch": 0.73, + "grad_norm": 1.4755718944077802, + "learning_rate": 1.930460541648281e-06, + "loss": 0.6797, + "step": 4889 + }, + { + "epoch": 0.73, + "grad_norm": 1.8979654587446695, + "learning_rate": 1.9304251409297083e-06, + "loss": 0.6745, + "step": 4890 + }, + { + "epoch": 0.73, + "grad_norm": 5.409270328869719, + "learning_rate": 1.930389731527396e-06, + "loss": 0.6849, + "step": 4891 + }, + { + "epoch": 0.73, + "grad_norm": 1.5710711223811027, + "learning_rate": 1.9303543134416743e-06, + "loss": 0.668, + "step": 4892 + }, + { + "epoch": 0.73, + "grad_norm": 0.41211547365088724, + "learning_rate": 1.9303188866728733e-06, + "loss": 0.6875, + "step": 4893 + }, + { + "epoch": 0.73, + "grad_norm": 1.1757427441753383, + "learning_rate": 1.930283451221324e-06, + "loss": 0.6751, + "step": 4894 + }, + { + "epoch": 0.73, + "grad_norm": 4.988298893754278, + "learning_rate": 1.930248007087357e-06, + "loss": 0.6823, + "step": 4895 + }, + { + "epoch": 0.73, + "grad_norm": 4.151908656105882, + "learning_rate": 1.930212554271303e-06, + "loss": 0.6751, + "step": 4896 + }, + { + "epoch": 0.73, + "grad_norm": 6.727377806565963, + "learning_rate": 1.9301770927734928e-06, + "loss": 0.6777, + "step": 4897 + }, + { + "epoch": 0.73, + "grad_norm": 1.0191023382990247, + "learning_rate": 1.9301416225942575e-06, + "loss": 0.6882, + "step": 4898 + }, + { + "epoch": 0.73, + "grad_norm": 2.896541971508066, + "learning_rate": 1.9301061437339285e-06, + "loss": 0.6745, + "step": 4899 + }, + { + "epoch": 0.73, + "grad_norm": 5.046957729426741, + "learning_rate": 1.9300706561928366e-06, + "loss": 0.6895, + "step": 4900 + }, + { + "epoch": 0.73, + "grad_norm": 4.295547510333853, + "learning_rate": 1.930035159971313e-06, + "loss": 0.6615, + "step": 4901 + }, + { + "epoch": 0.73, + "grad_norm": 1.6533316818745891, + "learning_rate": 1.9299996550696886e-06, + "loss": 0.6751, + "step": 4902 + }, + { + "epoch": 0.73, + "grad_norm": 0.5599859899418057, + "learning_rate": 1.9299641414882954e-06, + "loss": 0.6764, + "step": 4903 + }, + { + "epoch": 0.73, + "grad_norm": 4.313150212360428, + "learning_rate": 1.9299286192274644e-06, + "loss": 0.679, + "step": 4904 + }, + { + "epoch": 0.73, + "grad_norm": 2.7313283601224674, + "learning_rate": 1.9298930882875277e-06, + "loss": 0.653, + "step": 4905 + }, + { + "epoch": 0.73, + "grad_norm": 3.955871965122361, + "learning_rate": 1.9298575486688165e-06, + "loss": 0.6758, + "step": 4906 + }, + { + "epoch": 0.73, + "grad_norm": 3.529941413050044, + "learning_rate": 1.9298220003716627e-06, + "loss": 0.6751, + "step": 4907 + }, + { + "epoch": 0.73, + "grad_norm": 2.5498294659664578, + "learning_rate": 1.9297864433963976e-06, + "loss": 0.668, + "step": 4908 + }, + { + "epoch": 0.73, + "grad_norm": 1.0081631105768638, + "learning_rate": 1.9297508777433537e-06, + "loss": 0.6634, + "step": 4909 + }, + { + "epoch": 0.73, + "grad_norm": 4.858559082031405, + "learning_rate": 1.9297153034128626e-06, + "loss": 0.6914, + "step": 4910 + }, + { + "epoch": 0.73, + "grad_norm": 2.5780912093885147, + "learning_rate": 1.929679720405256e-06, + "loss": 0.6927, + "step": 4911 + }, + { + "epoch": 0.73, + "grad_norm": 3.3093655424778645, + "learning_rate": 1.929644128720867e-06, + "loss": 0.6836, + "step": 4912 + }, + { + "epoch": 0.73, + "grad_norm": 1.0937248099823926, + "learning_rate": 1.929608528360027e-06, + "loss": 0.6764, + "step": 4913 + }, + { + "epoch": 0.73, + "grad_norm": 1.714855652708421, + "learning_rate": 1.929572919323068e-06, + "loss": 0.7116, + "step": 4914 + }, + { + "epoch": 0.73, + "grad_norm": 2.922132561602789, + "learning_rate": 1.929537301610323e-06, + "loss": 0.6803, + "step": 4915 + }, + { + "epoch": 0.73, + "grad_norm": 0.7503868014149488, + "learning_rate": 1.929501675222124e-06, + "loss": 0.6602, + "step": 4916 + }, + { + "epoch": 0.73, + "grad_norm": 0.9095548547436259, + "learning_rate": 1.9294660401588034e-06, + "loss": 0.6823, + "step": 4917 + }, + { + "epoch": 0.73, + "grad_norm": 2.728892838187432, + "learning_rate": 1.9294303964206944e-06, + "loss": 0.6947, + "step": 4918 + }, + { + "epoch": 0.73, + "grad_norm": 3.60910156058529, + "learning_rate": 1.9293947440081295e-06, + "loss": 0.6888, + "step": 4919 + }, + { + "epoch": 0.73, + "grad_norm": 3.188943512551134, + "learning_rate": 1.929359082921441e-06, + "loss": 0.6523, + "step": 4920 + }, + { + "epoch": 0.73, + "grad_norm": 0.7560486432045961, + "learning_rate": 1.929323413160962e-06, + "loss": 0.6602, + "step": 4921 + }, + { + "epoch": 0.73, + "grad_norm": 1.7771123668653983, + "learning_rate": 1.9292877347270254e-06, + "loss": 0.6777, + "step": 4922 + }, + { + "epoch": 0.73, + "grad_norm": 0.7825665282424902, + "learning_rate": 1.9292520476199643e-06, + "loss": 0.7038, + "step": 4923 + }, + { + "epoch": 0.73, + "grad_norm": 7.371672901896353, + "learning_rate": 1.9292163518401116e-06, + "loss": 0.6738, + "step": 4924 + }, + { + "epoch": 0.73, + "grad_norm": 2.3571344162193255, + "learning_rate": 1.9291806473878004e-06, + "loss": 0.6745, + "step": 4925 + }, + { + "epoch": 0.73, + "grad_norm": 4.691561697207337, + "learning_rate": 1.929144934263364e-06, + "loss": 0.6543, + "step": 4926 + }, + { + "epoch": 0.73, + "grad_norm": 6.346419882685644, + "learning_rate": 1.9291092124671364e-06, + "loss": 0.6777, + "step": 4927 + }, + { + "epoch": 0.73, + "grad_norm": 1.3210312706564717, + "learning_rate": 1.9290734819994495e-06, + "loss": 0.6693, + "step": 4928 + }, + { + "epoch": 0.74, + "grad_norm": 1.3039413845740808, + "learning_rate": 1.929037742860638e-06, + "loss": 0.6719, + "step": 4929 + }, + { + "epoch": 0.74, + "grad_norm": 1.1881783982580865, + "learning_rate": 1.9290019950510352e-06, + "loss": 0.6986, + "step": 4930 + }, + { + "epoch": 0.74, + "grad_norm": 2.23541688811473, + "learning_rate": 1.9289662385709745e-06, + "loss": 0.6823, + "step": 4931 + }, + { + "epoch": 0.74, + "grad_norm": 0.7114828035702128, + "learning_rate": 1.9289304734207896e-06, + "loss": 0.6706, + "step": 4932 + }, + { + "epoch": 0.74, + "grad_norm": 1.5801684420637143, + "learning_rate": 1.9288946996008147e-06, + "loss": 0.6908, + "step": 4933 + }, + { + "epoch": 0.74, + "grad_norm": 5.190670487336772, + "learning_rate": 1.928858917111383e-06, + "loss": 0.6693, + "step": 4934 + }, + { + "epoch": 0.74, + "grad_norm": 2.77416365660095, + "learning_rate": 1.9288231259528295e-06, + "loss": 0.6667, + "step": 4935 + }, + { + "epoch": 0.74, + "grad_norm": 4.07134536937542, + "learning_rate": 1.928787326125487e-06, + "loss": 0.6654, + "step": 4936 + }, + { + "epoch": 0.74, + "grad_norm": 1.3966395819139361, + "learning_rate": 1.9287515176296905e-06, + "loss": 0.6758, + "step": 4937 + }, + { + "epoch": 0.74, + "grad_norm": 2.7090847938867517, + "learning_rate": 1.928715700465774e-06, + "loss": 0.6842, + "step": 4938 + }, + { + "epoch": 0.74, + "grad_norm": 1.1289329657048472, + "learning_rate": 1.928679874634072e-06, + "loss": 0.6719, + "step": 4939 + }, + { + "epoch": 0.74, + "grad_norm": 1.837610139356017, + "learning_rate": 1.9286440401349185e-06, + "loss": 0.6849, + "step": 4940 + }, + { + "epoch": 0.74, + "grad_norm": 1.1912708142563047, + "learning_rate": 1.928608196968648e-06, + "loss": 0.7044, + "step": 4941 + }, + { + "epoch": 0.74, + "grad_norm": 2.817560046085256, + "learning_rate": 1.9285723451355947e-06, + "loss": 0.7044, + "step": 4942 + }, + { + "epoch": 0.74, + "grad_norm": 3.225185842434117, + "learning_rate": 1.9285364846360943e-06, + "loss": 0.6823, + "step": 4943 + }, + { + "epoch": 0.74, + "grad_norm": 8.116987180930424, + "learning_rate": 1.92850061547048e-06, + "loss": 0.6934, + "step": 4944 + }, + { + "epoch": 0.74, + "grad_norm": 4.964040430071127, + "learning_rate": 1.9284647376390878e-06, + "loss": 0.7135, + "step": 4945 + }, + { + "epoch": 0.74, + "grad_norm": 4.685634485379869, + "learning_rate": 1.9284288511422518e-06, + "loss": 0.7174, + "step": 4946 + }, + { + "epoch": 0.74, + "grad_norm": 3.087043876424659, + "learning_rate": 1.9283929559803072e-06, + "loss": 0.6882, + "step": 4947 + }, + { + "epoch": 0.74, + "grad_norm": 3.573703830216191, + "learning_rate": 1.9283570521535896e-06, + "loss": 0.6999, + "step": 4948 + }, + { + "epoch": 0.74, + "grad_norm": 8.363711225073967, + "learning_rate": 1.928321139662433e-06, + "loss": 0.6751, + "step": 4949 + }, + { + "epoch": 0.74, + "grad_norm": 1.082334204755485, + "learning_rate": 1.9282852185071735e-06, + "loss": 0.6849, + "step": 4950 + }, + { + "epoch": 0.74, + "grad_norm": 0.671989980258979, + "learning_rate": 1.9282492886881453e-06, + "loss": 0.6693, + "step": 4951 + }, + { + "epoch": 0.74, + "grad_norm": 5.965944427552028, + "learning_rate": 1.928213350205685e-06, + "loss": 0.6953, + "step": 4952 + }, + { + "epoch": 0.74, + "grad_norm": 7.389377937417062, + "learning_rate": 1.9281774030601274e-06, + "loss": 0.6966, + "step": 4953 + }, + { + "epoch": 0.74, + "grad_norm": 6.20933766820728, + "learning_rate": 1.9281414472518076e-06, + "loss": 0.6921, + "step": 4954 + }, + { + "epoch": 0.74, + "grad_norm": 13.795159031252245, + "learning_rate": 1.928105482781062e-06, + "loss": 0.7201, + "step": 4955 + }, + { + "epoch": 0.74, + "grad_norm": 1.5232960396566528, + "learning_rate": 1.9280695096482255e-06, + "loss": 0.6849, + "step": 4956 + }, + { + "epoch": 0.74, + "grad_norm": 2.8629527247831907, + "learning_rate": 1.9280335278536345e-06, + "loss": 0.6797, + "step": 4957 + }, + { + "epoch": 0.74, + "grad_norm": 1.427192450883182, + "learning_rate": 1.9279975373976247e-06, + "loss": 0.666, + "step": 4958 + }, + { + "epoch": 0.74, + "grad_norm": 3.804231897328308, + "learning_rate": 1.9279615382805314e-06, + "loss": 0.6868, + "step": 4959 + }, + { + "epoch": 0.74, + "grad_norm": 1.701918667055409, + "learning_rate": 1.927925530502691e-06, + "loss": 0.6758, + "step": 4960 + }, + { + "epoch": 0.74, + "grad_norm": 2.248470440495258, + "learning_rate": 1.92788951406444e-06, + "loss": 0.6777, + "step": 4961 + }, + { + "epoch": 0.74, + "grad_norm": 0.3519338271586586, + "learning_rate": 1.9278534889661135e-06, + "loss": 0.6888, + "step": 4962 + }, + { + "epoch": 0.74, + "grad_norm": 1.4767752717640106, + "learning_rate": 1.9278174552080485e-06, + "loss": 0.6797, + "step": 4963 + }, + { + "epoch": 0.74, + "grad_norm": 2.9453993855064584, + "learning_rate": 1.9277814127905815e-06, + "loss": 0.6712, + "step": 4964 + }, + { + "epoch": 0.74, + "grad_norm": 3.3102769367643474, + "learning_rate": 1.9277453617140482e-06, + "loss": 0.6777, + "step": 4965 + }, + { + "epoch": 0.74, + "grad_norm": 1.8627439582804972, + "learning_rate": 1.9277093019787855e-06, + "loss": 0.6875, + "step": 4966 + }, + { + "epoch": 0.74, + "grad_norm": 2.350142317384139, + "learning_rate": 1.92767323358513e-06, + "loss": 0.6908, + "step": 4967 + }, + { + "epoch": 0.74, + "grad_norm": 2.898591886834909, + "learning_rate": 1.927637156533418e-06, + "loss": 0.6934, + "step": 4968 + }, + { + "epoch": 0.74, + "grad_norm": 5.4887686936169855, + "learning_rate": 1.9276010708239863e-06, + "loss": 0.6914, + "step": 4969 + }, + { + "epoch": 0.74, + "grad_norm": 4.897003739104938, + "learning_rate": 1.9275649764571716e-06, + "loss": 0.6621, + "step": 4970 + }, + { + "epoch": 0.74, + "grad_norm": 1.5992636446537338, + "learning_rate": 1.9275288734333113e-06, + "loss": 0.6745, + "step": 4971 + }, + { + "epoch": 0.74, + "grad_norm": 4.113937454338086, + "learning_rate": 1.927492761752742e-06, + "loss": 0.7031, + "step": 4972 + }, + { + "epoch": 0.74, + "grad_norm": 1.6428630163077593, + "learning_rate": 1.9274566414158008e-06, + "loss": 0.6908, + "step": 4973 + }, + { + "epoch": 0.74, + "grad_norm": 1.1770472780381784, + "learning_rate": 1.9274205124228243e-06, + "loss": 0.6829, + "step": 4974 + }, + { + "epoch": 0.74, + "grad_norm": 7.612131812646875, + "learning_rate": 1.9273843747741507e-06, + "loss": 0.679, + "step": 4975 + }, + { + "epoch": 0.74, + "grad_norm": 0.8995111366233651, + "learning_rate": 1.927348228470116e-06, + "loss": 0.6855, + "step": 4976 + }, + { + "epoch": 0.74, + "grad_norm": 3.300950206224386, + "learning_rate": 1.9273120735110593e-06, + "loss": 0.6771, + "step": 4977 + }, + { + "epoch": 0.74, + "grad_norm": 2.6477723227721746, + "learning_rate": 1.927275909897316e-06, + "loss": 0.6882, + "step": 4978 + }, + { + "epoch": 0.74, + "grad_norm": 2.8413344042524926, + "learning_rate": 1.927239737629225e-06, + "loss": 0.6745, + "step": 4979 + }, + { + "epoch": 0.74, + "grad_norm": 2.7923351059468877, + "learning_rate": 1.9272035567071236e-06, + "loss": 0.6751, + "step": 4980 + }, + { + "epoch": 0.74, + "grad_norm": 1.090852599227851, + "learning_rate": 1.9271673671313497e-06, + "loss": 0.6842, + "step": 4981 + }, + { + "epoch": 0.74, + "grad_norm": 4.3562233634599545, + "learning_rate": 1.92713116890224e-06, + "loss": 0.6745, + "step": 4982 + }, + { + "epoch": 0.74, + "grad_norm": 1.3368601672200886, + "learning_rate": 1.927094962020134e-06, + "loss": 0.6862, + "step": 4983 + }, + { + "epoch": 0.74, + "grad_norm": 1.2958641298507503, + "learning_rate": 1.927058746485368e-06, + "loss": 0.666, + "step": 4984 + }, + { + "epoch": 0.74, + "grad_norm": 3.6941027017982444, + "learning_rate": 1.927022522298281e-06, + "loss": 0.6797, + "step": 4985 + }, + { + "epoch": 0.74, + "grad_norm": 2.8138429971941274, + "learning_rate": 1.926986289459211e-06, + "loss": 0.6862, + "step": 4986 + }, + { + "epoch": 0.74, + "grad_norm": 3.344991240428279, + "learning_rate": 1.926950047968496e-06, + "loss": 0.6999, + "step": 4987 + }, + { + "epoch": 0.74, + "grad_norm": 1.4068046796540017, + "learning_rate": 1.926913797826474e-06, + "loss": 0.6966, + "step": 4988 + }, + { + "epoch": 0.74, + "grad_norm": 0.8760816243735755, + "learning_rate": 1.9268775390334833e-06, + "loss": 0.6699, + "step": 4989 + }, + { + "epoch": 0.74, + "grad_norm": 1.8227795777591125, + "learning_rate": 1.926841271589863e-06, + "loss": 0.7018, + "step": 4990 + }, + { + "epoch": 0.74, + "grad_norm": 5.234012610643305, + "learning_rate": 1.9268049954959513e-06, + "loss": 0.6719, + "step": 4991 + }, + { + "epoch": 0.74, + "grad_norm": 1.475439457551577, + "learning_rate": 1.926768710752086e-06, + "loss": 0.6758, + "step": 4992 + }, + { + "epoch": 0.74, + "grad_norm": 4.497804155510151, + "learning_rate": 1.9267324173586065e-06, + "loss": 0.6882, + "step": 4993 + }, + { + "epoch": 0.74, + "grad_norm": 0.7563075304587514, + "learning_rate": 1.9266961153158515e-06, + "loss": 0.6842, + "step": 4994 + }, + { + "epoch": 0.74, + "grad_norm": 0.6852941566759778, + "learning_rate": 1.92665980462416e-06, + "loss": 0.6517, + "step": 4995 + }, + { + "epoch": 0.75, + "grad_norm": 6.556780286815626, + "learning_rate": 1.9266234852838707e-06, + "loss": 0.7031, + "step": 4996 + }, + { + "epoch": 0.75, + "grad_norm": 5.30182299073212, + "learning_rate": 1.9265871572953216e-06, + "loss": 0.709, + "step": 4997 + }, + { + "epoch": 0.75, + "grad_norm": 0.9225669512296288, + "learning_rate": 1.9265508206588534e-06, + "loss": 0.6641, + "step": 4998 + }, + { + "epoch": 0.75, + "grad_norm": 0.665375040077518, + "learning_rate": 1.9265144753748042e-06, + "loss": 0.6764, + "step": 4999 + }, + { + "epoch": 0.75, + "grad_norm": 2.4677024852500926, + "learning_rate": 1.9264781214435135e-06, + "loss": 0.6784, + "step": 5000 + }, + { + "epoch": 0.75, + "grad_norm": 6.235362007897166, + "learning_rate": 1.9264417588653208e-06, + "loss": 0.696, + "step": 5001 + }, + { + "epoch": 0.75, + "grad_norm": 6.040671178791531, + "learning_rate": 1.926405387640565e-06, + "loss": 0.6803, + "step": 5002 + }, + { + "epoch": 0.75, + "grad_norm": 8.680628162912633, + "learning_rate": 1.9263690077695856e-06, + "loss": 0.6634, + "step": 5003 + }, + { + "epoch": 0.75, + "grad_norm": 6.980207976591543, + "learning_rate": 1.9263326192527224e-06, + "loss": 0.7005, + "step": 5004 + }, + { + "epoch": 0.75, + "grad_norm": 2.178139501553892, + "learning_rate": 1.926296222090315e-06, + "loss": 0.6849, + "step": 5005 + }, + { + "epoch": 0.75, + "grad_norm": 1.4756439748336987, + "learning_rate": 1.926259816282703e-06, + "loss": 0.6693, + "step": 5006 + }, + { + "epoch": 0.75, + "grad_norm": 1.251353429792244, + "learning_rate": 1.9262234018302265e-06, + "loss": 0.6888, + "step": 5007 + }, + { + "epoch": 0.75, + "grad_norm": 1.27773521472061, + "learning_rate": 1.9261869787332246e-06, + "loss": 0.6855, + "step": 5008 + }, + { + "epoch": 0.75, + "grad_norm": 2.4195316758980603, + "learning_rate": 1.926150546992038e-06, + "loss": 0.6751, + "step": 5009 + }, + { + "epoch": 0.75, + "grad_norm": 3.0931471661542207, + "learning_rate": 1.9261141066070065e-06, + "loss": 0.7012, + "step": 5010 + }, + { + "epoch": 0.75, + "grad_norm": 0.8792416090201662, + "learning_rate": 1.92607765757847e-06, + "loss": 0.681, + "step": 5011 + }, + { + "epoch": 0.75, + "grad_norm": 2.885849906607889, + "learning_rate": 1.926041199906769e-06, + "loss": 0.6732, + "step": 5012 + }, + { + "epoch": 0.75, + "grad_norm": 7.997705431581159, + "learning_rate": 1.9260047335922437e-06, + "loss": 0.6895, + "step": 5013 + }, + { + "epoch": 0.75, + "grad_norm": 2.9516609621684893, + "learning_rate": 1.9259682586352337e-06, + "loss": 0.679, + "step": 5014 + }, + { + "epoch": 0.75, + "grad_norm": 7.276436004388301, + "learning_rate": 1.925931775036081e-06, + "loss": 0.679, + "step": 5015 + }, + { + "epoch": 0.75, + "grad_norm": 0.6713646635021844, + "learning_rate": 1.9258952827951243e-06, + "loss": 0.6842, + "step": 5016 + }, + { + "epoch": 0.75, + "grad_norm": 1.0854940687513635, + "learning_rate": 1.9258587819127055e-06, + "loss": 0.6706, + "step": 5017 + }, + { + "epoch": 0.75, + "grad_norm": 4.475552975476553, + "learning_rate": 1.925822272389165e-06, + "loss": 0.6901, + "step": 5018 + }, + { + "epoch": 0.75, + "grad_norm": 1.1594535894303395, + "learning_rate": 1.9257857542248424e-06, + "loss": 0.7018, + "step": 5019 + }, + { + "epoch": 0.75, + "grad_norm": 2.7205634832668975, + "learning_rate": 1.92574922742008e-06, + "loss": 0.6751, + "step": 5020 + }, + { + "epoch": 0.75, + "grad_norm": 2.9518311929724317, + "learning_rate": 1.9257126919752187e-06, + "loss": 0.679, + "step": 5021 + }, + { + "epoch": 0.75, + "grad_norm": 0.9182452715593398, + "learning_rate": 1.9256761478905984e-06, + "loss": 0.6855, + "step": 5022 + }, + { + "epoch": 0.75, + "grad_norm": 1.0697037667747646, + "learning_rate": 1.925639595166561e-06, + "loss": 0.679, + "step": 5023 + }, + { + "epoch": 0.75, + "grad_norm": 1.0920663331525802, + "learning_rate": 1.9256030338034473e-06, + "loss": 0.6712, + "step": 5024 + }, + { + "epoch": 0.75, + "grad_norm": 2.379844244906793, + "learning_rate": 1.9255664638015988e-06, + "loss": 0.707, + "step": 5025 + }, + { + "epoch": 0.75, + "grad_norm": 0.4610782970706937, + "learning_rate": 1.9255298851613567e-06, + "loss": 0.6803, + "step": 5026 + }, + { + "epoch": 0.75, + "grad_norm": 1.7365221308191854, + "learning_rate": 1.925493297883062e-06, + "loss": 0.6738, + "step": 5027 + }, + { + "epoch": 0.75, + "grad_norm": 1.9129392784287338, + "learning_rate": 1.9254567019670567e-06, + "loss": 0.6992, + "step": 5028 + }, + { + "epoch": 0.75, + "grad_norm": 1.3312896297587973, + "learning_rate": 1.9254200974136818e-06, + "loss": 0.6979, + "step": 5029 + }, + { + "epoch": 0.75, + "grad_norm": 7.159286672087882, + "learning_rate": 1.9253834842232796e-06, + "loss": 0.6764, + "step": 5030 + }, + { + "epoch": 0.75, + "grad_norm": 5.078987409089902, + "learning_rate": 1.9253468623961916e-06, + "loss": 0.6784, + "step": 5031 + }, + { + "epoch": 0.75, + "grad_norm": 5.556176306713169, + "learning_rate": 1.9253102319327594e-06, + "loss": 0.6784, + "step": 5032 + }, + { + "epoch": 0.75, + "grad_norm": 0.468821417577688, + "learning_rate": 1.925273592833325e-06, + "loss": 0.6901, + "step": 5033 + }, + { + "epoch": 0.75, + "grad_norm": 0.4087137983407362, + "learning_rate": 1.9252369450982303e-06, + "loss": 0.6842, + "step": 5034 + }, + { + "epoch": 0.75, + "grad_norm": 1.2078590940117218, + "learning_rate": 1.9252002887278173e-06, + "loss": 0.6875, + "step": 5035 + }, + { + "epoch": 0.75, + "grad_norm": 3.180172708686467, + "learning_rate": 1.925163623722428e-06, + "loss": 0.679, + "step": 5036 + }, + { + "epoch": 0.75, + "grad_norm": 2.888708470428105, + "learning_rate": 1.925126950082405e-06, + "loss": 0.6875, + "step": 5037 + }, + { + "epoch": 0.75, + "grad_norm": 0.9487924503393504, + "learning_rate": 1.9250902678080906e-06, + "loss": 0.6686, + "step": 5038 + }, + { + "epoch": 0.75, + "grad_norm": 0.8835830313133762, + "learning_rate": 1.925053576899827e-06, + "loss": 0.6758, + "step": 5039 + }, + { + "epoch": 0.75, + "grad_norm": 1.078781603888221, + "learning_rate": 1.9250168773579565e-06, + "loss": 0.6751, + "step": 5040 + }, + { + "epoch": 0.75, + "grad_norm": 6.390257063566098, + "learning_rate": 1.924980169182821e-06, + "loss": 0.6901, + "step": 5041 + }, + { + "epoch": 0.75, + "grad_norm": 6.891990122582198, + "learning_rate": 1.9249434523747647e-06, + "loss": 0.6895, + "step": 5042 + }, + { + "epoch": 0.75, + "grad_norm": 3.979403857475553, + "learning_rate": 1.9249067269341287e-06, + "loss": 0.6797, + "step": 5043 + }, + { + "epoch": 0.75, + "grad_norm": 7.316251558693802, + "learning_rate": 1.9248699928612563e-06, + "loss": 0.7057, + "step": 5044 + }, + { + "epoch": 0.75, + "grad_norm": 1.2879678296380832, + "learning_rate": 1.924833250156491e-06, + "loss": 0.6686, + "step": 5045 + }, + { + "epoch": 0.75, + "grad_norm": 1.3526386780402346, + "learning_rate": 1.924796498820175e-06, + "loss": 0.6803, + "step": 5046 + }, + { + "epoch": 0.75, + "grad_norm": 2.1373946731790063, + "learning_rate": 1.9247597388526514e-06, + "loss": 0.7044, + "step": 5047 + }, + { + "epoch": 0.75, + "grad_norm": 5.358717341205867, + "learning_rate": 1.9247229702542637e-06, + "loss": 0.6771, + "step": 5048 + }, + { + "epoch": 0.75, + "grad_norm": 3.60547527930621, + "learning_rate": 1.924686193025354e-06, + "loss": 0.6784, + "step": 5049 + }, + { + "epoch": 0.75, + "grad_norm": 3.522107764561483, + "learning_rate": 1.924649407166267e-06, + "loss": 0.6686, + "step": 5050 + }, + { + "epoch": 0.75, + "grad_norm": 5.0058930531065515, + "learning_rate": 1.924612612677345e-06, + "loss": 0.6992, + "step": 5051 + }, + { + "epoch": 0.75, + "grad_norm": 3.017835360158522, + "learning_rate": 1.9245758095589316e-06, + "loss": 0.6888, + "step": 5052 + }, + { + "epoch": 0.75, + "grad_norm": 6.268360649033404, + "learning_rate": 1.924538997811371e-06, + "loss": 0.7168, + "step": 5053 + }, + { + "epoch": 0.75, + "grad_norm": 0.5914305988879397, + "learning_rate": 1.9245021774350058e-06, + "loss": 0.6966, + "step": 5054 + }, + { + "epoch": 0.75, + "grad_norm": 1.7640576706324542, + "learning_rate": 1.9244653484301797e-06, + "loss": 0.6758, + "step": 5055 + }, + { + "epoch": 0.75, + "grad_norm": 0.9374689532352106, + "learning_rate": 1.924428510797237e-06, + "loss": 0.6738, + "step": 5056 + }, + { + "epoch": 0.75, + "grad_norm": 1.5484279591033343, + "learning_rate": 1.9243916645365217e-06, + "loss": 0.6901, + "step": 5057 + }, + { + "epoch": 0.75, + "grad_norm": 0.4922954185377094, + "learning_rate": 1.9243548096483766e-06, + "loss": 0.6888, + "step": 5058 + }, + { + "epoch": 0.75, + "grad_norm": 0.9842899301560162, + "learning_rate": 1.924317946133147e-06, + "loss": 0.6842, + "step": 5059 + }, + { + "epoch": 0.75, + "grad_norm": 4.132164335608764, + "learning_rate": 1.9242810739911753e-06, + "loss": 0.6875, + "step": 5060 + }, + { + "epoch": 0.75, + "grad_norm": 1.2676773428143349, + "learning_rate": 1.924244193222807e-06, + "loss": 0.6875, + "step": 5061 + }, + { + "epoch": 0.75, + "grad_norm": 0.4363286709270197, + "learning_rate": 1.9242073038283864e-06, + "loss": 0.6855, + "step": 5062 + }, + { + "epoch": 0.76, + "grad_norm": 9.66501636383612, + "learning_rate": 1.924170405808257e-06, + "loss": 0.709, + "step": 5063 + }, + { + "epoch": 0.76, + "grad_norm": 0.9759044912108429, + "learning_rate": 1.9241334991627633e-06, + "loss": 0.6882, + "step": 5064 + }, + { + "epoch": 0.76, + "grad_norm": 3.927849892276194, + "learning_rate": 1.92409658389225e-06, + "loss": 0.6836, + "step": 5065 + }, + { + "epoch": 0.76, + "grad_norm": 2.9557717330740916, + "learning_rate": 1.9240596599970613e-06, + "loss": 0.6875, + "step": 5066 + }, + { + "epoch": 0.76, + "grad_norm": 1.367905457830349, + "learning_rate": 1.9240227274775424e-06, + "loss": 0.679, + "step": 5067 + }, + { + "epoch": 0.76, + "grad_norm": 3.7195188507687895, + "learning_rate": 1.923985786334038e-06, + "loss": 0.6849, + "step": 5068 + }, + { + "epoch": 0.76, + "grad_norm": 2.5439244493506443, + "learning_rate": 1.923948836566892e-06, + "loss": 0.6712, + "step": 5069 + }, + { + "epoch": 0.76, + "grad_norm": 4.84645698442471, + "learning_rate": 1.92391187817645e-06, + "loss": 0.6934, + "step": 5070 + }, + { + "epoch": 0.76, + "grad_norm": 1.9897220314723627, + "learning_rate": 1.9238749111630564e-06, + "loss": 0.6855, + "step": 5071 + }, + { + "epoch": 0.76, + "grad_norm": 1.899576100271968, + "learning_rate": 1.923837935527057e-06, + "loss": 0.6914, + "step": 5072 + }, + { + "epoch": 0.76, + "grad_norm": 0.35769802992667216, + "learning_rate": 1.9238009512687964e-06, + "loss": 0.6934, + "step": 5073 + }, + { + "epoch": 0.76, + "grad_norm": 1.5641032556240462, + "learning_rate": 1.9237639583886196e-06, + "loss": 0.6829, + "step": 5074 + }, + { + "epoch": 0.76, + "grad_norm": 0.9801106135547746, + "learning_rate": 1.9237269568868726e-06, + "loss": 0.6836, + "step": 5075 + }, + { + "epoch": 0.76, + "grad_norm": 2.8544003578366586, + "learning_rate": 1.9236899467638997e-06, + "loss": 0.6875, + "step": 5076 + }, + { + "epoch": 0.76, + "grad_norm": 3.516142267571946, + "learning_rate": 1.923652928020047e-06, + "loss": 0.6908, + "step": 5077 + }, + { + "epoch": 0.76, + "grad_norm": 2.1846956517658245, + "learning_rate": 1.9236159006556596e-06, + "loss": 0.6868, + "step": 5078 + }, + { + "epoch": 0.76, + "grad_norm": 1.0924800446198513, + "learning_rate": 1.9235788646710835e-06, + "loss": 0.6927, + "step": 5079 + }, + { + "epoch": 0.76, + "grad_norm": 0.5082021854981877, + "learning_rate": 1.9235418200666644e-06, + "loss": 0.6829, + "step": 5080 + }, + { + "epoch": 0.76, + "grad_norm": 3.445286597021625, + "learning_rate": 1.9235047668427474e-06, + "loss": 0.651, + "step": 5081 + }, + { + "epoch": 0.76, + "grad_norm": 2.096615740263812, + "learning_rate": 1.923467704999679e-06, + "loss": 0.6849, + "step": 5082 + }, + { + "epoch": 0.76, + "grad_norm": 3.150325518482485, + "learning_rate": 1.923430634537805e-06, + "loss": 0.6784, + "step": 5083 + }, + { + "epoch": 0.76, + "grad_norm": 3.155652584983833, + "learning_rate": 1.9233935554574707e-06, + "loss": 0.6875, + "step": 5084 + }, + { + "epoch": 0.76, + "grad_norm": 2.513793350033659, + "learning_rate": 1.923356467759023e-06, + "loss": 0.6882, + "step": 5085 + }, + { + "epoch": 0.76, + "grad_norm": 1.0487666299749516, + "learning_rate": 1.923319371442808e-06, + "loss": 0.651, + "step": 5086 + }, + { + "epoch": 0.76, + "grad_norm": 4.481452187750333, + "learning_rate": 1.9232822665091715e-06, + "loss": 0.6738, + "step": 5087 + }, + { + "epoch": 0.76, + "grad_norm": 2.8317638351774193, + "learning_rate": 1.9232451529584597e-06, + "loss": 0.6823, + "step": 5088 + }, + { + "epoch": 0.76, + "grad_norm": 3.6147671711419296, + "learning_rate": 1.9232080307910193e-06, + "loss": 0.6921, + "step": 5089 + }, + { + "epoch": 0.76, + "grad_norm": 1.703316874442366, + "learning_rate": 1.9231709000071968e-06, + "loss": 0.6816, + "step": 5090 + }, + { + "epoch": 0.76, + "grad_norm": 0.6638682068353373, + "learning_rate": 1.9231337606073386e-06, + "loss": 0.6992, + "step": 5091 + }, + { + "epoch": 0.76, + "grad_norm": 1.0648726866890255, + "learning_rate": 1.9230966125917917e-06, + "loss": 0.6751, + "step": 5092 + }, + { + "epoch": 0.76, + "grad_norm": 1.0664979226280602, + "learning_rate": 1.923059455960902e-06, + "loss": 0.6888, + "step": 5093 + }, + { + "epoch": 0.76, + "grad_norm": 1.8614393887535705, + "learning_rate": 1.923022290715017e-06, + "loss": 0.6634, + "step": 5094 + }, + { + "epoch": 0.76, + "grad_norm": 3.1719576239263763, + "learning_rate": 1.9229851168544833e-06, + "loss": 0.6823, + "step": 5095 + }, + { + "epoch": 0.76, + "grad_norm": 5.370267530443113, + "learning_rate": 1.922947934379648e-06, + "loss": 0.7044, + "step": 5096 + }, + { + "epoch": 0.76, + "grad_norm": 6.177519122869517, + "learning_rate": 1.922910743290858e-06, + "loss": 0.6803, + "step": 5097 + }, + { + "epoch": 0.76, + "grad_norm": 0.9738941242934729, + "learning_rate": 1.9228735435884606e-06, + "loss": 0.6868, + "step": 5098 + }, + { + "epoch": 0.76, + "grad_norm": 7.599335711753268, + "learning_rate": 1.922836335272802e-06, + "loss": 0.7083, + "step": 5099 + }, + { + "epoch": 0.76, + "grad_norm": 2.0898182652425534, + "learning_rate": 1.9227991183442312e-06, + "loss": 0.7018, + "step": 5100 + }, + { + "epoch": 0.76, + "grad_norm": 0.9550285927866295, + "learning_rate": 1.922761892803094e-06, + "loss": 0.679, + "step": 5101 + }, + { + "epoch": 0.76, + "grad_norm": 0.5512685273157563, + "learning_rate": 1.9227246586497388e-06, + "loss": 0.679, + "step": 5102 + }, + { + "epoch": 0.76, + "grad_norm": 5.176006293319146, + "learning_rate": 1.9226874158845125e-06, + "loss": 0.6895, + "step": 5103 + }, + { + "epoch": 0.76, + "grad_norm": 4.464016722144726, + "learning_rate": 1.922650164507763e-06, + "loss": 0.6927, + "step": 5104 + }, + { + "epoch": 0.76, + "grad_norm": 2.765636745534902, + "learning_rate": 1.922612904519838e-06, + "loss": 0.6738, + "step": 5105 + }, + { + "epoch": 0.76, + "grad_norm": 8.158399617529925, + "learning_rate": 1.922575635921085e-06, + "loss": 0.6621, + "step": 5106 + }, + { + "epoch": 0.76, + "grad_norm": 0.9302735844059835, + "learning_rate": 1.9225383587118522e-06, + "loss": 0.6855, + "step": 5107 + }, + { + "epoch": 0.76, + "grad_norm": 3.076948147620345, + "learning_rate": 1.9225010728924876e-06, + "loss": 0.6699, + "step": 5108 + }, + { + "epoch": 0.76, + "grad_norm": 2.964419031790629, + "learning_rate": 1.9224637784633384e-06, + "loss": 0.7064, + "step": 5109 + }, + { + "epoch": 0.76, + "grad_norm": 2.1877509814991303, + "learning_rate": 1.9224264754247536e-06, + "loss": 0.722, + "step": 5110 + }, + { + "epoch": 0.76, + "grad_norm": 0.8432882536882317, + "learning_rate": 1.9223891637770805e-06, + "loss": 0.666, + "step": 5111 + }, + { + "epoch": 0.76, + "grad_norm": 4.8028406646836785, + "learning_rate": 1.9223518435206677e-06, + "loss": 0.6914, + "step": 5112 + }, + { + "epoch": 0.76, + "grad_norm": 6.754136568555937, + "learning_rate": 1.922314514655864e-06, + "loss": 0.6953, + "step": 5113 + }, + { + "epoch": 0.76, + "grad_norm": 4.129251857541044, + "learning_rate": 1.922277177183017e-06, + "loss": 0.7018, + "step": 5114 + }, + { + "epoch": 0.76, + "grad_norm": 1.698065941617655, + "learning_rate": 1.922239831102476e-06, + "loss": 0.6712, + "step": 5115 + }, + { + "epoch": 0.76, + "grad_norm": 2.4267135470626373, + "learning_rate": 1.9222024764145884e-06, + "loss": 0.6921, + "step": 5116 + }, + { + "epoch": 0.76, + "grad_norm": 2.1564375085516394, + "learning_rate": 1.9221651131197044e-06, + "loss": 0.6966, + "step": 5117 + }, + { + "epoch": 0.76, + "grad_norm": 1.195961346495355, + "learning_rate": 1.9221277412181715e-06, + "loss": 0.6868, + "step": 5118 + }, + { + "epoch": 0.76, + "grad_norm": 7.251588066294339, + "learning_rate": 1.9220903607103383e-06, + "loss": 0.7051, + "step": 5119 + }, + { + "epoch": 0.76, + "grad_norm": 3.400041616204578, + "learning_rate": 1.922052971596555e-06, + "loss": 0.6836, + "step": 5120 + }, + { + "epoch": 0.76, + "grad_norm": 2.7798029255793986, + "learning_rate": 1.922015573877169e-06, + "loss": 0.6986, + "step": 5121 + }, + { + "epoch": 0.76, + "grad_norm": 3.7199763748484393, + "learning_rate": 1.9219781675525307e-06, + "loss": 0.6797, + "step": 5122 + }, + { + "epoch": 0.76, + "grad_norm": 5.415319209793105, + "learning_rate": 1.9219407526229887e-06, + "loss": 0.6777, + "step": 5123 + }, + { + "epoch": 0.76, + "grad_norm": 4.450410273717887, + "learning_rate": 1.921903329088892e-06, + "loss": 0.681, + "step": 5124 + }, + { + "epoch": 0.76, + "grad_norm": 0.42904463676618404, + "learning_rate": 1.9218658969505896e-06, + "loss": 0.6823, + "step": 5125 + }, + { + "epoch": 0.76, + "grad_norm": 2.2359442415492023, + "learning_rate": 1.9218284562084318e-06, + "loss": 0.6797, + "step": 5126 + }, + { + "epoch": 0.76, + "grad_norm": 0.3816113996183385, + "learning_rate": 1.921791006862767e-06, + "loss": 0.6836, + "step": 5127 + }, + { + "epoch": 0.76, + "grad_norm": 0.3974052019865893, + "learning_rate": 1.9217535489139456e-06, + "loss": 0.6784, + "step": 5128 + }, + { + "epoch": 0.76, + "grad_norm": 4.347835601366578, + "learning_rate": 1.9217160823623165e-06, + "loss": 0.6836, + "step": 5129 + }, + { + "epoch": 0.77, + "grad_norm": 0.438141495820985, + "learning_rate": 1.92167860720823e-06, + "loss": 0.6745, + "step": 5130 + }, + { + "epoch": 0.77, + "grad_norm": 0.5378566037000513, + "learning_rate": 1.9216411234520354e-06, + "loss": 0.6764, + "step": 5131 + }, + { + "epoch": 0.77, + "grad_norm": 1.8068297991961422, + "learning_rate": 1.921603631094083e-06, + "loss": 0.6686, + "step": 5132 + }, + { + "epoch": 0.77, + "grad_norm": 3.8653719826809416, + "learning_rate": 1.921566130134722e-06, + "loss": 0.6908, + "step": 5133 + }, + { + "epoch": 0.77, + "grad_norm": 3.5350584471433684, + "learning_rate": 1.921528620574303e-06, + "loss": 0.6947, + "step": 5134 + }, + { + "epoch": 0.77, + "grad_norm": 6.652431647328719, + "learning_rate": 1.9214911024131757e-06, + "loss": 0.6901, + "step": 5135 + }, + { + "epoch": 0.77, + "grad_norm": 3.0279637099977132, + "learning_rate": 1.9214535756516906e-06, + "loss": 0.6927, + "step": 5136 + }, + { + "epoch": 0.77, + "grad_norm": 2.879161441808333, + "learning_rate": 1.9214160402901974e-06, + "loss": 0.6842, + "step": 5137 + }, + { + "epoch": 0.77, + "grad_norm": 0.6630115003699333, + "learning_rate": 1.921378496329047e-06, + "loss": 0.6908, + "step": 5138 + }, + { + "epoch": 0.77, + "grad_norm": 2.577469028790561, + "learning_rate": 1.92134094376859e-06, + "loss": 0.6914, + "step": 5139 + }, + { + "epoch": 0.77, + "grad_norm": 0.7176898854911028, + "learning_rate": 1.9213033826091763e-06, + "loss": 0.6855, + "step": 5140 + }, + { + "epoch": 0.77, + "grad_norm": 2.069033149795668, + "learning_rate": 1.921265812851157e-06, + "loss": 0.6595, + "step": 5141 + }, + { + "epoch": 0.77, + "grad_norm": 0.6987482834839938, + "learning_rate": 1.9212282344948816e-06, + "loss": 0.6849, + "step": 5142 + }, + { + "epoch": 0.77, + "grad_norm": 1.0777941495240506, + "learning_rate": 1.921190647540702e-06, + "loss": 0.6654, + "step": 5143 + }, + { + "epoch": 0.77, + "grad_norm": 0.9767904417191738, + "learning_rate": 1.921153051988969e-06, + "loss": 0.668, + "step": 5144 + }, + { + "epoch": 0.77, + "grad_norm": 7.6471950457733975, + "learning_rate": 1.9211154478400326e-06, + "loss": 0.696, + "step": 5145 + }, + { + "epoch": 0.77, + "grad_norm": 9.368620248420106, + "learning_rate": 1.9210778350942446e-06, + "loss": 0.6868, + "step": 5146 + }, + { + "epoch": 0.77, + "grad_norm": 8.406773833897521, + "learning_rate": 1.9210402137519556e-06, + "loss": 0.6797, + "step": 5147 + }, + { + "epoch": 0.77, + "grad_norm": 0.7296456720431217, + "learning_rate": 1.9210025838135168e-06, + "loss": 0.6647, + "step": 5148 + }, + { + "epoch": 0.77, + "grad_norm": 2.2679879012481376, + "learning_rate": 1.920964945279279e-06, + "loss": 0.6569, + "step": 5149 + }, + { + "epoch": 0.77, + "grad_norm": 3.580213805994913, + "learning_rate": 1.9209272981495947e-06, + "loss": 0.6784, + "step": 5150 + }, + { + "epoch": 0.77, + "grad_norm": 0.6671725614384765, + "learning_rate": 1.920889642424814e-06, + "loss": 0.6934, + "step": 5151 + }, + { + "epoch": 0.77, + "grad_norm": 2.3314603902642217, + "learning_rate": 1.920851978105289e-06, + "loss": 0.668, + "step": 5152 + }, + { + "epoch": 0.77, + "grad_norm": 4.492567497229608, + "learning_rate": 1.9208143051913706e-06, + "loss": 0.7051, + "step": 5153 + }, + { + "epoch": 0.77, + "grad_norm": 1.0690737422091536, + "learning_rate": 1.9207766236834115e-06, + "loss": 0.6719, + "step": 5154 + }, + { + "epoch": 0.77, + "grad_norm": 4.206787292206513, + "learning_rate": 1.9207389335817627e-06, + "loss": 0.6699, + "step": 5155 + }, + { + "epoch": 0.77, + "grad_norm": 4.349409370228818, + "learning_rate": 1.9207012348867754e-06, + "loss": 0.7083, + "step": 5156 + }, + { + "epoch": 0.77, + "grad_norm": 4.087562509256677, + "learning_rate": 1.9206635275988028e-06, + "loss": 0.6849, + "step": 5157 + }, + { + "epoch": 0.77, + "grad_norm": 0.6183786906549782, + "learning_rate": 1.9206258117181954e-06, + "loss": 0.6914, + "step": 5158 + }, + { + "epoch": 0.77, + "grad_norm": 2.2032132540721903, + "learning_rate": 1.920588087245306e-06, + "loss": 0.6842, + "step": 5159 + }, + { + "epoch": 0.77, + "grad_norm": 0.7948104076811605, + "learning_rate": 1.920550354180487e-06, + "loss": 0.6621, + "step": 5160 + }, + { + "epoch": 0.77, + "grad_norm": 9.153245675561223, + "learning_rate": 1.92051261252409e-06, + "loss": 0.7292, + "step": 5161 + }, + { + "epoch": 0.77, + "grad_norm": 7.169394859917476, + "learning_rate": 1.9204748622764675e-06, + "loss": 0.6784, + "step": 5162 + }, + { + "epoch": 0.77, + "grad_norm": 8.407339503085876, + "learning_rate": 1.9204371034379714e-06, + "loss": 0.6966, + "step": 5163 + }, + { + "epoch": 0.77, + "grad_norm": 0.7385966937479674, + "learning_rate": 1.9203993360089544e-06, + "loss": 0.6966, + "step": 5164 + }, + { + "epoch": 0.77, + "grad_norm": 3.5201668655439455, + "learning_rate": 1.9203615599897693e-06, + "loss": 0.6829, + "step": 5165 + }, + { + "epoch": 0.77, + "grad_norm": 7.89701701905455, + "learning_rate": 1.920323775380768e-06, + "loss": 0.6836, + "step": 5166 + }, + { + "epoch": 0.77, + "grad_norm": 3.552017736013891, + "learning_rate": 1.9202859821823038e-06, + "loss": 0.6615, + "step": 5167 + }, + { + "epoch": 0.77, + "grad_norm": 1.0013523298104055, + "learning_rate": 1.920248180394729e-06, + "loss": 0.6784, + "step": 5168 + }, + { + "epoch": 0.77, + "grad_norm": 0.7027260826675716, + "learning_rate": 1.9202103700183967e-06, + "loss": 0.6875, + "step": 5169 + }, + { + "epoch": 0.77, + "grad_norm": 0.5259123602246278, + "learning_rate": 1.9201725510536594e-06, + "loss": 0.6589, + "step": 5170 + }, + { + "epoch": 0.77, + "grad_norm": 5.827988211831475, + "learning_rate": 1.9201347235008704e-06, + "loss": 0.6842, + "step": 5171 + }, + { + "epoch": 0.77, + "grad_norm": 3.71231156332935, + "learning_rate": 1.9200968873603827e-06, + "loss": 0.6842, + "step": 5172 + }, + { + "epoch": 0.77, + "grad_norm": 2.8925409277378984, + "learning_rate": 1.9200590426325493e-06, + "loss": 0.6777, + "step": 5173 + }, + { + "epoch": 0.77, + "grad_norm": 1.5074998039780627, + "learning_rate": 1.9200211893177237e-06, + "loss": 0.6569, + "step": 5174 + }, + { + "epoch": 0.77, + "grad_norm": 2.4832061483873415, + "learning_rate": 1.9199833274162587e-06, + "loss": 0.6706, + "step": 5175 + }, + { + "epoch": 0.77, + "grad_norm": 3.9575151831079998, + "learning_rate": 1.919945456928508e-06, + "loss": 0.6934, + "step": 5176 + }, + { + "epoch": 0.77, + "grad_norm": 1.4786507388511414, + "learning_rate": 1.919907577854825e-06, + "loss": 0.6758, + "step": 5177 + }, + { + "epoch": 0.77, + "grad_norm": 0.5983247167125327, + "learning_rate": 1.9198696901955634e-06, + "loss": 0.6725, + "step": 5178 + }, + { + "epoch": 0.77, + "grad_norm": 3.0549328915758487, + "learning_rate": 1.919831793951077e-06, + "loss": 0.6745, + "step": 5179 + }, + { + "epoch": 0.77, + "grad_norm": 3.3648938759223146, + "learning_rate": 1.919793889121718e-06, + "loss": 0.681, + "step": 5180 + }, + { + "epoch": 0.77, + "grad_norm": 2.403790200164839, + "learning_rate": 1.9197559757078424e-06, + "loss": 0.6934, + "step": 5181 + }, + { + "epoch": 0.77, + "grad_norm": 6.11836677863783, + "learning_rate": 1.9197180537098024e-06, + "loss": 0.7311, + "step": 5182 + }, + { + "epoch": 0.77, + "grad_norm": 4.414646113241014, + "learning_rate": 1.919680123127953e-06, + "loss": 0.7064, + "step": 5183 + }, + { + "epoch": 0.77, + "grad_norm": 2.204501634827672, + "learning_rate": 1.919642183962647e-06, + "loss": 0.6784, + "step": 5184 + }, + { + "epoch": 0.77, + "grad_norm": 1.0108367609058428, + "learning_rate": 1.919604236214239e-06, + "loss": 0.6875, + "step": 5185 + }, + { + "epoch": 0.77, + "grad_norm": 4.415862161554144, + "learning_rate": 1.919566279883084e-06, + "loss": 0.6908, + "step": 5186 + }, + { + "epoch": 0.77, + "grad_norm": 0.6080045967786997, + "learning_rate": 1.9195283149695355e-06, + "loss": 0.6842, + "step": 5187 + }, + { + "epoch": 0.77, + "grad_norm": 2.230580366397807, + "learning_rate": 1.9194903414739476e-06, + "loss": 0.6999, + "step": 5188 + }, + { + "epoch": 0.77, + "grad_norm": 7.01421943827295, + "learning_rate": 1.9194523593966754e-06, + "loss": 0.7135, + "step": 5189 + }, + { + "epoch": 0.77, + "grad_norm": 6.725345443931961, + "learning_rate": 1.9194143687380726e-06, + "loss": 0.709, + "step": 5190 + }, + { + "epoch": 0.77, + "grad_norm": 3.24374121028374, + "learning_rate": 1.919376369498494e-06, + "loss": 0.6973, + "step": 5191 + }, + { + "epoch": 0.77, + "grad_norm": 1.4779383777087123, + "learning_rate": 1.919338361678295e-06, + "loss": 0.6882, + "step": 5192 + }, + { + "epoch": 0.77, + "grad_norm": 4.2791807040524015, + "learning_rate": 1.9193003452778296e-06, + "loss": 0.6908, + "step": 5193 + }, + { + "epoch": 0.77, + "grad_norm": 3.786264890643834, + "learning_rate": 1.9192623202974527e-06, + "loss": 0.6953, + "step": 5194 + }, + { + "epoch": 0.77, + "grad_norm": 7.790419040334855, + "learning_rate": 1.919224286737519e-06, + "loss": 0.7064, + "step": 5195 + }, + { + "epoch": 0.77, + "grad_norm": 2.2549362854425765, + "learning_rate": 1.9191862445983846e-06, + "loss": 0.6777, + "step": 5196 + }, + { + "epoch": 0.78, + "grad_norm": 6.1796302284878175, + "learning_rate": 1.919148193880403e-06, + "loss": 0.6602, + "step": 5197 + }, + { + "epoch": 0.78, + "grad_norm": 1.9709865988004287, + "learning_rate": 1.9191101345839302e-06, + "loss": 0.6777, + "step": 5198 + }, + { + "epoch": 0.78, + "grad_norm": 3.337124593507324, + "learning_rate": 1.919072066709321e-06, + "loss": 0.6816, + "step": 5199 + }, + { + "epoch": 0.78, + "grad_norm": 3.2278142648941435, + "learning_rate": 1.9190339902569314e-06, + "loss": 0.681, + "step": 5200 + }, + { + "epoch": 0.78, + "grad_norm": 1.3178904478544053, + "learning_rate": 1.9189959052271163e-06, + "loss": 0.6842, + "step": 5201 + }, + { + "epoch": 0.78, + "grad_norm": 0.9592556234508612, + "learning_rate": 1.9189578116202307e-06, + "loss": 0.6849, + "step": 5202 + }, + { + "epoch": 0.78, + "grad_norm": 2.1412437432476477, + "learning_rate": 1.9189197094366304e-06, + "loss": 0.6732, + "step": 5203 + }, + { + "epoch": 0.78, + "grad_norm": 4.984087261848875, + "learning_rate": 1.9188815986766717e-06, + "loss": 0.7096, + "step": 5204 + }, + { + "epoch": 0.78, + "grad_norm": 4.251028001144143, + "learning_rate": 1.9188434793407097e-06, + "loss": 0.6908, + "step": 5205 + }, + { + "epoch": 0.78, + "grad_norm": 5.233816076135131, + "learning_rate": 1.9188053514291e-06, + "loss": 0.6875, + "step": 5206 + }, + { + "epoch": 0.78, + "grad_norm": 2.4467441939200607, + "learning_rate": 1.918767214942199e-06, + "loss": 0.679, + "step": 5207 + }, + { + "epoch": 0.78, + "grad_norm": 2.426166902273389, + "learning_rate": 1.9187290698803623e-06, + "loss": 0.6836, + "step": 5208 + }, + { + "epoch": 0.78, + "grad_norm": 1.0403718120229577, + "learning_rate": 1.918690916243946e-06, + "loss": 0.6849, + "step": 5209 + }, + { + "epoch": 0.78, + "grad_norm": 5.656421133283909, + "learning_rate": 1.9186527540333056e-06, + "loss": 0.6803, + "step": 5210 + }, + { + "epoch": 0.78, + "grad_norm": 3.775241068350747, + "learning_rate": 1.9186145832487984e-06, + "loss": 0.6784, + "step": 5211 + }, + { + "epoch": 0.78, + "grad_norm": 0.593782398176366, + "learning_rate": 1.91857640389078e-06, + "loss": 0.6992, + "step": 5212 + }, + { + "epoch": 0.78, + "grad_norm": 1.0268466256823021, + "learning_rate": 1.9185382159596063e-06, + "loss": 0.6816, + "step": 5213 + }, + { + "epoch": 0.78, + "grad_norm": 2.836752197483894, + "learning_rate": 1.918500019455635e-06, + "loss": 0.6934, + "step": 5214 + }, + { + "epoch": 0.78, + "grad_norm": 2.5497102406794387, + "learning_rate": 1.918461814379221e-06, + "loss": 0.7025, + "step": 5215 + }, + { + "epoch": 0.78, + "grad_norm": 1.4426955628474918, + "learning_rate": 1.9184236007307215e-06, + "loss": 0.6615, + "step": 5216 + }, + { + "epoch": 0.78, + "grad_norm": 0.5875161068361667, + "learning_rate": 1.9183853785104943e-06, + "loss": 0.696, + "step": 5217 + }, + { + "epoch": 0.78, + "grad_norm": 8.526577055827364, + "learning_rate": 1.9183471477188946e-06, + "loss": 0.6908, + "step": 5218 + }, + { + "epoch": 0.78, + "grad_norm": 1.7313920883726799, + "learning_rate": 1.9183089083562795e-06, + "loss": 0.6836, + "step": 5219 + }, + { + "epoch": 0.78, + "grad_norm": 3.0074573706843046, + "learning_rate": 1.9182706604230068e-06, + "loss": 0.6641, + "step": 5220 + }, + { + "epoch": 0.78, + "grad_norm": 0.9659421020134467, + "learning_rate": 1.9182324039194323e-06, + "loss": 0.6745, + "step": 5221 + }, + { + "epoch": 0.78, + "grad_norm": 1.1290928762331447, + "learning_rate": 1.9181941388459134e-06, + "loss": 0.6829, + "step": 5222 + }, + { + "epoch": 0.78, + "grad_norm": 4.580297989375622, + "learning_rate": 1.918155865202808e-06, + "loss": 0.7051, + "step": 5223 + }, + { + "epoch": 0.78, + "grad_norm": 2.9765386971620598, + "learning_rate": 1.9181175829904724e-06, + "loss": 0.6823, + "step": 5224 + }, + { + "epoch": 0.78, + "grad_norm": 0.5185311776456006, + "learning_rate": 1.9180792922092643e-06, + "loss": 0.6751, + "step": 5225 + }, + { + "epoch": 0.78, + "grad_norm": 6.022780066980127, + "learning_rate": 1.918040992859541e-06, + "loss": 0.696, + "step": 5226 + }, + { + "epoch": 0.78, + "grad_norm": 2.130163845614596, + "learning_rate": 1.9180026849416598e-06, + "loss": 0.6582, + "step": 5227 + }, + { + "epoch": 0.78, + "grad_norm": 4.995861165441475, + "learning_rate": 1.917964368455978e-06, + "loss": 0.6803, + "step": 5228 + }, + { + "epoch": 0.78, + "grad_norm": 2.3214532064959865, + "learning_rate": 1.917926043402854e-06, + "loss": 0.6758, + "step": 5229 + }, + { + "epoch": 0.78, + "grad_norm": 1.6320866197685475, + "learning_rate": 1.9178877097826454e-06, + "loss": 0.6908, + "step": 5230 + }, + { + "epoch": 0.78, + "grad_norm": 2.033214131841345, + "learning_rate": 1.917849367595709e-06, + "loss": 0.6686, + "step": 5231 + }, + { + "epoch": 0.78, + "grad_norm": 1.2068698611908288, + "learning_rate": 1.9178110168424037e-06, + "loss": 0.6908, + "step": 5232 + }, + { + "epoch": 0.78, + "grad_norm": 5.677288833735646, + "learning_rate": 1.9177726575230868e-06, + "loss": 0.7018, + "step": 5233 + }, + { + "epoch": 0.78, + "grad_norm": 0.7747602144250911, + "learning_rate": 1.9177342896381166e-06, + "loss": 0.6803, + "step": 5234 + }, + { + "epoch": 0.78, + "grad_norm": 2.5969152241165303, + "learning_rate": 1.9176959131878513e-06, + "loss": 0.6849, + "step": 5235 + }, + { + "epoch": 0.78, + "grad_norm": 4.545123520734429, + "learning_rate": 1.9176575281726486e-06, + "loss": 0.6738, + "step": 5236 + }, + { + "epoch": 0.78, + "grad_norm": 1.9536482287193089, + "learning_rate": 1.917619134592867e-06, + "loss": 0.6667, + "step": 5237 + }, + { + "epoch": 0.78, + "grad_norm": 1.9253076182618392, + "learning_rate": 1.9175807324488653e-06, + "loss": 0.6895, + "step": 5238 + }, + { + "epoch": 0.78, + "grad_norm": 1.914853437550412, + "learning_rate": 1.917542321741001e-06, + "loss": 0.6829, + "step": 5239 + }, + { + "epoch": 0.78, + "grad_norm": 3.3136230302829746, + "learning_rate": 1.917503902469633e-06, + "loss": 0.6934, + "step": 5240 + }, + { + "epoch": 0.78, + "grad_norm": 0.7862442091318633, + "learning_rate": 1.91746547463512e-06, + "loss": 0.6699, + "step": 5241 + }, + { + "epoch": 0.78, + "grad_norm": 6.8683164058673105, + "learning_rate": 1.9174270382378207e-06, + "loss": 0.6725, + "step": 5242 + }, + { + "epoch": 0.78, + "grad_norm": 0.6709064552454, + "learning_rate": 1.917388593278094e-06, + "loss": 0.6784, + "step": 5243 + }, + { + "epoch": 0.78, + "grad_norm": 1.9736205452264726, + "learning_rate": 1.917350139756298e-06, + "loss": 0.6784, + "step": 5244 + }, + { + "epoch": 0.78, + "grad_norm": 1.2535789510524122, + "learning_rate": 1.917311677672792e-06, + "loss": 0.6608, + "step": 5245 + }, + { + "epoch": 0.78, + "grad_norm": 1.141781675782395, + "learning_rate": 1.9172732070279357e-06, + "loss": 0.6803, + "step": 5246 + }, + { + "epoch": 0.78, + "grad_norm": 8.701634714227708, + "learning_rate": 1.917234727822087e-06, + "loss": 0.7155, + "step": 5247 + }, + { + "epoch": 0.78, + "grad_norm": 3.889265067961097, + "learning_rate": 1.917196240055605e-06, + "loss": 0.6947, + "step": 5248 + }, + { + "epoch": 0.78, + "grad_norm": 3.70915310801982, + "learning_rate": 1.91715774372885e-06, + "loss": 0.7168, + "step": 5249 + }, + { + "epoch": 0.78, + "grad_norm": 4.3154800181256245, + "learning_rate": 1.9171192388421807e-06, + "loss": 0.6706, + "step": 5250 + }, + { + "epoch": 0.78, + "grad_norm": 2.6917591974403074, + "learning_rate": 1.917080725395956e-06, + "loss": 0.6803, + "step": 5251 + }, + { + "epoch": 0.78, + "grad_norm": 7.908787650576514, + "learning_rate": 1.917042203390536e-06, + "loss": 0.707, + "step": 5252 + }, + { + "epoch": 0.78, + "grad_norm": 2.040648821685389, + "learning_rate": 1.91700367282628e-06, + "loss": 0.6973, + "step": 5253 + }, + { + "epoch": 0.78, + "grad_norm": 4.010462416914631, + "learning_rate": 1.9169651337035473e-06, + "loss": 0.6927, + "step": 5254 + }, + { + "epoch": 0.78, + "grad_norm": 1.2680119221735622, + "learning_rate": 1.9169265860226987e-06, + "loss": 0.6647, + "step": 5255 + }, + { + "epoch": 0.78, + "grad_norm": 2.285494867645112, + "learning_rate": 1.916888029784093e-06, + "loss": 0.6803, + "step": 5256 + }, + { + "epoch": 0.78, + "grad_norm": 4.918505295409657, + "learning_rate": 1.9168494649880896e-06, + "loss": 0.7025, + "step": 5257 + }, + { + "epoch": 0.78, + "grad_norm": 4.658532867539108, + "learning_rate": 1.9168108916350492e-06, + "loss": 0.6921, + "step": 5258 + }, + { + "epoch": 0.78, + "grad_norm": 5.272965671237271, + "learning_rate": 1.916772309725332e-06, + "loss": 0.7044, + "step": 5259 + }, + { + "epoch": 0.78, + "grad_norm": 4.493318732001443, + "learning_rate": 1.9167337192592978e-06, + "loss": 0.6842, + "step": 5260 + }, + { + "epoch": 0.78, + "grad_norm": 0.8061856568224763, + "learning_rate": 1.9166951202373063e-06, + "loss": 0.6849, + "step": 5261 + }, + { + "epoch": 0.78, + "grad_norm": 2.0656746380419118, + "learning_rate": 1.916656512659719e-06, + "loss": 0.6882, + "step": 5262 + }, + { + "epoch": 0.78, + "grad_norm": 6.182325528756099, + "learning_rate": 1.9166178965268944e-06, + "loss": 0.6803, + "step": 5263 + }, + { + "epoch": 0.79, + "grad_norm": 2.970524274152441, + "learning_rate": 1.9165792718391944e-06, + "loss": 0.6862, + "step": 5264 + }, + { + "epoch": 0.79, + "grad_norm": 2.1149962621253136, + "learning_rate": 1.916540638596979e-06, + "loss": 0.681, + "step": 5265 + }, + { + "epoch": 0.79, + "grad_norm": 2.415960623323468, + "learning_rate": 1.916501996800609e-06, + "loss": 0.6908, + "step": 5266 + }, + { + "epoch": 0.79, + "grad_norm": 1.3872293135724258, + "learning_rate": 1.916463346450444e-06, + "loss": 0.6816, + "step": 5267 + }, + { + "epoch": 0.79, + "grad_norm": 1.0939619116808057, + "learning_rate": 1.9164246875468464e-06, + "loss": 0.6771, + "step": 5268 + }, + { + "epoch": 0.79, + "grad_norm": 2.5416882648567247, + "learning_rate": 1.9163860200901756e-06, + "loss": 0.6725, + "step": 5269 + }, + { + "epoch": 0.79, + "grad_norm": 1.666106239728497, + "learning_rate": 1.9163473440807934e-06, + "loss": 0.6751, + "step": 5270 + }, + { + "epoch": 0.79, + "grad_norm": 3.412183348302087, + "learning_rate": 1.9163086595190605e-06, + "loss": 0.6738, + "step": 5271 + }, + { + "epoch": 0.79, + "grad_norm": 6.880662307553055, + "learning_rate": 1.9162699664053373e-06, + "loss": 0.6764, + "step": 5272 + }, + { + "epoch": 0.79, + "grad_norm": 3.4360233820959825, + "learning_rate": 1.916231264739986e-06, + "loss": 0.6855, + "step": 5273 + }, + { + "epoch": 0.79, + "grad_norm": 1.7942903640245291, + "learning_rate": 1.916192554523367e-06, + "loss": 0.6758, + "step": 5274 + }, + { + "epoch": 0.79, + "grad_norm": 0.5170136060518417, + "learning_rate": 1.9161538357558417e-06, + "loss": 0.6751, + "step": 5275 + }, + { + "epoch": 0.79, + "grad_norm": 2.316361163671118, + "learning_rate": 1.916115108437772e-06, + "loss": 0.6628, + "step": 5276 + }, + { + "epoch": 0.79, + "grad_norm": 1.719055550256077, + "learning_rate": 1.916076372569519e-06, + "loss": 0.6647, + "step": 5277 + }, + { + "epoch": 0.79, + "grad_norm": 0.5550965442359631, + "learning_rate": 1.916037628151444e-06, + "loss": 0.6901, + "step": 5278 + }, + { + "epoch": 0.79, + "grad_norm": 7.631050830378552, + "learning_rate": 1.915998875183909e-06, + "loss": 0.6888, + "step": 5279 + }, + { + "epoch": 0.79, + "grad_norm": 0.5204219917111207, + "learning_rate": 1.915960113667275e-06, + "loss": 0.6875, + "step": 5280 + }, + { + "epoch": 0.79, + "grad_norm": 5.197663041055757, + "learning_rate": 1.9159213436019048e-06, + "loss": 0.6608, + "step": 5281 + }, + { + "epoch": 0.79, + "grad_norm": 1.4816972804690998, + "learning_rate": 1.9158825649881592e-06, + "loss": 0.6725, + "step": 5282 + }, + { + "epoch": 0.79, + "grad_norm": 4.859748873202748, + "learning_rate": 1.915843777826401e-06, + "loss": 0.6908, + "step": 5283 + }, + { + "epoch": 0.79, + "grad_norm": 2.3738369645438135, + "learning_rate": 1.9158049821169918e-06, + "loss": 0.696, + "step": 5284 + }, + { + "epoch": 0.79, + "grad_norm": 3.3946684401357903, + "learning_rate": 1.9157661778602936e-06, + "loss": 0.6621, + "step": 5285 + }, + { + "epoch": 0.79, + "grad_norm": 4.658156802998355, + "learning_rate": 1.915727365056669e-06, + "loss": 0.6654, + "step": 5286 + }, + { + "epoch": 0.79, + "grad_norm": 2.273495561876913, + "learning_rate": 1.9156885437064795e-06, + "loss": 0.6882, + "step": 5287 + }, + { + "epoch": 0.79, + "grad_norm": 1.8730964249208273, + "learning_rate": 1.915649713810088e-06, + "loss": 0.6764, + "step": 5288 + }, + { + "epoch": 0.79, + "grad_norm": 0.6957977434864091, + "learning_rate": 1.9156108753678568e-06, + "loss": 0.6686, + "step": 5289 + }, + { + "epoch": 0.79, + "grad_norm": 1.2821212744970933, + "learning_rate": 1.915572028380148e-06, + "loss": 0.6901, + "step": 5290 + }, + { + "epoch": 0.79, + "grad_norm": 4.0016874962768085, + "learning_rate": 1.915533172847325e-06, + "loss": 0.6855, + "step": 5291 + }, + { + "epoch": 0.79, + "grad_norm": 3.4925523149771247, + "learning_rate": 1.9154943087697495e-06, + "loss": 0.6986, + "step": 5292 + }, + { + "epoch": 0.79, + "grad_norm": 5.565772404423167, + "learning_rate": 1.915455436147785e-06, + "loss": 0.6875, + "step": 5293 + }, + { + "epoch": 0.79, + "grad_norm": 4.479693330475718, + "learning_rate": 1.915416554981794e-06, + "loss": 0.666, + "step": 5294 + }, + { + "epoch": 0.79, + "grad_norm": 11.432220701239354, + "learning_rate": 1.915377665272139e-06, + "loss": 0.7227, + "step": 5295 + }, + { + "epoch": 0.79, + "grad_norm": 1.4716210598302022, + "learning_rate": 1.9153387670191834e-06, + "loss": 0.6934, + "step": 5296 + }, + { + "epoch": 0.79, + "grad_norm": 2.7209885257498145, + "learning_rate": 1.9152998602232898e-06, + "loss": 0.6927, + "step": 5297 + }, + { + "epoch": 0.79, + "grad_norm": 3.2204283054769767, + "learning_rate": 1.9152609448848224e-06, + "loss": 0.6908, + "step": 5298 + }, + { + "epoch": 0.79, + "grad_norm": 2.4519340467516675, + "learning_rate": 1.9152220210041433e-06, + "loss": 0.653, + "step": 5299 + }, + { + "epoch": 0.79, + "grad_norm": 5.270508461687908, + "learning_rate": 1.915183088581616e-06, + "loss": 0.679, + "step": 5300 + }, + { + "epoch": 0.79, + "grad_norm": 2.8152637561545615, + "learning_rate": 1.915144147617604e-06, + "loss": 0.6536, + "step": 5301 + }, + { + "epoch": 0.79, + "grad_norm": 3.5082242778265713, + "learning_rate": 1.915105198112471e-06, + "loss": 0.6784, + "step": 5302 + }, + { + "epoch": 0.79, + "grad_norm": 0.5566355306947642, + "learning_rate": 1.91506624006658e-06, + "loss": 0.6836, + "step": 5303 + }, + { + "epoch": 0.79, + "grad_norm": 0.665023721051778, + "learning_rate": 1.9150272734802952e-06, + "loss": 0.6543, + "step": 5304 + }, + { + "epoch": 0.79, + "grad_norm": 2.781433302139859, + "learning_rate": 1.9149882983539797e-06, + "loss": 0.6921, + "step": 5305 + }, + { + "epoch": 0.79, + "grad_norm": 2.102891479409787, + "learning_rate": 1.9149493146879975e-06, + "loss": 0.6921, + "step": 5306 + }, + { + "epoch": 0.79, + "grad_norm": 0.8749213259659145, + "learning_rate": 1.9149103224827123e-06, + "loss": 0.6667, + "step": 5307 + }, + { + "epoch": 0.79, + "grad_norm": 3.077293123415245, + "learning_rate": 1.9148713217384883e-06, + "loss": 0.6673, + "step": 5308 + }, + { + "epoch": 0.79, + "grad_norm": 4.169989496138715, + "learning_rate": 1.9148323124556895e-06, + "loss": 0.6947, + "step": 5309 + }, + { + "epoch": 0.79, + "grad_norm": 5.46093231984859, + "learning_rate": 1.9147932946346796e-06, + "loss": 0.6875, + "step": 5310 + }, + { + "epoch": 0.79, + "grad_norm": 3.638656427932575, + "learning_rate": 1.9147542682758227e-06, + "loss": 0.6484, + "step": 5311 + }, + { + "epoch": 0.79, + "grad_norm": 3.2558826446806655, + "learning_rate": 1.9147152333794836e-06, + "loss": 0.6888, + "step": 5312 + }, + { + "epoch": 0.79, + "grad_norm": 0.7210099547041231, + "learning_rate": 1.9146761899460265e-06, + "loss": 0.679, + "step": 5313 + }, + { + "epoch": 0.79, + "grad_norm": 1.7445369107447437, + "learning_rate": 1.9146371379758156e-06, + "loss": 0.6569, + "step": 5314 + }, + { + "epoch": 0.79, + "grad_norm": 9.283186825976683, + "learning_rate": 1.9145980774692156e-06, + "loss": 0.6758, + "step": 5315 + }, + { + "epoch": 0.79, + "grad_norm": 1.8484434690737648, + "learning_rate": 1.9145590084265906e-06, + "loss": 0.6868, + "step": 5316 + }, + { + "epoch": 0.79, + "grad_norm": 1.127849239617561, + "learning_rate": 1.9145199308483053e-06, + "loss": 0.6816, + "step": 5317 + }, + { + "epoch": 0.79, + "grad_norm": 4.252005523332189, + "learning_rate": 1.914480844734725e-06, + "loss": 0.6973, + "step": 5318 + }, + { + "epoch": 0.79, + "grad_norm": 5.741612831472131, + "learning_rate": 1.9144417500862134e-06, + "loss": 0.707, + "step": 5319 + }, + { + "epoch": 0.79, + "grad_norm": 4.801829654910554, + "learning_rate": 1.914402646903137e-06, + "loss": 0.6712, + "step": 5320 + }, + { + "epoch": 0.79, + "grad_norm": 0.5623531298403651, + "learning_rate": 1.914363535185859e-06, + "loss": 0.6895, + "step": 5321 + }, + { + "epoch": 0.79, + "grad_norm": 0.7383243392260396, + "learning_rate": 1.914324414934746e-06, + "loss": 0.7018, + "step": 5322 + }, + { + "epoch": 0.79, + "grad_norm": 1.3067305133821983, + "learning_rate": 1.9142852861501622e-06, + "loss": 0.6862, + "step": 5323 + }, + { + "epoch": 0.79, + "grad_norm": 0.6765967510045325, + "learning_rate": 1.914246148832473e-06, + "loss": 0.6621, + "step": 5324 + }, + { + "epoch": 0.79, + "grad_norm": 7.106651762843581, + "learning_rate": 1.9142070029820432e-06, + "loss": 0.6979, + "step": 5325 + }, + { + "epoch": 0.79, + "grad_norm": 1.4214601018212831, + "learning_rate": 1.9141678485992386e-06, + "loss": 0.6803, + "step": 5326 + }, + { + "epoch": 0.79, + "grad_norm": 2.2960984626937355, + "learning_rate": 1.914128685684425e-06, + "loss": 0.6855, + "step": 5327 + }, + { + "epoch": 0.79, + "grad_norm": 9.153183350090814, + "learning_rate": 1.9140895142379674e-06, + "loss": 0.6823, + "step": 5328 + }, + { + "epoch": 0.79, + "grad_norm": 2.1486123137013178, + "learning_rate": 1.9140503342602315e-06, + "loss": 0.6816, + "step": 5329 + }, + { + "epoch": 0.79, + "grad_norm": 1.7171794153103246, + "learning_rate": 1.9140111457515832e-06, + "loss": 0.6895, + "step": 5330 + }, + { + "epoch": 0.8, + "grad_norm": 5.079561887053426, + "learning_rate": 1.913971948712388e-06, + "loss": 0.6803, + "step": 5331 + }, + { + "epoch": 0.8, + "grad_norm": 5.0161156232739295, + "learning_rate": 1.9139327431430117e-06, + "loss": 0.6784, + "step": 5332 + }, + { + "epoch": 0.8, + "grad_norm": 2.587731313638269, + "learning_rate": 1.91389352904382e-06, + "loss": 0.6784, + "step": 5333 + }, + { + "epoch": 0.8, + "grad_norm": 2.5342202865902017, + "learning_rate": 1.913854306415179e-06, + "loss": 0.6895, + "step": 5334 + }, + { + "epoch": 0.8, + "grad_norm": 6.797199066881254, + "learning_rate": 1.9138150752574556e-06, + "loss": 0.7051, + "step": 5335 + }, + { + "epoch": 0.8, + "grad_norm": 3.773504582919025, + "learning_rate": 1.9137758355710153e-06, + "loss": 0.6836, + "step": 5336 + }, + { + "epoch": 0.8, + "grad_norm": 6.224965559867011, + "learning_rate": 1.9137365873562238e-06, + "loss": 0.6803, + "step": 5337 + }, + { + "epoch": 0.8, + "grad_norm": 3.6001877528190334, + "learning_rate": 1.913697330613448e-06, + "loss": 0.6797, + "step": 5338 + }, + { + "epoch": 0.8, + "grad_norm": 0.6132890414950464, + "learning_rate": 1.913658065343055e-06, + "loss": 0.6777, + "step": 5339 + }, + { + "epoch": 0.8, + "grad_norm": 1.7805824530101828, + "learning_rate": 1.9136187915454095e-06, + "loss": 0.6901, + "step": 5340 + }, + { + "epoch": 0.8, + "grad_norm": 0.9730190710510055, + "learning_rate": 1.9135795092208794e-06, + "loss": 0.6816, + "step": 5341 + }, + { + "epoch": 0.8, + "grad_norm": 0.6385307753946438, + "learning_rate": 1.9135402183698306e-06, + "loss": 0.6947, + "step": 5342 + }, + { + "epoch": 0.8, + "grad_norm": 4.318265106387141, + "learning_rate": 1.913500918992631e-06, + "loss": 0.7012, + "step": 5343 + }, + { + "epoch": 0.8, + "grad_norm": 3.5337561410901617, + "learning_rate": 1.913461611089646e-06, + "loss": 0.6842, + "step": 5344 + }, + { + "epoch": 0.8, + "grad_norm": 0.8099436391747247, + "learning_rate": 1.913422294661243e-06, + "loss": 0.6745, + "step": 5345 + }, + { + "epoch": 0.8, + "grad_norm": 3.5390630386747435, + "learning_rate": 1.913382969707789e-06, + "loss": 0.6842, + "step": 5346 + }, + { + "epoch": 0.8, + "grad_norm": 4.8294675761157695, + "learning_rate": 1.913343636229651e-06, + "loss": 0.6999, + "step": 5347 + }, + { + "epoch": 0.8, + "grad_norm": 1.247949305267284, + "learning_rate": 1.913304294227196e-06, + "loss": 0.6836, + "step": 5348 + }, + { + "epoch": 0.8, + "grad_norm": 5.699146846387004, + "learning_rate": 1.913264943700791e-06, + "loss": 0.6966, + "step": 5349 + }, + { + "epoch": 0.8, + "grad_norm": 1.6646424836688758, + "learning_rate": 1.913225584650804e-06, + "loss": 0.694, + "step": 5350 + }, + { + "epoch": 0.8, + "grad_norm": 1.51395552537227, + "learning_rate": 1.9131862170776014e-06, + "loss": 0.6732, + "step": 5351 + }, + { + "epoch": 0.8, + "grad_norm": 1.2922720442069453, + "learning_rate": 1.913146840981551e-06, + "loss": 0.6914, + "step": 5352 + }, + { + "epoch": 0.8, + "grad_norm": 0.34595253883062527, + "learning_rate": 1.9131074563630213e-06, + "loss": 0.6797, + "step": 5353 + }, + { + "epoch": 0.8, + "grad_norm": 8.200146886002653, + "learning_rate": 1.913068063222378e-06, + "loss": 0.6855, + "step": 5354 + }, + { + "epoch": 0.8, + "grad_norm": 0.5326391770291503, + "learning_rate": 1.91302866155999e-06, + "loss": 0.6589, + "step": 5355 + }, + { + "epoch": 0.8, + "grad_norm": 0.6851408250296119, + "learning_rate": 1.912989251376225e-06, + "loss": 0.681, + "step": 5356 + }, + { + "epoch": 0.8, + "grad_norm": 1.5101138886230872, + "learning_rate": 1.9129498326714505e-06, + "loss": 0.694, + "step": 5357 + }, + { + "epoch": 0.8, + "grad_norm": 8.736553762294813, + "learning_rate": 1.9129104054460343e-06, + "loss": 0.6797, + "step": 5358 + }, + { + "epoch": 0.8, + "grad_norm": 0.5324244521895196, + "learning_rate": 1.9128709697003446e-06, + "loss": 0.6908, + "step": 5359 + }, + { + "epoch": 0.8, + "grad_norm": 1.3702247524028457, + "learning_rate": 1.9128315254347494e-06, + "loss": 0.6771, + "step": 5360 + }, + { + "epoch": 0.8, + "grad_norm": 3.484332641869686, + "learning_rate": 1.912792072649617e-06, + "loss": 0.6921, + "step": 5361 + }, + { + "epoch": 0.8, + "grad_norm": 2.473137049966319, + "learning_rate": 1.9127526113453153e-06, + "loss": 0.6836, + "step": 5362 + }, + { + "epoch": 0.8, + "grad_norm": 2.92384623372227, + "learning_rate": 1.912713141522213e-06, + "loss": 0.6686, + "step": 5363 + }, + { + "epoch": 0.8, + "grad_norm": 3.4304416872233325, + "learning_rate": 1.912673663180678e-06, + "loss": 0.6992, + "step": 5364 + }, + { + "epoch": 0.8, + "grad_norm": 0.9237269121174538, + "learning_rate": 1.912634176321079e-06, + "loss": 0.6895, + "step": 5365 + }, + { + "epoch": 0.8, + "grad_norm": 2.4156732686293485, + "learning_rate": 1.9125946809437843e-06, + "loss": 0.6914, + "step": 5366 + }, + { + "epoch": 0.8, + "grad_norm": 2.53281265184044, + "learning_rate": 1.912555177049163e-06, + "loss": 0.681, + "step": 5367 + }, + { + "epoch": 0.8, + "grad_norm": 3.3254405582154636, + "learning_rate": 1.9125156646375837e-06, + "loss": 0.6784, + "step": 5368 + }, + { + "epoch": 0.8, + "grad_norm": 1.4487463843697181, + "learning_rate": 1.912476143709415e-06, + "loss": 0.6771, + "step": 5369 + }, + { + "epoch": 0.8, + "grad_norm": 4.54781793833766, + "learning_rate": 1.9124366142650253e-06, + "loss": 0.6667, + "step": 5370 + }, + { + "epoch": 0.8, + "grad_norm": 0.7970463352233956, + "learning_rate": 1.912397076304784e-06, + "loss": 0.6803, + "step": 5371 + }, + { + "epoch": 0.8, + "grad_norm": 5.725442708099446, + "learning_rate": 1.9123575298290607e-06, + "loss": 0.694, + "step": 5372 + }, + { + "epoch": 0.8, + "grad_norm": 3.194644993104653, + "learning_rate": 1.9123179748382235e-06, + "loss": 0.7155, + "step": 5373 + }, + { + "epoch": 0.8, + "grad_norm": 3.194943323083387, + "learning_rate": 1.912278411332642e-06, + "loss": 0.681, + "step": 5374 + }, + { + "epoch": 0.8, + "grad_norm": 0.5296387165706179, + "learning_rate": 1.9122388393126852e-06, + "loss": 0.668, + "step": 5375 + }, + { + "epoch": 0.8, + "grad_norm": 3.07366605031229, + "learning_rate": 1.912199258778723e-06, + "loss": 0.6947, + "step": 5376 + }, + { + "epoch": 0.8, + "grad_norm": 10.14571884465069, + "learning_rate": 1.9121596697311243e-06, + "loss": 0.7005, + "step": 5377 + }, + { + "epoch": 0.8, + "grad_norm": 3.073495102478522, + "learning_rate": 1.912120072170259e-06, + "loss": 0.7005, + "step": 5378 + }, + { + "epoch": 0.8, + "grad_norm": 0.45309732440097916, + "learning_rate": 1.912080466096496e-06, + "loss": 0.6777, + "step": 5379 + }, + { + "epoch": 0.8, + "grad_norm": 2.687754273636608, + "learning_rate": 1.9120408515102053e-06, + "loss": 0.6751, + "step": 5380 + }, + { + "epoch": 0.8, + "grad_norm": 12.259371757352644, + "learning_rate": 1.912001228411757e-06, + "loss": 0.6842, + "step": 5381 + }, + { + "epoch": 0.8, + "grad_norm": 0.45632645494601815, + "learning_rate": 1.9119615968015207e-06, + "loss": 0.6771, + "step": 5382 + }, + { + "epoch": 0.8, + "grad_norm": 4.102308754263552, + "learning_rate": 1.911921956679866e-06, + "loss": 0.6914, + "step": 5383 + }, + { + "epoch": 0.8, + "grad_norm": 1.7484208674912138, + "learning_rate": 1.911882308047163e-06, + "loss": 0.6953, + "step": 5384 + }, + { + "epoch": 0.8, + "grad_norm": 2.25626350528375, + "learning_rate": 1.9118426509037816e-06, + "loss": 0.6829, + "step": 5385 + }, + { + "epoch": 0.8, + "grad_norm": 1.9833362424703695, + "learning_rate": 1.911802985250092e-06, + "loss": 0.6706, + "step": 5386 + }, + { + "epoch": 0.8, + "grad_norm": 4.604006485342938, + "learning_rate": 1.9117633110864648e-06, + "loss": 0.6901, + "step": 5387 + }, + { + "epoch": 0.8, + "grad_norm": 0.7123367964736932, + "learning_rate": 1.91172362841327e-06, + "loss": 0.6849, + "step": 5388 + }, + { + "epoch": 0.8, + "grad_norm": 1.4356696554466584, + "learning_rate": 1.911683937230878e-06, + "loss": 0.6738, + "step": 5389 + }, + { + "epoch": 0.8, + "grad_norm": 4.493886691473135, + "learning_rate": 1.911644237539659e-06, + "loss": 0.6719, + "step": 5390 + }, + { + "epoch": 0.8, + "grad_norm": 1.5655741208924117, + "learning_rate": 1.9116045293399836e-06, + "loss": 0.6979, + "step": 5391 + }, + { + "epoch": 0.8, + "grad_norm": 4.96454369782311, + "learning_rate": 1.9115648126322227e-06, + "loss": 0.6771, + "step": 5392 + }, + { + "epoch": 0.8, + "grad_norm": 0.658096152052235, + "learning_rate": 1.911525087416747e-06, + "loss": 0.6699, + "step": 5393 + }, + { + "epoch": 0.8, + "grad_norm": 5.584702528784411, + "learning_rate": 1.911485353693926e-06, + "loss": 0.6888, + "step": 5394 + }, + { + "epoch": 0.8, + "grad_norm": 2.3950599651343487, + "learning_rate": 1.911445611464133e-06, + "loss": 0.7038, + "step": 5395 + }, + { + "epoch": 0.8, + "grad_norm": 3.603041488486046, + "learning_rate": 1.911405860727737e-06, + "loss": 0.6497, + "step": 5396 + }, + { + "epoch": 0.8, + "grad_norm": 3.469833042841585, + "learning_rate": 1.911366101485109e-06, + "loss": 0.7064, + "step": 5397 + }, + { + "epoch": 0.81, + "grad_norm": 4.04644743112094, + "learning_rate": 1.9113263337366213e-06, + "loss": 0.6777, + "step": 5398 + }, + { + "epoch": 0.81, + "grad_norm": 2.845410376886332, + "learning_rate": 1.911286557482644e-06, + "loss": 0.6934, + "step": 5399 + }, + { + "epoch": 0.81, + "grad_norm": 5.204116562993643, + "learning_rate": 1.9112467727235485e-06, + "loss": 0.6908, + "step": 5400 + }, + { + "epoch": 0.81, + "grad_norm": 2.5993468315424737, + "learning_rate": 1.9112069794597064e-06, + "loss": 0.7174, + "step": 5401 + }, + { + "epoch": 0.81, + "grad_norm": 1.2862311561163153, + "learning_rate": 1.911167177691489e-06, + "loss": 0.679, + "step": 5402 + }, + { + "epoch": 0.81, + "grad_norm": 5.804012966127382, + "learning_rate": 1.9111273674192683e-06, + "loss": 0.6797, + "step": 5403 + }, + { + "epoch": 0.81, + "grad_norm": 3.978158372640676, + "learning_rate": 1.9110875486434142e-06, + "loss": 0.6908, + "step": 5404 + }, + { + "epoch": 0.81, + "grad_norm": 0.7761711891364356, + "learning_rate": 1.9110477213643e-06, + "loss": 0.6628, + "step": 5405 + }, + { + "epoch": 0.81, + "grad_norm": 6.798688805756057, + "learning_rate": 1.911007885582297e-06, + "loss": 0.6921, + "step": 5406 + }, + { + "epoch": 0.81, + "grad_norm": 5.4303536648137625, + "learning_rate": 1.910968041297777e-06, + "loss": 0.6914, + "step": 5407 + }, + { + "epoch": 0.81, + "grad_norm": 6.575375114305759, + "learning_rate": 1.910928188511111e-06, + "loss": 0.6986, + "step": 5408 + }, + { + "epoch": 0.81, + "grad_norm": 2.7154241064974287, + "learning_rate": 1.910888327222672e-06, + "loss": 0.6816, + "step": 5409 + }, + { + "epoch": 0.81, + "grad_norm": 2.5471353877724394, + "learning_rate": 1.9108484574328325e-06, + "loss": 0.6797, + "step": 5410 + }, + { + "epoch": 0.81, + "grad_norm": 4.1427653372782505, + "learning_rate": 1.910808579141963e-06, + "loss": 0.6725, + "step": 5411 + }, + { + "epoch": 0.81, + "grad_norm": 3.587760062097268, + "learning_rate": 1.9107686923504365e-06, + "loss": 0.6693, + "step": 5412 + }, + { + "epoch": 0.81, + "grad_norm": 6.115576815671119, + "learning_rate": 1.910728797058625e-06, + "loss": 0.6732, + "step": 5413 + }, + { + "epoch": 0.81, + "grad_norm": 3.3650511644413115, + "learning_rate": 1.910688893266902e-06, + "loss": 0.7018, + "step": 5414 + }, + { + "epoch": 0.81, + "grad_norm": 1.229592613513115, + "learning_rate": 1.910648980975638e-06, + "loss": 0.6855, + "step": 5415 + }, + { + "epoch": 0.81, + "grad_norm": 2.4685420857483593, + "learning_rate": 1.910609060185207e-06, + "loss": 0.6803, + "step": 5416 + }, + { + "epoch": 0.81, + "grad_norm": 5.36210639265082, + "learning_rate": 1.910569130895981e-06, + "loss": 0.6868, + "step": 5417 + }, + { + "epoch": 0.81, + "grad_norm": 1.0175201314164835, + "learning_rate": 1.9105291931083326e-06, + "loss": 0.6771, + "step": 5418 + }, + { + "epoch": 0.81, + "grad_norm": 3.806481632579675, + "learning_rate": 1.9104892468226353e-06, + "loss": 0.6849, + "step": 5419 + }, + { + "epoch": 0.81, + "grad_norm": 1.270693115808768, + "learning_rate": 1.910449292039261e-06, + "loss": 0.6914, + "step": 5420 + }, + { + "epoch": 0.81, + "grad_norm": 2.264300145001483, + "learning_rate": 1.910409328758583e-06, + "loss": 0.6823, + "step": 5421 + }, + { + "epoch": 0.81, + "grad_norm": 1.3159868320991486, + "learning_rate": 1.910369356980974e-06, + "loss": 0.6797, + "step": 5422 + }, + { + "epoch": 0.81, + "grad_norm": 1.106489621120441, + "learning_rate": 1.9103293767068075e-06, + "loss": 0.6934, + "step": 5423 + }, + { + "epoch": 0.81, + "grad_norm": 4.486634394969297, + "learning_rate": 1.9102893879364562e-06, + "loss": 0.6634, + "step": 5424 + }, + { + "epoch": 0.81, + "grad_norm": 0.9443697107489045, + "learning_rate": 1.9102493906702938e-06, + "loss": 0.694, + "step": 5425 + }, + { + "epoch": 0.81, + "grad_norm": 1.728547745336364, + "learning_rate": 1.9102093849086932e-06, + "loss": 0.6517, + "step": 5426 + }, + { + "epoch": 0.81, + "grad_norm": 2.505511492014726, + "learning_rate": 1.9101693706520278e-06, + "loss": 0.6758, + "step": 5427 + }, + { + "epoch": 0.81, + "grad_norm": 5.697622274783304, + "learning_rate": 1.9101293479006713e-06, + "loss": 0.6699, + "step": 5428 + }, + { + "epoch": 0.81, + "grad_norm": 2.828783352672083, + "learning_rate": 1.9100893166549974e-06, + "loss": 0.6816, + "step": 5429 + }, + { + "epoch": 0.81, + "grad_norm": 2.3223891530755743, + "learning_rate": 1.910049276915379e-06, + "loss": 0.6647, + "step": 5430 + }, + { + "epoch": 0.81, + "grad_norm": 0.6683240222460095, + "learning_rate": 1.9100092286821905e-06, + "loss": 0.6654, + "step": 5431 + }, + { + "epoch": 0.81, + "grad_norm": 4.022447940787219, + "learning_rate": 1.909969171955805e-06, + "loss": 0.6855, + "step": 5432 + }, + { + "epoch": 0.81, + "grad_norm": 4.885855760577506, + "learning_rate": 1.9099291067365973e-06, + "loss": 0.6895, + "step": 5433 + }, + { + "epoch": 0.81, + "grad_norm": 2.140346290803164, + "learning_rate": 1.9098890330249405e-06, + "loss": 0.6895, + "step": 5434 + }, + { + "epoch": 0.81, + "grad_norm": 1.4187799986356722, + "learning_rate": 1.909848950821209e-06, + "loss": 0.6569, + "step": 5435 + }, + { + "epoch": 0.81, + "grad_norm": 4.2591301474942895, + "learning_rate": 1.9098088601257766e-06, + "loss": 0.6829, + "step": 5436 + }, + { + "epoch": 0.81, + "grad_norm": 4.6451428066093845, + "learning_rate": 1.9097687609390174e-06, + "loss": 0.6732, + "step": 5437 + }, + { + "epoch": 0.81, + "grad_norm": 1.5786572909423557, + "learning_rate": 1.9097286532613065e-06, + "loss": 0.6875, + "step": 5438 + }, + { + "epoch": 0.81, + "grad_norm": 1.2131650221200236, + "learning_rate": 1.9096885370930173e-06, + "loss": 0.6914, + "step": 5439 + }, + { + "epoch": 0.81, + "grad_norm": 1.574565004786006, + "learning_rate": 1.9096484124345243e-06, + "loss": 0.6947, + "step": 5440 + }, + { + "epoch": 0.81, + "grad_norm": 2.195612088441409, + "learning_rate": 1.9096082792862027e-06, + "loss": 0.6777, + "step": 5441 + }, + { + "epoch": 0.81, + "grad_norm": 4.483404293811685, + "learning_rate": 1.9095681376484264e-06, + "loss": 0.6862, + "step": 5442 + }, + { + "epoch": 0.81, + "grad_norm": 6.033055000413567, + "learning_rate": 1.9095279875215704e-06, + "loss": 0.6875, + "step": 5443 + }, + { + "epoch": 0.81, + "grad_norm": 3.0127968042269733, + "learning_rate": 1.909487828906009e-06, + "loss": 0.6849, + "step": 5444 + }, + { + "epoch": 0.81, + "grad_norm": 1.6976555580714874, + "learning_rate": 1.909447661802117e-06, + "loss": 0.6751, + "step": 5445 + }, + { + "epoch": 0.81, + "grad_norm": 2.1213807824219213, + "learning_rate": 1.90940748621027e-06, + "loss": 0.6758, + "step": 5446 + }, + { + "epoch": 0.81, + "grad_norm": 3.6034238017014153, + "learning_rate": 1.9093673021308425e-06, + "loss": 0.6751, + "step": 5447 + }, + { + "epoch": 0.81, + "grad_norm": 2.75919847411345, + "learning_rate": 1.9093271095642096e-06, + "loss": 0.6712, + "step": 5448 + }, + { + "epoch": 0.81, + "grad_norm": 2.9659816763235773, + "learning_rate": 1.909286908510746e-06, + "loss": 0.6764, + "step": 5449 + }, + { + "epoch": 0.81, + "grad_norm": 0.5304222834784398, + "learning_rate": 1.9092466989708273e-06, + "loss": 0.6875, + "step": 5450 + }, + { + "epoch": 0.81, + "grad_norm": 0.4893162266451367, + "learning_rate": 1.909206480944829e-06, + "loss": 0.6823, + "step": 5451 + }, + { + "epoch": 0.81, + "grad_norm": 0.5157187869858134, + "learning_rate": 1.909166254433126e-06, + "loss": 0.6745, + "step": 5452 + }, + { + "epoch": 0.81, + "grad_norm": 5.245056060820955, + "learning_rate": 1.909126019436094e-06, + "loss": 0.6797, + "step": 5453 + }, + { + "epoch": 0.81, + "grad_norm": 4.419531017761062, + "learning_rate": 1.9090857759541087e-06, + "loss": 0.668, + "step": 5454 + }, + { + "epoch": 0.81, + "grad_norm": 3.0155505750481986, + "learning_rate": 1.9090455239875455e-06, + "loss": 0.6764, + "step": 5455 + }, + { + "epoch": 0.81, + "grad_norm": 1.1002494346397511, + "learning_rate": 1.9090052635367796e-06, + "loss": 0.6934, + "step": 5456 + }, + { + "epoch": 0.81, + "grad_norm": 0.8368075602841207, + "learning_rate": 1.9089649946021876e-06, + "loss": 0.666, + "step": 5457 + }, + { + "epoch": 0.81, + "grad_norm": 0.4973382378709329, + "learning_rate": 1.9089247171841448e-06, + "loss": 0.6803, + "step": 5458 + }, + { + "epoch": 0.81, + "grad_norm": 0.5197707097464869, + "learning_rate": 1.908884431283027e-06, + "loss": 0.6823, + "step": 5459 + }, + { + "epoch": 0.81, + "grad_norm": 4.210714597457763, + "learning_rate": 1.9088441368992104e-06, + "loss": 0.6986, + "step": 5460 + }, + { + "epoch": 0.81, + "grad_norm": 1.0234567090809092, + "learning_rate": 1.9088038340330715e-06, + "loss": 0.6901, + "step": 5461 + }, + { + "epoch": 0.81, + "grad_norm": 4.530196872891378, + "learning_rate": 1.9087635226849857e-06, + "loss": 0.6953, + "step": 5462 + }, + { + "epoch": 0.81, + "grad_norm": 4.568936466158479, + "learning_rate": 1.9087232028553298e-06, + "loss": 0.7057, + "step": 5463 + }, + { + "epoch": 0.81, + "grad_norm": 1.367526944736571, + "learning_rate": 1.9086828745444795e-06, + "loss": 0.6452, + "step": 5464 + }, + { + "epoch": 0.82, + "grad_norm": 3.6787337315248867, + "learning_rate": 1.9086425377528122e-06, + "loss": 0.6712, + "step": 5465 + }, + { + "epoch": 0.82, + "grad_norm": 10.083053758443006, + "learning_rate": 1.9086021924807036e-06, + "loss": 0.6706, + "step": 5466 + }, + { + "epoch": 0.82, + "grad_norm": 3.127501774504163, + "learning_rate": 1.9085618387285297e-06, + "loss": 0.6764, + "step": 5467 + }, + { + "epoch": 0.82, + "grad_norm": 1.1875166386616898, + "learning_rate": 1.9085214764966688e-06, + "loss": 0.666, + "step": 5468 + }, + { + "epoch": 0.82, + "grad_norm": 2.1779443526238365, + "learning_rate": 1.908481105785496e-06, + "loss": 0.6693, + "step": 5469 + }, + { + "epoch": 0.82, + "grad_norm": 6.201222501312542, + "learning_rate": 1.9084407265953887e-06, + "loss": 0.6986, + "step": 5470 + }, + { + "epoch": 0.82, + "grad_norm": 1.2077790508883879, + "learning_rate": 1.908400338926724e-06, + "loss": 0.6901, + "step": 5471 + }, + { + "epoch": 0.82, + "grad_norm": 2.5832507215831164, + "learning_rate": 1.9083599427798785e-06, + "loss": 0.6836, + "step": 5472 + }, + { + "epoch": 0.82, + "grad_norm": 1.3914376823415122, + "learning_rate": 1.9083195381552293e-06, + "loss": 0.6908, + "step": 5473 + }, + { + "epoch": 0.82, + "grad_norm": 2.899767592830175, + "learning_rate": 1.9082791250531534e-06, + "loss": 0.7018, + "step": 5474 + }, + { + "epoch": 0.82, + "grad_norm": 5.214299218729961, + "learning_rate": 1.908238703474028e-06, + "loss": 0.6875, + "step": 5475 + }, + { + "epoch": 0.82, + "grad_norm": 6.810100046663468, + "learning_rate": 1.9081982734182308e-06, + "loss": 0.6908, + "step": 5476 + }, + { + "epoch": 0.82, + "grad_norm": 2.0585482000393385, + "learning_rate": 1.9081578348861387e-06, + "loss": 0.7168, + "step": 5477 + }, + { + "epoch": 0.82, + "grad_norm": 1.5606487313478286, + "learning_rate": 1.9081173878781293e-06, + "loss": 0.6914, + "step": 5478 + }, + { + "epoch": 0.82, + "grad_norm": 4.392562032238874, + "learning_rate": 1.90807693239458e-06, + "loss": 0.6771, + "step": 5479 + }, + { + "epoch": 0.82, + "grad_norm": 0.6865232715480841, + "learning_rate": 1.9080364684358683e-06, + "loss": 0.7025, + "step": 5480 + }, + { + "epoch": 0.82, + "grad_norm": 1.6170390365593836, + "learning_rate": 1.9079959960023718e-06, + "loss": 0.6751, + "step": 5481 + }, + { + "epoch": 0.82, + "grad_norm": 2.9726649503527964, + "learning_rate": 1.907955515094468e-06, + "loss": 0.6777, + "step": 5482 + }, + { + "epoch": 0.82, + "grad_norm": 2.2448436159621483, + "learning_rate": 1.907915025712536e-06, + "loss": 0.679, + "step": 5483 + }, + { + "epoch": 0.82, + "grad_norm": 2.838720190070574, + "learning_rate": 1.9078745278569526e-06, + "loss": 0.6953, + "step": 5484 + }, + { + "epoch": 0.82, + "grad_norm": 5.292577110669105, + "learning_rate": 1.9078340215280956e-06, + "loss": 0.6745, + "step": 5485 + }, + { + "epoch": 0.82, + "grad_norm": 0.9722802901473545, + "learning_rate": 1.907793506726343e-06, + "loss": 0.7038, + "step": 5486 + }, + { + "epoch": 0.82, + "grad_norm": 7.192141468498595, + "learning_rate": 1.907752983452074e-06, + "loss": 0.7018, + "step": 5487 + }, + { + "epoch": 0.82, + "grad_norm": 0.527450883436419, + "learning_rate": 1.907712451705666e-06, + "loss": 0.6855, + "step": 5488 + }, + { + "epoch": 0.82, + "grad_norm": 2.048926085471254, + "learning_rate": 1.9076719114874974e-06, + "loss": 0.6823, + "step": 5489 + }, + { + "epoch": 0.82, + "grad_norm": 3.1854030253616425, + "learning_rate": 1.9076313627979468e-06, + "loss": 0.6836, + "step": 5490 + }, + { + "epoch": 0.82, + "grad_norm": 2.8601122331909363, + "learning_rate": 1.9075908056373916e-06, + "loss": 0.7051, + "step": 5491 + }, + { + "epoch": 0.82, + "grad_norm": 2.7418189526086105, + "learning_rate": 1.907550240006212e-06, + "loss": 0.6862, + "step": 5492 + }, + { + "epoch": 0.82, + "grad_norm": 4.923063926991367, + "learning_rate": 1.9075096659047855e-06, + "loss": 0.6914, + "step": 5493 + }, + { + "epoch": 0.82, + "grad_norm": 0.9930556726369295, + "learning_rate": 1.907469083333491e-06, + "loss": 0.7064, + "step": 5494 + }, + { + "epoch": 0.82, + "grad_norm": 3.3706675597206615, + "learning_rate": 1.9074284922927072e-06, + "loss": 0.6738, + "step": 5495 + }, + { + "epoch": 0.82, + "grad_norm": 1.4865047555726787, + "learning_rate": 1.907387892782813e-06, + "loss": 0.6836, + "step": 5496 + }, + { + "epoch": 0.82, + "grad_norm": 1.9638886750769846, + "learning_rate": 1.9073472848041874e-06, + "loss": 0.679, + "step": 5497 + }, + { + "epoch": 0.82, + "grad_norm": 0.5084467427604431, + "learning_rate": 1.9073066683572097e-06, + "loss": 0.6882, + "step": 5498 + }, + { + "epoch": 0.82, + "grad_norm": 2.923361927211544, + "learning_rate": 1.907266043442258e-06, + "loss": 0.6875, + "step": 5499 + }, + { + "epoch": 0.82, + "grad_norm": 1.0933380003287518, + "learning_rate": 1.9072254100597124e-06, + "loss": 0.6693, + "step": 5500 + }, + { + "epoch": 0.82, + "grad_norm": 6.153848342356767, + "learning_rate": 1.907184768209952e-06, + "loss": 0.6868, + "step": 5501 + }, + { + "epoch": 0.82, + "grad_norm": 0.8926452488264023, + "learning_rate": 1.9071441178933557e-06, + "loss": 0.6647, + "step": 5502 + }, + { + "epoch": 0.82, + "grad_norm": 1.0496568133996464, + "learning_rate": 1.9071034591103032e-06, + "loss": 0.6836, + "step": 5503 + }, + { + "epoch": 0.82, + "grad_norm": 0.7173697014190534, + "learning_rate": 1.9070627918611739e-06, + "loss": 0.6836, + "step": 5504 + }, + { + "epoch": 0.82, + "grad_norm": 0.7879396707748789, + "learning_rate": 1.9070221161463473e-06, + "loss": 0.6634, + "step": 5505 + }, + { + "epoch": 0.82, + "grad_norm": 7.672777599451549, + "learning_rate": 1.9069814319662033e-06, + "loss": 0.6986, + "step": 5506 + }, + { + "epoch": 0.82, + "grad_norm": 2.980769777113529, + "learning_rate": 1.906940739321121e-06, + "loss": 0.6797, + "step": 5507 + }, + { + "epoch": 0.82, + "grad_norm": 1.1697154073411653, + "learning_rate": 1.9069000382114809e-06, + "loss": 0.6732, + "step": 5508 + }, + { + "epoch": 0.82, + "grad_norm": 4.38316977616134, + "learning_rate": 1.9068593286376622e-06, + "loss": 0.7057, + "step": 5509 + }, + { + "epoch": 0.82, + "grad_norm": 3.1103146584092167, + "learning_rate": 1.9068186106000454e-06, + "loss": 0.6829, + "step": 5510 + }, + { + "epoch": 0.82, + "grad_norm": 4.310400243416692, + "learning_rate": 1.9067778840990102e-06, + "loss": 0.6816, + "step": 5511 + }, + { + "epoch": 0.82, + "grad_norm": 5.392394439902363, + "learning_rate": 1.906737149134937e-06, + "loss": 0.6855, + "step": 5512 + }, + { + "epoch": 0.82, + "grad_norm": 1.4131561576491556, + "learning_rate": 1.9066964057082058e-06, + "loss": 0.6895, + "step": 5513 + }, + { + "epoch": 0.82, + "grad_norm": 0.8291043737926767, + "learning_rate": 1.9066556538191968e-06, + "loss": 0.6751, + "step": 5514 + }, + { + "epoch": 0.82, + "grad_norm": 3.5228480341394834, + "learning_rate": 1.9066148934682904e-06, + "loss": 0.6908, + "step": 5515 + }, + { + "epoch": 0.82, + "grad_norm": 1.0042488976515052, + "learning_rate": 1.906574124655867e-06, + "loss": 0.6745, + "step": 5516 + }, + { + "epoch": 0.82, + "grad_norm": 7.466600628882615, + "learning_rate": 1.9065333473823073e-06, + "loss": 0.694, + "step": 5517 + }, + { + "epoch": 0.82, + "grad_norm": 2.235110594055505, + "learning_rate": 1.9064925616479913e-06, + "loss": 0.6816, + "step": 5518 + }, + { + "epoch": 0.82, + "grad_norm": 0.5911316528705001, + "learning_rate": 1.9064517674533004e-06, + "loss": 0.6523, + "step": 5519 + }, + { + "epoch": 0.82, + "grad_norm": 4.656923950951761, + "learning_rate": 1.9064109647986149e-06, + "loss": 0.6641, + "step": 5520 + }, + { + "epoch": 0.82, + "grad_norm": 3.9541338671619948, + "learning_rate": 1.9063701536843157e-06, + "loss": 0.6862, + "step": 5521 + }, + { + "epoch": 0.82, + "grad_norm": 2.1524806527486637, + "learning_rate": 1.9063293341107838e-06, + "loss": 0.6589, + "step": 5522 + }, + { + "epoch": 0.82, + "grad_norm": 6.075029383816859, + "learning_rate": 1.9062885060783998e-06, + "loss": 0.694, + "step": 5523 + }, + { + "epoch": 0.82, + "grad_norm": 1.4309766803121466, + "learning_rate": 1.9062476695875451e-06, + "loss": 0.6934, + "step": 5524 + }, + { + "epoch": 0.82, + "grad_norm": 4.266341590520029, + "learning_rate": 1.9062068246386008e-06, + "loss": 0.6667, + "step": 5525 + }, + { + "epoch": 0.82, + "grad_norm": 2.349668617910969, + "learning_rate": 1.9061659712319483e-06, + "loss": 0.6699, + "step": 5526 + }, + { + "epoch": 0.82, + "grad_norm": 2.8623251883377696, + "learning_rate": 1.9061251093679682e-06, + "loss": 0.6836, + "step": 5527 + }, + { + "epoch": 0.82, + "grad_norm": 1.9611527222895082, + "learning_rate": 1.9060842390470429e-06, + "loss": 0.6921, + "step": 5528 + }, + { + "epoch": 0.82, + "grad_norm": 0.8719238644133777, + "learning_rate": 1.9060433602695528e-06, + "loss": 0.6979, + "step": 5529 + }, + { + "epoch": 0.82, + "grad_norm": 4.346726851977964, + "learning_rate": 1.9060024730358801e-06, + "loss": 0.6699, + "step": 5530 + }, + { + "epoch": 0.82, + "grad_norm": 8.461802456751439, + "learning_rate": 1.905961577346406e-06, + "loss": 0.6849, + "step": 5531 + }, + { + "epoch": 0.83, + "grad_norm": 3.1045062609887557, + "learning_rate": 1.9059206732015125e-06, + "loss": 0.6934, + "step": 5532 + }, + { + "epoch": 0.83, + "grad_norm": 3.8535395331046938, + "learning_rate": 1.9058797606015813e-06, + "loss": 0.6973, + "step": 5533 + }, + { + "epoch": 0.83, + "grad_norm": 3.509742372142129, + "learning_rate": 1.9058388395469942e-06, + "loss": 0.6921, + "step": 5534 + }, + { + "epoch": 0.83, + "grad_norm": 0.6754059615603349, + "learning_rate": 1.9057979100381329e-06, + "loss": 0.6738, + "step": 5535 + }, + { + "epoch": 0.83, + "grad_norm": 1.2091607398730064, + "learning_rate": 1.9057569720753798e-06, + "loss": 0.6777, + "step": 5536 + }, + { + "epoch": 0.83, + "grad_norm": 1.6992908401957099, + "learning_rate": 1.9057160256591168e-06, + "loss": 0.6784, + "step": 5537 + }, + { + "epoch": 0.83, + "grad_norm": 7.501900314119416, + "learning_rate": 1.905675070789726e-06, + "loss": 0.7122, + "step": 5538 + }, + { + "epoch": 0.83, + "grad_norm": 6.716357896633141, + "learning_rate": 1.9056341074675895e-06, + "loss": 0.7051, + "step": 5539 + }, + { + "epoch": 0.83, + "grad_norm": 1.3457222333882157, + "learning_rate": 1.9055931356930898e-06, + "loss": 0.679, + "step": 5540 + }, + { + "epoch": 0.83, + "grad_norm": 1.4058457797311832, + "learning_rate": 1.9055521554666095e-06, + "loss": 0.7083, + "step": 5541 + }, + { + "epoch": 0.83, + "grad_norm": 1.8939512833677639, + "learning_rate": 1.9055111667885306e-06, + "loss": 0.6875, + "step": 5542 + }, + { + "epoch": 0.83, + "grad_norm": 3.6144577402911784, + "learning_rate": 1.9054701696592361e-06, + "loss": 0.6927, + "step": 5543 + }, + { + "epoch": 0.83, + "grad_norm": 2.432947596930979, + "learning_rate": 1.9054291640791086e-06, + "loss": 0.6816, + "step": 5544 + }, + { + "epoch": 0.83, + "grad_norm": 2.516394563642583, + "learning_rate": 1.9053881500485303e-06, + "loss": 0.6836, + "step": 5545 + }, + { + "epoch": 0.83, + "grad_norm": 1.3286123520151523, + "learning_rate": 1.9053471275678845e-06, + "loss": 0.7005, + "step": 5546 + }, + { + "epoch": 0.83, + "grad_norm": 4.433038387176593, + "learning_rate": 1.9053060966375537e-06, + "loss": 0.6816, + "step": 5547 + }, + { + "epoch": 0.83, + "grad_norm": 2.3560453839975755, + "learning_rate": 1.9052650572579213e-06, + "loss": 0.6602, + "step": 5548 + }, + { + "epoch": 0.83, + "grad_norm": 0.4569986654387582, + "learning_rate": 1.9052240094293696e-06, + "loss": 0.6979, + "step": 5549 + }, + { + "epoch": 0.83, + "grad_norm": 4.488576314997142, + "learning_rate": 1.9051829531522826e-06, + "loss": 0.6693, + "step": 5550 + }, + { + "epoch": 0.83, + "grad_norm": 0.9611763885294521, + "learning_rate": 1.905141888427043e-06, + "loss": 0.6823, + "step": 5551 + }, + { + "epoch": 0.83, + "grad_norm": 0.9498372537585407, + "learning_rate": 1.905100815254034e-06, + "loss": 0.6816, + "step": 5552 + }, + { + "epoch": 0.83, + "grad_norm": 3.008252723955906, + "learning_rate": 1.9050597336336392e-06, + "loss": 0.6829, + "step": 5553 + }, + { + "epoch": 0.83, + "grad_norm": 0.9034304034041019, + "learning_rate": 1.9050186435662417e-06, + "loss": 0.6777, + "step": 5554 + }, + { + "epoch": 0.83, + "grad_norm": 0.9399570766670484, + "learning_rate": 1.9049775450522252e-06, + "loss": 0.6771, + "step": 5555 + }, + { + "epoch": 0.83, + "grad_norm": 0.37099021733811455, + "learning_rate": 1.9049364380919734e-06, + "loss": 0.6849, + "step": 5556 + }, + { + "epoch": 0.83, + "grad_norm": 0.40486151236802925, + "learning_rate": 1.9048953226858696e-06, + "loss": 0.6751, + "step": 5557 + }, + { + "epoch": 0.83, + "grad_norm": 6.239667488169959, + "learning_rate": 1.9048541988342976e-06, + "loss": 0.7005, + "step": 5558 + }, + { + "epoch": 0.83, + "grad_norm": 6.340978068999794, + "learning_rate": 1.9048130665376418e-06, + "loss": 0.6992, + "step": 5559 + }, + { + "epoch": 0.83, + "grad_norm": 5.332801192347401, + "learning_rate": 1.9047719257962854e-06, + "loss": 0.6901, + "step": 5560 + }, + { + "epoch": 0.83, + "grad_norm": 0.5013617200877036, + "learning_rate": 1.9047307766106125e-06, + "loss": 0.6712, + "step": 5561 + }, + { + "epoch": 0.83, + "grad_norm": 1.6822386088326282, + "learning_rate": 1.9046896189810073e-06, + "loss": 0.6895, + "step": 5562 + }, + { + "epoch": 0.83, + "grad_norm": 3.6510129512975134, + "learning_rate": 1.9046484529078539e-06, + "loss": 0.6745, + "step": 5563 + }, + { + "epoch": 0.83, + "grad_norm": 0.430354723579045, + "learning_rate": 1.9046072783915366e-06, + "loss": 0.679, + "step": 5564 + }, + { + "epoch": 0.83, + "grad_norm": 4.942884948437109, + "learning_rate": 1.9045660954324394e-06, + "loss": 0.7122, + "step": 5565 + }, + { + "epoch": 0.83, + "grad_norm": 0.4561347428470864, + "learning_rate": 1.9045249040309468e-06, + "loss": 0.668, + "step": 5566 + }, + { + "epoch": 0.83, + "grad_norm": 1.8533093115338986, + "learning_rate": 1.9044837041874432e-06, + "loss": 0.6803, + "step": 5567 + }, + { + "epoch": 0.83, + "grad_norm": 3.9691231415096038, + "learning_rate": 1.9044424959023137e-06, + "loss": 0.6986, + "step": 5568 + }, + { + "epoch": 0.83, + "grad_norm": 5.244948029067535, + "learning_rate": 1.9044012791759419e-06, + "loss": 0.6882, + "step": 5569 + }, + { + "epoch": 0.83, + "grad_norm": 3.452168806051766, + "learning_rate": 1.9043600540087133e-06, + "loss": 0.6823, + "step": 5570 + }, + { + "epoch": 0.83, + "grad_norm": 5.48430303310187, + "learning_rate": 1.904318820401012e-06, + "loss": 0.7012, + "step": 5571 + }, + { + "epoch": 0.83, + "grad_norm": 4.521539043906775, + "learning_rate": 1.9042775783532234e-06, + "loss": 0.6777, + "step": 5572 + }, + { + "epoch": 0.83, + "grad_norm": 2.563345467283827, + "learning_rate": 1.9042363278657323e-06, + "loss": 0.6992, + "step": 5573 + }, + { + "epoch": 0.83, + "grad_norm": 3.5468957896889424, + "learning_rate": 1.9041950689389235e-06, + "loss": 0.6921, + "step": 5574 + }, + { + "epoch": 0.83, + "grad_norm": 4.269298626370951, + "learning_rate": 1.904153801573182e-06, + "loss": 0.6654, + "step": 5575 + }, + { + "epoch": 0.83, + "grad_norm": 0.8760341906137397, + "learning_rate": 1.9041125257688932e-06, + "loss": 0.6992, + "step": 5576 + }, + { + "epoch": 0.83, + "grad_norm": 1.8323690228524838, + "learning_rate": 1.9040712415264423e-06, + "loss": 0.6751, + "step": 5577 + }, + { + "epoch": 0.83, + "grad_norm": 1.403663640539477, + "learning_rate": 1.9040299488462144e-06, + "loss": 0.6868, + "step": 5578 + }, + { + "epoch": 0.83, + "grad_norm": 1.300935966256576, + "learning_rate": 1.9039886477285952e-06, + "loss": 0.6758, + "step": 5579 + }, + { + "epoch": 0.83, + "grad_norm": 2.3230239139010567, + "learning_rate": 1.90394733817397e-06, + "loss": 0.6973, + "step": 5580 + }, + { + "epoch": 0.83, + "grad_norm": 8.580814893298534, + "learning_rate": 1.9039060201827244e-06, + "loss": 0.7077, + "step": 5581 + }, + { + "epoch": 0.83, + "grad_norm": 4.842883049786627, + "learning_rate": 1.9038646937552438e-06, + "loss": 0.696, + "step": 5582 + }, + { + "epoch": 0.83, + "grad_norm": 4.191514461587872, + "learning_rate": 1.9038233588919142e-06, + "loss": 0.6914, + "step": 5583 + }, + { + "epoch": 0.83, + "grad_norm": 6.231342903077598, + "learning_rate": 1.903782015593121e-06, + "loss": 0.694, + "step": 5584 + }, + { + "epoch": 0.83, + "grad_norm": 2.380486649447479, + "learning_rate": 1.9037406638592507e-06, + "loss": 0.6953, + "step": 5585 + }, + { + "epoch": 0.83, + "grad_norm": 1.150370148568734, + "learning_rate": 1.9036993036906884e-06, + "loss": 0.6823, + "step": 5586 + }, + { + "epoch": 0.83, + "grad_norm": 1.5452880380155316, + "learning_rate": 1.9036579350878213e-06, + "loss": 0.6947, + "step": 5587 + }, + { + "epoch": 0.83, + "grad_norm": 0.8786276789392509, + "learning_rate": 1.9036165580510342e-06, + "loss": 0.6882, + "step": 5588 + }, + { + "epoch": 0.83, + "grad_norm": 1.8833083630213898, + "learning_rate": 1.9035751725807142e-06, + "loss": 0.6777, + "step": 5589 + }, + { + "epoch": 0.83, + "grad_norm": 3.206979991748056, + "learning_rate": 1.9035337786772469e-06, + "loss": 0.6823, + "step": 5590 + }, + { + "epoch": 0.83, + "grad_norm": 3.878651980265916, + "learning_rate": 1.903492376341019e-06, + "loss": 0.6908, + "step": 5591 + }, + { + "epoch": 0.83, + "grad_norm": 2.9663714503514953, + "learning_rate": 1.903450965572417e-06, + "loss": 0.6764, + "step": 5592 + }, + { + "epoch": 0.83, + "grad_norm": 0.48835185226363453, + "learning_rate": 1.903409546371827e-06, + "loss": 0.6875, + "step": 5593 + }, + { + "epoch": 0.83, + "grad_norm": 0.602337492842773, + "learning_rate": 1.9033681187396362e-06, + "loss": 0.6556, + "step": 5594 + }, + { + "epoch": 0.83, + "grad_norm": 4.110398669312392, + "learning_rate": 1.9033266826762305e-06, + "loss": 0.6868, + "step": 5595 + }, + { + "epoch": 0.83, + "grad_norm": 5.562402089130054, + "learning_rate": 1.903285238181997e-06, + "loss": 0.6725, + "step": 5596 + }, + { + "epoch": 0.83, + "grad_norm": 6.931575463062654, + "learning_rate": 1.9032437852573226e-06, + "loss": 0.696, + "step": 5597 + }, + { + "epoch": 0.83, + "grad_norm": 2.9534272532505077, + "learning_rate": 1.9032023239025942e-06, + "loss": 0.6829, + "step": 5598 + }, + { + "epoch": 0.84, + "grad_norm": 0.4356155227780331, + "learning_rate": 1.9031608541181985e-06, + "loss": 0.6699, + "step": 5599 + }, + { + "epoch": 0.84, + "grad_norm": 1.7873276854621354, + "learning_rate": 1.9031193759045227e-06, + "loss": 0.694, + "step": 5600 + }, + { + "epoch": 0.84, + "grad_norm": 0.8829814252582762, + "learning_rate": 1.9030778892619538e-06, + "loss": 0.6751, + "step": 5601 + }, + { + "epoch": 0.84, + "grad_norm": 4.798677522154087, + "learning_rate": 1.9030363941908792e-06, + "loss": 0.6647, + "step": 5602 + }, + { + "epoch": 0.84, + "grad_norm": 0.5138221677469433, + "learning_rate": 1.9029948906916862e-06, + "loss": 0.6758, + "step": 5603 + }, + { + "epoch": 0.84, + "grad_norm": 3.466451203212498, + "learning_rate": 1.9029533787647618e-06, + "loss": 0.6999, + "step": 5604 + }, + { + "epoch": 0.84, + "grad_norm": 1.212157115313361, + "learning_rate": 1.9029118584104938e-06, + "loss": 0.6706, + "step": 5605 + }, + { + "epoch": 0.84, + "grad_norm": 1.2818718778166838, + "learning_rate": 1.9028703296292691e-06, + "loss": 0.6855, + "step": 5606 + }, + { + "epoch": 0.84, + "grad_norm": 6.667039711939353, + "learning_rate": 1.9028287924214762e-06, + "loss": 0.7057, + "step": 5607 + }, + { + "epoch": 0.84, + "grad_norm": 0.8327088386838178, + "learning_rate": 1.9027872467875025e-06, + "loss": 0.681, + "step": 5608 + }, + { + "epoch": 0.84, + "grad_norm": 2.1723759375396887, + "learning_rate": 1.9027456927277355e-06, + "loss": 0.6654, + "step": 5609 + }, + { + "epoch": 0.84, + "grad_norm": 0.9202725400245598, + "learning_rate": 1.9027041302425629e-06, + "loss": 0.6855, + "step": 5610 + }, + { + "epoch": 0.84, + "grad_norm": 1.2735595024550885, + "learning_rate": 1.902662559332373e-06, + "loss": 0.6947, + "step": 5611 + }, + { + "epoch": 0.84, + "grad_norm": 0.8415535711084681, + "learning_rate": 1.9026209799975534e-06, + "loss": 0.6719, + "step": 5612 + }, + { + "epoch": 0.84, + "grad_norm": 4.411468942860319, + "learning_rate": 1.9025793922384927e-06, + "loss": 0.6654, + "step": 5613 + }, + { + "epoch": 0.84, + "grad_norm": 3.195490451419684, + "learning_rate": 1.9025377960555784e-06, + "loss": 0.6855, + "step": 5614 + }, + { + "epoch": 0.84, + "grad_norm": 0.7242749983763039, + "learning_rate": 1.9024961914491992e-06, + "loss": 0.6823, + "step": 5615 + }, + { + "epoch": 0.84, + "grad_norm": 0.9947161117837513, + "learning_rate": 1.9024545784197431e-06, + "loss": 0.6797, + "step": 5616 + }, + { + "epoch": 0.84, + "grad_norm": 1.7771540212277601, + "learning_rate": 1.9024129569675987e-06, + "loss": 0.6888, + "step": 5617 + }, + { + "epoch": 0.84, + "grad_norm": 3.3340339921474116, + "learning_rate": 1.9023713270931548e-06, + "loss": 0.6888, + "step": 5618 + }, + { + "epoch": 0.84, + "grad_norm": 1.8666583528263951, + "learning_rate": 1.9023296887967987e-06, + "loss": 0.6868, + "step": 5619 + }, + { + "epoch": 0.84, + "grad_norm": 2.3747849869845643, + "learning_rate": 1.9022880420789202e-06, + "loss": 0.6999, + "step": 5620 + }, + { + "epoch": 0.84, + "grad_norm": 3.9209564921938944, + "learning_rate": 1.9022463869399077e-06, + "loss": 0.6855, + "step": 5621 + }, + { + "epoch": 0.84, + "grad_norm": 1.3106768035126835, + "learning_rate": 1.9022047233801497e-06, + "loss": 0.6667, + "step": 5622 + }, + { + "epoch": 0.84, + "grad_norm": 3.9586487126789836, + "learning_rate": 1.9021630514000355e-06, + "loss": 0.6816, + "step": 5623 + }, + { + "epoch": 0.84, + "grad_norm": 6.16192282235105, + "learning_rate": 1.9021213709999535e-06, + "loss": 0.681, + "step": 5624 + }, + { + "epoch": 0.84, + "grad_norm": 0.7369812568023882, + "learning_rate": 1.902079682180293e-06, + "loss": 0.6712, + "step": 5625 + }, + { + "epoch": 0.84, + "grad_norm": 3.0804726017346984, + "learning_rate": 1.902037984941443e-06, + "loss": 0.6777, + "step": 5626 + }, + { + "epoch": 0.84, + "grad_norm": 1.467095060668563, + "learning_rate": 1.901996279283793e-06, + "loss": 0.6849, + "step": 5627 + }, + { + "epoch": 0.84, + "grad_norm": 2.0884931465249004, + "learning_rate": 1.9019545652077317e-06, + "loss": 0.7057, + "step": 5628 + }, + { + "epoch": 0.84, + "grad_norm": 0.8913512300780436, + "learning_rate": 1.901912842713649e-06, + "loss": 0.6934, + "step": 5629 + }, + { + "epoch": 0.84, + "grad_norm": 0.5115748136588367, + "learning_rate": 1.9018711118019332e-06, + "loss": 0.6849, + "step": 5630 + }, + { + "epoch": 0.84, + "grad_norm": 3.8678658689964105, + "learning_rate": 1.9018293724729753e-06, + "loss": 0.6738, + "step": 5631 + }, + { + "epoch": 0.84, + "grad_norm": 0.466789935749207, + "learning_rate": 1.9017876247271638e-06, + "loss": 0.6823, + "step": 5632 + }, + { + "epoch": 0.84, + "grad_norm": 2.555571829713764, + "learning_rate": 1.901745868564889e-06, + "loss": 0.696, + "step": 5633 + }, + { + "epoch": 0.84, + "grad_norm": 2.3945097179950103, + "learning_rate": 1.9017041039865398e-06, + "loss": 0.6764, + "step": 5634 + }, + { + "epoch": 0.84, + "grad_norm": 3.498458678144041, + "learning_rate": 1.9016623309925067e-06, + "loss": 0.6888, + "step": 5635 + }, + { + "epoch": 0.84, + "grad_norm": 0.7875264296671552, + "learning_rate": 1.9016205495831793e-06, + "loss": 0.6797, + "step": 5636 + }, + { + "epoch": 0.84, + "grad_norm": 0.4539165680269443, + "learning_rate": 1.9015787597589475e-06, + "loss": 0.6602, + "step": 5637 + }, + { + "epoch": 0.84, + "grad_norm": 0.4877404641015688, + "learning_rate": 1.9015369615202013e-06, + "loss": 0.6758, + "step": 5638 + }, + { + "epoch": 0.84, + "grad_norm": 2.447523559393441, + "learning_rate": 1.901495154867331e-06, + "loss": 0.6947, + "step": 5639 + }, + { + "epoch": 0.84, + "grad_norm": 5.24603456146483, + "learning_rate": 1.9014533398007269e-06, + "loss": 0.7005, + "step": 5640 + }, + { + "epoch": 0.84, + "grad_norm": 0.9774354686953249, + "learning_rate": 1.901411516320779e-06, + "loss": 0.6829, + "step": 5641 + }, + { + "epoch": 0.84, + "grad_norm": 2.8100876478671406, + "learning_rate": 1.9013696844278775e-06, + "loss": 0.681, + "step": 5642 + }, + { + "epoch": 0.84, + "grad_norm": 5.6299978435474936, + "learning_rate": 1.901327844122413e-06, + "loss": 0.6836, + "step": 5643 + }, + { + "epoch": 0.84, + "grad_norm": 0.7054331390989067, + "learning_rate": 1.9012859954047763e-06, + "loss": 0.6862, + "step": 5644 + }, + { + "epoch": 0.84, + "grad_norm": 0.47364210208110985, + "learning_rate": 1.9012441382753576e-06, + "loss": 0.6745, + "step": 5645 + }, + { + "epoch": 0.84, + "grad_norm": 3.9266122350726373, + "learning_rate": 1.901202272734548e-06, + "loss": 0.6914, + "step": 5646 + }, + { + "epoch": 0.84, + "grad_norm": 4.606600163849815, + "learning_rate": 1.9011603987827372e-06, + "loss": 0.6641, + "step": 5647 + }, + { + "epoch": 0.84, + "grad_norm": 3.7917842462473805, + "learning_rate": 1.901118516420317e-06, + "loss": 0.679, + "step": 5648 + }, + { + "epoch": 0.84, + "grad_norm": 4.546589532309159, + "learning_rate": 1.9010766256476777e-06, + "loss": 0.6823, + "step": 5649 + }, + { + "epoch": 0.84, + "grad_norm": 1.7628491356510734, + "learning_rate": 1.901034726465211e-06, + "loss": 0.6732, + "step": 5650 + }, + { + "epoch": 0.84, + "grad_norm": 3.4612057823070486, + "learning_rate": 1.9009928188733076e-06, + "loss": 0.6823, + "step": 5651 + }, + { + "epoch": 0.84, + "grad_norm": 0.90974246344727, + "learning_rate": 1.9009509028723583e-06, + "loss": 0.6797, + "step": 5652 + }, + { + "epoch": 0.84, + "grad_norm": 1.3321216712494932, + "learning_rate": 1.9009089784627545e-06, + "loss": 0.6562, + "step": 5653 + }, + { + "epoch": 0.84, + "grad_norm": 1.663047754354477, + "learning_rate": 1.9008670456448876e-06, + "loss": 0.6803, + "step": 5654 + }, + { + "epoch": 0.84, + "grad_norm": 2.795307518788447, + "learning_rate": 1.900825104419149e-06, + "loss": 0.6816, + "step": 5655 + }, + { + "epoch": 0.84, + "grad_norm": 0.6479373717707054, + "learning_rate": 1.9007831547859299e-06, + "loss": 0.6882, + "step": 5656 + }, + { + "epoch": 0.84, + "grad_norm": 1.2747659057648204, + "learning_rate": 1.900741196745622e-06, + "loss": 0.6927, + "step": 5657 + }, + { + "epoch": 0.84, + "grad_norm": 1.1489486306586663, + "learning_rate": 1.9006992302986168e-06, + "loss": 0.6888, + "step": 5658 + }, + { + "epoch": 0.84, + "grad_norm": 3.0631560679075096, + "learning_rate": 1.9006572554453059e-06, + "loss": 0.6934, + "step": 5659 + }, + { + "epoch": 0.84, + "grad_norm": 4.292291383592662, + "learning_rate": 1.9006152721860815e-06, + "loss": 0.668, + "step": 5660 + }, + { + "epoch": 0.84, + "grad_norm": 2.886055996123708, + "learning_rate": 1.900573280521335e-06, + "loss": 0.7174, + "step": 5661 + }, + { + "epoch": 0.84, + "grad_norm": 2.9485468562059807, + "learning_rate": 1.9005312804514584e-06, + "loss": 0.6862, + "step": 5662 + }, + { + "epoch": 0.84, + "grad_norm": 2.633852948573134, + "learning_rate": 1.9004892719768437e-06, + "loss": 0.6901, + "step": 5663 + }, + { + "epoch": 0.84, + "grad_norm": 1.4864091909031532, + "learning_rate": 1.900447255097883e-06, + "loss": 0.6921, + "step": 5664 + }, + { + "epoch": 0.84, + "grad_norm": 2.3226959674003864, + "learning_rate": 1.9004052298149684e-06, + "loss": 0.6745, + "step": 5665 + }, + { + "epoch": 0.85, + "grad_norm": 2.609281726305633, + "learning_rate": 1.9003631961284925e-06, + "loss": 0.6771, + "step": 5666 + }, + { + "epoch": 0.85, + "grad_norm": 1.2314401198019638, + "learning_rate": 1.9003211540388467e-06, + "loss": 0.6849, + "step": 5667 + }, + { + "epoch": 0.85, + "grad_norm": 4.268316054100911, + "learning_rate": 1.9002791035464245e-06, + "loss": 0.679, + "step": 5668 + }, + { + "epoch": 0.85, + "grad_norm": 4.818254798224212, + "learning_rate": 1.9002370446516175e-06, + "loss": 0.6797, + "step": 5669 + }, + { + "epoch": 0.85, + "grad_norm": 2.1032159842152875, + "learning_rate": 1.9001949773548184e-06, + "loss": 0.6979, + "step": 5670 + }, + { + "epoch": 0.85, + "grad_norm": 0.7182901217918238, + "learning_rate": 1.9001529016564204e-06, + "loss": 0.6836, + "step": 5671 + }, + { + "epoch": 0.85, + "grad_norm": 3.3622848907564897, + "learning_rate": 1.9001108175568153e-06, + "loss": 0.6927, + "step": 5672 + }, + { + "epoch": 0.85, + "grad_norm": 5.0556311899896045, + "learning_rate": 1.9000687250563967e-06, + "loss": 0.6921, + "step": 5673 + }, + { + "epoch": 0.85, + "grad_norm": 1.8013458967099603, + "learning_rate": 1.900026624155557e-06, + "loss": 0.6842, + "step": 5674 + }, + { + "epoch": 0.85, + "grad_norm": 2.1382900199681067, + "learning_rate": 1.8999845148546892e-06, + "loss": 0.6797, + "step": 5675 + }, + { + "epoch": 0.85, + "grad_norm": 2.2795776092559787, + "learning_rate": 1.8999423971541862e-06, + "loss": 0.6712, + "step": 5676 + }, + { + "epoch": 0.85, + "grad_norm": 2.1939675239421175, + "learning_rate": 1.8999002710544412e-06, + "loss": 0.6719, + "step": 5677 + }, + { + "epoch": 0.85, + "grad_norm": 4.16970920615337, + "learning_rate": 1.8998581365558478e-06, + "loss": 0.6888, + "step": 5678 + }, + { + "epoch": 0.85, + "grad_norm": 1.3312513650899354, + "learning_rate": 1.8998159936587986e-06, + "loss": 0.6921, + "step": 5679 + }, + { + "epoch": 0.85, + "grad_norm": 1.4192313603325126, + "learning_rate": 1.899773842363687e-06, + "loss": 0.6953, + "step": 5680 + }, + { + "epoch": 0.85, + "grad_norm": 1.7602624522432158, + "learning_rate": 1.8997316826709064e-06, + "loss": 0.6667, + "step": 5681 + }, + { + "epoch": 0.85, + "grad_norm": 5.162450537788318, + "learning_rate": 1.8996895145808506e-06, + "loss": 0.6823, + "step": 5682 + }, + { + "epoch": 0.85, + "grad_norm": 4.501587496565855, + "learning_rate": 1.8996473380939133e-06, + "loss": 0.6836, + "step": 5683 + }, + { + "epoch": 0.85, + "grad_norm": 0.6496453754647847, + "learning_rate": 1.8996051532104878e-06, + "loss": 0.6771, + "step": 5684 + }, + { + "epoch": 0.85, + "grad_norm": 2.0876229669716295, + "learning_rate": 1.8995629599309675e-06, + "loss": 0.6751, + "step": 5685 + }, + { + "epoch": 0.85, + "grad_norm": 0.6179285748744595, + "learning_rate": 1.8995207582557469e-06, + "loss": 0.6855, + "step": 5686 + }, + { + "epoch": 0.85, + "grad_norm": 1.045979277780798, + "learning_rate": 1.899478548185219e-06, + "loss": 0.707, + "step": 5687 + }, + { + "epoch": 0.85, + "grad_norm": 3.1419130575104504, + "learning_rate": 1.8994363297197789e-06, + "loss": 0.6829, + "step": 5688 + }, + { + "epoch": 0.85, + "grad_norm": 3.7241946836669815, + "learning_rate": 1.8993941028598197e-06, + "loss": 0.6895, + "step": 5689 + }, + { + "epoch": 0.85, + "grad_norm": 6.165313805286444, + "learning_rate": 1.8993518676057357e-06, + "loss": 0.6712, + "step": 5690 + }, + { + "epoch": 0.85, + "grad_norm": 2.671142936661056, + "learning_rate": 1.8993096239579215e-06, + "loss": 0.6966, + "step": 5691 + }, + { + "epoch": 0.85, + "grad_norm": 0.6938901904841335, + "learning_rate": 1.8992673719167707e-06, + "loss": 0.6784, + "step": 5692 + }, + { + "epoch": 0.85, + "grad_norm": 2.976364326847813, + "learning_rate": 1.899225111482678e-06, + "loss": 0.6914, + "step": 5693 + }, + { + "epoch": 0.85, + "grad_norm": 2.5372933123006596, + "learning_rate": 1.8991828426560378e-06, + "loss": 0.6973, + "step": 5694 + }, + { + "epoch": 0.85, + "grad_norm": 4.520438450410901, + "learning_rate": 1.899140565437245e-06, + "loss": 0.6803, + "step": 5695 + }, + { + "epoch": 0.85, + "grad_norm": 4.155176673604168, + "learning_rate": 1.8990982798266933e-06, + "loss": 0.6621, + "step": 5696 + }, + { + "epoch": 0.85, + "grad_norm": 2.275211006486067, + "learning_rate": 1.899055985824778e-06, + "loss": 0.6784, + "step": 5697 + }, + { + "epoch": 0.85, + "grad_norm": 0.697621767700265, + "learning_rate": 1.8990136834318938e-06, + "loss": 0.6784, + "step": 5698 + }, + { + "epoch": 0.85, + "grad_norm": 0.619970370030874, + "learning_rate": 1.8989713726484356e-06, + "loss": 0.6803, + "step": 5699 + }, + { + "epoch": 0.85, + "grad_norm": 1.7955639190223, + "learning_rate": 1.8989290534747977e-06, + "loss": 0.6764, + "step": 5700 + }, + { + "epoch": 0.85, + "grad_norm": 0.4822126077620442, + "learning_rate": 1.8988867259113754e-06, + "loss": 0.6895, + "step": 5701 + }, + { + "epoch": 0.85, + "grad_norm": 5.864200902362447, + "learning_rate": 1.898844389958564e-06, + "loss": 0.6947, + "step": 5702 + }, + { + "epoch": 0.85, + "grad_norm": 4.914384355045049, + "learning_rate": 1.8988020456167584e-06, + "loss": 0.6927, + "step": 5703 + }, + { + "epoch": 0.85, + "grad_norm": 3.3245644076642944, + "learning_rate": 1.8987596928863535e-06, + "loss": 0.6699, + "step": 5704 + }, + { + "epoch": 0.85, + "grad_norm": 0.8173784235488852, + "learning_rate": 1.8987173317677454e-06, + "loss": 0.6829, + "step": 5705 + }, + { + "epoch": 0.85, + "grad_norm": 4.014796859110358, + "learning_rate": 1.8986749622613287e-06, + "loss": 0.6667, + "step": 5706 + }, + { + "epoch": 0.85, + "grad_norm": 0.6230632678191484, + "learning_rate": 1.8986325843674994e-06, + "loss": 0.6823, + "step": 5707 + }, + { + "epoch": 0.85, + "grad_norm": 7.741686756394081, + "learning_rate": 1.8985901980866523e-06, + "loss": 0.6862, + "step": 5708 + }, + { + "epoch": 0.85, + "grad_norm": 0.8348524352382031, + "learning_rate": 1.8985478034191837e-06, + "loss": 0.6654, + "step": 5709 + }, + { + "epoch": 0.85, + "grad_norm": 2.0213473651315232, + "learning_rate": 1.8985054003654888e-06, + "loss": 0.6947, + "step": 5710 + }, + { + "epoch": 0.85, + "grad_norm": 5.559204799202996, + "learning_rate": 1.8984629889259636e-06, + "loss": 0.6725, + "step": 5711 + }, + { + "epoch": 0.85, + "grad_norm": 3.5003665957501666, + "learning_rate": 1.8984205691010039e-06, + "loss": 0.6784, + "step": 5712 + }, + { + "epoch": 0.85, + "grad_norm": 1.8532288938012103, + "learning_rate": 1.8983781408910056e-06, + "loss": 0.6895, + "step": 5713 + }, + { + "epoch": 0.85, + "grad_norm": 1.8333742487654896, + "learning_rate": 1.8983357042963646e-06, + "loss": 0.6849, + "step": 5714 + }, + { + "epoch": 0.85, + "grad_norm": 3.5439757805085557, + "learning_rate": 1.898293259317477e-06, + "loss": 0.7064, + "step": 5715 + }, + { + "epoch": 0.85, + "grad_norm": 4.8210389609459146, + "learning_rate": 1.898250805954739e-06, + "loss": 0.6868, + "step": 5716 + }, + { + "epoch": 0.85, + "grad_norm": 0.9681914765678747, + "learning_rate": 1.8982083442085468e-06, + "loss": 0.7031, + "step": 5717 + }, + { + "epoch": 0.85, + "grad_norm": 2.842885709181293, + "learning_rate": 1.8981658740792967e-06, + "loss": 0.6947, + "step": 5718 + }, + { + "epoch": 0.85, + "grad_norm": 1.9563318077930694, + "learning_rate": 1.898123395567385e-06, + "loss": 0.6654, + "step": 5719 + }, + { + "epoch": 0.85, + "grad_norm": 1.4257672989071417, + "learning_rate": 1.8980809086732083e-06, + "loss": 0.6947, + "step": 5720 + }, + { + "epoch": 0.85, + "grad_norm": 5.6611321303945985, + "learning_rate": 1.8980384133971631e-06, + "loss": 0.6784, + "step": 5721 + }, + { + "epoch": 0.85, + "grad_norm": 1.926105846832941, + "learning_rate": 1.897995909739646e-06, + "loss": 0.694, + "step": 5722 + }, + { + "epoch": 0.85, + "grad_norm": 3.602894842187153, + "learning_rate": 1.8979533977010535e-06, + "loss": 0.6797, + "step": 5723 + }, + { + "epoch": 0.85, + "grad_norm": 0.49426812100402306, + "learning_rate": 1.8979108772817827e-06, + "loss": 0.6882, + "step": 5724 + }, + { + "epoch": 0.85, + "grad_norm": 4.1738719648178995, + "learning_rate": 1.89786834848223e-06, + "loss": 0.6947, + "step": 5725 + }, + { + "epoch": 0.85, + "grad_norm": 1.6008662314574131, + "learning_rate": 1.8978258113027929e-06, + "loss": 0.6719, + "step": 5726 + }, + { + "epoch": 0.85, + "grad_norm": 2.508465716852739, + "learning_rate": 1.8977832657438682e-06, + "loss": 0.6641, + "step": 5727 + }, + { + "epoch": 0.85, + "grad_norm": 1.1801666278230776, + "learning_rate": 1.8977407118058526e-06, + "loss": 0.6966, + "step": 5728 + }, + { + "epoch": 0.85, + "grad_norm": 3.327992056767213, + "learning_rate": 1.8976981494891433e-06, + "loss": 0.6862, + "step": 5729 + }, + { + "epoch": 0.85, + "grad_norm": 1.9599689774164932, + "learning_rate": 1.8976555787941381e-06, + "loss": 0.6712, + "step": 5730 + }, + { + "epoch": 0.85, + "grad_norm": 4.15644934040858, + "learning_rate": 1.8976129997212342e-06, + "loss": 0.6725, + "step": 5731 + }, + { + "epoch": 0.85, + "grad_norm": 3.1989128849456323, + "learning_rate": 1.8975704122708285e-06, + "loss": 0.679, + "step": 5732 + }, + { + "epoch": 0.86, + "grad_norm": 3.3938393842617263, + "learning_rate": 1.8975278164433188e-06, + "loss": 0.6667, + "step": 5733 + }, + { + "epoch": 0.86, + "grad_norm": 0.6724216429388632, + "learning_rate": 1.8974852122391028e-06, + "loss": 0.6673, + "step": 5734 + }, + { + "epoch": 0.86, + "grad_norm": 1.3791972402556618, + "learning_rate": 1.8974425996585777e-06, + "loss": 0.6777, + "step": 5735 + }, + { + "epoch": 0.86, + "grad_norm": 8.502808565867802, + "learning_rate": 1.8973999787021416e-06, + "loss": 0.6842, + "step": 5736 + }, + { + "epoch": 0.86, + "grad_norm": 1.8394726413444533, + "learning_rate": 1.8973573493701922e-06, + "loss": 0.6849, + "step": 5737 + }, + { + "epoch": 0.86, + "grad_norm": 0.9943399301836768, + "learning_rate": 1.8973147116631272e-06, + "loss": 0.6888, + "step": 5738 + }, + { + "epoch": 0.86, + "grad_norm": 4.946187425246753, + "learning_rate": 1.8972720655813446e-06, + "loss": 0.7077, + "step": 5739 + }, + { + "epoch": 0.86, + "grad_norm": 2.3827342521395303, + "learning_rate": 1.8972294111252424e-06, + "loss": 0.679, + "step": 5740 + }, + { + "epoch": 0.86, + "grad_norm": 5.704261143629431, + "learning_rate": 1.8971867482952192e-06, + "loss": 0.6621, + "step": 5741 + }, + { + "epoch": 0.86, + "grad_norm": 0.6356937418647707, + "learning_rate": 1.897144077091672e-06, + "loss": 0.7005, + "step": 5742 + }, + { + "epoch": 0.86, + "grad_norm": 0.8999155694635049, + "learning_rate": 1.8971013975150004e-06, + "loss": 0.6855, + "step": 5743 + }, + { + "epoch": 0.86, + "grad_norm": 0.5128330486874426, + "learning_rate": 1.8970587095656019e-06, + "loss": 0.6803, + "step": 5744 + }, + { + "epoch": 0.86, + "grad_norm": 3.133493931812541, + "learning_rate": 1.8970160132438752e-06, + "loss": 0.6966, + "step": 5745 + }, + { + "epoch": 0.86, + "grad_norm": 1.666324787951686, + "learning_rate": 1.8969733085502186e-06, + "loss": 0.707, + "step": 5746 + }, + { + "epoch": 0.86, + "grad_norm": 1.8917005388006758, + "learning_rate": 1.8969305954850307e-06, + "loss": 0.7031, + "step": 5747 + }, + { + "epoch": 0.86, + "grad_norm": 5.5573490739220945, + "learning_rate": 1.8968878740487104e-06, + "loss": 0.6797, + "step": 5748 + }, + { + "epoch": 0.86, + "grad_norm": 6.255828571839916, + "learning_rate": 1.8968451442416562e-06, + "loss": 0.6797, + "step": 5749 + }, + { + "epoch": 0.86, + "grad_norm": 1.5190873977692851, + "learning_rate": 1.8968024060642672e-06, + "loss": 0.6868, + "step": 5750 + }, + { + "epoch": 0.86, + "grad_norm": 3.3823135655857297, + "learning_rate": 1.8967596595169418e-06, + "loss": 0.6771, + "step": 5751 + }, + { + "epoch": 0.86, + "grad_norm": 6.332256324552277, + "learning_rate": 1.8967169046000793e-06, + "loss": 0.6947, + "step": 5752 + }, + { + "epoch": 0.86, + "grad_norm": 1.124390502297776, + "learning_rate": 1.8966741413140784e-06, + "loss": 0.6986, + "step": 5753 + }, + { + "epoch": 0.86, + "grad_norm": 2.568240637280728, + "learning_rate": 1.8966313696593387e-06, + "loss": 0.6901, + "step": 5754 + }, + { + "epoch": 0.86, + "grad_norm": 2.901360787174524, + "learning_rate": 1.8965885896362587e-06, + "loss": 0.6895, + "step": 5755 + }, + { + "epoch": 0.86, + "grad_norm": 4.443272921963464, + "learning_rate": 1.8965458012452387e-06, + "loss": 0.6751, + "step": 5756 + }, + { + "epoch": 0.86, + "grad_norm": 1.788279089285079, + "learning_rate": 1.8965030044866772e-06, + "loss": 0.6888, + "step": 5757 + }, + { + "epoch": 0.86, + "grad_norm": 1.1022001648949211, + "learning_rate": 1.8964601993609736e-06, + "loss": 0.6751, + "step": 5758 + }, + { + "epoch": 0.86, + "grad_norm": 4.494154346413941, + "learning_rate": 1.8964173858685281e-06, + "loss": 0.6771, + "step": 5759 + }, + { + "epoch": 0.86, + "grad_norm": 0.4342255340163689, + "learning_rate": 1.8963745640097396e-06, + "loss": 0.6797, + "step": 5760 + }, + { + "epoch": 0.86, + "grad_norm": 6.8755156056215, + "learning_rate": 1.8963317337850083e-06, + "loss": 0.6634, + "step": 5761 + }, + { + "epoch": 0.86, + "grad_norm": 2.031523884533938, + "learning_rate": 1.8962888951947333e-06, + "loss": 0.6758, + "step": 5762 + }, + { + "epoch": 0.86, + "grad_norm": 0.44709465204532595, + "learning_rate": 1.896246048239315e-06, + "loss": 0.6719, + "step": 5763 + }, + { + "epoch": 0.86, + "grad_norm": 2.702700835291541, + "learning_rate": 1.896203192919153e-06, + "loss": 0.6556, + "step": 5764 + }, + { + "epoch": 0.86, + "grad_norm": 3.2049375198457573, + "learning_rate": 1.8961603292346475e-06, + "loss": 0.6888, + "step": 5765 + }, + { + "epoch": 0.86, + "grad_norm": 3.1166873011126075, + "learning_rate": 1.8961174571861982e-06, + "loss": 0.6764, + "step": 5766 + }, + { + "epoch": 0.86, + "grad_norm": 1.5150526580454737, + "learning_rate": 1.8960745767742055e-06, + "loss": 0.6882, + "step": 5767 + }, + { + "epoch": 0.86, + "grad_norm": 2.2996652037854877, + "learning_rate": 1.8960316879990694e-06, + "loss": 0.6803, + "step": 5768 + }, + { + "epoch": 0.86, + "grad_norm": 4.310140596784811, + "learning_rate": 1.8959887908611906e-06, + "loss": 0.6966, + "step": 5769 + }, + { + "epoch": 0.86, + "grad_norm": 1.069711691065855, + "learning_rate": 1.895945885360969e-06, + "loss": 0.7025, + "step": 5770 + }, + { + "epoch": 0.86, + "grad_norm": 3.7457525986711184, + "learning_rate": 1.8959029714988052e-06, + "loss": 0.6947, + "step": 5771 + }, + { + "epoch": 0.86, + "grad_norm": 4.371233379472939, + "learning_rate": 1.8958600492750997e-06, + "loss": 0.6771, + "step": 5772 + }, + { + "epoch": 0.86, + "grad_norm": 1.4163868494304255, + "learning_rate": 1.8958171186902533e-06, + "loss": 0.6908, + "step": 5773 + }, + { + "epoch": 0.86, + "grad_norm": 0.7168400184118793, + "learning_rate": 1.8957741797446664e-06, + "loss": 0.6784, + "step": 5774 + }, + { + "epoch": 0.86, + "grad_norm": 2.648158237595941, + "learning_rate": 1.8957312324387397e-06, + "loss": 0.6816, + "step": 5775 + }, + { + "epoch": 0.86, + "grad_norm": 2.5789768314206953, + "learning_rate": 1.8956882767728745e-06, + "loss": 0.6764, + "step": 5776 + }, + { + "epoch": 0.86, + "grad_norm": 0.7682562086457699, + "learning_rate": 1.8956453127474714e-06, + "loss": 0.6875, + "step": 5777 + }, + { + "epoch": 0.86, + "grad_norm": 1.8211506118783203, + "learning_rate": 1.895602340362931e-06, + "loss": 0.6823, + "step": 5778 + }, + { + "epoch": 0.86, + "grad_norm": 3.2107388912122157, + "learning_rate": 1.8955593596196554e-06, + "loss": 0.6888, + "step": 5779 + }, + { + "epoch": 0.86, + "grad_norm": 1.9290956476111283, + "learning_rate": 1.8955163705180443e-06, + "loss": 0.6764, + "step": 5780 + }, + { + "epoch": 0.86, + "grad_norm": 0.5513879908553017, + "learning_rate": 1.8954733730585005e-06, + "loss": 0.6842, + "step": 5781 + }, + { + "epoch": 0.86, + "grad_norm": 1.805813814807205, + "learning_rate": 1.8954303672414241e-06, + "loss": 0.6888, + "step": 5782 + }, + { + "epoch": 0.86, + "grad_norm": 2.5199415785741963, + "learning_rate": 1.8953873530672166e-06, + "loss": 0.6947, + "step": 5783 + }, + { + "epoch": 0.86, + "grad_norm": 3.8217027052960235, + "learning_rate": 1.8953443305362802e-06, + "loss": 0.7018, + "step": 5784 + }, + { + "epoch": 0.86, + "grad_norm": 1.7638888891201998, + "learning_rate": 1.8953012996490162e-06, + "loss": 0.6764, + "step": 5785 + }, + { + "epoch": 0.86, + "grad_norm": 5.106118886001696, + "learning_rate": 1.8952582604058254e-06, + "loss": 0.6862, + "step": 5786 + }, + { + "epoch": 0.86, + "grad_norm": 3.739461829905946, + "learning_rate": 1.8952152128071104e-06, + "loss": 0.6758, + "step": 5787 + }, + { + "epoch": 0.86, + "grad_norm": 0.9247422163003521, + "learning_rate": 1.8951721568532725e-06, + "loss": 0.6816, + "step": 5788 + }, + { + "epoch": 0.86, + "grad_norm": 1.3759019666367902, + "learning_rate": 1.895129092544714e-06, + "loss": 0.6797, + "step": 5789 + }, + { + "epoch": 0.86, + "grad_norm": 2.7740902232028155, + "learning_rate": 1.8950860198818361e-06, + "loss": 0.6634, + "step": 5790 + }, + { + "epoch": 0.86, + "grad_norm": 1.214971342747412, + "learning_rate": 1.8950429388650415e-06, + "loss": 0.6966, + "step": 5791 + }, + { + "epoch": 0.86, + "grad_norm": 1.1881031022766677, + "learning_rate": 1.8949998494947319e-06, + "loss": 0.679, + "step": 5792 + }, + { + "epoch": 0.86, + "grad_norm": 2.633135843959463, + "learning_rate": 1.8949567517713097e-06, + "loss": 0.6823, + "step": 5793 + }, + { + "epoch": 0.86, + "grad_norm": 1.853101150842682, + "learning_rate": 1.8949136456951766e-06, + "loss": 0.7044, + "step": 5794 + }, + { + "epoch": 0.86, + "grad_norm": 1.7656448721831561, + "learning_rate": 1.894870531266736e-06, + "loss": 0.6797, + "step": 5795 + }, + { + "epoch": 0.86, + "grad_norm": 1.011784662986079, + "learning_rate": 1.894827408486389e-06, + "loss": 0.6589, + "step": 5796 + }, + { + "epoch": 0.86, + "grad_norm": 2.7702292713759276, + "learning_rate": 1.8947842773545389e-06, + "loss": 0.722, + "step": 5797 + }, + { + "epoch": 0.86, + "grad_norm": 1.0258817792746755, + "learning_rate": 1.8947411378715879e-06, + "loss": 0.7031, + "step": 5798 + }, + { + "epoch": 0.86, + "grad_norm": 4.751800823656043, + "learning_rate": 1.8946979900379388e-06, + "loss": 0.6862, + "step": 5799 + }, + { + "epoch": 0.87, + "grad_norm": 4.357393362439956, + "learning_rate": 1.894654833853994e-06, + "loss": 0.6738, + "step": 5800 + }, + { + "epoch": 0.87, + "grad_norm": 3.7818072722063145, + "learning_rate": 1.8946116693201568e-06, + "loss": 0.6758, + "step": 5801 + }, + { + "epoch": 0.87, + "grad_norm": 3.800922021928231, + "learning_rate": 1.8945684964368298e-06, + "loss": 0.6966, + "step": 5802 + }, + { + "epoch": 0.87, + "grad_norm": 5.531842322887951, + "learning_rate": 1.8945253152044156e-06, + "loss": 0.6927, + "step": 5803 + }, + { + "epoch": 0.87, + "grad_norm": 0.454003294842343, + "learning_rate": 1.8944821256233179e-06, + "loss": 0.6764, + "step": 5804 + }, + { + "epoch": 0.87, + "grad_norm": 3.8202808576101277, + "learning_rate": 1.8944389276939395e-06, + "loss": 0.6966, + "step": 5805 + }, + { + "epoch": 0.87, + "grad_norm": 3.606672496239756, + "learning_rate": 1.894395721416683e-06, + "loss": 0.6914, + "step": 5806 + }, + { + "epoch": 0.87, + "grad_norm": 6.236503181948538, + "learning_rate": 1.8943525067919524e-06, + "loss": 0.6895, + "step": 5807 + }, + { + "epoch": 0.87, + "grad_norm": 0.9955553539231059, + "learning_rate": 1.8943092838201506e-06, + "loss": 0.6667, + "step": 5808 + }, + { + "epoch": 0.87, + "grad_norm": 1.1023162167295946, + "learning_rate": 1.8942660525016814e-06, + "loss": 0.6712, + "step": 5809 + }, + { + "epoch": 0.87, + "grad_norm": 0.6686722433276703, + "learning_rate": 1.894222812836948e-06, + "loss": 0.6816, + "step": 5810 + }, + { + "epoch": 0.87, + "grad_norm": 7.399511712353142, + "learning_rate": 1.894179564826354e-06, + "loss": 0.6979, + "step": 5811 + }, + { + "epoch": 0.87, + "grad_norm": 4.6391403586305575, + "learning_rate": 1.894136308470303e-06, + "loss": 0.668, + "step": 5812 + }, + { + "epoch": 0.87, + "grad_norm": 5.833517886114246, + "learning_rate": 1.8940930437691987e-06, + "loss": 0.694, + "step": 5813 + }, + { + "epoch": 0.87, + "grad_norm": 6.077642087509309, + "learning_rate": 1.894049770723445e-06, + "loss": 0.6855, + "step": 5814 + }, + { + "epoch": 0.87, + "grad_norm": 1.3675848617130593, + "learning_rate": 1.8940064893334457e-06, + "loss": 0.6953, + "step": 5815 + }, + { + "epoch": 0.87, + "grad_norm": 1.208380805314563, + "learning_rate": 1.893963199599605e-06, + "loss": 0.6797, + "step": 5816 + }, + { + "epoch": 0.87, + "grad_norm": 5.690730937779325, + "learning_rate": 1.8939199015223263e-06, + "loss": 0.6738, + "step": 5817 + }, + { + "epoch": 0.87, + "grad_norm": 2.934613076297435, + "learning_rate": 1.8938765951020144e-06, + "loss": 0.6751, + "step": 5818 + }, + { + "epoch": 0.87, + "grad_norm": 1.1703147519309798, + "learning_rate": 1.893833280339073e-06, + "loss": 0.6862, + "step": 5819 + }, + { + "epoch": 0.87, + "grad_norm": 0.5219298205235605, + "learning_rate": 1.8937899572339067e-06, + "loss": 0.6868, + "step": 5820 + }, + { + "epoch": 0.87, + "grad_norm": 4.059285012352195, + "learning_rate": 1.8937466257869197e-06, + "loss": 0.6725, + "step": 5821 + }, + { + "epoch": 0.87, + "grad_norm": 0.5137517708945837, + "learning_rate": 1.8937032859985162e-06, + "loss": 0.6842, + "step": 5822 + }, + { + "epoch": 0.87, + "grad_norm": 1.9922617329216517, + "learning_rate": 1.8936599378691013e-06, + "loss": 0.6758, + "step": 5823 + }, + { + "epoch": 0.87, + "grad_norm": 2.4831263613722503, + "learning_rate": 1.8936165813990788e-06, + "loss": 0.6868, + "step": 5824 + }, + { + "epoch": 0.87, + "grad_norm": 2.612737356264833, + "learning_rate": 1.893573216588854e-06, + "loss": 0.6693, + "step": 5825 + }, + { + "epoch": 0.87, + "grad_norm": 1.5551833567318094, + "learning_rate": 1.8935298434388311e-06, + "loss": 0.6686, + "step": 5826 + }, + { + "epoch": 0.87, + "grad_norm": 1.8010150289318558, + "learning_rate": 1.8934864619494154e-06, + "loss": 0.6628, + "step": 5827 + }, + { + "epoch": 0.87, + "grad_norm": 1.3892560768271918, + "learning_rate": 1.8934430721210113e-06, + "loss": 0.6582, + "step": 5828 + }, + { + "epoch": 0.87, + "grad_norm": 1.256292750928203, + "learning_rate": 1.893399673954024e-06, + "loss": 0.6725, + "step": 5829 + }, + { + "epoch": 0.87, + "grad_norm": 7.008553588340591, + "learning_rate": 1.8933562674488587e-06, + "loss": 0.709, + "step": 5830 + }, + { + "epoch": 0.87, + "grad_norm": 1.3406687320482205, + "learning_rate": 1.8933128526059201e-06, + "loss": 0.6908, + "step": 5831 + }, + { + "epoch": 0.87, + "grad_norm": 1.676525064600641, + "learning_rate": 1.8932694294256138e-06, + "loss": 0.6849, + "step": 5832 + }, + { + "epoch": 0.87, + "grad_norm": 3.775650452545772, + "learning_rate": 1.893225997908345e-06, + "loss": 0.6764, + "step": 5833 + }, + { + "epoch": 0.87, + "grad_norm": 1.7644023279080867, + "learning_rate": 1.8931825580545188e-06, + "loss": 0.6667, + "step": 5834 + }, + { + "epoch": 0.87, + "grad_norm": 0.526754605958477, + "learning_rate": 1.893139109864541e-06, + "loss": 0.666, + "step": 5835 + }, + { + "epoch": 0.87, + "grad_norm": 2.819768382311779, + "learning_rate": 1.8930956533388165e-06, + "loss": 0.6732, + "step": 5836 + }, + { + "epoch": 0.87, + "grad_norm": 1.5601533813945314, + "learning_rate": 1.8930521884777515e-06, + "loss": 0.6732, + "step": 5837 + }, + { + "epoch": 0.87, + "grad_norm": 4.5258274123014814, + "learning_rate": 1.8930087152817516e-06, + "loss": 0.6784, + "step": 5838 + }, + { + "epoch": 0.87, + "grad_norm": 4.330371758496266, + "learning_rate": 1.8929652337512222e-06, + "loss": 0.6803, + "step": 5839 + }, + { + "epoch": 0.87, + "grad_norm": 1.4728203896536765, + "learning_rate": 1.8929217438865694e-06, + "loss": 0.6953, + "step": 5840 + }, + { + "epoch": 0.87, + "grad_norm": 5.432364209702959, + "learning_rate": 1.8928782456881991e-06, + "loss": 0.6875, + "step": 5841 + }, + { + "epoch": 0.87, + "grad_norm": 1.5534916818862385, + "learning_rate": 1.892834739156517e-06, + "loss": 0.6706, + "step": 5842 + }, + { + "epoch": 0.87, + "grad_norm": 4.608751771932098, + "learning_rate": 1.8927912242919295e-06, + "loss": 0.709, + "step": 5843 + }, + { + "epoch": 0.87, + "grad_norm": 2.933560807432667, + "learning_rate": 1.8927477010948425e-06, + "loss": 0.6823, + "step": 5844 + }, + { + "epoch": 0.87, + "grad_norm": 7.781365263533117, + "learning_rate": 1.892704169565662e-06, + "loss": 0.6836, + "step": 5845 + }, + { + "epoch": 0.87, + "grad_norm": 1.3718834895666472, + "learning_rate": 1.8926606297047948e-06, + "loss": 0.6628, + "step": 5846 + }, + { + "epoch": 0.87, + "grad_norm": 0.6804790363111145, + "learning_rate": 1.892617081512647e-06, + "loss": 0.6862, + "step": 5847 + }, + { + "epoch": 0.87, + "grad_norm": 4.324371175776372, + "learning_rate": 1.892573524989625e-06, + "loss": 0.6914, + "step": 5848 + }, + { + "epoch": 0.87, + "grad_norm": 5.622174614262544, + "learning_rate": 1.8925299601361355e-06, + "loss": 0.6921, + "step": 5849 + }, + { + "epoch": 0.87, + "grad_norm": 2.589615376608115, + "learning_rate": 1.8924863869525845e-06, + "loss": 0.679, + "step": 5850 + }, + { + "epoch": 0.87, + "grad_norm": 3.2902023712430863, + "learning_rate": 1.8924428054393795e-06, + "loss": 0.6758, + "step": 5851 + }, + { + "epoch": 0.87, + "grad_norm": 2.1070032329039132, + "learning_rate": 1.8923992155969269e-06, + "loss": 0.6699, + "step": 5852 + }, + { + "epoch": 0.87, + "grad_norm": 1.4635753781230223, + "learning_rate": 1.8923556174256333e-06, + "loss": 0.6875, + "step": 5853 + }, + { + "epoch": 0.87, + "grad_norm": 1.9253173454152992, + "learning_rate": 1.892312010925906e-06, + "loss": 0.6699, + "step": 5854 + }, + { + "epoch": 0.87, + "grad_norm": 4.111958646948024, + "learning_rate": 1.8922683960981516e-06, + "loss": 0.6654, + "step": 5855 + }, + { + "epoch": 0.87, + "grad_norm": 4.192286610640333, + "learning_rate": 1.8922247729427775e-06, + "loss": 0.668, + "step": 5856 + }, + { + "epoch": 0.87, + "grad_norm": 3.6941452017418777, + "learning_rate": 1.8921811414601904e-06, + "loss": 0.7018, + "step": 5857 + }, + { + "epoch": 0.87, + "grad_norm": 3.618936338705843, + "learning_rate": 1.8921375016507981e-06, + "loss": 0.6934, + "step": 5858 + }, + { + "epoch": 0.87, + "grad_norm": 1.2670768683580413, + "learning_rate": 1.8920938535150076e-06, + "loss": 0.6823, + "step": 5859 + }, + { + "epoch": 0.87, + "grad_norm": 4.805105141163526, + "learning_rate": 1.8920501970532262e-06, + "loss": 0.7051, + "step": 5860 + }, + { + "epoch": 0.87, + "grad_norm": 2.2069087195617576, + "learning_rate": 1.8920065322658614e-06, + "loss": 0.6895, + "step": 5861 + }, + { + "epoch": 0.87, + "grad_norm": 0.534131838856608, + "learning_rate": 1.8919628591533207e-06, + "loss": 0.6654, + "step": 5862 + }, + { + "epoch": 0.87, + "grad_norm": 0.6444501275079458, + "learning_rate": 1.8919191777160118e-06, + "loss": 0.6693, + "step": 5863 + }, + { + "epoch": 0.87, + "grad_norm": 1.1831584762964416, + "learning_rate": 1.8918754879543425e-06, + "loss": 0.6738, + "step": 5864 + }, + { + "epoch": 0.87, + "grad_norm": 0.600372815109467, + "learning_rate": 1.8918317898687202e-06, + "loss": 0.7005, + "step": 5865 + }, + { + "epoch": 0.87, + "grad_norm": 1.9924697311166937, + "learning_rate": 1.891788083459553e-06, + "loss": 0.6758, + "step": 5866 + }, + { + "epoch": 0.88, + "grad_norm": 2.5143550842509055, + "learning_rate": 1.8917443687272488e-06, + "loss": 0.6764, + "step": 5867 + }, + { + "epoch": 0.88, + "grad_norm": 0.907324040742769, + "learning_rate": 1.8917006456722152e-06, + "loss": 0.7031, + "step": 5868 + }, + { + "epoch": 0.88, + "grad_norm": 3.0125717732973536, + "learning_rate": 1.891656914294861e-06, + "loss": 0.6927, + "step": 5869 + }, + { + "epoch": 0.88, + "grad_norm": 6.734529918561692, + "learning_rate": 1.891613174595594e-06, + "loss": 0.6602, + "step": 5870 + }, + { + "epoch": 0.88, + "grad_norm": 0.9217786698436028, + "learning_rate": 1.891569426574822e-06, + "loss": 0.6712, + "step": 5871 + }, + { + "epoch": 0.88, + "grad_norm": 0.9558600033936074, + "learning_rate": 1.8915256702329541e-06, + "loss": 0.6842, + "step": 5872 + }, + { + "epoch": 0.88, + "grad_norm": 3.318633759390756, + "learning_rate": 1.8914819055703983e-06, + "loss": 0.7155, + "step": 5873 + }, + { + "epoch": 0.88, + "grad_norm": 1.0878083081310888, + "learning_rate": 1.8914381325875629e-06, + "loss": 0.6934, + "step": 5874 + }, + { + "epoch": 0.88, + "grad_norm": 2.4704283982135102, + "learning_rate": 1.8913943512848563e-06, + "loss": 0.7044, + "step": 5875 + }, + { + "epoch": 0.88, + "grad_norm": 3.6186704503992715, + "learning_rate": 1.8913505616626878e-06, + "loss": 0.6927, + "step": 5876 + }, + { + "epoch": 0.88, + "grad_norm": 7.485747614842573, + "learning_rate": 1.8913067637214655e-06, + "loss": 0.6725, + "step": 5877 + }, + { + "epoch": 0.88, + "grad_norm": 0.7166016379946732, + "learning_rate": 1.8912629574615984e-06, + "loss": 0.6667, + "step": 5878 + }, + { + "epoch": 0.88, + "grad_norm": 4.794814159080404, + "learning_rate": 1.8912191428834951e-06, + "loss": 0.7129, + "step": 5879 + }, + { + "epoch": 0.88, + "grad_norm": 3.2210114579946, + "learning_rate": 1.8911753199875648e-06, + "loss": 0.6862, + "step": 5880 + }, + { + "epoch": 0.88, + "grad_norm": 1.1127979818657205, + "learning_rate": 1.8911314887742168e-06, + "loss": 0.6706, + "step": 5881 + }, + { + "epoch": 0.88, + "grad_norm": 2.850934110301945, + "learning_rate": 1.8910876492438593e-06, + "loss": 0.707, + "step": 5882 + }, + { + "epoch": 0.88, + "grad_norm": 4.795126670901922, + "learning_rate": 1.8910438013969024e-06, + "loss": 0.6719, + "step": 5883 + }, + { + "epoch": 0.88, + "grad_norm": 2.0936531079642187, + "learning_rate": 1.8909999452337545e-06, + "loss": 0.6719, + "step": 5884 + }, + { + "epoch": 0.88, + "grad_norm": 3.2772097598776853, + "learning_rate": 1.8909560807548255e-06, + "loss": 0.6927, + "step": 5885 + }, + { + "epoch": 0.88, + "grad_norm": 6.761772082556573, + "learning_rate": 1.8909122079605247e-06, + "loss": 0.6999, + "step": 5886 + }, + { + "epoch": 0.88, + "grad_norm": 3.211275611370532, + "learning_rate": 1.8908683268512614e-06, + "loss": 0.6829, + "step": 5887 + }, + { + "epoch": 0.88, + "grad_norm": 1.4619287689467533, + "learning_rate": 1.8908244374274452e-06, + "loss": 0.6849, + "step": 5888 + }, + { + "epoch": 0.88, + "grad_norm": 0.6767189936812046, + "learning_rate": 1.8907805396894859e-06, + "loss": 0.6868, + "step": 5889 + }, + { + "epoch": 0.88, + "grad_norm": 0.43577764960217996, + "learning_rate": 1.8907366336377927e-06, + "loss": 0.6868, + "step": 5890 + }, + { + "epoch": 0.88, + "grad_norm": 0.6090718795628086, + "learning_rate": 1.8906927192727759e-06, + "loss": 0.681, + "step": 5891 + }, + { + "epoch": 0.88, + "grad_norm": 2.825757279787833, + "learning_rate": 1.8906487965948451e-06, + "loss": 0.681, + "step": 5892 + }, + { + "epoch": 0.88, + "grad_norm": 1.641065283158043, + "learning_rate": 1.8906048656044104e-06, + "loss": 0.6712, + "step": 5893 + }, + { + "epoch": 0.88, + "grad_norm": 1.5885401642951331, + "learning_rate": 1.890560926301882e-06, + "loss": 0.6895, + "step": 5894 + }, + { + "epoch": 0.88, + "grad_norm": 6.413559631683134, + "learning_rate": 1.8905169786876693e-06, + "loss": 0.6953, + "step": 5895 + }, + { + "epoch": 0.88, + "grad_norm": 5.860726293033074, + "learning_rate": 1.8904730227621832e-06, + "loss": 0.6732, + "step": 5896 + }, + { + "epoch": 0.88, + "grad_norm": 0.5648291127917496, + "learning_rate": 1.8904290585258335e-06, + "loss": 0.6797, + "step": 5897 + }, + { + "epoch": 0.88, + "grad_norm": 4.577450353460008, + "learning_rate": 1.8903850859790307e-06, + "loss": 0.6784, + "step": 5898 + }, + { + "epoch": 0.88, + "grad_norm": 1.840325015190047, + "learning_rate": 1.890341105122185e-06, + "loss": 0.6895, + "step": 5899 + }, + { + "epoch": 0.88, + "grad_norm": 0.9034068546529915, + "learning_rate": 1.890297115955707e-06, + "loss": 0.6608, + "step": 5900 + }, + { + "epoch": 0.88, + "grad_norm": 0.7630881266200054, + "learning_rate": 1.8902531184800075e-06, + "loss": 0.6921, + "step": 5901 + }, + { + "epoch": 0.88, + "grad_norm": 0.3625475664607728, + "learning_rate": 1.8902091126954968e-06, + "loss": 0.6849, + "step": 5902 + }, + { + "epoch": 0.88, + "grad_norm": 2.4314372818807035, + "learning_rate": 1.8901650986025857e-06, + "loss": 0.694, + "step": 5903 + }, + { + "epoch": 0.88, + "grad_norm": 2.5332524884365424, + "learning_rate": 1.890121076201685e-06, + "loss": 0.6686, + "step": 5904 + }, + { + "epoch": 0.88, + "grad_norm": 2.5649505613036236, + "learning_rate": 1.8900770454932059e-06, + "loss": 0.6914, + "step": 5905 + }, + { + "epoch": 0.88, + "grad_norm": 0.6316613864772797, + "learning_rate": 1.8900330064775584e-06, + "loss": 0.6816, + "step": 5906 + }, + { + "epoch": 0.88, + "grad_norm": 0.887035442894278, + "learning_rate": 1.8899889591551546e-06, + "loss": 0.6908, + "step": 5907 + }, + { + "epoch": 0.88, + "grad_norm": 1.8189305509948062, + "learning_rate": 1.889944903526405e-06, + "loss": 0.6784, + "step": 5908 + }, + { + "epoch": 0.88, + "grad_norm": 2.9793176759356355, + "learning_rate": 1.8899008395917208e-06, + "loss": 0.6732, + "step": 5909 + }, + { + "epoch": 0.88, + "grad_norm": 5.3726819635631, + "learning_rate": 1.8898567673515137e-06, + "loss": 0.6621, + "step": 5910 + }, + { + "epoch": 0.88, + "grad_norm": 6.098438036010641, + "learning_rate": 1.8898126868061945e-06, + "loss": 0.668, + "step": 5911 + }, + { + "epoch": 0.88, + "grad_norm": 2.9382499594386378, + "learning_rate": 1.8897685979561748e-06, + "loss": 0.7077, + "step": 5912 + }, + { + "epoch": 0.88, + "grad_norm": 3.865377381800785, + "learning_rate": 1.889724500801866e-06, + "loss": 0.7077, + "step": 5913 + }, + { + "epoch": 0.88, + "grad_norm": 4.138046610383268, + "learning_rate": 1.8896803953436798e-06, + "loss": 0.6771, + "step": 5914 + }, + { + "epoch": 0.88, + "grad_norm": 1.4453037558732833, + "learning_rate": 1.889636281582028e-06, + "loss": 0.7103, + "step": 5915 + }, + { + "epoch": 0.88, + "grad_norm": 2.051366075502716, + "learning_rate": 1.889592159517322e-06, + "loss": 0.7025, + "step": 5916 + }, + { + "epoch": 0.88, + "grad_norm": 4.216645825007843, + "learning_rate": 1.8895480291499735e-06, + "loss": 0.6745, + "step": 5917 + }, + { + "epoch": 0.88, + "grad_norm": 1.5540214809603665, + "learning_rate": 1.8895038904803945e-06, + "loss": 0.6862, + "step": 5918 + }, + { + "epoch": 0.88, + "grad_norm": 2.27693836321633, + "learning_rate": 1.8894597435089974e-06, + "loss": 0.6966, + "step": 5919 + }, + { + "epoch": 0.88, + "grad_norm": 1.6258670775758, + "learning_rate": 1.8894155882361937e-06, + "loss": 0.6888, + "step": 5920 + }, + { + "epoch": 0.88, + "grad_norm": 4.075423097732612, + "learning_rate": 1.8893714246623958e-06, + "loss": 0.6634, + "step": 5921 + }, + { + "epoch": 0.88, + "grad_norm": 0.8817476753651244, + "learning_rate": 1.8893272527880155e-06, + "loss": 0.6934, + "step": 5922 + }, + { + "epoch": 0.88, + "grad_norm": 1.89645213567567, + "learning_rate": 1.8892830726134654e-06, + "loss": 0.6634, + "step": 5923 + }, + { + "epoch": 0.88, + "grad_norm": 3.7417658063265917, + "learning_rate": 1.8892388841391578e-06, + "loss": 0.7174, + "step": 5924 + }, + { + "epoch": 0.88, + "grad_norm": 6.594037589980852, + "learning_rate": 1.889194687365505e-06, + "loss": 0.7135, + "step": 5925 + }, + { + "epoch": 0.88, + "grad_norm": 3.2523053612761523, + "learning_rate": 1.8891504822929197e-06, + "loss": 0.6751, + "step": 5926 + }, + { + "epoch": 0.88, + "grad_norm": 0.4811828970389134, + "learning_rate": 1.889106268921814e-06, + "loss": 0.6829, + "step": 5927 + }, + { + "epoch": 0.88, + "grad_norm": 2.041427434761512, + "learning_rate": 1.8890620472526008e-06, + "loss": 0.6823, + "step": 5928 + }, + { + "epoch": 0.88, + "grad_norm": 1.91807170467695, + "learning_rate": 1.8890178172856932e-06, + "loss": 0.6836, + "step": 5929 + }, + { + "epoch": 0.88, + "grad_norm": 0.44572824626082763, + "learning_rate": 1.8889735790215036e-06, + "loss": 0.6855, + "step": 5930 + }, + { + "epoch": 0.88, + "grad_norm": 0.7626532393770764, + "learning_rate": 1.888929332460445e-06, + "loss": 0.6855, + "step": 5931 + }, + { + "epoch": 0.88, + "grad_norm": 2.1850451194984397, + "learning_rate": 1.8888850776029303e-06, + "loss": 0.6914, + "step": 5932 + }, + { + "epoch": 0.88, + "grad_norm": 3.8370436254553395, + "learning_rate": 1.8888408144493726e-06, + "loss": 0.696, + "step": 5933 + }, + { + "epoch": 0.89, + "grad_norm": 5.53940411260967, + "learning_rate": 1.888796543000185e-06, + "loss": 0.6895, + "step": 5934 + }, + { + "epoch": 0.89, + "grad_norm": 7.955040093059553, + "learning_rate": 1.8887522632557804e-06, + "loss": 0.6973, + "step": 5935 + }, + { + "epoch": 0.89, + "grad_norm": 0.9333274597250709, + "learning_rate": 1.8887079752165726e-06, + "loss": 0.6693, + "step": 5936 + }, + { + "epoch": 0.89, + "grad_norm": 0.457839301694274, + "learning_rate": 1.8886636788829746e-06, + "loss": 0.6953, + "step": 5937 + }, + { + "epoch": 0.89, + "grad_norm": 3.6493551063465595, + "learning_rate": 1.8886193742554e-06, + "loss": 0.681, + "step": 5938 + }, + { + "epoch": 0.89, + "grad_norm": 1.0924019833406042, + "learning_rate": 1.888575061334262e-06, + "loss": 0.7005, + "step": 5939 + }, + { + "epoch": 0.89, + "grad_norm": 2.7148133088168906, + "learning_rate": 1.8885307401199743e-06, + "loss": 0.6849, + "step": 5940 + }, + { + "epoch": 0.89, + "grad_norm": 1.0493661953047873, + "learning_rate": 1.888486410612951e-06, + "loss": 0.6901, + "step": 5941 + }, + { + "epoch": 0.89, + "grad_norm": 0.8734875794944658, + "learning_rate": 1.8884420728136051e-06, + "loss": 0.6868, + "step": 5942 + }, + { + "epoch": 0.89, + "grad_norm": 2.766719024882909, + "learning_rate": 1.888397726722351e-06, + "loss": 0.6745, + "step": 5943 + }, + { + "epoch": 0.89, + "grad_norm": 2.5105398788238382, + "learning_rate": 1.8883533723396024e-06, + "loss": 0.6836, + "step": 5944 + }, + { + "epoch": 0.89, + "grad_norm": 2.1367296478964994, + "learning_rate": 1.888309009665773e-06, + "loss": 0.6725, + "step": 5945 + }, + { + "epoch": 0.89, + "grad_norm": 1.9293876189110004, + "learning_rate": 1.888264638701277e-06, + "loss": 0.6797, + "step": 5946 + }, + { + "epoch": 0.89, + "grad_norm": 5.140309529467257, + "learning_rate": 1.8882202594465285e-06, + "loss": 0.6745, + "step": 5947 + }, + { + "epoch": 0.89, + "grad_norm": 4.974879032212208, + "learning_rate": 1.8881758719019423e-06, + "loss": 0.6992, + "step": 5948 + }, + { + "epoch": 0.89, + "grad_norm": 3.628904218236352, + "learning_rate": 1.8881314760679316e-06, + "loss": 0.6849, + "step": 5949 + }, + { + "epoch": 0.89, + "grad_norm": 4.217959444648854, + "learning_rate": 1.8880870719449118e-06, + "loss": 0.681, + "step": 5950 + }, + { + "epoch": 0.89, + "grad_norm": 2.354262813531418, + "learning_rate": 1.8880426595332964e-06, + "loss": 0.6895, + "step": 5951 + }, + { + "epoch": 0.89, + "grad_norm": 3.099917001343992, + "learning_rate": 1.8879982388335005e-06, + "loss": 0.6686, + "step": 5952 + }, + { + "epoch": 0.89, + "grad_norm": 4.550786764285134, + "learning_rate": 1.8879538098459382e-06, + "loss": 0.6712, + "step": 5953 + }, + { + "epoch": 0.89, + "grad_norm": 4.696528847182234, + "learning_rate": 1.8879093725710248e-06, + "loss": 0.694, + "step": 5954 + }, + { + "epoch": 0.89, + "grad_norm": 1.6839875085368214, + "learning_rate": 1.8878649270091747e-06, + "loss": 0.6901, + "step": 5955 + }, + { + "epoch": 0.89, + "grad_norm": 1.5739838542425084, + "learning_rate": 1.8878204731608024e-06, + "loss": 0.6855, + "step": 5956 + }, + { + "epoch": 0.89, + "grad_norm": 4.2162247976892795, + "learning_rate": 1.8877760110263235e-06, + "loss": 0.6888, + "step": 5957 + }, + { + "epoch": 0.89, + "grad_norm": 3.1096659028454647, + "learning_rate": 1.8877315406061525e-06, + "loss": 0.6816, + "step": 5958 + }, + { + "epoch": 0.89, + "grad_norm": 3.210146364766292, + "learning_rate": 1.8876870619007047e-06, + "loss": 0.6934, + "step": 5959 + }, + { + "epoch": 0.89, + "grad_norm": 4.301320020345402, + "learning_rate": 1.8876425749103946e-06, + "loss": 0.6992, + "step": 5960 + }, + { + "epoch": 0.89, + "grad_norm": 1.882241065143536, + "learning_rate": 1.8875980796356381e-06, + "loss": 0.6667, + "step": 5961 + }, + { + "epoch": 0.89, + "grad_norm": 7.023078893505202, + "learning_rate": 1.8875535760768501e-06, + "loss": 0.7096, + "step": 5962 + }, + { + "epoch": 0.89, + "grad_norm": 0.7222346473053234, + "learning_rate": 1.8875090642344462e-06, + "loss": 0.6816, + "step": 5963 + }, + { + "epoch": 0.89, + "grad_norm": 3.5341470858919406, + "learning_rate": 1.8874645441088415e-06, + "loss": 0.6888, + "step": 5964 + }, + { + "epoch": 0.89, + "grad_norm": 2.0479986040652913, + "learning_rate": 1.887420015700452e-06, + "loss": 0.6732, + "step": 5965 + }, + { + "epoch": 0.89, + "grad_norm": 1.266359507095793, + "learning_rate": 1.887375479009693e-06, + "loss": 0.6693, + "step": 5966 + }, + { + "epoch": 0.89, + "grad_norm": 3.1028948965096506, + "learning_rate": 1.88733093403698e-06, + "loss": 0.6699, + "step": 5967 + }, + { + "epoch": 0.89, + "grad_norm": 8.883463622843717, + "learning_rate": 1.887286380782729e-06, + "loss": 0.7038, + "step": 5968 + }, + { + "epoch": 0.89, + "grad_norm": 1.3449302022011655, + "learning_rate": 1.8872418192473557e-06, + "loss": 0.6693, + "step": 5969 + }, + { + "epoch": 0.89, + "grad_norm": 1.2579460540730516, + "learning_rate": 1.8871972494312761e-06, + "loss": 0.6816, + "step": 5970 + }, + { + "epoch": 0.89, + "grad_norm": 0.61156020828246, + "learning_rate": 1.887152671334906e-06, + "loss": 0.6732, + "step": 5971 + }, + { + "epoch": 0.89, + "grad_norm": 2.065771089784037, + "learning_rate": 1.8871080849586617e-06, + "loss": 0.6758, + "step": 5972 + }, + { + "epoch": 0.89, + "grad_norm": 1.354945728155412, + "learning_rate": 1.8870634903029594e-06, + "loss": 0.6706, + "step": 5973 + }, + { + "epoch": 0.89, + "grad_norm": 1.8217570628738906, + "learning_rate": 1.8870188873682147e-06, + "loss": 0.6823, + "step": 5974 + }, + { + "epoch": 0.89, + "grad_norm": 0.760994323332844, + "learning_rate": 1.8869742761548446e-06, + "loss": 0.6523, + "step": 5975 + }, + { + "epoch": 0.89, + "grad_norm": 2.7635657313363198, + "learning_rate": 1.8869296566632648e-06, + "loss": 0.6693, + "step": 5976 + }, + { + "epoch": 0.89, + "grad_norm": 0.48390272040181975, + "learning_rate": 1.8868850288938925e-06, + "loss": 0.681, + "step": 5977 + }, + { + "epoch": 0.89, + "grad_norm": 4.872528385455043, + "learning_rate": 1.8868403928471436e-06, + "loss": 0.6947, + "step": 5978 + }, + { + "epoch": 0.89, + "grad_norm": 9.65368588047377, + "learning_rate": 1.8867957485234345e-06, + "loss": 0.6855, + "step": 5979 + }, + { + "epoch": 0.89, + "grad_norm": 3.972599505078751, + "learning_rate": 1.8867510959231827e-06, + "loss": 0.6751, + "step": 5980 + }, + { + "epoch": 0.89, + "grad_norm": 0.6433032202100978, + "learning_rate": 1.8867064350468045e-06, + "loss": 0.6888, + "step": 5981 + }, + { + "epoch": 0.89, + "grad_norm": 2.6182420667879165, + "learning_rate": 1.8866617658947166e-06, + "loss": 0.7031, + "step": 5982 + }, + { + "epoch": 0.89, + "grad_norm": 4.9580094472208405, + "learning_rate": 1.8866170884673358e-06, + "loss": 0.6842, + "step": 5983 + }, + { + "epoch": 0.89, + "grad_norm": 7.502343373471326, + "learning_rate": 1.8865724027650797e-06, + "loss": 0.7018, + "step": 5984 + }, + { + "epoch": 0.89, + "grad_norm": 5.270028322515516, + "learning_rate": 1.8865277087883649e-06, + "loss": 0.6803, + "step": 5985 + }, + { + "epoch": 0.89, + "grad_norm": 1.6640080874880314, + "learning_rate": 1.8864830065376083e-06, + "loss": 0.6647, + "step": 5986 + }, + { + "epoch": 0.89, + "grad_norm": 9.816141769688084, + "learning_rate": 1.8864382960132278e-06, + "loss": 0.6901, + "step": 5987 + }, + { + "epoch": 0.89, + "grad_norm": 0.5171170591267734, + "learning_rate": 1.8863935772156401e-06, + "loss": 0.6699, + "step": 5988 + }, + { + "epoch": 0.89, + "grad_norm": 1.6090296088861038, + "learning_rate": 1.8863488501452626e-06, + "loss": 0.679, + "step": 5989 + }, + { + "epoch": 0.89, + "grad_norm": 1.4244379743871818, + "learning_rate": 1.8863041148025133e-06, + "loss": 0.6992, + "step": 5990 + }, + { + "epoch": 0.89, + "grad_norm": 9.099237869868617, + "learning_rate": 1.8862593711878087e-06, + "loss": 0.6816, + "step": 5991 + }, + { + "epoch": 0.89, + "grad_norm": 0.682223145391384, + "learning_rate": 1.8862146193015675e-06, + "loss": 0.7012, + "step": 5992 + }, + { + "epoch": 0.89, + "grad_norm": 0.4458408719530503, + "learning_rate": 1.8861698591442068e-06, + "loss": 0.6771, + "step": 5993 + }, + { + "epoch": 0.89, + "grad_norm": 7.129857328436434, + "learning_rate": 1.8861250907161443e-06, + "loss": 0.6523, + "step": 5994 + }, + { + "epoch": 0.89, + "grad_norm": 4.493944845580579, + "learning_rate": 1.886080314017798e-06, + "loss": 0.6556, + "step": 5995 + }, + { + "epoch": 0.89, + "grad_norm": 0.4859104721704812, + "learning_rate": 1.886035529049586e-06, + "loss": 0.6914, + "step": 5996 + }, + { + "epoch": 0.89, + "grad_norm": 0.5293741387621629, + "learning_rate": 1.8859907358119257e-06, + "loss": 0.6875, + "step": 5997 + }, + { + "epoch": 0.89, + "grad_norm": 2.350066854853013, + "learning_rate": 1.8859459343052357e-06, + "loss": 0.6973, + "step": 5998 + }, + { + "epoch": 0.89, + "grad_norm": 0.8626651927702126, + "learning_rate": 1.885901124529934e-06, + "loss": 0.696, + "step": 5999 + }, + { + "epoch": 0.89, + "grad_norm": 0.5733025126743571, + "learning_rate": 1.8858563064864389e-06, + "loss": 0.668, + "step": 6000 + }, + { + "epoch": 0.9, + "grad_norm": 4.464890044198618, + "learning_rate": 1.8858114801751683e-06, + "loss": 0.666, + "step": 6001 + }, + { + "epoch": 0.9, + "grad_norm": 1.1018674650440876, + "learning_rate": 1.8857666455965408e-06, + "loss": 0.6693, + "step": 6002 + }, + { + "epoch": 0.9, + "grad_norm": 4.0180103345636065, + "learning_rate": 1.8857218027509749e-06, + "loss": 0.7018, + "step": 6003 + }, + { + "epoch": 0.9, + "grad_norm": 5.024105126265531, + "learning_rate": 1.8856769516388891e-06, + "loss": 0.6667, + "step": 6004 + }, + { + "epoch": 0.9, + "grad_norm": 1.032977837141615, + "learning_rate": 1.8856320922607019e-06, + "loss": 0.668, + "step": 6005 + }, + { + "epoch": 0.9, + "grad_norm": 5.297719463975125, + "learning_rate": 1.8855872246168323e-06, + "loss": 0.6908, + "step": 6006 + }, + { + "epoch": 0.9, + "grad_norm": 3.735384013978389, + "learning_rate": 1.8855423487076987e-06, + "loss": 0.6836, + "step": 6007 + }, + { + "epoch": 0.9, + "grad_norm": 3.0994179840775584, + "learning_rate": 1.8854974645337198e-06, + "loss": 0.6745, + "step": 6008 + }, + { + "epoch": 0.9, + "grad_norm": 5.282317948435615, + "learning_rate": 1.885452572095315e-06, + "loss": 0.6868, + "step": 6009 + }, + { + "epoch": 0.9, + "grad_norm": 1.9676656816860187, + "learning_rate": 1.8854076713929032e-06, + "loss": 0.6621, + "step": 6010 + }, + { + "epoch": 0.9, + "grad_norm": 1.6008820889491513, + "learning_rate": 1.8853627624269031e-06, + "loss": 0.6777, + "step": 6011 + }, + { + "epoch": 0.9, + "grad_norm": 0.9562639656838388, + "learning_rate": 1.8853178451977341e-06, + "loss": 0.6686, + "step": 6012 + }, + { + "epoch": 0.9, + "grad_norm": 5.8237289057665285, + "learning_rate": 1.885272919705815e-06, + "loss": 0.6882, + "step": 6013 + }, + { + "epoch": 0.9, + "grad_norm": 3.213926761254298, + "learning_rate": 1.885227985951566e-06, + "loss": 0.6706, + "step": 6014 + }, + { + "epoch": 0.9, + "grad_norm": 2.483793956601741, + "learning_rate": 1.8851830439354056e-06, + "loss": 0.6725, + "step": 6015 + }, + { + "epoch": 0.9, + "grad_norm": 3.907963826185097, + "learning_rate": 1.8851380936577537e-06, + "loss": 0.6921, + "step": 6016 + }, + { + "epoch": 0.9, + "grad_norm": 4.691494089253121, + "learning_rate": 1.8850931351190297e-06, + "loss": 0.6921, + "step": 6017 + }, + { + "epoch": 0.9, + "grad_norm": 1.2814534323095372, + "learning_rate": 1.885048168319653e-06, + "loss": 0.6732, + "step": 6018 + }, + { + "epoch": 0.9, + "grad_norm": 1.8851839864016313, + "learning_rate": 1.8850031932600436e-06, + "loss": 0.7005, + "step": 6019 + }, + { + "epoch": 0.9, + "grad_norm": 4.886868335372431, + "learning_rate": 1.8849582099406215e-06, + "loss": 0.6751, + "step": 6020 + }, + { + "epoch": 0.9, + "grad_norm": 3.4141747015703596, + "learning_rate": 1.8849132183618055e-06, + "loss": 0.6882, + "step": 6021 + }, + { + "epoch": 0.9, + "grad_norm": 2.4247014842118753, + "learning_rate": 1.8848682185240166e-06, + "loss": 0.6875, + "step": 6022 + }, + { + "epoch": 0.9, + "grad_norm": 5.692255152552669, + "learning_rate": 1.8848232104276742e-06, + "loss": 0.6738, + "step": 6023 + }, + { + "epoch": 0.9, + "grad_norm": 5.825028458782002, + "learning_rate": 1.8847781940731985e-06, + "loss": 0.6764, + "step": 6024 + }, + { + "epoch": 0.9, + "grad_norm": 1.2113287202557455, + "learning_rate": 1.8847331694610098e-06, + "loss": 0.7083, + "step": 6025 + }, + { + "epoch": 0.9, + "grad_norm": 5.836678885344998, + "learning_rate": 1.884688136591528e-06, + "loss": 0.6875, + "step": 6026 + }, + { + "epoch": 0.9, + "grad_norm": 4.188091833741748, + "learning_rate": 1.884643095465174e-06, + "loss": 0.6855, + "step": 6027 + }, + { + "epoch": 0.9, + "grad_norm": 3.4188851253359345, + "learning_rate": 1.8845980460823674e-06, + "loss": 0.6882, + "step": 6028 + }, + { + "epoch": 0.9, + "grad_norm": 5.987620387229338, + "learning_rate": 1.884552988443529e-06, + "loss": 0.6738, + "step": 6029 + }, + { + "epoch": 0.9, + "grad_norm": 1.4630166285180066, + "learning_rate": 1.8845079225490794e-06, + "loss": 0.6803, + "step": 6030 + }, + { + "epoch": 0.9, + "grad_norm": 5.858384491786981, + "learning_rate": 1.8844628483994392e-06, + "loss": 0.6823, + "step": 6031 + }, + { + "epoch": 0.9, + "grad_norm": 1.9890080263092869, + "learning_rate": 1.8844177659950292e-06, + "loss": 0.6797, + "step": 6032 + }, + { + "epoch": 0.9, + "grad_norm": 2.299530144488247, + "learning_rate": 1.8843726753362693e-06, + "loss": 0.679, + "step": 6033 + }, + { + "epoch": 0.9, + "grad_norm": 2.3923171786144533, + "learning_rate": 1.8843275764235815e-06, + "loss": 0.6868, + "step": 6034 + }, + { + "epoch": 0.9, + "grad_norm": 2.106263975899353, + "learning_rate": 1.8842824692573864e-06, + "loss": 0.6693, + "step": 6035 + }, + { + "epoch": 0.9, + "grad_norm": 2.8993429348511293, + "learning_rate": 1.8842373538381048e-06, + "loss": 0.6836, + "step": 6036 + }, + { + "epoch": 0.9, + "grad_norm": 0.5801176283785634, + "learning_rate": 1.8841922301661572e-06, + "loss": 0.7038, + "step": 6037 + }, + { + "epoch": 0.9, + "grad_norm": 1.5682367403428263, + "learning_rate": 1.8841470982419658e-06, + "loss": 0.6823, + "step": 6038 + }, + { + "epoch": 0.9, + "grad_norm": 0.48531001128223716, + "learning_rate": 1.8841019580659513e-06, + "loss": 0.6849, + "step": 6039 + }, + { + "epoch": 0.9, + "grad_norm": 1.5797631729245536, + "learning_rate": 1.884056809638535e-06, + "loss": 0.6784, + "step": 6040 + }, + { + "epoch": 0.9, + "grad_norm": 5.175786152753635, + "learning_rate": 1.8840116529601383e-06, + "loss": 0.6855, + "step": 6041 + }, + { + "epoch": 0.9, + "grad_norm": 0.8698695707318717, + "learning_rate": 1.8839664880311825e-06, + "loss": 0.6751, + "step": 6042 + }, + { + "epoch": 0.9, + "grad_norm": 1.8673113832849126, + "learning_rate": 1.8839213148520896e-06, + "loss": 0.6732, + "step": 6043 + }, + { + "epoch": 0.9, + "grad_norm": 4.557522412613059, + "learning_rate": 1.8838761334232806e-06, + "loss": 0.6725, + "step": 6044 + }, + { + "epoch": 0.9, + "grad_norm": 2.0060818851257154, + "learning_rate": 1.8838309437451776e-06, + "loss": 0.6589, + "step": 6045 + }, + { + "epoch": 0.9, + "grad_norm": 2.5011743499824566, + "learning_rate": 1.8837857458182022e-06, + "loss": 0.694, + "step": 6046 + }, + { + "epoch": 0.9, + "grad_norm": 0.7398474972018584, + "learning_rate": 1.8837405396427765e-06, + "loss": 0.6908, + "step": 6047 + }, + { + "epoch": 0.9, + "grad_norm": 3.005524495762091, + "learning_rate": 1.883695325219322e-06, + "loss": 0.6934, + "step": 6048 + }, + { + "epoch": 0.9, + "grad_norm": 3.833593368916091, + "learning_rate": 1.8836501025482612e-06, + "loss": 0.6934, + "step": 6049 + }, + { + "epoch": 0.9, + "grad_norm": 5.373829072090312, + "learning_rate": 1.8836048716300153e-06, + "loss": 0.6686, + "step": 6050 + }, + { + "epoch": 0.9, + "grad_norm": 1.8542914568335578, + "learning_rate": 1.8835596324650073e-06, + "loss": 0.6855, + "step": 6051 + }, + { + "epoch": 0.9, + "grad_norm": 0.5754427868132655, + "learning_rate": 1.883514385053659e-06, + "loss": 0.6751, + "step": 6052 + }, + { + "epoch": 0.9, + "grad_norm": 2.2453027302569004, + "learning_rate": 1.883469129396393e-06, + "loss": 0.6745, + "step": 6053 + }, + { + "epoch": 0.9, + "grad_norm": 3.712817185198309, + "learning_rate": 1.8834238654936314e-06, + "loss": 0.6784, + "step": 6054 + }, + { + "epoch": 0.9, + "grad_norm": 6.136373523454478, + "learning_rate": 1.8833785933457965e-06, + "loss": 0.6771, + "step": 6055 + }, + { + "epoch": 0.9, + "grad_norm": 0.7011422092847055, + "learning_rate": 1.8833333129533114e-06, + "loss": 0.6725, + "step": 6056 + }, + { + "epoch": 0.9, + "grad_norm": 2.819230125584055, + "learning_rate": 1.8832880243165983e-06, + "loss": 0.6986, + "step": 6057 + }, + { + "epoch": 0.9, + "grad_norm": 1.5934520530716723, + "learning_rate": 1.8832427274360798e-06, + "loss": 0.7161, + "step": 6058 + }, + { + "epoch": 0.9, + "grad_norm": 7.249379468633216, + "learning_rate": 1.8831974223121789e-06, + "loss": 0.6816, + "step": 6059 + }, + { + "epoch": 0.9, + "grad_norm": 0.8795514849344822, + "learning_rate": 1.8831521089453184e-06, + "loss": 0.6823, + "step": 6060 + }, + { + "epoch": 0.9, + "grad_norm": 1.9291867566870287, + "learning_rate": 1.883106787335921e-06, + "loss": 0.6888, + "step": 6061 + }, + { + "epoch": 0.9, + "grad_norm": 1.3753892241503238, + "learning_rate": 1.88306145748441e-06, + "loss": 0.7018, + "step": 6062 + }, + { + "epoch": 0.9, + "grad_norm": 1.9561582806983127, + "learning_rate": 1.8830161193912085e-06, + "loss": 0.6875, + "step": 6063 + }, + { + "epoch": 0.9, + "grad_norm": 5.049211700488271, + "learning_rate": 1.8829707730567393e-06, + "loss": 0.6934, + "step": 6064 + }, + { + "epoch": 0.9, + "grad_norm": 2.301690386702933, + "learning_rate": 1.8829254184814257e-06, + "loss": 0.6934, + "step": 6065 + }, + { + "epoch": 0.9, + "grad_norm": 0.644302390746597, + "learning_rate": 1.8828800556656912e-06, + "loss": 0.6758, + "step": 6066 + }, + { + "epoch": 0.9, + "grad_norm": 3.4000054373051016, + "learning_rate": 1.8828346846099588e-06, + "loss": 0.6745, + "step": 6067 + }, + { + "epoch": 0.9, + "grad_norm": 2.625655059547706, + "learning_rate": 1.8827893053146525e-06, + "loss": 0.6745, + "step": 6068 + }, + { + "epoch": 0.91, + "grad_norm": 0.3812251696247182, + "learning_rate": 1.8827439177801954e-06, + "loss": 0.6882, + "step": 6069 + }, + { + "epoch": 0.91, + "grad_norm": 0.747622111789243, + "learning_rate": 1.8826985220070112e-06, + "loss": 0.6842, + "step": 6070 + }, + { + "epoch": 0.91, + "grad_norm": 2.6521795223549995, + "learning_rate": 1.8826531179955238e-06, + "loss": 0.6641, + "step": 6071 + }, + { + "epoch": 0.91, + "grad_norm": 1.5038471575312717, + "learning_rate": 1.8826077057461567e-06, + "loss": 0.6947, + "step": 6072 + }, + { + "epoch": 0.91, + "grad_norm": 1.732587895363916, + "learning_rate": 1.8825622852593338e-06, + "loss": 0.6914, + "step": 6073 + }, + { + "epoch": 0.91, + "grad_norm": 2.4037292313188967, + "learning_rate": 1.8825168565354791e-06, + "loss": 0.6849, + "step": 6074 + }, + { + "epoch": 0.91, + "grad_norm": 6.102885447773246, + "learning_rate": 1.8824714195750167e-06, + "loss": 0.6738, + "step": 6075 + }, + { + "epoch": 0.91, + "grad_norm": 2.215545158815517, + "learning_rate": 1.8824259743783704e-06, + "loss": 0.6901, + "step": 6076 + }, + { + "epoch": 0.91, + "grad_norm": 2.5760456386281643, + "learning_rate": 1.8823805209459645e-06, + "loss": 0.6777, + "step": 6077 + }, + { + "epoch": 0.91, + "grad_norm": 0.6581935610562029, + "learning_rate": 1.882335059278223e-06, + "loss": 0.681, + "step": 6078 + }, + { + "epoch": 0.91, + "grad_norm": 2.0802602734461058, + "learning_rate": 1.8822895893755704e-06, + "loss": 0.6764, + "step": 6079 + }, + { + "epoch": 0.91, + "grad_norm": 3.5928051125234637, + "learning_rate": 1.8822441112384309e-06, + "loss": 0.6712, + "step": 6080 + }, + { + "epoch": 0.91, + "grad_norm": 1.5896587694785094, + "learning_rate": 1.8821986248672292e-06, + "loss": 0.6953, + "step": 6081 + }, + { + "epoch": 0.91, + "grad_norm": 4.306732847694067, + "learning_rate": 1.8821531302623898e-06, + "loss": 0.6725, + "step": 6082 + }, + { + "epoch": 0.91, + "grad_norm": 2.0119654174088866, + "learning_rate": 1.8821076274243372e-06, + "loss": 0.6738, + "step": 6083 + }, + { + "epoch": 0.91, + "grad_norm": 7.439744356028769, + "learning_rate": 1.8820621163534962e-06, + "loss": 0.694, + "step": 6084 + }, + { + "epoch": 0.91, + "grad_norm": 2.4374615239910034, + "learning_rate": 1.8820165970502912e-06, + "loss": 0.7012, + "step": 6085 + }, + { + "epoch": 0.91, + "grad_norm": 1.6399016218744307, + "learning_rate": 1.8819710695151474e-06, + "loss": 0.6758, + "step": 6086 + }, + { + "epoch": 0.91, + "grad_norm": 0.5691375358758142, + "learning_rate": 1.8819255337484899e-06, + "loss": 0.6849, + "step": 6087 + }, + { + "epoch": 0.91, + "grad_norm": 0.5235024591318997, + "learning_rate": 1.8818799897507432e-06, + "loss": 0.6823, + "step": 6088 + }, + { + "epoch": 0.91, + "grad_norm": 2.4741234495134394, + "learning_rate": 1.8818344375223322e-06, + "loss": 0.6803, + "step": 6089 + }, + { + "epoch": 0.91, + "grad_norm": 3.573905421543656, + "learning_rate": 1.8817888770636828e-06, + "loss": 0.6751, + "step": 6090 + }, + { + "epoch": 0.91, + "grad_norm": 0.9823815099891096, + "learning_rate": 1.8817433083752198e-06, + "loss": 0.6934, + "step": 6091 + }, + { + "epoch": 0.91, + "grad_norm": 0.8530606579148706, + "learning_rate": 1.8816977314573686e-06, + "loss": 0.6732, + "step": 6092 + }, + { + "epoch": 0.91, + "grad_norm": 3.747748897878654, + "learning_rate": 1.8816521463105542e-06, + "loss": 0.6719, + "step": 6093 + }, + { + "epoch": 0.91, + "grad_norm": 7.036632743830533, + "learning_rate": 1.8816065529352025e-06, + "loss": 0.6823, + "step": 6094 + }, + { + "epoch": 0.91, + "grad_norm": 0.6975322767156973, + "learning_rate": 1.881560951331739e-06, + "loss": 0.6706, + "step": 6095 + }, + { + "epoch": 0.91, + "grad_norm": 4.968407483522623, + "learning_rate": 1.881515341500589e-06, + "loss": 0.681, + "step": 6096 + }, + { + "epoch": 0.91, + "grad_norm": 2.0612025723979013, + "learning_rate": 1.8814697234421786e-06, + "loss": 0.6576, + "step": 6097 + }, + { + "epoch": 0.91, + "grad_norm": 4.163307171707782, + "learning_rate": 1.8814240971569331e-06, + "loss": 0.681, + "step": 6098 + }, + { + "epoch": 0.91, + "grad_norm": 2.830706652031418, + "learning_rate": 1.8813784626452787e-06, + "loss": 0.6868, + "step": 6099 + }, + { + "epoch": 0.91, + "grad_norm": 2.8388815101065354, + "learning_rate": 1.8813328199076407e-06, + "loss": 0.6914, + "step": 6100 + }, + { + "epoch": 0.91, + "grad_norm": 0.7948955102673124, + "learning_rate": 1.8812871689444463e-06, + "loss": 0.6875, + "step": 6101 + }, + { + "epoch": 0.91, + "grad_norm": 3.1503160342313863, + "learning_rate": 1.8812415097561205e-06, + "loss": 0.6862, + "step": 6102 + }, + { + "epoch": 0.91, + "grad_norm": 1.7518234846080198, + "learning_rate": 1.8811958423430897e-06, + "loss": 0.7031, + "step": 6103 + }, + { + "epoch": 0.91, + "grad_norm": 2.9546062676564278, + "learning_rate": 1.88115016670578e-06, + "loss": 0.6888, + "step": 6104 + }, + { + "epoch": 0.91, + "grad_norm": 2.8461077758234157, + "learning_rate": 1.881104482844618e-06, + "loss": 0.681, + "step": 6105 + }, + { + "epoch": 0.91, + "grad_norm": 1.7311730123993399, + "learning_rate": 1.88105879076003e-06, + "loss": 0.6921, + "step": 6106 + }, + { + "epoch": 0.91, + "grad_norm": 2.3936392127326522, + "learning_rate": 1.8810130904524426e-06, + "loss": 0.6777, + "step": 6107 + }, + { + "epoch": 0.91, + "grad_norm": 3.2089286827172594, + "learning_rate": 1.8809673819222817e-06, + "loss": 0.6725, + "step": 6108 + }, + { + "epoch": 0.91, + "grad_norm": 2.239619932410184, + "learning_rate": 1.8809216651699745e-06, + "loss": 0.6764, + "step": 6109 + }, + { + "epoch": 0.91, + "grad_norm": 0.5862825706348201, + "learning_rate": 1.8808759401959476e-06, + "loss": 0.6816, + "step": 6110 + }, + { + "epoch": 0.91, + "grad_norm": 4.721777406138191, + "learning_rate": 1.8808302070006275e-06, + "loss": 0.653, + "step": 6111 + }, + { + "epoch": 0.91, + "grad_norm": 2.865296966448042, + "learning_rate": 1.8807844655844414e-06, + "loss": 0.6849, + "step": 6112 + }, + { + "epoch": 0.91, + "grad_norm": 3.4061529564410282, + "learning_rate": 1.8807387159478157e-06, + "loss": 0.6712, + "step": 6113 + }, + { + "epoch": 0.91, + "grad_norm": 3.5545552404940826, + "learning_rate": 1.8806929580911777e-06, + "loss": 0.6784, + "step": 6114 + }, + { + "epoch": 0.91, + "grad_norm": 2.125158433099163, + "learning_rate": 1.8806471920149547e-06, + "loss": 0.6523, + "step": 6115 + }, + { + "epoch": 0.91, + "grad_norm": 1.3051107475380646, + "learning_rate": 1.8806014177195733e-06, + "loss": 0.6784, + "step": 6116 + }, + { + "epoch": 0.91, + "grad_norm": 5.886778035529972, + "learning_rate": 1.8805556352054613e-06, + "loss": 0.6608, + "step": 6117 + }, + { + "epoch": 0.91, + "grad_norm": 3.071038101137393, + "learning_rate": 1.8805098444730452e-06, + "loss": 0.6576, + "step": 6118 + }, + { + "epoch": 0.91, + "grad_norm": 4.091279952869818, + "learning_rate": 1.8804640455227531e-06, + "loss": 0.6829, + "step": 6119 + }, + { + "epoch": 0.91, + "grad_norm": 0.7177715068237671, + "learning_rate": 1.8804182383550122e-06, + "loss": 0.6842, + "step": 6120 + }, + { + "epoch": 0.91, + "grad_norm": 0.9655902614752728, + "learning_rate": 1.8803724229702501e-06, + "loss": 0.6452, + "step": 6121 + }, + { + "epoch": 0.91, + "grad_norm": 3.0890980844881497, + "learning_rate": 1.8803265993688943e-06, + "loss": 0.6842, + "step": 6122 + }, + { + "epoch": 0.91, + "grad_norm": 0.6961885699445306, + "learning_rate": 1.8802807675513725e-06, + "loss": 0.6797, + "step": 6123 + }, + { + "epoch": 0.91, + "grad_norm": 2.8874562142894775, + "learning_rate": 1.8802349275181125e-06, + "loss": 0.6934, + "step": 6124 + }, + { + "epoch": 0.91, + "grad_norm": 2.515963800772158, + "learning_rate": 1.880189079269542e-06, + "loss": 0.6738, + "step": 6125 + }, + { + "epoch": 0.91, + "grad_norm": 1.368333420988293, + "learning_rate": 1.880143222806089e-06, + "loss": 0.709, + "step": 6126 + }, + { + "epoch": 0.91, + "grad_norm": 4.182270738279412, + "learning_rate": 1.8800973581281814e-06, + "loss": 0.6836, + "step": 6127 + }, + { + "epoch": 0.91, + "grad_norm": 7.637709790225781, + "learning_rate": 1.8800514852362475e-06, + "loss": 0.6895, + "step": 6128 + }, + { + "epoch": 0.91, + "grad_norm": 0.5813215020100773, + "learning_rate": 1.8800056041307148e-06, + "loss": 0.6992, + "step": 6129 + }, + { + "epoch": 0.91, + "grad_norm": 1.2914265030524057, + "learning_rate": 1.8799597148120126e-06, + "loss": 0.6751, + "step": 6130 + }, + { + "epoch": 0.91, + "grad_norm": 0.7370414241684106, + "learning_rate": 1.8799138172805683e-06, + "loss": 0.6816, + "step": 6131 + }, + { + "epoch": 0.91, + "grad_norm": 0.831891766932348, + "learning_rate": 1.8798679115368103e-06, + "loss": 0.6595, + "step": 6132 + }, + { + "epoch": 0.91, + "grad_norm": 2.875448559305072, + "learning_rate": 1.8798219975811676e-06, + "loss": 0.6777, + "step": 6133 + }, + { + "epoch": 0.91, + "grad_norm": 3.344723877938696, + "learning_rate": 1.8797760754140684e-06, + "loss": 0.6947, + "step": 6134 + }, + { + "epoch": 0.91, + "grad_norm": 1.7203431243155316, + "learning_rate": 1.8797301450359408e-06, + "loss": 0.6699, + "step": 6135 + }, + { + "epoch": 0.92, + "grad_norm": 6.511045571179299, + "learning_rate": 1.8796842064472144e-06, + "loss": 0.6829, + "step": 6136 + }, + { + "epoch": 0.92, + "grad_norm": 4.906644915568238, + "learning_rate": 1.8796382596483173e-06, + "loss": 0.6751, + "step": 6137 + }, + { + "epoch": 0.92, + "grad_norm": 3.1582376234805443, + "learning_rate": 1.8795923046396789e-06, + "loss": 0.6947, + "step": 6138 + }, + { + "epoch": 0.92, + "grad_norm": 3.826278594034733, + "learning_rate": 1.8795463414217272e-06, + "loss": 0.6797, + "step": 6139 + }, + { + "epoch": 0.92, + "grad_norm": 2.5220173386385696, + "learning_rate": 1.8795003699948922e-06, + "loss": 0.6725, + "step": 6140 + }, + { + "epoch": 0.92, + "grad_norm": 6.174973510640079, + "learning_rate": 1.8794543903596022e-06, + "loss": 0.7057, + "step": 6141 + }, + { + "epoch": 0.92, + "grad_norm": 3.311719883047002, + "learning_rate": 1.8794084025162866e-06, + "loss": 0.6803, + "step": 6142 + }, + { + "epoch": 0.92, + "grad_norm": 0.5769429967538163, + "learning_rate": 1.8793624064653747e-06, + "loss": 0.6927, + "step": 6143 + }, + { + "epoch": 0.92, + "grad_norm": 1.4678049200116807, + "learning_rate": 1.8793164022072956e-06, + "loss": 0.6758, + "step": 6144 + }, + { + "epoch": 0.92, + "grad_norm": 0.7472165687822417, + "learning_rate": 1.8792703897424788e-06, + "loss": 0.6771, + "step": 6145 + }, + { + "epoch": 0.92, + "grad_norm": 1.230286841998463, + "learning_rate": 1.8792243690713536e-06, + "loss": 0.6862, + "step": 6146 + }, + { + "epoch": 0.92, + "grad_norm": 0.7084804649065791, + "learning_rate": 1.87917834019435e-06, + "loss": 0.6855, + "step": 6147 + }, + { + "epoch": 0.92, + "grad_norm": 7.263563409104156, + "learning_rate": 1.8791323031118966e-06, + "loss": 0.6979, + "step": 6148 + }, + { + "epoch": 0.92, + "grad_norm": 1.7830303290470717, + "learning_rate": 1.8790862578244239e-06, + "loss": 0.696, + "step": 6149 + }, + { + "epoch": 0.92, + "grad_norm": 4.784585536310086, + "learning_rate": 1.8790402043323618e-06, + "loss": 0.6862, + "step": 6150 + }, + { + "epoch": 0.92, + "grad_norm": 3.5987106435521583, + "learning_rate": 1.8789941426361394e-06, + "loss": 0.6764, + "step": 6151 + }, + { + "epoch": 0.92, + "grad_norm": 2.2000544226692056, + "learning_rate": 1.878948072736187e-06, + "loss": 0.6999, + "step": 6152 + }, + { + "epoch": 0.92, + "grad_norm": 3.005173350914075, + "learning_rate": 1.8789019946329345e-06, + "loss": 0.6816, + "step": 6153 + }, + { + "epoch": 0.92, + "grad_norm": 0.7949680110606188, + "learning_rate": 1.8788559083268118e-06, + "loss": 0.681, + "step": 6154 + }, + { + "epoch": 0.92, + "grad_norm": 1.0443842233319707, + "learning_rate": 1.8788098138182493e-06, + "loss": 0.6823, + "step": 6155 + }, + { + "epoch": 0.92, + "grad_norm": 0.7691718212915561, + "learning_rate": 1.878763711107677e-06, + "loss": 0.7038, + "step": 6156 + }, + { + "epoch": 0.92, + "grad_norm": 0.8000935614452913, + "learning_rate": 1.8787176001955257e-06, + "loss": 0.6868, + "step": 6157 + }, + { + "epoch": 0.92, + "grad_norm": 3.7336689690708518, + "learning_rate": 1.8786714810822248e-06, + "loss": 0.6953, + "step": 6158 + }, + { + "epoch": 0.92, + "grad_norm": 4.442379505638934, + "learning_rate": 1.8786253537682054e-06, + "loss": 0.6745, + "step": 6159 + }, + { + "epoch": 0.92, + "grad_norm": 6.927132345459361, + "learning_rate": 1.878579218253898e-06, + "loss": 0.6875, + "step": 6160 + }, + { + "epoch": 0.92, + "grad_norm": 2.6588504172178866, + "learning_rate": 1.878533074539733e-06, + "loss": 0.7109, + "step": 6161 + }, + { + "epoch": 0.92, + "grad_norm": 2.1138596000204593, + "learning_rate": 1.878486922626141e-06, + "loss": 0.6615, + "step": 6162 + }, + { + "epoch": 0.92, + "grad_norm": 2.887555794877082, + "learning_rate": 1.878440762513553e-06, + "loss": 0.6849, + "step": 6163 + }, + { + "epoch": 0.92, + "grad_norm": 4.33471874772784, + "learning_rate": 1.8783945942023998e-06, + "loss": 0.6849, + "step": 6164 + }, + { + "epoch": 0.92, + "grad_norm": 1.0915350581439025, + "learning_rate": 1.8783484176931119e-06, + "loss": 0.6719, + "step": 6165 + }, + { + "epoch": 0.92, + "grad_norm": 4.415383543217328, + "learning_rate": 1.8783022329861208e-06, + "loss": 0.679, + "step": 6166 + }, + { + "epoch": 0.92, + "grad_norm": 1.007995383306124, + "learning_rate": 1.878256040081857e-06, + "loss": 0.6745, + "step": 6167 + }, + { + "epoch": 0.92, + "grad_norm": 2.9124652323397013, + "learning_rate": 1.8782098389807521e-06, + "loss": 0.6882, + "step": 6168 + }, + { + "epoch": 0.92, + "grad_norm": 1.1793158942752007, + "learning_rate": 1.878163629683237e-06, + "loss": 0.6855, + "step": 6169 + }, + { + "epoch": 0.92, + "grad_norm": 1.1752960552591762, + "learning_rate": 1.8781174121897431e-06, + "loss": 0.6816, + "step": 6170 + }, + { + "epoch": 0.92, + "grad_norm": 0.42368932577792084, + "learning_rate": 1.8780711865007017e-06, + "loss": 0.6979, + "step": 6171 + }, + { + "epoch": 0.92, + "grad_norm": 1.2523429348503428, + "learning_rate": 1.8780249526165446e-06, + "loss": 0.6816, + "step": 6172 + }, + { + "epoch": 0.92, + "grad_norm": 1.3966047113780293, + "learning_rate": 1.8779787105377026e-06, + "loss": 0.6979, + "step": 6173 + }, + { + "epoch": 0.92, + "grad_norm": 4.7908563339441566, + "learning_rate": 1.8779324602646079e-06, + "loss": 0.6764, + "step": 6174 + }, + { + "epoch": 0.92, + "grad_norm": 0.6067756891932039, + "learning_rate": 1.8778862017976917e-06, + "loss": 0.6927, + "step": 6175 + }, + { + "epoch": 0.92, + "grad_norm": 3.764104854925303, + "learning_rate": 1.877839935137386e-06, + "loss": 0.6797, + "step": 6176 + }, + { + "epoch": 0.92, + "grad_norm": 0.5998041885855347, + "learning_rate": 1.8777936602841225e-06, + "loss": 0.6738, + "step": 6177 + }, + { + "epoch": 0.92, + "grad_norm": 4.055760726803181, + "learning_rate": 1.877747377238333e-06, + "loss": 0.694, + "step": 6178 + }, + { + "epoch": 0.92, + "grad_norm": 1.2231527383597511, + "learning_rate": 1.8777010860004502e-06, + "loss": 0.6719, + "step": 6179 + }, + { + "epoch": 0.92, + "grad_norm": 0.9556769392410246, + "learning_rate": 1.877654786570905e-06, + "loss": 0.6823, + "step": 6180 + }, + { + "epoch": 0.92, + "grad_norm": 7.3567632170013075, + "learning_rate": 1.8776084789501303e-06, + "loss": 0.6654, + "step": 6181 + }, + { + "epoch": 0.92, + "grad_norm": 6.89278727280193, + "learning_rate": 1.877562163138558e-06, + "loss": 0.6764, + "step": 6182 + }, + { + "epoch": 0.92, + "grad_norm": 8.726852024726883, + "learning_rate": 1.8775158391366205e-06, + "loss": 0.7025, + "step": 6183 + }, + { + "epoch": 0.92, + "grad_norm": 1.2911447818926565, + "learning_rate": 1.8774695069447498e-06, + "loss": 0.7025, + "step": 6184 + }, + { + "epoch": 0.92, + "grad_norm": 5.0384862800241805, + "learning_rate": 1.8774231665633787e-06, + "loss": 0.6699, + "step": 6185 + }, + { + "epoch": 0.92, + "grad_norm": 0.772990816042378, + "learning_rate": 1.8773768179929398e-06, + "loss": 0.6771, + "step": 6186 + }, + { + "epoch": 0.92, + "grad_norm": 0.6252504806886091, + "learning_rate": 1.8773304612338654e-06, + "loss": 0.6641, + "step": 6187 + }, + { + "epoch": 0.92, + "grad_norm": 4.66257344769312, + "learning_rate": 1.877284096286588e-06, + "loss": 0.6803, + "step": 6188 + }, + { + "epoch": 0.92, + "grad_norm": 2.680656282957612, + "learning_rate": 1.877237723151541e-06, + "loss": 0.6895, + "step": 6189 + }, + { + "epoch": 0.92, + "grad_norm": 1.762970558650049, + "learning_rate": 1.8771913418291561e-06, + "loss": 0.6875, + "step": 6190 + }, + { + "epoch": 0.92, + "grad_norm": 0.48959903768348756, + "learning_rate": 1.8771449523198672e-06, + "loss": 0.6641, + "step": 6191 + }, + { + "epoch": 0.92, + "grad_norm": 5.992626117754683, + "learning_rate": 1.8770985546241066e-06, + "loss": 0.6882, + "step": 6192 + }, + { + "epoch": 0.92, + "grad_norm": 4.638161507322233, + "learning_rate": 1.8770521487423082e-06, + "loss": 0.709, + "step": 6193 + }, + { + "epoch": 0.92, + "grad_norm": 2.920234515492739, + "learning_rate": 1.877005734674904e-06, + "loss": 0.6719, + "step": 6194 + }, + { + "epoch": 0.92, + "grad_norm": 2.4117581499734904, + "learning_rate": 1.8769593124223283e-06, + "loss": 0.6901, + "step": 6195 + }, + { + "epoch": 0.92, + "grad_norm": 3.170472830161039, + "learning_rate": 1.8769128819850134e-06, + "loss": 0.6803, + "step": 6196 + }, + { + "epoch": 0.92, + "grad_norm": 0.43099257150497483, + "learning_rate": 1.8768664433633927e-06, + "loss": 0.6836, + "step": 6197 + }, + { + "epoch": 0.92, + "grad_norm": 3.1015737566736146, + "learning_rate": 1.8768199965579005e-06, + "loss": 0.6686, + "step": 6198 + }, + { + "epoch": 0.92, + "grad_norm": 0.47372490111568766, + "learning_rate": 1.8767735415689694e-06, + "loss": 0.6823, + "step": 6199 + }, + { + "epoch": 0.92, + "grad_norm": 3.213520214559056, + "learning_rate": 1.8767270783970335e-06, + "loss": 0.7044, + "step": 6200 + }, + { + "epoch": 0.92, + "grad_norm": 3.911652983107594, + "learning_rate": 1.8766806070425262e-06, + "loss": 0.6927, + "step": 6201 + }, + { + "epoch": 0.92, + "grad_norm": 4.07928120727656, + "learning_rate": 1.876634127505881e-06, + "loss": 0.6777, + "step": 6202 + }, + { + "epoch": 0.93, + "grad_norm": 2.4741495352108123, + "learning_rate": 1.8765876397875322e-06, + "loss": 0.6895, + "step": 6203 + }, + { + "epoch": 0.93, + "grad_norm": 8.732014873990217, + "learning_rate": 1.8765411438879135e-06, + "loss": 0.6992, + "step": 6204 + }, + { + "epoch": 0.93, + "grad_norm": 3.8560390742360773, + "learning_rate": 1.8764946398074586e-06, + "loss": 0.7044, + "step": 6205 + }, + { + "epoch": 0.93, + "grad_norm": 5.399775164065908, + "learning_rate": 1.876448127546602e-06, + "loss": 0.681, + "step": 6206 + }, + { + "epoch": 0.93, + "grad_norm": 0.7469605188153049, + "learning_rate": 1.8764016071057772e-06, + "loss": 0.7083, + "step": 6207 + }, + { + "epoch": 0.93, + "grad_norm": 3.2750115664609463, + "learning_rate": 1.8763550784854187e-06, + "loss": 0.6549, + "step": 6208 + }, + { + "epoch": 0.93, + "grad_norm": 3.0786187351003744, + "learning_rate": 1.876308541685961e-06, + "loss": 0.6882, + "step": 6209 + }, + { + "epoch": 0.93, + "grad_norm": 4.1234799373092645, + "learning_rate": 1.8762619967078378e-06, + "loss": 0.7057, + "step": 6210 + }, + { + "epoch": 0.93, + "grad_norm": 7.089771538981556, + "learning_rate": 1.8762154435514845e-06, + "loss": 0.6986, + "step": 6211 + }, + { + "epoch": 0.93, + "grad_norm": 1.0122832079675768, + "learning_rate": 1.8761688822173346e-06, + "loss": 0.6862, + "step": 6212 + }, + { + "epoch": 0.93, + "grad_norm": 4.891328979444926, + "learning_rate": 1.876122312705823e-06, + "loss": 0.7018, + "step": 6213 + }, + { + "epoch": 0.93, + "grad_norm": 1.7117377800899076, + "learning_rate": 1.8760757350173844e-06, + "loss": 0.7038, + "step": 6214 + }, + { + "epoch": 0.93, + "grad_norm": 3.916992244684363, + "learning_rate": 1.8760291491524533e-06, + "loss": 0.6934, + "step": 6215 + }, + { + "epoch": 0.93, + "grad_norm": 3.789389599519961, + "learning_rate": 1.8759825551114651e-06, + "loss": 0.6725, + "step": 6216 + }, + { + "epoch": 0.93, + "grad_norm": 2.1242620647151282, + "learning_rate": 1.8759359528948542e-06, + "loss": 0.6829, + "step": 6217 + }, + { + "epoch": 0.93, + "grad_norm": 5.511137271169675, + "learning_rate": 1.8758893425030554e-06, + "loss": 0.679, + "step": 6218 + }, + { + "epoch": 0.93, + "grad_norm": 5.423259460439939, + "learning_rate": 1.8758427239365037e-06, + "loss": 0.6947, + "step": 6219 + }, + { + "epoch": 0.93, + "grad_norm": 0.8327346823704251, + "learning_rate": 1.8757960971956348e-06, + "loss": 0.6862, + "step": 6220 + }, + { + "epoch": 0.93, + "grad_norm": 2.1178465778429723, + "learning_rate": 1.8757494622808833e-06, + "loss": 0.7012, + "step": 6221 + }, + { + "epoch": 0.93, + "grad_norm": 2.300442613107654, + "learning_rate": 1.8757028191926846e-06, + "loss": 0.6829, + "step": 6222 + }, + { + "epoch": 0.93, + "grad_norm": 6.569528078995323, + "learning_rate": 1.8756561679314741e-06, + "loss": 0.6712, + "step": 6223 + }, + { + "epoch": 0.93, + "grad_norm": 0.9803021800471802, + "learning_rate": 1.8756095084976872e-06, + "loss": 0.6738, + "step": 6224 + }, + { + "epoch": 0.93, + "grad_norm": 4.773072760404691, + "learning_rate": 1.8755628408917592e-06, + "loss": 0.681, + "step": 6225 + }, + { + "epoch": 0.93, + "grad_norm": 4.678198647734124, + "learning_rate": 1.875516165114126e-06, + "loss": 0.6882, + "step": 6226 + }, + { + "epoch": 0.93, + "grad_norm": 3.0541082316048915, + "learning_rate": 1.8754694811652227e-06, + "loss": 0.694, + "step": 6227 + }, + { + "epoch": 0.93, + "grad_norm": 3.6174128407044464, + "learning_rate": 1.8754227890454855e-06, + "loss": 0.6803, + "step": 6228 + }, + { + "epoch": 0.93, + "grad_norm": 5.099745211660662, + "learning_rate": 1.8753760887553498e-06, + "loss": 0.7038, + "step": 6229 + }, + { + "epoch": 0.93, + "grad_norm": 8.596312507813074, + "learning_rate": 1.8753293802952519e-06, + "loss": 0.6973, + "step": 6230 + }, + { + "epoch": 0.93, + "grad_norm": 6.709070635868949, + "learning_rate": 1.8752826636656275e-06, + "loss": 0.6888, + "step": 6231 + }, + { + "epoch": 0.93, + "grad_norm": 1.3548520113515385, + "learning_rate": 1.8752359388669124e-06, + "loss": 0.6829, + "step": 6232 + }, + { + "epoch": 0.93, + "grad_norm": 2.122192036229687, + "learning_rate": 1.8751892058995428e-06, + "loss": 0.6823, + "step": 6233 + }, + { + "epoch": 0.93, + "grad_norm": 8.725494726286474, + "learning_rate": 1.875142464763955e-06, + "loss": 0.6895, + "step": 6234 + }, + { + "epoch": 0.93, + "grad_norm": 3.74234457272886, + "learning_rate": 1.8750957154605853e-06, + "loss": 0.6986, + "step": 6235 + }, + { + "epoch": 0.93, + "grad_norm": 3.665128370446475, + "learning_rate": 1.87504895798987e-06, + "loss": 0.679, + "step": 6236 + }, + { + "epoch": 0.93, + "grad_norm": 0.8172416450028654, + "learning_rate": 1.8750021923522454e-06, + "loss": 0.6823, + "step": 6237 + }, + { + "epoch": 0.93, + "grad_norm": 1.770881007929116, + "learning_rate": 1.8749554185481477e-06, + "loss": 0.668, + "step": 6238 + }, + { + "epoch": 0.93, + "grad_norm": 1.1257195244116043, + "learning_rate": 1.8749086365780138e-06, + "loss": 0.668, + "step": 6239 + }, + { + "epoch": 0.93, + "grad_norm": 0.6992854328853659, + "learning_rate": 1.8748618464422803e-06, + "loss": 0.6764, + "step": 6240 + }, + { + "epoch": 0.93, + "grad_norm": 5.220862317281296, + "learning_rate": 1.8748150481413835e-06, + "loss": 0.7025, + "step": 6241 + }, + { + "epoch": 0.93, + "grad_norm": 1.0888690261172767, + "learning_rate": 1.8747682416757611e-06, + "loss": 0.6654, + "step": 6242 + }, + { + "epoch": 0.93, + "grad_norm": 0.7128149675491544, + "learning_rate": 1.874721427045849e-06, + "loss": 0.6764, + "step": 6243 + }, + { + "epoch": 0.93, + "grad_norm": 2.76593434843563, + "learning_rate": 1.8746746042520844e-06, + "loss": 0.6693, + "step": 6244 + }, + { + "epoch": 0.93, + "grad_norm": 0.8636407149233667, + "learning_rate": 1.8746277732949043e-06, + "loss": 0.6992, + "step": 6245 + }, + { + "epoch": 0.93, + "grad_norm": 8.730573719157167, + "learning_rate": 1.874580934174746e-06, + "loss": 0.6803, + "step": 6246 + }, + { + "epoch": 0.93, + "grad_norm": 3.5759570786694206, + "learning_rate": 1.8745340868920466e-06, + "loss": 0.6927, + "step": 6247 + }, + { + "epoch": 0.93, + "grad_norm": 9.042437754233315, + "learning_rate": 1.8744872314472429e-06, + "loss": 0.7109, + "step": 6248 + }, + { + "epoch": 0.93, + "grad_norm": 1.9542384002794655, + "learning_rate": 1.8744403678407727e-06, + "loss": 0.666, + "step": 6249 + }, + { + "epoch": 0.93, + "grad_norm": 1.2615086104423334, + "learning_rate": 1.8743934960730732e-06, + "loss": 0.6953, + "step": 6250 + }, + { + "epoch": 0.93, + "grad_norm": 4.931363069421727, + "learning_rate": 1.874346616144582e-06, + "loss": 0.6868, + "step": 6251 + }, + { + "epoch": 0.93, + "grad_norm": 2.19963196113797, + "learning_rate": 1.8742997280557365e-06, + "loss": 0.6647, + "step": 6252 + }, + { + "epoch": 0.93, + "grad_norm": 3.0907548105702705, + "learning_rate": 1.874252831806974e-06, + "loss": 0.6823, + "step": 6253 + }, + { + "epoch": 0.93, + "grad_norm": 0.6148818686446623, + "learning_rate": 1.874205927398733e-06, + "loss": 0.668, + "step": 6254 + }, + { + "epoch": 0.93, + "grad_norm": 1.0457647683471298, + "learning_rate": 1.8741590148314503e-06, + "loss": 0.6667, + "step": 6255 + }, + { + "epoch": 0.93, + "grad_norm": 3.996532194238737, + "learning_rate": 1.8741120941055647e-06, + "loss": 0.6654, + "step": 6256 + }, + { + "epoch": 0.93, + "grad_norm": 1.2424508840007804, + "learning_rate": 1.8740651652215132e-06, + "loss": 0.6582, + "step": 6257 + }, + { + "epoch": 0.93, + "grad_norm": 1.3859099977996234, + "learning_rate": 1.8740182281797345e-06, + "loss": 0.6842, + "step": 6258 + }, + { + "epoch": 0.93, + "grad_norm": 5.220474949698748, + "learning_rate": 1.8739712829806661e-06, + "loss": 0.6966, + "step": 6259 + }, + { + "epoch": 0.93, + "grad_norm": 1.4660018772786287, + "learning_rate": 1.8739243296247467e-06, + "loss": 0.7122, + "step": 6260 + }, + { + "epoch": 0.93, + "grad_norm": 4.057186452608995, + "learning_rate": 1.873877368112414e-06, + "loss": 0.6719, + "step": 6261 + }, + { + "epoch": 0.93, + "grad_norm": 0.7197859060769284, + "learning_rate": 1.8738303984441067e-06, + "loss": 0.7064, + "step": 6262 + }, + { + "epoch": 0.93, + "grad_norm": 3.1761484566829647, + "learning_rate": 1.873783420620263e-06, + "loss": 0.6634, + "step": 6263 + }, + { + "epoch": 0.93, + "grad_norm": 2.6610769664748393, + "learning_rate": 1.8737364346413212e-06, + "loss": 0.6979, + "step": 6264 + }, + { + "epoch": 0.93, + "grad_norm": 2.369118725330069, + "learning_rate": 1.8736894405077202e-06, + "loss": 0.6751, + "step": 6265 + }, + { + "epoch": 0.93, + "grad_norm": 8.218109587547842, + "learning_rate": 1.8736424382198984e-06, + "loss": 0.6699, + "step": 6266 + }, + { + "epoch": 0.93, + "grad_norm": 0.9074205967013995, + "learning_rate": 1.8735954277782943e-06, + "loss": 0.6934, + "step": 6267 + }, + { + "epoch": 0.93, + "grad_norm": 0.6348583982404096, + "learning_rate": 1.8735484091833467e-06, + "loss": 0.6732, + "step": 6268 + }, + { + "epoch": 0.93, + "grad_norm": 2.6078110744219813, + "learning_rate": 1.8735013824354949e-06, + "loss": 0.6758, + "step": 6269 + }, + { + "epoch": 0.94, + "grad_norm": 3.380131220304053, + "learning_rate": 1.8734543475351773e-06, + "loss": 0.6855, + "step": 6270 + }, + { + "epoch": 0.94, + "grad_norm": 6.494520971262866, + "learning_rate": 1.8734073044828329e-06, + "loss": 0.7077, + "step": 6271 + }, + { + "epoch": 0.94, + "grad_norm": 3.839757802408945, + "learning_rate": 1.8733602532789011e-06, + "loss": 0.6816, + "step": 6272 + }, + { + "epoch": 0.94, + "grad_norm": 7.625046060090498, + "learning_rate": 1.8733131939238205e-06, + "loss": 0.694, + "step": 6273 + }, + { + "epoch": 0.94, + "grad_norm": 4.280144576894384, + "learning_rate": 1.8732661264180312e-06, + "loss": 0.6823, + "step": 6274 + }, + { + "epoch": 0.94, + "grad_norm": 0.8676595532734265, + "learning_rate": 1.8732190507619713e-06, + "loss": 0.6895, + "step": 6275 + }, + { + "epoch": 0.94, + "grad_norm": 0.5719908606782107, + "learning_rate": 1.873171966956081e-06, + "loss": 0.6667, + "step": 6276 + }, + { + "epoch": 0.94, + "grad_norm": 0.850119741079184, + "learning_rate": 1.8731248750007994e-06, + "loss": 0.7109, + "step": 6277 + }, + { + "epoch": 0.94, + "grad_norm": 0.8129121605873284, + "learning_rate": 1.8730777748965665e-06, + "loss": 0.6849, + "step": 6278 + }, + { + "epoch": 0.94, + "grad_norm": 0.7742996290679682, + "learning_rate": 1.8730306666438214e-06, + "loss": 0.6771, + "step": 6279 + }, + { + "epoch": 0.94, + "grad_norm": 9.238361725159281, + "learning_rate": 1.8729835502430035e-06, + "loss": 0.6823, + "step": 6280 + }, + { + "epoch": 0.94, + "grad_norm": 2.4325220317156653, + "learning_rate": 1.8729364256945535e-06, + "loss": 0.6842, + "step": 6281 + }, + { + "epoch": 0.94, + "grad_norm": 1.8268717321215315, + "learning_rate": 1.8728892929989102e-06, + "loss": 0.6888, + "step": 6282 + }, + { + "epoch": 0.94, + "grad_norm": 1.2387164580027976, + "learning_rate": 1.872842152156514e-06, + "loss": 0.6927, + "step": 6283 + }, + { + "epoch": 0.94, + "grad_norm": 0.4272862608445212, + "learning_rate": 1.8727950031678051e-06, + "loss": 0.6771, + "step": 6284 + }, + { + "epoch": 0.94, + "grad_norm": 3.39555614108365, + "learning_rate": 1.872747846033223e-06, + "loss": 0.6934, + "step": 6285 + }, + { + "epoch": 0.94, + "grad_norm": 1.439729036997235, + "learning_rate": 1.872700680753208e-06, + "loss": 0.6868, + "step": 6286 + }, + { + "epoch": 0.94, + "grad_norm": 0.8308285238606361, + "learning_rate": 1.8726535073282004e-06, + "loss": 0.6569, + "step": 6287 + }, + { + "epoch": 0.94, + "grad_norm": 5.317387553130564, + "learning_rate": 1.8726063257586407e-06, + "loss": 0.6882, + "step": 6288 + }, + { + "epoch": 0.94, + "grad_norm": 2.2035453560848635, + "learning_rate": 1.8725591360449686e-06, + "loss": 0.6706, + "step": 6289 + }, + { + "epoch": 0.94, + "grad_norm": 4.210964923766792, + "learning_rate": 1.8725119381876252e-06, + "loss": 0.6921, + "step": 6290 + }, + { + "epoch": 0.94, + "grad_norm": 0.5766066455151826, + "learning_rate": 1.8724647321870505e-06, + "loss": 0.6823, + "step": 6291 + }, + { + "epoch": 0.94, + "grad_norm": 3.023601252861584, + "learning_rate": 1.8724175180436856e-06, + "loss": 0.6966, + "step": 6292 + }, + { + "epoch": 0.94, + "grad_norm": 1.5920727368910712, + "learning_rate": 1.8723702957579706e-06, + "loss": 0.6764, + "step": 6293 + }, + { + "epoch": 0.94, + "grad_norm": 5.447167117391492, + "learning_rate": 1.8723230653303466e-06, + "loss": 0.6654, + "step": 6294 + }, + { + "epoch": 0.94, + "grad_norm": 0.6729492348200283, + "learning_rate": 1.8722758267612542e-06, + "loss": 0.6751, + "step": 6295 + }, + { + "epoch": 0.94, + "grad_norm": 1.0004912980126912, + "learning_rate": 1.8722285800511343e-06, + "loss": 0.6934, + "step": 6296 + }, + { + "epoch": 0.94, + "grad_norm": 1.5936413208780447, + "learning_rate": 1.872181325200428e-06, + "loss": 0.6777, + "step": 6297 + }, + { + "epoch": 0.94, + "grad_norm": 4.070956954593622, + "learning_rate": 1.8721340622095764e-06, + "loss": 0.6823, + "step": 6298 + }, + { + "epoch": 0.94, + "grad_norm": 0.7523567726323714, + "learning_rate": 1.8720867910790204e-06, + "loss": 0.6719, + "step": 6299 + }, + { + "epoch": 0.94, + "grad_norm": 4.207954318165495, + "learning_rate": 1.8720395118092012e-06, + "loss": 0.6999, + "step": 6300 + }, + { + "epoch": 0.94, + "grad_norm": 2.746730324248295, + "learning_rate": 1.8719922244005602e-06, + "loss": 0.6914, + "step": 6301 + }, + { + "epoch": 0.94, + "grad_norm": 1.227737950327188, + "learning_rate": 1.8719449288535385e-06, + "loss": 0.6895, + "step": 6302 + }, + { + "epoch": 0.94, + "grad_norm": 3.4923046203866352, + "learning_rate": 1.8718976251685778e-06, + "loss": 0.6595, + "step": 6303 + }, + { + "epoch": 0.94, + "grad_norm": 7.009896339404022, + "learning_rate": 1.8718503133461194e-06, + "loss": 0.666, + "step": 6304 + }, + { + "epoch": 0.94, + "grad_norm": 0.771069584633817, + "learning_rate": 1.871802993386605e-06, + "loss": 0.6979, + "step": 6305 + }, + { + "epoch": 0.94, + "grad_norm": 1.9572312615091352, + "learning_rate": 1.8717556652904762e-06, + "loss": 0.6836, + "step": 6306 + }, + { + "epoch": 0.94, + "grad_norm": 0.5454868832478174, + "learning_rate": 1.8717083290581745e-06, + "loss": 0.6647, + "step": 6307 + }, + { + "epoch": 0.94, + "grad_norm": 1.1465355237270876, + "learning_rate": 1.871660984690142e-06, + "loss": 0.6738, + "step": 6308 + }, + { + "epoch": 0.94, + "grad_norm": 0.8528575399931512, + "learning_rate": 1.8716136321868206e-06, + "loss": 0.6797, + "step": 6309 + }, + { + "epoch": 0.94, + "grad_norm": 5.030579952003167, + "learning_rate": 1.8715662715486518e-06, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 0.94, + "grad_norm": 6.900582823308894, + "learning_rate": 1.871518902776078e-06, + "loss": 0.7031, + "step": 6311 + }, + { + "epoch": 0.94, + "grad_norm": 4.120375172574917, + "learning_rate": 1.8714715258695413e-06, + "loss": 0.6777, + "step": 6312 + }, + { + "epoch": 0.94, + "grad_norm": 3.462334119493579, + "learning_rate": 1.8714241408294838e-06, + "loss": 0.6842, + "step": 6313 + }, + { + "epoch": 0.94, + "grad_norm": 2.35800051442324, + "learning_rate": 1.8713767476563476e-06, + "loss": 0.6908, + "step": 6314 + }, + { + "epoch": 0.94, + "grad_norm": 5.413083751682385, + "learning_rate": 1.871329346350575e-06, + "loss": 0.6855, + "step": 6315 + }, + { + "epoch": 0.94, + "grad_norm": 1.2444652078019927, + "learning_rate": 1.8712819369126089e-06, + "loss": 0.6927, + "step": 6316 + }, + { + "epoch": 0.94, + "grad_norm": 3.0191343268061455, + "learning_rate": 1.8712345193428912e-06, + "loss": 0.6921, + "step": 6317 + }, + { + "epoch": 0.94, + "grad_norm": 1.0463152051433844, + "learning_rate": 1.8711870936418647e-06, + "loss": 0.6621, + "step": 6318 + }, + { + "epoch": 0.94, + "grad_norm": 0.8377718287648469, + "learning_rate": 1.8711396598099722e-06, + "loss": 0.6888, + "step": 6319 + }, + { + "epoch": 0.94, + "grad_norm": 1.1419936687700452, + "learning_rate": 1.871092217847656e-06, + "loss": 0.679, + "step": 6320 + }, + { + "epoch": 0.94, + "grad_norm": 2.802783431523835, + "learning_rate": 1.871044767755359e-06, + "loss": 0.6641, + "step": 6321 + }, + { + "epoch": 0.94, + "grad_norm": 3.6187657467982977, + "learning_rate": 1.8709973095335245e-06, + "loss": 0.6654, + "step": 6322 + }, + { + "epoch": 0.94, + "grad_norm": 2.036502248466032, + "learning_rate": 1.8709498431825947e-06, + "loss": 0.6777, + "step": 6323 + }, + { + "epoch": 0.94, + "grad_norm": 8.280633613306184, + "learning_rate": 1.8709023687030135e-06, + "loss": 0.6934, + "step": 6324 + }, + { + "epoch": 0.94, + "grad_norm": 0.7229858940460838, + "learning_rate": 1.8708548860952232e-06, + "loss": 0.6699, + "step": 6325 + }, + { + "epoch": 0.94, + "grad_norm": 2.4842633096475866, + "learning_rate": 1.8708073953596671e-06, + "loss": 0.6719, + "step": 6326 + }, + { + "epoch": 0.94, + "grad_norm": 1.597433447215373, + "learning_rate": 1.8707598964967888e-06, + "loss": 0.6771, + "step": 6327 + }, + { + "epoch": 0.94, + "grad_norm": 0.672477327729963, + "learning_rate": 1.870712389507031e-06, + "loss": 0.6758, + "step": 6328 + }, + { + "epoch": 0.94, + "grad_norm": 4.444736119344417, + "learning_rate": 1.870664874390838e-06, + "loss": 0.6973, + "step": 6329 + }, + { + "epoch": 0.94, + "grad_norm": 0.9711076045324156, + "learning_rate": 1.8706173511486522e-06, + "loss": 0.6842, + "step": 6330 + }, + { + "epoch": 0.94, + "grad_norm": 1.11304512187729, + "learning_rate": 1.8705698197809178e-06, + "loss": 0.7096, + "step": 6331 + }, + { + "epoch": 0.94, + "grad_norm": 4.073661286570759, + "learning_rate": 1.8705222802880787e-06, + "loss": 0.6777, + "step": 6332 + }, + { + "epoch": 0.94, + "grad_norm": 3.6190454901551754, + "learning_rate": 1.8704747326705777e-06, + "loss": 0.6706, + "step": 6333 + }, + { + "epoch": 0.94, + "grad_norm": 0.6044836753483153, + "learning_rate": 1.8704271769288593e-06, + "loss": 0.6875, + "step": 6334 + }, + { + "epoch": 0.94, + "grad_norm": 9.518053833179852, + "learning_rate": 1.870379613063367e-06, + "loss": 0.7057, + "step": 6335 + }, + { + "epoch": 0.94, + "grad_norm": 3.926092525193877, + "learning_rate": 1.8703320410745448e-06, + "loss": 0.679, + "step": 6336 + }, + { + "epoch": 0.95, + "grad_norm": 2.2826763369435925, + "learning_rate": 1.8702844609628368e-06, + "loss": 0.6745, + "step": 6337 + }, + { + "epoch": 0.95, + "grad_norm": 2.814634235917494, + "learning_rate": 1.8702368727286868e-06, + "loss": 0.6868, + "step": 6338 + }, + { + "epoch": 0.95, + "grad_norm": 2.9165641780489953, + "learning_rate": 1.870189276372539e-06, + "loss": 0.6966, + "step": 6339 + }, + { + "epoch": 0.95, + "grad_norm": 4.238290712501135, + "learning_rate": 1.870141671894838e-06, + "loss": 0.6764, + "step": 6340 + }, + { + "epoch": 0.95, + "grad_norm": 3.6428603310169407, + "learning_rate": 1.8700940592960277e-06, + "loss": 0.6986, + "step": 6341 + }, + { + "epoch": 0.95, + "grad_norm": 1.624972013718136, + "learning_rate": 1.8700464385765527e-06, + "loss": 0.6608, + "step": 6342 + }, + { + "epoch": 0.95, + "grad_norm": 1.3511328512295284, + "learning_rate": 1.8699988097368574e-06, + "loss": 0.6862, + "step": 6343 + }, + { + "epoch": 0.95, + "grad_norm": 2.716909166682827, + "learning_rate": 1.869951172777386e-06, + "loss": 0.6973, + "step": 6344 + }, + { + "epoch": 0.95, + "grad_norm": 0.5084294036224327, + "learning_rate": 1.8699035276985833e-06, + "loss": 0.694, + "step": 6345 + }, + { + "epoch": 0.95, + "grad_norm": 2.3746068391683752, + "learning_rate": 1.8698558745008943e-06, + "loss": 0.6777, + "step": 6346 + }, + { + "epoch": 0.95, + "grad_norm": 0.45533324359123084, + "learning_rate": 1.8698082131847634e-06, + "loss": 0.6777, + "step": 6347 + }, + { + "epoch": 0.95, + "grad_norm": 8.176578124564198, + "learning_rate": 1.8697605437506357e-06, + "loss": 0.6719, + "step": 6348 + }, + { + "epoch": 0.95, + "grad_norm": 3.4933949889730083, + "learning_rate": 1.869712866198956e-06, + "loss": 0.6777, + "step": 6349 + }, + { + "epoch": 0.95, + "grad_norm": 5.192171252690165, + "learning_rate": 1.869665180530169e-06, + "loss": 0.6855, + "step": 6350 + }, + { + "epoch": 0.95, + "grad_norm": 4.739042826572142, + "learning_rate": 1.8696174867447196e-06, + "loss": 0.6823, + "step": 6351 + }, + { + "epoch": 0.95, + "grad_norm": 3.0008461999517513, + "learning_rate": 1.8695697848430537e-06, + "loss": 0.6777, + "step": 6352 + }, + { + "epoch": 0.95, + "grad_norm": 1.853204439543093, + "learning_rate": 1.8695220748256162e-06, + "loss": 0.6452, + "step": 6353 + }, + { + "epoch": 0.95, + "grad_norm": 0.6458549594395147, + "learning_rate": 1.869474356692852e-06, + "loss": 0.6751, + "step": 6354 + }, + { + "epoch": 0.95, + "grad_norm": 5.939562448553499, + "learning_rate": 1.869426630445207e-06, + "loss": 0.6979, + "step": 6355 + }, + { + "epoch": 0.95, + "grad_norm": 1.939998288446473, + "learning_rate": 1.869378896083126e-06, + "loss": 0.6953, + "step": 6356 + }, + { + "epoch": 0.95, + "grad_norm": 2.11341337253842, + "learning_rate": 1.8693311536070552e-06, + "loss": 0.6999, + "step": 6357 + }, + { + "epoch": 0.95, + "grad_norm": 2.345180576275583, + "learning_rate": 1.8692834030174399e-06, + "loss": 0.6895, + "step": 6358 + }, + { + "epoch": 0.95, + "grad_norm": 5.2903788118546355, + "learning_rate": 1.8692356443147254e-06, + "loss": 0.6816, + "step": 6359 + }, + { + "epoch": 0.95, + "grad_norm": 0.5256284356701191, + "learning_rate": 1.8691878774993581e-06, + "loss": 0.6641, + "step": 6360 + }, + { + "epoch": 0.95, + "grad_norm": 1.4217666691508826, + "learning_rate": 1.8691401025717833e-06, + "loss": 0.6882, + "step": 6361 + }, + { + "epoch": 0.95, + "grad_norm": 1.3108511023037845, + "learning_rate": 1.869092319532447e-06, + "loss": 0.6914, + "step": 6362 + }, + { + "epoch": 0.95, + "grad_norm": 3.085935680086019, + "learning_rate": 1.8690445283817952e-06, + "loss": 0.694, + "step": 6363 + }, + { + "epoch": 0.95, + "grad_norm": 2.8371678039053996, + "learning_rate": 1.8689967291202741e-06, + "loss": 0.6712, + "step": 6364 + }, + { + "epoch": 0.95, + "grad_norm": 4.2091557343865516, + "learning_rate": 1.8689489217483296e-06, + "loss": 0.696, + "step": 6365 + }, + { + "epoch": 0.95, + "grad_norm": 1.071197562054763, + "learning_rate": 1.8689011062664082e-06, + "loss": 0.6732, + "step": 6366 + }, + { + "epoch": 0.95, + "grad_norm": 3.386498337379059, + "learning_rate": 1.8688532826749556e-06, + "loss": 0.6836, + "step": 6367 + }, + { + "epoch": 0.95, + "grad_norm": 0.5007268505865077, + "learning_rate": 1.8688054509744187e-06, + "loss": 0.666, + "step": 6368 + }, + { + "epoch": 0.95, + "grad_norm": 2.0857825663126883, + "learning_rate": 1.8687576111652437e-06, + "loss": 0.7096, + "step": 6369 + }, + { + "epoch": 0.95, + "grad_norm": 6.629661668696504, + "learning_rate": 1.868709763247877e-06, + "loss": 0.6895, + "step": 6370 + }, + { + "epoch": 0.95, + "grad_norm": 0.4718685446535111, + "learning_rate": 1.8686619072227652e-06, + "loss": 0.6699, + "step": 6371 + }, + { + "epoch": 0.95, + "grad_norm": 6.222120425591797, + "learning_rate": 1.8686140430903551e-06, + "loss": 0.6914, + "step": 6372 + }, + { + "epoch": 0.95, + "grad_norm": 0.4730439920787831, + "learning_rate": 1.8685661708510933e-06, + "loss": 0.6712, + "step": 6373 + }, + { + "epoch": 0.95, + "grad_norm": 1.59522467344479, + "learning_rate": 1.8685182905054267e-06, + "loss": 0.6868, + "step": 6374 + }, + { + "epoch": 0.95, + "grad_norm": 0.6628048464614543, + "learning_rate": 1.868470402053802e-06, + "loss": 0.6634, + "step": 6375 + }, + { + "epoch": 0.95, + "grad_norm": 0.47551583851560647, + "learning_rate": 1.868422505496666e-06, + "loss": 0.6628, + "step": 6376 + }, + { + "epoch": 0.95, + "grad_norm": 2.585425464656077, + "learning_rate": 1.8683746008344663e-06, + "loss": 0.6549, + "step": 6377 + }, + { + "epoch": 0.95, + "grad_norm": 4.343447437874227, + "learning_rate": 1.8683266880676496e-06, + "loss": 0.6908, + "step": 6378 + }, + { + "epoch": 0.95, + "grad_norm": 3.9803498272616835, + "learning_rate": 1.8682787671966632e-06, + "loss": 0.6836, + "step": 6379 + }, + { + "epoch": 0.95, + "grad_norm": 2.622632566826875, + "learning_rate": 1.868230838221954e-06, + "loss": 0.7122, + "step": 6380 + }, + { + "epoch": 0.95, + "grad_norm": 7.220735935778717, + "learning_rate": 1.8681829011439696e-06, + "loss": 0.6927, + "step": 6381 + }, + { + "epoch": 0.95, + "grad_norm": 1.7474905086165569, + "learning_rate": 1.8681349559631576e-06, + "loss": 0.6953, + "step": 6382 + }, + { + "epoch": 0.95, + "grad_norm": 2.290641382147447, + "learning_rate": 1.868087002679965e-06, + "loss": 0.6823, + "step": 6383 + }, + { + "epoch": 0.95, + "grad_norm": 1.9603038488069617, + "learning_rate": 1.86803904129484e-06, + "loss": 0.6764, + "step": 6384 + }, + { + "epoch": 0.95, + "grad_norm": 3.1236613251160716, + "learning_rate": 1.86799107180823e-06, + "loss": 0.6738, + "step": 6385 + }, + { + "epoch": 0.95, + "grad_norm": 2.4607759629164634, + "learning_rate": 1.8679430942205822e-06, + "loss": 0.6628, + "step": 6386 + }, + { + "epoch": 0.95, + "grad_norm": 3.3933059638614957, + "learning_rate": 1.8678951085323447e-06, + "loss": 0.6738, + "step": 6387 + }, + { + "epoch": 0.95, + "grad_norm": 0.6976756339955079, + "learning_rate": 1.8678471147439657e-06, + "loss": 0.6979, + "step": 6388 + }, + { + "epoch": 0.95, + "grad_norm": 1.899196384277235, + "learning_rate": 1.8677991128558926e-06, + "loss": 0.6836, + "step": 6389 + }, + { + "epoch": 0.95, + "grad_norm": 3.086466876819816, + "learning_rate": 1.8677511028685736e-06, + "loss": 0.6862, + "step": 6390 + }, + { + "epoch": 0.95, + "grad_norm": 0.4778593222442257, + "learning_rate": 1.8677030847824572e-06, + "loss": 0.6836, + "step": 6391 + }, + { + "epoch": 0.95, + "grad_norm": 3.1351900178871137, + "learning_rate": 1.8676550585979906e-06, + "loss": 0.6947, + "step": 6392 + }, + { + "epoch": 0.95, + "grad_norm": 1.2729538016293718, + "learning_rate": 1.867607024315623e-06, + "loss": 0.6816, + "step": 6393 + }, + { + "epoch": 0.95, + "grad_norm": 0.5566849531719532, + "learning_rate": 1.867558981935802e-06, + "loss": 0.6628, + "step": 6394 + }, + { + "epoch": 0.95, + "grad_norm": 3.5837046206963903, + "learning_rate": 1.8675109314589767e-06, + "loss": 0.6927, + "step": 6395 + }, + { + "epoch": 0.95, + "grad_norm": 9.13061587514612, + "learning_rate": 1.867462872885595e-06, + "loss": 0.7057, + "step": 6396 + }, + { + "epoch": 0.95, + "grad_norm": 6.546731594174343, + "learning_rate": 1.8674148062161055e-06, + "loss": 0.7031, + "step": 6397 + }, + { + "epoch": 0.95, + "grad_norm": 1.6916345295396396, + "learning_rate": 1.8673667314509572e-06, + "loss": 0.6628, + "step": 6398 + }, + { + "epoch": 0.95, + "grad_norm": 1.8922350003484478, + "learning_rate": 1.8673186485905982e-06, + "loss": 0.6706, + "step": 6399 + }, + { + "epoch": 0.95, + "grad_norm": 7.815315041634688, + "learning_rate": 1.8672705576354775e-06, + "loss": 0.6771, + "step": 6400 + }, + { + "epoch": 0.95, + "grad_norm": 1.1532363334440052, + "learning_rate": 1.8672224585860441e-06, + "loss": 0.6862, + "step": 6401 + }, + { + "epoch": 0.95, + "grad_norm": 3.163913610023156, + "learning_rate": 1.8671743514427468e-06, + "loss": 0.6973, + "step": 6402 + }, + { + "epoch": 0.95, + "grad_norm": 2.4469873055913753, + "learning_rate": 1.8671262362060346e-06, + "loss": 0.6764, + "step": 6403 + }, + { + "epoch": 0.96, + "grad_norm": 1.3363082246572333, + "learning_rate": 1.8670781128763565e-06, + "loss": 0.7148, + "step": 6404 + }, + { + "epoch": 0.96, + "grad_norm": 3.451811659617845, + "learning_rate": 1.8670299814541616e-06, + "loss": 0.6966, + "step": 6405 + }, + { + "epoch": 0.96, + "grad_norm": 2.6326615390110235, + "learning_rate": 1.8669818419398995e-06, + "loss": 0.6842, + "step": 6406 + }, + { + "epoch": 0.96, + "grad_norm": 0.5491910773721497, + "learning_rate": 1.866933694334019e-06, + "loss": 0.6986, + "step": 6407 + }, + { + "epoch": 0.96, + "grad_norm": 2.2817817659420436, + "learning_rate": 1.8668855386369698e-06, + "loss": 0.6895, + "step": 6408 + }, + { + "epoch": 0.96, + "grad_norm": 1.9241537719026507, + "learning_rate": 1.866837374849201e-06, + "loss": 0.6973, + "step": 6409 + }, + { + "epoch": 0.96, + "grad_norm": 0.42150719431791617, + "learning_rate": 1.8667892029711623e-06, + "loss": 0.6758, + "step": 6410 + }, + { + "epoch": 0.96, + "grad_norm": 2.6865000810355846, + "learning_rate": 1.8667410230033035e-06, + "loss": 0.6908, + "step": 6411 + }, + { + "epoch": 0.96, + "grad_norm": 2.613917957760698, + "learning_rate": 1.866692834946074e-06, + "loss": 0.6842, + "step": 6412 + }, + { + "epoch": 0.96, + "grad_norm": 1.106876195386032, + "learning_rate": 1.8666446387999237e-06, + "loss": 0.6862, + "step": 6413 + }, + { + "epoch": 0.96, + "grad_norm": 2.163918517097381, + "learning_rate": 1.8665964345653022e-06, + "loss": 0.6875, + "step": 6414 + }, + { + "epoch": 0.96, + "grad_norm": 2.6732776979781585, + "learning_rate": 1.8665482222426595e-06, + "loss": 0.681, + "step": 6415 + }, + { + "epoch": 0.96, + "grad_norm": 2.227815610393997, + "learning_rate": 1.866500001832446e-06, + "loss": 0.679, + "step": 6416 + }, + { + "epoch": 0.96, + "grad_norm": 2.130377400824001, + "learning_rate": 1.8664517733351111e-06, + "loss": 0.6758, + "step": 6417 + }, + { + "epoch": 0.96, + "grad_norm": 5.962267077684355, + "learning_rate": 1.8664035367511054e-06, + "loss": 0.6895, + "step": 6418 + }, + { + "epoch": 0.96, + "grad_norm": 5.700854151168571, + "learning_rate": 1.8663552920808786e-06, + "loss": 0.7025, + "step": 6419 + }, + { + "epoch": 0.96, + "grad_norm": 3.5220790211565816, + "learning_rate": 1.866307039324881e-06, + "loss": 0.6569, + "step": 6420 + }, + { + "epoch": 0.96, + "grad_norm": 5.827481383965972, + "learning_rate": 1.8662587784835635e-06, + "loss": 0.681, + "step": 6421 + }, + { + "epoch": 0.96, + "grad_norm": 4.010988992430959, + "learning_rate": 1.8662105095573762e-06, + "loss": 0.6758, + "step": 6422 + }, + { + "epoch": 0.96, + "grad_norm": 0.4419541804231049, + "learning_rate": 1.8661622325467695e-06, + "loss": 0.6908, + "step": 6423 + }, + { + "epoch": 0.96, + "grad_norm": 0.40991730881231336, + "learning_rate": 1.8661139474521944e-06, + "loss": 0.6764, + "step": 6424 + }, + { + "epoch": 0.96, + "grad_norm": 3.0202695774040063, + "learning_rate": 1.8660656542741009e-06, + "loss": 0.6634, + "step": 6425 + }, + { + "epoch": 0.96, + "grad_norm": 1.2277915006733935, + "learning_rate": 1.86601735301294e-06, + "loss": 0.6706, + "step": 6426 + }, + { + "epoch": 0.96, + "grad_norm": 0.5577591927404695, + "learning_rate": 1.8659690436691629e-06, + "loss": 0.6927, + "step": 6427 + }, + { + "epoch": 0.96, + "grad_norm": 2.724051879227557, + "learning_rate": 1.86592072624322e-06, + "loss": 0.6725, + "step": 6428 + }, + { + "epoch": 0.96, + "grad_norm": 4.986872745246057, + "learning_rate": 1.8658724007355624e-06, + "loss": 0.6973, + "step": 6429 + }, + { + "epoch": 0.96, + "grad_norm": 4.705646844256053, + "learning_rate": 1.865824067146641e-06, + "loss": 0.7057, + "step": 6430 + }, + { + "epoch": 0.96, + "grad_norm": 4.971739332316349, + "learning_rate": 1.865775725476907e-06, + "loss": 0.6764, + "step": 6431 + }, + { + "epoch": 0.96, + "grad_norm": 2.74685527328894, + "learning_rate": 1.8657273757268115e-06, + "loss": 0.6947, + "step": 6432 + }, + { + "epoch": 0.96, + "grad_norm": 4.832057754792157, + "learning_rate": 1.865679017896806e-06, + "loss": 0.7012, + "step": 6433 + }, + { + "epoch": 0.96, + "grad_norm": 2.244650090206115, + "learning_rate": 1.8656306519873417e-06, + "loss": 0.6836, + "step": 6434 + }, + { + "epoch": 0.96, + "grad_norm": 5.034400209936846, + "learning_rate": 1.86558227799887e-06, + "loss": 0.6777, + "step": 6435 + }, + { + "epoch": 0.96, + "grad_norm": 3.8959592766348585, + "learning_rate": 1.8655338959318423e-06, + "loss": 0.6862, + "step": 6436 + }, + { + "epoch": 0.96, + "grad_norm": 1.4347423139688538, + "learning_rate": 1.8654855057867102e-06, + "loss": 0.6712, + "step": 6437 + }, + { + "epoch": 0.96, + "grad_norm": 0.6693169785789351, + "learning_rate": 1.8654371075639253e-06, + "loss": 0.6667, + "step": 6438 + }, + { + "epoch": 0.96, + "grad_norm": 3.015907136284261, + "learning_rate": 1.8653887012639392e-06, + "loss": 0.6647, + "step": 6439 + }, + { + "epoch": 0.96, + "grad_norm": 5.31228266790519, + "learning_rate": 1.865340286887204e-06, + "loss": 0.6816, + "step": 6440 + }, + { + "epoch": 0.96, + "grad_norm": 3.9887796508574636, + "learning_rate": 1.8652918644341714e-06, + "loss": 0.7064, + "step": 6441 + }, + { + "epoch": 0.96, + "grad_norm": 4.3851588314110295, + "learning_rate": 1.8652434339052934e-06, + "loss": 0.6882, + "step": 6442 + }, + { + "epoch": 0.96, + "grad_norm": 3.001408423505329, + "learning_rate": 1.8651949953010219e-06, + "loss": 0.6953, + "step": 6443 + }, + { + "epoch": 0.96, + "grad_norm": 1.7902391339197112, + "learning_rate": 1.8651465486218088e-06, + "loss": 0.6712, + "step": 6444 + }, + { + "epoch": 0.96, + "grad_norm": 3.5151305964195654, + "learning_rate": 1.8650980938681064e-06, + "loss": 0.6921, + "step": 6445 + }, + { + "epoch": 0.96, + "grad_norm": 4.367413048591461, + "learning_rate": 1.8650496310403671e-06, + "loss": 0.6797, + "step": 6446 + }, + { + "epoch": 0.96, + "grad_norm": 2.4421736775200973, + "learning_rate": 1.8650011601390431e-06, + "loss": 0.6719, + "step": 6447 + }, + { + "epoch": 0.96, + "grad_norm": 2.7839325058440494, + "learning_rate": 1.8649526811645869e-06, + "loss": 0.6654, + "step": 6448 + }, + { + "epoch": 0.96, + "grad_norm": 1.5219396190323475, + "learning_rate": 1.8649041941174505e-06, + "loss": 0.6829, + "step": 6449 + }, + { + "epoch": 0.96, + "grad_norm": 1.1192936053754634, + "learning_rate": 1.8648556989980872e-06, + "loss": 0.6634, + "step": 6450 + }, + { + "epoch": 0.96, + "grad_norm": 1.7103247990793193, + "learning_rate": 1.8648071958069489e-06, + "loss": 0.6947, + "step": 6451 + }, + { + "epoch": 0.96, + "grad_norm": 0.9160580319929172, + "learning_rate": 1.8647586845444886e-06, + "loss": 0.6745, + "step": 6452 + }, + { + "epoch": 0.96, + "grad_norm": 5.908656525476134, + "learning_rate": 1.864710165211159e-06, + "loss": 0.6816, + "step": 6453 + }, + { + "epoch": 0.96, + "grad_norm": 4.7894273485312375, + "learning_rate": 1.864661637807413e-06, + "loss": 0.6992, + "step": 6454 + }, + { + "epoch": 0.96, + "grad_norm": 8.146249696515614, + "learning_rate": 1.8646131023337034e-06, + "loss": 0.6986, + "step": 6455 + }, + { + "epoch": 0.96, + "grad_norm": 1.61330951811665, + "learning_rate": 1.8645645587904834e-06, + "loss": 0.6816, + "step": 6456 + }, + { + "epoch": 0.96, + "grad_norm": 0.8654281424910488, + "learning_rate": 1.864516007178206e-06, + "loss": 0.6569, + "step": 6457 + }, + { + "epoch": 0.96, + "grad_norm": 1.5311728292071833, + "learning_rate": 1.8644674474973238e-06, + "loss": 0.6732, + "step": 6458 + }, + { + "epoch": 0.96, + "grad_norm": 12.414689679016519, + "learning_rate": 1.864418879748291e-06, + "loss": 0.679, + "step": 6459 + }, + { + "epoch": 0.96, + "grad_norm": 0.5603552981069959, + "learning_rate": 1.86437030393156e-06, + "loss": 0.6947, + "step": 6460 + }, + { + "epoch": 0.96, + "grad_norm": 4.140813313138478, + "learning_rate": 1.8643217200475845e-06, + "loss": 0.668, + "step": 6461 + }, + { + "epoch": 0.96, + "grad_norm": 2.4993217203469524, + "learning_rate": 1.8642731280968182e-06, + "loss": 0.6673, + "step": 6462 + }, + { + "epoch": 0.96, + "grad_norm": 4.854226818901376, + "learning_rate": 1.8642245280797142e-06, + "loss": 0.6758, + "step": 6463 + }, + { + "epoch": 0.96, + "grad_norm": 2.9708753324087507, + "learning_rate": 1.8641759199967262e-06, + "loss": 0.6647, + "step": 6464 + }, + { + "epoch": 0.96, + "grad_norm": 3.9485293791262137, + "learning_rate": 1.864127303848308e-06, + "loss": 0.6751, + "step": 6465 + }, + { + "epoch": 0.96, + "grad_norm": 1.2598455562122965, + "learning_rate": 1.8640786796349134e-06, + "loss": 0.7018, + "step": 6466 + }, + { + "epoch": 0.96, + "grad_norm": 4.17488338446789, + "learning_rate": 1.864030047356996e-06, + "loss": 0.6758, + "step": 6467 + }, + { + "epoch": 0.96, + "grad_norm": 4.991952364457256, + "learning_rate": 1.8639814070150095e-06, + "loss": 0.6784, + "step": 6468 + }, + { + "epoch": 0.96, + "grad_norm": 7.4476865449194545, + "learning_rate": 1.8639327586094083e-06, + "loss": 0.7012, + "step": 6469 + }, + { + "epoch": 0.96, + "grad_norm": 0.8088379183298321, + "learning_rate": 1.8638841021406462e-06, + "loss": 0.6816, + "step": 6470 + }, + { + "epoch": 0.97, + "grad_norm": 0.705259790953747, + "learning_rate": 1.8638354376091776e-06, + "loss": 0.6725, + "step": 6471 + }, + { + "epoch": 0.97, + "grad_norm": 1.6911709316770736, + "learning_rate": 1.8637867650154562e-06, + "loss": 0.6914, + "step": 6472 + }, + { + "epoch": 0.97, + "grad_norm": 1.1295306972520838, + "learning_rate": 1.8637380843599366e-06, + "loss": 0.666, + "step": 6473 + }, + { + "epoch": 0.97, + "grad_norm": 2.067461421380177, + "learning_rate": 1.8636893956430731e-06, + "loss": 0.707, + "step": 6474 + }, + { + "epoch": 0.97, + "grad_norm": 0.9202734876682023, + "learning_rate": 1.86364069886532e-06, + "loss": 0.7116, + "step": 6475 + }, + { + "epoch": 0.97, + "grad_norm": 2.037852400352243, + "learning_rate": 1.8635919940271322e-06, + "loss": 0.6732, + "step": 6476 + }, + { + "epoch": 0.97, + "grad_norm": 6.607658140279162, + "learning_rate": 1.8635432811289635e-06, + "loss": 0.6784, + "step": 6477 + }, + { + "epoch": 0.97, + "grad_norm": 0.8953560269646634, + "learning_rate": 1.8634945601712692e-06, + "loss": 0.6849, + "step": 6478 + }, + { + "epoch": 0.97, + "grad_norm": 1.115265978631081, + "learning_rate": 1.8634458311545038e-06, + "loss": 0.6973, + "step": 6479 + }, + { + "epoch": 0.97, + "grad_norm": 0.7233522873435755, + "learning_rate": 1.863397094079122e-06, + "loss": 0.6777, + "step": 6480 + }, + { + "epoch": 0.97, + "grad_norm": 2.865075858277898, + "learning_rate": 1.8633483489455792e-06, + "loss": 0.707, + "step": 6481 + }, + { + "epoch": 0.97, + "grad_norm": 1.4416650886711402, + "learning_rate": 1.8632995957543293e-06, + "loss": 0.6797, + "step": 6482 + }, + { + "epoch": 0.97, + "grad_norm": 4.521443264094885, + "learning_rate": 1.8632508345058283e-06, + "loss": 0.6927, + "step": 6483 + }, + { + "epoch": 0.97, + "grad_norm": 0.6147457689246825, + "learning_rate": 1.863202065200531e-06, + "loss": 0.6582, + "step": 6484 + }, + { + "epoch": 0.97, + "grad_norm": 0.8167173762332465, + "learning_rate": 1.8631532878388922e-06, + "loss": 0.6686, + "step": 6485 + }, + { + "epoch": 0.97, + "grad_norm": 1.3013890947521987, + "learning_rate": 1.8631045024213676e-06, + "loss": 0.679, + "step": 6486 + }, + { + "epoch": 0.97, + "grad_norm": 0.8329728766728385, + "learning_rate": 1.8630557089484125e-06, + "loss": 0.6947, + "step": 6487 + }, + { + "epoch": 0.97, + "grad_norm": 0.6159446220399283, + "learning_rate": 1.8630069074204817e-06, + "loss": 0.6738, + "step": 6488 + }, + { + "epoch": 0.97, + "grad_norm": 2.864905539828197, + "learning_rate": 1.8629580978380315e-06, + "loss": 0.694, + "step": 6489 + }, + { + "epoch": 0.97, + "grad_norm": 3.449908623039939, + "learning_rate": 1.862909280201517e-06, + "loss": 0.7103, + "step": 6490 + }, + { + "epoch": 0.97, + "grad_norm": 4.685455837716848, + "learning_rate": 1.8628604545113937e-06, + "loss": 0.6771, + "step": 6491 + }, + { + "epoch": 0.97, + "grad_norm": 6.186574083754597, + "learning_rate": 1.8628116207681174e-06, + "loss": 0.6888, + "step": 6492 + }, + { + "epoch": 0.97, + "grad_norm": 3.780252998917692, + "learning_rate": 1.8627627789721442e-06, + "loss": 0.6641, + "step": 6493 + }, + { + "epoch": 0.97, + "grad_norm": 3.3631368798719343, + "learning_rate": 1.8627139291239296e-06, + "loss": 0.696, + "step": 6494 + }, + { + "epoch": 0.97, + "grad_norm": 0.9940566821054434, + "learning_rate": 1.8626650712239295e-06, + "loss": 0.6699, + "step": 6495 + }, + { + "epoch": 0.97, + "grad_norm": 5.258161102214819, + "learning_rate": 1.8626162052726002e-06, + "loss": 0.6803, + "step": 6496 + }, + { + "epoch": 0.97, + "grad_norm": 0.627750563137641, + "learning_rate": 1.8625673312703976e-06, + "loss": 0.6816, + "step": 6497 + }, + { + "epoch": 0.97, + "grad_norm": 3.8121729641142985, + "learning_rate": 1.8625184492177777e-06, + "loss": 0.6875, + "step": 6498 + }, + { + "epoch": 0.97, + "grad_norm": 1.556215429904746, + "learning_rate": 1.8624695591151967e-06, + "loss": 0.6816, + "step": 6499 + }, + { + "epoch": 0.97, + "grad_norm": 3.4797513590850158, + "learning_rate": 1.862420660963111e-06, + "loss": 0.6738, + "step": 6500 + }, + { + "epoch": 0.97, + "grad_norm": 4.150940912526497, + "learning_rate": 1.8623717547619773e-06, + "loss": 0.6862, + "step": 6501 + }, + { + "epoch": 0.97, + "grad_norm": 5.529064270111706, + "learning_rate": 1.8623228405122514e-06, + "loss": 0.6849, + "step": 6502 + }, + { + "epoch": 0.97, + "grad_norm": 6.915102244967386, + "learning_rate": 1.8622739182143908e-06, + "loss": 0.6901, + "step": 6503 + }, + { + "epoch": 0.97, + "grad_norm": 2.391118047256188, + "learning_rate": 1.8622249878688509e-06, + "loss": 0.6934, + "step": 6504 + }, + { + "epoch": 0.97, + "grad_norm": 3.695184602114022, + "learning_rate": 1.862176049476089e-06, + "loss": 0.6797, + "step": 6505 + }, + { + "epoch": 0.97, + "grad_norm": 1.4096515255291737, + "learning_rate": 1.862127103036562e-06, + "loss": 0.6758, + "step": 6506 + }, + { + "epoch": 0.97, + "grad_norm": 2.9122170326725634, + "learning_rate": 1.8620781485507263e-06, + "loss": 0.6797, + "step": 6507 + }, + { + "epoch": 0.97, + "grad_norm": 1.5943104845108107, + "learning_rate": 1.8620291860190389e-06, + "loss": 0.7031, + "step": 6508 + }, + { + "epoch": 0.97, + "grad_norm": 1.7666537087704017, + "learning_rate": 1.861980215441957e-06, + "loss": 0.668, + "step": 6509 + }, + { + "epoch": 0.97, + "grad_norm": 4.004670085357831, + "learning_rate": 1.8619312368199376e-06, + "loss": 0.6868, + "step": 6510 + }, + { + "epoch": 0.97, + "grad_norm": 3.2067031399889196, + "learning_rate": 1.861882250153438e-06, + "loss": 0.6836, + "step": 6511 + }, + { + "epoch": 0.97, + "grad_norm": 3.927663312147311, + "learning_rate": 1.8618332554429145e-06, + "loss": 0.6823, + "step": 6512 + }, + { + "epoch": 0.97, + "grad_norm": 2.0521064042255817, + "learning_rate": 1.8617842526888254e-06, + "loss": 0.653, + "step": 6513 + }, + { + "epoch": 0.97, + "grad_norm": 3.8270247258073082, + "learning_rate": 1.8617352418916277e-06, + "loss": 0.6771, + "step": 6514 + }, + { + "epoch": 0.97, + "grad_norm": 9.625115786405678, + "learning_rate": 1.8616862230517785e-06, + "loss": 0.7031, + "step": 6515 + }, + { + "epoch": 0.97, + "grad_norm": 4.340144293689997, + "learning_rate": 1.861637196169736e-06, + "loss": 0.6953, + "step": 6516 + }, + { + "epoch": 0.97, + "grad_norm": 0.5785219610972918, + "learning_rate": 1.861588161245957e-06, + "loss": 0.6764, + "step": 6517 + }, + { + "epoch": 0.97, + "grad_norm": 0.4856632784365987, + "learning_rate": 1.8615391182808997e-06, + "loss": 0.6797, + "step": 6518 + }, + { + "epoch": 0.97, + "grad_norm": 0.7986161979252567, + "learning_rate": 1.8614900672750214e-06, + "loss": 0.6999, + "step": 6519 + }, + { + "epoch": 0.97, + "grad_norm": 1.404199554947715, + "learning_rate": 1.8614410082287804e-06, + "loss": 0.6816, + "step": 6520 + }, + { + "epoch": 0.97, + "grad_norm": 0.7750257619326295, + "learning_rate": 1.861391941142634e-06, + "loss": 0.6745, + "step": 6521 + }, + { + "epoch": 0.97, + "grad_norm": 0.6574910260217555, + "learning_rate": 1.8613428660170408e-06, + "loss": 0.6875, + "step": 6522 + }, + { + "epoch": 0.97, + "grad_norm": 1.8376295139625045, + "learning_rate": 1.861293782852458e-06, + "loss": 0.6803, + "step": 6523 + }, + { + "epoch": 0.97, + "grad_norm": 2.3346456286257644, + "learning_rate": 1.8612446916493442e-06, + "loss": 0.6901, + "step": 6524 + }, + { + "epoch": 0.97, + "grad_norm": 1.5279595751096138, + "learning_rate": 1.861195592408158e-06, + "loss": 0.6797, + "step": 6525 + }, + { + "epoch": 0.97, + "grad_norm": 1.4389594953042424, + "learning_rate": 1.8611464851293568e-06, + "loss": 0.6673, + "step": 6526 + }, + { + "epoch": 0.97, + "grad_norm": 0.48641433781952215, + "learning_rate": 1.8610973698133993e-06, + "loss": 0.6693, + "step": 6527 + }, + { + "epoch": 0.97, + "grad_norm": 2.6532099510290372, + "learning_rate": 1.861048246460744e-06, + "loss": 0.6934, + "step": 6528 + }, + { + "epoch": 0.97, + "grad_norm": 7.331790457962405, + "learning_rate": 1.8609991150718493e-06, + "loss": 0.6771, + "step": 6529 + }, + { + "epoch": 0.97, + "grad_norm": 1.0979788911287165, + "learning_rate": 1.8609499756471736e-06, + "loss": 0.7122, + "step": 6530 + }, + { + "epoch": 0.97, + "grad_norm": 4.499578895174726, + "learning_rate": 1.8609008281871757e-06, + "loss": 0.6647, + "step": 6531 + }, + { + "epoch": 0.97, + "grad_norm": 1.5648775651618354, + "learning_rate": 1.8608516726923142e-06, + "loss": 0.6862, + "step": 6532 + }, + { + "epoch": 0.97, + "grad_norm": 0.5076083517645148, + "learning_rate": 1.860802509163048e-06, + "loss": 0.6745, + "step": 6533 + }, + { + "epoch": 0.97, + "grad_norm": 2.360200260004777, + "learning_rate": 1.860753337599836e-06, + "loss": 0.7103, + "step": 6534 + }, + { + "epoch": 0.97, + "grad_norm": 3.7789808795443838, + "learning_rate": 1.8607041580031366e-06, + "loss": 0.7012, + "step": 6535 + }, + { + "epoch": 0.97, + "grad_norm": 2.021089349907555, + "learning_rate": 1.8606549703734093e-06, + "loss": 0.6947, + "step": 6536 + }, + { + "epoch": 0.97, + "grad_norm": 1.1340446019616115, + "learning_rate": 1.860605774711113e-06, + "loss": 0.6836, + "step": 6537 + }, + { + "epoch": 0.98, + "grad_norm": 4.069607612008416, + "learning_rate": 1.860556571016707e-06, + "loss": 0.6745, + "step": 6538 + }, + { + "epoch": 0.98, + "grad_norm": 1.6095251627356104, + "learning_rate": 1.8605073592906506e-06, + "loss": 0.6771, + "step": 6539 + }, + { + "epoch": 0.98, + "grad_norm": 7.155409176034626, + "learning_rate": 1.8604581395334028e-06, + "loss": 0.7018, + "step": 6540 + }, + { + "epoch": 0.98, + "grad_norm": 2.5795875776156305, + "learning_rate": 1.860408911745423e-06, + "loss": 0.6921, + "step": 6541 + }, + { + "epoch": 0.98, + "grad_norm": 7.735267800231069, + "learning_rate": 1.8603596759271707e-06, + "loss": 0.6732, + "step": 6542 + }, + { + "epoch": 0.98, + "grad_norm": 1.5149768344540373, + "learning_rate": 1.8603104320791054e-06, + "loss": 0.6816, + "step": 6543 + }, + { + "epoch": 0.98, + "grad_norm": 5.2079978174206945, + "learning_rate": 1.860261180201687e-06, + "loss": 0.6927, + "step": 6544 + }, + { + "epoch": 0.98, + "grad_norm": 1.8672946484224378, + "learning_rate": 1.8602119202953748e-06, + "loss": 0.6888, + "step": 6545 + }, + { + "epoch": 0.98, + "grad_norm": 4.906850876074825, + "learning_rate": 1.8601626523606286e-06, + "loss": 0.6953, + "step": 6546 + }, + { + "epoch": 0.98, + "grad_norm": 3.289961318004927, + "learning_rate": 1.8601133763979084e-06, + "loss": 0.6842, + "step": 6547 + }, + { + "epoch": 0.98, + "grad_norm": 0.5100168389694729, + "learning_rate": 1.8600640924076737e-06, + "loss": 0.6751, + "step": 6548 + }, + { + "epoch": 0.98, + "grad_norm": 2.480536854988924, + "learning_rate": 1.8600148003903853e-06, + "loss": 0.6849, + "step": 6549 + }, + { + "epoch": 0.98, + "grad_norm": 3.103792370758966, + "learning_rate": 1.8599655003465025e-06, + "loss": 0.6934, + "step": 6550 + }, + { + "epoch": 0.98, + "grad_norm": 2.649421897395156, + "learning_rate": 1.8599161922764854e-06, + "loss": 0.6901, + "step": 6551 + }, + { + "epoch": 0.98, + "grad_norm": 1.6143573991309292, + "learning_rate": 1.8598668761807948e-06, + "loss": 0.6999, + "step": 6552 + }, + { + "epoch": 0.98, + "grad_norm": 1.5480858665790658, + "learning_rate": 1.8598175520598902e-06, + "loss": 0.6979, + "step": 6553 + }, + { + "epoch": 0.98, + "grad_norm": 3.5781258410240753, + "learning_rate": 1.859768219914233e-06, + "loss": 0.6868, + "step": 6554 + }, + { + "epoch": 0.98, + "grad_norm": 4.637923115576538, + "learning_rate": 1.8597188797442823e-06, + "loss": 0.6875, + "step": 6555 + }, + { + "epoch": 0.98, + "grad_norm": 4.668758966057671, + "learning_rate": 1.8596695315504993e-06, + "loss": 0.6921, + "step": 6556 + }, + { + "epoch": 0.98, + "grad_norm": 4.5034688363064665, + "learning_rate": 1.8596201753333447e-06, + "loss": 0.6745, + "step": 6557 + }, + { + "epoch": 0.98, + "grad_norm": 2.2528078132914384, + "learning_rate": 1.8595708110932793e-06, + "loss": 0.6901, + "step": 6558 + }, + { + "epoch": 0.98, + "grad_norm": 2.014753801975447, + "learning_rate": 1.8595214388307631e-06, + "loss": 0.6823, + "step": 6559 + }, + { + "epoch": 0.98, + "grad_norm": 1.562989455909385, + "learning_rate": 1.8594720585462573e-06, + "loss": 0.6706, + "step": 6560 + }, + { + "epoch": 0.98, + "grad_norm": 2.357388145873125, + "learning_rate": 1.8594226702402232e-06, + "loss": 0.6901, + "step": 6561 + }, + { + "epoch": 0.98, + "grad_norm": 2.5372940359902505, + "learning_rate": 1.8593732739131206e-06, + "loss": 0.6882, + "step": 6562 + }, + { + "epoch": 0.98, + "grad_norm": 2.480352146935645, + "learning_rate": 1.8593238695654119e-06, + "loss": 0.6784, + "step": 6563 + }, + { + "epoch": 0.98, + "grad_norm": 2.8273814816110705, + "learning_rate": 1.8592744571975572e-06, + "loss": 0.679, + "step": 6564 + }, + { + "epoch": 0.98, + "grad_norm": 10.046103293501954, + "learning_rate": 1.859225036810018e-06, + "loss": 0.6934, + "step": 6565 + }, + { + "epoch": 0.98, + "grad_norm": 2.631271749268336, + "learning_rate": 1.8591756084032556e-06, + "loss": 0.6882, + "step": 6566 + }, + { + "epoch": 0.98, + "grad_norm": 0.7126056735689793, + "learning_rate": 1.8591261719777313e-06, + "loss": 0.668, + "step": 6567 + }, + { + "epoch": 0.98, + "grad_norm": 1.5869595094254465, + "learning_rate": 1.8590767275339066e-06, + "loss": 0.6836, + "step": 6568 + }, + { + "epoch": 0.98, + "grad_norm": 2.288063662124137, + "learning_rate": 1.8590272750722427e-06, + "loss": 0.6979, + "step": 6569 + }, + { + "epoch": 0.98, + "grad_norm": 1.2096128256718401, + "learning_rate": 1.8589778145932012e-06, + "loss": 0.6868, + "step": 6570 + }, + { + "epoch": 0.98, + "grad_norm": 3.791514354104738, + "learning_rate": 1.8589283460972438e-06, + "loss": 0.6914, + "step": 6571 + }, + { + "epoch": 0.98, + "grad_norm": 0.4542403515596616, + "learning_rate": 1.8588788695848322e-06, + "loss": 0.6719, + "step": 6572 + }, + { + "epoch": 0.98, + "grad_norm": 3.11658147359153, + "learning_rate": 1.8588293850564281e-06, + "loss": 0.681, + "step": 6573 + }, + { + "epoch": 0.98, + "grad_norm": 1.983505906396281, + "learning_rate": 1.8587798925124935e-06, + "loss": 0.6979, + "step": 6574 + }, + { + "epoch": 0.98, + "grad_norm": 1.5823974445573312, + "learning_rate": 1.8587303919534903e-06, + "loss": 0.6823, + "step": 6575 + }, + { + "epoch": 0.98, + "grad_norm": 2.5278051171261926, + "learning_rate": 1.8586808833798804e-06, + "loss": 0.696, + "step": 6576 + }, + { + "epoch": 0.98, + "grad_norm": 1.402897681737694, + "learning_rate": 1.8586313667921257e-06, + "loss": 0.6706, + "step": 6577 + }, + { + "epoch": 0.98, + "grad_norm": 4.244199077412784, + "learning_rate": 1.8585818421906886e-06, + "loss": 0.7025, + "step": 6578 + }, + { + "epoch": 0.98, + "grad_norm": 0.9012311891545943, + "learning_rate": 1.8585323095760312e-06, + "loss": 0.6719, + "step": 6579 + }, + { + "epoch": 0.98, + "grad_norm": 5.3964444735998764, + "learning_rate": 1.8584827689486157e-06, + "loss": 0.6842, + "step": 6580 + }, + { + "epoch": 0.98, + "grad_norm": 0.5057728026577213, + "learning_rate": 1.8584332203089047e-06, + "loss": 0.6725, + "step": 6581 + }, + { + "epoch": 0.98, + "grad_norm": 1.6211927793668222, + "learning_rate": 1.8583836636573609e-06, + "loss": 0.6797, + "step": 6582 + }, + { + "epoch": 0.98, + "grad_norm": 1.341967456958294, + "learning_rate": 1.858334098994446e-06, + "loss": 0.679, + "step": 6583 + }, + { + "epoch": 0.98, + "grad_norm": 1.2553617348124797, + "learning_rate": 1.8582845263206232e-06, + "loss": 0.7012, + "step": 6584 + }, + { + "epoch": 0.98, + "grad_norm": 0.9644592782751504, + "learning_rate": 1.8582349456363552e-06, + "loss": 0.6895, + "step": 6585 + }, + { + "epoch": 0.98, + "grad_norm": 3.560210909026044, + "learning_rate": 1.8581853569421042e-06, + "loss": 0.6914, + "step": 6586 + }, + { + "epoch": 0.98, + "grad_norm": 2.9785200542383454, + "learning_rate": 1.8581357602383334e-06, + "loss": 0.7096, + "step": 6587 + }, + { + "epoch": 0.98, + "grad_norm": 0.4615147990402858, + "learning_rate": 1.858086155525506e-06, + "loss": 0.6875, + "step": 6588 + }, + { + "epoch": 0.98, + "grad_norm": 5.0759530758384805, + "learning_rate": 1.8580365428040846e-06, + "loss": 0.6784, + "step": 6589 + }, + { + "epoch": 0.98, + "grad_norm": 4.868382791996868, + "learning_rate": 1.857986922074532e-06, + "loss": 0.6849, + "step": 6590 + }, + { + "epoch": 0.98, + "grad_norm": 0.9550530177558854, + "learning_rate": 1.8579372933373115e-06, + "loss": 0.6901, + "step": 6591 + }, + { + "epoch": 0.98, + "grad_norm": 6.198385405419071, + "learning_rate": 1.8578876565928866e-06, + "loss": 0.6927, + "step": 6592 + }, + { + "epoch": 0.98, + "grad_norm": 3.6939826871007115, + "learning_rate": 1.8578380118417203e-06, + "loss": 0.6615, + "step": 6593 + }, + { + "epoch": 0.98, + "grad_norm": 2.1640903531856512, + "learning_rate": 1.8577883590842758e-06, + "loss": 0.6784, + "step": 6594 + }, + { + "epoch": 0.98, + "grad_norm": 0.5112385047429189, + "learning_rate": 1.8577386983210168e-06, + "loss": 0.6738, + "step": 6595 + }, + { + "epoch": 0.98, + "grad_norm": 5.474669637103727, + "learning_rate": 1.8576890295524069e-06, + "loss": 0.6816, + "step": 6596 + }, + { + "epoch": 0.98, + "grad_norm": 1.673180834185709, + "learning_rate": 1.857639352778909e-06, + "loss": 0.6615, + "step": 6597 + }, + { + "epoch": 0.98, + "grad_norm": 4.865655573141371, + "learning_rate": 1.8575896680009877e-06, + "loss": 0.6875, + "step": 6598 + }, + { + "epoch": 0.98, + "grad_norm": 1.0894246853448633, + "learning_rate": 1.8575399752191059e-06, + "loss": 0.6816, + "step": 6599 + }, + { + "epoch": 0.98, + "grad_norm": 3.242867241386685, + "learning_rate": 1.8574902744337275e-06, + "loss": 0.6562, + "step": 6600 + }, + { + "epoch": 0.98, + "grad_norm": 2.544371352883825, + "learning_rate": 1.8574405656453168e-06, + "loss": 0.6862, + "step": 6601 + }, + { + "epoch": 0.98, + "grad_norm": 3.2615602583967744, + "learning_rate": 1.8573908488543376e-06, + "loss": 0.6973, + "step": 6602 + }, + { + "epoch": 0.98, + "grad_norm": 3.0667185621977655, + "learning_rate": 1.8573411240612533e-06, + "loss": 0.6641, + "step": 6603 + }, + { + "epoch": 0.98, + "grad_norm": 4.054851608120882, + "learning_rate": 1.8572913912665289e-06, + "loss": 0.6686, + "step": 6604 + }, + { + "epoch": 0.99, + "grad_norm": 1.5240045087569118, + "learning_rate": 1.8572416504706279e-06, + "loss": 0.6829, + "step": 6605 + }, + { + "epoch": 0.99, + "grad_norm": 1.840106068879677, + "learning_rate": 1.8571919016740148e-06, + "loss": 0.6947, + "step": 6606 + }, + { + "epoch": 0.99, + "grad_norm": 3.49720404256001, + "learning_rate": 1.8571421448771538e-06, + "loss": 0.6816, + "step": 6607 + }, + { + "epoch": 0.99, + "grad_norm": 0.5263428055195744, + "learning_rate": 1.8570923800805097e-06, + "loss": 0.6777, + "step": 6608 + }, + { + "epoch": 0.99, + "grad_norm": 3.7224609062243235, + "learning_rate": 1.8570426072845464e-06, + "loss": 0.6888, + "step": 6609 + }, + { + "epoch": 0.99, + "grad_norm": 1.2224810402466941, + "learning_rate": 1.8569928264897286e-06, + "loss": 0.6562, + "step": 6610 + }, + { + "epoch": 0.99, + "grad_norm": 1.9308882780157302, + "learning_rate": 1.856943037696521e-06, + "loss": 0.6699, + "step": 6611 + }, + { + "epoch": 0.99, + "grad_norm": 0.8639326540476732, + "learning_rate": 1.8568932409053884e-06, + "loss": 0.6706, + "step": 6612 + }, + { + "epoch": 0.99, + "grad_norm": 0.5615012683678361, + "learning_rate": 1.856843436116795e-06, + "loss": 0.6758, + "step": 6613 + }, + { + "epoch": 0.99, + "grad_norm": 1.9408903351538997, + "learning_rate": 1.8567936233312062e-06, + "loss": 0.6595, + "step": 6614 + }, + { + "epoch": 0.99, + "grad_norm": 0.640154243178397, + "learning_rate": 1.856743802549087e-06, + "loss": 0.6966, + "step": 6615 + }, + { + "epoch": 0.99, + "grad_norm": 1.8371276934852536, + "learning_rate": 1.856693973770902e-06, + "loss": 0.6777, + "step": 6616 + }, + { + "epoch": 0.99, + "grad_norm": 0.6062073609712393, + "learning_rate": 1.8566441369971163e-06, + "loss": 0.6751, + "step": 6617 + }, + { + "epoch": 0.99, + "grad_norm": 2.195840305925646, + "learning_rate": 1.856594292228195e-06, + "loss": 0.6549, + "step": 6618 + }, + { + "epoch": 0.99, + "grad_norm": 4.449298558374584, + "learning_rate": 1.8565444394646038e-06, + "loss": 0.7051, + "step": 6619 + }, + { + "epoch": 0.99, + "grad_norm": 0.6344704585174077, + "learning_rate": 1.8564945787068074e-06, + "loss": 0.651, + "step": 6620 + }, + { + "epoch": 0.99, + "grad_norm": 2.9550805344990505, + "learning_rate": 1.856444709955271e-06, + "loss": 0.7083, + "step": 6621 + }, + { + "epoch": 0.99, + "grad_norm": 0.6193653291611428, + "learning_rate": 1.856394833210461e-06, + "loss": 0.6641, + "step": 6622 + }, + { + "epoch": 0.99, + "grad_norm": 8.418866017271391, + "learning_rate": 1.8563449484728419e-06, + "loss": 0.6882, + "step": 6623 + }, + { + "epoch": 0.99, + "grad_norm": 5.067953102070022, + "learning_rate": 1.8562950557428797e-06, + "loss": 0.6992, + "step": 6624 + }, + { + "epoch": 0.99, + "grad_norm": 6.991946527817631, + "learning_rate": 1.85624515502104e-06, + "loss": 0.6732, + "step": 6625 + }, + { + "epoch": 0.99, + "grad_norm": 2.332281733353921, + "learning_rate": 1.8561952463077887e-06, + "loss": 0.6745, + "step": 6626 + }, + { + "epoch": 0.99, + "grad_norm": 0.5982639518140968, + "learning_rate": 1.8561453296035913e-06, + "loss": 0.6699, + "step": 6627 + }, + { + "epoch": 0.99, + "grad_norm": 3.987178369159516, + "learning_rate": 1.856095404908914e-06, + "loss": 0.6777, + "step": 6628 + }, + { + "epoch": 0.99, + "grad_norm": 3.9306820417671373, + "learning_rate": 1.8560454722242224e-06, + "loss": 0.6921, + "step": 6629 + }, + { + "epoch": 0.99, + "grad_norm": 1.7133499875243237, + "learning_rate": 1.8559955315499826e-06, + "loss": 0.679, + "step": 6630 + }, + { + "epoch": 0.99, + "grad_norm": 2.0009228360291886, + "learning_rate": 1.8559455828866612e-06, + "loss": 0.6842, + "step": 6631 + }, + { + "epoch": 0.99, + "grad_norm": 5.400798653350712, + "learning_rate": 1.8558956262347235e-06, + "loss": 0.6882, + "step": 6632 + }, + { + "epoch": 0.99, + "grad_norm": 4.036996349281076, + "learning_rate": 1.8558456615946366e-06, + "loss": 0.6771, + "step": 6633 + }, + { + "epoch": 0.99, + "grad_norm": 0.5424438364928837, + "learning_rate": 1.8557956889668664e-06, + "loss": 0.6569, + "step": 6634 + }, + { + "epoch": 0.99, + "grad_norm": 7.424413127840211, + "learning_rate": 1.8557457083518792e-06, + "loss": 0.6901, + "step": 6635 + }, + { + "epoch": 0.99, + "grad_norm": 1.3490981281537682, + "learning_rate": 1.855695719750142e-06, + "loss": 0.6745, + "step": 6636 + }, + { + "epoch": 0.99, + "grad_norm": 2.795498238583546, + "learning_rate": 1.8556457231621202e-06, + "loss": 0.6947, + "step": 6637 + }, + { + "epoch": 0.99, + "grad_norm": 4.040450839826729, + "learning_rate": 1.855595718588282e-06, + "loss": 0.6562, + "step": 6638 + }, + { + "epoch": 0.99, + "grad_norm": 2.3663001290175596, + "learning_rate": 1.8555457060290929e-06, + "loss": 0.6914, + "step": 6639 + }, + { + "epoch": 0.99, + "grad_norm": 2.5086286399934044, + "learning_rate": 1.8554956854850201e-06, + "loss": 0.7109, + "step": 6640 + }, + { + "epoch": 0.99, + "grad_norm": 4.116064733597686, + "learning_rate": 1.8554456569565305e-06, + "loss": 0.6875, + "step": 6641 + }, + { + "epoch": 0.99, + "grad_norm": 1.03916978610033, + "learning_rate": 1.8553956204440907e-06, + "loss": 0.6628, + "step": 6642 + }, + { + "epoch": 0.99, + "grad_norm": 0.9525227747864292, + "learning_rate": 1.8553455759481684e-06, + "loss": 0.6816, + "step": 6643 + }, + { + "epoch": 0.99, + "grad_norm": 6.873851551920292, + "learning_rate": 1.8552955234692298e-06, + "loss": 0.7025, + "step": 6644 + }, + { + "epoch": 0.99, + "grad_norm": 2.4889355746045236, + "learning_rate": 1.8552454630077426e-06, + "loss": 0.7201, + "step": 6645 + }, + { + "epoch": 0.99, + "grad_norm": 1.1776546745445406, + "learning_rate": 1.8551953945641738e-06, + "loss": 0.6829, + "step": 6646 + }, + { + "epoch": 0.99, + "grad_norm": 1.4942071925553457, + "learning_rate": 1.8551453181389906e-06, + "loss": 0.6862, + "step": 6647 + }, + { + "epoch": 0.99, + "grad_norm": 4.451260514137733, + "learning_rate": 1.8550952337326606e-06, + "loss": 0.668, + "step": 6648 + }, + { + "epoch": 0.99, + "grad_norm": 1.7699360379328808, + "learning_rate": 1.8550451413456517e-06, + "loss": 0.6803, + "step": 6649 + }, + { + "epoch": 0.99, + "grad_norm": 0.5106385091673709, + "learning_rate": 1.85499504097843e-06, + "loss": 0.6595, + "step": 6650 + }, + { + "epoch": 0.99, + "grad_norm": 1.3858164945546978, + "learning_rate": 1.8549449326314645e-06, + "loss": 0.6836, + "step": 6651 + }, + { + "epoch": 0.99, + "grad_norm": 0.44224441975790196, + "learning_rate": 1.8548948163052222e-06, + "loss": 0.6953, + "step": 6652 + }, + { + "epoch": 0.99, + "grad_norm": 3.801687938947746, + "learning_rate": 1.8548446920001712e-06, + "loss": 0.6908, + "step": 6653 + }, + { + "epoch": 0.99, + "grad_norm": 1.6913857077573615, + "learning_rate": 1.8547945597167786e-06, + "loss": 0.6725, + "step": 6654 + }, + { + "epoch": 0.99, + "grad_norm": 3.7086335553375545, + "learning_rate": 1.8547444194555132e-06, + "loss": 0.6908, + "step": 6655 + }, + { + "epoch": 0.99, + "grad_norm": 1.037105814863272, + "learning_rate": 1.8546942712168424e-06, + "loss": 0.6849, + "step": 6656 + }, + { + "epoch": 0.99, + "grad_norm": 4.488230493530531, + "learning_rate": 1.8546441150012346e-06, + "loss": 0.679, + "step": 6657 + }, + { + "epoch": 0.99, + "grad_norm": 0.4618132980791901, + "learning_rate": 1.8545939508091575e-06, + "loss": 0.6895, + "step": 6658 + }, + { + "epoch": 0.99, + "grad_norm": 1.7537446658013136, + "learning_rate": 1.8545437786410794e-06, + "loss": 0.6712, + "step": 6659 + }, + { + "epoch": 0.99, + "grad_norm": 1.2739831478639936, + "learning_rate": 1.8544935984974687e-06, + "loss": 0.6836, + "step": 6660 + }, + { + "epoch": 0.99, + "grad_norm": 2.142022932527656, + "learning_rate": 1.8544434103787939e-06, + "loss": 0.7031, + "step": 6661 + }, + { + "epoch": 0.99, + "grad_norm": 6.223524017626008, + "learning_rate": 1.8543932142855227e-06, + "loss": 0.6849, + "step": 6662 + }, + { + "epoch": 0.99, + "grad_norm": 2.7713765894803797, + "learning_rate": 1.8543430102181245e-06, + "loss": 0.7012, + "step": 6663 + }, + { + "epoch": 0.99, + "grad_norm": 1.1423801923504797, + "learning_rate": 1.854292798177067e-06, + "loss": 0.6771, + "step": 6664 + }, + { + "epoch": 0.99, + "grad_norm": 1.7029727941254367, + "learning_rate": 1.8542425781628197e-06, + "loss": 0.6829, + "step": 6665 + }, + { + "epoch": 0.99, + "grad_norm": 1.2024348599348444, + "learning_rate": 1.8541923501758508e-06, + "loss": 0.6751, + "step": 6666 + }, + { + "epoch": 0.99, + "grad_norm": 0.9945017284871528, + "learning_rate": 1.8541421142166291e-06, + "loss": 0.6706, + "step": 6667 + }, + { + "epoch": 0.99, + "grad_norm": 0.5320563327890223, + "learning_rate": 1.8540918702856234e-06, + "loss": 0.6764, + "step": 6668 + }, + { + "epoch": 0.99, + "grad_norm": 4.878222847483, + "learning_rate": 1.8540416183833027e-06, + "loss": 0.6999, + "step": 6669 + }, + { + "epoch": 0.99, + "grad_norm": 1.7383066578296853, + "learning_rate": 1.8539913585101363e-06, + "loss": 0.6725, + "step": 6670 + }, + { + "epoch": 0.99, + "grad_norm": 2.5661593154181657, + "learning_rate": 1.853941090666593e-06, + "loss": 0.6647, + "step": 6671 + }, + { + "epoch": 1.0, + "grad_norm": 0.40965815823185214, + "learning_rate": 1.8538908148531418e-06, + "loss": 0.6751, + "step": 6672 + }, + { + "epoch": 1.0, + "grad_norm": 0.46381971202293953, + "learning_rate": 1.853840531070252e-06, + "loss": 0.6725, + "step": 6673 + }, + { + "epoch": 1.0, + "grad_norm": 4.4992269712789925, + "learning_rate": 1.8537902393183935e-06, + "loss": 0.6712, + "step": 6674 + }, + { + "epoch": 1.0, + "grad_norm": 1.9171604285525736, + "learning_rate": 1.853739939598035e-06, + "loss": 0.681, + "step": 6675 + }, + { + "epoch": 1.0, + "grad_norm": 0.4751883412307195, + "learning_rate": 1.853689631909646e-06, + "loss": 0.6602, + "step": 6676 + }, + { + "epoch": 1.0, + "grad_norm": 1.8786727352684027, + "learning_rate": 1.8536393162536963e-06, + "loss": 0.6921, + "step": 6677 + }, + { + "epoch": 1.0, + "grad_norm": 3.6142197022294877, + "learning_rate": 1.8535889926306552e-06, + "loss": 0.6901, + "step": 6678 + }, + { + "epoch": 1.0, + "grad_norm": 3.8995436956808702, + "learning_rate": 1.8535386610409925e-06, + "loss": 0.6855, + "step": 6679 + }, + { + "epoch": 1.0, + "grad_norm": 1.6129631250119167, + "learning_rate": 1.8534883214851781e-06, + "loss": 0.707, + "step": 6680 + }, + { + "epoch": 1.0, + "grad_norm": 2.220523500475032, + "learning_rate": 1.8534379739636818e-06, + "loss": 0.6797, + "step": 6681 + }, + { + "epoch": 1.0, + "grad_norm": 5.67938625309235, + "learning_rate": 1.8533876184769735e-06, + "loss": 0.6882, + "step": 6682 + }, + { + "epoch": 1.0, + "grad_norm": 2.274947275043748, + "learning_rate": 1.8533372550255228e-06, + "loss": 0.709, + "step": 6683 + }, + { + "epoch": 1.0, + "grad_norm": 0.48727245797344587, + "learning_rate": 1.8532868836098e-06, + "loss": 0.6901, + "step": 6684 + }, + { + "epoch": 1.0, + "grad_norm": 3.7634126293764205, + "learning_rate": 1.8532365042302756e-06, + "loss": 0.6751, + "step": 6685 + }, + { + "epoch": 1.0, + "grad_norm": 2.164177434757322, + "learning_rate": 1.8531861168874192e-06, + "loss": 0.6947, + "step": 6686 + }, + { + "epoch": 1.0, + "grad_norm": 3.244135252581009, + "learning_rate": 1.8531357215817015e-06, + "loss": 0.6777, + "step": 6687 + }, + { + "epoch": 1.0, + "grad_norm": 0.48730153262053794, + "learning_rate": 1.8530853183135925e-06, + "loss": 0.6764, + "step": 6688 + }, + { + "epoch": 1.0, + "grad_norm": 1.64185547126021, + "learning_rate": 1.853034907083563e-06, + "loss": 0.6608, + "step": 6689 + }, + { + "epoch": 1.0, + "grad_norm": 0.7217067731860382, + "learning_rate": 1.852984487892083e-06, + "loss": 0.6634, + "step": 6690 + }, + { + "epoch": 1.0, + "grad_norm": 3.7133663960426704, + "learning_rate": 1.8529340607396236e-06, + "loss": 0.6934, + "step": 6691 + }, + { + "epoch": 1.0, + "grad_norm": 4.864273730652383, + "learning_rate": 1.8528836256266552e-06, + "loss": 0.6797, + "step": 6692 + }, + { + "epoch": 1.0, + "grad_norm": 3.2994075298404586, + "learning_rate": 1.852833182553648e-06, + "loss": 0.6862, + "step": 6693 + }, + { + "epoch": 1.0, + "grad_norm": 1.5191310130089788, + "learning_rate": 1.8527827315210737e-06, + "loss": 0.6842, + "step": 6694 + }, + { + "epoch": 1.0, + "grad_norm": 4.973775898472121, + "learning_rate": 1.8527322725294028e-06, + "loss": 0.6686, + "step": 6695 + }, + { + "epoch": 1.0, + "grad_norm": 1.6388192113420652, + "learning_rate": 1.852681805579106e-06, + "loss": 0.6745, + "step": 6696 + }, + { + "epoch": 1.0, + "grad_norm": 1.7884944810996695, + "learning_rate": 1.8526313306706544e-06, + "loss": 0.6855, + "step": 6697 + }, + { + "epoch": 1.0, + "grad_norm": 2.716976365409744, + "learning_rate": 1.8525808478045194e-06, + "loss": 0.6751, + "step": 6698 + }, + { + "epoch": 1.0, + "grad_norm": 1.1371545334006539, + "learning_rate": 1.8525303569811719e-06, + "loss": 0.6771, + "step": 6699 + }, + { + "epoch": 1.0, + "grad_norm": 2.020738827440076, + "learning_rate": 1.8524798582010831e-06, + "loss": 0.6868, + "step": 6700 + }, + { + "epoch": 1.0, + "grad_norm": 5.05472135232009, + "learning_rate": 1.8524293514647247e-06, + "loss": 0.6738, + "step": 6701 + }, + { + "epoch": 1.0, + "grad_norm": 1.3671468363030728, + "learning_rate": 1.8523788367725675e-06, + "loss": 0.6842, + "step": 6702 + }, + { + "epoch": 1.0, + "grad_norm": 2.7057062988042238, + "learning_rate": 1.852328314125083e-06, + "loss": 0.6706, + "step": 6703 + }, + { + "epoch": 1.0, + "grad_norm": 3.718673217343224, + "learning_rate": 1.8522777835227434e-06, + "loss": 0.6934, + "step": 6704 + }, + { + "epoch": 1.0, + "grad_norm": 2.637707467072178, + "learning_rate": 1.8522272449660199e-06, + "loss": 0.6803, + "step": 6705 + }, + { + "epoch": 1.0, + "grad_norm": 2.816426554616545, + "learning_rate": 1.852176698455384e-06, + "loss": 0.6556, + "step": 6706 + }, + { + "epoch": 1.0, + "grad_norm": 0.6390331347292056, + "learning_rate": 1.8521261439913075e-06, + "loss": 0.6738, + "step": 6707 + }, + { + "epoch": 1.0, + "grad_norm": 1.9787469629136438, + "learning_rate": 1.8520755815742626e-06, + "loss": 0.6797, + "step": 6708 + }, + { + "epoch": 1.0, + "grad_norm": 1.9478654962043869, + "learning_rate": 1.8520250112047209e-06, + "loss": 0.6699, + "step": 6709 + }, + { + "epoch": 1.0, + "grad_norm": 1.3241198080804524, + "learning_rate": 1.851974432883154e-06, + "loss": 0.6901, + "step": 6710 + }, + { + "epoch": 1.0, + "grad_norm": 5.876353607340394, + "learning_rate": 1.851923846610035e-06, + "loss": 0.6602, + "step": 6711 + }, + { + "epoch": 1.0, + "grad_norm": 2.277558455076857, + "learning_rate": 1.851873252385835e-06, + "loss": 0.6719, + "step": 6712 + }, + { + "epoch": 1.0, + "grad_norm": 0.7360719693991303, + "learning_rate": 1.8518226502110266e-06, + "loss": 0.6654, + "step": 6713 + }, + { + "epoch": 1.0, + "grad_norm": 2.511294108832639, + "learning_rate": 1.851772040086082e-06, + "loss": 0.6855, + "step": 6714 + }, + { + "epoch": 1.0, + "grad_norm": 2.002974574498549, + "learning_rate": 1.851721422011474e-06, + "loss": 0.6667, + "step": 6715 + }, + { + "epoch": 1.0, + "grad_norm": 2.3600523646879643, + "learning_rate": 1.8516707959876743e-06, + "loss": 0.681, + "step": 6716 + }, + { + "epoch": 1.0, + "grad_norm": 0.8120373159762168, + "learning_rate": 1.8516201620151558e-06, + "loss": 0.6536, + "step": 6717 + }, + { + "epoch": 1.0, + "grad_norm": 2.6770606664083743, + "learning_rate": 1.851569520094391e-06, + "loss": 0.6797, + "step": 6718 + }, + { + "epoch": 1.0, + "grad_norm": 1.5771173178695845, + "learning_rate": 1.8515188702258529e-06, + "loss": 0.6576, + "step": 6719 + }, + { + "epoch": 1.0, + "grad_norm": 1.9903552812354621, + "learning_rate": 1.8514682124100136e-06, + "loss": 0.7025, + "step": 6720 + }, + { + "epoch": 1.0, + "grad_norm": 2.377402397193773, + "learning_rate": 1.851417546647346e-06, + "loss": 0.681, + "step": 6721 + }, + { + "epoch": 1.0, + "grad_norm": 1.150047670111178, + "learning_rate": 1.8513668729383235e-06, + "loss": 0.6719, + "step": 6722 + }, + { + "epoch": 1.0, + "grad_norm": 4.246290968722913, + "learning_rate": 1.8513161912834186e-06, + "loss": 0.6673, + "step": 6723 + }, + { + "epoch": 1.0, + "grad_norm": 2.4138079748270416, + "learning_rate": 1.8512655016831042e-06, + "loss": 0.6732, + "step": 6724 + }, + { + "epoch": 1.0, + "grad_norm": 2.343296935360941, + "learning_rate": 1.8512148041378536e-06, + "loss": 0.6771, + "step": 6725 + }, + { + "epoch": 1.0, + "grad_norm": 4.769514021095052, + "learning_rate": 1.8511640986481402e-06, + "loss": 0.6647, + "step": 6726 + }, + { + "epoch": 1.0, + "grad_norm": 1.4894220056178296, + "learning_rate": 1.8511133852144369e-06, + "loss": 0.6973, + "step": 6727 + }, + { + "epoch": 1.0, + "grad_norm": 4.435300556584951, + "learning_rate": 1.851062663837217e-06, + "loss": 0.6654, + "step": 6728 + }, + { + "epoch": 1.0, + "grad_norm": 2.0519547394250592, + "learning_rate": 1.8510119345169542e-06, + "loss": 0.6758, + "step": 6729 + }, + { + "epoch": 1.0, + "grad_norm": 0.8635415782863484, + "learning_rate": 1.8509611972541214e-06, + "loss": 0.6849, + "step": 6730 + }, + { + "epoch": 1.0, + "grad_norm": 0.752279199469599, + "learning_rate": 1.8509104520491929e-06, + "loss": 0.6927, + "step": 6731 + }, + { + "epoch": 1.0, + "grad_norm": 8.598727998330249, + "learning_rate": 1.8508596989026418e-06, + "loss": 0.7018, + "step": 6732 + }, + { + "epoch": 1.0, + "grad_norm": 0.9670847674620684, + "learning_rate": 1.8508089378149417e-06, + "loss": 0.6836, + "step": 6733 + }, + { + "epoch": 1.0, + "grad_norm": 0.8969656455640703, + "learning_rate": 1.8507581687865665e-06, + "loss": 0.6706, + "step": 6734 + }, + { + "epoch": 1.0, + "grad_norm": 0.7653807459713625, + "learning_rate": 1.8507073918179902e-06, + "loss": 0.6595, + "step": 6735 + }, + { + "epoch": 1.0, + "grad_norm": 2.901754647266317, + "learning_rate": 1.8506566069096868e-06, + "loss": 0.6517, + "step": 6736 + }, + { + "epoch": 1.0, + "grad_norm": 1.1623412040532268, + "learning_rate": 1.8506058140621297e-06, + "loss": 0.666, + "step": 6737 + }, + { + "epoch": 1.0, + "grad_norm": 4.226294731660979, + "learning_rate": 1.8505550132757934e-06, + "loss": 0.6921, + "step": 6738 + }, + { + "epoch": 1.01, + "grad_norm": 0.5805963236388242, + "learning_rate": 1.8505042045511522e-06, + "loss": 0.6621, + "step": 6739 + }, + { + "epoch": 1.01, + "grad_norm": 0.5986910524191317, + "learning_rate": 1.8504533878886798e-06, + "loss": 0.6816, + "step": 6740 + }, + { + "epoch": 1.01, + "grad_norm": 0.7388439311335336, + "learning_rate": 1.8504025632888507e-06, + "loss": 0.6634, + "step": 6741 + }, + { + "epoch": 1.01, + "grad_norm": 3.283468511101873, + "learning_rate": 1.8503517307521396e-06, + "loss": 0.6784, + "step": 6742 + }, + { + "epoch": 1.01, + "grad_norm": 0.7981056347414509, + "learning_rate": 1.85030089027902e-06, + "loss": 0.6673, + "step": 6743 + }, + { + "epoch": 1.01, + "grad_norm": 3.8614589987564436, + "learning_rate": 1.8502500418699675e-06, + "loss": 0.6973, + "step": 6744 + }, + { + "epoch": 1.01, + "grad_norm": 4.4928893732392075, + "learning_rate": 1.850199185525456e-06, + "loss": 0.6803, + "step": 6745 + }, + { + "epoch": 1.01, + "grad_norm": 5.076279403684931, + "learning_rate": 1.8501483212459603e-06, + "loss": 0.6914, + "step": 6746 + }, + { + "epoch": 1.01, + "grad_norm": 0.9875889973875243, + "learning_rate": 1.850097449031955e-06, + "loss": 0.6732, + "step": 6747 + }, + { + "epoch": 1.01, + "grad_norm": 0.9541449811965922, + "learning_rate": 1.8500465688839153e-06, + "loss": 0.6999, + "step": 6748 + }, + { + "epoch": 1.01, + "grad_norm": 6.428990584058827, + "learning_rate": 1.8499956808023156e-06, + "loss": 0.6921, + "step": 6749 + }, + { + "epoch": 1.01, + "grad_norm": 1.0720778019924604, + "learning_rate": 1.849944784787631e-06, + "loss": 0.6745, + "step": 6750 + }, + { + "epoch": 1.01, + "grad_norm": 2.23513168856775, + "learning_rate": 1.8498938808403368e-06, + "loss": 0.6738, + "step": 6751 + }, + { + "epoch": 1.01, + "grad_norm": 0.8691727099860858, + "learning_rate": 1.8498429689609077e-06, + "loss": 0.6986, + "step": 6752 + }, + { + "epoch": 1.01, + "grad_norm": 0.9868938937371229, + "learning_rate": 1.849792049149819e-06, + "loss": 0.6608, + "step": 6753 + }, + { + "epoch": 1.01, + "grad_norm": 4.24219265277345, + "learning_rate": 1.849741121407546e-06, + "loss": 0.6829, + "step": 6754 + }, + { + "epoch": 1.01, + "grad_norm": 0.6468302482115987, + "learning_rate": 1.8496901857345638e-06, + "loss": 0.6855, + "step": 6755 + }, + { + "epoch": 1.01, + "grad_norm": 2.0079755807610695, + "learning_rate": 1.8496392421313482e-06, + "loss": 0.6634, + "step": 6756 + }, + { + "epoch": 1.01, + "grad_norm": 0.6824658922518377, + "learning_rate": 1.8495882905983745e-06, + "loss": 0.6908, + "step": 6757 + }, + { + "epoch": 1.01, + "grad_norm": 3.7156191020573557, + "learning_rate": 1.8495373311361178e-06, + "loss": 0.681, + "step": 6758 + }, + { + "epoch": 1.01, + "grad_norm": 6.50217442669622, + "learning_rate": 1.8494863637450542e-06, + "loss": 0.6934, + "step": 6759 + }, + { + "epoch": 1.01, + "grad_norm": 3.563188533605097, + "learning_rate": 1.8494353884256593e-06, + "loss": 0.7012, + "step": 6760 + }, + { + "epoch": 1.01, + "grad_norm": 1.6401980665392026, + "learning_rate": 1.849384405178409e-06, + "loss": 0.6569, + "step": 6761 + }, + { + "epoch": 1.01, + "grad_norm": 0.7588296227375738, + "learning_rate": 1.8493334140037787e-06, + "loss": 0.6725, + "step": 6762 + }, + { + "epoch": 1.01, + "grad_norm": 2.3643372236103, + "learning_rate": 1.8492824149022446e-06, + "loss": 0.6497, + "step": 6763 + }, + { + "epoch": 1.01, + "grad_norm": 0.6473290324342449, + "learning_rate": 1.8492314078742823e-06, + "loss": 0.653, + "step": 6764 + }, + { + "epoch": 1.01, + "grad_norm": 1.849080898445266, + "learning_rate": 1.8491803929203687e-06, + "loss": 0.6497, + "step": 6765 + }, + { + "epoch": 1.01, + "grad_norm": 0.9333701705194225, + "learning_rate": 1.8491293700409789e-06, + "loss": 0.6615, + "step": 6766 + }, + { + "epoch": 1.01, + "grad_norm": 1.6253975661730835, + "learning_rate": 1.8490783392365897e-06, + "loss": 0.6732, + "step": 6767 + }, + { + "epoch": 1.01, + "grad_norm": 4.500039846260997, + "learning_rate": 1.8490273005076776e-06, + "loss": 0.6523, + "step": 6768 + }, + { + "epoch": 1.01, + "grad_norm": 2.0349895059843996, + "learning_rate": 1.8489762538547184e-06, + "loss": 0.6888, + "step": 6769 + }, + { + "epoch": 1.01, + "grad_norm": 6.232445569354863, + "learning_rate": 1.8489251992781887e-06, + "loss": 0.6797, + "step": 6770 + }, + { + "epoch": 1.01, + "grad_norm": 0.9427093178274546, + "learning_rate": 1.8488741367785651e-06, + "loss": 0.6921, + "step": 6771 + }, + { + "epoch": 1.01, + "grad_norm": 2.2103804010433064, + "learning_rate": 1.8488230663563241e-06, + "loss": 0.6725, + "step": 6772 + }, + { + "epoch": 1.01, + "grad_norm": 2.0624014580042362, + "learning_rate": 1.8487719880119423e-06, + "loss": 0.6719, + "step": 6773 + }, + { + "epoch": 1.01, + "grad_norm": 4.783392885548068, + "learning_rate": 1.8487209017458967e-06, + "loss": 0.6882, + "step": 6774 + }, + { + "epoch": 1.01, + "grad_norm": 0.9607285016975315, + "learning_rate": 1.8486698075586636e-06, + "loss": 0.666, + "step": 6775 + }, + { + "epoch": 1.01, + "grad_norm": 2.859620780855016, + "learning_rate": 1.8486187054507202e-06, + "loss": 0.6934, + "step": 6776 + }, + { + "epoch": 1.01, + "grad_norm": 1.6127144290083153, + "learning_rate": 1.8485675954225433e-06, + "loss": 0.6973, + "step": 6777 + }, + { + "epoch": 1.01, + "grad_norm": 7.535248662080887, + "learning_rate": 1.8485164774746103e-06, + "loss": 0.6745, + "step": 6778 + }, + { + "epoch": 1.01, + "grad_norm": 1.263475649774124, + "learning_rate": 1.8484653516073977e-06, + "loss": 0.7188, + "step": 6779 + }, + { + "epoch": 1.01, + "grad_norm": 1.9928857678271354, + "learning_rate": 1.848414217821383e-06, + "loss": 0.7279, + "step": 6780 + }, + { + "epoch": 1.01, + "grad_norm": 2.348088253370447, + "learning_rate": 1.8483630761170433e-06, + "loss": 0.6901, + "step": 6781 + }, + { + "epoch": 1.01, + "grad_norm": 3.7003022291404304, + "learning_rate": 1.8483119264948561e-06, + "loss": 0.6771, + "step": 6782 + }, + { + "epoch": 1.01, + "grad_norm": 0.7689156347858895, + "learning_rate": 1.8482607689552987e-06, + "loss": 0.6536, + "step": 6783 + }, + { + "epoch": 1.01, + "grad_norm": 1.359722194302439, + "learning_rate": 1.8482096034988483e-06, + "loss": 0.6686, + "step": 6784 + }, + { + "epoch": 1.01, + "grad_norm": 4.920130466515206, + "learning_rate": 1.848158430125983e-06, + "loss": 0.6927, + "step": 6785 + }, + { + "epoch": 1.01, + "grad_norm": 0.8162601658576183, + "learning_rate": 1.8481072488371798e-06, + "loss": 0.6823, + "step": 6786 + }, + { + "epoch": 1.01, + "grad_norm": 2.1297573839471067, + "learning_rate": 1.8480560596329169e-06, + "loss": 0.7057, + "step": 6787 + }, + { + "epoch": 1.01, + "grad_norm": 0.9166980454613854, + "learning_rate": 1.8480048625136715e-06, + "loss": 0.681, + "step": 6788 + }, + { + "epoch": 1.01, + "grad_norm": 1.886696251920872, + "learning_rate": 1.847953657479922e-06, + "loss": 0.6589, + "step": 6789 + }, + { + "epoch": 1.01, + "grad_norm": 1.5002320147457997, + "learning_rate": 1.8479024445321458e-06, + "loss": 0.668, + "step": 6790 + }, + { + "epoch": 1.01, + "grad_norm": 4.245867714994213, + "learning_rate": 1.8478512236708212e-06, + "loss": 0.6842, + "step": 6791 + }, + { + "epoch": 1.01, + "grad_norm": 1.892279196622061, + "learning_rate": 1.8477999948964263e-06, + "loss": 0.6712, + "step": 6792 + }, + { + "epoch": 1.01, + "grad_norm": 0.8260073805473119, + "learning_rate": 1.8477487582094386e-06, + "loss": 0.6751, + "step": 6793 + }, + { + "epoch": 1.01, + "grad_norm": 2.681289355190913, + "learning_rate": 1.8476975136103373e-06, + "loss": 0.6862, + "step": 6794 + }, + { + "epoch": 1.01, + "grad_norm": 2.3691621522813286, + "learning_rate": 1.8476462610995997e-06, + "loss": 0.6855, + "step": 6795 + }, + { + "epoch": 1.01, + "grad_norm": 2.4499993852873923, + "learning_rate": 1.847595000677705e-06, + "loss": 0.6934, + "step": 6796 + }, + { + "epoch": 1.01, + "grad_norm": 0.6242882453956032, + "learning_rate": 1.847543732345131e-06, + "loss": 0.6966, + "step": 6797 + }, + { + "epoch": 1.01, + "grad_norm": 3.152548475449875, + "learning_rate": 1.8474924561023561e-06, + "loss": 0.6927, + "step": 6798 + }, + { + "epoch": 1.01, + "grad_norm": 0.7881780141028234, + "learning_rate": 1.8474411719498596e-06, + "loss": 0.681, + "step": 6799 + }, + { + "epoch": 1.01, + "grad_norm": 0.5134501397178901, + "learning_rate": 1.8473898798881195e-06, + "loss": 0.6862, + "step": 6800 + }, + { + "epoch": 1.01, + "grad_norm": 2.946344838914507, + "learning_rate": 1.8473385799176149e-06, + "loss": 0.6901, + "step": 6801 + }, + { + "epoch": 1.01, + "grad_norm": 2.9832002540945646, + "learning_rate": 1.8472872720388243e-06, + "loss": 0.6771, + "step": 6802 + }, + { + "epoch": 1.01, + "grad_norm": 2.0176979328212648, + "learning_rate": 1.8472359562522266e-06, + "loss": 0.6712, + "step": 6803 + }, + { + "epoch": 1.01, + "grad_norm": 3.22598011632839, + "learning_rate": 1.8471846325583006e-06, + "loss": 0.6927, + "step": 6804 + }, + { + "epoch": 1.01, + "grad_norm": 2.3068081165127663, + "learning_rate": 1.8471333009575258e-06, + "loss": 0.7051, + "step": 6805 + }, + { + "epoch": 1.02, + "grad_norm": 3.3202310620423963, + "learning_rate": 1.847081961450381e-06, + "loss": 0.6777, + "step": 6806 + }, + { + "epoch": 1.02, + "grad_norm": 0.8876538262049494, + "learning_rate": 1.847030614037345e-06, + "loss": 0.6908, + "step": 6807 + }, + { + "epoch": 1.02, + "grad_norm": 2.4747738475843284, + "learning_rate": 1.8469792587188978e-06, + "loss": 0.6745, + "step": 6808 + }, + { + "epoch": 1.02, + "grad_norm": 0.7015275967678047, + "learning_rate": 1.846927895495518e-06, + "loss": 0.6829, + "step": 6809 + }, + { + "epoch": 1.02, + "grad_norm": 3.6840199879397773, + "learning_rate": 1.8468765243676854e-06, + "loss": 0.6908, + "step": 6810 + }, + { + "epoch": 1.02, + "grad_norm": 2.113101731279391, + "learning_rate": 1.8468251453358792e-06, + "loss": 0.6816, + "step": 6811 + }, + { + "epoch": 1.02, + "grad_norm": 2.1883197005175115, + "learning_rate": 1.8467737584005792e-06, + "loss": 0.6595, + "step": 6812 + }, + { + "epoch": 1.02, + "grad_norm": 1.6054219936871439, + "learning_rate": 1.8467223635622647e-06, + "loss": 0.6803, + "step": 6813 + }, + { + "epoch": 1.02, + "grad_norm": 4.591804572484437, + "learning_rate": 1.8466709608214154e-06, + "loss": 0.681, + "step": 6814 + }, + { + "epoch": 1.02, + "grad_norm": 2.534461086297991, + "learning_rate": 1.8466195501785111e-06, + "loss": 0.6615, + "step": 6815 + }, + { + "epoch": 1.02, + "grad_norm": 6.223742389347049, + "learning_rate": 1.846568131634032e-06, + "loss": 0.6608, + "step": 6816 + }, + { + "epoch": 1.02, + "grad_norm": 5.341380339532581, + "learning_rate": 1.8465167051884576e-06, + "loss": 0.6576, + "step": 6817 + }, + { + "epoch": 1.02, + "grad_norm": 5.0096767939913205, + "learning_rate": 1.8464652708422676e-06, + "loss": 0.6654, + "step": 6818 + }, + { + "epoch": 1.02, + "grad_norm": 1.5056742475524838, + "learning_rate": 1.8464138285959425e-06, + "loss": 0.6719, + "step": 6819 + }, + { + "epoch": 1.02, + "grad_norm": 2.7821469951168276, + "learning_rate": 1.8463623784499625e-06, + "loss": 0.6738, + "step": 6820 + }, + { + "epoch": 1.02, + "grad_norm": 0.8871329757752502, + "learning_rate": 1.8463109204048072e-06, + "loss": 0.6602, + "step": 6821 + }, + { + "epoch": 1.02, + "grad_norm": 7.370536852511089, + "learning_rate": 1.8462594544609575e-06, + "loss": 0.6797, + "step": 6822 + }, + { + "epoch": 1.02, + "grad_norm": 6.793526410445219, + "learning_rate": 1.8462079806188937e-06, + "loss": 0.7018, + "step": 6823 + }, + { + "epoch": 1.02, + "grad_norm": 4.54626507929667, + "learning_rate": 1.8461564988790959e-06, + "loss": 0.6784, + "step": 6824 + }, + { + "epoch": 1.02, + "grad_norm": 4.146513123512455, + "learning_rate": 1.8461050092420442e-06, + "loss": 0.7233, + "step": 6825 + }, + { + "epoch": 1.02, + "grad_norm": 5.5671859259583245, + "learning_rate": 1.8460535117082201e-06, + "loss": 0.7148, + "step": 6826 + }, + { + "epoch": 1.02, + "grad_norm": 4.743574757752061, + "learning_rate": 1.8460020062781034e-06, + "loss": 0.681, + "step": 6827 + }, + { + "epoch": 1.02, + "grad_norm": 7.26795205504625, + "learning_rate": 1.8459504929521753e-06, + "loss": 0.6875, + "step": 6828 + }, + { + "epoch": 1.02, + "grad_norm": 1.5567677934238149, + "learning_rate": 1.8458989717309165e-06, + "loss": 0.6992, + "step": 6829 + }, + { + "epoch": 1.02, + "grad_norm": 5.83705093921404, + "learning_rate": 1.8458474426148078e-06, + "loss": 0.6836, + "step": 6830 + }, + { + "epoch": 1.02, + "grad_norm": 2.5240174576706944, + "learning_rate": 1.8457959056043299e-06, + "loss": 0.6908, + "step": 6831 + }, + { + "epoch": 1.02, + "grad_norm": 4.5703641450804815, + "learning_rate": 1.8457443606999643e-06, + "loss": 0.6999, + "step": 6832 + }, + { + "epoch": 1.02, + "grad_norm": 8.486355979846582, + "learning_rate": 1.8456928079021918e-06, + "loss": 0.7135, + "step": 6833 + }, + { + "epoch": 1.02, + "grad_norm": 9.784229076067088, + "learning_rate": 1.8456412472114935e-06, + "loss": 0.7129, + "step": 6834 + }, + { + "epoch": 1.02, + "grad_norm": 7.392795874577879, + "learning_rate": 1.8455896786283502e-06, + "loss": 0.7272, + "step": 6835 + }, + { + "epoch": 1.02, + "grad_norm": 3.7605321019874602, + "learning_rate": 1.8455381021532445e-06, + "loss": 0.6855, + "step": 6836 + }, + { + "epoch": 1.02, + "grad_norm": 3.8535966518411713, + "learning_rate": 1.845486517786656e-06, + "loss": 0.6888, + "step": 6837 + }, + { + "epoch": 1.02, + "grad_norm": 0.5845611958135027, + "learning_rate": 1.8454349255290675e-06, + "loss": 0.6862, + "step": 6838 + }, + { + "epoch": 1.02, + "grad_norm": 2.8199569647728384, + "learning_rate": 1.8453833253809604e-06, + "loss": 0.6686, + "step": 6839 + }, + { + "epoch": 1.02, + "grad_norm": 0.602976990987608, + "learning_rate": 1.8453317173428153e-06, + "loss": 0.6764, + "step": 6840 + }, + { + "epoch": 1.02, + "grad_norm": 1.38774768559328, + "learning_rate": 1.8452801014151152e-06, + "loss": 0.6569, + "step": 6841 + }, + { + "epoch": 1.02, + "grad_norm": 2.6857888398945486, + "learning_rate": 1.8452284775983405e-06, + "loss": 0.6966, + "step": 6842 + }, + { + "epoch": 1.02, + "grad_norm": 3.1336735946827443, + "learning_rate": 1.8451768458929742e-06, + "loss": 0.6992, + "step": 6843 + }, + { + "epoch": 1.02, + "grad_norm": 1.726299019781025, + "learning_rate": 1.8451252062994974e-06, + "loss": 0.6569, + "step": 6844 + }, + { + "epoch": 1.02, + "grad_norm": 4.48094486264641, + "learning_rate": 1.8450735588183921e-06, + "loss": 0.696, + "step": 6845 + }, + { + "epoch": 1.02, + "grad_norm": 1.3898627905067968, + "learning_rate": 1.845021903450141e-06, + "loss": 0.6719, + "step": 6846 + }, + { + "epoch": 1.02, + "grad_norm": 7.459733471666695, + "learning_rate": 1.8449702401952254e-06, + "loss": 0.7025, + "step": 6847 + }, + { + "epoch": 1.02, + "grad_norm": 3.7739061747570144, + "learning_rate": 1.844918569054128e-06, + "loss": 0.6966, + "step": 6848 + }, + { + "epoch": 1.02, + "grad_norm": 3.276687393683117, + "learning_rate": 1.844866890027331e-06, + "loss": 0.6875, + "step": 6849 + }, + { + "epoch": 1.02, + "grad_norm": 6.344548947354527, + "learning_rate": 1.8448152031153162e-06, + "loss": 0.679, + "step": 6850 + }, + { + "epoch": 1.02, + "grad_norm": 3.5871306742574416, + "learning_rate": 1.8447635083185666e-06, + "loss": 0.696, + "step": 6851 + }, + { + "epoch": 1.02, + "grad_norm": 0.5492273559652694, + "learning_rate": 1.8447118056375645e-06, + "loss": 0.6803, + "step": 6852 + }, + { + "epoch": 1.02, + "grad_norm": 3.5077825969968197, + "learning_rate": 1.8446600950727924e-06, + "loss": 0.6947, + "step": 6853 + }, + { + "epoch": 1.02, + "grad_norm": 5.111806280403876, + "learning_rate": 1.8446083766247332e-06, + "loss": 0.681, + "step": 6854 + }, + { + "epoch": 1.02, + "grad_norm": 0.519155202004483, + "learning_rate": 1.8445566502938687e-06, + "loss": 0.6745, + "step": 6855 + }, + { + "epoch": 1.02, + "grad_norm": 2.493245256524762, + "learning_rate": 1.844504916080683e-06, + "loss": 0.6758, + "step": 6856 + }, + { + "epoch": 1.02, + "grad_norm": 0.7329255473633075, + "learning_rate": 1.8444531739856579e-06, + "loss": 0.6816, + "step": 6857 + }, + { + "epoch": 1.02, + "grad_norm": 3.636959941096662, + "learning_rate": 1.8444014240092764e-06, + "loss": 0.6777, + "step": 6858 + }, + { + "epoch": 1.02, + "grad_norm": 6.147229976036517, + "learning_rate": 1.8443496661520221e-06, + "loss": 0.6549, + "step": 6859 + }, + { + "epoch": 1.02, + "grad_norm": 2.825591211529964, + "learning_rate": 1.8442979004143775e-06, + "loss": 0.6849, + "step": 6860 + }, + { + "epoch": 1.02, + "grad_norm": 2.06005844833952, + "learning_rate": 1.8442461267968258e-06, + "loss": 0.6751, + "step": 6861 + }, + { + "epoch": 1.02, + "grad_norm": 2.1582273944328443, + "learning_rate": 1.8441943452998507e-06, + "loss": 0.6875, + "step": 6862 + }, + { + "epoch": 1.02, + "grad_norm": 7.3410479180595525, + "learning_rate": 1.844142555923935e-06, + "loss": 0.6738, + "step": 6863 + }, + { + "epoch": 1.02, + "grad_norm": 3.84869403029646, + "learning_rate": 1.8440907586695622e-06, + "loss": 0.681, + "step": 6864 + }, + { + "epoch": 1.02, + "grad_norm": 2.5669986497674095, + "learning_rate": 1.8440389535372156e-06, + "loss": 0.6712, + "step": 6865 + }, + { + "epoch": 1.02, + "grad_norm": 1.48592874865449, + "learning_rate": 1.8439871405273788e-06, + "loss": 0.6458, + "step": 6866 + }, + { + "epoch": 1.02, + "grad_norm": 2.0473770861850884, + "learning_rate": 1.8439353196405354e-06, + "loss": 0.6621, + "step": 6867 + }, + { + "epoch": 1.02, + "grad_norm": 2.5760914234304084, + "learning_rate": 1.843883490877169e-06, + "loss": 0.653, + "step": 6868 + }, + { + "epoch": 1.02, + "grad_norm": 1.0546314329624293, + "learning_rate": 1.8438316542377637e-06, + "loss": 0.6816, + "step": 6869 + }, + { + "epoch": 1.02, + "grad_norm": 4.006537285587918, + "learning_rate": 1.8437798097228026e-06, + "loss": 0.6699, + "step": 6870 + }, + { + "epoch": 1.02, + "grad_norm": 2.3865864664023255, + "learning_rate": 1.8437279573327697e-06, + "loss": 0.679, + "step": 6871 + }, + { + "epoch": 1.02, + "grad_norm": 1.9350968381060383, + "learning_rate": 1.8436760970681496e-06, + "loss": 0.7005, + "step": 6872 + }, + { + "epoch": 1.03, + "grad_norm": 0.8416284279218665, + "learning_rate": 1.8436242289294258e-06, + "loss": 0.6777, + "step": 6873 + }, + { + "epoch": 1.03, + "grad_norm": 0.9135277141268533, + "learning_rate": 1.843572352917082e-06, + "loss": 0.6712, + "step": 6874 + }, + { + "epoch": 1.03, + "grad_norm": 2.8085342249713965, + "learning_rate": 1.8435204690316035e-06, + "loss": 0.6927, + "step": 6875 + }, + { + "epoch": 1.03, + "grad_norm": 7.298518372166924, + "learning_rate": 1.8434685772734735e-06, + "loss": 0.681, + "step": 6876 + }, + { + "epoch": 1.03, + "grad_norm": 1.8455938241087184, + "learning_rate": 1.8434166776431765e-06, + "loss": 0.6868, + "step": 6877 + }, + { + "epoch": 1.03, + "grad_norm": 0.8962717913764853, + "learning_rate": 1.8433647701411973e-06, + "loss": 0.7005, + "step": 6878 + }, + { + "epoch": 1.03, + "grad_norm": 2.732344821712517, + "learning_rate": 1.8433128547680203e-06, + "loss": 0.6706, + "step": 6879 + }, + { + "epoch": 1.03, + "grad_norm": 1.037623592887603, + "learning_rate": 1.8432609315241296e-06, + "loss": 0.6901, + "step": 6880 + }, + { + "epoch": 1.03, + "grad_norm": 1.5777038483573511, + "learning_rate": 1.8432090004100101e-06, + "loss": 0.666, + "step": 6881 + }, + { + "epoch": 1.03, + "grad_norm": 3.1971842647950766, + "learning_rate": 1.8431570614261465e-06, + "loss": 0.6979, + "step": 6882 + }, + { + "epoch": 1.03, + "grad_norm": 5.1466900255531325, + "learning_rate": 1.8431051145730234e-06, + "loss": 0.7044, + "step": 6883 + }, + { + "epoch": 1.03, + "grad_norm": 1.3437441223671833, + "learning_rate": 1.8430531598511256e-06, + "loss": 0.7038, + "step": 6884 + }, + { + "epoch": 1.03, + "grad_norm": 1.9387235880536544, + "learning_rate": 1.8430011972609382e-06, + "loss": 0.6758, + "step": 6885 + }, + { + "epoch": 1.03, + "grad_norm": 1.7270623785115673, + "learning_rate": 1.842949226802946e-06, + "loss": 0.6719, + "step": 6886 + }, + { + "epoch": 1.03, + "grad_norm": 1.447833666845079, + "learning_rate": 1.8428972484776344e-06, + "loss": 0.6829, + "step": 6887 + }, + { + "epoch": 1.03, + "grad_norm": 2.959904541970192, + "learning_rate": 1.842845262285488e-06, + "loss": 0.6719, + "step": 6888 + }, + { + "epoch": 1.03, + "grad_norm": 4.186131468049194, + "learning_rate": 1.8427932682269924e-06, + "loss": 0.6953, + "step": 6889 + }, + { + "epoch": 1.03, + "grad_norm": 0.7545612609435365, + "learning_rate": 1.8427412663026325e-06, + "loss": 0.6842, + "step": 6890 + }, + { + "epoch": 1.03, + "grad_norm": 0.6868139495211089, + "learning_rate": 1.842689256512894e-06, + "loss": 0.6803, + "step": 6891 + }, + { + "epoch": 1.03, + "grad_norm": 0.9244783126988934, + "learning_rate": 1.8426372388582618e-06, + "loss": 0.7064, + "step": 6892 + }, + { + "epoch": 1.03, + "grad_norm": 2.4867175131117807, + "learning_rate": 1.842585213339222e-06, + "loss": 0.6725, + "step": 6893 + }, + { + "epoch": 1.03, + "grad_norm": 0.8925272278774259, + "learning_rate": 1.84253317995626e-06, + "loss": 0.6458, + "step": 6894 + }, + { + "epoch": 1.03, + "grad_norm": 0.5775900129256245, + "learning_rate": 1.8424811387098612e-06, + "loss": 0.6784, + "step": 6895 + }, + { + "epoch": 1.03, + "grad_norm": 1.8617580796913886, + "learning_rate": 1.8424290896005115e-06, + "loss": 0.6738, + "step": 6896 + }, + { + "epoch": 1.03, + "grad_norm": 3.2349797975031005, + "learning_rate": 1.8423770326286965e-06, + "loss": 0.6836, + "step": 6897 + }, + { + "epoch": 1.03, + "grad_norm": 0.46622570146322917, + "learning_rate": 1.8423249677949023e-06, + "loss": 0.681, + "step": 6898 + }, + { + "epoch": 1.03, + "grad_norm": 0.9244993868671156, + "learning_rate": 1.8422728950996144e-06, + "loss": 0.6536, + "step": 6899 + }, + { + "epoch": 1.03, + "grad_norm": 4.828594571400317, + "learning_rate": 1.8422208145433194e-06, + "loss": 0.6589, + "step": 6900 + }, + { + "epoch": 1.03, + "grad_norm": 0.6155074860157153, + "learning_rate": 1.8421687261265029e-06, + "loss": 0.6706, + "step": 6901 + }, + { + "epoch": 1.03, + "grad_norm": 1.0112566083522554, + "learning_rate": 1.8421166298496513e-06, + "loss": 0.6745, + "step": 6902 + }, + { + "epoch": 1.03, + "grad_norm": 0.7183687606726806, + "learning_rate": 1.8420645257132505e-06, + "loss": 0.6836, + "step": 6903 + }, + { + "epoch": 1.03, + "grad_norm": 0.5455898485403436, + "learning_rate": 1.842012413717787e-06, + "loss": 0.6738, + "step": 6904 + }, + { + "epoch": 1.03, + "grad_norm": 4.010068654465921, + "learning_rate": 1.8419602938637474e-06, + "loss": 0.6576, + "step": 6905 + }, + { + "epoch": 1.03, + "grad_norm": 2.3982428409225247, + "learning_rate": 1.841908166151618e-06, + "loss": 0.6829, + "step": 6906 + }, + { + "epoch": 1.03, + "grad_norm": 2.092633869139048, + "learning_rate": 1.841856030581885e-06, + "loss": 0.6725, + "step": 6907 + }, + { + "epoch": 1.03, + "grad_norm": 6.446780600503024, + "learning_rate": 1.8418038871550353e-06, + "loss": 0.668, + "step": 6908 + }, + { + "epoch": 1.03, + "grad_norm": 0.7290841500291892, + "learning_rate": 1.8417517358715554e-06, + "loss": 0.7018, + "step": 6909 + }, + { + "epoch": 1.03, + "grad_norm": 2.482568988596345, + "learning_rate": 1.8416995767319324e-06, + "loss": 0.6641, + "step": 6910 + }, + { + "epoch": 1.03, + "grad_norm": 1.736489909309705, + "learning_rate": 1.8416474097366525e-06, + "loss": 0.6745, + "step": 6911 + }, + { + "epoch": 1.03, + "grad_norm": 1.5301430403258154, + "learning_rate": 1.8415952348862029e-06, + "loss": 0.6628, + "step": 6912 + }, + { + "epoch": 1.03, + "grad_norm": 1.8495765900201016, + "learning_rate": 1.8415430521810706e-06, + "loss": 0.679, + "step": 6913 + }, + { + "epoch": 1.03, + "grad_norm": 0.7872846333844105, + "learning_rate": 1.8414908616217426e-06, + "loss": 0.6816, + "step": 6914 + }, + { + "epoch": 1.03, + "grad_norm": 1.7907305768924149, + "learning_rate": 1.8414386632087062e-06, + "loss": 0.7057, + "step": 6915 + }, + { + "epoch": 1.03, + "grad_norm": 5.441456922668186, + "learning_rate": 1.8413864569424479e-06, + "loss": 0.6634, + "step": 6916 + }, + { + "epoch": 1.03, + "grad_norm": 4.721025016187932, + "learning_rate": 1.8413342428234558e-06, + "loss": 0.6686, + "step": 6917 + }, + { + "epoch": 1.03, + "grad_norm": 1.6620451435087231, + "learning_rate": 1.8412820208522164e-06, + "loss": 0.6771, + "step": 6918 + }, + { + "epoch": 1.03, + "grad_norm": 6.4200338184336925, + "learning_rate": 1.8412297910292178e-06, + "loss": 0.6888, + "step": 6919 + }, + { + "epoch": 1.03, + "grad_norm": 5.0996543466124225, + "learning_rate": 1.841177553354947e-06, + "loss": 0.6582, + "step": 6920 + }, + { + "epoch": 1.03, + "grad_norm": 3.4004504670486972, + "learning_rate": 1.8411253078298918e-06, + "loss": 0.6745, + "step": 6921 + }, + { + "epoch": 1.03, + "grad_norm": 4.746607041329221, + "learning_rate": 1.8410730544545398e-06, + "loss": 0.6602, + "step": 6922 + }, + { + "epoch": 1.03, + "grad_norm": 2.9535937351691386, + "learning_rate": 1.8410207932293784e-06, + "loss": 0.6484, + "step": 6923 + }, + { + "epoch": 1.03, + "grad_norm": 2.879216796409624, + "learning_rate": 1.8409685241548958e-06, + "loss": 0.6745, + "step": 6924 + }, + { + "epoch": 1.03, + "grad_norm": 1.2062075838555837, + "learning_rate": 1.8409162472315796e-06, + "loss": 0.7025, + "step": 6925 + }, + { + "epoch": 1.03, + "grad_norm": 1.0314565775203697, + "learning_rate": 1.8408639624599175e-06, + "loss": 0.6855, + "step": 6926 + }, + { + "epoch": 1.03, + "grad_norm": 1.5363197379065352, + "learning_rate": 1.8408116698403976e-06, + "loss": 0.6953, + "step": 6927 + }, + { + "epoch": 1.03, + "grad_norm": 1.1033920491683755, + "learning_rate": 1.8407593693735082e-06, + "loss": 0.6816, + "step": 6928 + }, + { + "epoch": 1.03, + "grad_norm": 5.2546460950721, + "learning_rate": 1.8407070610597373e-06, + "loss": 0.7005, + "step": 6929 + }, + { + "epoch": 1.03, + "grad_norm": 1.3052991027655403, + "learning_rate": 1.8406547448995729e-06, + "loss": 0.6751, + "step": 6930 + }, + { + "epoch": 1.03, + "grad_norm": 1.0937849344584176, + "learning_rate": 1.8406024208935036e-06, + "loss": 0.6602, + "step": 6931 + }, + { + "epoch": 1.03, + "grad_norm": 1.2059286828470344, + "learning_rate": 1.8405500890420174e-06, + "loss": 0.6777, + "step": 6932 + }, + { + "epoch": 1.03, + "grad_norm": 2.0978159782057326, + "learning_rate": 1.8404977493456028e-06, + "loss": 0.6882, + "step": 6933 + }, + { + "epoch": 1.03, + "grad_norm": 1.5986559514920609, + "learning_rate": 1.8404454018047485e-06, + "loss": 0.6745, + "step": 6934 + }, + { + "epoch": 1.03, + "grad_norm": 6.45814365090293, + "learning_rate": 1.840393046419943e-06, + "loss": 0.6999, + "step": 6935 + }, + { + "epoch": 1.03, + "grad_norm": 3.7208103549842235, + "learning_rate": 1.840340683191675e-06, + "loss": 0.6875, + "step": 6936 + }, + { + "epoch": 1.03, + "grad_norm": 4.751942688760206, + "learning_rate": 1.8402883121204328e-06, + "loss": 0.6803, + "step": 6937 + }, + { + "epoch": 1.03, + "grad_norm": 2.4289532171646524, + "learning_rate": 1.8402359332067055e-06, + "loss": 0.6615, + "step": 6938 + }, + { + "epoch": 1.03, + "grad_norm": 0.733425970956521, + "learning_rate": 1.8401835464509822e-06, + "loss": 0.6842, + "step": 6939 + }, + { + "epoch": 1.04, + "grad_norm": 4.270432978966753, + "learning_rate": 1.8401311518537512e-06, + "loss": 0.679, + "step": 6940 + }, + { + "epoch": 1.04, + "grad_norm": 1.3152279964866898, + "learning_rate": 1.8400787494155021e-06, + "loss": 0.6764, + "step": 6941 + }, + { + "epoch": 1.04, + "grad_norm": 0.6473599619457052, + "learning_rate": 1.8400263391367237e-06, + "loss": 0.6777, + "step": 6942 + }, + { + "epoch": 1.04, + "grad_norm": 2.364327521105375, + "learning_rate": 1.8399739210179051e-06, + "loss": 0.6986, + "step": 6943 + }, + { + "epoch": 1.04, + "grad_norm": 1.0384858046276955, + "learning_rate": 1.839921495059536e-06, + "loss": 0.6712, + "step": 6944 + }, + { + "epoch": 1.04, + "grad_norm": 2.5241656680404483, + "learning_rate": 1.8398690612621049e-06, + "loss": 0.6868, + "step": 6945 + }, + { + "epoch": 1.04, + "grad_norm": 2.1875306119866975, + "learning_rate": 1.8398166196261013e-06, + "loss": 0.6921, + "step": 6946 + }, + { + "epoch": 1.04, + "grad_norm": 1.9611383852443112, + "learning_rate": 1.8397641701520155e-06, + "loss": 0.6732, + "step": 6947 + }, + { + "epoch": 1.04, + "grad_norm": 0.6943692075029656, + "learning_rate": 1.8397117128403364e-06, + "loss": 0.6823, + "step": 6948 + }, + { + "epoch": 1.04, + "grad_norm": 4.043145169576151, + "learning_rate": 1.839659247691553e-06, + "loss": 0.6888, + "step": 6949 + }, + { + "epoch": 1.04, + "grad_norm": 3.203874719225912, + "learning_rate": 1.8396067747061563e-06, + "loss": 0.6771, + "step": 6950 + }, + { + "epoch": 1.04, + "grad_norm": 1.9771288780094791, + "learning_rate": 1.839554293884635e-06, + "loss": 0.666, + "step": 6951 + }, + { + "epoch": 1.04, + "grad_norm": 5.76779142078125, + "learning_rate": 1.8395018052274794e-06, + "loss": 0.6842, + "step": 6952 + }, + { + "epoch": 1.04, + "grad_norm": 1.2628157201539942, + "learning_rate": 1.839449308735179e-06, + "loss": 0.6908, + "step": 6953 + }, + { + "epoch": 1.04, + "grad_norm": 3.653751092667478, + "learning_rate": 1.839396804408224e-06, + "loss": 0.668, + "step": 6954 + }, + { + "epoch": 1.04, + "grad_norm": 1.8261520290584325, + "learning_rate": 1.8393442922471046e-06, + "loss": 0.6712, + "step": 6955 + }, + { + "epoch": 1.04, + "grad_norm": 5.384425379013841, + "learning_rate": 1.8392917722523105e-06, + "loss": 0.6771, + "step": 6956 + }, + { + "epoch": 1.04, + "grad_norm": 1.2142774320123755, + "learning_rate": 1.839239244424332e-06, + "loss": 0.6725, + "step": 6957 + }, + { + "epoch": 1.04, + "grad_norm": 4.769118311425131, + "learning_rate": 1.8391867087636595e-06, + "loss": 0.6829, + "step": 6958 + }, + { + "epoch": 1.04, + "grad_norm": 1.9249647672719445, + "learning_rate": 1.8391341652707831e-06, + "loss": 0.6771, + "step": 6959 + }, + { + "epoch": 1.04, + "grad_norm": 5.3117360219265475, + "learning_rate": 1.8390816139461936e-06, + "loss": 0.6589, + "step": 6960 + }, + { + "epoch": 1.04, + "grad_norm": 1.1455509043009917, + "learning_rate": 1.839029054790381e-06, + "loss": 0.6738, + "step": 6961 + }, + { + "epoch": 1.04, + "grad_norm": 2.1397213511152007, + "learning_rate": 1.838976487803836e-06, + "loss": 0.6699, + "step": 6962 + }, + { + "epoch": 1.04, + "grad_norm": 0.8229949503959654, + "learning_rate": 1.8389239129870496e-06, + "loss": 0.6895, + "step": 6963 + }, + { + "epoch": 1.04, + "grad_norm": 1.3713543696825754, + "learning_rate": 1.838871330340512e-06, + "loss": 0.6771, + "step": 6964 + }, + { + "epoch": 1.04, + "grad_norm": 4.7191832740174045, + "learning_rate": 1.8388187398647137e-06, + "loss": 0.6979, + "step": 6965 + }, + { + "epoch": 1.04, + "grad_norm": 2.9399288275359434, + "learning_rate": 1.8387661415601462e-06, + "loss": 0.6862, + "step": 6966 + }, + { + "epoch": 1.04, + "grad_norm": 4.124261985915034, + "learning_rate": 1.8387135354273001e-06, + "loss": 0.7038, + "step": 6967 + }, + { + "epoch": 1.04, + "grad_norm": 0.6263400972907583, + "learning_rate": 1.8386609214666663e-06, + "loss": 0.6895, + "step": 6968 + }, + { + "epoch": 1.04, + "grad_norm": 1.1864622125295454, + "learning_rate": 1.8386082996787362e-06, + "loss": 0.6667, + "step": 6969 + }, + { + "epoch": 1.04, + "grad_norm": 4.077218208657157, + "learning_rate": 1.8385556700640005e-06, + "loss": 0.6816, + "step": 6970 + }, + { + "epoch": 1.04, + "grad_norm": 2.595069878526633, + "learning_rate": 1.8385030326229506e-06, + "loss": 0.6882, + "step": 6971 + }, + { + "epoch": 1.04, + "grad_norm": 6.332737378531102, + "learning_rate": 1.8384503873560778e-06, + "loss": 0.668, + "step": 6972 + }, + { + "epoch": 1.04, + "grad_norm": 3.337600198430594, + "learning_rate": 1.8383977342638735e-06, + "loss": 0.6901, + "step": 6973 + }, + { + "epoch": 1.04, + "grad_norm": 2.134323736464271, + "learning_rate": 1.8383450733468286e-06, + "loss": 0.6803, + "step": 6974 + }, + { + "epoch": 1.04, + "grad_norm": 2.4152988419774273, + "learning_rate": 1.8382924046054352e-06, + "loss": 0.6523, + "step": 6975 + }, + { + "epoch": 1.04, + "grad_norm": 2.193997142127918, + "learning_rate": 1.8382397280401849e-06, + "loss": 0.6706, + "step": 6976 + }, + { + "epoch": 1.04, + "grad_norm": 1.0256347334423845, + "learning_rate": 1.838187043651569e-06, + "loss": 0.6615, + "step": 6977 + }, + { + "epoch": 1.04, + "grad_norm": 2.039229828522437, + "learning_rate": 1.838134351440079e-06, + "loss": 0.6934, + "step": 6978 + }, + { + "epoch": 1.04, + "grad_norm": 1.9066970600833497, + "learning_rate": 1.8380816514062072e-06, + "loss": 0.6706, + "step": 6979 + }, + { + "epoch": 1.04, + "grad_norm": 0.6945582231200902, + "learning_rate": 1.8380289435504453e-06, + "loss": 0.6758, + "step": 6980 + }, + { + "epoch": 1.04, + "grad_norm": 2.981273892688787, + "learning_rate": 1.837976227873285e-06, + "loss": 0.6797, + "step": 6981 + }, + { + "epoch": 1.04, + "grad_norm": 0.7763977050198088, + "learning_rate": 1.8379235043752184e-06, + "loss": 0.6966, + "step": 6982 + }, + { + "epoch": 1.04, + "grad_norm": 5.091701919774396, + "learning_rate": 1.8378707730567378e-06, + "loss": 0.6595, + "step": 6983 + }, + { + "epoch": 1.04, + "grad_norm": 1.5979210897815055, + "learning_rate": 1.8378180339183352e-06, + "loss": 0.6862, + "step": 6984 + }, + { + "epoch": 1.04, + "grad_norm": 2.596255551888981, + "learning_rate": 1.8377652869605024e-06, + "loss": 0.6836, + "step": 6985 + }, + { + "epoch": 1.04, + "grad_norm": 0.6734389724006666, + "learning_rate": 1.8377125321837324e-06, + "loss": 0.6504, + "step": 6986 + }, + { + "epoch": 1.04, + "grad_norm": 1.4666357107382213, + "learning_rate": 1.8376597695885173e-06, + "loss": 0.6803, + "step": 6987 + }, + { + "epoch": 1.04, + "grad_norm": 1.2334569889172549, + "learning_rate": 1.8376069991753497e-06, + "loss": 0.6836, + "step": 6988 + }, + { + "epoch": 1.04, + "grad_norm": 4.547190201995633, + "learning_rate": 1.8375542209447214e-06, + "loss": 0.7292, + "step": 6989 + }, + { + "epoch": 1.04, + "grad_norm": 6.344378800543547, + "learning_rate": 1.8375014348971257e-06, + "loss": 0.6979, + "step": 6990 + }, + { + "epoch": 1.04, + "grad_norm": 2.0010954992925227, + "learning_rate": 1.8374486410330552e-06, + "loss": 0.6934, + "step": 6991 + }, + { + "epoch": 1.04, + "grad_norm": 1.3691443768000584, + "learning_rate": 1.8373958393530023e-06, + "loss": 0.6615, + "step": 6992 + }, + { + "epoch": 1.04, + "grad_norm": 0.5633820955065174, + "learning_rate": 1.83734302985746e-06, + "loss": 0.6777, + "step": 6993 + }, + { + "epoch": 1.04, + "grad_norm": 2.8846716164747783, + "learning_rate": 1.8372902125469212e-06, + "loss": 0.6764, + "step": 6994 + }, + { + "epoch": 1.04, + "grad_norm": 1.7361719144036136, + "learning_rate": 1.8372373874218785e-06, + "loss": 0.6686, + "step": 6995 + }, + { + "epoch": 1.04, + "grad_norm": 1.113671541942443, + "learning_rate": 1.8371845544828256e-06, + "loss": 0.6849, + "step": 6996 + }, + { + "epoch": 1.04, + "grad_norm": 0.9394454652473033, + "learning_rate": 1.8371317137302552e-06, + "loss": 0.6582, + "step": 6997 + }, + { + "epoch": 1.04, + "grad_norm": 3.405104783736042, + "learning_rate": 1.8370788651646603e-06, + "loss": 0.6536, + "step": 6998 + }, + { + "epoch": 1.04, + "grad_norm": 0.9202913370748615, + "learning_rate": 1.8370260087865343e-06, + "loss": 0.6882, + "step": 6999 + }, + { + "epoch": 1.04, + "grad_norm": 3.4779870369863732, + "learning_rate": 1.8369731445963708e-06, + "loss": 0.6771, + "step": 7000 + }, + { + "epoch": 1.04, + "grad_norm": 0.7515071177596435, + "learning_rate": 1.8369202725946629e-06, + "loss": 0.7129, + "step": 7001 + }, + { + "epoch": 1.04, + "grad_norm": 1.5627329832001353, + "learning_rate": 1.8368673927819038e-06, + "loss": 0.6647, + "step": 7002 + }, + { + "epoch": 1.04, + "grad_norm": 1.317286729163837, + "learning_rate": 1.8368145051585876e-06, + "loss": 0.6745, + "step": 7003 + }, + { + "epoch": 1.04, + "grad_norm": 0.977568786908681, + "learning_rate": 1.8367616097252075e-06, + "loss": 0.7083, + "step": 7004 + }, + { + "epoch": 1.04, + "grad_norm": 4.853377848605319, + "learning_rate": 1.8367087064822573e-06, + "loss": 0.6595, + "step": 7005 + }, + { + "epoch": 1.04, + "grad_norm": 2.2250366702646063, + "learning_rate": 1.8366557954302307e-06, + "loss": 0.6836, + "step": 7006 + }, + { + "epoch": 1.05, + "grad_norm": 1.0299108540073463, + "learning_rate": 1.8366028765696214e-06, + "loss": 0.6732, + "step": 7007 + }, + { + "epoch": 1.05, + "grad_norm": 2.888239416443442, + "learning_rate": 1.8365499499009239e-06, + "loss": 0.6927, + "step": 7008 + }, + { + "epoch": 1.05, + "grad_norm": 3.433977631771741, + "learning_rate": 1.8364970154246314e-06, + "loss": 0.6634, + "step": 7009 + }, + { + "epoch": 1.05, + "grad_norm": 1.3760901383260116, + "learning_rate": 1.8364440731412385e-06, + "loss": 0.6947, + "step": 7010 + }, + { + "epoch": 1.05, + "grad_norm": 3.64617304957815, + "learning_rate": 1.8363911230512387e-06, + "loss": 0.6966, + "step": 7011 + }, + { + "epoch": 1.05, + "grad_norm": 1.4230207030324622, + "learning_rate": 1.8363381651551268e-06, + "loss": 0.6842, + "step": 7012 + }, + { + "epoch": 1.05, + "grad_norm": 0.5803123828959674, + "learning_rate": 1.836285199453397e-06, + "loss": 0.6621, + "step": 7013 + }, + { + "epoch": 1.05, + "grad_norm": 2.392937707968575, + "learning_rate": 1.8362322259465434e-06, + "loss": 0.681, + "step": 7014 + }, + { + "epoch": 1.05, + "grad_norm": 0.7102843290360398, + "learning_rate": 1.8361792446350606e-06, + "loss": 0.6497, + "step": 7015 + }, + { + "epoch": 1.05, + "grad_norm": 1.272671658516419, + "learning_rate": 1.8361262555194427e-06, + "loss": 0.6615, + "step": 7016 + }, + { + "epoch": 1.05, + "grad_norm": 2.0379674039137496, + "learning_rate": 1.8360732586001847e-06, + "loss": 0.6803, + "step": 7017 + }, + { + "epoch": 1.05, + "grad_norm": 4.765639151578921, + "learning_rate": 1.8360202538777808e-06, + "loss": 0.6829, + "step": 7018 + }, + { + "epoch": 1.05, + "grad_norm": 2.1939243796631627, + "learning_rate": 1.8359672413527263e-06, + "loss": 0.6686, + "step": 7019 + }, + { + "epoch": 1.05, + "grad_norm": 1.1868010972083478, + "learning_rate": 1.8359142210255155e-06, + "loss": 0.668, + "step": 7020 + }, + { + "epoch": 1.05, + "grad_norm": 0.7580533381631278, + "learning_rate": 1.8358611928966434e-06, + "loss": 0.6836, + "step": 7021 + }, + { + "epoch": 1.05, + "grad_norm": 0.5991755028224198, + "learning_rate": 1.8358081569666046e-06, + "loss": 0.6849, + "step": 7022 + }, + { + "epoch": 1.05, + "grad_norm": 3.5383124683610108, + "learning_rate": 1.8357551132358946e-06, + "loss": 0.6686, + "step": 7023 + }, + { + "epoch": 1.05, + "grad_norm": 0.9656800938949044, + "learning_rate": 1.835702061705008e-06, + "loss": 0.6823, + "step": 7024 + }, + { + "epoch": 1.05, + "grad_norm": 3.4750192350790954, + "learning_rate": 1.8356490023744402e-06, + "loss": 0.6764, + "step": 7025 + }, + { + "epoch": 1.05, + "grad_norm": 4.382053122914745, + "learning_rate": 1.8355959352446867e-06, + "loss": 0.6823, + "step": 7026 + }, + { + "epoch": 1.05, + "grad_norm": 2.8408335095165076, + "learning_rate": 1.835542860316242e-06, + "loss": 0.6908, + "step": 7027 + }, + { + "epoch": 1.05, + "grad_norm": 3.343764557749404, + "learning_rate": 1.835489777589602e-06, + "loss": 0.6803, + "step": 7028 + }, + { + "epoch": 1.05, + "grad_norm": 2.3736603994956686, + "learning_rate": 1.835436687065262e-06, + "loss": 0.6556, + "step": 7029 + }, + { + "epoch": 1.05, + "grad_norm": 1.4261664402564007, + "learning_rate": 1.8353835887437176e-06, + "loss": 0.6803, + "step": 7030 + }, + { + "epoch": 1.05, + "grad_norm": 0.862417439171769, + "learning_rate": 1.8353304826254642e-06, + "loss": 0.6569, + "step": 7031 + }, + { + "epoch": 1.05, + "grad_norm": 3.0600305148018583, + "learning_rate": 1.8352773687109976e-06, + "loss": 0.6823, + "step": 7032 + }, + { + "epoch": 1.05, + "grad_norm": 4.5791622268553, + "learning_rate": 1.8352242470008138e-06, + "loss": 0.6934, + "step": 7033 + }, + { + "epoch": 1.05, + "grad_norm": 4.967024117235244, + "learning_rate": 1.835171117495408e-06, + "loss": 0.6908, + "step": 7034 + }, + { + "epoch": 1.05, + "grad_norm": 5.714683802174773, + "learning_rate": 1.835117980195276e-06, + "loss": 0.6947, + "step": 7035 + }, + { + "epoch": 1.05, + "grad_norm": 3.6159900552154034, + "learning_rate": 1.835064835100914e-06, + "loss": 0.7005, + "step": 7036 + }, + { + "epoch": 1.05, + "grad_norm": 4.350253435669832, + "learning_rate": 1.8350116822128185e-06, + "loss": 0.7109, + "step": 7037 + }, + { + "epoch": 1.05, + "grad_norm": 0.8063935848577485, + "learning_rate": 1.8349585215314848e-06, + "loss": 0.6836, + "step": 7038 + }, + { + "epoch": 1.05, + "grad_norm": 1.6775123943310326, + "learning_rate": 1.8349053530574096e-06, + "loss": 0.7083, + "step": 7039 + }, + { + "epoch": 1.05, + "grad_norm": 2.356741164768443, + "learning_rate": 1.834852176791089e-06, + "loss": 0.6868, + "step": 7040 + }, + { + "epoch": 1.05, + "grad_norm": 2.834046022470828, + "learning_rate": 1.834798992733019e-06, + "loss": 0.6673, + "step": 7041 + }, + { + "epoch": 1.05, + "grad_norm": 1.1969754518600044, + "learning_rate": 1.834745800883696e-06, + "loss": 0.6595, + "step": 7042 + }, + { + "epoch": 1.05, + "grad_norm": 0.9415839029212515, + "learning_rate": 1.834692601243617e-06, + "loss": 0.6673, + "step": 7043 + }, + { + "epoch": 1.05, + "grad_norm": 1.2085031179095451, + "learning_rate": 1.8346393938132779e-06, + "loss": 0.6901, + "step": 7044 + }, + { + "epoch": 1.05, + "grad_norm": 0.9042916590903297, + "learning_rate": 1.8345861785931755e-06, + "loss": 0.6667, + "step": 7045 + }, + { + "epoch": 1.05, + "grad_norm": 1.1537465296068696, + "learning_rate": 1.8345329555838064e-06, + "loss": 0.6478, + "step": 7046 + }, + { + "epoch": 1.05, + "grad_norm": 2.548910073912032, + "learning_rate": 1.8344797247856678e-06, + "loss": 0.6764, + "step": 7047 + }, + { + "epoch": 1.05, + "grad_norm": 6.414736194592172, + "learning_rate": 1.8344264861992557e-06, + "loss": 0.6947, + "step": 7048 + }, + { + "epoch": 1.05, + "grad_norm": 3.370108422160381, + "learning_rate": 1.8343732398250677e-06, + "loss": 0.6921, + "step": 7049 + }, + { + "epoch": 1.05, + "grad_norm": 2.2963286140406933, + "learning_rate": 1.8343199856636003e-06, + "loss": 0.6875, + "step": 7050 + }, + { + "epoch": 1.05, + "grad_norm": 3.868637484090843, + "learning_rate": 1.834266723715351e-06, + "loss": 0.6823, + "step": 7051 + }, + { + "epoch": 1.05, + "grad_norm": 5.983246709335134, + "learning_rate": 1.834213453980816e-06, + "loss": 0.6751, + "step": 7052 + }, + { + "epoch": 1.05, + "grad_norm": 1.159870374616478, + "learning_rate": 1.8341601764604934e-06, + "loss": 0.6745, + "step": 7053 + }, + { + "epoch": 1.05, + "grad_norm": 1.096160772447296, + "learning_rate": 1.8341068911548802e-06, + "loss": 0.6927, + "step": 7054 + }, + { + "epoch": 1.05, + "grad_norm": 5.248613302362108, + "learning_rate": 1.8340535980644738e-06, + "loss": 0.6823, + "step": 7055 + }, + { + "epoch": 1.05, + "grad_norm": 2.739246342192815, + "learning_rate": 1.834000297189771e-06, + "loss": 0.681, + "step": 7056 + }, + { + "epoch": 1.05, + "grad_norm": 2.3974164951541277, + "learning_rate": 1.8339469885312698e-06, + "loss": 0.6823, + "step": 7057 + }, + { + "epoch": 1.05, + "grad_norm": 1.9973841852427423, + "learning_rate": 1.8338936720894677e-06, + "loss": 0.6888, + "step": 7058 + }, + { + "epoch": 1.05, + "grad_norm": 3.703986942988171, + "learning_rate": 1.833840347864862e-06, + "loss": 0.6699, + "step": 7059 + }, + { + "epoch": 1.05, + "grad_norm": 3.167274639556455, + "learning_rate": 1.833787015857951e-06, + "loss": 0.6764, + "step": 7060 + }, + { + "epoch": 1.05, + "grad_norm": 4.098702202560682, + "learning_rate": 1.8337336760692317e-06, + "loss": 0.6888, + "step": 7061 + }, + { + "epoch": 1.05, + "grad_norm": 0.7047720948018918, + "learning_rate": 1.8336803284992022e-06, + "loss": 0.6478, + "step": 7062 + }, + { + "epoch": 1.05, + "grad_norm": 1.6449284414239116, + "learning_rate": 1.8336269731483606e-06, + "loss": 0.6549, + "step": 7063 + }, + { + "epoch": 1.05, + "grad_norm": 2.83699967647789, + "learning_rate": 1.833573610017205e-06, + "loss": 0.6836, + "step": 7064 + }, + { + "epoch": 1.05, + "grad_norm": 2.6733760496729064, + "learning_rate": 1.8335202391062327e-06, + "loss": 0.6419, + "step": 7065 + }, + { + "epoch": 1.05, + "grad_norm": 1.4561065467831493, + "learning_rate": 1.8334668604159425e-06, + "loss": 0.6745, + "step": 7066 + }, + { + "epoch": 1.05, + "grad_norm": 2.6063076575877604, + "learning_rate": 1.8334134739468326e-06, + "loss": 0.6842, + "step": 7067 + }, + { + "epoch": 1.05, + "grad_norm": 1.3693995719979304, + "learning_rate": 1.8333600796994006e-06, + "loss": 0.6686, + "step": 7068 + }, + { + "epoch": 1.05, + "grad_norm": 5.414226897615028, + "learning_rate": 1.8333066776741458e-06, + "loss": 0.6999, + "step": 7069 + }, + { + "epoch": 1.05, + "grad_norm": 3.8130650576760945, + "learning_rate": 1.8332532678715659e-06, + "loss": 0.6569, + "step": 7070 + }, + { + "epoch": 1.05, + "grad_norm": 2.043659346053398, + "learning_rate": 1.8331998502921592e-06, + "loss": 0.6803, + "step": 7071 + }, + { + "epoch": 1.05, + "grad_norm": 0.7739993252773123, + "learning_rate": 1.833146424936425e-06, + "loss": 0.6667, + "step": 7072 + }, + { + "epoch": 1.05, + "grad_norm": 1.5353832494233224, + "learning_rate": 1.8330929918048615e-06, + "loss": 0.6823, + "step": 7073 + }, + { + "epoch": 1.06, + "grad_norm": 1.0520049406899967, + "learning_rate": 1.8330395508979673e-06, + "loss": 0.6686, + "step": 7074 + }, + { + "epoch": 1.06, + "grad_norm": 2.9255779063344547, + "learning_rate": 1.8329861022162414e-06, + "loss": 0.6816, + "step": 7075 + }, + { + "epoch": 1.06, + "grad_norm": 4.241967781676939, + "learning_rate": 1.8329326457601826e-06, + "loss": 0.666, + "step": 7076 + }, + { + "epoch": 1.06, + "grad_norm": 3.3355874517721813, + "learning_rate": 1.8328791815302895e-06, + "loss": 0.6738, + "step": 7077 + }, + { + "epoch": 1.06, + "grad_norm": 1.7544157387399582, + "learning_rate": 1.8328257095270616e-06, + "loss": 0.7005, + "step": 7078 + }, + { + "epoch": 1.06, + "grad_norm": 2.4829915035580905, + "learning_rate": 1.8327722297509976e-06, + "loss": 0.6836, + "step": 7079 + }, + { + "epoch": 1.06, + "grad_norm": 2.0371745250605926, + "learning_rate": 1.8327187422025968e-06, + "loss": 0.6751, + "step": 7080 + }, + { + "epoch": 1.06, + "grad_norm": 2.613898731058296, + "learning_rate": 1.832665246882358e-06, + "loss": 0.6693, + "step": 7081 + }, + { + "epoch": 1.06, + "grad_norm": 6.421058742754259, + "learning_rate": 1.8326117437907812e-06, + "loss": 0.6849, + "step": 7082 + }, + { + "epoch": 1.06, + "grad_norm": 1.145287160892493, + "learning_rate": 1.832558232928365e-06, + "loss": 0.7109, + "step": 7083 + }, + { + "epoch": 1.06, + "grad_norm": 0.9418527697637538, + "learning_rate": 1.8325047142956098e-06, + "loss": 0.6641, + "step": 7084 + }, + { + "epoch": 1.06, + "grad_norm": 1.164228164370004, + "learning_rate": 1.832451187893014e-06, + "loss": 0.6738, + "step": 7085 + }, + { + "epoch": 1.06, + "grad_norm": 2.2808652913065455, + "learning_rate": 1.832397653721078e-06, + "loss": 0.6908, + "step": 7086 + }, + { + "epoch": 1.06, + "grad_norm": 1.9065155679666836, + "learning_rate": 1.8323441117803008e-06, + "loss": 0.6725, + "step": 7087 + }, + { + "epoch": 1.06, + "grad_norm": 0.9176741998491389, + "learning_rate": 1.8322905620711824e-06, + "loss": 0.7096, + "step": 7088 + }, + { + "epoch": 1.06, + "grad_norm": 7.438834112762495, + "learning_rate": 1.8322370045942224e-06, + "loss": 0.6999, + "step": 7089 + }, + { + "epoch": 1.06, + "grad_norm": 1.0637657947511272, + "learning_rate": 1.8321834393499211e-06, + "loss": 0.6576, + "step": 7090 + }, + { + "epoch": 1.06, + "grad_norm": 3.1275629763338513, + "learning_rate": 1.8321298663387782e-06, + "loss": 0.6712, + "step": 7091 + }, + { + "epoch": 1.06, + "grad_norm": 1.8867842510907793, + "learning_rate": 1.8320762855612934e-06, + "loss": 0.6777, + "step": 7092 + }, + { + "epoch": 1.06, + "grad_norm": 5.829661851413002, + "learning_rate": 1.8320226970179675e-06, + "loss": 0.6628, + "step": 7093 + }, + { + "epoch": 1.06, + "grad_norm": 2.433348019740995, + "learning_rate": 1.8319691007092998e-06, + "loss": 0.6374, + "step": 7094 + }, + { + "epoch": 1.06, + "grad_norm": 0.6073094696283741, + "learning_rate": 1.831915496635791e-06, + "loss": 0.6986, + "step": 7095 + }, + { + "epoch": 1.06, + "grad_norm": 2.5758318576645767, + "learning_rate": 1.8318618847979415e-06, + "loss": 0.6797, + "step": 7096 + }, + { + "epoch": 1.06, + "grad_norm": 3.8607217185433162, + "learning_rate": 1.8318082651962512e-06, + "loss": 0.6862, + "step": 7097 + }, + { + "epoch": 1.06, + "grad_norm": 0.6558368760628744, + "learning_rate": 1.8317546378312208e-06, + "loss": 0.681, + "step": 7098 + }, + { + "epoch": 1.06, + "grad_norm": 0.5878091754615791, + "learning_rate": 1.831701002703351e-06, + "loss": 0.6706, + "step": 7099 + }, + { + "epoch": 1.06, + "grad_norm": 3.780955245189495, + "learning_rate": 1.831647359813142e-06, + "loss": 0.7012, + "step": 7100 + }, + { + "epoch": 1.06, + "grad_norm": 1.3831449689587536, + "learning_rate": 1.831593709161095e-06, + "loss": 0.6706, + "step": 7101 + }, + { + "epoch": 1.06, + "grad_norm": 0.6485505473150068, + "learning_rate": 1.8315400507477098e-06, + "loss": 0.6816, + "step": 7102 + }, + { + "epoch": 1.06, + "grad_norm": 0.6992079315930506, + "learning_rate": 1.831486384573488e-06, + "loss": 0.6947, + "step": 7103 + }, + { + "epoch": 1.06, + "grad_norm": 4.315835060746847, + "learning_rate": 1.8314327106389305e-06, + "loss": 0.696, + "step": 7104 + }, + { + "epoch": 1.06, + "grad_norm": 1.068222775989483, + "learning_rate": 1.831379028944538e-06, + "loss": 0.6647, + "step": 7105 + }, + { + "epoch": 1.06, + "grad_norm": 0.7831143212259006, + "learning_rate": 1.831325339490811e-06, + "loss": 0.6595, + "step": 7106 + }, + { + "epoch": 1.06, + "grad_norm": 1.3409327893443241, + "learning_rate": 1.8312716422782516e-06, + "loss": 0.6654, + "step": 7107 + }, + { + "epoch": 1.06, + "grad_norm": 3.3547332150329514, + "learning_rate": 1.8312179373073602e-06, + "loss": 0.7025, + "step": 7108 + }, + { + "epoch": 1.06, + "grad_norm": 0.5616662622358014, + "learning_rate": 1.831164224578638e-06, + "loss": 0.679, + "step": 7109 + }, + { + "epoch": 1.06, + "grad_norm": 1.4894623158981564, + "learning_rate": 1.831110504092587e-06, + "loss": 0.6829, + "step": 7110 + }, + { + "epoch": 1.06, + "grad_norm": 0.591774693859202, + "learning_rate": 1.8310567758497082e-06, + "loss": 0.6979, + "step": 7111 + }, + { + "epoch": 1.06, + "grad_norm": 2.0140090189874584, + "learning_rate": 1.8310030398505029e-06, + "loss": 0.6647, + "step": 7112 + }, + { + "epoch": 1.06, + "grad_norm": 0.8090386604577016, + "learning_rate": 1.8309492960954727e-06, + "loss": 0.6719, + "step": 7113 + }, + { + "epoch": 1.06, + "grad_norm": 1.5246803069542283, + "learning_rate": 1.830895544585119e-06, + "loss": 0.6901, + "step": 7114 + }, + { + "epoch": 1.06, + "grad_norm": 2.214566598921988, + "learning_rate": 1.830841785319944e-06, + "loss": 0.6934, + "step": 7115 + }, + { + "epoch": 1.06, + "grad_norm": 0.719441105514477, + "learning_rate": 1.8307880183004492e-06, + "loss": 0.6576, + "step": 7116 + }, + { + "epoch": 1.06, + "grad_norm": 4.549942026861599, + "learning_rate": 1.8307342435271363e-06, + "loss": 0.6738, + "step": 7117 + }, + { + "epoch": 1.06, + "grad_norm": 3.0030333206415847, + "learning_rate": 1.830680461000507e-06, + "loss": 0.6784, + "step": 7118 + }, + { + "epoch": 1.06, + "grad_norm": 6.211669397183332, + "learning_rate": 1.8306266707210638e-06, + "loss": 0.696, + "step": 7119 + }, + { + "epoch": 1.06, + "grad_norm": 2.1889396266005487, + "learning_rate": 1.8305728726893084e-06, + "loss": 0.6868, + "step": 7120 + }, + { + "epoch": 1.06, + "grad_norm": 1.2982743871531153, + "learning_rate": 1.830519066905743e-06, + "loss": 0.6589, + "step": 7121 + }, + { + "epoch": 1.06, + "grad_norm": 1.4667908707828778, + "learning_rate": 1.8304652533708693e-06, + "loss": 0.696, + "step": 7122 + }, + { + "epoch": 1.06, + "grad_norm": 1.9845752403432624, + "learning_rate": 1.8304114320851902e-06, + "loss": 0.6862, + "step": 7123 + }, + { + "epoch": 1.06, + "grad_norm": 2.8500376316686333, + "learning_rate": 1.8303576030492079e-06, + "loss": 0.668, + "step": 7124 + }, + { + "epoch": 1.06, + "grad_norm": 3.7197156305823906, + "learning_rate": 1.8303037662634245e-06, + "loss": 0.6693, + "step": 7125 + }, + { + "epoch": 1.06, + "grad_norm": 1.2185052699093195, + "learning_rate": 1.8302499217283427e-06, + "loss": 0.6621, + "step": 7126 + }, + { + "epoch": 1.06, + "grad_norm": 1.5947070808856378, + "learning_rate": 1.8301960694444648e-06, + "loss": 0.6777, + "step": 7127 + }, + { + "epoch": 1.06, + "grad_norm": 0.8400563742313745, + "learning_rate": 1.8301422094122937e-06, + "loss": 0.6667, + "step": 7128 + }, + { + "epoch": 1.06, + "grad_norm": 0.68660014272437, + "learning_rate": 1.830088341632332e-06, + "loss": 0.6882, + "step": 7129 + }, + { + "epoch": 1.06, + "grad_norm": 0.7481005985080551, + "learning_rate": 1.8300344661050822e-06, + "loss": 0.6654, + "step": 7130 + }, + { + "epoch": 1.06, + "grad_norm": 0.7023636061915226, + "learning_rate": 1.8299805828310472e-06, + "loss": 0.6953, + "step": 7131 + }, + { + "epoch": 1.06, + "grad_norm": 1.018536929551939, + "learning_rate": 1.8299266918107304e-06, + "loss": 0.6764, + "step": 7132 + }, + { + "epoch": 1.06, + "grad_norm": 2.664367336900198, + "learning_rate": 1.8298727930446343e-06, + "loss": 0.6439, + "step": 7133 + }, + { + "epoch": 1.06, + "grad_norm": 2.0819923613714475, + "learning_rate": 1.8298188865332618e-06, + "loss": 0.6595, + "step": 7134 + }, + { + "epoch": 1.06, + "grad_norm": 3.0883833333567643, + "learning_rate": 1.8297649722771164e-06, + "loss": 0.6797, + "step": 7135 + }, + { + "epoch": 1.06, + "grad_norm": 1.6726029025847993, + "learning_rate": 1.8297110502767012e-06, + "loss": 0.6992, + "step": 7136 + }, + { + "epoch": 1.06, + "grad_norm": 4.98475117547987, + "learning_rate": 1.8296571205325196e-06, + "loss": 0.6536, + "step": 7137 + }, + { + "epoch": 1.06, + "grad_norm": 5.064657017056068, + "learning_rate": 1.8296031830450743e-06, + "loss": 0.6947, + "step": 7138 + }, + { + "epoch": 1.06, + "grad_norm": 5.930640029110142, + "learning_rate": 1.8295492378148692e-06, + "loss": 0.679, + "step": 7139 + }, + { + "epoch": 1.06, + "grad_norm": 7.895001787762332, + "learning_rate": 1.829495284842408e-06, + "loss": 0.6882, + "step": 7140 + }, + { + "epoch": 1.07, + "grad_norm": 0.8725393644923981, + "learning_rate": 1.8294413241281936e-06, + "loss": 0.6973, + "step": 7141 + }, + { + "epoch": 1.07, + "grad_norm": 1.122493286024838, + "learning_rate": 1.8293873556727299e-06, + "loss": 0.668, + "step": 7142 + }, + { + "epoch": 1.07, + "grad_norm": 1.2553465094330782, + "learning_rate": 1.829333379476521e-06, + "loss": 0.6888, + "step": 7143 + }, + { + "epoch": 1.07, + "grad_norm": 2.393837768572665, + "learning_rate": 1.82927939554007e-06, + "loss": 0.6549, + "step": 7144 + }, + { + "epoch": 1.07, + "grad_norm": 0.7231988229694165, + "learning_rate": 1.8292254038638814e-06, + "loss": 0.6543, + "step": 7145 + }, + { + "epoch": 1.07, + "grad_norm": 1.811848895045366, + "learning_rate": 1.8291714044484587e-06, + "loss": 0.6751, + "step": 7146 + }, + { + "epoch": 1.07, + "grad_norm": 0.9250192571570942, + "learning_rate": 1.829117397294306e-06, + "loss": 0.6445, + "step": 7147 + }, + { + "epoch": 1.07, + "grad_norm": 0.780615651245685, + "learning_rate": 1.829063382401927e-06, + "loss": 0.6888, + "step": 7148 + }, + { + "epoch": 1.07, + "grad_norm": 5.308465232190144, + "learning_rate": 1.8290093597718265e-06, + "loss": 0.653, + "step": 7149 + }, + { + "epoch": 1.07, + "grad_norm": 1.1299893450139793, + "learning_rate": 1.8289553294045082e-06, + "loss": 0.6667, + "step": 7150 + }, + { + "epoch": 1.07, + "grad_norm": 4.592146774197155, + "learning_rate": 1.8289012913004761e-06, + "loss": 0.681, + "step": 7151 + }, + { + "epoch": 1.07, + "grad_norm": 3.0895376568318818, + "learning_rate": 1.8288472454602357e-06, + "loss": 0.6966, + "step": 7152 + }, + { + "epoch": 1.07, + "grad_norm": 5.448997777537957, + "learning_rate": 1.8287931918842903e-06, + "loss": 0.6725, + "step": 7153 + }, + { + "epoch": 1.07, + "grad_norm": 4.490837770199781, + "learning_rate": 1.8287391305731447e-06, + "loss": 0.6764, + "step": 7154 + }, + { + "epoch": 1.07, + "grad_norm": 0.7827388865572432, + "learning_rate": 1.8286850615273033e-06, + "loss": 0.6576, + "step": 7155 + }, + { + "epoch": 1.07, + "grad_norm": 4.507032510163258, + "learning_rate": 1.8286309847472712e-06, + "loss": 0.6797, + "step": 7156 + }, + { + "epoch": 1.07, + "grad_norm": 0.7288834014024398, + "learning_rate": 1.8285769002335528e-06, + "loss": 0.6536, + "step": 7157 + }, + { + "epoch": 1.07, + "grad_norm": 1.8462799790474498, + "learning_rate": 1.828522807986653e-06, + "loss": 0.6777, + "step": 7158 + }, + { + "epoch": 1.07, + "grad_norm": 1.0347034584825534, + "learning_rate": 1.8284687080070765e-06, + "loss": 0.6901, + "step": 7159 + }, + { + "epoch": 1.07, + "grad_norm": 5.8856747078086675, + "learning_rate": 1.828414600295328e-06, + "loss": 0.681, + "step": 7160 + }, + { + "epoch": 1.07, + "grad_norm": 1.0938659917803564, + "learning_rate": 1.8283604848519133e-06, + "loss": 0.7109, + "step": 7161 + }, + { + "epoch": 1.07, + "grad_norm": 3.313004380069963, + "learning_rate": 1.8283063616773365e-06, + "loss": 0.6647, + "step": 7162 + }, + { + "epoch": 1.07, + "grad_norm": 6.399306801123899, + "learning_rate": 1.8282522307721036e-06, + "loss": 0.6908, + "step": 7163 + }, + { + "epoch": 1.07, + "grad_norm": 1.8794713219587187, + "learning_rate": 1.8281980921367187e-06, + "loss": 0.707, + "step": 7164 + }, + { + "epoch": 1.07, + "grad_norm": 0.7911568987651573, + "learning_rate": 1.8281439457716883e-06, + "loss": 0.6842, + "step": 7165 + }, + { + "epoch": 1.07, + "grad_norm": 8.367730568527076, + "learning_rate": 1.8280897916775171e-06, + "loss": 0.7012, + "step": 7166 + }, + { + "epoch": 1.07, + "grad_norm": 4.079620023988491, + "learning_rate": 1.8280356298547107e-06, + "loss": 0.6751, + "step": 7167 + }, + { + "epoch": 1.07, + "grad_norm": 1.2288528031954034, + "learning_rate": 1.8279814603037744e-06, + "loss": 0.6569, + "step": 7168 + }, + { + "epoch": 1.07, + "grad_norm": 5.191628126923919, + "learning_rate": 1.8279272830252141e-06, + "loss": 0.6719, + "step": 7169 + }, + { + "epoch": 1.07, + "grad_norm": 1.3774048713310256, + "learning_rate": 1.827873098019535e-06, + "loss": 0.6751, + "step": 7170 + }, + { + "epoch": 1.07, + "grad_norm": 0.9328467261113137, + "learning_rate": 1.8278189052872434e-06, + "loss": 0.681, + "step": 7171 + }, + { + "epoch": 1.07, + "grad_norm": 4.011964023855743, + "learning_rate": 1.8277647048288446e-06, + "loss": 0.694, + "step": 7172 + }, + { + "epoch": 1.07, + "grad_norm": 2.4626937807322165, + "learning_rate": 1.8277104966448445e-06, + "loss": 0.6621, + "step": 7173 + }, + { + "epoch": 1.07, + "grad_norm": 1.2838270588967253, + "learning_rate": 1.8276562807357492e-06, + "loss": 0.6829, + "step": 7174 + }, + { + "epoch": 1.07, + "grad_norm": 3.493320112359465, + "learning_rate": 1.8276020571020645e-06, + "loss": 0.6953, + "step": 7175 + }, + { + "epoch": 1.07, + "grad_norm": 8.378856429860503, + "learning_rate": 1.827547825744297e-06, + "loss": 0.7246, + "step": 7176 + }, + { + "epoch": 1.07, + "grad_norm": 0.9105295387153779, + "learning_rate": 1.827493586662952e-06, + "loss": 0.6908, + "step": 7177 + }, + { + "epoch": 1.07, + "grad_norm": 1.5124779298081124, + "learning_rate": 1.8274393398585364e-06, + "loss": 0.6771, + "step": 7178 + }, + { + "epoch": 1.07, + "grad_norm": 2.3978326503351686, + "learning_rate": 1.8273850853315563e-06, + "loss": 0.6784, + "step": 7179 + }, + { + "epoch": 1.07, + "grad_norm": 2.628863490098975, + "learning_rate": 1.827330823082518e-06, + "loss": 0.6816, + "step": 7180 + }, + { + "epoch": 1.07, + "grad_norm": 3.820554613126918, + "learning_rate": 1.8272765531119278e-06, + "loss": 0.6855, + "step": 7181 + }, + { + "epoch": 1.07, + "grad_norm": 2.2747043616282467, + "learning_rate": 1.8272222754202925e-06, + "loss": 0.6901, + "step": 7182 + }, + { + "epoch": 1.07, + "grad_norm": 5.3651217706141745, + "learning_rate": 1.8271679900081188e-06, + "loss": 0.6771, + "step": 7183 + }, + { + "epoch": 1.07, + "grad_norm": 2.41804721840154, + "learning_rate": 1.8271136968759127e-06, + "loss": 0.6595, + "step": 7184 + }, + { + "epoch": 1.07, + "grad_norm": 0.4776630275436415, + "learning_rate": 1.8270593960241815e-06, + "loss": 0.6751, + "step": 7185 + }, + { + "epoch": 1.07, + "grad_norm": 3.0457857723017736, + "learning_rate": 1.8270050874534317e-06, + "loss": 0.6953, + "step": 7186 + }, + { + "epoch": 1.07, + "grad_norm": 1.0117875184710476, + "learning_rate": 1.8269507711641703e-06, + "loss": 0.6569, + "step": 7187 + }, + { + "epoch": 1.07, + "grad_norm": 4.397715053551542, + "learning_rate": 1.8268964471569043e-06, + "loss": 0.6829, + "step": 7188 + }, + { + "epoch": 1.07, + "grad_norm": 7.44298530613423, + "learning_rate": 1.8268421154321408e-06, + "loss": 0.6927, + "step": 7189 + }, + { + "epoch": 1.07, + "grad_norm": 2.6569572557851524, + "learning_rate": 1.8267877759903865e-06, + "loss": 0.6947, + "step": 7190 + }, + { + "epoch": 1.07, + "grad_norm": 3.821664258323817, + "learning_rate": 1.8267334288321488e-06, + "loss": 0.694, + "step": 7191 + }, + { + "epoch": 1.07, + "grad_norm": 1.0331601738282739, + "learning_rate": 1.8266790739579348e-06, + "loss": 0.6797, + "step": 7192 + }, + { + "epoch": 1.07, + "grad_norm": 2.4569526445610297, + "learning_rate": 1.826624711368252e-06, + "loss": 0.7057, + "step": 7193 + }, + { + "epoch": 1.07, + "grad_norm": 4.6579171583529515, + "learning_rate": 1.8265703410636078e-06, + "loss": 0.6908, + "step": 7194 + }, + { + "epoch": 1.07, + "grad_norm": 0.9853346568440725, + "learning_rate": 1.826515963044509e-06, + "loss": 0.6882, + "step": 7195 + }, + { + "epoch": 1.07, + "grad_norm": 2.452527100848695, + "learning_rate": 1.8264615773114642e-06, + "loss": 0.6895, + "step": 7196 + }, + { + "epoch": 1.07, + "grad_norm": 3.29990569452902, + "learning_rate": 1.8264071838649802e-06, + "loss": 0.6712, + "step": 7197 + }, + { + "epoch": 1.07, + "grad_norm": 1.9880747729484978, + "learning_rate": 1.826352782705565e-06, + "loss": 0.6751, + "step": 7198 + }, + { + "epoch": 1.07, + "grad_norm": 1.1322188464290015, + "learning_rate": 1.8262983738337262e-06, + "loss": 0.6732, + "step": 7199 + }, + { + "epoch": 1.07, + "grad_norm": 0.8162466579494263, + "learning_rate": 1.8262439572499714e-06, + "loss": 0.6868, + "step": 7200 + }, + { + "epoch": 1.07, + "grad_norm": 0.7012387563637887, + "learning_rate": 1.8261895329548085e-06, + "loss": 0.6758, + "step": 7201 + }, + { + "epoch": 1.07, + "grad_norm": 1.0074085419752015, + "learning_rate": 1.826135100948746e-06, + "loss": 0.6829, + "step": 7202 + }, + { + "epoch": 1.07, + "grad_norm": 2.641593131293144, + "learning_rate": 1.8260806612322914e-06, + "loss": 0.6868, + "step": 7203 + }, + { + "epoch": 1.07, + "grad_norm": 0.4831365053489164, + "learning_rate": 1.826026213805953e-06, + "loss": 0.6732, + "step": 7204 + }, + { + "epoch": 1.07, + "grad_norm": 1.511460862267005, + "learning_rate": 1.8259717586702388e-06, + "loss": 0.6764, + "step": 7205 + }, + { + "epoch": 1.07, + "grad_norm": 5.176765971464415, + "learning_rate": 1.8259172958256571e-06, + "loss": 0.6764, + "step": 7206 + }, + { + "epoch": 1.07, + "grad_norm": 0.5003229581766684, + "learning_rate": 1.8258628252727163e-06, + "loss": 0.6758, + "step": 7207 + }, + { + "epoch": 1.08, + "grad_norm": 0.6432396051821214, + "learning_rate": 1.8258083470119248e-06, + "loss": 0.6719, + "step": 7208 + }, + { + "epoch": 1.08, + "grad_norm": 2.550932963129105, + "learning_rate": 1.825753861043791e-06, + "loss": 0.6849, + "step": 7209 + }, + { + "epoch": 1.08, + "grad_norm": 0.7549553789483441, + "learning_rate": 1.8256993673688232e-06, + "loss": 0.6686, + "step": 7210 + }, + { + "epoch": 1.08, + "grad_norm": 2.2692915580637507, + "learning_rate": 1.8256448659875302e-06, + "loss": 0.679, + "step": 7211 + }, + { + "epoch": 1.08, + "grad_norm": 2.1828980392023642, + "learning_rate": 1.8255903569004207e-06, + "loss": 0.6862, + "step": 7212 + }, + { + "epoch": 1.08, + "grad_norm": 6.916138136774664, + "learning_rate": 1.8255358401080034e-06, + "loss": 0.6823, + "step": 7213 + }, + { + "epoch": 1.08, + "grad_norm": 4.262551025677481, + "learning_rate": 1.825481315610787e-06, + "loss": 0.6914, + "step": 7214 + }, + { + "epoch": 1.08, + "grad_norm": 3.762819860865685, + "learning_rate": 1.8254267834092806e-06, + "loss": 0.696, + "step": 7215 + }, + { + "epoch": 1.08, + "grad_norm": 0.928085939323418, + "learning_rate": 1.8253722435039934e-06, + "loss": 0.6654, + "step": 7216 + }, + { + "epoch": 1.08, + "grad_norm": 3.142880588442651, + "learning_rate": 1.8253176958954335e-06, + "loss": 0.6771, + "step": 7217 + }, + { + "epoch": 1.08, + "grad_norm": 0.8624308768891542, + "learning_rate": 1.8252631405841108e-06, + "loss": 0.6608, + "step": 7218 + }, + { + "epoch": 1.08, + "grad_norm": 3.0004008257723553, + "learning_rate": 1.825208577570534e-06, + "loss": 0.6602, + "step": 7219 + }, + { + "epoch": 1.08, + "grad_norm": 1.023849740757716, + "learning_rate": 1.8251540068552127e-06, + "loss": 0.6348, + "step": 7220 + }, + { + "epoch": 1.08, + "grad_norm": 0.746580065802468, + "learning_rate": 1.8250994284386563e-06, + "loss": 0.7025, + "step": 7221 + }, + { + "epoch": 1.08, + "grad_norm": 3.2842821155739337, + "learning_rate": 1.8250448423213738e-06, + "loss": 0.681, + "step": 7222 + }, + { + "epoch": 1.08, + "grad_norm": 1.2588069678947211, + "learning_rate": 1.8249902485038748e-06, + "loss": 0.6504, + "step": 7223 + }, + { + "epoch": 1.08, + "grad_norm": 0.8756941924562145, + "learning_rate": 1.8249356469866689e-06, + "loss": 0.6882, + "step": 7224 + }, + { + "epoch": 1.08, + "grad_norm": 3.611007346884863, + "learning_rate": 1.8248810377702655e-06, + "loss": 0.6608, + "step": 7225 + }, + { + "epoch": 1.08, + "grad_norm": 0.6727869828445554, + "learning_rate": 1.8248264208551744e-06, + "loss": 0.6803, + "step": 7226 + }, + { + "epoch": 1.08, + "grad_norm": 4.158336505348912, + "learning_rate": 1.8247717962419059e-06, + "loss": 0.6725, + "step": 7227 + }, + { + "epoch": 1.08, + "grad_norm": 0.8519612609706468, + "learning_rate": 1.8247171639309686e-06, + "loss": 0.6732, + "step": 7228 + }, + { + "epoch": 1.08, + "grad_norm": 1.4801638048264887, + "learning_rate": 1.8246625239228735e-06, + "loss": 0.6654, + "step": 7229 + }, + { + "epoch": 1.08, + "grad_norm": 2.1285689361748696, + "learning_rate": 1.8246078762181299e-06, + "loss": 0.6777, + "step": 7230 + }, + { + "epoch": 1.08, + "grad_norm": 2.3175490192092947, + "learning_rate": 1.8245532208172483e-06, + "loss": 0.6914, + "step": 7231 + }, + { + "epoch": 1.08, + "grad_norm": 2.288671401336204, + "learning_rate": 1.8244985577207384e-06, + "loss": 0.6882, + "step": 7232 + }, + { + "epoch": 1.08, + "grad_norm": 1.604287822433012, + "learning_rate": 1.8244438869291105e-06, + "loss": 0.696, + "step": 7233 + }, + { + "epoch": 1.08, + "grad_norm": 3.7356461130024967, + "learning_rate": 1.8243892084428749e-06, + "loss": 0.6562, + "step": 7234 + }, + { + "epoch": 1.08, + "grad_norm": 4.170473047683846, + "learning_rate": 1.824334522262542e-06, + "loss": 0.6576, + "step": 7235 + }, + { + "epoch": 1.08, + "grad_norm": 4.035565481748346, + "learning_rate": 1.8242798283886222e-06, + "loss": 0.7005, + "step": 7236 + }, + { + "epoch": 1.08, + "grad_norm": 3.514232294241727, + "learning_rate": 1.8242251268216257e-06, + "loss": 0.6719, + "step": 7237 + }, + { + "epoch": 1.08, + "grad_norm": 1.1134423136785534, + "learning_rate": 1.824170417562063e-06, + "loss": 0.6882, + "step": 7238 + }, + { + "epoch": 1.08, + "grad_norm": 3.115282872323748, + "learning_rate": 1.8241157006104454e-06, + "loss": 0.6738, + "step": 7239 + }, + { + "epoch": 1.08, + "grad_norm": 0.7414741925828613, + "learning_rate": 1.8240609759672829e-06, + "loss": 0.681, + "step": 7240 + }, + { + "epoch": 1.08, + "grad_norm": 2.390962461987225, + "learning_rate": 1.8240062436330863e-06, + "loss": 0.6803, + "step": 7241 + }, + { + "epoch": 1.08, + "grad_norm": 1.3859121240853152, + "learning_rate": 1.8239515036083668e-06, + "loss": 0.7018, + "step": 7242 + }, + { + "epoch": 1.08, + "grad_norm": 0.8648658801407387, + "learning_rate": 1.8238967558936346e-06, + "loss": 0.6628, + "step": 7243 + }, + { + "epoch": 1.08, + "grad_norm": 1.7218604798839086, + "learning_rate": 1.8238420004894018e-06, + "loss": 0.6751, + "step": 7244 + }, + { + "epoch": 1.08, + "grad_norm": 0.6164395512634436, + "learning_rate": 1.8237872373961783e-06, + "loss": 0.6855, + "step": 7245 + }, + { + "epoch": 1.08, + "grad_norm": 0.8614405707937205, + "learning_rate": 1.8237324666144756e-06, + "loss": 0.6706, + "step": 7246 + }, + { + "epoch": 1.08, + "grad_norm": 6.413588578963587, + "learning_rate": 1.8236776881448051e-06, + "loss": 0.6966, + "step": 7247 + }, + { + "epoch": 1.08, + "grad_norm": 6.2265618151280435, + "learning_rate": 1.8236229019876779e-06, + "loss": 0.7246, + "step": 7248 + }, + { + "epoch": 1.08, + "grad_norm": 2.4038043931386883, + "learning_rate": 1.8235681081436055e-06, + "loss": 0.6771, + "step": 7249 + }, + { + "epoch": 1.08, + "grad_norm": 0.9489115739120588, + "learning_rate": 1.8235133066130985e-06, + "loss": 0.6738, + "step": 7250 + }, + { + "epoch": 1.08, + "grad_norm": 4.43265597586846, + "learning_rate": 1.8234584973966697e-06, + "loss": 0.6914, + "step": 7251 + }, + { + "epoch": 1.08, + "grad_norm": 5.827574654502057, + "learning_rate": 1.8234036804948294e-06, + "loss": 0.6973, + "step": 7252 + }, + { + "epoch": 1.08, + "grad_norm": 1.688850582341655, + "learning_rate": 1.82334885590809e-06, + "loss": 0.6829, + "step": 7253 + }, + { + "epoch": 1.08, + "grad_norm": 5.590475978032138, + "learning_rate": 1.8232940236369626e-06, + "loss": 0.6621, + "step": 7254 + }, + { + "epoch": 1.08, + "grad_norm": 1.550958989934827, + "learning_rate": 1.82323918368196e-06, + "loss": 0.6908, + "step": 7255 + }, + { + "epoch": 1.08, + "grad_norm": 1.4356949457406927, + "learning_rate": 1.8231843360435922e-06, + "loss": 0.668, + "step": 7256 + }, + { + "epoch": 1.08, + "grad_norm": 1.939032730613223, + "learning_rate": 1.8231294807223732e-06, + "loss": 0.6758, + "step": 7257 + }, + { + "epoch": 1.08, + "grad_norm": 3.015201329416802, + "learning_rate": 1.8230746177188134e-06, + "loss": 0.7038, + "step": 7258 + }, + { + "epoch": 1.08, + "grad_norm": 0.847811533322734, + "learning_rate": 1.8230197470334257e-06, + "loss": 0.6686, + "step": 7259 + }, + { + "epoch": 1.08, + "grad_norm": 3.5890241856303056, + "learning_rate": 1.8229648686667219e-06, + "loss": 0.6797, + "step": 7260 + }, + { + "epoch": 1.08, + "grad_norm": 0.99580732868676, + "learning_rate": 1.822909982619214e-06, + "loss": 0.6803, + "step": 7261 + }, + { + "epoch": 1.08, + "grad_norm": 1.5563389471011442, + "learning_rate": 1.8228550888914144e-06, + "loss": 0.6712, + "step": 7262 + }, + { + "epoch": 1.08, + "grad_norm": 0.8490884794071439, + "learning_rate": 1.8228001874838355e-06, + "loss": 0.6764, + "step": 7263 + }, + { + "epoch": 1.08, + "grad_norm": 6.118003981602806, + "learning_rate": 1.82274527839699e-06, + "loss": 0.7038, + "step": 7264 + }, + { + "epoch": 1.08, + "grad_norm": 2.5945271507390184, + "learning_rate": 1.8226903616313898e-06, + "loss": 0.6908, + "step": 7265 + }, + { + "epoch": 1.08, + "grad_norm": 3.7151394878185116, + "learning_rate": 1.822635437187548e-06, + "loss": 0.6862, + "step": 7266 + }, + { + "epoch": 1.08, + "grad_norm": 2.844451492519643, + "learning_rate": 1.8225805050659763e-06, + "loss": 0.6927, + "step": 7267 + }, + { + "epoch": 1.08, + "grad_norm": 4.404662446899828, + "learning_rate": 1.8225255652671887e-06, + "loss": 0.6673, + "step": 7268 + }, + { + "epoch": 1.08, + "grad_norm": 5.025830924132563, + "learning_rate": 1.8224706177916969e-06, + "loss": 0.6908, + "step": 7269 + }, + { + "epoch": 1.08, + "grad_norm": 1.3792988807393225, + "learning_rate": 1.8224156626400143e-06, + "loss": 0.6921, + "step": 7270 + }, + { + "epoch": 1.08, + "grad_norm": 1.2575384476749831, + "learning_rate": 1.8223606998126533e-06, + "loss": 0.6589, + "step": 7271 + }, + { + "epoch": 1.08, + "grad_norm": 1.8498914567682243, + "learning_rate": 1.8223057293101272e-06, + "loss": 0.6745, + "step": 7272 + }, + { + "epoch": 1.08, + "grad_norm": 0.6862255179012066, + "learning_rate": 1.822250751132949e-06, + "loss": 0.6621, + "step": 7273 + }, + { + "epoch": 1.08, + "grad_norm": 0.885609041553421, + "learning_rate": 1.822195765281632e-06, + "loss": 0.6641, + "step": 7274 + }, + { + "epoch": 1.09, + "grad_norm": 1.479952549554041, + "learning_rate": 1.8221407717566891e-06, + "loss": 0.6797, + "step": 7275 + }, + { + "epoch": 1.09, + "grad_norm": 7.71083422965666, + "learning_rate": 1.8220857705586336e-06, + "loss": 0.7096, + "step": 7276 + }, + { + "epoch": 1.09, + "grad_norm": 4.45208222561637, + "learning_rate": 1.8220307616879792e-06, + "loss": 0.696, + "step": 7277 + }, + { + "epoch": 1.09, + "grad_norm": 0.5181100455214666, + "learning_rate": 1.8219757451452387e-06, + "loss": 0.6771, + "step": 7278 + }, + { + "epoch": 1.09, + "grad_norm": 5.245630340260268, + "learning_rate": 1.821920720930926e-06, + "loss": 0.6921, + "step": 7279 + }, + { + "epoch": 1.09, + "grad_norm": 0.6053442829314937, + "learning_rate": 1.8218656890455545e-06, + "loss": 0.6797, + "step": 7280 + }, + { + "epoch": 1.09, + "grad_norm": 5.853241498775552, + "learning_rate": 1.8218106494896376e-06, + "loss": 0.6621, + "step": 7281 + }, + { + "epoch": 1.09, + "grad_norm": 4.162077099395584, + "learning_rate": 1.8217556022636898e-06, + "loss": 0.6901, + "step": 7282 + }, + { + "epoch": 1.09, + "grad_norm": 0.8047995224498609, + "learning_rate": 1.821700547368224e-06, + "loss": 0.709, + "step": 7283 + }, + { + "epoch": 1.09, + "grad_norm": 1.7856592835133405, + "learning_rate": 1.821645484803754e-06, + "loss": 0.6699, + "step": 7284 + }, + { + "epoch": 1.09, + "grad_norm": 1.9020861139425744, + "learning_rate": 1.8215904145707946e-06, + "loss": 0.6706, + "step": 7285 + }, + { + "epoch": 1.09, + "grad_norm": 0.9606914277161466, + "learning_rate": 1.821535336669859e-06, + "loss": 0.6751, + "step": 7286 + }, + { + "epoch": 1.09, + "grad_norm": 3.548752009630068, + "learning_rate": 1.8214802511014613e-06, + "loss": 0.6536, + "step": 7287 + }, + { + "epoch": 1.09, + "grad_norm": 2.010477509571199, + "learning_rate": 1.8214251578661157e-06, + "loss": 0.6686, + "step": 7288 + }, + { + "epoch": 1.09, + "grad_norm": 4.420643835377131, + "learning_rate": 1.8213700569643367e-06, + "loss": 0.6725, + "step": 7289 + }, + { + "epoch": 1.09, + "grad_norm": 5.193478600395327, + "learning_rate": 1.821314948396638e-06, + "loss": 0.7083, + "step": 7290 + }, + { + "epoch": 1.09, + "grad_norm": 3.745861726908077, + "learning_rate": 1.8212598321635348e-06, + "loss": 0.6693, + "step": 7291 + }, + { + "epoch": 1.09, + "grad_norm": 8.588415569582555, + "learning_rate": 1.8212047082655405e-06, + "loss": 0.7135, + "step": 7292 + }, + { + "epoch": 1.09, + "grad_norm": 0.6345385314792242, + "learning_rate": 1.8211495767031702e-06, + "loss": 0.6654, + "step": 7293 + }, + { + "epoch": 1.09, + "grad_norm": 5.206962898158289, + "learning_rate": 1.821094437476938e-06, + "loss": 0.6784, + "step": 7294 + }, + { + "epoch": 1.09, + "grad_norm": 1.9622535647533401, + "learning_rate": 1.8210392905873594e-06, + "loss": 0.6745, + "step": 7295 + }, + { + "epoch": 1.09, + "grad_norm": 1.1039316846726348, + "learning_rate": 1.8209841360349478e-06, + "loss": 0.6908, + "step": 7296 + }, + { + "epoch": 1.09, + "grad_norm": 1.2427515911951277, + "learning_rate": 1.8209289738202195e-06, + "loss": 0.6615, + "step": 7297 + }, + { + "epoch": 1.09, + "grad_norm": 3.2883550535264217, + "learning_rate": 1.8208738039436878e-06, + "loss": 0.6764, + "step": 7298 + }, + { + "epoch": 1.09, + "grad_norm": 3.5696664602468267, + "learning_rate": 1.8208186264058686e-06, + "loss": 0.6992, + "step": 7299 + }, + { + "epoch": 1.09, + "grad_norm": 4.446002163835734, + "learning_rate": 1.8207634412072764e-06, + "loss": 0.6895, + "step": 7300 + }, + { + "epoch": 1.09, + "grad_norm": 2.904273261273821, + "learning_rate": 1.8207082483484264e-06, + "loss": 0.6901, + "step": 7301 + }, + { + "epoch": 1.09, + "grad_norm": 4.158244697655491, + "learning_rate": 1.8206530478298338e-06, + "loss": 0.6934, + "step": 7302 + }, + { + "epoch": 1.09, + "grad_norm": 4.498355288883931, + "learning_rate": 1.8205978396520139e-06, + "loss": 0.6868, + "step": 7303 + }, + { + "epoch": 1.09, + "grad_norm": 1.0112532393369063, + "learning_rate": 1.8205426238154815e-06, + "loss": 0.6797, + "step": 7304 + }, + { + "epoch": 1.09, + "grad_norm": 2.195683819179623, + "learning_rate": 1.8204874003207521e-06, + "loss": 0.6973, + "step": 7305 + }, + { + "epoch": 1.09, + "grad_norm": 6.079315500513975, + "learning_rate": 1.8204321691683417e-06, + "loss": 0.6986, + "step": 7306 + }, + { + "epoch": 1.09, + "grad_norm": 2.452095793576954, + "learning_rate": 1.820376930358765e-06, + "loss": 0.6908, + "step": 7307 + }, + { + "epoch": 1.09, + "grad_norm": 2.7622837628871335, + "learning_rate": 1.8203216838925378e-06, + "loss": 0.6719, + "step": 7308 + }, + { + "epoch": 1.09, + "grad_norm": 4.703265467220527, + "learning_rate": 1.820266429770176e-06, + "loss": 0.6764, + "step": 7309 + }, + { + "epoch": 1.09, + "grad_norm": 1.1653929371788205, + "learning_rate": 1.8202111679921952e-06, + "loss": 0.6732, + "step": 7310 + }, + { + "epoch": 1.09, + "grad_norm": 3.6106249104559636, + "learning_rate": 1.8201558985591108e-06, + "loss": 0.6875, + "step": 7311 + }, + { + "epoch": 1.09, + "grad_norm": 3.4644560459645546, + "learning_rate": 1.820100621471439e-06, + "loss": 0.6693, + "step": 7312 + }, + { + "epoch": 1.09, + "grad_norm": 2.284947940980634, + "learning_rate": 1.8200453367296954e-06, + "loss": 0.6751, + "step": 7313 + }, + { + "epoch": 1.09, + "grad_norm": 2.917079534103123, + "learning_rate": 1.8199900443343964e-06, + "loss": 0.694, + "step": 7314 + }, + { + "epoch": 1.09, + "grad_norm": 4.446710306975443, + "learning_rate": 1.8199347442860576e-06, + "loss": 0.6829, + "step": 7315 + }, + { + "epoch": 1.09, + "grad_norm": 1.5978505463378918, + "learning_rate": 1.8198794365851952e-06, + "loss": 0.6686, + "step": 7316 + }, + { + "epoch": 1.09, + "grad_norm": 1.7342930317697398, + "learning_rate": 1.8198241212323255e-06, + "loss": 0.6602, + "step": 7317 + }, + { + "epoch": 1.09, + "grad_norm": 5.371602154211714, + "learning_rate": 1.8197687982279652e-06, + "loss": 0.7155, + "step": 7318 + }, + { + "epoch": 1.09, + "grad_norm": 3.1732238280835503, + "learning_rate": 1.81971346757263e-06, + "loss": 0.6921, + "step": 7319 + }, + { + "epoch": 1.09, + "grad_norm": 0.8585707511386347, + "learning_rate": 1.8196581292668363e-06, + "loss": 0.6908, + "step": 7320 + }, + { + "epoch": 1.09, + "grad_norm": 1.3372387155157546, + "learning_rate": 1.819602783311101e-06, + "loss": 0.666, + "step": 7321 + }, + { + "epoch": 1.09, + "grad_norm": 0.6535190569311565, + "learning_rate": 1.8195474297059402e-06, + "loss": 0.6628, + "step": 7322 + }, + { + "epoch": 1.09, + "grad_norm": 1.9156334101171626, + "learning_rate": 1.819492068451871e-06, + "loss": 0.6686, + "step": 7323 + }, + { + "epoch": 1.09, + "grad_norm": 2.538964535660808, + "learning_rate": 1.8194366995494099e-06, + "loss": 0.6797, + "step": 7324 + }, + { + "epoch": 1.09, + "grad_norm": 1.2253679602573568, + "learning_rate": 1.8193813229990736e-06, + "loss": 0.6777, + "step": 7325 + }, + { + "epoch": 1.09, + "grad_norm": 1.432463853302201, + "learning_rate": 1.8193259388013789e-06, + "loss": 0.6842, + "step": 7326 + }, + { + "epoch": 1.09, + "grad_norm": 1.0414396318232162, + "learning_rate": 1.8192705469568427e-06, + "loss": 0.6849, + "step": 7327 + }, + { + "epoch": 1.09, + "grad_norm": 0.5467557681450665, + "learning_rate": 1.8192151474659823e-06, + "loss": 0.6634, + "step": 7328 + }, + { + "epoch": 1.09, + "grad_norm": 2.943883252819132, + "learning_rate": 1.8191597403293139e-06, + "loss": 0.6738, + "step": 7329 + }, + { + "epoch": 1.09, + "grad_norm": 0.6340104576013441, + "learning_rate": 1.8191043255473557e-06, + "loss": 0.6732, + "step": 7330 + }, + { + "epoch": 1.09, + "grad_norm": 2.535982142854951, + "learning_rate": 1.8190489031206244e-06, + "loss": 0.6556, + "step": 7331 + }, + { + "epoch": 1.09, + "grad_norm": 4.3062476194407955, + "learning_rate": 1.818993473049637e-06, + "loss": 0.6751, + "step": 7332 + }, + { + "epoch": 1.09, + "grad_norm": 4.577564985050159, + "learning_rate": 1.818938035334911e-06, + "loss": 0.694, + "step": 7333 + }, + { + "epoch": 1.09, + "grad_norm": 4.491623922932199, + "learning_rate": 1.8188825899769641e-06, + "loss": 0.6829, + "step": 7334 + }, + { + "epoch": 1.09, + "grad_norm": 4.298712607506904, + "learning_rate": 1.8188271369763135e-06, + "loss": 0.6803, + "step": 7335 + }, + { + "epoch": 1.09, + "grad_norm": 0.7644155072853887, + "learning_rate": 1.818771676333477e-06, + "loss": 0.6895, + "step": 7336 + }, + { + "epoch": 1.09, + "grad_norm": 2.167106924107878, + "learning_rate": 1.818716208048972e-06, + "loss": 0.6901, + "step": 7337 + }, + { + "epoch": 1.09, + "grad_norm": 1.5036572660005894, + "learning_rate": 1.818660732123316e-06, + "loss": 0.6491, + "step": 7338 + }, + { + "epoch": 1.09, + "grad_norm": 0.9515413784541786, + "learning_rate": 1.818605248557027e-06, + "loss": 0.6999, + "step": 7339 + }, + { + "epoch": 1.09, + "grad_norm": 0.714075746794432, + "learning_rate": 1.818549757350623e-06, + "loss": 0.6465, + "step": 7340 + }, + { + "epoch": 1.09, + "grad_norm": 0.964469578102391, + "learning_rate": 1.818494258504622e-06, + "loss": 0.6751, + "step": 7341 + }, + { + "epoch": 1.1, + "grad_norm": 1.4752674202094782, + "learning_rate": 1.8184387520195413e-06, + "loss": 0.6836, + "step": 7342 + }, + { + "epoch": 1.1, + "grad_norm": 2.097073558943924, + "learning_rate": 1.8183832378958992e-06, + "loss": 0.6549, + "step": 7343 + }, + { + "epoch": 1.1, + "grad_norm": 2.5708253841725948, + "learning_rate": 1.818327716134214e-06, + "loss": 0.6628, + "step": 7344 + }, + { + "epoch": 1.1, + "grad_norm": 2.0246153177139243, + "learning_rate": 1.818272186735004e-06, + "loss": 0.6367, + "step": 7345 + }, + { + "epoch": 1.1, + "grad_norm": 3.9150234184610793, + "learning_rate": 1.8182166496987877e-06, + "loss": 0.6686, + "step": 7346 + }, + { + "epoch": 1.1, + "grad_norm": 0.666075020224906, + "learning_rate": 1.8181611050260825e-06, + "loss": 0.6712, + "step": 7347 + }, + { + "epoch": 1.1, + "grad_norm": 1.8311192875195532, + "learning_rate": 1.8181055527174077e-06, + "loss": 0.6576, + "step": 7348 + }, + { + "epoch": 1.1, + "grad_norm": 1.5708408289516476, + "learning_rate": 1.818049992773281e-06, + "loss": 0.6901, + "step": 7349 + }, + { + "epoch": 1.1, + "grad_norm": 1.4516844673964295, + "learning_rate": 1.817994425194222e-06, + "loss": 0.696, + "step": 7350 + }, + { + "epoch": 1.1, + "grad_norm": 2.7551328238280623, + "learning_rate": 1.817938849980748e-06, + "loss": 0.6667, + "step": 7351 + }, + { + "epoch": 1.1, + "grad_norm": 2.395360402281314, + "learning_rate": 1.817883267133379e-06, + "loss": 0.6712, + "step": 7352 + }, + { + "epoch": 1.1, + "grad_norm": 3.3146416169021085, + "learning_rate": 1.8178276766526328e-06, + "loss": 0.6927, + "step": 7353 + }, + { + "epoch": 1.1, + "grad_norm": 5.638615196137087, + "learning_rate": 1.8177720785390286e-06, + "loss": 0.6823, + "step": 7354 + }, + { + "epoch": 1.1, + "grad_norm": 3.1186853260894107, + "learning_rate": 1.8177164727930854e-06, + "loss": 0.6576, + "step": 7355 + }, + { + "epoch": 1.1, + "grad_norm": 4.192832104972316, + "learning_rate": 1.817660859415322e-06, + "loss": 0.6296, + "step": 7356 + }, + { + "epoch": 1.1, + "grad_norm": 1.0710196301529344, + "learning_rate": 1.8176052384062572e-06, + "loss": 0.6803, + "step": 7357 + }, + { + "epoch": 1.1, + "grad_norm": 2.0734618465058556, + "learning_rate": 1.8175496097664107e-06, + "loss": 0.6868, + "step": 7358 + }, + { + "epoch": 1.1, + "grad_norm": 0.8253790245483167, + "learning_rate": 1.8174939734963016e-06, + "loss": 0.6699, + "step": 7359 + }, + { + "epoch": 1.1, + "grad_norm": 2.8235521334862264, + "learning_rate": 1.8174383295964484e-06, + "loss": 0.7064, + "step": 7360 + }, + { + "epoch": 1.1, + "grad_norm": 2.1513808931810323, + "learning_rate": 1.8173826780673713e-06, + "loss": 0.6569, + "step": 7361 + }, + { + "epoch": 1.1, + "grad_norm": 1.6464916350495298, + "learning_rate": 1.8173270189095892e-06, + "loss": 0.7096, + "step": 7362 + }, + { + "epoch": 1.1, + "grad_norm": 1.657598591124363, + "learning_rate": 1.8172713521236221e-06, + "loss": 0.6549, + "step": 7363 + }, + { + "epoch": 1.1, + "grad_norm": 2.884582368669517, + "learning_rate": 1.817215677709989e-06, + "loss": 0.6576, + "step": 7364 + }, + { + "epoch": 1.1, + "grad_norm": 1.0030600759049453, + "learning_rate": 1.8171599956692096e-06, + "loss": 0.6732, + "step": 7365 + }, + { + "epoch": 1.1, + "grad_norm": 2.157471651205081, + "learning_rate": 1.8171043060018039e-06, + "loss": 0.6582, + "step": 7366 + }, + { + "epoch": 1.1, + "grad_norm": 1.1281737061516737, + "learning_rate": 1.8170486087082913e-06, + "loss": 0.6712, + "step": 7367 + }, + { + "epoch": 1.1, + "grad_norm": 1.078446639876991, + "learning_rate": 1.8169929037891917e-06, + "loss": 0.6712, + "step": 7368 + }, + { + "epoch": 1.1, + "grad_norm": 1.7693363932531958, + "learning_rate": 1.8169371912450253e-06, + "loss": 0.6836, + "step": 7369 + }, + { + "epoch": 1.1, + "grad_norm": 1.8384220030824798, + "learning_rate": 1.8168814710763117e-06, + "loss": 0.6842, + "step": 7370 + }, + { + "epoch": 1.1, + "grad_norm": 3.2582535172315374, + "learning_rate": 1.8168257432835712e-06, + "loss": 0.6849, + "step": 7371 + }, + { + "epoch": 1.1, + "grad_norm": 2.567503854037778, + "learning_rate": 1.8167700078673237e-06, + "loss": 0.6712, + "step": 7372 + }, + { + "epoch": 1.1, + "grad_norm": 1.866268917741038, + "learning_rate": 1.8167142648280895e-06, + "loss": 0.6882, + "step": 7373 + }, + { + "epoch": 1.1, + "grad_norm": 3.558779836355342, + "learning_rate": 1.8166585141663888e-06, + "loss": 0.6543, + "step": 7374 + }, + { + "epoch": 1.1, + "grad_norm": 3.610884515653038, + "learning_rate": 1.8166027558827422e-06, + "loss": 0.6855, + "step": 7375 + }, + { + "epoch": 1.1, + "grad_norm": 0.9037057453848019, + "learning_rate": 1.8165469899776698e-06, + "loss": 0.6855, + "step": 7376 + }, + { + "epoch": 1.1, + "grad_norm": 2.6234508324458097, + "learning_rate": 1.8164912164516918e-06, + "loss": 0.6615, + "step": 7377 + }, + { + "epoch": 1.1, + "grad_norm": 1.5995290080951414, + "learning_rate": 1.8164354353053293e-06, + "loss": 0.668, + "step": 7378 + }, + { + "epoch": 1.1, + "grad_norm": 1.7468512962832143, + "learning_rate": 1.8163796465391025e-06, + "loss": 0.6738, + "step": 7379 + }, + { + "epoch": 1.1, + "grad_norm": 1.638707163080237, + "learning_rate": 1.8163238501535325e-06, + "loss": 0.6816, + "step": 7380 + }, + { + "epoch": 1.1, + "grad_norm": 0.6717338891912779, + "learning_rate": 1.81626804614914e-06, + "loss": 0.6797, + "step": 7381 + }, + { + "epoch": 1.1, + "grad_norm": 2.476124658814075, + "learning_rate": 1.8162122345264452e-06, + "loss": 0.6849, + "step": 7382 + }, + { + "epoch": 1.1, + "grad_norm": 1.1714280601872489, + "learning_rate": 1.8161564152859696e-06, + "loss": 0.6738, + "step": 7383 + }, + { + "epoch": 1.1, + "grad_norm": 0.7211587233605898, + "learning_rate": 1.8161005884282341e-06, + "loss": 0.6738, + "step": 7384 + }, + { + "epoch": 1.1, + "grad_norm": 2.4193974005508254, + "learning_rate": 1.8160447539537596e-06, + "loss": 0.6569, + "step": 7385 + }, + { + "epoch": 1.1, + "grad_norm": 2.8390843097495075, + "learning_rate": 1.815988911863067e-06, + "loss": 0.6693, + "step": 7386 + }, + { + "epoch": 1.1, + "grad_norm": 1.63407455291123, + "learning_rate": 1.815933062156678e-06, + "loss": 0.6771, + "step": 7387 + }, + { + "epoch": 1.1, + "grad_norm": 3.70146598519997, + "learning_rate": 1.8158772048351137e-06, + "loss": 0.6641, + "step": 7388 + }, + { + "epoch": 1.1, + "grad_norm": 1.7910540403449544, + "learning_rate": 1.815821339898895e-06, + "loss": 0.6602, + "step": 7389 + }, + { + "epoch": 1.1, + "grad_norm": 0.709365246314985, + "learning_rate": 1.8157654673485437e-06, + "loss": 0.6608, + "step": 7390 + }, + { + "epoch": 1.1, + "grad_norm": 1.390079436104886, + "learning_rate": 1.815709587184581e-06, + "loss": 0.6569, + "step": 7391 + }, + { + "epoch": 1.1, + "grad_norm": 0.9941120198522088, + "learning_rate": 1.8156536994075286e-06, + "loss": 0.6406, + "step": 7392 + }, + { + "epoch": 1.1, + "grad_norm": 5.312360633505529, + "learning_rate": 1.8155978040179083e-06, + "loss": 0.6947, + "step": 7393 + }, + { + "epoch": 1.1, + "grad_norm": 1.755330680774744, + "learning_rate": 1.8155419010162415e-06, + "loss": 0.6478, + "step": 7394 + }, + { + "epoch": 1.1, + "grad_norm": 11.35350342414453, + "learning_rate": 1.8154859904030499e-06, + "loss": 0.7194, + "step": 7395 + }, + { + "epoch": 1.1, + "grad_norm": 2.4054499009692765, + "learning_rate": 1.8154300721788555e-06, + "loss": 0.6758, + "step": 7396 + }, + { + "epoch": 1.1, + "grad_norm": 1.68690345114268, + "learning_rate": 1.8153741463441802e-06, + "loss": 0.6471, + "step": 7397 + }, + { + "epoch": 1.1, + "grad_norm": 1.2409545132511568, + "learning_rate": 1.8153182128995458e-06, + "loss": 0.6589, + "step": 7398 + }, + { + "epoch": 1.1, + "grad_norm": 1.209366698127717, + "learning_rate": 1.8152622718454747e-06, + "loss": 0.7051, + "step": 7399 + }, + { + "epoch": 1.1, + "grad_norm": 2.6726476025321464, + "learning_rate": 1.8152063231824881e-06, + "loss": 0.6276, + "step": 7400 + }, + { + "epoch": 1.1, + "grad_norm": 0.9724992978372485, + "learning_rate": 1.8151503669111095e-06, + "loss": 0.7025, + "step": 7401 + }, + { + "epoch": 1.1, + "grad_norm": 1.246168017843598, + "learning_rate": 1.81509440303186e-06, + "loss": 0.6849, + "step": 7402 + }, + { + "epoch": 1.1, + "grad_norm": 2.0657024843777365, + "learning_rate": 1.8150384315452624e-06, + "loss": 0.6543, + "step": 7403 + }, + { + "epoch": 1.1, + "grad_norm": 1.2156204417498613, + "learning_rate": 1.8149824524518392e-06, + "loss": 0.7318, + "step": 7404 + }, + { + "epoch": 1.1, + "grad_norm": 1.6976943517183212, + "learning_rate": 1.8149264657521125e-06, + "loss": 0.6374, + "step": 7405 + }, + { + "epoch": 1.1, + "grad_norm": 4.220025297707143, + "learning_rate": 1.8148704714466052e-06, + "loss": 0.679, + "step": 7406 + }, + { + "epoch": 1.1, + "grad_norm": 7.297845838011607, + "learning_rate": 1.8148144695358397e-06, + "loss": 0.6634, + "step": 7407 + }, + { + "epoch": 1.1, + "grad_norm": 1.6374127442840511, + "learning_rate": 1.8147584600203386e-06, + "loss": 0.6816, + "step": 7408 + }, + { + "epoch": 1.1, + "grad_norm": 0.8232200118292949, + "learning_rate": 1.8147024429006247e-06, + "loss": 0.6595, + "step": 7409 + }, + { + "epoch": 1.11, + "grad_norm": 3.071467748720222, + "learning_rate": 1.8146464181772207e-06, + "loss": 0.6771, + "step": 7410 + }, + { + "epoch": 1.11, + "grad_norm": 3.306792533001335, + "learning_rate": 1.81459038585065e-06, + "loss": 0.6413, + "step": 7411 + }, + { + "epoch": 1.11, + "grad_norm": 2.9861320149313504, + "learning_rate": 1.814534345921435e-06, + "loss": 0.7038, + "step": 7412 + }, + { + "epoch": 1.11, + "grad_norm": 6.047620720223633, + "learning_rate": 1.8144782983900991e-06, + "loss": 0.6816, + "step": 7413 + }, + { + "epoch": 1.11, + "grad_norm": 1.9051101911509063, + "learning_rate": 1.814422243257165e-06, + "loss": 0.6784, + "step": 7414 + }, + { + "epoch": 1.11, + "grad_norm": 0.783235014172375, + "learning_rate": 1.814366180523156e-06, + "loss": 0.6686, + "step": 7415 + }, + { + "epoch": 1.11, + "grad_norm": 0.7211469997115514, + "learning_rate": 1.8143101101885959e-06, + "loss": 0.6693, + "step": 7416 + }, + { + "epoch": 1.11, + "grad_norm": 4.211111660702086, + "learning_rate": 1.814254032254007e-06, + "loss": 0.6615, + "step": 7417 + }, + { + "epoch": 1.11, + "grad_norm": 4.501408604671979, + "learning_rate": 1.8141979467199135e-06, + "loss": 0.6647, + "step": 7418 + }, + { + "epoch": 1.11, + "grad_norm": 3.7506102500558174, + "learning_rate": 1.8141418535868384e-06, + "loss": 0.6595, + "step": 7419 + }, + { + "epoch": 1.11, + "grad_norm": 2.854502921454748, + "learning_rate": 1.8140857528553056e-06, + "loss": 0.7038, + "step": 7420 + }, + { + "epoch": 1.11, + "grad_norm": 4.795361278944093, + "learning_rate": 1.8140296445258381e-06, + "loss": 0.6947, + "step": 7421 + }, + { + "epoch": 1.11, + "grad_norm": 3.0318837588760323, + "learning_rate": 1.8139735285989603e-06, + "loss": 0.6751, + "step": 7422 + }, + { + "epoch": 1.11, + "grad_norm": 3.228939163648049, + "learning_rate": 1.8139174050751956e-06, + "loss": 0.6602, + "step": 7423 + }, + { + "epoch": 1.11, + "grad_norm": 2.6799274933346187, + "learning_rate": 1.8138612739550675e-06, + "loss": 0.6764, + "step": 7424 + }, + { + "epoch": 1.11, + "grad_norm": 1.2009593439566362, + "learning_rate": 1.8138051352391002e-06, + "loss": 0.6667, + "step": 7425 + }, + { + "epoch": 1.11, + "grad_norm": 3.539075810977762, + "learning_rate": 1.8137489889278178e-06, + "loss": 0.7044, + "step": 7426 + }, + { + "epoch": 1.11, + "grad_norm": 3.8631749442845145, + "learning_rate": 1.8136928350217438e-06, + "loss": 0.6992, + "step": 7427 + }, + { + "epoch": 1.11, + "grad_norm": 1.594575398297396, + "learning_rate": 1.8136366735214028e-06, + "loss": 0.681, + "step": 7428 + }, + { + "epoch": 1.11, + "grad_norm": 2.074182313175987, + "learning_rate": 1.813580504427319e-06, + "loss": 0.6862, + "step": 7429 + }, + { + "epoch": 1.11, + "grad_norm": 2.874297670247067, + "learning_rate": 1.813524327740016e-06, + "loss": 0.6543, + "step": 7430 + }, + { + "epoch": 1.11, + "grad_norm": 0.7862888374147761, + "learning_rate": 1.8134681434600188e-06, + "loss": 0.6908, + "step": 7431 + }, + { + "epoch": 1.11, + "grad_norm": 7.559298502436818, + "learning_rate": 1.8134119515878512e-06, + "loss": 0.6784, + "step": 7432 + }, + { + "epoch": 1.11, + "grad_norm": 3.154031563479754, + "learning_rate": 1.813355752124038e-06, + "loss": 0.6595, + "step": 7433 + }, + { + "epoch": 1.11, + "grad_norm": 0.647130323849457, + "learning_rate": 1.8132995450691037e-06, + "loss": 0.6823, + "step": 7434 + }, + { + "epoch": 1.11, + "grad_norm": 1.044757567970662, + "learning_rate": 1.8132433304235731e-06, + "loss": 0.653, + "step": 7435 + }, + { + "epoch": 1.11, + "grad_norm": 2.738758878940332, + "learning_rate": 1.8131871081879703e-06, + "loss": 0.6868, + "step": 7436 + }, + { + "epoch": 1.11, + "grad_norm": 2.2970145628911456, + "learning_rate": 1.8131308783628203e-06, + "loss": 0.6706, + "step": 7437 + }, + { + "epoch": 1.11, + "grad_norm": 3.589299039594592, + "learning_rate": 1.8130746409486479e-06, + "loss": 0.6738, + "step": 7438 + }, + { + "epoch": 1.11, + "grad_norm": 1.887556779427573, + "learning_rate": 1.8130183959459782e-06, + "loss": 0.6608, + "step": 7439 + }, + { + "epoch": 1.11, + "grad_norm": 7.171406718979038, + "learning_rate": 1.8129621433553355e-06, + "loss": 0.7012, + "step": 7440 + }, + { + "epoch": 1.11, + "grad_norm": 2.0479817299387615, + "learning_rate": 1.8129058831772456e-06, + "loss": 0.6771, + "step": 7441 + }, + { + "epoch": 1.11, + "grad_norm": 2.3350695161433777, + "learning_rate": 1.812849615412233e-06, + "loss": 0.6908, + "step": 7442 + }, + { + "epoch": 1.11, + "grad_norm": 4.109990288868045, + "learning_rate": 1.8127933400608231e-06, + "loss": 0.6712, + "step": 7443 + }, + { + "epoch": 1.11, + "grad_norm": 5.022849046992747, + "learning_rate": 1.812737057123541e-06, + "loss": 0.6673, + "step": 7444 + }, + { + "epoch": 1.11, + "grad_norm": 1.2017391629570933, + "learning_rate": 1.812680766600912e-06, + "loss": 0.6758, + "step": 7445 + }, + { + "epoch": 1.11, + "grad_norm": 5.822878484226666, + "learning_rate": 1.8126244684934618e-06, + "loss": 0.6719, + "step": 7446 + }, + { + "epoch": 1.11, + "grad_norm": 2.280064777244867, + "learning_rate": 1.8125681628017153e-06, + "loss": 0.6589, + "step": 7447 + }, + { + "epoch": 1.11, + "grad_norm": 0.9139561040483585, + "learning_rate": 1.8125118495261984e-06, + "loss": 0.6745, + "step": 7448 + }, + { + "epoch": 1.11, + "grad_norm": 4.084579442532635, + "learning_rate": 1.8124555286674368e-06, + "loss": 0.6986, + "step": 7449 + }, + { + "epoch": 1.11, + "grad_norm": 3.5210945476900135, + "learning_rate": 1.8123992002259556e-06, + "loss": 0.709, + "step": 7450 + }, + { + "epoch": 1.11, + "grad_norm": 7.164327252449194, + "learning_rate": 1.8123428642022808e-06, + "loss": 0.7148, + "step": 7451 + }, + { + "epoch": 1.11, + "grad_norm": 0.9361740831176277, + "learning_rate": 1.8122865205969384e-06, + "loss": 0.6582, + "step": 7452 + }, + { + "epoch": 1.11, + "grad_norm": 4.317850487711159, + "learning_rate": 1.812230169410454e-06, + "loss": 0.6921, + "step": 7453 + }, + { + "epoch": 1.11, + "grad_norm": 3.386036967297382, + "learning_rate": 1.8121738106433536e-06, + "loss": 0.6842, + "step": 7454 + }, + { + "epoch": 1.11, + "grad_norm": 4.063789407138934, + "learning_rate": 1.8121174442961631e-06, + "loss": 0.6693, + "step": 7455 + }, + { + "epoch": 1.11, + "grad_norm": 0.8900427268545426, + "learning_rate": 1.8120610703694089e-06, + "loss": 0.6751, + "step": 7456 + }, + { + "epoch": 1.11, + "grad_norm": 3.5727328616383436, + "learning_rate": 1.8120046888636166e-06, + "loss": 0.6589, + "step": 7457 + }, + { + "epoch": 1.11, + "grad_norm": 1.4590570848815785, + "learning_rate": 1.811948299779313e-06, + "loss": 0.666, + "step": 7458 + }, + { + "epoch": 1.11, + "grad_norm": 2.3519682754377125, + "learning_rate": 1.8118919031170239e-06, + "loss": 0.6699, + "step": 7459 + }, + { + "epoch": 1.11, + "grad_norm": 7.002936151289689, + "learning_rate": 1.8118354988772762e-06, + "loss": 0.6855, + "step": 7460 + }, + { + "epoch": 1.11, + "grad_norm": 1.6817965871287766, + "learning_rate": 1.8117790870605955e-06, + "loss": 0.6732, + "step": 7461 + }, + { + "epoch": 1.11, + "grad_norm": 0.8578749115726428, + "learning_rate": 1.8117226676675093e-06, + "loss": 0.6641, + "step": 7462 + }, + { + "epoch": 1.11, + "grad_norm": 0.8394723975080728, + "learning_rate": 1.8116662406985433e-06, + "loss": 0.6699, + "step": 7463 + }, + { + "epoch": 1.11, + "grad_norm": 0.9692659444234067, + "learning_rate": 1.8116098061542245e-06, + "loss": 0.6562, + "step": 7464 + }, + { + "epoch": 1.11, + "grad_norm": 2.6630423928603166, + "learning_rate": 1.8115533640350798e-06, + "loss": 0.6764, + "step": 7465 + }, + { + "epoch": 1.11, + "grad_norm": 1.0906300264421864, + "learning_rate": 1.8114969143416357e-06, + "loss": 0.6797, + "step": 7466 + }, + { + "epoch": 1.11, + "grad_norm": 0.7001420991698185, + "learning_rate": 1.8114404570744193e-06, + "loss": 0.6823, + "step": 7467 + }, + { + "epoch": 1.11, + "grad_norm": 1.140641470927915, + "learning_rate": 1.811383992233957e-06, + "loss": 0.6732, + "step": 7468 + }, + { + "epoch": 1.11, + "grad_norm": 0.6957741612706475, + "learning_rate": 1.8113275198207765e-06, + "loss": 0.6693, + "step": 7469 + }, + { + "epoch": 1.11, + "grad_norm": 2.006250481069956, + "learning_rate": 1.8112710398354042e-06, + "loss": 0.6647, + "step": 7470 + }, + { + "epoch": 1.11, + "grad_norm": 2.374998924345669, + "learning_rate": 1.8112145522783674e-06, + "loss": 0.679, + "step": 7471 + }, + { + "epoch": 1.11, + "grad_norm": 1.4614748328545484, + "learning_rate": 1.811158057150194e-06, + "loss": 0.7077, + "step": 7472 + }, + { + "epoch": 1.11, + "grad_norm": 1.8188884610326317, + "learning_rate": 1.8111015544514104e-06, + "loss": 0.6797, + "step": 7473 + }, + { + "epoch": 1.11, + "grad_norm": 2.8168562722041752, + "learning_rate": 1.8110450441825443e-06, + "loss": 0.6797, + "step": 7474 + }, + { + "epoch": 1.11, + "grad_norm": 1.3744881897208594, + "learning_rate": 1.8109885263441232e-06, + "loss": 0.6855, + "step": 7475 + }, + { + "epoch": 1.11, + "grad_norm": 3.1591363163093504, + "learning_rate": 1.8109320009366742e-06, + "loss": 0.6816, + "step": 7476 + }, + { + "epoch": 1.12, + "grad_norm": 6.188685326346386, + "learning_rate": 1.8108754679607255e-06, + "loss": 0.6777, + "step": 7477 + }, + { + "epoch": 1.12, + "grad_norm": 6.129483610433367, + "learning_rate": 1.810818927416804e-06, + "loss": 0.6979, + "step": 7478 + }, + { + "epoch": 1.12, + "grad_norm": 2.2074322934501134, + "learning_rate": 1.8107623793054376e-06, + "loss": 0.6888, + "step": 7479 + }, + { + "epoch": 1.12, + "grad_norm": 1.50012326733529, + "learning_rate": 1.8107058236271546e-06, + "loss": 0.6576, + "step": 7480 + }, + { + "epoch": 1.12, + "grad_norm": 0.6885374358271288, + "learning_rate": 1.8106492603824824e-06, + "loss": 0.6543, + "step": 7481 + }, + { + "epoch": 1.12, + "grad_norm": 3.17685301066144, + "learning_rate": 1.8105926895719488e-06, + "loss": 0.6387, + "step": 7482 + }, + { + "epoch": 1.12, + "grad_norm": 1.7838102199623187, + "learning_rate": 1.8105361111960819e-06, + "loss": 0.6413, + "step": 7483 + }, + { + "epoch": 1.12, + "grad_norm": 0.635143733535203, + "learning_rate": 1.81047952525541e-06, + "loss": 0.6797, + "step": 7484 + }, + { + "epoch": 1.12, + "grad_norm": 0.8647866344980731, + "learning_rate": 1.8104229317504612e-06, + "loss": 0.6862, + "step": 7485 + }, + { + "epoch": 1.12, + "grad_norm": 1.3076419574708238, + "learning_rate": 1.810366330681763e-06, + "loss": 0.707, + "step": 7486 + }, + { + "epoch": 1.12, + "grad_norm": 4.326463229573156, + "learning_rate": 1.8103097220498445e-06, + "loss": 0.6719, + "step": 7487 + }, + { + "epoch": 1.12, + "grad_norm": 5.0644750783504575, + "learning_rate": 1.8102531058552338e-06, + "loss": 0.6738, + "step": 7488 + }, + { + "epoch": 1.12, + "grad_norm": 0.8610392402938268, + "learning_rate": 1.8101964820984587e-06, + "loss": 0.6621, + "step": 7489 + }, + { + "epoch": 1.12, + "grad_norm": 2.1237346262859065, + "learning_rate": 1.8101398507800485e-06, + "loss": 0.6589, + "step": 7490 + }, + { + "epoch": 1.12, + "grad_norm": 4.84637333903065, + "learning_rate": 1.8100832119005314e-06, + "loss": 0.6947, + "step": 7491 + }, + { + "epoch": 1.12, + "grad_norm": 3.5983043534674333, + "learning_rate": 1.8100265654604363e-06, + "loss": 0.6751, + "step": 7492 + }, + { + "epoch": 1.12, + "grad_norm": 2.770662429888964, + "learning_rate": 1.8099699114602913e-06, + "loss": 0.6901, + "step": 7493 + }, + { + "epoch": 1.12, + "grad_norm": 3.147796203883097, + "learning_rate": 1.8099132499006255e-06, + "loss": 0.6608, + "step": 7494 + }, + { + "epoch": 1.12, + "grad_norm": 3.5235746980898215, + "learning_rate": 1.809856580781968e-06, + "loss": 0.6966, + "step": 7495 + }, + { + "epoch": 1.12, + "grad_norm": 2.3028747386039123, + "learning_rate": 1.8097999041048473e-06, + "loss": 0.668, + "step": 7496 + }, + { + "epoch": 1.12, + "grad_norm": 0.8327062366379931, + "learning_rate": 1.809743219869792e-06, + "loss": 0.668, + "step": 7497 + }, + { + "epoch": 1.12, + "grad_norm": 1.0063478943933934, + "learning_rate": 1.809686528077332e-06, + "loss": 0.6667, + "step": 7498 + }, + { + "epoch": 1.12, + "grad_norm": 1.8835783369446857, + "learning_rate": 1.8096298287279962e-06, + "loss": 0.7038, + "step": 7499 + }, + { + "epoch": 1.12, + "grad_norm": 1.1117979636322437, + "learning_rate": 1.8095731218223136e-06, + "loss": 0.696, + "step": 7500 + }, + { + "epoch": 1.12, + "grad_norm": 1.843536497705405, + "learning_rate": 1.8095164073608133e-06, + "loss": 0.6986, + "step": 7501 + }, + { + "epoch": 1.12, + "grad_norm": 2.8691561988045935, + "learning_rate": 1.809459685344025e-06, + "loss": 0.6693, + "step": 7502 + }, + { + "epoch": 1.12, + "grad_norm": 4.3558886598279996, + "learning_rate": 1.8094029557724775e-06, + "loss": 0.6641, + "step": 7503 + }, + { + "epoch": 1.12, + "grad_norm": 5.2727315181331615, + "learning_rate": 1.8093462186467007e-06, + "loss": 0.7161, + "step": 7504 + }, + { + "epoch": 1.12, + "grad_norm": 5.002513491050286, + "learning_rate": 1.8092894739672242e-06, + "loss": 0.6764, + "step": 7505 + }, + { + "epoch": 1.12, + "grad_norm": 3.786840256765564, + "learning_rate": 1.8092327217345774e-06, + "loss": 0.6921, + "step": 7506 + }, + { + "epoch": 1.12, + "grad_norm": 0.6146775006896171, + "learning_rate": 1.80917596194929e-06, + "loss": 0.6823, + "step": 7507 + }, + { + "epoch": 1.12, + "grad_norm": 4.103851086018532, + "learning_rate": 1.809119194611892e-06, + "loss": 0.6875, + "step": 7508 + }, + { + "epoch": 1.12, + "grad_norm": 0.5061712704456871, + "learning_rate": 1.8090624197229127e-06, + "loss": 0.6745, + "step": 7509 + }, + { + "epoch": 1.12, + "grad_norm": 0.9693285248173811, + "learning_rate": 1.8090056372828827e-06, + "loss": 0.6927, + "step": 7510 + }, + { + "epoch": 1.12, + "grad_norm": 4.865292122453303, + "learning_rate": 1.8089488472923314e-06, + "loss": 0.7116, + "step": 7511 + }, + { + "epoch": 1.12, + "grad_norm": 3.0181438895280626, + "learning_rate": 1.8088920497517886e-06, + "loss": 0.6686, + "step": 7512 + }, + { + "epoch": 1.12, + "grad_norm": 1.228773634596141, + "learning_rate": 1.8088352446617852e-06, + "loss": 0.6875, + "step": 7513 + }, + { + "epoch": 1.12, + "grad_norm": 0.746568875911184, + "learning_rate": 1.8087784320228507e-06, + "loss": 0.679, + "step": 7514 + }, + { + "epoch": 1.12, + "grad_norm": 0.4726451854121043, + "learning_rate": 1.8087216118355157e-06, + "loss": 0.6927, + "step": 7515 + }, + { + "epoch": 1.12, + "grad_norm": 1.2567364288555065, + "learning_rate": 1.8086647841003102e-06, + "loss": 0.6549, + "step": 7516 + }, + { + "epoch": 1.12, + "grad_norm": 3.227911099152864, + "learning_rate": 1.808607948817765e-06, + "loss": 0.6908, + "step": 7517 + }, + { + "epoch": 1.12, + "grad_norm": 2.174689050145776, + "learning_rate": 1.80855110598841e-06, + "loss": 0.6882, + "step": 7518 + }, + { + "epoch": 1.12, + "grad_norm": 5.480781010628385, + "learning_rate": 1.8084942556127762e-06, + "loss": 0.6797, + "step": 7519 + }, + { + "epoch": 1.12, + "grad_norm": 1.8322838469324212, + "learning_rate": 1.808437397691394e-06, + "loss": 0.6947, + "step": 7520 + }, + { + "epoch": 1.12, + "grad_norm": 5.14145841750839, + "learning_rate": 1.8083805322247943e-06, + "loss": 0.6797, + "step": 7521 + }, + { + "epoch": 1.12, + "grad_norm": 4.690988987277324, + "learning_rate": 1.8083236592135074e-06, + "loss": 0.6888, + "step": 7522 + }, + { + "epoch": 1.12, + "grad_norm": 0.8431280863966504, + "learning_rate": 1.808266778658064e-06, + "loss": 0.6823, + "step": 7523 + }, + { + "epoch": 1.12, + "grad_norm": 1.8229386395718508, + "learning_rate": 1.8082098905589956e-06, + "loss": 0.6771, + "step": 7524 + }, + { + "epoch": 1.12, + "grad_norm": 2.4838792494698287, + "learning_rate": 1.808152994916833e-06, + "loss": 0.6634, + "step": 7525 + }, + { + "epoch": 1.12, + "grad_norm": 1.5838489007897552, + "learning_rate": 1.8080960917321067e-06, + "loss": 0.6738, + "step": 7526 + }, + { + "epoch": 1.12, + "grad_norm": 0.7101960323037394, + "learning_rate": 1.8080391810053482e-06, + "loss": 0.679, + "step": 7527 + }, + { + "epoch": 1.12, + "grad_norm": 0.5264364370657599, + "learning_rate": 1.8079822627370886e-06, + "loss": 0.6699, + "step": 7528 + }, + { + "epoch": 1.12, + "grad_norm": 6.148951034759593, + "learning_rate": 1.807925336927859e-06, + "loss": 0.6699, + "step": 7529 + }, + { + "epoch": 1.12, + "grad_norm": 5.333525885722905, + "learning_rate": 1.8078684035781908e-06, + "loss": 0.6725, + "step": 7530 + }, + { + "epoch": 1.12, + "grad_norm": 5.603991033669679, + "learning_rate": 1.8078114626886155e-06, + "loss": 0.668, + "step": 7531 + }, + { + "epoch": 1.12, + "grad_norm": 2.437984005609779, + "learning_rate": 1.8077545142596641e-06, + "loss": 0.6758, + "step": 7532 + }, + { + "epoch": 1.12, + "grad_norm": 1.9099102754128967, + "learning_rate": 1.8076975582918688e-06, + "loss": 0.6562, + "step": 7533 + }, + { + "epoch": 1.12, + "grad_norm": 0.9225076332537238, + "learning_rate": 1.8076405947857604e-06, + "loss": 0.666, + "step": 7534 + }, + { + "epoch": 1.12, + "grad_norm": 3.609217843738097, + "learning_rate": 1.8075836237418711e-06, + "loss": 0.6986, + "step": 7535 + }, + { + "epoch": 1.12, + "grad_norm": 2.131980565189938, + "learning_rate": 1.8075266451607326e-06, + "loss": 0.6764, + "step": 7536 + }, + { + "epoch": 1.12, + "grad_norm": 1.083496758177941, + "learning_rate": 1.807469659042876e-06, + "loss": 0.7181, + "step": 7537 + }, + { + "epoch": 1.12, + "grad_norm": 0.607767993055828, + "learning_rate": 1.807412665388834e-06, + "loss": 0.6634, + "step": 7538 + }, + { + "epoch": 1.12, + "grad_norm": 1.5955013008143215, + "learning_rate": 1.807355664199138e-06, + "loss": 0.6855, + "step": 7539 + }, + { + "epoch": 1.12, + "grad_norm": 2.929734350664788, + "learning_rate": 1.8072986554743204e-06, + "loss": 0.6712, + "step": 7540 + }, + { + "epoch": 1.12, + "grad_norm": 3.377912137628293, + "learning_rate": 1.8072416392149128e-06, + "loss": 0.6947, + "step": 7541 + }, + { + "epoch": 1.12, + "grad_norm": 0.9136570524474887, + "learning_rate": 1.8071846154214478e-06, + "loss": 0.6693, + "step": 7542 + }, + { + "epoch": 1.12, + "grad_norm": 3.137962851604011, + "learning_rate": 1.8071275840944574e-06, + "loss": 0.6921, + "step": 7543 + }, + { + "epoch": 1.13, + "grad_norm": 1.97197551177776, + "learning_rate": 1.807070545234474e-06, + "loss": 0.6471, + "step": 7544 + }, + { + "epoch": 1.13, + "grad_norm": 0.935983874036902, + "learning_rate": 1.8070134988420294e-06, + "loss": 0.6647, + "step": 7545 + }, + { + "epoch": 1.13, + "grad_norm": 2.434261854365477, + "learning_rate": 1.8069564449176569e-06, + "loss": 0.6751, + "step": 7546 + }, + { + "epoch": 1.13, + "grad_norm": 1.222903349115411, + "learning_rate": 1.8068993834618881e-06, + "loss": 0.6589, + "step": 7547 + }, + { + "epoch": 1.13, + "grad_norm": 1.898999451098594, + "learning_rate": 1.8068423144752562e-06, + "loss": 0.6504, + "step": 7548 + }, + { + "epoch": 1.13, + "grad_norm": 1.669774827320959, + "learning_rate": 1.8067852379582938e-06, + "loss": 0.6927, + "step": 7549 + }, + { + "epoch": 1.13, + "grad_norm": 1.606608500213705, + "learning_rate": 1.8067281539115334e-06, + "loss": 0.6914, + "step": 7550 + }, + { + "epoch": 1.13, + "grad_norm": 1.2849575857037947, + "learning_rate": 1.8066710623355075e-06, + "loss": 0.6549, + "step": 7551 + }, + { + "epoch": 1.13, + "grad_norm": 1.195914983342289, + "learning_rate": 1.8066139632307493e-06, + "loss": 0.6751, + "step": 7552 + }, + { + "epoch": 1.13, + "grad_norm": 2.251206976640443, + "learning_rate": 1.8065568565977918e-06, + "loss": 0.6784, + "step": 7553 + }, + { + "epoch": 1.13, + "grad_norm": 2.6825415500575036, + "learning_rate": 1.8064997424371678e-06, + "loss": 0.6882, + "step": 7554 + }, + { + "epoch": 1.13, + "grad_norm": 3.147696510597642, + "learning_rate": 1.8064426207494105e-06, + "loss": 0.6419, + "step": 7555 + }, + { + "epoch": 1.13, + "grad_norm": 2.7200530733070782, + "learning_rate": 1.8063854915350527e-06, + "loss": 0.6719, + "step": 7556 + }, + { + "epoch": 1.13, + "grad_norm": 0.8209595675097672, + "learning_rate": 1.806328354794628e-06, + "loss": 0.6868, + "step": 7557 + }, + { + "epoch": 1.13, + "grad_norm": 3.201186403860847, + "learning_rate": 1.8062712105286692e-06, + "loss": 0.6608, + "step": 7558 + }, + { + "epoch": 1.13, + "grad_norm": 4.979132480935657, + "learning_rate": 1.80621405873771e-06, + "loss": 0.6797, + "step": 7559 + }, + { + "epoch": 1.13, + "grad_norm": 1.088505957815975, + "learning_rate": 1.8061568994222838e-06, + "loss": 0.6589, + "step": 7560 + }, + { + "epoch": 1.13, + "grad_norm": 6.049429410140051, + "learning_rate": 1.8060997325829237e-06, + "loss": 0.694, + "step": 7561 + }, + { + "epoch": 1.13, + "grad_norm": 2.386192162871459, + "learning_rate": 1.8060425582201637e-06, + "loss": 0.679, + "step": 7562 + }, + { + "epoch": 1.13, + "grad_norm": 1.1252765526389015, + "learning_rate": 1.8059853763345374e-06, + "loss": 0.6673, + "step": 7563 + }, + { + "epoch": 1.13, + "grad_norm": 7.273206086870948, + "learning_rate": 1.805928186926578e-06, + "loss": 0.6797, + "step": 7564 + }, + { + "epoch": 1.13, + "grad_norm": 1.5946057621225835, + "learning_rate": 1.8058709899968196e-06, + "loss": 0.6842, + "step": 7565 + }, + { + "epoch": 1.13, + "grad_norm": 3.665616878193305, + "learning_rate": 1.805813785545796e-06, + "loss": 0.6758, + "step": 7566 + }, + { + "epoch": 1.13, + "grad_norm": 1.6577070922644357, + "learning_rate": 1.8057565735740413e-06, + "loss": 0.7181, + "step": 7567 + }, + { + "epoch": 1.13, + "grad_norm": 0.8365990366036306, + "learning_rate": 1.805699354082089e-06, + "loss": 0.6921, + "step": 7568 + }, + { + "epoch": 1.13, + "grad_norm": 6.25919253816712, + "learning_rate": 1.8056421270704733e-06, + "loss": 0.6745, + "step": 7569 + }, + { + "epoch": 1.13, + "grad_norm": 0.9614005393781139, + "learning_rate": 1.8055848925397285e-06, + "loss": 0.6803, + "step": 7570 + }, + { + "epoch": 1.13, + "grad_norm": 3.8710553908608563, + "learning_rate": 1.8055276504903887e-06, + "loss": 0.681, + "step": 7571 + }, + { + "epoch": 1.13, + "grad_norm": 5.085116015438871, + "learning_rate": 1.8054704009229882e-06, + "loss": 0.6517, + "step": 7572 + }, + { + "epoch": 1.13, + "grad_norm": 1.9629052290813893, + "learning_rate": 1.805413143838061e-06, + "loss": 0.6797, + "step": 7573 + }, + { + "epoch": 1.13, + "grad_norm": 1.5520019394998228, + "learning_rate": 1.8053558792361418e-06, + "loss": 0.6895, + "step": 7574 + }, + { + "epoch": 1.13, + "grad_norm": 5.25787238732106, + "learning_rate": 1.805298607117765e-06, + "loss": 0.6849, + "step": 7575 + }, + { + "epoch": 1.13, + "grad_norm": 3.85942339000292, + "learning_rate": 1.805241327483465e-06, + "loss": 0.6771, + "step": 7576 + }, + { + "epoch": 1.13, + "grad_norm": 4.1769526921093, + "learning_rate": 1.8051840403337763e-06, + "loss": 0.6764, + "step": 7577 + }, + { + "epoch": 1.13, + "grad_norm": 3.1661644450370683, + "learning_rate": 1.8051267456692342e-06, + "loss": 0.6855, + "step": 7578 + }, + { + "epoch": 1.13, + "grad_norm": 4.062424552757287, + "learning_rate": 1.8050694434903729e-06, + "loss": 0.6882, + "step": 7579 + }, + { + "epoch": 1.13, + "grad_norm": 5.078536836145203, + "learning_rate": 1.8050121337977269e-06, + "loss": 0.7038, + "step": 7580 + }, + { + "epoch": 1.13, + "grad_norm": 3.2800640099616403, + "learning_rate": 1.8049548165918317e-06, + "loss": 0.7103, + "step": 7581 + }, + { + "epoch": 1.13, + "grad_norm": 4.710676898316017, + "learning_rate": 1.8048974918732221e-06, + "loss": 0.6706, + "step": 7582 + }, + { + "epoch": 1.13, + "grad_norm": 0.5475756222666774, + "learning_rate": 1.804840159642433e-06, + "loss": 0.6738, + "step": 7583 + }, + { + "epoch": 1.13, + "grad_norm": 1.5592494140925217, + "learning_rate": 1.8047828198999993e-06, + "loss": 0.6751, + "step": 7584 + }, + { + "epoch": 1.13, + "grad_norm": 2.721883561216686, + "learning_rate": 1.8047254726464565e-06, + "loss": 0.6882, + "step": 7585 + }, + { + "epoch": 1.13, + "grad_norm": 3.668818341949005, + "learning_rate": 1.8046681178823397e-06, + "loss": 0.6992, + "step": 7586 + }, + { + "epoch": 1.13, + "grad_norm": 4.233536348011961, + "learning_rate": 1.8046107556081843e-06, + "loss": 0.6673, + "step": 7587 + }, + { + "epoch": 1.13, + "grad_norm": 3.517986593718916, + "learning_rate": 1.8045533858245254e-06, + "loss": 0.6784, + "step": 7588 + }, + { + "epoch": 1.13, + "grad_norm": 4.293488914623266, + "learning_rate": 1.8044960085318985e-06, + "loss": 0.6862, + "step": 7589 + }, + { + "epoch": 1.13, + "grad_norm": 2.0222847671825352, + "learning_rate": 1.8044386237308396e-06, + "loss": 0.681, + "step": 7590 + }, + { + "epoch": 1.13, + "grad_norm": 1.661788749768422, + "learning_rate": 1.8043812314218836e-06, + "loss": 0.6882, + "step": 7591 + }, + { + "epoch": 1.13, + "grad_norm": 5.78896236758053, + "learning_rate": 1.8043238316055665e-06, + "loss": 0.6654, + "step": 7592 + }, + { + "epoch": 1.13, + "grad_norm": 3.088842327490802, + "learning_rate": 1.8042664242824239e-06, + "loss": 0.6712, + "step": 7593 + }, + { + "epoch": 1.13, + "grad_norm": 1.1995852199074506, + "learning_rate": 1.8042090094529915e-06, + "loss": 0.6914, + "step": 7594 + }, + { + "epoch": 1.13, + "grad_norm": 6.769872195500121, + "learning_rate": 1.8041515871178054e-06, + "loss": 0.6803, + "step": 7595 + }, + { + "epoch": 1.13, + "grad_norm": 0.8050425109111234, + "learning_rate": 1.8040941572774017e-06, + "loss": 0.6764, + "step": 7596 + }, + { + "epoch": 1.13, + "grad_norm": 3.6555225107403007, + "learning_rate": 1.804036719932316e-06, + "loss": 0.6888, + "step": 7597 + }, + { + "epoch": 1.13, + "grad_norm": 0.8959446175575153, + "learning_rate": 1.8039792750830843e-06, + "loss": 0.6849, + "step": 7598 + }, + { + "epoch": 1.13, + "grad_norm": 3.673662521236642, + "learning_rate": 1.803921822730243e-06, + "loss": 0.6849, + "step": 7599 + }, + { + "epoch": 1.13, + "grad_norm": 0.8386573954097687, + "learning_rate": 1.8038643628743283e-06, + "loss": 0.6901, + "step": 7600 + }, + { + "epoch": 1.13, + "grad_norm": 1.8105993441026929, + "learning_rate": 1.8038068955158762e-06, + "loss": 0.696, + "step": 7601 + }, + { + "epoch": 1.13, + "grad_norm": 0.9852521147196218, + "learning_rate": 1.8037494206554236e-06, + "loss": 0.6855, + "step": 7602 + }, + { + "epoch": 1.13, + "grad_norm": 4.717775374514062, + "learning_rate": 1.8036919382935062e-06, + "loss": 0.6784, + "step": 7603 + }, + { + "epoch": 1.13, + "grad_norm": 0.7227757234534503, + "learning_rate": 1.8036344484306612e-06, + "loss": 0.6582, + "step": 7604 + }, + { + "epoch": 1.13, + "grad_norm": 2.306418348991347, + "learning_rate": 1.8035769510674248e-06, + "loss": 0.6725, + "step": 7605 + }, + { + "epoch": 1.13, + "grad_norm": 2.118874478120104, + "learning_rate": 1.8035194462043336e-06, + "loss": 0.6953, + "step": 7606 + }, + { + "epoch": 1.13, + "grad_norm": 1.7554950055270702, + "learning_rate": 1.8034619338419241e-06, + "loss": 0.7181, + "step": 7607 + }, + { + "epoch": 1.13, + "grad_norm": 0.5475297103468596, + "learning_rate": 1.8034044139807337e-06, + "loss": 0.694, + "step": 7608 + }, + { + "epoch": 1.13, + "grad_norm": 0.7673883577884323, + "learning_rate": 1.8033468866212984e-06, + "loss": 0.6719, + "step": 7609 + }, + { + "epoch": 1.13, + "grad_norm": 4.566406176004526, + "learning_rate": 1.8032893517641558e-06, + "loss": 0.6921, + "step": 7610 + }, + { + "epoch": 1.14, + "grad_norm": 1.077974056531398, + "learning_rate": 1.8032318094098429e-06, + "loss": 0.6667, + "step": 7611 + }, + { + "epoch": 1.14, + "grad_norm": 1.827401908515998, + "learning_rate": 1.8031742595588961e-06, + "loss": 0.6712, + "step": 7612 + }, + { + "epoch": 1.14, + "grad_norm": 2.3069577166083164, + "learning_rate": 1.8031167022118533e-06, + "loss": 0.6803, + "step": 7613 + }, + { + "epoch": 1.14, + "grad_norm": 0.77605051462597, + "learning_rate": 1.8030591373692507e-06, + "loss": 0.6771, + "step": 7614 + }, + { + "epoch": 1.14, + "grad_norm": 2.16249480122612, + "learning_rate": 1.8030015650316265e-06, + "loss": 0.6901, + "step": 7615 + }, + { + "epoch": 1.14, + "grad_norm": 2.9233867419366804, + "learning_rate": 1.802943985199518e-06, + "loss": 0.6569, + "step": 7616 + }, + { + "epoch": 1.14, + "grad_norm": 2.72898237860487, + "learning_rate": 1.8028863978734617e-06, + "loss": 0.6777, + "step": 7617 + }, + { + "epoch": 1.14, + "grad_norm": 1.8820677478236003, + "learning_rate": 1.802828803053996e-06, + "loss": 0.6862, + "step": 7618 + }, + { + "epoch": 1.14, + "grad_norm": 2.873911713482087, + "learning_rate": 1.8027712007416579e-06, + "loss": 0.6829, + "step": 7619 + }, + { + "epoch": 1.14, + "grad_norm": 1.8588987857386263, + "learning_rate": 1.8027135909369852e-06, + "loss": 0.6771, + "step": 7620 + }, + { + "epoch": 1.14, + "grad_norm": 1.4625633682853765, + "learning_rate": 1.8026559736405157e-06, + "loss": 0.6823, + "step": 7621 + }, + { + "epoch": 1.14, + "grad_norm": 2.0147163890899398, + "learning_rate": 1.8025983488527869e-06, + "loss": 0.6621, + "step": 7622 + }, + { + "epoch": 1.14, + "grad_norm": 1.70556788710403, + "learning_rate": 1.8025407165743366e-06, + "loss": 0.6686, + "step": 7623 + }, + { + "epoch": 1.14, + "grad_norm": 1.0528672405552248, + "learning_rate": 1.8024830768057034e-06, + "loss": 0.6849, + "step": 7624 + }, + { + "epoch": 1.14, + "grad_norm": 7.331674201040157, + "learning_rate": 1.8024254295474242e-06, + "loss": 0.696, + "step": 7625 + }, + { + "epoch": 1.14, + "grad_norm": 1.3041863542790884, + "learning_rate": 1.8023677748000377e-06, + "loss": 0.6725, + "step": 7626 + }, + { + "epoch": 1.14, + "grad_norm": 2.2488717514478656, + "learning_rate": 1.8023101125640815e-06, + "loss": 0.6693, + "step": 7627 + }, + { + "epoch": 1.14, + "grad_norm": 1.913000167363831, + "learning_rate": 1.8022524428400944e-06, + "loss": 0.7057, + "step": 7628 + }, + { + "epoch": 1.14, + "grad_norm": 1.9833764039891042, + "learning_rate": 1.8021947656286143e-06, + "loss": 0.6628, + "step": 7629 + }, + { + "epoch": 1.14, + "grad_norm": 0.6279920421410395, + "learning_rate": 1.802137080930179e-06, + "loss": 0.6576, + "step": 7630 + }, + { + "epoch": 1.14, + "grad_norm": 1.0905432897852878, + "learning_rate": 1.8020793887453279e-06, + "loss": 0.6947, + "step": 7631 + }, + { + "epoch": 1.14, + "grad_norm": 0.7284796469401006, + "learning_rate": 1.802021689074599e-06, + "loss": 0.6914, + "step": 7632 + }, + { + "epoch": 1.14, + "grad_norm": 1.6911146999000104, + "learning_rate": 1.8019639819185303e-06, + "loss": 0.6758, + "step": 7633 + }, + { + "epoch": 1.14, + "grad_norm": 2.5125293710240713, + "learning_rate": 1.801906267277661e-06, + "loss": 0.6706, + "step": 7634 + }, + { + "epoch": 1.14, + "grad_norm": 5.677544889074706, + "learning_rate": 1.8018485451525297e-06, + "loss": 0.6888, + "step": 7635 + }, + { + "epoch": 1.14, + "grad_norm": 1.4711265657761836, + "learning_rate": 1.8017908155436752e-06, + "loss": 0.6816, + "step": 7636 + }, + { + "epoch": 1.14, + "grad_norm": 1.1638968775460339, + "learning_rate": 1.8017330784516358e-06, + "loss": 0.6686, + "step": 7637 + }, + { + "epoch": 1.14, + "grad_norm": 0.7433601287423109, + "learning_rate": 1.8016753338769508e-06, + "loss": 0.653, + "step": 7638 + }, + { + "epoch": 1.14, + "grad_norm": 1.4943865008663848, + "learning_rate": 1.8016175818201589e-06, + "loss": 0.6732, + "step": 7639 + }, + { + "epoch": 1.14, + "grad_norm": 1.0633218252561578, + "learning_rate": 1.8015598222817994e-06, + "loss": 0.6816, + "step": 7640 + }, + { + "epoch": 1.14, + "grad_norm": 1.1564019377545627, + "learning_rate": 1.801502055262411e-06, + "loss": 0.6849, + "step": 7641 + }, + { + "epoch": 1.14, + "grad_norm": 1.3150928774295731, + "learning_rate": 1.8014442807625333e-06, + "loss": 0.6745, + "step": 7642 + }, + { + "epoch": 1.14, + "grad_norm": 1.8543607606667492, + "learning_rate": 1.8013864987827048e-06, + "loss": 0.6784, + "step": 7643 + }, + { + "epoch": 1.14, + "grad_norm": 0.6510217403630376, + "learning_rate": 1.8013287093234654e-06, + "loss": 0.6719, + "step": 7644 + }, + { + "epoch": 1.14, + "grad_norm": 1.815408691078686, + "learning_rate": 1.8012709123853546e-06, + "loss": 0.6582, + "step": 7645 + }, + { + "epoch": 1.14, + "grad_norm": 4.316483294696491, + "learning_rate": 1.8012131079689113e-06, + "loss": 0.6953, + "step": 7646 + }, + { + "epoch": 1.14, + "grad_norm": 3.4836833338591493, + "learning_rate": 1.8011552960746753e-06, + "loss": 0.6908, + "step": 7647 + }, + { + "epoch": 1.14, + "grad_norm": 3.7214655217295354, + "learning_rate": 1.801097476703186e-06, + "loss": 0.696, + "step": 7648 + }, + { + "epoch": 1.14, + "grad_norm": 1.3316668660724003, + "learning_rate": 1.8010396498549831e-06, + "loss": 0.6764, + "step": 7649 + }, + { + "epoch": 1.14, + "grad_norm": 1.7377984472200811, + "learning_rate": 1.8009818155306062e-06, + "loss": 0.6647, + "step": 7650 + }, + { + "epoch": 1.14, + "grad_norm": 0.7126061357460048, + "learning_rate": 1.8009239737305952e-06, + "loss": 0.668, + "step": 7651 + }, + { + "epoch": 1.14, + "grad_norm": 0.7044008972144451, + "learning_rate": 1.8008661244554904e-06, + "loss": 0.6647, + "step": 7652 + }, + { + "epoch": 1.14, + "grad_norm": 3.7321693127011963, + "learning_rate": 1.8008082677058307e-06, + "loss": 0.6895, + "step": 7653 + }, + { + "epoch": 1.14, + "grad_norm": 4.530402431481635, + "learning_rate": 1.8007504034821567e-06, + "loss": 0.6641, + "step": 7654 + }, + { + "epoch": 1.14, + "grad_norm": 3.0813244913599878, + "learning_rate": 1.8006925317850086e-06, + "loss": 0.6803, + "step": 7655 + }, + { + "epoch": 1.14, + "grad_norm": 0.6582225522940939, + "learning_rate": 1.800634652614926e-06, + "loss": 0.6693, + "step": 7656 + }, + { + "epoch": 1.14, + "grad_norm": 3.3074171232590674, + "learning_rate": 1.8005767659724494e-06, + "loss": 0.679, + "step": 7657 + }, + { + "epoch": 1.14, + "grad_norm": 0.6442741725402705, + "learning_rate": 1.8005188718581192e-06, + "loss": 0.6706, + "step": 7658 + }, + { + "epoch": 1.14, + "grad_norm": 1.7202350219814713, + "learning_rate": 1.8004609702724755e-06, + "loss": 0.6751, + "step": 7659 + }, + { + "epoch": 1.14, + "grad_norm": 1.6567683269219928, + "learning_rate": 1.800403061216059e-06, + "loss": 0.6914, + "step": 7660 + }, + { + "epoch": 1.14, + "grad_norm": 0.8174121777220171, + "learning_rate": 1.8003451446894097e-06, + "loss": 0.6667, + "step": 7661 + }, + { + "epoch": 1.14, + "grad_norm": 2.1604744201226627, + "learning_rate": 1.8002872206930684e-06, + "loss": 0.668, + "step": 7662 + }, + { + "epoch": 1.14, + "grad_norm": 2.1243566734021337, + "learning_rate": 1.8002292892275757e-06, + "loss": 0.6836, + "step": 7663 + }, + { + "epoch": 1.14, + "grad_norm": 1.8992691585918293, + "learning_rate": 1.8001713502934721e-06, + "loss": 0.668, + "step": 7664 + }, + { + "epoch": 1.14, + "grad_norm": 2.761678079785416, + "learning_rate": 1.800113403891299e-06, + "loss": 0.6764, + "step": 7665 + }, + { + "epoch": 1.14, + "grad_norm": 2.3645323917720185, + "learning_rate": 1.8000554500215963e-06, + "loss": 0.6901, + "step": 7666 + }, + { + "epoch": 1.14, + "grad_norm": 2.851037064712901, + "learning_rate": 1.7999974886849057e-06, + "loss": 0.679, + "step": 7667 + }, + { + "epoch": 1.14, + "grad_norm": 0.6460324426036594, + "learning_rate": 1.7999395198817675e-06, + "loss": 0.666, + "step": 7668 + }, + { + "epoch": 1.14, + "grad_norm": 2.536610200968378, + "learning_rate": 1.7998815436127233e-06, + "loss": 0.6712, + "step": 7669 + }, + { + "epoch": 1.14, + "grad_norm": 1.0337562998783156, + "learning_rate": 1.7998235598783136e-06, + "loss": 0.6882, + "step": 7670 + }, + { + "epoch": 1.14, + "grad_norm": 1.4379033526136926, + "learning_rate": 1.79976556867908e-06, + "loss": 0.679, + "step": 7671 + }, + { + "epoch": 1.14, + "grad_norm": 1.7639174020770754, + "learning_rate": 1.7997075700155636e-06, + "loss": 0.6673, + "step": 7672 + }, + { + "epoch": 1.14, + "grad_norm": 0.8602320386126858, + "learning_rate": 1.799649563888306e-06, + "loss": 0.6504, + "step": 7673 + }, + { + "epoch": 1.14, + "grad_norm": 3.2929180759072705, + "learning_rate": 1.7995915502978479e-06, + "loss": 0.6823, + "step": 7674 + }, + { + "epoch": 1.14, + "grad_norm": 0.8621906323296638, + "learning_rate": 1.7995335292447314e-06, + "loss": 0.6842, + "step": 7675 + }, + { + "epoch": 1.14, + "grad_norm": 2.4793209083607355, + "learning_rate": 1.7994755007294975e-06, + "loss": 0.6458, + "step": 7676 + }, + { + "epoch": 1.14, + "grad_norm": 3.240575538628971, + "learning_rate": 1.7994174647526882e-06, + "loss": 0.6803, + "step": 7677 + }, + { + "epoch": 1.15, + "grad_norm": 1.2764855203501082, + "learning_rate": 1.799359421314845e-06, + "loss": 0.7012, + "step": 7678 + }, + { + "epoch": 1.15, + "grad_norm": 3.058882128380825, + "learning_rate": 1.7993013704165096e-06, + "loss": 0.6914, + "step": 7679 + }, + { + "epoch": 1.15, + "grad_norm": 0.7090049233116296, + "learning_rate": 1.799243312058224e-06, + "loss": 0.6855, + "step": 7680 + }, + { + "epoch": 1.15, + "grad_norm": 0.902497994039548, + "learning_rate": 1.7991852462405297e-06, + "loss": 0.6719, + "step": 7681 + }, + { + "epoch": 1.15, + "grad_norm": 2.4244781386860668, + "learning_rate": 1.799127172963969e-06, + "loss": 0.694, + "step": 7682 + }, + { + "epoch": 1.15, + "grad_norm": 1.8463256752231572, + "learning_rate": 1.7990690922290836e-06, + "loss": 0.6556, + "step": 7683 + }, + { + "epoch": 1.15, + "grad_norm": 1.6420198310295337, + "learning_rate": 1.7990110040364158e-06, + "loss": 0.6641, + "step": 7684 + }, + { + "epoch": 1.15, + "grad_norm": 1.9040746937414061, + "learning_rate": 1.7989529083865074e-06, + "loss": 0.6504, + "step": 7685 + }, + { + "epoch": 1.15, + "grad_norm": 4.615386656898934, + "learning_rate": 1.7988948052799011e-06, + "loss": 0.6855, + "step": 7686 + }, + { + "epoch": 1.15, + "grad_norm": 2.5050257325620753, + "learning_rate": 1.7988366947171387e-06, + "loss": 0.6934, + "step": 7687 + }, + { + "epoch": 1.15, + "grad_norm": 0.7486647427762501, + "learning_rate": 1.7987785766987628e-06, + "loss": 0.6836, + "step": 7688 + }, + { + "epoch": 1.15, + "grad_norm": 0.8067099314376366, + "learning_rate": 1.798720451225316e-06, + "loss": 0.6855, + "step": 7689 + }, + { + "epoch": 1.15, + "grad_norm": 4.872677996591358, + "learning_rate": 1.7986623182973407e-06, + "loss": 0.6836, + "step": 7690 + }, + { + "epoch": 1.15, + "grad_norm": 3.5587365668727973, + "learning_rate": 1.7986041779153788e-06, + "loss": 0.6914, + "step": 7691 + }, + { + "epoch": 1.15, + "grad_norm": 7.92027012264793, + "learning_rate": 1.798546030079974e-06, + "loss": 0.6901, + "step": 7692 + }, + { + "epoch": 1.15, + "grad_norm": 8.20706187422276, + "learning_rate": 1.7984878747916686e-06, + "loss": 0.6602, + "step": 7693 + }, + { + "epoch": 1.15, + "grad_norm": 0.709565947254308, + "learning_rate": 1.7984297120510047e-06, + "loss": 0.6751, + "step": 7694 + }, + { + "epoch": 1.15, + "grad_norm": 0.6611301700096069, + "learning_rate": 1.798371541858526e-06, + "loss": 0.6836, + "step": 7695 + }, + { + "epoch": 1.15, + "grad_norm": 0.7730624850690595, + "learning_rate": 1.798313364214775e-06, + "loss": 0.6771, + "step": 7696 + }, + { + "epoch": 1.15, + "grad_norm": 1.2222696782636524, + "learning_rate": 1.798255179120295e-06, + "loss": 0.6641, + "step": 7697 + }, + { + "epoch": 1.15, + "grad_norm": 0.7854539244191486, + "learning_rate": 1.7981969865756286e-06, + "loss": 0.6621, + "step": 7698 + }, + { + "epoch": 1.15, + "grad_norm": 0.7641282138726972, + "learning_rate": 1.7981387865813192e-06, + "loss": 0.6758, + "step": 7699 + }, + { + "epoch": 1.15, + "grad_norm": 0.8238649219920461, + "learning_rate": 1.7980805791379098e-06, + "loss": 0.6634, + "step": 7700 + }, + { + "epoch": 1.15, + "grad_norm": 1.914251722630238, + "learning_rate": 1.798022364245944e-06, + "loss": 0.679, + "step": 7701 + }, + { + "epoch": 1.15, + "grad_norm": 3.8312808372179883, + "learning_rate": 1.7979641419059647e-06, + "loss": 0.6465, + "step": 7702 + }, + { + "epoch": 1.15, + "grad_norm": 4.451683486082455, + "learning_rate": 1.7979059121185156e-06, + "loss": 0.6699, + "step": 7703 + }, + { + "epoch": 1.15, + "grad_norm": 2.4118686358917665, + "learning_rate": 1.79784767488414e-06, + "loss": 0.6647, + "step": 7704 + }, + { + "epoch": 1.15, + "grad_norm": 1.8000380976923656, + "learning_rate": 1.7977894302033817e-06, + "loss": 0.6895, + "step": 7705 + }, + { + "epoch": 1.15, + "grad_norm": 3.792137585193667, + "learning_rate": 1.797731178076784e-06, + "loss": 0.6901, + "step": 7706 + }, + { + "epoch": 1.15, + "grad_norm": 1.6083425535522864, + "learning_rate": 1.7976729185048905e-06, + "loss": 0.6706, + "step": 7707 + }, + { + "epoch": 1.15, + "grad_norm": 1.9007979498536047, + "learning_rate": 1.7976146514882453e-06, + "loss": 0.6855, + "step": 7708 + }, + { + "epoch": 1.15, + "grad_norm": 1.8577842398276514, + "learning_rate": 1.7975563770273916e-06, + "loss": 0.6699, + "step": 7709 + }, + { + "epoch": 1.15, + "grad_norm": 5.729927946622599, + "learning_rate": 1.7974980951228741e-06, + "loss": 0.6699, + "step": 7710 + }, + { + "epoch": 1.15, + "grad_norm": 7.381285019062765, + "learning_rate": 1.7974398057752365e-06, + "loss": 0.6868, + "step": 7711 + }, + { + "epoch": 1.15, + "grad_norm": 4.982175070767714, + "learning_rate": 1.7973815089850222e-06, + "loss": 0.6849, + "step": 7712 + }, + { + "epoch": 1.15, + "grad_norm": 2.486677248813751, + "learning_rate": 1.797323204752776e-06, + "loss": 0.6849, + "step": 7713 + }, + { + "epoch": 1.15, + "grad_norm": 1.7754461950899618, + "learning_rate": 1.7972648930790419e-06, + "loss": 0.6699, + "step": 7714 + }, + { + "epoch": 1.15, + "grad_norm": 1.2826547543087938, + "learning_rate": 1.797206573964364e-06, + "loss": 0.6562, + "step": 7715 + }, + { + "epoch": 1.15, + "grad_norm": 1.896602920353304, + "learning_rate": 1.7971482474092867e-06, + "loss": 0.627, + "step": 7716 + }, + { + "epoch": 1.15, + "grad_norm": 2.8968878697381424, + "learning_rate": 1.797089913414354e-06, + "loss": 0.6829, + "step": 7717 + }, + { + "epoch": 1.15, + "grad_norm": 2.3746363426494526, + "learning_rate": 1.7970315719801111e-06, + "loss": 0.694, + "step": 7718 + }, + { + "epoch": 1.15, + "grad_norm": 1.4987919495230626, + "learning_rate": 1.7969732231071015e-06, + "loss": 0.6439, + "step": 7719 + }, + { + "epoch": 1.15, + "grad_norm": 1.020005368370342, + "learning_rate": 1.796914866795871e-06, + "loss": 0.6641, + "step": 7720 + }, + { + "epoch": 1.15, + "grad_norm": 3.7010513166065717, + "learning_rate": 1.796856503046963e-06, + "loss": 0.6999, + "step": 7721 + }, + { + "epoch": 1.15, + "grad_norm": 4.0333052954342214, + "learning_rate": 1.796798131860923e-06, + "loss": 0.6706, + "step": 7722 + }, + { + "epoch": 1.15, + "grad_norm": 1.0684640918297597, + "learning_rate": 1.7967397532382958e-06, + "loss": 0.6908, + "step": 7723 + }, + { + "epoch": 1.15, + "grad_norm": 0.9179957567360484, + "learning_rate": 1.7966813671796258e-06, + "loss": 0.7018, + "step": 7724 + }, + { + "epoch": 1.15, + "grad_norm": 4.246830860249278, + "learning_rate": 1.7966229736854581e-06, + "loss": 0.6921, + "step": 7725 + }, + { + "epoch": 1.15, + "grad_norm": 5.253782275788872, + "learning_rate": 1.7965645727563379e-06, + "loss": 0.7201, + "step": 7726 + }, + { + "epoch": 1.15, + "grad_norm": 2.4786647675207774, + "learning_rate": 1.7965061643928098e-06, + "loss": 0.668, + "step": 7727 + }, + { + "epoch": 1.15, + "grad_norm": 0.7278085348762666, + "learning_rate": 1.7964477485954195e-06, + "loss": 0.6745, + "step": 7728 + }, + { + "epoch": 1.15, + "grad_norm": 2.7509335524244545, + "learning_rate": 1.7963893253647118e-06, + "loss": 0.6654, + "step": 7729 + }, + { + "epoch": 1.15, + "grad_norm": 3.0562883463994512, + "learning_rate": 1.7963308947012322e-06, + "loss": 0.6751, + "step": 7730 + }, + { + "epoch": 1.15, + "grad_norm": 2.2520167129031368, + "learning_rate": 1.7962724566055257e-06, + "loss": 0.6608, + "step": 7731 + }, + { + "epoch": 1.15, + "grad_norm": 5.1775281506416535, + "learning_rate": 1.7962140110781383e-06, + "loss": 0.6927, + "step": 7732 + }, + { + "epoch": 1.15, + "grad_norm": 3.3425833661761555, + "learning_rate": 1.7961555581196148e-06, + "loss": 0.6829, + "step": 7733 + }, + { + "epoch": 1.15, + "grad_norm": 1.1647753398770249, + "learning_rate": 1.7960970977305011e-06, + "loss": 0.6868, + "step": 7734 + }, + { + "epoch": 1.15, + "grad_norm": 1.2063848603523009, + "learning_rate": 1.796038629911343e-06, + "loss": 0.6816, + "step": 7735 + }, + { + "epoch": 1.15, + "grad_norm": 3.3066458084771693, + "learning_rate": 1.7959801546626858e-06, + "loss": 0.6673, + "step": 7736 + }, + { + "epoch": 1.15, + "grad_norm": 2.7929008774938664, + "learning_rate": 1.7959216719850756e-06, + "loss": 0.6621, + "step": 7737 + }, + { + "epoch": 1.15, + "grad_norm": 5.905249937941882, + "learning_rate": 1.795863181879058e-06, + "loss": 0.6595, + "step": 7738 + }, + { + "epoch": 1.15, + "grad_norm": 2.9678752920989715, + "learning_rate": 1.795804684345179e-06, + "loss": 0.6953, + "step": 7739 + }, + { + "epoch": 1.15, + "grad_norm": 1.5681519058718973, + "learning_rate": 1.7957461793839843e-06, + "loss": 0.6706, + "step": 7740 + }, + { + "epoch": 1.15, + "grad_norm": 1.7791911142008936, + "learning_rate": 1.79568766699602e-06, + "loss": 0.696, + "step": 7741 + }, + { + "epoch": 1.15, + "grad_norm": 2.010687611626237, + "learning_rate": 1.7956291471818325e-06, + "loss": 0.668, + "step": 7742 + }, + { + "epoch": 1.15, + "grad_norm": 1.5677406660249715, + "learning_rate": 1.795570619941968e-06, + "loss": 0.6855, + "step": 7743 + }, + { + "epoch": 1.15, + "grad_norm": 4.809522251690051, + "learning_rate": 1.7955120852769725e-06, + "loss": 0.6829, + "step": 7744 + }, + { + "epoch": 1.16, + "grad_norm": 1.071355047416568, + "learning_rate": 1.795453543187392e-06, + "loss": 0.6849, + "step": 7745 + }, + { + "epoch": 1.16, + "grad_norm": 6.172592845794552, + "learning_rate": 1.7953949936737738e-06, + "loss": 0.6914, + "step": 7746 + }, + { + "epoch": 1.16, + "grad_norm": 0.9592761127688448, + "learning_rate": 1.7953364367366633e-06, + "loss": 0.6868, + "step": 7747 + }, + { + "epoch": 1.16, + "grad_norm": 5.922126682600098, + "learning_rate": 1.7952778723766078e-06, + "loss": 0.6927, + "step": 7748 + }, + { + "epoch": 1.16, + "grad_norm": 1.945590891629098, + "learning_rate": 1.7952193005941535e-06, + "loss": 0.6836, + "step": 7749 + }, + { + "epoch": 1.16, + "grad_norm": 1.274677979322328, + "learning_rate": 1.795160721389847e-06, + "loss": 0.6764, + "step": 7750 + }, + { + "epoch": 1.16, + "grad_norm": 3.3709740765293046, + "learning_rate": 1.7951021347642351e-06, + "loss": 0.6549, + "step": 7751 + }, + { + "epoch": 1.16, + "grad_norm": 1.2527401144597425, + "learning_rate": 1.795043540717865e-06, + "loss": 0.6536, + "step": 7752 + }, + { + "epoch": 1.16, + "grad_norm": 0.8720641726672703, + "learning_rate": 1.794984939251283e-06, + "loss": 0.6582, + "step": 7753 + }, + { + "epoch": 1.16, + "grad_norm": 0.5301154840030979, + "learning_rate": 1.7949263303650364e-06, + "loss": 0.6608, + "step": 7754 + }, + { + "epoch": 1.16, + "grad_norm": 3.798644986553411, + "learning_rate": 1.7948677140596717e-06, + "loss": 0.6979, + "step": 7755 + }, + { + "epoch": 1.16, + "grad_norm": 4.226580659670496, + "learning_rate": 1.7948090903357369e-06, + "loss": 0.694, + "step": 7756 + }, + { + "epoch": 1.16, + "grad_norm": 1.4894040159646234, + "learning_rate": 1.7947504591937782e-06, + "loss": 0.6908, + "step": 7757 + }, + { + "epoch": 1.16, + "grad_norm": 3.0571542564194827, + "learning_rate": 1.7946918206343433e-06, + "loss": 0.6673, + "step": 7758 + }, + { + "epoch": 1.16, + "grad_norm": 1.96881781981754, + "learning_rate": 1.7946331746579792e-06, + "loss": 0.6738, + "step": 7759 + }, + { + "epoch": 1.16, + "grad_norm": 1.7319453711046187, + "learning_rate": 1.7945745212652335e-06, + "loss": 0.6699, + "step": 7760 + }, + { + "epoch": 1.16, + "grad_norm": 0.9835389613572618, + "learning_rate": 1.7945158604566537e-06, + "loss": 0.6868, + "step": 7761 + }, + { + "epoch": 1.16, + "grad_norm": 1.645275000242452, + "learning_rate": 1.794457192232787e-06, + "loss": 0.696, + "step": 7762 + }, + { + "epoch": 1.16, + "grad_norm": 0.9182084702999613, + "learning_rate": 1.7943985165941807e-06, + "loss": 0.6914, + "step": 7763 + }, + { + "epoch": 1.16, + "grad_norm": 3.790009710521913, + "learning_rate": 1.7943398335413833e-06, + "loss": 0.6777, + "step": 7764 + }, + { + "epoch": 1.16, + "grad_norm": 2.4789472793081817, + "learning_rate": 1.7942811430749419e-06, + "loss": 0.6732, + "step": 7765 + }, + { + "epoch": 1.16, + "grad_norm": 3.369976394148154, + "learning_rate": 1.7942224451954043e-06, + "loss": 0.6797, + "step": 7766 + }, + { + "epoch": 1.16, + "grad_norm": 2.494576614069379, + "learning_rate": 1.7941637399033183e-06, + "loss": 0.6842, + "step": 7767 + }, + { + "epoch": 1.16, + "grad_norm": 1.9201558611079763, + "learning_rate": 1.7941050271992317e-06, + "loss": 0.6888, + "step": 7768 + }, + { + "epoch": 1.16, + "grad_norm": 2.5602585811720093, + "learning_rate": 1.794046307083693e-06, + "loss": 0.6836, + "step": 7769 + }, + { + "epoch": 1.16, + "grad_norm": 4.357086977225307, + "learning_rate": 1.7939875795572497e-06, + "loss": 0.6875, + "step": 7770 + }, + { + "epoch": 1.16, + "grad_norm": 1.0134529627358908, + "learning_rate": 1.79392884462045e-06, + "loss": 0.6901, + "step": 7771 + }, + { + "epoch": 1.16, + "grad_norm": 1.1982878722337058, + "learning_rate": 1.7938701022738426e-06, + "loss": 0.6686, + "step": 7772 + }, + { + "epoch": 1.16, + "grad_norm": 6.166853113079343, + "learning_rate": 1.793811352517975e-06, + "loss": 0.6517, + "step": 7773 + }, + { + "epoch": 1.16, + "grad_norm": 0.5577454264587861, + "learning_rate": 1.793752595353396e-06, + "loss": 0.6764, + "step": 7774 + }, + { + "epoch": 1.16, + "grad_norm": 4.067197973901514, + "learning_rate": 1.7936938307806537e-06, + "loss": 0.6992, + "step": 7775 + }, + { + "epoch": 1.16, + "grad_norm": 1.8915828727167665, + "learning_rate": 1.7936350588002966e-06, + "loss": 0.6751, + "step": 7776 + }, + { + "epoch": 1.16, + "grad_norm": 0.7963194913601637, + "learning_rate": 1.7935762794128736e-06, + "loss": 0.6823, + "step": 7777 + }, + { + "epoch": 1.16, + "grad_norm": 2.2152593837015018, + "learning_rate": 1.7935174926189329e-06, + "loss": 0.6823, + "step": 7778 + }, + { + "epoch": 1.16, + "grad_norm": 2.786164760128728, + "learning_rate": 1.7934586984190231e-06, + "loss": 0.6751, + "step": 7779 + }, + { + "epoch": 1.16, + "grad_norm": 3.5140771216782394, + "learning_rate": 1.7933998968136934e-06, + "loss": 0.6803, + "step": 7780 + }, + { + "epoch": 1.16, + "grad_norm": 5.267925931375109, + "learning_rate": 1.7933410878034922e-06, + "loss": 0.6868, + "step": 7781 + }, + { + "epoch": 1.16, + "grad_norm": 3.2037706268299297, + "learning_rate": 1.7932822713889683e-06, + "loss": 0.696, + "step": 7782 + }, + { + "epoch": 1.16, + "grad_norm": 2.6812708221366717, + "learning_rate": 1.793223447570671e-06, + "loss": 0.6816, + "step": 7783 + }, + { + "epoch": 1.16, + "grad_norm": 0.7999344490447936, + "learning_rate": 1.793164616349149e-06, + "loss": 0.6914, + "step": 7784 + }, + { + "epoch": 1.16, + "grad_norm": 4.8825652094393135, + "learning_rate": 1.7931057777249513e-06, + "loss": 0.6764, + "step": 7785 + }, + { + "epoch": 1.16, + "grad_norm": 0.5822375803419783, + "learning_rate": 1.7930469316986274e-06, + "loss": 0.6732, + "step": 7786 + }, + { + "epoch": 1.16, + "grad_norm": 3.127682883839617, + "learning_rate": 1.7929880782707265e-06, + "loss": 0.6803, + "step": 7787 + }, + { + "epoch": 1.16, + "grad_norm": 0.7193864297953111, + "learning_rate": 1.7929292174417976e-06, + "loss": 0.6725, + "step": 7788 + }, + { + "epoch": 1.16, + "grad_norm": 3.3676507900463424, + "learning_rate": 1.7928703492123903e-06, + "loss": 0.6888, + "step": 7789 + }, + { + "epoch": 1.16, + "grad_norm": 2.115137801443432, + "learning_rate": 1.7928114735830537e-06, + "loss": 0.7135, + "step": 7790 + }, + { + "epoch": 1.16, + "grad_norm": 0.7537600839874847, + "learning_rate": 1.7927525905543377e-06, + "loss": 0.6842, + "step": 7791 + }, + { + "epoch": 1.16, + "grad_norm": 2.0724550092619083, + "learning_rate": 1.7926937001267917e-06, + "loss": 0.666, + "step": 7792 + }, + { + "epoch": 1.16, + "grad_norm": 3.969141832248279, + "learning_rate": 1.7926348023009652e-06, + "loss": 0.6536, + "step": 7793 + }, + { + "epoch": 1.16, + "grad_norm": 5.721789623611519, + "learning_rate": 1.7925758970774081e-06, + "loss": 0.6706, + "step": 7794 + }, + { + "epoch": 1.16, + "grad_norm": 2.727969651922138, + "learning_rate": 1.79251698445667e-06, + "loss": 0.668, + "step": 7795 + }, + { + "epoch": 1.16, + "grad_norm": 0.938828465655528, + "learning_rate": 1.7924580644393004e-06, + "loss": 0.6699, + "step": 7796 + }, + { + "epoch": 1.16, + "grad_norm": 5.132714051512522, + "learning_rate": 1.7923991370258503e-06, + "loss": 0.6842, + "step": 7797 + }, + { + "epoch": 1.16, + "grad_norm": 5.637297053023925, + "learning_rate": 1.7923402022168683e-06, + "loss": 0.681, + "step": 7798 + }, + { + "epoch": 1.16, + "grad_norm": 2.4657858942763267, + "learning_rate": 1.7922812600129056e-06, + "loss": 0.6999, + "step": 7799 + }, + { + "epoch": 1.16, + "grad_norm": 1.3595864158556599, + "learning_rate": 1.7922223104145115e-06, + "loss": 0.7129, + "step": 7800 + }, + { + "epoch": 1.16, + "grad_norm": 1.6375380282976904, + "learning_rate": 1.7921633534222367e-06, + "loss": 0.6686, + "step": 7801 + }, + { + "epoch": 1.16, + "grad_norm": 1.1306572616689996, + "learning_rate": 1.7921043890366313e-06, + "loss": 0.6999, + "step": 7802 + }, + { + "epoch": 1.16, + "grad_norm": 2.5860298359402867, + "learning_rate": 1.7920454172582456e-06, + "loss": 0.6641, + "step": 7803 + }, + { + "epoch": 1.16, + "grad_norm": 2.618784335327758, + "learning_rate": 1.7919864380876299e-06, + "loss": 0.6908, + "step": 7804 + }, + { + "epoch": 1.16, + "grad_norm": 0.4922627112146273, + "learning_rate": 1.7919274515253348e-06, + "loss": 0.6868, + "step": 7805 + }, + { + "epoch": 1.16, + "grad_norm": 5.3399386185012965, + "learning_rate": 1.7918684575719107e-06, + "loss": 0.7051, + "step": 7806 + }, + { + "epoch": 1.16, + "grad_norm": 4.770604689889783, + "learning_rate": 1.7918094562279085e-06, + "loss": 0.6777, + "step": 7807 + }, + { + "epoch": 1.16, + "grad_norm": 0.8664519438639625, + "learning_rate": 1.7917504474938783e-06, + "loss": 0.6699, + "step": 7808 + }, + { + "epoch": 1.16, + "grad_norm": 1.8511234916020909, + "learning_rate": 1.7916914313703712e-06, + "loss": 0.6491, + "step": 7809 + }, + { + "epoch": 1.16, + "grad_norm": 1.5366092303544405, + "learning_rate": 1.7916324078579381e-06, + "loss": 0.6628, + "step": 7810 + }, + { + "epoch": 1.16, + "grad_norm": 1.2535611596830032, + "learning_rate": 1.7915733769571296e-06, + "loss": 0.6745, + "step": 7811 + }, + { + "epoch": 1.17, + "grad_norm": 0.6989698018631837, + "learning_rate": 1.7915143386684969e-06, + "loss": 0.6549, + "step": 7812 + }, + { + "epoch": 1.17, + "grad_norm": 2.770061012684887, + "learning_rate": 1.7914552929925906e-06, + "loss": 0.6999, + "step": 7813 + }, + { + "epoch": 1.17, + "grad_norm": 1.7031130536575814, + "learning_rate": 1.7913962399299623e-06, + "loss": 0.6836, + "step": 7814 + }, + { + "epoch": 1.17, + "grad_norm": 3.0188497144046393, + "learning_rate": 1.791337179481163e-06, + "loss": 0.6842, + "step": 7815 + }, + { + "epoch": 1.17, + "grad_norm": 4.633438038751276, + "learning_rate": 1.7912781116467435e-06, + "loss": 0.6888, + "step": 7816 + }, + { + "epoch": 1.17, + "grad_norm": 1.546104309533672, + "learning_rate": 1.7912190364272556e-06, + "loss": 0.6751, + "step": 7817 + }, + { + "epoch": 1.17, + "grad_norm": 1.8107740013214264, + "learning_rate": 1.7911599538232504e-06, + "loss": 0.6901, + "step": 7818 + }, + { + "epoch": 1.17, + "grad_norm": 6.818262716955003, + "learning_rate": 1.7911008638352797e-06, + "loss": 0.6602, + "step": 7819 + }, + { + "epoch": 1.17, + "grad_norm": 1.0083262569019058, + "learning_rate": 1.7910417664638943e-06, + "loss": 0.6615, + "step": 7820 + }, + { + "epoch": 1.17, + "grad_norm": 4.0939511501646315, + "learning_rate": 1.790982661709646e-06, + "loss": 0.6784, + "step": 7821 + }, + { + "epoch": 1.17, + "grad_norm": 3.1041883891571533, + "learning_rate": 1.790923549573087e-06, + "loss": 0.6862, + "step": 7822 + }, + { + "epoch": 1.17, + "grad_norm": 2.606323764405245, + "learning_rate": 1.7908644300547683e-06, + "loss": 0.6725, + "step": 7823 + }, + { + "epoch": 1.17, + "grad_norm": 0.6777755136689322, + "learning_rate": 1.790805303155242e-06, + "loss": 0.6849, + "step": 7824 + }, + { + "epoch": 1.17, + "grad_norm": 1.0733145766109056, + "learning_rate": 1.79074616887506e-06, + "loss": 0.6868, + "step": 7825 + }, + { + "epoch": 1.17, + "grad_norm": 3.335278430313521, + "learning_rate": 1.790687027214774e-06, + "loss": 0.681, + "step": 7826 + }, + { + "epoch": 1.17, + "grad_norm": 2.030192285457427, + "learning_rate": 1.790627878174936e-06, + "loss": 0.696, + "step": 7827 + }, + { + "epoch": 1.17, + "grad_norm": 8.420089349482867, + "learning_rate": 1.790568721756098e-06, + "loss": 0.7168, + "step": 7828 + }, + { + "epoch": 1.17, + "grad_norm": 2.526764596879638, + "learning_rate": 1.7905095579588124e-06, + "loss": 0.7038, + "step": 7829 + }, + { + "epoch": 1.17, + "grad_norm": 5.768692363621557, + "learning_rate": 1.7904503867836311e-06, + "loss": 0.6771, + "step": 7830 + }, + { + "epoch": 1.17, + "grad_norm": 1.3119813015634803, + "learning_rate": 1.7903912082311063e-06, + "loss": 0.6784, + "step": 7831 + }, + { + "epoch": 1.17, + "grad_norm": 1.1091801939447323, + "learning_rate": 1.7903320223017908e-06, + "loss": 0.6758, + "step": 7832 + }, + { + "epoch": 1.17, + "grad_norm": 5.1107835722489785, + "learning_rate": 1.7902728289962363e-06, + "loss": 0.6621, + "step": 7833 + }, + { + "epoch": 1.17, + "grad_norm": 3.8227012289882496, + "learning_rate": 1.7902136283149956e-06, + "loss": 0.6673, + "step": 7834 + }, + { + "epoch": 1.17, + "grad_norm": 1.1607004155851004, + "learning_rate": 1.7901544202586215e-06, + "loss": 0.6745, + "step": 7835 + }, + { + "epoch": 1.17, + "grad_norm": 1.775673700870771, + "learning_rate": 1.7900952048276664e-06, + "loss": 0.6823, + "step": 7836 + }, + { + "epoch": 1.17, + "grad_norm": 0.5296733646690788, + "learning_rate": 1.7900359820226822e-06, + "loss": 0.6758, + "step": 7837 + }, + { + "epoch": 1.17, + "grad_norm": 4.27673254186823, + "learning_rate": 1.789976751844223e-06, + "loss": 0.6921, + "step": 7838 + }, + { + "epoch": 1.17, + "grad_norm": 3.5395910987470143, + "learning_rate": 1.7899175142928407e-06, + "loss": 0.6862, + "step": 7839 + }, + { + "epoch": 1.17, + "grad_norm": 2.4062493103788114, + "learning_rate": 1.7898582693690884e-06, + "loss": 0.6712, + "step": 7840 + }, + { + "epoch": 1.17, + "grad_norm": 2.895738713371363, + "learning_rate": 1.7897990170735188e-06, + "loss": 0.6732, + "step": 7841 + }, + { + "epoch": 1.17, + "grad_norm": 2.0942953944778817, + "learning_rate": 1.7897397574066852e-06, + "loss": 0.7057, + "step": 7842 + }, + { + "epoch": 1.17, + "grad_norm": 4.423828388713636, + "learning_rate": 1.789680490369141e-06, + "loss": 0.6895, + "step": 7843 + }, + { + "epoch": 1.17, + "grad_norm": 1.1955719972870693, + "learning_rate": 1.7896212159614385e-06, + "loss": 0.6803, + "step": 7844 + }, + { + "epoch": 1.17, + "grad_norm": 2.140832320569366, + "learning_rate": 1.7895619341841316e-06, + "loss": 0.6836, + "step": 7845 + }, + { + "epoch": 1.17, + "grad_norm": 0.9938169050670111, + "learning_rate": 1.789502645037773e-06, + "loss": 0.6673, + "step": 7846 + }, + { + "epoch": 1.17, + "grad_norm": 1.2279627375179472, + "learning_rate": 1.789443348522917e-06, + "loss": 0.6647, + "step": 7847 + }, + { + "epoch": 1.17, + "grad_norm": 0.6622756295551799, + "learning_rate": 1.789384044640116e-06, + "loss": 0.6758, + "step": 7848 + }, + { + "epoch": 1.17, + "grad_norm": 2.511959044916248, + "learning_rate": 1.789324733389924e-06, + "loss": 0.6764, + "step": 7849 + }, + { + "epoch": 1.17, + "grad_norm": 1.550935299872667, + "learning_rate": 1.7892654147728943e-06, + "loss": 0.6576, + "step": 7850 + }, + { + "epoch": 1.17, + "grad_norm": 1.468767341108722, + "learning_rate": 1.7892060887895808e-06, + "loss": 0.6738, + "step": 7851 + }, + { + "epoch": 1.17, + "grad_norm": 1.1712737390263919, + "learning_rate": 1.7891467554405372e-06, + "loss": 0.6862, + "step": 7852 + }, + { + "epoch": 1.17, + "grad_norm": 6.718496770290687, + "learning_rate": 1.789087414726317e-06, + "loss": 0.6888, + "step": 7853 + }, + { + "epoch": 1.17, + "grad_norm": 2.7164746055104216, + "learning_rate": 1.7890280666474742e-06, + "loss": 0.653, + "step": 7854 + }, + { + "epoch": 1.17, + "grad_norm": 2.042885591913077, + "learning_rate": 1.7889687112045627e-06, + "loss": 0.6621, + "step": 7855 + }, + { + "epoch": 1.17, + "grad_norm": 1.078562105331707, + "learning_rate": 1.7889093483981364e-06, + "loss": 0.6784, + "step": 7856 + }, + { + "epoch": 1.17, + "grad_norm": 4.188102271644591, + "learning_rate": 1.7888499782287495e-06, + "loss": 0.7064, + "step": 7857 + }, + { + "epoch": 1.17, + "grad_norm": 1.0722500551948888, + "learning_rate": 1.7887906006969558e-06, + "loss": 0.6895, + "step": 7858 + }, + { + "epoch": 1.17, + "grad_norm": 1.5678436076358448, + "learning_rate": 1.7887312158033097e-06, + "loss": 0.6439, + "step": 7859 + }, + { + "epoch": 1.17, + "grad_norm": 3.8691149746865143, + "learning_rate": 1.7886718235483656e-06, + "loss": 0.6523, + "step": 7860 + }, + { + "epoch": 1.17, + "grad_norm": 1.6564549473558243, + "learning_rate": 1.7886124239326775e-06, + "loss": 0.6927, + "step": 7861 + }, + { + "epoch": 1.17, + "grad_norm": 0.9078109855203387, + "learning_rate": 1.7885530169567997e-06, + "loss": 0.6836, + "step": 7862 + }, + { + "epoch": 1.17, + "grad_norm": 0.8045019854372317, + "learning_rate": 1.7884936026212874e-06, + "loss": 0.6927, + "step": 7863 + }, + { + "epoch": 1.17, + "grad_norm": 1.7163109547824256, + "learning_rate": 1.788434180926694e-06, + "loss": 0.6628, + "step": 7864 + }, + { + "epoch": 1.17, + "grad_norm": 1.0913257430704493, + "learning_rate": 1.788374751873575e-06, + "loss": 0.6628, + "step": 7865 + }, + { + "epoch": 1.17, + "grad_norm": 1.0661987762305503, + "learning_rate": 1.7883153154624845e-06, + "loss": 0.6699, + "step": 7866 + }, + { + "epoch": 1.17, + "grad_norm": 3.528210744882716, + "learning_rate": 1.7882558716939777e-06, + "loss": 0.6725, + "step": 7867 + }, + { + "epoch": 1.17, + "grad_norm": 2.355570332792845, + "learning_rate": 1.788196420568609e-06, + "loss": 0.6888, + "step": 7868 + }, + { + "epoch": 1.17, + "grad_norm": 1.7690713052946618, + "learning_rate": 1.7881369620869332e-06, + "loss": 0.6517, + "step": 7869 + }, + { + "epoch": 1.17, + "grad_norm": 2.527208900338788, + "learning_rate": 1.7880774962495058e-06, + "loss": 0.6816, + "step": 7870 + }, + { + "epoch": 1.17, + "grad_norm": 1.0300832617404059, + "learning_rate": 1.7880180230568814e-06, + "loss": 0.6738, + "step": 7871 + }, + { + "epoch": 1.17, + "grad_norm": 3.035702393803674, + "learning_rate": 1.787958542509615e-06, + "loss": 0.6908, + "step": 7872 + }, + { + "epoch": 1.17, + "grad_norm": 2.304000348507327, + "learning_rate": 1.7878990546082618e-06, + "loss": 0.7012, + "step": 7873 + }, + { + "epoch": 1.17, + "grad_norm": 2.084515549847242, + "learning_rate": 1.787839559353377e-06, + "loss": 0.6504, + "step": 7874 + }, + { + "epoch": 1.17, + "grad_norm": 1.5846048033438138, + "learning_rate": 1.7877800567455162e-06, + "loss": 0.679, + "step": 7875 + }, + { + "epoch": 1.17, + "grad_norm": 1.3150627783233995, + "learning_rate": 1.787720546785234e-06, + "loss": 0.681, + "step": 7876 + }, + { + "epoch": 1.17, + "grad_norm": 2.5905230084477364, + "learning_rate": 1.7876610294730868e-06, + "loss": 0.6908, + "step": 7877 + }, + { + "epoch": 1.17, + "grad_norm": 2.394880933463845, + "learning_rate": 1.787601504809629e-06, + "loss": 0.6868, + "step": 7878 + }, + { + "epoch": 1.18, + "grad_norm": 0.7939081157118376, + "learning_rate": 1.7875419727954171e-06, + "loss": 0.6589, + "step": 7879 + }, + { + "epoch": 1.18, + "grad_norm": 3.59718884197319, + "learning_rate": 1.7874824334310063e-06, + "loss": 0.6908, + "step": 7880 + }, + { + "epoch": 1.18, + "grad_norm": 3.7958202352931103, + "learning_rate": 1.787422886716952e-06, + "loss": 0.6895, + "step": 7881 + }, + { + "epoch": 1.18, + "grad_norm": 1.4892046959438265, + "learning_rate": 1.7873633326538108e-06, + "loss": 0.6751, + "step": 7882 + }, + { + "epoch": 1.18, + "grad_norm": 0.8071554345701145, + "learning_rate": 1.7873037712421375e-06, + "loss": 0.6471, + "step": 7883 + }, + { + "epoch": 1.18, + "grad_norm": 4.453655701529692, + "learning_rate": 1.7872442024824887e-06, + "loss": 0.6888, + "step": 7884 + }, + { + "epoch": 1.18, + "grad_norm": 0.8391129683386065, + "learning_rate": 1.7871846263754202e-06, + "loss": 0.6589, + "step": 7885 + }, + { + "epoch": 1.18, + "grad_norm": 1.1360570450212177, + "learning_rate": 1.7871250429214878e-06, + "loss": 0.666, + "step": 7886 + }, + { + "epoch": 1.18, + "grad_norm": 3.652918105758686, + "learning_rate": 1.7870654521212475e-06, + "loss": 0.6895, + "step": 7887 + }, + { + "epoch": 1.18, + "grad_norm": 1.950263418861133, + "learning_rate": 1.7870058539752563e-06, + "loss": 0.6706, + "step": 7888 + }, + { + "epoch": 1.18, + "grad_norm": 1.1501042424357886, + "learning_rate": 1.7869462484840696e-06, + "loss": 0.6927, + "step": 7889 + }, + { + "epoch": 1.18, + "grad_norm": 4.092878997989544, + "learning_rate": 1.786886635648244e-06, + "loss": 0.6758, + "step": 7890 + }, + { + "epoch": 1.18, + "grad_norm": 3.020340264145261, + "learning_rate": 1.7868270154683358e-06, + "loss": 0.638, + "step": 7891 + }, + { + "epoch": 1.18, + "grad_norm": 0.8159307075578195, + "learning_rate": 1.7867673879449014e-06, + "loss": 0.6465, + "step": 7892 + }, + { + "epoch": 1.18, + "grad_norm": 3.5277703634514417, + "learning_rate": 1.7867077530784976e-06, + "loss": 0.696, + "step": 7893 + }, + { + "epoch": 1.18, + "grad_norm": 2.408823539258897, + "learning_rate": 1.7866481108696805e-06, + "loss": 0.681, + "step": 7894 + }, + { + "epoch": 1.18, + "grad_norm": 0.9047898830552344, + "learning_rate": 1.7865884613190072e-06, + "loss": 0.7259, + "step": 7895 + }, + { + "epoch": 1.18, + "grad_norm": 2.189877047452228, + "learning_rate": 1.7865288044270343e-06, + "loss": 0.6947, + "step": 7896 + }, + { + "epoch": 1.18, + "grad_norm": 1.9476294688055775, + "learning_rate": 1.7864691401943183e-06, + "loss": 0.6497, + "step": 7897 + }, + { + "epoch": 1.18, + "grad_norm": 0.641365426872345, + "learning_rate": 1.7864094686214166e-06, + "loss": 0.6738, + "step": 7898 + }, + { + "epoch": 1.18, + "grad_norm": 1.7423940343499047, + "learning_rate": 1.7863497897088853e-06, + "loss": 0.6706, + "step": 7899 + }, + { + "epoch": 1.18, + "grad_norm": 1.3199560012847718, + "learning_rate": 1.7862901034572823e-06, + "loss": 0.681, + "step": 7900 + }, + { + "epoch": 1.18, + "grad_norm": 1.8648846994924737, + "learning_rate": 1.786230409867164e-06, + "loss": 0.6999, + "step": 7901 + }, + { + "epoch": 1.18, + "grad_norm": 0.7709000094172318, + "learning_rate": 1.7861707089390878e-06, + "loss": 0.6732, + "step": 7902 + }, + { + "epoch": 1.18, + "grad_norm": 1.9999503811074812, + "learning_rate": 1.7861110006736108e-06, + "loss": 0.681, + "step": 7903 + }, + { + "epoch": 1.18, + "grad_norm": 0.8528893335036821, + "learning_rate": 1.7860512850712905e-06, + "loss": 0.6543, + "step": 7904 + }, + { + "epoch": 1.18, + "grad_norm": 3.143598037572271, + "learning_rate": 1.785991562132684e-06, + "loss": 0.6966, + "step": 7905 + }, + { + "epoch": 1.18, + "grad_norm": 3.5922974029607277, + "learning_rate": 1.7859318318583485e-06, + "loss": 0.6966, + "step": 7906 + }, + { + "epoch": 1.18, + "grad_norm": 3.517864618865297, + "learning_rate": 1.7858720942488419e-06, + "loss": 0.6862, + "step": 7907 + }, + { + "epoch": 1.18, + "grad_norm": 4.168829847277345, + "learning_rate": 1.7858123493047215e-06, + "loss": 0.6673, + "step": 7908 + }, + { + "epoch": 1.18, + "grad_norm": 1.2710237589520341, + "learning_rate": 1.785752597026545e-06, + "loss": 0.6738, + "step": 7909 + }, + { + "epoch": 1.18, + "grad_norm": 0.730331934840108, + "learning_rate": 1.7856928374148698e-06, + "loss": 0.6452, + "step": 7910 + }, + { + "epoch": 1.18, + "grad_norm": 1.5419906940710313, + "learning_rate": 1.7856330704702543e-06, + "loss": 0.6725, + "step": 7911 + }, + { + "epoch": 1.18, + "grad_norm": 4.082030789160582, + "learning_rate": 1.7855732961932554e-06, + "loss": 0.6699, + "step": 7912 + }, + { + "epoch": 1.18, + "grad_norm": 3.0702445645397023, + "learning_rate": 1.7855135145844318e-06, + "loss": 0.6478, + "step": 7913 + }, + { + "epoch": 1.18, + "grad_norm": 5.750838442164652, + "learning_rate": 1.785453725644341e-06, + "loss": 0.6895, + "step": 7914 + }, + { + "epoch": 1.18, + "grad_norm": 3.099383694734257, + "learning_rate": 1.7853939293735409e-06, + "loss": 0.6406, + "step": 7915 + }, + { + "epoch": 1.18, + "grad_norm": 6.986689545055561, + "learning_rate": 1.7853341257725903e-06, + "loss": 0.6517, + "step": 7916 + }, + { + "epoch": 1.18, + "grad_norm": 3.4381911507471634, + "learning_rate": 1.7852743148420463e-06, + "loss": 0.6901, + "step": 7917 + }, + { + "epoch": 1.18, + "grad_norm": 1.3752327509555435, + "learning_rate": 1.785214496582468e-06, + "loss": 0.6816, + "step": 7918 + }, + { + "epoch": 1.18, + "grad_norm": 3.491070518630572, + "learning_rate": 1.7851546709944133e-06, + "loss": 0.6693, + "step": 7919 + }, + { + "epoch": 1.18, + "grad_norm": 0.9510820805400848, + "learning_rate": 1.7850948380784405e-06, + "loss": 0.6927, + "step": 7920 + }, + { + "epoch": 1.18, + "grad_norm": 5.308565853938603, + "learning_rate": 1.7850349978351083e-06, + "loss": 0.6901, + "step": 7921 + }, + { + "epoch": 1.18, + "grad_norm": 1.332693711298609, + "learning_rate": 1.784975150264975e-06, + "loss": 0.6803, + "step": 7922 + }, + { + "epoch": 1.18, + "grad_norm": 0.8676530021458798, + "learning_rate": 1.784915295368599e-06, + "loss": 0.6413, + "step": 7923 + }, + { + "epoch": 1.18, + "grad_norm": 2.3466798584739488, + "learning_rate": 1.7848554331465395e-06, + "loss": 0.6628, + "step": 7924 + }, + { + "epoch": 1.18, + "grad_norm": 0.9520166403694907, + "learning_rate": 1.7847955635993545e-06, + "loss": 0.6855, + "step": 7925 + }, + { + "epoch": 1.18, + "grad_norm": 1.3083587590780483, + "learning_rate": 1.7847356867276035e-06, + "loss": 0.6719, + "step": 7926 + }, + { + "epoch": 1.18, + "grad_norm": 1.0150824454268024, + "learning_rate": 1.7846758025318445e-06, + "loss": 0.6634, + "step": 7927 + }, + { + "epoch": 1.18, + "grad_norm": 7.364132045253592, + "learning_rate": 1.7846159110126371e-06, + "loss": 0.6745, + "step": 7928 + }, + { + "epoch": 1.18, + "grad_norm": 3.2870558460036676, + "learning_rate": 1.7845560121705398e-06, + "loss": 0.6445, + "step": 7929 + }, + { + "epoch": 1.18, + "grad_norm": 4.968728887711417, + "learning_rate": 1.784496106006112e-06, + "loss": 0.6895, + "step": 7930 + }, + { + "epoch": 1.18, + "grad_norm": 2.442889441067201, + "learning_rate": 1.7844361925199129e-06, + "loss": 0.6745, + "step": 7931 + }, + { + "epoch": 1.18, + "grad_norm": 4.890978634476137, + "learning_rate": 1.7843762717125013e-06, + "loss": 0.694, + "step": 7932 + }, + { + "epoch": 1.18, + "grad_norm": 3.840322491246795, + "learning_rate": 1.7843163435844363e-06, + "loss": 0.709, + "step": 7933 + }, + { + "epoch": 1.18, + "grad_norm": 2.2702196349291084, + "learning_rate": 1.784256408136278e-06, + "loss": 0.651, + "step": 7934 + }, + { + "epoch": 1.18, + "grad_norm": 2.10172839197904, + "learning_rate": 1.784196465368585e-06, + "loss": 0.6699, + "step": 7935 + }, + { + "epoch": 1.18, + "grad_norm": 2.9436887931254234, + "learning_rate": 1.7841365152819167e-06, + "loss": 0.6738, + "step": 7936 + }, + { + "epoch": 1.18, + "grad_norm": 2.458863935309173, + "learning_rate": 1.7840765578768332e-06, + "loss": 0.6751, + "step": 7937 + }, + { + "epoch": 1.18, + "grad_norm": 0.9564249001482956, + "learning_rate": 1.7840165931538942e-06, + "loss": 0.6413, + "step": 7938 + }, + { + "epoch": 1.18, + "grad_norm": 0.9442963067437598, + "learning_rate": 1.7839566211136588e-06, + "loss": 0.6628, + "step": 7939 + }, + { + "epoch": 1.18, + "grad_norm": 6.937356444127516, + "learning_rate": 1.7838966417566867e-06, + "loss": 0.7168, + "step": 7940 + }, + { + "epoch": 1.18, + "grad_norm": 4.4815541617559, + "learning_rate": 1.7838366550835383e-06, + "loss": 0.6855, + "step": 7941 + }, + { + "epoch": 1.18, + "grad_norm": 4.664942771364098, + "learning_rate": 1.7837766610947727e-06, + "loss": 0.6732, + "step": 7942 + }, + { + "epoch": 1.18, + "grad_norm": 6.433777405836866, + "learning_rate": 1.7837166597909504e-06, + "loss": 0.6667, + "step": 7943 + }, + { + "epoch": 1.18, + "grad_norm": 5.113767461735031, + "learning_rate": 1.7836566511726312e-06, + "loss": 0.6803, + "step": 7944 + }, + { + "epoch": 1.18, + "grad_norm": 2.410123632088717, + "learning_rate": 1.7835966352403752e-06, + "loss": 0.6999, + "step": 7945 + }, + { + "epoch": 1.19, + "grad_norm": 0.6894405204513954, + "learning_rate": 1.7835366119947425e-06, + "loss": 0.6536, + "step": 7946 + }, + { + "epoch": 1.19, + "grad_norm": 3.182220008108242, + "learning_rate": 1.7834765814362931e-06, + "loss": 0.679, + "step": 7947 + }, + { + "epoch": 1.19, + "grad_norm": 1.4204137131175842, + "learning_rate": 1.7834165435655877e-06, + "loss": 0.6888, + "step": 7948 + }, + { + "epoch": 1.19, + "grad_norm": 1.2880087934299782, + "learning_rate": 1.7833564983831864e-06, + "loss": 0.6706, + "step": 7949 + }, + { + "epoch": 1.19, + "grad_norm": 3.222439323796202, + "learning_rate": 1.7832964458896496e-06, + "loss": 0.6686, + "step": 7950 + }, + { + "epoch": 1.19, + "grad_norm": 3.4469166927573047, + "learning_rate": 1.7832363860855378e-06, + "loss": 0.6849, + "step": 7951 + }, + { + "epoch": 1.19, + "grad_norm": 7.868087156625793, + "learning_rate": 1.7831763189714115e-06, + "loss": 0.7246, + "step": 7952 + }, + { + "epoch": 1.19, + "grad_norm": 1.0086318327468549, + "learning_rate": 1.7831162445478315e-06, + "loss": 0.6458, + "step": 7953 + }, + { + "epoch": 1.19, + "grad_norm": 1.4247742100726828, + "learning_rate": 1.7830561628153583e-06, + "loss": 0.6829, + "step": 7954 + }, + { + "epoch": 1.19, + "grad_norm": 0.9177633193130327, + "learning_rate": 1.7829960737745525e-06, + "loss": 0.694, + "step": 7955 + }, + { + "epoch": 1.19, + "grad_norm": 1.4600022021923327, + "learning_rate": 1.7829359774259748e-06, + "loss": 0.6777, + "step": 7956 + }, + { + "epoch": 1.19, + "grad_norm": 1.0931654007952838, + "learning_rate": 1.782875873770187e-06, + "loss": 0.6732, + "step": 7957 + }, + { + "epoch": 1.19, + "grad_norm": 0.8617228561561906, + "learning_rate": 1.7828157628077492e-06, + "loss": 0.6751, + "step": 7958 + }, + { + "epoch": 1.19, + "grad_norm": 1.7067090315121163, + "learning_rate": 1.7827556445392226e-06, + "loss": 0.668, + "step": 7959 + }, + { + "epoch": 1.19, + "grad_norm": 5.233583842575185, + "learning_rate": 1.7826955189651684e-06, + "loss": 0.6901, + "step": 7960 + }, + { + "epoch": 1.19, + "grad_norm": 1.1522335458775923, + "learning_rate": 1.7826353860861478e-06, + "loss": 0.6615, + "step": 7961 + }, + { + "epoch": 1.19, + "grad_norm": 6.3246815518683475, + "learning_rate": 1.782575245902722e-06, + "loss": 0.6803, + "step": 7962 + }, + { + "epoch": 1.19, + "grad_norm": 0.821674598862052, + "learning_rate": 1.7825150984154519e-06, + "loss": 0.6947, + "step": 7963 + }, + { + "epoch": 1.19, + "grad_norm": 4.351427949773968, + "learning_rate": 1.7824549436248994e-06, + "loss": 0.6699, + "step": 7964 + }, + { + "epoch": 1.19, + "grad_norm": 0.6110387476100635, + "learning_rate": 1.7823947815316255e-06, + "loss": 0.6719, + "step": 7965 + }, + { + "epoch": 1.19, + "grad_norm": 0.6450160499577823, + "learning_rate": 1.7823346121361922e-06, + "loss": 0.666, + "step": 7966 + }, + { + "epoch": 1.19, + "grad_norm": 1.1320956812878602, + "learning_rate": 1.7822744354391607e-06, + "loss": 0.6549, + "step": 7967 + }, + { + "epoch": 1.19, + "grad_norm": 1.747341497806228, + "learning_rate": 1.7822142514410928e-06, + "loss": 0.6536, + "step": 7968 + }, + { + "epoch": 1.19, + "grad_norm": 0.7152508730975571, + "learning_rate": 1.7821540601425498e-06, + "loss": 0.6882, + "step": 7969 + }, + { + "epoch": 1.19, + "grad_norm": 0.7956987645243097, + "learning_rate": 1.7820938615440939e-06, + "loss": 0.6732, + "step": 7970 + }, + { + "epoch": 1.19, + "grad_norm": 6.478169271202926, + "learning_rate": 1.7820336556462873e-06, + "loss": 0.6934, + "step": 7971 + }, + { + "epoch": 1.19, + "grad_norm": 3.0125276187481203, + "learning_rate": 1.781973442449691e-06, + "loss": 0.7116, + "step": 7972 + }, + { + "epoch": 1.19, + "grad_norm": 3.5735387055490753, + "learning_rate": 1.781913221954867e-06, + "loss": 0.7103, + "step": 7973 + }, + { + "epoch": 1.19, + "grad_norm": 0.8854485201221655, + "learning_rate": 1.7818529941623783e-06, + "loss": 0.6673, + "step": 7974 + }, + { + "epoch": 1.19, + "grad_norm": 1.5193541434253923, + "learning_rate": 1.7817927590727865e-06, + "loss": 0.6797, + "step": 7975 + }, + { + "epoch": 1.19, + "grad_norm": 2.081196683720705, + "learning_rate": 1.7817325166866536e-06, + "loss": 0.6927, + "step": 7976 + }, + { + "epoch": 1.19, + "grad_norm": 4.942453064251408, + "learning_rate": 1.781672267004542e-06, + "loss": 0.6738, + "step": 7977 + }, + { + "epoch": 1.19, + "grad_norm": 1.6570171562823817, + "learning_rate": 1.7816120100270137e-06, + "loss": 0.6764, + "step": 7978 + }, + { + "epoch": 1.19, + "grad_norm": 1.5099272251602496, + "learning_rate": 1.781551745754632e-06, + "loss": 0.6745, + "step": 7979 + }, + { + "epoch": 1.19, + "grad_norm": 0.578074794207129, + "learning_rate": 1.7814914741879583e-06, + "loss": 0.6738, + "step": 7980 + }, + { + "epoch": 1.19, + "grad_norm": 3.4057032251435264, + "learning_rate": 1.7814311953275559e-06, + "loss": 0.6868, + "step": 7981 + }, + { + "epoch": 1.19, + "grad_norm": 2.638941873021235, + "learning_rate": 1.7813709091739865e-06, + "loss": 0.6895, + "step": 7982 + }, + { + "epoch": 1.19, + "grad_norm": 3.603285314739194, + "learning_rate": 1.7813106157278138e-06, + "loss": 0.666, + "step": 7983 + }, + { + "epoch": 1.19, + "grad_norm": 1.7153938745575241, + "learning_rate": 1.7812503149895998e-06, + "loss": 0.6836, + "step": 7984 + }, + { + "epoch": 1.19, + "grad_norm": 7.227496627039111, + "learning_rate": 1.7811900069599077e-06, + "loss": 0.6797, + "step": 7985 + }, + { + "epoch": 1.19, + "grad_norm": 6.3893677565695315, + "learning_rate": 1.7811296916393e-06, + "loss": 0.6686, + "step": 7986 + }, + { + "epoch": 1.19, + "grad_norm": 0.8502642582693448, + "learning_rate": 1.7810693690283398e-06, + "loss": 0.6667, + "step": 7987 + }, + { + "epoch": 1.19, + "grad_norm": 3.380344279100638, + "learning_rate": 1.78100903912759e-06, + "loss": 0.6738, + "step": 7988 + }, + { + "epoch": 1.19, + "grad_norm": 2.2538357037385603, + "learning_rate": 1.7809487019376141e-06, + "loss": 0.6667, + "step": 7989 + }, + { + "epoch": 1.19, + "grad_norm": 4.349372436846222, + "learning_rate": 1.7808883574589747e-06, + "loss": 0.6888, + "step": 7990 + }, + { + "epoch": 1.19, + "grad_norm": 4.18419856557202, + "learning_rate": 1.7808280056922354e-06, + "loss": 0.6849, + "step": 7991 + }, + { + "epoch": 1.19, + "grad_norm": 0.7470866689820885, + "learning_rate": 1.780767646637959e-06, + "loss": 0.6432, + "step": 7992 + }, + { + "epoch": 1.19, + "grad_norm": 3.5877345973873287, + "learning_rate": 1.7807072802967094e-06, + "loss": 0.6986, + "step": 7993 + }, + { + "epoch": 1.19, + "grad_norm": 8.074806282209495, + "learning_rate": 1.7806469066690496e-06, + "loss": 0.6992, + "step": 7994 + }, + { + "epoch": 1.19, + "grad_norm": 0.6698641834329676, + "learning_rate": 1.7805865257555429e-06, + "loss": 0.6908, + "step": 7995 + }, + { + "epoch": 1.19, + "grad_norm": 1.2830441886957746, + "learning_rate": 1.7805261375567533e-06, + "loss": 0.6647, + "step": 7996 + }, + { + "epoch": 1.19, + "grad_norm": 1.8126479250228622, + "learning_rate": 1.7804657420732444e-06, + "loss": 0.6497, + "step": 7997 + }, + { + "epoch": 1.19, + "grad_norm": 1.0351347549727155, + "learning_rate": 1.7804053393055795e-06, + "loss": 0.7083, + "step": 7998 + }, + { + "epoch": 1.19, + "grad_norm": 1.7345615639081828, + "learning_rate": 1.7803449292543227e-06, + "loss": 0.6849, + "step": 7999 + }, + { + "epoch": 1.19, + "grad_norm": 2.9525567106600885, + "learning_rate": 1.7802845119200374e-06, + "loss": 0.6615, + "step": 8000 + }, + { + "epoch": 1.19, + "grad_norm": 1.161357617841448, + "learning_rate": 1.7802240873032881e-06, + "loss": 0.6589, + "step": 8001 + }, + { + "epoch": 1.19, + "grad_norm": 3.2837661028103695, + "learning_rate": 1.7801636554046381e-06, + "loss": 0.6712, + "step": 8002 + }, + { + "epoch": 1.19, + "grad_norm": 3.3913921682423656, + "learning_rate": 1.780103216224652e-06, + "loss": 0.638, + "step": 8003 + }, + { + "epoch": 1.19, + "grad_norm": 0.8902145191422632, + "learning_rate": 1.7800427697638934e-06, + "loss": 0.6836, + "step": 8004 + }, + { + "epoch": 1.19, + "grad_norm": 0.8584177940055723, + "learning_rate": 1.7799823160229268e-06, + "loss": 0.6745, + "step": 8005 + }, + { + "epoch": 1.19, + "grad_norm": 4.034260779743276, + "learning_rate": 1.7799218550023159e-06, + "loss": 0.6667, + "step": 8006 + }, + { + "epoch": 1.19, + "grad_norm": 0.9446492269179192, + "learning_rate": 1.7798613867026257e-06, + "loss": 0.7005, + "step": 8007 + }, + { + "epoch": 1.19, + "grad_norm": 4.9809564130333115, + "learning_rate": 1.7798009111244202e-06, + "loss": 0.6562, + "step": 8008 + }, + { + "epoch": 1.19, + "grad_norm": 1.640571225648137, + "learning_rate": 1.7797404282682638e-06, + "loss": 0.6725, + "step": 8009 + }, + { + "epoch": 1.19, + "grad_norm": 3.3611688690712156, + "learning_rate": 1.7796799381347207e-06, + "loss": 0.6725, + "step": 8010 + }, + { + "epoch": 1.19, + "grad_norm": 3.740362735025016, + "learning_rate": 1.7796194407243562e-06, + "loss": 0.6706, + "step": 8011 + }, + { + "epoch": 1.19, + "grad_norm": 2.231486523829095, + "learning_rate": 1.7795589360377342e-06, + "loss": 0.6855, + "step": 8012 + }, + { + "epoch": 1.2, + "grad_norm": 3.861443682891015, + "learning_rate": 1.77949842407542e-06, + "loss": 0.7025, + "step": 8013 + }, + { + "epoch": 1.2, + "grad_norm": 1.0317858106698439, + "learning_rate": 1.7794379048379779e-06, + "loss": 0.6536, + "step": 8014 + }, + { + "epoch": 1.2, + "grad_norm": 1.3639826533633104, + "learning_rate": 1.7793773783259732e-06, + "loss": 0.737, + "step": 8015 + }, + { + "epoch": 1.2, + "grad_norm": 1.563400757632636, + "learning_rate": 1.77931684453997e-06, + "loss": 0.6764, + "step": 8016 + }, + { + "epoch": 1.2, + "grad_norm": 4.160214259569214, + "learning_rate": 1.7792563034805341e-06, + "loss": 0.6556, + "step": 8017 + }, + { + "epoch": 1.2, + "grad_norm": 3.345619208943855, + "learning_rate": 1.7791957551482301e-06, + "loss": 0.6842, + "step": 8018 + }, + { + "epoch": 1.2, + "grad_norm": 3.9681055944688124, + "learning_rate": 1.7791351995436236e-06, + "loss": 0.6823, + "step": 8019 + }, + { + "epoch": 1.2, + "grad_norm": 1.8828944687789668, + "learning_rate": 1.779074636667279e-06, + "loss": 0.681, + "step": 8020 + }, + { + "epoch": 1.2, + "grad_norm": 2.5982300331165873, + "learning_rate": 1.7790140665197617e-06, + "loss": 0.681, + "step": 8021 + }, + { + "epoch": 1.2, + "grad_norm": 0.8973937710767076, + "learning_rate": 1.7789534891016374e-06, + "loss": 0.6673, + "step": 8022 + }, + { + "epoch": 1.2, + "grad_norm": 2.405715007041798, + "learning_rate": 1.7788929044134714e-06, + "loss": 0.6829, + "step": 8023 + }, + { + "epoch": 1.2, + "grad_norm": 4.017293228407578, + "learning_rate": 1.778832312455829e-06, + "loss": 0.6888, + "step": 8024 + }, + { + "epoch": 1.2, + "grad_norm": 3.668766780908194, + "learning_rate": 1.778771713229276e-06, + "loss": 0.6882, + "step": 8025 + }, + { + "epoch": 1.2, + "grad_norm": 6.166410294233481, + "learning_rate": 1.7787111067343774e-06, + "loss": 0.6966, + "step": 8026 + }, + { + "epoch": 1.2, + "grad_norm": 4.910014908049617, + "learning_rate": 1.7786504929716992e-06, + "loss": 0.6973, + "step": 8027 + }, + { + "epoch": 1.2, + "grad_norm": 4.288497264643473, + "learning_rate": 1.778589871941807e-06, + "loss": 0.709, + "step": 8028 + }, + { + "epoch": 1.2, + "grad_norm": 2.613456522682263, + "learning_rate": 1.778529243645267e-06, + "loss": 0.7077, + "step": 8029 + }, + { + "epoch": 1.2, + "grad_norm": 1.3990243345662794, + "learning_rate": 1.7784686080826448e-06, + "loss": 0.6777, + "step": 8030 + }, + { + "epoch": 1.2, + "grad_norm": 0.6208506932638238, + "learning_rate": 1.7784079652545056e-06, + "loss": 0.668, + "step": 8031 + }, + { + "epoch": 1.2, + "grad_norm": 2.6920250793065694, + "learning_rate": 1.7783473151614165e-06, + "loss": 0.6797, + "step": 8032 + }, + { + "epoch": 1.2, + "grad_norm": 2.872956856068526, + "learning_rate": 1.778286657803943e-06, + "loss": 0.6829, + "step": 8033 + }, + { + "epoch": 1.2, + "grad_norm": 2.4115288510290065, + "learning_rate": 1.7782259931826514e-06, + "loss": 0.6771, + "step": 8034 + }, + { + "epoch": 1.2, + "grad_norm": 0.6086474496094524, + "learning_rate": 1.7781653212981076e-06, + "loss": 0.6738, + "step": 8035 + }, + { + "epoch": 1.2, + "grad_norm": 6.150130938832348, + "learning_rate": 1.778104642150878e-06, + "loss": 0.651, + "step": 8036 + }, + { + "epoch": 1.2, + "grad_norm": 3.087015834008969, + "learning_rate": 1.7780439557415291e-06, + "loss": 0.6842, + "step": 8037 + }, + { + "epoch": 1.2, + "grad_norm": 2.661034417544444, + "learning_rate": 1.7779832620706271e-06, + "loss": 0.6771, + "step": 8038 + }, + { + "epoch": 1.2, + "grad_norm": 5.211064548691608, + "learning_rate": 1.7779225611387384e-06, + "loss": 0.6855, + "step": 8039 + }, + { + "epoch": 1.2, + "grad_norm": 5.088418688993373, + "learning_rate": 1.77786185294643e-06, + "loss": 0.6842, + "step": 8040 + }, + { + "epoch": 1.2, + "grad_norm": 0.9878617530880718, + "learning_rate": 1.7778011374942678e-06, + "loss": 0.6901, + "step": 8041 + }, + { + "epoch": 1.2, + "grad_norm": 2.3430529960934066, + "learning_rate": 1.777740414782819e-06, + "loss": 0.6888, + "step": 8042 + }, + { + "epoch": 1.2, + "grad_norm": 2.4551193520815238, + "learning_rate": 1.7776796848126501e-06, + "loss": 0.6686, + "step": 8043 + }, + { + "epoch": 1.2, + "grad_norm": 0.5856795747523836, + "learning_rate": 1.7776189475843277e-06, + "loss": 0.666, + "step": 8044 + }, + { + "epoch": 1.2, + "grad_norm": 0.9560912376916344, + "learning_rate": 1.7775582030984191e-06, + "loss": 0.6771, + "step": 8045 + }, + { + "epoch": 1.2, + "grad_norm": 2.091863257253711, + "learning_rate": 1.777497451355491e-06, + "loss": 0.6888, + "step": 8046 + }, + { + "epoch": 1.2, + "grad_norm": 3.054007853427597, + "learning_rate": 1.7774366923561105e-06, + "loss": 0.6816, + "step": 8047 + }, + { + "epoch": 1.2, + "grad_norm": 2.3384654683286454, + "learning_rate": 1.7773759261008446e-06, + "loss": 0.6934, + "step": 8048 + }, + { + "epoch": 1.2, + "grad_norm": 0.7666109214286962, + "learning_rate": 1.77731515259026e-06, + "loss": 0.6797, + "step": 8049 + }, + { + "epoch": 1.2, + "grad_norm": 2.0538099440078033, + "learning_rate": 1.7772543718249246e-06, + "loss": 0.6842, + "step": 8050 + }, + { + "epoch": 1.2, + "grad_norm": 1.0120625580681797, + "learning_rate": 1.7771935838054057e-06, + "loss": 0.6999, + "step": 8051 + }, + { + "epoch": 1.2, + "grad_norm": 2.42515998816361, + "learning_rate": 1.77713278853227e-06, + "loss": 0.6816, + "step": 8052 + }, + { + "epoch": 1.2, + "grad_norm": 4.319768201391808, + "learning_rate": 1.7770719860060854e-06, + "loss": 0.6836, + "step": 8053 + }, + { + "epoch": 1.2, + "grad_norm": 3.263374913898832, + "learning_rate": 1.7770111762274194e-06, + "loss": 0.6849, + "step": 8054 + }, + { + "epoch": 1.2, + "grad_norm": 1.5559779862371266, + "learning_rate": 1.7769503591968391e-06, + "loss": 0.6641, + "step": 8055 + }, + { + "epoch": 1.2, + "grad_norm": 2.4199208332517297, + "learning_rate": 1.7768895349149125e-06, + "loss": 0.6667, + "step": 8056 + }, + { + "epoch": 1.2, + "grad_norm": 0.6925805848341369, + "learning_rate": 1.7768287033822072e-06, + "loss": 0.6725, + "step": 8057 + }, + { + "epoch": 1.2, + "grad_norm": 4.659568174932323, + "learning_rate": 1.776767864599291e-06, + "loss": 0.653, + "step": 8058 + }, + { + "epoch": 1.2, + "grad_norm": 2.925555844122031, + "learning_rate": 1.7767070185667314e-06, + "loss": 0.6654, + "step": 8059 + }, + { + "epoch": 1.2, + "grad_norm": 2.5664422297620746, + "learning_rate": 1.7766461652850966e-06, + "loss": 0.7038, + "step": 8060 + }, + { + "epoch": 1.2, + "grad_norm": 1.317980239282158, + "learning_rate": 1.7765853047549544e-06, + "loss": 0.6732, + "step": 8061 + }, + { + "epoch": 1.2, + "grad_norm": 1.676664032482997, + "learning_rate": 1.7765244369768731e-06, + "loss": 0.6497, + "step": 8062 + }, + { + "epoch": 1.2, + "grad_norm": 8.426516133245375, + "learning_rate": 1.7764635619514203e-06, + "loss": 0.6986, + "step": 8063 + }, + { + "epoch": 1.2, + "grad_norm": 2.4099508765833413, + "learning_rate": 1.7764026796791645e-06, + "loss": 0.679, + "step": 8064 + }, + { + "epoch": 1.2, + "grad_norm": 2.0063990605849567, + "learning_rate": 1.776341790160674e-06, + "loss": 0.6608, + "step": 8065 + }, + { + "epoch": 1.2, + "grad_norm": 1.523918549671847, + "learning_rate": 1.7762808933965165e-06, + "loss": 0.6777, + "step": 8066 + }, + { + "epoch": 1.2, + "grad_norm": 2.6840534031114682, + "learning_rate": 1.7762199893872612e-06, + "loss": 0.6947, + "step": 8067 + }, + { + "epoch": 1.2, + "grad_norm": 1.3282259198474333, + "learning_rate": 1.776159078133476e-06, + "loss": 0.6543, + "step": 8068 + }, + { + "epoch": 1.2, + "grad_norm": 1.3509381452333078, + "learning_rate": 1.7760981596357292e-06, + "loss": 0.6855, + "step": 8069 + }, + { + "epoch": 1.2, + "grad_norm": 2.4702261360293503, + "learning_rate": 1.77603723389459e-06, + "loss": 0.6536, + "step": 8070 + }, + { + "epoch": 1.2, + "grad_norm": 1.7210330133967864, + "learning_rate": 1.7759763009106263e-06, + "loss": 0.6829, + "step": 8071 + }, + { + "epoch": 1.2, + "grad_norm": 3.1228160081058776, + "learning_rate": 1.7759153606844075e-06, + "loss": 0.679, + "step": 8072 + }, + { + "epoch": 1.2, + "grad_norm": 4.5228855206910525, + "learning_rate": 1.7758544132165015e-06, + "loss": 0.6895, + "step": 8073 + }, + { + "epoch": 1.2, + "grad_norm": 4.219122414757881, + "learning_rate": 1.7757934585074784e-06, + "loss": 0.6829, + "step": 8074 + }, + { + "epoch": 1.2, + "grad_norm": 0.9453232687609646, + "learning_rate": 1.775732496557906e-06, + "loss": 0.6999, + "step": 8075 + }, + { + "epoch": 1.2, + "grad_norm": 2.597482666800879, + "learning_rate": 1.7756715273683534e-06, + "loss": 0.6543, + "step": 8076 + }, + { + "epoch": 1.2, + "grad_norm": 2.089912729771731, + "learning_rate": 1.77561055093939e-06, + "loss": 0.6751, + "step": 8077 + }, + { + "epoch": 1.2, + "grad_norm": 4.4072179290658875, + "learning_rate": 1.775549567271585e-06, + "loss": 0.679, + "step": 8078 + }, + { + "epoch": 1.2, + "grad_norm": 2.393404095200482, + "learning_rate": 1.775488576365507e-06, + "loss": 0.6517, + "step": 8079 + }, + { + "epoch": 1.21, + "grad_norm": 2.6771229606177025, + "learning_rate": 1.7754275782217256e-06, + "loss": 0.6471, + "step": 8080 + }, + { + "epoch": 1.21, + "grad_norm": 1.2860233299192523, + "learning_rate": 1.77536657284081e-06, + "loss": 0.6576, + "step": 8081 + }, + { + "epoch": 1.21, + "grad_norm": 3.3256924145602005, + "learning_rate": 1.7753055602233298e-06, + "loss": 0.6738, + "step": 8082 + }, + { + "epoch": 1.21, + "grad_norm": 2.446997543658499, + "learning_rate": 1.775244540369854e-06, + "loss": 0.6569, + "step": 8083 + }, + { + "epoch": 1.21, + "grad_norm": 3.9618897517948515, + "learning_rate": 1.7751835132809525e-06, + "loss": 0.6927, + "step": 8084 + }, + { + "epoch": 1.21, + "grad_norm": 1.1478375543495918, + "learning_rate": 1.7751224789571944e-06, + "loss": 0.6921, + "step": 8085 + }, + { + "epoch": 1.21, + "grad_norm": 1.8881801315753346, + "learning_rate": 1.7750614373991502e-06, + "loss": 0.6497, + "step": 8086 + }, + { + "epoch": 1.21, + "grad_norm": 1.3581606964092017, + "learning_rate": 1.7750003886073886e-06, + "loss": 0.6732, + "step": 8087 + }, + { + "epoch": 1.21, + "grad_norm": 4.030436913760241, + "learning_rate": 1.7749393325824799e-06, + "loss": 0.6816, + "step": 8088 + }, + { + "epoch": 1.21, + "grad_norm": 3.6628104862269426, + "learning_rate": 1.774878269324994e-06, + "loss": 0.6829, + "step": 8089 + }, + { + "epoch": 1.21, + "grad_norm": 1.1591816743438672, + "learning_rate": 1.7748171988355007e-06, + "loss": 0.6908, + "step": 8090 + }, + { + "epoch": 1.21, + "grad_norm": 3.1259747470057437, + "learning_rate": 1.7747561211145698e-06, + "loss": 0.6543, + "step": 8091 + }, + { + "epoch": 1.21, + "grad_norm": 6.125890679222732, + "learning_rate": 1.7746950361627717e-06, + "loss": 0.6842, + "step": 8092 + }, + { + "epoch": 1.21, + "grad_norm": 1.2230837284195801, + "learning_rate": 1.7746339439806764e-06, + "loss": 0.6595, + "step": 8093 + }, + { + "epoch": 1.21, + "grad_norm": 1.3053116297129619, + "learning_rate": 1.7745728445688537e-06, + "loss": 0.6738, + "step": 8094 + }, + { + "epoch": 1.21, + "grad_norm": 2.62276005492471, + "learning_rate": 1.7745117379278742e-06, + "loss": 0.7109, + "step": 8095 + }, + { + "epoch": 1.21, + "grad_norm": 3.661674291563569, + "learning_rate": 1.7744506240583084e-06, + "loss": 0.7331, + "step": 8096 + }, + { + "epoch": 1.21, + "grad_norm": 4.982856647550589, + "learning_rate": 1.774389502960726e-06, + "loss": 0.6751, + "step": 8097 + }, + { + "epoch": 1.21, + "grad_norm": 1.5805552531362586, + "learning_rate": 1.7743283746356986e-06, + "loss": 0.7044, + "step": 8098 + }, + { + "epoch": 1.21, + "grad_norm": 0.9647756602075948, + "learning_rate": 1.7742672390837954e-06, + "loss": 0.6862, + "step": 8099 + }, + { + "epoch": 1.21, + "grad_norm": 1.9312912556320647, + "learning_rate": 1.7742060963055879e-06, + "loss": 0.6829, + "step": 8100 + }, + { + "epoch": 1.21, + "grad_norm": 1.291728778518983, + "learning_rate": 1.774144946301646e-06, + "loss": 0.6764, + "step": 8101 + }, + { + "epoch": 1.21, + "grad_norm": 1.5697339194800999, + "learning_rate": 1.774083789072541e-06, + "loss": 0.6738, + "step": 8102 + }, + { + "epoch": 1.21, + "grad_norm": 1.5915870815538846, + "learning_rate": 1.774022624618844e-06, + "loss": 0.6738, + "step": 8103 + }, + { + "epoch": 1.21, + "grad_norm": 5.816467937397159, + "learning_rate": 1.7739614529411248e-06, + "loss": 0.6999, + "step": 8104 + }, + { + "epoch": 1.21, + "grad_norm": 5.0007518427289535, + "learning_rate": 1.7739002740399554e-06, + "loss": 0.707, + "step": 8105 + }, + { + "epoch": 1.21, + "grad_norm": 2.044770120101599, + "learning_rate": 1.7738390879159061e-06, + "loss": 0.6654, + "step": 8106 + }, + { + "epoch": 1.21, + "grad_norm": 1.4674720852466587, + "learning_rate": 1.773777894569548e-06, + "loss": 0.6777, + "step": 8107 + }, + { + "epoch": 1.21, + "grad_norm": 3.038328493601596, + "learning_rate": 1.7737166940014525e-06, + "loss": 0.6465, + "step": 8108 + }, + { + "epoch": 1.21, + "grad_norm": 5.787118152520135, + "learning_rate": 1.7736554862121908e-06, + "loss": 0.6764, + "step": 8109 + }, + { + "epoch": 1.21, + "grad_norm": 0.6846141538367396, + "learning_rate": 1.7735942712023339e-06, + "loss": 0.6738, + "step": 8110 + }, + { + "epoch": 1.21, + "grad_norm": 3.7898289905964875, + "learning_rate": 1.7735330489724534e-06, + "loss": 0.6823, + "step": 8111 + }, + { + "epoch": 1.21, + "grad_norm": 1.38388505380564, + "learning_rate": 1.7734718195231203e-06, + "loss": 0.6654, + "step": 8112 + }, + { + "epoch": 1.21, + "grad_norm": 0.8729885540222594, + "learning_rate": 1.7734105828549066e-06, + "loss": 0.7051, + "step": 8113 + }, + { + "epoch": 1.21, + "grad_norm": 2.2246780821067125, + "learning_rate": 1.7733493389683832e-06, + "loss": 0.6882, + "step": 8114 + }, + { + "epoch": 1.21, + "grad_norm": 4.340450906066482, + "learning_rate": 1.7732880878641222e-06, + "loss": 0.6868, + "step": 8115 + }, + { + "epoch": 1.21, + "grad_norm": 2.2329795023766303, + "learning_rate": 1.7732268295426951e-06, + "loss": 0.6719, + "step": 8116 + }, + { + "epoch": 1.21, + "grad_norm": 1.7131971619986077, + "learning_rate": 1.7731655640046735e-06, + "loss": 0.6966, + "step": 8117 + }, + { + "epoch": 1.21, + "grad_norm": 3.5456415576942315, + "learning_rate": 1.7731042912506294e-06, + "loss": 0.6764, + "step": 8118 + }, + { + "epoch": 1.21, + "grad_norm": 3.802590835355331, + "learning_rate": 1.7730430112811348e-06, + "loss": 0.6999, + "step": 8119 + }, + { + "epoch": 1.21, + "grad_norm": 1.1053457201304784, + "learning_rate": 1.7729817240967612e-06, + "loss": 0.7122, + "step": 8120 + }, + { + "epoch": 1.21, + "grad_norm": 4.299477964562134, + "learning_rate": 1.7729204296980806e-06, + "loss": 0.6686, + "step": 8121 + }, + { + "epoch": 1.21, + "grad_norm": 2.3040373090961817, + "learning_rate": 1.7728591280856655e-06, + "loss": 0.6986, + "step": 8122 + }, + { + "epoch": 1.21, + "grad_norm": 0.9413246850467756, + "learning_rate": 1.7727978192600878e-06, + "loss": 0.6855, + "step": 8123 + }, + { + "epoch": 1.21, + "grad_norm": 3.601575715707658, + "learning_rate": 1.7727365032219198e-06, + "loss": 0.6745, + "step": 8124 + }, + { + "epoch": 1.21, + "grad_norm": 2.2999375867158327, + "learning_rate": 1.7726751799717338e-06, + "loss": 0.6641, + "step": 8125 + }, + { + "epoch": 1.21, + "grad_norm": 1.2938969326953424, + "learning_rate": 1.7726138495101018e-06, + "loss": 0.696, + "step": 8126 + }, + { + "epoch": 1.21, + "grad_norm": 1.78409393261442, + "learning_rate": 1.7725525118375963e-06, + "loss": 0.666, + "step": 8127 + }, + { + "epoch": 1.21, + "grad_norm": 5.392146370622397, + "learning_rate": 1.77249116695479e-06, + "loss": 0.6712, + "step": 8128 + }, + { + "epoch": 1.21, + "grad_norm": 7.765871937694265, + "learning_rate": 1.7724298148622553e-06, + "loss": 0.694, + "step": 8129 + }, + { + "epoch": 1.21, + "grad_norm": 4.127001047675583, + "learning_rate": 1.772368455560565e-06, + "loss": 0.679, + "step": 8130 + }, + { + "epoch": 1.21, + "grad_norm": 1.1189215638281813, + "learning_rate": 1.7723070890502915e-06, + "loss": 0.6693, + "step": 8131 + }, + { + "epoch": 1.21, + "grad_norm": 1.0053365032068624, + "learning_rate": 1.7722457153320076e-06, + "loss": 0.6836, + "step": 8132 + }, + { + "epoch": 1.21, + "grad_norm": 4.743162437287045, + "learning_rate": 1.772184334406286e-06, + "loss": 0.7051, + "step": 8133 + }, + { + "epoch": 1.21, + "grad_norm": 3.202820901249202, + "learning_rate": 1.7721229462736998e-06, + "loss": 0.6888, + "step": 8134 + }, + { + "epoch": 1.21, + "grad_norm": 3.2227190539325856, + "learning_rate": 1.772061550934822e-06, + "loss": 0.6816, + "step": 8135 + }, + { + "epoch": 1.21, + "grad_norm": 0.9296078476351898, + "learning_rate": 1.7720001483902254e-06, + "loss": 0.6797, + "step": 8136 + }, + { + "epoch": 1.21, + "grad_norm": 1.4094857244218904, + "learning_rate": 1.7719387386404832e-06, + "loss": 0.6738, + "step": 8137 + }, + { + "epoch": 1.21, + "grad_norm": 3.61277867099441, + "learning_rate": 1.7718773216861682e-06, + "loss": 0.6712, + "step": 8138 + }, + { + "epoch": 1.21, + "grad_norm": 4.355966743994376, + "learning_rate": 1.7718158975278543e-06, + "loss": 0.6921, + "step": 8139 + }, + { + "epoch": 1.21, + "grad_norm": 1.7519678321582035, + "learning_rate": 1.771754466166114e-06, + "loss": 0.6836, + "step": 8140 + }, + { + "epoch": 1.21, + "grad_norm": 1.141081157598876, + "learning_rate": 1.7716930276015214e-06, + "loss": 0.6823, + "step": 8141 + }, + { + "epoch": 1.21, + "grad_norm": 2.1453353169098195, + "learning_rate": 1.7716315818346495e-06, + "loss": 0.6836, + "step": 8142 + }, + { + "epoch": 1.21, + "grad_norm": 1.8122510155121356, + "learning_rate": 1.7715701288660715e-06, + "loss": 0.6608, + "step": 8143 + }, + { + "epoch": 1.21, + "grad_norm": 1.2510360251088728, + "learning_rate": 1.7715086686963612e-06, + "loss": 0.694, + "step": 8144 + }, + { + "epoch": 1.21, + "grad_norm": 0.7149843358698816, + "learning_rate": 1.7714472013260926e-06, + "loss": 0.6751, + "step": 8145 + }, + { + "epoch": 1.21, + "grad_norm": 0.896976054499155, + "learning_rate": 1.771385726755839e-06, + "loss": 0.6966, + "step": 8146 + }, + { + "epoch": 1.22, + "grad_norm": 0.6931386332700419, + "learning_rate": 1.771324244986174e-06, + "loss": 0.6673, + "step": 8147 + }, + { + "epoch": 1.22, + "grad_norm": 3.34364711354458, + "learning_rate": 1.7712627560176715e-06, + "loss": 0.6615, + "step": 8148 + }, + { + "epoch": 1.22, + "grad_norm": 2.7357485941270645, + "learning_rate": 1.7712012598509059e-06, + "loss": 0.6641, + "step": 8149 + }, + { + "epoch": 1.22, + "grad_norm": 0.7956279782142406, + "learning_rate": 1.7711397564864505e-06, + "loss": 0.6673, + "step": 8150 + }, + { + "epoch": 1.22, + "grad_norm": 0.7082800316012083, + "learning_rate": 1.7710782459248796e-06, + "loss": 0.651, + "step": 8151 + }, + { + "epoch": 1.22, + "grad_norm": 4.264009610533429, + "learning_rate": 1.7710167281667672e-06, + "loss": 0.6803, + "step": 8152 + }, + { + "epoch": 1.22, + "grad_norm": 1.7197508724951214, + "learning_rate": 1.7709552032126875e-06, + "loss": 0.6947, + "step": 8153 + }, + { + "epoch": 1.22, + "grad_norm": 2.4313470798188477, + "learning_rate": 1.7708936710632148e-06, + "loss": 0.666, + "step": 8154 + }, + { + "epoch": 1.22, + "grad_norm": 2.52813779146507, + "learning_rate": 1.7708321317189233e-06, + "loss": 0.6882, + "step": 8155 + }, + { + "epoch": 1.22, + "grad_norm": 3.8424798962576836, + "learning_rate": 1.7707705851803873e-06, + "loss": 0.6693, + "step": 8156 + }, + { + "epoch": 1.22, + "grad_norm": 3.8992045368418347, + "learning_rate": 1.770709031448181e-06, + "loss": 0.6549, + "step": 8157 + }, + { + "epoch": 1.22, + "grad_norm": 2.582451843038587, + "learning_rate": 1.7706474705228794e-06, + "loss": 0.7168, + "step": 8158 + }, + { + "epoch": 1.22, + "grad_norm": 7.243826918951421, + "learning_rate": 1.7705859024050568e-06, + "loss": 0.7259, + "step": 8159 + }, + { + "epoch": 1.22, + "grad_norm": 1.1529046162296575, + "learning_rate": 1.770524327095288e-06, + "loss": 0.6628, + "step": 8160 + }, + { + "epoch": 1.22, + "grad_norm": 6.214789892082113, + "learning_rate": 1.7704627445941474e-06, + "loss": 0.7044, + "step": 8161 + }, + { + "epoch": 1.22, + "grad_norm": 0.7529585416549178, + "learning_rate": 1.7704011549022097e-06, + "loss": 0.6862, + "step": 8162 + }, + { + "epoch": 1.22, + "grad_norm": 2.149854405522985, + "learning_rate": 1.77033955802005e-06, + "loss": 0.6797, + "step": 8163 + }, + { + "epoch": 1.22, + "grad_norm": 3.4119175255615324, + "learning_rate": 1.770277953948243e-06, + "loss": 0.694, + "step": 8164 + }, + { + "epoch": 1.22, + "grad_norm": 1.2092186990065747, + "learning_rate": 1.7702163426873638e-06, + "loss": 0.6921, + "step": 8165 + }, + { + "epoch": 1.22, + "grad_norm": 2.3605048872060412, + "learning_rate": 1.7701547242379876e-06, + "loss": 0.7005, + "step": 8166 + }, + { + "epoch": 1.22, + "grad_norm": 0.7633541672538737, + "learning_rate": 1.770093098600689e-06, + "loss": 0.6934, + "step": 8167 + }, + { + "epoch": 1.22, + "grad_norm": 0.6695014281244466, + "learning_rate": 1.7700314657760434e-06, + "loss": 0.668, + "step": 8168 + }, + { + "epoch": 1.22, + "grad_norm": 1.6385630013588564, + "learning_rate": 1.769969825764626e-06, + "loss": 0.6732, + "step": 8169 + }, + { + "epoch": 1.22, + "grad_norm": 0.6879260657156457, + "learning_rate": 1.7699081785670123e-06, + "loss": 0.668, + "step": 8170 + }, + { + "epoch": 1.22, + "grad_norm": 0.7459719294779475, + "learning_rate": 1.769846524183777e-06, + "loss": 0.668, + "step": 8171 + }, + { + "epoch": 1.22, + "grad_norm": 2.788413024224751, + "learning_rate": 1.7697848626154966e-06, + "loss": 0.6829, + "step": 8172 + }, + { + "epoch": 1.22, + "grad_norm": 0.6855647258704008, + "learning_rate": 1.7697231938627458e-06, + "loss": 0.6732, + "step": 8173 + }, + { + "epoch": 1.22, + "grad_norm": 3.008385905203185, + "learning_rate": 1.7696615179261003e-06, + "loss": 0.707, + "step": 8174 + }, + { + "epoch": 1.22, + "grad_norm": 4.15455542580627, + "learning_rate": 1.7695998348061357e-06, + "loss": 0.6758, + "step": 8175 + }, + { + "epoch": 1.22, + "grad_norm": 0.980500633382956, + "learning_rate": 1.7695381445034281e-06, + "loss": 0.6706, + "step": 8176 + }, + { + "epoch": 1.22, + "grad_norm": 1.520957677985349, + "learning_rate": 1.7694764470185525e-06, + "loss": 0.6634, + "step": 8177 + }, + { + "epoch": 1.22, + "grad_norm": 3.379698861131609, + "learning_rate": 1.7694147423520856e-06, + "loss": 0.6777, + "step": 8178 + }, + { + "epoch": 1.22, + "grad_norm": 1.2209084734528144, + "learning_rate": 1.7693530305046025e-06, + "loss": 0.6491, + "step": 8179 + }, + { + "epoch": 1.22, + "grad_norm": 5.6179639975104045, + "learning_rate": 1.76929131147668e-06, + "loss": 0.6686, + "step": 8180 + }, + { + "epoch": 1.22, + "grad_norm": 1.1146303887758724, + "learning_rate": 1.7692295852688931e-06, + "loss": 0.6934, + "step": 8181 + }, + { + "epoch": 1.22, + "grad_norm": 6.783286621290227, + "learning_rate": 1.7691678518818188e-06, + "loss": 0.681, + "step": 8182 + }, + { + "epoch": 1.22, + "grad_norm": 0.6912562622230408, + "learning_rate": 1.7691061113160328e-06, + "loss": 0.6823, + "step": 8183 + }, + { + "epoch": 1.22, + "grad_norm": 1.3899496637048219, + "learning_rate": 1.7690443635721115e-06, + "loss": 0.6471, + "step": 8184 + }, + { + "epoch": 1.22, + "grad_norm": 6.736847900391969, + "learning_rate": 1.7689826086506311e-06, + "loss": 0.6745, + "step": 8185 + }, + { + "epoch": 1.22, + "grad_norm": 2.5359549229810434, + "learning_rate": 1.7689208465521682e-06, + "loss": 0.6712, + "step": 8186 + }, + { + "epoch": 1.22, + "grad_norm": 1.1757375208855845, + "learning_rate": 1.7688590772772987e-06, + "loss": 0.6549, + "step": 8187 + }, + { + "epoch": 1.22, + "grad_norm": 0.7010415049602778, + "learning_rate": 1.7687973008265998e-06, + "loss": 0.6673, + "step": 8188 + }, + { + "epoch": 1.22, + "grad_norm": 2.0060363621947706, + "learning_rate": 1.7687355172006476e-06, + "loss": 0.7005, + "step": 8189 + }, + { + "epoch": 1.22, + "grad_norm": 2.3995823685756776, + "learning_rate": 1.7686737264000184e-06, + "loss": 0.6797, + "step": 8190 + }, + { + "epoch": 1.22, + "grad_norm": 2.3964460786265, + "learning_rate": 1.76861192842529e-06, + "loss": 0.6901, + "step": 8191 + }, + { + "epoch": 1.22, + "grad_norm": 3.8270871460565923, + "learning_rate": 1.7685501232770381e-06, + "loss": 0.7012, + "step": 8192 + }, + { + "epoch": 1.22, + "grad_norm": 1.605134420658266, + "learning_rate": 1.7684883109558399e-06, + "loss": 0.6445, + "step": 8193 + }, + { + "epoch": 1.22, + "grad_norm": 2.2137068557397392, + "learning_rate": 1.7684264914622724e-06, + "loss": 0.6595, + "step": 8194 + }, + { + "epoch": 1.22, + "grad_norm": 1.2496740608981667, + "learning_rate": 1.7683646647969125e-06, + "loss": 0.6667, + "step": 8195 + }, + { + "epoch": 1.22, + "grad_norm": 1.424385354956029, + "learning_rate": 1.7683028309603373e-06, + "loss": 0.6621, + "step": 8196 + }, + { + "epoch": 1.22, + "grad_norm": 0.9142815550875967, + "learning_rate": 1.7682409899531238e-06, + "loss": 0.6888, + "step": 8197 + }, + { + "epoch": 1.22, + "grad_norm": 3.116326621822263, + "learning_rate": 1.7681791417758495e-06, + "loss": 0.6725, + "step": 8198 + }, + { + "epoch": 1.22, + "grad_norm": 4.175419675368888, + "learning_rate": 1.7681172864290909e-06, + "loss": 0.6471, + "step": 8199 + }, + { + "epoch": 1.22, + "grad_norm": 1.4248473542400282, + "learning_rate": 1.768055423913426e-06, + "loss": 0.666, + "step": 8200 + }, + { + "epoch": 1.22, + "grad_norm": 0.9246134005067722, + "learning_rate": 1.7679935542294318e-06, + "loss": 0.6803, + "step": 8201 + }, + { + "epoch": 1.22, + "grad_norm": 0.867645803507274, + "learning_rate": 1.7679316773776857e-06, + "loss": 0.6882, + "step": 8202 + }, + { + "epoch": 1.22, + "grad_norm": 2.2035800219688637, + "learning_rate": 1.7678697933587653e-06, + "loss": 0.6719, + "step": 8203 + }, + { + "epoch": 1.22, + "grad_norm": 0.935903801952249, + "learning_rate": 1.7678079021732484e-06, + "loss": 0.6758, + "step": 8204 + }, + { + "epoch": 1.22, + "grad_norm": 2.3187789304899704, + "learning_rate": 1.7677460038217125e-06, + "loss": 0.6764, + "step": 8205 + }, + { + "epoch": 1.22, + "grad_norm": 2.6409512757403943, + "learning_rate": 1.7676840983047355e-06, + "loss": 0.7083, + "step": 8206 + }, + { + "epoch": 1.22, + "grad_norm": 0.9184512983405025, + "learning_rate": 1.7676221856228946e-06, + "loss": 0.6686, + "step": 8207 + }, + { + "epoch": 1.22, + "grad_norm": 3.014943328206297, + "learning_rate": 1.7675602657767677e-06, + "loss": 0.6777, + "step": 8208 + }, + { + "epoch": 1.22, + "grad_norm": 4.565836661722948, + "learning_rate": 1.7674983387669334e-06, + "loss": 0.6589, + "step": 8209 + }, + { + "epoch": 1.22, + "grad_norm": 1.5143088786371517, + "learning_rate": 1.7674364045939689e-06, + "loss": 0.6654, + "step": 8210 + }, + { + "epoch": 1.22, + "grad_norm": 3.2323895371490914, + "learning_rate": 1.7673744632584526e-06, + "loss": 0.7051, + "step": 8211 + }, + { + "epoch": 1.22, + "grad_norm": 3.548704437811231, + "learning_rate": 1.7673125147609626e-06, + "loss": 0.6719, + "step": 8212 + }, + { + "epoch": 1.22, + "grad_norm": 2.0199018355121914, + "learning_rate": 1.7672505591020771e-06, + "loss": 0.6608, + "step": 8213 + }, + { + "epoch": 1.23, + "grad_norm": 1.2698697524866405, + "learning_rate": 1.7671885962823744e-06, + "loss": 0.6829, + "step": 8214 + }, + { + "epoch": 1.23, + "grad_norm": 0.7031258780292228, + "learning_rate": 1.7671266263024327e-06, + "loss": 0.6882, + "step": 8215 + }, + { + "epoch": 1.23, + "grad_norm": 1.136399521484059, + "learning_rate": 1.7670646491628304e-06, + "loss": 0.7012, + "step": 8216 + }, + { + "epoch": 1.23, + "grad_norm": 1.0313037312679345, + "learning_rate": 1.7670026648641457e-06, + "loss": 0.6647, + "step": 8217 + }, + { + "epoch": 1.23, + "grad_norm": 2.075095039569451, + "learning_rate": 1.7669406734069572e-06, + "loss": 0.6829, + "step": 8218 + }, + { + "epoch": 1.23, + "grad_norm": 1.5573693606319854, + "learning_rate": 1.7668786747918438e-06, + "loss": 0.6784, + "step": 8219 + }, + { + "epoch": 1.23, + "grad_norm": 1.962938850727921, + "learning_rate": 1.7668166690193837e-06, + "loss": 0.6862, + "step": 8220 + }, + { + "epoch": 1.23, + "grad_norm": 1.7720515640590893, + "learning_rate": 1.766754656090156e-06, + "loss": 0.6849, + "step": 8221 + }, + { + "epoch": 1.23, + "grad_norm": 0.6142316470200578, + "learning_rate": 1.7666926360047393e-06, + "loss": 0.6927, + "step": 8222 + }, + { + "epoch": 1.23, + "grad_norm": 1.2526534836131513, + "learning_rate": 1.7666306087637125e-06, + "loss": 0.6719, + "step": 8223 + }, + { + "epoch": 1.23, + "grad_norm": 1.034490408875806, + "learning_rate": 1.7665685743676541e-06, + "loss": 0.6673, + "step": 8224 + }, + { + "epoch": 1.23, + "grad_norm": 3.3818874180539242, + "learning_rate": 1.7665065328171438e-06, + "loss": 0.6602, + "step": 8225 + }, + { + "epoch": 1.23, + "grad_norm": 7.290855308578177, + "learning_rate": 1.76644448411276e-06, + "loss": 0.6953, + "step": 8226 + }, + { + "epoch": 1.23, + "grad_norm": 4.841056557950499, + "learning_rate": 1.766382428255082e-06, + "loss": 0.7233, + "step": 8227 + }, + { + "epoch": 1.23, + "grad_norm": 5.511663957425788, + "learning_rate": 1.766320365244689e-06, + "loss": 0.6986, + "step": 8228 + }, + { + "epoch": 1.23, + "grad_norm": 2.5184178009884994, + "learning_rate": 1.7662582950821604e-06, + "loss": 0.696, + "step": 8229 + }, + { + "epoch": 1.23, + "grad_norm": 0.6621913900281183, + "learning_rate": 1.7661962177680754e-06, + "loss": 0.6888, + "step": 8230 + }, + { + "epoch": 1.23, + "grad_norm": 2.6650757063575634, + "learning_rate": 1.766134133303013e-06, + "loss": 0.6914, + "step": 8231 + }, + { + "epoch": 1.23, + "grad_norm": 3.0452233633113086, + "learning_rate": 1.7660720416875534e-06, + "loss": 0.6738, + "step": 8232 + }, + { + "epoch": 1.23, + "grad_norm": 1.2661483744980122, + "learning_rate": 1.7660099429222757e-06, + "loss": 0.6986, + "step": 8233 + }, + { + "epoch": 1.23, + "grad_norm": 1.420322240065862, + "learning_rate": 1.7659478370077591e-06, + "loss": 0.6608, + "step": 8234 + }, + { + "epoch": 1.23, + "grad_norm": 2.8641081392238763, + "learning_rate": 1.765885723944584e-06, + "loss": 0.6673, + "step": 8235 + }, + { + "epoch": 1.23, + "grad_norm": 0.6541503785422895, + "learning_rate": 1.7658236037333294e-06, + "loss": 0.6999, + "step": 8236 + }, + { + "epoch": 1.23, + "grad_norm": 1.4978601258745312, + "learning_rate": 1.7657614763745755e-06, + "loss": 0.6673, + "step": 8237 + }, + { + "epoch": 1.23, + "grad_norm": 4.0714027312415375, + "learning_rate": 1.7656993418689022e-06, + "loss": 0.6738, + "step": 8238 + }, + { + "epoch": 1.23, + "grad_norm": 4.648879169107128, + "learning_rate": 1.7656372002168886e-06, + "loss": 0.6914, + "step": 8239 + }, + { + "epoch": 1.23, + "grad_norm": 1.4318971321190677, + "learning_rate": 1.7655750514191158e-06, + "loss": 0.6797, + "step": 8240 + }, + { + "epoch": 1.23, + "grad_norm": 2.195239957366541, + "learning_rate": 1.7655128954761635e-06, + "loss": 0.6771, + "step": 8241 + }, + { + "epoch": 1.23, + "grad_norm": 1.3276709569288978, + "learning_rate": 1.7654507323886114e-06, + "loss": 0.6523, + "step": 8242 + }, + { + "epoch": 1.23, + "grad_norm": 0.7777173560939453, + "learning_rate": 1.76538856215704e-06, + "loss": 0.6784, + "step": 8243 + }, + { + "epoch": 1.23, + "grad_norm": 0.5651129849808804, + "learning_rate": 1.7653263847820292e-06, + "loss": 0.6868, + "step": 8244 + }, + { + "epoch": 1.23, + "grad_norm": 1.0633022339523535, + "learning_rate": 1.7652642002641597e-06, + "loss": 0.6621, + "step": 8245 + }, + { + "epoch": 1.23, + "grad_norm": 4.651009633298751, + "learning_rate": 1.765202008604012e-06, + "loss": 0.681, + "step": 8246 + }, + { + "epoch": 1.23, + "grad_norm": 1.1803580753357932, + "learning_rate": 1.7651398098021662e-06, + "loss": 0.6706, + "step": 8247 + }, + { + "epoch": 1.23, + "grad_norm": 1.4081566319215593, + "learning_rate": 1.7650776038592027e-06, + "loss": 0.6868, + "step": 8248 + }, + { + "epoch": 1.23, + "grad_norm": 0.6338749896252511, + "learning_rate": 1.7650153907757024e-06, + "loss": 0.6868, + "step": 8249 + }, + { + "epoch": 1.23, + "grad_norm": 2.6172661787155778, + "learning_rate": 1.7649531705522456e-06, + "loss": 0.666, + "step": 8250 + }, + { + "epoch": 1.23, + "grad_norm": 5.842230117963634, + "learning_rate": 1.7648909431894134e-06, + "loss": 0.6823, + "step": 8251 + }, + { + "epoch": 1.23, + "grad_norm": 4.818393758993432, + "learning_rate": 1.7648287086877863e-06, + "loss": 0.6836, + "step": 8252 + }, + { + "epoch": 1.23, + "grad_norm": 1.1326406186725255, + "learning_rate": 1.7647664670479455e-06, + "loss": 0.6667, + "step": 8253 + }, + { + "epoch": 1.23, + "grad_norm": 0.6568878366838505, + "learning_rate": 1.7647042182704712e-06, + "loss": 0.6628, + "step": 8254 + }, + { + "epoch": 1.23, + "grad_norm": 3.8146669391815524, + "learning_rate": 1.764641962355945e-06, + "loss": 0.6849, + "step": 8255 + }, + { + "epoch": 1.23, + "grad_norm": 2.7339644212500485, + "learning_rate": 1.7645796993049477e-06, + "loss": 0.6999, + "step": 8256 + }, + { + "epoch": 1.23, + "grad_norm": 2.508111828057744, + "learning_rate": 1.7645174291180607e-06, + "loss": 0.679, + "step": 8257 + }, + { + "epoch": 1.23, + "grad_norm": 3.7166855283642843, + "learning_rate": 1.7644551517958646e-06, + "loss": 0.6686, + "step": 8258 + }, + { + "epoch": 1.23, + "grad_norm": 0.6974609093641714, + "learning_rate": 1.764392867338941e-06, + "loss": 0.6803, + "step": 8259 + }, + { + "epoch": 1.23, + "grad_norm": 3.089076446921345, + "learning_rate": 1.7643305757478713e-06, + "loss": 0.6927, + "step": 8260 + }, + { + "epoch": 1.23, + "grad_norm": 0.8677492948187838, + "learning_rate": 1.7642682770232367e-06, + "loss": 0.6628, + "step": 8261 + }, + { + "epoch": 1.23, + "grad_norm": 2.3050916289694765, + "learning_rate": 1.7642059711656186e-06, + "loss": 0.6576, + "step": 8262 + }, + { + "epoch": 1.23, + "grad_norm": 4.140313732100836, + "learning_rate": 1.7641436581755988e-06, + "loss": 0.6706, + "step": 8263 + }, + { + "epoch": 1.23, + "grad_norm": 1.0493675425011166, + "learning_rate": 1.7640813380537585e-06, + "loss": 0.6966, + "step": 8264 + }, + { + "epoch": 1.23, + "grad_norm": 0.7798459607596451, + "learning_rate": 1.7640190108006796e-06, + "loss": 0.6758, + "step": 8265 + }, + { + "epoch": 1.23, + "grad_norm": 4.8160955311859865, + "learning_rate": 1.7639566764169436e-06, + "loss": 0.666, + "step": 8266 + }, + { + "epoch": 1.23, + "grad_norm": 1.7588300322555348, + "learning_rate": 1.7638943349031323e-06, + "loss": 0.6862, + "step": 8267 + }, + { + "epoch": 1.23, + "grad_norm": 3.3849728026728627, + "learning_rate": 1.763831986259828e-06, + "loss": 0.6556, + "step": 8268 + }, + { + "epoch": 1.23, + "grad_norm": 1.136590102870004, + "learning_rate": 1.7637696304876118e-06, + "loss": 0.6458, + "step": 8269 + }, + { + "epoch": 1.23, + "grad_norm": 1.0981610571080447, + "learning_rate": 1.7637072675870664e-06, + "loss": 0.6647, + "step": 8270 + }, + { + "epoch": 1.23, + "grad_norm": 2.3013501861278263, + "learning_rate": 1.7636448975587733e-06, + "loss": 0.7122, + "step": 8271 + }, + { + "epoch": 1.23, + "grad_norm": 3.830554378136484, + "learning_rate": 1.7635825204033148e-06, + "loss": 0.6855, + "step": 8272 + }, + { + "epoch": 1.23, + "grad_norm": 3.50938204458683, + "learning_rate": 1.7635201361212732e-06, + "loss": 0.6732, + "step": 8273 + }, + { + "epoch": 1.23, + "grad_norm": 1.9859260435078687, + "learning_rate": 1.763457744713231e-06, + "loss": 0.6732, + "step": 8274 + }, + { + "epoch": 1.23, + "grad_norm": 1.6658647293460294, + "learning_rate": 1.7633953461797698e-06, + "loss": 0.6953, + "step": 8275 + }, + { + "epoch": 1.23, + "grad_norm": 4.93105816920559, + "learning_rate": 1.7633329405214722e-06, + "loss": 0.668, + "step": 8276 + }, + { + "epoch": 1.23, + "grad_norm": 2.8291309886595624, + "learning_rate": 1.7632705277389209e-06, + "loss": 0.6862, + "step": 8277 + }, + { + "epoch": 1.23, + "grad_norm": 1.3334361972311257, + "learning_rate": 1.7632081078326984e-06, + "loss": 0.6673, + "step": 8278 + }, + { + "epoch": 1.23, + "grad_norm": 6.672718668178125, + "learning_rate": 1.7631456808033866e-06, + "loss": 0.696, + "step": 8279 + }, + { + "epoch": 1.23, + "grad_norm": 3.4459198810863274, + "learning_rate": 1.7630832466515692e-06, + "loss": 0.6745, + "step": 8280 + }, + { + "epoch": 1.24, + "grad_norm": 4.011819732756287, + "learning_rate": 1.7630208053778284e-06, + "loss": 0.6732, + "step": 8281 + }, + { + "epoch": 1.24, + "grad_norm": 2.4273266034255836, + "learning_rate": 1.7629583569827467e-06, + "loss": 0.6771, + "step": 8282 + }, + { + "epoch": 1.24, + "grad_norm": 0.6821066302691414, + "learning_rate": 1.7628959014669074e-06, + "loss": 0.6784, + "step": 8283 + }, + { + "epoch": 1.24, + "grad_norm": 1.2279563110993712, + "learning_rate": 1.762833438830893e-06, + "loss": 0.6523, + "step": 8284 + }, + { + "epoch": 1.24, + "grad_norm": 2.4964478225566977, + "learning_rate": 1.7627709690752867e-06, + "loss": 0.6797, + "step": 8285 + }, + { + "epoch": 1.24, + "grad_norm": 0.7613035736281238, + "learning_rate": 1.7627084922006716e-06, + "loss": 0.6628, + "step": 8286 + }, + { + "epoch": 1.24, + "grad_norm": 3.308376154090149, + "learning_rate": 1.7626460082076307e-06, + "loss": 0.6628, + "step": 8287 + }, + { + "epoch": 1.24, + "grad_norm": 4.7640308642098965, + "learning_rate": 1.762583517096747e-06, + "loss": 0.6725, + "step": 8288 + }, + { + "epoch": 1.24, + "grad_norm": 2.2261520742141516, + "learning_rate": 1.762521018868604e-06, + "loss": 0.6829, + "step": 8289 + }, + { + "epoch": 1.24, + "grad_norm": 2.7437878893456173, + "learning_rate": 1.7624585135237849e-06, + "loss": 0.6829, + "step": 8290 + }, + { + "epoch": 1.24, + "grad_norm": 4.01606947286814, + "learning_rate": 1.762396001062873e-06, + "loss": 0.6556, + "step": 8291 + }, + { + "epoch": 1.24, + "grad_norm": 1.5427473632290218, + "learning_rate": 1.7623334814864517e-06, + "loss": 0.6634, + "step": 8292 + }, + { + "epoch": 1.24, + "grad_norm": 3.1379605067542924, + "learning_rate": 1.7622709547951047e-06, + "loss": 0.6602, + "step": 8293 + }, + { + "epoch": 1.24, + "grad_norm": 0.9523811727279502, + "learning_rate": 1.7622084209894155e-06, + "loss": 0.6641, + "step": 8294 + }, + { + "epoch": 1.24, + "grad_norm": 3.132693475938538, + "learning_rate": 1.7621458800699677e-06, + "loss": 0.6999, + "step": 8295 + }, + { + "epoch": 1.24, + "grad_norm": 2.609455016406023, + "learning_rate": 1.7620833320373452e-06, + "loss": 0.679, + "step": 8296 + }, + { + "epoch": 1.24, + "grad_norm": 3.210516049181611, + "learning_rate": 1.7620207768921311e-06, + "loss": 0.6764, + "step": 8297 + }, + { + "epoch": 1.24, + "grad_norm": 0.881287192366005, + "learning_rate": 1.7619582146349101e-06, + "loss": 0.6784, + "step": 8298 + }, + { + "epoch": 1.24, + "grad_norm": 2.0740147524569013, + "learning_rate": 1.7618956452662654e-06, + "loss": 0.6745, + "step": 8299 + }, + { + "epoch": 1.24, + "grad_norm": 0.9332850608717427, + "learning_rate": 1.7618330687867815e-06, + "loss": 0.679, + "step": 8300 + }, + { + "epoch": 1.24, + "grad_norm": 1.272861602782909, + "learning_rate": 1.761770485197042e-06, + "loss": 0.6621, + "step": 8301 + }, + { + "epoch": 1.24, + "grad_norm": 4.669319098657656, + "learning_rate": 1.7617078944976313e-06, + "loss": 0.651, + "step": 8302 + }, + { + "epoch": 1.24, + "grad_norm": 0.8736833930219595, + "learning_rate": 1.7616452966891335e-06, + "loss": 0.6602, + "step": 8303 + }, + { + "epoch": 1.24, + "grad_norm": 5.660221529832421, + "learning_rate": 1.7615826917721328e-06, + "loss": 0.6836, + "step": 8304 + }, + { + "epoch": 1.24, + "grad_norm": 2.4802027157881805, + "learning_rate": 1.7615200797472136e-06, + "loss": 0.7005, + "step": 8305 + }, + { + "epoch": 1.24, + "grad_norm": 1.2186244419074548, + "learning_rate": 1.76145746061496e-06, + "loss": 0.6719, + "step": 8306 + }, + { + "epoch": 1.24, + "grad_norm": 3.2813224927459954, + "learning_rate": 1.7613948343759563e-06, + "loss": 0.64, + "step": 8307 + }, + { + "epoch": 1.24, + "grad_norm": 6.063402143950922, + "learning_rate": 1.7613322010307877e-06, + "loss": 0.6693, + "step": 8308 + }, + { + "epoch": 1.24, + "grad_norm": 0.8668916099258407, + "learning_rate": 1.7612695605800383e-06, + "loss": 0.6732, + "step": 8309 + }, + { + "epoch": 1.24, + "grad_norm": 1.2391618177511563, + "learning_rate": 1.7612069130242926e-06, + "loss": 0.666, + "step": 8310 + }, + { + "epoch": 1.24, + "grad_norm": 2.5290692136380013, + "learning_rate": 1.7611442583641356e-06, + "loss": 0.6641, + "step": 8311 + }, + { + "epoch": 1.24, + "grad_norm": 1.460435191437211, + "learning_rate": 1.7610815966001518e-06, + "loss": 0.6556, + "step": 8312 + }, + { + "epoch": 1.24, + "grad_norm": 2.849453255899012, + "learning_rate": 1.761018927732926e-06, + "loss": 0.6875, + "step": 8313 + }, + { + "epoch": 1.24, + "grad_norm": 0.9174316576932129, + "learning_rate": 1.7609562517630437e-06, + "loss": 0.6829, + "step": 8314 + }, + { + "epoch": 1.24, + "grad_norm": 5.652115576801831, + "learning_rate": 1.760893568691089e-06, + "loss": 0.6641, + "step": 8315 + }, + { + "epoch": 1.24, + "grad_norm": 5.287898989315849, + "learning_rate": 1.7608308785176476e-06, + "loss": 0.6517, + "step": 8316 + }, + { + "epoch": 1.24, + "grad_norm": 1.028516130335906, + "learning_rate": 1.7607681812433039e-06, + "loss": 0.6764, + "step": 8317 + }, + { + "epoch": 1.24, + "grad_norm": 3.6258883119933003, + "learning_rate": 1.760705476868644e-06, + "loss": 0.6719, + "step": 8318 + }, + { + "epoch": 1.24, + "grad_norm": 3.7932115266417235, + "learning_rate": 1.7606427653942527e-06, + "loss": 0.6836, + "step": 8319 + }, + { + "epoch": 1.24, + "grad_norm": 0.9315254493509603, + "learning_rate": 1.7605800468207145e-06, + "loss": 0.6836, + "step": 8320 + }, + { + "epoch": 1.24, + "grad_norm": 1.4611501884078304, + "learning_rate": 1.7605173211486161e-06, + "loss": 0.7025, + "step": 8321 + }, + { + "epoch": 1.24, + "grad_norm": 2.256899317167705, + "learning_rate": 1.760454588378542e-06, + "loss": 0.6764, + "step": 8322 + }, + { + "epoch": 1.24, + "grad_norm": 3.4705917822017156, + "learning_rate": 1.760391848511078e-06, + "loss": 0.7272, + "step": 8323 + }, + { + "epoch": 1.24, + "grad_norm": 0.9986078069386555, + "learning_rate": 1.7603291015468096e-06, + "loss": 0.7083, + "step": 8324 + }, + { + "epoch": 1.24, + "grad_norm": 1.6899027446325254, + "learning_rate": 1.7602663474863224e-06, + "loss": 0.6569, + "step": 8325 + }, + { + "epoch": 1.24, + "grad_norm": 6.670374723060532, + "learning_rate": 1.7602035863302024e-06, + "loss": 0.7018, + "step": 8326 + }, + { + "epoch": 1.24, + "grad_norm": 2.127358785799581, + "learning_rate": 1.7601408180790349e-06, + "loss": 0.6595, + "step": 8327 + }, + { + "epoch": 1.24, + "grad_norm": 1.6549007358110601, + "learning_rate": 1.760078042733406e-06, + "loss": 0.6914, + "step": 8328 + }, + { + "epoch": 1.24, + "grad_norm": 1.7585197971847122, + "learning_rate": 1.7600152602939016e-06, + "loss": 0.6706, + "step": 8329 + }, + { + "epoch": 1.24, + "grad_norm": 0.6326049604782407, + "learning_rate": 1.7599524707611074e-06, + "loss": 0.6738, + "step": 8330 + }, + { + "epoch": 1.24, + "grad_norm": 1.1621457043205343, + "learning_rate": 1.7598896741356097e-06, + "loss": 0.681, + "step": 8331 + }, + { + "epoch": 1.24, + "grad_norm": 1.097046593798738, + "learning_rate": 1.7598268704179944e-06, + "loss": 0.6816, + "step": 8332 + }, + { + "epoch": 1.24, + "grad_norm": 1.9852633995692814, + "learning_rate": 1.7597640596088477e-06, + "loss": 0.6615, + "step": 8333 + }, + { + "epoch": 1.24, + "grad_norm": 3.7821357844372407, + "learning_rate": 1.759701241708756e-06, + "loss": 0.6829, + "step": 8334 + }, + { + "epoch": 1.24, + "grad_norm": 1.342516555943925, + "learning_rate": 1.7596384167183054e-06, + "loss": 0.6602, + "step": 8335 + }, + { + "epoch": 1.24, + "grad_norm": 3.1340567845804848, + "learning_rate": 1.759575584638082e-06, + "loss": 0.666, + "step": 8336 + }, + { + "epoch": 1.24, + "grad_norm": 0.8813073871417317, + "learning_rate": 1.7595127454686727e-06, + "loss": 0.6497, + "step": 8337 + }, + { + "epoch": 1.24, + "grad_norm": 3.129347412582638, + "learning_rate": 1.7594498992106638e-06, + "loss": 0.6797, + "step": 8338 + }, + { + "epoch": 1.24, + "grad_norm": 1.1310518934610587, + "learning_rate": 1.7593870458646419e-06, + "loss": 0.6667, + "step": 8339 + }, + { + "epoch": 1.24, + "grad_norm": 4.433729236853284, + "learning_rate": 1.7593241854311933e-06, + "loss": 0.694, + "step": 8340 + }, + { + "epoch": 1.24, + "grad_norm": 0.6592604820942095, + "learning_rate": 1.759261317910905e-06, + "loss": 0.6764, + "step": 8341 + }, + { + "epoch": 1.24, + "grad_norm": 2.404018492965414, + "learning_rate": 1.759198443304364e-06, + "loss": 0.6641, + "step": 8342 + }, + { + "epoch": 1.24, + "grad_norm": 0.7519227063821126, + "learning_rate": 1.7591355616121564e-06, + "loss": 0.6628, + "step": 8343 + }, + { + "epoch": 1.24, + "grad_norm": 2.0037361141013075, + "learning_rate": 1.7590726728348693e-06, + "loss": 0.6868, + "step": 8344 + }, + { + "epoch": 1.24, + "grad_norm": 0.9447979089266481, + "learning_rate": 1.75900977697309e-06, + "loss": 0.6634, + "step": 8345 + }, + { + "epoch": 1.24, + "grad_norm": 2.1969674060010647, + "learning_rate": 1.7589468740274058e-06, + "loss": 0.6641, + "step": 8346 + }, + { + "epoch": 1.24, + "grad_norm": 4.079204188332253, + "learning_rate": 1.7588839639984028e-06, + "loss": 0.6771, + "step": 8347 + }, + { + "epoch": 1.25, + "grad_norm": 0.848377640271139, + "learning_rate": 1.758821046886669e-06, + "loss": 0.6868, + "step": 8348 + }, + { + "epoch": 1.25, + "grad_norm": 1.7066958657695377, + "learning_rate": 1.7587581226927907e-06, + "loss": 0.6699, + "step": 8349 + }, + { + "epoch": 1.25, + "grad_norm": 1.7888790568363018, + "learning_rate": 1.7586951914173562e-06, + "loss": 0.6855, + "step": 8350 + }, + { + "epoch": 1.25, + "grad_norm": 3.18674036823073, + "learning_rate": 1.7586322530609523e-06, + "loss": 0.681, + "step": 8351 + }, + { + "epoch": 1.25, + "grad_norm": 0.9589627868263441, + "learning_rate": 1.7585693076241666e-06, + "loss": 0.6667, + "step": 8352 + }, + { + "epoch": 1.25, + "grad_norm": 3.3247333178740246, + "learning_rate": 1.758506355107586e-06, + "loss": 0.6901, + "step": 8353 + }, + { + "epoch": 1.25, + "grad_norm": 3.016680637587108, + "learning_rate": 1.758443395511799e-06, + "loss": 0.6823, + "step": 8354 + }, + { + "epoch": 1.25, + "grad_norm": 1.767005654574231, + "learning_rate": 1.7583804288373926e-06, + "loss": 0.7155, + "step": 8355 + }, + { + "epoch": 1.25, + "grad_norm": 1.494591412663007, + "learning_rate": 1.7583174550849545e-06, + "loss": 0.6608, + "step": 8356 + }, + { + "epoch": 1.25, + "grad_norm": 1.991625312523906, + "learning_rate": 1.7582544742550727e-06, + "loss": 0.6823, + "step": 8357 + }, + { + "epoch": 1.25, + "grad_norm": 1.9540029281986886, + "learning_rate": 1.7581914863483345e-06, + "loss": 0.6582, + "step": 8358 + }, + { + "epoch": 1.25, + "grad_norm": 0.879703079239246, + "learning_rate": 1.7581284913653286e-06, + "loss": 0.6758, + "step": 8359 + }, + { + "epoch": 1.25, + "grad_norm": 4.194850350146564, + "learning_rate": 1.758065489306642e-06, + "loss": 0.6888, + "step": 8360 + }, + { + "epoch": 1.25, + "grad_norm": 6.48458269731789, + "learning_rate": 1.7580024801728634e-06, + "loss": 0.6868, + "step": 8361 + }, + { + "epoch": 1.25, + "grad_norm": 3.212130830681786, + "learning_rate": 1.7579394639645804e-06, + "loss": 0.6732, + "step": 8362 + }, + { + "epoch": 1.25, + "grad_norm": 4.56138677896627, + "learning_rate": 1.7578764406823816e-06, + "loss": 0.6582, + "step": 8363 + }, + { + "epoch": 1.25, + "grad_norm": 3.0201426949192203, + "learning_rate": 1.7578134103268546e-06, + "loss": 0.6745, + "step": 8364 + }, + { + "epoch": 1.25, + "grad_norm": 1.609016438603995, + "learning_rate": 1.7577503728985886e-06, + "loss": 0.6895, + "step": 8365 + }, + { + "epoch": 1.25, + "grad_norm": 0.6862526804486803, + "learning_rate": 1.757687328398171e-06, + "loss": 0.6758, + "step": 8366 + }, + { + "epoch": 1.25, + "grad_norm": 5.386661140066468, + "learning_rate": 1.7576242768261907e-06, + "loss": 0.6823, + "step": 8367 + }, + { + "epoch": 1.25, + "grad_norm": 1.9160264309554997, + "learning_rate": 1.7575612181832357e-06, + "loss": 0.6836, + "step": 8368 + }, + { + "epoch": 1.25, + "grad_norm": 1.121099031846933, + "learning_rate": 1.7574981524698949e-06, + "loss": 0.6953, + "step": 8369 + }, + { + "epoch": 1.25, + "grad_norm": 7.670652409003985, + "learning_rate": 1.7574350796867571e-06, + "loss": 0.7155, + "step": 8370 + }, + { + "epoch": 1.25, + "grad_norm": 2.2735770808466538, + "learning_rate": 1.7573719998344107e-06, + "loss": 0.6471, + "step": 8371 + }, + { + "epoch": 1.25, + "grad_norm": 5.818522136716526, + "learning_rate": 1.7573089129134444e-06, + "loss": 0.6836, + "step": 8372 + }, + { + "epoch": 1.25, + "grad_norm": 0.6972529539280158, + "learning_rate": 1.757245818924447e-06, + "loss": 0.668, + "step": 8373 + }, + { + "epoch": 1.25, + "grad_norm": 3.8258724564074846, + "learning_rate": 1.7571827178680074e-06, + "loss": 0.6673, + "step": 8374 + }, + { + "epoch": 1.25, + "grad_norm": 0.6538235971643236, + "learning_rate": 1.7571196097447145e-06, + "loss": 0.6777, + "step": 8375 + }, + { + "epoch": 1.25, + "grad_norm": 0.8044351287810149, + "learning_rate": 1.7570564945551575e-06, + "loss": 0.6797, + "step": 8376 + }, + { + "epoch": 1.25, + "grad_norm": 4.171918306651579, + "learning_rate": 1.756993372299925e-06, + "loss": 0.7012, + "step": 8377 + }, + { + "epoch": 1.25, + "grad_norm": 0.8719582003037981, + "learning_rate": 1.7569302429796066e-06, + "loss": 0.6465, + "step": 8378 + }, + { + "epoch": 1.25, + "grad_norm": 4.6208352479236705, + "learning_rate": 1.7568671065947914e-06, + "loss": 0.6823, + "step": 8379 + }, + { + "epoch": 1.25, + "grad_norm": 1.4511780941652335, + "learning_rate": 1.7568039631460684e-06, + "loss": 0.6738, + "step": 8380 + }, + { + "epoch": 1.25, + "grad_norm": 2.0374594133032975, + "learning_rate": 1.7567408126340273e-06, + "loss": 0.6699, + "step": 8381 + }, + { + "epoch": 1.25, + "grad_norm": 1.3014559488107746, + "learning_rate": 1.7566776550592567e-06, + "loss": 0.6862, + "step": 8382 + }, + { + "epoch": 1.25, + "grad_norm": 0.6393199796144529, + "learning_rate": 1.7566144904223472e-06, + "loss": 0.6712, + "step": 8383 + }, + { + "epoch": 1.25, + "grad_norm": 3.0177075989965076, + "learning_rate": 1.7565513187238875e-06, + "loss": 0.668, + "step": 8384 + }, + { + "epoch": 1.25, + "grad_norm": 2.671396750294299, + "learning_rate": 1.7564881399644678e-06, + "loss": 0.7064, + "step": 8385 + }, + { + "epoch": 1.25, + "grad_norm": 0.7728410231934479, + "learning_rate": 1.756424954144677e-06, + "loss": 0.6823, + "step": 8386 + }, + { + "epoch": 1.25, + "grad_norm": 0.7434251375106967, + "learning_rate": 1.7563617612651051e-06, + "loss": 0.6882, + "step": 8387 + }, + { + "epoch": 1.25, + "grad_norm": 3.431861873114253, + "learning_rate": 1.7562985613263423e-06, + "loss": 0.6686, + "step": 8388 + }, + { + "epoch": 1.25, + "grad_norm": 0.8348608798646953, + "learning_rate": 1.7562353543289782e-06, + "loss": 0.6745, + "step": 8389 + }, + { + "epoch": 1.25, + "grad_norm": 3.3927432572574223, + "learning_rate": 1.7561721402736022e-06, + "loss": 0.681, + "step": 8390 + }, + { + "epoch": 1.25, + "grad_norm": 0.6576142932746482, + "learning_rate": 1.7561089191608055e-06, + "loss": 0.6849, + "step": 8391 + }, + { + "epoch": 1.25, + "grad_norm": 4.598980616556941, + "learning_rate": 1.7560456909911767e-06, + "loss": 0.6445, + "step": 8392 + }, + { + "epoch": 1.25, + "grad_norm": 1.1807334197107011, + "learning_rate": 1.755982455765307e-06, + "loss": 0.6667, + "step": 8393 + }, + { + "epoch": 1.25, + "grad_norm": 3.3391822098745902, + "learning_rate": 1.7559192134837857e-06, + "loss": 0.6699, + "step": 8394 + }, + { + "epoch": 1.25, + "grad_norm": 5.450186613038512, + "learning_rate": 1.755855964147204e-06, + "loss": 0.6816, + "step": 8395 + }, + { + "epoch": 1.25, + "grad_norm": 2.189824052796164, + "learning_rate": 1.7557927077561517e-06, + "loss": 0.681, + "step": 8396 + }, + { + "epoch": 1.25, + "grad_norm": 1.7744826928917756, + "learning_rate": 1.755729444311219e-06, + "loss": 0.6947, + "step": 8397 + }, + { + "epoch": 1.25, + "grad_norm": 2.3816991739978386, + "learning_rate": 1.7556661738129968e-06, + "loss": 0.6693, + "step": 8398 + }, + { + "epoch": 1.25, + "grad_norm": 4.4022262539816515, + "learning_rate": 1.7556028962620753e-06, + "loss": 0.6634, + "step": 8399 + }, + { + "epoch": 1.25, + "grad_norm": 2.8543828536630036, + "learning_rate": 1.755539611659045e-06, + "loss": 0.6777, + "step": 8400 + }, + { + "epoch": 1.25, + "grad_norm": 2.914908692458497, + "learning_rate": 1.755476320004497e-06, + "loss": 0.696, + "step": 8401 + }, + { + "epoch": 1.25, + "grad_norm": 1.6007482498641905, + "learning_rate": 1.7554130212990213e-06, + "loss": 0.6725, + "step": 8402 + }, + { + "epoch": 1.25, + "grad_norm": 0.8778609484805571, + "learning_rate": 1.7553497155432094e-06, + "loss": 0.6732, + "step": 8403 + }, + { + "epoch": 1.25, + "grad_norm": 3.526416975642028, + "learning_rate": 1.7552864027376515e-06, + "loss": 0.6484, + "step": 8404 + }, + { + "epoch": 1.25, + "grad_norm": 2.7700909587899667, + "learning_rate": 1.7552230828829391e-06, + "loss": 0.6699, + "step": 8405 + }, + { + "epoch": 1.25, + "grad_norm": 5.002554269648387, + "learning_rate": 1.7551597559796628e-06, + "loss": 0.6816, + "step": 8406 + }, + { + "epoch": 1.25, + "grad_norm": 3.331346050318256, + "learning_rate": 1.7550964220284134e-06, + "loss": 0.6543, + "step": 8407 + }, + { + "epoch": 1.25, + "grad_norm": 2.5982351407367377, + "learning_rate": 1.7550330810297827e-06, + "loss": 0.6855, + "step": 8408 + }, + { + "epoch": 1.25, + "grad_norm": 2.4116218569050676, + "learning_rate": 1.7549697329843614e-06, + "loss": 0.6608, + "step": 8409 + }, + { + "epoch": 1.25, + "grad_norm": 2.184525734103381, + "learning_rate": 1.7549063778927406e-06, + "loss": 0.6875, + "step": 8410 + }, + { + "epoch": 1.25, + "grad_norm": 2.328063305922707, + "learning_rate": 1.754843015755512e-06, + "loss": 0.6849, + "step": 8411 + }, + { + "epoch": 1.25, + "grad_norm": 2.327250988484939, + "learning_rate": 1.754779646573267e-06, + "loss": 0.679, + "step": 8412 + }, + { + "epoch": 1.25, + "grad_norm": 0.830545073770895, + "learning_rate": 1.7547162703465966e-06, + "loss": 0.6667, + "step": 8413 + }, + { + "epoch": 1.25, + "grad_norm": 1.5516215888219336, + "learning_rate": 1.7546528870760924e-06, + "loss": 0.7096, + "step": 8414 + }, + { + "epoch": 1.26, + "grad_norm": 5.4850047005465346, + "learning_rate": 1.754589496762346e-06, + "loss": 0.6986, + "step": 8415 + }, + { + "epoch": 1.26, + "grad_norm": 1.6891284067867605, + "learning_rate": 1.7545260994059494e-06, + "loss": 0.6758, + "step": 8416 + }, + { + "epoch": 1.26, + "grad_norm": 0.7914590027524395, + "learning_rate": 1.754462695007494e-06, + "loss": 0.679, + "step": 8417 + }, + { + "epoch": 1.26, + "grad_norm": 2.805396171168065, + "learning_rate": 1.754399283567571e-06, + "loss": 0.6712, + "step": 8418 + }, + { + "epoch": 1.26, + "grad_norm": 0.7520450412408397, + "learning_rate": 1.7543358650867733e-06, + "loss": 0.6719, + "step": 8419 + }, + { + "epoch": 1.26, + "grad_norm": 1.0838520953632196, + "learning_rate": 1.754272439565692e-06, + "loss": 0.6882, + "step": 8420 + }, + { + "epoch": 1.26, + "grad_norm": 1.2959431158327688, + "learning_rate": 1.7542090070049196e-06, + "loss": 0.6934, + "step": 8421 + }, + { + "epoch": 1.26, + "grad_norm": 1.5706381789350388, + "learning_rate": 1.7541455674050476e-06, + "loss": 0.6608, + "step": 8422 + }, + { + "epoch": 1.26, + "grad_norm": 0.8076908740572829, + "learning_rate": 1.7540821207666686e-06, + "loss": 0.6621, + "step": 8423 + }, + { + "epoch": 1.26, + "grad_norm": 2.3118656839738723, + "learning_rate": 1.754018667090374e-06, + "loss": 0.6842, + "step": 8424 + }, + { + "epoch": 1.26, + "grad_norm": 5.810183228098484, + "learning_rate": 1.7539552063767569e-06, + "loss": 0.6921, + "step": 8425 + }, + { + "epoch": 1.26, + "grad_norm": 2.086831530379851, + "learning_rate": 1.753891738626409e-06, + "loss": 0.7031, + "step": 8426 + }, + { + "epoch": 1.26, + "grad_norm": 0.7810366730235022, + "learning_rate": 1.753828263839923e-06, + "loss": 0.6921, + "step": 8427 + }, + { + "epoch": 1.26, + "grad_norm": 5.4050330248622425, + "learning_rate": 1.753764782017891e-06, + "loss": 0.6986, + "step": 8428 + }, + { + "epoch": 1.26, + "grad_norm": 0.8879981037843245, + "learning_rate": 1.7537012931609057e-06, + "loss": 0.6562, + "step": 8429 + }, + { + "epoch": 1.26, + "grad_norm": 2.3186191162567225, + "learning_rate": 1.7536377972695596e-06, + "loss": 0.6803, + "step": 8430 + }, + { + "epoch": 1.26, + "grad_norm": 2.7416603316820205, + "learning_rate": 1.753574294344445e-06, + "loss": 0.6927, + "step": 8431 + }, + { + "epoch": 1.26, + "grad_norm": 1.0906804983254494, + "learning_rate": 1.7535107843861549e-06, + "loss": 0.6647, + "step": 8432 + }, + { + "epoch": 1.26, + "grad_norm": 2.011846511821355, + "learning_rate": 1.7534472673952824e-06, + "loss": 0.6855, + "step": 8433 + }, + { + "epoch": 1.26, + "grad_norm": 2.157095885151558, + "learning_rate": 1.7533837433724199e-06, + "loss": 0.6699, + "step": 8434 + }, + { + "epoch": 1.26, + "grad_norm": 2.552822556937054, + "learning_rate": 1.7533202123181598e-06, + "loss": 0.6758, + "step": 8435 + }, + { + "epoch": 1.26, + "grad_norm": 1.574972218792734, + "learning_rate": 1.7532566742330958e-06, + "loss": 0.6706, + "step": 8436 + }, + { + "epoch": 1.26, + "grad_norm": 0.5157255471205197, + "learning_rate": 1.753193129117821e-06, + "loss": 0.6751, + "step": 8437 + }, + { + "epoch": 1.26, + "grad_norm": 1.9557284813526632, + "learning_rate": 1.753129576972928e-06, + "loss": 0.7031, + "step": 8438 + }, + { + "epoch": 1.26, + "grad_norm": 2.2562223596306046, + "learning_rate": 1.7530660177990098e-06, + "loss": 0.6895, + "step": 8439 + }, + { + "epoch": 1.26, + "grad_norm": 2.737662381901969, + "learning_rate": 1.7530024515966598e-06, + "loss": 0.6777, + "step": 8440 + }, + { + "epoch": 1.26, + "grad_norm": 3.3039489062171543, + "learning_rate": 1.7529388783664713e-06, + "loss": 0.638, + "step": 8441 + }, + { + "epoch": 1.26, + "grad_norm": 6.098959373330654, + "learning_rate": 1.7528752981090383e-06, + "loss": 0.6589, + "step": 8442 + }, + { + "epoch": 1.26, + "grad_norm": 1.9731203982068775, + "learning_rate": 1.752811710824953e-06, + "loss": 0.6458, + "step": 8443 + }, + { + "epoch": 1.26, + "grad_norm": 1.352818939948505, + "learning_rate": 1.7527481165148093e-06, + "loss": 0.6699, + "step": 8444 + }, + { + "epoch": 1.26, + "grad_norm": 2.6471607673525788, + "learning_rate": 1.752684515179201e-06, + "loss": 0.6849, + "step": 8445 + }, + { + "epoch": 1.26, + "grad_norm": 5.106137288408484, + "learning_rate": 1.7526209068187217e-06, + "loss": 0.6829, + "step": 8446 + }, + { + "epoch": 1.26, + "grad_norm": 7.292993190055021, + "learning_rate": 1.7525572914339648e-06, + "loss": 0.7077, + "step": 8447 + }, + { + "epoch": 1.26, + "grad_norm": 1.5182245949063482, + "learning_rate": 1.7524936690255242e-06, + "loss": 0.6836, + "step": 8448 + }, + { + "epoch": 1.26, + "grad_norm": 3.7012192165064492, + "learning_rate": 1.7524300395939937e-06, + "loss": 0.6706, + "step": 8449 + }, + { + "epoch": 1.26, + "grad_norm": 3.3398245350739004, + "learning_rate": 1.752366403139967e-06, + "loss": 0.6706, + "step": 8450 + }, + { + "epoch": 1.26, + "grad_norm": 2.6163281105078706, + "learning_rate": 1.7523027596640386e-06, + "loss": 0.6849, + "step": 8451 + }, + { + "epoch": 1.26, + "grad_norm": 0.6806198612944929, + "learning_rate": 1.7522391091668011e-06, + "loss": 0.6758, + "step": 8452 + }, + { + "epoch": 1.26, + "grad_norm": 3.765977024080551, + "learning_rate": 1.75217545164885e-06, + "loss": 0.6634, + "step": 8453 + }, + { + "epoch": 1.26, + "grad_norm": 1.7441317326981092, + "learning_rate": 1.7521117871107788e-06, + "loss": 0.6868, + "step": 8454 + }, + { + "epoch": 1.26, + "grad_norm": 0.6532830212415591, + "learning_rate": 1.7520481155531818e-06, + "loss": 0.6634, + "step": 8455 + }, + { + "epoch": 1.26, + "grad_norm": 2.3050592490081656, + "learning_rate": 1.7519844369766533e-06, + "loss": 0.6738, + "step": 8456 + }, + { + "epoch": 1.26, + "grad_norm": 1.3245890249469847, + "learning_rate": 1.7519207513817875e-06, + "loss": 0.6816, + "step": 8457 + }, + { + "epoch": 1.26, + "grad_norm": 1.8068534545994837, + "learning_rate": 1.751857058769179e-06, + "loss": 0.6934, + "step": 8458 + }, + { + "epoch": 1.26, + "grad_norm": 3.2594090500664876, + "learning_rate": 1.7517933591394219e-06, + "loss": 0.681, + "step": 8459 + }, + { + "epoch": 1.26, + "grad_norm": 5.027494340832862, + "learning_rate": 1.7517296524931107e-06, + "loss": 0.6667, + "step": 8460 + }, + { + "epoch": 1.26, + "grad_norm": 3.1231782745605874, + "learning_rate": 1.7516659388308406e-06, + "loss": 0.6738, + "step": 8461 + }, + { + "epoch": 1.26, + "grad_norm": 3.4134572715618794, + "learning_rate": 1.7516022181532055e-06, + "loss": 0.6608, + "step": 8462 + }, + { + "epoch": 1.26, + "grad_norm": 3.0322509247733525, + "learning_rate": 1.7515384904608005e-06, + "loss": 0.6712, + "step": 8463 + }, + { + "epoch": 1.26, + "grad_norm": 5.375843737721812, + "learning_rate": 1.7514747557542205e-06, + "loss": 0.6816, + "step": 8464 + }, + { + "epoch": 1.26, + "grad_norm": 0.9143281235583178, + "learning_rate": 1.7514110140340596e-06, + "loss": 0.6764, + "step": 8465 + }, + { + "epoch": 1.26, + "grad_norm": 1.3006801213672434, + "learning_rate": 1.7513472653009139e-06, + "loss": 0.6439, + "step": 8466 + }, + { + "epoch": 1.26, + "grad_norm": 5.4287916874273705, + "learning_rate": 1.7512835095553773e-06, + "loss": 0.6628, + "step": 8467 + }, + { + "epoch": 1.26, + "grad_norm": 3.0127081171517234, + "learning_rate": 1.7512197467980453e-06, + "loss": 0.6823, + "step": 8468 + }, + { + "epoch": 1.26, + "grad_norm": 2.940401075450121, + "learning_rate": 1.7511559770295133e-06, + "loss": 0.6484, + "step": 8469 + }, + { + "epoch": 1.26, + "grad_norm": 1.9585260602875434, + "learning_rate": 1.751092200250376e-06, + "loss": 0.6615, + "step": 8470 + }, + { + "epoch": 1.26, + "grad_norm": 0.9803752905750682, + "learning_rate": 1.7510284164612287e-06, + "loss": 0.666, + "step": 8471 + }, + { + "epoch": 1.26, + "grad_norm": 1.1797890838387168, + "learning_rate": 1.7509646256626666e-06, + "loss": 0.6895, + "step": 8472 + }, + { + "epoch": 1.26, + "grad_norm": 3.3088519543094574, + "learning_rate": 1.7509008278552856e-06, + "loss": 0.6732, + "step": 8473 + }, + { + "epoch": 1.26, + "grad_norm": 1.8493948075611242, + "learning_rate": 1.7508370230396805e-06, + "loss": 0.6823, + "step": 8474 + }, + { + "epoch": 1.26, + "grad_norm": 2.920594344817036, + "learning_rate": 1.7507732112164471e-06, + "loss": 0.6992, + "step": 8475 + }, + { + "epoch": 1.26, + "grad_norm": 1.1530470608442853, + "learning_rate": 1.750709392386181e-06, + "loss": 0.6712, + "step": 8476 + }, + { + "epoch": 1.26, + "grad_norm": 3.559354429457932, + "learning_rate": 1.7506455665494774e-06, + "loss": 0.6823, + "step": 8477 + }, + { + "epoch": 1.26, + "grad_norm": 1.0754780402718862, + "learning_rate": 1.7505817337069328e-06, + "loss": 0.6354, + "step": 8478 + }, + { + "epoch": 1.26, + "grad_norm": 1.9250994044382344, + "learning_rate": 1.7505178938591426e-06, + "loss": 0.6953, + "step": 8479 + }, + { + "epoch": 1.26, + "grad_norm": 0.9806830647790007, + "learning_rate": 1.7504540470067022e-06, + "loss": 0.6706, + "step": 8480 + }, + { + "epoch": 1.26, + "grad_norm": 1.7192833508689822, + "learning_rate": 1.7503901931502079e-06, + "loss": 0.6842, + "step": 8481 + }, + { + "epoch": 1.27, + "grad_norm": 2.481953064756009, + "learning_rate": 1.7503263322902555e-06, + "loss": 0.6751, + "step": 8482 + }, + { + "epoch": 1.27, + "grad_norm": 0.9273338406407773, + "learning_rate": 1.750262464427441e-06, + "loss": 0.6478, + "step": 8483 + }, + { + "epoch": 1.27, + "grad_norm": 1.284239175085265, + "learning_rate": 1.7501985895623607e-06, + "loss": 0.6471, + "step": 8484 + }, + { + "epoch": 1.27, + "grad_norm": 1.9056733640241625, + "learning_rate": 1.7501347076956108e-06, + "loss": 0.6497, + "step": 8485 + }, + { + "epoch": 1.27, + "grad_norm": 2.09495563120318, + "learning_rate": 1.7500708188277873e-06, + "loss": 0.6947, + "step": 8486 + }, + { + "epoch": 1.27, + "grad_norm": 0.9567989487105927, + "learning_rate": 1.7500069229594862e-06, + "loss": 0.6647, + "step": 8487 + }, + { + "epoch": 1.27, + "grad_norm": 0.8777924239302743, + "learning_rate": 1.7499430200913046e-06, + "loss": 0.6855, + "step": 8488 + }, + { + "epoch": 1.27, + "grad_norm": 1.9828746078637152, + "learning_rate": 1.749879110223838e-06, + "loss": 0.6829, + "step": 8489 + }, + { + "epoch": 1.27, + "grad_norm": 0.9142780010096849, + "learning_rate": 1.7498151933576835e-06, + "loss": 0.6823, + "step": 8490 + }, + { + "epoch": 1.27, + "grad_norm": 1.5064803930640844, + "learning_rate": 1.7497512694934375e-06, + "loss": 0.668, + "step": 8491 + }, + { + "epoch": 1.27, + "grad_norm": 1.437777595836175, + "learning_rate": 1.7496873386316968e-06, + "loss": 0.7142, + "step": 8492 + }, + { + "epoch": 1.27, + "grad_norm": 0.8442274039133193, + "learning_rate": 1.7496234007730577e-06, + "loss": 0.6745, + "step": 8493 + }, + { + "epoch": 1.27, + "grad_norm": 1.0838284109248855, + "learning_rate": 1.749559455918117e-06, + "loss": 0.6654, + "step": 8494 + }, + { + "epoch": 1.27, + "grad_norm": 0.7630137615554881, + "learning_rate": 1.7494955040674718e-06, + "loss": 0.681, + "step": 8495 + }, + { + "epoch": 1.27, + "grad_norm": 1.1896222241889551, + "learning_rate": 1.7494315452217188e-06, + "loss": 0.6523, + "step": 8496 + }, + { + "epoch": 1.27, + "grad_norm": 4.963320570602255, + "learning_rate": 1.7493675793814549e-06, + "loss": 0.7142, + "step": 8497 + }, + { + "epoch": 1.27, + "grad_norm": 1.2481704954078534, + "learning_rate": 1.749303606547277e-06, + "loss": 0.6602, + "step": 8498 + }, + { + "epoch": 1.27, + "grad_norm": 2.976789133883316, + "learning_rate": 1.7492396267197824e-06, + "loss": 0.6842, + "step": 8499 + }, + { + "epoch": 1.27, + "grad_norm": 1.7762452648919975, + "learning_rate": 1.749175639899568e-06, + "loss": 0.6816, + "step": 8500 + }, + { + "epoch": 1.27, + "grad_norm": 1.2088855440228539, + "learning_rate": 1.749111646087231e-06, + "loss": 0.6452, + "step": 8501 + }, + { + "epoch": 1.27, + "grad_norm": 3.1065632862748913, + "learning_rate": 1.749047645283369e-06, + "loss": 0.6628, + "step": 8502 + }, + { + "epoch": 1.27, + "grad_norm": 4.104437801511193, + "learning_rate": 1.748983637488579e-06, + "loss": 0.6751, + "step": 8503 + }, + { + "epoch": 1.27, + "grad_norm": 0.8746949728965709, + "learning_rate": 1.7489196227034585e-06, + "loss": 0.6712, + "step": 8504 + }, + { + "epoch": 1.27, + "grad_norm": 1.795150525095519, + "learning_rate": 1.748855600928605e-06, + "loss": 0.6829, + "step": 8505 + }, + { + "epoch": 1.27, + "grad_norm": 3.4478281827080473, + "learning_rate": 1.7487915721646156e-06, + "loss": 0.7103, + "step": 8506 + }, + { + "epoch": 1.27, + "grad_norm": 7.732142088541005, + "learning_rate": 1.7487275364120883e-06, + "loss": 0.6999, + "step": 8507 + }, + { + "epoch": 1.27, + "grad_norm": 2.536941319082531, + "learning_rate": 1.748663493671621e-06, + "loss": 0.6706, + "step": 8508 + }, + { + "epoch": 1.27, + "grad_norm": 4.801299316366276, + "learning_rate": 1.748599443943811e-06, + "loss": 0.6771, + "step": 8509 + }, + { + "epoch": 1.27, + "grad_norm": 2.3614798437817095, + "learning_rate": 1.748535387229256e-06, + "loss": 0.6901, + "step": 8510 + }, + { + "epoch": 1.27, + "grad_norm": 2.4729496307429395, + "learning_rate": 1.7484713235285542e-06, + "loss": 0.6862, + "step": 8511 + }, + { + "epoch": 1.27, + "grad_norm": 2.4753724907635064, + "learning_rate": 1.7484072528423032e-06, + "loss": 0.7109, + "step": 8512 + }, + { + "epoch": 1.27, + "grad_norm": 2.316171453622901, + "learning_rate": 1.7483431751711014e-06, + "loss": 0.6803, + "step": 8513 + }, + { + "epoch": 1.27, + "grad_norm": 0.7096170867602362, + "learning_rate": 1.7482790905155464e-06, + "loss": 0.6966, + "step": 8514 + }, + { + "epoch": 1.27, + "grad_norm": 4.526390170704584, + "learning_rate": 1.7482149988762366e-06, + "loss": 0.6745, + "step": 8515 + }, + { + "epoch": 1.27, + "grad_norm": 1.543338276950688, + "learning_rate": 1.74815090025377e-06, + "loss": 0.653, + "step": 8516 + }, + { + "epoch": 1.27, + "grad_norm": 3.846694830341309, + "learning_rate": 1.7480867946487449e-06, + "loss": 0.6829, + "step": 8517 + }, + { + "epoch": 1.27, + "grad_norm": 2.7884385230418576, + "learning_rate": 1.7480226820617592e-06, + "loss": 0.6849, + "step": 8518 + }, + { + "epoch": 1.27, + "grad_norm": 1.2420483678246617, + "learning_rate": 1.7479585624934118e-06, + "loss": 0.6693, + "step": 8519 + }, + { + "epoch": 1.27, + "grad_norm": 0.6881122698977804, + "learning_rate": 1.7478944359443014e-06, + "loss": 0.6855, + "step": 8520 + }, + { + "epoch": 1.27, + "grad_norm": 0.8132351310639084, + "learning_rate": 1.7478303024150259e-06, + "loss": 0.707, + "step": 8521 + }, + { + "epoch": 1.27, + "grad_norm": 5.1978628563605405, + "learning_rate": 1.7477661619061838e-06, + "loss": 0.6868, + "step": 8522 + }, + { + "epoch": 1.27, + "grad_norm": 3.1267955714651876, + "learning_rate": 1.7477020144183743e-06, + "loss": 0.6784, + "step": 8523 + }, + { + "epoch": 1.27, + "grad_norm": 1.8116351982818888, + "learning_rate": 1.7476378599521957e-06, + "loss": 0.6647, + "step": 8524 + }, + { + "epoch": 1.27, + "grad_norm": 2.6960964516078314, + "learning_rate": 1.7475736985082468e-06, + "loss": 0.6647, + "step": 8525 + }, + { + "epoch": 1.27, + "grad_norm": 1.4487680890802155, + "learning_rate": 1.7475095300871263e-06, + "loss": 0.6615, + "step": 8526 + }, + { + "epoch": 1.27, + "grad_norm": 1.8111870107296688, + "learning_rate": 1.7474453546894333e-06, + "loss": 0.6758, + "step": 8527 + }, + { + "epoch": 1.27, + "grad_norm": 1.1792483444198771, + "learning_rate": 1.7473811723157668e-06, + "loss": 0.679, + "step": 8528 + }, + { + "epoch": 1.27, + "grad_norm": 2.589244876786775, + "learning_rate": 1.7473169829667259e-06, + "loss": 0.679, + "step": 8529 + }, + { + "epoch": 1.27, + "grad_norm": 1.0317367902242551, + "learning_rate": 1.7472527866429095e-06, + "loss": 0.6712, + "step": 8530 + }, + { + "epoch": 1.27, + "grad_norm": 3.806616933824005, + "learning_rate": 1.7471885833449162e-06, + "loss": 0.6862, + "step": 8531 + }, + { + "epoch": 1.27, + "grad_norm": 2.079459072355763, + "learning_rate": 1.7471243730733462e-06, + "loss": 0.668, + "step": 8532 + }, + { + "epoch": 1.27, + "grad_norm": 6.378378146181488, + "learning_rate": 1.7470601558287986e-06, + "loss": 0.6875, + "step": 8533 + }, + { + "epoch": 1.27, + "grad_norm": 1.108626539142797, + "learning_rate": 1.7469959316118722e-06, + "loss": 0.6849, + "step": 8534 + }, + { + "epoch": 1.27, + "grad_norm": 3.7244307756990667, + "learning_rate": 1.7469317004231666e-06, + "loss": 0.6849, + "step": 8535 + }, + { + "epoch": 1.27, + "grad_norm": 0.7105755623878527, + "learning_rate": 1.7468674622632814e-06, + "loss": 0.6576, + "step": 8536 + }, + { + "epoch": 1.27, + "grad_norm": 1.1031419772696784, + "learning_rate": 1.7468032171328163e-06, + "loss": 0.6855, + "step": 8537 + }, + { + "epoch": 1.27, + "grad_norm": 1.6976314273814137, + "learning_rate": 1.7467389650323708e-06, + "loss": 0.6921, + "step": 8538 + }, + { + "epoch": 1.27, + "grad_norm": 5.103390802417593, + "learning_rate": 1.746674705962544e-06, + "loss": 0.6868, + "step": 8539 + }, + { + "epoch": 1.27, + "grad_norm": 2.9158051022246703, + "learning_rate": 1.7466104399239368e-06, + "loss": 0.6634, + "step": 8540 + }, + { + "epoch": 1.27, + "grad_norm": 5.429036637466942, + "learning_rate": 1.7465461669171479e-06, + "loss": 0.6777, + "step": 8541 + }, + { + "epoch": 1.27, + "grad_norm": 3.2845180189060077, + "learning_rate": 1.7464818869427778e-06, + "loss": 0.6725, + "step": 8542 + }, + { + "epoch": 1.27, + "grad_norm": 0.7314876439224441, + "learning_rate": 1.7464176000014262e-06, + "loss": 0.6621, + "step": 8543 + }, + { + "epoch": 1.27, + "grad_norm": 2.592626015204027, + "learning_rate": 1.746353306093693e-06, + "loss": 0.7077, + "step": 8544 + }, + { + "epoch": 1.27, + "grad_norm": 2.380233018967456, + "learning_rate": 1.7462890052201783e-06, + "loss": 0.6732, + "step": 8545 + }, + { + "epoch": 1.27, + "grad_norm": 4.440335367801142, + "learning_rate": 1.746224697381483e-06, + "loss": 0.6738, + "step": 8546 + }, + { + "epoch": 1.27, + "grad_norm": 5.446996739734463, + "learning_rate": 1.746160382578206e-06, + "loss": 0.6517, + "step": 8547 + }, + { + "epoch": 1.27, + "grad_norm": 1.5230542555389208, + "learning_rate": 1.7460960608109483e-06, + "loss": 0.6478, + "step": 8548 + }, + { + "epoch": 1.28, + "grad_norm": 1.9883243999463356, + "learning_rate": 1.74603173208031e-06, + "loss": 0.6771, + "step": 8549 + }, + { + "epoch": 1.28, + "grad_norm": 0.8473990682574692, + "learning_rate": 1.7459673963868916e-06, + "loss": 0.6855, + "step": 8550 + }, + { + "epoch": 1.28, + "grad_norm": 1.0587778563531656, + "learning_rate": 1.7459030537312936e-06, + "loss": 0.696, + "step": 8551 + }, + { + "epoch": 1.28, + "grad_norm": 0.7882983785286577, + "learning_rate": 1.7458387041141162e-06, + "loss": 0.6641, + "step": 8552 + }, + { + "epoch": 1.28, + "grad_norm": 2.013509860283508, + "learning_rate": 1.7457743475359606e-06, + "loss": 0.6862, + "step": 8553 + }, + { + "epoch": 1.28, + "grad_norm": 5.955328087559887, + "learning_rate": 1.7457099839974268e-06, + "loss": 0.6719, + "step": 8554 + }, + { + "epoch": 1.28, + "grad_norm": 2.3427087310290355, + "learning_rate": 1.745645613499116e-06, + "loss": 0.7057, + "step": 8555 + }, + { + "epoch": 1.28, + "grad_norm": 3.619607951049931, + "learning_rate": 1.7455812360416285e-06, + "loss": 0.6927, + "step": 8556 + }, + { + "epoch": 1.28, + "grad_norm": 0.7867250471669366, + "learning_rate": 1.7455168516255655e-06, + "loss": 0.6439, + "step": 8557 + }, + { + "epoch": 1.28, + "grad_norm": 2.2132963992102743, + "learning_rate": 1.7454524602515277e-06, + "loss": 0.6536, + "step": 8558 + }, + { + "epoch": 1.28, + "grad_norm": 3.841489482460608, + "learning_rate": 1.7453880619201163e-06, + "loss": 0.6686, + "step": 8559 + }, + { + "epoch": 1.28, + "grad_norm": 2.417478351794759, + "learning_rate": 1.7453236566319318e-06, + "loss": 0.6706, + "step": 8560 + }, + { + "epoch": 1.28, + "grad_norm": 3.453820469197068, + "learning_rate": 1.7452592443875762e-06, + "loss": 0.694, + "step": 8561 + }, + { + "epoch": 1.28, + "grad_norm": 0.9712870083912312, + "learning_rate": 1.74519482518765e-06, + "loss": 0.6882, + "step": 8562 + }, + { + "epoch": 1.28, + "grad_norm": 4.360277202157476, + "learning_rate": 1.7451303990327546e-06, + "loss": 0.6823, + "step": 8563 + }, + { + "epoch": 1.28, + "grad_norm": 2.214270964471408, + "learning_rate": 1.7450659659234911e-06, + "loss": 0.6764, + "step": 8564 + }, + { + "epoch": 1.28, + "grad_norm": 4.479412739817417, + "learning_rate": 1.7450015258604615e-06, + "loss": 0.6712, + "step": 8565 + }, + { + "epoch": 1.28, + "grad_norm": 4.545678446674888, + "learning_rate": 1.7449370788442664e-06, + "loss": 0.6673, + "step": 8566 + }, + { + "epoch": 1.28, + "grad_norm": 0.8112376211292398, + "learning_rate": 1.7448726248755077e-06, + "loss": 0.6536, + "step": 8567 + }, + { + "epoch": 1.28, + "grad_norm": 1.793086598890094, + "learning_rate": 1.7448081639547868e-06, + "loss": 0.6621, + "step": 8568 + }, + { + "epoch": 1.28, + "grad_norm": 1.7385051992679705, + "learning_rate": 1.7447436960827059e-06, + "loss": 0.6549, + "step": 8569 + }, + { + "epoch": 1.28, + "grad_norm": 2.584117114376808, + "learning_rate": 1.744679221259866e-06, + "loss": 0.6888, + "step": 8570 + }, + { + "epoch": 1.28, + "grad_norm": 3.4853189058853213, + "learning_rate": 1.7446147394868688e-06, + "loss": 0.6673, + "step": 8571 + }, + { + "epoch": 1.28, + "grad_norm": 3.568714974176752, + "learning_rate": 1.7445502507643167e-06, + "loss": 0.6725, + "step": 8572 + }, + { + "epoch": 1.28, + "grad_norm": 2.2802926761397955, + "learning_rate": 1.7444857550928114e-06, + "loss": 0.6855, + "step": 8573 + }, + { + "epoch": 1.28, + "grad_norm": 0.8066935255361387, + "learning_rate": 1.7444212524729544e-06, + "loss": 0.6771, + "step": 8574 + }, + { + "epoch": 1.28, + "grad_norm": 4.448783248434127, + "learning_rate": 1.7443567429053482e-06, + "loss": 0.6647, + "step": 8575 + }, + { + "epoch": 1.28, + "grad_norm": 2.7239523387010345, + "learning_rate": 1.7442922263905944e-06, + "loss": 0.6901, + "step": 8576 + }, + { + "epoch": 1.28, + "grad_norm": 0.9451374629370396, + "learning_rate": 1.7442277029292956e-06, + "loss": 0.6543, + "step": 8577 + }, + { + "epoch": 1.28, + "grad_norm": 1.573702380359975, + "learning_rate": 1.7441631725220537e-06, + "loss": 0.6803, + "step": 8578 + }, + { + "epoch": 1.28, + "grad_norm": 1.1455324298318579, + "learning_rate": 1.7440986351694716e-06, + "loss": 0.6927, + "step": 8579 + }, + { + "epoch": 1.28, + "grad_norm": 1.029363590719137, + "learning_rate": 1.7440340908721508e-06, + "loss": 0.6895, + "step": 8580 + }, + { + "epoch": 1.28, + "grad_norm": 0.8245273979157051, + "learning_rate": 1.7439695396306936e-06, + "loss": 0.6536, + "step": 8581 + }, + { + "epoch": 1.28, + "grad_norm": 1.6905307557643108, + "learning_rate": 1.7439049814457036e-06, + "loss": 0.6973, + "step": 8582 + }, + { + "epoch": 1.28, + "grad_norm": 2.8510217774342013, + "learning_rate": 1.7438404163177824e-06, + "loss": 0.6868, + "step": 8583 + }, + { + "epoch": 1.28, + "grad_norm": 2.338330397663637, + "learning_rate": 1.7437758442475326e-06, + "loss": 0.6693, + "step": 8584 + }, + { + "epoch": 1.28, + "grad_norm": 5.3751631707923275, + "learning_rate": 1.7437112652355571e-06, + "loss": 0.7298, + "step": 8585 + }, + { + "epoch": 1.28, + "grad_norm": 1.6305762958686594, + "learning_rate": 1.7436466792824587e-06, + "loss": 0.6764, + "step": 8586 + }, + { + "epoch": 1.28, + "grad_norm": 0.928691895041909, + "learning_rate": 1.7435820863888404e-06, + "loss": 0.679, + "step": 8587 + }, + { + "epoch": 1.28, + "grad_norm": 2.992309083185412, + "learning_rate": 1.7435174865553042e-06, + "loss": 0.679, + "step": 8588 + }, + { + "epoch": 1.28, + "grad_norm": 0.7163559668236464, + "learning_rate": 1.7434528797824539e-06, + "loss": 0.6686, + "step": 8589 + }, + { + "epoch": 1.28, + "grad_norm": 1.9373885826063735, + "learning_rate": 1.743388266070892e-06, + "loss": 0.6693, + "step": 8590 + }, + { + "epoch": 1.28, + "grad_norm": 2.5760188621246303, + "learning_rate": 1.7433236454212214e-06, + "loss": 0.6628, + "step": 8591 + }, + { + "epoch": 1.28, + "grad_norm": 2.205869437416245, + "learning_rate": 1.7432590178340458e-06, + "loss": 0.6654, + "step": 8592 + }, + { + "epoch": 1.28, + "grad_norm": 9.57333716043537, + "learning_rate": 1.743194383309968e-06, + "loss": 0.7142, + "step": 8593 + }, + { + "epoch": 1.28, + "grad_norm": 2.5165106773053543, + "learning_rate": 1.743129741849591e-06, + "loss": 0.6706, + "step": 8594 + }, + { + "epoch": 1.28, + "grad_norm": 3.690107670131931, + "learning_rate": 1.7430650934535192e-06, + "loss": 0.6816, + "step": 8595 + }, + { + "epoch": 1.28, + "grad_norm": 2.086692497805524, + "learning_rate": 1.7430004381223545e-06, + "loss": 0.6758, + "step": 8596 + }, + { + "epoch": 1.28, + "grad_norm": 6.1108308936250895, + "learning_rate": 1.7429357758567011e-06, + "loss": 0.6842, + "step": 8597 + }, + { + "epoch": 1.28, + "grad_norm": 2.966299553508105, + "learning_rate": 1.7428711066571623e-06, + "loss": 0.6732, + "step": 8598 + }, + { + "epoch": 1.28, + "grad_norm": 1.6110687143046547, + "learning_rate": 1.742806430524342e-06, + "loss": 0.6803, + "step": 8599 + }, + { + "epoch": 1.28, + "grad_norm": 3.287697503617918, + "learning_rate": 1.7427417474588437e-06, + "loss": 0.6868, + "step": 8600 + }, + { + "epoch": 1.28, + "grad_norm": 5.012531085532044, + "learning_rate": 1.7426770574612708e-06, + "loss": 0.6699, + "step": 8601 + }, + { + "epoch": 1.28, + "grad_norm": 2.9346150418282706, + "learning_rate": 1.7426123605322272e-06, + "loss": 0.6361, + "step": 8602 + }, + { + "epoch": 1.28, + "grad_norm": 0.6303003476220026, + "learning_rate": 1.7425476566723169e-06, + "loss": 0.6719, + "step": 8603 + }, + { + "epoch": 1.28, + "grad_norm": 2.0417090105381916, + "learning_rate": 1.7424829458821434e-06, + "loss": 0.6543, + "step": 8604 + }, + { + "epoch": 1.28, + "grad_norm": 2.9034032314276104, + "learning_rate": 1.742418228162311e-06, + "loss": 0.6934, + "step": 8605 + }, + { + "epoch": 1.28, + "grad_norm": 3.3248426229828385, + "learning_rate": 1.7423535035134238e-06, + "loss": 0.6699, + "step": 8606 + }, + { + "epoch": 1.28, + "grad_norm": 2.062324806491953, + "learning_rate": 1.7422887719360854e-06, + "loss": 0.6562, + "step": 8607 + }, + { + "epoch": 1.28, + "grad_norm": 6.74948101398741, + "learning_rate": 1.7422240334309005e-06, + "loss": 0.6595, + "step": 8608 + }, + { + "epoch": 1.28, + "grad_norm": 3.1906481618090172, + "learning_rate": 1.742159287998473e-06, + "loss": 0.6647, + "step": 8609 + }, + { + "epoch": 1.28, + "grad_norm": 2.816044648132965, + "learning_rate": 1.7420945356394072e-06, + "loss": 0.6751, + "step": 8610 + }, + { + "epoch": 1.28, + "grad_norm": 2.1575065802310935, + "learning_rate": 1.7420297763543076e-06, + "loss": 0.7142, + "step": 8611 + }, + { + "epoch": 1.28, + "grad_norm": 0.764068950059657, + "learning_rate": 1.7419650101437785e-06, + "loss": 0.6725, + "step": 8612 + }, + { + "epoch": 1.28, + "grad_norm": 3.3113165421020896, + "learning_rate": 1.741900237008424e-06, + "loss": 0.6764, + "step": 8613 + }, + { + "epoch": 1.28, + "grad_norm": 5.101864599003163, + "learning_rate": 1.7418354569488493e-06, + "loss": 0.6882, + "step": 8614 + }, + { + "epoch": 1.28, + "grad_norm": 3.666652163127594, + "learning_rate": 1.7417706699656586e-06, + "loss": 0.6888, + "step": 8615 + }, + { + "epoch": 1.29, + "grad_norm": 2.069993944491255, + "learning_rate": 1.7417058760594565e-06, + "loss": 0.6758, + "step": 8616 + }, + { + "epoch": 1.29, + "grad_norm": 2.9229938310977372, + "learning_rate": 1.741641075230848e-06, + "loss": 0.694, + "step": 8617 + }, + { + "epoch": 1.29, + "grad_norm": 1.7653551451982106, + "learning_rate": 1.741576267480438e-06, + "loss": 0.651, + "step": 8618 + }, + { + "epoch": 1.29, + "grad_norm": 1.998201188288485, + "learning_rate": 1.7415114528088307e-06, + "loss": 0.7031, + "step": 8619 + }, + { + "epoch": 1.29, + "grad_norm": 1.705855057853482, + "learning_rate": 1.7414466312166317e-06, + "loss": 0.6927, + "step": 8620 + }, + { + "epoch": 1.29, + "grad_norm": 2.003262680919501, + "learning_rate": 1.7413818027044456e-06, + "loss": 0.6745, + "step": 8621 + }, + { + "epoch": 1.29, + "grad_norm": 1.0170995028702727, + "learning_rate": 1.7413169672728774e-06, + "loss": 0.6882, + "step": 8622 + }, + { + "epoch": 1.29, + "grad_norm": 1.2978838986957328, + "learning_rate": 1.7412521249225325e-06, + "loss": 0.6888, + "step": 8623 + }, + { + "epoch": 1.29, + "grad_norm": 1.8108160283538626, + "learning_rate": 1.741187275654016e-06, + "loss": 0.681, + "step": 8624 + }, + { + "epoch": 1.29, + "grad_norm": 2.311653748428833, + "learning_rate": 1.7411224194679332e-06, + "loss": 0.6842, + "step": 8625 + }, + { + "epoch": 1.29, + "grad_norm": 1.9223820298381749, + "learning_rate": 1.7410575563648892e-06, + "loss": 0.6751, + "step": 8626 + }, + { + "epoch": 1.29, + "grad_norm": 6.427443275437862, + "learning_rate": 1.7409926863454896e-06, + "loss": 0.6979, + "step": 8627 + }, + { + "epoch": 1.29, + "grad_norm": 0.710968981056539, + "learning_rate": 1.7409278094103397e-06, + "loss": 0.6686, + "step": 8628 + }, + { + "epoch": 1.29, + "grad_norm": 3.226504440699719, + "learning_rate": 1.7408629255600449e-06, + "loss": 0.681, + "step": 8629 + }, + { + "epoch": 1.29, + "grad_norm": 2.5816441811236768, + "learning_rate": 1.7407980347952108e-06, + "loss": 0.6797, + "step": 8630 + }, + { + "epoch": 1.29, + "grad_norm": 0.957668221765823, + "learning_rate": 1.7407331371164435e-06, + "loss": 0.6868, + "step": 8631 + }, + { + "epoch": 1.29, + "grad_norm": 1.1463433221233437, + "learning_rate": 1.7406682325243482e-06, + "loss": 0.707, + "step": 8632 + }, + { + "epoch": 1.29, + "grad_norm": 2.3913961877465364, + "learning_rate": 1.7406033210195309e-06, + "loss": 0.6686, + "step": 8633 + }, + { + "epoch": 1.29, + "grad_norm": 1.386073872657376, + "learning_rate": 1.7405384026025969e-06, + "loss": 0.6706, + "step": 8634 + }, + { + "epoch": 1.29, + "grad_norm": 3.7645145232151362, + "learning_rate": 1.7404734772741527e-06, + "loss": 0.6855, + "step": 8635 + }, + { + "epoch": 1.29, + "grad_norm": 1.8205902170339454, + "learning_rate": 1.7404085450348042e-06, + "loss": 0.6641, + "step": 8636 + }, + { + "epoch": 1.29, + "grad_norm": 3.0050577797160725, + "learning_rate": 1.7403436058851572e-06, + "loss": 0.679, + "step": 8637 + }, + { + "epoch": 1.29, + "grad_norm": 2.6981148302400655, + "learning_rate": 1.7402786598258178e-06, + "loss": 0.6979, + "step": 8638 + }, + { + "epoch": 1.29, + "grad_norm": 0.613579036528833, + "learning_rate": 1.7402137068573924e-06, + "loss": 0.6706, + "step": 8639 + }, + { + "epoch": 1.29, + "grad_norm": 0.59763358761898, + "learning_rate": 1.7401487469804867e-06, + "loss": 0.6882, + "step": 8640 + }, + { + "epoch": 1.29, + "grad_norm": 1.8730100243426784, + "learning_rate": 1.7400837801957073e-06, + "loss": 0.6621, + "step": 8641 + }, + { + "epoch": 1.29, + "grad_norm": 3.0577151055701948, + "learning_rate": 1.7400188065036607e-06, + "loss": 0.6641, + "step": 8642 + }, + { + "epoch": 1.29, + "grad_norm": 0.9431599979935923, + "learning_rate": 1.739953825904953e-06, + "loss": 0.6849, + "step": 8643 + }, + { + "epoch": 1.29, + "grad_norm": 2.9852734243642334, + "learning_rate": 1.7398888384001911e-06, + "loss": 0.6751, + "step": 8644 + }, + { + "epoch": 1.29, + "grad_norm": 2.010915227218595, + "learning_rate": 1.739823843989981e-06, + "loss": 0.6751, + "step": 8645 + }, + { + "epoch": 1.29, + "grad_norm": 2.84799394774548, + "learning_rate": 1.7397588426749294e-06, + "loss": 0.6699, + "step": 8646 + }, + { + "epoch": 1.29, + "grad_norm": 0.9834780734940043, + "learning_rate": 1.7396938344556435e-06, + "loss": 0.6712, + "step": 8647 + }, + { + "epoch": 1.29, + "grad_norm": 0.5863694362322177, + "learning_rate": 1.7396288193327292e-06, + "loss": 0.6628, + "step": 8648 + }, + { + "epoch": 1.29, + "grad_norm": 4.688169907241975, + "learning_rate": 1.739563797306794e-06, + "loss": 0.6836, + "step": 8649 + }, + { + "epoch": 1.29, + "grad_norm": 4.806665307385985, + "learning_rate": 1.7394987683784446e-06, + "loss": 0.6882, + "step": 8650 + }, + { + "epoch": 1.29, + "grad_norm": 0.8972501823720941, + "learning_rate": 1.7394337325482874e-06, + "loss": 0.6725, + "step": 8651 + }, + { + "epoch": 1.29, + "grad_norm": 1.5953402561630483, + "learning_rate": 1.7393686898169299e-06, + "loss": 0.6868, + "step": 8652 + }, + { + "epoch": 1.29, + "grad_norm": 0.8327396892990179, + "learning_rate": 1.7393036401849792e-06, + "loss": 0.6536, + "step": 8653 + }, + { + "epoch": 1.29, + "grad_norm": 1.2306636862181517, + "learning_rate": 1.7392385836530422e-06, + "loss": 0.6673, + "step": 8654 + }, + { + "epoch": 1.29, + "grad_norm": 4.252107476667633, + "learning_rate": 1.7391735202217262e-06, + "loss": 0.6947, + "step": 8655 + }, + { + "epoch": 1.29, + "grad_norm": 3.471484149013074, + "learning_rate": 1.739108449891638e-06, + "loss": 0.6706, + "step": 8656 + }, + { + "epoch": 1.29, + "grad_norm": 2.7682815078226106, + "learning_rate": 1.7390433726633856e-06, + "loss": 0.6699, + "step": 8657 + }, + { + "epoch": 1.29, + "grad_norm": 1.532289143632362, + "learning_rate": 1.7389782885375758e-06, + "loss": 0.7038, + "step": 8658 + }, + { + "epoch": 1.29, + "grad_norm": 0.7280841199444859, + "learning_rate": 1.7389131975148166e-06, + "loss": 0.707, + "step": 8659 + }, + { + "epoch": 1.29, + "grad_norm": 3.1837131753588235, + "learning_rate": 1.7388480995957152e-06, + "loss": 0.6921, + "step": 8660 + }, + { + "epoch": 1.29, + "grad_norm": 4.812125550496958, + "learning_rate": 1.738782994780879e-06, + "loss": 0.696, + "step": 8661 + }, + { + "epoch": 1.29, + "grad_norm": 1.025326709442193, + "learning_rate": 1.7387178830709154e-06, + "loss": 0.6842, + "step": 8662 + }, + { + "epoch": 1.29, + "grad_norm": 1.1277862933142069, + "learning_rate": 1.7386527644664328e-06, + "loss": 0.6777, + "step": 8663 + }, + { + "epoch": 1.29, + "grad_norm": 0.8314189126826788, + "learning_rate": 1.7385876389680388e-06, + "loss": 0.6706, + "step": 8664 + }, + { + "epoch": 1.29, + "grad_norm": 1.6649068976514594, + "learning_rate": 1.738522506576341e-06, + "loss": 0.6608, + "step": 8665 + }, + { + "epoch": 1.29, + "grad_norm": 2.3073782178497555, + "learning_rate": 1.7384573672919468e-06, + "loss": 0.6686, + "step": 8666 + }, + { + "epoch": 1.29, + "grad_norm": 0.8469043215876674, + "learning_rate": 1.7383922211154654e-06, + "loss": 0.6908, + "step": 8667 + }, + { + "epoch": 1.29, + "grad_norm": 4.752106714556416, + "learning_rate": 1.7383270680475038e-06, + "loss": 0.6595, + "step": 8668 + }, + { + "epoch": 1.29, + "grad_norm": 2.498684657089412, + "learning_rate": 1.7382619080886704e-06, + "loss": 0.6686, + "step": 8669 + }, + { + "epoch": 1.29, + "grad_norm": 1.359397435968665, + "learning_rate": 1.7381967412395732e-06, + "loss": 0.6348, + "step": 8670 + }, + { + "epoch": 1.29, + "grad_norm": 4.08454247820394, + "learning_rate": 1.7381315675008205e-06, + "loss": 0.6582, + "step": 8671 + }, + { + "epoch": 1.29, + "grad_norm": 4.484554262160724, + "learning_rate": 1.738066386873021e-06, + "loss": 0.6328, + "step": 8672 + }, + { + "epoch": 1.29, + "grad_norm": 2.1632319371045567, + "learning_rate": 1.7380011993567823e-06, + "loss": 0.6771, + "step": 8673 + }, + { + "epoch": 1.29, + "grad_norm": 4.022571126956604, + "learning_rate": 1.7379360049527133e-06, + "loss": 0.6875, + "step": 8674 + }, + { + "epoch": 1.29, + "grad_norm": 2.7836973973995485, + "learning_rate": 1.7378708036614222e-06, + "loss": 0.6829, + "step": 8675 + }, + { + "epoch": 1.29, + "grad_norm": 3.928155181410454, + "learning_rate": 1.7378055954835177e-06, + "loss": 0.6387, + "step": 8676 + }, + { + "epoch": 1.29, + "grad_norm": 4.627353419571463, + "learning_rate": 1.7377403804196083e-06, + "loss": 0.6777, + "step": 8677 + }, + { + "epoch": 1.29, + "grad_norm": 4.444197965302287, + "learning_rate": 1.7376751584703028e-06, + "loss": 0.6992, + "step": 8678 + }, + { + "epoch": 1.29, + "grad_norm": 3.9624734928780465, + "learning_rate": 1.7376099296362096e-06, + "loss": 0.748, + "step": 8679 + }, + { + "epoch": 1.29, + "grad_norm": 2.285291308348364, + "learning_rate": 1.7375446939179378e-06, + "loss": 0.679, + "step": 8680 + }, + { + "epoch": 1.29, + "grad_norm": 2.2793882373826264, + "learning_rate": 1.737479451316096e-06, + "loss": 0.6712, + "step": 8681 + }, + { + "epoch": 1.29, + "grad_norm": 1.9039189443790199, + "learning_rate": 1.7374142018312934e-06, + "loss": 0.6914, + "step": 8682 + }, + { + "epoch": 1.3, + "grad_norm": 0.7969641499298772, + "learning_rate": 1.7373489454641388e-06, + "loss": 0.6803, + "step": 8683 + }, + { + "epoch": 1.3, + "grad_norm": 3.327779195188425, + "learning_rate": 1.7372836822152412e-06, + "loss": 0.6484, + "step": 8684 + }, + { + "epoch": 1.3, + "grad_norm": 2.0298270317414167, + "learning_rate": 1.73721841208521e-06, + "loss": 0.6784, + "step": 8685 + }, + { + "epoch": 1.3, + "grad_norm": 1.2142224313450019, + "learning_rate": 1.7371531350746543e-06, + "loss": 0.6452, + "step": 8686 + }, + { + "epoch": 1.3, + "grad_norm": 1.9102664863547918, + "learning_rate": 1.7370878511841828e-06, + "loss": 0.6751, + "step": 8687 + }, + { + "epoch": 1.3, + "grad_norm": 0.7710773208201414, + "learning_rate": 1.7370225604144055e-06, + "loss": 0.6621, + "step": 8688 + }, + { + "epoch": 1.3, + "grad_norm": 3.0713174801580645, + "learning_rate": 1.7369572627659313e-06, + "loss": 0.6914, + "step": 8689 + }, + { + "epoch": 1.3, + "grad_norm": 0.9477469575993099, + "learning_rate": 1.7368919582393697e-06, + "loss": 0.6647, + "step": 8690 + }, + { + "epoch": 1.3, + "grad_norm": 1.3923085444066967, + "learning_rate": 1.7368266468353303e-06, + "loss": 0.681, + "step": 8691 + }, + { + "epoch": 1.3, + "grad_norm": 2.1160131243573588, + "learning_rate": 1.7367613285544228e-06, + "loss": 0.6745, + "step": 8692 + }, + { + "epoch": 1.3, + "grad_norm": 4.299833819630635, + "learning_rate": 1.7366960033972568e-06, + "loss": 0.6615, + "step": 8693 + }, + { + "epoch": 1.3, + "grad_norm": 1.194810764555172, + "learning_rate": 1.7366306713644416e-06, + "loss": 0.6732, + "step": 8694 + }, + { + "epoch": 1.3, + "grad_norm": 2.8071008126246544, + "learning_rate": 1.7365653324565872e-06, + "loss": 0.6419, + "step": 8695 + }, + { + "epoch": 1.3, + "grad_norm": 1.9362980430046428, + "learning_rate": 1.7364999866743038e-06, + "loss": 0.6868, + "step": 8696 + }, + { + "epoch": 1.3, + "grad_norm": 1.6912331673251444, + "learning_rate": 1.7364346340182003e-06, + "loss": 0.6699, + "step": 8697 + }, + { + "epoch": 1.3, + "grad_norm": 1.8860868063441683, + "learning_rate": 1.7363692744888877e-06, + "loss": 0.7012, + "step": 8698 + }, + { + "epoch": 1.3, + "grad_norm": 2.1570679798159986, + "learning_rate": 1.7363039080869752e-06, + "loss": 0.6667, + "step": 8699 + }, + { + "epoch": 1.3, + "grad_norm": 2.0029080309189116, + "learning_rate": 1.7362385348130735e-06, + "loss": 0.7012, + "step": 8700 + }, + { + "epoch": 1.3, + "grad_norm": 0.948924524147168, + "learning_rate": 1.7361731546677923e-06, + "loss": 0.6706, + "step": 8701 + }, + { + "epoch": 1.3, + "grad_norm": 1.7674398817469692, + "learning_rate": 1.736107767651742e-06, + "loss": 0.6862, + "step": 8702 + }, + { + "epoch": 1.3, + "grad_norm": 1.5489592754562365, + "learning_rate": 1.7360423737655328e-06, + "loss": 0.6849, + "step": 8703 + }, + { + "epoch": 1.3, + "grad_norm": 2.9386395700676684, + "learning_rate": 1.735976973009775e-06, + "loss": 0.6947, + "step": 8704 + }, + { + "epoch": 1.3, + "grad_norm": 1.0207471574654423, + "learning_rate": 1.7359115653850789e-06, + "loss": 0.6706, + "step": 8705 + }, + { + "epoch": 1.3, + "grad_norm": 2.74410453800444, + "learning_rate": 1.7358461508920552e-06, + "loss": 0.6992, + "step": 8706 + }, + { + "epoch": 1.3, + "grad_norm": 2.9776932656124546, + "learning_rate": 1.7357807295313142e-06, + "loss": 0.6908, + "step": 8707 + }, + { + "epoch": 1.3, + "grad_norm": 1.2469403953987217, + "learning_rate": 1.7357153013034668e-06, + "loss": 0.6829, + "step": 8708 + }, + { + "epoch": 1.3, + "grad_norm": 1.6431222642616439, + "learning_rate": 1.7356498662091235e-06, + "loss": 0.6725, + "step": 8709 + }, + { + "epoch": 1.3, + "grad_norm": 1.8700562847706494, + "learning_rate": 1.7355844242488945e-06, + "loss": 0.6745, + "step": 8710 + }, + { + "epoch": 1.3, + "grad_norm": 1.4760935818938612, + "learning_rate": 1.7355189754233912e-06, + "loss": 0.681, + "step": 8711 + }, + { + "epoch": 1.3, + "grad_norm": 2.0946056643397983, + "learning_rate": 1.7354535197332245e-06, + "loss": 0.6842, + "step": 8712 + }, + { + "epoch": 1.3, + "grad_norm": 5.966866482135429, + "learning_rate": 1.7353880571790049e-06, + "loss": 0.6914, + "step": 8713 + }, + { + "epoch": 1.3, + "grad_norm": 1.1104615051732918, + "learning_rate": 1.7353225877613435e-06, + "loss": 0.6706, + "step": 8714 + }, + { + "epoch": 1.3, + "grad_norm": 2.534656421975636, + "learning_rate": 1.7352571114808514e-06, + "loss": 0.6556, + "step": 8715 + }, + { + "epoch": 1.3, + "grad_norm": 2.4051887440715234, + "learning_rate": 1.7351916283381396e-06, + "loss": 0.6875, + "step": 8716 + }, + { + "epoch": 1.3, + "grad_norm": 1.4476133685196044, + "learning_rate": 1.7351261383338191e-06, + "loss": 0.6576, + "step": 8717 + }, + { + "epoch": 1.3, + "grad_norm": 4.941265053633604, + "learning_rate": 1.7350606414685015e-06, + "loss": 0.6914, + "step": 8718 + }, + { + "epoch": 1.3, + "grad_norm": 2.1895700186682023, + "learning_rate": 1.7349951377427983e-06, + "loss": 0.6895, + "step": 8719 + }, + { + "epoch": 1.3, + "grad_norm": 1.075940149057537, + "learning_rate": 1.7349296271573202e-06, + "loss": 0.6569, + "step": 8720 + }, + { + "epoch": 1.3, + "grad_norm": 3.6263566082799543, + "learning_rate": 1.7348641097126787e-06, + "loss": 0.6719, + "step": 8721 + }, + { + "epoch": 1.3, + "grad_norm": 3.881612400436302, + "learning_rate": 1.7347985854094857e-06, + "loss": 0.6947, + "step": 8722 + }, + { + "epoch": 1.3, + "grad_norm": 3.6598521973784672, + "learning_rate": 1.7347330542483527e-06, + "loss": 0.6758, + "step": 8723 + }, + { + "epoch": 1.3, + "grad_norm": 2.763836633188647, + "learning_rate": 1.7346675162298907e-06, + "loss": 0.6868, + "step": 8724 + }, + { + "epoch": 1.3, + "grad_norm": 3.078072228822213, + "learning_rate": 1.7346019713547121e-06, + "loss": 0.6836, + "step": 8725 + }, + { + "epoch": 1.3, + "grad_norm": 2.2415446583076757, + "learning_rate": 1.7345364196234283e-06, + "loss": 0.666, + "step": 8726 + }, + { + "epoch": 1.3, + "grad_norm": 1.149863561297166, + "learning_rate": 1.7344708610366513e-06, + "loss": 0.6569, + "step": 8727 + }, + { + "epoch": 1.3, + "grad_norm": 0.6856837107617393, + "learning_rate": 1.7344052955949925e-06, + "loss": 0.681, + "step": 8728 + }, + { + "epoch": 1.3, + "grad_norm": 1.869814296368122, + "learning_rate": 1.7343397232990639e-06, + "loss": 0.6602, + "step": 8729 + }, + { + "epoch": 1.3, + "grad_norm": 3.3809752666395663, + "learning_rate": 1.734274144149478e-06, + "loss": 0.6966, + "step": 8730 + }, + { + "epoch": 1.3, + "grad_norm": 2.053519122936646, + "learning_rate": 1.734208558146847e-06, + "loss": 0.6693, + "step": 8731 + }, + { + "epoch": 1.3, + "grad_norm": 4.031355299793135, + "learning_rate": 1.734142965291782e-06, + "loss": 0.6875, + "step": 8732 + }, + { + "epoch": 1.3, + "grad_norm": 2.53050077064896, + "learning_rate": 1.734077365584896e-06, + "loss": 0.6875, + "step": 8733 + }, + { + "epoch": 1.3, + "grad_norm": 4.348940896979694, + "learning_rate": 1.734011759026801e-06, + "loss": 0.6738, + "step": 8734 + }, + { + "epoch": 1.3, + "grad_norm": 1.082292300900019, + "learning_rate": 1.7339461456181094e-06, + "loss": 0.6725, + "step": 8735 + }, + { + "epoch": 1.3, + "grad_norm": 0.6367561327672218, + "learning_rate": 1.7338805253594336e-06, + "loss": 0.6615, + "step": 8736 + }, + { + "epoch": 1.3, + "grad_norm": 0.7630645943007089, + "learning_rate": 1.733814898251386e-06, + "loss": 0.6576, + "step": 8737 + }, + { + "epoch": 1.3, + "grad_norm": 6.015022626700978, + "learning_rate": 1.7337492642945786e-06, + "loss": 0.6868, + "step": 8738 + }, + { + "epoch": 1.3, + "grad_norm": 0.764733522115458, + "learning_rate": 1.7336836234896249e-06, + "loss": 0.666, + "step": 8739 + }, + { + "epoch": 1.3, + "grad_norm": 0.945018509097129, + "learning_rate": 1.7336179758371365e-06, + "loss": 0.6875, + "step": 8740 + }, + { + "epoch": 1.3, + "grad_norm": 2.103060267797686, + "learning_rate": 1.7335523213377272e-06, + "loss": 0.6947, + "step": 8741 + }, + { + "epoch": 1.3, + "grad_norm": 1.7893749942705137, + "learning_rate": 1.733486659992009e-06, + "loss": 0.6738, + "step": 8742 + }, + { + "epoch": 1.3, + "grad_norm": 0.8073967038313508, + "learning_rate": 1.733420991800595e-06, + "loss": 0.6868, + "step": 8743 + }, + { + "epoch": 1.3, + "grad_norm": 1.5582940260119535, + "learning_rate": 1.733355316764098e-06, + "loss": 0.6927, + "step": 8744 + }, + { + "epoch": 1.3, + "grad_norm": 1.1357760714649452, + "learning_rate": 1.733289634883131e-06, + "loss": 0.681, + "step": 8745 + }, + { + "epoch": 1.3, + "grad_norm": 1.6953873607791772, + "learning_rate": 1.7332239461583069e-06, + "loss": 0.668, + "step": 8746 + }, + { + "epoch": 1.3, + "grad_norm": 3.213975427753867, + "learning_rate": 1.7331582505902392e-06, + "loss": 0.7188, + "step": 8747 + }, + { + "epoch": 1.3, + "grad_norm": 1.544165665782981, + "learning_rate": 1.7330925481795401e-06, + "loss": 0.7038, + "step": 8748 + }, + { + "epoch": 1.3, + "grad_norm": 2.0097536983474775, + "learning_rate": 1.7330268389268238e-06, + "loss": 0.6751, + "step": 8749 + }, + { + "epoch": 1.3, + "grad_norm": 1.132360379088005, + "learning_rate": 1.7329611228327032e-06, + "loss": 0.6706, + "step": 8750 + }, + { + "epoch": 1.31, + "grad_norm": 0.7264294314559426, + "learning_rate": 1.7328953998977915e-06, + "loss": 0.6693, + "step": 8751 + }, + { + "epoch": 1.31, + "grad_norm": 3.262001565668996, + "learning_rate": 1.7328296701227023e-06, + "loss": 0.6842, + "step": 8752 + }, + { + "epoch": 1.31, + "grad_norm": 1.6458084443989123, + "learning_rate": 1.732763933508049e-06, + "loss": 0.6712, + "step": 8753 + }, + { + "epoch": 1.31, + "grad_norm": 0.6676803155124256, + "learning_rate": 1.732698190054445e-06, + "loss": 0.6589, + "step": 8754 + }, + { + "epoch": 1.31, + "grad_norm": 3.60158415297403, + "learning_rate": 1.7326324397625043e-06, + "loss": 0.6764, + "step": 8755 + }, + { + "epoch": 1.31, + "grad_norm": 1.7065413562882679, + "learning_rate": 1.7325666826328397e-06, + "loss": 0.6862, + "step": 8756 + }, + { + "epoch": 1.31, + "grad_norm": 3.5092030985363736, + "learning_rate": 1.732500918666066e-06, + "loss": 0.7077, + "step": 8757 + }, + { + "epoch": 1.31, + "grad_norm": 0.8455426533221444, + "learning_rate": 1.7324351478627963e-06, + "loss": 0.6882, + "step": 8758 + }, + { + "epoch": 1.31, + "grad_norm": 3.1377082761407857, + "learning_rate": 1.7323693702236444e-06, + "loss": 0.6654, + "step": 8759 + }, + { + "epoch": 1.31, + "grad_norm": 1.501693476572139, + "learning_rate": 1.7323035857492245e-06, + "loss": 0.6888, + "step": 8760 + }, + { + "epoch": 1.31, + "grad_norm": 2.096526735235738, + "learning_rate": 1.7322377944401505e-06, + "loss": 0.6836, + "step": 8761 + }, + { + "epoch": 1.31, + "grad_norm": 1.1829377107060752, + "learning_rate": 1.7321719962970364e-06, + "loss": 0.6725, + "step": 8762 + }, + { + "epoch": 1.31, + "grad_norm": 2.17619494895312, + "learning_rate": 1.7321061913204961e-06, + "loss": 0.7096, + "step": 8763 + }, + { + "epoch": 1.31, + "grad_norm": 4.792708162060531, + "learning_rate": 1.7320403795111445e-06, + "loss": 0.6927, + "step": 8764 + }, + { + "epoch": 1.31, + "grad_norm": 1.406465476165838, + "learning_rate": 1.7319745608695949e-06, + "loss": 0.6719, + "step": 8765 + }, + { + "epoch": 1.31, + "grad_norm": 2.309530222601276, + "learning_rate": 1.7319087353964622e-06, + "loss": 0.6706, + "step": 8766 + }, + { + "epoch": 1.31, + "grad_norm": 0.6506438277895225, + "learning_rate": 1.7318429030923602e-06, + "loss": 0.6621, + "step": 8767 + }, + { + "epoch": 1.31, + "grad_norm": 4.642996833054726, + "learning_rate": 1.7317770639579038e-06, + "loss": 0.668, + "step": 8768 + }, + { + "epoch": 1.31, + "grad_norm": 1.4150116397612136, + "learning_rate": 1.7317112179937077e-06, + "loss": 0.6895, + "step": 8769 + }, + { + "epoch": 1.31, + "grad_norm": 0.793736012766643, + "learning_rate": 1.7316453652003858e-06, + "loss": 0.6966, + "step": 8770 + }, + { + "epoch": 1.31, + "grad_norm": 1.2481956908295462, + "learning_rate": 1.7315795055785529e-06, + "loss": 0.6914, + "step": 8771 + }, + { + "epoch": 1.31, + "grad_norm": 1.2894044121976247, + "learning_rate": 1.731513639128824e-06, + "loss": 0.6628, + "step": 8772 + }, + { + "epoch": 1.31, + "grad_norm": 1.6046972370431685, + "learning_rate": 1.7314477658518136e-06, + "loss": 0.6686, + "step": 8773 + }, + { + "epoch": 1.31, + "grad_norm": 1.9419026595249262, + "learning_rate": 1.7313818857481364e-06, + "loss": 0.6543, + "step": 8774 + }, + { + "epoch": 1.31, + "grad_norm": 4.2929727748964135, + "learning_rate": 1.7313159988184073e-06, + "loss": 0.6764, + "step": 8775 + }, + { + "epoch": 1.31, + "grad_norm": 1.1695867997158182, + "learning_rate": 1.7312501050632414e-06, + "loss": 0.6699, + "step": 8776 + }, + { + "epoch": 1.31, + "grad_norm": 1.516213890860853, + "learning_rate": 1.7311842044832536e-06, + "loss": 0.6947, + "step": 8777 + }, + { + "epoch": 1.31, + "grad_norm": 1.938374668105928, + "learning_rate": 1.731118297079059e-06, + "loss": 0.696, + "step": 8778 + }, + { + "epoch": 1.31, + "grad_norm": 0.6090860249218704, + "learning_rate": 1.7310523828512729e-06, + "loss": 0.6745, + "step": 8779 + }, + { + "epoch": 1.31, + "grad_norm": 1.6695876715514129, + "learning_rate": 1.7309864618005098e-06, + "loss": 0.7227, + "step": 8780 + }, + { + "epoch": 1.31, + "grad_norm": 2.3421695492315915, + "learning_rate": 1.7309205339273855e-06, + "loss": 0.6582, + "step": 8781 + }, + { + "epoch": 1.31, + "grad_norm": 0.6231930666731943, + "learning_rate": 1.7308545992325154e-06, + "loss": 0.6641, + "step": 8782 + }, + { + "epoch": 1.31, + "grad_norm": 1.0798289190483816, + "learning_rate": 1.7307886577165148e-06, + "loss": 0.6387, + "step": 8783 + }, + { + "epoch": 1.31, + "grad_norm": 0.8075747139747669, + "learning_rate": 1.7307227093799988e-06, + "loss": 0.6829, + "step": 8784 + }, + { + "epoch": 1.31, + "grad_norm": 1.9408456653316726, + "learning_rate": 1.7306567542235833e-06, + "loss": 0.6426, + "step": 8785 + }, + { + "epoch": 1.31, + "grad_norm": 0.746036704031865, + "learning_rate": 1.7305907922478836e-06, + "loss": 0.6914, + "step": 8786 + }, + { + "epoch": 1.31, + "grad_norm": 2.7895115780652353, + "learning_rate": 1.7305248234535156e-06, + "loss": 0.6849, + "step": 8787 + }, + { + "epoch": 1.31, + "grad_norm": 5.665032227658894, + "learning_rate": 1.7304588478410945e-06, + "loss": 0.6816, + "step": 8788 + }, + { + "epoch": 1.31, + "grad_norm": 0.7330684481812412, + "learning_rate": 1.7303928654112364e-06, + "loss": 0.668, + "step": 8789 + }, + { + "epoch": 1.31, + "grad_norm": 1.0858575120099037, + "learning_rate": 1.7303268761645572e-06, + "loss": 0.6784, + "step": 8790 + }, + { + "epoch": 1.31, + "grad_norm": 7.3811784318036375, + "learning_rate": 1.7302608801016727e-06, + "loss": 0.6777, + "step": 8791 + }, + { + "epoch": 1.31, + "grad_norm": 0.7366448046178056, + "learning_rate": 1.7301948772231988e-06, + "loss": 0.6803, + "step": 8792 + }, + { + "epoch": 1.31, + "grad_norm": 2.662478302697057, + "learning_rate": 1.7301288675297515e-06, + "loss": 0.6673, + "step": 8793 + }, + { + "epoch": 1.31, + "grad_norm": 2.6038862071963496, + "learning_rate": 1.730062851021947e-06, + "loss": 0.6615, + "step": 8794 + }, + { + "epoch": 1.31, + "grad_norm": 3.2796677297832253, + "learning_rate": 1.7299968277004014e-06, + "loss": 0.6862, + "step": 8795 + }, + { + "epoch": 1.31, + "grad_norm": 0.8293653164791005, + "learning_rate": 1.7299307975657307e-06, + "loss": 0.6445, + "step": 8796 + }, + { + "epoch": 1.31, + "grad_norm": 2.270043409634695, + "learning_rate": 1.7298647606185513e-06, + "loss": 0.6602, + "step": 8797 + }, + { + "epoch": 1.31, + "grad_norm": 0.7367892257334551, + "learning_rate": 1.7297987168594796e-06, + "loss": 0.6478, + "step": 8798 + }, + { + "epoch": 1.31, + "grad_norm": 1.325972397404438, + "learning_rate": 1.729732666289132e-06, + "loss": 0.6686, + "step": 8799 + }, + { + "epoch": 1.31, + "grad_norm": 1.7246127718698703, + "learning_rate": 1.7296666089081247e-06, + "loss": 0.6602, + "step": 8800 + }, + { + "epoch": 1.31, + "grad_norm": 1.6692165898234563, + "learning_rate": 1.7296005447170746e-06, + "loss": 0.6849, + "step": 8801 + }, + { + "epoch": 1.31, + "grad_norm": 3.570445799429104, + "learning_rate": 1.729534473716598e-06, + "loss": 0.6751, + "step": 8802 + }, + { + "epoch": 1.31, + "grad_norm": 2.2538519743644607, + "learning_rate": 1.7294683959073118e-06, + "loss": 0.6823, + "step": 8803 + }, + { + "epoch": 1.31, + "grad_norm": 1.971970884688457, + "learning_rate": 1.7294023112898324e-06, + "loss": 0.6797, + "step": 8804 + }, + { + "epoch": 1.31, + "grad_norm": 2.9892619208855646, + "learning_rate": 1.7293362198647767e-06, + "loss": 0.6927, + "step": 8805 + }, + { + "epoch": 1.31, + "grad_norm": 3.2512484747594534, + "learning_rate": 1.7292701216327616e-06, + "loss": 0.6836, + "step": 8806 + }, + { + "epoch": 1.31, + "grad_norm": 3.616089950621106, + "learning_rate": 1.729204016594404e-06, + "loss": 0.7135, + "step": 8807 + }, + { + "epoch": 1.31, + "grad_norm": 1.3096177416523338, + "learning_rate": 1.729137904750321e-06, + "loss": 0.6582, + "step": 8808 + }, + { + "epoch": 1.31, + "grad_norm": 1.2663430258016568, + "learning_rate": 1.7290717861011292e-06, + "loss": 0.668, + "step": 8809 + }, + { + "epoch": 1.31, + "grad_norm": 10.532387295330262, + "learning_rate": 1.7290056606474458e-06, + "loss": 0.709, + "step": 8810 + }, + { + "epoch": 1.31, + "grad_norm": 0.8276412091954252, + "learning_rate": 1.7289395283898885e-06, + "loss": 0.6413, + "step": 8811 + }, + { + "epoch": 1.31, + "grad_norm": 7.047795965201187, + "learning_rate": 1.7288733893290738e-06, + "loss": 0.6966, + "step": 8812 + }, + { + "epoch": 1.31, + "grad_norm": 1.2277070043818499, + "learning_rate": 1.7288072434656193e-06, + "loss": 0.6693, + "step": 8813 + }, + { + "epoch": 1.31, + "grad_norm": 1.0181676698722704, + "learning_rate": 1.7287410908001427e-06, + "loss": 0.6719, + "step": 8814 + }, + { + "epoch": 1.31, + "grad_norm": 1.1475741581814176, + "learning_rate": 1.7286749313332607e-06, + "loss": 0.696, + "step": 8815 + }, + { + "epoch": 1.31, + "grad_norm": 1.761581955998432, + "learning_rate": 1.7286087650655913e-06, + "loss": 0.7044, + "step": 8816 + }, + { + "epoch": 1.31, + "grad_norm": 3.793193886477874, + "learning_rate": 1.7285425919977517e-06, + "loss": 0.6842, + "step": 8817 + }, + { + "epoch": 1.32, + "grad_norm": 5.472404567877617, + "learning_rate": 1.7284764121303599e-06, + "loss": 0.6816, + "step": 8818 + }, + { + "epoch": 1.32, + "grad_norm": 6.322948390153908, + "learning_rate": 1.7284102254640332e-06, + "loss": 0.6803, + "step": 8819 + }, + { + "epoch": 1.32, + "grad_norm": 1.2262786865714033, + "learning_rate": 1.7283440319993892e-06, + "loss": 0.6842, + "step": 8820 + }, + { + "epoch": 1.32, + "grad_norm": 1.4032273959672943, + "learning_rate": 1.7282778317370461e-06, + "loss": 0.6699, + "step": 8821 + }, + { + "epoch": 1.32, + "grad_norm": 1.6172637314183689, + "learning_rate": 1.728211624677622e-06, + "loss": 0.6829, + "step": 8822 + }, + { + "epoch": 1.32, + "grad_norm": 4.7704463771251495, + "learning_rate": 1.728145410821734e-06, + "loss": 0.6882, + "step": 8823 + }, + { + "epoch": 1.32, + "grad_norm": 1.752821834819236, + "learning_rate": 1.7280791901700003e-06, + "loss": 0.6667, + "step": 8824 + }, + { + "epoch": 1.32, + "grad_norm": 4.5211493736056285, + "learning_rate": 1.7280129627230396e-06, + "loss": 0.6823, + "step": 8825 + }, + { + "epoch": 1.32, + "grad_norm": 3.074552822050978, + "learning_rate": 1.7279467284814693e-06, + "loss": 0.6608, + "step": 8826 + }, + { + "epoch": 1.32, + "grad_norm": 5.509814231368676, + "learning_rate": 1.7278804874459078e-06, + "loss": 0.6523, + "step": 8827 + }, + { + "epoch": 1.32, + "grad_norm": 4.354120908684249, + "learning_rate": 1.7278142396169732e-06, + "loss": 0.6719, + "step": 8828 + }, + { + "epoch": 1.32, + "grad_norm": 1.2789932232434846, + "learning_rate": 1.7277479849952842e-06, + "loss": 0.6608, + "step": 8829 + }, + { + "epoch": 1.32, + "grad_norm": 0.7642396549000932, + "learning_rate": 1.727681723581459e-06, + "loss": 0.6543, + "step": 8830 + }, + { + "epoch": 1.32, + "grad_norm": 1.4931908921745864, + "learning_rate": 1.7276154553761155e-06, + "loss": 0.6628, + "step": 8831 + }, + { + "epoch": 1.32, + "grad_norm": 3.2458546730350175, + "learning_rate": 1.727549180379873e-06, + "loss": 0.6686, + "step": 8832 + }, + { + "epoch": 1.32, + "grad_norm": 1.3218320176732796, + "learning_rate": 1.7274828985933493e-06, + "loss": 0.6517, + "step": 8833 + }, + { + "epoch": 1.32, + "grad_norm": 0.6610253919258459, + "learning_rate": 1.7274166100171637e-06, + "loss": 0.6908, + "step": 8834 + }, + { + "epoch": 1.32, + "grad_norm": 2.9143594586844466, + "learning_rate": 1.7273503146519343e-06, + "loss": 0.6621, + "step": 8835 + }, + { + "epoch": 1.32, + "grad_norm": 1.204021390281893, + "learning_rate": 1.7272840124982802e-06, + "loss": 0.6719, + "step": 8836 + }, + { + "epoch": 1.32, + "grad_norm": 2.35358689688705, + "learning_rate": 1.7272177035568203e-06, + "loss": 0.6842, + "step": 8837 + }, + { + "epoch": 1.32, + "grad_norm": 5.257972536602473, + "learning_rate": 1.727151387828173e-06, + "loss": 0.6908, + "step": 8838 + }, + { + "epoch": 1.32, + "grad_norm": 1.9515137471920472, + "learning_rate": 1.7270850653129574e-06, + "loss": 0.6758, + "step": 8839 + }, + { + "epoch": 1.32, + "grad_norm": 1.3344057620165082, + "learning_rate": 1.7270187360117927e-06, + "loss": 0.6556, + "step": 8840 + }, + { + "epoch": 1.32, + "grad_norm": 2.0141991366948298, + "learning_rate": 1.726952399925298e-06, + "loss": 0.6595, + "step": 8841 + }, + { + "epoch": 1.32, + "grad_norm": 5.7674301586904955, + "learning_rate": 1.726886057054092e-06, + "loss": 0.6797, + "step": 8842 + }, + { + "epoch": 1.32, + "grad_norm": 0.9421841285468386, + "learning_rate": 1.7268197073987945e-06, + "loss": 0.6823, + "step": 8843 + }, + { + "epoch": 1.32, + "grad_norm": 5.718578400755337, + "learning_rate": 1.726753350960024e-06, + "loss": 0.6895, + "step": 8844 + }, + { + "epoch": 1.32, + "grad_norm": 0.7499969339068997, + "learning_rate": 1.7266869877384002e-06, + "loss": 0.6491, + "step": 8845 + }, + { + "epoch": 1.32, + "grad_norm": 2.6011021176138267, + "learning_rate": 1.726620617734543e-06, + "loss": 0.6536, + "step": 8846 + }, + { + "epoch": 1.32, + "grad_norm": 0.8786071417302911, + "learning_rate": 1.726554240949071e-06, + "loss": 0.6523, + "step": 8847 + }, + { + "epoch": 1.32, + "grad_norm": 0.9079358381779234, + "learning_rate": 1.7264878573826037e-06, + "loss": 0.6732, + "step": 8848 + }, + { + "epoch": 1.32, + "grad_norm": 6.905143704816516, + "learning_rate": 1.7264214670357613e-06, + "loss": 0.6992, + "step": 8849 + }, + { + "epoch": 1.32, + "grad_norm": 1.7374506965879248, + "learning_rate": 1.726355069909163e-06, + "loss": 0.681, + "step": 8850 + }, + { + "epoch": 1.32, + "grad_norm": 1.5982772796360636, + "learning_rate": 1.726288666003429e-06, + "loss": 0.6914, + "step": 8851 + }, + { + "epoch": 1.32, + "grad_norm": 3.49288101615905, + "learning_rate": 1.7262222553191784e-06, + "loss": 0.6888, + "step": 8852 + }, + { + "epoch": 1.32, + "grad_norm": 4.724715173183867, + "learning_rate": 1.7261558378570312e-06, + "loss": 0.7168, + "step": 8853 + }, + { + "epoch": 1.32, + "grad_norm": 1.82568094355982, + "learning_rate": 1.7260894136176072e-06, + "loss": 0.6504, + "step": 8854 + }, + { + "epoch": 1.32, + "grad_norm": 2.691939716701735, + "learning_rate": 1.7260229826015266e-06, + "loss": 0.6712, + "step": 8855 + }, + { + "epoch": 1.32, + "grad_norm": 1.2936571956201168, + "learning_rate": 1.7259565448094094e-06, + "loss": 0.6517, + "step": 8856 + }, + { + "epoch": 1.32, + "grad_norm": 0.786189050787584, + "learning_rate": 1.7258901002418753e-06, + "loss": 0.6777, + "step": 8857 + }, + { + "epoch": 1.32, + "grad_norm": 5.322120231698932, + "learning_rate": 1.725823648899545e-06, + "loss": 0.6901, + "step": 8858 + }, + { + "epoch": 1.32, + "grad_norm": 3.2151469451858095, + "learning_rate": 1.7257571907830383e-06, + "loss": 0.6868, + "step": 8859 + }, + { + "epoch": 1.32, + "grad_norm": 1.8478687153285074, + "learning_rate": 1.7256907258929756e-06, + "loss": 0.666, + "step": 8860 + }, + { + "epoch": 1.32, + "grad_norm": 0.7828230892233147, + "learning_rate": 1.7256242542299772e-06, + "loss": 0.6536, + "step": 8861 + }, + { + "epoch": 1.32, + "grad_norm": 2.9979085334748747, + "learning_rate": 1.7255577757946633e-06, + "loss": 0.6862, + "step": 8862 + }, + { + "epoch": 1.32, + "grad_norm": 7.1892228103731854, + "learning_rate": 1.7254912905876547e-06, + "loss": 0.6862, + "step": 8863 + }, + { + "epoch": 1.32, + "grad_norm": 1.188542450658862, + "learning_rate": 1.7254247986095715e-06, + "loss": 0.6615, + "step": 8864 + }, + { + "epoch": 1.32, + "grad_norm": 2.794873462052883, + "learning_rate": 1.7253582998610345e-06, + "loss": 0.6862, + "step": 8865 + }, + { + "epoch": 1.32, + "grad_norm": 2.7327366212053796, + "learning_rate": 1.7252917943426648e-06, + "loss": 0.6771, + "step": 8866 + }, + { + "epoch": 1.32, + "grad_norm": 3.0358409935197996, + "learning_rate": 1.7252252820550822e-06, + "loss": 0.6608, + "step": 8867 + }, + { + "epoch": 1.32, + "grad_norm": 0.7723390622333534, + "learning_rate": 1.725158762998908e-06, + "loss": 0.6699, + "step": 8868 + }, + { + "epoch": 1.32, + "grad_norm": 3.1653314200966682, + "learning_rate": 1.725092237174763e-06, + "loss": 0.6738, + "step": 8869 + }, + { + "epoch": 1.32, + "grad_norm": 1.6681059897154251, + "learning_rate": 1.7250257045832678e-06, + "loss": 0.696, + "step": 8870 + }, + { + "epoch": 1.32, + "grad_norm": 2.031042182273189, + "learning_rate": 1.724959165225044e-06, + "loss": 0.6842, + "step": 8871 + }, + { + "epoch": 1.32, + "grad_norm": 7.331151586476754, + "learning_rate": 1.7248926191007118e-06, + "loss": 0.6745, + "step": 8872 + }, + { + "epoch": 1.32, + "grad_norm": 1.942590142772518, + "learning_rate": 1.7248260662108929e-06, + "loss": 0.6706, + "step": 8873 + }, + { + "epoch": 1.32, + "grad_norm": 1.92331399125714, + "learning_rate": 1.7247595065562083e-06, + "loss": 0.6973, + "step": 8874 + }, + { + "epoch": 1.32, + "grad_norm": 0.8686324440851039, + "learning_rate": 1.7246929401372788e-06, + "loss": 0.6745, + "step": 8875 + }, + { + "epoch": 1.32, + "grad_norm": 4.063176679380842, + "learning_rate": 1.7246263669547263e-06, + "loss": 0.6855, + "step": 8876 + }, + { + "epoch": 1.32, + "grad_norm": 2.5072139027826363, + "learning_rate": 1.7245597870091718e-06, + "loss": 0.6667, + "step": 8877 + }, + { + "epoch": 1.32, + "grad_norm": 3.755698548754282, + "learning_rate": 1.7244932003012365e-06, + "loss": 0.6836, + "step": 8878 + }, + { + "epoch": 1.32, + "grad_norm": 3.168152652898917, + "learning_rate": 1.7244266068315423e-06, + "loss": 0.6999, + "step": 8879 + }, + { + "epoch": 1.32, + "grad_norm": 1.5914601078408233, + "learning_rate": 1.7243600066007104e-06, + "loss": 0.6732, + "step": 8880 + }, + { + "epoch": 1.32, + "grad_norm": 0.7269442003154436, + "learning_rate": 1.7242933996093627e-06, + "loss": 0.6829, + "step": 8881 + }, + { + "epoch": 1.32, + "grad_norm": 5.956251529599911, + "learning_rate": 1.7242267858581206e-06, + "loss": 0.681, + "step": 8882 + }, + { + "epoch": 1.32, + "grad_norm": 4.542881693866246, + "learning_rate": 1.7241601653476058e-06, + "loss": 0.6615, + "step": 8883 + }, + { + "epoch": 1.32, + "grad_norm": 2.947873306369048, + "learning_rate": 1.72409353807844e-06, + "loss": 0.6576, + "step": 8884 + }, + { + "epoch": 1.33, + "grad_norm": 4.057290921221241, + "learning_rate": 1.7240269040512452e-06, + "loss": 0.6992, + "step": 8885 + }, + { + "epoch": 1.33, + "grad_norm": 0.7908925774558436, + "learning_rate": 1.7239602632666435e-06, + "loss": 0.6667, + "step": 8886 + }, + { + "epoch": 1.33, + "grad_norm": 4.466658879034757, + "learning_rate": 1.7238936157252564e-06, + "loss": 0.7012, + "step": 8887 + }, + { + "epoch": 1.33, + "grad_norm": 0.7707319025035271, + "learning_rate": 1.723826961427706e-06, + "loss": 0.6634, + "step": 8888 + }, + { + "epoch": 1.33, + "grad_norm": 4.58518602616954, + "learning_rate": 1.723760300374615e-06, + "loss": 0.7096, + "step": 8889 + }, + { + "epoch": 1.33, + "grad_norm": 2.9002180163407503, + "learning_rate": 1.723693632566605e-06, + "loss": 0.6803, + "step": 8890 + }, + { + "epoch": 1.33, + "grad_norm": 1.3758428543721204, + "learning_rate": 1.7236269580042983e-06, + "loss": 0.6673, + "step": 8891 + }, + { + "epoch": 1.33, + "grad_norm": 5.1519109447752305, + "learning_rate": 1.723560276688317e-06, + "loss": 0.6777, + "step": 8892 + }, + { + "epoch": 1.33, + "grad_norm": 1.8467363705955273, + "learning_rate": 1.723493588619284e-06, + "loss": 0.638, + "step": 8893 + }, + { + "epoch": 1.33, + "grad_norm": 0.7214537435057161, + "learning_rate": 1.723426893797821e-06, + "loss": 0.668, + "step": 8894 + }, + { + "epoch": 1.33, + "grad_norm": 2.681073888970992, + "learning_rate": 1.723360192224551e-06, + "loss": 0.6699, + "step": 8895 + }, + { + "epoch": 1.33, + "grad_norm": 0.8049452680503447, + "learning_rate": 1.7232934839000962e-06, + "loss": 0.6934, + "step": 8896 + }, + { + "epoch": 1.33, + "grad_norm": 0.9155597768029925, + "learning_rate": 1.72322676882508e-06, + "loss": 0.6888, + "step": 8897 + }, + { + "epoch": 1.33, + "grad_norm": 2.7875551277240542, + "learning_rate": 1.7231600470001238e-06, + "loss": 0.6816, + "step": 8898 + }, + { + "epoch": 1.33, + "grad_norm": 0.815487254358227, + "learning_rate": 1.7230933184258509e-06, + "loss": 0.6621, + "step": 8899 + }, + { + "epoch": 1.33, + "grad_norm": 3.848603312303086, + "learning_rate": 1.723026583102884e-06, + "loss": 0.6725, + "step": 8900 + }, + { + "epoch": 1.33, + "grad_norm": 4.080170171640607, + "learning_rate": 1.7229598410318467e-06, + "loss": 0.6712, + "step": 8901 + }, + { + "epoch": 1.33, + "grad_norm": 1.5944293916933554, + "learning_rate": 1.7228930922133608e-06, + "loss": 0.696, + "step": 8902 + }, + { + "epoch": 1.33, + "grad_norm": 3.375237080500811, + "learning_rate": 1.7228263366480497e-06, + "loss": 0.6758, + "step": 8903 + }, + { + "epoch": 1.33, + "grad_norm": 1.3231627018656, + "learning_rate": 1.7227595743365367e-06, + "loss": 0.6751, + "step": 8904 + }, + { + "epoch": 1.33, + "grad_norm": 1.928961404732532, + "learning_rate": 1.7226928052794445e-06, + "loss": 0.6569, + "step": 8905 + }, + { + "epoch": 1.33, + "grad_norm": 0.8805288856145922, + "learning_rate": 1.7226260294773964e-06, + "loss": 0.6523, + "step": 8906 + }, + { + "epoch": 1.33, + "grad_norm": 0.8145969214093784, + "learning_rate": 1.722559246931016e-06, + "loss": 0.6615, + "step": 8907 + }, + { + "epoch": 1.33, + "grad_norm": 1.9340631496124185, + "learning_rate": 1.7224924576409258e-06, + "loss": 0.6706, + "step": 8908 + }, + { + "epoch": 1.33, + "grad_norm": 2.4959148367664916, + "learning_rate": 1.72242566160775e-06, + "loss": 0.6745, + "step": 8909 + }, + { + "epoch": 1.33, + "grad_norm": 3.9246124123760664, + "learning_rate": 1.7223588588321115e-06, + "loss": 0.6777, + "step": 8910 + }, + { + "epoch": 1.33, + "grad_norm": 1.2117500490849693, + "learning_rate": 1.7222920493146336e-06, + "loss": 0.6882, + "step": 8911 + }, + { + "epoch": 1.33, + "grad_norm": 4.627028522380986, + "learning_rate": 1.7222252330559403e-06, + "loss": 0.7279, + "step": 8912 + }, + { + "epoch": 1.33, + "grad_norm": 2.0943449256372277, + "learning_rate": 1.722158410056655e-06, + "loss": 0.6875, + "step": 8913 + }, + { + "epoch": 1.33, + "grad_norm": 1.1980721884834362, + "learning_rate": 1.7220915803174014e-06, + "loss": 0.6758, + "step": 8914 + }, + { + "epoch": 1.33, + "grad_norm": 1.0099090905423724, + "learning_rate": 1.7220247438388037e-06, + "loss": 0.694, + "step": 8915 + }, + { + "epoch": 1.33, + "grad_norm": 1.281954987246569, + "learning_rate": 1.7219579006214845e-06, + "loss": 0.6836, + "step": 8916 + }, + { + "epoch": 1.33, + "grad_norm": 3.526386016605229, + "learning_rate": 1.721891050666069e-06, + "loss": 0.6602, + "step": 8917 + }, + { + "epoch": 1.33, + "grad_norm": 1.3694532711039822, + "learning_rate": 1.7218241939731798e-06, + "loss": 0.6868, + "step": 8918 + }, + { + "epoch": 1.33, + "grad_norm": 2.631463680950229, + "learning_rate": 1.7217573305434421e-06, + "loss": 0.6693, + "step": 8919 + }, + { + "epoch": 1.33, + "grad_norm": 2.088072656772336, + "learning_rate": 1.721690460377479e-06, + "loss": 0.681, + "step": 8920 + }, + { + "epoch": 1.33, + "grad_norm": 2.0426219532016585, + "learning_rate": 1.7216235834759155e-06, + "loss": 0.666, + "step": 8921 + }, + { + "epoch": 1.33, + "grad_norm": 1.8122273206066355, + "learning_rate": 1.7215566998393748e-06, + "loss": 0.6803, + "step": 8922 + }, + { + "epoch": 1.33, + "grad_norm": 2.222676558823178, + "learning_rate": 1.721489809468482e-06, + "loss": 0.6654, + "step": 8923 + }, + { + "epoch": 1.33, + "grad_norm": 1.3833223822896643, + "learning_rate": 1.7214229123638608e-06, + "loss": 0.7031, + "step": 8924 + }, + { + "epoch": 1.33, + "grad_norm": 1.9727163282368998, + "learning_rate": 1.7213560085261358e-06, + "loss": 0.7005, + "step": 8925 + }, + { + "epoch": 1.33, + "grad_norm": 1.8045025472350986, + "learning_rate": 1.7212890979559314e-06, + "loss": 0.679, + "step": 8926 + }, + { + "epoch": 1.33, + "grad_norm": 1.1839241453551494, + "learning_rate": 1.7212221806538719e-06, + "loss": 0.6549, + "step": 8927 + }, + { + "epoch": 1.33, + "grad_norm": 1.022170277905354, + "learning_rate": 1.7211552566205821e-06, + "loss": 0.6934, + "step": 8928 + }, + { + "epoch": 1.33, + "grad_norm": 0.945872877354285, + "learning_rate": 1.7210883258566864e-06, + "loss": 0.6842, + "step": 8929 + }, + { + "epoch": 1.33, + "grad_norm": 1.0462772155134767, + "learning_rate": 1.7210213883628098e-06, + "loss": 0.6895, + "step": 8930 + }, + { + "epoch": 1.33, + "grad_norm": 1.7245322848630193, + "learning_rate": 1.7209544441395769e-06, + "loss": 0.6693, + "step": 8931 + }, + { + "epoch": 1.33, + "grad_norm": 1.0098884328115327, + "learning_rate": 1.7208874931876123e-06, + "loss": 0.6777, + "step": 8932 + }, + { + "epoch": 1.33, + "grad_norm": 5.655052754527628, + "learning_rate": 1.7208205355075406e-06, + "loss": 0.6999, + "step": 8933 + }, + { + "epoch": 1.33, + "grad_norm": 4.558723715190061, + "learning_rate": 1.7207535710999877e-06, + "loss": 0.6654, + "step": 8934 + }, + { + "epoch": 1.33, + "grad_norm": 0.8786926064027464, + "learning_rate": 1.7206865999655776e-06, + "loss": 0.6784, + "step": 8935 + }, + { + "epoch": 1.33, + "grad_norm": 3.540243090942768, + "learning_rate": 1.7206196221049359e-06, + "loss": 0.6602, + "step": 8936 + }, + { + "epoch": 1.33, + "grad_norm": 0.9562317582188503, + "learning_rate": 1.7205526375186875e-06, + "loss": 0.7116, + "step": 8937 + }, + { + "epoch": 1.33, + "grad_norm": 1.752739640187885, + "learning_rate": 1.7204856462074575e-06, + "loss": 0.6764, + "step": 8938 + }, + { + "epoch": 1.33, + "grad_norm": 4.062872676567447, + "learning_rate": 1.7204186481718712e-06, + "loss": 0.6829, + "step": 8939 + }, + { + "epoch": 1.33, + "grad_norm": 1.292550801676416, + "learning_rate": 1.7203516434125543e-06, + "loss": 0.6582, + "step": 8940 + }, + { + "epoch": 1.33, + "grad_norm": 1.0596487887995374, + "learning_rate": 1.7202846319301317e-06, + "loss": 0.6738, + "step": 8941 + }, + { + "epoch": 1.33, + "grad_norm": 2.852387304909222, + "learning_rate": 1.7202176137252287e-06, + "loss": 0.6914, + "step": 8942 + }, + { + "epoch": 1.33, + "grad_norm": 2.1730734840099117, + "learning_rate": 1.720150588798471e-06, + "loss": 0.6745, + "step": 8943 + }, + { + "epoch": 1.33, + "grad_norm": 2.235266270874155, + "learning_rate": 1.7200835571504843e-06, + "loss": 0.6888, + "step": 8944 + }, + { + "epoch": 1.33, + "grad_norm": 1.1521314577594863, + "learning_rate": 1.7200165187818942e-06, + "loss": 0.6654, + "step": 8945 + }, + { + "epoch": 1.33, + "grad_norm": 4.0906972084446815, + "learning_rate": 1.719949473693326e-06, + "loss": 0.6452, + "step": 8946 + }, + { + "epoch": 1.33, + "grad_norm": 0.7090749711913243, + "learning_rate": 1.7198824218854058e-06, + "loss": 0.6862, + "step": 8947 + }, + { + "epoch": 1.33, + "grad_norm": 0.850040034957921, + "learning_rate": 1.7198153633587595e-06, + "loss": 0.6484, + "step": 8948 + }, + { + "epoch": 1.33, + "grad_norm": 1.083662407807541, + "learning_rate": 1.7197482981140126e-06, + "loss": 0.6608, + "step": 8949 + }, + { + "epoch": 1.33, + "grad_norm": 0.892679997056147, + "learning_rate": 1.7196812261517912e-06, + "loss": 0.6712, + "step": 8950 + }, + { + "epoch": 1.33, + "grad_norm": 2.257342204904894, + "learning_rate": 1.719614147472721e-06, + "loss": 0.6829, + "step": 8951 + }, + { + "epoch": 1.34, + "grad_norm": 2.219260950588435, + "learning_rate": 1.7195470620774288e-06, + "loss": 0.6771, + "step": 8952 + }, + { + "epoch": 1.34, + "grad_norm": 2.9024747058986833, + "learning_rate": 1.7194799699665399e-06, + "loss": 0.6348, + "step": 8953 + }, + { + "epoch": 1.34, + "grad_norm": 0.9710137471050568, + "learning_rate": 1.719412871140681e-06, + "loss": 0.7259, + "step": 8954 + }, + { + "epoch": 1.34, + "grad_norm": 3.338944812860526, + "learning_rate": 1.7193457656004778e-06, + "loss": 0.6719, + "step": 8955 + }, + { + "epoch": 1.34, + "grad_norm": 3.6506175841493045, + "learning_rate": 1.7192786533465573e-06, + "loss": 0.681, + "step": 8956 + }, + { + "epoch": 1.34, + "grad_norm": 7.253399055398987, + "learning_rate": 1.7192115343795454e-06, + "loss": 0.7083, + "step": 8957 + }, + { + "epoch": 1.34, + "grad_norm": 1.0840830239781793, + "learning_rate": 1.7191444087000688e-06, + "loss": 0.651, + "step": 8958 + }, + { + "epoch": 1.34, + "grad_norm": 2.6009415904154167, + "learning_rate": 1.7190772763087538e-06, + "loss": 0.6784, + "step": 8959 + }, + { + "epoch": 1.34, + "grad_norm": 3.0742297735226773, + "learning_rate": 1.719010137206227e-06, + "loss": 0.6745, + "step": 8960 + }, + { + "epoch": 1.34, + "grad_norm": 4.2027862734183685, + "learning_rate": 1.7189429913931147e-06, + "loss": 0.679, + "step": 8961 + }, + { + "epoch": 1.34, + "grad_norm": 1.084060899449546, + "learning_rate": 1.718875838870044e-06, + "loss": 0.694, + "step": 8962 + }, + { + "epoch": 1.34, + "grad_norm": 5.54030144316122, + "learning_rate": 1.7188086796376417e-06, + "loss": 0.6934, + "step": 8963 + }, + { + "epoch": 1.34, + "grad_norm": 2.2235415731819725, + "learning_rate": 1.7187415136965341e-06, + "loss": 0.6634, + "step": 8964 + }, + { + "epoch": 1.34, + "grad_norm": 7.884145322856984, + "learning_rate": 1.718674341047349e-06, + "loss": 0.722, + "step": 8965 + }, + { + "epoch": 1.34, + "grad_norm": 2.410164089873417, + "learning_rate": 1.7186071616907124e-06, + "loss": 0.6452, + "step": 8966 + }, + { + "epoch": 1.34, + "grad_norm": 7.506247751634845, + "learning_rate": 1.7185399756272514e-06, + "loss": 0.6764, + "step": 8967 + }, + { + "epoch": 1.34, + "grad_norm": 4.352671192525237, + "learning_rate": 1.7184727828575934e-06, + "loss": 0.6875, + "step": 8968 + }, + { + "epoch": 1.34, + "grad_norm": 4.768524571885998, + "learning_rate": 1.7184055833823655e-06, + "loss": 0.6816, + "step": 8969 + }, + { + "epoch": 1.34, + "grad_norm": 3.1496836739007787, + "learning_rate": 1.7183383772021947e-06, + "loss": 0.6934, + "step": 8970 + }, + { + "epoch": 1.34, + "grad_norm": 1.6178470161154719, + "learning_rate": 1.7182711643177082e-06, + "loss": 0.6797, + "step": 8971 + }, + { + "epoch": 1.34, + "grad_norm": 7.741940453811778, + "learning_rate": 1.7182039447295334e-06, + "loss": 0.6992, + "step": 8972 + }, + { + "epoch": 1.34, + "grad_norm": 2.3112629370397975, + "learning_rate": 1.7181367184382975e-06, + "loss": 0.6647, + "step": 8973 + }, + { + "epoch": 1.34, + "grad_norm": 3.553267187452984, + "learning_rate": 1.7180694854446284e-06, + "loss": 0.6784, + "step": 8974 + }, + { + "epoch": 1.34, + "grad_norm": 2.661821915354852, + "learning_rate": 1.7180022457491535e-06, + "loss": 0.6686, + "step": 8975 + }, + { + "epoch": 1.34, + "grad_norm": 1.0747408209109655, + "learning_rate": 1.7179349993524999e-06, + "loss": 0.6562, + "step": 8976 + }, + { + "epoch": 1.34, + "grad_norm": 1.8771873195393747, + "learning_rate": 1.7178677462552953e-06, + "loss": 0.6966, + "step": 8977 + }, + { + "epoch": 1.34, + "grad_norm": 1.5189881553442321, + "learning_rate": 1.7178004864581675e-06, + "loss": 0.6621, + "step": 8978 + }, + { + "epoch": 1.34, + "grad_norm": 6.078066732786764, + "learning_rate": 1.7177332199617445e-06, + "loss": 0.7181, + "step": 8979 + }, + { + "epoch": 1.34, + "grad_norm": 1.8254588586703813, + "learning_rate": 1.7176659467666536e-06, + "loss": 0.6868, + "step": 8980 + }, + { + "epoch": 1.34, + "grad_norm": 1.2976231202819928, + "learning_rate": 1.7175986668735234e-06, + "loss": 0.6556, + "step": 8981 + }, + { + "epoch": 1.34, + "grad_norm": 1.640736012450371, + "learning_rate": 1.7175313802829812e-06, + "loss": 0.6693, + "step": 8982 + }, + { + "epoch": 1.34, + "grad_norm": 2.3723374220918134, + "learning_rate": 1.717464086995655e-06, + "loss": 0.6706, + "step": 8983 + }, + { + "epoch": 1.34, + "grad_norm": 2.215788691671679, + "learning_rate": 1.7173967870121733e-06, + "loss": 0.6647, + "step": 8984 + }, + { + "epoch": 1.34, + "grad_norm": 3.867268714902981, + "learning_rate": 1.7173294803331637e-06, + "loss": 0.6615, + "step": 8985 + }, + { + "epoch": 1.34, + "grad_norm": 1.2745063296705421, + "learning_rate": 1.7172621669592547e-06, + "loss": 0.6562, + "step": 8986 + }, + { + "epoch": 1.34, + "grad_norm": 2.403608603923426, + "learning_rate": 1.7171948468910746e-06, + "loss": 0.6465, + "step": 8987 + }, + { + "epoch": 1.34, + "grad_norm": 1.3433467692233598, + "learning_rate": 1.7171275201292512e-06, + "loss": 0.666, + "step": 8988 + }, + { + "epoch": 1.34, + "grad_norm": 2.5717171019738085, + "learning_rate": 1.7170601866744136e-06, + "loss": 0.6914, + "step": 8989 + }, + { + "epoch": 1.34, + "grad_norm": 3.9930685814672735, + "learning_rate": 1.7169928465271898e-06, + "loss": 0.7096, + "step": 8990 + }, + { + "epoch": 1.34, + "grad_norm": 0.8856419301827728, + "learning_rate": 1.7169254996882082e-06, + "loss": 0.6947, + "step": 8991 + }, + { + "epoch": 1.34, + "grad_norm": 3.544012088366, + "learning_rate": 1.7168581461580977e-06, + "loss": 0.6732, + "step": 8992 + }, + { + "epoch": 1.34, + "grad_norm": 3.084472836823921, + "learning_rate": 1.7167907859374867e-06, + "loss": 0.679, + "step": 8993 + }, + { + "epoch": 1.34, + "grad_norm": 1.9000898613294417, + "learning_rate": 1.7167234190270038e-06, + "loss": 0.6901, + "step": 8994 + }, + { + "epoch": 1.34, + "grad_norm": 1.178034544564859, + "learning_rate": 1.7166560454272778e-06, + "loss": 0.6914, + "step": 8995 + }, + { + "epoch": 1.34, + "grad_norm": 4.375699448575205, + "learning_rate": 1.7165886651389378e-06, + "loss": 0.6634, + "step": 8996 + }, + { + "epoch": 1.34, + "grad_norm": 1.0653827565566047, + "learning_rate": 1.7165212781626122e-06, + "loss": 0.6764, + "step": 8997 + }, + { + "epoch": 1.34, + "grad_norm": 6.3482200380249125, + "learning_rate": 1.7164538844989301e-06, + "loss": 0.6771, + "step": 8998 + }, + { + "epoch": 1.34, + "grad_norm": 2.083264404060306, + "learning_rate": 1.7163864841485208e-06, + "loss": 0.6667, + "step": 8999 + }, + { + "epoch": 1.34, + "grad_norm": 3.074538824067115, + "learning_rate": 1.7163190771120132e-06, + "loss": 0.6693, + "step": 9000 + }, + { + "epoch": 1.34, + "grad_norm": 1.7788532985539869, + "learning_rate": 1.7162516633900357e-06, + "loss": 0.6855, + "step": 9001 + }, + { + "epoch": 1.34, + "grad_norm": 4.725358093385939, + "learning_rate": 1.7161842429832185e-06, + "loss": 0.6882, + "step": 9002 + }, + { + "epoch": 1.34, + "grad_norm": 2.384930412897024, + "learning_rate": 1.7161168158921905e-06, + "loss": 0.7005, + "step": 9003 + }, + { + "epoch": 1.34, + "grad_norm": 3.3553788760032313, + "learning_rate": 1.7160493821175806e-06, + "loss": 0.6784, + "step": 9004 + }, + { + "epoch": 1.34, + "grad_norm": 1.1353755295399244, + "learning_rate": 1.7159819416600184e-06, + "loss": 0.6549, + "step": 9005 + }, + { + "epoch": 1.34, + "grad_norm": 3.1368043743697673, + "learning_rate": 1.7159144945201337e-06, + "loss": 0.6576, + "step": 9006 + }, + { + "epoch": 1.34, + "grad_norm": 2.9340387595810604, + "learning_rate": 1.7158470406985558e-06, + "loss": 0.6628, + "step": 9007 + }, + { + "epoch": 1.34, + "grad_norm": 0.7174178562105439, + "learning_rate": 1.7157795801959143e-06, + "loss": 0.6745, + "step": 9008 + }, + { + "epoch": 1.34, + "grad_norm": 2.3276216843714925, + "learning_rate": 1.7157121130128384e-06, + "loss": 0.6868, + "step": 9009 + }, + { + "epoch": 1.34, + "grad_norm": 2.1672819462742563, + "learning_rate": 1.7156446391499578e-06, + "loss": 0.6745, + "step": 9010 + }, + { + "epoch": 1.34, + "grad_norm": 3.5629731749653266, + "learning_rate": 1.715577158607903e-06, + "loss": 0.6699, + "step": 9011 + }, + { + "epoch": 1.34, + "grad_norm": 3.505547713333929, + "learning_rate": 1.715509671387303e-06, + "loss": 0.668, + "step": 9012 + }, + { + "epoch": 1.34, + "grad_norm": 3.490603145544369, + "learning_rate": 1.715442177488788e-06, + "loss": 0.6562, + "step": 9013 + }, + { + "epoch": 1.34, + "grad_norm": 2.0346071929544385, + "learning_rate": 1.715374676912988e-06, + "loss": 0.6699, + "step": 9014 + }, + { + "epoch": 1.34, + "grad_norm": 2.0249043576247887, + "learning_rate": 1.7153071696605328e-06, + "loss": 0.6615, + "step": 9015 + }, + { + "epoch": 1.34, + "grad_norm": 0.8217915899058746, + "learning_rate": 1.7152396557320525e-06, + "loss": 0.6699, + "step": 9016 + }, + { + "epoch": 1.34, + "grad_norm": 0.892651502458677, + "learning_rate": 1.7151721351281774e-06, + "loss": 0.668, + "step": 9017 + }, + { + "epoch": 1.34, + "grad_norm": 0.9720393540597205, + "learning_rate": 1.7151046078495374e-06, + "loss": 0.6751, + "step": 9018 + }, + { + "epoch": 1.35, + "grad_norm": 2.0197153100111715, + "learning_rate": 1.7150370738967627e-06, + "loss": 0.6953, + "step": 9019 + }, + { + "epoch": 1.35, + "grad_norm": 1.119644306477457, + "learning_rate": 1.714969533270484e-06, + "loss": 0.6927, + "step": 9020 + }, + { + "epoch": 1.35, + "grad_norm": 3.9104934637935873, + "learning_rate": 1.7149019859713314e-06, + "loss": 0.6615, + "step": 9021 + }, + { + "epoch": 1.35, + "grad_norm": 0.915249049429312, + "learning_rate": 1.714834431999935e-06, + "loss": 0.6758, + "step": 9022 + }, + { + "epoch": 1.35, + "grad_norm": 2.4187473346374686, + "learning_rate": 1.714766871356926e-06, + "loss": 0.6934, + "step": 9023 + }, + { + "epoch": 1.35, + "grad_norm": 0.7675734208695795, + "learning_rate": 1.7146993040429345e-06, + "loss": 0.6784, + "step": 9024 + }, + { + "epoch": 1.35, + "grad_norm": 2.427500884512883, + "learning_rate": 1.7146317300585911e-06, + "loss": 0.6797, + "step": 9025 + }, + { + "epoch": 1.35, + "grad_norm": 1.7647487942347952, + "learning_rate": 1.7145641494045266e-06, + "loss": 0.6647, + "step": 9026 + }, + { + "epoch": 1.35, + "grad_norm": 2.109123530498625, + "learning_rate": 1.7144965620813718e-06, + "loss": 0.6432, + "step": 9027 + }, + { + "epoch": 1.35, + "grad_norm": 0.9875505046055676, + "learning_rate": 1.7144289680897574e-06, + "loss": 0.6764, + "step": 9028 + }, + { + "epoch": 1.35, + "grad_norm": 0.874156930572412, + "learning_rate": 1.7143613674303141e-06, + "loss": 0.6549, + "step": 9029 + }, + { + "epoch": 1.35, + "grad_norm": 0.97330863411936, + "learning_rate": 1.714293760103673e-06, + "loss": 0.666, + "step": 9030 + }, + { + "epoch": 1.35, + "grad_norm": 2.211470045528498, + "learning_rate": 1.7142261461104652e-06, + "loss": 0.6615, + "step": 9031 + }, + { + "epoch": 1.35, + "grad_norm": 2.8280705405206157, + "learning_rate": 1.7141585254513216e-06, + "loss": 0.6738, + "step": 9032 + }, + { + "epoch": 1.35, + "grad_norm": 1.8074704181700543, + "learning_rate": 1.7140908981268733e-06, + "loss": 0.6556, + "step": 9033 + }, + { + "epoch": 1.35, + "grad_norm": 2.848355039218546, + "learning_rate": 1.7140232641377512e-06, + "loss": 0.6589, + "step": 9034 + }, + { + "epoch": 1.35, + "grad_norm": 1.6001004303403215, + "learning_rate": 1.7139556234845874e-06, + "loss": 0.6523, + "step": 9035 + }, + { + "epoch": 1.35, + "grad_norm": 2.50740861022956, + "learning_rate": 1.7138879761680122e-06, + "loss": 0.6895, + "step": 9036 + }, + { + "epoch": 1.35, + "grad_norm": 3.6293222014507354, + "learning_rate": 1.7138203221886576e-06, + "loss": 0.7012, + "step": 9037 + }, + { + "epoch": 1.35, + "grad_norm": 2.3326909990901106, + "learning_rate": 1.7137526615471546e-06, + "loss": 0.6595, + "step": 9038 + }, + { + "epoch": 1.35, + "grad_norm": 1.041469772606308, + "learning_rate": 1.7136849942441352e-06, + "loss": 0.6758, + "step": 9039 + }, + { + "epoch": 1.35, + "grad_norm": 0.9683833265499198, + "learning_rate": 1.7136173202802304e-06, + "loss": 0.6823, + "step": 9040 + }, + { + "epoch": 1.35, + "grad_norm": 3.938900594767453, + "learning_rate": 1.713549639656072e-06, + "loss": 0.6732, + "step": 9041 + }, + { + "epoch": 1.35, + "grad_norm": 3.1086422282866333, + "learning_rate": 1.7134819523722919e-06, + "loss": 0.6523, + "step": 9042 + }, + { + "epoch": 1.35, + "grad_norm": 1.8357929820014929, + "learning_rate": 1.7134142584295215e-06, + "loss": 0.6686, + "step": 9043 + }, + { + "epoch": 1.35, + "grad_norm": 0.8671366356368536, + "learning_rate": 1.7133465578283927e-06, + "loss": 0.666, + "step": 9044 + }, + { + "epoch": 1.35, + "grad_norm": 5.284288204450902, + "learning_rate": 1.7132788505695374e-06, + "loss": 0.6882, + "step": 9045 + }, + { + "epoch": 1.35, + "grad_norm": 0.8948781987477697, + "learning_rate": 1.7132111366535877e-06, + "loss": 0.6634, + "step": 9046 + }, + { + "epoch": 1.35, + "grad_norm": 2.541112436110081, + "learning_rate": 1.713143416081175e-06, + "loss": 0.6771, + "step": 9047 + }, + { + "epoch": 1.35, + "grad_norm": 1.2783511799684615, + "learning_rate": 1.7130756888529324e-06, + "loss": 0.6562, + "step": 9048 + }, + { + "epoch": 1.35, + "grad_norm": 3.1659786065043574, + "learning_rate": 1.7130079549694906e-06, + "loss": 0.6478, + "step": 9049 + }, + { + "epoch": 1.35, + "grad_norm": 2.3106995162728103, + "learning_rate": 1.712940214431483e-06, + "loss": 0.6641, + "step": 9050 + }, + { + "epoch": 1.35, + "grad_norm": 0.9946008180456891, + "learning_rate": 1.7128724672395412e-06, + "loss": 0.6823, + "step": 9051 + }, + { + "epoch": 1.35, + "grad_norm": 1.687744686691879, + "learning_rate": 1.7128047133942976e-06, + "loss": 0.6777, + "step": 9052 + }, + { + "epoch": 1.35, + "grad_norm": 2.3503451745715793, + "learning_rate": 1.7127369528963844e-06, + "loss": 0.6855, + "step": 9053 + }, + { + "epoch": 1.35, + "grad_norm": 1.1352038693910043, + "learning_rate": 1.712669185746434e-06, + "loss": 0.7064, + "step": 9054 + }, + { + "epoch": 1.35, + "grad_norm": 1.3773344460794612, + "learning_rate": 1.7126014119450796e-06, + "loss": 0.6888, + "step": 9055 + }, + { + "epoch": 1.35, + "grad_norm": 4.511860599937161, + "learning_rate": 1.712533631492953e-06, + "loss": 0.6914, + "step": 9056 + }, + { + "epoch": 1.35, + "grad_norm": 0.894526713962332, + "learning_rate": 1.712465844390687e-06, + "loss": 0.6823, + "step": 9057 + }, + { + "epoch": 1.35, + "grad_norm": 1.9740906651503238, + "learning_rate": 1.7123980506389144e-06, + "loss": 0.6621, + "step": 9058 + }, + { + "epoch": 1.35, + "grad_norm": 2.749367763918816, + "learning_rate": 1.7123302502382676e-06, + "loss": 0.7025, + "step": 9059 + }, + { + "epoch": 1.35, + "grad_norm": 3.3987918344718833, + "learning_rate": 1.7122624431893798e-06, + "loss": 0.6927, + "step": 9060 + }, + { + "epoch": 1.35, + "grad_norm": 3.1280028527367096, + "learning_rate": 1.7121946294928834e-06, + "loss": 0.6712, + "step": 9061 + }, + { + "epoch": 1.35, + "grad_norm": 4.350331840125984, + "learning_rate": 1.7121268091494114e-06, + "loss": 0.6895, + "step": 9062 + }, + { + "epoch": 1.35, + "grad_norm": 1.4263239966174013, + "learning_rate": 1.7120589821595973e-06, + "loss": 0.6732, + "step": 9063 + }, + { + "epoch": 1.35, + "grad_norm": 0.6434971649063116, + "learning_rate": 1.7119911485240736e-06, + "loss": 0.6738, + "step": 9064 + }, + { + "epoch": 1.35, + "grad_norm": 1.4170458487773365, + "learning_rate": 1.7119233082434734e-06, + "loss": 0.6686, + "step": 9065 + }, + { + "epoch": 1.35, + "grad_norm": 1.9560868793371984, + "learning_rate": 1.7118554613184302e-06, + "loss": 0.6693, + "step": 9066 + }, + { + "epoch": 1.35, + "grad_norm": 0.7210832389355212, + "learning_rate": 1.711787607749577e-06, + "loss": 0.6745, + "step": 9067 + }, + { + "epoch": 1.35, + "grad_norm": 0.6597669433952074, + "learning_rate": 1.7117197475375468e-06, + "loss": 0.6777, + "step": 9068 + }, + { + "epoch": 1.35, + "grad_norm": 1.8732359824918814, + "learning_rate": 1.7116518806829739e-06, + "loss": 0.6875, + "step": 9069 + }, + { + "epoch": 1.35, + "grad_norm": 2.1662406421361484, + "learning_rate": 1.7115840071864904e-06, + "loss": 0.6771, + "step": 9070 + }, + { + "epoch": 1.35, + "grad_norm": 0.7226183457878842, + "learning_rate": 1.7115161270487306e-06, + "loss": 0.6836, + "step": 9071 + }, + { + "epoch": 1.35, + "grad_norm": 3.2130716590572, + "learning_rate": 1.7114482402703281e-06, + "loss": 0.679, + "step": 9072 + }, + { + "epoch": 1.35, + "grad_norm": 0.8223146986778914, + "learning_rate": 1.7113803468519162e-06, + "loss": 0.6536, + "step": 9073 + }, + { + "epoch": 1.35, + "grad_norm": 2.407373747077449, + "learning_rate": 1.7113124467941286e-06, + "loss": 0.6406, + "step": 9074 + }, + { + "epoch": 1.35, + "grad_norm": 1.024380876847112, + "learning_rate": 1.7112445400975988e-06, + "loss": 0.6673, + "step": 9075 + }, + { + "epoch": 1.35, + "grad_norm": 0.755740003041189, + "learning_rate": 1.711176626762961e-06, + "loss": 0.6562, + "step": 9076 + }, + { + "epoch": 1.35, + "grad_norm": 0.8439304433221655, + "learning_rate": 1.7111087067908487e-06, + "loss": 0.6849, + "step": 9077 + }, + { + "epoch": 1.35, + "grad_norm": 1.7354205028354484, + "learning_rate": 1.7110407801818962e-06, + "loss": 0.6641, + "step": 9078 + }, + { + "epoch": 1.35, + "grad_norm": 2.7390369337098868, + "learning_rate": 1.710972846936737e-06, + "loss": 0.6934, + "step": 9079 + }, + { + "epoch": 1.35, + "grad_norm": 0.9272767780723676, + "learning_rate": 1.7109049070560055e-06, + "loss": 0.6816, + "step": 9080 + }, + { + "epoch": 1.35, + "grad_norm": 9.028404461263111, + "learning_rate": 1.7108369605403357e-06, + "loss": 0.6992, + "step": 9081 + }, + { + "epoch": 1.35, + "grad_norm": 0.7974605530165487, + "learning_rate": 1.7107690073903616e-06, + "loss": 0.6576, + "step": 9082 + }, + { + "epoch": 1.35, + "grad_norm": 4.013495534104482, + "learning_rate": 1.7107010476067175e-06, + "loss": 0.7109, + "step": 9083 + }, + { + "epoch": 1.35, + "grad_norm": 3.321939947300424, + "learning_rate": 1.7106330811900374e-06, + "loss": 0.6602, + "step": 9084 + }, + { + "epoch": 1.35, + "grad_norm": 1.0039742453854488, + "learning_rate": 1.710565108140956e-06, + "loss": 0.6458, + "step": 9085 + }, + { + "epoch": 1.36, + "grad_norm": 4.325559642864358, + "learning_rate": 1.710497128460108e-06, + "loss": 0.6862, + "step": 9086 + }, + { + "epoch": 1.36, + "grad_norm": 1.878290321759715, + "learning_rate": 1.7104291421481272e-06, + "loss": 0.7038, + "step": 9087 + }, + { + "epoch": 1.36, + "grad_norm": 3.1012525503769233, + "learning_rate": 1.7103611492056485e-06, + "loss": 0.6862, + "step": 9088 + }, + { + "epoch": 1.36, + "grad_norm": 3.5048180602999537, + "learning_rate": 1.7102931496333062e-06, + "loss": 0.6862, + "step": 9089 + }, + { + "epoch": 1.36, + "grad_norm": 1.351036201653561, + "learning_rate": 1.7102251434317352e-06, + "loss": 0.7109, + "step": 9090 + }, + { + "epoch": 1.36, + "grad_norm": 5.818849148778638, + "learning_rate": 1.71015713060157e-06, + "loss": 0.6758, + "step": 9091 + }, + { + "epoch": 1.36, + "grad_norm": 1.8614200668414003, + "learning_rate": 1.7100891111434456e-06, + "loss": 0.6973, + "step": 9092 + }, + { + "epoch": 1.36, + "grad_norm": 4.492782335610241, + "learning_rate": 1.7100210850579968e-06, + "loss": 0.6777, + "step": 9093 + }, + { + "epoch": 1.36, + "grad_norm": 5.010033144454473, + "learning_rate": 1.7099530523458583e-06, + "loss": 0.679, + "step": 9094 + }, + { + "epoch": 1.36, + "grad_norm": 2.5303941367958394, + "learning_rate": 1.7098850130076653e-06, + "loss": 0.6745, + "step": 9095 + }, + { + "epoch": 1.36, + "grad_norm": 2.489790973865067, + "learning_rate": 1.7098169670440524e-06, + "loss": 0.6641, + "step": 9096 + }, + { + "epoch": 1.36, + "grad_norm": 2.590792985774741, + "learning_rate": 1.7097489144556553e-06, + "loss": 0.6595, + "step": 9097 + }, + { + "epoch": 1.36, + "grad_norm": 0.8175106743161007, + "learning_rate": 1.7096808552431089e-06, + "loss": 0.6979, + "step": 9098 + }, + { + "epoch": 1.36, + "grad_norm": 0.8251667946520137, + "learning_rate": 1.709612789407048e-06, + "loss": 0.6966, + "step": 9099 + }, + { + "epoch": 1.36, + "grad_norm": 2.717400213158441, + "learning_rate": 1.7095447169481082e-06, + "loss": 0.653, + "step": 9100 + }, + { + "epoch": 1.36, + "grad_norm": 0.6602295525474112, + "learning_rate": 1.709476637866925e-06, + "loss": 0.6745, + "step": 9101 + }, + { + "epoch": 1.36, + "grad_norm": 0.6393800728247847, + "learning_rate": 1.7094085521641338e-06, + "loss": 0.6758, + "step": 9102 + }, + { + "epoch": 1.36, + "grad_norm": 0.9733177461150109, + "learning_rate": 1.7093404598403693e-06, + "loss": 0.6608, + "step": 9103 + }, + { + "epoch": 1.36, + "grad_norm": 1.223699148599248, + "learning_rate": 1.7092723608962678e-06, + "loss": 0.6771, + "step": 9104 + }, + { + "epoch": 1.36, + "grad_norm": 3.8676541560964033, + "learning_rate": 1.7092042553324646e-06, + "loss": 0.6719, + "step": 9105 + }, + { + "epoch": 1.36, + "grad_norm": 2.5129203623465743, + "learning_rate": 1.7091361431495955e-06, + "loss": 0.7044, + "step": 9106 + }, + { + "epoch": 1.36, + "grad_norm": 2.228429862898769, + "learning_rate": 1.709068024348296e-06, + "loss": 0.6628, + "step": 9107 + }, + { + "epoch": 1.36, + "grad_norm": 2.54767917846934, + "learning_rate": 1.708999898929202e-06, + "loss": 0.6745, + "step": 9108 + }, + { + "epoch": 1.36, + "grad_norm": 0.8309529874787356, + "learning_rate": 1.7089317668929492e-06, + "loss": 0.6842, + "step": 9109 + }, + { + "epoch": 1.36, + "grad_norm": 5.3499200550488455, + "learning_rate": 1.7088636282401735e-06, + "loss": 0.668, + "step": 9110 + }, + { + "epoch": 1.36, + "grad_norm": 0.7949153142228425, + "learning_rate": 1.7087954829715108e-06, + "loss": 0.6738, + "step": 9111 + }, + { + "epoch": 1.36, + "grad_norm": 2.9300368443364615, + "learning_rate": 1.7087273310875972e-06, + "loss": 0.6875, + "step": 9112 + }, + { + "epoch": 1.36, + "grad_norm": 0.8036696797588058, + "learning_rate": 1.7086591725890686e-06, + "loss": 0.638, + "step": 9113 + }, + { + "epoch": 1.36, + "grad_norm": 1.5640424537198738, + "learning_rate": 1.7085910074765616e-06, + "loss": 0.6901, + "step": 9114 + }, + { + "epoch": 1.36, + "grad_norm": 0.8000382282718134, + "learning_rate": 1.708522835750712e-06, + "loss": 0.6836, + "step": 9115 + }, + { + "epoch": 1.36, + "grad_norm": 4.7430277704671875, + "learning_rate": 1.7084546574121558e-06, + "loss": 0.6738, + "step": 9116 + }, + { + "epoch": 1.36, + "grad_norm": 3.946832770091644, + "learning_rate": 1.7083864724615297e-06, + "loss": 0.696, + "step": 9117 + }, + { + "epoch": 1.36, + "grad_norm": 2.725773843208377, + "learning_rate": 1.7083182808994705e-06, + "loss": 0.6842, + "step": 9118 + }, + { + "epoch": 1.36, + "grad_norm": 4.704531900905469, + "learning_rate": 1.7082500827266137e-06, + "loss": 0.6953, + "step": 9119 + }, + { + "epoch": 1.36, + "grad_norm": 0.9647894785419417, + "learning_rate": 1.7081818779435964e-06, + "loss": 0.7044, + "step": 9120 + }, + { + "epoch": 1.36, + "grad_norm": 1.50612295057099, + "learning_rate": 1.7081136665510547e-06, + "loss": 0.6771, + "step": 9121 + }, + { + "epoch": 1.36, + "grad_norm": 0.8464149537748066, + "learning_rate": 1.708045448549626e-06, + "loss": 0.6621, + "step": 9122 + }, + { + "epoch": 1.36, + "grad_norm": 0.8605808096500263, + "learning_rate": 1.7079772239399462e-06, + "loss": 0.6953, + "step": 9123 + }, + { + "epoch": 1.36, + "grad_norm": 6.017125772260472, + "learning_rate": 1.7079089927226525e-06, + "loss": 0.6803, + "step": 9124 + }, + { + "epoch": 1.36, + "grad_norm": 4.039000481028101, + "learning_rate": 1.7078407548983816e-06, + "loss": 0.6895, + "step": 9125 + }, + { + "epoch": 1.36, + "grad_norm": 1.9734383884144266, + "learning_rate": 1.7077725104677701e-06, + "loss": 0.6992, + "step": 9126 + }, + { + "epoch": 1.36, + "grad_norm": 2.5348185884774788, + "learning_rate": 1.7077042594314554e-06, + "loss": 0.7031, + "step": 9127 + }, + { + "epoch": 1.36, + "grad_norm": 2.435702693767827, + "learning_rate": 1.7076360017900742e-06, + "loss": 0.6621, + "step": 9128 + }, + { + "epoch": 1.36, + "grad_norm": 2.3869823993023864, + "learning_rate": 1.7075677375442635e-06, + "loss": 0.6921, + "step": 9129 + }, + { + "epoch": 1.36, + "grad_norm": 0.567544583706478, + "learning_rate": 1.7074994666946605e-06, + "loss": 0.6803, + "step": 9130 + }, + { + "epoch": 1.36, + "grad_norm": 3.4869485978466845, + "learning_rate": 1.7074311892419026e-06, + "loss": 0.6686, + "step": 9131 + }, + { + "epoch": 1.36, + "grad_norm": 1.3029160208151074, + "learning_rate": 1.7073629051866267e-06, + "loss": 0.6751, + "step": 9132 + }, + { + "epoch": 1.36, + "grad_norm": 2.3148795680212846, + "learning_rate": 1.7072946145294704e-06, + "loss": 0.6771, + "step": 9133 + }, + { + "epoch": 1.36, + "grad_norm": 1.2751413563973062, + "learning_rate": 1.7072263172710709e-06, + "loss": 0.6628, + "step": 9134 + }, + { + "epoch": 1.36, + "grad_norm": 6.191143974686877, + "learning_rate": 1.7071580134120656e-06, + "loss": 0.7018, + "step": 9135 + }, + { + "epoch": 1.36, + "grad_norm": 6.49999325251229, + "learning_rate": 1.707089702953092e-06, + "loss": 0.694, + "step": 9136 + }, + { + "epoch": 1.36, + "grad_norm": 1.065365630302391, + "learning_rate": 1.7070213858947877e-06, + "loss": 0.6738, + "step": 9137 + }, + { + "epoch": 1.36, + "grad_norm": 3.874218493775202, + "learning_rate": 1.7069530622377902e-06, + "loss": 0.681, + "step": 9138 + }, + { + "epoch": 1.36, + "grad_norm": 2.531184477262752, + "learning_rate": 1.706884731982737e-06, + "loss": 0.6895, + "step": 9139 + }, + { + "epoch": 1.36, + "grad_norm": 0.7390628639342364, + "learning_rate": 1.7068163951302664e-06, + "loss": 0.6667, + "step": 9140 + }, + { + "epoch": 1.36, + "grad_norm": 3.508200683477671, + "learning_rate": 1.706748051681016e-06, + "loss": 0.6966, + "step": 9141 + }, + { + "epoch": 1.36, + "grad_norm": 0.8896050996807681, + "learning_rate": 1.7066797016356233e-06, + "loss": 0.6784, + "step": 9142 + }, + { + "epoch": 1.36, + "grad_norm": 2.5103271688703384, + "learning_rate": 1.7066113449947267e-06, + "loss": 0.6921, + "step": 9143 + }, + { + "epoch": 1.36, + "grad_norm": 2.8714912983514442, + "learning_rate": 1.7065429817589638e-06, + "loss": 0.6712, + "step": 9144 + }, + { + "epoch": 1.36, + "grad_norm": 0.6069904328109078, + "learning_rate": 1.7064746119289728e-06, + "loss": 0.6719, + "step": 9145 + }, + { + "epoch": 1.36, + "grad_norm": 1.9651477494093823, + "learning_rate": 1.7064062355053917e-06, + "loss": 0.6549, + "step": 9146 + }, + { + "epoch": 1.36, + "grad_norm": 3.4323148038383406, + "learning_rate": 1.7063378524888585e-06, + "loss": 0.6908, + "step": 9147 + }, + { + "epoch": 1.36, + "grad_norm": 2.9316522834444787, + "learning_rate": 1.706269462880012e-06, + "loss": 0.6908, + "step": 9148 + }, + { + "epoch": 1.36, + "grad_norm": 1.0844973967009657, + "learning_rate": 1.70620106667949e-06, + "loss": 0.6738, + "step": 9149 + }, + { + "epoch": 1.36, + "grad_norm": 1.4672287281384517, + "learning_rate": 1.706132663887931e-06, + "loss": 0.6621, + "step": 9150 + }, + { + "epoch": 1.36, + "grad_norm": 1.6019516846017468, + "learning_rate": 1.7060642545059736e-06, + "loss": 0.6758, + "step": 9151 + }, + { + "epoch": 1.36, + "grad_norm": 1.028753949485863, + "learning_rate": 1.7059958385342559e-06, + "loss": 0.6549, + "step": 9152 + }, + { + "epoch": 1.37, + "grad_norm": 1.6074441456602453, + "learning_rate": 1.7059274159734165e-06, + "loss": 0.6849, + "step": 9153 + }, + { + "epoch": 1.37, + "grad_norm": 4.0613249609574735, + "learning_rate": 1.7058589868240942e-06, + "loss": 0.6888, + "step": 9154 + }, + { + "epoch": 1.37, + "grad_norm": 3.5841330069596347, + "learning_rate": 1.7057905510869276e-06, + "loss": 0.7057, + "step": 9155 + }, + { + "epoch": 1.37, + "grad_norm": 1.1745438346160024, + "learning_rate": 1.7057221087625556e-06, + "loss": 0.6719, + "step": 9156 + }, + { + "epoch": 1.37, + "grad_norm": 2.0862078183324, + "learning_rate": 1.7056536598516162e-06, + "loss": 0.679, + "step": 9157 + }, + { + "epoch": 1.37, + "grad_norm": 1.0018854958153858, + "learning_rate": 1.705585204354749e-06, + "loss": 0.681, + "step": 9158 + }, + { + "epoch": 1.37, + "grad_norm": 0.7166568353622885, + "learning_rate": 1.705516742272593e-06, + "loss": 0.6777, + "step": 9159 + }, + { + "epoch": 1.37, + "grad_norm": 0.965679843057323, + "learning_rate": 1.7054482736057864e-06, + "loss": 0.6641, + "step": 9160 + }, + { + "epoch": 1.37, + "grad_norm": 2.313719889776869, + "learning_rate": 1.7053797983549688e-06, + "loss": 0.6986, + "step": 9161 + }, + { + "epoch": 1.37, + "grad_norm": 1.266355297620178, + "learning_rate": 1.7053113165207793e-06, + "loss": 0.6621, + "step": 9162 + }, + { + "epoch": 1.37, + "grad_norm": 2.261614024405649, + "learning_rate": 1.705242828103857e-06, + "loss": 0.6829, + "step": 9163 + }, + { + "epoch": 1.37, + "grad_norm": 4.451123486434517, + "learning_rate": 1.7051743331048406e-06, + "loss": 0.6992, + "step": 9164 + }, + { + "epoch": 1.37, + "grad_norm": 0.7129984101409237, + "learning_rate": 1.7051058315243702e-06, + "loss": 0.6836, + "step": 9165 + }, + { + "epoch": 1.37, + "grad_norm": 1.3421253669107116, + "learning_rate": 1.7050373233630846e-06, + "loss": 0.6829, + "step": 9166 + }, + { + "epoch": 1.37, + "grad_norm": 0.7734909927153423, + "learning_rate": 1.7049688086216232e-06, + "loss": 0.6602, + "step": 9167 + }, + { + "epoch": 1.37, + "grad_norm": 2.8995125575614367, + "learning_rate": 1.7049002873006258e-06, + "loss": 0.6602, + "step": 9168 + }, + { + "epoch": 1.37, + "grad_norm": 0.8490212454214706, + "learning_rate": 1.7048317594007316e-06, + "loss": 0.6849, + "step": 9169 + }, + { + "epoch": 1.37, + "grad_norm": 3.095084113345477, + "learning_rate": 1.7047632249225798e-06, + "loss": 0.6738, + "step": 9170 + }, + { + "epoch": 1.37, + "grad_norm": 0.6568380624402392, + "learning_rate": 1.704694683866811e-06, + "loss": 0.6738, + "step": 9171 + }, + { + "epoch": 1.37, + "grad_norm": 0.748982304955807, + "learning_rate": 1.704626136234064e-06, + "loss": 0.6855, + "step": 9172 + }, + { + "epoch": 1.37, + "grad_norm": 5.0512377617443445, + "learning_rate": 1.7045575820249797e-06, + "loss": 0.709, + "step": 9173 + }, + { + "epoch": 1.37, + "grad_norm": 0.7080458453296153, + "learning_rate": 1.7044890212401966e-06, + "loss": 0.6634, + "step": 9174 + }, + { + "epoch": 1.37, + "grad_norm": 1.7710001355294278, + "learning_rate": 1.7044204538803553e-06, + "loss": 0.6686, + "step": 9175 + }, + { + "epoch": 1.37, + "grad_norm": 4.461655425232297, + "learning_rate": 1.7043518799460955e-06, + "loss": 0.6576, + "step": 9176 + }, + { + "epoch": 1.37, + "grad_norm": 5.183856585613049, + "learning_rate": 1.7042832994380575e-06, + "loss": 0.6654, + "step": 9177 + }, + { + "epoch": 1.37, + "grad_norm": 2.5882430614906857, + "learning_rate": 1.704214712356881e-06, + "loss": 0.6862, + "step": 9178 + }, + { + "epoch": 1.37, + "grad_norm": 1.791024639901277, + "learning_rate": 1.7041461187032065e-06, + "loss": 0.6725, + "step": 9179 + }, + { + "epoch": 1.37, + "grad_norm": 3.7885994989919434, + "learning_rate": 1.704077518477674e-06, + "loss": 0.6615, + "step": 9180 + }, + { + "epoch": 1.37, + "grad_norm": 3.1591723940737797, + "learning_rate": 1.7040089116809236e-06, + "loss": 0.6979, + "step": 9181 + }, + { + "epoch": 1.37, + "grad_norm": 7.945481342484333, + "learning_rate": 1.7039402983135957e-06, + "loss": 0.6712, + "step": 9182 + }, + { + "epoch": 1.37, + "grad_norm": 3.6319068602947735, + "learning_rate": 1.7038716783763309e-06, + "loss": 0.6829, + "step": 9183 + }, + { + "epoch": 1.37, + "grad_norm": 1.21590405605672, + "learning_rate": 1.7038030518697695e-06, + "loss": 0.6641, + "step": 9184 + }, + { + "epoch": 1.37, + "grad_norm": 2.6655017144368864, + "learning_rate": 1.7037344187945516e-06, + "loss": 0.6849, + "step": 9185 + }, + { + "epoch": 1.37, + "grad_norm": 4.685988350735724, + "learning_rate": 1.7036657791513186e-06, + "loss": 0.6706, + "step": 9186 + }, + { + "epoch": 1.37, + "grad_norm": 0.9414390392878395, + "learning_rate": 1.7035971329407106e-06, + "loss": 0.6706, + "step": 9187 + }, + { + "epoch": 1.37, + "grad_norm": 3.492355920422382, + "learning_rate": 1.703528480163368e-06, + "loss": 0.6829, + "step": 9188 + }, + { + "epoch": 1.37, + "grad_norm": 5.810618030144529, + "learning_rate": 1.7034598208199324e-06, + "loss": 0.6784, + "step": 9189 + }, + { + "epoch": 1.37, + "grad_norm": 3.6100306352710447, + "learning_rate": 1.7033911549110438e-06, + "loss": 0.6725, + "step": 9190 + }, + { + "epoch": 1.37, + "grad_norm": 3.0278234902747703, + "learning_rate": 1.7033224824373431e-06, + "loss": 0.6758, + "step": 9191 + }, + { + "epoch": 1.37, + "grad_norm": 2.555473390876823, + "learning_rate": 1.7032538033994718e-06, + "loss": 0.6543, + "step": 9192 + }, + { + "epoch": 1.37, + "grad_norm": 0.7213471878150616, + "learning_rate": 1.7031851177980706e-06, + "loss": 0.6686, + "step": 9193 + }, + { + "epoch": 1.37, + "grad_norm": 0.8962557986173637, + "learning_rate": 1.7031164256337801e-06, + "loss": 0.6947, + "step": 9194 + }, + { + "epoch": 1.37, + "grad_norm": 4.762864164715707, + "learning_rate": 1.7030477269072423e-06, + "loss": 0.6914, + "step": 9195 + }, + { + "epoch": 1.37, + "grad_norm": 3.6149481963694345, + "learning_rate": 1.7029790216190975e-06, + "loss": 0.6725, + "step": 9196 + }, + { + "epoch": 1.37, + "grad_norm": 4.604933797800781, + "learning_rate": 1.7029103097699877e-06, + "loss": 0.6615, + "step": 9197 + }, + { + "epoch": 1.37, + "grad_norm": 2.3555465400871327, + "learning_rate": 1.7028415913605536e-06, + "loss": 0.6888, + "step": 9198 + }, + { + "epoch": 1.37, + "grad_norm": 0.9710991646282793, + "learning_rate": 1.7027728663914366e-06, + "loss": 0.6764, + "step": 9199 + }, + { + "epoch": 1.37, + "grad_norm": 0.7729877208009758, + "learning_rate": 1.7027041348632786e-06, + "loss": 0.651, + "step": 9200 + }, + { + "epoch": 1.37, + "grad_norm": 1.9186274370402197, + "learning_rate": 1.7026353967767206e-06, + "loss": 0.679, + "step": 9201 + }, + { + "epoch": 1.37, + "grad_norm": 2.753907342009414, + "learning_rate": 1.7025666521324048e-06, + "loss": 0.6862, + "step": 9202 + }, + { + "epoch": 1.37, + "grad_norm": 3.3031610119966466, + "learning_rate": 1.7024979009309716e-06, + "loss": 0.6953, + "step": 9203 + }, + { + "epoch": 1.37, + "grad_norm": 0.986126659338499, + "learning_rate": 1.7024291431730636e-06, + "loss": 0.6576, + "step": 9204 + }, + { + "epoch": 1.37, + "grad_norm": 2.5504480955600592, + "learning_rate": 1.7023603788593222e-06, + "loss": 0.6478, + "step": 9205 + }, + { + "epoch": 1.37, + "grad_norm": 0.9136992614309805, + "learning_rate": 1.7022916079903898e-06, + "loss": 0.6738, + "step": 9206 + }, + { + "epoch": 1.37, + "grad_norm": 1.6162410039276436, + "learning_rate": 1.702222830566907e-06, + "loss": 0.6966, + "step": 9207 + }, + { + "epoch": 1.37, + "grad_norm": 1.100546818149164, + "learning_rate": 1.7021540465895169e-06, + "loss": 0.6602, + "step": 9208 + }, + { + "epoch": 1.37, + "grad_norm": 0.8854581471213354, + "learning_rate": 1.7020852560588608e-06, + "loss": 0.6582, + "step": 9209 + }, + { + "epoch": 1.37, + "grad_norm": 5.07030333328606, + "learning_rate": 1.702016458975581e-06, + "loss": 0.6921, + "step": 9210 + }, + { + "epoch": 1.37, + "grad_norm": 0.843672335659326, + "learning_rate": 1.7019476553403193e-06, + "loss": 0.6816, + "step": 9211 + }, + { + "epoch": 1.37, + "grad_norm": 0.7588860282312812, + "learning_rate": 1.7018788451537183e-06, + "loss": 0.6797, + "step": 9212 + }, + { + "epoch": 1.37, + "grad_norm": 1.747944481720809, + "learning_rate": 1.70181002841642e-06, + "loss": 0.6797, + "step": 9213 + }, + { + "epoch": 1.37, + "grad_norm": 3.86937832862698, + "learning_rate": 1.7017412051290662e-06, + "loss": 0.6484, + "step": 9214 + }, + { + "epoch": 1.37, + "grad_norm": 1.257944195346281, + "learning_rate": 1.7016723752922998e-06, + "loss": 0.6699, + "step": 9215 + }, + { + "epoch": 1.37, + "grad_norm": 3.8346938858305233, + "learning_rate": 1.7016035389067637e-06, + "loss": 0.6777, + "step": 9216 + }, + { + "epoch": 1.37, + "grad_norm": 2.5600545979239464, + "learning_rate": 1.7015346959730989e-06, + "loss": 0.6725, + "step": 9217 + }, + { + "epoch": 1.37, + "grad_norm": 3.215169648204341, + "learning_rate": 1.701465846491949e-06, + "loss": 0.6862, + "step": 9218 + }, + { + "epoch": 1.37, + "grad_norm": 0.9276252050051682, + "learning_rate": 1.7013969904639564e-06, + "loss": 0.651, + "step": 9219 + }, + { + "epoch": 1.38, + "grad_norm": 0.9777845171284809, + "learning_rate": 1.7013281278897638e-06, + "loss": 0.6784, + "step": 9220 + }, + { + "epoch": 1.38, + "grad_norm": 1.9540514205283603, + "learning_rate": 1.7012592587700137e-06, + "loss": 0.6673, + "step": 9221 + }, + { + "epoch": 1.38, + "grad_norm": 0.93581571617637, + "learning_rate": 1.7011903831053492e-06, + "loss": 0.6842, + "step": 9222 + }, + { + "epoch": 1.38, + "grad_norm": 3.754659321422443, + "learning_rate": 1.7011215008964124e-06, + "loss": 0.6634, + "step": 9223 + }, + { + "epoch": 1.38, + "grad_norm": 1.0643051295250954, + "learning_rate": 1.7010526121438465e-06, + "loss": 0.6908, + "step": 9224 + }, + { + "epoch": 1.38, + "grad_norm": 3.2003032003878245, + "learning_rate": 1.7009837168482947e-06, + "loss": 0.653, + "step": 9225 + }, + { + "epoch": 1.38, + "grad_norm": 2.519125353024893, + "learning_rate": 1.7009148150104002e-06, + "loss": 0.6784, + "step": 9226 + }, + { + "epoch": 1.38, + "grad_norm": 2.846464295347159, + "learning_rate": 1.7008459066308057e-06, + "loss": 0.6842, + "step": 9227 + }, + { + "epoch": 1.38, + "grad_norm": 1.4014656416218518, + "learning_rate": 1.700776991710154e-06, + "loss": 0.6673, + "step": 9228 + }, + { + "epoch": 1.38, + "grad_norm": 2.806527408636845, + "learning_rate": 1.7007080702490888e-06, + "loss": 0.6699, + "step": 9229 + }, + { + "epoch": 1.38, + "grad_norm": 1.7363400644522, + "learning_rate": 1.7006391422482533e-06, + "loss": 0.694, + "step": 9230 + }, + { + "epoch": 1.38, + "grad_norm": 1.632746499560052, + "learning_rate": 1.7005702077082907e-06, + "loss": 0.6771, + "step": 9231 + }, + { + "epoch": 1.38, + "grad_norm": 0.9638449625533294, + "learning_rate": 1.7005012666298444e-06, + "loss": 0.6361, + "step": 9232 + }, + { + "epoch": 1.38, + "grad_norm": 1.9142968489034125, + "learning_rate": 1.700432319013558e-06, + "loss": 0.6758, + "step": 9233 + }, + { + "epoch": 1.38, + "grad_norm": 1.071139256007235, + "learning_rate": 1.7003633648600746e-06, + "loss": 0.6908, + "step": 9234 + }, + { + "epoch": 1.38, + "grad_norm": 1.0301639055777452, + "learning_rate": 1.7002944041700381e-06, + "loss": 0.6426, + "step": 9235 + }, + { + "epoch": 1.38, + "grad_norm": 3.2929957754705725, + "learning_rate": 1.700225436944092e-06, + "loss": 0.6641, + "step": 9236 + }, + { + "epoch": 1.38, + "grad_norm": 1.2580719448363882, + "learning_rate": 1.7001564631828797e-06, + "loss": 0.6458, + "step": 9237 + }, + { + "epoch": 1.38, + "grad_norm": 1.4174130799584796, + "learning_rate": 1.7000874828870452e-06, + "loss": 0.6895, + "step": 9238 + }, + { + "epoch": 1.38, + "grad_norm": 1.5543424245884347, + "learning_rate": 1.700018496057233e-06, + "loss": 0.6445, + "step": 9239 + }, + { + "epoch": 1.38, + "grad_norm": 4.001391578860759, + "learning_rate": 1.6999495026940855e-06, + "loss": 0.6842, + "step": 9240 + }, + { + "epoch": 1.38, + "grad_norm": 1.9184626453559437, + "learning_rate": 1.6998805027982477e-06, + "loss": 0.653, + "step": 9241 + }, + { + "epoch": 1.38, + "grad_norm": 1.3882352023017643, + "learning_rate": 1.6998114963703635e-06, + "loss": 0.6829, + "step": 9242 + }, + { + "epoch": 1.38, + "grad_norm": 1.3240905247038905, + "learning_rate": 1.6997424834110766e-06, + "loss": 0.7077, + "step": 9243 + }, + { + "epoch": 1.38, + "grad_norm": 2.4071501734525, + "learning_rate": 1.6996734639210308e-06, + "loss": 0.6589, + "step": 9244 + }, + { + "epoch": 1.38, + "grad_norm": 2.9893327972856283, + "learning_rate": 1.6996044379008709e-06, + "loss": 0.6309, + "step": 9245 + }, + { + "epoch": 1.38, + "grad_norm": 1.7437779847913333, + "learning_rate": 1.6995354053512413e-06, + "loss": 0.666, + "step": 9246 + }, + { + "epoch": 1.38, + "grad_norm": 2.7830669891356767, + "learning_rate": 1.6994663662727854e-06, + "loss": 0.6693, + "step": 9247 + }, + { + "epoch": 1.38, + "grad_norm": 1.1010201619915208, + "learning_rate": 1.699397320666148e-06, + "loss": 0.6595, + "step": 9248 + }, + { + "epoch": 1.38, + "grad_norm": 2.9205206977296005, + "learning_rate": 1.699328268531974e-06, + "loss": 0.7142, + "step": 9249 + }, + { + "epoch": 1.38, + "grad_norm": 1.4787205168402404, + "learning_rate": 1.6992592098709073e-06, + "loss": 0.6823, + "step": 9250 + }, + { + "epoch": 1.38, + "grad_norm": 1.4743904948749715, + "learning_rate": 1.6991901446835924e-06, + "loss": 0.6647, + "step": 9251 + }, + { + "epoch": 1.38, + "grad_norm": 9.083251947336375, + "learning_rate": 1.6991210729706743e-06, + "loss": 0.7409, + "step": 9252 + }, + { + "epoch": 1.38, + "grad_norm": 2.4641272462664014, + "learning_rate": 1.699051994732797e-06, + "loss": 0.6549, + "step": 9253 + }, + { + "epoch": 1.38, + "grad_norm": 2.4412885383149336, + "learning_rate": 1.6989829099706058e-06, + "loss": 0.6504, + "step": 9254 + }, + { + "epoch": 1.38, + "grad_norm": 1.1034240417070411, + "learning_rate": 1.6989138186847453e-06, + "loss": 0.6882, + "step": 9255 + }, + { + "epoch": 1.38, + "grad_norm": 1.8608770423778367, + "learning_rate": 1.6988447208758606e-06, + "loss": 0.6803, + "step": 9256 + }, + { + "epoch": 1.38, + "grad_norm": 4.4692086561065985, + "learning_rate": 1.6987756165445956e-06, + "loss": 0.679, + "step": 9257 + }, + { + "epoch": 1.38, + "grad_norm": 2.4619958212595745, + "learning_rate": 1.6987065056915966e-06, + "loss": 0.6699, + "step": 9258 + }, + { + "epoch": 1.38, + "grad_norm": 2.1558976759213753, + "learning_rate": 1.698637388317508e-06, + "loss": 0.6999, + "step": 9259 + }, + { + "epoch": 1.38, + "grad_norm": 1.1765687926866297, + "learning_rate": 1.6985682644229746e-06, + "loss": 0.6595, + "step": 9260 + }, + { + "epoch": 1.38, + "grad_norm": 7.614954625654576, + "learning_rate": 1.6984991340086418e-06, + "loss": 0.696, + "step": 9261 + }, + { + "epoch": 1.38, + "grad_norm": 2.8402896958533694, + "learning_rate": 1.6984299970751553e-06, + "loss": 0.6615, + "step": 9262 + }, + { + "epoch": 1.38, + "grad_norm": 2.1821108291014037, + "learning_rate": 1.6983608536231593e-06, + "loss": 0.7012, + "step": 9263 + }, + { + "epoch": 1.38, + "grad_norm": 1.4323500174207981, + "learning_rate": 1.6982917036532998e-06, + "loss": 0.6471, + "step": 9264 + }, + { + "epoch": 1.38, + "grad_norm": 1.3236960389341141, + "learning_rate": 1.6982225471662222e-06, + "loss": 0.6855, + "step": 9265 + }, + { + "epoch": 1.38, + "grad_norm": 1.2490745638099767, + "learning_rate": 1.6981533841625717e-06, + "loss": 0.7012, + "step": 9266 + }, + { + "epoch": 1.38, + "grad_norm": 0.6293106911619734, + "learning_rate": 1.6980842146429942e-06, + "loss": 0.6777, + "step": 9267 + }, + { + "epoch": 1.38, + "grad_norm": 3.1816183059661998, + "learning_rate": 1.6980150386081347e-06, + "loss": 0.6478, + "step": 9268 + }, + { + "epoch": 1.38, + "grad_norm": 1.2338383032353804, + "learning_rate": 1.697945856058639e-06, + "loss": 0.6823, + "step": 9269 + }, + { + "epoch": 1.38, + "grad_norm": 4.360867952892332, + "learning_rate": 1.697876666995153e-06, + "loss": 0.7064, + "step": 9270 + }, + { + "epoch": 1.38, + "grad_norm": 2.5198799400694996, + "learning_rate": 1.6978074714183226e-06, + "loss": 0.6582, + "step": 9271 + }, + { + "epoch": 1.38, + "grad_norm": 2.2388099687397123, + "learning_rate": 1.6977382693287933e-06, + "loss": 0.6719, + "step": 9272 + }, + { + "epoch": 1.38, + "grad_norm": 0.722195216090002, + "learning_rate": 1.697669060727211e-06, + "loss": 0.6868, + "step": 9273 + }, + { + "epoch": 1.38, + "grad_norm": 1.3493618016380704, + "learning_rate": 1.6975998456142215e-06, + "loss": 0.6628, + "step": 9274 + }, + { + "epoch": 1.38, + "grad_norm": 1.020946446508889, + "learning_rate": 1.6975306239904713e-06, + "loss": 0.6914, + "step": 9275 + }, + { + "epoch": 1.38, + "grad_norm": 8.194982463635421, + "learning_rate": 1.6974613958566058e-06, + "loss": 0.6842, + "step": 9276 + }, + { + "epoch": 1.38, + "grad_norm": 2.6387306422896812, + "learning_rate": 1.6973921612132713e-06, + "loss": 0.6647, + "step": 9277 + }, + { + "epoch": 1.38, + "grad_norm": 3.0802581081939264, + "learning_rate": 1.697322920061114e-06, + "loss": 0.6758, + "step": 9278 + }, + { + "epoch": 1.38, + "grad_norm": 3.7604496611494076, + "learning_rate": 1.6972536724007807e-06, + "loss": 0.6738, + "step": 9279 + }, + { + "epoch": 1.38, + "grad_norm": 1.2110662509351289, + "learning_rate": 1.697184418232917e-06, + "loss": 0.6901, + "step": 9280 + }, + { + "epoch": 1.38, + "grad_norm": 4.167766531472868, + "learning_rate": 1.6971151575581694e-06, + "loss": 0.6777, + "step": 9281 + }, + { + "epoch": 1.38, + "grad_norm": 0.9565195007208538, + "learning_rate": 1.6970458903771844e-06, + "loss": 0.6771, + "step": 9282 + }, + { + "epoch": 1.38, + "grad_norm": 1.4605601189022863, + "learning_rate": 1.6969766166906085e-06, + "loss": 0.6576, + "step": 9283 + }, + { + "epoch": 1.38, + "grad_norm": 0.666487977870133, + "learning_rate": 1.696907336499088e-06, + "loss": 0.6816, + "step": 9284 + }, + { + "epoch": 1.38, + "grad_norm": 2.320914187840947, + "learning_rate": 1.69683804980327e-06, + "loss": 0.6615, + "step": 9285 + }, + { + "epoch": 1.38, + "grad_norm": 1.5169158859572844, + "learning_rate": 1.6967687566038008e-06, + "loss": 0.6582, + "step": 9286 + }, + { + "epoch": 1.39, + "grad_norm": 3.595790475012993, + "learning_rate": 1.696699456901327e-06, + "loss": 0.6686, + "step": 9287 + }, + { + "epoch": 1.39, + "grad_norm": 2.5560296537805414, + "learning_rate": 1.696630150696496e-06, + "loss": 0.6693, + "step": 9288 + }, + { + "epoch": 1.39, + "grad_norm": 2.742913319059911, + "learning_rate": 1.6965608379899536e-06, + "loss": 0.6979, + "step": 9289 + }, + { + "epoch": 1.39, + "grad_norm": 3.380188957222264, + "learning_rate": 1.6964915187823477e-06, + "loss": 0.6927, + "step": 9290 + }, + { + "epoch": 1.39, + "grad_norm": 3.0113947751765453, + "learning_rate": 1.6964221930743245e-06, + "loss": 0.6901, + "step": 9291 + }, + { + "epoch": 1.39, + "grad_norm": 2.41076979382996, + "learning_rate": 1.696352860866532e-06, + "loss": 0.6641, + "step": 9292 + }, + { + "epoch": 1.39, + "grad_norm": 3.809427436592986, + "learning_rate": 1.696283522159616e-06, + "loss": 0.6719, + "step": 9293 + }, + { + "epoch": 1.39, + "grad_norm": 1.08766669923799, + "learning_rate": 1.6962141769542247e-06, + "loss": 0.6536, + "step": 9294 + }, + { + "epoch": 1.39, + "grad_norm": 1.8587126851771527, + "learning_rate": 1.6961448252510047e-06, + "loss": 0.6712, + "step": 9295 + }, + { + "epoch": 1.39, + "grad_norm": 1.4867019979892033, + "learning_rate": 1.6960754670506037e-06, + "loss": 0.6777, + "step": 9296 + }, + { + "epoch": 1.39, + "grad_norm": 0.8130152066729462, + "learning_rate": 1.6960061023536687e-06, + "loss": 0.7005, + "step": 9297 + }, + { + "epoch": 1.39, + "grad_norm": 2.548251305563582, + "learning_rate": 1.695936731160847e-06, + "loss": 0.668, + "step": 9298 + }, + { + "epoch": 1.39, + "grad_norm": 2.9710310387617938, + "learning_rate": 1.6958673534727865e-06, + "loss": 0.6882, + "step": 9299 + }, + { + "epoch": 1.39, + "grad_norm": 3.296568962646546, + "learning_rate": 1.6957979692901345e-06, + "loss": 0.6712, + "step": 9300 + }, + { + "epoch": 1.39, + "grad_norm": 9.356611060151176, + "learning_rate": 1.6957285786135384e-06, + "loss": 0.7018, + "step": 9301 + }, + { + "epoch": 1.39, + "grad_norm": 1.5458378032170146, + "learning_rate": 1.6956591814436458e-06, + "loss": 0.6582, + "step": 9302 + }, + { + "epoch": 1.39, + "grad_norm": 4.936353252863101, + "learning_rate": 1.6955897777811048e-06, + "loss": 0.6868, + "step": 9303 + }, + { + "epoch": 1.39, + "grad_norm": 3.965068365278645, + "learning_rate": 1.6955203676265625e-06, + "loss": 0.6673, + "step": 9304 + }, + { + "epoch": 1.39, + "grad_norm": 2.306067799302311, + "learning_rate": 1.6954509509806675e-06, + "loss": 0.6908, + "step": 9305 + }, + { + "epoch": 1.39, + "grad_norm": 1.3387753185445157, + "learning_rate": 1.6953815278440674e-06, + "loss": 0.6745, + "step": 9306 + }, + { + "epoch": 1.39, + "grad_norm": 0.7358375387814916, + "learning_rate": 1.6953120982174096e-06, + "loss": 0.6901, + "step": 9307 + }, + { + "epoch": 1.39, + "grad_norm": 3.8954066360837714, + "learning_rate": 1.6952426621013429e-06, + "loss": 0.6797, + "step": 9308 + }, + { + "epoch": 1.39, + "grad_norm": 4.069287756809119, + "learning_rate": 1.6951732194965147e-06, + "loss": 0.6732, + "step": 9309 + }, + { + "epoch": 1.39, + "grad_norm": 1.751977673360162, + "learning_rate": 1.6951037704035733e-06, + "loss": 0.6693, + "step": 9310 + }, + { + "epoch": 1.39, + "grad_norm": 1.7975903164056686, + "learning_rate": 1.695034314823167e-06, + "loss": 0.6732, + "step": 9311 + }, + { + "epoch": 1.39, + "grad_norm": 3.6775739291675027, + "learning_rate": 1.694964852755944e-06, + "loss": 0.6784, + "step": 9312 + }, + { + "epoch": 1.39, + "grad_norm": 4.990000246189686, + "learning_rate": 1.6948953842025525e-06, + "loss": 0.6556, + "step": 9313 + }, + { + "epoch": 1.39, + "grad_norm": 1.9453625766833835, + "learning_rate": 1.694825909163641e-06, + "loss": 0.6908, + "step": 9314 + }, + { + "epoch": 1.39, + "grad_norm": 5.06918278363075, + "learning_rate": 1.6947564276398579e-06, + "loss": 0.6901, + "step": 9315 + }, + { + "epoch": 1.39, + "grad_norm": 2.2908503805802036, + "learning_rate": 1.6946869396318512e-06, + "loss": 0.6842, + "step": 9316 + }, + { + "epoch": 1.39, + "grad_norm": 0.893141167889451, + "learning_rate": 1.6946174451402705e-06, + "loss": 0.6816, + "step": 9317 + }, + { + "epoch": 1.39, + "grad_norm": 3.0611304873604026, + "learning_rate": 1.6945479441657634e-06, + "loss": 0.6986, + "step": 9318 + }, + { + "epoch": 1.39, + "grad_norm": 0.7537534323367286, + "learning_rate": 1.6944784367089788e-06, + "loss": 0.6732, + "step": 9319 + }, + { + "epoch": 1.39, + "grad_norm": 3.2480598692004086, + "learning_rate": 1.6944089227705658e-06, + "loss": 0.6764, + "step": 9320 + }, + { + "epoch": 1.39, + "grad_norm": 0.8459384925908271, + "learning_rate": 1.694339402351173e-06, + "loss": 0.666, + "step": 9321 + }, + { + "epoch": 1.39, + "grad_norm": 4.087434198676277, + "learning_rate": 1.6942698754514488e-06, + "loss": 0.6784, + "step": 9322 + }, + { + "epoch": 1.39, + "grad_norm": 2.454286272509224, + "learning_rate": 1.6942003420720425e-06, + "loss": 0.679, + "step": 9323 + }, + { + "epoch": 1.39, + "grad_norm": 4.000519503421371, + "learning_rate": 1.6941308022136032e-06, + "loss": 0.681, + "step": 9324 + }, + { + "epoch": 1.39, + "grad_norm": 1.3366397708062216, + "learning_rate": 1.6940612558767796e-06, + "loss": 0.6725, + "step": 9325 + }, + { + "epoch": 1.39, + "grad_norm": 2.1641869959608444, + "learning_rate": 1.6939917030622212e-06, + "loss": 0.6849, + "step": 9326 + }, + { + "epoch": 1.39, + "grad_norm": 1.332850682359284, + "learning_rate": 1.6939221437705765e-06, + "loss": 0.6888, + "step": 9327 + }, + { + "epoch": 1.39, + "grad_norm": 3.16652152219531, + "learning_rate": 1.6938525780024953e-06, + "loss": 0.6725, + "step": 9328 + }, + { + "epoch": 1.39, + "grad_norm": 2.421387783317438, + "learning_rate": 1.6937830057586267e-06, + "loss": 0.7051, + "step": 9329 + }, + { + "epoch": 1.39, + "grad_norm": 1.520427668749435, + "learning_rate": 1.6937134270396198e-06, + "loss": 0.6699, + "step": 9330 + }, + { + "epoch": 1.39, + "grad_norm": 0.6553375080138726, + "learning_rate": 1.6936438418461242e-06, + "loss": 0.6784, + "step": 9331 + }, + { + "epoch": 1.39, + "grad_norm": 3.427728941110382, + "learning_rate": 1.6935742501787891e-06, + "loss": 0.6647, + "step": 9332 + }, + { + "epoch": 1.39, + "grad_norm": 0.7017076628636436, + "learning_rate": 1.6935046520382642e-06, + "loss": 0.6888, + "step": 9333 + }, + { + "epoch": 1.39, + "grad_norm": 0.7553809924917252, + "learning_rate": 1.6934350474251992e-06, + "loss": 0.696, + "step": 9334 + }, + { + "epoch": 1.39, + "grad_norm": 1.507619793209572, + "learning_rate": 1.6933654363402433e-06, + "loss": 0.6719, + "step": 9335 + }, + { + "epoch": 1.39, + "grad_norm": 1.7693307359645383, + "learning_rate": 1.693295818784047e-06, + "loss": 0.7188, + "step": 9336 + }, + { + "epoch": 1.39, + "grad_norm": 0.901234000321596, + "learning_rate": 1.6932261947572592e-06, + "loss": 0.7038, + "step": 9337 + }, + { + "epoch": 1.39, + "grad_norm": 1.083228949780308, + "learning_rate": 1.6931565642605301e-06, + "loss": 0.668, + "step": 9338 + }, + { + "epoch": 1.39, + "grad_norm": 4.957144282707367, + "learning_rate": 1.6930869272945094e-06, + "loss": 0.6979, + "step": 9339 + }, + { + "epoch": 1.39, + "grad_norm": 0.8313734226464049, + "learning_rate": 1.693017283859847e-06, + "loss": 0.6738, + "step": 9340 + }, + { + "epoch": 1.39, + "grad_norm": 1.1737405455026442, + "learning_rate": 1.6929476339571934e-06, + "loss": 0.6797, + "step": 9341 + }, + { + "epoch": 1.39, + "grad_norm": 0.8515904359096019, + "learning_rate": 1.692877977587198e-06, + "loss": 0.6784, + "step": 9342 + }, + { + "epoch": 1.39, + "grad_norm": 0.8844046893646197, + "learning_rate": 1.692808314750511e-06, + "loss": 0.6667, + "step": 9343 + }, + { + "epoch": 1.39, + "grad_norm": 0.5609913310669468, + "learning_rate": 1.692738645447783e-06, + "loss": 0.6803, + "step": 9344 + }, + { + "epoch": 1.39, + "grad_norm": 2.8378815906557593, + "learning_rate": 1.6926689696796636e-06, + "loss": 0.666, + "step": 9345 + }, + { + "epoch": 1.39, + "grad_norm": 1.3169628596681522, + "learning_rate": 1.6925992874468035e-06, + "loss": 0.6777, + "step": 9346 + }, + { + "epoch": 1.39, + "grad_norm": 0.920950314146007, + "learning_rate": 1.6925295987498534e-06, + "loss": 0.6784, + "step": 9347 + }, + { + "epoch": 1.39, + "grad_norm": 0.5845963492813664, + "learning_rate": 1.6924599035894632e-06, + "loss": 0.6816, + "step": 9348 + }, + { + "epoch": 1.39, + "grad_norm": 4.35927708818465, + "learning_rate": 1.6923902019662831e-06, + "loss": 0.6745, + "step": 9349 + }, + { + "epoch": 1.39, + "grad_norm": 4.446936550602115, + "learning_rate": 1.6923204938809644e-06, + "loss": 0.6595, + "step": 9350 + }, + { + "epoch": 1.39, + "grad_norm": 5.567365107014403, + "learning_rate": 1.6922507793341572e-06, + "loss": 0.6738, + "step": 9351 + }, + { + "epoch": 1.39, + "grad_norm": 5.250154305495749, + "learning_rate": 1.6921810583265123e-06, + "loss": 0.6745, + "step": 9352 + }, + { + "epoch": 1.39, + "grad_norm": 2.7169281886020977, + "learning_rate": 1.6921113308586804e-06, + "loss": 0.668, + "step": 9353 + }, + { + "epoch": 1.4, + "grad_norm": 3.1883472457990956, + "learning_rate": 1.6920415969313123e-06, + "loss": 0.6908, + "step": 9354 + }, + { + "epoch": 1.4, + "grad_norm": 3.9470986701941384, + "learning_rate": 1.6919718565450585e-06, + "loss": 0.6875, + "step": 9355 + }, + { + "epoch": 1.4, + "grad_norm": 3.0638073370622294, + "learning_rate": 1.6919021097005701e-06, + "loss": 0.6758, + "step": 9356 + }, + { + "epoch": 1.4, + "grad_norm": 4.105468540000647, + "learning_rate": 1.6918323563984984e-06, + "loss": 0.6582, + "step": 9357 + }, + { + "epoch": 1.4, + "grad_norm": 7.0073942942611955, + "learning_rate": 1.691762596639494e-06, + "loss": 0.696, + "step": 9358 + }, + { + "epoch": 1.4, + "grad_norm": 0.8150929049624134, + "learning_rate": 1.6916928304242084e-06, + "loss": 0.6667, + "step": 9359 + }, + { + "epoch": 1.4, + "grad_norm": 4.180566894279906, + "learning_rate": 1.6916230577532922e-06, + "loss": 0.6829, + "step": 9360 + }, + { + "epoch": 1.4, + "grad_norm": 2.5772733079251298, + "learning_rate": 1.6915532786273968e-06, + "loss": 0.6686, + "step": 9361 + }, + { + "epoch": 1.4, + "grad_norm": 2.850781702565314, + "learning_rate": 1.6914834930471736e-06, + "loss": 0.6719, + "step": 9362 + }, + { + "epoch": 1.4, + "grad_norm": 0.7407921056957307, + "learning_rate": 1.691413701013274e-06, + "loss": 0.6914, + "step": 9363 + }, + { + "epoch": 1.4, + "grad_norm": 0.8528674590491975, + "learning_rate": 1.6913439025263488e-06, + "loss": 0.6608, + "step": 9364 + }, + { + "epoch": 1.4, + "grad_norm": 1.006634154149408, + "learning_rate": 1.69127409758705e-06, + "loss": 0.6947, + "step": 9365 + }, + { + "epoch": 1.4, + "grad_norm": 2.8251215074600293, + "learning_rate": 1.6912042861960285e-06, + "loss": 0.6895, + "step": 9366 + }, + { + "epoch": 1.4, + "grad_norm": 2.286712713562804, + "learning_rate": 1.6911344683539368e-06, + "loss": 0.6823, + "step": 9367 + }, + { + "epoch": 1.4, + "grad_norm": 4.0504496079296235, + "learning_rate": 1.6910646440614258e-06, + "loss": 0.6855, + "step": 9368 + }, + { + "epoch": 1.4, + "grad_norm": 0.9176122146603708, + "learning_rate": 1.6909948133191473e-06, + "loss": 0.6738, + "step": 9369 + }, + { + "epoch": 1.4, + "grad_norm": 1.6792945963371197, + "learning_rate": 1.6909249761277533e-06, + "loss": 0.679, + "step": 9370 + }, + { + "epoch": 1.4, + "grad_norm": 2.9747039970563067, + "learning_rate": 1.6908551324878953e-06, + "loss": 0.6823, + "step": 9371 + }, + { + "epoch": 1.4, + "grad_norm": 2.5939745533216465, + "learning_rate": 1.6907852824002253e-06, + "loss": 0.6549, + "step": 9372 + }, + { + "epoch": 1.4, + "grad_norm": 1.637600148775928, + "learning_rate": 1.6907154258653948e-06, + "loss": 0.6751, + "step": 9373 + }, + { + "epoch": 1.4, + "grad_norm": 1.3133373305441025, + "learning_rate": 1.6906455628840568e-06, + "loss": 0.6419, + "step": 9374 + }, + { + "epoch": 1.4, + "grad_norm": 0.899026958475936, + "learning_rate": 1.6905756934568622e-06, + "loss": 0.6784, + "step": 9375 + }, + { + "epoch": 1.4, + "grad_norm": 1.6043946509922165, + "learning_rate": 1.6905058175844637e-06, + "loss": 0.6536, + "step": 9376 + }, + { + "epoch": 1.4, + "grad_norm": 4.696957891930431, + "learning_rate": 1.6904359352675135e-06, + "loss": 0.6608, + "step": 9377 + }, + { + "epoch": 1.4, + "grad_norm": 0.8894273941730979, + "learning_rate": 1.6903660465066634e-06, + "loss": 0.6693, + "step": 9378 + }, + { + "epoch": 1.4, + "grad_norm": 1.7946605556350421, + "learning_rate": 1.690296151302566e-06, + "loss": 0.6953, + "step": 9379 + }, + { + "epoch": 1.4, + "grad_norm": 3.2766195226355226, + "learning_rate": 1.6902262496558737e-06, + "loss": 0.6751, + "step": 9380 + }, + { + "epoch": 1.4, + "grad_norm": 1.7969676734748228, + "learning_rate": 1.6901563415672388e-06, + "loss": 0.668, + "step": 9381 + }, + { + "epoch": 1.4, + "grad_norm": 2.646379725268616, + "learning_rate": 1.6900864270373134e-06, + "loss": 0.6673, + "step": 9382 + }, + { + "epoch": 1.4, + "grad_norm": 4.783697781402308, + "learning_rate": 1.6900165060667507e-06, + "loss": 0.7077, + "step": 9383 + }, + { + "epoch": 1.4, + "grad_norm": 0.9004929651885427, + "learning_rate": 1.6899465786562029e-06, + "loss": 0.6478, + "step": 9384 + }, + { + "epoch": 1.4, + "grad_norm": 1.0191617058793816, + "learning_rate": 1.6898766448063223e-06, + "loss": 0.6693, + "step": 9385 + }, + { + "epoch": 1.4, + "grad_norm": 1.837662663821036, + "learning_rate": 1.689806704517762e-06, + "loss": 0.6426, + "step": 9386 + }, + { + "epoch": 1.4, + "grad_norm": 3.1366253537216506, + "learning_rate": 1.6897367577911755e-06, + "loss": 0.6634, + "step": 9387 + }, + { + "epoch": 1.4, + "grad_norm": 3.113216514182604, + "learning_rate": 1.689666804627214e-06, + "loss": 0.6777, + "step": 9388 + }, + { + "epoch": 1.4, + "grad_norm": 2.778054926002196, + "learning_rate": 1.6895968450265316e-06, + "loss": 0.6992, + "step": 9389 + }, + { + "epoch": 1.4, + "grad_norm": 3.5357119429453845, + "learning_rate": 1.6895268789897808e-06, + "loss": 0.6445, + "step": 9390 + }, + { + "epoch": 1.4, + "grad_norm": 4.115141194756565, + "learning_rate": 1.6894569065176147e-06, + "loss": 0.6732, + "step": 9391 + }, + { + "epoch": 1.4, + "grad_norm": 6.279064424789582, + "learning_rate": 1.6893869276106862e-06, + "loss": 0.7161, + "step": 9392 + }, + { + "epoch": 1.4, + "grad_norm": 1.4014780891383043, + "learning_rate": 1.6893169422696486e-06, + "loss": 0.6582, + "step": 9393 + }, + { + "epoch": 1.4, + "grad_norm": 3.3764270229088553, + "learning_rate": 1.6892469504951553e-06, + "loss": 0.7285, + "step": 9394 + }, + { + "epoch": 1.4, + "grad_norm": 2.7998921900386944, + "learning_rate": 1.6891769522878587e-06, + "loss": 0.6615, + "step": 9395 + }, + { + "epoch": 1.4, + "grad_norm": 5.9316786878579295, + "learning_rate": 1.6891069476484128e-06, + "loss": 0.6797, + "step": 9396 + }, + { + "epoch": 1.4, + "grad_norm": 1.3995602102541627, + "learning_rate": 1.689036936577471e-06, + "loss": 0.6686, + "step": 9397 + }, + { + "epoch": 1.4, + "grad_norm": 2.064932272305162, + "learning_rate": 1.6889669190756866e-06, + "loss": 0.64, + "step": 9398 + }, + { + "epoch": 1.4, + "grad_norm": 1.7061225565275777, + "learning_rate": 1.6888968951437128e-06, + "loss": 0.6654, + "step": 9399 + }, + { + "epoch": 1.4, + "grad_norm": 1.0928339093536188, + "learning_rate": 1.6888268647822032e-06, + "loss": 0.6901, + "step": 9400 + }, + { + "epoch": 1.4, + "grad_norm": 4.492561742717643, + "learning_rate": 1.688756827991812e-06, + "loss": 0.6647, + "step": 9401 + }, + { + "epoch": 1.4, + "grad_norm": 5.264074640237215, + "learning_rate": 1.6886867847731922e-06, + "loss": 0.7031, + "step": 9402 + }, + { + "epoch": 1.4, + "grad_norm": 0.8550715634422468, + "learning_rate": 1.6886167351269978e-06, + "loss": 0.6654, + "step": 9403 + }, + { + "epoch": 1.4, + "grad_norm": 3.70893022801471, + "learning_rate": 1.6885466790538826e-06, + "loss": 0.6908, + "step": 9404 + }, + { + "epoch": 1.4, + "grad_norm": 1.0155061281974755, + "learning_rate": 1.6884766165545e-06, + "loss": 0.6608, + "step": 9405 + }, + { + "epoch": 1.4, + "grad_norm": 5.474309964405541, + "learning_rate": 1.6884065476295044e-06, + "loss": 0.6725, + "step": 9406 + }, + { + "epoch": 1.4, + "grad_norm": 6.161578842596586, + "learning_rate": 1.6883364722795498e-06, + "loss": 0.6582, + "step": 9407 + }, + { + "epoch": 1.4, + "grad_norm": 2.5119190749676243, + "learning_rate": 1.68826639050529e-06, + "loss": 0.6686, + "step": 9408 + }, + { + "epoch": 1.4, + "grad_norm": 0.7934815191151756, + "learning_rate": 1.688196302307379e-06, + "loss": 0.6706, + "step": 9409 + }, + { + "epoch": 1.4, + "grad_norm": 2.740643978788661, + "learning_rate": 1.6881262076864711e-06, + "loss": 0.6719, + "step": 9410 + }, + { + "epoch": 1.4, + "grad_norm": 3.831169632217328, + "learning_rate": 1.6880561066432203e-06, + "loss": 0.6543, + "step": 9411 + }, + { + "epoch": 1.4, + "grad_norm": 1.7716168953286284, + "learning_rate": 1.6879859991782813e-06, + "loss": 0.6686, + "step": 9412 + }, + { + "epoch": 1.4, + "grad_norm": 1.1891441527450586, + "learning_rate": 1.6879158852923076e-06, + "loss": 0.6725, + "step": 9413 + }, + { + "epoch": 1.4, + "grad_norm": 5.07082621642335, + "learning_rate": 1.6878457649859548e-06, + "loss": 0.7122, + "step": 9414 + }, + { + "epoch": 1.4, + "grad_norm": 1.0180750962768559, + "learning_rate": 1.6877756382598762e-06, + "loss": 0.6224, + "step": 9415 + }, + { + "epoch": 1.4, + "grad_norm": 6.656445890763785, + "learning_rate": 1.6877055051147267e-06, + "loss": 0.6719, + "step": 9416 + }, + { + "epoch": 1.4, + "grad_norm": 2.683727727664329, + "learning_rate": 1.687635365551161e-06, + "loss": 0.6582, + "step": 9417 + }, + { + "epoch": 1.4, + "grad_norm": 0.8546732376565696, + "learning_rate": 1.687565219569834e-06, + "loss": 0.668, + "step": 9418 + }, + { + "epoch": 1.4, + "grad_norm": 0.9346118892582027, + "learning_rate": 1.6874950671713994e-06, + "loss": 0.6693, + "step": 9419 + }, + { + "epoch": 1.4, + "grad_norm": 4.836182612071113, + "learning_rate": 1.687424908356513e-06, + "loss": 0.6855, + "step": 9420 + }, + { + "epoch": 1.41, + "grad_norm": 1.0953967571725485, + "learning_rate": 1.6873547431258288e-06, + "loss": 0.6576, + "step": 9421 + }, + { + "epoch": 1.41, + "grad_norm": 3.6879593264151707, + "learning_rate": 1.687284571480002e-06, + "loss": 0.6836, + "step": 9422 + }, + { + "epoch": 1.41, + "grad_norm": 3.185626434297404, + "learning_rate": 1.6872143934196878e-06, + "loss": 0.6706, + "step": 9423 + }, + { + "epoch": 1.41, + "grad_norm": 1.0679871414618531, + "learning_rate": 1.6871442089455407e-06, + "loss": 0.6549, + "step": 9424 + }, + { + "epoch": 1.41, + "grad_norm": 2.517089891405334, + "learning_rate": 1.687074018058216e-06, + "loss": 0.6803, + "step": 9425 + }, + { + "epoch": 1.41, + "grad_norm": 2.0183511683858577, + "learning_rate": 1.687003820758369e-06, + "loss": 0.6523, + "step": 9426 + }, + { + "epoch": 1.41, + "grad_norm": 0.9465096852469961, + "learning_rate": 1.6869336170466541e-06, + "loss": 0.6712, + "step": 9427 + }, + { + "epoch": 1.41, + "grad_norm": 6.918348642576582, + "learning_rate": 1.6868634069237273e-06, + "loss": 0.7044, + "step": 9428 + }, + { + "epoch": 1.41, + "grad_norm": 3.1941449585144936, + "learning_rate": 1.6867931903902433e-06, + "loss": 0.6953, + "step": 9429 + }, + { + "epoch": 1.41, + "grad_norm": 1.1340850892730312, + "learning_rate": 1.6867229674468582e-06, + "loss": 0.6927, + "step": 9430 + }, + { + "epoch": 1.41, + "grad_norm": 3.4546227107958103, + "learning_rate": 1.6866527380942267e-06, + "loss": 0.6543, + "step": 9431 + }, + { + "epoch": 1.41, + "grad_norm": 1.4511453730651218, + "learning_rate": 1.686582502333004e-06, + "loss": 0.6719, + "step": 9432 + }, + { + "epoch": 1.41, + "grad_norm": 4.161099628772299, + "learning_rate": 1.6865122601638463e-06, + "loss": 0.6953, + "step": 9433 + }, + { + "epoch": 1.41, + "grad_norm": 1.5199968843923886, + "learning_rate": 1.6864420115874094e-06, + "loss": 0.6478, + "step": 9434 + }, + { + "epoch": 1.41, + "grad_norm": 2.269144450952579, + "learning_rate": 1.6863717566043484e-06, + "loss": 0.6719, + "step": 9435 + }, + { + "epoch": 1.41, + "grad_norm": 2.0550287767882534, + "learning_rate": 1.6863014952153188e-06, + "loss": 0.6999, + "step": 9436 + }, + { + "epoch": 1.41, + "grad_norm": 4.784207435200631, + "learning_rate": 1.686231227420977e-06, + "loss": 0.668, + "step": 9437 + }, + { + "epoch": 1.41, + "grad_norm": 2.820037458513027, + "learning_rate": 1.686160953221978e-06, + "loss": 0.6992, + "step": 9438 + }, + { + "epoch": 1.41, + "grad_norm": 0.9485436552014328, + "learning_rate": 1.6860906726189783e-06, + "loss": 0.6784, + "step": 9439 + }, + { + "epoch": 1.41, + "grad_norm": 6.3003058839431505, + "learning_rate": 1.686020385612634e-06, + "loss": 0.7109, + "step": 9440 + }, + { + "epoch": 1.41, + "grad_norm": 0.9216149090716865, + "learning_rate": 1.6859500922036003e-06, + "loss": 0.6673, + "step": 9441 + }, + { + "epoch": 1.41, + "grad_norm": 2.0455685168900786, + "learning_rate": 1.6858797923925341e-06, + "loss": 0.6803, + "step": 9442 + }, + { + "epoch": 1.41, + "grad_norm": 3.1439178418596687, + "learning_rate": 1.6858094861800912e-06, + "loss": 0.6595, + "step": 9443 + }, + { + "epoch": 1.41, + "grad_norm": 4.24636718381097, + "learning_rate": 1.6857391735669273e-06, + "loss": 0.6855, + "step": 9444 + }, + { + "epoch": 1.41, + "grad_norm": 2.0471175078297352, + "learning_rate": 1.6856688545536992e-06, + "loss": 0.6803, + "step": 9445 + }, + { + "epoch": 1.41, + "grad_norm": 5.081322515106662, + "learning_rate": 1.685598529141063e-06, + "loss": 0.6712, + "step": 9446 + }, + { + "epoch": 1.41, + "grad_norm": 1.372979690987149, + "learning_rate": 1.6855281973296753e-06, + "loss": 0.6699, + "step": 9447 + }, + { + "epoch": 1.41, + "grad_norm": 1.1197689475692203, + "learning_rate": 1.6854578591201922e-06, + "loss": 0.6693, + "step": 9448 + }, + { + "epoch": 1.41, + "grad_norm": 1.0738605304826307, + "learning_rate": 1.6853875145132706e-06, + "loss": 0.6895, + "step": 9449 + }, + { + "epoch": 1.41, + "grad_norm": 3.6163463130067712, + "learning_rate": 1.6853171635095662e-06, + "loss": 0.681, + "step": 9450 + }, + { + "epoch": 1.41, + "grad_norm": 2.5948615146172496, + "learning_rate": 1.6852468061097364e-06, + "loss": 0.6647, + "step": 9451 + }, + { + "epoch": 1.41, + "grad_norm": 3.2132380597939316, + "learning_rate": 1.6851764423144375e-06, + "loss": 0.6862, + "step": 9452 + }, + { + "epoch": 1.41, + "grad_norm": 0.874119342516642, + "learning_rate": 1.6851060721243262e-06, + "loss": 0.6569, + "step": 9453 + }, + { + "epoch": 1.41, + "grad_norm": 4.234991554377252, + "learning_rate": 1.6850356955400593e-06, + "loss": 0.668, + "step": 9454 + }, + { + "epoch": 1.41, + "grad_norm": 1.1654921024287894, + "learning_rate": 1.6849653125622938e-06, + "loss": 0.679, + "step": 9455 + }, + { + "epoch": 1.41, + "grad_norm": 3.5876076225891507, + "learning_rate": 1.6848949231916864e-06, + "loss": 0.6908, + "step": 9456 + }, + { + "epoch": 1.41, + "grad_norm": 0.7171485961424795, + "learning_rate": 1.6848245274288942e-06, + "loss": 0.6777, + "step": 9457 + }, + { + "epoch": 1.41, + "grad_norm": 0.7386109512663415, + "learning_rate": 1.684754125274574e-06, + "loss": 0.6615, + "step": 9458 + }, + { + "epoch": 1.41, + "grad_norm": 3.8154856911262427, + "learning_rate": 1.684683716729383e-06, + "loss": 0.6641, + "step": 9459 + }, + { + "epoch": 1.41, + "grad_norm": 0.8159785496003683, + "learning_rate": 1.6846133017939788e-06, + "loss": 0.6686, + "step": 9460 + }, + { + "epoch": 1.41, + "grad_norm": 1.3839417750423542, + "learning_rate": 1.6845428804690176e-06, + "loss": 0.6621, + "step": 9461 + }, + { + "epoch": 1.41, + "grad_norm": 1.0558139623113854, + "learning_rate": 1.6844724527551571e-06, + "loss": 0.6862, + "step": 9462 + }, + { + "epoch": 1.41, + "grad_norm": 3.842692750213089, + "learning_rate": 1.684402018653055e-06, + "loss": 0.6699, + "step": 9463 + }, + { + "epoch": 1.41, + "grad_norm": 1.573705511646552, + "learning_rate": 1.684331578163368e-06, + "loss": 0.6901, + "step": 9464 + }, + { + "epoch": 1.41, + "grad_norm": 0.7585211988995161, + "learning_rate": 1.6842611312867542e-06, + "loss": 0.6602, + "step": 9465 + }, + { + "epoch": 1.41, + "grad_norm": 4.7219635226735255, + "learning_rate": 1.6841906780238704e-06, + "loss": 0.666, + "step": 9466 + }, + { + "epoch": 1.41, + "grad_norm": 3.5152354726942137, + "learning_rate": 1.6841202183753747e-06, + "loss": 0.6686, + "step": 9467 + }, + { + "epoch": 1.41, + "grad_norm": 2.7825971387068322, + "learning_rate": 1.6840497523419243e-06, + "loss": 0.6771, + "step": 9468 + }, + { + "epoch": 1.41, + "grad_norm": 4.026658063191153, + "learning_rate": 1.6839792799241771e-06, + "loss": 0.6797, + "step": 9469 + }, + { + "epoch": 1.41, + "grad_norm": 0.8743055383823879, + "learning_rate": 1.6839088011227913e-06, + "loss": 0.6497, + "step": 9470 + }, + { + "epoch": 1.41, + "grad_norm": 2.285448521932554, + "learning_rate": 1.6838383159384236e-06, + "loss": 0.6634, + "step": 9471 + }, + { + "epoch": 1.41, + "grad_norm": 3.916469271916746, + "learning_rate": 1.6837678243717325e-06, + "loss": 0.6875, + "step": 9472 + }, + { + "epoch": 1.41, + "grad_norm": 1.1258012591044695, + "learning_rate": 1.6836973264233762e-06, + "loss": 0.6888, + "step": 9473 + }, + { + "epoch": 1.41, + "grad_norm": 2.55754232376877, + "learning_rate": 1.6836268220940117e-06, + "loss": 0.6374, + "step": 9474 + }, + { + "epoch": 1.41, + "grad_norm": 1.0140192122446967, + "learning_rate": 1.683556311384298e-06, + "loss": 0.6699, + "step": 9475 + }, + { + "epoch": 1.41, + "grad_norm": 1.1256606316162507, + "learning_rate": 1.6834857942948928e-06, + "loss": 0.668, + "step": 9476 + }, + { + "epoch": 1.41, + "grad_norm": 0.9802641930659526, + "learning_rate": 1.6834152708264542e-06, + "loss": 0.7025, + "step": 9477 + }, + { + "epoch": 1.41, + "grad_norm": 2.4445690619439913, + "learning_rate": 1.6833447409796403e-06, + "loss": 0.7174, + "step": 9478 + }, + { + "epoch": 1.41, + "grad_norm": 6.025900856740322, + "learning_rate": 1.6832742047551096e-06, + "loss": 0.6712, + "step": 9479 + }, + { + "epoch": 1.41, + "grad_norm": 3.563850114418661, + "learning_rate": 1.6832036621535207e-06, + "loss": 0.6673, + "step": 9480 + }, + { + "epoch": 1.41, + "grad_norm": 2.5304133806400975, + "learning_rate": 1.683133113175531e-06, + "loss": 0.6966, + "step": 9481 + }, + { + "epoch": 1.41, + "grad_norm": 0.9375636910255628, + "learning_rate": 1.6830625578217999e-06, + "loss": 0.6895, + "step": 9482 + }, + { + "epoch": 1.41, + "grad_norm": 1.7744897329136857, + "learning_rate": 1.6829919960929852e-06, + "loss": 0.6725, + "step": 9483 + }, + { + "epoch": 1.41, + "grad_norm": 2.710363822398303, + "learning_rate": 1.682921427989746e-06, + "loss": 0.638, + "step": 9484 + }, + { + "epoch": 1.41, + "grad_norm": 0.8190450684233177, + "learning_rate": 1.6828508535127407e-06, + "loss": 0.6901, + "step": 9485 + }, + { + "epoch": 1.41, + "grad_norm": 1.2355217774363692, + "learning_rate": 1.6827802726626284e-06, + "loss": 0.6934, + "step": 9486 + }, + { + "epoch": 1.41, + "grad_norm": 2.2804217090171095, + "learning_rate": 1.682709685440067e-06, + "loss": 0.6842, + "step": 9487 + }, + { + "epoch": 1.42, + "grad_norm": 1.1774272376141146, + "learning_rate": 1.6826390918457156e-06, + "loss": 0.6725, + "step": 9488 + }, + { + "epoch": 1.42, + "grad_norm": 0.7968430968198743, + "learning_rate": 1.6825684918802334e-06, + "loss": 0.6517, + "step": 9489 + }, + { + "epoch": 1.42, + "grad_norm": 1.727738582364127, + "learning_rate": 1.6824978855442793e-06, + "loss": 0.6738, + "step": 9490 + }, + { + "epoch": 1.42, + "grad_norm": 4.510988702282659, + "learning_rate": 1.6824272728385118e-06, + "loss": 0.7064, + "step": 9491 + }, + { + "epoch": 1.42, + "grad_norm": 0.7338102177417876, + "learning_rate": 1.6823566537635902e-06, + "loss": 0.6803, + "step": 9492 + }, + { + "epoch": 1.42, + "grad_norm": 0.7890686248308613, + "learning_rate": 1.6822860283201737e-06, + "loss": 0.679, + "step": 9493 + }, + { + "epoch": 1.42, + "grad_norm": 2.5444183716030784, + "learning_rate": 1.6822153965089214e-06, + "loss": 0.668, + "step": 9494 + }, + { + "epoch": 1.42, + "grad_norm": 0.8905975786057961, + "learning_rate": 1.6821447583304922e-06, + "loss": 0.679, + "step": 9495 + }, + { + "epoch": 1.42, + "grad_norm": 6.005566232584522, + "learning_rate": 1.6820741137855462e-06, + "loss": 0.6875, + "step": 9496 + }, + { + "epoch": 1.42, + "grad_norm": 4.151636785115559, + "learning_rate": 1.6820034628747417e-06, + "loss": 0.6608, + "step": 9497 + }, + { + "epoch": 1.42, + "grad_norm": 3.908389906764061, + "learning_rate": 1.6819328055987386e-06, + "loss": 0.6868, + "step": 9498 + }, + { + "epoch": 1.42, + "grad_norm": 2.0816483072461582, + "learning_rate": 1.6818621419581964e-06, + "loss": 0.6628, + "step": 9499 + }, + { + "epoch": 1.42, + "grad_norm": 0.7860002157910276, + "learning_rate": 1.6817914719537748e-06, + "loss": 0.6908, + "step": 9500 + }, + { + "epoch": 1.42, + "grad_norm": 3.1403342576183664, + "learning_rate": 1.6817207955861327e-06, + "loss": 0.6693, + "step": 9501 + }, + { + "epoch": 1.42, + "grad_norm": 1.4538150051969574, + "learning_rate": 1.6816501128559302e-06, + "loss": 0.6823, + "step": 9502 + }, + { + "epoch": 1.42, + "grad_norm": 1.7007248967961794, + "learning_rate": 1.6815794237638271e-06, + "loss": 0.6816, + "step": 9503 + }, + { + "epoch": 1.42, + "grad_norm": 1.368701576403718, + "learning_rate": 1.681508728310483e-06, + "loss": 0.6621, + "step": 9504 + }, + { + "epoch": 1.42, + "grad_norm": 3.31489975735538, + "learning_rate": 1.6814380264965574e-06, + "loss": 0.6986, + "step": 9505 + }, + { + "epoch": 1.42, + "grad_norm": 0.8937389275453913, + "learning_rate": 1.6813673183227106e-06, + "loss": 0.6751, + "step": 9506 + }, + { + "epoch": 1.42, + "grad_norm": 0.7520955674839375, + "learning_rate": 1.6812966037896024e-06, + "loss": 0.6589, + "step": 9507 + }, + { + "epoch": 1.42, + "grad_norm": 0.6896347064535673, + "learning_rate": 1.6812258828978924e-06, + "loss": 0.6816, + "step": 9508 + }, + { + "epoch": 1.42, + "grad_norm": 5.676676351526415, + "learning_rate": 1.6811551556482413e-06, + "loss": 0.6803, + "step": 9509 + }, + { + "epoch": 1.42, + "grad_norm": 2.148687662317602, + "learning_rate": 1.6810844220413086e-06, + "loss": 0.6523, + "step": 9510 + }, + { + "epoch": 1.42, + "grad_norm": 0.9295917333969211, + "learning_rate": 1.6810136820777552e-06, + "loss": 0.7038, + "step": 9511 + }, + { + "epoch": 1.42, + "grad_norm": 6.070397914437808, + "learning_rate": 1.6809429357582405e-06, + "loss": 0.6979, + "step": 9512 + }, + { + "epoch": 1.42, + "grad_norm": 1.1205715248378458, + "learning_rate": 1.6808721830834256e-06, + "loss": 0.6602, + "step": 9513 + }, + { + "epoch": 1.42, + "grad_norm": 1.7143439998145027, + "learning_rate": 1.6808014240539701e-06, + "loss": 0.6725, + "step": 9514 + }, + { + "epoch": 1.42, + "grad_norm": 1.3728416065521893, + "learning_rate": 1.6807306586705347e-06, + "loss": 0.6576, + "step": 9515 + }, + { + "epoch": 1.42, + "grad_norm": 5.826664803912482, + "learning_rate": 1.68065988693378e-06, + "loss": 0.666, + "step": 9516 + }, + { + "epoch": 1.42, + "grad_norm": 3.2549173782513803, + "learning_rate": 1.6805891088443662e-06, + "loss": 0.6823, + "step": 9517 + }, + { + "epoch": 1.42, + "grad_norm": 0.8270671202152395, + "learning_rate": 1.6805183244029541e-06, + "loss": 0.679, + "step": 9518 + }, + { + "epoch": 1.42, + "grad_norm": 1.071884995927598, + "learning_rate": 1.6804475336102044e-06, + "loss": 0.6419, + "step": 9519 + }, + { + "epoch": 1.42, + "grad_norm": 1.1921822155296558, + "learning_rate": 1.6803767364667778e-06, + "loss": 0.6595, + "step": 9520 + }, + { + "epoch": 1.42, + "grad_norm": 1.4268249283393655, + "learning_rate": 1.6803059329733346e-06, + "loss": 0.6497, + "step": 9521 + }, + { + "epoch": 1.42, + "grad_norm": 4.729064493976697, + "learning_rate": 1.680235123130536e-06, + "loss": 0.7038, + "step": 9522 + }, + { + "epoch": 1.42, + "grad_norm": 3.5042403478764372, + "learning_rate": 1.6801643069390435e-06, + "loss": 0.7044, + "step": 9523 + }, + { + "epoch": 1.42, + "grad_norm": 2.590225790011658, + "learning_rate": 1.6800934843995167e-06, + "loss": 0.6719, + "step": 9524 + }, + { + "epoch": 1.42, + "grad_norm": 1.2176712358619817, + "learning_rate": 1.6800226555126173e-06, + "loss": 0.6986, + "step": 9525 + }, + { + "epoch": 1.42, + "grad_norm": 3.6768008836468007, + "learning_rate": 1.6799518202790064e-06, + "loss": 0.6602, + "step": 9526 + }, + { + "epoch": 1.42, + "grad_norm": 1.2054494416072576, + "learning_rate": 1.6798809786993454e-06, + "loss": 0.6387, + "step": 9527 + }, + { + "epoch": 1.42, + "grad_norm": 2.39989150701562, + "learning_rate": 1.6798101307742947e-06, + "loss": 0.668, + "step": 9528 + }, + { + "epoch": 1.42, + "grad_norm": 0.8740192996012748, + "learning_rate": 1.6797392765045156e-06, + "loss": 0.6888, + "step": 9529 + }, + { + "epoch": 1.42, + "grad_norm": 0.6984164991486733, + "learning_rate": 1.6796684158906705e-06, + "loss": 0.6634, + "step": 9530 + }, + { + "epoch": 1.42, + "grad_norm": 7.0219652218048285, + "learning_rate": 1.6795975489334193e-06, + "loss": 0.6628, + "step": 9531 + }, + { + "epoch": 1.42, + "grad_norm": 3.45182528376254, + "learning_rate": 1.6795266756334244e-06, + "loss": 0.6816, + "step": 9532 + }, + { + "epoch": 1.42, + "grad_norm": 1.378099410512111, + "learning_rate": 1.6794557959913468e-06, + "loss": 0.6973, + "step": 9533 + }, + { + "epoch": 1.42, + "grad_norm": 1.456752570052077, + "learning_rate": 1.6793849100078485e-06, + "loss": 0.6895, + "step": 9534 + }, + { + "epoch": 1.42, + "grad_norm": 3.301773763546872, + "learning_rate": 1.6793140176835904e-06, + "loss": 0.6491, + "step": 9535 + }, + { + "epoch": 1.42, + "grad_norm": 0.8644499893119629, + "learning_rate": 1.6792431190192348e-06, + "loss": 0.6921, + "step": 9536 + }, + { + "epoch": 1.42, + "grad_norm": 0.9007932079597792, + "learning_rate": 1.679172214015443e-06, + "loss": 0.6725, + "step": 9537 + }, + { + "epoch": 1.42, + "grad_norm": 1.8064218117990218, + "learning_rate": 1.6791013026728766e-06, + "loss": 0.6895, + "step": 9538 + }, + { + "epoch": 1.42, + "grad_norm": 2.025246198336835, + "learning_rate": 1.6790303849921981e-06, + "loss": 0.666, + "step": 9539 + }, + { + "epoch": 1.42, + "grad_norm": 1.0923385463632698, + "learning_rate": 1.6789594609740687e-06, + "loss": 0.653, + "step": 9540 + }, + { + "epoch": 1.42, + "grad_norm": 4.723089034627031, + "learning_rate": 1.6788885306191504e-06, + "loss": 0.6868, + "step": 9541 + }, + { + "epoch": 1.42, + "grad_norm": 1.3609184016493776, + "learning_rate": 1.6788175939281058e-06, + "loss": 0.6712, + "step": 9542 + }, + { + "epoch": 1.42, + "grad_norm": 5.694594441518031, + "learning_rate": 1.6787466509015964e-06, + "loss": 0.7201, + "step": 9543 + }, + { + "epoch": 1.42, + "grad_norm": 1.7850621354176779, + "learning_rate": 1.6786757015402846e-06, + "loss": 0.6693, + "step": 9544 + }, + { + "epoch": 1.42, + "grad_norm": 2.5107322594001444, + "learning_rate": 1.6786047458448324e-06, + "loss": 0.6589, + "step": 9545 + }, + { + "epoch": 1.42, + "grad_norm": 4.623678888888987, + "learning_rate": 1.678533783815902e-06, + "loss": 0.6842, + "step": 9546 + }, + { + "epoch": 1.42, + "grad_norm": 3.7427515282302597, + "learning_rate": 1.6784628154541558e-06, + "loss": 0.6699, + "step": 9547 + }, + { + "epoch": 1.42, + "grad_norm": 2.3715119829295537, + "learning_rate": 1.6783918407602563e-06, + "loss": 0.6419, + "step": 9548 + }, + { + "epoch": 1.42, + "grad_norm": 1.0744454267967012, + "learning_rate": 1.6783208597348657e-06, + "loss": 0.6829, + "step": 9549 + }, + { + "epoch": 1.42, + "grad_norm": 0.7690346538082838, + "learning_rate": 1.6782498723786463e-06, + "loss": 0.668, + "step": 9550 + }, + { + "epoch": 1.42, + "grad_norm": 1.6091673257773724, + "learning_rate": 1.678178878692261e-06, + "loss": 0.6582, + "step": 9551 + }, + { + "epoch": 1.42, + "grad_norm": 1.6273066237536522, + "learning_rate": 1.6781078786763722e-06, + "loss": 0.6855, + "step": 9552 + }, + { + "epoch": 1.42, + "grad_norm": 2.8373166564547065, + "learning_rate": 1.6780368723316428e-06, + "loss": 0.6543, + "step": 9553 + }, + { + "epoch": 1.42, + "grad_norm": 1.0507414529695513, + "learning_rate": 1.6779658596587352e-06, + "loss": 0.6732, + "step": 9554 + }, + { + "epoch": 1.43, + "grad_norm": 0.8343834746425348, + "learning_rate": 1.6778948406583125e-06, + "loss": 0.6628, + "step": 9555 + }, + { + "epoch": 1.43, + "grad_norm": 1.8935135885596721, + "learning_rate": 1.677823815331037e-06, + "loss": 0.6374, + "step": 9556 + }, + { + "epoch": 1.43, + "grad_norm": 4.48577661596623, + "learning_rate": 1.677752783677572e-06, + "loss": 0.6602, + "step": 9557 + }, + { + "epoch": 1.43, + "grad_norm": 2.6501129204688043, + "learning_rate": 1.6776817456985803e-06, + "loss": 0.6777, + "step": 9558 + }, + { + "epoch": 1.43, + "grad_norm": 1.7743504590288344, + "learning_rate": 1.677610701394725e-06, + "loss": 0.6686, + "step": 9559 + }, + { + "epoch": 1.43, + "grad_norm": 3.9084133981082347, + "learning_rate": 1.6775396507666689e-06, + "loss": 0.6914, + "step": 9560 + }, + { + "epoch": 1.43, + "grad_norm": 0.9272438050433522, + "learning_rate": 1.6774685938150754e-06, + "loss": 0.6667, + "step": 9561 + }, + { + "epoch": 1.43, + "grad_norm": 4.633723655984862, + "learning_rate": 1.677397530540608e-06, + "loss": 0.6908, + "step": 9562 + }, + { + "epoch": 1.43, + "grad_norm": 2.9835246389006103, + "learning_rate": 1.6773264609439287e-06, + "loss": 0.6589, + "step": 9563 + }, + { + "epoch": 1.43, + "grad_norm": 1.611949815451516, + "learning_rate": 1.6772553850257022e-06, + "loss": 0.6393, + "step": 9564 + }, + { + "epoch": 1.43, + "grad_norm": 1.7884290154364095, + "learning_rate": 1.6771843027865912e-06, + "loss": 0.6999, + "step": 9565 + }, + { + "epoch": 1.43, + "grad_norm": 3.3029032388658894, + "learning_rate": 1.6771132142272592e-06, + "loss": 0.6595, + "step": 9566 + }, + { + "epoch": 1.43, + "grad_norm": 1.6797538121690359, + "learning_rate": 1.6770421193483698e-06, + "loss": 0.6934, + "step": 9567 + }, + { + "epoch": 1.43, + "grad_norm": 3.482809965198993, + "learning_rate": 1.6769710181505863e-06, + "loss": 0.6934, + "step": 9568 + }, + { + "epoch": 1.43, + "grad_norm": 3.318838690733213, + "learning_rate": 1.6768999106345726e-06, + "loss": 0.6634, + "step": 9569 + }, + { + "epoch": 1.43, + "grad_norm": 0.9876597078985407, + "learning_rate": 1.6768287968009923e-06, + "loss": 0.6491, + "step": 9570 + }, + { + "epoch": 1.43, + "grad_norm": 4.705540263104218, + "learning_rate": 1.6767576766505086e-06, + "loss": 0.6823, + "step": 9571 + }, + { + "epoch": 1.43, + "grad_norm": 2.998442153282063, + "learning_rate": 1.6766865501837857e-06, + "loss": 0.6582, + "step": 9572 + }, + { + "epoch": 1.43, + "grad_norm": 1.413893095578076, + "learning_rate": 1.6766154174014876e-06, + "loss": 0.6771, + "step": 9573 + }, + { + "epoch": 1.43, + "grad_norm": 1.6060252687346341, + "learning_rate": 1.6765442783042778e-06, + "loss": 0.6589, + "step": 9574 + }, + { + "epoch": 1.43, + "grad_norm": 1.5287492618693583, + "learning_rate": 1.6764731328928203e-06, + "loss": 0.6751, + "step": 9575 + }, + { + "epoch": 1.43, + "grad_norm": 2.3538618683915717, + "learning_rate": 1.6764019811677797e-06, + "loss": 0.6595, + "step": 9576 + }, + { + "epoch": 1.43, + "grad_norm": 1.0648060792179486, + "learning_rate": 1.676330823129819e-06, + "loss": 0.6543, + "step": 9577 + }, + { + "epoch": 1.43, + "grad_norm": 5.688173476836239, + "learning_rate": 1.6762596587796031e-06, + "loss": 0.6576, + "step": 9578 + }, + { + "epoch": 1.43, + "grad_norm": 3.362489724366235, + "learning_rate": 1.6761884881177963e-06, + "loss": 0.6875, + "step": 9579 + }, + { + "epoch": 1.43, + "grad_norm": 1.2523422281552372, + "learning_rate": 1.6761173111450624e-06, + "loss": 0.679, + "step": 9580 + }, + { + "epoch": 1.43, + "grad_norm": 5.86204480888129, + "learning_rate": 1.6760461278620657e-06, + "loss": 0.6673, + "step": 9581 + }, + { + "epoch": 1.43, + "grad_norm": 1.1242457915406445, + "learning_rate": 1.6759749382694707e-06, + "loss": 0.694, + "step": 9582 + }, + { + "epoch": 1.43, + "grad_norm": 0.8028932646019833, + "learning_rate": 1.6759037423679422e-06, + "loss": 0.6927, + "step": 9583 + }, + { + "epoch": 1.43, + "grad_norm": 4.285348736083158, + "learning_rate": 1.6758325401581436e-06, + "loss": 0.666, + "step": 9584 + }, + { + "epoch": 1.43, + "grad_norm": 1.0072473117865912, + "learning_rate": 1.6757613316407405e-06, + "loss": 0.6875, + "step": 9585 + }, + { + "epoch": 1.43, + "grad_norm": 0.864085001826599, + "learning_rate": 1.6756901168163975e-06, + "loss": 0.6712, + "step": 9586 + }, + { + "epoch": 1.43, + "grad_norm": 2.34249932812976, + "learning_rate": 1.6756188956857783e-06, + "loss": 0.6849, + "step": 9587 + }, + { + "epoch": 1.43, + "grad_norm": 2.9789253460393126, + "learning_rate": 1.6755476682495486e-06, + "loss": 0.6862, + "step": 9588 + }, + { + "epoch": 1.43, + "grad_norm": 0.8578171814383369, + "learning_rate": 1.6754764345083728e-06, + "loss": 0.679, + "step": 9589 + }, + { + "epoch": 1.43, + "grad_norm": 2.1859304537147928, + "learning_rate": 1.6754051944629154e-06, + "loss": 0.6712, + "step": 9590 + }, + { + "epoch": 1.43, + "grad_norm": 0.9135208047183097, + "learning_rate": 1.6753339481138418e-06, + "loss": 0.6439, + "step": 9591 + }, + { + "epoch": 1.43, + "grad_norm": 1.291172598780422, + "learning_rate": 1.6752626954618165e-06, + "loss": 0.6673, + "step": 9592 + }, + { + "epoch": 1.43, + "grad_norm": 2.280402110755857, + "learning_rate": 1.675191436507505e-06, + "loss": 0.709, + "step": 9593 + }, + { + "epoch": 1.43, + "grad_norm": 0.7131740886402314, + "learning_rate": 1.675120171251572e-06, + "loss": 0.6693, + "step": 9594 + }, + { + "epoch": 1.43, + "grad_norm": 0.7351501419976892, + "learning_rate": 1.6750488996946825e-06, + "loss": 0.6777, + "step": 9595 + }, + { + "epoch": 1.43, + "grad_norm": 0.8318229076332505, + "learning_rate": 1.6749776218375024e-06, + "loss": 0.6595, + "step": 9596 + }, + { + "epoch": 1.43, + "grad_norm": 0.6961597984382141, + "learning_rate": 1.6749063376806963e-06, + "loss": 0.6966, + "step": 9597 + }, + { + "epoch": 1.43, + "grad_norm": 1.8282434695401781, + "learning_rate": 1.6748350472249296e-06, + "loss": 0.6862, + "step": 9598 + }, + { + "epoch": 1.43, + "grad_norm": 0.6735552813162105, + "learning_rate": 1.6747637504708676e-06, + "loss": 0.681, + "step": 9599 + }, + { + "epoch": 1.43, + "grad_norm": 6.764530279016239, + "learning_rate": 1.6746924474191761e-06, + "loss": 0.6641, + "step": 9600 + }, + { + "epoch": 1.43, + "grad_norm": 1.1189664337287515, + "learning_rate": 1.67462113807052e-06, + "loss": 0.6738, + "step": 9601 + }, + { + "epoch": 1.43, + "grad_norm": 1.2194363658483989, + "learning_rate": 1.6745498224255656e-06, + "loss": 0.6816, + "step": 9602 + }, + { + "epoch": 1.43, + "grad_norm": 1.2920823120334126, + "learning_rate": 1.6744785004849778e-06, + "loss": 0.679, + "step": 9603 + }, + { + "epoch": 1.43, + "grad_norm": 3.635593517775977, + "learning_rate": 1.6744071722494222e-06, + "loss": 0.6803, + "step": 9604 + }, + { + "epoch": 1.43, + "grad_norm": 1.4033331400555151, + "learning_rate": 1.6743358377195654e-06, + "loss": 0.6615, + "step": 9605 + }, + { + "epoch": 1.43, + "grad_norm": 0.8232074387664571, + "learning_rate": 1.674264496896072e-06, + "loss": 0.6562, + "step": 9606 + }, + { + "epoch": 1.43, + "grad_norm": 5.24869857921295, + "learning_rate": 1.674193149779609e-06, + "loss": 0.6901, + "step": 9607 + }, + { + "epoch": 1.43, + "grad_norm": 2.1656225018438873, + "learning_rate": 1.6741217963708415e-06, + "loss": 0.679, + "step": 9608 + }, + { + "epoch": 1.43, + "grad_norm": 2.542307421521935, + "learning_rate": 1.6740504366704355e-06, + "loss": 0.6725, + "step": 9609 + }, + { + "epoch": 1.43, + "grad_norm": 1.6037936362184697, + "learning_rate": 1.6739790706790573e-06, + "loss": 0.6797, + "step": 9610 + }, + { + "epoch": 1.43, + "grad_norm": 1.8995054730865857, + "learning_rate": 1.6739076983973727e-06, + "loss": 0.6667, + "step": 9611 + }, + { + "epoch": 1.43, + "grad_norm": 0.9390299011212142, + "learning_rate": 1.6738363198260482e-06, + "loss": 0.6426, + "step": 9612 + }, + { + "epoch": 1.43, + "grad_norm": 4.236726693810282, + "learning_rate": 1.6737649349657495e-06, + "loss": 0.681, + "step": 9613 + }, + { + "epoch": 1.43, + "grad_norm": 0.8448031134491082, + "learning_rate": 1.6736935438171434e-06, + "loss": 0.6855, + "step": 9614 + }, + { + "epoch": 1.43, + "grad_norm": 1.5177577669101523, + "learning_rate": 1.6736221463808955e-06, + "loss": 0.6758, + "step": 9615 + }, + { + "epoch": 1.43, + "grad_norm": 5.238479662743005, + "learning_rate": 1.6735507426576728e-06, + "loss": 0.6875, + "step": 9616 + }, + { + "epoch": 1.43, + "grad_norm": 1.253235804237378, + "learning_rate": 1.6734793326481413e-06, + "loss": 0.6602, + "step": 9617 + }, + { + "epoch": 1.43, + "grad_norm": 2.9911227638338227, + "learning_rate": 1.6734079163529676e-06, + "loss": 0.6758, + "step": 9618 + }, + { + "epoch": 1.43, + "grad_norm": 1.8380047772462822, + "learning_rate": 1.6733364937728185e-06, + "loss": 0.6882, + "step": 9619 + }, + { + "epoch": 1.43, + "grad_norm": 4.6722788314606785, + "learning_rate": 1.6732650649083599e-06, + "loss": 0.6686, + "step": 9620 + }, + { + "epoch": 1.43, + "grad_norm": 3.9995105601917205, + "learning_rate": 1.6731936297602593e-06, + "loss": 0.6868, + "step": 9621 + }, + { + "epoch": 1.44, + "grad_norm": 2.197650016937239, + "learning_rate": 1.673122188329183e-06, + "loss": 0.6829, + "step": 9622 + }, + { + "epoch": 1.44, + "grad_norm": 1.34885559222513, + "learning_rate": 1.6730507406157976e-06, + "loss": 0.6758, + "step": 9623 + }, + { + "epoch": 1.44, + "grad_norm": 3.106855153171697, + "learning_rate": 1.6729792866207703e-06, + "loss": 0.6908, + "step": 9624 + }, + { + "epoch": 1.44, + "grad_norm": 3.0138300644384106, + "learning_rate": 1.6729078263447675e-06, + "loss": 0.6621, + "step": 9625 + }, + { + "epoch": 1.44, + "grad_norm": 2.1569891601378757, + "learning_rate": 1.672836359788457e-06, + "loss": 0.6628, + "step": 9626 + }, + { + "epoch": 1.44, + "grad_norm": 1.3199601596592592, + "learning_rate": 1.6727648869525046e-06, + "loss": 0.6986, + "step": 9627 + }, + { + "epoch": 1.44, + "grad_norm": 2.1064007874353297, + "learning_rate": 1.6726934078375786e-06, + "loss": 0.6497, + "step": 9628 + }, + { + "epoch": 1.44, + "grad_norm": 0.8419547809298458, + "learning_rate": 1.6726219224443452e-06, + "loss": 0.6497, + "step": 9629 + }, + { + "epoch": 1.44, + "grad_norm": 4.1751370569248705, + "learning_rate": 1.672550430773472e-06, + "loss": 0.6823, + "step": 9630 + }, + { + "epoch": 1.44, + "grad_norm": 2.5194894220311874, + "learning_rate": 1.672478932825626e-06, + "loss": 0.6361, + "step": 9631 + }, + { + "epoch": 1.44, + "grad_norm": 4.5417294987000405, + "learning_rate": 1.6724074286014748e-06, + "loss": 0.6562, + "step": 9632 + }, + { + "epoch": 1.44, + "grad_norm": 0.9743426347578505, + "learning_rate": 1.6723359181016856e-06, + "loss": 0.6452, + "step": 9633 + }, + { + "epoch": 1.44, + "grad_norm": 0.8832296641210833, + "learning_rate": 1.672264401326926e-06, + "loss": 0.6947, + "step": 9634 + }, + { + "epoch": 1.44, + "grad_norm": 0.8648227017232651, + "learning_rate": 1.672192878277863e-06, + "loss": 0.696, + "step": 9635 + }, + { + "epoch": 1.44, + "grad_norm": 1.3387160380603584, + "learning_rate": 1.6721213489551645e-06, + "loss": 0.6641, + "step": 9636 + }, + { + "epoch": 1.44, + "grad_norm": 3.1410197158347626, + "learning_rate": 1.6720498133594977e-06, + "loss": 0.6797, + "step": 9637 + }, + { + "epoch": 1.44, + "grad_norm": 0.8910753650028101, + "learning_rate": 1.6719782714915312e-06, + "loss": 0.6732, + "step": 9638 + }, + { + "epoch": 1.44, + "grad_norm": 1.3793697895166697, + "learning_rate": 1.671906723351932e-06, + "loss": 0.6719, + "step": 9639 + }, + { + "epoch": 1.44, + "grad_norm": 2.6085415332503556, + "learning_rate": 1.6718351689413676e-06, + "loss": 0.6621, + "step": 9640 + }, + { + "epoch": 1.44, + "grad_norm": 1.5305712384094143, + "learning_rate": 1.6717636082605062e-06, + "loss": 0.6634, + "step": 9641 + }, + { + "epoch": 1.44, + "grad_norm": 1.5750329521977326, + "learning_rate": 1.6716920413100156e-06, + "loss": 0.6738, + "step": 9642 + }, + { + "epoch": 1.44, + "grad_norm": 1.5051648999715816, + "learning_rate": 1.6716204680905638e-06, + "loss": 0.6771, + "step": 9643 + }, + { + "epoch": 1.44, + "grad_norm": 1.0030414537916683, + "learning_rate": 1.6715488886028188e-06, + "loss": 0.6979, + "step": 9644 + }, + { + "epoch": 1.44, + "grad_norm": 1.6811455042521513, + "learning_rate": 1.6714773028474487e-06, + "loss": 0.6719, + "step": 9645 + }, + { + "epoch": 1.44, + "grad_norm": 5.564446335632076, + "learning_rate": 1.6714057108251214e-06, + "loss": 0.6927, + "step": 9646 + }, + { + "epoch": 1.44, + "grad_norm": 1.2006433670350414, + "learning_rate": 1.6713341125365054e-06, + "loss": 0.651, + "step": 9647 + }, + { + "epoch": 1.44, + "grad_norm": 3.2188325001227733, + "learning_rate": 1.6712625079822686e-06, + "loss": 0.6387, + "step": 9648 + }, + { + "epoch": 1.44, + "grad_norm": 0.8511725164602723, + "learning_rate": 1.6711908971630798e-06, + "loss": 0.6758, + "step": 9649 + }, + { + "epoch": 1.44, + "grad_norm": 2.6221917510378905, + "learning_rate": 1.6711192800796065e-06, + "loss": 0.7025, + "step": 9650 + }, + { + "epoch": 1.44, + "grad_norm": 2.3503540489236965, + "learning_rate": 1.6710476567325177e-06, + "loss": 0.681, + "step": 9651 + }, + { + "epoch": 1.44, + "grad_norm": 0.9768459997001198, + "learning_rate": 1.6709760271224824e-06, + "loss": 0.6842, + "step": 9652 + }, + { + "epoch": 1.44, + "grad_norm": 0.7466352274070805, + "learning_rate": 1.670904391250168e-06, + "loss": 0.6706, + "step": 9653 + }, + { + "epoch": 1.44, + "grad_norm": 3.1187038399670226, + "learning_rate": 1.6708327491162435e-06, + "loss": 0.6628, + "step": 9654 + }, + { + "epoch": 1.44, + "grad_norm": 2.702639473272661, + "learning_rate": 1.6707611007213778e-06, + "loss": 0.6576, + "step": 9655 + }, + { + "epoch": 1.44, + "grad_norm": 1.3748746727861492, + "learning_rate": 1.6706894460662392e-06, + "loss": 0.6842, + "step": 9656 + }, + { + "epoch": 1.44, + "grad_norm": 1.4939498880561393, + "learning_rate": 1.6706177851514971e-06, + "loss": 0.6784, + "step": 9657 + }, + { + "epoch": 1.44, + "grad_norm": 6.581790806568183, + "learning_rate": 1.6705461179778195e-06, + "loss": 0.6751, + "step": 9658 + }, + { + "epoch": 1.44, + "grad_norm": 1.8399195827266945, + "learning_rate": 1.6704744445458759e-06, + "loss": 0.6745, + "step": 9659 + }, + { + "epoch": 1.44, + "grad_norm": 1.1846807233389074, + "learning_rate": 1.6704027648563348e-06, + "loss": 0.6771, + "step": 9660 + }, + { + "epoch": 1.44, + "grad_norm": 0.8724194163547382, + "learning_rate": 1.6703310789098656e-06, + "loss": 0.6784, + "step": 9661 + }, + { + "epoch": 1.44, + "grad_norm": 5.703113338853615, + "learning_rate": 1.6702593867071372e-06, + "loss": 0.6797, + "step": 9662 + }, + { + "epoch": 1.44, + "grad_norm": 2.014899046744529, + "learning_rate": 1.6701876882488186e-06, + "loss": 0.6771, + "step": 9663 + }, + { + "epoch": 1.44, + "grad_norm": 3.5310672770706124, + "learning_rate": 1.670115983535579e-06, + "loss": 0.6771, + "step": 9664 + }, + { + "epoch": 1.44, + "grad_norm": 2.430963628822894, + "learning_rate": 1.6700442725680876e-06, + "loss": 0.6562, + "step": 9665 + }, + { + "epoch": 1.44, + "grad_norm": 2.7877270897711375, + "learning_rate": 1.6699725553470141e-06, + "loss": 0.679, + "step": 9666 + }, + { + "epoch": 1.44, + "grad_norm": 4.321511239910974, + "learning_rate": 1.669900831873027e-06, + "loss": 0.6699, + "step": 9667 + }, + { + "epoch": 1.44, + "grad_norm": 1.4257416858208407, + "learning_rate": 1.6698291021467967e-06, + "loss": 0.6908, + "step": 9668 + }, + { + "epoch": 1.44, + "grad_norm": 2.2701348782697925, + "learning_rate": 1.6697573661689916e-06, + "loss": 0.694, + "step": 9669 + }, + { + "epoch": 1.44, + "grad_norm": 1.2134053399041311, + "learning_rate": 1.669685623940282e-06, + "loss": 0.6478, + "step": 9670 + }, + { + "epoch": 1.44, + "grad_norm": 1.0728262720315482, + "learning_rate": 1.6696138754613375e-06, + "loss": 0.7051, + "step": 9671 + }, + { + "epoch": 1.44, + "grad_norm": 2.7813672192025423, + "learning_rate": 1.6695421207328273e-06, + "loss": 0.668, + "step": 9672 + }, + { + "epoch": 1.44, + "grad_norm": 1.3095903628187533, + "learning_rate": 1.6694703597554213e-06, + "loss": 0.6816, + "step": 9673 + }, + { + "epoch": 1.44, + "grad_norm": 1.7277598376162095, + "learning_rate": 1.669398592529789e-06, + "loss": 0.6699, + "step": 9674 + }, + { + "epoch": 1.44, + "grad_norm": 1.3496002993294751, + "learning_rate": 1.6693268190566007e-06, + "loss": 0.6875, + "step": 9675 + }, + { + "epoch": 1.44, + "grad_norm": 6.072181522590317, + "learning_rate": 1.6692550393365259e-06, + "loss": 0.6855, + "step": 9676 + }, + { + "epoch": 1.44, + "grad_norm": 1.3630186541198854, + "learning_rate": 1.6691832533702349e-06, + "loss": 0.6914, + "step": 9677 + }, + { + "epoch": 1.44, + "grad_norm": 2.3958573958675493, + "learning_rate": 1.6691114611583969e-06, + "loss": 0.6706, + "step": 9678 + }, + { + "epoch": 1.44, + "grad_norm": 1.3720163486162358, + "learning_rate": 1.6690396627016828e-06, + "loss": 0.653, + "step": 9679 + }, + { + "epoch": 1.44, + "grad_norm": 3.999575453309043, + "learning_rate": 1.6689678580007622e-06, + "loss": 0.6562, + "step": 9680 + }, + { + "epoch": 1.44, + "grad_norm": 2.975215997286789, + "learning_rate": 1.6688960470563056e-06, + "loss": 0.696, + "step": 9681 + }, + { + "epoch": 1.44, + "grad_norm": 1.3277966540874255, + "learning_rate": 1.6688242298689829e-06, + "loss": 0.6699, + "step": 9682 + }, + { + "epoch": 1.44, + "grad_norm": 3.034929807189619, + "learning_rate": 1.6687524064394648e-06, + "loss": 0.6667, + "step": 9683 + }, + { + "epoch": 1.44, + "grad_norm": 1.6120732587569009, + "learning_rate": 1.668680576768421e-06, + "loss": 0.7096, + "step": 9684 + }, + { + "epoch": 1.44, + "grad_norm": 2.4762152495721406, + "learning_rate": 1.6686087408565224e-06, + "loss": 0.6667, + "step": 9685 + }, + { + "epoch": 1.44, + "grad_norm": 1.147645433677212, + "learning_rate": 1.6685368987044392e-06, + "loss": 0.6927, + "step": 9686 + }, + { + "epoch": 1.44, + "grad_norm": 0.822945191312777, + "learning_rate": 1.6684650503128421e-06, + "loss": 0.6667, + "step": 9687 + }, + { + "epoch": 1.44, + "grad_norm": 4.637141422157643, + "learning_rate": 1.6683931956824015e-06, + "loss": 0.6836, + "step": 9688 + }, + { + "epoch": 1.45, + "grad_norm": 1.8327189887284647, + "learning_rate": 1.668321334813788e-06, + "loss": 0.6517, + "step": 9689 + }, + { + "epoch": 1.45, + "grad_norm": 1.1655777979161333, + "learning_rate": 1.6682494677076727e-06, + "loss": 0.6628, + "step": 9690 + }, + { + "epoch": 1.45, + "grad_norm": 3.9745096422524133, + "learning_rate": 1.6681775943647258e-06, + "loss": 0.6725, + "step": 9691 + }, + { + "epoch": 1.45, + "grad_norm": 2.4839107895559898, + "learning_rate": 1.6681057147856183e-06, + "loss": 0.6758, + "step": 9692 + }, + { + "epoch": 1.45, + "grad_norm": 4.255368826700643, + "learning_rate": 1.668033828971021e-06, + "loss": 0.6602, + "step": 9693 + }, + { + "epoch": 1.45, + "grad_norm": 2.202548805042927, + "learning_rate": 1.667961936921605e-06, + "loss": 0.6641, + "step": 9694 + }, + { + "epoch": 1.45, + "grad_norm": 2.3362022676111964, + "learning_rate": 1.6678900386380414e-06, + "loss": 0.6615, + "step": 9695 + }, + { + "epoch": 1.45, + "grad_norm": 1.301692288876765, + "learning_rate": 1.6678181341210006e-06, + "loss": 0.6855, + "step": 9696 + }, + { + "epoch": 1.45, + "grad_norm": 1.587575565235512, + "learning_rate": 1.6677462233711544e-06, + "loss": 0.6484, + "step": 9697 + }, + { + "epoch": 1.45, + "grad_norm": 3.8330753834865057, + "learning_rate": 1.6676743063891736e-06, + "loss": 0.7142, + "step": 9698 + }, + { + "epoch": 1.45, + "grad_norm": 1.8806365219363756, + "learning_rate": 1.6676023831757293e-06, + "loss": 0.6816, + "step": 9699 + }, + { + "epoch": 1.45, + "grad_norm": 0.8732887130169137, + "learning_rate": 1.667530453731493e-06, + "loss": 0.668, + "step": 9700 + }, + { + "epoch": 1.45, + "grad_norm": 5.068346304168517, + "learning_rate": 1.6674585180571362e-06, + "loss": 0.6803, + "step": 9701 + }, + { + "epoch": 1.45, + "grad_norm": 1.126979944432927, + "learning_rate": 1.6673865761533296e-06, + "loss": 0.6693, + "step": 9702 + }, + { + "epoch": 1.45, + "grad_norm": 1.0921392012856914, + "learning_rate": 1.6673146280207453e-06, + "loss": 0.6543, + "step": 9703 + }, + { + "epoch": 1.45, + "grad_norm": 1.9359091954711463, + "learning_rate": 1.6672426736600542e-06, + "loss": 0.6699, + "step": 9704 + }, + { + "epoch": 1.45, + "grad_norm": 0.9518910079676866, + "learning_rate": 1.6671707130719288e-06, + "loss": 0.651, + "step": 9705 + }, + { + "epoch": 1.45, + "grad_norm": 1.2425547176816032, + "learning_rate": 1.6670987462570398e-06, + "loss": 0.6868, + "step": 9706 + }, + { + "epoch": 1.45, + "grad_norm": 1.247302545809432, + "learning_rate": 1.6670267732160592e-06, + "loss": 0.6797, + "step": 9707 + }, + { + "epoch": 1.45, + "grad_norm": 1.844535076921108, + "learning_rate": 1.666954793949659e-06, + "loss": 0.7103, + "step": 9708 + }, + { + "epoch": 1.45, + "grad_norm": 5.670003665248686, + "learning_rate": 1.6668828084585106e-06, + "loss": 0.6621, + "step": 9709 + }, + { + "epoch": 1.45, + "grad_norm": 1.1066171458201315, + "learning_rate": 1.6668108167432857e-06, + "loss": 0.6309, + "step": 9710 + }, + { + "epoch": 1.45, + "grad_norm": 6.103652096362747, + "learning_rate": 1.6667388188046566e-06, + "loss": 0.6849, + "step": 9711 + }, + { + "epoch": 1.45, + "grad_norm": 1.4395702790341298, + "learning_rate": 1.6666668146432952e-06, + "loss": 0.6992, + "step": 9712 + }, + { + "epoch": 1.45, + "grad_norm": 2.628926661943596, + "learning_rate": 1.6665948042598734e-06, + "loss": 0.7174, + "step": 9713 + }, + { + "epoch": 1.45, + "grad_norm": 3.7294803332650512, + "learning_rate": 1.6665227876550633e-06, + "loss": 0.679, + "step": 9714 + }, + { + "epoch": 1.45, + "grad_norm": 2.4831239927296727, + "learning_rate": 1.6664507648295374e-06, + "loss": 0.6589, + "step": 9715 + }, + { + "epoch": 1.45, + "grad_norm": 1.811136377416195, + "learning_rate": 1.6663787357839674e-06, + "loss": 0.6882, + "step": 9716 + }, + { + "epoch": 1.45, + "grad_norm": 1.070162829310352, + "learning_rate": 1.6663067005190254e-06, + "loss": 0.6589, + "step": 9717 + }, + { + "epoch": 1.45, + "grad_norm": 3.127900796891792, + "learning_rate": 1.6662346590353847e-06, + "loss": 0.6836, + "step": 9718 + }, + { + "epoch": 1.45, + "grad_norm": 4.028414382375511, + "learning_rate": 1.6661626113337164e-06, + "loss": 0.6699, + "step": 9719 + }, + { + "epoch": 1.45, + "grad_norm": 1.1186275730037307, + "learning_rate": 1.666090557414694e-06, + "loss": 0.6582, + "step": 9720 + }, + { + "epoch": 1.45, + "grad_norm": 0.8100493946962505, + "learning_rate": 1.6660184972789892e-06, + "loss": 0.6953, + "step": 9721 + }, + { + "epoch": 1.45, + "grad_norm": 2.508338191705204, + "learning_rate": 1.665946430927275e-06, + "loss": 0.6517, + "step": 9722 + }, + { + "epoch": 1.45, + "grad_norm": 2.923315843937856, + "learning_rate": 1.6658743583602238e-06, + "loss": 0.6751, + "step": 9723 + }, + { + "epoch": 1.45, + "grad_norm": 0.7655147113632967, + "learning_rate": 1.6658022795785086e-06, + "loss": 0.666, + "step": 9724 + }, + { + "epoch": 1.45, + "grad_norm": 2.9870452894289063, + "learning_rate": 1.6657301945828013e-06, + "loss": 0.6686, + "step": 9725 + }, + { + "epoch": 1.45, + "grad_norm": 1.872222606378239, + "learning_rate": 1.6656581033737755e-06, + "loss": 0.6471, + "step": 9726 + }, + { + "epoch": 1.45, + "grad_norm": 0.9418020036085006, + "learning_rate": 1.665586005952104e-06, + "loss": 0.7044, + "step": 9727 + }, + { + "epoch": 1.45, + "grad_norm": 2.2798697756572857, + "learning_rate": 1.6655139023184592e-06, + "loss": 0.668, + "step": 9728 + }, + { + "epoch": 1.45, + "grad_norm": 1.2683535029402613, + "learning_rate": 1.6654417924735142e-06, + "loss": 0.7129, + "step": 9729 + }, + { + "epoch": 1.45, + "grad_norm": 0.8225226793740442, + "learning_rate": 1.665369676417942e-06, + "loss": 0.6875, + "step": 9730 + }, + { + "epoch": 1.45, + "grad_norm": 1.4816016402818848, + "learning_rate": 1.6652975541524162e-06, + "loss": 0.6784, + "step": 9731 + }, + { + "epoch": 1.45, + "grad_norm": 2.2042561017312865, + "learning_rate": 1.6652254256776092e-06, + "loss": 0.6901, + "step": 9732 + }, + { + "epoch": 1.45, + "grad_norm": 1.6045966222998318, + "learning_rate": 1.6651532909941945e-06, + "loss": 0.651, + "step": 9733 + }, + { + "epoch": 1.45, + "grad_norm": 0.8934264942445005, + "learning_rate": 1.6650811501028454e-06, + "loss": 0.6784, + "step": 9734 + }, + { + "epoch": 1.45, + "grad_norm": 1.862258918482296, + "learning_rate": 1.6650090030042353e-06, + "loss": 0.6797, + "step": 9735 + }, + { + "epoch": 1.45, + "grad_norm": 1.632831610960232, + "learning_rate": 1.664936849699037e-06, + "loss": 0.6914, + "step": 9736 + }, + { + "epoch": 1.45, + "grad_norm": 5.022620421420958, + "learning_rate": 1.6648646901879242e-06, + "loss": 0.681, + "step": 9737 + }, + { + "epoch": 1.45, + "grad_norm": 0.9508298690090705, + "learning_rate": 1.6647925244715709e-06, + "loss": 0.6777, + "step": 9738 + }, + { + "epoch": 1.45, + "grad_norm": 1.630931686515698, + "learning_rate": 1.6647203525506498e-06, + "loss": 0.7005, + "step": 9739 + }, + { + "epoch": 1.45, + "grad_norm": 7.338958859621994, + "learning_rate": 1.6646481744258352e-06, + "loss": 0.7109, + "step": 9740 + }, + { + "epoch": 1.45, + "grad_norm": 4.445622919826445, + "learning_rate": 1.6645759900978002e-06, + "loss": 0.6979, + "step": 9741 + }, + { + "epoch": 1.45, + "grad_norm": 2.4545403913671824, + "learning_rate": 1.6645037995672186e-06, + "loss": 0.6673, + "step": 9742 + }, + { + "epoch": 1.45, + "grad_norm": 2.4950360031213865, + "learning_rate": 1.6644316028347645e-06, + "loss": 0.6823, + "step": 9743 + }, + { + "epoch": 1.45, + "grad_norm": 1.060917012928369, + "learning_rate": 1.6643593999011114e-06, + "loss": 0.6849, + "step": 9744 + }, + { + "epoch": 1.45, + "grad_norm": 2.3862232824384333, + "learning_rate": 1.664287190766933e-06, + "loss": 0.694, + "step": 9745 + }, + { + "epoch": 1.45, + "grad_norm": 0.8552087013477551, + "learning_rate": 1.664214975432904e-06, + "loss": 0.666, + "step": 9746 + }, + { + "epoch": 1.45, + "grad_norm": 2.49292427998881, + "learning_rate": 1.6641427538996977e-06, + "loss": 0.6582, + "step": 9747 + }, + { + "epoch": 1.45, + "grad_norm": 3.4886485421590443, + "learning_rate": 1.6640705261679883e-06, + "loss": 0.6764, + "step": 9748 + }, + { + "epoch": 1.45, + "grad_norm": 3.3668286106525467, + "learning_rate": 1.6639982922384497e-06, + "loss": 0.6829, + "step": 9749 + }, + { + "epoch": 1.45, + "grad_norm": 1.4584286440700827, + "learning_rate": 1.6639260521117566e-06, + "loss": 0.6751, + "step": 9750 + }, + { + "epoch": 1.45, + "grad_norm": 4.521497243643481, + "learning_rate": 1.6638538057885831e-06, + "loss": 0.6706, + "step": 9751 + }, + { + "epoch": 1.45, + "grad_norm": 3.092617250602266, + "learning_rate": 1.6637815532696031e-06, + "loss": 0.6868, + "step": 9752 + }, + { + "epoch": 1.45, + "grad_norm": 3.589137793935203, + "learning_rate": 1.663709294555491e-06, + "loss": 0.6816, + "step": 9753 + }, + { + "epoch": 1.45, + "grad_norm": 2.6540304088528517, + "learning_rate": 1.6636370296469218e-06, + "loss": 0.6738, + "step": 9754 + }, + { + "epoch": 1.45, + "grad_norm": 3.433939872536329, + "learning_rate": 1.6635647585445694e-06, + "loss": 0.6712, + "step": 9755 + }, + { + "epoch": 1.46, + "grad_norm": 2.4721720743051914, + "learning_rate": 1.6634924812491083e-06, + "loss": 0.6673, + "step": 9756 + }, + { + "epoch": 1.46, + "grad_norm": 1.9610845680779063, + "learning_rate": 1.663420197761213e-06, + "loss": 0.6328, + "step": 9757 + }, + { + "epoch": 1.46, + "grad_norm": 1.3426400746442064, + "learning_rate": 1.6633479080815586e-06, + "loss": 0.6569, + "step": 9758 + }, + { + "epoch": 1.46, + "grad_norm": 2.8442052693277415, + "learning_rate": 1.6632756122108194e-06, + "loss": 0.6465, + "step": 9759 + }, + { + "epoch": 1.46, + "grad_norm": 1.0111606275071159, + "learning_rate": 1.66320331014967e-06, + "loss": 0.6517, + "step": 9760 + }, + { + "epoch": 1.46, + "grad_norm": 2.900961578489697, + "learning_rate": 1.663131001898786e-06, + "loss": 0.6908, + "step": 9761 + }, + { + "epoch": 1.46, + "grad_norm": 1.1527698410462595, + "learning_rate": 1.6630586874588413e-06, + "loss": 0.6419, + "step": 9762 + }, + { + "epoch": 1.46, + "grad_norm": 5.595408651288162, + "learning_rate": 1.6629863668305112e-06, + "loss": 0.6725, + "step": 9763 + }, + { + "epoch": 1.46, + "grad_norm": 1.4917457039857178, + "learning_rate": 1.662914040014471e-06, + "loss": 0.6309, + "step": 9764 + }, + { + "epoch": 1.46, + "grad_norm": 3.7956494108541556, + "learning_rate": 1.6628417070113953e-06, + "loss": 0.6621, + "step": 9765 + }, + { + "epoch": 1.46, + "grad_norm": 1.4219412268472278, + "learning_rate": 1.6627693678219594e-06, + "loss": 0.6836, + "step": 9766 + }, + { + "epoch": 1.46, + "grad_norm": 1.6256713855011813, + "learning_rate": 1.6626970224468387e-06, + "loss": 0.7064, + "step": 9767 + }, + { + "epoch": 1.46, + "grad_norm": 2.4115057913548923, + "learning_rate": 1.6626246708867076e-06, + "loss": 0.6673, + "step": 9768 + }, + { + "epoch": 1.46, + "grad_norm": 3.4928122185175834, + "learning_rate": 1.6625523131422417e-06, + "loss": 0.694, + "step": 9769 + }, + { + "epoch": 1.46, + "grad_norm": 1.4832188413049459, + "learning_rate": 1.6624799492141168e-06, + "loss": 0.6927, + "step": 9770 + }, + { + "epoch": 1.46, + "grad_norm": 1.0898047812555751, + "learning_rate": 1.6624075791030081e-06, + "loss": 0.6738, + "step": 9771 + }, + { + "epoch": 1.46, + "grad_norm": 1.0935544220894187, + "learning_rate": 1.6623352028095904e-06, + "loss": 0.6484, + "step": 9772 + }, + { + "epoch": 1.46, + "grad_norm": 2.2414110521814226, + "learning_rate": 1.6622628203345399e-06, + "loss": 0.6986, + "step": 9773 + }, + { + "epoch": 1.46, + "grad_norm": 0.9436869089966068, + "learning_rate": 1.6621904316785323e-06, + "loss": 0.6699, + "step": 9774 + }, + { + "epoch": 1.46, + "grad_norm": 5.9764007524960485, + "learning_rate": 1.6621180368422423e-06, + "loss": 0.6986, + "step": 9775 + }, + { + "epoch": 1.46, + "grad_norm": 1.0421123288871688, + "learning_rate": 1.6620456358263463e-06, + "loss": 0.6361, + "step": 9776 + }, + { + "epoch": 1.46, + "grad_norm": 2.708008561247752, + "learning_rate": 1.66197322863152e-06, + "loss": 0.7038, + "step": 9777 + }, + { + "epoch": 1.46, + "grad_norm": 1.1160123159416082, + "learning_rate": 1.6619008152584387e-06, + "loss": 0.6673, + "step": 9778 + }, + { + "epoch": 1.46, + "grad_norm": 1.122894280507208, + "learning_rate": 1.6618283957077787e-06, + "loss": 0.6543, + "step": 9779 + }, + { + "epoch": 1.46, + "grad_norm": 1.2639825148663444, + "learning_rate": 1.661755969980216e-06, + "loss": 0.7142, + "step": 9780 + }, + { + "epoch": 1.46, + "grad_norm": 0.8968389193266446, + "learning_rate": 1.661683538076426e-06, + "loss": 0.6647, + "step": 9781 + }, + { + "epoch": 1.46, + "grad_norm": 1.4334992922354743, + "learning_rate": 1.6616110999970849e-06, + "loss": 0.6797, + "step": 9782 + }, + { + "epoch": 1.46, + "grad_norm": 1.8293062730660452, + "learning_rate": 1.6615386557428693e-06, + "loss": 0.6947, + "step": 9783 + }, + { + "epoch": 1.46, + "grad_norm": 4.4134118537157425, + "learning_rate": 1.6614662053144546e-06, + "loss": 0.6797, + "step": 9784 + }, + { + "epoch": 1.46, + "grad_norm": 2.9819117817149463, + "learning_rate": 1.6613937487125176e-06, + "loss": 0.6693, + "step": 9785 + }, + { + "epoch": 1.46, + "grad_norm": 2.687479821979684, + "learning_rate": 1.661321285937734e-06, + "loss": 0.6777, + "step": 9786 + }, + { + "epoch": 1.46, + "grad_norm": 1.549526723829375, + "learning_rate": 1.6612488169907803e-06, + "loss": 0.6751, + "step": 9787 + }, + { + "epoch": 1.46, + "grad_norm": 2.059026373136371, + "learning_rate": 1.6611763418723333e-06, + "loss": 0.6875, + "step": 9788 + }, + { + "epoch": 1.46, + "grad_norm": 3.0635332246506946, + "learning_rate": 1.6611038605830686e-06, + "loss": 0.6686, + "step": 9789 + }, + { + "epoch": 1.46, + "grad_norm": 1.0411742030185271, + "learning_rate": 1.6610313731236635e-06, + "loss": 0.6556, + "step": 9790 + }, + { + "epoch": 1.46, + "grad_norm": 2.108268372762885, + "learning_rate": 1.6609588794947937e-06, + "loss": 0.7155, + "step": 9791 + }, + { + "epoch": 1.46, + "grad_norm": 4.300516292962294, + "learning_rate": 1.6608863796971364e-06, + "loss": 0.681, + "step": 9792 + }, + { + "epoch": 1.46, + "grad_norm": 1.7469426722475678, + "learning_rate": 1.6608138737313681e-06, + "loss": 0.681, + "step": 9793 + }, + { + "epoch": 1.46, + "grad_norm": 1.7909849205170547, + "learning_rate": 1.6607413615981658e-06, + "loss": 0.6816, + "step": 9794 + }, + { + "epoch": 1.46, + "grad_norm": 0.9229890456113778, + "learning_rate": 1.6606688432982054e-06, + "loss": 0.6836, + "step": 9795 + }, + { + "epoch": 1.46, + "grad_norm": 2.8294300378244888, + "learning_rate": 1.6605963188321646e-06, + "loss": 0.6849, + "step": 9796 + }, + { + "epoch": 1.46, + "grad_norm": 1.2886518756758847, + "learning_rate": 1.6605237882007196e-06, + "loss": 0.651, + "step": 9797 + }, + { + "epoch": 1.46, + "grad_norm": 2.312948770034611, + "learning_rate": 1.6604512514045481e-06, + "loss": 0.6582, + "step": 9798 + }, + { + "epoch": 1.46, + "grad_norm": 0.8412145554445429, + "learning_rate": 1.6603787084443265e-06, + "loss": 0.6628, + "step": 9799 + }, + { + "epoch": 1.46, + "grad_norm": 4.76515660078146, + "learning_rate": 1.660306159320732e-06, + "loss": 0.6855, + "step": 9800 + }, + { + "epoch": 1.46, + "grad_norm": 5.721959848765309, + "learning_rate": 1.6602336040344418e-06, + "loss": 0.6836, + "step": 9801 + }, + { + "epoch": 1.46, + "grad_norm": 2.818785489983603, + "learning_rate": 1.6601610425861327e-06, + "loss": 0.6784, + "step": 9802 + }, + { + "epoch": 1.46, + "grad_norm": 1.1550146305719124, + "learning_rate": 1.6600884749764824e-06, + "loss": 0.681, + "step": 9803 + }, + { + "epoch": 1.46, + "grad_norm": 2.5428615451109757, + "learning_rate": 1.660015901206168e-06, + "loss": 0.6615, + "step": 9804 + }, + { + "epoch": 1.46, + "grad_norm": 1.935713691475513, + "learning_rate": 1.6599433212758669e-06, + "loss": 0.679, + "step": 9805 + }, + { + "epoch": 1.46, + "grad_norm": 3.2926517887057205, + "learning_rate": 1.6598707351862562e-06, + "loss": 0.6803, + "step": 9806 + }, + { + "epoch": 1.46, + "grad_norm": 1.056385892331831, + "learning_rate": 1.6597981429380136e-06, + "loss": 0.6947, + "step": 9807 + }, + { + "epoch": 1.46, + "grad_norm": 0.9991339031936356, + "learning_rate": 1.6597255445318167e-06, + "loss": 0.6868, + "step": 9808 + }, + { + "epoch": 1.46, + "grad_norm": 3.407317626178942, + "learning_rate": 1.659652939968343e-06, + "loss": 0.6556, + "step": 9809 + }, + { + "epoch": 1.46, + "grad_norm": 3.272493147860428, + "learning_rate": 1.6595803292482699e-06, + "loss": 0.6712, + "step": 9810 + }, + { + "epoch": 1.46, + "grad_norm": 1.7958060109711418, + "learning_rate": 1.6595077123722754e-06, + "loss": 0.6758, + "step": 9811 + }, + { + "epoch": 1.46, + "grad_norm": 0.7251922036259588, + "learning_rate": 1.6594350893410371e-06, + "loss": 0.6751, + "step": 9812 + }, + { + "epoch": 1.46, + "grad_norm": 2.009177822464526, + "learning_rate": 1.6593624601552327e-06, + "loss": 0.7077, + "step": 9813 + }, + { + "epoch": 1.46, + "grad_norm": 6.232954564613191, + "learning_rate": 1.6592898248155406e-06, + "loss": 0.6901, + "step": 9814 + }, + { + "epoch": 1.46, + "grad_norm": 5.773028733985202, + "learning_rate": 1.6592171833226375e-06, + "loss": 0.6693, + "step": 9815 + }, + { + "epoch": 1.46, + "grad_norm": 2.6372812385090216, + "learning_rate": 1.6591445356772024e-06, + "loss": 0.6803, + "step": 9816 + }, + { + "epoch": 1.46, + "grad_norm": 2.4128250382131875, + "learning_rate": 1.6590718818799134e-06, + "loss": 0.6745, + "step": 9817 + }, + { + "epoch": 1.46, + "grad_norm": 3.6094847791902662, + "learning_rate": 1.658999221931448e-06, + "loss": 0.6654, + "step": 9818 + }, + { + "epoch": 1.46, + "grad_norm": 1.1863051559330382, + "learning_rate": 1.6589265558324845e-06, + "loss": 0.625, + "step": 9819 + }, + { + "epoch": 1.46, + "grad_norm": 1.6887388272900787, + "learning_rate": 1.6588538835837014e-06, + "loss": 0.679, + "step": 9820 + }, + { + "epoch": 1.46, + "grad_norm": 2.0280499180670137, + "learning_rate": 1.6587812051857766e-06, + "loss": 0.6745, + "step": 9821 + }, + { + "epoch": 1.46, + "grad_norm": 1.6340914339715475, + "learning_rate": 1.6587085206393882e-06, + "loss": 0.6803, + "step": 9822 + }, + { + "epoch": 1.47, + "grad_norm": 0.8572458477567645, + "learning_rate": 1.6586358299452154e-06, + "loss": 0.6673, + "step": 9823 + }, + { + "epoch": 1.47, + "grad_norm": 1.9431014724381395, + "learning_rate": 1.6585631331039363e-06, + "loss": 0.6888, + "step": 9824 + }, + { + "epoch": 1.47, + "grad_norm": 4.097119985369713, + "learning_rate": 1.658490430116229e-06, + "loss": 0.668, + "step": 9825 + }, + { + "epoch": 1.47, + "grad_norm": 1.6467018113645682, + "learning_rate": 1.658417720982772e-06, + "loss": 0.7031, + "step": 9826 + }, + { + "epoch": 1.47, + "grad_norm": 4.318628537004638, + "learning_rate": 1.6583450057042443e-06, + "loss": 0.7064, + "step": 9827 + }, + { + "epoch": 1.47, + "grad_norm": 1.0230295806026461, + "learning_rate": 1.6582722842813247e-06, + "loss": 0.6927, + "step": 9828 + }, + { + "epoch": 1.47, + "grad_norm": 1.5652656597226042, + "learning_rate": 1.6581995567146914e-06, + "loss": 0.6895, + "step": 9829 + }, + { + "epoch": 1.47, + "grad_norm": 1.6784488571165563, + "learning_rate": 1.6581268230050238e-06, + "loss": 0.6641, + "step": 9830 + }, + { + "epoch": 1.47, + "grad_norm": 0.9129400983884061, + "learning_rate": 1.6580540831529998e-06, + "loss": 0.6836, + "step": 9831 + }, + { + "epoch": 1.47, + "grad_norm": 1.510801044985163, + "learning_rate": 1.6579813371592994e-06, + "loss": 0.6764, + "step": 9832 + }, + { + "epoch": 1.47, + "grad_norm": 0.9120823818752061, + "learning_rate": 1.6579085850246005e-06, + "loss": 0.679, + "step": 9833 + }, + { + "epoch": 1.47, + "grad_norm": 0.7305943356863195, + "learning_rate": 1.657835826749583e-06, + "loss": 0.6745, + "step": 9834 + }, + { + "epoch": 1.47, + "grad_norm": 1.2243408226157664, + "learning_rate": 1.6577630623349252e-06, + "loss": 0.6875, + "step": 9835 + }, + { + "epoch": 1.47, + "grad_norm": 0.7810101553386706, + "learning_rate": 1.6576902917813069e-06, + "loss": 0.6797, + "step": 9836 + }, + { + "epoch": 1.47, + "grad_norm": 1.6374878710578666, + "learning_rate": 1.657617515089407e-06, + "loss": 0.6947, + "step": 9837 + }, + { + "epoch": 1.47, + "grad_norm": 2.9329461796345964, + "learning_rate": 1.6575447322599043e-06, + "loss": 0.6719, + "step": 9838 + }, + { + "epoch": 1.47, + "grad_norm": 1.5044560547857644, + "learning_rate": 1.6574719432934787e-06, + "loss": 0.6914, + "step": 9839 + }, + { + "epoch": 1.47, + "grad_norm": 3.412584157791351, + "learning_rate": 1.657399148190809e-06, + "loss": 0.6699, + "step": 9840 + }, + { + "epoch": 1.47, + "grad_norm": 1.3819741201165197, + "learning_rate": 1.6573263469525754e-06, + "loss": 0.6895, + "step": 9841 + }, + { + "epoch": 1.47, + "grad_norm": 2.3638637328602843, + "learning_rate": 1.6572535395794566e-06, + "loss": 0.6758, + "step": 9842 + }, + { + "epoch": 1.47, + "grad_norm": 0.9693963761677733, + "learning_rate": 1.6571807260721324e-06, + "loss": 0.6543, + "step": 9843 + }, + { + "epoch": 1.47, + "grad_norm": 0.9476116931549834, + "learning_rate": 1.6571079064312826e-06, + "loss": 0.653, + "step": 9844 + }, + { + "epoch": 1.47, + "grad_norm": 1.1106753372242069, + "learning_rate": 1.6570350806575864e-06, + "loss": 0.6738, + "step": 9845 + }, + { + "epoch": 1.47, + "grad_norm": 0.6881844021402357, + "learning_rate": 1.6569622487517236e-06, + "loss": 0.6445, + "step": 9846 + }, + { + "epoch": 1.47, + "grad_norm": 1.677795538091941, + "learning_rate": 1.6568894107143741e-06, + "loss": 0.6895, + "step": 9847 + }, + { + "epoch": 1.47, + "grad_norm": 0.994761970390672, + "learning_rate": 1.6568165665462176e-06, + "loss": 0.666, + "step": 9848 + }, + { + "epoch": 1.47, + "grad_norm": 2.213324164379337, + "learning_rate": 1.656743716247934e-06, + "loss": 0.6706, + "step": 9849 + }, + { + "epoch": 1.47, + "grad_norm": 6.151848591766622, + "learning_rate": 1.6566708598202034e-06, + "loss": 0.679, + "step": 9850 + }, + { + "epoch": 1.47, + "grad_norm": 2.431650300702672, + "learning_rate": 1.6565979972637055e-06, + "loss": 0.6641, + "step": 9851 + }, + { + "epoch": 1.47, + "grad_norm": 7.876689842037546, + "learning_rate": 1.6565251285791205e-06, + "loss": 0.7018, + "step": 9852 + }, + { + "epoch": 1.47, + "grad_norm": 0.772023018433608, + "learning_rate": 1.6564522537671282e-06, + "loss": 0.6719, + "step": 9853 + }, + { + "epoch": 1.47, + "grad_norm": 0.6769198096291962, + "learning_rate": 1.656379372828409e-06, + "loss": 0.679, + "step": 9854 + }, + { + "epoch": 1.47, + "grad_norm": 3.1544063784835514, + "learning_rate": 1.656306485763643e-06, + "loss": 0.6719, + "step": 9855 + }, + { + "epoch": 1.47, + "grad_norm": 2.0626458482572727, + "learning_rate": 1.6562335925735108e-06, + "loss": 0.679, + "step": 9856 + }, + { + "epoch": 1.47, + "grad_norm": 6.4171779476564605, + "learning_rate": 1.6561606932586925e-06, + "loss": 0.6569, + "step": 9857 + }, + { + "epoch": 1.47, + "grad_norm": 0.8426440919988678, + "learning_rate": 1.656087787819868e-06, + "loss": 0.6549, + "step": 9858 + }, + { + "epoch": 1.47, + "grad_norm": 2.729220339787862, + "learning_rate": 1.6560148762577181e-06, + "loss": 0.6914, + "step": 9859 + }, + { + "epoch": 1.47, + "grad_norm": 2.586584854497965, + "learning_rate": 1.655941958572924e-06, + "loss": 0.6706, + "step": 9860 + }, + { + "epoch": 1.47, + "grad_norm": 2.543836915159266, + "learning_rate": 1.655869034766165e-06, + "loss": 0.6647, + "step": 9861 + }, + { + "epoch": 1.47, + "grad_norm": 4.733243275553741, + "learning_rate": 1.6557961048381225e-06, + "loss": 0.6621, + "step": 9862 + }, + { + "epoch": 1.47, + "grad_norm": 2.8011314364180078, + "learning_rate": 1.6557231687894771e-06, + "loss": 0.6641, + "step": 9863 + }, + { + "epoch": 1.47, + "grad_norm": 1.1800549939419775, + "learning_rate": 1.6556502266209092e-06, + "loss": 0.6693, + "step": 9864 + }, + { + "epoch": 1.47, + "grad_norm": 3.328023102258037, + "learning_rate": 1.6555772783330998e-06, + "loss": 0.7311, + "step": 9865 + }, + { + "epoch": 1.47, + "grad_norm": 2.2598830977171613, + "learning_rate": 1.6555043239267298e-06, + "loss": 0.6849, + "step": 9866 + }, + { + "epoch": 1.47, + "grad_norm": 1.359746152122676, + "learning_rate": 1.6554313634024798e-06, + "loss": 0.694, + "step": 9867 + }, + { + "epoch": 1.47, + "grad_norm": 4.317029029167903, + "learning_rate": 1.6553583967610309e-06, + "loss": 0.6478, + "step": 9868 + }, + { + "epoch": 1.47, + "grad_norm": 2.040390999198379, + "learning_rate": 1.6552854240030643e-06, + "loss": 0.6549, + "step": 9869 + }, + { + "epoch": 1.47, + "grad_norm": 0.9681573107406458, + "learning_rate": 1.6552124451292608e-06, + "loss": 0.6966, + "step": 9870 + }, + { + "epoch": 1.47, + "grad_norm": 1.8413450400178544, + "learning_rate": 1.6551394601403015e-06, + "loss": 0.6836, + "step": 9871 + }, + { + "epoch": 1.47, + "grad_norm": 0.9686397598214506, + "learning_rate": 1.6550664690368678e-06, + "loss": 0.6973, + "step": 9872 + }, + { + "epoch": 1.47, + "grad_norm": 1.7960481332728135, + "learning_rate": 1.6549934718196407e-06, + "loss": 0.64, + "step": 9873 + }, + { + "epoch": 1.47, + "grad_norm": 1.260838867142914, + "learning_rate": 1.6549204684893019e-06, + "loss": 0.6634, + "step": 9874 + }, + { + "epoch": 1.47, + "grad_norm": 3.319906004149914, + "learning_rate": 1.6548474590465323e-06, + "loss": 0.7077, + "step": 9875 + }, + { + "epoch": 1.47, + "grad_norm": 2.958525477853591, + "learning_rate": 1.6547744434920134e-06, + "loss": 0.681, + "step": 9876 + }, + { + "epoch": 1.47, + "grad_norm": 1.3444243138149365, + "learning_rate": 1.6547014218264265e-06, + "loss": 0.6497, + "step": 9877 + }, + { + "epoch": 1.47, + "grad_norm": 5.468791324941726, + "learning_rate": 1.6546283940504538e-06, + "loss": 0.6862, + "step": 9878 + }, + { + "epoch": 1.47, + "grad_norm": 4.573313468985222, + "learning_rate": 1.654555360164776e-06, + "loss": 0.6628, + "step": 9879 + }, + { + "epoch": 1.47, + "grad_norm": 8.922031088090549, + "learning_rate": 1.6544823201700753e-06, + "loss": 0.6641, + "step": 9880 + }, + { + "epoch": 1.47, + "grad_norm": 2.08899204304058, + "learning_rate": 1.6544092740670333e-06, + "loss": 0.6719, + "step": 9881 + }, + { + "epoch": 1.47, + "grad_norm": 1.5198970624583983, + "learning_rate": 1.6543362218563314e-06, + "loss": 0.6875, + "step": 9882 + }, + { + "epoch": 1.47, + "grad_norm": 1.7933324011825298, + "learning_rate": 1.654263163538652e-06, + "loss": 0.6784, + "step": 9883 + }, + { + "epoch": 1.47, + "grad_norm": 1.004600049751718, + "learning_rate": 1.6541900991146764e-06, + "loss": 0.6576, + "step": 9884 + }, + { + "epoch": 1.47, + "grad_norm": 1.8735131876181963, + "learning_rate": 1.6541170285850869e-06, + "loss": 0.6693, + "step": 9885 + }, + { + "epoch": 1.47, + "grad_norm": 2.351812946094553, + "learning_rate": 1.6540439519505654e-06, + "loss": 0.6875, + "step": 9886 + }, + { + "epoch": 1.47, + "grad_norm": 3.944223494603865, + "learning_rate": 1.6539708692117937e-06, + "loss": 0.6803, + "step": 9887 + }, + { + "epoch": 1.47, + "grad_norm": 4.486377271610382, + "learning_rate": 1.6538977803694541e-06, + "loss": 0.7025, + "step": 9888 + }, + { + "epoch": 1.47, + "grad_norm": 3.156924591873219, + "learning_rate": 1.6538246854242284e-06, + "loss": 0.6549, + "step": 9889 + }, + { + "epoch": 1.48, + "grad_norm": 1.0708877411193474, + "learning_rate": 1.6537515843767995e-06, + "loss": 0.6823, + "step": 9890 + }, + { + "epoch": 1.48, + "grad_norm": 1.113195834709318, + "learning_rate": 1.653678477227849e-06, + "loss": 0.679, + "step": 9891 + }, + { + "epoch": 1.48, + "grad_norm": 4.842462057708399, + "learning_rate": 1.6536053639780596e-06, + "loss": 0.7181, + "step": 9892 + }, + { + "epoch": 1.48, + "grad_norm": 1.31666780644333, + "learning_rate": 1.6535322446281135e-06, + "loss": 0.6986, + "step": 9893 + }, + { + "epoch": 1.48, + "grad_norm": 1.3497186731817528, + "learning_rate": 1.6534591191786932e-06, + "loss": 0.6973, + "step": 9894 + }, + { + "epoch": 1.48, + "grad_norm": 0.7918966070161075, + "learning_rate": 1.653385987630481e-06, + "loss": 0.6934, + "step": 9895 + }, + { + "epoch": 1.48, + "grad_norm": 1.4286206345549182, + "learning_rate": 1.65331284998416e-06, + "loss": 0.6992, + "step": 9896 + }, + { + "epoch": 1.48, + "grad_norm": 2.2135903429058477, + "learning_rate": 1.653239706240412e-06, + "loss": 0.681, + "step": 9897 + }, + { + "epoch": 1.48, + "grad_norm": 1.316903854663833, + "learning_rate": 1.65316655639992e-06, + "loss": 0.681, + "step": 9898 + }, + { + "epoch": 1.48, + "grad_norm": 2.4931124603792463, + "learning_rate": 1.653093400463367e-06, + "loss": 0.679, + "step": 9899 + }, + { + "epoch": 1.48, + "grad_norm": 0.7284430656698797, + "learning_rate": 1.6530202384314356e-06, + "loss": 0.6706, + "step": 9900 + }, + { + "epoch": 1.48, + "grad_norm": 0.6657576336003742, + "learning_rate": 1.6529470703048084e-06, + "loss": 0.6836, + "step": 9901 + }, + { + "epoch": 1.48, + "grad_norm": 3.603917312095757, + "learning_rate": 1.6528738960841682e-06, + "loss": 0.6816, + "step": 9902 + }, + { + "epoch": 1.48, + "grad_norm": 1.0329223371212657, + "learning_rate": 1.6528007157701986e-06, + "loss": 0.6771, + "step": 9903 + }, + { + "epoch": 1.48, + "grad_norm": 1.8323641916797815, + "learning_rate": 1.652727529363582e-06, + "loss": 0.6745, + "step": 9904 + }, + { + "epoch": 1.48, + "grad_norm": 3.206245772824931, + "learning_rate": 1.6526543368650017e-06, + "loss": 0.6758, + "step": 9905 + }, + { + "epoch": 1.48, + "grad_norm": 0.5350552572166755, + "learning_rate": 1.652581138275141e-06, + "loss": 0.6895, + "step": 9906 + }, + { + "epoch": 1.48, + "grad_norm": 2.2167629922241874, + "learning_rate": 1.6525079335946823e-06, + "loss": 0.6908, + "step": 9907 + }, + { + "epoch": 1.48, + "grad_norm": 3.3924977992347682, + "learning_rate": 1.6524347228243096e-06, + "loss": 0.6751, + "step": 9908 + }, + { + "epoch": 1.48, + "grad_norm": 0.7471420753179014, + "learning_rate": 1.652361505964706e-06, + "loss": 0.6914, + "step": 9909 + }, + { + "epoch": 1.48, + "grad_norm": 5.555616987141426, + "learning_rate": 1.652288283016555e-06, + "loss": 0.681, + "step": 9910 + }, + { + "epoch": 1.48, + "grad_norm": 3.6314292956600354, + "learning_rate": 1.6522150539805392e-06, + "loss": 0.6758, + "step": 9911 + }, + { + "epoch": 1.48, + "grad_norm": 2.636869225829931, + "learning_rate": 1.6521418188573431e-06, + "loss": 0.6895, + "step": 9912 + }, + { + "epoch": 1.48, + "grad_norm": 0.6521618316540639, + "learning_rate": 1.6520685776476495e-06, + "loss": 0.6725, + "step": 9913 + }, + { + "epoch": 1.48, + "grad_norm": 2.5605299245445297, + "learning_rate": 1.6519953303521423e-06, + "loss": 0.6842, + "step": 9914 + }, + { + "epoch": 1.48, + "grad_norm": 6.024815640352903, + "learning_rate": 1.651922076971505e-06, + "loss": 0.6758, + "step": 9915 + }, + { + "epoch": 1.48, + "grad_norm": 0.6095034089301409, + "learning_rate": 1.6518488175064213e-06, + "loss": 0.6706, + "step": 9916 + }, + { + "epoch": 1.48, + "grad_norm": 2.418683053308214, + "learning_rate": 1.651775551957575e-06, + "loss": 0.681, + "step": 9917 + }, + { + "epoch": 1.48, + "grad_norm": 0.5795962204571, + "learning_rate": 1.6517022803256495e-06, + "loss": 0.6771, + "step": 9918 + }, + { + "epoch": 1.48, + "grad_norm": 1.3064639931515403, + "learning_rate": 1.6516290026113295e-06, + "loss": 0.681, + "step": 9919 + }, + { + "epoch": 1.48, + "grad_norm": 0.625374913063576, + "learning_rate": 1.6515557188152979e-06, + "loss": 0.679, + "step": 9920 + }, + { + "epoch": 1.48, + "grad_norm": 0.6141781287872546, + "learning_rate": 1.6514824289382395e-06, + "loss": 0.6803, + "step": 9921 + }, + { + "epoch": 1.48, + "grad_norm": 3.247251926634323, + "learning_rate": 1.6514091329808378e-06, + "loss": 0.6882, + "step": 9922 + }, + { + "epoch": 1.48, + "grad_norm": 0.8795045115200982, + "learning_rate": 1.6513358309437772e-06, + "loss": 0.6719, + "step": 9923 + }, + { + "epoch": 1.48, + "grad_norm": 1.928579701688441, + "learning_rate": 1.6512625228277415e-06, + "loss": 0.6562, + "step": 9924 + }, + { + "epoch": 1.48, + "grad_norm": 2.7578256748138177, + "learning_rate": 1.6511892086334155e-06, + "loss": 0.7031, + "step": 9925 + }, + { + "epoch": 1.48, + "grad_norm": 1.0991464784939484, + "learning_rate": 1.6511158883614823e-06, + "loss": 0.6836, + "step": 9926 + }, + { + "epoch": 1.48, + "grad_norm": 2.2664765643153544, + "learning_rate": 1.6510425620126275e-06, + "loss": 0.6543, + "step": 9927 + }, + { + "epoch": 1.48, + "grad_norm": 1.4932884269908633, + "learning_rate": 1.6509692295875347e-06, + "loss": 0.668, + "step": 9928 + }, + { + "epoch": 1.48, + "grad_norm": 0.6098757625246827, + "learning_rate": 1.6508958910868886e-06, + "loss": 0.6589, + "step": 9929 + }, + { + "epoch": 1.48, + "grad_norm": 3.103215373239368, + "learning_rate": 1.6508225465113737e-06, + "loss": 0.694, + "step": 9930 + }, + { + "epoch": 1.48, + "grad_norm": 0.7107622881144262, + "learning_rate": 1.650749195861674e-06, + "loss": 0.6862, + "step": 9931 + }, + { + "epoch": 1.48, + "grad_norm": 2.656338189121676, + "learning_rate": 1.6506758391384747e-06, + "loss": 0.6712, + "step": 9932 + }, + { + "epoch": 1.48, + "grad_norm": 2.122563161303288, + "learning_rate": 1.6506024763424607e-06, + "loss": 0.6901, + "step": 9933 + }, + { + "epoch": 1.48, + "grad_norm": 0.674901837334565, + "learning_rate": 1.6505291074743157e-06, + "loss": 0.6667, + "step": 9934 + }, + { + "epoch": 1.48, + "grad_norm": 1.2183901991678245, + "learning_rate": 1.650455732534725e-06, + "loss": 0.6992, + "step": 9935 + }, + { + "epoch": 1.48, + "grad_norm": 4.775127007209287, + "learning_rate": 1.6503823515243737e-06, + "loss": 0.6576, + "step": 9936 + }, + { + "epoch": 1.48, + "grad_norm": 2.771424177565759, + "learning_rate": 1.6503089644439463e-06, + "loss": 0.6667, + "step": 9937 + }, + { + "epoch": 1.48, + "grad_norm": 2.1410985672506837, + "learning_rate": 1.6502355712941278e-06, + "loss": 0.6947, + "step": 9938 + }, + { + "epoch": 1.48, + "grad_norm": 0.7927106300957308, + "learning_rate": 1.6501621720756033e-06, + "loss": 0.6517, + "step": 9939 + }, + { + "epoch": 1.48, + "grad_norm": 4.956358119415231, + "learning_rate": 1.6500887667890578e-06, + "loss": 0.6693, + "step": 9940 + }, + { + "epoch": 1.48, + "grad_norm": 2.402063066913063, + "learning_rate": 1.650015355435176e-06, + "loss": 0.6953, + "step": 9941 + }, + { + "epoch": 1.48, + "grad_norm": 1.879040991610949, + "learning_rate": 1.6499419380146437e-06, + "loss": 0.6712, + "step": 9942 + }, + { + "epoch": 1.48, + "grad_norm": 3.7678752535632025, + "learning_rate": 1.649868514528146e-06, + "loss": 0.7096, + "step": 9943 + }, + { + "epoch": 1.48, + "grad_norm": 4.135139001882899, + "learning_rate": 1.6497950849763675e-06, + "loss": 0.6862, + "step": 9944 + }, + { + "epoch": 1.48, + "grad_norm": 8.03701122104132, + "learning_rate": 1.6497216493599942e-06, + "loss": 0.6608, + "step": 9945 + }, + { + "epoch": 1.48, + "grad_norm": 2.325119180204224, + "learning_rate": 1.6496482076797114e-06, + "loss": 0.6589, + "step": 9946 + }, + { + "epoch": 1.48, + "grad_norm": 3.617221420712736, + "learning_rate": 1.6495747599362044e-06, + "loss": 0.6628, + "step": 9947 + }, + { + "epoch": 1.48, + "grad_norm": 0.9472259810965098, + "learning_rate": 1.6495013061301586e-06, + "loss": 0.6673, + "step": 9948 + }, + { + "epoch": 1.48, + "grad_norm": 2.693456960603709, + "learning_rate": 1.6494278462622599e-06, + "loss": 0.6784, + "step": 9949 + }, + { + "epoch": 1.48, + "grad_norm": 2.4792967609242966, + "learning_rate": 1.6493543803331933e-06, + "loss": 0.6797, + "step": 9950 + }, + { + "epoch": 1.48, + "grad_norm": 1.8862477843142669, + "learning_rate": 1.649280908343645e-06, + "loss": 0.6667, + "step": 9951 + }, + { + "epoch": 1.48, + "grad_norm": 4.482084092124688, + "learning_rate": 1.6492074302943006e-06, + "loss": 0.6797, + "step": 9952 + }, + { + "epoch": 1.48, + "grad_norm": 1.592123881696566, + "learning_rate": 1.649133946185846e-06, + "loss": 0.6901, + "step": 9953 + }, + { + "epoch": 1.48, + "grad_norm": 2.3000686582365106, + "learning_rate": 1.6490604560189667e-06, + "loss": 0.6784, + "step": 9954 + }, + { + "epoch": 1.48, + "grad_norm": 3.40554690450808, + "learning_rate": 1.6489869597943486e-06, + "loss": 0.7044, + "step": 9955 + }, + { + "epoch": 1.48, + "grad_norm": 1.9080005400283278, + "learning_rate": 1.6489134575126781e-06, + "loss": 0.6764, + "step": 9956 + }, + { + "epoch": 1.49, + "grad_norm": 3.8666511750151473, + "learning_rate": 1.648839949174641e-06, + "loss": 0.7025, + "step": 9957 + }, + { + "epoch": 1.49, + "grad_norm": 5.226683752515855, + "learning_rate": 1.6487664347809229e-06, + "loss": 0.6934, + "step": 9958 + }, + { + "epoch": 1.49, + "grad_norm": 1.3454855822448273, + "learning_rate": 1.6486929143322109e-06, + "loss": 0.6465, + "step": 9959 + }, + { + "epoch": 1.49, + "grad_norm": 4.939556644956294, + "learning_rate": 1.64861938782919e-06, + "loss": 0.6797, + "step": 9960 + }, + { + "epoch": 1.49, + "grad_norm": 0.726738417011594, + "learning_rate": 1.648545855272547e-06, + "loss": 0.679, + "step": 9961 + }, + { + "epoch": 1.49, + "grad_norm": 1.3468676696892572, + "learning_rate": 1.6484723166629684e-06, + "loss": 0.6562, + "step": 9962 + }, + { + "epoch": 1.49, + "grad_norm": 1.8794287875959326, + "learning_rate": 1.64839877200114e-06, + "loss": 0.6471, + "step": 9963 + }, + { + "epoch": 1.49, + "grad_norm": 0.7096241873656556, + "learning_rate": 1.648325221287749e-06, + "loss": 0.6771, + "step": 9964 + }, + { + "epoch": 1.49, + "grad_norm": 4.527265138430421, + "learning_rate": 1.6482516645234811e-06, + "loss": 0.6888, + "step": 9965 + }, + { + "epoch": 1.49, + "grad_norm": 3.8248149089016414, + "learning_rate": 1.6481781017090234e-06, + "loss": 0.6628, + "step": 9966 + }, + { + "epoch": 1.49, + "grad_norm": 2.684900575171601, + "learning_rate": 1.6481045328450616e-06, + "loss": 0.6829, + "step": 9967 + }, + { + "epoch": 1.49, + "grad_norm": 1.292888961210333, + "learning_rate": 1.6480309579322831e-06, + "loss": 0.6849, + "step": 9968 + }, + { + "epoch": 1.49, + "grad_norm": 2.827013086784271, + "learning_rate": 1.6479573769713745e-06, + "loss": 0.6706, + "step": 9969 + }, + { + "epoch": 1.49, + "grad_norm": 0.7623454604462017, + "learning_rate": 1.6478837899630223e-06, + "loss": 0.6797, + "step": 9970 + }, + { + "epoch": 1.49, + "grad_norm": 1.0522098944438925, + "learning_rate": 1.6478101969079135e-06, + "loss": 0.6777, + "step": 9971 + }, + { + "epoch": 1.49, + "grad_norm": 0.7822033766690107, + "learning_rate": 1.6477365978067347e-06, + "loss": 0.6764, + "step": 9972 + }, + { + "epoch": 1.49, + "grad_norm": 4.276645194078977, + "learning_rate": 1.647662992660173e-06, + "loss": 0.6816, + "step": 9973 + }, + { + "epoch": 1.49, + "grad_norm": 2.448456818354612, + "learning_rate": 1.6475893814689156e-06, + "loss": 0.6751, + "step": 9974 + }, + { + "epoch": 1.49, + "grad_norm": 1.2967554484691157, + "learning_rate": 1.6475157642336489e-06, + "loss": 0.6673, + "step": 9975 + }, + { + "epoch": 1.49, + "grad_norm": 2.830765368045768, + "learning_rate": 1.6474421409550602e-06, + "loss": 0.6849, + "step": 9976 + }, + { + "epoch": 1.49, + "grad_norm": 1.4111959926118993, + "learning_rate": 1.6473685116338368e-06, + "loss": 0.6777, + "step": 9977 + }, + { + "epoch": 1.49, + "grad_norm": 0.8366615505089715, + "learning_rate": 1.647294876270666e-06, + "loss": 0.653, + "step": 9978 + }, + { + "epoch": 1.49, + "grad_norm": 1.8834181216650654, + "learning_rate": 1.6472212348662348e-06, + "loss": 0.6628, + "step": 9979 + }, + { + "epoch": 1.49, + "grad_norm": 3.3715062743769715, + "learning_rate": 1.6471475874212306e-06, + "loss": 0.6986, + "step": 9980 + }, + { + "epoch": 1.49, + "grad_norm": 1.7074706882837152, + "learning_rate": 1.6470739339363406e-06, + "loss": 0.6471, + "step": 9981 + }, + { + "epoch": 1.49, + "grad_norm": 1.006183832735476, + "learning_rate": 1.6470002744122526e-06, + "loss": 0.6589, + "step": 9982 + }, + { + "epoch": 1.49, + "grad_norm": 4.35964339250405, + "learning_rate": 1.6469266088496537e-06, + "loss": 0.6621, + "step": 9983 + }, + { + "epoch": 1.49, + "grad_norm": 3.2144134580224017, + "learning_rate": 1.6468529372492315e-06, + "loss": 0.6751, + "step": 9984 + }, + { + "epoch": 1.49, + "grad_norm": 4.5184373433269736, + "learning_rate": 1.646779259611674e-06, + "loss": 0.6634, + "step": 9985 + }, + { + "epoch": 1.49, + "grad_norm": 2.4946966305629332, + "learning_rate": 1.646705575937668e-06, + "loss": 0.6699, + "step": 9986 + }, + { + "epoch": 1.49, + "grad_norm": 0.921605648806424, + "learning_rate": 1.646631886227902e-06, + "loss": 0.6927, + "step": 9987 + }, + { + "epoch": 1.49, + "grad_norm": 0.9563212590951395, + "learning_rate": 1.6465581904830627e-06, + "loss": 0.6947, + "step": 9988 + }, + { + "epoch": 1.49, + "grad_norm": 6.363289718233698, + "learning_rate": 1.6464844887038394e-06, + "loss": 0.6816, + "step": 9989 + }, + { + "epoch": 1.49, + "grad_norm": 4.6834021581060155, + "learning_rate": 1.646410780890919e-06, + "loss": 0.6667, + "step": 9990 + }, + { + "epoch": 1.49, + "grad_norm": 3.285496453921681, + "learning_rate": 1.6463370670449894e-06, + "loss": 0.6901, + "step": 9991 + }, + { + "epoch": 1.49, + "grad_norm": 1.9144105929026172, + "learning_rate": 1.646263347166739e-06, + "loss": 0.6738, + "step": 9992 + }, + { + "epoch": 1.49, + "grad_norm": 2.0826417989560624, + "learning_rate": 1.6461896212568557e-06, + "loss": 0.6497, + "step": 9993 + }, + { + "epoch": 1.49, + "grad_norm": 2.05497370429454, + "learning_rate": 1.6461158893160275e-06, + "loss": 0.6667, + "step": 9994 + }, + { + "epoch": 1.49, + "grad_norm": 2.4865225041151064, + "learning_rate": 1.6460421513449422e-06, + "loss": 0.6621, + "step": 9995 + }, + { + "epoch": 1.49, + "grad_norm": 1.8441192050696686, + "learning_rate": 1.6459684073442887e-06, + "loss": 0.6829, + "step": 9996 + }, + { + "epoch": 1.49, + "grad_norm": 1.6691485289232972, + "learning_rate": 1.6458946573147549e-06, + "loss": 0.6771, + "step": 9997 + }, + { + "epoch": 1.49, + "grad_norm": 3.4942345175125595, + "learning_rate": 1.645820901257029e-06, + "loss": 0.6608, + "step": 9998 + }, + { + "epoch": 1.49, + "grad_norm": 1.7330203369105919, + "learning_rate": 1.6457471391717999e-06, + "loss": 0.6888, + "step": 9999 + }, + { + "epoch": 1.49, + "grad_norm": 1.180051857444326, + "learning_rate": 1.6456733710597553e-06, + "loss": 0.7155, + "step": 10000 + }, + { + "epoch": 1.49, + "grad_norm": 2.5754689181450026, + "learning_rate": 1.645599596921584e-06, + "loss": 0.6895, + "step": 10001 + }, + { + "epoch": 1.49, + "grad_norm": 6.719709913597436, + "learning_rate": 1.6455258167579749e-06, + "loss": 0.707, + "step": 10002 + }, + { + "epoch": 1.49, + "grad_norm": 0.8227450681596211, + "learning_rate": 1.6454520305696161e-06, + "loss": 0.6771, + "step": 10003 + }, + { + "epoch": 1.49, + "grad_norm": 0.7899071314459465, + "learning_rate": 1.6453782383571961e-06, + "loss": 0.6608, + "step": 10004 + }, + { + "epoch": 1.49, + "grad_norm": 1.5950688709998109, + "learning_rate": 1.6453044401214046e-06, + "loss": 0.6764, + "step": 10005 + }, + { + "epoch": 1.49, + "grad_norm": 2.0574134250647447, + "learning_rate": 1.645230635862929e-06, + "loss": 0.6771, + "step": 10006 + }, + { + "epoch": 1.49, + "grad_norm": 1.4218660439905266, + "learning_rate": 1.6451568255824592e-06, + "loss": 0.668, + "step": 10007 + }, + { + "epoch": 1.49, + "grad_norm": 0.8232592648361359, + "learning_rate": 1.6450830092806832e-06, + "loss": 0.6628, + "step": 10008 + }, + { + "epoch": 1.49, + "grad_norm": 1.4736661645844606, + "learning_rate": 1.645009186958291e-06, + "loss": 0.6706, + "step": 10009 + }, + { + "epoch": 1.49, + "grad_norm": 1.8600865022452353, + "learning_rate": 1.6449353586159709e-06, + "loss": 0.6803, + "step": 10010 + }, + { + "epoch": 1.49, + "grad_norm": 1.3022630605018486, + "learning_rate": 1.644861524254412e-06, + "loss": 0.6803, + "step": 10011 + }, + { + "epoch": 1.49, + "grad_norm": 0.8118546706850466, + "learning_rate": 1.6447876838743034e-06, + "loss": 0.6732, + "step": 10012 + }, + { + "epoch": 1.49, + "grad_norm": 1.2426370023676856, + "learning_rate": 1.6447138374763343e-06, + "loss": 0.6452, + "step": 10013 + }, + { + "epoch": 1.49, + "grad_norm": 0.9489654785476652, + "learning_rate": 1.644639985061194e-06, + "loss": 0.6725, + "step": 10014 + }, + { + "epoch": 1.49, + "grad_norm": 0.7672036043656284, + "learning_rate": 1.6445661266295719e-06, + "loss": 0.6875, + "step": 10015 + }, + { + "epoch": 1.49, + "grad_norm": 3.9774213104405796, + "learning_rate": 1.644492262182157e-06, + "loss": 0.6868, + "step": 10016 + }, + { + "epoch": 1.49, + "grad_norm": 1.4971003239095444, + "learning_rate": 1.6444183917196388e-06, + "loss": 0.6556, + "step": 10017 + }, + { + "epoch": 1.49, + "grad_norm": 3.136063171919282, + "learning_rate": 1.6443445152427068e-06, + "loss": 0.6868, + "step": 10018 + }, + { + "epoch": 1.49, + "grad_norm": 2.526107132749558, + "learning_rate": 1.6442706327520503e-06, + "loss": 0.6589, + "step": 10019 + }, + { + "epoch": 1.49, + "grad_norm": 0.9196094808843973, + "learning_rate": 1.644196744248359e-06, + "loss": 0.6725, + "step": 10020 + }, + { + "epoch": 1.49, + "grad_norm": 1.9637141800315325, + "learning_rate": 1.6441228497323227e-06, + "loss": 0.6901, + "step": 10021 + }, + { + "epoch": 1.49, + "grad_norm": 0.9633403067275358, + "learning_rate": 1.644048949204631e-06, + "loss": 0.6901, + "step": 10022 + }, + { + "epoch": 1.49, + "grad_norm": 3.509248435909666, + "learning_rate": 1.643975042665973e-06, + "loss": 0.6882, + "step": 10023 + }, + { + "epoch": 1.5, + "grad_norm": 2.8453863580551237, + "learning_rate": 1.6439011301170393e-06, + "loss": 0.6549, + "step": 10024 + }, + { + "epoch": 1.5, + "grad_norm": 5.33638815327061, + "learning_rate": 1.6438272115585195e-06, + "loss": 0.6628, + "step": 10025 + }, + { + "epoch": 1.5, + "grad_norm": 1.436197485807936, + "learning_rate": 1.6437532869911033e-06, + "loss": 0.6549, + "step": 10026 + }, + { + "epoch": 1.5, + "grad_norm": 2.006482610256991, + "learning_rate": 1.6436793564154808e-06, + "loss": 0.6712, + "step": 10027 + }, + { + "epoch": 1.5, + "grad_norm": 1.7923271002944339, + "learning_rate": 1.6436054198323417e-06, + "loss": 0.7031, + "step": 10028 + }, + { + "epoch": 1.5, + "grad_norm": 3.2136392351116996, + "learning_rate": 1.6435314772423765e-06, + "loss": 0.6797, + "step": 10029 + }, + { + "epoch": 1.5, + "grad_norm": 4.39066684730858, + "learning_rate": 1.643457528646275e-06, + "loss": 0.6602, + "step": 10030 + }, + { + "epoch": 1.5, + "grad_norm": 4.185218772224239, + "learning_rate": 1.6433835740447277e-06, + "loss": 0.7005, + "step": 10031 + }, + { + "epoch": 1.5, + "grad_norm": 5.30551345051245, + "learning_rate": 1.6433096134384245e-06, + "loss": 0.6927, + "step": 10032 + }, + { + "epoch": 1.5, + "grad_norm": 5.309976160620673, + "learning_rate": 1.6432356468280557e-06, + "loss": 0.6589, + "step": 10033 + }, + { + "epoch": 1.5, + "grad_norm": 1.4535421285553503, + "learning_rate": 1.6431616742143117e-06, + "loss": 0.6608, + "step": 10034 + }, + { + "epoch": 1.5, + "grad_norm": 0.810293185735206, + "learning_rate": 1.6430876955978832e-06, + "loss": 0.6973, + "step": 10035 + }, + { + "epoch": 1.5, + "grad_norm": 1.1280773260600077, + "learning_rate": 1.6430137109794601e-06, + "loss": 0.6628, + "step": 10036 + }, + { + "epoch": 1.5, + "grad_norm": 2.2919601527852067, + "learning_rate": 1.6429397203597333e-06, + "loss": 0.6855, + "step": 10037 + }, + { + "epoch": 1.5, + "grad_norm": 2.1971019552033257, + "learning_rate": 1.6428657237393933e-06, + "loss": 0.6497, + "step": 10038 + }, + { + "epoch": 1.5, + "grad_norm": 3.2538367795125285, + "learning_rate": 1.6427917211191304e-06, + "loss": 0.6654, + "step": 10039 + }, + { + "epoch": 1.5, + "grad_norm": 2.2170286076344574, + "learning_rate": 1.6427177124996353e-06, + "loss": 0.6901, + "step": 10040 + }, + { + "epoch": 1.5, + "grad_norm": 4.3629576565614965, + "learning_rate": 1.6426436978815997e-06, + "loss": 0.6745, + "step": 10041 + }, + { + "epoch": 1.5, + "grad_norm": 6.383037132339359, + "learning_rate": 1.6425696772657131e-06, + "loss": 0.6855, + "step": 10042 + }, + { + "epoch": 1.5, + "grad_norm": 1.8363338943626104, + "learning_rate": 1.642495650652667e-06, + "loss": 0.6641, + "step": 10043 + }, + { + "epoch": 1.5, + "grad_norm": 0.7715424454085996, + "learning_rate": 1.642421618043152e-06, + "loss": 0.6823, + "step": 10044 + }, + { + "epoch": 1.5, + "grad_norm": 1.4720760281670646, + "learning_rate": 1.6423475794378593e-06, + "loss": 0.6784, + "step": 10045 + }, + { + "epoch": 1.5, + "grad_norm": 4.321998821471092, + "learning_rate": 1.6422735348374802e-06, + "loss": 0.6829, + "step": 10046 + }, + { + "epoch": 1.5, + "grad_norm": 1.3264765373966032, + "learning_rate": 1.6421994842427052e-06, + "loss": 0.7135, + "step": 10047 + }, + { + "epoch": 1.5, + "grad_norm": 1.0447975938330636, + "learning_rate": 1.6421254276542255e-06, + "loss": 0.6836, + "step": 10048 + }, + { + "epoch": 1.5, + "grad_norm": 1.40589979951859, + "learning_rate": 1.6420513650727324e-06, + "loss": 0.6777, + "step": 10049 + }, + { + "epoch": 1.5, + "grad_norm": 0.716699374532945, + "learning_rate": 1.6419772964989172e-06, + "loss": 0.6751, + "step": 10050 + }, + { + "epoch": 1.5, + "grad_norm": 5.627287290569854, + "learning_rate": 1.6419032219334712e-06, + "loss": 0.6947, + "step": 10051 + }, + { + "epoch": 1.5, + "grad_norm": 1.2388253448792164, + "learning_rate": 1.6418291413770856e-06, + "loss": 0.7044, + "step": 10052 + }, + { + "epoch": 1.5, + "grad_norm": 1.479099720915079, + "learning_rate": 1.6417550548304515e-06, + "loss": 0.6595, + "step": 10053 + }, + { + "epoch": 1.5, + "grad_norm": 2.982423700019169, + "learning_rate": 1.6416809622942612e-06, + "loss": 0.6491, + "step": 10054 + }, + { + "epoch": 1.5, + "grad_norm": 1.0974516225801076, + "learning_rate": 1.6416068637692058e-06, + "loss": 0.6751, + "step": 10055 + }, + { + "epoch": 1.5, + "grad_norm": 0.8904539272982976, + "learning_rate": 1.6415327592559765e-06, + "loss": 0.6732, + "step": 10056 + }, + { + "epoch": 1.5, + "grad_norm": 2.3472401765800934, + "learning_rate": 1.641458648755265e-06, + "loss": 0.6582, + "step": 10057 + }, + { + "epoch": 1.5, + "grad_norm": 2.5365743482298653, + "learning_rate": 1.6413845322677635e-06, + "loss": 0.6882, + "step": 10058 + }, + { + "epoch": 1.5, + "grad_norm": 0.7991107803163578, + "learning_rate": 1.6413104097941637e-06, + "loss": 0.6439, + "step": 10059 + }, + { + "epoch": 1.5, + "grad_norm": 2.962610781081156, + "learning_rate": 1.6412362813351566e-06, + "loss": 0.6973, + "step": 10060 + }, + { + "epoch": 1.5, + "grad_norm": 1.5904427816096207, + "learning_rate": 1.641162146891435e-06, + "loss": 0.679, + "step": 10061 + }, + { + "epoch": 1.5, + "grad_norm": 1.9199057114835714, + "learning_rate": 1.6410880064636904e-06, + "loss": 0.6829, + "step": 10062 + }, + { + "epoch": 1.5, + "grad_norm": 3.4536112249463886, + "learning_rate": 1.6410138600526144e-06, + "loss": 0.6986, + "step": 10063 + }, + { + "epoch": 1.5, + "grad_norm": 1.4282300678268347, + "learning_rate": 1.6409397076588993e-06, + "loss": 0.6719, + "step": 10064 + }, + { + "epoch": 1.5, + "grad_norm": 1.296620215139268, + "learning_rate": 1.6408655492832378e-06, + "loss": 0.6914, + "step": 10065 + }, + { + "epoch": 1.5, + "grad_norm": 0.971540902464812, + "learning_rate": 1.640791384926321e-06, + "loss": 0.6979, + "step": 10066 + }, + { + "epoch": 1.5, + "grad_norm": 7.032783747167393, + "learning_rate": 1.6407172145888414e-06, + "loss": 0.6719, + "step": 10067 + }, + { + "epoch": 1.5, + "grad_norm": 0.6468483667010119, + "learning_rate": 1.640643038271492e-06, + "loss": 0.6803, + "step": 10068 + }, + { + "epoch": 1.5, + "grad_norm": 4.251977832249665, + "learning_rate": 1.640568855974964e-06, + "loss": 0.6875, + "step": 10069 + }, + { + "epoch": 1.5, + "grad_norm": 3.037096745237455, + "learning_rate": 1.6404946676999502e-06, + "loss": 0.6719, + "step": 10070 + }, + { + "epoch": 1.5, + "grad_norm": 0.7584541786601974, + "learning_rate": 1.6404204734471432e-06, + "loss": 0.6712, + "step": 10071 + }, + { + "epoch": 1.5, + "grad_norm": 1.979719975736982, + "learning_rate": 1.640346273217235e-06, + "loss": 0.6712, + "step": 10072 + }, + { + "epoch": 1.5, + "grad_norm": 0.7421505319894611, + "learning_rate": 1.6402720670109189e-06, + "loss": 0.6654, + "step": 10073 + }, + { + "epoch": 1.5, + "grad_norm": 1.7687563677521991, + "learning_rate": 1.6401978548288864e-06, + "loss": 0.6849, + "step": 10074 + }, + { + "epoch": 1.5, + "grad_norm": 3.097166661572905, + "learning_rate": 1.6401236366718311e-06, + "loss": 0.6517, + "step": 10075 + }, + { + "epoch": 1.5, + "grad_norm": 1.8262110669750455, + "learning_rate": 1.6400494125404452e-06, + "loss": 0.6901, + "step": 10076 + }, + { + "epoch": 1.5, + "grad_norm": 0.7908367471956809, + "learning_rate": 1.6399751824354215e-06, + "loss": 0.6732, + "step": 10077 + }, + { + "epoch": 1.5, + "grad_norm": 1.1130818154575521, + "learning_rate": 1.639900946357453e-06, + "loss": 0.6745, + "step": 10078 + }, + { + "epoch": 1.5, + "grad_norm": 2.7736055528082635, + "learning_rate": 1.6398267043072322e-06, + "loss": 0.681, + "step": 10079 + }, + { + "epoch": 1.5, + "grad_norm": 4.599054147338766, + "learning_rate": 1.639752456285452e-06, + "loss": 0.694, + "step": 10080 + }, + { + "epoch": 1.5, + "grad_norm": 1.6607182288898472, + "learning_rate": 1.639678202292806e-06, + "loss": 0.6641, + "step": 10081 + }, + { + "epoch": 1.5, + "grad_norm": 0.7879646240307386, + "learning_rate": 1.6396039423299863e-06, + "loss": 0.6836, + "step": 10082 + }, + { + "epoch": 1.5, + "grad_norm": 0.845942548443652, + "learning_rate": 1.6395296763976866e-06, + "loss": 0.6608, + "step": 10083 + }, + { + "epoch": 1.5, + "grad_norm": 2.268874323932314, + "learning_rate": 1.6394554044966001e-06, + "loss": 0.6758, + "step": 10084 + }, + { + "epoch": 1.5, + "grad_norm": 2.535517933697156, + "learning_rate": 1.6393811266274196e-06, + "loss": 0.6758, + "step": 10085 + }, + { + "epoch": 1.5, + "grad_norm": 2.1063614438503, + "learning_rate": 1.6393068427908386e-06, + "loss": 0.6706, + "step": 10086 + }, + { + "epoch": 1.5, + "grad_norm": 0.7837066318283684, + "learning_rate": 1.6392325529875502e-06, + "loss": 0.6699, + "step": 10087 + }, + { + "epoch": 1.5, + "grad_norm": 3.1548663166252537, + "learning_rate": 1.6391582572182476e-06, + "loss": 0.6849, + "step": 10088 + }, + { + "epoch": 1.5, + "grad_norm": 2.671247082524675, + "learning_rate": 1.639083955483625e-06, + "loss": 0.6921, + "step": 10089 + }, + { + "epoch": 1.5, + "grad_norm": 2.7654103599532784, + "learning_rate": 1.6390096477843746e-06, + "loss": 0.6914, + "step": 10090 + }, + { + "epoch": 1.5, + "grad_norm": 2.0769563657372365, + "learning_rate": 1.6389353341211913e-06, + "loss": 0.6882, + "step": 10091 + }, + { + "epoch": 1.51, + "grad_norm": 4.347365172382965, + "learning_rate": 1.638861014494768e-06, + "loss": 0.6771, + "step": 10092 + }, + { + "epoch": 1.51, + "grad_norm": 1.1596798275971207, + "learning_rate": 1.638786688905798e-06, + "loss": 0.6732, + "step": 10093 + }, + { + "epoch": 1.51, + "grad_norm": 2.6014933709828543, + "learning_rate": 1.6387123573549755e-06, + "loss": 0.6973, + "step": 10094 + }, + { + "epoch": 1.51, + "grad_norm": 0.9847970477914642, + "learning_rate": 1.6386380198429941e-06, + "loss": 0.653, + "step": 10095 + }, + { + "epoch": 1.51, + "grad_norm": 0.8043871075806164, + "learning_rate": 1.6385636763705475e-06, + "loss": 0.6589, + "step": 10096 + }, + { + "epoch": 1.51, + "grad_norm": 1.8109676358289992, + "learning_rate": 1.6384893269383294e-06, + "loss": 0.7148, + "step": 10097 + }, + { + "epoch": 1.51, + "grad_norm": 2.0215467735122177, + "learning_rate": 1.6384149715470347e-06, + "loss": 0.6921, + "step": 10098 + }, + { + "epoch": 1.51, + "grad_norm": 3.040336323102228, + "learning_rate": 1.6383406101973561e-06, + "loss": 0.6784, + "step": 10099 + }, + { + "epoch": 1.51, + "grad_norm": 1.014433289397713, + "learning_rate": 1.638266242889988e-06, + "loss": 0.7018, + "step": 10100 + }, + { + "epoch": 1.51, + "grad_norm": 4.781177746666437, + "learning_rate": 1.638191869625625e-06, + "loss": 0.6732, + "step": 10101 + }, + { + "epoch": 1.51, + "grad_norm": 0.8733977716848446, + "learning_rate": 1.6381174904049607e-06, + "loss": 0.6647, + "step": 10102 + }, + { + "epoch": 1.51, + "grad_norm": 1.5264441624701548, + "learning_rate": 1.6380431052286894e-06, + "loss": 0.696, + "step": 10103 + }, + { + "epoch": 1.51, + "grad_norm": 1.3508130755595977, + "learning_rate": 1.6379687140975054e-06, + "loss": 0.653, + "step": 10104 + }, + { + "epoch": 1.51, + "grad_norm": 1.639809336726616, + "learning_rate": 1.6378943170121027e-06, + "loss": 0.6888, + "step": 10105 + }, + { + "epoch": 1.51, + "grad_norm": 2.8663493031781435, + "learning_rate": 1.637819913973176e-06, + "loss": 0.6986, + "step": 10106 + }, + { + "epoch": 1.51, + "grad_norm": 0.6596002172206183, + "learning_rate": 1.63774550498142e-06, + "loss": 0.6836, + "step": 10107 + }, + { + "epoch": 1.51, + "grad_norm": 1.581845764461697, + "learning_rate": 1.6376710900375286e-06, + "loss": 0.6647, + "step": 10108 + }, + { + "epoch": 1.51, + "grad_norm": 4.176172332087184, + "learning_rate": 1.6375966691421966e-06, + "loss": 0.6829, + "step": 10109 + }, + { + "epoch": 1.51, + "grad_norm": 0.6789340835719438, + "learning_rate": 1.637522242296118e-06, + "loss": 0.6849, + "step": 10110 + }, + { + "epoch": 1.51, + "grad_norm": 2.5223715617572435, + "learning_rate": 1.6374478094999886e-06, + "loss": 0.6706, + "step": 10111 + }, + { + "epoch": 1.51, + "grad_norm": 0.7750950535212838, + "learning_rate": 1.6373733707545022e-06, + "loss": 0.6732, + "step": 10112 + }, + { + "epoch": 1.51, + "grad_norm": 0.6832449039725715, + "learning_rate": 1.6372989260603535e-06, + "loss": 0.6732, + "step": 10113 + }, + { + "epoch": 1.51, + "grad_norm": 1.4253886399196152, + "learning_rate": 1.6372244754182381e-06, + "loss": 0.679, + "step": 10114 + }, + { + "epoch": 1.51, + "grad_norm": 2.480245927437989, + "learning_rate": 1.6371500188288498e-06, + "loss": 0.6927, + "step": 10115 + }, + { + "epoch": 1.51, + "grad_norm": 1.466479191525556, + "learning_rate": 1.6370755562928841e-06, + "loss": 0.6536, + "step": 10116 + }, + { + "epoch": 1.51, + "grad_norm": 0.7190816948484166, + "learning_rate": 1.6370010878110364e-06, + "loss": 0.7018, + "step": 10117 + }, + { + "epoch": 1.51, + "grad_norm": 1.72477046966782, + "learning_rate": 1.6369266133840007e-06, + "loss": 0.6849, + "step": 10118 + }, + { + "epoch": 1.51, + "grad_norm": 3.5574491173398703, + "learning_rate": 1.6368521330124726e-06, + "loss": 0.6667, + "step": 10119 + }, + { + "epoch": 1.51, + "grad_norm": 3.0578772228369018, + "learning_rate": 1.6367776466971475e-06, + "loss": 0.6764, + "step": 10120 + }, + { + "epoch": 1.51, + "grad_norm": 3.816104043851193, + "learning_rate": 1.6367031544387202e-06, + "loss": 0.6764, + "step": 10121 + }, + { + "epoch": 1.51, + "grad_norm": 5.087994031616014, + "learning_rate": 1.636628656237886e-06, + "loss": 0.6758, + "step": 10122 + }, + { + "epoch": 1.51, + "grad_norm": 1.7635054481009311, + "learning_rate": 1.6365541520953402e-06, + "loss": 0.6667, + "step": 10123 + }, + { + "epoch": 1.51, + "grad_norm": 0.8296801488639648, + "learning_rate": 1.6364796420117784e-06, + "loss": 0.6953, + "step": 10124 + }, + { + "epoch": 1.51, + "grad_norm": 0.991915572044242, + "learning_rate": 1.6364051259878956e-06, + "loss": 0.6732, + "step": 10125 + }, + { + "epoch": 1.51, + "grad_norm": 5.667795456664484, + "learning_rate": 1.6363306040243873e-06, + "loss": 0.6868, + "step": 10126 + }, + { + "epoch": 1.51, + "grad_norm": 1.3776165322164236, + "learning_rate": 1.6362560761219497e-06, + "loss": 0.6816, + "step": 10127 + }, + { + "epoch": 1.51, + "grad_norm": 1.7019908845973406, + "learning_rate": 1.6361815422812777e-06, + "loss": 0.6934, + "step": 10128 + }, + { + "epoch": 1.51, + "grad_norm": 3.5593749591665698, + "learning_rate": 1.636107002503067e-06, + "loss": 0.6667, + "step": 10129 + }, + { + "epoch": 1.51, + "grad_norm": 0.8027352966350878, + "learning_rate": 1.6360324567880133e-06, + "loss": 0.6914, + "step": 10130 + }, + { + "epoch": 1.51, + "grad_norm": 3.920752979814105, + "learning_rate": 1.635957905136813e-06, + "loss": 0.6979, + "step": 10131 + }, + { + "epoch": 1.51, + "grad_norm": 0.8087994504539686, + "learning_rate": 1.6358833475501605e-06, + "loss": 0.6882, + "step": 10132 + }, + { + "epoch": 1.51, + "grad_norm": 2.703692391021306, + "learning_rate": 1.635808784028753e-06, + "loss": 0.6784, + "step": 10133 + }, + { + "epoch": 1.51, + "grad_norm": 0.6539948368190013, + "learning_rate": 1.6357342145732858e-06, + "loss": 0.6536, + "step": 10134 + }, + { + "epoch": 1.51, + "grad_norm": 1.8343285379500907, + "learning_rate": 1.635659639184455e-06, + "loss": 0.6719, + "step": 10135 + }, + { + "epoch": 1.51, + "grad_norm": 4.258557928040255, + "learning_rate": 1.6355850578629565e-06, + "loss": 0.6589, + "step": 10136 + }, + { + "epoch": 1.51, + "grad_norm": 2.245022044801823, + "learning_rate": 1.6355104706094865e-06, + "loss": 0.6693, + "step": 10137 + }, + { + "epoch": 1.51, + "grad_norm": 1.8986973764700903, + "learning_rate": 1.635435877424741e-06, + "loss": 0.6875, + "step": 10138 + }, + { + "epoch": 1.51, + "grad_norm": 0.6622693848536387, + "learning_rate": 1.635361278309416e-06, + "loss": 0.666, + "step": 10139 + }, + { + "epoch": 1.51, + "grad_norm": 0.9968342198827762, + "learning_rate": 1.6352866732642083e-06, + "loss": 0.6471, + "step": 10140 + }, + { + "epoch": 1.51, + "grad_norm": 2.562092781032756, + "learning_rate": 1.635212062289814e-06, + "loss": 0.6868, + "step": 10141 + }, + { + "epoch": 1.51, + "grad_norm": 0.6900232073483618, + "learning_rate": 1.6351374453869293e-06, + "loss": 0.6803, + "step": 10142 + }, + { + "epoch": 1.51, + "grad_norm": 2.657407062126761, + "learning_rate": 1.6350628225562504e-06, + "loss": 0.6934, + "step": 10143 + }, + { + "epoch": 1.51, + "grad_norm": 2.6101743170838416, + "learning_rate": 1.6349881937984743e-06, + "loss": 0.6549, + "step": 10144 + }, + { + "epoch": 1.51, + "grad_norm": 3.405400034768449, + "learning_rate": 1.634913559114297e-06, + "loss": 0.6549, + "step": 10145 + }, + { + "epoch": 1.51, + "grad_norm": 3.245549562189062, + "learning_rate": 1.6348389185044153e-06, + "loss": 0.6823, + "step": 10146 + }, + { + "epoch": 1.51, + "grad_norm": 2.9402360527455706, + "learning_rate": 1.6347642719695263e-06, + "loss": 0.668, + "step": 10147 + }, + { + "epoch": 1.51, + "grad_norm": 4.2455841531889265, + "learning_rate": 1.6346896195103256e-06, + "loss": 0.6738, + "step": 10148 + }, + { + "epoch": 1.51, + "grad_norm": 4.805486460137932, + "learning_rate": 1.6346149611275108e-06, + "loss": 0.6699, + "step": 10149 + }, + { + "epoch": 1.51, + "grad_norm": 0.8653157159292948, + "learning_rate": 1.6345402968217785e-06, + "loss": 0.6823, + "step": 10150 + }, + { + "epoch": 1.51, + "grad_norm": 1.2604685800643636, + "learning_rate": 1.6344656265938258e-06, + "loss": 0.7031, + "step": 10151 + }, + { + "epoch": 1.51, + "grad_norm": 1.91381360066182, + "learning_rate": 1.6343909504443488e-06, + "loss": 0.6823, + "step": 10152 + }, + { + "epoch": 1.51, + "grad_norm": 1.1935482621625224, + "learning_rate": 1.6343162683740452e-06, + "loss": 0.6647, + "step": 10153 + }, + { + "epoch": 1.51, + "grad_norm": 1.056364066680561, + "learning_rate": 1.6342415803836115e-06, + "loss": 0.681, + "step": 10154 + }, + { + "epoch": 1.51, + "grad_norm": 3.5386584322584778, + "learning_rate": 1.6341668864737454e-06, + "loss": 0.6732, + "step": 10155 + }, + { + "epoch": 1.51, + "grad_norm": 1.6829998032825328, + "learning_rate": 1.6340921866451435e-06, + "loss": 0.6628, + "step": 10156 + }, + { + "epoch": 1.51, + "grad_norm": 2.347483403602712, + "learning_rate": 1.6340174808985031e-06, + "loss": 0.6882, + "step": 10157 + }, + { + "epoch": 1.51, + "grad_norm": 1.0834124578010924, + "learning_rate": 1.6339427692345216e-06, + "loss": 0.6725, + "step": 10158 + }, + { + "epoch": 1.52, + "grad_norm": 2.1403597280166458, + "learning_rate": 1.6338680516538959e-06, + "loss": 0.6875, + "step": 10159 + }, + { + "epoch": 1.52, + "grad_norm": 3.6654887463487547, + "learning_rate": 1.6337933281573237e-06, + "loss": 0.6582, + "step": 10160 + }, + { + "epoch": 1.52, + "grad_norm": 4.230748631514442, + "learning_rate": 1.6337185987455028e-06, + "loss": 0.6771, + "step": 10161 + }, + { + "epoch": 1.52, + "grad_norm": 3.326215881440545, + "learning_rate": 1.6336438634191296e-06, + "loss": 0.6966, + "step": 10162 + }, + { + "epoch": 1.52, + "grad_norm": 2.19701067119091, + "learning_rate": 1.6335691221789026e-06, + "loss": 0.6829, + "step": 10163 + }, + { + "epoch": 1.52, + "grad_norm": 3.3342579577739264, + "learning_rate": 1.6334943750255187e-06, + "loss": 0.6471, + "step": 10164 + }, + { + "epoch": 1.52, + "grad_norm": 4.642670366293431, + "learning_rate": 1.6334196219596756e-06, + "loss": 0.6816, + "step": 10165 + }, + { + "epoch": 1.52, + "grad_norm": 0.9358583803554025, + "learning_rate": 1.6333448629820714e-06, + "loss": 0.6602, + "step": 10166 + }, + { + "epoch": 1.52, + "grad_norm": 2.469903810800908, + "learning_rate": 1.6332700980934039e-06, + "loss": 0.6927, + "step": 10167 + }, + { + "epoch": 1.52, + "grad_norm": 1.4981181192704194, + "learning_rate": 1.63319532729437e-06, + "loss": 0.6654, + "step": 10168 + }, + { + "epoch": 1.52, + "grad_norm": 1.8977779831329404, + "learning_rate": 1.6331205505856685e-06, + "loss": 0.6764, + "step": 10169 + }, + { + "epoch": 1.52, + "grad_norm": 0.7831993874528095, + "learning_rate": 1.6330457679679968e-06, + "loss": 0.651, + "step": 10170 + }, + { + "epoch": 1.52, + "grad_norm": 1.1483964579367503, + "learning_rate": 1.6329709794420527e-06, + "loss": 0.6992, + "step": 10171 + }, + { + "epoch": 1.52, + "grad_norm": 3.4936508060349545, + "learning_rate": 1.6328961850085348e-06, + "loss": 0.679, + "step": 10172 + }, + { + "epoch": 1.52, + "grad_norm": 1.2545513931674224, + "learning_rate": 1.6328213846681408e-06, + "loss": 0.6895, + "step": 10173 + }, + { + "epoch": 1.52, + "grad_norm": 4.133046721875583, + "learning_rate": 1.6327465784215685e-06, + "loss": 0.6602, + "step": 10174 + }, + { + "epoch": 1.52, + "grad_norm": 3.2539818861505325, + "learning_rate": 1.632671766269517e-06, + "loss": 0.6569, + "step": 10175 + }, + { + "epoch": 1.52, + "grad_norm": 0.9064549916681074, + "learning_rate": 1.6325969482126834e-06, + "loss": 0.6901, + "step": 10176 + }, + { + "epoch": 1.52, + "grad_norm": 2.5717407957664746, + "learning_rate": 1.6325221242517668e-06, + "loss": 0.6947, + "step": 10177 + }, + { + "epoch": 1.52, + "grad_norm": 0.8082567738896592, + "learning_rate": 1.6324472943874653e-06, + "loss": 0.6764, + "step": 10178 + }, + { + "epoch": 1.52, + "grad_norm": 2.61283141286511, + "learning_rate": 1.6323724586204773e-06, + "loss": 0.6543, + "step": 10179 + }, + { + "epoch": 1.52, + "grad_norm": 0.8816004523360768, + "learning_rate": 1.6322976169515012e-06, + "loss": 0.679, + "step": 10180 + }, + { + "epoch": 1.52, + "grad_norm": 1.52387466362683, + "learning_rate": 1.6322227693812355e-06, + "loss": 0.6908, + "step": 10181 + }, + { + "epoch": 1.52, + "grad_norm": 1.1459706374655456, + "learning_rate": 1.6321479159103786e-06, + "loss": 0.6829, + "step": 10182 + }, + { + "epoch": 1.52, + "grad_norm": 5.754960502830345, + "learning_rate": 1.6320730565396295e-06, + "loss": 0.6973, + "step": 10183 + }, + { + "epoch": 1.52, + "grad_norm": 2.8752969844217806, + "learning_rate": 1.6319981912696867e-06, + "loss": 0.6719, + "step": 10184 + }, + { + "epoch": 1.52, + "grad_norm": 1.9849654555690641, + "learning_rate": 1.6319233201012487e-06, + "loss": 0.6953, + "step": 10185 + }, + { + "epoch": 1.52, + "grad_norm": 2.824950459430911, + "learning_rate": 1.6318484430350145e-06, + "loss": 0.6953, + "step": 10186 + }, + { + "epoch": 1.52, + "grad_norm": 1.02955346632862, + "learning_rate": 1.6317735600716831e-06, + "loss": 0.679, + "step": 10187 + }, + { + "epoch": 1.52, + "grad_norm": 3.535491523559198, + "learning_rate": 1.6316986712119532e-06, + "loss": 0.6882, + "step": 10188 + }, + { + "epoch": 1.52, + "grad_norm": 0.722841135211233, + "learning_rate": 1.6316237764565237e-06, + "loss": 0.6927, + "step": 10189 + }, + { + "epoch": 1.52, + "grad_norm": 0.7416073064439076, + "learning_rate": 1.6315488758060934e-06, + "loss": 0.6758, + "step": 10190 + }, + { + "epoch": 1.52, + "grad_norm": 3.3282814774312617, + "learning_rate": 1.6314739692613618e-06, + "loss": 0.6784, + "step": 10191 + }, + { + "epoch": 1.52, + "grad_norm": 0.6436925990803667, + "learning_rate": 1.6313990568230275e-06, + "loss": 0.6628, + "step": 10192 + }, + { + "epoch": 1.52, + "grad_norm": 0.936867254264143, + "learning_rate": 1.6313241384917903e-06, + "loss": 0.6673, + "step": 10193 + }, + { + "epoch": 1.52, + "grad_norm": 1.4963431331490813, + "learning_rate": 1.631249214268349e-06, + "loss": 0.6738, + "step": 10194 + }, + { + "epoch": 1.52, + "grad_norm": 1.2126435386532717, + "learning_rate": 1.631174284153403e-06, + "loss": 0.6862, + "step": 10195 + }, + { + "epoch": 1.52, + "grad_norm": 2.8099546592695273, + "learning_rate": 1.6310993481476515e-06, + "loss": 0.6706, + "step": 10196 + }, + { + "epoch": 1.52, + "grad_norm": 0.7826347127334707, + "learning_rate": 1.6310244062517945e-06, + "loss": 0.6673, + "step": 10197 + }, + { + "epoch": 1.52, + "grad_norm": 0.74578378026671, + "learning_rate": 1.6309494584665303e-06, + "loss": 0.6934, + "step": 10198 + }, + { + "epoch": 1.52, + "grad_norm": 2.425658762701524, + "learning_rate": 1.6308745047925587e-06, + "loss": 0.6712, + "step": 10199 + }, + { + "epoch": 1.52, + "grad_norm": 1.7237724959516856, + "learning_rate": 1.6307995452305802e-06, + "loss": 0.6673, + "step": 10200 + }, + { + "epoch": 1.52, + "grad_norm": 0.5236453477809615, + "learning_rate": 1.6307245797812937e-06, + "loss": 0.6712, + "step": 10201 + }, + { + "epoch": 1.52, + "grad_norm": 1.6154711134080868, + "learning_rate": 1.6306496084453986e-06, + "loss": 0.7031, + "step": 10202 + }, + { + "epoch": 1.52, + "grad_norm": 0.665408669274428, + "learning_rate": 1.6305746312235952e-06, + "loss": 0.6842, + "step": 10203 + }, + { + "epoch": 1.52, + "grad_norm": 1.878489482167477, + "learning_rate": 1.630499648116583e-06, + "loss": 0.6738, + "step": 10204 + }, + { + "epoch": 1.52, + "grad_norm": 1.2721275679175412, + "learning_rate": 1.6304246591250617e-06, + "loss": 0.6895, + "step": 10205 + }, + { + "epoch": 1.52, + "grad_norm": 0.6836061066473168, + "learning_rate": 1.6303496642497312e-06, + "loss": 0.6725, + "step": 10206 + }, + { + "epoch": 1.52, + "grad_norm": 0.6507515114941774, + "learning_rate": 1.6302746634912916e-06, + "loss": 0.6628, + "step": 10207 + }, + { + "epoch": 1.52, + "grad_norm": 4.859361787743219, + "learning_rate": 1.630199656850443e-06, + "loss": 0.6829, + "step": 10208 + }, + { + "epoch": 1.52, + "grad_norm": 3.5987164002225263, + "learning_rate": 1.630124644327885e-06, + "loss": 0.6764, + "step": 10209 + }, + { + "epoch": 1.52, + "grad_norm": 1.6694358461119305, + "learning_rate": 1.6300496259243181e-06, + "loss": 0.6855, + "step": 10210 + }, + { + "epoch": 1.52, + "grad_norm": 1.1996335294224652, + "learning_rate": 1.6299746016404426e-06, + "loss": 0.6771, + "step": 10211 + }, + { + "epoch": 1.52, + "grad_norm": 2.6404601380253476, + "learning_rate": 1.6298995714769582e-06, + "loss": 0.668, + "step": 10212 + }, + { + "epoch": 1.52, + "grad_norm": 6.8769489161103845, + "learning_rate": 1.6298245354345654e-06, + "loss": 0.6999, + "step": 10213 + }, + { + "epoch": 1.52, + "grad_norm": 2.2371408091309215, + "learning_rate": 1.6297494935139645e-06, + "loss": 0.7116, + "step": 10214 + }, + { + "epoch": 1.52, + "grad_norm": 3.0834647452488033, + "learning_rate": 1.6296744457158557e-06, + "loss": 0.6686, + "step": 10215 + }, + { + "epoch": 1.52, + "grad_norm": 5.160371435697365, + "learning_rate": 1.6295993920409398e-06, + "loss": 0.6576, + "step": 10216 + }, + { + "epoch": 1.52, + "grad_norm": 0.620089602980297, + "learning_rate": 1.6295243324899173e-06, + "loss": 0.6699, + "step": 10217 + }, + { + "epoch": 1.52, + "grad_norm": 4.227889594259028, + "learning_rate": 1.6294492670634882e-06, + "loss": 0.6706, + "step": 10218 + }, + { + "epoch": 1.52, + "grad_norm": 2.324766123449667, + "learning_rate": 1.6293741957623537e-06, + "loss": 0.6797, + "step": 10219 + }, + { + "epoch": 1.52, + "grad_norm": 2.713342542915228, + "learning_rate": 1.6292991185872138e-06, + "loss": 0.6647, + "step": 10220 + }, + { + "epoch": 1.52, + "grad_norm": 2.942652588558279, + "learning_rate": 1.6292240355387698e-06, + "loss": 0.6862, + "step": 10221 + }, + { + "epoch": 1.52, + "grad_norm": 3.9525528950705553, + "learning_rate": 1.6291489466177222e-06, + "loss": 0.6673, + "step": 10222 + }, + { + "epoch": 1.52, + "grad_norm": 0.744284789644142, + "learning_rate": 1.629073851824772e-06, + "loss": 0.668, + "step": 10223 + }, + { + "epoch": 1.52, + "grad_norm": 2.1758313500768423, + "learning_rate": 1.6289987511606195e-06, + "loss": 0.6758, + "step": 10224 + }, + { + "epoch": 1.52, + "grad_norm": 3.667445527121641, + "learning_rate": 1.6289236446259667e-06, + "loss": 0.6582, + "step": 10225 + }, + { + "epoch": 1.53, + "grad_norm": 0.8082587166088344, + "learning_rate": 1.6288485322215134e-06, + "loss": 0.6862, + "step": 10226 + }, + { + "epoch": 1.53, + "grad_norm": 1.9862166431497015, + "learning_rate": 1.6287734139479611e-06, + "loss": 0.6751, + "step": 10227 + }, + { + "epoch": 1.53, + "grad_norm": 2.2718635105011007, + "learning_rate": 1.6286982898060111e-06, + "loss": 0.6771, + "step": 10228 + }, + { + "epoch": 1.53, + "grad_norm": 3.962802085706511, + "learning_rate": 1.6286231597963643e-06, + "loss": 0.7129, + "step": 10229 + }, + { + "epoch": 1.53, + "grad_norm": 1.9680763093820512, + "learning_rate": 1.628548023919722e-06, + "loss": 0.6751, + "step": 10230 + }, + { + "epoch": 1.53, + "grad_norm": 5.3150360441942235, + "learning_rate": 1.6284728821767851e-06, + "loss": 0.681, + "step": 10231 + }, + { + "epoch": 1.53, + "grad_norm": 0.9218702634692069, + "learning_rate": 1.6283977345682558e-06, + "loss": 0.6693, + "step": 10232 + }, + { + "epoch": 1.53, + "grad_norm": 0.8337874253079368, + "learning_rate": 1.6283225810948347e-06, + "loss": 0.6836, + "step": 10233 + }, + { + "epoch": 1.53, + "grad_norm": 3.0916383511693777, + "learning_rate": 1.6282474217572232e-06, + "loss": 0.6504, + "step": 10234 + }, + { + "epoch": 1.53, + "grad_norm": 0.715499050650289, + "learning_rate": 1.6281722565561232e-06, + "loss": 0.6621, + "step": 10235 + }, + { + "epoch": 1.53, + "grad_norm": 2.318202975018482, + "learning_rate": 1.6280970854922355e-06, + "loss": 0.679, + "step": 10236 + }, + { + "epoch": 1.53, + "grad_norm": 1.4426900915138532, + "learning_rate": 1.6280219085662625e-06, + "loss": 0.6777, + "step": 10237 + }, + { + "epoch": 1.53, + "grad_norm": 1.199743093987751, + "learning_rate": 1.6279467257789056e-06, + "loss": 0.7135, + "step": 10238 + }, + { + "epoch": 1.53, + "grad_norm": 1.015986455541469, + "learning_rate": 1.627871537130866e-06, + "loss": 0.6732, + "step": 10239 + }, + { + "epoch": 1.53, + "grad_norm": 3.8683534167201508, + "learning_rate": 1.627796342622846e-06, + "loss": 0.6849, + "step": 10240 + }, + { + "epoch": 1.53, + "grad_norm": 1.3983919213151395, + "learning_rate": 1.6277211422555474e-06, + "loss": 0.6797, + "step": 10241 + }, + { + "epoch": 1.53, + "grad_norm": 1.5341524821475971, + "learning_rate": 1.6276459360296716e-06, + "loss": 0.6569, + "step": 10242 + }, + { + "epoch": 1.53, + "grad_norm": 1.551259473386159, + "learning_rate": 1.627570723945921e-06, + "loss": 0.6816, + "step": 10243 + }, + { + "epoch": 1.53, + "grad_norm": 4.59239142106943, + "learning_rate": 1.6274955060049972e-06, + "loss": 0.6849, + "step": 10244 + }, + { + "epoch": 1.53, + "grad_norm": 4.678634048143368, + "learning_rate": 1.6274202822076024e-06, + "loss": 0.6725, + "step": 10245 + }, + { + "epoch": 1.53, + "grad_norm": 2.5182998445144045, + "learning_rate": 1.6273450525544386e-06, + "loss": 0.6556, + "step": 10246 + }, + { + "epoch": 1.53, + "grad_norm": 0.7173293665131881, + "learning_rate": 1.6272698170462077e-06, + "loss": 0.6745, + "step": 10247 + }, + { + "epoch": 1.53, + "grad_norm": 1.7211050351314325, + "learning_rate": 1.6271945756836125e-06, + "loss": 0.6758, + "step": 10248 + }, + { + "epoch": 1.53, + "grad_norm": 2.4628521836165356, + "learning_rate": 1.6271193284673545e-06, + "loss": 0.7116, + "step": 10249 + }, + { + "epoch": 1.53, + "grad_norm": 4.954219297355041, + "learning_rate": 1.6270440753981368e-06, + "loss": 0.6868, + "step": 10250 + }, + { + "epoch": 1.53, + "grad_norm": 1.0968153447687723, + "learning_rate": 1.626968816476661e-06, + "loss": 0.6803, + "step": 10251 + }, + { + "epoch": 1.53, + "grad_norm": 0.7529045146585827, + "learning_rate": 1.6268935517036297e-06, + "loss": 0.6517, + "step": 10252 + }, + { + "epoch": 1.53, + "grad_norm": 2.8802577719851397, + "learning_rate": 1.6268182810797458e-06, + "loss": 0.6803, + "step": 10253 + }, + { + "epoch": 1.53, + "grad_norm": 2.810232527936292, + "learning_rate": 1.626743004605711e-06, + "loss": 0.666, + "step": 10254 + }, + { + "epoch": 1.53, + "grad_norm": 3.750125218198303, + "learning_rate": 1.6266677222822287e-06, + "loss": 0.7064, + "step": 10255 + }, + { + "epoch": 1.53, + "grad_norm": 1.1385962040382558, + "learning_rate": 1.626592434110001e-06, + "loss": 0.6634, + "step": 10256 + }, + { + "epoch": 1.53, + "grad_norm": 2.3698431921831062, + "learning_rate": 1.6265171400897306e-06, + "loss": 0.6556, + "step": 10257 + }, + { + "epoch": 1.53, + "grad_norm": 5.142701844875042, + "learning_rate": 1.6264418402221206e-06, + "loss": 0.6836, + "step": 10258 + }, + { + "epoch": 1.53, + "grad_norm": 1.643140119537721, + "learning_rate": 1.626366534507873e-06, + "loss": 0.668, + "step": 10259 + }, + { + "epoch": 1.53, + "grad_norm": 2.3189399298953113, + "learning_rate": 1.6262912229476915e-06, + "loss": 0.6523, + "step": 10260 + }, + { + "epoch": 1.53, + "grad_norm": 0.7935042729905852, + "learning_rate": 1.6262159055422785e-06, + "loss": 0.6576, + "step": 10261 + }, + { + "epoch": 1.53, + "grad_norm": 0.8061731753004324, + "learning_rate": 1.6261405822923373e-06, + "loss": 0.6784, + "step": 10262 + }, + { + "epoch": 1.53, + "grad_norm": 0.8363536723769096, + "learning_rate": 1.6260652531985706e-06, + "loss": 0.6562, + "step": 10263 + }, + { + "epoch": 1.53, + "grad_norm": 1.4502414668921044, + "learning_rate": 1.6259899182616813e-06, + "loss": 0.6875, + "step": 10264 + }, + { + "epoch": 1.53, + "grad_norm": 0.7463163408981366, + "learning_rate": 1.6259145774823728e-06, + "loss": 0.6706, + "step": 10265 + }, + { + "epoch": 1.53, + "grad_norm": 2.764122146655114, + "learning_rate": 1.6258392308613484e-06, + "loss": 0.6888, + "step": 10266 + }, + { + "epoch": 1.53, + "grad_norm": 5.0865046850817155, + "learning_rate": 1.625763878399311e-06, + "loss": 0.6706, + "step": 10267 + }, + { + "epoch": 1.53, + "grad_norm": 1.4621657263317278, + "learning_rate": 1.625688520096964e-06, + "loss": 0.7005, + "step": 10268 + }, + { + "epoch": 1.53, + "grad_norm": 2.2590185105464897, + "learning_rate": 1.6256131559550105e-06, + "loss": 0.6882, + "step": 10269 + }, + { + "epoch": 1.53, + "grad_norm": 0.9253857550091246, + "learning_rate": 1.6255377859741545e-06, + "loss": 0.6751, + "step": 10270 + }, + { + "epoch": 1.53, + "grad_norm": 3.9315871619080514, + "learning_rate": 1.6254624101550987e-06, + "loss": 0.6777, + "step": 10271 + }, + { + "epoch": 1.53, + "grad_norm": 1.2262371059121546, + "learning_rate": 1.625387028498547e-06, + "loss": 0.6816, + "step": 10272 + }, + { + "epoch": 1.53, + "grad_norm": 1.7818965720107574, + "learning_rate": 1.625311641005203e-06, + "loss": 0.6849, + "step": 10273 + }, + { + "epoch": 1.53, + "grad_norm": 3.3743926992465485, + "learning_rate": 1.6252362476757697e-06, + "loss": 0.6562, + "step": 10274 + }, + { + "epoch": 1.53, + "grad_norm": 1.8283077736629882, + "learning_rate": 1.6251608485109519e-06, + "loss": 0.6999, + "step": 10275 + }, + { + "epoch": 1.53, + "grad_norm": 1.2735100480789725, + "learning_rate": 1.625085443511452e-06, + "loss": 0.6725, + "step": 10276 + }, + { + "epoch": 1.53, + "grad_norm": 1.378212324952378, + "learning_rate": 1.6250100326779746e-06, + "loss": 0.7018, + "step": 10277 + }, + { + "epoch": 1.53, + "grad_norm": 1.9177241480908265, + "learning_rate": 1.6249346160112233e-06, + "loss": 0.6654, + "step": 10278 + }, + { + "epoch": 1.53, + "grad_norm": 0.9846389662203885, + "learning_rate": 1.6248591935119022e-06, + "loss": 0.6556, + "step": 10279 + }, + { + "epoch": 1.53, + "grad_norm": 2.317685696027806, + "learning_rate": 1.6247837651807147e-06, + "loss": 0.6842, + "step": 10280 + }, + { + "epoch": 1.53, + "grad_norm": 4.406709687639959, + "learning_rate": 1.6247083310183649e-06, + "loss": 0.7031, + "step": 10281 + }, + { + "epoch": 1.53, + "grad_norm": 4.057843366874547, + "learning_rate": 1.6246328910255574e-06, + "loss": 0.6719, + "step": 10282 + }, + { + "epoch": 1.53, + "grad_norm": 3.2834437230059264, + "learning_rate": 1.6245574452029954e-06, + "loss": 0.6921, + "step": 10283 + }, + { + "epoch": 1.53, + "grad_norm": 5.140388752226303, + "learning_rate": 1.624481993551384e-06, + "loss": 0.6829, + "step": 10284 + }, + { + "epoch": 1.53, + "grad_norm": 2.836295121526413, + "learning_rate": 1.6244065360714267e-06, + "loss": 0.7012, + "step": 10285 + }, + { + "epoch": 1.53, + "grad_norm": 4.785551326687787, + "learning_rate": 1.6243310727638282e-06, + "loss": 0.6738, + "step": 10286 + }, + { + "epoch": 1.53, + "grad_norm": 2.442611807000822, + "learning_rate": 1.6242556036292924e-06, + "loss": 0.6367, + "step": 10287 + }, + { + "epoch": 1.53, + "grad_norm": 0.7092779912331559, + "learning_rate": 1.6241801286685239e-06, + "loss": 0.6797, + "step": 10288 + }, + { + "epoch": 1.53, + "grad_norm": 0.6807523592277771, + "learning_rate": 1.624104647882227e-06, + "loss": 0.6589, + "step": 10289 + }, + { + "epoch": 1.53, + "grad_norm": 1.9640673804602893, + "learning_rate": 1.6240291612711063e-06, + "loss": 0.6549, + "step": 10290 + }, + { + "epoch": 1.53, + "grad_norm": 0.768386547971458, + "learning_rate": 1.6239536688358662e-06, + "loss": 0.6686, + "step": 10291 + }, + { + "epoch": 1.53, + "grad_norm": 1.1268106982083896, + "learning_rate": 1.6238781705772113e-06, + "loss": 0.6862, + "step": 10292 + }, + { + "epoch": 1.54, + "grad_norm": 0.8007950386447416, + "learning_rate": 1.6238026664958465e-06, + "loss": 0.6445, + "step": 10293 + }, + { + "epoch": 1.54, + "grad_norm": 0.7380814479108405, + "learning_rate": 1.6237271565924759e-06, + "loss": 0.6895, + "step": 10294 + }, + { + "epoch": 1.54, + "grad_norm": 3.332159870948773, + "learning_rate": 1.6236516408678047e-06, + "loss": 0.7064, + "step": 10295 + }, + { + "epoch": 1.54, + "grad_norm": 1.9272970314236602, + "learning_rate": 1.623576119322538e-06, + "loss": 0.681, + "step": 10296 + }, + { + "epoch": 1.54, + "grad_norm": 2.4212373615672518, + "learning_rate": 1.6235005919573794e-06, + "loss": 0.6784, + "step": 10297 + }, + { + "epoch": 1.54, + "grad_norm": 1.1946902643028976, + "learning_rate": 1.6234250587730351e-06, + "loss": 0.6654, + "step": 10298 + }, + { + "epoch": 1.54, + "grad_norm": 1.4818673091315688, + "learning_rate": 1.6233495197702097e-06, + "loss": 0.6706, + "step": 10299 + }, + { + "epoch": 1.54, + "grad_norm": 1.0568815813564705, + "learning_rate": 1.623273974949608e-06, + "loss": 0.681, + "step": 10300 + }, + { + "epoch": 1.54, + "grad_norm": 3.1749948570188, + "learning_rate": 1.6231984243119351e-06, + "loss": 0.6849, + "step": 10301 + }, + { + "epoch": 1.54, + "grad_norm": 0.8902852780322059, + "learning_rate": 1.6231228678578962e-06, + "loss": 0.6595, + "step": 10302 + }, + { + "epoch": 1.54, + "grad_norm": 2.413186285748763, + "learning_rate": 1.6230473055881965e-06, + "loss": 0.6712, + "step": 10303 + }, + { + "epoch": 1.54, + "grad_norm": 1.5654114174447125, + "learning_rate": 1.6229717375035414e-06, + "loss": 0.6855, + "step": 10304 + }, + { + "epoch": 1.54, + "grad_norm": 6.260165424807542, + "learning_rate": 1.622896163604636e-06, + "loss": 0.668, + "step": 10305 + }, + { + "epoch": 1.54, + "grad_norm": 0.7561792209704052, + "learning_rate": 1.622820583892185e-06, + "loss": 0.6712, + "step": 10306 + }, + { + "epoch": 1.54, + "grad_norm": 0.9566194838320212, + "learning_rate": 1.622744998366895e-06, + "loss": 0.6595, + "step": 10307 + }, + { + "epoch": 1.54, + "grad_norm": 0.8920073461346159, + "learning_rate": 1.6226694070294706e-06, + "loss": 0.6855, + "step": 10308 + }, + { + "epoch": 1.54, + "grad_norm": 1.3236681698489543, + "learning_rate": 1.6225938098806178e-06, + "loss": 0.6654, + "step": 10309 + }, + { + "epoch": 1.54, + "grad_norm": 1.8041724735418005, + "learning_rate": 1.622518206921042e-06, + "loss": 0.681, + "step": 10310 + }, + { + "epoch": 1.54, + "grad_norm": 0.8624253069370428, + "learning_rate": 1.6224425981514483e-06, + "loss": 0.696, + "step": 10311 + }, + { + "epoch": 1.54, + "grad_norm": 1.5655836959200857, + "learning_rate": 1.622366983572543e-06, + "loss": 0.6582, + "step": 10312 + }, + { + "epoch": 1.54, + "grad_norm": 1.3023426686781667, + "learning_rate": 1.6222913631850314e-06, + "loss": 0.653, + "step": 10313 + }, + { + "epoch": 1.54, + "grad_norm": 2.1456280570902195, + "learning_rate": 1.6222157369896197e-06, + "loss": 0.6367, + "step": 10314 + }, + { + "epoch": 1.54, + "grad_norm": 2.668001587279381, + "learning_rate": 1.6221401049870133e-06, + "loss": 0.6628, + "step": 10315 + }, + { + "epoch": 1.54, + "grad_norm": 1.7439507443593811, + "learning_rate": 1.6220644671779187e-06, + "loss": 0.6413, + "step": 10316 + }, + { + "epoch": 1.54, + "grad_norm": 1.327873032148467, + "learning_rate": 1.6219888235630413e-06, + "loss": 0.7018, + "step": 10317 + }, + { + "epoch": 1.54, + "grad_norm": 0.9829974326608202, + "learning_rate": 1.6219131741430868e-06, + "loss": 0.6628, + "step": 10318 + }, + { + "epoch": 1.54, + "grad_norm": 1.1637074003834662, + "learning_rate": 1.6218375189187618e-06, + "loss": 0.6901, + "step": 10319 + }, + { + "epoch": 1.54, + "grad_norm": 3.05436967926895, + "learning_rate": 1.6217618578907723e-06, + "loss": 0.6732, + "step": 10320 + }, + { + "epoch": 1.54, + "grad_norm": 1.069736830476203, + "learning_rate": 1.6216861910598247e-06, + "loss": 0.696, + "step": 10321 + }, + { + "epoch": 1.54, + "grad_norm": 1.6785577335906408, + "learning_rate": 1.6216105184266248e-06, + "loss": 0.653, + "step": 10322 + }, + { + "epoch": 1.54, + "grad_norm": 2.164265326103103, + "learning_rate": 1.6215348399918787e-06, + "loss": 0.6868, + "step": 10323 + }, + { + "epoch": 1.54, + "grad_norm": 1.943248145300726, + "learning_rate": 1.6214591557562929e-06, + "loss": 0.7005, + "step": 10324 + }, + { + "epoch": 1.54, + "grad_norm": 2.8102200847432615, + "learning_rate": 1.6213834657205738e-06, + "loss": 0.6602, + "step": 10325 + }, + { + "epoch": 1.54, + "grad_norm": 0.9982958292995333, + "learning_rate": 1.6213077698854283e-06, + "loss": 0.6862, + "step": 10326 + }, + { + "epoch": 1.54, + "grad_norm": 4.016349713808062, + "learning_rate": 1.6212320682515621e-06, + "loss": 0.6543, + "step": 10327 + }, + { + "epoch": 1.54, + "grad_norm": 2.019242970468254, + "learning_rate": 1.6211563608196822e-06, + "loss": 0.6738, + "step": 10328 + }, + { + "epoch": 1.54, + "grad_norm": 2.58131986323554, + "learning_rate": 1.621080647590495e-06, + "loss": 0.6934, + "step": 10329 + }, + { + "epoch": 1.54, + "grad_norm": 1.8016794903308844, + "learning_rate": 1.6210049285647074e-06, + "loss": 0.6842, + "step": 10330 + }, + { + "epoch": 1.54, + "grad_norm": 2.3104563031084524, + "learning_rate": 1.6209292037430255e-06, + "loss": 0.6602, + "step": 10331 + }, + { + "epoch": 1.54, + "grad_norm": 3.9500694719970144, + "learning_rate": 1.6208534731261568e-06, + "loss": 0.6628, + "step": 10332 + }, + { + "epoch": 1.54, + "grad_norm": 5.815335827733258, + "learning_rate": 1.6207777367148076e-06, + "loss": 0.6901, + "step": 10333 + }, + { + "epoch": 1.54, + "grad_norm": 1.6106602085952437, + "learning_rate": 1.6207019945096847e-06, + "loss": 0.6641, + "step": 10334 + }, + { + "epoch": 1.54, + "grad_norm": 0.9335654738618404, + "learning_rate": 1.6206262465114954e-06, + "loss": 0.668, + "step": 10335 + }, + { + "epoch": 1.54, + "grad_norm": 3.9871359402541584, + "learning_rate": 1.6205504927209463e-06, + "loss": 0.6875, + "step": 10336 + }, + { + "epoch": 1.54, + "grad_norm": 2.4486935626367714, + "learning_rate": 1.6204747331387448e-06, + "loss": 0.6628, + "step": 10337 + }, + { + "epoch": 1.54, + "grad_norm": 2.746217134066712, + "learning_rate": 1.6203989677655977e-06, + "loss": 0.6673, + "step": 10338 + }, + { + "epoch": 1.54, + "grad_norm": 3.5443834761754296, + "learning_rate": 1.6203231966022119e-06, + "loss": 0.6953, + "step": 10339 + }, + { + "epoch": 1.54, + "grad_norm": 3.7000358389088794, + "learning_rate": 1.620247419649295e-06, + "loss": 0.6693, + "step": 10340 + }, + { + "epoch": 1.54, + "grad_norm": 4.04412248752864, + "learning_rate": 1.6201716369075543e-06, + "loss": 0.6816, + "step": 10341 + }, + { + "epoch": 1.54, + "grad_norm": 2.2586514283544883, + "learning_rate": 1.6200958483776966e-06, + "loss": 0.6595, + "step": 10342 + }, + { + "epoch": 1.54, + "grad_norm": 2.889012669517161, + "learning_rate": 1.6200200540604295e-06, + "loss": 0.6595, + "step": 10343 + }, + { + "epoch": 1.54, + "grad_norm": 2.8976146040744, + "learning_rate": 1.6199442539564606e-06, + "loss": 0.681, + "step": 10344 + }, + { + "epoch": 1.54, + "grad_norm": 3.5545503481266008, + "learning_rate": 1.6198684480664968e-06, + "loss": 0.6719, + "step": 10345 + }, + { + "epoch": 1.54, + "grad_norm": 2.540908788680958, + "learning_rate": 1.6197926363912461e-06, + "loss": 0.6855, + "step": 10346 + }, + { + "epoch": 1.54, + "grad_norm": 5.95318830408814, + "learning_rate": 1.619716818931416e-06, + "loss": 0.6628, + "step": 10347 + }, + { + "epoch": 1.54, + "grad_norm": 4.981416715286128, + "learning_rate": 1.6196409956877142e-06, + "loss": 0.6621, + "step": 10348 + }, + { + "epoch": 1.54, + "grad_norm": 1.73716482164375, + "learning_rate": 1.619565166660848e-06, + "loss": 0.6673, + "step": 10349 + }, + { + "epoch": 1.54, + "grad_norm": 1.122998877901755, + "learning_rate": 1.6194893318515255e-06, + "loss": 0.6654, + "step": 10350 + }, + { + "epoch": 1.54, + "grad_norm": 5.198227964604503, + "learning_rate": 1.619413491260454e-06, + "loss": 0.6797, + "step": 10351 + }, + { + "epoch": 1.54, + "grad_norm": 1.750103007225608, + "learning_rate": 1.6193376448883418e-06, + "loss": 0.6797, + "step": 10352 + }, + { + "epoch": 1.54, + "grad_norm": 1.1045226776613553, + "learning_rate": 1.6192617927358963e-06, + "loss": 0.6608, + "step": 10353 + }, + { + "epoch": 1.54, + "grad_norm": 3.2814242860591682, + "learning_rate": 1.6191859348038261e-06, + "loss": 0.6562, + "step": 10354 + }, + { + "epoch": 1.54, + "grad_norm": 2.14377832476718, + "learning_rate": 1.6191100710928387e-06, + "loss": 0.6615, + "step": 10355 + }, + { + "epoch": 1.54, + "grad_norm": 2.6870690612340367, + "learning_rate": 1.6190342016036422e-06, + "loss": 0.6797, + "step": 10356 + }, + { + "epoch": 1.54, + "grad_norm": 2.3209144662399384, + "learning_rate": 1.6189583263369445e-06, + "loss": 0.6517, + "step": 10357 + }, + { + "epoch": 1.54, + "grad_norm": 3.5603025809848408, + "learning_rate": 1.6188824452934542e-06, + "loss": 0.6862, + "step": 10358 + }, + { + "epoch": 1.54, + "grad_norm": 4.351726551970594, + "learning_rate": 1.6188065584738795e-06, + "loss": 0.6849, + "step": 10359 + }, + { + "epoch": 1.55, + "grad_norm": 2.192022690367436, + "learning_rate": 1.6187306658789282e-06, + "loss": 0.6458, + "step": 10360 + }, + { + "epoch": 1.55, + "grad_norm": 5.788007555369339, + "learning_rate": 1.6186547675093093e-06, + "loss": 0.7051, + "step": 10361 + }, + { + "epoch": 1.55, + "grad_norm": 4.504266123160117, + "learning_rate": 1.6185788633657303e-06, + "loss": 0.6673, + "step": 10362 + }, + { + "epoch": 1.55, + "grad_norm": 1.6865134965610802, + "learning_rate": 1.6185029534489e-06, + "loss": 0.6797, + "step": 10363 + }, + { + "epoch": 1.55, + "grad_norm": 0.9341424785068296, + "learning_rate": 1.6184270377595272e-06, + "loss": 0.6647, + "step": 10364 + }, + { + "epoch": 1.55, + "grad_norm": 2.0815860615065533, + "learning_rate": 1.6183511162983203e-06, + "loss": 0.6647, + "step": 10365 + }, + { + "epoch": 1.55, + "grad_norm": 7.5945081871429245, + "learning_rate": 1.6182751890659876e-06, + "loss": 0.6908, + "step": 10366 + }, + { + "epoch": 1.55, + "grad_norm": 0.8763319451401799, + "learning_rate": 1.6181992560632379e-06, + "loss": 0.6901, + "step": 10367 + }, + { + "epoch": 1.55, + "grad_norm": 4.741498208980225, + "learning_rate": 1.6181233172907796e-06, + "loss": 0.6888, + "step": 10368 + }, + { + "epoch": 1.55, + "grad_norm": 1.8143414862425047, + "learning_rate": 1.618047372749322e-06, + "loss": 0.6621, + "step": 10369 + }, + { + "epoch": 1.55, + "grad_norm": 6.329865665458528, + "learning_rate": 1.6179714224395736e-06, + "loss": 0.7077, + "step": 10370 + }, + { + "epoch": 1.55, + "grad_norm": 3.5759771294002705, + "learning_rate": 1.6178954663622432e-06, + "loss": 0.6725, + "step": 10371 + }, + { + "epoch": 1.55, + "grad_norm": 2.214199554003566, + "learning_rate": 1.6178195045180397e-06, + "loss": 0.6764, + "step": 10372 + }, + { + "epoch": 1.55, + "grad_norm": 1.1235622170957353, + "learning_rate": 1.6177435369076722e-06, + "loss": 0.7038, + "step": 10373 + }, + { + "epoch": 1.55, + "grad_norm": 2.5525366613637863, + "learning_rate": 1.6176675635318496e-06, + "loss": 0.6523, + "step": 10374 + }, + { + "epoch": 1.55, + "grad_norm": 2.581232288898547, + "learning_rate": 1.617591584391281e-06, + "loss": 0.6706, + "step": 10375 + }, + { + "epoch": 1.55, + "grad_norm": 3.3223048462399847, + "learning_rate": 1.6175155994866758e-06, + "loss": 0.6589, + "step": 10376 + }, + { + "epoch": 1.55, + "grad_norm": 3.72471007887352, + "learning_rate": 1.6174396088187425e-06, + "loss": 0.6849, + "step": 10377 + }, + { + "epoch": 1.55, + "grad_norm": 2.330658386130743, + "learning_rate": 1.6173636123881909e-06, + "loss": 0.6686, + "step": 10378 + }, + { + "epoch": 1.55, + "grad_norm": 0.8001712346950672, + "learning_rate": 1.6172876101957302e-06, + "loss": 0.6615, + "step": 10379 + }, + { + "epoch": 1.55, + "grad_norm": 5.383317877560884, + "learning_rate": 1.6172116022420696e-06, + "loss": 0.6712, + "step": 10380 + }, + { + "epoch": 1.55, + "grad_norm": 1.5328668184902223, + "learning_rate": 1.6171355885279182e-06, + "loss": 0.6673, + "step": 10381 + }, + { + "epoch": 1.55, + "grad_norm": 3.174624306618927, + "learning_rate": 1.6170595690539864e-06, + "loss": 0.6634, + "step": 10382 + }, + { + "epoch": 1.55, + "grad_norm": 3.2616399382115104, + "learning_rate": 1.6169835438209827e-06, + "loss": 0.6836, + "step": 10383 + }, + { + "epoch": 1.55, + "grad_norm": 2.4267215406453206, + "learning_rate": 1.616907512829617e-06, + "loss": 0.6895, + "step": 10384 + }, + { + "epoch": 1.55, + "grad_norm": 3.131815108711594, + "learning_rate": 1.616831476080599e-06, + "loss": 0.6589, + "step": 10385 + }, + { + "epoch": 1.55, + "grad_norm": 3.1992642015711996, + "learning_rate": 1.6167554335746383e-06, + "loss": 0.6829, + "step": 10386 + }, + { + "epoch": 1.55, + "grad_norm": 2.069198644927804, + "learning_rate": 1.6166793853124446e-06, + "loss": 0.6608, + "step": 10387 + }, + { + "epoch": 1.55, + "grad_norm": 1.8047024208391222, + "learning_rate": 1.6166033312947277e-06, + "loss": 0.6582, + "step": 10388 + }, + { + "epoch": 1.55, + "grad_norm": 4.896511442813455, + "learning_rate": 1.6165272715221971e-06, + "loss": 0.6895, + "step": 10389 + }, + { + "epoch": 1.55, + "grad_norm": 2.0310259108127324, + "learning_rate": 1.6164512059955634e-06, + "loss": 0.6908, + "step": 10390 + }, + { + "epoch": 1.55, + "grad_norm": 0.7994083102015301, + "learning_rate": 1.6163751347155356e-06, + "loss": 0.6784, + "step": 10391 + }, + { + "epoch": 1.55, + "grad_norm": 1.9506858265245428, + "learning_rate": 1.6162990576828243e-06, + "loss": 0.651, + "step": 10392 + }, + { + "epoch": 1.55, + "grad_norm": 3.4007130857528476, + "learning_rate": 1.6162229748981395e-06, + "loss": 0.6803, + "step": 10393 + }, + { + "epoch": 1.55, + "grad_norm": 2.798553516880961, + "learning_rate": 1.6161468863621912e-06, + "loss": 0.6758, + "step": 10394 + }, + { + "epoch": 1.55, + "grad_norm": 4.206464170282407, + "learning_rate": 1.6160707920756896e-06, + "loss": 0.6764, + "step": 10395 + }, + { + "epoch": 1.55, + "grad_norm": 3.9388289846270292, + "learning_rate": 1.6159946920393446e-06, + "loss": 0.6764, + "step": 10396 + }, + { + "epoch": 1.55, + "grad_norm": 0.8489396704147095, + "learning_rate": 1.6159185862538667e-06, + "loss": 0.6706, + "step": 10397 + }, + { + "epoch": 1.55, + "grad_norm": 2.1757773705151244, + "learning_rate": 1.6158424747199663e-06, + "loss": 0.7038, + "step": 10398 + }, + { + "epoch": 1.55, + "grad_norm": 2.2249919493466384, + "learning_rate": 1.6157663574383538e-06, + "loss": 0.6562, + "step": 10399 + }, + { + "epoch": 1.55, + "grad_norm": 7.365167283942752, + "learning_rate": 1.615690234409739e-06, + "loss": 0.6569, + "step": 10400 + }, + { + "epoch": 1.55, + "grad_norm": 1.369963269935487, + "learning_rate": 1.6156141056348326e-06, + "loss": 0.681, + "step": 10401 + }, + { + "epoch": 1.55, + "grad_norm": 3.650694683702199, + "learning_rate": 1.6155379711143459e-06, + "loss": 0.6738, + "step": 10402 + }, + { + "epoch": 1.55, + "grad_norm": 0.8692712755574945, + "learning_rate": 1.6154618308489888e-06, + "loss": 0.6699, + "step": 10403 + }, + { + "epoch": 1.55, + "grad_norm": 1.5486848130848325, + "learning_rate": 1.6153856848394719e-06, + "loss": 0.6426, + "step": 10404 + }, + { + "epoch": 1.55, + "grad_norm": 2.3769222281434983, + "learning_rate": 1.6153095330865058e-06, + "loss": 0.6784, + "step": 10405 + }, + { + "epoch": 1.55, + "grad_norm": 1.3887591117591862, + "learning_rate": 1.6152333755908017e-06, + "loss": 0.6725, + "step": 10406 + }, + { + "epoch": 1.55, + "grad_norm": 1.410583961766482, + "learning_rate": 1.6151572123530698e-06, + "loss": 0.6641, + "step": 10407 + }, + { + "epoch": 1.55, + "grad_norm": 1.176482983639999, + "learning_rate": 1.6150810433740215e-06, + "loss": 0.6706, + "step": 10408 + }, + { + "epoch": 1.55, + "grad_norm": 3.2539466590024766, + "learning_rate": 1.6150048686543673e-06, + "loss": 0.6777, + "step": 10409 + }, + { + "epoch": 1.55, + "grad_norm": 1.9741635487264337, + "learning_rate": 1.6149286881948183e-06, + "loss": 0.6888, + "step": 10410 + }, + { + "epoch": 1.55, + "grad_norm": 2.8209258110351887, + "learning_rate": 1.6148525019960855e-06, + "loss": 0.6784, + "step": 10411 + }, + { + "epoch": 1.55, + "grad_norm": 1.1262730757025194, + "learning_rate": 1.6147763100588798e-06, + "loss": 0.6536, + "step": 10412 + }, + { + "epoch": 1.55, + "grad_norm": 1.229446980983648, + "learning_rate": 1.6147001123839123e-06, + "loss": 0.6419, + "step": 10413 + }, + { + "epoch": 1.55, + "grad_norm": 2.1248102650360874, + "learning_rate": 1.6146239089718948e-06, + "loss": 0.6712, + "step": 10414 + }, + { + "epoch": 1.55, + "grad_norm": 0.883114823114334, + "learning_rate": 1.6145476998235376e-06, + "loss": 0.6816, + "step": 10415 + }, + { + "epoch": 1.55, + "grad_norm": 6.456584065868253, + "learning_rate": 1.6144714849395525e-06, + "loss": 0.6868, + "step": 10416 + }, + { + "epoch": 1.55, + "grad_norm": 1.349353327782616, + "learning_rate": 1.6143952643206506e-06, + "loss": 0.666, + "step": 10417 + }, + { + "epoch": 1.55, + "grad_norm": 2.2332579356050517, + "learning_rate": 1.6143190379675438e-06, + "loss": 0.6816, + "step": 10418 + }, + { + "epoch": 1.55, + "grad_norm": 1.0158465766087084, + "learning_rate": 1.6142428058809425e-06, + "loss": 0.6706, + "step": 10419 + }, + { + "epoch": 1.55, + "grad_norm": 1.0892002413418176, + "learning_rate": 1.614166568061559e-06, + "loss": 0.6764, + "step": 10420 + }, + { + "epoch": 1.55, + "grad_norm": 1.4195318732827586, + "learning_rate": 1.6140903245101046e-06, + "loss": 0.6764, + "step": 10421 + }, + { + "epoch": 1.55, + "grad_norm": 1.061846028194124, + "learning_rate": 1.614014075227291e-06, + "loss": 0.6758, + "step": 10422 + }, + { + "epoch": 1.55, + "grad_norm": 0.9899584425524364, + "learning_rate": 1.6139378202138295e-06, + "loss": 0.6634, + "step": 10423 + }, + { + "epoch": 1.55, + "grad_norm": 0.955512872978637, + "learning_rate": 1.613861559470432e-06, + "loss": 0.6855, + "step": 10424 + }, + { + "epoch": 1.55, + "grad_norm": 1.7157983782957074, + "learning_rate": 1.6137852929978107e-06, + "loss": 0.7077, + "step": 10425 + }, + { + "epoch": 1.55, + "grad_norm": 1.1569380509452167, + "learning_rate": 1.6137090207966766e-06, + "loss": 0.6895, + "step": 10426 + }, + { + "epoch": 1.56, + "grad_norm": 1.0958081136112015, + "learning_rate": 1.6136327428677421e-06, + "loss": 0.6432, + "step": 10427 + }, + { + "epoch": 1.56, + "grad_norm": 2.8908274638195035, + "learning_rate": 1.613556459211719e-06, + "loss": 0.6432, + "step": 10428 + }, + { + "epoch": 1.56, + "grad_norm": 0.9807690588725883, + "learning_rate": 1.613480169829319e-06, + "loss": 0.6849, + "step": 10429 + }, + { + "epoch": 1.56, + "grad_norm": 3.7634343014871123, + "learning_rate": 1.6134038747212544e-06, + "loss": 0.6504, + "step": 10430 + }, + { + "epoch": 1.56, + "grad_norm": 0.8422792012989204, + "learning_rate": 1.6133275738882373e-06, + "loss": 0.651, + "step": 10431 + }, + { + "epoch": 1.56, + "grad_norm": 0.9802672663108236, + "learning_rate": 1.6132512673309794e-06, + "loss": 0.6634, + "step": 10432 + }, + { + "epoch": 1.56, + "grad_norm": 5.769275077767195, + "learning_rate": 1.6131749550501937e-06, + "loss": 0.6973, + "step": 10433 + }, + { + "epoch": 1.56, + "grad_norm": 2.618417782396874, + "learning_rate": 1.6130986370465914e-06, + "loss": 0.6771, + "step": 10434 + }, + { + "epoch": 1.56, + "grad_norm": 1.2837721955170236, + "learning_rate": 1.6130223133208857e-06, + "loss": 0.6484, + "step": 10435 + }, + { + "epoch": 1.56, + "grad_norm": 0.8728846283375432, + "learning_rate": 1.6129459838737885e-06, + "loss": 0.6719, + "step": 10436 + }, + { + "epoch": 1.56, + "grad_norm": 1.6443744587216584, + "learning_rate": 1.6128696487060118e-06, + "loss": 0.6686, + "step": 10437 + }, + { + "epoch": 1.56, + "grad_norm": 1.414006712068593, + "learning_rate": 1.612793307818269e-06, + "loss": 0.6842, + "step": 10438 + }, + { + "epoch": 1.56, + "grad_norm": 2.880968944746577, + "learning_rate": 1.6127169612112717e-06, + "loss": 0.6868, + "step": 10439 + }, + { + "epoch": 1.56, + "grad_norm": 1.924138063683255, + "learning_rate": 1.6126406088857332e-06, + "loss": 0.6647, + "step": 10440 + }, + { + "epoch": 1.56, + "grad_norm": 3.8105705087558817, + "learning_rate": 1.6125642508423653e-06, + "loss": 0.7051, + "step": 10441 + }, + { + "epoch": 1.56, + "grad_norm": 2.8269868290764024, + "learning_rate": 1.6124878870818812e-06, + "loss": 0.6764, + "step": 10442 + }, + { + "epoch": 1.56, + "grad_norm": 0.8412735988853681, + "learning_rate": 1.6124115176049935e-06, + "loss": 0.6751, + "step": 10443 + }, + { + "epoch": 1.56, + "grad_norm": 4.29874572931978, + "learning_rate": 1.612335142412415e-06, + "loss": 0.7038, + "step": 10444 + }, + { + "epoch": 1.56, + "grad_norm": 3.2465152967572943, + "learning_rate": 1.6122587615048585e-06, + "loss": 0.666, + "step": 10445 + }, + { + "epoch": 1.56, + "grad_norm": 4.673608730443005, + "learning_rate": 1.6121823748830367e-06, + "loss": 0.6725, + "step": 10446 + }, + { + "epoch": 1.56, + "grad_norm": 3.003444823455845, + "learning_rate": 1.6121059825476628e-06, + "loss": 0.6641, + "step": 10447 + }, + { + "epoch": 1.56, + "grad_norm": 5.278584169801551, + "learning_rate": 1.6120295844994495e-06, + "loss": 0.6712, + "step": 10448 + }, + { + "epoch": 1.56, + "grad_norm": 2.701231225212683, + "learning_rate": 1.6119531807391101e-06, + "loss": 0.6855, + "step": 10449 + }, + { + "epoch": 1.56, + "grad_norm": 3.5803940278464905, + "learning_rate": 1.6118767712673576e-06, + "loss": 0.6855, + "step": 10450 + }, + { + "epoch": 1.56, + "grad_norm": 1.0860018128550684, + "learning_rate": 1.6118003560849047e-06, + "loss": 0.6901, + "step": 10451 + }, + { + "epoch": 1.56, + "grad_norm": 3.753730623367058, + "learning_rate": 1.6117239351924654e-06, + "loss": 0.6497, + "step": 10452 + }, + { + "epoch": 1.56, + "grad_norm": 2.3078542400341875, + "learning_rate": 1.6116475085907522e-06, + "loss": 0.6947, + "step": 10453 + }, + { + "epoch": 1.56, + "grad_norm": 1.9325891366307528, + "learning_rate": 1.611571076280479e-06, + "loss": 0.6927, + "step": 10454 + }, + { + "epoch": 1.56, + "grad_norm": 4.343129013444942, + "learning_rate": 1.6114946382623587e-06, + "loss": 0.6849, + "step": 10455 + }, + { + "epoch": 1.56, + "grad_norm": 0.7728169738890908, + "learning_rate": 1.6114181945371047e-06, + "loss": 0.6439, + "step": 10456 + }, + { + "epoch": 1.56, + "grad_norm": 1.387393004323564, + "learning_rate": 1.6113417451054309e-06, + "loss": 0.6699, + "step": 10457 + }, + { + "epoch": 1.56, + "grad_norm": 3.5437382887293705, + "learning_rate": 1.6112652899680504e-06, + "loss": 0.679, + "step": 10458 + }, + { + "epoch": 1.56, + "grad_norm": 1.1233492622932046, + "learning_rate": 1.6111888291256768e-06, + "loss": 0.6784, + "step": 10459 + }, + { + "epoch": 1.56, + "grad_norm": 4.490913850229282, + "learning_rate": 1.611112362579024e-06, + "loss": 0.6582, + "step": 10460 + }, + { + "epoch": 1.56, + "grad_norm": 0.6004067589826256, + "learning_rate": 1.6110358903288056e-06, + "loss": 0.6751, + "step": 10461 + }, + { + "epoch": 1.56, + "grad_norm": 0.678892270608283, + "learning_rate": 1.6109594123757348e-06, + "loss": 0.6784, + "step": 10462 + }, + { + "epoch": 1.56, + "grad_norm": 2.76935470413852, + "learning_rate": 1.6108829287205258e-06, + "loss": 0.6582, + "step": 10463 + }, + { + "epoch": 1.56, + "grad_norm": 3.372485025747459, + "learning_rate": 1.6108064393638926e-06, + "loss": 0.6641, + "step": 10464 + }, + { + "epoch": 1.56, + "grad_norm": 0.7081553294186177, + "learning_rate": 1.6107299443065491e-06, + "loss": 0.6582, + "step": 10465 + }, + { + "epoch": 1.56, + "grad_norm": 5.314668946029745, + "learning_rate": 1.6106534435492088e-06, + "loss": 0.6699, + "step": 10466 + }, + { + "epoch": 1.56, + "grad_norm": 1.3340896894664265, + "learning_rate": 1.6105769370925857e-06, + "loss": 0.6719, + "step": 10467 + }, + { + "epoch": 1.56, + "grad_norm": 2.9546058309904315, + "learning_rate": 1.6105004249373946e-06, + "loss": 0.7064, + "step": 10468 + }, + { + "epoch": 1.56, + "grad_norm": 1.569812357202883, + "learning_rate": 1.6104239070843485e-06, + "loss": 0.681, + "step": 10469 + }, + { + "epoch": 1.56, + "grad_norm": 0.8295144847152031, + "learning_rate": 1.6103473835341622e-06, + "loss": 0.681, + "step": 10470 + }, + { + "epoch": 1.56, + "grad_norm": 1.3793225498629087, + "learning_rate": 1.61027085428755e-06, + "loss": 0.6621, + "step": 10471 + }, + { + "epoch": 1.56, + "grad_norm": 2.5435876234694472, + "learning_rate": 1.6101943193452257e-06, + "loss": 0.6927, + "step": 10472 + }, + { + "epoch": 1.56, + "grad_norm": 1.7012742021446208, + "learning_rate": 1.610117778707904e-06, + "loss": 0.6608, + "step": 10473 + }, + { + "epoch": 1.56, + "grad_norm": 1.9449859337811843, + "learning_rate": 1.6100412323762992e-06, + "loss": 0.6829, + "step": 10474 + }, + { + "epoch": 1.56, + "grad_norm": 3.1959659774631164, + "learning_rate": 1.6099646803511253e-06, + "loss": 0.6764, + "step": 10475 + }, + { + "epoch": 1.56, + "grad_norm": 0.7992540534575984, + "learning_rate": 1.6098881226330973e-06, + "loss": 0.6602, + "step": 10476 + }, + { + "epoch": 1.56, + "grad_norm": 2.6723373752643966, + "learning_rate": 1.6098115592229295e-06, + "loss": 0.6712, + "step": 10477 + }, + { + "epoch": 1.56, + "grad_norm": 0.7289105751564607, + "learning_rate": 1.6097349901213367e-06, + "loss": 0.6686, + "step": 10478 + }, + { + "epoch": 1.56, + "grad_norm": 4.0699256618476465, + "learning_rate": 1.609658415329033e-06, + "loss": 0.6719, + "step": 10479 + }, + { + "epoch": 1.56, + "grad_norm": 0.7942858259121693, + "learning_rate": 1.6095818348467333e-06, + "loss": 0.6517, + "step": 10480 + }, + { + "epoch": 1.56, + "grad_norm": 2.6811791266427902, + "learning_rate": 1.6095052486751528e-06, + "loss": 0.6882, + "step": 10481 + }, + { + "epoch": 1.56, + "grad_norm": 2.840642315135, + "learning_rate": 1.6094286568150058e-06, + "loss": 0.6302, + "step": 10482 + }, + { + "epoch": 1.56, + "grad_norm": 0.9069955900969143, + "learning_rate": 1.609352059267007e-06, + "loss": 0.6862, + "step": 10483 + }, + { + "epoch": 1.56, + "grad_norm": 2.5729942039322156, + "learning_rate": 1.609275456031872e-06, + "loss": 0.6966, + "step": 10484 + }, + { + "epoch": 1.56, + "grad_norm": 5.059352259643909, + "learning_rate": 1.609198847110315e-06, + "loss": 0.6888, + "step": 10485 + }, + { + "epoch": 1.56, + "grad_norm": 1.0541187815473374, + "learning_rate": 1.6091222325030513e-06, + "loss": 0.6934, + "step": 10486 + }, + { + "epoch": 1.56, + "grad_norm": 2.9819551252196392, + "learning_rate": 1.6090456122107958e-06, + "loss": 0.6999, + "step": 10487 + }, + { + "epoch": 1.56, + "grad_norm": 1.0696898906949532, + "learning_rate": 1.6089689862342642e-06, + "loss": 0.6257, + "step": 10488 + }, + { + "epoch": 1.56, + "grad_norm": 0.9021072518924748, + "learning_rate": 1.6088923545741708e-06, + "loss": 0.6953, + "step": 10489 + }, + { + "epoch": 1.56, + "grad_norm": 1.2898816622882154, + "learning_rate": 1.6088157172312314e-06, + "loss": 0.6784, + "step": 10490 + }, + { + "epoch": 1.56, + "grad_norm": 1.8671226129773821, + "learning_rate": 1.608739074206161e-06, + "loss": 0.6836, + "step": 10491 + }, + { + "epoch": 1.56, + "grad_norm": 0.9630630986460217, + "learning_rate": 1.6086624254996748e-06, + "loss": 0.6673, + "step": 10492 + }, + { + "epoch": 1.56, + "grad_norm": 0.9534212601201989, + "learning_rate": 1.6085857711124886e-06, + "loss": 0.6751, + "step": 10493 + }, + { + "epoch": 1.57, + "grad_norm": 0.8063841834616705, + "learning_rate": 1.6085091110453176e-06, + "loss": 0.653, + "step": 10494 + }, + { + "epoch": 1.57, + "grad_norm": 0.9418307657614774, + "learning_rate": 1.6084324452988768e-06, + "loss": 0.6589, + "step": 10495 + }, + { + "epoch": 1.57, + "grad_norm": 0.966916627813492, + "learning_rate": 1.6083557738738827e-06, + "loss": 0.6276, + "step": 10496 + }, + { + "epoch": 1.57, + "grad_norm": 1.0025156114418055, + "learning_rate": 1.6082790967710502e-06, + "loss": 0.6641, + "step": 10497 + }, + { + "epoch": 1.57, + "grad_norm": 1.1493269929461927, + "learning_rate": 1.608202413991095e-06, + "loss": 0.6497, + "step": 10498 + }, + { + "epoch": 1.57, + "grad_norm": 0.8445852254329119, + "learning_rate": 1.6081257255347331e-06, + "loss": 0.668, + "step": 10499 + }, + { + "epoch": 1.57, + "grad_norm": 1.8219434825447665, + "learning_rate": 1.6080490314026799e-06, + "loss": 0.6641, + "step": 10500 + }, + { + "epoch": 1.57, + "grad_norm": 2.056938920929201, + "learning_rate": 1.6079723315956516e-06, + "loss": 0.6654, + "step": 10501 + }, + { + "epoch": 1.57, + "grad_norm": 1.7341374336036666, + "learning_rate": 1.6078956261143631e-06, + "loss": 0.64, + "step": 10502 + }, + { + "epoch": 1.57, + "grad_norm": 3.5803581192531526, + "learning_rate": 1.6078189149595313e-06, + "loss": 0.6589, + "step": 10503 + }, + { + "epoch": 1.57, + "grad_norm": 4.44271908965502, + "learning_rate": 1.6077421981318719e-06, + "loss": 0.6719, + "step": 10504 + }, + { + "epoch": 1.57, + "grad_norm": 2.4986974029924798, + "learning_rate": 1.6076654756321008e-06, + "loss": 0.6576, + "step": 10505 + }, + { + "epoch": 1.57, + "grad_norm": 3.6179058766408483, + "learning_rate": 1.6075887474609342e-06, + "loss": 0.679, + "step": 10506 + }, + { + "epoch": 1.57, + "grad_norm": 2.761890268668514, + "learning_rate": 1.6075120136190878e-06, + "loss": 0.6732, + "step": 10507 + }, + { + "epoch": 1.57, + "grad_norm": 1.0889639481834938, + "learning_rate": 1.6074352741072783e-06, + "loss": 0.6641, + "step": 10508 + }, + { + "epoch": 1.57, + "grad_norm": 1.0308724621585843, + "learning_rate": 1.6073585289262214e-06, + "loss": 0.6185, + "step": 10509 + }, + { + "epoch": 1.57, + "grad_norm": 5.3667034155828235, + "learning_rate": 1.6072817780766336e-06, + "loss": 0.6862, + "step": 10510 + }, + { + "epoch": 1.57, + "grad_norm": 2.4876183823846394, + "learning_rate": 1.6072050215592316e-06, + "loss": 0.6576, + "step": 10511 + }, + { + "epoch": 1.57, + "grad_norm": 6.562099297514402, + "learning_rate": 1.6071282593747313e-06, + "loss": 0.7012, + "step": 10512 + }, + { + "epoch": 1.57, + "grad_norm": 3.1425871169952027, + "learning_rate": 1.6070514915238494e-06, + "loss": 0.6361, + "step": 10513 + }, + { + "epoch": 1.57, + "grad_norm": 2.568115755048413, + "learning_rate": 1.606974718007302e-06, + "loss": 0.6621, + "step": 10514 + }, + { + "epoch": 1.57, + "grad_norm": 2.268286407262168, + "learning_rate": 1.6068979388258058e-06, + "loss": 0.6667, + "step": 10515 + }, + { + "epoch": 1.57, + "grad_norm": 4.230876487428838, + "learning_rate": 1.6068211539800779e-06, + "loss": 0.7012, + "step": 10516 + }, + { + "epoch": 1.57, + "grad_norm": 2.5971326087346056, + "learning_rate": 1.6067443634708342e-06, + "loss": 0.6673, + "step": 10517 + }, + { + "epoch": 1.57, + "grad_norm": 3.8659130673162037, + "learning_rate": 1.6066675672987917e-06, + "loss": 0.694, + "step": 10518 + }, + { + "epoch": 1.57, + "grad_norm": 1.4778930943929722, + "learning_rate": 1.6065907654646673e-06, + "loss": 0.6608, + "step": 10519 + }, + { + "epoch": 1.57, + "grad_norm": 2.1795504921000597, + "learning_rate": 1.6065139579691773e-06, + "loss": 0.6738, + "step": 10520 + }, + { + "epoch": 1.57, + "grad_norm": 3.2139336278060138, + "learning_rate": 1.6064371448130394e-06, + "loss": 0.6895, + "step": 10521 + }, + { + "epoch": 1.57, + "grad_norm": 5.688882433914158, + "learning_rate": 1.6063603259969696e-06, + "loss": 0.6647, + "step": 10522 + }, + { + "epoch": 1.57, + "grad_norm": 5.571281083275598, + "learning_rate": 1.6062835015216854e-06, + "loss": 0.6654, + "step": 10523 + }, + { + "epoch": 1.57, + "grad_norm": 2.347959191372006, + "learning_rate": 1.6062066713879037e-06, + "loss": 0.6725, + "step": 10524 + }, + { + "epoch": 1.57, + "grad_norm": 4.991300195623274, + "learning_rate": 1.6061298355963414e-06, + "loss": 0.6849, + "step": 10525 + }, + { + "epoch": 1.57, + "grad_norm": 1.7359229731184926, + "learning_rate": 1.6060529941477157e-06, + "loss": 0.7012, + "step": 10526 + }, + { + "epoch": 1.57, + "grad_norm": 2.9510153699113486, + "learning_rate": 1.605976147042744e-06, + "loss": 0.6777, + "step": 10527 + }, + { + "epoch": 1.57, + "grad_norm": 3.497766326398461, + "learning_rate": 1.605899294282143e-06, + "loss": 0.6751, + "step": 10528 + }, + { + "epoch": 1.57, + "grad_norm": 0.8911976721575352, + "learning_rate": 1.6058224358666307e-06, + "loss": 0.6413, + "step": 10529 + }, + { + "epoch": 1.57, + "grad_norm": 0.8074764355196067, + "learning_rate": 1.6057455717969237e-06, + "loss": 0.6725, + "step": 10530 + }, + { + "epoch": 1.57, + "grad_norm": 2.232085591843132, + "learning_rate": 1.60566870207374e-06, + "loss": 0.6816, + "step": 10531 + }, + { + "epoch": 1.57, + "grad_norm": 1.2507801545933415, + "learning_rate": 1.6055918266977969e-06, + "loss": 0.6595, + "step": 10532 + }, + { + "epoch": 1.57, + "grad_norm": 1.1183630046251014, + "learning_rate": 1.6055149456698112e-06, + "loss": 0.7292, + "step": 10533 + }, + { + "epoch": 1.57, + "grad_norm": 4.111647265753686, + "learning_rate": 1.6054380589905012e-06, + "loss": 0.6673, + "step": 10534 + }, + { + "epoch": 1.57, + "grad_norm": 2.39721277268262, + "learning_rate": 1.6053611666605841e-06, + "loss": 0.6667, + "step": 10535 + }, + { + "epoch": 1.57, + "grad_norm": 1.8732491309080523, + "learning_rate": 1.6052842686807782e-06, + "loss": 0.6465, + "step": 10536 + }, + { + "epoch": 1.57, + "grad_norm": 0.8850797768324148, + "learning_rate": 1.6052073650518003e-06, + "loss": 0.6901, + "step": 10537 + }, + { + "epoch": 1.57, + "grad_norm": 0.7271359044288235, + "learning_rate": 1.6051304557743683e-06, + "loss": 0.6673, + "step": 10538 + }, + { + "epoch": 1.57, + "grad_norm": 5.17085331479119, + "learning_rate": 1.6050535408492008e-06, + "loss": 0.6842, + "step": 10539 + }, + { + "epoch": 1.57, + "grad_norm": 1.5805172998029935, + "learning_rate": 1.6049766202770147e-06, + "loss": 0.6901, + "step": 10540 + }, + { + "epoch": 1.57, + "grad_norm": 1.0262321421689766, + "learning_rate": 1.6048996940585285e-06, + "loss": 0.6764, + "step": 10541 + }, + { + "epoch": 1.57, + "grad_norm": 2.994189389193963, + "learning_rate": 1.60482276219446e-06, + "loss": 0.6419, + "step": 10542 + }, + { + "epoch": 1.57, + "grad_norm": 2.2393789170433736, + "learning_rate": 1.6047458246855267e-06, + "loss": 0.6582, + "step": 10543 + }, + { + "epoch": 1.57, + "grad_norm": 3.662197849008676, + "learning_rate": 1.6046688815324478e-06, + "loss": 0.6823, + "step": 10544 + }, + { + "epoch": 1.57, + "grad_norm": 1.2414422723840777, + "learning_rate": 1.6045919327359406e-06, + "loss": 0.6862, + "step": 10545 + }, + { + "epoch": 1.57, + "grad_norm": 3.345804822999088, + "learning_rate": 1.604514978296723e-06, + "loss": 0.6914, + "step": 10546 + }, + { + "epoch": 1.57, + "grad_norm": 1.6791962668558984, + "learning_rate": 1.6044380182155139e-06, + "loss": 0.6634, + "step": 10547 + }, + { + "epoch": 1.57, + "grad_norm": 0.6795543597212046, + "learning_rate": 1.6043610524930316e-06, + "loss": 0.6823, + "step": 10548 + }, + { + "epoch": 1.57, + "grad_norm": 2.925246111511015, + "learning_rate": 1.6042840811299937e-06, + "loss": 0.6803, + "step": 10549 + }, + { + "epoch": 1.57, + "grad_norm": 2.869999670301417, + "learning_rate": 1.6042071041271195e-06, + "loss": 0.6882, + "step": 10550 + }, + { + "epoch": 1.57, + "grad_norm": 3.6950603424681123, + "learning_rate": 1.6041301214851264e-06, + "loss": 0.7077, + "step": 10551 + }, + { + "epoch": 1.57, + "grad_norm": 0.9226251768241063, + "learning_rate": 1.6040531332047337e-06, + "loss": 0.6523, + "step": 10552 + }, + { + "epoch": 1.57, + "grad_norm": 2.328540241169398, + "learning_rate": 1.6039761392866598e-06, + "loss": 0.7018, + "step": 10553 + }, + { + "epoch": 1.57, + "grad_norm": 6.422729052141753, + "learning_rate": 1.6038991397316232e-06, + "loss": 0.7077, + "step": 10554 + }, + { + "epoch": 1.57, + "grad_norm": 1.3660643045467609, + "learning_rate": 1.6038221345403425e-06, + "loss": 0.6784, + "step": 10555 + }, + { + "epoch": 1.57, + "grad_norm": 0.6911833034906448, + "learning_rate": 1.6037451237135362e-06, + "loss": 0.666, + "step": 10556 + }, + { + "epoch": 1.57, + "grad_norm": 2.0362565916142636, + "learning_rate": 1.6036681072519235e-06, + "loss": 0.6751, + "step": 10557 + }, + { + "epoch": 1.57, + "grad_norm": 3.480279198683516, + "learning_rate": 1.6035910851562229e-06, + "loss": 0.6953, + "step": 10558 + }, + { + "epoch": 1.57, + "grad_norm": 1.3578972504825313, + "learning_rate": 1.6035140574271533e-06, + "loss": 0.6673, + "step": 10559 + }, + { + "epoch": 1.57, + "grad_norm": 1.9030141254474753, + "learning_rate": 1.6034370240654335e-06, + "loss": 0.6842, + "step": 10560 + }, + { + "epoch": 1.58, + "grad_norm": 1.5051463536826981, + "learning_rate": 1.6033599850717828e-06, + "loss": 0.6523, + "step": 10561 + }, + { + "epoch": 1.58, + "grad_norm": 2.7094218003369117, + "learning_rate": 1.60328294044692e-06, + "loss": 0.6738, + "step": 10562 + }, + { + "epoch": 1.58, + "grad_norm": 0.6958792909678274, + "learning_rate": 1.6032058901915638e-06, + "loss": 0.6706, + "step": 10563 + }, + { + "epoch": 1.58, + "grad_norm": 4.029556403930683, + "learning_rate": 1.6031288343064344e-06, + "loss": 0.6589, + "step": 10564 + }, + { + "epoch": 1.58, + "grad_norm": 2.0649091403655686, + "learning_rate": 1.6030517727922497e-06, + "loss": 0.6738, + "step": 10565 + }, + { + "epoch": 1.58, + "grad_norm": 6.288391926346313, + "learning_rate": 1.6029747056497293e-06, + "loss": 0.6862, + "step": 10566 + }, + { + "epoch": 1.58, + "grad_norm": 2.134857556255149, + "learning_rate": 1.6028976328795933e-06, + "loss": 0.6914, + "step": 10567 + }, + { + "epoch": 1.58, + "grad_norm": 1.7599892704219504, + "learning_rate": 1.6028205544825601e-06, + "loss": 0.6829, + "step": 10568 + }, + { + "epoch": 1.58, + "grad_norm": 1.769851129747811, + "learning_rate": 1.602743470459349e-06, + "loss": 0.6712, + "step": 10569 + }, + { + "epoch": 1.58, + "grad_norm": 0.6890045963794594, + "learning_rate": 1.6026663808106803e-06, + "loss": 0.6523, + "step": 10570 + }, + { + "epoch": 1.58, + "grad_norm": 1.1401793961356061, + "learning_rate": 1.6025892855372727e-06, + "loss": 0.6745, + "step": 10571 + }, + { + "epoch": 1.58, + "grad_norm": 3.0200274592024288, + "learning_rate": 1.602512184639846e-06, + "loss": 0.6387, + "step": 10572 + }, + { + "epoch": 1.58, + "grad_norm": 3.5301738127992706, + "learning_rate": 1.6024350781191198e-06, + "loss": 0.6712, + "step": 10573 + }, + { + "epoch": 1.58, + "grad_norm": 2.573280576143697, + "learning_rate": 1.602357965975814e-06, + "loss": 0.7012, + "step": 10574 + }, + { + "epoch": 1.58, + "grad_norm": 2.1351852365840585, + "learning_rate": 1.6022808482106476e-06, + "loss": 0.6458, + "step": 10575 + }, + { + "epoch": 1.58, + "grad_norm": 1.7801267586196359, + "learning_rate": 1.602203724824341e-06, + "loss": 0.681, + "step": 10576 + }, + { + "epoch": 1.58, + "grad_norm": 0.9120230164791362, + "learning_rate": 1.602126595817614e-06, + "loss": 0.6947, + "step": 10577 + }, + { + "epoch": 1.58, + "grad_norm": 4.011129982992828, + "learning_rate": 1.6020494611911857e-06, + "loss": 0.6764, + "step": 10578 + }, + { + "epoch": 1.58, + "grad_norm": 1.3188877881084622, + "learning_rate": 1.601972320945777e-06, + "loss": 0.681, + "step": 10579 + }, + { + "epoch": 1.58, + "grad_norm": 0.7891343293961847, + "learning_rate": 1.6018951750821072e-06, + "loss": 0.6693, + "step": 10580 + }, + { + "epoch": 1.58, + "grad_norm": 0.7677286112890448, + "learning_rate": 1.6018180236008965e-06, + "loss": 0.666, + "step": 10581 + }, + { + "epoch": 1.58, + "grad_norm": 5.793699944805356, + "learning_rate": 1.601740866502865e-06, + "loss": 0.6797, + "step": 10582 + }, + { + "epoch": 1.58, + "grad_norm": 1.5913012597313587, + "learning_rate": 1.601663703788733e-06, + "loss": 0.696, + "step": 10583 + }, + { + "epoch": 1.58, + "grad_norm": 0.810590512131866, + "learning_rate": 1.6015865354592202e-06, + "loss": 0.6615, + "step": 10584 + }, + { + "epoch": 1.58, + "grad_norm": 0.980934617927779, + "learning_rate": 1.601509361515047e-06, + "loss": 0.6289, + "step": 10585 + }, + { + "epoch": 1.58, + "grad_norm": 2.9154921264179414, + "learning_rate": 1.6014321819569338e-06, + "loss": 0.6484, + "step": 10586 + }, + { + "epoch": 1.58, + "grad_norm": 1.4783851137350486, + "learning_rate": 1.6013549967856009e-06, + "loss": 0.6699, + "step": 10587 + }, + { + "epoch": 1.58, + "grad_norm": 2.0182967173849495, + "learning_rate": 1.6012778060017684e-06, + "loss": 0.6719, + "step": 10588 + }, + { + "epoch": 1.58, + "grad_norm": 3.7791925551726, + "learning_rate": 1.6012006096061575e-06, + "loss": 0.6979, + "step": 10589 + }, + { + "epoch": 1.58, + "grad_norm": 1.0593376391074572, + "learning_rate": 1.6011234075994877e-06, + "loss": 0.6647, + "step": 10590 + }, + { + "epoch": 1.58, + "grad_norm": 1.222938338093076, + "learning_rate": 1.60104619998248e-06, + "loss": 0.6257, + "step": 10591 + }, + { + "epoch": 1.58, + "grad_norm": 1.1081541329480593, + "learning_rate": 1.600968986755855e-06, + "loss": 0.6849, + "step": 10592 + }, + { + "epoch": 1.58, + "grad_norm": 2.829697359572362, + "learning_rate": 1.6008917679203338e-06, + "loss": 0.6549, + "step": 10593 + }, + { + "epoch": 1.58, + "grad_norm": 2.776584122502331, + "learning_rate": 1.6008145434766357e-06, + "loss": 0.6712, + "step": 10594 + }, + { + "epoch": 1.58, + "grad_norm": 1.7777194891030494, + "learning_rate": 1.600737313425483e-06, + "loss": 0.6562, + "step": 10595 + }, + { + "epoch": 1.58, + "grad_norm": 2.059008514204396, + "learning_rate": 1.6006600777675953e-06, + "loss": 0.6868, + "step": 10596 + }, + { + "epoch": 1.58, + "grad_norm": 1.0984679018450267, + "learning_rate": 1.6005828365036947e-06, + "loss": 0.6497, + "step": 10597 + }, + { + "epoch": 1.58, + "grad_norm": 4.423789444159164, + "learning_rate": 1.6005055896345008e-06, + "loss": 0.6699, + "step": 10598 + }, + { + "epoch": 1.58, + "grad_norm": 1.011301914623093, + "learning_rate": 1.6004283371607352e-06, + "loss": 0.6432, + "step": 10599 + }, + { + "epoch": 1.58, + "grad_norm": 1.4182904796778697, + "learning_rate": 1.600351079083119e-06, + "loss": 0.6471, + "step": 10600 + }, + { + "epoch": 1.58, + "grad_norm": 1.7262612373146613, + "learning_rate": 1.600273815402373e-06, + "loss": 0.6738, + "step": 10601 + }, + { + "epoch": 1.58, + "grad_norm": 1.6397212747798364, + "learning_rate": 1.6001965461192183e-06, + "loss": 0.6439, + "step": 10602 + }, + { + "epoch": 1.58, + "grad_norm": 1.9626154413982173, + "learning_rate": 1.6001192712343764e-06, + "loss": 0.7005, + "step": 10603 + }, + { + "epoch": 1.58, + "grad_norm": 3.2771636034846354, + "learning_rate": 1.6000419907485681e-06, + "loss": 0.7174, + "step": 10604 + }, + { + "epoch": 1.58, + "grad_norm": 1.657170592516209, + "learning_rate": 1.5999647046625149e-06, + "loss": 0.6549, + "step": 10605 + }, + { + "epoch": 1.58, + "grad_norm": 2.9985807661005817, + "learning_rate": 1.5998874129769377e-06, + "loss": 0.6797, + "step": 10606 + }, + { + "epoch": 1.58, + "grad_norm": 3.494396830085671, + "learning_rate": 1.5998101156925588e-06, + "loss": 0.6283, + "step": 10607 + }, + { + "epoch": 1.58, + "grad_norm": 3.3853377739016746, + "learning_rate": 1.5997328128100986e-06, + "loss": 0.7174, + "step": 10608 + }, + { + "epoch": 1.58, + "grad_norm": 4.967399143866023, + "learning_rate": 1.5996555043302794e-06, + "loss": 0.6882, + "step": 10609 + }, + { + "epoch": 1.58, + "grad_norm": 1.595210671247009, + "learning_rate": 1.599578190253822e-06, + "loss": 0.6693, + "step": 10610 + }, + { + "epoch": 1.58, + "grad_norm": 3.9700704433892473, + "learning_rate": 1.5995008705814486e-06, + "loss": 0.6836, + "step": 10611 + }, + { + "epoch": 1.58, + "grad_norm": 3.685540473503203, + "learning_rate": 1.5994235453138804e-06, + "loss": 0.666, + "step": 10612 + }, + { + "epoch": 1.58, + "grad_norm": 6.404822826004761, + "learning_rate": 1.5993462144518396e-06, + "loss": 0.6816, + "step": 10613 + }, + { + "epoch": 1.58, + "grad_norm": 1.1426517716359084, + "learning_rate": 1.5992688779960467e-06, + "loss": 0.6699, + "step": 10614 + }, + { + "epoch": 1.58, + "grad_norm": 1.1688884010138954, + "learning_rate": 1.599191535947225e-06, + "loss": 0.653, + "step": 10615 + }, + { + "epoch": 1.58, + "grad_norm": 2.1527693762638274, + "learning_rate": 1.5991141883060958e-06, + "loss": 0.6829, + "step": 10616 + }, + { + "epoch": 1.58, + "grad_norm": 3.426717734221776, + "learning_rate": 1.5990368350733808e-06, + "loss": 0.7298, + "step": 10617 + }, + { + "epoch": 1.58, + "grad_norm": 1.021500784005617, + "learning_rate": 1.5989594762498017e-06, + "loss": 0.6803, + "step": 10618 + }, + { + "epoch": 1.58, + "grad_norm": 0.9417019948029158, + "learning_rate": 1.598882111836081e-06, + "loss": 0.6536, + "step": 10619 + }, + { + "epoch": 1.58, + "grad_norm": 4.3413359916624525, + "learning_rate": 1.5988047418329408e-06, + "loss": 0.6699, + "step": 10620 + }, + { + "epoch": 1.58, + "grad_norm": 4.937748628096285, + "learning_rate": 1.5987273662411028e-06, + "loss": 0.6901, + "step": 10621 + }, + { + "epoch": 1.58, + "grad_norm": 0.7889605400185694, + "learning_rate": 1.5986499850612896e-06, + "loss": 0.6406, + "step": 10622 + }, + { + "epoch": 1.58, + "grad_norm": 3.1073054624065533, + "learning_rate": 1.5985725982942228e-06, + "loss": 0.6706, + "step": 10623 + }, + { + "epoch": 1.58, + "grad_norm": 2.3816757168926435, + "learning_rate": 1.5984952059406251e-06, + "loss": 0.6764, + "step": 10624 + }, + { + "epoch": 1.58, + "grad_norm": 3.2824976092225695, + "learning_rate": 1.598417808001219e-06, + "loss": 0.6693, + "step": 10625 + }, + { + "epoch": 1.58, + "grad_norm": 0.7026981551250129, + "learning_rate": 1.5983404044767265e-06, + "loss": 0.668, + "step": 10626 + }, + { + "epoch": 1.58, + "grad_norm": 0.8771504860245654, + "learning_rate": 1.59826299536787e-06, + "loss": 0.6582, + "step": 10627 + }, + { + "epoch": 1.59, + "grad_norm": 0.7108004689071977, + "learning_rate": 1.5981855806753718e-06, + "loss": 0.6608, + "step": 10628 + }, + { + "epoch": 1.59, + "grad_norm": 1.7714838366315084, + "learning_rate": 1.598108160399955e-06, + "loss": 0.6712, + "step": 10629 + }, + { + "epoch": 1.59, + "grad_norm": 0.8367612259249367, + "learning_rate": 1.5980307345423415e-06, + "loss": 0.6641, + "step": 10630 + }, + { + "epoch": 1.59, + "grad_norm": 0.8966157648471349, + "learning_rate": 1.5979533031032547e-06, + "loss": 0.6497, + "step": 10631 + }, + { + "epoch": 1.59, + "grad_norm": 0.8331620198486638, + "learning_rate": 1.5978758660834165e-06, + "loss": 0.6712, + "step": 10632 + }, + { + "epoch": 1.59, + "grad_norm": 4.771180123256769, + "learning_rate": 1.5977984234835502e-06, + "loss": 0.6608, + "step": 10633 + }, + { + "epoch": 1.59, + "grad_norm": 1.5516637302637508, + "learning_rate": 1.5977209753043783e-06, + "loss": 0.6784, + "step": 10634 + }, + { + "epoch": 1.59, + "grad_norm": 1.3017273692137188, + "learning_rate": 1.5976435215466236e-06, + "loss": 0.6738, + "step": 10635 + }, + { + "epoch": 1.59, + "grad_norm": 3.403537993157435, + "learning_rate": 1.5975660622110091e-06, + "loss": 0.681, + "step": 10636 + }, + { + "epoch": 1.59, + "grad_norm": 2.9574350781853074, + "learning_rate": 1.5974885972982577e-06, + "loss": 0.6849, + "step": 10637 + }, + { + "epoch": 1.59, + "grad_norm": 1.9292851676999485, + "learning_rate": 1.597411126809092e-06, + "loss": 0.6699, + "step": 10638 + }, + { + "epoch": 1.59, + "grad_norm": 1.8730967398121867, + "learning_rate": 1.597333650744236e-06, + "loss": 0.6803, + "step": 10639 + }, + { + "epoch": 1.59, + "grad_norm": 4.456041282839581, + "learning_rate": 1.597256169104412e-06, + "loss": 0.6719, + "step": 10640 + }, + { + "epoch": 1.59, + "grad_norm": 1.281112859466946, + "learning_rate": 1.5971786818903434e-06, + "loss": 0.6536, + "step": 10641 + }, + { + "epoch": 1.59, + "grad_norm": 1.0074577082982474, + "learning_rate": 1.5971011891027532e-06, + "loss": 0.6771, + "step": 10642 + }, + { + "epoch": 1.59, + "grad_norm": 1.8523547660667197, + "learning_rate": 1.5970236907423652e-06, + "loss": 0.6562, + "step": 10643 + }, + { + "epoch": 1.59, + "grad_norm": 1.033143600878901, + "learning_rate": 1.5969461868099017e-06, + "loss": 0.6576, + "step": 10644 + }, + { + "epoch": 1.59, + "grad_norm": 1.409255167182134, + "learning_rate": 1.5968686773060869e-06, + "loss": 0.6654, + "step": 10645 + }, + { + "epoch": 1.59, + "grad_norm": 1.622157846534333, + "learning_rate": 1.596791162231644e-06, + "loss": 0.7253, + "step": 10646 + }, + { + "epoch": 1.59, + "grad_norm": 0.9096765650283816, + "learning_rate": 1.5967136415872966e-06, + "loss": 0.6895, + "step": 10647 + }, + { + "epoch": 1.59, + "grad_norm": 1.5532586403075905, + "learning_rate": 1.596636115373768e-06, + "loss": 0.6426, + "step": 10648 + }, + { + "epoch": 1.59, + "grad_norm": 0.7730724549930181, + "learning_rate": 1.5965585835917814e-06, + "loss": 0.6673, + "step": 10649 + }, + { + "epoch": 1.59, + "grad_norm": 2.1498719377281543, + "learning_rate": 1.596481046242061e-06, + "loss": 0.6882, + "step": 10650 + }, + { + "epoch": 1.59, + "grad_norm": 2.6424530245592868, + "learning_rate": 1.5964035033253303e-06, + "loss": 0.679, + "step": 10651 + }, + { + "epoch": 1.59, + "grad_norm": 1.467615551990316, + "learning_rate": 1.5963259548423126e-06, + "loss": 0.6465, + "step": 10652 + }, + { + "epoch": 1.59, + "grad_norm": 2.6528650991326326, + "learning_rate": 1.5962484007937324e-06, + "loss": 0.6621, + "step": 10653 + }, + { + "epoch": 1.59, + "grad_norm": 1.6246412450045586, + "learning_rate": 1.5961708411803133e-06, + "loss": 0.6751, + "step": 10654 + }, + { + "epoch": 1.59, + "grad_norm": 4.267480013579725, + "learning_rate": 1.5960932760027785e-06, + "loss": 0.7083, + "step": 10655 + }, + { + "epoch": 1.59, + "grad_norm": 2.9951273662726403, + "learning_rate": 1.5960157052618528e-06, + "loss": 0.6934, + "step": 10656 + }, + { + "epoch": 1.59, + "grad_norm": 3.030775258616441, + "learning_rate": 1.5959381289582597e-06, + "loss": 0.6706, + "step": 10657 + }, + { + "epoch": 1.59, + "grad_norm": 1.665651312529856, + "learning_rate": 1.5958605470927234e-06, + "loss": 0.6686, + "step": 10658 + }, + { + "epoch": 1.59, + "grad_norm": 4.313112068091202, + "learning_rate": 1.5957829596659677e-06, + "loss": 0.6992, + "step": 10659 + }, + { + "epoch": 1.59, + "grad_norm": 2.033066050832694, + "learning_rate": 1.5957053666787173e-06, + "loss": 0.6888, + "step": 10660 + }, + { + "epoch": 1.59, + "grad_norm": 0.9297313249052698, + "learning_rate": 1.5956277681316958e-06, + "loss": 0.6764, + "step": 10661 + }, + { + "epoch": 1.59, + "grad_norm": 1.0708667391061506, + "learning_rate": 1.5955501640256277e-06, + "loss": 0.6549, + "step": 10662 + }, + { + "epoch": 1.59, + "grad_norm": 4.570193347516926, + "learning_rate": 1.5954725543612373e-06, + "loss": 0.7259, + "step": 10663 + }, + { + "epoch": 1.59, + "grad_norm": 3.680448370569042, + "learning_rate": 1.595394939139249e-06, + "loss": 0.6842, + "step": 10664 + }, + { + "epoch": 1.59, + "grad_norm": 2.4437611236176404, + "learning_rate": 1.5953173183603868e-06, + "loss": 0.6673, + "step": 10665 + }, + { + "epoch": 1.59, + "grad_norm": 1.318647775293041, + "learning_rate": 1.5952396920253756e-06, + "loss": 0.6764, + "step": 10666 + }, + { + "epoch": 1.59, + "grad_norm": 3.306790190559772, + "learning_rate": 1.5951620601349398e-06, + "loss": 0.6855, + "step": 10667 + }, + { + "epoch": 1.59, + "grad_norm": 0.6867292376386129, + "learning_rate": 1.5950844226898035e-06, + "loss": 0.6849, + "step": 10668 + }, + { + "epoch": 1.59, + "grad_norm": 2.0049917527696812, + "learning_rate": 1.5950067796906922e-06, + "loss": 0.6901, + "step": 10669 + }, + { + "epoch": 1.59, + "grad_norm": 2.0026955809263685, + "learning_rate": 1.5949291311383295e-06, + "loss": 0.6569, + "step": 10670 + }, + { + "epoch": 1.59, + "grad_norm": 0.6503777245222949, + "learning_rate": 1.5948514770334409e-06, + "loss": 0.6654, + "step": 10671 + }, + { + "epoch": 1.59, + "grad_norm": 1.507385231468381, + "learning_rate": 1.5947738173767507e-06, + "loss": 0.6862, + "step": 10672 + }, + { + "epoch": 1.59, + "grad_norm": 5.124630378002624, + "learning_rate": 1.5946961521689839e-06, + "loss": 0.7168, + "step": 10673 + }, + { + "epoch": 1.59, + "grad_norm": 1.5247507404687974, + "learning_rate": 1.5946184814108654e-06, + "loss": 0.6803, + "step": 10674 + }, + { + "epoch": 1.59, + "grad_norm": 1.0422876594629236, + "learning_rate": 1.59454080510312e-06, + "loss": 0.6738, + "step": 10675 + }, + { + "epoch": 1.59, + "grad_norm": 3.453949133921399, + "learning_rate": 1.5944631232464727e-06, + "loss": 0.681, + "step": 10676 + }, + { + "epoch": 1.59, + "grad_norm": 0.8134794439606867, + "learning_rate": 1.5943854358416485e-06, + "loss": 0.6829, + "step": 10677 + }, + { + "epoch": 1.59, + "grad_norm": 3.6545994086370523, + "learning_rate": 1.5943077428893724e-06, + "loss": 0.6673, + "step": 10678 + }, + { + "epoch": 1.59, + "grad_norm": 0.9192731800212324, + "learning_rate": 1.5942300443903696e-06, + "loss": 0.6895, + "step": 10679 + }, + { + "epoch": 1.59, + "grad_norm": 2.4495670578153312, + "learning_rate": 1.5941523403453652e-06, + "loss": 0.6719, + "step": 10680 + }, + { + "epoch": 1.59, + "grad_norm": 0.8132524103752882, + "learning_rate": 1.5940746307550845e-06, + "loss": 0.6927, + "step": 10681 + }, + { + "epoch": 1.59, + "grad_norm": 4.94279390186323, + "learning_rate": 1.5939969156202526e-06, + "loss": 0.6862, + "step": 10682 + }, + { + "epoch": 1.59, + "grad_norm": 1.2486886393640533, + "learning_rate": 1.5939191949415953e-06, + "loss": 0.6667, + "step": 10683 + }, + { + "epoch": 1.59, + "grad_norm": 1.301575910197921, + "learning_rate": 1.5938414687198374e-06, + "loss": 0.681, + "step": 10684 + }, + { + "epoch": 1.59, + "grad_norm": 1.3174056340125417, + "learning_rate": 1.5937637369557044e-06, + "loss": 0.6712, + "step": 10685 + }, + { + "epoch": 1.59, + "grad_norm": 1.9660387727673612, + "learning_rate": 1.593685999649922e-06, + "loss": 0.6966, + "step": 10686 + }, + { + "epoch": 1.59, + "grad_norm": 4.340156427698749, + "learning_rate": 1.593608256803216e-06, + "loss": 0.6842, + "step": 10687 + }, + { + "epoch": 1.59, + "grad_norm": 4.608520264510315, + "learning_rate": 1.5935305084163114e-06, + "loss": 0.6973, + "step": 10688 + }, + { + "epoch": 1.59, + "grad_norm": 1.6728076513367613, + "learning_rate": 1.593452754489934e-06, + "loss": 0.6777, + "step": 10689 + }, + { + "epoch": 1.59, + "grad_norm": 1.4438963766426107, + "learning_rate": 1.59337499502481e-06, + "loss": 0.6686, + "step": 10690 + }, + { + "epoch": 1.59, + "grad_norm": 2.7284689620129163, + "learning_rate": 1.593297230021664e-06, + "loss": 0.6686, + "step": 10691 + }, + { + "epoch": 1.59, + "grad_norm": 2.924935696238743, + "learning_rate": 1.5932194594812232e-06, + "loss": 0.6751, + "step": 10692 + }, + { + "epoch": 1.59, + "grad_norm": 4.691111700898623, + "learning_rate": 1.593141683404212e-06, + "loss": 0.6803, + "step": 10693 + }, + { + "epoch": 1.59, + "grad_norm": 0.593891751759368, + "learning_rate": 1.5930639017913573e-06, + "loss": 0.6712, + "step": 10694 + }, + { + "epoch": 1.6, + "grad_norm": 1.7253660964438966, + "learning_rate": 1.592986114643385e-06, + "loss": 0.681, + "step": 10695 + }, + { + "epoch": 1.6, + "grad_norm": 0.6097844679411142, + "learning_rate": 1.592908321961021e-06, + "loss": 0.6751, + "step": 10696 + }, + { + "epoch": 1.6, + "grad_norm": 2.6158702998651684, + "learning_rate": 1.5928305237449906e-06, + "loss": 0.696, + "step": 10697 + }, + { + "epoch": 1.6, + "grad_norm": 3.745575638458016, + "learning_rate": 1.5927527199960207e-06, + "loss": 0.6615, + "step": 10698 + }, + { + "epoch": 1.6, + "grad_norm": 1.1314246081152448, + "learning_rate": 1.5926749107148374e-06, + "loss": 0.6901, + "step": 10699 + }, + { + "epoch": 1.6, + "grad_norm": 2.0707266450240085, + "learning_rate": 1.5925970959021667e-06, + "loss": 0.694, + "step": 10700 + }, + { + "epoch": 1.6, + "grad_norm": 6.788200541215602, + "learning_rate": 1.5925192755587346e-06, + "loss": 0.6934, + "step": 10701 + }, + { + "epoch": 1.6, + "grad_norm": 5.006777770856037, + "learning_rate": 1.592441449685268e-06, + "loss": 0.6602, + "step": 10702 + }, + { + "epoch": 1.6, + "grad_norm": 0.9666629648458682, + "learning_rate": 1.592363618282493e-06, + "loss": 0.6751, + "step": 10703 + }, + { + "epoch": 1.6, + "grad_norm": 5.43487497401501, + "learning_rate": 1.592285781351136e-06, + "loss": 0.6849, + "step": 10704 + }, + { + "epoch": 1.6, + "grad_norm": 0.9746821281149624, + "learning_rate": 1.5922079388919231e-06, + "loss": 0.6725, + "step": 10705 + }, + { + "epoch": 1.6, + "grad_norm": 2.554987932295611, + "learning_rate": 1.5921300909055814e-06, + "loss": 0.6576, + "step": 10706 + }, + { + "epoch": 1.6, + "grad_norm": 5.720227953642612, + "learning_rate": 1.5920522373928372e-06, + "loss": 0.6862, + "step": 10707 + }, + { + "epoch": 1.6, + "grad_norm": 1.67744336402437, + "learning_rate": 1.591974378354417e-06, + "loss": 0.6751, + "step": 10708 + }, + { + "epoch": 1.6, + "grad_norm": 4.328357537237208, + "learning_rate": 1.5918965137910478e-06, + "loss": 0.6895, + "step": 10709 + }, + { + "epoch": 1.6, + "grad_norm": 3.088685418858503, + "learning_rate": 1.5918186437034557e-06, + "loss": 0.7012, + "step": 10710 + }, + { + "epoch": 1.6, + "grad_norm": 3.74788435758057, + "learning_rate": 1.5917407680923681e-06, + "loss": 0.6901, + "step": 10711 + }, + { + "epoch": 1.6, + "grad_norm": 4.02055177013486, + "learning_rate": 1.5916628869585118e-06, + "loss": 0.6771, + "step": 10712 + }, + { + "epoch": 1.6, + "grad_norm": 2.3472880575100725, + "learning_rate": 1.5915850003026133e-06, + "loss": 0.6686, + "step": 10713 + }, + { + "epoch": 1.6, + "grad_norm": 2.0629192619985117, + "learning_rate": 1.5915071081253993e-06, + "loss": 0.6895, + "step": 10714 + }, + { + "epoch": 1.6, + "grad_norm": 3.1463311461267063, + "learning_rate": 1.5914292104275973e-06, + "loss": 0.6999, + "step": 10715 + }, + { + "epoch": 1.6, + "grad_norm": 1.563892733234567, + "learning_rate": 1.5913513072099348e-06, + "loss": 0.6771, + "step": 10716 + }, + { + "epoch": 1.6, + "grad_norm": 1.1228198770737108, + "learning_rate": 1.5912733984731375e-06, + "loss": 0.6849, + "step": 10717 + }, + { + "epoch": 1.6, + "grad_norm": 2.390781599933482, + "learning_rate": 1.5911954842179336e-06, + "loss": 0.6641, + "step": 10718 + }, + { + "epoch": 1.6, + "grad_norm": 0.9398167749610081, + "learning_rate": 1.5911175644450498e-06, + "loss": 0.6595, + "step": 10719 + }, + { + "epoch": 1.6, + "grad_norm": 1.9282566888158597, + "learning_rate": 1.5910396391552137e-06, + "loss": 0.6621, + "step": 10720 + }, + { + "epoch": 1.6, + "grad_norm": 1.422812918261563, + "learning_rate": 1.5909617083491522e-06, + "loss": 0.6732, + "step": 10721 + }, + { + "epoch": 1.6, + "grad_norm": 0.9727448175650432, + "learning_rate": 1.590883772027593e-06, + "loss": 0.6673, + "step": 10722 + }, + { + "epoch": 1.6, + "grad_norm": 2.84949478348144, + "learning_rate": 1.5908058301912633e-06, + "loss": 0.6706, + "step": 10723 + }, + { + "epoch": 1.6, + "grad_norm": 4.2801541163470995, + "learning_rate": 1.5907278828408903e-06, + "loss": 0.6595, + "step": 10724 + }, + { + "epoch": 1.6, + "grad_norm": 2.2282261751697985, + "learning_rate": 1.5906499299772018e-06, + "loss": 0.6875, + "step": 10725 + }, + { + "epoch": 1.6, + "grad_norm": 0.7894510526407118, + "learning_rate": 1.5905719716009253e-06, + "loss": 0.6901, + "step": 10726 + }, + { + "epoch": 1.6, + "grad_norm": 0.7106367162166289, + "learning_rate": 1.5904940077127884e-06, + "loss": 0.694, + "step": 10727 + }, + { + "epoch": 1.6, + "grad_norm": 1.2551500201796133, + "learning_rate": 1.5904160383135189e-06, + "loss": 0.6504, + "step": 10728 + }, + { + "epoch": 1.6, + "grad_norm": 0.7432296116665978, + "learning_rate": 1.5903380634038439e-06, + "loss": 0.6836, + "step": 10729 + }, + { + "epoch": 1.6, + "grad_norm": 3.9873200881372637, + "learning_rate": 1.5902600829844919e-06, + "loss": 0.6777, + "step": 10730 + }, + { + "epoch": 1.6, + "grad_norm": 3.268509414764705, + "learning_rate": 1.59018209705619e-06, + "loss": 0.6556, + "step": 10731 + }, + { + "epoch": 1.6, + "grad_norm": 2.888306897847804, + "learning_rate": 1.5901041056196669e-06, + "loss": 0.7064, + "step": 10732 + }, + { + "epoch": 1.6, + "grad_norm": 1.0660517219801737, + "learning_rate": 1.5900261086756495e-06, + "loss": 0.6784, + "step": 10733 + }, + { + "epoch": 1.6, + "grad_norm": 4.937547280617474, + "learning_rate": 1.5899481062248663e-06, + "loss": 0.6654, + "step": 10734 + }, + { + "epoch": 1.6, + "grad_norm": 1.2588070975410581, + "learning_rate": 1.5898700982680454e-06, + "loss": 0.6888, + "step": 10735 + }, + { + "epoch": 1.6, + "grad_norm": 2.5352139096429003, + "learning_rate": 1.5897920848059147e-06, + "loss": 0.6654, + "step": 10736 + }, + { + "epoch": 1.6, + "grad_norm": 6.116683518067418, + "learning_rate": 1.5897140658392022e-06, + "loss": 0.6699, + "step": 10737 + }, + { + "epoch": 1.6, + "grad_norm": 0.7095881932046281, + "learning_rate": 1.5896360413686365e-06, + "loss": 0.6569, + "step": 10738 + }, + { + "epoch": 1.6, + "grad_norm": 1.8883852315106506, + "learning_rate": 1.5895580113949452e-06, + "loss": 0.681, + "step": 10739 + }, + { + "epoch": 1.6, + "grad_norm": 1.1007755771851004, + "learning_rate": 1.589479975918857e-06, + "loss": 0.6712, + "step": 10740 + }, + { + "epoch": 1.6, + "grad_norm": 4.61274264136697, + "learning_rate": 1.5894019349410998e-06, + "loss": 0.6543, + "step": 10741 + }, + { + "epoch": 1.6, + "grad_norm": 0.8317078484030825, + "learning_rate": 1.5893238884624025e-06, + "loss": 0.6641, + "step": 10742 + }, + { + "epoch": 1.6, + "grad_norm": 4.560262629643608, + "learning_rate": 1.5892458364834932e-06, + "loss": 0.6771, + "step": 10743 + }, + { + "epoch": 1.6, + "grad_norm": 1.3555370783367344, + "learning_rate": 1.5891677790051e-06, + "loss": 0.6602, + "step": 10744 + }, + { + "epoch": 1.6, + "grad_norm": 2.4929734821333795, + "learning_rate": 1.5890897160279524e-06, + "loss": 0.6582, + "step": 10745 + }, + { + "epoch": 1.6, + "grad_norm": 1.5992400376050881, + "learning_rate": 1.589011647552778e-06, + "loss": 0.6725, + "step": 10746 + }, + { + "epoch": 1.6, + "grad_norm": 2.4744135664287725, + "learning_rate": 1.5889335735803057e-06, + "loss": 0.6608, + "step": 10747 + }, + { + "epoch": 1.6, + "grad_norm": 3.3110259343483635, + "learning_rate": 1.5888554941112646e-06, + "loss": 0.6686, + "step": 10748 + }, + { + "epoch": 1.6, + "grad_norm": 2.070564526440586, + "learning_rate": 1.5887774091463832e-06, + "loss": 0.6745, + "step": 10749 + }, + { + "epoch": 1.6, + "grad_norm": 2.9745304571720066, + "learning_rate": 1.5886993186863898e-06, + "loss": 0.6523, + "step": 10750 + }, + { + "epoch": 1.6, + "grad_norm": 0.8897892429336022, + "learning_rate": 1.5886212227320137e-06, + "loss": 0.6628, + "step": 10751 + }, + { + "epoch": 1.6, + "grad_norm": 1.4851566701575993, + "learning_rate": 1.5885431212839836e-06, + "loss": 0.6758, + "step": 10752 + }, + { + "epoch": 1.6, + "grad_norm": 4.460264021534012, + "learning_rate": 1.5884650143430287e-06, + "loss": 0.696, + "step": 10753 + }, + { + "epoch": 1.6, + "grad_norm": 1.008518071483874, + "learning_rate": 1.5883869019098776e-06, + "loss": 0.6751, + "step": 10754 + }, + { + "epoch": 1.6, + "grad_norm": 2.320455374493063, + "learning_rate": 1.5883087839852597e-06, + "loss": 0.6797, + "step": 10755 + }, + { + "epoch": 1.6, + "grad_norm": 6.263873494971331, + "learning_rate": 1.5882306605699038e-06, + "loss": 0.6751, + "step": 10756 + }, + { + "epoch": 1.6, + "grad_norm": 1.7519917864491772, + "learning_rate": 1.5881525316645392e-06, + "loss": 0.6484, + "step": 10757 + }, + { + "epoch": 1.6, + "grad_norm": 2.804009674418962, + "learning_rate": 1.5880743972698948e-06, + "loss": 0.679, + "step": 10758 + }, + { + "epoch": 1.6, + "grad_norm": 0.9464000078837473, + "learning_rate": 1.5879962573867004e-06, + "loss": 0.6634, + "step": 10759 + }, + { + "epoch": 1.6, + "grad_norm": 2.989751078430376, + "learning_rate": 1.5879181120156849e-06, + "loss": 0.6803, + "step": 10760 + }, + { + "epoch": 1.6, + "grad_norm": 0.984977823520119, + "learning_rate": 1.5878399611575772e-06, + "loss": 0.6771, + "step": 10761 + }, + { + "epoch": 1.61, + "grad_norm": 1.8424366550717994, + "learning_rate": 1.5877618048131076e-06, + "loss": 0.6725, + "step": 10762 + }, + { + "epoch": 1.61, + "grad_norm": 1.4436700013960118, + "learning_rate": 1.5876836429830046e-06, + "loss": 0.6732, + "step": 10763 + }, + { + "epoch": 1.61, + "grad_norm": 3.510815761183359, + "learning_rate": 1.5876054756679987e-06, + "loss": 0.6504, + "step": 10764 + }, + { + "epoch": 1.61, + "grad_norm": 2.4326630572519288, + "learning_rate": 1.587527302868819e-06, + "loss": 0.6829, + "step": 10765 + }, + { + "epoch": 1.61, + "grad_norm": 2.0053335511658967, + "learning_rate": 1.5874491245861946e-06, + "loss": 0.6888, + "step": 10766 + }, + { + "epoch": 1.61, + "grad_norm": 4.987335684911339, + "learning_rate": 1.5873709408208557e-06, + "loss": 0.7168, + "step": 10767 + }, + { + "epoch": 1.61, + "grad_norm": 5.8842837370971095, + "learning_rate": 1.5872927515735318e-06, + "loss": 0.6999, + "step": 10768 + }, + { + "epoch": 1.61, + "grad_norm": 4.826600319711791, + "learning_rate": 1.5872145568449532e-06, + "loss": 0.6829, + "step": 10769 + }, + { + "epoch": 1.61, + "grad_norm": 4.44416003557016, + "learning_rate": 1.5871363566358483e-06, + "loss": 0.6673, + "step": 10770 + }, + { + "epoch": 1.61, + "grad_norm": 0.7770266677490889, + "learning_rate": 1.5870581509469486e-06, + "loss": 0.6667, + "step": 10771 + }, + { + "epoch": 1.61, + "grad_norm": 1.5102882188211637, + "learning_rate": 1.5869799397789831e-06, + "loss": 0.6647, + "step": 10772 + }, + { + "epoch": 1.61, + "grad_norm": 2.975335184941419, + "learning_rate": 1.5869017231326818e-06, + "loss": 0.6504, + "step": 10773 + }, + { + "epoch": 1.61, + "grad_norm": 1.5435798002658043, + "learning_rate": 1.5868235010087748e-06, + "loss": 0.6699, + "step": 10774 + }, + { + "epoch": 1.61, + "grad_norm": 4.935605089526534, + "learning_rate": 1.5867452734079925e-06, + "loss": 0.6595, + "step": 10775 + }, + { + "epoch": 1.61, + "grad_norm": 5.339411442489267, + "learning_rate": 1.5866670403310644e-06, + "loss": 0.6641, + "step": 10776 + }, + { + "epoch": 1.61, + "grad_norm": 4.329686457943396, + "learning_rate": 1.5865888017787207e-06, + "loss": 0.6641, + "step": 10777 + }, + { + "epoch": 1.61, + "grad_norm": 3.498784453455429, + "learning_rate": 1.5865105577516924e-06, + "loss": 0.7129, + "step": 10778 + }, + { + "epoch": 1.61, + "grad_norm": 0.7789620399115162, + "learning_rate": 1.5864323082507087e-06, + "loss": 0.6758, + "step": 10779 + }, + { + "epoch": 1.61, + "grad_norm": 0.9356948104704966, + "learning_rate": 1.5863540532765008e-06, + "loss": 0.6868, + "step": 10780 + }, + { + "epoch": 1.61, + "grad_norm": 1.9203659334465277, + "learning_rate": 1.5862757928297984e-06, + "loss": 0.6504, + "step": 10781 + }, + { + "epoch": 1.61, + "grad_norm": 0.8643464585613446, + "learning_rate": 1.5861975269113321e-06, + "loss": 0.6628, + "step": 10782 + }, + { + "epoch": 1.61, + "grad_norm": 0.8539930337497562, + "learning_rate": 1.5861192555218327e-06, + "loss": 0.6602, + "step": 10783 + }, + { + "epoch": 1.61, + "grad_norm": 4.1983933400382965, + "learning_rate": 1.58604097866203e-06, + "loss": 0.679, + "step": 10784 + }, + { + "epoch": 1.61, + "grad_norm": 0.8606396505430961, + "learning_rate": 1.5859626963326558e-06, + "loss": 0.6693, + "step": 10785 + }, + { + "epoch": 1.61, + "grad_norm": 1.415935133179074, + "learning_rate": 1.5858844085344392e-06, + "loss": 0.696, + "step": 10786 + }, + { + "epoch": 1.61, + "grad_norm": 2.0670825538669866, + "learning_rate": 1.585806115268112e-06, + "loss": 0.6608, + "step": 10787 + }, + { + "epoch": 1.61, + "grad_norm": 3.5796433344825997, + "learning_rate": 1.5857278165344043e-06, + "loss": 0.6712, + "step": 10788 + }, + { + "epoch": 1.61, + "grad_norm": 0.8560322375196245, + "learning_rate": 1.5856495123340473e-06, + "loss": 0.6706, + "step": 10789 + }, + { + "epoch": 1.61, + "grad_norm": 3.355464693331478, + "learning_rate": 1.5855712026677714e-06, + "loss": 0.6901, + "step": 10790 + }, + { + "epoch": 1.61, + "grad_norm": 1.9036202297966078, + "learning_rate": 1.5854928875363078e-06, + "loss": 0.6686, + "step": 10791 + }, + { + "epoch": 1.61, + "grad_norm": 1.3082789436045046, + "learning_rate": 1.5854145669403874e-06, + "loss": 0.6608, + "step": 10792 + }, + { + "epoch": 1.61, + "grad_norm": 0.8185196913128634, + "learning_rate": 1.5853362408807409e-06, + "loss": 0.681, + "step": 10793 + }, + { + "epoch": 1.61, + "grad_norm": 2.0814875935971164, + "learning_rate": 1.5852579093580995e-06, + "loss": 0.6647, + "step": 10794 + }, + { + "epoch": 1.61, + "grad_norm": 2.5072810013207376, + "learning_rate": 1.5851795723731946e-06, + "loss": 0.6673, + "step": 10795 + }, + { + "epoch": 1.61, + "grad_norm": 1.8338129255290356, + "learning_rate": 1.5851012299267565e-06, + "loss": 0.6738, + "step": 10796 + }, + { + "epoch": 1.61, + "grad_norm": 3.5785368659353956, + "learning_rate": 1.5850228820195172e-06, + "loss": 0.6764, + "step": 10797 + }, + { + "epoch": 1.61, + "grad_norm": 4.922504041496011, + "learning_rate": 1.5849445286522076e-06, + "loss": 0.6445, + "step": 10798 + }, + { + "epoch": 1.61, + "grad_norm": 3.2077574509629128, + "learning_rate": 1.5848661698255588e-06, + "loss": 0.6999, + "step": 10799 + }, + { + "epoch": 1.61, + "grad_norm": 2.181838320551803, + "learning_rate": 1.5847878055403024e-06, + "loss": 0.6634, + "step": 10800 + }, + { + "epoch": 1.61, + "grad_norm": 1.9090246468365284, + "learning_rate": 1.5847094357971697e-06, + "loss": 0.6927, + "step": 10801 + }, + { + "epoch": 1.61, + "grad_norm": 2.160670703723923, + "learning_rate": 1.5846310605968923e-06, + "loss": 0.6829, + "step": 10802 + }, + { + "epoch": 1.61, + "grad_norm": 2.5521866857602546, + "learning_rate": 1.5845526799402013e-06, + "loss": 0.6582, + "step": 10803 + }, + { + "epoch": 1.61, + "grad_norm": 1.1524373554235046, + "learning_rate": 1.5844742938278284e-06, + "loss": 0.7025, + "step": 10804 + }, + { + "epoch": 1.61, + "grad_norm": 2.459830967953604, + "learning_rate": 1.5843959022605054e-06, + "loss": 0.6751, + "step": 10805 + }, + { + "epoch": 1.61, + "grad_norm": 0.9204885100904291, + "learning_rate": 1.5843175052389638e-06, + "loss": 0.6816, + "step": 10806 + }, + { + "epoch": 1.61, + "grad_norm": 1.9154824729524065, + "learning_rate": 1.5842391027639348e-06, + "loss": 0.6764, + "step": 10807 + }, + { + "epoch": 1.61, + "grad_norm": 1.0530523004041314, + "learning_rate": 1.5841606948361513e-06, + "loss": 0.6654, + "step": 10808 + }, + { + "epoch": 1.61, + "grad_norm": 1.399015165149739, + "learning_rate": 1.5840822814563439e-06, + "loss": 0.6855, + "step": 10809 + }, + { + "epoch": 1.61, + "grad_norm": 0.9700485861753707, + "learning_rate": 1.584003862625245e-06, + "loss": 0.6576, + "step": 10810 + }, + { + "epoch": 1.61, + "grad_norm": 4.157575594843804, + "learning_rate": 1.5839254383435866e-06, + "loss": 0.6751, + "step": 10811 + }, + { + "epoch": 1.61, + "grad_norm": 0.8208198363976994, + "learning_rate": 1.5838470086121001e-06, + "loss": 0.6504, + "step": 10812 + }, + { + "epoch": 1.61, + "grad_norm": 3.701890409286022, + "learning_rate": 1.5837685734315178e-06, + "loss": 0.6361, + "step": 10813 + }, + { + "epoch": 1.61, + "grad_norm": 1.318702008932996, + "learning_rate": 1.5836901328025717e-06, + "loss": 0.627, + "step": 10814 + }, + { + "epoch": 1.61, + "grad_norm": 0.8840069938854406, + "learning_rate": 1.5836116867259945e-06, + "loss": 0.6797, + "step": 10815 + }, + { + "epoch": 1.61, + "grad_norm": 0.9457181562743554, + "learning_rate": 1.5835332352025177e-06, + "loss": 0.6589, + "step": 10816 + }, + { + "epoch": 1.61, + "grad_norm": 0.9511584922581308, + "learning_rate": 1.5834547782328733e-06, + "loss": 0.6569, + "step": 10817 + }, + { + "epoch": 1.61, + "grad_norm": 2.838789389001184, + "learning_rate": 1.5833763158177941e-06, + "loss": 0.6855, + "step": 10818 + }, + { + "epoch": 1.61, + "grad_norm": 2.2298633252801436, + "learning_rate": 1.5832978479580121e-06, + "loss": 0.6641, + "step": 10819 + }, + { + "epoch": 1.61, + "grad_norm": 2.0685196309837743, + "learning_rate": 1.5832193746542596e-06, + "loss": 0.6706, + "step": 10820 + }, + { + "epoch": 1.61, + "grad_norm": 1.6796460528194717, + "learning_rate": 1.5831408959072694e-06, + "loss": 0.6699, + "step": 10821 + }, + { + "epoch": 1.61, + "grad_norm": 3.0022230524563915, + "learning_rate": 1.5830624117177732e-06, + "loss": 0.6823, + "step": 10822 + }, + { + "epoch": 1.61, + "grad_norm": 1.7144892447726907, + "learning_rate": 1.5829839220865041e-06, + "loss": 0.6608, + "step": 10823 + }, + { + "epoch": 1.61, + "grad_norm": 1.1293658431918006, + "learning_rate": 1.5829054270141948e-06, + "loss": 0.6602, + "step": 10824 + }, + { + "epoch": 1.61, + "grad_norm": 3.9747387722455483, + "learning_rate": 1.5828269265015775e-06, + "loss": 0.6823, + "step": 10825 + }, + { + "epoch": 1.61, + "grad_norm": 1.0157743125964502, + "learning_rate": 1.5827484205493847e-06, + "loss": 0.6543, + "step": 10826 + }, + { + "epoch": 1.61, + "grad_norm": 2.669339999355637, + "learning_rate": 1.5826699091583493e-06, + "loss": 0.6849, + "step": 10827 + }, + { + "epoch": 1.61, + "grad_norm": 2.1894414658972168, + "learning_rate": 1.5825913923292044e-06, + "loss": 0.7025, + "step": 10828 + }, + { + "epoch": 1.62, + "grad_norm": 2.0801933180660597, + "learning_rate": 1.5825128700626823e-06, + "loss": 0.6803, + "step": 10829 + }, + { + "epoch": 1.62, + "grad_norm": 2.322973495327259, + "learning_rate": 1.5824343423595163e-06, + "loss": 0.6836, + "step": 10830 + }, + { + "epoch": 1.62, + "grad_norm": 1.8848285910613736, + "learning_rate": 1.582355809220439e-06, + "loss": 0.6686, + "step": 10831 + }, + { + "epoch": 1.62, + "grad_norm": 2.3239464082898382, + "learning_rate": 1.5822772706461832e-06, + "loss": 0.6569, + "step": 10832 + }, + { + "epoch": 1.62, + "grad_norm": 0.9466892314774642, + "learning_rate": 1.5821987266374826e-06, + "loss": 0.6686, + "step": 10833 + }, + { + "epoch": 1.62, + "grad_norm": 1.0459272382200813, + "learning_rate": 1.5821201771950695e-06, + "loss": 0.6615, + "step": 10834 + }, + { + "epoch": 1.62, + "grad_norm": 0.9967695945441474, + "learning_rate": 1.5820416223196773e-06, + "loss": 0.6836, + "step": 10835 + }, + { + "epoch": 1.62, + "grad_norm": 0.8770053486898904, + "learning_rate": 1.581963062012039e-06, + "loss": 0.6595, + "step": 10836 + }, + { + "epoch": 1.62, + "grad_norm": 0.910773594725433, + "learning_rate": 1.5818844962728881e-06, + "loss": 0.6536, + "step": 10837 + }, + { + "epoch": 1.62, + "grad_norm": 2.5074522117771987, + "learning_rate": 1.581805925102958e-06, + "loss": 0.6875, + "step": 10838 + }, + { + "epoch": 1.62, + "grad_norm": 2.9596419648770977, + "learning_rate": 1.5817273485029814e-06, + "loss": 0.6641, + "step": 10839 + }, + { + "epoch": 1.62, + "grad_norm": 1.0510076392205372, + "learning_rate": 1.581648766473692e-06, + "loss": 0.6523, + "step": 10840 + }, + { + "epoch": 1.62, + "grad_norm": 0.9817154594639532, + "learning_rate": 1.5815701790158236e-06, + "loss": 0.6751, + "step": 10841 + }, + { + "epoch": 1.62, + "grad_norm": 1.301259661785676, + "learning_rate": 1.5814915861301091e-06, + "loss": 0.6595, + "step": 10842 + }, + { + "epoch": 1.62, + "grad_norm": 2.0114616294354986, + "learning_rate": 1.581412987817282e-06, + "loss": 0.679, + "step": 10843 + }, + { + "epoch": 1.62, + "grad_norm": 1.572221454907409, + "learning_rate": 1.5813343840780763e-06, + "loss": 0.6491, + "step": 10844 + }, + { + "epoch": 1.62, + "grad_norm": 0.845910476239507, + "learning_rate": 1.5812557749132257e-06, + "loss": 0.6777, + "step": 10845 + }, + { + "epoch": 1.62, + "grad_norm": 0.956480241470625, + "learning_rate": 1.581177160323463e-06, + "loss": 0.679, + "step": 10846 + }, + { + "epoch": 1.62, + "grad_norm": 3.4073459833751856, + "learning_rate": 1.5810985403095223e-06, + "loss": 0.6777, + "step": 10847 + }, + { + "epoch": 1.62, + "grad_norm": 1.3430517690590422, + "learning_rate": 1.5810199148721382e-06, + "loss": 0.6836, + "step": 10848 + }, + { + "epoch": 1.62, + "grad_norm": 2.9588820669757094, + "learning_rate": 1.5809412840120433e-06, + "loss": 0.6914, + "step": 10849 + }, + { + "epoch": 1.62, + "grad_norm": 1.1009034027596245, + "learning_rate": 1.5808626477299725e-06, + "loss": 0.6556, + "step": 10850 + }, + { + "epoch": 1.62, + "grad_norm": 1.0830545971914356, + "learning_rate": 1.580784006026659e-06, + "loss": 0.6868, + "step": 10851 + }, + { + "epoch": 1.62, + "grad_norm": 1.0278537749453733, + "learning_rate": 1.5807053589028367e-06, + "loss": 0.6602, + "step": 10852 + }, + { + "epoch": 1.62, + "grad_norm": 0.9222050747071919, + "learning_rate": 1.5806267063592405e-06, + "loss": 0.6823, + "step": 10853 + }, + { + "epoch": 1.62, + "grad_norm": 4.186732253476257, + "learning_rate": 1.5805480483966034e-06, + "loss": 0.6641, + "step": 10854 + }, + { + "epoch": 1.62, + "grad_norm": 0.9094514025724776, + "learning_rate": 1.5804693850156605e-06, + "loss": 0.6562, + "step": 10855 + }, + { + "epoch": 1.62, + "grad_norm": 2.6634288653517064, + "learning_rate": 1.580390716217145e-06, + "loss": 0.6335, + "step": 10856 + }, + { + "epoch": 1.62, + "grad_norm": 2.0525260277492583, + "learning_rate": 1.5803120420017919e-06, + "loss": 0.6732, + "step": 10857 + }, + { + "epoch": 1.62, + "grad_norm": 1.8136244536584318, + "learning_rate": 1.5802333623703351e-06, + "loss": 0.6745, + "step": 10858 + }, + { + "epoch": 1.62, + "grad_norm": 3.8010558419343305, + "learning_rate": 1.5801546773235091e-06, + "loss": 0.6556, + "step": 10859 + }, + { + "epoch": 1.62, + "grad_norm": 1.629618943054998, + "learning_rate": 1.5800759868620483e-06, + "loss": 0.6465, + "step": 10860 + }, + { + "epoch": 1.62, + "grad_norm": 1.4128627436945658, + "learning_rate": 1.5799972909866867e-06, + "loss": 0.6478, + "step": 10861 + }, + { + "epoch": 1.62, + "grad_norm": 1.689569587139332, + "learning_rate": 1.5799185896981596e-06, + "loss": 0.7064, + "step": 10862 + }, + { + "epoch": 1.62, + "grad_norm": 2.7000922590271332, + "learning_rate": 1.5798398829972004e-06, + "loss": 0.6556, + "step": 10863 + }, + { + "epoch": 1.62, + "grad_norm": 0.9423234281970452, + "learning_rate": 1.5797611708845447e-06, + "loss": 0.6615, + "step": 10864 + }, + { + "epoch": 1.62, + "grad_norm": 5.25358674047133, + "learning_rate": 1.5796824533609265e-06, + "loss": 0.6732, + "step": 10865 + }, + { + "epoch": 1.62, + "grad_norm": 3.506514262054172, + "learning_rate": 1.5796037304270809e-06, + "loss": 0.6934, + "step": 10866 + }, + { + "epoch": 1.62, + "grad_norm": 1.4669980573473138, + "learning_rate": 1.5795250020837423e-06, + "loss": 0.6914, + "step": 10867 + }, + { + "epoch": 1.62, + "grad_norm": 1.4573506142577413, + "learning_rate": 1.5794462683316456e-06, + "loss": 0.6699, + "step": 10868 + }, + { + "epoch": 1.62, + "grad_norm": 2.797288241815941, + "learning_rate": 1.5793675291715256e-06, + "loss": 0.6432, + "step": 10869 + }, + { + "epoch": 1.62, + "grad_norm": 1.3393245995853789, + "learning_rate": 1.5792887846041173e-06, + "loss": 0.7018, + "step": 10870 + }, + { + "epoch": 1.62, + "grad_norm": 2.905258121767591, + "learning_rate": 1.5792100346301555e-06, + "loss": 0.6745, + "step": 10871 + }, + { + "epoch": 1.62, + "grad_norm": 2.2455463319774505, + "learning_rate": 1.5791312792503754e-06, + "loss": 0.6751, + "step": 10872 + }, + { + "epoch": 1.62, + "grad_norm": 2.099197399590721, + "learning_rate": 1.5790525184655117e-06, + "loss": 0.6927, + "step": 10873 + }, + { + "epoch": 1.62, + "grad_norm": 3.4544582753929207, + "learning_rate": 1.5789737522762997e-06, + "loss": 0.6673, + "step": 10874 + }, + { + "epoch": 1.62, + "grad_norm": 0.9706640604385935, + "learning_rate": 1.5788949806834743e-06, + "loss": 0.6901, + "step": 10875 + }, + { + "epoch": 1.62, + "grad_norm": 1.5160975865259367, + "learning_rate": 1.5788162036877709e-06, + "loss": 0.7135, + "step": 10876 + }, + { + "epoch": 1.62, + "grad_norm": 4.69208507040437, + "learning_rate": 1.5787374212899247e-06, + "loss": 0.6921, + "step": 10877 + }, + { + "epoch": 1.62, + "grad_norm": 1.8855290135376344, + "learning_rate": 1.5786586334906711e-06, + "loss": 0.6823, + "step": 10878 + }, + { + "epoch": 1.62, + "grad_norm": 2.679696473697665, + "learning_rate": 1.5785798402907453e-06, + "loss": 0.6693, + "step": 10879 + }, + { + "epoch": 1.62, + "grad_norm": 8.204966584821817, + "learning_rate": 1.5785010416908823e-06, + "loss": 0.7025, + "step": 10880 + }, + { + "epoch": 1.62, + "grad_norm": 2.136604135695541, + "learning_rate": 1.5784222376918183e-06, + "loss": 0.653, + "step": 10881 + }, + { + "epoch": 1.62, + "grad_norm": 1.369216482913039, + "learning_rate": 1.5783434282942882e-06, + "loss": 0.6712, + "step": 10882 + }, + { + "epoch": 1.62, + "grad_norm": 1.1298008047460024, + "learning_rate": 1.5782646134990278e-06, + "loss": 0.6738, + "step": 10883 + }, + { + "epoch": 1.62, + "grad_norm": 1.816092444896434, + "learning_rate": 1.5781857933067731e-06, + "loss": 0.7012, + "step": 10884 + }, + { + "epoch": 1.62, + "grad_norm": 3.444683204721409, + "learning_rate": 1.5781069677182585e-06, + "loss": 0.6842, + "step": 10885 + }, + { + "epoch": 1.62, + "grad_norm": 6.632631276018449, + "learning_rate": 1.578028136734221e-06, + "loss": 0.7109, + "step": 10886 + }, + { + "epoch": 1.62, + "grad_norm": 5.219793411889018, + "learning_rate": 1.5779493003553954e-06, + "loss": 0.6986, + "step": 10887 + }, + { + "epoch": 1.62, + "grad_norm": 0.7814650339840221, + "learning_rate": 1.577870458582518e-06, + "loss": 0.7025, + "step": 10888 + }, + { + "epoch": 1.62, + "grad_norm": 1.220889433551564, + "learning_rate": 1.5777916114163247e-06, + "loss": 0.6908, + "step": 10889 + }, + { + "epoch": 1.62, + "grad_norm": 4.5407129468413965, + "learning_rate": 1.577712758857551e-06, + "loss": 0.6908, + "step": 10890 + }, + { + "epoch": 1.62, + "grad_norm": 4.090404316393218, + "learning_rate": 1.577633900906933e-06, + "loss": 0.6784, + "step": 10891 + }, + { + "epoch": 1.62, + "grad_norm": 1.8495097335816986, + "learning_rate": 1.5775550375652067e-06, + "loss": 0.6784, + "step": 10892 + }, + { + "epoch": 1.62, + "grad_norm": 1.8969914788994002, + "learning_rate": 1.5774761688331082e-06, + "loss": 0.6784, + "step": 10893 + }, + { + "epoch": 1.62, + "grad_norm": 2.042289938802985, + "learning_rate": 1.577397294711374e-06, + "loss": 0.6478, + "step": 10894 + }, + { + "epoch": 1.62, + "grad_norm": 1.872418807566418, + "learning_rate": 1.577318415200739e-06, + "loss": 0.6699, + "step": 10895 + }, + { + "epoch": 1.63, + "grad_norm": 0.6379590771404006, + "learning_rate": 1.5772395303019405e-06, + "loss": 0.6602, + "step": 10896 + }, + { + "epoch": 1.63, + "grad_norm": 2.0122191409371633, + "learning_rate": 1.5771606400157146e-06, + "loss": 0.6836, + "step": 10897 + }, + { + "epoch": 1.63, + "grad_norm": 1.0859701688342407, + "learning_rate": 1.5770817443427974e-06, + "loss": 0.6706, + "step": 10898 + }, + { + "epoch": 1.63, + "grad_norm": 4.405799645696073, + "learning_rate": 1.577002843283925e-06, + "loss": 0.696, + "step": 10899 + }, + { + "epoch": 1.63, + "grad_norm": 2.5968189217224458, + "learning_rate": 1.576923936839834e-06, + "loss": 0.6549, + "step": 10900 + }, + { + "epoch": 1.63, + "grad_norm": 5.303799362696809, + "learning_rate": 1.5768450250112611e-06, + "loss": 0.6908, + "step": 10901 + }, + { + "epoch": 1.63, + "grad_norm": 8.031690518433326, + "learning_rate": 1.5767661077989423e-06, + "loss": 0.6888, + "step": 10902 + }, + { + "epoch": 1.63, + "grad_norm": 2.111060782740673, + "learning_rate": 1.5766871852036147e-06, + "loss": 0.6732, + "step": 10903 + }, + { + "epoch": 1.63, + "grad_norm": 0.7046592465576236, + "learning_rate": 1.5766082572260145e-06, + "loss": 0.6751, + "step": 10904 + }, + { + "epoch": 1.63, + "grad_norm": 0.5682048330347854, + "learning_rate": 1.5765293238668783e-06, + "loss": 0.6745, + "step": 10905 + }, + { + "epoch": 1.63, + "grad_norm": 3.196006997753249, + "learning_rate": 1.5764503851269431e-06, + "loss": 0.6536, + "step": 10906 + }, + { + "epoch": 1.63, + "grad_norm": 3.4240489425377034, + "learning_rate": 1.5763714410069456e-06, + "loss": 0.6712, + "step": 10907 + }, + { + "epoch": 1.63, + "grad_norm": 0.7190492386183969, + "learning_rate": 1.5762924915076224e-06, + "loss": 0.6803, + "step": 10908 + }, + { + "epoch": 1.63, + "grad_norm": 3.5589222745559774, + "learning_rate": 1.5762135366297101e-06, + "loss": 0.6836, + "step": 10909 + }, + { + "epoch": 1.63, + "grad_norm": 2.753186264839048, + "learning_rate": 1.5761345763739463e-06, + "loss": 0.7142, + "step": 10910 + }, + { + "epoch": 1.63, + "grad_norm": 2.950976156585306, + "learning_rate": 1.5760556107410674e-06, + "loss": 0.6764, + "step": 10911 + }, + { + "epoch": 1.63, + "grad_norm": 2.8750083653396317, + "learning_rate": 1.5759766397318103e-06, + "loss": 0.6986, + "step": 10912 + }, + { + "epoch": 1.63, + "grad_norm": 0.7156017502585692, + "learning_rate": 1.5758976633469128e-06, + "loss": 0.6829, + "step": 10913 + }, + { + "epoch": 1.63, + "grad_norm": 5.116291418460826, + "learning_rate": 1.575818681587111e-06, + "loss": 0.6699, + "step": 10914 + }, + { + "epoch": 1.63, + "grad_norm": 7.034432795264781, + "learning_rate": 1.575739694453143e-06, + "loss": 0.6823, + "step": 10915 + }, + { + "epoch": 1.63, + "grad_norm": 1.6924086205399211, + "learning_rate": 1.5756607019457452e-06, + "loss": 0.6895, + "step": 10916 + }, + { + "epoch": 1.63, + "grad_norm": 0.846144029242367, + "learning_rate": 1.5755817040656551e-06, + "loss": 0.6797, + "step": 10917 + }, + { + "epoch": 1.63, + "grad_norm": 2.8709558181365127, + "learning_rate": 1.57550270081361e-06, + "loss": 0.6725, + "step": 10918 + }, + { + "epoch": 1.63, + "grad_norm": 0.9721618694161916, + "learning_rate": 1.5754236921903475e-06, + "loss": 0.6921, + "step": 10919 + }, + { + "epoch": 1.63, + "grad_norm": 0.6979067835663101, + "learning_rate": 1.5753446781966049e-06, + "loss": 0.6751, + "step": 10920 + }, + { + "epoch": 1.63, + "grad_norm": 0.872306573131073, + "learning_rate": 1.5752656588331193e-06, + "loss": 0.6784, + "step": 10921 + }, + { + "epoch": 1.63, + "grad_norm": 1.6956777739750108, + "learning_rate": 1.5751866341006283e-06, + "loss": 0.6855, + "step": 10922 + }, + { + "epoch": 1.63, + "grad_norm": 1.1897887993478684, + "learning_rate": 1.5751076039998698e-06, + "loss": 0.6693, + "step": 10923 + }, + { + "epoch": 1.63, + "grad_norm": 1.34467964753188, + "learning_rate": 1.5750285685315814e-06, + "loss": 0.6484, + "step": 10924 + }, + { + "epoch": 1.63, + "grad_norm": 0.7549339994270977, + "learning_rate": 1.5749495276964999e-06, + "loss": 0.6725, + "step": 10925 + }, + { + "epoch": 1.63, + "grad_norm": 1.075183602347167, + "learning_rate": 1.5748704814953643e-06, + "loss": 0.6725, + "step": 10926 + }, + { + "epoch": 1.63, + "grad_norm": 1.6118381754313562, + "learning_rate": 1.5747914299289113e-06, + "loss": 0.6706, + "step": 10927 + }, + { + "epoch": 1.63, + "grad_norm": 7.348179859933883, + "learning_rate": 1.574712372997879e-06, + "loss": 0.6771, + "step": 10928 + }, + { + "epoch": 1.63, + "grad_norm": 2.550081507486053, + "learning_rate": 1.5746333107030054e-06, + "loss": 0.666, + "step": 10929 + }, + { + "epoch": 1.63, + "grad_norm": 0.9218505298261529, + "learning_rate": 1.5745542430450285e-06, + "loss": 0.681, + "step": 10930 + }, + { + "epoch": 1.63, + "grad_norm": 0.9136566605186969, + "learning_rate": 1.5744751700246856e-06, + "loss": 0.6875, + "step": 10931 + }, + { + "epoch": 1.63, + "grad_norm": 1.8948483462206522, + "learning_rate": 1.5743960916427153e-06, + "loss": 0.6836, + "step": 10932 + }, + { + "epoch": 1.63, + "grad_norm": 1.0234674194939835, + "learning_rate": 1.5743170078998555e-06, + "loss": 0.6615, + "step": 10933 + }, + { + "epoch": 1.63, + "grad_norm": 0.9943207425774195, + "learning_rate": 1.5742379187968444e-06, + "loss": 0.6777, + "step": 10934 + }, + { + "epoch": 1.63, + "grad_norm": 1.3740746814502571, + "learning_rate": 1.5741588243344199e-06, + "loss": 0.6693, + "step": 10935 + }, + { + "epoch": 1.63, + "grad_norm": 2.0456323142121864, + "learning_rate": 1.5740797245133204e-06, + "loss": 0.6361, + "step": 10936 + }, + { + "epoch": 1.63, + "grad_norm": 1.570121096311766, + "learning_rate": 1.574000619334284e-06, + "loss": 0.6868, + "step": 10937 + }, + { + "epoch": 1.63, + "grad_norm": 2.109992810350273, + "learning_rate": 1.573921508798049e-06, + "loss": 0.6654, + "step": 10938 + }, + { + "epoch": 1.63, + "grad_norm": 3.724521926961035, + "learning_rate": 1.5738423929053538e-06, + "loss": 0.6934, + "step": 10939 + }, + { + "epoch": 1.63, + "grad_norm": 5.213900177645598, + "learning_rate": 1.573763271656937e-06, + "loss": 0.6868, + "step": 10940 + }, + { + "epoch": 1.63, + "grad_norm": 0.814279246307477, + "learning_rate": 1.5736841450535368e-06, + "loss": 0.6758, + "step": 10941 + }, + { + "epoch": 1.63, + "grad_norm": 1.1725572902559482, + "learning_rate": 1.5736050130958915e-06, + "loss": 0.6426, + "step": 10942 + }, + { + "epoch": 1.63, + "grad_norm": 1.974960599269594, + "learning_rate": 1.57352587578474e-06, + "loss": 0.696, + "step": 10943 + }, + { + "epoch": 1.63, + "grad_norm": 0.8639996485870586, + "learning_rate": 1.573446733120821e-06, + "loss": 0.6816, + "step": 10944 + }, + { + "epoch": 1.63, + "grad_norm": 1.3657074449299011, + "learning_rate": 1.5733675851048727e-06, + "loss": 0.6621, + "step": 10945 + }, + { + "epoch": 1.63, + "grad_norm": 1.1340741836927337, + "learning_rate": 1.5732884317376339e-06, + "loss": 0.6829, + "step": 10946 + }, + { + "epoch": 1.63, + "grad_norm": 2.3840299963219964, + "learning_rate": 1.5732092730198438e-06, + "loss": 0.6686, + "step": 10947 + }, + { + "epoch": 1.63, + "grad_norm": 0.9149744869998353, + "learning_rate": 1.5731301089522408e-06, + "loss": 0.6771, + "step": 10948 + }, + { + "epoch": 1.63, + "grad_norm": 1.3611126839719732, + "learning_rate": 1.5730509395355635e-06, + "loss": 0.6719, + "step": 10949 + }, + { + "epoch": 1.63, + "grad_norm": 1.0077093364365355, + "learning_rate": 1.5729717647705517e-06, + "loss": 0.6855, + "step": 10950 + }, + { + "epoch": 1.63, + "grad_norm": 1.5681658388419473, + "learning_rate": 1.572892584657943e-06, + "loss": 0.6934, + "step": 10951 + }, + { + "epoch": 1.63, + "grad_norm": 1.230216131839852, + "learning_rate": 1.5728133991984776e-06, + "loss": 0.6706, + "step": 10952 + }, + { + "epoch": 1.63, + "grad_norm": 2.5449229934003084, + "learning_rate": 1.5727342083928941e-06, + "loss": 0.6576, + "step": 10953 + }, + { + "epoch": 1.63, + "grad_norm": 4.437802055893595, + "learning_rate": 1.5726550122419313e-06, + "loss": 0.6725, + "step": 10954 + }, + { + "epoch": 1.63, + "grad_norm": 3.082945808799576, + "learning_rate": 1.5725758107463287e-06, + "loss": 0.6621, + "step": 10955 + }, + { + "epoch": 1.63, + "grad_norm": 0.9964887351024893, + "learning_rate": 1.5724966039068255e-06, + "loss": 0.6855, + "step": 10956 + }, + { + "epoch": 1.63, + "grad_norm": 3.0159438346265537, + "learning_rate": 1.5724173917241611e-06, + "loss": 0.6882, + "step": 10957 + }, + { + "epoch": 1.63, + "grad_norm": 2.1281041897269204, + "learning_rate": 1.5723381741990745e-06, + "loss": 0.6803, + "step": 10958 + }, + { + "epoch": 1.63, + "grad_norm": 2.5770437752823474, + "learning_rate": 1.5722589513323046e-06, + "loss": 0.6823, + "step": 10959 + }, + { + "epoch": 1.63, + "grad_norm": 0.9219254068289353, + "learning_rate": 1.572179723124592e-06, + "loss": 0.653, + "step": 10960 + }, + { + "epoch": 1.63, + "grad_norm": 0.7983515852505811, + "learning_rate": 1.572100489576675e-06, + "loss": 0.6784, + "step": 10961 + }, + { + "epoch": 1.63, + "grad_norm": 1.831478458549315, + "learning_rate": 1.5720212506892935e-06, + "loss": 0.6842, + "step": 10962 + }, + { + "epoch": 1.64, + "grad_norm": 1.528216391443293, + "learning_rate": 1.5719420064631871e-06, + "loss": 0.6855, + "step": 10963 + }, + { + "epoch": 1.64, + "grad_norm": 1.9589306300799871, + "learning_rate": 1.5718627568990955e-06, + "loss": 0.7051, + "step": 10964 + }, + { + "epoch": 1.64, + "grad_norm": 2.2410105370776954, + "learning_rate": 1.5717835019977583e-06, + "loss": 0.6667, + "step": 10965 + }, + { + "epoch": 1.64, + "grad_norm": 3.3701702737774113, + "learning_rate": 1.5717042417599151e-06, + "loss": 0.6576, + "step": 10966 + }, + { + "epoch": 1.64, + "grad_norm": 0.7117827525348709, + "learning_rate": 1.5716249761863056e-06, + "loss": 0.6784, + "step": 10967 + }, + { + "epoch": 1.64, + "grad_norm": 2.541445167817156, + "learning_rate": 1.5715457052776695e-06, + "loss": 0.6576, + "step": 10968 + }, + { + "epoch": 1.64, + "grad_norm": 1.128723950159252, + "learning_rate": 1.5714664290347468e-06, + "loss": 0.6842, + "step": 10969 + }, + { + "epoch": 1.64, + "grad_norm": 1.1238091725276853, + "learning_rate": 1.5713871474582776e-06, + "loss": 0.6634, + "step": 10970 + }, + { + "epoch": 1.64, + "grad_norm": 3.927287928866984, + "learning_rate": 1.5713078605490015e-06, + "loss": 0.6836, + "step": 10971 + }, + { + "epoch": 1.64, + "grad_norm": 3.0782073517569946, + "learning_rate": 1.5712285683076586e-06, + "loss": 0.7129, + "step": 10972 + }, + { + "epoch": 1.64, + "grad_norm": 2.6506574136045127, + "learning_rate": 1.5711492707349892e-06, + "loss": 0.6712, + "step": 10973 + }, + { + "epoch": 1.64, + "grad_norm": 0.7899508058710776, + "learning_rate": 1.5710699678317327e-06, + "loss": 0.6836, + "step": 10974 + }, + { + "epoch": 1.64, + "grad_norm": 1.0889368380581284, + "learning_rate": 1.57099065959863e-06, + "loss": 0.6725, + "step": 10975 + }, + { + "epoch": 1.64, + "grad_norm": 1.4821343769582394, + "learning_rate": 1.5709113460364209e-06, + "loss": 0.6465, + "step": 10976 + }, + { + "epoch": 1.64, + "grad_norm": 1.1628228811368406, + "learning_rate": 1.5708320271458458e-06, + "loss": 0.6849, + "step": 10977 + }, + { + "epoch": 1.64, + "grad_norm": 3.376097379563005, + "learning_rate": 1.5707527029276447e-06, + "loss": 0.6641, + "step": 10978 + }, + { + "epoch": 1.64, + "grad_norm": 1.1831951499265478, + "learning_rate": 1.5706733733825583e-06, + "loss": 0.6921, + "step": 10979 + }, + { + "epoch": 1.64, + "grad_norm": 2.4223063622582006, + "learning_rate": 1.5705940385113267e-06, + "loss": 0.6966, + "step": 10980 + }, + { + "epoch": 1.64, + "grad_norm": 1.1467797957680779, + "learning_rate": 1.5705146983146907e-06, + "loss": 0.6842, + "step": 10981 + }, + { + "epoch": 1.64, + "grad_norm": 2.499514511518596, + "learning_rate": 1.5704353527933903e-06, + "loss": 0.6608, + "step": 10982 + }, + { + "epoch": 1.64, + "grad_norm": 2.193462364197924, + "learning_rate": 1.5703560019481668e-06, + "loss": 0.6589, + "step": 10983 + }, + { + "epoch": 1.64, + "grad_norm": 1.0985943897379518, + "learning_rate": 1.5702766457797596e-06, + "loss": 0.6771, + "step": 10984 + }, + { + "epoch": 1.64, + "grad_norm": 2.926762124295069, + "learning_rate": 1.5701972842889104e-06, + "loss": 0.6465, + "step": 10985 + }, + { + "epoch": 1.64, + "grad_norm": 3.068431986437767, + "learning_rate": 1.5701179174763596e-06, + "loss": 0.6654, + "step": 10986 + }, + { + "epoch": 1.64, + "grad_norm": 3.0228241407132734, + "learning_rate": 1.5700385453428478e-06, + "loss": 0.6504, + "step": 10987 + }, + { + "epoch": 1.64, + "grad_norm": 1.7526396309636663, + "learning_rate": 1.5699591678891157e-06, + "loss": 0.6862, + "step": 10988 + }, + { + "epoch": 1.64, + "grad_norm": 0.8663143342100569, + "learning_rate": 1.5698797851159044e-06, + "loss": 0.6803, + "step": 10989 + }, + { + "epoch": 1.64, + "grad_norm": 0.8010232033255492, + "learning_rate": 1.5698003970239546e-06, + "loss": 0.6836, + "step": 10990 + }, + { + "epoch": 1.64, + "grad_norm": 1.5314561438062217, + "learning_rate": 1.5697210036140074e-06, + "loss": 0.6849, + "step": 10991 + }, + { + "epoch": 1.64, + "grad_norm": 4.452700519343828, + "learning_rate": 1.5696416048868036e-06, + "loss": 0.6732, + "step": 10992 + }, + { + "epoch": 1.64, + "grad_norm": 1.2005256212753086, + "learning_rate": 1.5695622008430846e-06, + "loss": 0.6439, + "step": 10993 + }, + { + "epoch": 1.64, + "grad_norm": 2.836220200999536, + "learning_rate": 1.569482791483591e-06, + "loss": 0.6882, + "step": 10994 + }, + { + "epoch": 1.64, + "grad_norm": 0.9426487619067886, + "learning_rate": 1.5694033768090638e-06, + "loss": 0.638, + "step": 10995 + }, + { + "epoch": 1.64, + "grad_norm": 2.853220524231923, + "learning_rate": 1.569323956820245e-06, + "loss": 0.6882, + "step": 10996 + }, + { + "epoch": 1.64, + "grad_norm": 1.5894680463171693, + "learning_rate": 1.5692445315178753e-06, + "loss": 0.6836, + "step": 10997 + }, + { + "epoch": 1.64, + "grad_norm": 2.6641968795992494, + "learning_rate": 1.569165100902696e-06, + "loss": 0.6589, + "step": 10998 + }, + { + "epoch": 1.64, + "grad_norm": 1.3024425999022622, + "learning_rate": 1.5690856649754482e-06, + "loss": 0.6823, + "step": 10999 + }, + { + "epoch": 1.64, + "grad_norm": 2.4018028229974293, + "learning_rate": 1.569006223736874e-06, + "loss": 0.6797, + "step": 11000 + }, + { + "epoch": 1.64, + "grad_norm": 1.3558012450356627, + "learning_rate": 1.568926777187714e-06, + "loss": 0.666, + "step": 11001 + }, + { + "epoch": 1.64, + "grad_norm": 1.5966894607211775, + "learning_rate": 1.5688473253287104e-06, + "loss": 0.6543, + "step": 11002 + }, + { + "epoch": 1.64, + "grad_norm": 2.7342646754833875, + "learning_rate": 1.5687678681606043e-06, + "loss": 0.6549, + "step": 11003 + }, + { + "epoch": 1.64, + "grad_norm": 0.8996206334838268, + "learning_rate": 1.5686884056841372e-06, + "loss": 0.6712, + "step": 11004 + }, + { + "epoch": 1.64, + "grad_norm": 1.5978410243115855, + "learning_rate": 1.568608937900051e-06, + "loss": 0.6745, + "step": 11005 + }, + { + "epoch": 1.64, + "grad_norm": 0.9627013908844201, + "learning_rate": 1.5685294648090876e-06, + "loss": 0.6732, + "step": 11006 + }, + { + "epoch": 1.64, + "grad_norm": 2.0233960594705365, + "learning_rate": 1.5684499864119883e-06, + "loss": 0.6829, + "step": 11007 + }, + { + "epoch": 1.64, + "grad_norm": 3.60832674034636, + "learning_rate": 1.5683705027094947e-06, + "loss": 0.6868, + "step": 11008 + }, + { + "epoch": 1.64, + "grad_norm": 0.9133052609936653, + "learning_rate": 1.5682910137023493e-06, + "loss": 0.6471, + "step": 11009 + }, + { + "epoch": 1.64, + "grad_norm": 2.929251407178253, + "learning_rate": 1.5682115193912935e-06, + "loss": 0.6458, + "step": 11010 + }, + { + "epoch": 1.64, + "grad_norm": 2.920223959074569, + "learning_rate": 1.5681320197770692e-06, + "loss": 0.6465, + "step": 11011 + }, + { + "epoch": 1.64, + "grad_norm": 2.2506515716931528, + "learning_rate": 1.5680525148604184e-06, + "loss": 0.6784, + "step": 11012 + }, + { + "epoch": 1.64, + "grad_norm": 2.19978538271458, + "learning_rate": 1.5679730046420838e-06, + "loss": 0.6823, + "step": 11013 + }, + { + "epoch": 1.64, + "grad_norm": 1.1315186159684851, + "learning_rate": 1.5678934891228065e-06, + "loss": 0.6582, + "step": 11014 + }, + { + "epoch": 1.64, + "grad_norm": 3.0583909406202303, + "learning_rate": 1.5678139683033292e-06, + "loss": 0.6712, + "step": 11015 + }, + { + "epoch": 1.64, + "grad_norm": 0.8821155846606155, + "learning_rate": 1.5677344421843936e-06, + "loss": 0.6549, + "step": 11016 + }, + { + "epoch": 1.64, + "grad_norm": 1.2782099014572355, + "learning_rate": 1.5676549107667424e-06, + "loss": 0.6751, + "step": 11017 + }, + { + "epoch": 1.64, + "grad_norm": 1.8713202447661705, + "learning_rate": 1.5675753740511179e-06, + "loss": 0.7161, + "step": 11018 + }, + { + "epoch": 1.64, + "grad_norm": 3.8669465406688337, + "learning_rate": 1.5674958320382623e-06, + "loss": 0.6634, + "step": 11019 + }, + { + "epoch": 1.64, + "grad_norm": 3.071454471754388, + "learning_rate": 1.5674162847289176e-06, + "loss": 0.6751, + "step": 11020 + }, + { + "epoch": 1.64, + "grad_norm": 4.651945493672988, + "learning_rate": 1.5673367321238267e-06, + "loss": 0.6738, + "step": 11021 + }, + { + "epoch": 1.64, + "grad_norm": 1.5139825774792492, + "learning_rate": 1.567257174223732e-06, + "loss": 0.6719, + "step": 11022 + }, + { + "epoch": 1.64, + "grad_norm": 1.1338159157343408, + "learning_rate": 1.5671776110293757e-06, + "loss": 0.6315, + "step": 11023 + }, + { + "epoch": 1.64, + "grad_norm": 1.2665135610964597, + "learning_rate": 1.5670980425415006e-06, + "loss": 0.6901, + "step": 11024 + }, + { + "epoch": 1.64, + "grad_norm": 2.326544767457685, + "learning_rate": 1.5670184687608497e-06, + "loss": 0.6966, + "step": 11025 + }, + { + "epoch": 1.64, + "grad_norm": 4.856435482137852, + "learning_rate": 1.566938889688165e-06, + "loss": 0.6322, + "step": 11026 + }, + { + "epoch": 1.64, + "grad_norm": 3.3812227492927738, + "learning_rate": 1.5668593053241894e-06, + "loss": 0.679, + "step": 11027 + }, + { + "epoch": 1.64, + "grad_norm": 0.8116644450458828, + "learning_rate": 1.566779715669666e-06, + "loss": 0.6797, + "step": 11028 + }, + { + "epoch": 1.64, + "grad_norm": 2.024558192933994, + "learning_rate": 1.5667001207253372e-06, + "loss": 0.6725, + "step": 11029 + }, + { + "epoch": 1.65, + "grad_norm": 1.7460470047728045, + "learning_rate": 1.5666205204919462e-06, + "loss": 0.6771, + "step": 11030 + }, + { + "epoch": 1.65, + "grad_norm": 1.1559757087039901, + "learning_rate": 1.5665409149702357e-06, + "loss": 0.6927, + "step": 11031 + }, + { + "epoch": 1.65, + "grad_norm": 1.3104958745537196, + "learning_rate": 1.5664613041609487e-06, + "loss": 0.651, + "step": 11032 + }, + { + "epoch": 1.65, + "grad_norm": 2.7116229562079615, + "learning_rate": 1.5663816880648283e-06, + "loss": 0.6764, + "step": 11033 + }, + { + "epoch": 1.65, + "grad_norm": 3.0851606118462764, + "learning_rate": 1.5663020666826175e-06, + "loss": 0.6842, + "step": 11034 + }, + { + "epoch": 1.65, + "grad_norm": 2.6236287283413167, + "learning_rate": 1.5662224400150596e-06, + "loss": 0.6855, + "step": 11035 + }, + { + "epoch": 1.65, + "grad_norm": 1.752063287548609, + "learning_rate": 1.5661428080628975e-06, + "loss": 0.6823, + "step": 11036 + }, + { + "epoch": 1.65, + "grad_norm": 0.842951190285467, + "learning_rate": 1.5660631708268744e-06, + "loss": 0.6829, + "step": 11037 + }, + { + "epoch": 1.65, + "grad_norm": 1.1116644534545757, + "learning_rate": 1.5659835283077335e-06, + "loss": 0.6875, + "step": 11038 + }, + { + "epoch": 1.65, + "grad_norm": 3.251929075048706, + "learning_rate": 1.5659038805062189e-06, + "loss": 0.6719, + "step": 11039 + }, + { + "epoch": 1.65, + "grad_norm": 1.8644715149599564, + "learning_rate": 1.5658242274230728e-06, + "loss": 0.6777, + "step": 11040 + }, + { + "epoch": 1.65, + "grad_norm": 0.7924832400664795, + "learning_rate": 1.5657445690590391e-06, + "loss": 0.6751, + "step": 11041 + }, + { + "epoch": 1.65, + "grad_norm": 0.8258657749569108, + "learning_rate": 1.5656649054148617e-06, + "loss": 0.681, + "step": 11042 + }, + { + "epoch": 1.65, + "grad_norm": 1.0103920740215293, + "learning_rate": 1.5655852364912835e-06, + "loss": 0.6712, + "step": 11043 + }, + { + "epoch": 1.65, + "grad_norm": 1.1591239295166853, + "learning_rate": 1.565505562289048e-06, + "loss": 0.6439, + "step": 11044 + }, + { + "epoch": 1.65, + "grad_norm": 2.4546565091174037, + "learning_rate": 1.5654258828088995e-06, + "loss": 0.6543, + "step": 11045 + }, + { + "epoch": 1.65, + "grad_norm": 2.2132332252496725, + "learning_rate": 1.5653461980515813e-06, + "loss": 0.6979, + "step": 11046 + }, + { + "epoch": 1.65, + "grad_norm": 0.8351220578941677, + "learning_rate": 1.5652665080178367e-06, + "loss": 0.6764, + "step": 11047 + }, + { + "epoch": 1.65, + "grad_norm": 1.1169301677372152, + "learning_rate": 1.5651868127084097e-06, + "loss": 0.6829, + "step": 11048 + }, + { + "epoch": 1.65, + "grad_norm": 4.499217816718475, + "learning_rate": 1.5651071121240444e-06, + "loss": 0.6406, + "step": 11049 + }, + { + "epoch": 1.65, + "grad_norm": 1.3506902595446095, + "learning_rate": 1.5650274062654844e-06, + "loss": 0.666, + "step": 11050 + }, + { + "epoch": 1.65, + "grad_norm": 3.063937252141609, + "learning_rate": 1.564947695133474e-06, + "loss": 0.6803, + "step": 11051 + }, + { + "epoch": 1.65, + "grad_norm": 0.9412156869288277, + "learning_rate": 1.5648679787287562e-06, + "loss": 0.6875, + "step": 11052 + }, + { + "epoch": 1.65, + "grad_norm": 2.5726772251929617, + "learning_rate": 1.5647882570520761e-06, + "loss": 0.6868, + "step": 11053 + }, + { + "epoch": 1.65, + "grad_norm": 1.2362289612781736, + "learning_rate": 1.5647085301041768e-06, + "loss": 0.653, + "step": 11054 + }, + { + "epoch": 1.65, + "grad_norm": 1.311084409902412, + "learning_rate": 1.5646287978858033e-06, + "loss": 0.6497, + "step": 11055 + }, + { + "epoch": 1.65, + "grad_norm": 3.150765599392005, + "learning_rate": 1.5645490603976992e-06, + "loss": 0.668, + "step": 11056 + }, + { + "epoch": 1.65, + "grad_norm": 1.7884654060186447, + "learning_rate": 1.5644693176406085e-06, + "loss": 0.6562, + "step": 11057 + }, + { + "epoch": 1.65, + "grad_norm": 0.9591915050447417, + "learning_rate": 1.564389569615276e-06, + "loss": 0.6576, + "step": 11058 + }, + { + "epoch": 1.65, + "grad_norm": 4.6783593336631135, + "learning_rate": 1.5643098163224458e-06, + "loss": 0.6693, + "step": 11059 + }, + { + "epoch": 1.65, + "grad_norm": 0.8203033351723013, + "learning_rate": 1.564230057762862e-06, + "loss": 0.6426, + "step": 11060 + }, + { + "epoch": 1.65, + "grad_norm": 0.9620683386555, + "learning_rate": 1.564150293937269e-06, + "loss": 0.6693, + "step": 11061 + }, + { + "epoch": 1.65, + "grad_norm": 1.5890499889393723, + "learning_rate": 1.5640705248464116e-06, + "loss": 0.6751, + "step": 11062 + }, + { + "epoch": 1.65, + "grad_norm": 1.7196768432680012, + "learning_rate": 1.5639907504910344e-06, + "loss": 0.6686, + "step": 11063 + }, + { + "epoch": 1.65, + "grad_norm": 3.1058862697909686, + "learning_rate": 1.5639109708718815e-06, + "loss": 0.6758, + "step": 11064 + }, + { + "epoch": 1.65, + "grad_norm": 3.427161859140091, + "learning_rate": 1.5638311859896973e-06, + "loss": 0.6712, + "step": 11065 + }, + { + "epoch": 1.65, + "grad_norm": 2.9360148694600396, + "learning_rate": 1.563751395845227e-06, + "loss": 0.6771, + "step": 11066 + }, + { + "epoch": 1.65, + "grad_norm": 1.4477488741804418, + "learning_rate": 1.563671600439215e-06, + "loss": 0.6738, + "step": 11067 + }, + { + "epoch": 1.65, + "grad_norm": 3.861333825873844, + "learning_rate": 1.5635917997724061e-06, + "loss": 0.6823, + "step": 11068 + }, + { + "epoch": 1.65, + "grad_norm": 2.536272174530692, + "learning_rate": 1.5635119938455455e-06, + "loss": 0.6706, + "step": 11069 + }, + { + "epoch": 1.65, + "grad_norm": 1.203602998783429, + "learning_rate": 1.563432182659377e-06, + "loss": 0.7018, + "step": 11070 + }, + { + "epoch": 1.65, + "grad_norm": 4.077625181998982, + "learning_rate": 1.5633523662146466e-06, + "loss": 0.6523, + "step": 11071 + }, + { + "epoch": 1.65, + "grad_norm": 1.131325894859596, + "learning_rate": 1.5632725445120984e-06, + "loss": 0.6823, + "step": 11072 + }, + { + "epoch": 1.65, + "grad_norm": 5.45626780470299, + "learning_rate": 1.563192717552478e-06, + "loss": 0.7083, + "step": 11073 + }, + { + "epoch": 1.65, + "grad_norm": 2.1524919513253162, + "learning_rate": 1.5631128853365302e-06, + "loss": 0.6491, + "step": 11074 + }, + { + "epoch": 1.65, + "grad_norm": 1.8078167857482328, + "learning_rate": 1.5630330478649996e-06, + "loss": 0.6654, + "step": 11075 + }, + { + "epoch": 1.65, + "grad_norm": 2.8282487770443367, + "learning_rate": 1.562953205138632e-06, + "loss": 0.679, + "step": 11076 + }, + { + "epoch": 1.65, + "grad_norm": 1.023022217354098, + "learning_rate": 1.5628733571581725e-06, + "loss": 0.6667, + "step": 11077 + }, + { + "epoch": 1.65, + "grad_norm": 2.2012741142241046, + "learning_rate": 1.5627935039243661e-06, + "loss": 0.6465, + "step": 11078 + }, + { + "epoch": 1.65, + "grad_norm": 3.030772911437229, + "learning_rate": 1.562713645437958e-06, + "loss": 0.6426, + "step": 11079 + }, + { + "epoch": 1.65, + "grad_norm": 3.6750047031520667, + "learning_rate": 1.562633781699694e-06, + "loss": 0.6751, + "step": 11080 + }, + { + "epoch": 1.65, + "grad_norm": 5.41012661519084, + "learning_rate": 1.5625539127103188e-06, + "loss": 0.7064, + "step": 11081 + }, + { + "epoch": 1.65, + "grad_norm": 1.3003711784676077, + "learning_rate": 1.5624740384705786e-06, + "loss": 0.6589, + "step": 11082 + }, + { + "epoch": 1.65, + "grad_norm": 2.932268940614518, + "learning_rate": 1.5623941589812183e-06, + "loss": 0.666, + "step": 11083 + }, + { + "epoch": 1.65, + "grad_norm": 2.65945318973393, + "learning_rate": 1.5623142742429834e-06, + "loss": 0.6536, + "step": 11084 + }, + { + "epoch": 1.65, + "grad_norm": 3.3785038805379024, + "learning_rate": 1.5622343842566197e-06, + "loss": 0.6895, + "step": 11085 + }, + { + "epoch": 1.65, + "grad_norm": 1.9878834860819865, + "learning_rate": 1.5621544890228729e-06, + "loss": 0.709, + "step": 11086 + }, + { + "epoch": 1.65, + "grad_norm": 1.004694455330313, + "learning_rate": 1.5620745885424884e-06, + "loss": 0.681, + "step": 11087 + }, + { + "epoch": 1.65, + "grad_norm": 1.732089184788964, + "learning_rate": 1.561994682816212e-06, + "loss": 0.6699, + "step": 11088 + }, + { + "epoch": 1.65, + "grad_norm": 1.8114382488991914, + "learning_rate": 1.5619147718447895e-06, + "loss": 0.6816, + "step": 11089 + }, + { + "epoch": 1.65, + "grad_norm": 1.0372131310317352, + "learning_rate": 1.5618348556289672e-06, + "loss": 0.7109, + "step": 11090 + }, + { + "epoch": 1.65, + "grad_norm": 1.6027335244257528, + "learning_rate": 1.56175493416949e-06, + "loss": 0.6745, + "step": 11091 + }, + { + "epoch": 1.65, + "grad_norm": 6.275199910327199, + "learning_rate": 1.5616750074671049e-06, + "loss": 0.6771, + "step": 11092 + }, + { + "epoch": 1.65, + "grad_norm": 1.9261426125492687, + "learning_rate": 1.5615950755225567e-06, + "loss": 0.681, + "step": 11093 + }, + { + "epoch": 1.65, + "grad_norm": 0.8316102335710438, + "learning_rate": 1.561515138336592e-06, + "loss": 0.6686, + "step": 11094 + }, + { + "epoch": 1.65, + "grad_norm": 1.9435841722912126, + "learning_rate": 1.561435195909957e-06, + "loss": 0.6582, + "step": 11095 + }, + { + "epoch": 1.65, + "grad_norm": 0.9223064363168515, + "learning_rate": 1.561355248243398e-06, + "loss": 0.6719, + "step": 11096 + }, + { + "epoch": 1.66, + "grad_norm": 3.122132726388421, + "learning_rate": 1.5612752953376604e-06, + "loss": 0.6849, + "step": 11097 + }, + { + "epoch": 1.66, + "grad_norm": 5.001133518578907, + "learning_rate": 1.561195337193491e-06, + "loss": 0.6504, + "step": 11098 + }, + { + "epoch": 1.66, + "grad_norm": 1.4183217726393211, + "learning_rate": 1.5611153738116358e-06, + "loss": 0.6764, + "step": 11099 + }, + { + "epoch": 1.66, + "grad_norm": 2.7425669600752007, + "learning_rate": 1.5610354051928414e-06, + "loss": 0.6602, + "step": 11100 + }, + { + "epoch": 1.66, + "grad_norm": 2.3608260319266887, + "learning_rate": 1.5609554313378537e-06, + "loss": 0.6686, + "step": 11101 + }, + { + "epoch": 1.66, + "grad_norm": 0.8470761079870077, + "learning_rate": 1.5608754522474194e-06, + "loss": 0.6517, + "step": 11102 + }, + { + "epoch": 1.66, + "grad_norm": 2.5083174480901356, + "learning_rate": 1.560795467922285e-06, + "loss": 0.6777, + "step": 11103 + }, + { + "epoch": 1.66, + "grad_norm": 4.71007261260778, + "learning_rate": 1.560715478363197e-06, + "loss": 0.6849, + "step": 11104 + }, + { + "epoch": 1.66, + "grad_norm": 3.751906016717194, + "learning_rate": 1.5606354835709017e-06, + "loss": 0.6621, + "step": 11105 + }, + { + "epoch": 1.66, + "grad_norm": 0.8791067113128842, + "learning_rate": 1.5605554835461456e-06, + "loss": 0.6719, + "step": 11106 + }, + { + "epoch": 1.66, + "grad_norm": 3.9857240836415744, + "learning_rate": 1.560475478289676e-06, + "loss": 0.6842, + "step": 11107 + }, + { + "epoch": 1.66, + "grad_norm": 5.597611651840696, + "learning_rate": 1.5603954678022388e-06, + "loss": 0.7194, + "step": 11108 + }, + { + "epoch": 1.66, + "grad_norm": 6.063099189850777, + "learning_rate": 1.5603154520845818e-06, + "loss": 0.6999, + "step": 11109 + }, + { + "epoch": 1.66, + "grad_norm": 2.7325004768469654, + "learning_rate": 1.5602354311374506e-06, + "loss": 0.6901, + "step": 11110 + }, + { + "epoch": 1.66, + "grad_norm": 0.911838305781167, + "learning_rate": 1.5601554049615926e-06, + "loss": 0.6758, + "step": 11111 + }, + { + "epoch": 1.66, + "grad_norm": 1.9741657679860791, + "learning_rate": 1.5600753735577547e-06, + "loss": 0.666, + "step": 11112 + }, + { + "epoch": 1.66, + "grad_norm": 1.0125759982496674, + "learning_rate": 1.559995336926684e-06, + "loss": 0.6927, + "step": 11113 + }, + { + "epoch": 1.66, + "grad_norm": 0.8795404938851586, + "learning_rate": 1.5599152950691272e-06, + "loss": 0.6732, + "step": 11114 + }, + { + "epoch": 1.66, + "grad_norm": 1.2697395108479121, + "learning_rate": 1.5598352479858315e-06, + "loss": 0.6855, + "step": 11115 + }, + { + "epoch": 1.66, + "grad_norm": 1.8334039664760695, + "learning_rate": 1.5597551956775439e-06, + "loss": 0.6732, + "step": 11116 + }, + { + "epoch": 1.66, + "grad_norm": 1.6602132150451303, + "learning_rate": 1.5596751381450113e-06, + "loss": 0.6771, + "step": 11117 + }, + { + "epoch": 1.66, + "grad_norm": 1.5852376725644413, + "learning_rate": 1.5595950753889813e-06, + "loss": 0.6445, + "step": 11118 + }, + { + "epoch": 1.66, + "grad_norm": 2.321383805495227, + "learning_rate": 1.559515007410201e-06, + "loss": 0.6797, + "step": 11119 + }, + { + "epoch": 1.66, + "grad_norm": 4.5302405894117355, + "learning_rate": 1.5594349342094178e-06, + "loss": 0.7057, + "step": 11120 + }, + { + "epoch": 1.66, + "grad_norm": 3.4270004202918782, + "learning_rate": 1.5593548557873786e-06, + "loss": 0.6719, + "step": 11121 + }, + { + "epoch": 1.66, + "grad_norm": 2.068045237997582, + "learning_rate": 1.5592747721448314e-06, + "loss": 0.6569, + "step": 11122 + }, + { + "epoch": 1.66, + "grad_norm": 2.063520420142707, + "learning_rate": 1.5591946832825229e-06, + "loss": 0.6908, + "step": 11123 + }, + { + "epoch": 1.66, + "grad_norm": 1.438264524722734, + "learning_rate": 1.5591145892012011e-06, + "loss": 0.6589, + "step": 11124 + }, + { + "epoch": 1.66, + "grad_norm": 1.618236437279899, + "learning_rate": 1.5590344899016137e-06, + "loss": 0.6654, + "step": 11125 + }, + { + "epoch": 1.66, + "grad_norm": 2.779222525978523, + "learning_rate": 1.5589543853845077e-06, + "loss": 0.6751, + "step": 11126 + }, + { + "epoch": 1.66, + "grad_norm": 1.7173139365475074, + "learning_rate": 1.558874275650631e-06, + "loss": 0.6842, + "step": 11127 + }, + { + "epoch": 1.66, + "grad_norm": 1.2776254548913242, + "learning_rate": 1.558794160700731e-06, + "loss": 0.696, + "step": 11128 + }, + { + "epoch": 1.66, + "grad_norm": 2.905946180288359, + "learning_rate": 1.5587140405355564e-06, + "loss": 0.6947, + "step": 11129 + }, + { + "epoch": 1.66, + "grad_norm": 3.353843384014234, + "learning_rate": 1.5586339151558537e-06, + "loss": 0.6986, + "step": 11130 + }, + { + "epoch": 1.66, + "grad_norm": 1.3058526087639761, + "learning_rate": 1.5585537845623713e-06, + "loss": 0.6764, + "step": 11131 + }, + { + "epoch": 1.66, + "grad_norm": 1.9818970223889079, + "learning_rate": 1.5584736487558572e-06, + "loss": 0.681, + "step": 11132 + }, + { + "epoch": 1.66, + "grad_norm": 0.8782354708294189, + "learning_rate": 1.558393507737059e-06, + "loss": 0.6719, + "step": 11133 + }, + { + "epoch": 1.66, + "grad_norm": 2.3926594740751774, + "learning_rate": 1.5583133615067248e-06, + "loss": 0.6699, + "step": 11134 + }, + { + "epoch": 1.66, + "grad_norm": 2.9831800708714202, + "learning_rate": 1.5582332100656029e-06, + "loss": 0.6836, + "step": 11135 + }, + { + "epoch": 1.66, + "grad_norm": 3.242828900256094, + "learning_rate": 1.5581530534144408e-06, + "loss": 0.6699, + "step": 11136 + }, + { + "epoch": 1.66, + "grad_norm": 1.0960550586108269, + "learning_rate": 1.5580728915539866e-06, + "loss": 0.6738, + "step": 11137 + }, + { + "epoch": 1.66, + "grad_norm": 0.9043122303698935, + "learning_rate": 1.5579927244849893e-06, + "loss": 0.6803, + "step": 11138 + }, + { + "epoch": 1.66, + "grad_norm": 3.9757104192032737, + "learning_rate": 1.5579125522081964e-06, + "loss": 0.6706, + "step": 11139 + }, + { + "epoch": 1.66, + "grad_norm": 0.7337610447817694, + "learning_rate": 1.5578323747243562e-06, + "loss": 0.679, + "step": 11140 + }, + { + "epoch": 1.66, + "grad_norm": 4.599755047666309, + "learning_rate": 1.5577521920342171e-06, + "loss": 0.6868, + "step": 11141 + }, + { + "epoch": 1.66, + "grad_norm": 6.631720114669484, + "learning_rate": 1.5576720041385278e-06, + "loss": 0.6882, + "step": 11142 + }, + { + "epoch": 1.66, + "grad_norm": 5.658903085481322, + "learning_rate": 1.5575918110380362e-06, + "loss": 0.7103, + "step": 11143 + }, + { + "epoch": 1.66, + "grad_norm": 1.5919183143270976, + "learning_rate": 1.5575116127334908e-06, + "loss": 0.6745, + "step": 11144 + }, + { + "epoch": 1.66, + "grad_norm": 2.511230867470934, + "learning_rate": 1.5574314092256403e-06, + "loss": 0.7031, + "step": 11145 + }, + { + "epoch": 1.66, + "grad_norm": 3.855088150747014, + "learning_rate": 1.5573512005152331e-06, + "loss": 0.651, + "step": 11146 + }, + { + "epoch": 1.66, + "grad_norm": 5.226299711028827, + "learning_rate": 1.5572709866030179e-06, + "loss": 0.6947, + "step": 11147 + }, + { + "epoch": 1.66, + "grad_norm": 1.3924223996856537, + "learning_rate": 1.5571907674897434e-06, + "loss": 0.6673, + "step": 11148 + }, + { + "epoch": 1.66, + "grad_norm": 1.0814797437997457, + "learning_rate": 1.557110543176158e-06, + "loss": 0.6719, + "step": 11149 + }, + { + "epoch": 1.66, + "grad_norm": 0.7429360919593815, + "learning_rate": 1.5570303136630108e-06, + "loss": 0.681, + "step": 11150 + }, + { + "epoch": 1.66, + "grad_norm": 4.2572656932113135, + "learning_rate": 1.5569500789510501e-06, + "loss": 0.6973, + "step": 11151 + }, + { + "epoch": 1.66, + "grad_norm": 1.41948355950848, + "learning_rate": 1.5568698390410252e-06, + "loss": 0.6862, + "step": 11152 + }, + { + "epoch": 1.66, + "grad_norm": 2.178215831361386, + "learning_rate": 1.556789593933685e-06, + "loss": 0.6751, + "step": 11153 + }, + { + "epoch": 1.66, + "grad_norm": 1.7355337428854247, + "learning_rate": 1.5567093436297783e-06, + "loss": 0.6608, + "step": 11154 + }, + { + "epoch": 1.66, + "grad_norm": 1.667750129575242, + "learning_rate": 1.556629088130054e-06, + "loss": 0.651, + "step": 11155 + }, + { + "epoch": 1.66, + "grad_norm": 1.4081301076731987, + "learning_rate": 1.556548827435261e-06, + "loss": 0.6712, + "step": 11156 + }, + { + "epoch": 1.66, + "grad_norm": 2.454432380138162, + "learning_rate": 1.5564685615461489e-06, + "loss": 0.6934, + "step": 11157 + }, + { + "epoch": 1.66, + "grad_norm": 0.8797490240989851, + "learning_rate": 1.5563882904634665e-06, + "loss": 0.6771, + "step": 11158 + }, + { + "epoch": 1.66, + "grad_norm": 1.1305676976726942, + "learning_rate": 1.5563080141879628e-06, + "loss": 0.6628, + "step": 11159 + }, + { + "epoch": 1.66, + "grad_norm": 1.7389754162693332, + "learning_rate": 1.556227732720387e-06, + "loss": 0.6758, + "step": 11160 + }, + { + "epoch": 1.66, + "grad_norm": 0.7448466918886252, + "learning_rate": 1.5561474460614886e-06, + "loss": 0.6875, + "step": 11161 + }, + { + "epoch": 1.66, + "grad_norm": 0.9028245922372903, + "learning_rate": 1.5560671542120173e-06, + "loss": 0.6647, + "step": 11162 + }, + { + "epoch": 1.66, + "grad_norm": 1.1372294506350304, + "learning_rate": 1.555986857172722e-06, + "loss": 0.6823, + "step": 11163 + }, + { + "epoch": 1.67, + "grad_norm": 2.949842386813308, + "learning_rate": 1.555906554944352e-06, + "loss": 0.6927, + "step": 11164 + }, + { + "epoch": 1.67, + "grad_norm": 5.756256500394645, + "learning_rate": 1.555826247527657e-06, + "loss": 0.6953, + "step": 11165 + }, + { + "epoch": 1.67, + "grad_norm": 3.5817706477439955, + "learning_rate": 1.5557459349233866e-06, + "loss": 0.6745, + "step": 11166 + }, + { + "epoch": 1.67, + "grad_norm": 0.7544729902383263, + "learning_rate": 1.5556656171322901e-06, + "loss": 0.6738, + "step": 11167 + }, + { + "epoch": 1.67, + "grad_norm": 2.106231058219863, + "learning_rate": 1.5555852941551174e-06, + "loss": 0.6966, + "step": 11168 + }, + { + "epoch": 1.67, + "grad_norm": 2.389920412067286, + "learning_rate": 1.5555049659926179e-06, + "loss": 0.6465, + "step": 11169 + }, + { + "epoch": 1.67, + "grad_norm": 1.4299782045115614, + "learning_rate": 1.5554246326455416e-06, + "loss": 0.6686, + "step": 11170 + }, + { + "epoch": 1.67, + "grad_norm": 1.4409682833028752, + "learning_rate": 1.5553442941146378e-06, + "loss": 0.6777, + "step": 11171 + }, + { + "epoch": 1.67, + "grad_norm": 2.6825286652887876, + "learning_rate": 1.5552639504006568e-06, + "loss": 0.7031, + "step": 11172 + }, + { + "epoch": 1.67, + "grad_norm": 0.777780014755392, + "learning_rate": 1.555183601504348e-06, + "loss": 0.6921, + "step": 11173 + }, + { + "epoch": 1.67, + "grad_norm": 3.1906987549718724, + "learning_rate": 1.5551032474264618e-06, + "loss": 0.7233, + "step": 11174 + }, + { + "epoch": 1.67, + "grad_norm": 2.2706835592684076, + "learning_rate": 1.5550228881677478e-06, + "loss": 0.6647, + "step": 11175 + }, + { + "epoch": 1.67, + "grad_norm": 1.3454484119802808, + "learning_rate": 1.5549425237289563e-06, + "loss": 0.6719, + "step": 11176 + }, + { + "epoch": 1.67, + "grad_norm": 1.362782424238008, + "learning_rate": 1.5548621541108369e-06, + "loss": 0.7025, + "step": 11177 + }, + { + "epoch": 1.67, + "grad_norm": 2.00587245419975, + "learning_rate": 1.55478177931414e-06, + "loss": 0.6602, + "step": 11178 + }, + { + "epoch": 1.67, + "grad_norm": 3.723942776585052, + "learning_rate": 1.5547013993396155e-06, + "loss": 0.6816, + "step": 11179 + }, + { + "epoch": 1.67, + "grad_norm": 3.73219934688831, + "learning_rate": 1.5546210141880143e-06, + "loss": 0.7012, + "step": 11180 + }, + { + "epoch": 1.67, + "grad_norm": 3.2882157101118974, + "learning_rate": 1.554540623860086e-06, + "loss": 0.6745, + "step": 11181 + }, + { + "epoch": 1.67, + "grad_norm": 0.9918882762294894, + "learning_rate": 1.5544602283565806e-06, + "loss": 0.6966, + "step": 11182 + }, + { + "epoch": 1.67, + "grad_norm": 1.3017436079918494, + "learning_rate": 1.5543798276782492e-06, + "loss": 0.6647, + "step": 11183 + }, + { + "epoch": 1.67, + "grad_norm": 0.7594423413902487, + "learning_rate": 1.5542994218258417e-06, + "loss": 0.6868, + "step": 11184 + }, + { + "epoch": 1.67, + "grad_norm": 1.6346779758366456, + "learning_rate": 1.5542190108001088e-06, + "loss": 0.6895, + "step": 11185 + }, + { + "epoch": 1.67, + "grad_norm": 2.1158435064111574, + "learning_rate": 1.5541385946018007e-06, + "loss": 0.668, + "step": 11186 + }, + { + "epoch": 1.67, + "grad_norm": 0.6937310033872162, + "learning_rate": 1.554058173231668e-06, + "loss": 0.6458, + "step": 11187 + }, + { + "epoch": 1.67, + "grad_norm": 0.8162248047886786, + "learning_rate": 1.5539777466904615e-06, + "loss": 0.6615, + "step": 11188 + }, + { + "epoch": 1.67, + "grad_norm": 4.703276509718176, + "learning_rate": 1.553897314978932e-06, + "loss": 0.681, + "step": 11189 + }, + { + "epoch": 1.67, + "grad_norm": 4.14217425879038, + "learning_rate": 1.5538168780978294e-06, + "loss": 0.6654, + "step": 11190 + }, + { + "epoch": 1.67, + "grad_norm": 1.0104738564360718, + "learning_rate": 1.5537364360479053e-06, + "loss": 0.666, + "step": 11191 + }, + { + "epoch": 1.67, + "grad_norm": 1.3280552532481134, + "learning_rate": 1.5536559888299098e-06, + "loss": 0.6829, + "step": 11192 + }, + { + "epoch": 1.67, + "grad_norm": 0.746705666943857, + "learning_rate": 1.5535755364445939e-06, + "loss": 0.6882, + "step": 11193 + }, + { + "epoch": 1.67, + "grad_norm": 2.783906904236911, + "learning_rate": 1.5534950788927087e-06, + "loss": 0.6589, + "step": 11194 + }, + { + "epoch": 1.67, + "grad_norm": 0.8240558385780643, + "learning_rate": 1.5534146161750052e-06, + "loss": 0.6602, + "step": 11195 + }, + { + "epoch": 1.67, + "grad_norm": 1.37758349891066, + "learning_rate": 1.553334148292234e-06, + "loss": 0.6764, + "step": 11196 + }, + { + "epoch": 1.67, + "grad_norm": 2.1454286089043038, + "learning_rate": 1.5532536752451462e-06, + "loss": 0.6816, + "step": 11197 + }, + { + "epoch": 1.67, + "grad_norm": 2.498592492754459, + "learning_rate": 1.553173197034493e-06, + "loss": 0.6771, + "step": 11198 + }, + { + "epoch": 1.67, + "grad_norm": 1.7825582758505125, + "learning_rate": 1.5530927136610253e-06, + "loss": 0.6576, + "step": 11199 + }, + { + "epoch": 1.67, + "grad_norm": 3.701342383112746, + "learning_rate": 1.5530122251254945e-06, + "loss": 0.6595, + "step": 11200 + }, + { + "epoch": 1.67, + "grad_norm": 1.3304031332093138, + "learning_rate": 1.5529317314286518e-06, + "loss": 0.6549, + "step": 11201 + }, + { + "epoch": 1.67, + "grad_norm": 1.3615885620381958, + "learning_rate": 1.5528512325712482e-06, + "loss": 0.6582, + "step": 11202 + }, + { + "epoch": 1.67, + "grad_norm": 2.5262101662590513, + "learning_rate": 1.552770728554035e-06, + "loss": 0.6589, + "step": 11203 + }, + { + "epoch": 1.67, + "grad_norm": 1.662180176417295, + "learning_rate": 1.552690219377764e-06, + "loss": 0.6699, + "step": 11204 + }, + { + "epoch": 1.67, + "grad_norm": 1.0303152162735767, + "learning_rate": 1.5526097050431863e-06, + "loss": 0.6745, + "step": 11205 + }, + { + "epoch": 1.67, + "grad_norm": 4.9759255556346815, + "learning_rate": 1.552529185551053e-06, + "loss": 0.666, + "step": 11206 + }, + { + "epoch": 1.67, + "grad_norm": 0.9499883212624459, + "learning_rate": 1.5524486609021162e-06, + "loss": 0.6875, + "step": 11207 + }, + { + "epoch": 1.67, + "grad_norm": 1.3732557426224352, + "learning_rate": 1.5523681310971272e-06, + "loss": 0.6517, + "step": 11208 + }, + { + "epoch": 1.67, + "grad_norm": 3.448777726036567, + "learning_rate": 1.5522875961368378e-06, + "loss": 0.6615, + "step": 11209 + }, + { + "epoch": 1.67, + "grad_norm": 4.635503730021206, + "learning_rate": 1.552207056021999e-06, + "loss": 0.6484, + "step": 11210 + }, + { + "epoch": 1.67, + "grad_norm": 1.5501810754757017, + "learning_rate": 1.5521265107533631e-06, + "loss": 0.7005, + "step": 11211 + }, + { + "epoch": 1.67, + "grad_norm": 1.6524542531999384, + "learning_rate": 1.5520459603316818e-06, + "loss": 0.6654, + "step": 11212 + }, + { + "epoch": 1.67, + "grad_norm": 2.020323785288818, + "learning_rate": 1.5519654047577063e-06, + "loss": 0.6836, + "step": 11213 + }, + { + "epoch": 1.67, + "grad_norm": 4.642929297029692, + "learning_rate": 1.5518848440321892e-06, + "loss": 0.707, + "step": 11214 + }, + { + "epoch": 1.67, + "grad_norm": 1.9316985462653227, + "learning_rate": 1.551804278155882e-06, + "loss": 0.6589, + "step": 11215 + }, + { + "epoch": 1.67, + "grad_norm": 0.9713724280667181, + "learning_rate": 1.5517237071295366e-06, + "loss": 0.6829, + "step": 11216 + }, + { + "epoch": 1.67, + "grad_norm": 1.212491612108119, + "learning_rate": 1.5516431309539053e-06, + "loss": 0.6803, + "step": 11217 + }, + { + "epoch": 1.67, + "grad_norm": 2.03688897214889, + "learning_rate": 1.5515625496297394e-06, + "loss": 0.6387, + "step": 11218 + }, + { + "epoch": 1.67, + "grad_norm": 5.937279063129707, + "learning_rate": 1.551481963157792e-06, + "loss": 0.6803, + "step": 11219 + }, + { + "epoch": 1.67, + "grad_norm": 1.4043822217593456, + "learning_rate": 1.5514013715388143e-06, + "loss": 0.6849, + "step": 11220 + }, + { + "epoch": 1.67, + "grad_norm": 0.8440437585067654, + "learning_rate": 1.551320774773559e-06, + "loss": 0.6562, + "step": 11221 + }, + { + "epoch": 1.67, + "grad_norm": 1.5157679037488527, + "learning_rate": 1.5512401728627781e-06, + "loss": 0.6654, + "step": 11222 + }, + { + "epoch": 1.67, + "grad_norm": 2.6129741990851256, + "learning_rate": 1.551159565807224e-06, + "loss": 0.6953, + "step": 11223 + }, + { + "epoch": 1.67, + "grad_norm": 2.788784880054805, + "learning_rate": 1.5510789536076488e-06, + "loss": 0.6908, + "step": 11224 + }, + { + "epoch": 1.67, + "grad_norm": 1.8622546749617819, + "learning_rate": 1.5509983362648051e-06, + "loss": 0.6634, + "step": 11225 + }, + { + "epoch": 1.67, + "grad_norm": 2.9265252550635537, + "learning_rate": 1.5509177137794448e-06, + "loss": 0.6595, + "step": 11226 + }, + { + "epoch": 1.67, + "grad_norm": 1.3029899052927767, + "learning_rate": 1.5508370861523212e-06, + "loss": 0.6562, + "step": 11227 + }, + { + "epoch": 1.67, + "grad_norm": 1.8110218676719474, + "learning_rate": 1.5507564533841863e-06, + "loss": 0.6452, + "step": 11228 + }, + { + "epoch": 1.67, + "grad_norm": 1.1633586318780214, + "learning_rate": 1.5506758154757927e-06, + "loss": 0.6895, + "step": 11229 + }, + { + "epoch": 1.67, + "grad_norm": 1.626131361225816, + "learning_rate": 1.5505951724278928e-06, + "loss": 0.6895, + "step": 11230 + }, + { + "epoch": 1.68, + "grad_norm": 2.2987824628603146, + "learning_rate": 1.5505145242412398e-06, + "loss": 0.6712, + "step": 11231 + }, + { + "epoch": 1.68, + "grad_norm": 2.8406325353072934, + "learning_rate": 1.5504338709165858e-06, + "loss": 0.6549, + "step": 11232 + }, + { + "epoch": 1.68, + "grad_norm": 1.1239900192931511, + "learning_rate": 1.5503532124546838e-06, + "loss": 0.6927, + "step": 11233 + }, + { + "epoch": 1.68, + "grad_norm": 4.464375720220149, + "learning_rate": 1.550272548856287e-06, + "loss": 0.6797, + "step": 11234 + }, + { + "epoch": 1.68, + "grad_norm": 5.891191271639314, + "learning_rate": 1.5501918801221474e-06, + "loss": 0.6986, + "step": 11235 + }, + { + "epoch": 1.68, + "grad_norm": 0.8546637103594538, + "learning_rate": 1.5501112062530185e-06, + "loss": 0.6849, + "step": 11236 + }, + { + "epoch": 1.68, + "grad_norm": 0.8654736632383196, + "learning_rate": 1.5500305272496529e-06, + "loss": 0.6751, + "step": 11237 + }, + { + "epoch": 1.68, + "grad_norm": 3.7104699710423326, + "learning_rate": 1.5499498431128037e-06, + "loss": 0.6634, + "step": 11238 + }, + { + "epoch": 1.68, + "grad_norm": 0.8786368048872732, + "learning_rate": 1.5498691538432243e-06, + "loss": 0.7018, + "step": 11239 + }, + { + "epoch": 1.68, + "grad_norm": 1.112706948718815, + "learning_rate": 1.5497884594416673e-06, + "loss": 0.6458, + "step": 11240 + }, + { + "epoch": 1.68, + "grad_norm": 1.0653847210376266, + "learning_rate": 1.549707759908886e-06, + "loss": 0.6602, + "step": 11241 + }, + { + "epoch": 1.68, + "grad_norm": 2.137347084059323, + "learning_rate": 1.5496270552456336e-06, + "loss": 0.6536, + "step": 11242 + }, + { + "epoch": 1.68, + "grad_norm": 1.4124486886880492, + "learning_rate": 1.5495463454526631e-06, + "loss": 0.6693, + "step": 11243 + }, + { + "epoch": 1.68, + "grad_norm": 1.4009008931408218, + "learning_rate": 1.5494656305307284e-06, + "loss": 0.6862, + "step": 11244 + }, + { + "epoch": 1.68, + "grad_norm": 3.835543916638712, + "learning_rate": 1.5493849104805818e-06, + "loss": 0.681, + "step": 11245 + }, + { + "epoch": 1.68, + "grad_norm": 0.8796240089325402, + "learning_rate": 1.5493041853029775e-06, + "loss": 0.6699, + "step": 11246 + }, + { + "epoch": 1.68, + "grad_norm": 1.0021590954882085, + "learning_rate": 1.5492234549986689e-06, + "loss": 0.6335, + "step": 11247 + }, + { + "epoch": 1.68, + "grad_norm": 2.0454330948967248, + "learning_rate": 1.549142719568409e-06, + "loss": 0.6842, + "step": 11248 + }, + { + "epoch": 1.68, + "grad_norm": 3.194481220383077, + "learning_rate": 1.5490619790129515e-06, + "loss": 0.6628, + "step": 11249 + }, + { + "epoch": 1.68, + "grad_norm": 5.316169570737417, + "learning_rate": 1.54898123333305e-06, + "loss": 0.6784, + "step": 11250 + }, + { + "epoch": 1.68, + "grad_norm": 2.653225764014253, + "learning_rate": 1.5489004825294582e-06, + "loss": 0.6908, + "step": 11251 + }, + { + "epoch": 1.68, + "grad_norm": 1.4713555062178114, + "learning_rate": 1.5488197266029298e-06, + "loss": 0.6862, + "step": 11252 + }, + { + "epoch": 1.68, + "grad_norm": 0.9123517240286612, + "learning_rate": 1.548738965554218e-06, + "loss": 0.6628, + "step": 11253 + }, + { + "epoch": 1.68, + "grad_norm": 3.090250049411171, + "learning_rate": 1.5486581993840771e-06, + "loss": 0.6719, + "step": 11254 + }, + { + "epoch": 1.68, + "grad_norm": 3.030720473826189, + "learning_rate": 1.5485774280932606e-06, + "loss": 0.7018, + "step": 11255 + }, + { + "epoch": 1.68, + "grad_norm": 1.9374253206031111, + "learning_rate": 1.5484966516825223e-06, + "loss": 0.6654, + "step": 11256 + }, + { + "epoch": 1.68, + "grad_norm": 1.8237772383640707, + "learning_rate": 1.5484158701526167e-06, + "loss": 0.653, + "step": 11257 + }, + { + "epoch": 1.68, + "grad_norm": 1.064423066856631, + "learning_rate": 1.548335083504297e-06, + "loss": 0.6686, + "step": 11258 + }, + { + "epoch": 1.68, + "grad_norm": 5.419814444524726, + "learning_rate": 1.5482542917383175e-06, + "loss": 0.6816, + "step": 11259 + }, + { + "epoch": 1.68, + "grad_norm": 2.2705233544584664, + "learning_rate": 1.548173494855432e-06, + "loss": 0.6712, + "step": 11260 + }, + { + "epoch": 1.68, + "grad_norm": 2.812481355520745, + "learning_rate": 1.5480926928563954e-06, + "loss": 0.6582, + "step": 11261 + }, + { + "epoch": 1.68, + "grad_norm": 3.383451544551159, + "learning_rate": 1.5480118857419607e-06, + "loss": 0.6634, + "step": 11262 + }, + { + "epoch": 1.68, + "grad_norm": 2.660456058432206, + "learning_rate": 1.5479310735128825e-06, + "loss": 0.6934, + "step": 11263 + }, + { + "epoch": 1.68, + "grad_norm": 1.1764403639587622, + "learning_rate": 1.5478502561699157e-06, + "loss": 0.6686, + "step": 11264 + }, + { + "epoch": 1.68, + "grad_norm": 4.134016113491482, + "learning_rate": 1.5477694337138135e-06, + "loss": 0.6667, + "step": 11265 + }, + { + "epoch": 1.68, + "grad_norm": 2.6609323690856286, + "learning_rate": 1.547688606145331e-06, + "loss": 0.6882, + "step": 11266 + }, + { + "epoch": 1.68, + "grad_norm": 1.4684936803424862, + "learning_rate": 1.5476077734652222e-06, + "loss": 0.6667, + "step": 11267 + }, + { + "epoch": 1.68, + "grad_norm": 4.405350591833407, + "learning_rate": 1.5475269356742416e-06, + "loss": 0.6803, + "step": 11268 + }, + { + "epoch": 1.68, + "grad_norm": 1.5469746117383931, + "learning_rate": 1.5474460927731436e-06, + "loss": 0.6842, + "step": 11269 + }, + { + "epoch": 1.68, + "grad_norm": 1.0408798723061385, + "learning_rate": 1.547365244762683e-06, + "loss": 0.6445, + "step": 11270 + }, + { + "epoch": 1.68, + "grad_norm": 1.715647558182814, + "learning_rate": 1.5472843916436142e-06, + "loss": 0.6543, + "step": 11271 + }, + { + "epoch": 1.68, + "grad_norm": 2.0091174632463797, + "learning_rate": 1.5472035334166917e-06, + "loss": 0.6771, + "step": 11272 + }, + { + "epoch": 1.68, + "grad_norm": 1.155576590543522, + "learning_rate": 1.54712267008267e-06, + "loss": 0.6243, + "step": 11273 + }, + { + "epoch": 1.68, + "grad_norm": 1.4293189664559534, + "learning_rate": 1.5470418016423045e-06, + "loss": 0.6615, + "step": 11274 + }, + { + "epoch": 1.68, + "grad_norm": 4.186449272996043, + "learning_rate": 1.5469609280963495e-06, + "loss": 0.6621, + "step": 11275 + }, + { + "epoch": 1.68, + "grad_norm": 2.837489970902029, + "learning_rate": 1.5468800494455591e-06, + "loss": 0.6432, + "step": 11276 + }, + { + "epoch": 1.68, + "grad_norm": 1.9032470181912642, + "learning_rate": 1.5467991656906895e-06, + "loss": 0.6491, + "step": 11277 + }, + { + "epoch": 1.68, + "grad_norm": 1.0418803347339793, + "learning_rate": 1.5467182768324946e-06, + "loss": 0.6914, + "step": 11278 + }, + { + "epoch": 1.68, + "grad_norm": 1.6548792851534657, + "learning_rate": 1.54663738287173e-06, + "loss": 0.6882, + "step": 11279 + }, + { + "epoch": 1.68, + "grad_norm": 1.0452618571509031, + "learning_rate": 1.5465564838091498e-06, + "loss": 0.6719, + "step": 11280 + }, + { + "epoch": 1.68, + "grad_norm": 2.951539210697477, + "learning_rate": 1.54647557964551e-06, + "loss": 0.6621, + "step": 11281 + }, + { + "epoch": 1.68, + "grad_norm": 2.704095187061839, + "learning_rate": 1.5463946703815651e-06, + "loss": 0.6471, + "step": 11282 + }, + { + "epoch": 1.68, + "grad_norm": 3.0746511354889523, + "learning_rate": 1.5463137560180709e-06, + "loss": 0.7201, + "step": 11283 + }, + { + "epoch": 1.68, + "grad_norm": 3.5797007847621907, + "learning_rate": 1.5462328365557815e-06, + "loss": 0.6953, + "step": 11284 + }, + { + "epoch": 1.68, + "grad_norm": 2.2300206725101535, + "learning_rate": 1.5461519119954529e-06, + "loss": 0.653, + "step": 11285 + }, + { + "epoch": 1.68, + "grad_norm": 2.771209644424251, + "learning_rate": 1.5460709823378402e-06, + "loss": 0.6641, + "step": 11286 + }, + { + "epoch": 1.68, + "grad_norm": 2.579381017045527, + "learning_rate": 1.545990047583699e-06, + "loss": 0.6556, + "step": 11287 + }, + { + "epoch": 1.68, + "grad_norm": 2.9760420434121126, + "learning_rate": 1.545909107733784e-06, + "loss": 0.6725, + "step": 11288 + }, + { + "epoch": 1.68, + "grad_norm": 1.3684206125210907, + "learning_rate": 1.5458281627888515e-06, + "loss": 0.6686, + "step": 11289 + }, + { + "epoch": 1.68, + "grad_norm": 0.9310820835639162, + "learning_rate": 1.545747212749656e-06, + "loss": 0.6374, + "step": 11290 + }, + { + "epoch": 1.68, + "grad_norm": 1.352642451559995, + "learning_rate": 1.5456662576169538e-06, + "loss": 0.6556, + "step": 11291 + }, + { + "epoch": 1.68, + "grad_norm": 2.08857080724838, + "learning_rate": 1.5455852973915001e-06, + "loss": 0.7103, + "step": 11292 + }, + { + "epoch": 1.68, + "grad_norm": 4.582657216868768, + "learning_rate": 1.5455043320740505e-06, + "loss": 0.6706, + "step": 11293 + }, + { + "epoch": 1.68, + "grad_norm": 8.034694436120494, + "learning_rate": 1.5454233616653607e-06, + "loss": 0.6803, + "step": 11294 + }, + { + "epoch": 1.68, + "grad_norm": 1.2327009067842445, + "learning_rate": 1.5453423861661867e-06, + "loss": 0.6777, + "step": 11295 + }, + { + "epoch": 1.68, + "grad_norm": 1.705485474650443, + "learning_rate": 1.5452614055772838e-06, + "loss": 0.6777, + "step": 11296 + }, + { + "epoch": 1.68, + "grad_norm": 1.0794755691117914, + "learning_rate": 1.5451804198994078e-06, + "loss": 0.6947, + "step": 11297 + }, + { + "epoch": 1.69, + "grad_norm": 0.8915000800003741, + "learning_rate": 1.5450994291333151e-06, + "loss": 0.6667, + "step": 11298 + }, + { + "epoch": 1.69, + "grad_norm": 0.983015185724108, + "learning_rate": 1.5450184332797607e-06, + "loss": 0.6777, + "step": 11299 + }, + { + "epoch": 1.69, + "grad_norm": 1.4982249777581058, + "learning_rate": 1.5449374323395015e-06, + "loss": 0.653, + "step": 11300 + }, + { + "epoch": 1.69, + "grad_norm": 3.963645389433203, + "learning_rate": 1.5448564263132929e-06, + "loss": 0.6882, + "step": 11301 + }, + { + "epoch": 1.69, + "grad_norm": 0.8765478923855543, + "learning_rate": 1.544775415201891e-06, + "loss": 0.679, + "step": 11302 + }, + { + "epoch": 1.69, + "grad_norm": 4.471109141613477, + "learning_rate": 1.5446943990060522e-06, + "loss": 0.6667, + "step": 11303 + }, + { + "epoch": 1.69, + "grad_norm": 0.8044445212580242, + "learning_rate": 1.5446133777265323e-06, + "loss": 0.666, + "step": 11304 + }, + { + "epoch": 1.69, + "grad_norm": 1.5037424880235595, + "learning_rate": 1.5445323513640875e-06, + "loss": 0.6654, + "step": 11305 + }, + { + "epoch": 1.69, + "grad_norm": 2.7794801218487497, + "learning_rate": 1.5444513199194743e-06, + "loss": 0.6654, + "step": 11306 + }, + { + "epoch": 1.69, + "grad_norm": 3.997109070315956, + "learning_rate": 1.5443702833934486e-06, + "loss": 0.694, + "step": 11307 + }, + { + "epoch": 1.69, + "grad_norm": 2.1509118118828296, + "learning_rate": 1.544289241786767e-06, + "loss": 0.6673, + "step": 11308 + }, + { + "epoch": 1.69, + "grad_norm": 2.459004880230745, + "learning_rate": 1.5442081951001855e-06, + "loss": 0.6465, + "step": 11309 + }, + { + "epoch": 1.69, + "grad_norm": 4.327210729484986, + "learning_rate": 1.5441271433344612e-06, + "loss": 0.6576, + "step": 11310 + }, + { + "epoch": 1.69, + "grad_norm": 0.9584109291822471, + "learning_rate": 1.5440460864903502e-06, + "loss": 0.6686, + "step": 11311 + }, + { + "epoch": 1.69, + "grad_norm": 4.9037343653377965, + "learning_rate": 1.5439650245686085e-06, + "loss": 0.679, + "step": 11312 + }, + { + "epoch": 1.69, + "grad_norm": 0.7785853177309929, + "learning_rate": 1.5438839575699933e-06, + "loss": 0.666, + "step": 11313 + }, + { + "epoch": 1.69, + "grad_norm": 1.9987285827825527, + "learning_rate": 1.5438028854952614e-06, + "loss": 0.681, + "step": 11314 + }, + { + "epoch": 1.69, + "grad_norm": 0.8963111520126065, + "learning_rate": 1.5437218083451687e-06, + "loss": 0.7083, + "step": 11315 + }, + { + "epoch": 1.69, + "grad_norm": 2.407704659950976, + "learning_rate": 1.543640726120472e-06, + "loss": 0.6706, + "step": 11316 + }, + { + "epoch": 1.69, + "grad_norm": 1.7155554033062366, + "learning_rate": 1.5435596388219288e-06, + "loss": 0.6699, + "step": 11317 + }, + { + "epoch": 1.69, + "grad_norm": 8.501391891220823, + "learning_rate": 1.5434785464502952e-06, + "loss": 0.7012, + "step": 11318 + }, + { + "epoch": 1.69, + "grad_norm": 2.7899643588268295, + "learning_rate": 1.5433974490063287e-06, + "loss": 0.6602, + "step": 11319 + }, + { + "epoch": 1.69, + "grad_norm": 0.9800574858201209, + "learning_rate": 1.5433163464907852e-06, + "loss": 0.6367, + "step": 11320 + }, + { + "epoch": 1.69, + "grad_norm": 3.365003059808672, + "learning_rate": 1.5432352389044226e-06, + "loss": 0.6328, + "step": 11321 + }, + { + "epoch": 1.69, + "grad_norm": 0.7933114555221286, + "learning_rate": 1.543154126247997e-06, + "loss": 0.6504, + "step": 11322 + }, + { + "epoch": 1.69, + "grad_norm": 0.8509903964906169, + "learning_rate": 1.5430730085222661e-06, + "loss": 0.651, + "step": 11323 + }, + { + "epoch": 1.69, + "grad_norm": 4.423108666081131, + "learning_rate": 1.5429918857279871e-06, + "loss": 0.6829, + "step": 11324 + }, + { + "epoch": 1.69, + "grad_norm": 2.991520536736925, + "learning_rate": 1.5429107578659163e-06, + "loss": 0.6699, + "step": 11325 + }, + { + "epoch": 1.69, + "grad_norm": 2.27064474980368, + "learning_rate": 1.5428296249368118e-06, + "loss": 0.6777, + "step": 11326 + }, + { + "epoch": 1.69, + "grad_norm": 5.692779716752442, + "learning_rate": 1.5427484869414303e-06, + "loss": 0.6322, + "step": 11327 + }, + { + "epoch": 1.69, + "grad_norm": 0.9598256756506396, + "learning_rate": 1.5426673438805292e-06, + "loss": 0.6621, + "step": 11328 + }, + { + "epoch": 1.69, + "grad_norm": 2.341847383437105, + "learning_rate": 1.5425861957548657e-06, + "loss": 0.6738, + "step": 11329 + }, + { + "epoch": 1.69, + "grad_norm": 3.053845612643833, + "learning_rate": 1.5425050425651973e-06, + "loss": 0.6914, + "step": 11330 + }, + { + "epoch": 1.69, + "grad_norm": 3.6859556606721897, + "learning_rate": 1.5424238843122814e-06, + "loss": 0.696, + "step": 11331 + }, + { + "epoch": 1.69, + "grad_norm": 6.062907077288447, + "learning_rate": 1.5423427209968752e-06, + "loss": 0.6647, + "step": 11332 + }, + { + "epoch": 1.69, + "grad_norm": 4.0310133684033636, + "learning_rate": 1.5422615526197366e-06, + "loss": 0.6947, + "step": 11333 + }, + { + "epoch": 1.69, + "grad_norm": 2.8450975845588578, + "learning_rate": 1.542180379181623e-06, + "loss": 0.7233, + "step": 11334 + }, + { + "epoch": 1.69, + "grad_norm": 1.3725480455418841, + "learning_rate": 1.5420992006832918e-06, + "loss": 0.6953, + "step": 11335 + }, + { + "epoch": 1.69, + "grad_norm": 1.0808836374354696, + "learning_rate": 1.5420180171255011e-06, + "loss": 0.6491, + "step": 11336 + }, + { + "epoch": 1.69, + "grad_norm": 1.2749635370840353, + "learning_rate": 1.5419368285090081e-06, + "loss": 0.6868, + "step": 11337 + }, + { + "epoch": 1.69, + "grad_norm": 1.627223133676176, + "learning_rate": 1.5418556348345706e-06, + "loss": 0.6354, + "step": 11338 + }, + { + "epoch": 1.69, + "grad_norm": 3.0373001990058945, + "learning_rate": 1.5417744361029467e-06, + "loss": 0.6999, + "step": 11339 + }, + { + "epoch": 1.69, + "grad_norm": 1.4505359282290686, + "learning_rate": 1.5416932323148942e-06, + "loss": 0.6751, + "step": 11340 + }, + { + "epoch": 1.69, + "grad_norm": 1.9851505041718704, + "learning_rate": 1.5416120234711708e-06, + "loss": 0.6803, + "step": 11341 + }, + { + "epoch": 1.69, + "grad_norm": 1.6432473005771593, + "learning_rate": 1.5415308095725346e-06, + "loss": 0.6641, + "step": 11342 + }, + { + "epoch": 1.69, + "grad_norm": 6.012054917768955, + "learning_rate": 1.5414495906197434e-06, + "loss": 0.6634, + "step": 11343 + }, + { + "epoch": 1.69, + "grad_norm": 4.100511809320942, + "learning_rate": 1.5413683666135553e-06, + "loss": 0.7266, + "step": 11344 + }, + { + "epoch": 1.69, + "grad_norm": 1.9018241901249615, + "learning_rate": 1.541287137554728e-06, + "loss": 0.6569, + "step": 11345 + }, + { + "epoch": 1.69, + "grad_norm": 2.743705175797526, + "learning_rate": 1.5412059034440203e-06, + "loss": 0.6927, + "step": 11346 + }, + { + "epoch": 1.69, + "grad_norm": 1.6704043730612945, + "learning_rate": 1.54112466428219e-06, + "loss": 0.6595, + "step": 11347 + }, + { + "epoch": 1.69, + "grad_norm": 2.167721030653473, + "learning_rate": 1.5410434200699953e-06, + "loss": 0.681, + "step": 11348 + }, + { + "epoch": 1.69, + "grad_norm": 1.4847142391894248, + "learning_rate": 1.5409621708081945e-06, + "loss": 0.6732, + "step": 11349 + }, + { + "epoch": 1.69, + "grad_norm": 1.275888891366519, + "learning_rate": 1.5408809164975463e-06, + "loss": 0.7031, + "step": 11350 + }, + { + "epoch": 1.69, + "grad_norm": 2.2470156651361965, + "learning_rate": 1.5407996571388082e-06, + "loss": 0.6947, + "step": 11351 + }, + { + "epoch": 1.69, + "grad_norm": 3.027309122590078, + "learning_rate": 1.5407183927327391e-06, + "loss": 0.6849, + "step": 11352 + }, + { + "epoch": 1.69, + "grad_norm": 1.0230874292019843, + "learning_rate": 1.5406371232800977e-06, + "loss": 0.6895, + "step": 11353 + }, + { + "epoch": 1.69, + "grad_norm": 4.053723298801417, + "learning_rate": 1.540555848781642e-06, + "loss": 0.6549, + "step": 11354 + }, + { + "epoch": 1.69, + "grad_norm": 1.0390140678852415, + "learning_rate": 1.540474569238131e-06, + "loss": 0.6523, + "step": 11355 + }, + { + "epoch": 1.69, + "grad_norm": 2.458888974358896, + "learning_rate": 1.5403932846503228e-06, + "loss": 0.6458, + "step": 11356 + }, + { + "epoch": 1.69, + "grad_norm": 2.9803766979722295, + "learning_rate": 1.5403119950189764e-06, + "loss": 0.6667, + "step": 11357 + }, + { + "epoch": 1.69, + "grad_norm": 2.1218547660589415, + "learning_rate": 1.5402307003448503e-06, + "loss": 0.6882, + "step": 11358 + }, + { + "epoch": 1.69, + "grad_norm": 1.9706750253407144, + "learning_rate": 1.5401494006287031e-06, + "loss": 0.651, + "step": 11359 + }, + { + "epoch": 1.69, + "grad_norm": 0.8447784819645549, + "learning_rate": 1.5400680958712942e-06, + "loss": 0.7083, + "step": 11360 + }, + { + "epoch": 1.69, + "grad_norm": 2.1199126640918595, + "learning_rate": 1.5399867860733816e-06, + "loss": 0.7018, + "step": 11361 + }, + { + "epoch": 1.69, + "grad_norm": 1.608948963219059, + "learning_rate": 1.539905471235725e-06, + "loss": 0.6615, + "step": 11362 + }, + { + "epoch": 1.69, + "grad_norm": 0.8491034409152949, + "learning_rate": 1.5398241513590826e-06, + "loss": 0.6497, + "step": 11363 + }, + { + "epoch": 1.69, + "grad_norm": 0.7816283651743097, + "learning_rate": 1.5397428264442137e-06, + "loss": 0.6576, + "step": 11364 + }, + { + "epoch": 1.7, + "grad_norm": 1.0451897200132, + "learning_rate": 1.5396614964918773e-06, + "loss": 0.6797, + "step": 11365 + }, + { + "epoch": 1.7, + "grad_norm": 4.777576196186719, + "learning_rate": 1.5395801615028324e-06, + "loss": 0.6979, + "step": 11366 + }, + { + "epoch": 1.7, + "grad_norm": 1.0677974404762325, + "learning_rate": 1.539498821477838e-06, + "loss": 0.681, + "step": 11367 + }, + { + "epoch": 1.7, + "grad_norm": 0.9618170680778946, + "learning_rate": 1.5394174764176534e-06, + "loss": 0.6504, + "step": 11368 + }, + { + "epoch": 1.7, + "grad_norm": 2.045711365334792, + "learning_rate": 1.5393361263230382e-06, + "loss": 0.6797, + "step": 11369 + }, + { + "epoch": 1.7, + "grad_norm": 0.9837123093844516, + "learning_rate": 1.5392547711947513e-06, + "loss": 0.6784, + "step": 11370 + }, + { + "epoch": 1.7, + "grad_norm": 3.3680770992880302, + "learning_rate": 1.5391734110335512e-06, + "loss": 0.6673, + "step": 11371 + }, + { + "epoch": 1.7, + "grad_norm": 2.1151117104447144, + "learning_rate": 1.5390920458401983e-06, + "loss": 0.6849, + "step": 11372 + }, + { + "epoch": 1.7, + "grad_norm": 1.8590358070470037, + "learning_rate": 1.5390106756154519e-06, + "loss": 0.6621, + "step": 11373 + }, + { + "epoch": 1.7, + "grad_norm": 7.816570726211608, + "learning_rate": 1.538929300360071e-06, + "loss": 0.681, + "step": 11374 + }, + { + "epoch": 1.7, + "grad_norm": 1.0850259394114408, + "learning_rate": 1.5388479200748154e-06, + "loss": 0.6758, + "step": 11375 + }, + { + "epoch": 1.7, + "grad_norm": 4.859913015470543, + "learning_rate": 1.5387665347604443e-06, + "loss": 0.694, + "step": 11376 + }, + { + "epoch": 1.7, + "grad_norm": 1.1495377174470636, + "learning_rate": 1.5386851444177176e-06, + "loss": 0.6628, + "step": 11377 + }, + { + "epoch": 1.7, + "grad_norm": 2.475884730737466, + "learning_rate": 1.5386037490473948e-06, + "loss": 0.6895, + "step": 11378 + }, + { + "epoch": 1.7, + "grad_norm": 2.375312653559208, + "learning_rate": 1.5385223486502354e-06, + "loss": 0.6829, + "step": 11379 + }, + { + "epoch": 1.7, + "grad_norm": 0.7994531102219278, + "learning_rate": 1.5384409432269999e-06, + "loss": 0.6842, + "step": 11380 + }, + { + "epoch": 1.7, + "grad_norm": 1.101460006577832, + "learning_rate": 1.538359532778447e-06, + "loss": 0.6829, + "step": 11381 + }, + { + "epoch": 1.7, + "grad_norm": 5.714637833671125, + "learning_rate": 1.5382781173053368e-06, + "loss": 0.6908, + "step": 11382 + }, + { + "epoch": 1.7, + "grad_norm": 1.8691100593281973, + "learning_rate": 1.5381966968084297e-06, + "loss": 0.6523, + "step": 11383 + }, + { + "epoch": 1.7, + "grad_norm": 1.6583209246782091, + "learning_rate": 1.5381152712884848e-06, + "loss": 0.6921, + "step": 11384 + }, + { + "epoch": 1.7, + "grad_norm": 3.3845344866483744, + "learning_rate": 1.538033840746263e-06, + "loss": 0.6693, + "step": 11385 + }, + { + "epoch": 1.7, + "grad_norm": 3.2188796210627664, + "learning_rate": 1.5379524051825237e-06, + "loss": 0.681, + "step": 11386 + }, + { + "epoch": 1.7, + "grad_norm": 4.6247618764041345, + "learning_rate": 1.5378709645980265e-06, + "loss": 0.6836, + "step": 11387 + }, + { + "epoch": 1.7, + "grad_norm": 2.4276062640750147, + "learning_rate": 1.5377895189935326e-06, + "loss": 0.6855, + "step": 11388 + }, + { + "epoch": 1.7, + "grad_norm": 3.9485673693746617, + "learning_rate": 1.5377080683698012e-06, + "loss": 0.6556, + "step": 11389 + }, + { + "epoch": 1.7, + "grad_norm": 2.4578961681074065, + "learning_rate": 1.537626612727593e-06, + "loss": 0.6784, + "step": 11390 + }, + { + "epoch": 1.7, + "grad_norm": 1.1018381711886223, + "learning_rate": 1.5375451520676684e-06, + "loss": 0.6751, + "step": 11391 + }, + { + "epoch": 1.7, + "grad_norm": 2.046423694722265, + "learning_rate": 1.5374636863907866e-06, + "loss": 0.6712, + "step": 11392 + }, + { + "epoch": 1.7, + "grad_norm": 2.3650593066452625, + "learning_rate": 1.5373822156977091e-06, + "loss": 0.6562, + "step": 11393 + }, + { + "epoch": 1.7, + "grad_norm": 1.584154407424906, + "learning_rate": 1.5373007399891961e-06, + "loss": 0.6758, + "step": 11394 + }, + { + "epoch": 1.7, + "grad_norm": 1.6148091661615163, + "learning_rate": 1.5372192592660075e-06, + "loss": 0.6478, + "step": 11395 + }, + { + "epoch": 1.7, + "grad_norm": 2.1684253503152457, + "learning_rate": 1.5371377735289043e-06, + "loss": 0.679, + "step": 11396 + }, + { + "epoch": 1.7, + "grad_norm": 6.398820365292819, + "learning_rate": 1.5370562827786464e-06, + "loss": 0.6895, + "step": 11397 + }, + { + "epoch": 1.7, + "grad_norm": 1.629063397751374, + "learning_rate": 1.5369747870159947e-06, + "loss": 0.6777, + "step": 11398 + }, + { + "epoch": 1.7, + "grad_norm": 2.1961799893775464, + "learning_rate": 1.5368932862417098e-06, + "loss": 0.6725, + "step": 11399 + }, + { + "epoch": 1.7, + "grad_norm": 1.3610088717152777, + "learning_rate": 1.5368117804565529e-06, + "loss": 0.64, + "step": 11400 + }, + { + "epoch": 1.7, + "grad_norm": 0.845246557267634, + "learning_rate": 1.5367302696612838e-06, + "loss": 0.7005, + "step": 11401 + }, + { + "epoch": 1.7, + "grad_norm": 2.3114434862107434, + "learning_rate": 1.5366487538566637e-06, + "loss": 0.6699, + "step": 11402 + }, + { + "epoch": 1.7, + "grad_norm": 4.282611504453395, + "learning_rate": 1.536567233043453e-06, + "loss": 0.6784, + "step": 11403 + }, + { + "epoch": 1.7, + "grad_norm": 0.9623115339863134, + "learning_rate": 1.5364857072224137e-06, + "loss": 0.6445, + "step": 11404 + }, + { + "epoch": 1.7, + "grad_norm": 1.7684628828455622, + "learning_rate": 1.536404176394305e-06, + "loss": 0.6504, + "step": 11405 + }, + { + "epoch": 1.7, + "grad_norm": 1.1958553625007986, + "learning_rate": 1.536322640559889e-06, + "loss": 0.6973, + "step": 11406 + }, + { + "epoch": 1.7, + "grad_norm": 2.0117156692592495, + "learning_rate": 1.5362410997199265e-06, + "loss": 0.6784, + "step": 11407 + }, + { + "epoch": 1.7, + "grad_norm": 2.027850448747667, + "learning_rate": 1.536159553875178e-06, + "loss": 0.6497, + "step": 11408 + }, + { + "epoch": 1.7, + "grad_norm": 3.1466018135579863, + "learning_rate": 1.5360780030264055e-06, + "loss": 0.7096, + "step": 11409 + }, + { + "epoch": 1.7, + "grad_norm": 1.3859707061338744, + "learning_rate": 1.5359964471743695e-06, + "loss": 0.6712, + "step": 11410 + }, + { + "epoch": 1.7, + "grad_norm": 0.9900256676547536, + "learning_rate": 1.5359148863198307e-06, + "loss": 0.6758, + "step": 11411 + }, + { + "epoch": 1.7, + "grad_norm": 0.964517994342586, + "learning_rate": 1.5358333204635514e-06, + "loss": 0.6667, + "step": 11412 + }, + { + "epoch": 1.7, + "grad_norm": 4.104979377334326, + "learning_rate": 1.5357517496062922e-06, + "loss": 0.6764, + "step": 11413 + }, + { + "epoch": 1.7, + "grad_norm": 1.9854846979464653, + "learning_rate": 1.5356701737488145e-06, + "loss": 0.6641, + "step": 11414 + }, + { + "epoch": 1.7, + "grad_norm": 1.8268068699629156, + "learning_rate": 1.5355885928918796e-06, + "loss": 0.651, + "step": 11415 + }, + { + "epoch": 1.7, + "grad_norm": 0.9398670033601974, + "learning_rate": 1.5355070070362493e-06, + "loss": 0.6803, + "step": 11416 + }, + { + "epoch": 1.7, + "grad_norm": 1.1154322753364858, + "learning_rate": 1.5354254161826845e-06, + "loss": 0.6706, + "step": 11417 + }, + { + "epoch": 1.7, + "grad_norm": 2.387410120027395, + "learning_rate": 1.5353438203319468e-06, + "loss": 0.6738, + "step": 11418 + }, + { + "epoch": 1.7, + "grad_norm": 1.0118329552413687, + "learning_rate": 1.5352622194847982e-06, + "loss": 0.7116, + "step": 11419 + }, + { + "epoch": 1.7, + "grad_norm": 1.0187445241798279, + "learning_rate": 1.5351806136419997e-06, + "loss": 0.6647, + "step": 11420 + }, + { + "epoch": 1.7, + "grad_norm": 3.081193874472834, + "learning_rate": 1.5350990028043134e-06, + "loss": 0.6868, + "step": 11421 + }, + { + "epoch": 1.7, + "grad_norm": 1.873703436376526, + "learning_rate": 1.5350173869725009e-06, + "loss": 0.6641, + "step": 11422 + }, + { + "epoch": 1.7, + "grad_norm": 1.5410507606135453, + "learning_rate": 1.5349357661473236e-06, + "loss": 0.6641, + "step": 11423 + }, + { + "epoch": 1.7, + "grad_norm": 0.9048672490073183, + "learning_rate": 1.5348541403295437e-06, + "loss": 0.653, + "step": 11424 + }, + { + "epoch": 1.7, + "grad_norm": 1.547601303010298, + "learning_rate": 1.5347725095199224e-06, + "loss": 0.6654, + "step": 11425 + }, + { + "epoch": 1.7, + "grad_norm": 3.147600974256876, + "learning_rate": 1.5346908737192224e-06, + "loss": 0.6543, + "step": 11426 + }, + { + "epoch": 1.7, + "grad_norm": 0.9398433574644719, + "learning_rate": 1.5346092329282047e-06, + "loss": 0.6699, + "step": 11427 + }, + { + "epoch": 1.7, + "grad_norm": 3.318675853424503, + "learning_rate": 1.5345275871476324e-06, + "loss": 0.7057, + "step": 11428 + }, + { + "epoch": 1.7, + "grad_norm": 1.1124358153428304, + "learning_rate": 1.5344459363782666e-06, + "loss": 0.7044, + "step": 11429 + }, + { + "epoch": 1.7, + "grad_norm": 0.9602810504173785, + "learning_rate": 1.5343642806208691e-06, + "loss": 0.6654, + "step": 11430 + }, + { + "epoch": 1.7, + "grad_norm": 1.471980470201261, + "learning_rate": 1.5342826198762033e-06, + "loss": 0.6667, + "step": 11431 + }, + { + "epoch": 1.7, + "grad_norm": 5.579599053598115, + "learning_rate": 1.5342009541450298e-06, + "loss": 0.7116, + "step": 11432 + }, + { + "epoch": 1.71, + "grad_norm": 1.3407061896450891, + "learning_rate": 1.5341192834281125e-06, + "loss": 0.6517, + "step": 11433 + }, + { + "epoch": 1.71, + "grad_norm": 1.1369377868853032, + "learning_rate": 1.534037607726212e-06, + "loss": 0.6745, + "step": 11434 + }, + { + "epoch": 1.71, + "grad_norm": 1.1504474686840287, + "learning_rate": 1.533955927040091e-06, + "loss": 0.6504, + "step": 11435 + }, + { + "epoch": 1.71, + "grad_norm": 0.8387564375102324, + "learning_rate": 1.533874241370513e-06, + "loss": 0.6523, + "step": 11436 + }, + { + "epoch": 1.71, + "grad_norm": 3.6561067959161653, + "learning_rate": 1.533792550718239e-06, + "loss": 0.6882, + "step": 11437 + }, + { + "epoch": 1.71, + "grad_norm": 1.6609975220956306, + "learning_rate": 1.533710855084032e-06, + "loss": 0.6719, + "step": 11438 + }, + { + "epoch": 1.71, + "grad_norm": 1.3840268381794052, + "learning_rate": 1.5336291544686541e-06, + "loss": 0.6549, + "step": 11439 + }, + { + "epoch": 1.71, + "grad_norm": 1.311509529553836, + "learning_rate": 1.5335474488728685e-06, + "loss": 0.6953, + "step": 11440 + }, + { + "epoch": 1.71, + "grad_norm": 1.680169732680658, + "learning_rate": 1.5334657382974371e-06, + "loss": 0.6667, + "step": 11441 + }, + { + "epoch": 1.71, + "grad_norm": 1.064664933553885, + "learning_rate": 1.5333840227431228e-06, + "loss": 0.6836, + "step": 11442 + }, + { + "epoch": 1.71, + "grad_norm": 2.185613249524422, + "learning_rate": 1.5333023022106883e-06, + "loss": 0.6882, + "step": 11443 + }, + { + "epoch": 1.71, + "grad_norm": 2.331456772144496, + "learning_rate": 1.5332205767008966e-06, + "loss": 0.6484, + "step": 11444 + }, + { + "epoch": 1.71, + "grad_norm": 3.66111508651182, + "learning_rate": 1.5331388462145097e-06, + "loss": 0.6934, + "step": 11445 + }, + { + "epoch": 1.71, + "grad_norm": 3.474631327227673, + "learning_rate": 1.533057110752291e-06, + "loss": 0.6823, + "step": 11446 + }, + { + "epoch": 1.71, + "grad_norm": 0.863875252857407, + "learning_rate": 1.532975370315003e-06, + "loss": 0.6686, + "step": 11447 + }, + { + "epoch": 1.71, + "grad_norm": 2.3024271047808993, + "learning_rate": 1.5328936249034086e-06, + "loss": 0.6615, + "step": 11448 + }, + { + "epoch": 1.71, + "grad_norm": 1.7123612633406033, + "learning_rate": 1.5328118745182713e-06, + "loss": 0.6901, + "step": 11449 + }, + { + "epoch": 1.71, + "grad_norm": 3.0424551542609777, + "learning_rate": 1.5327301191603533e-06, + "loss": 0.7038, + "step": 11450 + }, + { + "epoch": 1.71, + "grad_norm": 1.047243753424626, + "learning_rate": 1.5326483588304182e-06, + "loss": 0.6543, + "step": 11451 + }, + { + "epoch": 1.71, + "grad_norm": 1.975807550474671, + "learning_rate": 1.532566593529229e-06, + "loss": 0.6686, + "step": 11452 + }, + { + "epoch": 1.71, + "grad_norm": 1.002418416130879, + "learning_rate": 1.5324848232575482e-06, + "loss": 0.6602, + "step": 11453 + }, + { + "epoch": 1.71, + "grad_norm": 4.767661190035643, + "learning_rate": 1.5324030480161398e-06, + "loss": 0.6654, + "step": 11454 + }, + { + "epoch": 1.71, + "grad_norm": 4.366206635343584, + "learning_rate": 1.5323212678057664e-06, + "loss": 0.6556, + "step": 11455 + }, + { + "epoch": 1.71, + "grad_norm": 0.9974021481801986, + "learning_rate": 1.5322394826271919e-06, + "loss": 0.6868, + "step": 11456 + }, + { + "epoch": 1.71, + "grad_norm": 0.8884514117753523, + "learning_rate": 1.532157692481179e-06, + "loss": 0.6465, + "step": 11457 + }, + { + "epoch": 1.71, + "grad_norm": 2.6687308908623497, + "learning_rate": 1.5320758973684913e-06, + "loss": 0.6738, + "step": 11458 + }, + { + "epoch": 1.71, + "grad_norm": 1.1997620189593705, + "learning_rate": 1.5319940972898921e-06, + "loss": 0.6458, + "step": 11459 + }, + { + "epoch": 1.71, + "grad_norm": 3.1964648417588277, + "learning_rate": 1.5319122922461452e-06, + "loss": 0.6582, + "step": 11460 + }, + { + "epoch": 1.71, + "grad_norm": 0.8669426614462568, + "learning_rate": 1.5318304822380136e-06, + "loss": 0.6862, + "step": 11461 + }, + { + "epoch": 1.71, + "grad_norm": 2.3256166192769836, + "learning_rate": 1.5317486672662615e-06, + "loss": 0.6803, + "step": 11462 + }, + { + "epoch": 1.71, + "grad_norm": 3.88182124372709, + "learning_rate": 1.5316668473316517e-06, + "loss": 0.7207, + "step": 11463 + }, + { + "epoch": 1.71, + "grad_norm": 1.334631295017903, + "learning_rate": 1.531585022434948e-06, + "loss": 0.7018, + "step": 11464 + }, + { + "epoch": 1.71, + "grad_norm": 1.7270526357154794, + "learning_rate": 1.5315031925769146e-06, + "loss": 0.6953, + "step": 11465 + }, + { + "epoch": 1.71, + "grad_norm": 1.3488509783022633, + "learning_rate": 1.5314213577583149e-06, + "loss": 0.6543, + "step": 11466 + }, + { + "epoch": 1.71, + "grad_norm": 1.4768040178398525, + "learning_rate": 1.5313395179799127e-06, + "loss": 0.6543, + "step": 11467 + }, + { + "epoch": 1.71, + "grad_norm": 4.776314009714664, + "learning_rate": 1.5312576732424716e-06, + "loss": 0.6667, + "step": 11468 + }, + { + "epoch": 1.71, + "grad_norm": 2.5206484193664638, + "learning_rate": 1.5311758235467559e-06, + "loss": 0.6966, + "step": 11469 + }, + { + "epoch": 1.71, + "grad_norm": 2.2287876009149676, + "learning_rate": 1.5310939688935289e-06, + "loss": 0.651, + "step": 11470 + }, + { + "epoch": 1.71, + "grad_norm": 0.8771289508932841, + "learning_rate": 1.531012109283555e-06, + "loss": 0.6536, + "step": 11471 + }, + { + "epoch": 1.71, + "grad_norm": 3.0468882364446648, + "learning_rate": 1.5309302447175983e-06, + "loss": 0.681, + "step": 11472 + }, + { + "epoch": 1.71, + "grad_norm": 1.2621108883406331, + "learning_rate": 1.5308483751964228e-06, + "loss": 0.6589, + "step": 11473 + }, + { + "epoch": 1.71, + "grad_norm": 0.8433338204304525, + "learning_rate": 1.5307665007207923e-06, + "loss": 0.6621, + "step": 11474 + }, + { + "epoch": 1.71, + "grad_norm": 3.9047871971496346, + "learning_rate": 1.530684621291471e-06, + "loss": 0.6764, + "step": 11475 + }, + { + "epoch": 1.71, + "grad_norm": 2.720679134319679, + "learning_rate": 1.5306027369092234e-06, + "loss": 0.6849, + "step": 11476 + }, + { + "epoch": 1.71, + "grad_norm": 0.9221373226732609, + "learning_rate": 1.5305208475748132e-06, + "loss": 0.6732, + "step": 11477 + }, + { + "epoch": 1.71, + "grad_norm": 0.9602985082524109, + "learning_rate": 1.5304389532890055e-06, + "loss": 0.6419, + "step": 11478 + }, + { + "epoch": 1.71, + "grad_norm": 2.4092210094630935, + "learning_rate": 1.5303570540525637e-06, + "loss": 0.666, + "step": 11479 + }, + { + "epoch": 1.71, + "grad_norm": 3.327330519181849, + "learning_rate": 1.5302751498662527e-06, + "loss": 0.696, + "step": 11480 + }, + { + "epoch": 1.71, + "grad_norm": 1.2026757597840179, + "learning_rate": 1.5301932407308372e-06, + "loss": 0.6966, + "step": 11481 + }, + { + "epoch": 1.71, + "grad_norm": 2.3098703438694783, + "learning_rate": 1.530111326647081e-06, + "loss": 0.6667, + "step": 11482 + }, + { + "epoch": 1.71, + "grad_norm": 5.58466421352664, + "learning_rate": 1.530029407615749e-06, + "loss": 0.6719, + "step": 11483 + }, + { + "epoch": 1.71, + "grad_norm": 1.8999739990618183, + "learning_rate": 1.5299474836376055e-06, + "loss": 0.6595, + "step": 11484 + }, + { + "epoch": 1.71, + "grad_norm": 2.173676847950758, + "learning_rate": 1.5298655547134151e-06, + "loss": 0.6647, + "step": 11485 + }, + { + "epoch": 1.71, + "grad_norm": 5.012409366197969, + "learning_rate": 1.5297836208439432e-06, + "loss": 0.6543, + "step": 11486 + }, + { + "epoch": 1.71, + "grad_norm": 2.4202006969302725, + "learning_rate": 1.5297016820299535e-06, + "loss": 0.668, + "step": 11487 + }, + { + "epoch": 1.71, + "grad_norm": 2.4555468700866956, + "learning_rate": 1.5296197382722111e-06, + "loss": 0.6784, + "step": 11488 + }, + { + "epoch": 1.71, + "grad_norm": 1.8455076334877527, + "learning_rate": 1.5295377895714812e-06, + "loss": 0.6634, + "step": 11489 + }, + { + "epoch": 1.71, + "grad_norm": 3.297455136615023, + "learning_rate": 1.529455835928528e-06, + "loss": 0.6914, + "step": 11490 + }, + { + "epoch": 1.71, + "grad_norm": 2.8148464926304158, + "learning_rate": 1.5293738773441168e-06, + "loss": 0.6628, + "step": 11491 + }, + { + "epoch": 1.71, + "grad_norm": 1.2359938150402179, + "learning_rate": 1.5292919138190124e-06, + "loss": 0.6706, + "step": 11492 + }, + { + "epoch": 1.71, + "grad_norm": 2.9154114390176344, + "learning_rate": 1.5292099453539796e-06, + "loss": 0.6758, + "step": 11493 + }, + { + "epoch": 1.71, + "grad_norm": 4.258893337458056, + "learning_rate": 1.5291279719497836e-06, + "loss": 0.6992, + "step": 11494 + }, + { + "epoch": 1.71, + "grad_norm": 6.559363678137722, + "learning_rate": 1.5290459936071898e-06, + "loss": 0.707, + "step": 11495 + }, + { + "epoch": 1.71, + "grad_norm": 4.643891220486227, + "learning_rate": 1.5289640103269623e-06, + "loss": 0.6562, + "step": 11496 + }, + { + "epoch": 1.71, + "grad_norm": 1.8736400985051713, + "learning_rate": 1.5288820221098675e-06, + "loss": 0.679, + "step": 11497 + }, + { + "epoch": 1.71, + "grad_norm": 3.1140829373384817, + "learning_rate": 1.5288000289566697e-06, + "loss": 0.7012, + "step": 11498 + }, + { + "epoch": 1.71, + "grad_norm": 1.2873171559170815, + "learning_rate": 1.5287180308681347e-06, + "loss": 0.6719, + "step": 11499 + }, + { + "epoch": 1.72, + "grad_norm": 1.225033352239221, + "learning_rate": 1.5286360278450272e-06, + "loss": 0.6764, + "step": 11500 + }, + { + "epoch": 1.72, + "grad_norm": 2.787640481425196, + "learning_rate": 1.528554019888113e-06, + "loss": 0.6966, + "step": 11501 + }, + { + "epoch": 1.72, + "grad_norm": 2.6840679425058362, + "learning_rate": 1.5284720069981578e-06, + "loss": 0.6641, + "step": 11502 + }, + { + "epoch": 1.72, + "grad_norm": 4.843423212809562, + "learning_rate": 1.5283899891759263e-06, + "loss": 0.6836, + "step": 11503 + }, + { + "epoch": 1.72, + "grad_norm": 0.9723516461622862, + "learning_rate": 1.5283079664221842e-06, + "loss": 0.6953, + "step": 11504 + }, + { + "epoch": 1.72, + "grad_norm": 3.1809509352633007, + "learning_rate": 1.5282259387376974e-06, + "loss": 0.6934, + "step": 11505 + }, + { + "epoch": 1.72, + "grad_norm": 4.4997349985195285, + "learning_rate": 1.528143906123231e-06, + "loss": 0.6628, + "step": 11506 + }, + { + "epoch": 1.72, + "grad_norm": 3.8413255828883, + "learning_rate": 1.528061868579551e-06, + "loss": 0.6478, + "step": 11507 + }, + { + "epoch": 1.72, + "grad_norm": 3.2035092397153395, + "learning_rate": 1.5279798261074225e-06, + "loss": 0.6803, + "step": 11508 + }, + { + "epoch": 1.72, + "grad_norm": 5.721390392327863, + "learning_rate": 1.527897778707612e-06, + "loss": 0.6829, + "step": 11509 + }, + { + "epoch": 1.72, + "grad_norm": 2.159561171716866, + "learning_rate": 1.5278157263808846e-06, + "loss": 0.6927, + "step": 11510 + }, + { + "epoch": 1.72, + "grad_norm": 0.8184733457015813, + "learning_rate": 1.5277336691280064e-06, + "loss": 0.6751, + "step": 11511 + }, + { + "epoch": 1.72, + "grad_norm": 3.264140928457679, + "learning_rate": 1.5276516069497435e-06, + "loss": 0.6647, + "step": 11512 + }, + { + "epoch": 1.72, + "grad_norm": 0.6940433394726021, + "learning_rate": 1.5275695398468611e-06, + "loss": 0.6862, + "step": 11513 + }, + { + "epoch": 1.72, + "grad_norm": 1.7766680109416464, + "learning_rate": 1.5274874678201254e-06, + "loss": 0.6725, + "step": 11514 + }, + { + "epoch": 1.72, + "grad_norm": 1.3594121081005044, + "learning_rate": 1.5274053908703033e-06, + "loss": 0.6647, + "step": 11515 + }, + { + "epoch": 1.72, + "grad_norm": 1.0532721629855162, + "learning_rate": 1.5273233089981593e-06, + "loss": 0.6602, + "step": 11516 + }, + { + "epoch": 1.72, + "grad_norm": 1.5887161415626203, + "learning_rate": 1.5272412222044606e-06, + "loss": 0.6758, + "step": 11517 + }, + { + "epoch": 1.72, + "grad_norm": 1.0288319090190476, + "learning_rate": 1.5271591304899727e-06, + "loss": 0.668, + "step": 11518 + }, + { + "epoch": 1.72, + "grad_norm": 1.8389809756854498, + "learning_rate": 1.5270770338554622e-06, + "loss": 0.6725, + "step": 11519 + }, + { + "epoch": 1.72, + "grad_norm": 1.993579845984833, + "learning_rate": 1.5269949323016951e-06, + "loss": 0.6908, + "step": 11520 + }, + { + "epoch": 1.72, + "grad_norm": 2.6534945297771086, + "learning_rate": 1.5269128258294375e-06, + "loss": 0.6673, + "step": 11521 + }, + { + "epoch": 1.72, + "grad_norm": 0.8851601505627456, + "learning_rate": 1.526830714439456e-06, + "loss": 0.6634, + "step": 11522 + }, + { + "epoch": 1.72, + "grad_norm": 1.6165733959729487, + "learning_rate": 1.5267485981325169e-06, + "loss": 0.6712, + "step": 11523 + }, + { + "epoch": 1.72, + "grad_norm": 1.2407716553316825, + "learning_rate": 1.5266664769093865e-06, + "loss": 0.6836, + "step": 11524 + }, + { + "epoch": 1.72, + "grad_norm": 1.1359782295825271, + "learning_rate": 1.5265843507708314e-06, + "loss": 0.6562, + "step": 11525 + }, + { + "epoch": 1.72, + "grad_norm": 2.823461378386351, + "learning_rate": 1.5265022197176178e-06, + "loss": 0.6647, + "step": 11526 + }, + { + "epoch": 1.72, + "grad_norm": 1.795326432441512, + "learning_rate": 1.5264200837505125e-06, + "loss": 0.6595, + "step": 11527 + }, + { + "epoch": 1.72, + "grad_norm": 3.7243942192831616, + "learning_rate": 1.5263379428702822e-06, + "loss": 0.6908, + "step": 11528 + }, + { + "epoch": 1.72, + "grad_norm": 2.7178726440038345, + "learning_rate": 1.526255797077693e-06, + "loss": 0.6654, + "step": 11529 + }, + { + "epoch": 1.72, + "grad_norm": 3.640359819172765, + "learning_rate": 1.5261736463735119e-06, + "loss": 0.6901, + "step": 11530 + }, + { + "epoch": 1.72, + "grad_norm": 2.46441937152636, + "learning_rate": 1.526091490758506e-06, + "loss": 0.6426, + "step": 11531 + }, + { + "epoch": 1.72, + "grad_norm": 0.9625234293609433, + "learning_rate": 1.5260093302334415e-06, + "loss": 0.6953, + "step": 11532 + }, + { + "epoch": 1.72, + "grad_norm": 1.1522603940980256, + "learning_rate": 1.5259271647990857e-06, + "loss": 0.6712, + "step": 11533 + }, + { + "epoch": 1.72, + "grad_norm": 4.825528634606967, + "learning_rate": 1.5258449944562046e-06, + "loss": 0.6771, + "step": 11534 + }, + { + "epoch": 1.72, + "grad_norm": 3.0755662892487736, + "learning_rate": 1.5257628192055658e-06, + "loss": 0.6556, + "step": 11535 + }, + { + "epoch": 1.72, + "grad_norm": 0.9721265607543156, + "learning_rate": 1.5256806390479364e-06, + "loss": 0.6576, + "step": 11536 + }, + { + "epoch": 1.72, + "grad_norm": 2.234082042987946, + "learning_rate": 1.5255984539840827e-06, + "loss": 0.6751, + "step": 11537 + }, + { + "epoch": 1.72, + "grad_norm": 4.405182931164534, + "learning_rate": 1.5255162640147726e-06, + "loss": 0.6823, + "step": 11538 + }, + { + "epoch": 1.72, + "grad_norm": 3.55610003390821, + "learning_rate": 1.5254340691407725e-06, + "loss": 0.6855, + "step": 11539 + }, + { + "epoch": 1.72, + "grad_norm": 4.0863282999185495, + "learning_rate": 1.5253518693628497e-06, + "loss": 0.7025, + "step": 11540 + }, + { + "epoch": 1.72, + "grad_norm": 2.225132543648698, + "learning_rate": 1.5252696646817716e-06, + "loss": 0.6569, + "step": 11541 + }, + { + "epoch": 1.72, + "grad_norm": 2.0596821418595606, + "learning_rate": 1.5251874550983053e-06, + "loss": 0.6895, + "step": 11542 + }, + { + "epoch": 1.72, + "grad_norm": 1.7173364952028267, + "learning_rate": 1.5251052406132177e-06, + "loss": 0.6758, + "step": 11543 + }, + { + "epoch": 1.72, + "grad_norm": 1.0601139788425298, + "learning_rate": 1.5250230212272767e-06, + "loss": 0.679, + "step": 11544 + }, + { + "epoch": 1.72, + "grad_norm": 2.578498645671937, + "learning_rate": 1.5249407969412499e-06, + "loss": 0.6732, + "step": 11545 + }, + { + "epoch": 1.72, + "grad_norm": 3.5884692739635544, + "learning_rate": 1.5248585677559032e-06, + "loss": 0.6875, + "step": 11546 + }, + { + "epoch": 1.72, + "grad_norm": 0.9211365974546788, + "learning_rate": 1.5247763336720057e-06, + "loss": 0.6836, + "step": 11547 + }, + { + "epoch": 1.72, + "grad_norm": 0.8705697555393813, + "learning_rate": 1.5246940946903242e-06, + "loss": 0.6725, + "step": 11548 + }, + { + "epoch": 1.72, + "grad_norm": 2.001256513189764, + "learning_rate": 1.5246118508116263e-06, + "loss": 0.6562, + "step": 11549 + }, + { + "epoch": 1.72, + "grad_norm": 3.2517210659199414, + "learning_rate": 1.5245296020366795e-06, + "loss": 0.6921, + "step": 11550 + }, + { + "epoch": 1.72, + "grad_norm": 1.1652624977340456, + "learning_rate": 1.5244473483662517e-06, + "loss": 0.6686, + "step": 11551 + }, + { + "epoch": 1.72, + "grad_norm": 2.0617956631762056, + "learning_rate": 1.5243650898011105e-06, + "loss": 0.6647, + "step": 11552 + }, + { + "epoch": 1.72, + "grad_norm": 3.5138729086952383, + "learning_rate": 1.5242828263420231e-06, + "loss": 0.6738, + "step": 11553 + }, + { + "epoch": 1.72, + "grad_norm": 0.9763973219088, + "learning_rate": 1.5242005579897582e-06, + "loss": 0.6667, + "step": 11554 + }, + { + "epoch": 1.72, + "grad_norm": 0.7950439098109021, + "learning_rate": 1.5241182847450828e-06, + "loss": 0.6751, + "step": 11555 + }, + { + "epoch": 1.72, + "grad_norm": 2.5545326066505156, + "learning_rate": 1.5240360066087652e-06, + "loss": 0.6543, + "step": 11556 + }, + { + "epoch": 1.72, + "grad_norm": 1.712131246247518, + "learning_rate": 1.5239537235815733e-06, + "loss": 0.6823, + "step": 11557 + }, + { + "epoch": 1.72, + "grad_norm": 0.8436324167312812, + "learning_rate": 1.523871435664275e-06, + "loss": 0.679, + "step": 11558 + }, + { + "epoch": 1.72, + "grad_norm": 1.547067308639291, + "learning_rate": 1.523789142857638e-06, + "loss": 0.6927, + "step": 11559 + }, + { + "epoch": 1.72, + "grad_norm": 6.282559267879227, + "learning_rate": 1.523706845162431e-06, + "loss": 0.6953, + "step": 11560 + }, + { + "epoch": 1.72, + "grad_norm": 3.8268317708739295, + "learning_rate": 1.5236245425794211e-06, + "loss": 0.6888, + "step": 11561 + }, + { + "epoch": 1.72, + "grad_norm": 1.834019569154629, + "learning_rate": 1.5235422351093775e-06, + "loss": 0.666, + "step": 11562 + }, + { + "epoch": 1.72, + "grad_norm": 1.9898444084273643, + "learning_rate": 1.5234599227530678e-06, + "loss": 0.6576, + "step": 11563 + }, + { + "epoch": 1.72, + "grad_norm": 2.029967106432987, + "learning_rate": 1.52337760551126e-06, + "loss": 0.6595, + "step": 11564 + }, + { + "epoch": 1.72, + "grad_norm": 1.648024925383263, + "learning_rate": 1.5232952833847237e-06, + "loss": 0.6751, + "step": 11565 + }, + { + "epoch": 1.72, + "grad_norm": 0.9316147656609647, + "learning_rate": 1.5232129563742256e-06, + "loss": 0.6803, + "step": 11566 + }, + { + "epoch": 1.73, + "grad_norm": 3.0215615920315675, + "learning_rate": 1.5231306244805345e-06, + "loss": 0.6641, + "step": 11567 + }, + { + "epoch": 1.73, + "grad_norm": 1.5871326680364233, + "learning_rate": 1.5230482877044195e-06, + "loss": 0.6901, + "step": 11568 + }, + { + "epoch": 1.73, + "grad_norm": 1.3178526910816928, + "learning_rate": 1.5229659460466482e-06, + "loss": 0.6608, + "step": 11569 + }, + { + "epoch": 1.73, + "grad_norm": 0.8308826039558111, + "learning_rate": 1.5228835995079896e-06, + "loss": 0.6621, + "step": 11570 + }, + { + "epoch": 1.73, + "grad_norm": 5.397895244788803, + "learning_rate": 1.522801248089212e-06, + "loss": 0.6914, + "step": 11571 + }, + { + "epoch": 1.73, + "grad_norm": 1.0882705767391025, + "learning_rate": 1.5227188917910842e-06, + "loss": 0.6823, + "step": 11572 + }, + { + "epoch": 1.73, + "grad_norm": 1.5058803095394655, + "learning_rate": 1.522636530614375e-06, + "loss": 0.6465, + "step": 11573 + }, + { + "epoch": 1.73, + "grad_norm": 3.1363946447048785, + "learning_rate": 1.5225541645598524e-06, + "loss": 0.6895, + "step": 11574 + }, + { + "epoch": 1.73, + "grad_norm": 1.1944721711300355, + "learning_rate": 1.522471793628286e-06, + "loss": 0.6953, + "step": 11575 + }, + { + "epoch": 1.73, + "grad_norm": 0.8110336339702339, + "learning_rate": 1.522389417820444e-06, + "loss": 0.6517, + "step": 11576 + }, + { + "epoch": 1.73, + "grad_norm": 1.588500399779806, + "learning_rate": 1.5223070371370953e-06, + "loss": 0.6719, + "step": 11577 + }, + { + "epoch": 1.73, + "grad_norm": 3.3921723111493836, + "learning_rate": 1.5222246515790088e-06, + "loss": 0.7142, + "step": 11578 + }, + { + "epoch": 1.73, + "grad_norm": 5.1171214123031845, + "learning_rate": 1.5221422611469533e-06, + "loss": 0.6745, + "step": 11579 + }, + { + "epoch": 1.73, + "grad_norm": 2.3396402673710743, + "learning_rate": 1.5220598658416983e-06, + "loss": 0.6771, + "step": 11580 + }, + { + "epoch": 1.73, + "grad_norm": 4.697104120544942, + "learning_rate": 1.5219774656640123e-06, + "loss": 0.696, + "step": 11581 + }, + { + "epoch": 1.73, + "grad_norm": 0.7781985169403002, + "learning_rate": 1.5218950606146644e-06, + "loss": 0.6608, + "step": 11582 + }, + { + "epoch": 1.73, + "grad_norm": 1.7566417626600026, + "learning_rate": 1.5218126506944238e-06, + "loss": 0.6914, + "step": 11583 + }, + { + "epoch": 1.73, + "grad_norm": 2.2830568251260455, + "learning_rate": 1.521730235904059e-06, + "loss": 0.6803, + "step": 11584 + }, + { + "epoch": 1.73, + "grad_norm": 1.4539331009798455, + "learning_rate": 1.5216478162443406e-06, + "loss": 0.668, + "step": 11585 + }, + { + "epoch": 1.73, + "grad_norm": 6.9534531607532, + "learning_rate": 1.5215653917160367e-06, + "loss": 0.6758, + "step": 11586 + }, + { + "epoch": 1.73, + "grad_norm": 3.8038603171777474, + "learning_rate": 1.5214829623199167e-06, + "loss": 0.6634, + "step": 11587 + }, + { + "epoch": 1.73, + "grad_norm": 3.691953235362084, + "learning_rate": 1.5214005280567501e-06, + "loss": 0.668, + "step": 11588 + }, + { + "epoch": 1.73, + "grad_norm": 0.7498415002097164, + "learning_rate": 1.5213180889273068e-06, + "loss": 0.6608, + "step": 11589 + }, + { + "epoch": 1.73, + "grad_norm": 0.71542207741776, + "learning_rate": 1.5212356449323549e-06, + "loss": 0.6732, + "step": 11590 + }, + { + "epoch": 1.73, + "grad_norm": 0.9196561802623485, + "learning_rate": 1.521153196072665e-06, + "loss": 0.6732, + "step": 11591 + }, + { + "epoch": 1.73, + "grad_norm": 0.7710991431737699, + "learning_rate": 1.5210707423490063e-06, + "loss": 0.6934, + "step": 11592 + }, + { + "epoch": 1.73, + "grad_norm": 3.962186235562875, + "learning_rate": 1.5209882837621483e-06, + "loss": 0.6816, + "step": 11593 + }, + { + "epoch": 1.73, + "grad_norm": 1.678815193929617, + "learning_rate": 1.5209058203128604e-06, + "loss": 0.6829, + "step": 11594 + }, + { + "epoch": 1.73, + "grad_norm": 1.3330679312002494, + "learning_rate": 1.5208233520019125e-06, + "loss": 0.6602, + "step": 11595 + }, + { + "epoch": 1.73, + "grad_norm": 0.8984089826381332, + "learning_rate": 1.520740878830074e-06, + "loss": 0.6719, + "step": 11596 + }, + { + "epoch": 1.73, + "grad_norm": 2.8343623229536266, + "learning_rate": 1.520658400798115e-06, + "loss": 0.6712, + "step": 11597 + }, + { + "epoch": 1.73, + "grad_norm": 2.3834998225452395, + "learning_rate": 1.5205759179068052e-06, + "loss": 0.6934, + "step": 11598 + }, + { + "epoch": 1.73, + "grad_norm": 0.8105577233361673, + "learning_rate": 1.5204934301569145e-06, + "loss": 0.6439, + "step": 11599 + }, + { + "epoch": 1.73, + "grad_norm": 2.048531568203376, + "learning_rate": 1.520410937549212e-06, + "loss": 0.6758, + "step": 11600 + }, + { + "epoch": 1.73, + "grad_norm": 0.9296005252046927, + "learning_rate": 1.5203284400844688e-06, + "loss": 0.6953, + "step": 11601 + }, + { + "epoch": 1.73, + "grad_norm": 0.7920750781823325, + "learning_rate": 1.520245937763454e-06, + "loss": 0.6562, + "step": 11602 + }, + { + "epoch": 1.73, + "grad_norm": 2.755575377984757, + "learning_rate": 1.5201634305869377e-06, + "loss": 0.6868, + "step": 11603 + }, + { + "epoch": 1.73, + "grad_norm": 2.293168813708451, + "learning_rate": 1.5200809185556901e-06, + "loss": 0.6823, + "step": 11604 + }, + { + "epoch": 1.73, + "grad_norm": 1.152477707174796, + "learning_rate": 1.5199984016704815e-06, + "loss": 0.7103, + "step": 11605 + }, + { + "epoch": 1.73, + "grad_norm": 1.812932331061227, + "learning_rate": 1.5199158799320818e-06, + "loss": 0.6829, + "step": 11606 + }, + { + "epoch": 1.73, + "grad_norm": 2.689979610829265, + "learning_rate": 1.5198333533412611e-06, + "loss": 0.6667, + "step": 11607 + }, + { + "epoch": 1.73, + "grad_norm": 1.947381732256308, + "learning_rate": 1.51975082189879e-06, + "loss": 0.6725, + "step": 11608 + }, + { + "epoch": 1.73, + "grad_norm": 0.8493489373623441, + "learning_rate": 1.519668285605438e-06, + "loss": 0.6693, + "step": 11609 + }, + { + "epoch": 1.73, + "grad_norm": 1.923026684726481, + "learning_rate": 1.5195857444619764e-06, + "loss": 0.6706, + "step": 11610 + }, + { + "epoch": 1.73, + "grad_norm": 1.9706312979209764, + "learning_rate": 1.5195031984691751e-06, + "loss": 0.6842, + "step": 11611 + }, + { + "epoch": 1.73, + "grad_norm": 4.116667782073485, + "learning_rate": 1.519420647627804e-06, + "loss": 0.6393, + "step": 11612 + }, + { + "epoch": 1.73, + "grad_norm": 1.7841469498733964, + "learning_rate": 1.5193380919386347e-06, + "loss": 0.6855, + "step": 11613 + }, + { + "epoch": 1.73, + "grad_norm": 1.4075788508962361, + "learning_rate": 1.5192555314024367e-06, + "loss": 0.6947, + "step": 11614 + }, + { + "epoch": 1.73, + "grad_norm": 1.8347634397416688, + "learning_rate": 1.519172966019981e-06, + "loss": 0.6797, + "step": 11615 + }, + { + "epoch": 1.73, + "grad_norm": 2.1362814766622633, + "learning_rate": 1.519090395792038e-06, + "loss": 0.6341, + "step": 11616 + }, + { + "epoch": 1.73, + "grad_norm": 1.596588025691175, + "learning_rate": 1.5190078207193783e-06, + "loss": 0.6862, + "step": 11617 + }, + { + "epoch": 1.73, + "grad_norm": 2.5529075220931947, + "learning_rate": 1.5189252408027732e-06, + "loss": 0.6374, + "step": 11618 + }, + { + "epoch": 1.73, + "grad_norm": 1.8573236431435471, + "learning_rate": 1.5188426560429925e-06, + "loss": 0.6549, + "step": 11619 + }, + { + "epoch": 1.73, + "grad_norm": 0.7438893202869956, + "learning_rate": 1.5187600664408078e-06, + "loss": 0.6673, + "step": 11620 + }, + { + "epoch": 1.73, + "grad_norm": 3.3847075786487735, + "learning_rate": 1.5186774719969892e-06, + "loss": 0.6784, + "step": 11621 + }, + { + "epoch": 1.73, + "grad_norm": 1.8787568012608378, + "learning_rate": 1.518594872712308e-06, + "loss": 0.6777, + "step": 11622 + }, + { + "epoch": 1.73, + "grad_norm": 1.583205494389302, + "learning_rate": 1.5185122685875349e-06, + "loss": 0.6576, + "step": 11623 + }, + { + "epoch": 1.73, + "grad_norm": 3.8319387431716527, + "learning_rate": 1.5184296596234409e-06, + "loss": 0.6458, + "step": 11624 + }, + { + "epoch": 1.73, + "grad_norm": 1.0310397087664336, + "learning_rate": 1.5183470458207972e-06, + "loss": 0.6738, + "step": 11625 + }, + { + "epoch": 1.73, + "grad_norm": 3.048240224710574, + "learning_rate": 1.5182644271803746e-06, + "loss": 0.6934, + "step": 11626 + }, + { + "epoch": 1.73, + "grad_norm": 2.3226785533415217, + "learning_rate": 1.5181818037029443e-06, + "loss": 0.6732, + "step": 11627 + }, + { + "epoch": 1.73, + "grad_norm": 1.1248389000887298, + "learning_rate": 1.5180991753892773e-06, + "loss": 0.6374, + "step": 11628 + }, + { + "epoch": 1.73, + "grad_norm": 1.9169367184289778, + "learning_rate": 1.518016542240145e-06, + "loss": 0.6582, + "step": 11629 + }, + { + "epoch": 1.73, + "grad_norm": 1.6834015868563286, + "learning_rate": 1.5179339042563184e-06, + "loss": 0.6784, + "step": 11630 + }, + { + "epoch": 1.73, + "grad_norm": 0.980353420324588, + "learning_rate": 1.5178512614385693e-06, + "loss": 0.668, + "step": 11631 + }, + { + "epoch": 1.73, + "grad_norm": 1.0836534779786946, + "learning_rate": 1.5177686137876679e-06, + "loss": 0.696, + "step": 11632 + }, + { + "epoch": 1.73, + "grad_norm": 3.1520682866616996, + "learning_rate": 1.5176859613043865e-06, + "loss": 0.7109, + "step": 11633 + }, + { + "epoch": 1.74, + "grad_norm": 2.578042247536175, + "learning_rate": 1.5176033039894964e-06, + "loss": 0.6667, + "step": 11634 + }, + { + "epoch": 1.74, + "grad_norm": 1.1648769501707026, + "learning_rate": 1.517520641843769e-06, + "loss": 0.6953, + "step": 11635 + }, + { + "epoch": 1.74, + "grad_norm": 2.0690594805974056, + "learning_rate": 1.5174379748679754e-06, + "loss": 0.6647, + "step": 11636 + }, + { + "epoch": 1.74, + "grad_norm": 2.734042493306766, + "learning_rate": 1.5173553030628874e-06, + "loss": 0.6777, + "step": 11637 + }, + { + "epoch": 1.74, + "grad_norm": 4.674909294002542, + "learning_rate": 1.5172726264292766e-06, + "loss": 0.625, + "step": 11638 + }, + { + "epoch": 1.74, + "grad_norm": 1.5597498693590062, + "learning_rate": 1.517189944967915e-06, + "loss": 0.6602, + "step": 11639 + }, + { + "epoch": 1.74, + "grad_norm": 2.534674558937592, + "learning_rate": 1.5171072586795734e-06, + "loss": 0.6875, + "step": 11640 + }, + { + "epoch": 1.74, + "grad_norm": 2.3925490427918277, + "learning_rate": 1.5170245675650245e-06, + "loss": 0.6725, + "step": 11641 + }, + { + "epoch": 1.74, + "grad_norm": 1.7830981449797252, + "learning_rate": 1.5169418716250394e-06, + "loss": 0.6875, + "step": 11642 + }, + { + "epoch": 1.74, + "grad_norm": 1.6718027732936311, + "learning_rate": 1.5168591708603898e-06, + "loss": 0.7174, + "step": 11643 + }, + { + "epoch": 1.74, + "grad_norm": 3.3655796330956846, + "learning_rate": 1.5167764652718481e-06, + "loss": 0.6426, + "step": 11644 + }, + { + "epoch": 1.74, + "grad_norm": 1.366375865099568, + "learning_rate": 1.516693754860186e-06, + "loss": 0.6654, + "step": 11645 + }, + { + "epoch": 1.74, + "grad_norm": 1.6845297689640262, + "learning_rate": 1.5166110396261752e-06, + "loss": 0.6673, + "step": 11646 + }, + { + "epoch": 1.74, + "grad_norm": 2.6131129807422697, + "learning_rate": 1.5165283195705882e-06, + "loss": 0.6497, + "step": 11647 + }, + { + "epoch": 1.74, + "grad_norm": 1.5480615405526605, + "learning_rate": 1.5164455946941964e-06, + "loss": 0.6914, + "step": 11648 + }, + { + "epoch": 1.74, + "grad_norm": 3.290725911115285, + "learning_rate": 1.5163628649977724e-06, + "loss": 0.6686, + "step": 11649 + }, + { + "epoch": 1.74, + "grad_norm": 2.7056990708665234, + "learning_rate": 1.5162801304820879e-06, + "loss": 0.6829, + "step": 11650 + }, + { + "epoch": 1.74, + "grad_norm": 2.086941574720548, + "learning_rate": 1.5161973911479154e-06, + "loss": 0.668, + "step": 11651 + }, + { + "epoch": 1.74, + "grad_norm": 2.5073850971692573, + "learning_rate": 1.516114646996027e-06, + "loss": 0.6621, + "step": 11652 + }, + { + "epoch": 1.74, + "grad_norm": 2.408000172865694, + "learning_rate": 1.516031898027195e-06, + "loss": 0.6855, + "step": 11653 + }, + { + "epoch": 1.74, + "grad_norm": 0.8671338941878278, + "learning_rate": 1.5159491442421916e-06, + "loss": 0.6549, + "step": 11654 + }, + { + "epoch": 1.74, + "grad_norm": 4.324143360298283, + "learning_rate": 1.5158663856417896e-06, + "loss": 0.694, + "step": 11655 + }, + { + "epoch": 1.74, + "grad_norm": 5.1036436938391665, + "learning_rate": 1.51578362222676e-06, + "loss": 0.6888, + "step": 11656 + }, + { + "epoch": 1.74, + "grad_norm": 1.0384618015227467, + "learning_rate": 1.515700853997877e-06, + "loss": 0.6751, + "step": 11657 + }, + { + "epoch": 1.74, + "grad_norm": 3.6521469710078587, + "learning_rate": 1.5156180809559126e-06, + "loss": 0.6908, + "step": 11658 + }, + { + "epoch": 1.74, + "grad_norm": 0.9122986460173937, + "learning_rate": 1.5155353031016382e-06, + "loss": 0.6745, + "step": 11659 + }, + { + "epoch": 1.74, + "grad_norm": 0.8215996448005556, + "learning_rate": 1.515452520435828e-06, + "loss": 0.6934, + "step": 11660 + }, + { + "epoch": 1.74, + "grad_norm": 3.8730755115475497, + "learning_rate": 1.5153697329592534e-06, + "loss": 0.6641, + "step": 11661 + }, + { + "epoch": 1.74, + "grad_norm": 1.7957477053966993, + "learning_rate": 1.5152869406726875e-06, + "loss": 0.6973, + "step": 11662 + }, + { + "epoch": 1.74, + "grad_norm": 2.705606701711637, + "learning_rate": 1.5152041435769033e-06, + "loss": 0.6777, + "step": 11663 + }, + { + "epoch": 1.74, + "grad_norm": 1.3599023865150701, + "learning_rate": 1.515121341672673e-06, + "loss": 0.6797, + "step": 11664 + }, + { + "epoch": 1.74, + "grad_norm": 3.1937504044767993, + "learning_rate": 1.51503853496077e-06, + "loss": 0.6536, + "step": 11665 + }, + { + "epoch": 1.74, + "grad_norm": 1.2271377330301583, + "learning_rate": 1.5149557234419662e-06, + "loss": 0.6934, + "step": 11666 + }, + { + "epoch": 1.74, + "grad_norm": 1.5345637101273377, + "learning_rate": 1.5148729071170356e-06, + "loss": 0.7038, + "step": 11667 + }, + { + "epoch": 1.74, + "grad_norm": 1.56054385704243, + "learning_rate": 1.5147900859867506e-06, + "loss": 0.6797, + "step": 11668 + }, + { + "epoch": 1.74, + "grad_norm": 0.9796679633809237, + "learning_rate": 1.5147072600518839e-06, + "loss": 0.6452, + "step": 11669 + }, + { + "epoch": 1.74, + "grad_norm": 6.189061867063472, + "learning_rate": 1.5146244293132094e-06, + "loss": 0.6608, + "step": 11670 + }, + { + "epoch": 1.74, + "grad_norm": 1.0175976025365658, + "learning_rate": 1.514541593771499e-06, + "loss": 0.6621, + "step": 11671 + }, + { + "epoch": 1.74, + "grad_norm": 1.3684597700371743, + "learning_rate": 1.5144587534275266e-06, + "loss": 0.6953, + "step": 11672 + }, + { + "epoch": 1.74, + "grad_norm": 0.8238939219503716, + "learning_rate": 1.5143759082820653e-06, + "loss": 0.6576, + "step": 11673 + }, + { + "epoch": 1.74, + "grad_norm": 2.128895102397412, + "learning_rate": 1.514293058335888e-06, + "loss": 0.6719, + "step": 11674 + }, + { + "epoch": 1.74, + "grad_norm": 2.7589103092609037, + "learning_rate": 1.514210203589768e-06, + "loss": 0.6784, + "step": 11675 + }, + { + "epoch": 1.74, + "grad_norm": 0.7630656788660078, + "learning_rate": 1.5141273440444789e-06, + "loss": 0.6634, + "step": 11676 + }, + { + "epoch": 1.74, + "grad_norm": 0.8370886725229035, + "learning_rate": 1.5140444797007939e-06, + "loss": 0.6706, + "step": 11677 + }, + { + "epoch": 1.74, + "grad_norm": 3.1732053176297605, + "learning_rate": 1.5139616105594861e-06, + "loss": 0.6686, + "step": 11678 + }, + { + "epoch": 1.74, + "grad_norm": 2.066842656356591, + "learning_rate": 1.5138787366213293e-06, + "loss": 0.6654, + "step": 11679 + }, + { + "epoch": 1.74, + "grad_norm": 1.958821708229822, + "learning_rate": 1.5137958578870967e-06, + "loss": 0.6771, + "step": 11680 + }, + { + "epoch": 1.74, + "grad_norm": 2.7209718174774036, + "learning_rate": 1.513712974357562e-06, + "loss": 0.6732, + "step": 11681 + }, + { + "epoch": 1.74, + "grad_norm": 2.1888983458149633, + "learning_rate": 1.5136300860334985e-06, + "loss": 0.6621, + "step": 11682 + }, + { + "epoch": 1.74, + "grad_norm": 1.4225821767614661, + "learning_rate": 1.5135471929156798e-06, + "loss": 0.651, + "step": 11683 + }, + { + "epoch": 1.74, + "grad_norm": 1.0175234328720022, + "learning_rate": 1.5134642950048804e-06, + "loss": 0.6615, + "step": 11684 + }, + { + "epoch": 1.74, + "grad_norm": 2.639854662788865, + "learning_rate": 1.5133813923018728e-06, + "loss": 0.6673, + "step": 11685 + }, + { + "epoch": 1.74, + "grad_norm": 1.820807585151161, + "learning_rate": 1.5132984848074316e-06, + "loss": 0.696, + "step": 11686 + }, + { + "epoch": 1.74, + "grad_norm": 2.5068967910038906, + "learning_rate": 1.5132155725223302e-06, + "loss": 0.6836, + "step": 11687 + }, + { + "epoch": 1.74, + "grad_norm": 1.442580002273388, + "learning_rate": 1.513132655447342e-06, + "loss": 0.7057, + "step": 11688 + }, + { + "epoch": 1.74, + "grad_norm": 1.099993433891396, + "learning_rate": 1.5130497335832417e-06, + "loss": 0.6719, + "step": 11689 + }, + { + "epoch": 1.74, + "grad_norm": 2.888585341331257, + "learning_rate": 1.512966806930803e-06, + "loss": 0.7077, + "step": 11690 + }, + { + "epoch": 1.74, + "grad_norm": 1.2196715299738263, + "learning_rate": 1.5128838754907993e-06, + "loss": 0.6562, + "step": 11691 + }, + { + "epoch": 1.74, + "grad_norm": 1.1530351908870633, + "learning_rate": 1.5128009392640056e-06, + "loss": 0.6764, + "step": 11692 + }, + { + "epoch": 1.74, + "grad_norm": 2.6675962397131356, + "learning_rate": 1.5127179982511948e-06, + "loss": 0.6797, + "step": 11693 + }, + { + "epoch": 1.74, + "grad_norm": 2.9382717209708455, + "learning_rate": 1.5126350524531421e-06, + "loss": 0.6458, + "step": 11694 + }, + { + "epoch": 1.74, + "grad_norm": 1.9544699931989893, + "learning_rate": 1.5125521018706208e-06, + "loss": 0.6693, + "step": 11695 + }, + { + "epoch": 1.74, + "grad_norm": 2.8733528750059127, + "learning_rate": 1.5124691465044056e-06, + "loss": 0.6608, + "step": 11696 + }, + { + "epoch": 1.74, + "grad_norm": 3.3477874018045912, + "learning_rate": 1.5123861863552706e-06, + "loss": 0.6751, + "step": 11697 + }, + { + "epoch": 1.74, + "grad_norm": 4.522340410197998, + "learning_rate": 1.5123032214239898e-06, + "loss": 0.6719, + "step": 11698 + }, + { + "epoch": 1.74, + "grad_norm": 1.7754910642119817, + "learning_rate": 1.5122202517113376e-06, + "loss": 0.6895, + "step": 11699 + }, + { + "epoch": 1.74, + "grad_norm": 2.668492754257447, + "learning_rate": 1.5121372772180889e-06, + "loss": 0.6901, + "step": 11700 + }, + { + "epoch": 1.75, + "grad_norm": 0.9808868201252721, + "learning_rate": 1.5120542979450173e-06, + "loss": 0.6719, + "step": 11701 + }, + { + "epoch": 1.75, + "grad_norm": 1.1852698068562173, + "learning_rate": 1.5119713138928978e-06, + "loss": 0.6868, + "step": 11702 + }, + { + "epoch": 1.75, + "grad_norm": 4.985251422585913, + "learning_rate": 1.5118883250625047e-06, + "loss": 0.6699, + "step": 11703 + }, + { + "epoch": 1.75, + "grad_norm": 0.845808520644887, + "learning_rate": 1.5118053314546128e-06, + "loss": 0.6686, + "step": 11704 + }, + { + "epoch": 1.75, + "grad_norm": 3.670887622681206, + "learning_rate": 1.5117223330699963e-06, + "loss": 0.6882, + "step": 11705 + }, + { + "epoch": 1.75, + "grad_norm": 1.663667101670586, + "learning_rate": 1.5116393299094302e-06, + "loss": 0.7148, + "step": 11706 + }, + { + "epoch": 1.75, + "grad_norm": 0.8474500720777891, + "learning_rate": 1.5115563219736886e-06, + "loss": 0.6608, + "step": 11707 + }, + { + "epoch": 1.75, + "grad_norm": 2.9685927150975844, + "learning_rate": 1.511473309263547e-06, + "loss": 0.6888, + "step": 11708 + }, + { + "epoch": 1.75, + "grad_norm": 0.9589983700045894, + "learning_rate": 1.5113902917797795e-06, + "loss": 0.6647, + "step": 11709 + }, + { + "epoch": 1.75, + "grad_norm": 2.8002578771345643, + "learning_rate": 1.5113072695231615e-06, + "loss": 0.666, + "step": 11710 + }, + { + "epoch": 1.75, + "grad_norm": 2.000152634315775, + "learning_rate": 1.5112242424944674e-06, + "loss": 0.6797, + "step": 11711 + }, + { + "epoch": 1.75, + "grad_norm": 2.9322798172933306, + "learning_rate": 1.511141210694472e-06, + "loss": 0.6777, + "step": 11712 + }, + { + "epoch": 1.75, + "grad_norm": 1.5522351026181698, + "learning_rate": 1.511058174123951e-06, + "loss": 0.6771, + "step": 11713 + }, + { + "epoch": 1.75, + "grad_norm": 2.4376500280099753, + "learning_rate": 1.5109751327836783e-06, + "loss": 0.6823, + "step": 11714 + }, + { + "epoch": 1.75, + "grad_norm": 3.133744393444653, + "learning_rate": 1.5108920866744298e-06, + "loss": 0.6608, + "step": 11715 + }, + { + "epoch": 1.75, + "grad_norm": 2.546706879089998, + "learning_rate": 1.5108090357969801e-06, + "loss": 0.6706, + "step": 11716 + }, + { + "epoch": 1.75, + "grad_norm": 1.731104980194241, + "learning_rate": 1.5107259801521047e-06, + "loss": 0.7064, + "step": 11717 + }, + { + "epoch": 1.75, + "grad_norm": 0.7583058862422003, + "learning_rate": 1.5106429197405784e-06, + "loss": 0.6667, + "step": 11718 + }, + { + "epoch": 1.75, + "grad_norm": 0.8482528283940698, + "learning_rate": 1.5105598545631764e-06, + "loss": 0.6745, + "step": 11719 + }, + { + "epoch": 1.75, + "grad_norm": 1.0380172378742518, + "learning_rate": 1.5104767846206742e-06, + "loss": 0.6712, + "step": 11720 + }, + { + "epoch": 1.75, + "grad_norm": 1.7738137506707194, + "learning_rate": 1.5103937099138475e-06, + "loss": 0.6628, + "step": 11721 + }, + { + "epoch": 1.75, + "grad_norm": 1.3619134645527178, + "learning_rate": 1.5103106304434706e-06, + "loss": 0.6829, + "step": 11722 + }, + { + "epoch": 1.75, + "grad_norm": 3.8115747149917643, + "learning_rate": 1.51022754621032e-06, + "loss": 0.6576, + "step": 11723 + }, + { + "epoch": 1.75, + "grad_norm": 0.8911371882306691, + "learning_rate": 1.51014445721517e-06, + "loss": 0.6836, + "step": 11724 + }, + { + "epoch": 1.75, + "grad_norm": 0.9441731241047443, + "learning_rate": 1.5100613634587972e-06, + "loss": 0.6732, + "step": 11725 + }, + { + "epoch": 1.75, + "grad_norm": 5.124621832842882, + "learning_rate": 1.509978264941976e-06, + "loss": 0.6921, + "step": 11726 + }, + { + "epoch": 1.75, + "grad_norm": 0.9042321596353384, + "learning_rate": 1.5098951616654831e-06, + "loss": 0.6348, + "step": 11727 + }, + { + "epoch": 1.75, + "grad_norm": 0.8568073939113832, + "learning_rate": 1.5098120536300933e-06, + "loss": 0.6725, + "step": 11728 + }, + { + "epoch": 1.75, + "grad_norm": 2.007098443783929, + "learning_rate": 1.5097289408365828e-06, + "loss": 0.6719, + "step": 11729 + }, + { + "epoch": 1.75, + "grad_norm": 5.742681432105714, + "learning_rate": 1.5096458232857265e-06, + "loss": 0.6654, + "step": 11730 + }, + { + "epoch": 1.75, + "grad_norm": 1.7390802220852914, + "learning_rate": 1.5095627009783012e-06, + "loss": 0.6784, + "step": 11731 + }, + { + "epoch": 1.75, + "grad_norm": 5.5044016492412275, + "learning_rate": 1.509479573915082e-06, + "loss": 0.6797, + "step": 11732 + }, + { + "epoch": 1.75, + "grad_norm": 2.420161221106136, + "learning_rate": 1.5093964420968446e-06, + "loss": 0.6842, + "step": 11733 + }, + { + "epoch": 1.75, + "grad_norm": 0.8716573670366264, + "learning_rate": 1.5093133055243655e-06, + "loss": 0.668, + "step": 11734 + }, + { + "epoch": 1.75, + "grad_norm": 1.0926197160540283, + "learning_rate": 1.5092301641984198e-06, + "loss": 0.6764, + "step": 11735 + }, + { + "epoch": 1.75, + "grad_norm": 2.124886753499349, + "learning_rate": 1.5091470181197842e-06, + "loss": 0.681, + "step": 11736 + }, + { + "epoch": 1.75, + "grad_norm": 3.6996541516420773, + "learning_rate": 1.5090638672892348e-06, + "loss": 0.6576, + "step": 11737 + }, + { + "epoch": 1.75, + "grad_norm": 4.079867183281646, + "learning_rate": 1.5089807117075468e-06, + "loss": 0.6569, + "step": 11738 + }, + { + "epoch": 1.75, + "grad_norm": 1.8626901102552127, + "learning_rate": 1.508897551375497e-06, + "loss": 0.6602, + "step": 11739 + }, + { + "epoch": 1.75, + "grad_norm": 2.668982829161387, + "learning_rate": 1.5088143862938615e-06, + "loss": 0.681, + "step": 11740 + }, + { + "epoch": 1.75, + "grad_norm": 0.8485745049430399, + "learning_rate": 1.508731216463416e-06, + "loss": 0.6738, + "step": 11741 + }, + { + "epoch": 1.75, + "grad_norm": 2.2750799942495576, + "learning_rate": 1.5086480418849374e-06, + "loss": 0.6602, + "step": 11742 + }, + { + "epoch": 1.75, + "grad_norm": 5.518797548600508, + "learning_rate": 1.5085648625592015e-06, + "loss": 0.6615, + "step": 11743 + }, + { + "epoch": 1.75, + "grad_norm": 1.246773864064693, + "learning_rate": 1.508481678486985e-06, + "loss": 0.6634, + "step": 11744 + }, + { + "epoch": 1.75, + "grad_norm": 2.2517240488636565, + "learning_rate": 1.5083984896690637e-06, + "loss": 0.6615, + "step": 11745 + }, + { + "epoch": 1.75, + "grad_norm": 2.912652900589948, + "learning_rate": 1.5083152961062144e-06, + "loss": 0.6914, + "step": 11746 + }, + { + "epoch": 1.75, + "grad_norm": 1.5736414098559168, + "learning_rate": 1.5082320977992136e-06, + "loss": 0.6478, + "step": 11747 + }, + { + "epoch": 1.75, + "grad_norm": 2.9195811523248505, + "learning_rate": 1.5081488947488374e-06, + "loss": 0.6836, + "step": 11748 + }, + { + "epoch": 1.75, + "grad_norm": 0.8881323627140221, + "learning_rate": 1.508065686955863e-06, + "loss": 0.6888, + "step": 11749 + }, + { + "epoch": 1.75, + "grad_norm": 1.55157200683781, + "learning_rate": 1.5079824744210666e-06, + "loss": 0.7201, + "step": 11750 + }, + { + "epoch": 1.75, + "grad_norm": 1.78725460763632, + "learning_rate": 1.5078992571452248e-06, + "loss": 0.7129, + "step": 11751 + }, + { + "epoch": 1.75, + "grad_norm": 2.097324154866867, + "learning_rate": 1.5078160351291141e-06, + "loss": 0.6589, + "step": 11752 + }, + { + "epoch": 1.75, + "grad_norm": 1.3551762123864088, + "learning_rate": 1.5077328083735114e-06, + "loss": 0.6608, + "step": 11753 + }, + { + "epoch": 1.75, + "grad_norm": 0.9158405835821134, + "learning_rate": 1.5076495768791938e-06, + "loss": 0.6725, + "step": 11754 + }, + { + "epoch": 1.75, + "grad_norm": 2.854191454704687, + "learning_rate": 1.5075663406469374e-06, + "loss": 0.6784, + "step": 11755 + }, + { + "epoch": 1.75, + "grad_norm": 1.8042457062349715, + "learning_rate": 1.50748309967752e-06, + "loss": 0.6927, + "step": 11756 + }, + { + "epoch": 1.75, + "grad_norm": 1.216094859454101, + "learning_rate": 1.5073998539717174e-06, + "loss": 0.679, + "step": 11757 + }, + { + "epoch": 1.75, + "grad_norm": 3.445695995665239, + "learning_rate": 1.5073166035303074e-06, + "loss": 0.6647, + "step": 11758 + }, + { + "epoch": 1.75, + "grad_norm": 0.8292577800199691, + "learning_rate": 1.5072333483540668e-06, + "loss": 0.6849, + "step": 11759 + }, + { + "epoch": 1.75, + "grad_norm": 4.35317902254148, + "learning_rate": 1.507150088443772e-06, + "loss": 0.6712, + "step": 11760 + }, + { + "epoch": 1.75, + "grad_norm": 4.729325958170044, + "learning_rate": 1.507066823800201e-06, + "loss": 0.6908, + "step": 11761 + }, + { + "epoch": 1.75, + "grad_norm": 1.2129496355750091, + "learning_rate": 1.5069835544241306e-06, + "loss": 0.6504, + "step": 11762 + }, + { + "epoch": 1.75, + "grad_norm": 0.8359255765640746, + "learning_rate": 1.5069002803163375e-06, + "loss": 0.6719, + "step": 11763 + }, + { + "epoch": 1.75, + "grad_norm": 2.4293475387923937, + "learning_rate": 1.5068170014775996e-06, + "loss": 0.6934, + "step": 11764 + }, + { + "epoch": 1.75, + "grad_norm": 1.6840015164287596, + "learning_rate": 1.5067337179086935e-06, + "loss": 0.6914, + "step": 11765 + }, + { + "epoch": 1.75, + "grad_norm": 1.0304237853686806, + "learning_rate": 1.506650429610397e-06, + "loss": 0.6764, + "step": 11766 + }, + { + "epoch": 1.75, + "grad_norm": 1.9625838750111442, + "learning_rate": 1.506567136583487e-06, + "loss": 0.6562, + "step": 11767 + }, + { + "epoch": 1.76, + "grad_norm": 1.2260194575504286, + "learning_rate": 1.5064838388287414e-06, + "loss": 0.6719, + "step": 11768 + }, + { + "epoch": 1.76, + "grad_norm": 2.457817352460932, + "learning_rate": 1.506400536346937e-06, + "loss": 0.7122, + "step": 11769 + }, + { + "epoch": 1.76, + "grad_norm": 0.8617622502390823, + "learning_rate": 1.506317229138852e-06, + "loss": 0.6803, + "step": 11770 + }, + { + "epoch": 1.76, + "grad_norm": 0.8030583791832338, + "learning_rate": 1.5062339172052632e-06, + "loss": 0.6738, + "step": 11771 + }, + { + "epoch": 1.76, + "grad_norm": 1.5428288515733437, + "learning_rate": 1.5061506005469485e-06, + "loss": 0.666, + "step": 11772 + }, + { + "epoch": 1.76, + "grad_norm": 0.7538789451176319, + "learning_rate": 1.5060672791646857e-06, + "loss": 0.6888, + "step": 11773 + }, + { + "epoch": 1.76, + "grad_norm": 5.8512013183999585, + "learning_rate": 1.5059839530592523e-06, + "loss": 0.6816, + "step": 11774 + }, + { + "epoch": 1.76, + "grad_norm": 1.8577341025300944, + "learning_rate": 1.5059006222314255e-06, + "loss": 0.6589, + "step": 11775 + }, + { + "epoch": 1.76, + "grad_norm": 1.3893111473171877, + "learning_rate": 1.5058172866819838e-06, + "loss": 0.681, + "step": 11776 + }, + { + "epoch": 1.76, + "grad_norm": 1.0051430494885982, + "learning_rate": 1.5057339464117045e-06, + "loss": 0.679, + "step": 11777 + }, + { + "epoch": 1.76, + "grad_norm": 4.370607030106942, + "learning_rate": 1.5056506014213655e-06, + "loss": 0.6947, + "step": 11778 + }, + { + "epoch": 1.76, + "grad_norm": 3.2944088098218303, + "learning_rate": 1.5055672517117448e-06, + "loss": 0.6517, + "step": 11779 + }, + { + "epoch": 1.76, + "grad_norm": 1.5889702407908473, + "learning_rate": 1.5054838972836203e-06, + "loss": 0.6602, + "step": 11780 + }, + { + "epoch": 1.76, + "grad_norm": 2.06425270626814, + "learning_rate": 1.50540053813777e-06, + "loss": 0.6797, + "step": 11781 + }, + { + "epoch": 1.76, + "grad_norm": 3.513488673548674, + "learning_rate": 1.5053171742749713e-06, + "loss": 0.6836, + "step": 11782 + }, + { + "epoch": 1.76, + "grad_norm": 0.9612609255488433, + "learning_rate": 1.5052338056960033e-06, + "loss": 0.6777, + "step": 11783 + }, + { + "epoch": 1.76, + "grad_norm": 1.2982710414217042, + "learning_rate": 1.505150432401643e-06, + "loss": 0.6452, + "step": 11784 + }, + { + "epoch": 1.76, + "grad_norm": 4.001752747085555, + "learning_rate": 1.5050670543926694e-06, + "loss": 0.6673, + "step": 11785 + }, + { + "epoch": 1.76, + "grad_norm": 0.8386356885927152, + "learning_rate": 1.50498367166986e-06, + "loss": 0.6569, + "step": 11786 + }, + { + "epoch": 1.76, + "grad_norm": 4.216404248177488, + "learning_rate": 1.5049002842339937e-06, + "loss": 0.6901, + "step": 11787 + }, + { + "epoch": 1.76, + "grad_norm": 1.8746050647518806, + "learning_rate": 1.5048168920858481e-06, + "loss": 0.6641, + "step": 11788 + }, + { + "epoch": 1.76, + "grad_norm": 5.007936045132978, + "learning_rate": 1.504733495226202e-06, + "loss": 0.6732, + "step": 11789 + }, + { + "epoch": 1.76, + "grad_norm": 0.9940780530170762, + "learning_rate": 1.5046500936558336e-06, + "loss": 0.6621, + "step": 11790 + }, + { + "epoch": 1.76, + "grad_norm": 2.286668490600123, + "learning_rate": 1.504566687375521e-06, + "loss": 0.6615, + "step": 11791 + }, + { + "epoch": 1.76, + "grad_norm": 1.113884433397335, + "learning_rate": 1.504483276386043e-06, + "loss": 0.694, + "step": 11792 + }, + { + "epoch": 1.76, + "grad_norm": 6.130701131838655, + "learning_rate": 1.5043998606881777e-06, + "loss": 0.6927, + "step": 11793 + }, + { + "epoch": 1.76, + "grad_norm": 1.9707057589661299, + "learning_rate": 1.5043164402827043e-06, + "loss": 0.707, + "step": 11794 + }, + { + "epoch": 1.76, + "grad_norm": 1.1333850275071067, + "learning_rate": 1.504233015170401e-06, + "loss": 0.6458, + "step": 11795 + }, + { + "epoch": 1.76, + "grad_norm": 6.967345358930288, + "learning_rate": 1.5041495853520463e-06, + "loss": 0.6751, + "step": 11796 + }, + { + "epoch": 1.76, + "grad_norm": 2.8766178748133813, + "learning_rate": 1.504066150828419e-06, + "loss": 0.6628, + "step": 11797 + }, + { + "epoch": 1.76, + "grad_norm": 1.0868168440137946, + "learning_rate": 1.5039827116002973e-06, + "loss": 0.6738, + "step": 11798 + }, + { + "epoch": 1.76, + "grad_norm": 2.4361660013999082, + "learning_rate": 1.5038992676684607e-06, + "loss": 0.6992, + "step": 11799 + }, + { + "epoch": 1.76, + "grad_norm": 3.874610968906433, + "learning_rate": 1.5038158190336876e-06, + "loss": 0.6517, + "step": 11800 + }, + { + "epoch": 1.76, + "grad_norm": 1.2347193810875197, + "learning_rate": 1.503732365696757e-06, + "loss": 0.6543, + "step": 11801 + }, + { + "epoch": 1.76, + "grad_norm": 2.7807097968519403, + "learning_rate": 1.5036489076584476e-06, + "loss": 0.6634, + "step": 11802 + }, + { + "epoch": 1.76, + "grad_norm": 3.2052829909748612, + "learning_rate": 1.5035654449195384e-06, + "loss": 0.6966, + "step": 11803 + }, + { + "epoch": 1.76, + "grad_norm": 1.0407520402530044, + "learning_rate": 1.5034819774808083e-06, + "loss": 0.6654, + "step": 11804 + }, + { + "epoch": 1.76, + "grad_norm": 0.9516562565618926, + "learning_rate": 1.5033985053430365e-06, + "loss": 0.6842, + "step": 11805 + }, + { + "epoch": 1.76, + "grad_norm": 4.303113726662779, + "learning_rate": 1.503315028507002e-06, + "loss": 0.696, + "step": 11806 + }, + { + "epoch": 1.76, + "grad_norm": 1.5516482023384734, + "learning_rate": 1.5032315469734835e-06, + "loss": 0.6693, + "step": 11807 + }, + { + "epoch": 1.76, + "grad_norm": 2.24024794211164, + "learning_rate": 1.503148060743261e-06, + "loss": 0.6921, + "step": 11808 + }, + { + "epoch": 1.76, + "grad_norm": 1.2816281408427155, + "learning_rate": 1.5030645698171127e-06, + "loss": 0.6406, + "step": 11809 + }, + { + "epoch": 1.76, + "grad_norm": 4.382884878161706, + "learning_rate": 1.5029810741958184e-06, + "loss": 0.6751, + "step": 11810 + }, + { + "epoch": 1.76, + "grad_norm": 2.5239743386000164, + "learning_rate": 1.5028975738801575e-06, + "loss": 0.6556, + "step": 11811 + }, + { + "epoch": 1.76, + "grad_norm": 7.2634624871023075, + "learning_rate": 1.5028140688709085e-06, + "loss": 0.6471, + "step": 11812 + }, + { + "epoch": 1.76, + "grad_norm": 0.9986330481248755, + "learning_rate": 1.502730559168852e-06, + "loss": 0.6699, + "step": 11813 + }, + { + "epoch": 1.76, + "grad_norm": 4.667338882905726, + "learning_rate": 1.5026470447747662e-06, + "loss": 0.6862, + "step": 11814 + }, + { + "epoch": 1.76, + "grad_norm": 0.9467785987362792, + "learning_rate": 1.5025635256894314e-06, + "loss": 0.681, + "step": 11815 + }, + { + "epoch": 1.76, + "grad_norm": 6.003128370304303, + "learning_rate": 1.5024800019136269e-06, + "loss": 0.653, + "step": 11816 + }, + { + "epoch": 1.76, + "grad_norm": 5.753263194373789, + "learning_rate": 1.5023964734481315e-06, + "loss": 0.6706, + "step": 11817 + }, + { + "epoch": 1.76, + "grad_norm": 0.8972675126174536, + "learning_rate": 1.5023129402937258e-06, + "loss": 0.6589, + "step": 11818 + }, + { + "epoch": 1.76, + "grad_norm": 1.0723374399305439, + "learning_rate": 1.502229402451189e-06, + "loss": 0.6803, + "step": 11819 + }, + { + "epoch": 1.76, + "grad_norm": 1.0091499648867628, + "learning_rate": 1.5021458599213008e-06, + "loss": 0.7142, + "step": 11820 + }, + { + "epoch": 1.76, + "grad_norm": 2.6102205784224637, + "learning_rate": 1.5020623127048407e-06, + "loss": 0.6908, + "step": 11821 + }, + { + "epoch": 1.76, + "grad_norm": 4.819287759029297, + "learning_rate": 1.501978760802589e-06, + "loss": 0.6549, + "step": 11822 + }, + { + "epoch": 1.76, + "grad_norm": 1.0554777611044828, + "learning_rate": 1.5018952042153248e-06, + "loss": 0.6842, + "step": 11823 + }, + { + "epoch": 1.76, + "grad_norm": 0.8227552776926362, + "learning_rate": 1.5018116429438283e-06, + "loss": 0.6641, + "step": 11824 + }, + { + "epoch": 1.76, + "grad_norm": 2.05556486331212, + "learning_rate": 1.5017280769888791e-06, + "loss": 0.6621, + "step": 11825 + }, + { + "epoch": 1.76, + "grad_norm": 2.8905808026755597, + "learning_rate": 1.501644506351258e-06, + "loss": 0.6855, + "step": 11826 + }, + { + "epoch": 1.76, + "grad_norm": 1.960532632624404, + "learning_rate": 1.501560931031744e-06, + "loss": 0.6803, + "step": 11827 + }, + { + "epoch": 1.76, + "grad_norm": 2.9311620083048266, + "learning_rate": 1.5014773510311173e-06, + "loss": 0.6628, + "step": 11828 + }, + { + "epoch": 1.76, + "grad_norm": 1.2672474016022472, + "learning_rate": 1.5013937663501587e-06, + "loss": 0.6296, + "step": 11829 + }, + { + "epoch": 1.76, + "grad_norm": 2.861423708593997, + "learning_rate": 1.5013101769896475e-06, + "loss": 0.6816, + "step": 11830 + }, + { + "epoch": 1.76, + "grad_norm": 5.031430030185488, + "learning_rate": 1.5012265829503638e-06, + "loss": 0.679, + "step": 11831 + }, + { + "epoch": 1.76, + "grad_norm": 0.8450543670503132, + "learning_rate": 1.5011429842330884e-06, + "loss": 0.6589, + "step": 11832 + }, + { + "epoch": 1.76, + "grad_norm": 1.916012516038645, + "learning_rate": 1.501059380838601e-06, + "loss": 0.638, + "step": 11833 + }, + { + "epoch": 1.76, + "grad_norm": 0.761877612027408, + "learning_rate": 1.5009757727676822e-06, + "loss": 0.6504, + "step": 11834 + }, + { + "epoch": 1.77, + "grad_norm": 0.8950531470791189, + "learning_rate": 1.5008921600211121e-06, + "loss": 0.6836, + "step": 11835 + }, + { + "epoch": 1.77, + "grad_norm": 7.92493753689475, + "learning_rate": 1.5008085425996715e-06, + "loss": 0.6855, + "step": 11836 + }, + { + "epoch": 1.77, + "grad_norm": 3.796187927828091, + "learning_rate": 1.50072492050414e-06, + "loss": 0.6647, + "step": 11837 + }, + { + "epoch": 1.77, + "grad_norm": 2.4514569406666022, + "learning_rate": 1.5006412937352988e-06, + "loss": 0.7051, + "step": 11838 + }, + { + "epoch": 1.77, + "grad_norm": 2.4705795237423653, + "learning_rate": 1.5005576622939283e-06, + "loss": 0.6413, + "step": 11839 + }, + { + "epoch": 1.77, + "grad_norm": 0.9116800195713789, + "learning_rate": 1.5004740261808086e-06, + "loss": 0.6693, + "step": 11840 + }, + { + "epoch": 1.77, + "grad_norm": 0.9496101375052376, + "learning_rate": 1.5003903853967206e-06, + "loss": 0.6712, + "step": 11841 + }, + { + "epoch": 1.77, + "grad_norm": 5.92020781511227, + "learning_rate": 1.5003067399424451e-06, + "loss": 0.7129, + "step": 11842 + }, + { + "epoch": 1.77, + "grad_norm": 4.188144097008329, + "learning_rate": 1.5002230898187622e-06, + "loss": 0.6621, + "step": 11843 + }, + { + "epoch": 1.77, + "grad_norm": 1.4918613041038322, + "learning_rate": 1.5001394350264529e-06, + "loss": 0.6979, + "step": 11844 + }, + { + "epoch": 1.77, + "grad_norm": 0.8203228877493811, + "learning_rate": 1.5000557755662983e-06, + "loss": 0.6706, + "step": 11845 + }, + { + "epoch": 1.77, + "grad_norm": 5.041312645270769, + "learning_rate": 1.4999721114390788e-06, + "loss": 0.7077, + "step": 11846 + }, + { + "epoch": 1.77, + "grad_norm": 6.964176428886507, + "learning_rate": 1.4998884426455755e-06, + "loss": 0.7122, + "step": 11847 + }, + { + "epoch": 1.77, + "grad_norm": 1.5640747797333203, + "learning_rate": 1.4998047691865692e-06, + "loss": 0.6634, + "step": 11848 + }, + { + "epoch": 1.77, + "grad_norm": 0.9422494964405078, + "learning_rate": 1.4997210910628407e-06, + "loss": 0.6536, + "step": 11849 + }, + { + "epoch": 1.77, + "grad_norm": 1.0585500664188214, + "learning_rate": 1.4996374082751711e-06, + "loss": 0.6751, + "step": 11850 + }, + { + "epoch": 1.77, + "grad_norm": 1.6681037504993805, + "learning_rate": 1.4995537208243412e-06, + "loss": 0.6348, + "step": 11851 + }, + { + "epoch": 1.77, + "grad_norm": 0.8274509530317121, + "learning_rate": 1.4994700287111323e-06, + "loss": 0.6686, + "step": 11852 + }, + { + "epoch": 1.77, + "grad_norm": 2.4504090668719085, + "learning_rate": 1.4993863319363257e-06, + "loss": 0.6686, + "step": 11853 + }, + { + "epoch": 1.77, + "grad_norm": 1.8132152108299262, + "learning_rate": 1.499302630500702e-06, + "loss": 0.6816, + "step": 11854 + }, + { + "epoch": 1.77, + "grad_norm": 6.301834957961106, + "learning_rate": 1.499218924405043e-06, + "loss": 0.679, + "step": 11855 + }, + { + "epoch": 1.77, + "grad_norm": 3.309230422224279, + "learning_rate": 1.4991352136501295e-06, + "loss": 0.6576, + "step": 11856 + }, + { + "epoch": 1.77, + "grad_norm": 0.9867877826442824, + "learning_rate": 1.4990514982367425e-06, + "loss": 0.6602, + "step": 11857 + }, + { + "epoch": 1.77, + "grad_norm": 1.8019476117946285, + "learning_rate": 1.4989677781656642e-06, + "loss": 0.6634, + "step": 11858 + }, + { + "epoch": 1.77, + "grad_norm": 1.0308371064191941, + "learning_rate": 1.4988840534376755e-06, + "loss": 0.6908, + "step": 11859 + }, + { + "epoch": 1.77, + "grad_norm": 1.357422136554526, + "learning_rate": 1.4988003240535575e-06, + "loss": 0.6673, + "step": 11860 + }, + { + "epoch": 1.77, + "grad_norm": 1.5799936965226196, + "learning_rate": 1.4987165900140923e-06, + "loss": 0.6875, + "step": 11861 + }, + { + "epoch": 1.77, + "grad_norm": 4.222815955533011, + "learning_rate": 1.498632851320061e-06, + "loss": 0.6803, + "step": 11862 + }, + { + "epoch": 1.77, + "grad_norm": 2.1426809405156746, + "learning_rate": 1.4985491079722452e-06, + "loss": 0.6771, + "step": 11863 + }, + { + "epoch": 1.77, + "grad_norm": 0.8467305029649089, + "learning_rate": 1.4984653599714262e-06, + "loss": 0.6628, + "step": 11864 + }, + { + "epoch": 1.77, + "grad_norm": 1.2409581988026055, + "learning_rate": 1.4983816073183864e-06, + "loss": 0.6641, + "step": 11865 + }, + { + "epoch": 1.77, + "grad_norm": 2.1139425088957298, + "learning_rate": 1.4982978500139067e-06, + "loss": 0.6823, + "step": 11866 + }, + { + "epoch": 1.77, + "grad_norm": 2.7658589013112707, + "learning_rate": 1.498214088058769e-06, + "loss": 0.6868, + "step": 11867 + }, + { + "epoch": 1.77, + "grad_norm": 1.110973695374311, + "learning_rate": 1.4981303214537552e-06, + "loss": 0.6829, + "step": 11868 + }, + { + "epoch": 1.77, + "grad_norm": 1.5515386446411894, + "learning_rate": 1.4980465501996471e-06, + "loss": 0.6901, + "step": 11869 + }, + { + "epoch": 1.77, + "grad_norm": 5.634978959427809, + "learning_rate": 1.4979627742972266e-06, + "loss": 0.7214, + "step": 11870 + }, + { + "epoch": 1.77, + "grad_norm": 1.3836400575376067, + "learning_rate": 1.4978789937472754e-06, + "loss": 0.6556, + "step": 11871 + }, + { + "epoch": 1.77, + "grad_norm": 3.0147205892281166, + "learning_rate": 1.4977952085505757e-06, + "loss": 0.6849, + "step": 11872 + }, + { + "epoch": 1.77, + "grad_norm": 3.344861692173813, + "learning_rate": 1.497711418707909e-06, + "loss": 0.6621, + "step": 11873 + }, + { + "epoch": 1.77, + "grad_norm": 2.0205742766844867, + "learning_rate": 1.4976276242200578e-06, + "loss": 0.6816, + "step": 11874 + }, + { + "epoch": 1.77, + "grad_norm": 1.338425760108114, + "learning_rate": 1.4975438250878038e-06, + "loss": 0.7044, + "step": 11875 + }, + { + "epoch": 1.77, + "grad_norm": 3.082614463799419, + "learning_rate": 1.4974600213119292e-06, + "loss": 0.6615, + "step": 11876 + }, + { + "epoch": 1.77, + "grad_norm": 4.616517977386176, + "learning_rate": 1.4973762128932167e-06, + "loss": 0.7031, + "step": 11877 + }, + { + "epoch": 1.77, + "grad_norm": 0.9376554526609723, + "learning_rate": 1.4972923998324476e-06, + "loss": 0.6784, + "step": 11878 + }, + { + "epoch": 1.77, + "grad_norm": 4.008562640359669, + "learning_rate": 1.497208582130405e-06, + "loss": 0.6862, + "step": 11879 + }, + { + "epoch": 1.77, + "grad_norm": 0.8244749610448849, + "learning_rate": 1.4971247597878702e-06, + "loss": 0.6823, + "step": 11880 + }, + { + "epoch": 1.77, + "grad_norm": 0.8341742137596365, + "learning_rate": 1.4970409328056262e-06, + "loss": 0.6803, + "step": 11881 + }, + { + "epoch": 1.77, + "grad_norm": 3.9826178979596656, + "learning_rate": 1.4969571011844556e-06, + "loss": 0.6491, + "step": 11882 + }, + { + "epoch": 1.77, + "grad_norm": 2.1533546988486982, + "learning_rate": 1.4968732649251398e-06, + "loss": 0.679, + "step": 11883 + }, + { + "epoch": 1.77, + "grad_norm": 3.048673033854098, + "learning_rate": 1.4967894240284624e-06, + "loss": 0.6719, + "step": 11884 + }, + { + "epoch": 1.77, + "grad_norm": 1.7110782278830865, + "learning_rate": 1.4967055784952054e-06, + "loss": 0.6888, + "step": 11885 + }, + { + "epoch": 1.77, + "grad_norm": 0.7204755348090529, + "learning_rate": 1.496621728326151e-06, + "loss": 0.6673, + "step": 11886 + }, + { + "epoch": 1.77, + "grad_norm": 0.7252211892485284, + "learning_rate": 1.4965378735220821e-06, + "loss": 0.679, + "step": 11887 + }, + { + "epoch": 1.77, + "grad_norm": 1.3396140316993785, + "learning_rate": 1.4964540140837811e-06, + "loss": 0.651, + "step": 11888 + }, + { + "epoch": 1.77, + "grad_norm": 0.7460820223769195, + "learning_rate": 1.4963701500120314e-06, + "loss": 0.6862, + "step": 11889 + }, + { + "epoch": 1.77, + "grad_norm": 1.3870264996725012, + "learning_rate": 1.496286281307615e-06, + "loss": 0.679, + "step": 11890 + }, + { + "epoch": 1.77, + "grad_norm": 1.7052917494123963, + "learning_rate": 1.4962024079713146e-06, + "loss": 0.6732, + "step": 11891 + }, + { + "epoch": 1.77, + "grad_norm": 0.8996906448173868, + "learning_rate": 1.4961185300039136e-06, + "loss": 0.6868, + "step": 11892 + }, + { + "epoch": 1.77, + "grad_norm": 0.7861706217275792, + "learning_rate": 1.4960346474061944e-06, + "loss": 0.6576, + "step": 11893 + }, + { + "epoch": 1.77, + "grad_norm": 1.9507765812620215, + "learning_rate": 1.4959507601789398e-06, + "loss": 0.6816, + "step": 11894 + }, + { + "epoch": 1.77, + "grad_norm": 1.8651622062190325, + "learning_rate": 1.4958668683229335e-06, + "loss": 0.6667, + "step": 11895 + }, + { + "epoch": 1.77, + "grad_norm": 0.7214303148764029, + "learning_rate": 1.495782971838957e-06, + "loss": 0.6686, + "step": 11896 + }, + { + "epoch": 1.77, + "grad_norm": 0.8333290010918981, + "learning_rate": 1.4956990707277946e-06, + "loss": 0.6699, + "step": 11897 + }, + { + "epoch": 1.77, + "grad_norm": 2.6835061481677327, + "learning_rate": 1.495615164990229e-06, + "loss": 0.6556, + "step": 11898 + }, + { + "epoch": 1.77, + "grad_norm": 1.121039729635472, + "learning_rate": 1.495531254627043e-06, + "loss": 0.6549, + "step": 11899 + }, + { + "epoch": 1.77, + "grad_norm": 0.9036911073309608, + "learning_rate": 1.4954473396390203e-06, + "loss": 0.696, + "step": 11900 + }, + { + "epoch": 1.77, + "grad_norm": 0.9561516795920127, + "learning_rate": 1.4953634200269435e-06, + "loss": 0.6947, + "step": 11901 + }, + { + "epoch": 1.78, + "grad_norm": 4.144152904754944, + "learning_rate": 1.4952794957915963e-06, + "loss": 0.6719, + "step": 11902 + }, + { + "epoch": 1.78, + "grad_norm": 2.097225228244218, + "learning_rate": 1.4951955669337614e-06, + "loss": 0.638, + "step": 11903 + }, + { + "epoch": 1.78, + "grad_norm": 1.2829292091950437, + "learning_rate": 1.4951116334542227e-06, + "loss": 0.6602, + "step": 11904 + }, + { + "epoch": 1.78, + "grad_norm": 1.1816851366846484, + "learning_rate": 1.4950276953537632e-06, + "loss": 0.6868, + "step": 11905 + }, + { + "epoch": 1.78, + "grad_norm": 5.29633935943123, + "learning_rate": 1.494943752633167e-06, + "loss": 0.6523, + "step": 11906 + }, + { + "epoch": 1.78, + "grad_norm": 1.1111959131688873, + "learning_rate": 1.4948598052932164e-06, + "loss": 0.6667, + "step": 11907 + }, + { + "epoch": 1.78, + "grad_norm": 1.3737695267751824, + "learning_rate": 1.494775853334696e-06, + "loss": 0.6556, + "step": 11908 + }, + { + "epoch": 1.78, + "grad_norm": 1.2762668783396212, + "learning_rate": 1.494691896758388e-06, + "loss": 0.6432, + "step": 11909 + }, + { + "epoch": 1.78, + "grad_norm": 5.18497274254999, + "learning_rate": 1.4946079355650772e-06, + "loss": 0.7077, + "step": 11910 + }, + { + "epoch": 1.78, + "grad_norm": 3.737302863331133, + "learning_rate": 1.4945239697555469e-06, + "loss": 0.6628, + "step": 11911 + }, + { + "epoch": 1.78, + "grad_norm": 1.1368534735248739, + "learning_rate": 1.4944399993305806e-06, + "loss": 0.6556, + "step": 11912 + }, + { + "epoch": 1.78, + "grad_norm": 1.4329242832253484, + "learning_rate": 1.4943560242909624e-06, + "loss": 0.6823, + "step": 11913 + }, + { + "epoch": 1.78, + "grad_norm": 1.2850862382346653, + "learning_rate": 1.494272044637475e-06, + "loss": 0.6296, + "step": 11914 + }, + { + "epoch": 1.78, + "grad_norm": 1.7403928642933437, + "learning_rate": 1.4941880603709034e-06, + "loss": 0.6497, + "step": 11915 + }, + { + "epoch": 1.78, + "grad_norm": 1.9612363915610491, + "learning_rate": 1.4941040714920307e-06, + "loss": 0.7142, + "step": 11916 + }, + { + "epoch": 1.78, + "grad_norm": 1.1317689795769958, + "learning_rate": 1.494020078001641e-06, + "loss": 0.6621, + "step": 11917 + }, + { + "epoch": 1.78, + "grad_norm": 1.8427468115715566, + "learning_rate": 1.4939360799005183e-06, + "loss": 0.6771, + "step": 11918 + }, + { + "epoch": 1.78, + "grad_norm": 1.2947167097242702, + "learning_rate": 1.4938520771894467e-06, + "loss": 0.6771, + "step": 11919 + }, + { + "epoch": 1.78, + "grad_norm": 1.327916624652714, + "learning_rate": 1.4937680698692096e-06, + "loss": 0.6823, + "step": 11920 + }, + { + "epoch": 1.78, + "grad_norm": 1.7230629017586712, + "learning_rate": 1.4936840579405918e-06, + "loss": 0.6589, + "step": 11921 + }, + { + "epoch": 1.78, + "grad_norm": 1.689463641058938, + "learning_rate": 1.4936000414043767e-06, + "loss": 0.6569, + "step": 11922 + }, + { + "epoch": 1.78, + "grad_norm": 0.9120818416043547, + "learning_rate": 1.493516020261349e-06, + "loss": 0.6699, + "step": 11923 + }, + { + "epoch": 1.78, + "grad_norm": 1.064874820586603, + "learning_rate": 1.4934319945122927e-06, + "loss": 0.6777, + "step": 11924 + }, + { + "epoch": 1.78, + "grad_norm": 1.685247495060427, + "learning_rate": 1.4933479641579916e-06, + "loss": 0.6211, + "step": 11925 + }, + { + "epoch": 1.78, + "grad_norm": 1.876406389124897, + "learning_rate": 1.4932639291992305e-06, + "loss": 0.6816, + "step": 11926 + }, + { + "epoch": 1.78, + "grad_norm": 1.1102815904510832, + "learning_rate": 1.4931798896367936e-06, + "loss": 0.6562, + "step": 11927 + }, + { + "epoch": 1.78, + "grad_norm": 2.6394869033668757, + "learning_rate": 1.4930958454714651e-06, + "loss": 0.6576, + "step": 11928 + }, + { + "epoch": 1.78, + "grad_norm": 4.780731970463914, + "learning_rate": 1.4930117967040296e-06, + "loss": 0.6758, + "step": 11929 + }, + { + "epoch": 1.78, + "grad_norm": 1.4974105950965768, + "learning_rate": 1.4929277433352712e-06, + "loss": 0.6706, + "step": 11930 + }, + { + "epoch": 1.78, + "grad_norm": 2.049732351195819, + "learning_rate": 1.4928436853659744e-06, + "loss": 0.6849, + "step": 11931 + }, + { + "epoch": 1.78, + "grad_norm": 1.4108486554292097, + "learning_rate": 1.4927596227969246e-06, + "loss": 0.6895, + "step": 11932 + }, + { + "epoch": 1.78, + "grad_norm": 4.240513847990413, + "learning_rate": 1.4926755556289053e-06, + "loss": 0.653, + "step": 11933 + }, + { + "epoch": 1.78, + "grad_norm": 2.1511509515329594, + "learning_rate": 1.4925914838627014e-06, + "loss": 0.6602, + "step": 11934 + }, + { + "epoch": 1.78, + "grad_norm": 1.822153785667129, + "learning_rate": 1.4925074074990976e-06, + "loss": 0.6719, + "step": 11935 + }, + { + "epoch": 1.78, + "grad_norm": 2.7276137127640174, + "learning_rate": 1.4924233265388786e-06, + "loss": 0.6667, + "step": 11936 + }, + { + "epoch": 1.78, + "grad_norm": 3.462806184688508, + "learning_rate": 1.492339240982829e-06, + "loss": 0.6719, + "step": 11937 + }, + { + "epoch": 1.78, + "grad_norm": 2.8226597773353523, + "learning_rate": 1.492255150831734e-06, + "loss": 0.6367, + "step": 11938 + }, + { + "epoch": 1.78, + "grad_norm": 2.171563141680207, + "learning_rate": 1.492171056086378e-06, + "loss": 0.6725, + "step": 11939 + }, + { + "epoch": 1.78, + "grad_norm": 1.0817821219180772, + "learning_rate": 1.492086956747546e-06, + "loss": 0.6484, + "step": 11940 + }, + { + "epoch": 1.78, + "grad_norm": 1.585320023523208, + "learning_rate": 1.4920028528160231e-06, + "loss": 0.6882, + "step": 11941 + }, + { + "epoch": 1.78, + "grad_norm": 1.6672328469142215, + "learning_rate": 1.4919187442925939e-06, + "loss": 0.6562, + "step": 11942 + }, + { + "epoch": 1.78, + "grad_norm": 1.111536722724045, + "learning_rate": 1.4918346311780436e-06, + "loss": 0.6673, + "step": 11943 + }, + { + "epoch": 1.78, + "grad_norm": 3.0750881211127, + "learning_rate": 1.4917505134731573e-06, + "loss": 0.7148, + "step": 11944 + }, + { + "epoch": 1.78, + "grad_norm": 7.902852149065372, + "learning_rate": 1.49166639117872e-06, + "loss": 0.7012, + "step": 11945 + }, + { + "epoch": 1.78, + "grad_norm": 1.2506703394202714, + "learning_rate": 1.4915822642955165e-06, + "loss": 0.6914, + "step": 11946 + }, + { + "epoch": 1.78, + "grad_norm": 1.363325083034512, + "learning_rate": 1.4914981328243323e-06, + "loss": 0.6901, + "step": 11947 + }, + { + "epoch": 1.78, + "grad_norm": 4.5135204240044855, + "learning_rate": 1.491413996765953e-06, + "loss": 0.6595, + "step": 11948 + }, + { + "epoch": 1.78, + "grad_norm": 2.205101993979222, + "learning_rate": 1.4913298561211627e-06, + "loss": 0.6693, + "step": 11949 + }, + { + "epoch": 1.78, + "grad_norm": 1.2298943758375105, + "learning_rate": 1.491245710890748e-06, + "loss": 0.6934, + "step": 11950 + }, + { + "epoch": 1.78, + "grad_norm": 4.093379173846382, + "learning_rate": 1.4911615610754935e-06, + "loss": 0.6882, + "step": 11951 + }, + { + "epoch": 1.78, + "grad_norm": 0.9823267525787449, + "learning_rate": 1.4910774066761845e-06, + "loss": 0.6387, + "step": 11952 + }, + { + "epoch": 1.78, + "grad_norm": 4.559858576314931, + "learning_rate": 1.4909932476936067e-06, + "loss": 0.7012, + "step": 11953 + }, + { + "epoch": 1.78, + "grad_norm": 2.44828583728488, + "learning_rate": 1.4909090841285454e-06, + "loss": 0.6784, + "step": 11954 + }, + { + "epoch": 1.78, + "grad_norm": 3.4025181082084, + "learning_rate": 1.4908249159817864e-06, + "loss": 0.6823, + "step": 11955 + }, + { + "epoch": 1.78, + "grad_norm": 1.441849385399167, + "learning_rate": 1.4907407432541152e-06, + "loss": 0.6497, + "step": 11956 + }, + { + "epoch": 1.78, + "grad_norm": 3.471638948273353, + "learning_rate": 1.490656565946317e-06, + "loss": 0.6387, + "step": 11957 + }, + { + "epoch": 1.78, + "grad_norm": 4.2725654746421124, + "learning_rate": 1.4905723840591777e-06, + "loss": 0.6823, + "step": 11958 + }, + { + "epoch": 1.78, + "grad_norm": 2.495426642285769, + "learning_rate": 1.4904881975934832e-06, + "loss": 0.6608, + "step": 11959 + }, + { + "epoch": 1.78, + "grad_norm": 0.9534433377114144, + "learning_rate": 1.4904040065500186e-06, + "loss": 0.6595, + "step": 11960 + }, + { + "epoch": 1.78, + "grad_norm": 1.8248139529308183, + "learning_rate": 1.4903198109295702e-06, + "loss": 0.6562, + "step": 11961 + }, + { + "epoch": 1.78, + "grad_norm": 5.138833185792895, + "learning_rate": 1.4902356107329238e-06, + "loss": 0.6732, + "step": 11962 + }, + { + "epoch": 1.78, + "grad_norm": 1.7561814256683792, + "learning_rate": 1.4901514059608646e-06, + "loss": 0.6589, + "step": 11963 + }, + { + "epoch": 1.78, + "grad_norm": 2.191512519726874, + "learning_rate": 1.4900671966141793e-06, + "loss": 0.6732, + "step": 11964 + }, + { + "epoch": 1.78, + "grad_norm": 1.5824661279980774, + "learning_rate": 1.489982982693654e-06, + "loss": 0.668, + "step": 11965 + }, + { + "epoch": 1.78, + "grad_norm": 1.429381395038681, + "learning_rate": 1.4898987642000735e-06, + "loss": 0.6986, + "step": 11966 + }, + { + "epoch": 1.78, + "grad_norm": 1.0805610330141326, + "learning_rate": 1.4898145411342247e-06, + "loss": 0.6562, + "step": 11967 + }, + { + "epoch": 1.78, + "grad_norm": 2.86143670949728, + "learning_rate": 1.4897303134968935e-06, + "loss": 0.6862, + "step": 11968 + }, + { + "epoch": 1.79, + "grad_norm": 1.3826229267521497, + "learning_rate": 1.489646081288866e-06, + "loss": 0.6934, + "step": 11969 + }, + { + "epoch": 1.79, + "grad_norm": 0.8718093169285492, + "learning_rate": 1.4895618445109283e-06, + "loss": 0.6725, + "step": 11970 + }, + { + "epoch": 1.79, + "grad_norm": 1.9806540532126193, + "learning_rate": 1.4894776031638666e-06, + "loss": 0.6589, + "step": 11971 + }, + { + "epoch": 1.79, + "grad_norm": 0.874363518201063, + "learning_rate": 1.4893933572484674e-06, + "loss": 0.6615, + "step": 11972 + }, + { + "epoch": 1.79, + "grad_norm": 1.9758600283997585, + "learning_rate": 1.489309106765516e-06, + "loss": 0.6693, + "step": 11973 + }, + { + "epoch": 1.79, + "grad_norm": 2.327691358223645, + "learning_rate": 1.4892248517158003e-06, + "loss": 0.6914, + "step": 11974 + }, + { + "epoch": 1.79, + "grad_norm": 0.8747140248117506, + "learning_rate": 1.4891405921001052e-06, + "loss": 0.6595, + "step": 11975 + }, + { + "epoch": 1.79, + "grad_norm": 0.9919055912523779, + "learning_rate": 1.4890563279192179e-06, + "loss": 0.6803, + "step": 11976 + }, + { + "epoch": 1.79, + "grad_norm": 2.0072851791503985, + "learning_rate": 1.4889720591739247e-06, + "loss": 0.6823, + "step": 11977 + }, + { + "epoch": 1.79, + "grad_norm": 1.275412461617239, + "learning_rate": 1.488887785865012e-06, + "loss": 0.6934, + "step": 11978 + }, + { + "epoch": 1.79, + "grad_norm": 1.568234150153362, + "learning_rate": 1.4888035079932664e-06, + "loss": 0.6478, + "step": 11979 + }, + { + "epoch": 1.79, + "grad_norm": 1.936525567173268, + "learning_rate": 1.4887192255594744e-06, + "loss": 0.6823, + "step": 11980 + }, + { + "epoch": 1.79, + "grad_norm": 1.4300932612329162, + "learning_rate": 1.4886349385644227e-06, + "loss": 0.6719, + "step": 11981 + }, + { + "epoch": 1.79, + "grad_norm": 1.1260829054688455, + "learning_rate": 1.4885506470088977e-06, + "loss": 0.6621, + "step": 11982 + }, + { + "epoch": 1.79, + "grad_norm": 0.9508405700665794, + "learning_rate": 1.4884663508936864e-06, + "loss": 0.6745, + "step": 11983 + }, + { + "epoch": 1.79, + "grad_norm": 1.8519669406682906, + "learning_rate": 1.4883820502195754e-06, + "loss": 0.6654, + "step": 11984 + }, + { + "epoch": 1.79, + "grad_norm": 1.4938402470553358, + "learning_rate": 1.4882977449873517e-06, + "loss": 0.6608, + "step": 11985 + }, + { + "epoch": 1.79, + "grad_norm": 0.8335153563970676, + "learning_rate": 1.4882134351978018e-06, + "loss": 0.6589, + "step": 11986 + }, + { + "epoch": 1.79, + "grad_norm": 0.912249496261774, + "learning_rate": 1.4881291208517128e-06, + "loss": 0.6953, + "step": 11987 + }, + { + "epoch": 1.79, + "grad_norm": 3.2411814978501865, + "learning_rate": 1.4880448019498716e-06, + "loss": 0.6699, + "step": 11988 + }, + { + "epoch": 1.79, + "grad_norm": 0.9393804363021436, + "learning_rate": 1.4879604784930648e-06, + "loss": 0.6589, + "step": 11989 + }, + { + "epoch": 1.79, + "grad_norm": 2.33063837628838, + "learning_rate": 1.48787615048208e-06, + "loss": 0.6322, + "step": 11990 + }, + { + "epoch": 1.79, + "grad_norm": 2.6698813749368258, + "learning_rate": 1.4877918179177043e-06, + "loss": 0.6608, + "step": 11991 + }, + { + "epoch": 1.79, + "grad_norm": 1.007833272815319, + "learning_rate": 1.4877074808007237e-06, + "loss": 0.6725, + "step": 11992 + }, + { + "epoch": 1.79, + "grad_norm": 2.079409185802692, + "learning_rate": 1.4876231391319266e-06, + "loss": 0.6862, + "step": 11993 + }, + { + "epoch": 1.79, + "grad_norm": 5.522861984935972, + "learning_rate": 1.4875387929120993e-06, + "loss": 0.6966, + "step": 11994 + }, + { + "epoch": 1.79, + "grad_norm": 1.665493787381403, + "learning_rate": 1.4874544421420295e-06, + "loss": 0.6758, + "step": 11995 + }, + { + "epoch": 1.79, + "grad_norm": 0.9832037110236159, + "learning_rate": 1.487370086822504e-06, + "loss": 0.6719, + "step": 11996 + }, + { + "epoch": 1.79, + "grad_norm": 3.745175891071176, + "learning_rate": 1.4872857269543103e-06, + "loss": 0.6504, + "step": 11997 + }, + { + "epoch": 1.79, + "grad_norm": 0.9569971894953369, + "learning_rate": 1.4872013625382365e-06, + "loss": 0.6536, + "step": 11998 + }, + { + "epoch": 1.79, + "grad_norm": 6.091938850330173, + "learning_rate": 1.4871169935750687e-06, + "loss": 0.6738, + "step": 11999 + }, + { + "epoch": 1.79, + "grad_norm": 2.373537300571694, + "learning_rate": 1.4870326200655954e-06, + "loss": 0.653, + "step": 12000 + }, + { + "epoch": 1.79, + "grad_norm": 2.240364764317561, + "learning_rate": 1.4869482420106031e-06, + "loss": 0.6497, + "step": 12001 + }, + { + "epoch": 1.79, + "grad_norm": 2.4210250605384394, + "learning_rate": 1.48686385941088e-06, + "loss": 0.6589, + "step": 12002 + }, + { + "epoch": 1.79, + "grad_norm": 2.674576143264329, + "learning_rate": 1.4867794722672137e-06, + "loss": 0.6211, + "step": 12003 + }, + { + "epoch": 1.79, + "grad_norm": 5.028672537297378, + "learning_rate": 1.4866950805803912e-06, + "loss": 0.6927, + "step": 12004 + }, + { + "epoch": 1.79, + "grad_norm": 1.240410854456695, + "learning_rate": 1.4866106843512003e-06, + "loss": 0.6491, + "step": 12005 + }, + { + "epoch": 1.79, + "grad_norm": 2.129549207176684, + "learning_rate": 1.4865262835804294e-06, + "loss": 0.6439, + "step": 12006 + }, + { + "epoch": 1.79, + "grad_norm": 2.201149173830624, + "learning_rate": 1.4864418782688651e-06, + "loss": 0.6875, + "step": 12007 + }, + { + "epoch": 1.79, + "grad_norm": 3.234432726050111, + "learning_rate": 1.4863574684172962e-06, + "loss": 0.6777, + "step": 12008 + }, + { + "epoch": 1.79, + "grad_norm": 2.5295374231769885, + "learning_rate": 1.4862730540265097e-06, + "loss": 0.6934, + "step": 12009 + }, + { + "epoch": 1.79, + "grad_norm": 1.136019167186263, + "learning_rate": 1.4861886350972938e-06, + "loss": 0.6458, + "step": 12010 + }, + { + "epoch": 1.79, + "grad_norm": 2.494628698553357, + "learning_rate": 1.4861042116304369e-06, + "loss": 0.7057, + "step": 12011 + }, + { + "epoch": 1.79, + "grad_norm": 1.2393610969109363, + "learning_rate": 1.486019783626726e-06, + "loss": 0.6543, + "step": 12012 + }, + { + "epoch": 1.79, + "grad_norm": 2.4196667334079556, + "learning_rate": 1.4859353510869493e-06, + "loss": 0.6497, + "step": 12013 + }, + { + "epoch": 1.79, + "grad_norm": 2.367693736681806, + "learning_rate": 1.4858509140118954e-06, + "loss": 0.668, + "step": 12014 + }, + { + "epoch": 1.79, + "grad_norm": 1.160136743689239, + "learning_rate": 1.4857664724023516e-06, + "loss": 0.6816, + "step": 12015 + }, + { + "epoch": 1.79, + "grad_norm": 2.6613262618526314, + "learning_rate": 1.4856820262591064e-06, + "loss": 0.6556, + "step": 12016 + }, + { + "epoch": 1.79, + "grad_norm": 1.185994143700659, + "learning_rate": 1.485597575582948e-06, + "loss": 0.6764, + "step": 12017 + }, + { + "epoch": 1.79, + "grad_norm": 1.0800600741311772, + "learning_rate": 1.4855131203746644e-06, + "loss": 0.6725, + "step": 12018 + }, + { + "epoch": 1.79, + "grad_norm": 1.0278047482699126, + "learning_rate": 1.485428660635044e-06, + "loss": 0.6745, + "step": 12019 + }, + { + "epoch": 1.79, + "grad_norm": 2.3849303469875056, + "learning_rate": 1.4853441963648747e-06, + "loss": 0.6699, + "step": 12020 + }, + { + "epoch": 1.79, + "grad_norm": 3.387428588705207, + "learning_rate": 1.4852597275649455e-06, + "loss": 0.6576, + "step": 12021 + }, + { + "epoch": 1.79, + "grad_norm": 3.9308607650350296, + "learning_rate": 1.4851752542360444e-06, + "loss": 0.6966, + "step": 12022 + }, + { + "epoch": 1.79, + "grad_norm": 6.225661596601007, + "learning_rate": 1.4850907763789593e-06, + "loss": 0.6491, + "step": 12023 + }, + { + "epoch": 1.79, + "grad_norm": 1.2976335390351637, + "learning_rate": 1.4850062939944794e-06, + "loss": 0.6654, + "step": 12024 + }, + { + "epoch": 1.79, + "grad_norm": 2.4377949146033235, + "learning_rate": 1.484921807083393e-06, + "loss": 0.6582, + "step": 12025 + }, + { + "epoch": 1.79, + "grad_norm": 1.7870910314415351, + "learning_rate": 1.484837315646488e-06, + "loss": 0.6875, + "step": 12026 + }, + { + "epoch": 1.79, + "grad_norm": 2.4966881962542473, + "learning_rate": 1.484752819684554e-06, + "loss": 0.6719, + "step": 12027 + }, + { + "epoch": 1.79, + "grad_norm": 1.3562010912854268, + "learning_rate": 1.4846683191983786e-06, + "loss": 0.6823, + "step": 12028 + }, + { + "epoch": 1.79, + "grad_norm": 3.2469826365851087, + "learning_rate": 1.484583814188751e-06, + "loss": 0.6953, + "step": 12029 + }, + { + "epoch": 1.79, + "grad_norm": 3.057389720129214, + "learning_rate": 1.48449930465646e-06, + "loss": 0.6465, + "step": 12030 + }, + { + "epoch": 1.79, + "grad_norm": 4.3605248765154725, + "learning_rate": 1.4844147906022942e-06, + "loss": 0.6504, + "step": 12031 + }, + { + "epoch": 1.79, + "grad_norm": 1.1283479630993243, + "learning_rate": 1.484330272027042e-06, + "loss": 0.7005, + "step": 12032 + }, + { + "epoch": 1.79, + "grad_norm": 4.198378825586257, + "learning_rate": 1.4842457489314928e-06, + "loss": 0.6641, + "step": 12033 + }, + { + "epoch": 1.79, + "grad_norm": 1.535630108056997, + "learning_rate": 1.4841612213164352e-06, + "loss": 0.6771, + "step": 12034 + }, + { + "epoch": 1.79, + "grad_norm": 3.1918674633595243, + "learning_rate": 1.484076689182658e-06, + "loss": 0.6882, + "step": 12035 + }, + { + "epoch": 1.8, + "grad_norm": 1.6923248449950044, + "learning_rate": 1.4839921525309501e-06, + "loss": 0.6569, + "step": 12036 + }, + { + "epoch": 1.8, + "grad_norm": 1.7263510436763516, + "learning_rate": 1.483907611362101e-06, + "loss": 0.6647, + "step": 12037 + }, + { + "epoch": 1.8, + "grad_norm": 5.527591517269369, + "learning_rate": 1.483823065676899e-06, + "loss": 0.6901, + "step": 12038 + }, + { + "epoch": 1.8, + "grad_norm": 1.1748680736652997, + "learning_rate": 1.4837385154761339e-06, + "loss": 0.6309, + "step": 12039 + }, + { + "epoch": 1.8, + "grad_norm": 1.211742313624715, + "learning_rate": 1.4836539607605942e-06, + "loss": 0.7161, + "step": 12040 + }, + { + "epoch": 1.8, + "grad_norm": 1.2586307455049432, + "learning_rate": 1.4835694015310695e-06, + "loss": 0.6654, + "step": 12041 + }, + { + "epoch": 1.8, + "grad_norm": 1.1560704310785046, + "learning_rate": 1.4834848377883486e-06, + "loss": 0.6862, + "step": 12042 + }, + { + "epoch": 1.8, + "grad_norm": 1.090365496487219, + "learning_rate": 1.483400269533221e-06, + "loss": 0.6927, + "step": 12043 + }, + { + "epoch": 1.8, + "grad_norm": 4.769488322022434, + "learning_rate": 1.4833156967664761e-06, + "loss": 0.7031, + "step": 12044 + }, + { + "epoch": 1.8, + "grad_norm": 3.4768180923360186, + "learning_rate": 1.4832311194889031e-06, + "loss": 0.6901, + "step": 12045 + }, + { + "epoch": 1.8, + "grad_norm": 4.344699181391958, + "learning_rate": 1.483146537701291e-06, + "loss": 0.6471, + "step": 12046 + }, + { + "epoch": 1.8, + "grad_norm": 0.9860854267011161, + "learning_rate": 1.4830619514044298e-06, + "loss": 0.6751, + "step": 12047 + }, + { + "epoch": 1.8, + "grad_norm": 1.7822320815597124, + "learning_rate": 1.4829773605991088e-06, + "loss": 0.6816, + "step": 12048 + }, + { + "epoch": 1.8, + "grad_norm": 0.862195790069949, + "learning_rate": 1.4828927652861169e-06, + "loss": 0.6523, + "step": 12049 + }, + { + "epoch": 1.8, + "grad_norm": 1.3209114882052007, + "learning_rate": 1.4828081654662445e-06, + "loss": 0.6797, + "step": 12050 + }, + { + "epoch": 1.8, + "grad_norm": 1.6449368236549404, + "learning_rate": 1.4827235611402808e-06, + "loss": 0.6497, + "step": 12051 + }, + { + "epoch": 1.8, + "grad_norm": 2.21767672022918, + "learning_rate": 1.4826389523090153e-06, + "loss": 0.6602, + "step": 12052 + }, + { + "epoch": 1.8, + "grad_norm": 2.2069514686835694, + "learning_rate": 1.4825543389732378e-06, + "loss": 0.6764, + "step": 12053 + }, + { + "epoch": 1.8, + "grad_norm": 1.3727050722865604, + "learning_rate": 1.482469721133738e-06, + "loss": 0.6283, + "step": 12054 + }, + { + "epoch": 1.8, + "grad_norm": 2.5999451010325103, + "learning_rate": 1.4823850987913053e-06, + "loss": 0.6836, + "step": 12055 + }, + { + "epoch": 1.8, + "grad_norm": 4.21899868413518, + "learning_rate": 1.4823004719467303e-06, + "loss": 0.6471, + "step": 12056 + }, + { + "epoch": 1.8, + "grad_norm": 1.4413765907761096, + "learning_rate": 1.482215840600802e-06, + "loss": 0.6556, + "step": 12057 + }, + { + "epoch": 1.8, + "grad_norm": 2.0481492200495865, + "learning_rate": 1.4821312047543107e-06, + "loss": 0.6745, + "step": 12058 + }, + { + "epoch": 1.8, + "grad_norm": 3.0714746383232523, + "learning_rate": 1.4820465644080464e-06, + "loss": 0.6882, + "step": 12059 + }, + { + "epoch": 1.8, + "grad_norm": 3.379094100126444, + "learning_rate": 1.4819619195627987e-06, + "loss": 0.7298, + "step": 12060 + }, + { + "epoch": 1.8, + "grad_norm": 3.975051649888666, + "learning_rate": 1.4818772702193581e-06, + "loss": 0.6393, + "step": 12061 + }, + { + "epoch": 1.8, + "grad_norm": 3.147403876144402, + "learning_rate": 1.4817926163785139e-06, + "loss": 0.7005, + "step": 12062 + }, + { + "epoch": 1.8, + "grad_norm": 3.624780684416257, + "learning_rate": 1.4817079580410567e-06, + "loss": 0.6413, + "step": 12063 + }, + { + "epoch": 1.8, + "grad_norm": 2.2053974305261383, + "learning_rate": 1.4816232952077768e-06, + "loss": 0.6868, + "step": 12064 + }, + { + "epoch": 1.8, + "grad_norm": 2.3298791901292235, + "learning_rate": 1.4815386278794639e-06, + "loss": 0.6641, + "step": 12065 + }, + { + "epoch": 1.8, + "grad_norm": 1.0322979137397708, + "learning_rate": 1.4814539560569083e-06, + "loss": 0.6927, + "step": 12066 + }, + { + "epoch": 1.8, + "grad_norm": 2.1750476045358127, + "learning_rate": 1.4813692797409005e-06, + "loss": 0.6654, + "step": 12067 + }, + { + "epoch": 1.8, + "grad_norm": 2.4525021185984284, + "learning_rate": 1.4812845989322304e-06, + "loss": 0.6738, + "step": 12068 + }, + { + "epoch": 1.8, + "grad_norm": 2.079207544803609, + "learning_rate": 1.4811999136316891e-06, + "loss": 0.6901, + "step": 12069 + }, + { + "epoch": 1.8, + "grad_norm": 2.74949297989608, + "learning_rate": 1.4811152238400658e-06, + "loss": 0.696, + "step": 12070 + }, + { + "epoch": 1.8, + "grad_norm": 1.6383091072598408, + "learning_rate": 1.481030529558152e-06, + "loss": 0.6947, + "step": 12071 + }, + { + "epoch": 1.8, + "grad_norm": 1.0153805356595655, + "learning_rate": 1.4809458307867376e-06, + "loss": 0.6849, + "step": 12072 + }, + { + "epoch": 1.8, + "grad_norm": 1.0240033174352305, + "learning_rate": 1.480861127526613e-06, + "loss": 0.6751, + "step": 12073 + }, + { + "epoch": 1.8, + "grad_norm": 1.7785613836231444, + "learning_rate": 1.4807764197785696e-06, + "loss": 0.6647, + "step": 12074 + }, + { + "epoch": 1.8, + "grad_norm": 1.4210120584879078, + "learning_rate": 1.480691707543397e-06, + "loss": 0.6641, + "step": 12075 + }, + { + "epoch": 1.8, + "grad_norm": 4.254986962675533, + "learning_rate": 1.4806069908218858e-06, + "loss": 0.6465, + "step": 12076 + }, + { + "epoch": 1.8, + "grad_norm": 3.734806698445326, + "learning_rate": 1.4805222696148278e-06, + "loss": 0.6413, + "step": 12077 + }, + { + "epoch": 1.8, + "grad_norm": 7.128206295877615, + "learning_rate": 1.4804375439230126e-06, + "loss": 0.709, + "step": 12078 + }, + { + "epoch": 1.8, + "grad_norm": 0.8618183135244816, + "learning_rate": 1.480352813747231e-06, + "loss": 0.6589, + "step": 12079 + }, + { + "epoch": 1.8, + "grad_norm": 3.073158137652573, + "learning_rate": 1.4802680790882746e-06, + "loss": 0.6914, + "step": 12080 + }, + { + "epoch": 1.8, + "grad_norm": 5.782532061526765, + "learning_rate": 1.4801833399469335e-06, + "loss": 0.6556, + "step": 12081 + }, + { + "epoch": 1.8, + "grad_norm": 1.2069460998011066, + "learning_rate": 1.480098596323999e-06, + "loss": 0.6829, + "step": 12082 + }, + { + "epoch": 1.8, + "grad_norm": 8.135164883892065, + "learning_rate": 1.4800138482202615e-06, + "loss": 0.6693, + "step": 12083 + }, + { + "epoch": 1.8, + "grad_norm": 2.7114235366014676, + "learning_rate": 1.4799290956365126e-06, + "loss": 0.6712, + "step": 12084 + }, + { + "epoch": 1.8, + "grad_norm": 1.6565517245006345, + "learning_rate": 1.4798443385735429e-06, + "loss": 0.6699, + "step": 12085 + }, + { + "epoch": 1.8, + "grad_norm": 0.8630527915489836, + "learning_rate": 1.4797595770321434e-06, + "loss": 0.666, + "step": 12086 + }, + { + "epoch": 1.8, + "grad_norm": 1.0392027668948636, + "learning_rate": 1.4796748110131057e-06, + "loss": 0.6699, + "step": 12087 + }, + { + "epoch": 1.8, + "grad_norm": 3.841498083050827, + "learning_rate": 1.4795900405172204e-06, + "loss": 0.6491, + "step": 12088 + }, + { + "epoch": 1.8, + "grad_norm": 1.0568916610905608, + "learning_rate": 1.4795052655452786e-06, + "loss": 0.6589, + "step": 12089 + }, + { + "epoch": 1.8, + "grad_norm": 1.0238291206712873, + "learning_rate": 1.4794204860980722e-06, + "loss": 0.6771, + "step": 12090 + }, + { + "epoch": 1.8, + "grad_norm": 1.0105076454482964, + "learning_rate": 1.4793357021763917e-06, + "loss": 0.6517, + "step": 12091 + }, + { + "epoch": 1.8, + "grad_norm": 1.9866690989813218, + "learning_rate": 1.4792509137810286e-06, + "loss": 0.6602, + "step": 12092 + }, + { + "epoch": 1.8, + "grad_norm": 3.671781150564738, + "learning_rate": 1.4791661209127749e-06, + "loss": 0.6712, + "step": 12093 + }, + { + "epoch": 1.8, + "grad_norm": 4.357963706075764, + "learning_rate": 1.479081323572421e-06, + "loss": 0.6829, + "step": 12094 + }, + { + "epoch": 1.8, + "grad_norm": 5.73719391401932, + "learning_rate": 1.4789965217607585e-06, + "loss": 0.6953, + "step": 12095 + }, + { + "epoch": 1.8, + "grad_norm": 5.786371495110824, + "learning_rate": 1.4789117154785795e-06, + "loss": 0.6589, + "step": 12096 + }, + { + "epoch": 1.8, + "grad_norm": 3.514759137973322, + "learning_rate": 1.478826904726675e-06, + "loss": 0.6549, + "step": 12097 + }, + { + "epoch": 1.8, + "grad_norm": 2.5282746260113123, + "learning_rate": 1.4787420895058367e-06, + "loss": 0.6589, + "step": 12098 + }, + { + "epoch": 1.8, + "grad_norm": 1.3984319703028918, + "learning_rate": 1.478657269816856e-06, + "loss": 0.6413, + "step": 12099 + }, + { + "epoch": 1.8, + "grad_norm": 2.4458211801538416, + "learning_rate": 1.4785724456605246e-06, + "loss": 0.6647, + "step": 12100 + }, + { + "epoch": 1.8, + "grad_norm": 1.9183751410198273, + "learning_rate": 1.4784876170376345e-06, + "loss": 0.6699, + "step": 12101 + }, + { + "epoch": 1.8, + "grad_norm": 2.0312018201784707, + "learning_rate": 1.478402783948977e-06, + "loss": 0.6634, + "step": 12102 + }, + { + "epoch": 1.81, + "grad_norm": 3.0991990357266115, + "learning_rate": 1.4783179463953441e-06, + "loss": 0.6647, + "step": 12103 + }, + { + "epoch": 1.81, + "grad_norm": 7.253145165706257, + "learning_rate": 1.4782331043775276e-06, + "loss": 0.7005, + "step": 12104 + }, + { + "epoch": 1.81, + "grad_norm": 4.20128048698092, + "learning_rate": 1.478148257896319e-06, + "loss": 0.6849, + "step": 12105 + }, + { + "epoch": 1.81, + "grad_norm": 3.7260406005823885, + "learning_rate": 1.4780634069525105e-06, + "loss": 0.6465, + "step": 12106 + }, + { + "epoch": 1.81, + "grad_norm": 3.436791249687259, + "learning_rate": 1.4779785515468941e-06, + "loss": 0.707, + "step": 12107 + }, + { + "epoch": 1.81, + "grad_norm": 3.454326363683667, + "learning_rate": 1.4778936916802614e-06, + "loss": 0.6934, + "step": 12108 + }, + { + "epoch": 1.81, + "grad_norm": 2.255444693185455, + "learning_rate": 1.4778088273534046e-06, + "loss": 0.694, + "step": 12109 + }, + { + "epoch": 1.81, + "grad_norm": 2.2973159525892775, + "learning_rate": 1.477723958567116e-06, + "loss": 0.709, + "step": 12110 + }, + { + "epoch": 1.81, + "grad_norm": 3.921759707065492, + "learning_rate": 1.4776390853221875e-06, + "loss": 0.6569, + "step": 12111 + }, + { + "epoch": 1.81, + "grad_norm": 1.5795433265886647, + "learning_rate": 1.4775542076194107e-06, + "loss": 0.7077, + "step": 12112 + }, + { + "epoch": 1.81, + "grad_norm": 1.4418085001371916, + "learning_rate": 1.4774693254595785e-06, + "loss": 0.6901, + "step": 12113 + }, + { + "epoch": 1.81, + "grad_norm": 4.863198017597598, + "learning_rate": 1.4773844388434832e-06, + "loss": 0.6777, + "step": 12114 + }, + { + "epoch": 1.81, + "grad_norm": 0.945930710722776, + "learning_rate": 1.4772995477719161e-06, + "loss": 0.6569, + "step": 12115 + }, + { + "epoch": 1.81, + "grad_norm": 4.006512964537014, + "learning_rate": 1.4772146522456705e-06, + "loss": 0.7096, + "step": 12116 + }, + { + "epoch": 1.81, + "grad_norm": 3.0270345653627104, + "learning_rate": 1.4771297522655384e-06, + "loss": 0.6315, + "step": 12117 + }, + { + "epoch": 1.81, + "grad_norm": 1.7565337862653394, + "learning_rate": 1.4770448478323117e-06, + "loss": 0.6855, + "step": 12118 + }, + { + "epoch": 1.81, + "grad_norm": 2.1650199410588633, + "learning_rate": 1.4769599389467837e-06, + "loss": 0.6836, + "step": 12119 + }, + { + "epoch": 1.81, + "grad_norm": 3.389694949563129, + "learning_rate": 1.4768750256097461e-06, + "loss": 0.6908, + "step": 12120 + }, + { + "epoch": 1.81, + "grad_norm": 4.9768923282600275, + "learning_rate": 1.476790107821992e-06, + "loss": 0.696, + "step": 12121 + }, + { + "epoch": 1.81, + "grad_norm": 2.365769173749654, + "learning_rate": 1.4767051855843137e-06, + "loss": 0.6921, + "step": 12122 + }, + { + "epoch": 1.81, + "grad_norm": 0.9609206914394944, + "learning_rate": 1.4766202588975034e-06, + "loss": 0.6797, + "step": 12123 + }, + { + "epoch": 1.81, + "grad_norm": 1.9211501134061997, + "learning_rate": 1.4765353277623541e-06, + "loss": 0.6764, + "step": 12124 + }, + { + "epoch": 1.81, + "grad_norm": 0.9073307480461267, + "learning_rate": 1.4764503921796587e-06, + "loss": 0.666, + "step": 12125 + }, + { + "epoch": 1.81, + "grad_norm": 1.3909902388208037, + "learning_rate": 1.4763654521502096e-06, + "loss": 0.6882, + "step": 12126 + }, + { + "epoch": 1.81, + "grad_norm": 1.5797220300510944, + "learning_rate": 1.4762805076747995e-06, + "loss": 0.6953, + "step": 12127 + }, + { + "epoch": 1.81, + "grad_norm": 1.417801973574752, + "learning_rate": 1.4761955587542211e-06, + "loss": 0.6595, + "step": 12128 + }, + { + "epoch": 1.81, + "grad_norm": 3.799895791340954, + "learning_rate": 1.4761106053892675e-06, + "loss": 0.6764, + "step": 12129 + }, + { + "epoch": 1.81, + "grad_norm": 1.697266161024543, + "learning_rate": 1.476025647580732e-06, + "loss": 0.6647, + "step": 12130 + }, + { + "epoch": 1.81, + "grad_norm": 1.78821732819301, + "learning_rate": 1.4759406853294067e-06, + "loss": 0.6556, + "step": 12131 + }, + { + "epoch": 1.81, + "grad_norm": 5.397227737310146, + "learning_rate": 1.475855718636085e-06, + "loss": 0.6784, + "step": 12132 + }, + { + "epoch": 1.81, + "grad_norm": 3.390438664213853, + "learning_rate": 1.4757707475015598e-06, + "loss": 0.6758, + "step": 12133 + }, + { + "epoch": 1.81, + "grad_norm": 3.904876288192505, + "learning_rate": 1.4756857719266238e-06, + "loss": 0.6725, + "step": 12134 + }, + { + "epoch": 1.81, + "grad_norm": 0.7405178273408124, + "learning_rate": 1.4756007919120708e-06, + "loss": 0.6628, + "step": 12135 + }, + { + "epoch": 1.81, + "grad_norm": 1.718702543480057, + "learning_rate": 1.4755158074586935e-06, + "loss": 0.6641, + "step": 12136 + }, + { + "epoch": 1.81, + "grad_norm": 1.260811776941801, + "learning_rate": 1.4754308185672847e-06, + "loss": 0.6914, + "step": 12137 + }, + { + "epoch": 1.81, + "grad_norm": 3.3877057513181206, + "learning_rate": 1.4753458252386386e-06, + "loss": 0.6771, + "step": 12138 + }, + { + "epoch": 1.81, + "grad_norm": 1.2036072382111562, + "learning_rate": 1.4752608274735475e-06, + "loss": 0.6543, + "step": 12139 + }, + { + "epoch": 1.81, + "grad_norm": 1.1774586495621202, + "learning_rate": 1.4751758252728053e-06, + "loss": 0.666, + "step": 12140 + }, + { + "epoch": 1.81, + "grad_norm": 0.7635173276552129, + "learning_rate": 1.4750908186372051e-06, + "loss": 0.6732, + "step": 12141 + }, + { + "epoch": 1.81, + "grad_norm": 3.044479602441556, + "learning_rate": 1.47500580756754e-06, + "loss": 0.6751, + "step": 12142 + }, + { + "epoch": 1.81, + "grad_norm": 2.871094779639037, + "learning_rate": 1.4749207920646041e-06, + "loss": 0.6914, + "step": 12143 + }, + { + "epoch": 1.81, + "grad_norm": 4.125580134644548, + "learning_rate": 1.47483577212919e-06, + "loss": 0.6602, + "step": 12144 + }, + { + "epoch": 1.81, + "grad_norm": 0.839178627595194, + "learning_rate": 1.474750747762092e-06, + "loss": 0.666, + "step": 12145 + }, + { + "epoch": 1.81, + "grad_norm": 2.549270324803005, + "learning_rate": 1.4746657189641033e-06, + "loss": 0.6849, + "step": 12146 + }, + { + "epoch": 1.81, + "grad_norm": 3.348687802285327, + "learning_rate": 1.4745806857360172e-06, + "loss": 0.6947, + "step": 12147 + }, + { + "epoch": 1.81, + "grad_norm": 0.9772660671053742, + "learning_rate": 1.4744956480786277e-06, + "loss": 0.6556, + "step": 12148 + }, + { + "epoch": 1.81, + "grad_norm": 0.860417906176917, + "learning_rate": 1.4744106059927284e-06, + "loss": 0.6745, + "step": 12149 + }, + { + "epoch": 1.81, + "grad_norm": 1.9137155883275994, + "learning_rate": 1.4743255594791128e-06, + "loss": 0.6732, + "step": 12150 + }, + { + "epoch": 1.81, + "grad_norm": 2.3580256870255814, + "learning_rate": 1.474240508538575e-06, + "loss": 0.6426, + "step": 12151 + }, + { + "epoch": 1.81, + "grad_norm": 5.912440459094422, + "learning_rate": 1.4741554531719084e-06, + "loss": 0.6914, + "step": 12152 + }, + { + "epoch": 1.81, + "grad_norm": 3.2345847423108234, + "learning_rate": 1.4740703933799073e-06, + "loss": 0.6452, + "step": 12153 + }, + { + "epoch": 1.81, + "grad_norm": 2.3879155747501093, + "learning_rate": 1.4739853291633654e-06, + "loss": 0.6556, + "step": 12154 + }, + { + "epoch": 1.81, + "grad_norm": 2.7484793168821167, + "learning_rate": 1.473900260523076e-06, + "loss": 0.6816, + "step": 12155 + }, + { + "epoch": 1.81, + "grad_norm": 1.4095066919642023, + "learning_rate": 1.473815187459834e-06, + "loss": 0.6738, + "step": 12156 + }, + { + "epoch": 1.81, + "grad_norm": 1.059152299539841, + "learning_rate": 1.4737301099744327e-06, + "loss": 0.6569, + "step": 12157 + }, + { + "epoch": 1.81, + "grad_norm": 1.6900554464715765, + "learning_rate": 1.4736450280676663e-06, + "loss": 0.6628, + "step": 12158 + }, + { + "epoch": 1.81, + "grad_norm": 1.1089321977601845, + "learning_rate": 1.473559941740329e-06, + "loss": 0.6777, + "step": 12159 + }, + { + "epoch": 1.81, + "grad_norm": 2.8425282223862016, + "learning_rate": 1.4734748509932154e-06, + "loss": 0.6771, + "step": 12160 + }, + { + "epoch": 1.81, + "grad_norm": 3.5144857769478457, + "learning_rate": 1.4733897558271185e-06, + "loss": 0.6615, + "step": 12161 + }, + { + "epoch": 1.81, + "grad_norm": 4.789872250728702, + "learning_rate": 1.4733046562428334e-06, + "loss": 0.6895, + "step": 12162 + }, + { + "epoch": 1.81, + "grad_norm": 5.801981314157978, + "learning_rate": 1.4732195522411542e-06, + "loss": 0.6953, + "step": 12163 + }, + { + "epoch": 1.81, + "grad_norm": 4.081616219948454, + "learning_rate": 1.473134443822875e-06, + "loss": 0.6725, + "step": 12164 + }, + { + "epoch": 1.81, + "grad_norm": 1.0539212426823057, + "learning_rate": 1.47304933098879e-06, + "loss": 0.6452, + "step": 12165 + }, + { + "epoch": 1.81, + "grad_norm": 1.1309284702340994, + "learning_rate": 1.472964213739694e-06, + "loss": 0.6738, + "step": 12166 + }, + { + "epoch": 1.81, + "grad_norm": 2.8918748861272716, + "learning_rate": 1.4728790920763809e-06, + "loss": 0.6738, + "step": 12167 + }, + { + "epoch": 1.81, + "grad_norm": 1.027236049848507, + "learning_rate": 1.4727939659996456e-06, + "loss": 0.6419, + "step": 12168 + }, + { + "epoch": 1.81, + "grad_norm": 2.6322579716937002, + "learning_rate": 1.4727088355102824e-06, + "loss": 0.6712, + "step": 12169 + }, + { + "epoch": 1.82, + "grad_norm": 1.172044023820262, + "learning_rate": 1.4726237006090858e-06, + "loss": 0.6602, + "step": 12170 + }, + { + "epoch": 1.82, + "grad_norm": 1.1857720999610357, + "learning_rate": 1.4725385612968504e-06, + "loss": 0.6387, + "step": 12171 + }, + { + "epoch": 1.82, + "grad_norm": 3.910893105954159, + "learning_rate": 1.4724534175743707e-06, + "loss": 0.6973, + "step": 12172 + }, + { + "epoch": 1.82, + "grad_norm": 1.0302188312419693, + "learning_rate": 1.472368269442442e-06, + "loss": 0.6602, + "step": 12173 + }, + { + "epoch": 1.82, + "grad_norm": 3.3499746861111688, + "learning_rate": 1.4722831169018578e-06, + "loss": 0.6771, + "step": 12174 + }, + { + "epoch": 1.82, + "grad_norm": 2.228667406531583, + "learning_rate": 1.472197959953414e-06, + "loss": 0.651, + "step": 12175 + }, + { + "epoch": 1.82, + "grad_norm": 3.2494666653183666, + "learning_rate": 1.4721127985979048e-06, + "loss": 0.6855, + "step": 12176 + }, + { + "epoch": 1.82, + "grad_norm": 1.0256695609052624, + "learning_rate": 1.472027632836125e-06, + "loss": 0.6751, + "step": 12177 + }, + { + "epoch": 1.82, + "grad_norm": 3.2686818902262575, + "learning_rate": 1.4719424626688692e-06, + "loss": 0.6901, + "step": 12178 + }, + { + "epoch": 1.82, + "grad_norm": 1.7638420662760224, + "learning_rate": 1.471857288096933e-06, + "loss": 0.6484, + "step": 12179 + }, + { + "epoch": 1.82, + "grad_norm": 2.8902161731991245, + "learning_rate": 1.471772109121111e-06, + "loss": 0.6517, + "step": 12180 + }, + { + "epoch": 1.82, + "grad_norm": 1.0971918533759857, + "learning_rate": 1.4716869257421981e-06, + "loss": 0.6771, + "step": 12181 + }, + { + "epoch": 1.82, + "grad_norm": 1.2641619467238285, + "learning_rate": 1.4716017379609897e-06, + "loss": 0.6589, + "step": 12182 + }, + { + "epoch": 1.82, + "grad_norm": 2.617203794901538, + "learning_rate": 1.4715165457782803e-06, + "loss": 0.625, + "step": 12183 + }, + { + "epoch": 1.82, + "grad_norm": 4.855183586402882, + "learning_rate": 1.4714313491948653e-06, + "loss": 0.6849, + "step": 12184 + }, + { + "epoch": 1.82, + "grad_norm": 2.176997201596278, + "learning_rate": 1.4713461482115398e-06, + "loss": 0.6582, + "step": 12185 + }, + { + "epoch": 1.82, + "grad_norm": 1.8390437711344654, + "learning_rate": 1.4712609428290991e-06, + "loss": 0.6654, + "step": 12186 + }, + { + "epoch": 1.82, + "grad_norm": 2.4989655468828778, + "learning_rate": 1.4711757330483382e-06, + "loss": 0.6549, + "step": 12187 + }, + { + "epoch": 1.82, + "grad_norm": 1.4311979515035345, + "learning_rate": 1.4710905188700529e-06, + "loss": 0.6745, + "step": 12188 + }, + { + "epoch": 1.82, + "grad_norm": 2.853219259148362, + "learning_rate": 1.4710053002950378e-06, + "loss": 0.6283, + "step": 12189 + }, + { + "epoch": 1.82, + "grad_norm": 2.9794625230196337, + "learning_rate": 1.4709200773240884e-06, + "loss": 0.6836, + "step": 12190 + }, + { + "epoch": 1.82, + "grad_norm": 1.159720281121457, + "learning_rate": 1.4708348499580007e-06, + "loss": 0.696, + "step": 12191 + }, + { + "epoch": 1.82, + "grad_norm": 1.5940219423748354, + "learning_rate": 1.4707496181975693e-06, + "loss": 0.668, + "step": 12192 + }, + { + "epoch": 1.82, + "grad_norm": 1.130476090101031, + "learning_rate": 1.4706643820435906e-06, + "loss": 0.6921, + "step": 12193 + }, + { + "epoch": 1.82, + "grad_norm": 1.3911841794676711, + "learning_rate": 1.4705791414968591e-06, + "loss": 0.7044, + "step": 12194 + }, + { + "epoch": 1.82, + "grad_norm": 1.8973876494332793, + "learning_rate": 1.470493896558171e-06, + "loss": 0.6309, + "step": 12195 + }, + { + "epoch": 1.82, + "grad_norm": 1.0362471854955593, + "learning_rate": 1.4704086472283218e-06, + "loss": 0.6712, + "step": 12196 + }, + { + "epoch": 1.82, + "grad_norm": 4.3840373410360005, + "learning_rate": 1.470323393508107e-06, + "loss": 0.709, + "step": 12197 + }, + { + "epoch": 1.82, + "grad_norm": 0.9443160364712853, + "learning_rate": 1.4702381353983228e-06, + "loss": 0.6549, + "step": 12198 + }, + { + "epoch": 1.82, + "grad_norm": 1.1367359528477212, + "learning_rate": 1.470152872899764e-06, + "loss": 0.6986, + "step": 12199 + }, + { + "epoch": 1.82, + "grad_norm": 1.9084199961328143, + "learning_rate": 1.4700676060132272e-06, + "loss": 0.7044, + "step": 12200 + }, + { + "epoch": 1.82, + "grad_norm": 3.66145692107494, + "learning_rate": 1.4699823347395078e-06, + "loss": 0.6582, + "step": 12201 + }, + { + "epoch": 1.82, + "grad_norm": 0.9148642627878733, + "learning_rate": 1.4698970590794016e-06, + "loss": 0.679, + "step": 12202 + }, + { + "epoch": 1.82, + "grad_norm": 1.6634795325880574, + "learning_rate": 1.4698117790337044e-06, + "loss": 0.6536, + "step": 12203 + }, + { + "epoch": 1.82, + "grad_norm": 1.9173252246704615, + "learning_rate": 1.4697264946032127e-06, + "loss": 0.6771, + "step": 12204 + }, + { + "epoch": 1.82, + "grad_norm": 2.3895436310578657, + "learning_rate": 1.4696412057887217e-06, + "loss": 0.6771, + "step": 12205 + }, + { + "epoch": 1.82, + "grad_norm": 1.1138592939688303, + "learning_rate": 1.4695559125910279e-06, + "loss": 0.6855, + "step": 12206 + }, + { + "epoch": 1.82, + "grad_norm": 1.334869881853277, + "learning_rate": 1.4694706150109275e-06, + "loss": 0.6979, + "step": 12207 + }, + { + "epoch": 1.82, + "grad_norm": 0.907297350920222, + "learning_rate": 1.469385313049216e-06, + "loss": 0.653, + "step": 12208 + }, + { + "epoch": 1.82, + "grad_norm": 6.221501268253955, + "learning_rate": 1.4693000067066903e-06, + "loss": 0.679, + "step": 12209 + }, + { + "epoch": 1.82, + "grad_norm": 0.8822565425947434, + "learning_rate": 1.4692146959841459e-06, + "loss": 0.6595, + "step": 12210 + }, + { + "epoch": 1.82, + "grad_norm": 1.3576131880279723, + "learning_rate": 1.469129380882379e-06, + "loss": 0.6738, + "step": 12211 + }, + { + "epoch": 1.82, + "grad_norm": 2.525304043654753, + "learning_rate": 1.4690440614021864e-06, + "loss": 0.6445, + "step": 12212 + }, + { + "epoch": 1.82, + "grad_norm": 0.941410628340744, + "learning_rate": 1.468958737544364e-06, + "loss": 0.6504, + "step": 12213 + }, + { + "epoch": 1.82, + "grad_norm": 0.9968948514294721, + "learning_rate": 1.468873409309708e-06, + "loss": 0.6829, + "step": 12214 + }, + { + "epoch": 1.82, + "grad_norm": 1.1332532367814068, + "learning_rate": 1.4687880766990152e-06, + "loss": 0.6816, + "step": 12215 + }, + { + "epoch": 1.82, + "grad_norm": 2.407609600299468, + "learning_rate": 1.468702739713082e-06, + "loss": 0.6634, + "step": 12216 + }, + { + "epoch": 1.82, + "grad_norm": 2.3662197913481284, + "learning_rate": 1.4686173983527045e-06, + "loss": 0.6953, + "step": 12217 + }, + { + "epoch": 1.82, + "grad_norm": 5.959049000203143, + "learning_rate": 1.4685320526186794e-06, + "loss": 0.6706, + "step": 12218 + }, + { + "epoch": 1.82, + "grad_norm": 1.2558622971623978, + "learning_rate": 1.4684467025118036e-06, + "loss": 0.6784, + "step": 12219 + }, + { + "epoch": 1.82, + "grad_norm": 2.314623053860325, + "learning_rate": 1.4683613480328729e-06, + "loss": 0.6816, + "step": 12220 + }, + { + "epoch": 1.82, + "grad_norm": 1.1621680219816615, + "learning_rate": 1.468275989182684e-06, + "loss": 0.6751, + "step": 12221 + }, + { + "epoch": 1.82, + "grad_norm": 1.909326765735582, + "learning_rate": 1.4681906259620346e-06, + "loss": 0.6667, + "step": 12222 + }, + { + "epoch": 1.82, + "grad_norm": 1.829617225154419, + "learning_rate": 1.4681052583717202e-06, + "loss": 0.6888, + "step": 12223 + }, + { + "epoch": 1.82, + "grad_norm": 2.9059988359683264, + "learning_rate": 1.468019886412538e-06, + "loss": 0.6816, + "step": 12224 + }, + { + "epoch": 1.82, + "grad_norm": 3.7397905442518535, + "learning_rate": 1.4679345100852852e-06, + "loss": 0.6882, + "step": 12225 + }, + { + "epoch": 1.82, + "grad_norm": 1.2366698028507426, + "learning_rate": 1.4678491293907579e-06, + "loss": 0.6517, + "step": 12226 + }, + { + "epoch": 1.82, + "grad_norm": 1.8534758527083415, + "learning_rate": 1.4677637443297532e-06, + "loss": 0.6693, + "step": 12227 + }, + { + "epoch": 1.82, + "grad_norm": 1.9668255120387312, + "learning_rate": 1.4676783549030684e-06, + "loss": 0.653, + "step": 12228 + }, + { + "epoch": 1.82, + "grad_norm": 2.071400541038119, + "learning_rate": 1.4675929611115002e-06, + "loss": 0.6693, + "step": 12229 + }, + { + "epoch": 1.82, + "grad_norm": 1.11342802259629, + "learning_rate": 1.4675075629558454e-06, + "loss": 0.6712, + "step": 12230 + }, + { + "epoch": 1.82, + "grad_norm": 1.1589209291258762, + "learning_rate": 1.4674221604369011e-06, + "loss": 0.6784, + "step": 12231 + }, + { + "epoch": 1.82, + "grad_norm": 1.009006944092179, + "learning_rate": 1.4673367535554647e-06, + "loss": 0.6491, + "step": 12232 + }, + { + "epoch": 1.82, + "grad_norm": 2.633927288012724, + "learning_rate": 1.4672513423123328e-06, + "loss": 0.6641, + "step": 12233 + }, + { + "epoch": 1.82, + "grad_norm": 4.254568011408006, + "learning_rate": 1.4671659267083027e-06, + "loss": 0.6771, + "step": 12234 + }, + { + "epoch": 1.82, + "grad_norm": 2.2954071766926245, + "learning_rate": 1.467080506744172e-06, + "loss": 0.6908, + "step": 12235 + }, + { + "epoch": 1.82, + "grad_norm": 2.9167464565078394, + "learning_rate": 1.4669950824207376e-06, + "loss": 0.6758, + "step": 12236 + }, + { + "epoch": 1.83, + "grad_norm": 2.6500327236021017, + "learning_rate": 1.4669096537387964e-06, + "loss": 0.6706, + "step": 12237 + }, + { + "epoch": 1.83, + "grad_norm": 2.198619315240072, + "learning_rate": 1.4668242206991464e-06, + "loss": 0.6283, + "step": 12238 + }, + { + "epoch": 1.83, + "grad_norm": 2.9789931771083307, + "learning_rate": 1.4667387833025848e-06, + "loss": 0.653, + "step": 12239 + }, + { + "epoch": 1.83, + "grad_norm": 2.0864853020216017, + "learning_rate": 1.4666533415499085e-06, + "loss": 0.6667, + "step": 12240 + }, + { + "epoch": 1.83, + "grad_norm": 1.0736077071093415, + "learning_rate": 1.4665678954419157e-06, + "loss": 0.6751, + "step": 12241 + }, + { + "epoch": 1.83, + "grad_norm": 5.831118674920704, + "learning_rate": 1.4664824449794031e-06, + "loss": 0.7103, + "step": 12242 + }, + { + "epoch": 1.83, + "grad_norm": 1.2753920768910434, + "learning_rate": 1.466396990163169e-06, + "loss": 0.6855, + "step": 12243 + }, + { + "epoch": 1.83, + "grad_norm": 5.8362627683924595, + "learning_rate": 1.4663115309940103e-06, + "loss": 0.6582, + "step": 12244 + }, + { + "epoch": 1.83, + "grad_norm": 1.2584900440195037, + "learning_rate": 1.4662260674727247e-06, + "loss": 0.6654, + "step": 12245 + }, + { + "epoch": 1.83, + "grad_norm": 1.2374137859759051, + "learning_rate": 1.4661405996001102e-06, + "loss": 0.6771, + "step": 12246 + }, + { + "epoch": 1.83, + "grad_norm": 3.1004868496447404, + "learning_rate": 1.4660551273769642e-06, + "loss": 0.6751, + "step": 12247 + }, + { + "epoch": 1.83, + "grad_norm": 2.675647795311902, + "learning_rate": 1.4659696508040844e-06, + "loss": 0.6484, + "step": 12248 + }, + { + "epoch": 1.83, + "grad_norm": 2.925490344192749, + "learning_rate": 1.4658841698822688e-06, + "loss": 0.6686, + "step": 12249 + }, + { + "epoch": 1.83, + "grad_norm": 2.4638509049356916, + "learning_rate": 1.4657986846123147e-06, + "loss": 0.7116, + "step": 12250 + }, + { + "epoch": 1.83, + "grad_norm": 1.4608492902062735, + "learning_rate": 1.4657131949950208e-06, + "loss": 0.668, + "step": 12251 + }, + { + "epoch": 1.83, + "grad_norm": 5.236589086925721, + "learning_rate": 1.4656277010311842e-06, + "loss": 0.7005, + "step": 12252 + }, + { + "epoch": 1.83, + "grad_norm": 2.9295723547627053, + "learning_rate": 1.4655422027216028e-06, + "loss": 0.6719, + "step": 12253 + }, + { + "epoch": 1.83, + "grad_norm": 4.548901952606786, + "learning_rate": 1.4654567000670755e-06, + "loss": 0.6758, + "step": 12254 + }, + { + "epoch": 1.83, + "grad_norm": 2.126201609893276, + "learning_rate": 1.4653711930683992e-06, + "loss": 0.6823, + "step": 12255 + }, + { + "epoch": 1.83, + "grad_norm": 3.7035623455242157, + "learning_rate": 1.4652856817263723e-06, + "loss": 0.6745, + "step": 12256 + }, + { + "epoch": 1.83, + "grad_norm": 0.9090749107489317, + "learning_rate": 1.4652001660417934e-06, + "loss": 0.6901, + "step": 12257 + }, + { + "epoch": 1.83, + "grad_norm": 0.8784216109637932, + "learning_rate": 1.4651146460154598e-06, + "loss": 0.64, + "step": 12258 + }, + { + "epoch": 1.83, + "grad_norm": 2.6829338435562287, + "learning_rate": 1.4650291216481706e-06, + "loss": 0.6699, + "step": 12259 + }, + { + "epoch": 1.83, + "grad_norm": 2.1140652024051234, + "learning_rate": 1.464943592940723e-06, + "loss": 0.6484, + "step": 12260 + }, + { + "epoch": 1.83, + "grad_norm": 1.0845389910890069, + "learning_rate": 1.4648580598939157e-06, + "loss": 0.6855, + "step": 12261 + }, + { + "epoch": 1.83, + "grad_norm": 1.9531714655348753, + "learning_rate": 1.4647725225085474e-06, + "loss": 0.6777, + "step": 12262 + }, + { + "epoch": 1.83, + "grad_norm": 2.839548262876379, + "learning_rate": 1.4646869807854159e-06, + "loss": 0.6816, + "step": 12263 + }, + { + "epoch": 1.83, + "grad_norm": 2.607495674852962, + "learning_rate": 1.4646014347253194e-06, + "loss": 0.6621, + "step": 12264 + }, + { + "epoch": 1.83, + "grad_norm": 0.9102826487911456, + "learning_rate": 1.464515884329057e-06, + "loss": 0.6901, + "step": 12265 + }, + { + "epoch": 1.83, + "grad_norm": 4.718181247025702, + "learning_rate": 1.4644303295974267e-06, + "loss": 0.6895, + "step": 12266 + }, + { + "epoch": 1.83, + "grad_norm": 4.783370641702589, + "learning_rate": 1.464344770531227e-06, + "loss": 0.6719, + "step": 12267 + }, + { + "epoch": 1.83, + "grad_norm": 1.1337175361217111, + "learning_rate": 1.4642592071312563e-06, + "loss": 0.6484, + "step": 12268 + }, + { + "epoch": 1.83, + "grad_norm": 1.6965784994479907, + "learning_rate": 1.4641736393983137e-06, + "loss": 0.6549, + "step": 12269 + }, + { + "epoch": 1.83, + "grad_norm": 3.5307486254807428, + "learning_rate": 1.4640880673331975e-06, + "loss": 0.6608, + "step": 12270 + }, + { + "epoch": 1.83, + "grad_norm": 2.385572256152192, + "learning_rate": 1.464002490936706e-06, + "loss": 0.6608, + "step": 12271 + }, + { + "epoch": 1.83, + "grad_norm": 3.5345397072655897, + "learning_rate": 1.4639169102096385e-06, + "loss": 0.7142, + "step": 12272 + }, + { + "epoch": 1.83, + "grad_norm": 3.4013148321292728, + "learning_rate": 1.4638313251527934e-06, + "loss": 0.6686, + "step": 12273 + }, + { + "epoch": 1.83, + "grad_norm": 2.9540194161266564, + "learning_rate": 1.4637457357669693e-06, + "loss": 0.694, + "step": 12274 + }, + { + "epoch": 1.83, + "grad_norm": 2.5538915953240795, + "learning_rate": 1.4636601420529655e-06, + "loss": 0.6719, + "step": 12275 + }, + { + "epoch": 1.83, + "grad_norm": 3.030981407556664, + "learning_rate": 1.4635745440115805e-06, + "loss": 0.6543, + "step": 12276 + }, + { + "epoch": 1.83, + "grad_norm": 2.0259675044491035, + "learning_rate": 1.4634889416436132e-06, + "loss": 0.6595, + "step": 12277 + }, + { + "epoch": 1.83, + "grad_norm": 1.447418166667284, + "learning_rate": 1.4634033349498628e-06, + "loss": 0.6673, + "step": 12278 + }, + { + "epoch": 1.83, + "grad_norm": 1.58422669431328, + "learning_rate": 1.463317723931128e-06, + "loss": 0.6875, + "step": 12279 + }, + { + "epoch": 1.83, + "grad_norm": 1.656233789357628, + "learning_rate": 1.4632321085882079e-06, + "loss": 0.6725, + "step": 12280 + }, + { + "epoch": 1.83, + "grad_norm": 3.818181431534818, + "learning_rate": 1.4631464889219014e-06, + "loss": 0.6693, + "step": 12281 + }, + { + "epoch": 1.83, + "grad_norm": 1.0617684316645033, + "learning_rate": 1.4630608649330078e-06, + "loss": 0.6862, + "step": 12282 + }, + { + "epoch": 1.83, + "grad_norm": 2.2336850506361867, + "learning_rate": 1.4629752366223265e-06, + "loss": 0.6523, + "step": 12283 + }, + { + "epoch": 1.83, + "grad_norm": 3.88554347829679, + "learning_rate": 1.462889603990656e-06, + "loss": 0.6764, + "step": 12284 + }, + { + "epoch": 1.83, + "grad_norm": 1.9712316489654642, + "learning_rate": 1.4628039670387961e-06, + "loss": 0.6862, + "step": 12285 + }, + { + "epoch": 1.83, + "grad_norm": 2.035024065188161, + "learning_rate": 1.462718325767546e-06, + "loss": 0.6458, + "step": 12286 + }, + { + "epoch": 1.83, + "grad_norm": 1.1532072740614168, + "learning_rate": 1.4626326801777045e-06, + "loss": 0.6562, + "step": 12287 + }, + { + "epoch": 1.83, + "grad_norm": 1.4141536231436191, + "learning_rate": 1.4625470302700715e-06, + "loss": 0.6634, + "step": 12288 + }, + { + "epoch": 1.83, + "grad_norm": 1.1188219970536133, + "learning_rate": 1.4624613760454463e-06, + "loss": 0.6484, + "step": 12289 + }, + { + "epoch": 1.83, + "grad_norm": 1.0425376396936408, + "learning_rate": 1.4623757175046278e-06, + "loss": 0.6602, + "step": 12290 + }, + { + "epoch": 1.83, + "grad_norm": 2.6391487122239554, + "learning_rate": 1.462290054648416e-06, + "loss": 0.6836, + "step": 12291 + }, + { + "epoch": 1.83, + "grad_norm": 3.5901841904649814, + "learning_rate": 1.4622043874776104e-06, + "loss": 0.6576, + "step": 12292 + }, + { + "epoch": 1.83, + "grad_norm": 1.0072619709212776, + "learning_rate": 1.46211871599301e-06, + "loss": 0.6673, + "step": 12293 + }, + { + "epoch": 1.83, + "grad_norm": 1.3541586463947233, + "learning_rate": 1.4620330401954153e-06, + "loss": 0.6641, + "step": 12294 + }, + { + "epoch": 1.83, + "grad_norm": 1.8596208830471894, + "learning_rate": 1.4619473600856249e-06, + "loss": 0.6816, + "step": 12295 + }, + { + "epoch": 1.83, + "grad_norm": 1.1982281591773456, + "learning_rate": 1.4618616756644392e-06, + "loss": 0.6621, + "step": 12296 + }, + { + "epoch": 1.83, + "grad_norm": 1.428965359617639, + "learning_rate": 1.4617759869326577e-06, + "loss": 0.6452, + "step": 12297 + }, + { + "epoch": 1.83, + "grad_norm": 1.3058418820214253, + "learning_rate": 1.4616902938910797e-06, + "loss": 0.6842, + "step": 12298 + }, + { + "epoch": 1.83, + "grad_norm": 2.763162300228691, + "learning_rate": 1.4616045965405058e-06, + "loss": 0.7018, + "step": 12299 + }, + { + "epoch": 1.83, + "grad_norm": 3.2950347824529973, + "learning_rate": 1.4615188948817349e-06, + "loss": 0.6589, + "step": 12300 + }, + { + "epoch": 1.83, + "grad_norm": 2.312429170685652, + "learning_rate": 1.4614331889155679e-06, + "loss": 0.679, + "step": 12301 + }, + { + "epoch": 1.83, + "grad_norm": 3.4277439289045653, + "learning_rate": 1.4613474786428037e-06, + "loss": 0.6986, + "step": 12302 + }, + { + "epoch": 1.83, + "grad_norm": 4.201563274647204, + "learning_rate": 1.4612617640642427e-06, + "loss": 0.6562, + "step": 12303 + }, + { + "epoch": 1.84, + "grad_norm": 1.697960303165918, + "learning_rate": 1.461176045180685e-06, + "loss": 0.6406, + "step": 12304 + }, + { + "epoch": 1.84, + "grad_norm": 1.1254844161233222, + "learning_rate": 1.4610903219929306e-06, + "loss": 0.6732, + "step": 12305 + }, + { + "epoch": 1.84, + "grad_norm": 3.218461842239054, + "learning_rate": 1.4610045945017794e-06, + "loss": 0.651, + "step": 12306 + }, + { + "epoch": 1.84, + "grad_norm": 1.362126564908107, + "learning_rate": 1.4609188627080313e-06, + "loss": 0.6738, + "step": 12307 + }, + { + "epoch": 1.84, + "grad_norm": 4.647285563179238, + "learning_rate": 1.460833126612487e-06, + "loss": 0.6628, + "step": 12308 + }, + { + "epoch": 1.84, + "grad_norm": 1.1296657324083497, + "learning_rate": 1.4607473862159464e-06, + "loss": 0.6934, + "step": 12309 + }, + { + "epoch": 1.84, + "grad_norm": 2.3908690375270725, + "learning_rate": 1.4606616415192093e-06, + "loss": 0.6719, + "step": 12310 + }, + { + "epoch": 1.84, + "grad_norm": 1.547802478657936, + "learning_rate": 1.4605758925230768e-06, + "loss": 0.6569, + "step": 12311 + }, + { + "epoch": 1.84, + "grad_norm": 1.6995316640214142, + "learning_rate": 1.4604901392283484e-06, + "loss": 0.666, + "step": 12312 + }, + { + "epoch": 1.84, + "grad_norm": 0.9177561426223245, + "learning_rate": 1.460404381635825e-06, + "loss": 0.6966, + "step": 12313 + }, + { + "epoch": 1.84, + "grad_norm": 1.0456373101499186, + "learning_rate": 1.4603186197463067e-06, + "loss": 0.6628, + "step": 12314 + }, + { + "epoch": 1.84, + "grad_norm": 1.0682570371122386, + "learning_rate": 1.4602328535605942e-06, + "loss": 0.651, + "step": 12315 + }, + { + "epoch": 1.84, + "grad_norm": 1.4323205212131374, + "learning_rate": 1.4601470830794873e-06, + "loss": 0.6797, + "step": 12316 + }, + { + "epoch": 1.84, + "grad_norm": 4.5614638203942555, + "learning_rate": 1.4600613083037874e-06, + "loss": 0.6243, + "step": 12317 + }, + { + "epoch": 1.84, + "grad_norm": 4.251124243359059, + "learning_rate": 1.4599755292342949e-06, + "loss": 0.6576, + "step": 12318 + }, + { + "epoch": 1.84, + "grad_norm": 1.0475711028883206, + "learning_rate": 1.4598897458718097e-06, + "loss": 0.6797, + "step": 12319 + }, + { + "epoch": 1.84, + "grad_norm": 2.9275563789284362, + "learning_rate": 1.4598039582171326e-06, + "loss": 0.6432, + "step": 12320 + }, + { + "epoch": 1.84, + "grad_norm": 1.898369043340807, + "learning_rate": 1.459718166271065e-06, + "loss": 0.6543, + "step": 12321 + }, + { + "epoch": 1.84, + "grad_norm": 1.2272741175726067, + "learning_rate": 1.4596323700344066e-06, + "loss": 0.6543, + "step": 12322 + }, + { + "epoch": 1.84, + "grad_norm": 3.0126871980693988, + "learning_rate": 1.4595465695079589e-06, + "loss": 0.6979, + "step": 12323 + }, + { + "epoch": 1.84, + "grad_norm": 3.2011333407352227, + "learning_rate": 1.4594607646925226e-06, + "loss": 0.6862, + "step": 12324 + }, + { + "epoch": 1.84, + "grad_norm": 1.6132565971470834, + "learning_rate": 1.4593749555888984e-06, + "loss": 0.6699, + "step": 12325 + }, + { + "epoch": 1.84, + "grad_norm": 4.086127426322782, + "learning_rate": 1.4592891421978867e-06, + "loss": 0.6634, + "step": 12326 + }, + { + "epoch": 1.84, + "grad_norm": 1.2200696642400461, + "learning_rate": 1.4592033245202889e-06, + "loss": 0.6536, + "step": 12327 + }, + { + "epoch": 1.84, + "grad_norm": 1.0066207730990318, + "learning_rate": 1.4591175025569063e-06, + "loss": 0.666, + "step": 12328 + }, + { + "epoch": 1.84, + "grad_norm": 2.1365620199172, + "learning_rate": 1.459031676308539e-06, + "loss": 0.6842, + "step": 12329 + }, + { + "epoch": 1.84, + "grad_norm": 2.3832639755199048, + "learning_rate": 1.4589458457759886e-06, + "loss": 0.6895, + "step": 12330 + }, + { + "epoch": 1.84, + "grad_norm": 1.393335813800698, + "learning_rate": 1.4588600109600563e-06, + "loss": 0.6693, + "step": 12331 + }, + { + "epoch": 1.84, + "grad_norm": 6.606156635189357, + "learning_rate": 1.4587741718615425e-06, + "loss": 0.6966, + "step": 12332 + }, + { + "epoch": 1.84, + "grad_norm": 6.645119196469157, + "learning_rate": 1.458688328481249e-06, + "loss": 0.6725, + "step": 12333 + }, + { + "epoch": 1.84, + "grad_norm": 1.3340969656665214, + "learning_rate": 1.4586024808199766e-06, + "loss": 0.6667, + "step": 12334 + }, + { + "epoch": 1.84, + "grad_norm": 2.091680068802187, + "learning_rate": 1.458516628878527e-06, + "loss": 0.6797, + "step": 12335 + }, + { + "epoch": 1.84, + "grad_norm": 1.5535338376449659, + "learning_rate": 1.458430772657701e-06, + "loss": 0.6836, + "step": 12336 + }, + { + "epoch": 1.84, + "grad_norm": 2.5767259251974135, + "learning_rate": 1.4583449121582998e-06, + "loss": 0.6634, + "step": 12337 + }, + { + "epoch": 1.84, + "grad_norm": 2.386220919017554, + "learning_rate": 1.4582590473811253e-06, + "loss": 0.6712, + "step": 12338 + }, + { + "epoch": 1.84, + "grad_norm": 2.1494769227814907, + "learning_rate": 1.4581731783269786e-06, + "loss": 0.6289, + "step": 12339 + }, + { + "epoch": 1.84, + "grad_norm": 1.5678335719004501, + "learning_rate": 1.4580873049966609e-06, + "loss": 0.6777, + "step": 12340 + }, + { + "epoch": 1.84, + "grad_norm": 2.613936727181965, + "learning_rate": 1.4580014273909744e-06, + "loss": 0.7012, + "step": 12341 + }, + { + "epoch": 1.84, + "grad_norm": 5.355254274831479, + "learning_rate": 1.4579155455107196e-06, + "loss": 0.6888, + "step": 12342 + }, + { + "epoch": 1.84, + "grad_norm": 1.8729346309110053, + "learning_rate": 1.4578296593566982e-06, + "loss": 0.6497, + "step": 12343 + }, + { + "epoch": 1.84, + "grad_norm": 2.0725620151061532, + "learning_rate": 1.4577437689297127e-06, + "loss": 0.6771, + "step": 12344 + }, + { + "epoch": 1.84, + "grad_norm": 2.0023685514120753, + "learning_rate": 1.4576578742305642e-06, + "loss": 0.6823, + "step": 12345 + }, + { + "epoch": 1.84, + "grad_norm": 2.8318585517327284, + "learning_rate": 1.4575719752600536e-06, + "loss": 0.6667, + "step": 12346 + }, + { + "epoch": 1.84, + "grad_norm": 0.8746218086921653, + "learning_rate": 1.4574860720189838e-06, + "loss": 0.696, + "step": 12347 + }, + { + "epoch": 1.84, + "grad_norm": 0.9178024385810218, + "learning_rate": 1.457400164508156e-06, + "loss": 0.653, + "step": 12348 + }, + { + "epoch": 1.84, + "grad_norm": 0.9681020906389491, + "learning_rate": 1.457314252728372e-06, + "loss": 0.666, + "step": 12349 + }, + { + "epoch": 1.84, + "grad_norm": 1.7536934820314978, + "learning_rate": 1.4572283366804335e-06, + "loss": 0.6829, + "step": 12350 + }, + { + "epoch": 1.84, + "grad_norm": 2.892422773944393, + "learning_rate": 1.4571424163651425e-06, + "loss": 0.679, + "step": 12351 + }, + { + "epoch": 1.84, + "grad_norm": 1.303243052175317, + "learning_rate": 1.457056491783301e-06, + "loss": 0.6569, + "step": 12352 + }, + { + "epoch": 1.84, + "grad_norm": 1.616104963120338, + "learning_rate": 1.4569705629357107e-06, + "loss": 0.6751, + "step": 12353 + }, + { + "epoch": 1.84, + "grad_norm": 1.1742226927596253, + "learning_rate": 1.4568846298231742e-06, + "loss": 0.6882, + "step": 12354 + }, + { + "epoch": 1.84, + "grad_norm": 1.4168245859600526, + "learning_rate": 1.4567986924464925e-06, + "loss": 0.6934, + "step": 12355 + }, + { + "epoch": 1.84, + "grad_norm": 1.2655810833351862, + "learning_rate": 1.4567127508064684e-06, + "loss": 0.6712, + "step": 12356 + }, + { + "epoch": 1.84, + "grad_norm": 2.3664419155898795, + "learning_rate": 1.4566268049039038e-06, + "loss": 0.6667, + "step": 12357 + }, + { + "epoch": 1.84, + "grad_norm": 0.9648417689894088, + "learning_rate": 1.4565408547396007e-06, + "loss": 0.6641, + "step": 12358 + }, + { + "epoch": 1.84, + "grad_norm": 4.369490038360759, + "learning_rate": 1.4564549003143617e-06, + "loss": 0.6777, + "step": 12359 + }, + { + "epoch": 1.84, + "grad_norm": 3.8145846877002754, + "learning_rate": 1.4563689416289887e-06, + "loss": 0.6862, + "step": 12360 + }, + { + "epoch": 1.84, + "grad_norm": 1.2775625350252207, + "learning_rate": 1.4562829786842838e-06, + "loss": 0.6784, + "step": 12361 + }, + { + "epoch": 1.84, + "grad_norm": 2.6239690637634294, + "learning_rate": 1.4561970114810497e-06, + "loss": 0.6693, + "step": 12362 + }, + { + "epoch": 1.84, + "grad_norm": 2.1593929295021512, + "learning_rate": 1.4561110400200884e-06, + "loss": 0.6797, + "step": 12363 + }, + { + "epoch": 1.84, + "grad_norm": 1.1491249250045785, + "learning_rate": 1.4560250643022027e-06, + "loss": 0.6497, + "step": 12364 + }, + { + "epoch": 1.84, + "grad_norm": 3.6183071740515564, + "learning_rate": 1.4559390843281944e-06, + "loss": 0.6686, + "step": 12365 + }, + { + "epoch": 1.84, + "grad_norm": 2.6534247920776286, + "learning_rate": 1.4558531000988666e-06, + "loss": 0.6647, + "step": 12366 + }, + { + "epoch": 1.84, + "grad_norm": 4.510353228967995, + "learning_rate": 1.4557671116150214e-06, + "loss": 0.6699, + "step": 12367 + }, + { + "epoch": 1.84, + "grad_norm": 0.9284576944822343, + "learning_rate": 1.4556811188774613e-06, + "loss": 0.6478, + "step": 12368 + }, + { + "epoch": 1.84, + "grad_norm": 0.9473501568395873, + "learning_rate": 1.455595121886989e-06, + "loss": 0.6732, + "step": 12369 + }, + { + "epoch": 1.84, + "grad_norm": 1.0672813527426297, + "learning_rate": 1.4555091206444076e-06, + "loss": 0.6829, + "step": 12370 + }, + { + "epoch": 1.85, + "grad_norm": 7.368321984083155, + "learning_rate": 1.455423115150519e-06, + "loss": 0.6868, + "step": 12371 + }, + { + "epoch": 1.85, + "grad_norm": 1.2772983723382383, + "learning_rate": 1.455337105406126e-06, + "loss": 0.6947, + "step": 12372 + }, + { + "epoch": 1.85, + "grad_norm": 2.509040955558913, + "learning_rate": 1.4552510914120318e-06, + "loss": 0.6426, + "step": 12373 + }, + { + "epoch": 1.85, + "grad_norm": 2.1856723912044402, + "learning_rate": 1.4551650731690389e-06, + "loss": 0.7031, + "step": 12374 + }, + { + "epoch": 1.85, + "grad_norm": 3.759403776170905, + "learning_rate": 1.4550790506779498e-06, + "loss": 0.6836, + "step": 12375 + }, + { + "epoch": 1.85, + "grad_norm": 1.9235037183863368, + "learning_rate": 1.4549930239395677e-06, + "loss": 0.6641, + "step": 12376 + }, + { + "epoch": 1.85, + "grad_norm": 1.230663736675649, + "learning_rate": 1.4549069929546955e-06, + "loss": 0.7051, + "step": 12377 + }, + { + "epoch": 1.85, + "grad_norm": 2.7807591375201155, + "learning_rate": 1.4548209577241366e-06, + "loss": 0.6654, + "step": 12378 + }, + { + "epoch": 1.85, + "grad_norm": 1.2488084152359842, + "learning_rate": 1.454734918248693e-06, + "loss": 0.6999, + "step": 12379 + }, + { + "epoch": 1.85, + "grad_norm": 2.2465282063509613, + "learning_rate": 1.4546488745291682e-06, + "loss": 0.6823, + "step": 12380 + }, + { + "epoch": 1.85, + "grad_norm": 1.670908978238409, + "learning_rate": 1.4545628265663657e-06, + "loss": 0.6484, + "step": 12381 + }, + { + "epoch": 1.85, + "grad_norm": 0.9778504386806836, + "learning_rate": 1.4544767743610876e-06, + "loss": 0.6628, + "step": 12382 + }, + { + "epoch": 1.85, + "grad_norm": 2.4069285331804187, + "learning_rate": 1.454390717914138e-06, + "loss": 0.6784, + "step": 12383 + }, + { + "epoch": 1.85, + "grad_norm": 3.8188331938888775, + "learning_rate": 1.4543046572263194e-06, + "loss": 0.6628, + "step": 12384 + }, + { + "epoch": 1.85, + "grad_norm": 4.185761871521142, + "learning_rate": 1.4542185922984352e-06, + "loss": 0.6725, + "step": 12385 + }, + { + "epoch": 1.85, + "grad_norm": 1.2288706982516169, + "learning_rate": 1.454132523131289e-06, + "loss": 0.6341, + "step": 12386 + }, + { + "epoch": 1.85, + "grad_norm": 3.886212120417973, + "learning_rate": 1.4540464497256834e-06, + "loss": 0.6712, + "step": 12387 + }, + { + "epoch": 1.85, + "grad_norm": 3.457020588711908, + "learning_rate": 1.4539603720824222e-06, + "loss": 0.6595, + "step": 12388 + }, + { + "epoch": 1.85, + "grad_norm": 2.002037621406297, + "learning_rate": 1.4538742902023088e-06, + "loss": 0.6582, + "step": 12389 + }, + { + "epoch": 1.85, + "grad_norm": 1.7558344966994766, + "learning_rate": 1.4537882040861464e-06, + "loss": 0.6562, + "step": 12390 + }, + { + "epoch": 1.85, + "grad_norm": 1.025307522416319, + "learning_rate": 1.4537021137347391e-06, + "loss": 0.6478, + "step": 12391 + }, + { + "epoch": 1.85, + "grad_norm": 2.5560308624186145, + "learning_rate": 1.4536160191488893e-06, + "loss": 0.6478, + "step": 12392 + }, + { + "epoch": 1.85, + "grad_norm": 8.612290902423858, + "learning_rate": 1.4535299203294012e-06, + "loss": 0.6914, + "step": 12393 + }, + { + "epoch": 1.85, + "grad_norm": 1.4377149750572964, + "learning_rate": 1.4534438172770784e-06, + "loss": 0.6667, + "step": 12394 + }, + { + "epoch": 1.85, + "grad_norm": 3.243596055372939, + "learning_rate": 1.453357709992724e-06, + "loss": 0.6595, + "step": 12395 + }, + { + "epoch": 1.85, + "grad_norm": 3.8785697854188608, + "learning_rate": 1.4532715984771424e-06, + "loss": 0.6816, + "step": 12396 + }, + { + "epoch": 1.85, + "grad_norm": 1.845802347553367, + "learning_rate": 1.4531854827311368e-06, + "loss": 0.7005, + "step": 12397 + }, + { + "epoch": 1.85, + "grad_norm": 2.4644227875313476, + "learning_rate": 1.4530993627555107e-06, + "loss": 0.6908, + "step": 12398 + }, + { + "epoch": 1.85, + "grad_norm": 5.2776396885277395, + "learning_rate": 1.4530132385510685e-06, + "loss": 0.64, + "step": 12399 + }, + { + "epoch": 1.85, + "grad_norm": 1.4019412318461644, + "learning_rate": 1.4529271101186136e-06, + "loss": 0.6608, + "step": 12400 + }, + { + "epoch": 1.85, + "grad_norm": 0.9840790326260761, + "learning_rate": 1.4528409774589497e-06, + "loss": 0.6745, + "step": 12401 + }, + { + "epoch": 1.85, + "grad_norm": 1.3592035988588818, + "learning_rate": 1.4527548405728814e-06, + "loss": 0.666, + "step": 12402 + }, + { + "epoch": 1.85, + "grad_norm": 4.127300089812723, + "learning_rate": 1.4526686994612117e-06, + "loss": 0.64, + "step": 12403 + }, + { + "epoch": 1.85, + "grad_norm": 1.022560936055211, + "learning_rate": 1.4525825541247452e-06, + "loss": 0.6667, + "step": 12404 + }, + { + "epoch": 1.85, + "grad_norm": 4.540494362532085, + "learning_rate": 1.4524964045642859e-06, + "loss": 0.7018, + "step": 12405 + }, + { + "epoch": 1.85, + "grad_norm": 2.319405772991775, + "learning_rate": 1.452410250780637e-06, + "loss": 0.6908, + "step": 12406 + }, + { + "epoch": 1.85, + "grad_norm": 1.9692966087952928, + "learning_rate": 1.4523240927746038e-06, + "loss": 0.6432, + "step": 12407 + }, + { + "epoch": 1.85, + "grad_norm": 2.1347802290800355, + "learning_rate": 1.4522379305469895e-06, + "loss": 0.653, + "step": 12408 + }, + { + "epoch": 1.85, + "grad_norm": 1.4284465412891802, + "learning_rate": 1.4521517640985986e-06, + "loss": 0.6725, + "step": 12409 + }, + { + "epoch": 1.85, + "grad_norm": 1.0774468126662733, + "learning_rate": 1.4520655934302356e-06, + "loss": 0.6979, + "step": 12410 + }, + { + "epoch": 1.85, + "grad_norm": 2.893985733540972, + "learning_rate": 1.4519794185427045e-06, + "loss": 0.6595, + "step": 12411 + }, + { + "epoch": 1.85, + "grad_norm": 1.6757189997159703, + "learning_rate": 1.4518932394368093e-06, + "loss": 0.6686, + "step": 12412 + }, + { + "epoch": 1.85, + "grad_norm": 1.8730216349640032, + "learning_rate": 1.4518070561133544e-06, + "loss": 0.6771, + "step": 12413 + }, + { + "epoch": 1.85, + "grad_norm": 2.2843981144343752, + "learning_rate": 1.4517208685731445e-06, + "loss": 0.6771, + "step": 12414 + }, + { + "epoch": 1.85, + "grad_norm": 1.8392610981143396, + "learning_rate": 1.4516346768169838e-06, + "loss": 0.6719, + "step": 12415 + }, + { + "epoch": 1.85, + "grad_norm": 1.33434607769667, + "learning_rate": 1.4515484808456763e-06, + "loss": 0.7044, + "step": 12416 + }, + { + "epoch": 1.85, + "grad_norm": 1.2460805388358238, + "learning_rate": 1.4514622806600273e-06, + "loss": 0.6608, + "step": 12417 + }, + { + "epoch": 1.85, + "grad_norm": 1.3899524836977712, + "learning_rate": 1.451376076260841e-06, + "loss": 0.6439, + "step": 12418 + }, + { + "epoch": 1.85, + "grad_norm": 2.1069707589165976, + "learning_rate": 1.4512898676489216e-06, + "loss": 0.6712, + "step": 12419 + }, + { + "epoch": 1.85, + "grad_norm": 1.8324222276089908, + "learning_rate": 1.4512036548250741e-06, + "loss": 0.6523, + "step": 12420 + }, + { + "epoch": 1.85, + "grad_norm": 1.687047909781031, + "learning_rate": 1.451117437790103e-06, + "loss": 0.724, + "step": 12421 + }, + { + "epoch": 1.85, + "grad_norm": 2.0656555916535084, + "learning_rate": 1.4510312165448128e-06, + "loss": 0.707, + "step": 12422 + }, + { + "epoch": 1.85, + "grad_norm": 2.1614303410097544, + "learning_rate": 1.4509449910900085e-06, + "loss": 0.6634, + "step": 12423 + }, + { + "epoch": 1.85, + "grad_norm": 1.7017602453527292, + "learning_rate": 1.4508587614264948e-06, + "loss": 0.6556, + "step": 12424 + }, + { + "epoch": 1.85, + "grad_norm": 3.6764208861376595, + "learning_rate": 1.4507725275550765e-06, + "loss": 0.6842, + "step": 12425 + }, + { + "epoch": 1.85, + "grad_norm": 0.9112740464087864, + "learning_rate": 1.450686289476558e-06, + "loss": 0.6576, + "step": 12426 + }, + { + "epoch": 1.85, + "grad_norm": 1.175734763963531, + "learning_rate": 1.4506000471917445e-06, + "loss": 0.6296, + "step": 12427 + }, + { + "epoch": 1.85, + "grad_norm": 0.8772470026223999, + "learning_rate": 1.4505138007014411e-06, + "loss": 0.6875, + "step": 12428 + }, + { + "epoch": 1.85, + "grad_norm": 4.127107555250363, + "learning_rate": 1.4504275500064524e-06, + "loss": 0.6589, + "step": 12429 + }, + { + "epoch": 1.85, + "grad_norm": 1.3101168402356096, + "learning_rate": 1.4503412951075837e-06, + "loss": 0.6556, + "step": 12430 + }, + { + "epoch": 1.85, + "grad_norm": 3.205948714967068, + "learning_rate": 1.45025503600564e-06, + "loss": 0.6523, + "step": 12431 + }, + { + "epoch": 1.85, + "grad_norm": 1.0490133174389114, + "learning_rate": 1.4501687727014258e-06, + "loss": 0.6432, + "step": 12432 + }, + { + "epoch": 1.85, + "grad_norm": 5.586129818393517, + "learning_rate": 1.450082505195747e-06, + "loss": 0.6823, + "step": 12433 + }, + { + "epoch": 1.85, + "grad_norm": 1.945690660268915, + "learning_rate": 1.4499962334894083e-06, + "loss": 0.6432, + "step": 12434 + }, + { + "epoch": 1.85, + "grad_norm": 1.2530325609265869, + "learning_rate": 1.4499099575832147e-06, + "loss": 0.7005, + "step": 12435 + }, + { + "epoch": 1.85, + "grad_norm": 5.422082795971429, + "learning_rate": 1.449823677477972e-06, + "loss": 0.6842, + "step": 12436 + }, + { + "epoch": 1.85, + "grad_norm": 1.80849807221026, + "learning_rate": 1.4497373931744853e-06, + "loss": 0.6986, + "step": 12437 + }, + { + "epoch": 1.86, + "grad_norm": 2.4275115248838937, + "learning_rate": 1.449651104673559e-06, + "loss": 0.6764, + "step": 12438 + }, + { + "epoch": 1.86, + "grad_norm": 3.500027093935637, + "learning_rate": 1.4495648119759999e-06, + "loss": 0.666, + "step": 12439 + }, + { + "epoch": 1.86, + "grad_norm": 1.6808298560027195, + "learning_rate": 1.4494785150826123e-06, + "loss": 0.651, + "step": 12440 + }, + { + "epoch": 1.86, + "grad_norm": 1.4615505183194404, + "learning_rate": 1.449392213994202e-06, + "loss": 0.6764, + "step": 12441 + }, + { + "epoch": 1.86, + "grad_norm": 1.5637310832739368, + "learning_rate": 1.4493059087115745e-06, + "loss": 0.6758, + "step": 12442 + }, + { + "epoch": 1.86, + "grad_norm": 1.2933772903627727, + "learning_rate": 1.4492195992355352e-06, + "loss": 0.6458, + "step": 12443 + }, + { + "epoch": 1.86, + "grad_norm": 1.543161717183826, + "learning_rate": 1.4491332855668899e-06, + "loss": 0.6484, + "step": 12444 + }, + { + "epoch": 1.86, + "grad_norm": 1.767386580533365, + "learning_rate": 1.4490469677064435e-06, + "loss": 0.6523, + "step": 12445 + }, + { + "epoch": 1.86, + "grad_norm": 2.778888567784385, + "learning_rate": 1.4489606456550024e-06, + "loss": 0.6647, + "step": 12446 + }, + { + "epoch": 1.86, + "grad_norm": 4.050593781943326, + "learning_rate": 1.4488743194133717e-06, + "loss": 0.6855, + "step": 12447 + }, + { + "epoch": 1.86, + "grad_norm": 1.518506598391988, + "learning_rate": 1.448787988982357e-06, + "loss": 0.6693, + "step": 12448 + }, + { + "epoch": 1.86, + "grad_norm": 5.085230341601038, + "learning_rate": 1.4487016543627646e-06, + "loss": 0.6797, + "step": 12449 + }, + { + "epoch": 1.86, + "grad_norm": 1.1971794330071301, + "learning_rate": 1.4486153155554e-06, + "loss": 0.6855, + "step": 12450 + }, + { + "epoch": 1.86, + "grad_norm": 2.9853979700482363, + "learning_rate": 1.448528972561069e-06, + "loss": 0.6641, + "step": 12451 + }, + { + "epoch": 1.86, + "grad_norm": 1.2559583884596903, + "learning_rate": 1.4484426253805775e-06, + "loss": 0.6725, + "step": 12452 + }, + { + "epoch": 1.86, + "grad_norm": 3.88532485520193, + "learning_rate": 1.4483562740147311e-06, + "loss": 0.6549, + "step": 12453 + }, + { + "epoch": 1.86, + "grad_norm": 2.031953888931476, + "learning_rate": 1.448269918464336e-06, + "loss": 0.6901, + "step": 12454 + }, + { + "epoch": 1.86, + "grad_norm": 0.9797599345950729, + "learning_rate": 1.4481835587301983e-06, + "loss": 0.653, + "step": 12455 + }, + { + "epoch": 1.86, + "grad_norm": 1.5967065793071815, + "learning_rate": 1.4480971948131236e-06, + "loss": 0.6758, + "step": 12456 + }, + { + "epoch": 1.86, + "grad_norm": 1.9691948570329385, + "learning_rate": 1.4480108267139184e-06, + "loss": 0.6927, + "step": 12457 + }, + { + "epoch": 1.86, + "grad_norm": 1.098600720430582, + "learning_rate": 1.4479244544333881e-06, + "loss": 0.6751, + "step": 12458 + }, + { + "epoch": 1.86, + "grad_norm": 1.1814409692782009, + "learning_rate": 1.4478380779723395e-06, + "loss": 0.638, + "step": 12459 + }, + { + "epoch": 1.86, + "grad_norm": 1.3670916142439984, + "learning_rate": 1.4477516973315788e-06, + "loss": 0.6419, + "step": 12460 + }, + { + "epoch": 1.86, + "grad_norm": 1.717567004848262, + "learning_rate": 1.4476653125119116e-06, + "loss": 0.6667, + "step": 12461 + }, + { + "epoch": 1.86, + "grad_norm": 1.056241514876347, + "learning_rate": 1.4475789235141446e-06, + "loss": 0.653, + "step": 12462 + }, + { + "epoch": 1.86, + "grad_norm": 1.3800354846280465, + "learning_rate": 1.4474925303390837e-06, + "loss": 0.6764, + "step": 12463 + }, + { + "epoch": 1.86, + "grad_norm": 2.6132226206271527, + "learning_rate": 1.4474061329875355e-06, + "loss": 0.6517, + "step": 12464 + }, + { + "epoch": 1.86, + "grad_norm": 2.1592046782102425, + "learning_rate": 1.4473197314603063e-06, + "loss": 0.6693, + "step": 12465 + }, + { + "epoch": 1.86, + "grad_norm": 2.2016720790677744, + "learning_rate": 1.4472333257582027e-06, + "loss": 0.6823, + "step": 12466 + }, + { + "epoch": 1.86, + "grad_norm": 1.320916702462576, + "learning_rate": 1.4471469158820306e-06, + "loss": 0.7005, + "step": 12467 + }, + { + "epoch": 1.86, + "grad_norm": 1.5045564406476266, + "learning_rate": 1.447060501832597e-06, + "loss": 0.6549, + "step": 12468 + }, + { + "epoch": 1.86, + "grad_norm": 2.6884655938963564, + "learning_rate": 1.446974083610708e-06, + "loss": 0.6842, + "step": 12469 + }, + { + "epoch": 1.86, + "grad_norm": 2.223549647286686, + "learning_rate": 1.4468876612171705e-06, + "loss": 0.6491, + "step": 12470 + }, + { + "epoch": 1.86, + "grad_norm": 2.5638453466268443, + "learning_rate": 1.4468012346527912e-06, + "loss": 0.6582, + "step": 12471 + }, + { + "epoch": 1.86, + "grad_norm": 1.505166932268932, + "learning_rate": 1.4467148039183759e-06, + "loss": 0.6921, + "step": 12472 + }, + { + "epoch": 1.86, + "grad_norm": 1.9709796576124676, + "learning_rate": 1.4466283690147326e-06, + "loss": 0.6855, + "step": 12473 + }, + { + "epoch": 1.86, + "grad_norm": 2.407872282270559, + "learning_rate": 1.4465419299426663e-06, + "loss": 0.653, + "step": 12474 + }, + { + "epoch": 1.86, + "grad_norm": 1.3408356629723104, + "learning_rate": 1.4464554867029852e-06, + "loss": 0.6745, + "step": 12475 + }, + { + "epoch": 1.86, + "grad_norm": 3.320691854643131, + "learning_rate": 1.4463690392964955e-06, + "loss": 0.6348, + "step": 12476 + }, + { + "epoch": 1.86, + "grad_norm": 1.4587467684599067, + "learning_rate": 1.4462825877240042e-06, + "loss": 0.6576, + "step": 12477 + }, + { + "epoch": 1.86, + "grad_norm": 3.7606234737790856, + "learning_rate": 1.446196131986318e-06, + "loss": 0.6797, + "step": 12478 + }, + { + "epoch": 1.86, + "grad_norm": 0.9828835507504571, + "learning_rate": 1.4461096720842435e-06, + "loss": 0.6875, + "step": 12479 + }, + { + "epoch": 1.86, + "grad_norm": 1.2720832561270508, + "learning_rate": 1.4460232080185883e-06, + "loss": 0.6322, + "step": 12480 + }, + { + "epoch": 1.86, + "grad_norm": 1.5026604544128987, + "learning_rate": 1.445936739790159e-06, + "loss": 0.6602, + "step": 12481 + }, + { + "epoch": 1.86, + "grad_norm": 1.1640786844216429, + "learning_rate": 1.4458502673997628e-06, + "loss": 0.6484, + "step": 12482 + }, + { + "epoch": 1.86, + "grad_norm": 1.4262943005058357, + "learning_rate": 1.4457637908482064e-06, + "loss": 0.651, + "step": 12483 + }, + { + "epoch": 1.86, + "grad_norm": 2.2433972821897883, + "learning_rate": 1.4456773101362974e-06, + "loss": 0.6725, + "step": 12484 + }, + { + "epoch": 1.86, + "grad_norm": 1.055048474791283, + "learning_rate": 1.4455908252648423e-06, + "loss": 0.696, + "step": 12485 + }, + { + "epoch": 1.86, + "grad_norm": 3.410698143550614, + "learning_rate": 1.445504336234649e-06, + "loss": 0.6641, + "step": 12486 + }, + { + "epoch": 1.86, + "grad_norm": 2.2926866094633973, + "learning_rate": 1.4454178430465241e-06, + "loss": 0.6751, + "step": 12487 + }, + { + "epoch": 1.86, + "grad_norm": 6.859866655055252, + "learning_rate": 1.445331345701275e-06, + "loss": 0.6921, + "step": 12488 + }, + { + "epoch": 1.86, + "grad_norm": 3.355775926795653, + "learning_rate": 1.4452448441997092e-06, + "loss": 0.6732, + "step": 12489 + }, + { + "epoch": 1.86, + "grad_norm": 3.486622519457837, + "learning_rate": 1.445158338542634e-06, + "loss": 0.6829, + "step": 12490 + }, + { + "epoch": 1.86, + "grad_norm": 5.747909772207964, + "learning_rate": 1.4450718287308565e-06, + "loss": 0.6888, + "step": 12491 + }, + { + "epoch": 1.86, + "grad_norm": 4.2278229816738655, + "learning_rate": 1.444985314765184e-06, + "loss": 0.6452, + "step": 12492 + }, + { + "epoch": 1.86, + "grad_norm": 1.2750037732010486, + "learning_rate": 1.4448987966464246e-06, + "loss": 0.6667, + "step": 12493 + }, + { + "epoch": 1.86, + "grad_norm": 2.2442961304924807, + "learning_rate": 1.4448122743753855e-06, + "loss": 0.6732, + "step": 12494 + }, + { + "epoch": 1.86, + "grad_norm": 1.824040588578854, + "learning_rate": 1.4447257479528736e-06, + "loss": 0.6816, + "step": 12495 + }, + { + "epoch": 1.86, + "grad_norm": 3.23549804726312, + "learning_rate": 1.4446392173796973e-06, + "loss": 0.668, + "step": 12496 + }, + { + "epoch": 1.86, + "grad_norm": 1.254407223657605, + "learning_rate": 1.4445526826566638e-06, + "loss": 0.6406, + "step": 12497 + }, + { + "epoch": 1.86, + "grad_norm": 1.0033299421343163, + "learning_rate": 1.4444661437845808e-06, + "loss": 0.6706, + "step": 12498 + }, + { + "epoch": 1.86, + "grad_norm": 0.953563963352119, + "learning_rate": 1.4443796007642562e-06, + "loss": 0.668, + "step": 12499 + }, + { + "epoch": 1.86, + "grad_norm": 1.6479677951073497, + "learning_rate": 1.4442930535964973e-06, + "loss": 0.6829, + "step": 12500 + }, + { + "epoch": 1.86, + "grad_norm": 1.9420785849144322, + "learning_rate": 1.4442065022821117e-06, + "loss": 0.6764, + "step": 12501 + }, + { + "epoch": 1.86, + "grad_norm": 4.075371943928709, + "learning_rate": 1.444119946821908e-06, + "loss": 0.6576, + "step": 12502 + }, + { + "epoch": 1.86, + "grad_norm": 2.321831288802021, + "learning_rate": 1.4440333872166934e-06, + "loss": 0.6348, + "step": 12503 + }, + { + "epoch": 1.86, + "grad_norm": 1.756047798931149, + "learning_rate": 1.4439468234672759e-06, + "loss": 0.6667, + "step": 12504 + }, + { + "epoch": 1.87, + "grad_norm": 2.7852553505378075, + "learning_rate": 1.4438602555744634e-06, + "loss": 0.6908, + "step": 12505 + }, + { + "epoch": 1.87, + "grad_norm": 2.613444780140548, + "learning_rate": 1.4437736835390642e-06, + "loss": 0.6504, + "step": 12506 + }, + { + "epoch": 1.87, + "grad_norm": 4.95838953484507, + "learning_rate": 1.4436871073618857e-06, + "loss": 0.6706, + "step": 12507 + }, + { + "epoch": 1.87, + "grad_norm": 1.3610608115848963, + "learning_rate": 1.4436005270437358e-06, + "loss": 0.6745, + "step": 12508 + }, + { + "epoch": 1.87, + "grad_norm": 3.480928643468722, + "learning_rate": 1.4435139425854232e-06, + "loss": 0.6569, + "step": 12509 + }, + { + "epoch": 1.87, + "grad_norm": 3.313870274202174, + "learning_rate": 1.4434273539877565e-06, + "loss": 0.6712, + "step": 12510 + }, + { + "epoch": 1.87, + "grad_norm": 2.507763129447576, + "learning_rate": 1.4433407612515423e-06, + "loss": 0.6764, + "step": 12511 + }, + { + "epoch": 1.87, + "grad_norm": 5.373338184911481, + "learning_rate": 1.4432541643775897e-06, + "loss": 0.6882, + "step": 12512 + }, + { + "epoch": 1.87, + "grad_norm": 1.248556582236853, + "learning_rate": 1.4431675633667068e-06, + "loss": 0.6673, + "step": 12513 + }, + { + "epoch": 1.87, + "grad_norm": 1.5692631267319064, + "learning_rate": 1.4430809582197019e-06, + "loss": 0.6699, + "step": 12514 + }, + { + "epoch": 1.87, + "grad_norm": 1.8962456775121872, + "learning_rate": 1.4429943489373832e-06, + "loss": 0.6628, + "step": 12515 + }, + { + "epoch": 1.87, + "grad_norm": 3.4639377742338495, + "learning_rate": 1.4429077355205591e-06, + "loss": 0.6816, + "step": 12516 + }, + { + "epoch": 1.87, + "grad_norm": 2.362575275971991, + "learning_rate": 1.442821117970038e-06, + "loss": 0.6654, + "step": 12517 + }, + { + "epoch": 1.87, + "grad_norm": 3.5771171370844232, + "learning_rate": 1.442734496286628e-06, + "loss": 0.6589, + "step": 12518 + }, + { + "epoch": 1.87, + "grad_norm": 3.225307619925999, + "learning_rate": 1.442647870471138e-06, + "loss": 0.6654, + "step": 12519 + }, + { + "epoch": 1.87, + "grad_norm": 1.755354409032059, + "learning_rate": 1.4425612405243763e-06, + "loss": 0.6517, + "step": 12520 + }, + { + "epoch": 1.87, + "grad_norm": 1.0400912075690814, + "learning_rate": 1.4424746064471514e-06, + "loss": 0.6882, + "step": 12521 + }, + { + "epoch": 1.87, + "grad_norm": 2.73668467052346, + "learning_rate": 1.4423879682402716e-06, + "loss": 0.6576, + "step": 12522 + }, + { + "epoch": 1.87, + "grad_norm": 3.0733388669867727, + "learning_rate": 1.4423013259045463e-06, + "loss": 0.6452, + "step": 12523 + }, + { + "epoch": 1.87, + "grad_norm": 1.8663234939849576, + "learning_rate": 1.442214679440783e-06, + "loss": 0.694, + "step": 12524 + }, + { + "epoch": 1.87, + "grad_norm": 1.3064750947580546, + "learning_rate": 1.4421280288497913e-06, + "loss": 0.6855, + "step": 12525 + }, + { + "epoch": 1.87, + "grad_norm": 1.770743498090016, + "learning_rate": 1.4420413741323799e-06, + "loss": 0.6745, + "step": 12526 + }, + { + "epoch": 1.87, + "grad_norm": 1.8015049463573158, + "learning_rate": 1.4419547152893567e-06, + "loss": 0.6712, + "step": 12527 + }, + { + "epoch": 1.87, + "grad_norm": 2.9939878794859203, + "learning_rate": 1.4418680523215314e-06, + "loss": 0.681, + "step": 12528 + }, + { + "epoch": 1.87, + "grad_norm": 1.2243556718038562, + "learning_rate": 1.4417813852297122e-06, + "loss": 0.694, + "step": 12529 + }, + { + "epoch": 1.87, + "grad_norm": 1.5489897589035702, + "learning_rate": 1.4416947140147087e-06, + "loss": 0.6771, + "step": 12530 + }, + { + "epoch": 1.87, + "grad_norm": 2.30028689172695, + "learning_rate": 1.441608038677329e-06, + "loss": 0.6745, + "step": 12531 + }, + { + "epoch": 1.87, + "grad_norm": 1.8067880361236945, + "learning_rate": 1.4415213592183824e-06, + "loss": 0.6667, + "step": 12532 + }, + { + "epoch": 1.87, + "grad_norm": 1.8994451315898802, + "learning_rate": 1.441434675638678e-06, + "loss": 0.6712, + "step": 12533 + }, + { + "epoch": 1.87, + "grad_norm": 2.3540290288516683, + "learning_rate": 1.4413479879390251e-06, + "loss": 0.6777, + "step": 12534 + }, + { + "epoch": 1.87, + "grad_norm": 0.8332594031185058, + "learning_rate": 1.4412612961202317e-06, + "loss": 0.6823, + "step": 12535 + }, + { + "epoch": 1.87, + "grad_norm": 1.5052437168458745, + "learning_rate": 1.4411746001831082e-06, + "loss": 0.6406, + "step": 12536 + }, + { + "epoch": 1.87, + "grad_norm": 1.0290301325447977, + "learning_rate": 1.4410879001284631e-06, + "loss": 0.6771, + "step": 12537 + }, + { + "epoch": 1.87, + "grad_norm": 1.425723402232241, + "learning_rate": 1.4410011959571051e-06, + "loss": 0.6732, + "step": 12538 + }, + { + "epoch": 1.87, + "grad_norm": 3.5022097439387947, + "learning_rate": 1.4409144876698446e-06, + "loss": 0.6582, + "step": 12539 + }, + { + "epoch": 1.87, + "grad_norm": 3.9001943939992163, + "learning_rate": 1.4408277752674897e-06, + "loss": 0.6758, + "step": 12540 + }, + { + "epoch": 1.87, + "grad_norm": 1.607133407347825, + "learning_rate": 1.4407410587508503e-06, + "loss": 0.679, + "step": 12541 + }, + { + "epoch": 1.87, + "grad_norm": 0.8834046526063377, + "learning_rate": 1.4406543381207359e-06, + "loss": 0.6595, + "step": 12542 + }, + { + "epoch": 1.87, + "grad_norm": 1.9857111745590363, + "learning_rate": 1.4405676133779554e-06, + "loss": 0.7025, + "step": 12543 + }, + { + "epoch": 1.87, + "grad_norm": 3.0653242603713333, + "learning_rate": 1.4404808845233185e-06, + "loss": 0.6615, + "step": 12544 + }, + { + "epoch": 1.87, + "grad_norm": 1.736288276301793, + "learning_rate": 1.4403941515576343e-06, + "loss": 0.6686, + "step": 12545 + }, + { + "epoch": 1.87, + "grad_norm": 1.9380361311398617, + "learning_rate": 1.4403074144817127e-06, + "loss": 0.6888, + "step": 12546 + }, + { + "epoch": 1.87, + "grad_norm": 2.9746262561283436, + "learning_rate": 1.4402206732963632e-06, + "loss": 0.6484, + "step": 12547 + }, + { + "epoch": 1.87, + "grad_norm": 2.3166050841700847, + "learning_rate": 1.4401339280023948e-06, + "loss": 0.709, + "step": 12548 + }, + { + "epoch": 1.87, + "grad_norm": 1.0628061988923159, + "learning_rate": 1.440047178600618e-06, + "loss": 0.6784, + "step": 12549 + }, + { + "epoch": 1.87, + "grad_norm": 1.7678560400564869, + "learning_rate": 1.4399604250918418e-06, + "loss": 0.651, + "step": 12550 + }, + { + "epoch": 1.87, + "grad_norm": 3.0139795638065414, + "learning_rate": 1.4398736674768756e-06, + "loss": 0.6699, + "step": 12551 + }, + { + "epoch": 1.87, + "grad_norm": 5.866755954269344, + "learning_rate": 1.4397869057565302e-06, + "loss": 0.6803, + "step": 12552 + }, + { + "epoch": 1.87, + "grad_norm": 1.1029200307477667, + "learning_rate": 1.4397001399316144e-06, + "loss": 0.6771, + "step": 12553 + }, + { + "epoch": 1.87, + "grad_norm": 2.846048764038593, + "learning_rate": 1.4396133700029385e-06, + "loss": 0.6797, + "step": 12554 + }, + { + "epoch": 1.87, + "grad_norm": 0.8940650822515483, + "learning_rate": 1.439526595971312e-06, + "loss": 0.668, + "step": 12555 + }, + { + "epoch": 1.87, + "grad_norm": 1.1804573115115742, + "learning_rate": 1.439439817837545e-06, + "loss": 0.6595, + "step": 12556 + }, + { + "epoch": 1.87, + "grad_norm": 2.0972383743064658, + "learning_rate": 1.4393530356024472e-06, + "loss": 0.6784, + "step": 12557 + }, + { + "epoch": 1.87, + "grad_norm": 1.945125785065361, + "learning_rate": 1.4392662492668285e-06, + "loss": 0.6823, + "step": 12558 + }, + { + "epoch": 1.87, + "grad_norm": 2.395266403452487, + "learning_rate": 1.4391794588314996e-06, + "loss": 0.668, + "step": 12559 + }, + { + "epoch": 1.87, + "grad_norm": 0.8682515601843726, + "learning_rate": 1.4390926642972696e-06, + "loss": 0.6536, + "step": 12560 + }, + { + "epoch": 1.87, + "grad_norm": 1.6883434453818402, + "learning_rate": 1.439005865664949e-06, + "loss": 0.6836, + "step": 12561 + }, + { + "epoch": 1.87, + "grad_norm": 2.6909191445261555, + "learning_rate": 1.4389190629353478e-06, + "loss": 0.6732, + "step": 12562 + }, + { + "epoch": 1.87, + "grad_norm": 1.2360019709216006, + "learning_rate": 1.4388322561092761e-06, + "loss": 0.6354, + "step": 12563 + }, + { + "epoch": 1.87, + "grad_norm": 2.205154622961903, + "learning_rate": 1.438745445187544e-06, + "loss": 0.6628, + "step": 12564 + }, + { + "epoch": 1.87, + "grad_norm": 3.81651945657055, + "learning_rate": 1.4386586301709623e-06, + "loss": 0.6882, + "step": 12565 + }, + { + "epoch": 1.87, + "grad_norm": 0.9702141356636155, + "learning_rate": 1.4385718110603407e-06, + "loss": 0.6589, + "step": 12566 + }, + { + "epoch": 1.87, + "grad_norm": 3.904590547007792, + "learning_rate": 1.4384849878564894e-06, + "loss": 0.6497, + "step": 12567 + }, + { + "epoch": 1.87, + "grad_norm": 1.8173225434463613, + "learning_rate": 1.4383981605602189e-06, + "loss": 0.6602, + "step": 12568 + }, + { + "epoch": 1.87, + "grad_norm": 4.060077097518184, + "learning_rate": 1.4383113291723398e-06, + "loss": 0.6771, + "step": 12569 + }, + { + "epoch": 1.87, + "grad_norm": 3.176950142417727, + "learning_rate": 1.438224493693662e-06, + "loss": 0.6628, + "step": 12570 + }, + { + "epoch": 1.87, + "grad_norm": 2.4030800945773376, + "learning_rate": 1.4381376541249965e-06, + "loss": 0.6426, + "step": 12571 + }, + { + "epoch": 1.88, + "grad_norm": 1.5847654826207236, + "learning_rate": 1.4380508104671534e-06, + "loss": 0.6458, + "step": 12572 + }, + { + "epoch": 1.88, + "grad_norm": 2.3421766916479956, + "learning_rate": 1.4379639627209438e-06, + "loss": 0.6888, + "step": 12573 + }, + { + "epoch": 1.88, + "grad_norm": 1.5767375041503078, + "learning_rate": 1.4378771108871774e-06, + "loss": 0.6628, + "step": 12574 + }, + { + "epoch": 1.88, + "grad_norm": 1.9343908446753237, + "learning_rate": 1.4377902549666652e-06, + "loss": 0.6888, + "step": 12575 + }, + { + "epoch": 1.88, + "grad_norm": 4.894321344675919, + "learning_rate": 1.437703394960218e-06, + "loss": 0.7337, + "step": 12576 + }, + { + "epoch": 1.88, + "grad_norm": 0.93607782331217, + "learning_rate": 1.437616530868646e-06, + "loss": 0.6745, + "step": 12577 + }, + { + "epoch": 1.88, + "grad_norm": 1.98934243922494, + "learning_rate": 1.4375296626927605e-06, + "loss": 0.681, + "step": 12578 + }, + { + "epoch": 1.88, + "grad_norm": 1.9774088427992862, + "learning_rate": 1.437442790433372e-06, + "loss": 0.6862, + "step": 12579 + }, + { + "epoch": 1.88, + "grad_norm": 2.4370661984135573, + "learning_rate": 1.437355914091291e-06, + "loss": 0.6712, + "step": 12580 + }, + { + "epoch": 1.88, + "grad_norm": 1.0741700765178777, + "learning_rate": 1.4372690336673285e-06, + "loss": 0.6686, + "step": 12581 + }, + { + "epoch": 1.88, + "grad_norm": 2.7842791418172035, + "learning_rate": 1.4371821491622959e-06, + "loss": 0.6771, + "step": 12582 + }, + { + "epoch": 1.88, + "grad_norm": 1.2060927857913637, + "learning_rate": 1.4370952605770032e-06, + "loss": 0.6784, + "step": 12583 + }, + { + "epoch": 1.88, + "grad_norm": 1.1447949331256906, + "learning_rate": 1.437008367912262e-06, + "loss": 0.7018, + "step": 12584 + }, + { + "epoch": 1.88, + "grad_norm": 3.4070511315325773, + "learning_rate": 1.436921471168883e-06, + "loss": 0.6549, + "step": 12585 + }, + { + "epoch": 1.88, + "grad_norm": 1.0235124259179167, + "learning_rate": 1.4368345703476772e-06, + "loss": 0.627, + "step": 12586 + }, + { + "epoch": 1.88, + "grad_norm": 5.5648860449565225, + "learning_rate": 1.436747665449456e-06, + "loss": 0.6888, + "step": 12587 + }, + { + "epoch": 1.88, + "grad_norm": 1.4051306201384222, + "learning_rate": 1.43666075647503e-06, + "loss": 0.6823, + "step": 12588 + }, + { + "epoch": 1.88, + "grad_norm": 1.5646017471125926, + "learning_rate": 1.4365738434252106e-06, + "loss": 0.6966, + "step": 12589 + }, + { + "epoch": 1.88, + "grad_norm": 1.1643267284195604, + "learning_rate": 1.4364869263008089e-06, + "loss": 0.6654, + "step": 12590 + }, + { + "epoch": 1.88, + "grad_norm": 1.2573731929949188, + "learning_rate": 1.4364000051026359e-06, + "loss": 0.6816, + "step": 12591 + }, + { + "epoch": 1.88, + "grad_norm": 3.0132044391681387, + "learning_rate": 1.4363130798315033e-06, + "loss": 0.6738, + "step": 12592 + }, + { + "epoch": 1.88, + "grad_norm": 1.432131208925529, + "learning_rate": 1.436226150488222e-06, + "loss": 0.6634, + "step": 12593 + }, + { + "epoch": 1.88, + "grad_norm": 0.9017062960095459, + "learning_rate": 1.4361392170736037e-06, + "loss": 0.6673, + "step": 12594 + }, + { + "epoch": 1.88, + "grad_norm": 0.7588905108065105, + "learning_rate": 1.436052279588459e-06, + "loss": 0.6602, + "step": 12595 + }, + { + "epoch": 1.88, + "grad_norm": 2.8441746950053384, + "learning_rate": 1.4359653380336005e-06, + "loss": 0.6615, + "step": 12596 + }, + { + "epoch": 1.88, + "grad_norm": 1.604944936418334, + "learning_rate": 1.4358783924098386e-06, + "loss": 0.6921, + "step": 12597 + }, + { + "epoch": 1.88, + "grad_norm": 0.8402019010379713, + "learning_rate": 1.435791442717985e-06, + "loss": 0.6589, + "step": 12598 + }, + { + "epoch": 1.88, + "grad_norm": 2.440828039375082, + "learning_rate": 1.4357044889588515e-06, + "loss": 0.6953, + "step": 12599 + }, + { + "epoch": 1.88, + "grad_norm": 0.7937164199949109, + "learning_rate": 1.4356175311332495e-06, + "loss": 0.6647, + "step": 12600 + }, + { + "epoch": 1.88, + "grad_norm": 2.5939882606722144, + "learning_rate": 1.4355305692419904e-06, + "loss": 0.6647, + "step": 12601 + }, + { + "epoch": 1.88, + "grad_norm": 0.9692868769244196, + "learning_rate": 1.435443603285886e-06, + "loss": 0.6823, + "step": 12602 + }, + { + "epoch": 1.88, + "grad_norm": 0.9742249723404245, + "learning_rate": 1.4353566332657483e-06, + "loss": 0.6491, + "step": 12603 + }, + { + "epoch": 1.88, + "grad_norm": 1.167606941999491, + "learning_rate": 1.435269659182388e-06, + "loss": 0.666, + "step": 12604 + }, + { + "epoch": 1.88, + "grad_norm": 0.8530220976872066, + "learning_rate": 1.435182681036618e-06, + "loss": 0.6693, + "step": 12605 + }, + { + "epoch": 1.88, + "grad_norm": 4.0296751222755525, + "learning_rate": 1.4350956988292496e-06, + "loss": 0.6771, + "step": 12606 + }, + { + "epoch": 1.88, + "grad_norm": 1.2764540807141975, + "learning_rate": 1.4350087125610942e-06, + "loss": 0.6562, + "step": 12607 + }, + { + "epoch": 1.88, + "grad_norm": 1.772671837516691, + "learning_rate": 1.4349217222329642e-06, + "loss": 0.666, + "step": 12608 + }, + { + "epoch": 1.88, + "grad_norm": 1.0127849106665296, + "learning_rate": 1.4348347278456712e-06, + "loss": 0.6712, + "step": 12609 + }, + { + "epoch": 1.88, + "grad_norm": 4.492410083852582, + "learning_rate": 1.4347477294000274e-06, + "loss": 0.6764, + "step": 12610 + }, + { + "epoch": 1.88, + "grad_norm": 3.3626487553790483, + "learning_rate": 1.4346607268968445e-06, + "loss": 0.6842, + "step": 12611 + }, + { + "epoch": 1.88, + "grad_norm": 0.8642246906751831, + "learning_rate": 1.4345737203369346e-06, + "loss": 0.638, + "step": 12612 + }, + { + "epoch": 1.88, + "grad_norm": 4.490397738364105, + "learning_rate": 1.4344867097211098e-06, + "loss": 0.6927, + "step": 12613 + }, + { + "epoch": 1.88, + "grad_norm": 7.546005722286114, + "learning_rate": 1.4343996950501819e-06, + "loss": 0.6562, + "step": 12614 + }, + { + "epoch": 1.88, + "grad_norm": 2.368452050983621, + "learning_rate": 1.4343126763249633e-06, + "loss": 0.6589, + "step": 12615 + }, + { + "epoch": 1.88, + "grad_norm": 1.8337737934513196, + "learning_rate": 1.4342256535462665e-06, + "loss": 0.6947, + "step": 12616 + }, + { + "epoch": 1.88, + "grad_norm": 2.1324444992368994, + "learning_rate": 1.4341386267149025e-06, + "loss": 0.7018, + "step": 12617 + }, + { + "epoch": 1.88, + "grad_norm": 3.007331160080461, + "learning_rate": 1.434051595831685e-06, + "loss": 0.6549, + "step": 12618 + }, + { + "epoch": 1.88, + "grad_norm": 2.0228052731166524, + "learning_rate": 1.4339645608974254e-06, + "loss": 0.6621, + "step": 12619 + }, + { + "epoch": 1.88, + "grad_norm": 2.1896158957821275, + "learning_rate": 1.4338775219129357e-06, + "loss": 0.6777, + "step": 12620 + }, + { + "epoch": 1.88, + "grad_norm": 1.9354930720411854, + "learning_rate": 1.4337904788790293e-06, + "loss": 0.6673, + "step": 12621 + }, + { + "epoch": 1.88, + "grad_norm": 2.743948656475399, + "learning_rate": 1.433703431796518e-06, + "loss": 0.6576, + "step": 12622 + }, + { + "epoch": 1.88, + "grad_norm": 4.624733412396031, + "learning_rate": 1.433616380666214e-06, + "loss": 0.6934, + "step": 12623 + }, + { + "epoch": 1.88, + "grad_norm": 4.092091295383152, + "learning_rate": 1.4335293254889297e-06, + "loss": 0.6862, + "step": 12624 + }, + { + "epoch": 1.88, + "grad_norm": 3.038065145504709, + "learning_rate": 1.4334422662654785e-06, + "loss": 0.6816, + "step": 12625 + }, + { + "epoch": 1.88, + "grad_norm": 1.8980795838497362, + "learning_rate": 1.4333552029966717e-06, + "loss": 0.6921, + "step": 12626 + }, + { + "epoch": 1.88, + "grad_norm": 2.3583505078417613, + "learning_rate": 1.4332681356833227e-06, + "loss": 0.6823, + "step": 12627 + }, + { + "epoch": 1.88, + "grad_norm": 5.942289375741922, + "learning_rate": 1.4331810643262439e-06, + "loss": 0.6647, + "step": 12628 + }, + { + "epoch": 1.88, + "grad_norm": 1.0047696772232269, + "learning_rate": 1.433093988926248e-06, + "loss": 0.6784, + "step": 12629 + }, + { + "epoch": 1.88, + "grad_norm": 3.520065852133935, + "learning_rate": 1.4330069094841474e-06, + "loss": 0.7109, + "step": 12630 + }, + { + "epoch": 1.88, + "grad_norm": 0.9226486952216164, + "learning_rate": 1.4329198260007551e-06, + "loss": 0.6621, + "step": 12631 + }, + { + "epoch": 1.88, + "grad_norm": 2.1898386172065347, + "learning_rate": 1.4328327384768838e-06, + "loss": 0.6816, + "step": 12632 + }, + { + "epoch": 1.88, + "grad_norm": 1.4192513130506603, + "learning_rate": 1.4327456469133464e-06, + "loss": 0.7018, + "step": 12633 + }, + { + "epoch": 1.88, + "grad_norm": 3.460812550146282, + "learning_rate": 1.4326585513109555e-06, + "loss": 0.679, + "step": 12634 + }, + { + "epoch": 1.88, + "grad_norm": 1.9624934202149165, + "learning_rate": 1.4325714516705241e-06, + "loss": 0.6829, + "step": 12635 + }, + { + "epoch": 1.88, + "grad_norm": 2.230577221953142, + "learning_rate": 1.4324843479928648e-06, + "loss": 0.6888, + "step": 12636 + }, + { + "epoch": 1.88, + "grad_norm": 1.5077756016926258, + "learning_rate": 1.4323972402787911e-06, + "loss": 0.6986, + "step": 12637 + }, + { + "epoch": 1.88, + "grad_norm": 1.4105561433903921, + "learning_rate": 1.4323101285291158e-06, + "loss": 0.6849, + "step": 12638 + }, + { + "epoch": 1.89, + "grad_norm": 2.614899394375778, + "learning_rate": 1.432223012744652e-06, + "loss": 0.6536, + "step": 12639 + }, + { + "epoch": 1.89, + "grad_norm": 3.1884186835246617, + "learning_rate": 1.432135892926212e-06, + "loss": 0.6836, + "step": 12640 + }, + { + "epoch": 1.89, + "grad_norm": 0.7217469712589699, + "learning_rate": 1.4320487690746096e-06, + "loss": 0.6647, + "step": 12641 + }, + { + "epoch": 1.89, + "grad_norm": 0.7215483421138434, + "learning_rate": 1.4319616411906585e-06, + "loss": 0.6803, + "step": 12642 + }, + { + "epoch": 1.89, + "grad_norm": 2.81429302586655, + "learning_rate": 1.4318745092751705e-06, + "loss": 0.6634, + "step": 12643 + }, + { + "epoch": 1.89, + "grad_norm": 1.6481250316517788, + "learning_rate": 1.43178737332896e-06, + "loss": 0.6686, + "step": 12644 + }, + { + "epoch": 1.89, + "grad_norm": 0.7182377105832082, + "learning_rate": 1.4317002333528397e-06, + "loss": 0.6758, + "step": 12645 + }, + { + "epoch": 1.89, + "grad_norm": 1.1984175059368667, + "learning_rate": 1.4316130893476227e-06, + "loss": 0.6693, + "step": 12646 + }, + { + "epoch": 1.89, + "grad_norm": 2.6966179017134224, + "learning_rate": 1.4315259413141225e-06, + "loss": 0.707, + "step": 12647 + }, + { + "epoch": 1.89, + "grad_norm": 1.145689089751425, + "learning_rate": 1.4314387892531525e-06, + "loss": 0.6992, + "step": 12648 + }, + { + "epoch": 1.89, + "grad_norm": 1.326709459687918, + "learning_rate": 1.4313516331655265e-06, + "loss": 0.666, + "step": 12649 + }, + { + "epoch": 1.89, + "grad_norm": 2.62562079422434, + "learning_rate": 1.4312644730520573e-06, + "loss": 0.6797, + "step": 12650 + }, + { + "epoch": 1.89, + "grad_norm": 1.9027058495265996, + "learning_rate": 1.4311773089135585e-06, + "loss": 0.6667, + "step": 12651 + }, + { + "epoch": 1.89, + "grad_norm": 1.1479355794465336, + "learning_rate": 1.431090140750844e-06, + "loss": 0.6595, + "step": 12652 + }, + { + "epoch": 1.89, + "grad_norm": 2.5330969181385963, + "learning_rate": 1.4310029685647274e-06, + "loss": 0.681, + "step": 12653 + }, + { + "epoch": 1.89, + "grad_norm": 2.7669010644112366, + "learning_rate": 1.4309157923560213e-06, + "loss": 0.6797, + "step": 12654 + }, + { + "epoch": 1.89, + "grad_norm": 2.6802848507932517, + "learning_rate": 1.4308286121255406e-06, + "loss": 0.6484, + "step": 12655 + }, + { + "epoch": 1.89, + "grad_norm": 1.5289490910784596, + "learning_rate": 1.430741427874098e-06, + "loss": 0.6751, + "step": 12656 + }, + { + "epoch": 1.89, + "grad_norm": 0.7992649854487311, + "learning_rate": 1.4306542396025075e-06, + "loss": 0.6836, + "step": 12657 + }, + { + "epoch": 1.89, + "grad_norm": 1.078577453545664, + "learning_rate": 1.430567047311583e-06, + "loss": 0.6725, + "step": 12658 + }, + { + "epoch": 1.89, + "grad_norm": 1.2384106113655997, + "learning_rate": 1.4304798510021383e-06, + "loss": 0.6615, + "step": 12659 + }, + { + "epoch": 1.89, + "grad_norm": 2.4360712614299245, + "learning_rate": 1.4303926506749871e-06, + "loss": 0.6712, + "step": 12660 + }, + { + "epoch": 1.89, + "grad_norm": 3.4335587629153648, + "learning_rate": 1.430305446330943e-06, + "loss": 0.6615, + "step": 12661 + }, + { + "epoch": 1.89, + "grad_norm": 2.371377673763259, + "learning_rate": 1.4302182379708203e-06, + "loss": 0.6777, + "step": 12662 + }, + { + "epoch": 1.89, + "grad_norm": 1.9957099285368467, + "learning_rate": 1.4301310255954327e-06, + "loss": 0.681, + "step": 12663 + }, + { + "epoch": 1.89, + "grad_norm": 1.7356259628415585, + "learning_rate": 1.430043809205594e-06, + "loss": 0.6641, + "step": 12664 + }, + { + "epoch": 1.89, + "grad_norm": 1.5034781144660627, + "learning_rate": 1.4299565888021185e-06, + "loss": 0.6777, + "step": 12665 + }, + { + "epoch": 1.89, + "grad_norm": 2.6702859724737253, + "learning_rate": 1.4298693643858204e-06, + "loss": 0.6602, + "step": 12666 + }, + { + "epoch": 1.89, + "grad_norm": 0.9413133512991246, + "learning_rate": 1.429782135957513e-06, + "loss": 0.668, + "step": 12667 + }, + { + "epoch": 1.89, + "grad_norm": 0.8643791679577477, + "learning_rate": 1.4296949035180116e-06, + "loss": 0.6699, + "step": 12668 + }, + { + "epoch": 1.89, + "grad_norm": 1.1807945880777444, + "learning_rate": 1.4296076670681292e-06, + "loss": 0.6992, + "step": 12669 + }, + { + "epoch": 1.89, + "grad_norm": 3.1899190545259675, + "learning_rate": 1.4295204266086804e-06, + "loss": 0.6589, + "step": 12670 + }, + { + "epoch": 1.89, + "grad_norm": 2.710313190566166, + "learning_rate": 1.4294331821404797e-06, + "loss": 0.6758, + "step": 12671 + }, + { + "epoch": 1.89, + "grad_norm": 1.0341826571891455, + "learning_rate": 1.429345933664341e-06, + "loss": 0.6693, + "step": 12672 + }, + { + "epoch": 1.89, + "grad_norm": 1.2619013184730898, + "learning_rate": 1.4292586811810787e-06, + "loss": 0.6628, + "step": 12673 + }, + { + "epoch": 1.89, + "grad_norm": 4.822532571369567, + "learning_rate": 1.4291714246915074e-06, + "loss": 0.6849, + "step": 12674 + }, + { + "epoch": 1.89, + "grad_norm": 1.0143199551857882, + "learning_rate": 1.429084164196441e-06, + "loss": 0.6335, + "step": 12675 + }, + { + "epoch": 1.89, + "grad_norm": 2.5900012744377325, + "learning_rate": 1.4289968996966944e-06, + "loss": 0.6602, + "step": 12676 + }, + { + "epoch": 1.89, + "grad_norm": 3.3742022269836434, + "learning_rate": 1.4289096311930815e-06, + "loss": 0.6178, + "step": 12677 + }, + { + "epoch": 1.89, + "grad_norm": 1.9924404672782514, + "learning_rate": 1.4288223586864173e-06, + "loss": 0.6875, + "step": 12678 + }, + { + "epoch": 1.89, + "grad_norm": 1.4241098032902069, + "learning_rate": 1.4287350821775158e-06, + "loss": 0.651, + "step": 12679 + }, + { + "epoch": 1.89, + "grad_norm": 3.609135397296484, + "learning_rate": 1.4286478016671922e-06, + "loss": 0.696, + "step": 12680 + }, + { + "epoch": 1.89, + "grad_norm": 1.6304452040659572, + "learning_rate": 1.4285605171562606e-06, + "loss": 0.6888, + "step": 12681 + }, + { + "epoch": 1.89, + "grad_norm": 1.1566376205986806, + "learning_rate": 1.428473228645536e-06, + "loss": 0.6576, + "step": 12682 + }, + { + "epoch": 1.89, + "grad_norm": 1.9673779351331424, + "learning_rate": 1.4283859361358323e-06, + "loss": 0.6986, + "step": 12683 + }, + { + "epoch": 1.89, + "grad_norm": 1.0890378937705893, + "learning_rate": 1.4282986396279654e-06, + "loss": 0.6465, + "step": 12684 + }, + { + "epoch": 1.89, + "grad_norm": 1.478699758828585, + "learning_rate": 1.4282113391227494e-06, + "loss": 0.6732, + "step": 12685 + }, + { + "epoch": 1.89, + "grad_norm": 1.2125562285909288, + "learning_rate": 1.4281240346209987e-06, + "loss": 0.6784, + "step": 12686 + }, + { + "epoch": 1.89, + "grad_norm": 1.2774046139894424, + "learning_rate": 1.4280367261235287e-06, + "loss": 0.6745, + "step": 12687 + }, + { + "epoch": 1.89, + "grad_norm": 1.1867956319906159, + "learning_rate": 1.427949413631154e-06, + "loss": 0.7005, + "step": 12688 + }, + { + "epoch": 1.89, + "grad_norm": 5.965173704914945, + "learning_rate": 1.42786209714469e-06, + "loss": 0.6699, + "step": 12689 + }, + { + "epoch": 1.89, + "grad_norm": 1.6019287004848963, + "learning_rate": 1.4277747766649506e-06, + "loss": 0.6829, + "step": 12690 + }, + { + "epoch": 1.89, + "grad_norm": 1.702922387550265, + "learning_rate": 1.4276874521927518e-06, + "loss": 0.6849, + "step": 12691 + }, + { + "epoch": 1.89, + "grad_norm": 1.8924461620052029, + "learning_rate": 1.427600123728908e-06, + "loss": 0.6641, + "step": 12692 + }, + { + "epoch": 1.89, + "grad_norm": 1.101597986356412, + "learning_rate": 1.4275127912742343e-06, + "loss": 0.6667, + "step": 12693 + }, + { + "epoch": 1.89, + "grad_norm": 2.473736188570242, + "learning_rate": 1.4274254548295463e-06, + "loss": 0.6517, + "step": 12694 + }, + { + "epoch": 1.89, + "grad_norm": 1.990996177310839, + "learning_rate": 1.4273381143956585e-06, + "loss": 0.666, + "step": 12695 + }, + { + "epoch": 1.89, + "grad_norm": 1.9214614387080842, + "learning_rate": 1.4272507699733862e-06, + "loss": 0.666, + "step": 12696 + }, + { + "epoch": 1.89, + "grad_norm": 5.4693210677234, + "learning_rate": 1.4271634215635448e-06, + "loss": 0.6777, + "step": 12697 + }, + { + "epoch": 1.89, + "grad_norm": 1.809803772218447, + "learning_rate": 1.4270760691669495e-06, + "loss": 0.6641, + "step": 12698 + }, + { + "epoch": 1.89, + "grad_norm": 2.256121715189284, + "learning_rate": 1.4269887127844152e-06, + "loss": 0.6758, + "step": 12699 + }, + { + "epoch": 1.89, + "grad_norm": 5.018544180879667, + "learning_rate": 1.4269013524167579e-06, + "loss": 0.7161, + "step": 12700 + }, + { + "epoch": 1.89, + "grad_norm": 3.283227748356394, + "learning_rate": 1.4268139880647923e-06, + "loss": 0.7051, + "step": 12701 + }, + { + "epoch": 1.89, + "grad_norm": 1.5524200477270582, + "learning_rate": 1.4267266197293336e-06, + "loss": 0.6634, + "step": 12702 + }, + { + "epoch": 1.89, + "grad_norm": 2.380626646596437, + "learning_rate": 1.4266392474111983e-06, + "loss": 0.6654, + "step": 12703 + }, + { + "epoch": 1.89, + "grad_norm": 1.0402069140252834, + "learning_rate": 1.4265518711112009e-06, + "loss": 0.6589, + "step": 12704 + }, + { + "epoch": 1.89, + "grad_norm": 6.067516708503343, + "learning_rate": 1.426464490830157e-06, + "loss": 0.6882, + "step": 12705 + }, + { + "epoch": 1.9, + "grad_norm": 2.212501726235734, + "learning_rate": 1.4263771065688825e-06, + "loss": 0.6504, + "step": 12706 + }, + { + "epoch": 1.9, + "grad_norm": 0.9428620711370828, + "learning_rate": 1.4262897183281926e-06, + "loss": 0.6738, + "step": 12707 + }, + { + "epoch": 1.9, + "grad_norm": 0.8820354869494984, + "learning_rate": 1.4262023261089034e-06, + "loss": 0.6855, + "step": 12708 + }, + { + "epoch": 1.9, + "grad_norm": 4.308027430023218, + "learning_rate": 1.4261149299118297e-06, + "loss": 0.6751, + "step": 12709 + }, + { + "epoch": 1.9, + "grad_norm": 3.8669784950856747, + "learning_rate": 1.4260275297377881e-06, + "loss": 0.6953, + "step": 12710 + }, + { + "epoch": 1.9, + "grad_norm": 3.4459517815961083, + "learning_rate": 1.425940125587594e-06, + "loss": 0.6615, + "step": 12711 + }, + { + "epoch": 1.9, + "grad_norm": 2.6792662476479117, + "learning_rate": 1.4258527174620625e-06, + "loss": 0.6882, + "step": 12712 + }, + { + "epoch": 1.9, + "grad_norm": 3.1831735637288827, + "learning_rate": 1.42576530536201e-06, + "loss": 0.6699, + "step": 12713 + }, + { + "epoch": 1.9, + "grad_norm": 2.6387269783345153, + "learning_rate": 1.4256778892882524e-06, + "loss": 0.6634, + "step": 12714 + }, + { + "epoch": 1.9, + "grad_norm": 4.018773192636667, + "learning_rate": 1.4255904692416056e-06, + "loss": 0.7018, + "step": 12715 + }, + { + "epoch": 1.9, + "grad_norm": 8.451516978362141, + "learning_rate": 1.4255030452228852e-06, + "loss": 0.7051, + "step": 12716 + }, + { + "epoch": 1.9, + "grad_norm": 1.3907951904895464, + "learning_rate": 1.4254156172329071e-06, + "loss": 0.666, + "step": 12717 + }, + { + "epoch": 1.9, + "grad_norm": 0.7791976449715916, + "learning_rate": 1.4253281852724875e-06, + "loss": 0.6673, + "step": 12718 + }, + { + "epoch": 1.9, + "grad_norm": 1.5997027665253245, + "learning_rate": 1.4252407493424423e-06, + "loss": 0.6816, + "step": 12719 + }, + { + "epoch": 1.9, + "grad_norm": 2.9879839465390745, + "learning_rate": 1.4251533094435876e-06, + "loss": 0.6549, + "step": 12720 + }, + { + "epoch": 1.9, + "grad_norm": 1.1356776321615174, + "learning_rate": 1.4250658655767397e-06, + "loss": 0.6693, + "step": 12721 + }, + { + "epoch": 1.9, + "grad_norm": 1.6647905830394203, + "learning_rate": 1.4249784177427143e-06, + "loss": 0.6921, + "step": 12722 + }, + { + "epoch": 1.9, + "grad_norm": 0.8150325827735764, + "learning_rate": 1.4248909659423276e-06, + "loss": 0.679, + "step": 12723 + }, + { + "epoch": 1.9, + "grad_norm": 2.140826224035399, + "learning_rate": 1.4248035101763962e-06, + "loss": 0.6719, + "step": 12724 + }, + { + "epoch": 1.9, + "grad_norm": 2.409775501761359, + "learning_rate": 1.4247160504457361e-06, + "loss": 0.709, + "step": 12725 + }, + { + "epoch": 1.9, + "grad_norm": 5.613073084397558, + "learning_rate": 1.4246285867511635e-06, + "loss": 0.6647, + "step": 12726 + }, + { + "epoch": 1.9, + "grad_norm": 1.3553974144954153, + "learning_rate": 1.4245411190934945e-06, + "loss": 0.6562, + "step": 12727 + }, + { + "epoch": 1.9, + "grad_norm": 1.4875458525697474, + "learning_rate": 1.4244536474735458e-06, + "loss": 0.6738, + "step": 12728 + }, + { + "epoch": 1.9, + "grad_norm": 2.4608520277306667, + "learning_rate": 1.4243661718921337e-06, + "loss": 0.679, + "step": 12729 + }, + { + "epoch": 1.9, + "grad_norm": 2.068085797169004, + "learning_rate": 1.4242786923500745e-06, + "loss": 0.666, + "step": 12730 + }, + { + "epoch": 1.9, + "grad_norm": 2.898976796491099, + "learning_rate": 1.4241912088481848e-06, + "loss": 0.7051, + "step": 12731 + }, + { + "epoch": 1.9, + "grad_norm": 3.5487879773036877, + "learning_rate": 1.4241037213872811e-06, + "loss": 0.6641, + "step": 12732 + }, + { + "epoch": 1.9, + "grad_norm": 1.9555766755679003, + "learning_rate": 1.4240162299681798e-06, + "loss": 0.6719, + "step": 12733 + }, + { + "epoch": 1.9, + "grad_norm": 3.2734963962361614, + "learning_rate": 1.4239287345916977e-06, + "loss": 0.694, + "step": 12734 + }, + { + "epoch": 1.9, + "grad_norm": 0.8083589455554778, + "learning_rate": 1.423841235258651e-06, + "loss": 0.6895, + "step": 12735 + }, + { + "epoch": 1.9, + "grad_norm": 0.9387652395665186, + "learning_rate": 1.4237537319698568e-06, + "loss": 0.6784, + "step": 12736 + }, + { + "epoch": 1.9, + "grad_norm": 1.6903170439586284, + "learning_rate": 1.4236662247261314e-06, + "loss": 0.6868, + "step": 12737 + }, + { + "epoch": 1.9, + "grad_norm": 0.7826734340825222, + "learning_rate": 1.4235787135282915e-06, + "loss": 0.679, + "step": 12738 + }, + { + "epoch": 1.9, + "grad_norm": 2.241993478189037, + "learning_rate": 1.423491198377154e-06, + "loss": 0.6738, + "step": 12739 + }, + { + "epoch": 1.9, + "grad_norm": 1.5986956771191057, + "learning_rate": 1.423403679273536e-06, + "loss": 0.6576, + "step": 12740 + }, + { + "epoch": 1.9, + "grad_norm": 0.7526370645555619, + "learning_rate": 1.423316156218254e-06, + "loss": 0.6738, + "step": 12741 + }, + { + "epoch": 1.9, + "grad_norm": 3.936583332830843, + "learning_rate": 1.4232286292121248e-06, + "loss": 0.6973, + "step": 12742 + }, + { + "epoch": 1.9, + "grad_norm": 0.764801304978914, + "learning_rate": 1.4231410982559653e-06, + "loss": 0.6901, + "step": 12743 + }, + { + "epoch": 1.9, + "grad_norm": 3.8252619747658465, + "learning_rate": 1.4230535633505924e-06, + "loss": 0.6797, + "step": 12744 + }, + { + "epoch": 1.9, + "grad_norm": 3.6664565348784355, + "learning_rate": 1.4229660244968233e-06, + "loss": 0.6693, + "step": 12745 + }, + { + "epoch": 1.9, + "grad_norm": 2.543722878153807, + "learning_rate": 1.4228784816954748e-06, + "loss": 0.6836, + "step": 12746 + }, + { + "epoch": 1.9, + "grad_norm": 3.263278176007791, + "learning_rate": 1.4227909349473643e-06, + "loss": 0.6751, + "step": 12747 + }, + { + "epoch": 1.9, + "grad_norm": 2.6953176981659257, + "learning_rate": 1.4227033842533085e-06, + "loss": 0.6875, + "step": 12748 + }, + { + "epoch": 1.9, + "grad_norm": 0.9460256808336505, + "learning_rate": 1.4226158296141244e-06, + "loss": 0.6888, + "step": 12749 + }, + { + "epoch": 1.9, + "grad_norm": 4.248479467949668, + "learning_rate": 1.4225282710306297e-06, + "loss": 0.6569, + "step": 12750 + }, + { + "epoch": 1.9, + "grad_norm": 3.926793026965161, + "learning_rate": 1.4224407085036413e-06, + "loss": 0.6914, + "step": 12751 + }, + { + "epoch": 1.9, + "grad_norm": 2.1135652796665, + "learning_rate": 1.422353142033976e-06, + "loss": 0.6693, + "step": 12752 + }, + { + "epoch": 1.9, + "grad_norm": 3.285448919385564, + "learning_rate": 1.4222655716224518e-06, + "loss": 0.6712, + "step": 12753 + }, + { + "epoch": 1.9, + "grad_norm": 2.5140820492280502, + "learning_rate": 1.4221779972698857e-06, + "loss": 0.6536, + "step": 12754 + }, + { + "epoch": 1.9, + "grad_norm": 0.699768815261376, + "learning_rate": 1.422090418977095e-06, + "loss": 0.6712, + "step": 12755 + }, + { + "epoch": 1.9, + "grad_norm": 2.0929759798951935, + "learning_rate": 1.422002836744897e-06, + "loss": 0.666, + "step": 12756 + }, + { + "epoch": 1.9, + "grad_norm": 5.880090106128239, + "learning_rate": 1.4219152505741091e-06, + "loss": 0.6823, + "step": 12757 + }, + { + "epoch": 1.9, + "grad_norm": 2.4202476048058577, + "learning_rate": 1.4218276604655493e-06, + "loss": 0.6745, + "step": 12758 + }, + { + "epoch": 1.9, + "grad_norm": 6.0021838187678975, + "learning_rate": 1.421740066420034e-06, + "loss": 0.6465, + "step": 12759 + }, + { + "epoch": 1.9, + "grad_norm": 2.220890857200297, + "learning_rate": 1.421652468438382e-06, + "loss": 0.6608, + "step": 12760 + }, + { + "epoch": 1.9, + "grad_norm": 1.4320712525761068, + "learning_rate": 1.4215648665214099e-06, + "loss": 0.6816, + "step": 12761 + }, + { + "epoch": 1.9, + "grad_norm": 1.4441200143845128, + "learning_rate": 1.4214772606699355e-06, + "loss": 0.6771, + "step": 12762 + }, + { + "epoch": 1.9, + "grad_norm": 1.2074169609215877, + "learning_rate": 1.4213896508847767e-06, + "loss": 0.6797, + "step": 12763 + }, + { + "epoch": 1.9, + "grad_norm": 3.0639142496986387, + "learning_rate": 1.421302037166751e-06, + "loss": 0.668, + "step": 12764 + }, + { + "epoch": 1.9, + "grad_norm": 2.7634009576313927, + "learning_rate": 1.421214419516676e-06, + "loss": 0.6751, + "step": 12765 + }, + { + "epoch": 1.9, + "grad_norm": 1.357626045206837, + "learning_rate": 1.4211267979353697e-06, + "loss": 0.6634, + "step": 12766 + }, + { + "epoch": 1.9, + "grad_norm": 3.2535089010633653, + "learning_rate": 1.42103917242365e-06, + "loss": 0.6745, + "step": 12767 + }, + { + "epoch": 1.9, + "grad_norm": 2.551705141482578, + "learning_rate": 1.420951542982334e-06, + "loss": 0.6647, + "step": 12768 + }, + { + "epoch": 1.9, + "grad_norm": 5.9693119594959, + "learning_rate": 1.4208639096122404e-06, + "loss": 0.6406, + "step": 12769 + }, + { + "epoch": 1.9, + "grad_norm": 3.489743734972277, + "learning_rate": 1.4207762723141862e-06, + "loss": 0.6406, + "step": 12770 + }, + { + "epoch": 1.9, + "grad_norm": 2.315395606411171, + "learning_rate": 1.4206886310889907e-06, + "loss": 0.7331, + "step": 12771 + }, + { + "epoch": 1.9, + "grad_norm": 3.513914468461589, + "learning_rate": 1.4206009859374703e-06, + "loss": 0.6406, + "step": 12772 + }, + { + "epoch": 1.9, + "grad_norm": 1.0243000108802602, + "learning_rate": 1.420513336860444e-06, + "loss": 0.6569, + "step": 12773 + }, + { + "epoch": 1.91, + "grad_norm": 1.6905272811814687, + "learning_rate": 1.4204256838587295e-06, + "loss": 0.6647, + "step": 12774 + }, + { + "epoch": 1.91, + "grad_norm": 3.148245886143196, + "learning_rate": 1.4203380269331447e-06, + "loss": 0.653, + "step": 12775 + }, + { + "epoch": 1.91, + "grad_norm": 3.938812601227677, + "learning_rate": 1.4202503660845082e-06, + "loss": 0.6556, + "step": 12776 + }, + { + "epoch": 1.91, + "grad_norm": 3.0432524661689, + "learning_rate": 1.4201627013136378e-06, + "loss": 0.6803, + "step": 12777 + }, + { + "epoch": 1.91, + "grad_norm": 1.3317426626234679, + "learning_rate": 1.4200750326213516e-06, + "loss": 0.6471, + "step": 12778 + }, + { + "epoch": 1.91, + "grad_norm": 1.1866035732723301, + "learning_rate": 1.4199873600084683e-06, + "loss": 0.6784, + "step": 12779 + }, + { + "epoch": 1.91, + "grad_norm": 2.6930581728903316, + "learning_rate": 1.4198996834758053e-06, + "loss": 0.6836, + "step": 12780 + }, + { + "epoch": 1.91, + "grad_norm": 1.040030080700013, + "learning_rate": 1.4198120030241817e-06, + "loss": 0.6914, + "step": 12781 + }, + { + "epoch": 1.91, + "grad_norm": 1.2775139468735266, + "learning_rate": 1.4197243186544156e-06, + "loss": 0.6335, + "step": 12782 + }, + { + "epoch": 1.91, + "grad_norm": 2.0237889647815557, + "learning_rate": 1.4196366303673252e-06, + "loss": 0.6576, + "step": 12783 + }, + { + "epoch": 1.91, + "grad_norm": 6.122202798556475, + "learning_rate": 1.4195489381637291e-06, + "loss": 0.6536, + "step": 12784 + }, + { + "epoch": 1.91, + "grad_norm": 3.6235425374943415, + "learning_rate": 1.4194612420444459e-06, + "loss": 0.6615, + "step": 12785 + }, + { + "epoch": 1.91, + "grad_norm": 2.223915184201507, + "learning_rate": 1.4193735420102932e-06, + "loss": 0.6745, + "step": 12786 + }, + { + "epoch": 1.91, + "grad_norm": 4.7784223683139535, + "learning_rate": 1.4192858380620909e-06, + "loss": 0.6283, + "step": 12787 + }, + { + "epoch": 1.91, + "grad_norm": 4.005670763683182, + "learning_rate": 1.4191981302006563e-06, + "loss": 0.6849, + "step": 12788 + }, + { + "epoch": 1.91, + "grad_norm": 1.036029416424164, + "learning_rate": 1.4191104184268087e-06, + "loss": 0.6543, + "step": 12789 + }, + { + "epoch": 1.91, + "grad_norm": 4.797979299755235, + "learning_rate": 1.4190227027413663e-06, + "loss": 0.7188, + "step": 12790 + }, + { + "epoch": 1.91, + "grad_norm": 1.3210712541555576, + "learning_rate": 1.418934983145148e-06, + "loss": 0.6413, + "step": 12791 + }, + { + "epoch": 1.91, + "grad_norm": 1.6966839911345606, + "learning_rate": 1.4188472596389727e-06, + "loss": 0.7044, + "step": 12792 + }, + { + "epoch": 1.91, + "grad_norm": 2.223707150222009, + "learning_rate": 1.4187595322236584e-06, + "loss": 0.6641, + "step": 12793 + }, + { + "epoch": 1.91, + "grad_norm": 5.003132416703462, + "learning_rate": 1.4186718009000249e-06, + "loss": 0.6829, + "step": 12794 + }, + { + "epoch": 1.91, + "grad_norm": 4.100443520206562, + "learning_rate": 1.4185840656688902e-06, + "loss": 0.6829, + "step": 12795 + }, + { + "epoch": 1.91, + "grad_norm": 4.122683266659331, + "learning_rate": 1.4184963265310734e-06, + "loss": 0.6465, + "step": 12796 + }, + { + "epoch": 1.91, + "grad_norm": 2.1329489000970607, + "learning_rate": 1.4184085834873933e-06, + "loss": 0.6921, + "step": 12797 + }, + { + "epoch": 1.91, + "grad_norm": 1.2130310509818363, + "learning_rate": 1.4183208365386691e-06, + "loss": 0.6536, + "step": 12798 + }, + { + "epoch": 1.91, + "grad_norm": 5.526173007930855, + "learning_rate": 1.4182330856857192e-06, + "loss": 0.6758, + "step": 12799 + }, + { + "epoch": 1.91, + "grad_norm": 3.0862990799623717, + "learning_rate": 1.4181453309293634e-06, + "loss": 0.6829, + "step": 12800 + }, + { + "epoch": 1.91, + "grad_norm": 1.3588527404862987, + "learning_rate": 1.41805757227042e-06, + "loss": 0.6725, + "step": 12801 + }, + { + "epoch": 1.91, + "grad_norm": 3.1053358021847703, + "learning_rate": 1.4179698097097082e-06, + "loss": 0.6725, + "step": 12802 + }, + { + "epoch": 1.91, + "grad_norm": 2.4249152663141573, + "learning_rate": 1.4178820432480474e-06, + "loss": 0.6914, + "step": 12803 + }, + { + "epoch": 1.91, + "grad_norm": 1.2123743402477163, + "learning_rate": 1.4177942728862565e-06, + "loss": 0.681, + "step": 12804 + }, + { + "epoch": 1.91, + "grad_norm": 2.2578381340597096, + "learning_rate": 1.4177064986251544e-06, + "loss": 0.6725, + "step": 12805 + }, + { + "epoch": 1.91, + "grad_norm": 3.075482908936755, + "learning_rate": 1.417618720465561e-06, + "loss": 0.7057, + "step": 12806 + }, + { + "epoch": 1.91, + "grad_norm": 1.0314485990593532, + "learning_rate": 1.417530938408295e-06, + "loss": 0.6803, + "step": 12807 + }, + { + "epoch": 1.91, + "grad_norm": 2.598870063716184, + "learning_rate": 1.417443152454176e-06, + "loss": 0.653, + "step": 12808 + }, + { + "epoch": 1.91, + "grad_norm": 1.6529315316335704, + "learning_rate": 1.4173553626040226e-06, + "loss": 0.6634, + "step": 12809 + }, + { + "epoch": 1.91, + "grad_norm": 3.5055810502735896, + "learning_rate": 1.4172675688586551e-06, + "loss": 0.6836, + "step": 12810 + }, + { + "epoch": 1.91, + "grad_norm": 2.6123977152678988, + "learning_rate": 1.4171797712188927e-06, + "loss": 0.6582, + "step": 12811 + }, + { + "epoch": 1.91, + "grad_norm": 4.132333056721575, + "learning_rate": 1.417091969685554e-06, + "loss": 0.6842, + "step": 12812 + }, + { + "epoch": 1.91, + "grad_norm": 1.0028698697017726, + "learning_rate": 1.4170041642594595e-06, + "loss": 0.6497, + "step": 12813 + }, + { + "epoch": 1.91, + "grad_norm": 1.4729649558904385, + "learning_rate": 1.4169163549414282e-06, + "loss": 0.6816, + "step": 12814 + }, + { + "epoch": 1.91, + "grad_norm": 5.3770967237542, + "learning_rate": 1.4168285417322797e-06, + "loss": 0.6901, + "step": 12815 + }, + { + "epoch": 1.91, + "grad_norm": 0.8332202824314414, + "learning_rate": 1.4167407246328334e-06, + "loss": 0.6647, + "step": 12816 + }, + { + "epoch": 1.91, + "grad_norm": 1.5531462931256732, + "learning_rate": 1.4166529036439092e-06, + "loss": 0.6582, + "step": 12817 + }, + { + "epoch": 1.91, + "grad_norm": 0.9643552618447554, + "learning_rate": 1.4165650787663265e-06, + "loss": 0.6738, + "step": 12818 + }, + { + "epoch": 1.91, + "grad_norm": 2.0278036777324715, + "learning_rate": 1.4164772500009052e-06, + "loss": 0.6738, + "step": 12819 + }, + { + "epoch": 1.91, + "grad_norm": 3.3128279954682527, + "learning_rate": 1.4163894173484646e-06, + "loss": 0.6829, + "step": 12820 + }, + { + "epoch": 1.91, + "grad_norm": 2.3340747681425427, + "learning_rate": 1.4163015808098252e-06, + "loss": 0.6582, + "step": 12821 + }, + { + "epoch": 1.91, + "grad_norm": 1.1719114589162707, + "learning_rate": 1.416213740385806e-06, + "loss": 0.6745, + "step": 12822 + }, + { + "epoch": 1.91, + "grad_norm": 1.1174859070991194, + "learning_rate": 1.4161258960772272e-06, + "loss": 0.6654, + "step": 12823 + }, + { + "epoch": 1.91, + "grad_norm": 2.1924237224818084, + "learning_rate": 1.416038047884909e-06, + "loss": 0.6999, + "step": 12824 + }, + { + "epoch": 1.91, + "grad_norm": 2.655616635525228, + "learning_rate": 1.4159501958096705e-06, + "loss": 0.6862, + "step": 12825 + }, + { + "epoch": 1.91, + "grad_norm": 2.998199281477778, + "learning_rate": 1.4158623398523321e-06, + "loss": 0.7051, + "step": 12826 + }, + { + "epoch": 1.91, + "grad_norm": 2.167181590111423, + "learning_rate": 1.4157744800137137e-06, + "loss": 0.6621, + "step": 12827 + }, + { + "epoch": 1.91, + "grad_norm": 0.8909382256695663, + "learning_rate": 1.4156866162946353e-06, + "loss": 0.6803, + "step": 12828 + }, + { + "epoch": 1.91, + "grad_norm": 1.6075231180899392, + "learning_rate": 1.415598748695917e-06, + "loss": 0.6823, + "step": 12829 + }, + { + "epoch": 1.91, + "grad_norm": 0.8145782356447447, + "learning_rate": 1.4155108772183789e-06, + "loss": 0.6621, + "step": 12830 + }, + { + "epoch": 1.91, + "grad_norm": 1.9332498596677217, + "learning_rate": 1.4154230018628408e-06, + "loss": 0.6725, + "step": 12831 + }, + { + "epoch": 1.91, + "grad_norm": 1.064503614994999, + "learning_rate": 1.4153351226301234e-06, + "loss": 0.6641, + "step": 12832 + }, + { + "epoch": 1.91, + "grad_norm": 0.7823019366358687, + "learning_rate": 1.4152472395210461e-06, + "loss": 0.6647, + "step": 12833 + }, + { + "epoch": 1.91, + "grad_norm": 1.3206736326995125, + "learning_rate": 1.41515935253643e-06, + "loss": 0.6719, + "step": 12834 + }, + { + "epoch": 1.91, + "grad_norm": 0.8204597575098326, + "learning_rate": 1.415071461677095e-06, + "loss": 0.6895, + "step": 12835 + }, + { + "epoch": 1.91, + "grad_norm": 3.730970696853117, + "learning_rate": 1.414983566943861e-06, + "loss": 0.6647, + "step": 12836 + }, + { + "epoch": 1.91, + "grad_norm": 1.2174835178238852, + "learning_rate": 1.414895668337549e-06, + "loss": 0.6764, + "step": 12837 + }, + { + "epoch": 1.91, + "grad_norm": 4.550310293747304, + "learning_rate": 1.4148077658589788e-06, + "loss": 0.6393, + "step": 12838 + }, + { + "epoch": 1.91, + "grad_norm": 4.008411015393533, + "learning_rate": 1.4147198595089708e-06, + "loss": 0.6712, + "step": 12839 + }, + { + "epoch": 1.91, + "grad_norm": 2.819962634928022, + "learning_rate": 1.4146319492883465e-06, + "loss": 0.6699, + "step": 12840 + }, + { + "epoch": 1.92, + "grad_norm": 2.3571819828946903, + "learning_rate": 1.414544035197925e-06, + "loss": 0.6647, + "step": 12841 + }, + { + "epoch": 1.92, + "grad_norm": 1.22479939515595, + "learning_rate": 1.4144561172385274e-06, + "loss": 0.6947, + "step": 12842 + }, + { + "epoch": 1.92, + "grad_norm": 0.9650257867846695, + "learning_rate": 1.4143681954109743e-06, + "loss": 0.6712, + "step": 12843 + }, + { + "epoch": 1.92, + "grad_norm": 2.2730346073129004, + "learning_rate": 1.414280269716086e-06, + "loss": 0.653, + "step": 12844 + }, + { + "epoch": 1.92, + "grad_norm": 2.4331414847104473, + "learning_rate": 1.4141923401546834e-06, + "loss": 0.6875, + "step": 12845 + }, + { + "epoch": 1.92, + "grad_norm": 1.0043417730133763, + "learning_rate": 1.414104406727587e-06, + "loss": 0.6992, + "step": 12846 + }, + { + "epoch": 1.92, + "grad_norm": 6.0591226138249645, + "learning_rate": 1.4140164694356177e-06, + "loss": 0.6536, + "step": 12847 + }, + { + "epoch": 1.92, + "grad_norm": 0.9342736579633029, + "learning_rate": 1.413928528279596e-06, + "loss": 0.6608, + "step": 12848 + }, + { + "epoch": 1.92, + "grad_norm": 3.462075955160419, + "learning_rate": 1.4138405832603426e-06, + "loss": 0.681, + "step": 12849 + }, + { + "epoch": 1.92, + "grad_norm": 1.6472135320647452, + "learning_rate": 1.4137526343786786e-06, + "loss": 0.666, + "step": 12850 + }, + { + "epoch": 1.92, + "grad_norm": 1.3013694368811815, + "learning_rate": 1.4136646816354246e-06, + "loss": 0.6706, + "step": 12851 + }, + { + "epoch": 1.92, + "grad_norm": 3.044690145817431, + "learning_rate": 1.4135767250314015e-06, + "loss": 0.6628, + "step": 12852 + }, + { + "epoch": 1.92, + "grad_norm": 1.1990118591686472, + "learning_rate": 1.4134887645674308e-06, + "loss": 0.6836, + "step": 12853 + }, + { + "epoch": 1.92, + "grad_norm": 1.0986219459123128, + "learning_rate": 1.4134008002443322e-06, + "loss": 0.6621, + "step": 12854 + }, + { + "epoch": 1.92, + "grad_norm": 1.05927310124199, + "learning_rate": 1.4133128320629272e-06, + "loss": 0.6654, + "step": 12855 + }, + { + "epoch": 1.92, + "grad_norm": 2.3873157022074376, + "learning_rate": 1.4132248600240378e-06, + "loss": 0.6882, + "step": 12856 + }, + { + "epoch": 1.92, + "grad_norm": 2.237008892052023, + "learning_rate": 1.413136884128484e-06, + "loss": 0.6634, + "step": 12857 + }, + { + "epoch": 1.92, + "grad_norm": 2.8517635012471803, + "learning_rate": 1.4130489043770868e-06, + "loss": 0.7038, + "step": 12858 + }, + { + "epoch": 1.92, + "grad_norm": 4.101315478313295, + "learning_rate": 1.4129609207706676e-06, + "loss": 0.666, + "step": 12859 + }, + { + "epoch": 1.92, + "grad_norm": 2.2127645355203613, + "learning_rate": 1.412872933310048e-06, + "loss": 0.6732, + "step": 12860 + }, + { + "epoch": 1.92, + "grad_norm": 1.0698964675030822, + "learning_rate": 1.4127849419960484e-06, + "loss": 0.6621, + "step": 12861 + }, + { + "epoch": 1.92, + "grad_norm": 2.0245245096816675, + "learning_rate": 1.4126969468294907e-06, + "loss": 0.7155, + "step": 12862 + }, + { + "epoch": 1.92, + "grad_norm": 3.615125776529782, + "learning_rate": 1.4126089478111958e-06, + "loss": 0.6784, + "step": 12863 + }, + { + "epoch": 1.92, + "grad_norm": 2.961480876184843, + "learning_rate": 1.4125209449419853e-06, + "loss": 0.6543, + "step": 12864 + }, + { + "epoch": 1.92, + "grad_norm": 1.2600715043981305, + "learning_rate": 1.4124329382226802e-06, + "loss": 0.6882, + "step": 12865 + }, + { + "epoch": 1.92, + "grad_norm": 1.3809724563953707, + "learning_rate": 1.412344927654102e-06, + "loss": 0.6895, + "step": 12866 + }, + { + "epoch": 1.92, + "grad_norm": 3.914697069775161, + "learning_rate": 1.4122569132370721e-06, + "loss": 0.6764, + "step": 12867 + }, + { + "epoch": 1.92, + "grad_norm": 3.341231478099604, + "learning_rate": 1.4121688949724119e-06, + "loss": 0.7005, + "step": 12868 + }, + { + "epoch": 1.92, + "grad_norm": 2.7018001302534818, + "learning_rate": 1.412080872860943e-06, + "loss": 0.7064, + "step": 12869 + }, + { + "epoch": 1.92, + "grad_norm": 2.287566811220623, + "learning_rate": 1.411992846903487e-06, + "loss": 0.6992, + "step": 12870 + }, + { + "epoch": 1.92, + "grad_norm": 1.2292971035048894, + "learning_rate": 1.411904817100865e-06, + "loss": 0.6732, + "step": 12871 + }, + { + "epoch": 1.92, + "grad_norm": 3.334554331780839, + "learning_rate": 1.4118167834538995e-06, + "loss": 0.6686, + "step": 12872 + }, + { + "epoch": 1.92, + "grad_norm": 0.7928656233812511, + "learning_rate": 1.411728745963411e-06, + "loss": 0.6686, + "step": 12873 + }, + { + "epoch": 1.92, + "grad_norm": 0.823522285369333, + "learning_rate": 1.4116407046302222e-06, + "loss": 0.6745, + "step": 12874 + }, + { + "epoch": 1.92, + "grad_norm": 1.9204302544142324, + "learning_rate": 1.4115526594551537e-06, + "loss": 0.6582, + "step": 12875 + }, + { + "epoch": 1.92, + "grad_norm": 3.37381560617873, + "learning_rate": 1.4114646104390283e-06, + "loss": 0.6432, + "step": 12876 + }, + { + "epoch": 1.92, + "grad_norm": 0.8279753927100064, + "learning_rate": 1.4113765575826671e-06, + "loss": 0.6699, + "step": 12877 + }, + { + "epoch": 1.92, + "grad_norm": 0.8477913008783866, + "learning_rate": 1.411288500886892e-06, + "loss": 0.6829, + "step": 12878 + }, + { + "epoch": 1.92, + "grad_norm": 2.0663699515885163, + "learning_rate": 1.411200440352525e-06, + "loss": 0.6738, + "step": 12879 + }, + { + "epoch": 1.92, + "grad_norm": 1.9853245211016814, + "learning_rate": 1.4111123759803881e-06, + "loss": 0.6849, + "step": 12880 + }, + { + "epoch": 1.92, + "grad_norm": 1.8503554192209817, + "learning_rate": 1.4110243077713028e-06, + "loss": 0.668, + "step": 12881 + }, + { + "epoch": 1.92, + "grad_norm": 1.9145972571625973, + "learning_rate": 1.4109362357260916e-06, + "loss": 0.6654, + "step": 12882 + }, + { + "epoch": 1.92, + "grad_norm": 3.85663776928085, + "learning_rate": 1.410848159845576e-06, + "loss": 0.653, + "step": 12883 + }, + { + "epoch": 1.92, + "grad_norm": 3.766156145194664, + "learning_rate": 1.4107600801305781e-06, + "loss": 0.6497, + "step": 12884 + }, + { + "epoch": 1.92, + "grad_norm": 1.8200247909413705, + "learning_rate": 1.4106719965819203e-06, + "loss": 0.6829, + "step": 12885 + }, + { + "epoch": 1.92, + "grad_norm": 6.221062866506149, + "learning_rate": 1.4105839092004244e-06, + "loss": 0.6738, + "step": 12886 + }, + { + "epoch": 1.92, + "grad_norm": 1.6127121222340897, + "learning_rate": 1.4104958179869124e-06, + "loss": 0.6855, + "step": 12887 + }, + { + "epoch": 1.92, + "grad_norm": 3.732377403608726, + "learning_rate": 1.4104077229422068e-06, + "loss": 0.6556, + "step": 12888 + }, + { + "epoch": 1.92, + "grad_norm": 1.042267278025713, + "learning_rate": 1.4103196240671292e-06, + "loss": 0.6816, + "step": 12889 + }, + { + "epoch": 1.92, + "grad_norm": 1.7676605747048009, + "learning_rate": 1.410231521362503e-06, + "loss": 0.6523, + "step": 12890 + }, + { + "epoch": 1.92, + "grad_norm": 0.9687530656254271, + "learning_rate": 1.410143414829149e-06, + "loss": 0.6686, + "step": 12891 + }, + { + "epoch": 1.92, + "grad_norm": 2.42470392888078, + "learning_rate": 1.4100553044678905e-06, + "loss": 0.679, + "step": 12892 + }, + { + "epoch": 1.92, + "grad_norm": 2.837885231750733, + "learning_rate": 1.4099671902795497e-06, + "loss": 0.6478, + "step": 12893 + }, + { + "epoch": 1.92, + "grad_norm": 2.251078782206496, + "learning_rate": 1.4098790722649487e-06, + "loss": 0.6647, + "step": 12894 + }, + { + "epoch": 1.92, + "grad_norm": 1.4179307024056789, + "learning_rate": 1.40979095042491e-06, + "loss": 0.6667, + "step": 12895 + }, + { + "epoch": 1.92, + "grad_norm": 1.1391668040495924, + "learning_rate": 1.4097028247602564e-06, + "loss": 0.6667, + "step": 12896 + }, + { + "epoch": 1.92, + "grad_norm": 3.757791684892955, + "learning_rate": 1.40961469527181e-06, + "loss": 0.6764, + "step": 12897 + }, + { + "epoch": 1.92, + "grad_norm": 5.198972752996393, + "learning_rate": 1.4095265619603934e-06, + "loss": 0.6556, + "step": 12898 + }, + { + "epoch": 1.92, + "grad_norm": 1.181530786616308, + "learning_rate": 1.409438424826829e-06, + "loss": 0.6628, + "step": 12899 + }, + { + "epoch": 1.92, + "grad_norm": 3.300637119618101, + "learning_rate": 1.4093502838719397e-06, + "loss": 0.6465, + "step": 12900 + }, + { + "epoch": 1.92, + "grad_norm": 1.3791507735309054, + "learning_rate": 1.4092621390965482e-06, + "loss": 0.6966, + "step": 12901 + }, + { + "epoch": 1.92, + "grad_norm": 1.7854005524549554, + "learning_rate": 1.4091739905014768e-06, + "loss": 0.6634, + "step": 12902 + }, + { + "epoch": 1.92, + "grad_norm": 2.4909999305237913, + "learning_rate": 1.4090858380875485e-06, + "loss": 0.627, + "step": 12903 + }, + { + "epoch": 1.92, + "grad_norm": 3.6774974249949293, + "learning_rate": 1.4089976818555856e-06, + "loss": 0.6328, + "step": 12904 + }, + { + "epoch": 1.92, + "grad_norm": 4.612259098741676, + "learning_rate": 1.4089095218064112e-06, + "loss": 0.668, + "step": 12905 + }, + { + "epoch": 1.92, + "grad_norm": 2.1935824165342845, + "learning_rate": 1.4088213579408487e-06, + "loss": 0.6745, + "step": 12906 + }, + { + "epoch": 1.92, + "grad_norm": 2.6945984563008722, + "learning_rate": 1.4087331902597194e-06, + "loss": 0.6406, + "step": 12907 + }, + { + "epoch": 1.93, + "grad_norm": 3.2283066976881316, + "learning_rate": 1.4086450187638477e-06, + "loss": 0.7096, + "step": 12908 + }, + { + "epoch": 1.93, + "grad_norm": 1.5290579629443508, + "learning_rate": 1.4085568434540556e-06, + "loss": 0.6719, + "step": 12909 + }, + { + "epoch": 1.93, + "grad_norm": 2.2807302060445385, + "learning_rate": 1.4084686643311666e-06, + "loss": 0.694, + "step": 12910 + }, + { + "epoch": 1.93, + "grad_norm": 2.562921725962534, + "learning_rate": 1.4083804813960031e-06, + "loss": 0.6875, + "step": 12911 + }, + { + "epoch": 1.93, + "grad_norm": 1.3633999926594944, + "learning_rate": 1.4082922946493887e-06, + "loss": 0.666, + "step": 12912 + }, + { + "epoch": 1.93, + "grad_norm": 1.675531277705713, + "learning_rate": 1.408204104092146e-06, + "loss": 0.6829, + "step": 12913 + }, + { + "epoch": 1.93, + "grad_norm": 1.859949010856096, + "learning_rate": 1.4081159097250984e-06, + "loss": 0.6406, + "step": 12914 + }, + { + "epoch": 1.93, + "grad_norm": 5.933057954094087, + "learning_rate": 1.4080277115490691e-06, + "loss": 0.6777, + "step": 12915 + }, + { + "epoch": 1.93, + "grad_norm": 2.4741039960710514, + "learning_rate": 1.407939509564881e-06, + "loss": 0.6764, + "step": 12916 + }, + { + "epoch": 1.93, + "grad_norm": 1.1129720583027036, + "learning_rate": 1.4078513037733573e-06, + "loss": 0.6862, + "step": 12917 + }, + { + "epoch": 1.93, + "grad_norm": 5.606389899155841, + "learning_rate": 1.407763094175321e-06, + "loss": 0.6738, + "step": 12918 + }, + { + "epoch": 1.93, + "grad_norm": 1.1909920244091108, + "learning_rate": 1.4076748807715966e-06, + "loss": 0.6628, + "step": 12919 + }, + { + "epoch": 1.93, + "grad_norm": 1.0837231991234668, + "learning_rate": 1.4075866635630055e-06, + "loss": 0.6361, + "step": 12920 + }, + { + "epoch": 1.93, + "grad_norm": 2.8043390509718726, + "learning_rate": 1.4074984425503725e-06, + "loss": 0.6608, + "step": 12921 + }, + { + "epoch": 1.93, + "grad_norm": 3.120562407144169, + "learning_rate": 1.4074102177345205e-06, + "loss": 0.6641, + "step": 12922 + }, + { + "epoch": 1.93, + "grad_norm": 3.513662488142481, + "learning_rate": 1.407321989116273e-06, + "loss": 0.6628, + "step": 12923 + }, + { + "epoch": 1.93, + "grad_norm": 3.5659922662661594, + "learning_rate": 1.4072337566964533e-06, + "loss": 0.6517, + "step": 12924 + }, + { + "epoch": 1.93, + "grad_norm": 2.468196060997647, + "learning_rate": 1.4071455204758845e-06, + "loss": 0.6855, + "step": 12925 + }, + { + "epoch": 1.93, + "grad_norm": 1.2931550142613448, + "learning_rate": 1.4070572804553908e-06, + "loss": 0.6719, + "step": 12926 + }, + { + "epoch": 1.93, + "grad_norm": 3.1938515045200218, + "learning_rate": 1.4069690366357958e-06, + "loss": 0.6556, + "step": 12927 + }, + { + "epoch": 1.93, + "grad_norm": 2.153564590367651, + "learning_rate": 1.4068807890179225e-06, + "loss": 0.6973, + "step": 12928 + }, + { + "epoch": 1.93, + "grad_norm": 2.089524777761413, + "learning_rate": 1.4067925376025949e-06, + "loss": 0.6745, + "step": 12929 + }, + { + "epoch": 1.93, + "grad_norm": 2.7027666680720435, + "learning_rate": 1.4067042823906364e-06, + "loss": 0.6921, + "step": 12930 + }, + { + "epoch": 1.93, + "grad_norm": 3.3198041203747306, + "learning_rate": 1.406616023382871e-06, + "loss": 0.6693, + "step": 12931 + }, + { + "epoch": 1.93, + "grad_norm": 1.3375118359636917, + "learning_rate": 1.4065277605801224e-06, + "loss": 0.6426, + "step": 12932 + }, + { + "epoch": 1.93, + "grad_norm": 1.7991693204871708, + "learning_rate": 1.4064394939832142e-06, + "loss": 0.653, + "step": 12933 + }, + { + "epoch": 1.93, + "grad_norm": 2.4204940914895943, + "learning_rate": 1.4063512235929703e-06, + "loss": 0.6654, + "step": 12934 + }, + { + "epoch": 1.93, + "grad_norm": 1.0726416915806214, + "learning_rate": 1.4062629494102142e-06, + "loss": 0.668, + "step": 12935 + }, + { + "epoch": 1.93, + "grad_norm": 1.1390988874169017, + "learning_rate": 1.4061746714357706e-06, + "loss": 0.6497, + "step": 12936 + }, + { + "epoch": 1.93, + "grad_norm": 0.912124290578713, + "learning_rate": 1.4060863896704622e-06, + "loss": 0.6706, + "step": 12937 + }, + { + "epoch": 1.93, + "grad_norm": 1.83689640047716, + "learning_rate": 1.4059981041151142e-06, + "loss": 0.6517, + "step": 12938 + }, + { + "epoch": 1.93, + "grad_norm": 1.5775828480944358, + "learning_rate": 1.4059098147705499e-06, + "loss": 0.6921, + "step": 12939 + }, + { + "epoch": 1.93, + "grad_norm": 1.8116653915536467, + "learning_rate": 1.4058215216375932e-06, + "loss": 0.6465, + "step": 12940 + }, + { + "epoch": 1.93, + "grad_norm": 4.861114840372033, + "learning_rate": 1.4057332247170684e-06, + "loss": 0.6458, + "step": 12941 + }, + { + "epoch": 1.93, + "grad_norm": 1.7743631419151435, + "learning_rate": 1.4056449240097994e-06, + "loss": 0.6569, + "step": 12942 + }, + { + "epoch": 1.93, + "grad_norm": 4.6151783281245145, + "learning_rate": 1.405556619516611e-06, + "loss": 0.7064, + "step": 12943 + }, + { + "epoch": 1.93, + "grad_norm": 0.9770039889617208, + "learning_rate": 1.4054683112383261e-06, + "loss": 0.6452, + "step": 12944 + }, + { + "epoch": 1.93, + "grad_norm": 4.213411949919454, + "learning_rate": 1.40537999917577e-06, + "loss": 0.6673, + "step": 12945 + }, + { + "epoch": 1.93, + "grad_norm": 2.0124629760695494, + "learning_rate": 1.4052916833297666e-06, + "loss": 0.6979, + "step": 12946 + }, + { + "epoch": 1.93, + "grad_norm": 3.814460883813714, + "learning_rate": 1.4052033637011395e-06, + "loss": 0.6908, + "step": 12947 + }, + { + "epoch": 1.93, + "grad_norm": 1.054591646304856, + "learning_rate": 1.4051150402907142e-06, + "loss": 0.6595, + "step": 12948 + }, + { + "epoch": 1.93, + "grad_norm": 1.1597905891884412, + "learning_rate": 1.405026713099314e-06, + "loss": 0.6543, + "step": 12949 + }, + { + "epoch": 1.93, + "grad_norm": 1.8573500206671358, + "learning_rate": 1.404938382127764e-06, + "loss": 0.6523, + "step": 12950 + }, + { + "epoch": 1.93, + "grad_norm": 1.9791796231654553, + "learning_rate": 1.404850047376888e-06, + "loss": 0.668, + "step": 12951 + }, + { + "epoch": 1.93, + "grad_norm": 2.375457972653346, + "learning_rate": 1.404761708847511e-06, + "loss": 0.7188, + "step": 12952 + }, + { + "epoch": 1.93, + "grad_norm": 3.133665185235993, + "learning_rate": 1.4046733665404568e-06, + "loss": 0.6178, + "step": 12953 + }, + { + "epoch": 1.93, + "grad_norm": 1.9647435860642644, + "learning_rate": 1.4045850204565505e-06, + "loss": 0.6615, + "step": 12954 + }, + { + "epoch": 1.93, + "grad_norm": 1.7788507786010523, + "learning_rate": 1.4044966705966164e-06, + "loss": 0.6875, + "step": 12955 + }, + { + "epoch": 1.93, + "grad_norm": 3.060748404922829, + "learning_rate": 1.4044083169614793e-06, + "loss": 0.6413, + "step": 12956 + }, + { + "epoch": 1.93, + "grad_norm": 1.0790896448478269, + "learning_rate": 1.4043199595519632e-06, + "loss": 0.6602, + "step": 12957 + }, + { + "epoch": 1.93, + "grad_norm": 1.1388737651521157, + "learning_rate": 1.4042315983688933e-06, + "loss": 0.6725, + "step": 12958 + }, + { + "epoch": 1.93, + "grad_norm": 0.9893272889693202, + "learning_rate": 1.4041432334130944e-06, + "loss": 0.6589, + "step": 12959 + }, + { + "epoch": 1.93, + "grad_norm": 1.804782873637079, + "learning_rate": 1.4040548646853907e-06, + "loss": 0.6901, + "step": 12960 + }, + { + "epoch": 1.93, + "grad_norm": 1.279653938813796, + "learning_rate": 1.4039664921866073e-06, + "loss": 0.6523, + "step": 12961 + }, + { + "epoch": 1.93, + "grad_norm": 1.9700494256811287, + "learning_rate": 1.403878115917569e-06, + "loss": 0.7044, + "step": 12962 + }, + { + "epoch": 1.93, + "grad_norm": 1.1083376894018637, + "learning_rate": 1.4037897358791003e-06, + "loss": 0.6758, + "step": 12963 + }, + { + "epoch": 1.93, + "grad_norm": 3.4326179670383925, + "learning_rate": 1.4037013520720264e-06, + "loss": 0.7253, + "step": 12964 + }, + { + "epoch": 1.93, + "grad_norm": 4.549961608925722, + "learning_rate": 1.403612964497172e-06, + "loss": 0.6784, + "step": 12965 + }, + { + "epoch": 1.93, + "grad_norm": 3.527402710336413, + "learning_rate": 1.4035245731553622e-06, + "loss": 0.6823, + "step": 12966 + }, + { + "epoch": 1.93, + "grad_norm": 1.0567569261194847, + "learning_rate": 1.403436178047422e-06, + "loss": 0.6523, + "step": 12967 + }, + { + "epoch": 1.93, + "grad_norm": 2.4717973074802444, + "learning_rate": 1.403347779174176e-06, + "loss": 0.6836, + "step": 12968 + }, + { + "epoch": 1.93, + "grad_norm": 2.577390632978276, + "learning_rate": 1.4032593765364497e-06, + "loss": 0.6777, + "step": 12969 + }, + { + "epoch": 1.93, + "grad_norm": 5.716679138820634, + "learning_rate": 1.403170970135068e-06, + "loss": 0.6895, + "step": 12970 + }, + { + "epoch": 1.93, + "grad_norm": 1.5320530849235738, + "learning_rate": 1.4030825599708557e-06, + "loss": 0.6803, + "step": 12971 + }, + { + "epoch": 1.93, + "grad_norm": 1.3360911186466393, + "learning_rate": 1.4029941460446385e-06, + "loss": 0.6595, + "step": 12972 + }, + { + "epoch": 1.93, + "grad_norm": 1.9061917119765863, + "learning_rate": 1.4029057283572412e-06, + "loss": 0.6517, + "step": 12973 + }, + { + "epoch": 1.93, + "grad_norm": 4.294093206649457, + "learning_rate": 1.4028173069094894e-06, + "loss": 0.6686, + "step": 12974 + }, + { + "epoch": 1.94, + "grad_norm": 1.130839699090959, + "learning_rate": 1.4027288817022074e-06, + "loss": 0.6484, + "step": 12975 + }, + { + "epoch": 1.94, + "grad_norm": 1.3606001508811214, + "learning_rate": 1.4026404527362217e-06, + "loss": 0.6465, + "step": 12976 + }, + { + "epoch": 1.94, + "grad_norm": 3.007869332511742, + "learning_rate": 1.4025520200123569e-06, + "loss": 0.6517, + "step": 12977 + }, + { + "epoch": 1.94, + "grad_norm": 1.680112088534043, + "learning_rate": 1.4024635835314383e-06, + "loss": 0.6738, + "step": 12978 + }, + { + "epoch": 1.94, + "grad_norm": 2.4838448230457835, + "learning_rate": 1.4023751432942917e-06, + "loss": 0.6523, + "step": 12979 + }, + { + "epoch": 1.94, + "grad_norm": 0.9250743742039668, + "learning_rate": 1.4022866993017424e-06, + "loss": 0.6719, + "step": 12980 + }, + { + "epoch": 1.94, + "grad_norm": 0.9871500624402798, + "learning_rate": 1.4021982515546154e-06, + "loss": 0.6784, + "step": 12981 + }, + { + "epoch": 1.94, + "grad_norm": 1.9700511344183806, + "learning_rate": 1.402109800053737e-06, + "loss": 0.694, + "step": 12982 + }, + { + "epoch": 1.94, + "grad_norm": 2.7781085955182716, + "learning_rate": 1.4020213447999318e-06, + "loss": 0.7057, + "step": 12983 + }, + { + "epoch": 1.94, + "grad_norm": 1.3076299645107081, + "learning_rate": 1.401932885794026e-06, + "loss": 0.6927, + "step": 12984 + }, + { + "epoch": 1.94, + "grad_norm": 1.4825276980398217, + "learning_rate": 1.401844423036845e-06, + "loss": 0.6335, + "step": 12985 + }, + { + "epoch": 1.94, + "grad_norm": 2.882649728652866, + "learning_rate": 1.4017559565292147e-06, + "loss": 0.6862, + "step": 12986 + }, + { + "epoch": 1.94, + "grad_norm": 2.6843360979354545, + "learning_rate": 1.4016674862719603e-06, + "loss": 0.6921, + "step": 12987 + }, + { + "epoch": 1.94, + "grad_norm": 4.115115484543992, + "learning_rate": 1.401579012265908e-06, + "loss": 0.6706, + "step": 12988 + }, + { + "epoch": 1.94, + "grad_norm": 1.8322551333400028, + "learning_rate": 1.401490534511883e-06, + "loss": 0.6979, + "step": 12989 + }, + { + "epoch": 1.94, + "grad_norm": 1.7319100197810424, + "learning_rate": 1.4014020530107116e-06, + "loss": 0.6628, + "step": 12990 + }, + { + "epoch": 1.94, + "grad_norm": 1.5307410278525715, + "learning_rate": 1.401313567763219e-06, + "loss": 0.6693, + "step": 12991 + }, + { + "epoch": 1.94, + "grad_norm": 0.9073259085412586, + "learning_rate": 1.4012250787702316e-06, + "loss": 0.6973, + "step": 12992 + }, + { + "epoch": 1.94, + "grad_norm": 3.3005241580722213, + "learning_rate": 1.401136586032575e-06, + "loss": 0.6413, + "step": 12993 + }, + { + "epoch": 1.94, + "grad_norm": 2.4706502333614084, + "learning_rate": 1.401048089551075e-06, + "loss": 0.6882, + "step": 12994 + }, + { + "epoch": 1.94, + "grad_norm": 0.8382078504064175, + "learning_rate": 1.400959589326558e-06, + "loss": 0.6732, + "step": 12995 + }, + { + "epoch": 1.94, + "grad_norm": 2.5986676067567465, + "learning_rate": 1.4008710853598495e-06, + "loss": 0.6777, + "step": 12996 + }, + { + "epoch": 1.94, + "grad_norm": 3.007064566821334, + "learning_rate": 1.4007825776517756e-06, + "loss": 0.6576, + "step": 12997 + }, + { + "epoch": 1.94, + "grad_norm": 3.4382264318520352, + "learning_rate": 1.4006940662031628e-06, + "loss": 0.6895, + "step": 12998 + }, + { + "epoch": 1.94, + "grad_norm": 1.618156518032833, + "learning_rate": 1.400605551014837e-06, + "loss": 0.6777, + "step": 12999 + }, + { + "epoch": 1.94, + "grad_norm": 4.004898749359389, + "learning_rate": 1.4005170320876234e-06, + "loss": 0.6784, + "step": 13000 + }, + { + "epoch": 1.94, + "grad_norm": 1.0499232472263913, + "learning_rate": 1.4004285094223496e-06, + "loss": 0.6569, + "step": 13001 + }, + { + "epoch": 1.94, + "grad_norm": 3.8426641609247096, + "learning_rate": 1.4003399830198407e-06, + "loss": 0.6784, + "step": 13002 + }, + { + "epoch": 1.94, + "grad_norm": 2.1201565253939374, + "learning_rate": 1.4002514528809234e-06, + "loss": 0.6745, + "step": 13003 + }, + { + "epoch": 1.94, + "grad_norm": 0.8895878746251799, + "learning_rate": 1.4001629190064241e-06, + "loss": 0.6829, + "step": 13004 + }, + { + "epoch": 1.94, + "grad_norm": 5.3717456429961645, + "learning_rate": 1.400074381397169e-06, + "loss": 0.6816, + "step": 13005 + }, + { + "epoch": 1.94, + "grad_norm": 5.519106208365375, + "learning_rate": 1.3999858400539842e-06, + "loss": 0.6947, + "step": 13006 + }, + { + "epoch": 1.94, + "grad_norm": 1.6335650651043576, + "learning_rate": 1.3998972949776959e-06, + "loss": 0.6751, + "step": 13007 + }, + { + "epoch": 1.94, + "grad_norm": 4.176157581354294, + "learning_rate": 1.3998087461691308e-06, + "loss": 0.6947, + "step": 13008 + }, + { + "epoch": 1.94, + "grad_norm": 1.7433732386565102, + "learning_rate": 1.3997201936291157e-06, + "loss": 0.6634, + "step": 13009 + }, + { + "epoch": 1.94, + "grad_norm": 0.9388220081738089, + "learning_rate": 1.3996316373584764e-06, + "loss": 0.6849, + "step": 13010 + }, + { + "epoch": 1.94, + "grad_norm": 2.208537379164724, + "learning_rate": 1.39954307735804e-06, + "loss": 0.6647, + "step": 13011 + }, + { + "epoch": 1.94, + "grad_norm": 1.5988103687199673, + "learning_rate": 1.3994545136286325e-06, + "loss": 0.681, + "step": 13012 + }, + { + "epoch": 1.94, + "grad_norm": 1.8733823380783254, + "learning_rate": 1.3993659461710806e-06, + "loss": 0.6738, + "step": 13013 + }, + { + "epoch": 1.94, + "grad_norm": 4.054657689386541, + "learning_rate": 1.399277374986211e-06, + "loss": 0.6628, + "step": 13014 + }, + { + "epoch": 1.94, + "grad_norm": 1.8481767913505336, + "learning_rate": 1.3991888000748503e-06, + "loss": 0.6497, + "step": 13015 + }, + { + "epoch": 1.94, + "grad_norm": 3.5160707374221145, + "learning_rate": 1.3991002214378253e-06, + "loss": 0.6927, + "step": 13016 + }, + { + "epoch": 1.94, + "grad_norm": 2.5268229650791163, + "learning_rate": 1.399011639075963e-06, + "loss": 0.6979, + "step": 13017 + }, + { + "epoch": 1.94, + "grad_norm": 2.309707657598006, + "learning_rate": 1.3989230529900892e-06, + "loss": 0.6745, + "step": 13018 + }, + { + "epoch": 1.94, + "grad_norm": 2.679439791284444, + "learning_rate": 1.3988344631810317e-06, + "loss": 0.6693, + "step": 13019 + }, + { + "epoch": 1.94, + "grad_norm": 2.2073480615895846, + "learning_rate": 1.3987458696496166e-06, + "loss": 0.6582, + "step": 13020 + }, + { + "epoch": 1.94, + "grad_norm": 3.2081267374945885, + "learning_rate": 1.398657272396671e-06, + "loss": 0.6576, + "step": 13021 + }, + { + "epoch": 1.94, + "grad_norm": 4.022079936230449, + "learning_rate": 1.3985686714230224e-06, + "loss": 0.679, + "step": 13022 + }, + { + "epoch": 1.94, + "grad_norm": 1.9160815211878806, + "learning_rate": 1.3984800667294967e-06, + "loss": 0.6816, + "step": 13023 + }, + { + "epoch": 1.94, + "grad_norm": 2.2946104780944943, + "learning_rate": 1.3983914583169214e-06, + "loss": 0.6569, + "step": 13024 + }, + { + "epoch": 1.94, + "grad_norm": 2.373213272268625, + "learning_rate": 1.3983028461861232e-06, + "loss": 0.6647, + "step": 13025 + }, + { + "epoch": 1.94, + "grad_norm": 1.8708884974728588, + "learning_rate": 1.3982142303379295e-06, + "loss": 0.6621, + "step": 13026 + }, + { + "epoch": 1.94, + "grad_norm": 3.1847308598845583, + "learning_rate": 1.398125610773167e-06, + "loss": 0.6803, + "step": 13027 + }, + { + "epoch": 1.94, + "grad_norm": 1.0822332260290375, + "learning_rate": 1.3980369874926632e-06, + "loss": 0.6471, + "step": 13028 + }, + { + "epoch": 1.94, + "grad_norm": 1.646547094555071, + "learning_rate": 1.3979483604972448e-06, + "loss": 0.6686, + "step": 13029 + }, + { + "epoch": 1.94, + "grad_norm": 2.688953708640687, + "learning_rate": 1.3978597297877393e-06, + "loss": 0.6777, + "step": 13030 + }, + { + "epoch": 1.94, + "grad_norm": 1.0232760253975535, + "learning_rate": 1.3977710953649734e-06, + "loss": 0.6556, + "step": 13031 + }, + { + "epoch": 1.94, + "grad_norm": 1.7486162464727228, + "learning_rate": 1.3976824572297752e-06, + "loss": 0.6725, + "step": 13032 + }, + { + "epoch": 1.94, + "grad_norm": 1.7157962853955957, + "learning_rate": 1.3975938153829712e-06, + "loss": 0.6771, + "step": 13033 + }, + { + "epoch": 1.94, + "grad_norm": 1.2108884511592215, + "learning_rate": 1.397505169825389e-06, + "loss": 0.6517, + "step": 13034 + }, + { + "epoch": 1.94, + "grad_norm": 2.833517210487151, + "learning_rate": 1.397416520557856e-06, + "loss": 0.6667, + "step": 13035 + }, + { + "epoch": 1.94, + "grad_norm": 1.0509131802620912, + "learning_rate": 1.3973278675811992e-06, + "loss": 0.6536, + "step": 13036 + }, + { + "epoch": 1.94, + "grad_norm": 1.5672019231656136, + "learning_rate": 1.3972392108962463e-06, + "loss": 0.6953, + "step": 13037 + }, + { + "epoch": 1.94, + "grad_norm": 1.0102873526197675, + "learning_rate": 1.397150550503825e-06, + "loss": 0.6589, + "step": 13038 + }, + { + "epoch": 1.94, + "grad_norm": 3.4165500699195763, + "learning_rate": 1.3970618864047623e-06, + "loss": 0.668, + "step": 13039 + }, + { + "epoch": 1.94, + "grad_norm": 1.3413084775334365, + "learning_rate": 1.396973218599886e-06, + "loss": 0.6667, + "step": 13040 + }, + { + "epoch": 1.94, + "grad_norm": 1.3711380218950424, + "learning_rate": 1.3968845470900234e-06, + "loss": 0.666, + "step": 13041 + }, + { + "epoch": 1.95, + "grad_norm": 3.7089897460489842, + "learning_rate": 1.3967958718760024e-06, + "loss": 0.6673, + "step": 13042 + }, + { + "epoch": 1.95, + "grad_norm": 2.280695729365668, + "learning_rate": 1.3967071929586502e-06, + "loss": 0.6732, + "step": 13043 + }, + { + "epoch": 1.95, + "grad_norm": 1.9949438645596862, + "learning_rate": 1.3966185103387949e-06, + "loss": 0.6602, + "step": 13044 + }, + { + "epoch": 1.95, + "grad_norm": 1.8783663103416446, + "learning_rate": 1.3965298240172639e-06, + "loss": 0.6647, + "step": 13045 + }, + { + "epoch": 1.95, + "grad_norm": 2.446974862494971, + "learning_rate": 1.396441133994885e-06, + "loss": 0.6953, + "step": 13046 + }, + { + "epoch": 1.95, + "grad_norm": 1.1010098251538805, + "learning_rate": 1.3963524402724858e-06, + "loss": 0.6745, + "step": 13047 + }, + { + "epoch": 1.95, + "grad_norm": 2.1651497188246407, + "learning_rate": 1.3962637428508944e-06, + "loss": 0.6634, + "step": 13048 + }, + { + "epoch": 1.95, + "grad_norm": 3.2143425374059116, + "learning_rate": 1.3961750417309385e-06, + "loss": 0.6608, + "step": 13049 + }, + { + "epoch": 1.95, + "grad_norm": 5.0568914488888295, + "learning_rate": 1.3960863369134455e-06, + "loss": 0.7168, + "step": 13050 + }, + { + "epoch": 1.95, + "grad_norm": 1.2885133260773307, + "learning_rate": 1.395997628399244e-06, + "loss": 0.7012, + "step": 13051 + }, + { + "epoch": 1.95, + "grad_norm": 1.2896791254118218, + "learning_rate": 1.3959089161891617e-06, + "loss": 0.6367, + "step": 13052 + }, + { + "epoch": 1.95, + "grad_norm": 1.051908601790371, + "learning_rate": 1.3958202002840263e-06, + "loss": 0.6803, + "step": 13053 + }, + { + "epoch": 1.95, + "grad_norm": 2.9873603722836064, + "learning_rate": 1.3957314806846661e-06, + "loss": 0.6608, + "step": 13054 + }, + { + "epoch": 1.95, + "grad_norm": 0.9760100013890756, + "learning_rate": 1.395642757391909e-06, + "loss": 0.6562, + "step": 13055 + }, + { + "epoch": 1.95, + "grad_norm": 3.8317980113382655, + "learning_rate": 1.395554030406583e-06, + "loss": 0.6849, + "step": 13056 + }, + { + "epoch": 1.95, + "grad_norm": 2.8945998118287872, + "learning_rate": 1.395465299729516e-06, + "loss": 0.6771, + "step": 13057 + }, + { + "epoch": 1.95, + "grad_norm": 2.2662912855430304, + "learning_rate": 1.3953765653615368e-06, + "loss": 0.6784, + "step": 13058 + }, + { + "epoch": 1.95, + "grad_norm": 1.6826006105588256, + "learning_rate": 1.3952878273034728e-06, + "loss": 0.6673, + "step": 13059 + }, + { + "epoch": 1.95, + "grad_norm": 6.696906772554908, + "learning_rate": 1.3951990855561527e-06, + "loss": 0.6719, + "step": 13060 + }, + { + "epoch": 1.95, + "grad_norm": 1.2750814783292144, + "learning_rate": 1.3951103401204047e-06, + "loss": 0.6576, + "step": 13061 + }, + { + "epoch": 1.95, + "grad_norm": 2.902225770075876, + "learning_rate": 1.3950215909970568e-06, + "loss": 0.6986, + "step": 13062 + }, + { + "epoch": 1.95, + "grad_norm": 3.0064871111464058, + "learning_rate": 1.3949328381869375e-06, + "loss": 0.6634, + "step": 13063 + }, + { + "epoch": 1.95, + "grad_norm": 2.1457112451343336, + "learning_rate": 1.3948440816908751e-06, + "loss": 0.7057, + "step": 13064 + }, + { + "epoch": 1.95, + "grad_norm": 1.7333004143801638, + "learning_rate": 1.394755321509698e-06, + "loss": 0.6816, + "step": 13065 + }, + { + "epoch": 1.95, + "grad_norm": 1.5229215328055532, + "learning_rate": 1.3946665576442345e-06, + "loss": 0.6901, + "step": 13066 + }, + { + "epoch": 1.95, + "grad_norm": 0.9535683409885687, + "learning_rate": 1.394577790095313e-06, + "loss": 0.6576, + "step": 13067 + }, + { + "epoch": 1.95, + "grad_norm": 3.330870794994635, + "learning_rate": 1.3944890188637626e-06, + "loss": 0.6536, + "step": 13068 + }, + { + "epoch": 1.95, + "grad_norm": 1.9571913183692617, + "learning_rate": 1.3944002439504109e-06, + "loss": 0.6699, + "step": 13069 + }, + { + "epoch": 1.95, + "grad_norm": 1.4119003786679363, + "learning_rate": 1.3943114653560868e-06, + "loss": 0.6719, + "step": 13070 + }, + { + "epoch": 1.95, + "grad_norm": 2.0163179258725976, + "learning_rate": 1.394222683081619e-06, + "loss": 0.6895, + "step": 13071 + }, + { + "epoch": 1.95, + "grad_norm": 1.2855142167763955, + "learning_rate": 1.3941338971278363e-06, + "loss": 0.6908, + "step": 13072 + }, + { + "epoch": 1.95, + "grad_norm": 4.546058476378657, + "learning_rate": 1.3940451074955665e-06, + "loss": 0.6719, + "step": 13073 + }, + { + "epoch": 1.95, + "grad_norm": 2.6357448835039996, + "learning_rate": 1.3939563141856392e-06, + "loss": 0.679, + "step": 13074 + }, + { + "epoch": 1.95, + "grad_norm": 0.8564914232953225, + "learning_rate": 1.393867517198883e-06, + "loss": 0.6576, + "step": 13075 + }, + { + "epoch": 1.95, + "grad_norm": 0.9071785373658633, + "learning_rate": 1.393778716536126e-06, + "loss": 0.6732, + "step": 13076 + }, + { + "epoch": 1.95, + "grad_norm": 4.300317376278429, + "learning_rate": 1.3936899121981978e-06, + "loss": 0.6732, + "step": 13077 + }, + { + "epoch": 1.95, + "grad_norm": 1.1988623480537373, + "learning_rate": 1.3936011041859266e-06, + "loss": 0.6745, + "step": 13078 + }, + { + "epoch": 1.95, + "grad_norm": 2.3798099795015593, + "learning_rate": 1.3935122925001414e-06, + "loss": 0.6745, + "step": 13079 + }, + { + "epoch": 1.95, + "grad_norm": 1.0058123472701588, + "learning_rate": 1.3934234771416713e-06, + "loss": 0.6758, + "step": 13080 + }, + { + "epoch": 1.95, + "grad_norm": 0.8129554475061969, + "learning_rate": 1.3933346581113452e-06, + "loss": 0.6829, + "step": 13081 + }, + { + "epoch": 1.95, + "grad_norm": 1.7484809883221795, + "learning_rate": 1.3932458354099919e-06, + "loss": 0.6875, + "step": 13082 + }, + { + "epoch": 1.95, + "grad_norm": 2.519004972379638, + "learning_rate": 1.3931570090384403e-06, + "loss": 0.7064, + "step": 13083 + }, + { + "epoch": 1.95, + "grad_norm": 1.160026838056003, + "learning_rate": 1.3930681789975196e-06, + "loss": 0.6842, + "step": 13084 + }, + { + "epoch": 1.95, + "grad_norm": 1.8859898723258035, + "learning_rate": 1.3929793452880592e-06, + "loss": 0.6921, + "step": 13085 + }, + { + "epoch": 1.95, + "grad_norm": 1.078322265771461, + "learning_rate": 1.3928905079108874e-06, + "loss": 0.6608, + "step": 13086 + }, + { + "epoch": 1.95, + "grad_norm": 0.8314592901630763, + "learning_rate": 1.3928016668668335e-06, + "loss": 0.6868, + "step": 13087 + }, + { + "epoch": 1.95, + "grad_norm": 4.011609729059014, + "learning_rate": 1.3927128221567277e-06, + "loss": 0.6829, + "step": 13088 + }, + { + "epoch": 1.95, + "grad_norm": 2.1792506195218064, + "learning_rate": 1.392623973781398e-06, + "loss": 0.6738, + "step": 13089 + }, + { + "epoch": 1.95, + "grad_norm": 2.3903704741312586, + "learning_rate": 1.392535121741674e-06, + "loss": 0.6615, + "step": 13090 + }, + { + "epoch": 1.95, + "grad_norm": 1.1796127725228467, + "learning_rate": 1.392446266038385e-06, + "loss": 0.6419, + "step": 13091 + }, + { + "epoch": 1.95, + "grad_norm": 0.8068751588803452, + "learning_rate": 1.39235740667236e-06, + "loss": 0.6562, + "step": 13092 + }, + { + "epoch": 1.95, + "grad_norm": 2.036172919737143, + "learning_rate": 1.392268543644429e-06, + "loss": 0.681, + "step": 13093 + }, + { + "epoch": 1.95, + "grad_norm": 1.2745799955675845, + "learning_rate": 1.3921796769554207e-06, + "loss": 0.6823, + "step": 13094 + }, + { + "epoch": 1.95, + "grad_norm": 4.143881210542724, + "learning_rate": 1.3920908066061652e-06, + "loss": 0.7005, + "step": 13095 + }, + { + "epoch": 1.95, + "grad_norm": 4.38398899856708, + "learning_rate": 1.3920019325974915e-06, + "loss": 0.6764, + "step": 13096 + }, + { + "epoch": 1.95, + "grad_norm": 2.0902923372854936, + "learning_rate": 1.3919130549302286e-06, + "loss": 0.6745, + "step": 13097 + }, + { + "epoch": 1.95, + "grad_norm": 0.9406895781887081, + "learning_rate": 1.391824173605207e-06, + "loss": 0.6751, + "step": 13098 + }, + { + "epoch": 1.95, + "grad_norm": 2.2753990452346144, + "learning_rate": 1.3917352886232556e-06, + "loss": 0.668, + "step": 13099 + }, + { + "epoch": 1.95, + "grad_norm": 3.284880165958848, + "learning_rate": 1.3916463999852037e-06, + "loss": 0.7096, + "step": 13100 + }, + { + "epoch": 1.95, + "grad_norm": 0.795728699650396, + "learning_rate": 1.391557507691882e-06, + "loss": 0.6647, + "step": 13101 + }, + { + "epoch": 1.95, + "grad_norm": 2.6248243021559343, + "learning_rate": 1.391468611744119e-06, + "loss": 0.6849, + "step": 13102 + }, + { + "epoch": 1.95, + "grad_norm": 1.0637146523898546, + "learning_rate": 1.3913797121427448e-06, + "loss": 0.6771, + "step": 13103 + }, + { + "epoch": 1.95, + "grad_norm": 1.1357029981223377, + "learning_rate": 1.3912908088885895e-06, + "loss": 0.6823, + "step": 13104 + }, + { + "epoch": 1.95, + "grad_norm": 1.5272284356302166, + "learning_rate": 1.3912019019824823e-06, + "loss": 0.681, + "step": 13105 + }, + { + "epoch": 1.95, + "grad_norm": 2.2746619645100434, + "learning_rate": 1.391112991425253e-06, + "loss": 0.6647, + "step": 13106 + }, + { + "epoch": 1.95, + "grad_norm": 2.6063988878298967, + "learning_rate": 1.3910240772177318e-06, + "loss": 0.6699, + "step": 13107 + }, + { + "epoch": 1.95, + "grad_norm": 4.1265737781436345, + "learning_rate": 1.3909351593607482e-06, + "loss": 0.6816, + "step": 13108 + }, + { + "epoch": 1.96, + "grad_norm": 1.83113343939977, + "learning_rate": 1.3908462378551323e-06, + "loss": 0.681, + "step": 13109 + }, + { + "epoch": 1.96, + "grad_norm": 1.515268103637715, + "learning_rate": 1.3907573127017137e-06, + "loss": 0.6777, + "step": 13110 + }, + { + "epoch": 1.96, + "grad_norm": 1.5252399816598072, + "learning_rate": 1.3906683839013225e-06, + "loss": 0.6432, + "step": 13111 + }, + { + "epoch": 1.96, + "grad_norm": 1.5511555494471279, + "learning_rate": 1.390579451454789e-06, + "loss": 0.668, + "step": 13112 + }, + { + "epoch": 1.96, + "grad_norm": 0.8325315577252432, + "learning_rate": 1.3904905153629425e-06, + "loss": 0.6706, + "step": 13113 + }, + { + "epoch": 1.96, + "grad_norm": 0.8408072520673661, + "learning_rate": 1.390401575626614e-06, + "loss": 0.6517, + "step": 13114 + }, + { + "epoch": 1.96, + "grad_norm": 1.511899073547349, + "learning_rate": 1.3903126322466328e-06, + "loss": 0.6556, + "step": 13115 + }, + { + "epoch": 1.96, + "grad_norm": 0.8922270045263905, + "learning_rate": 1.3902236852238292e-06, + "loss": 0.6634, + "step": 13116 + }, + { + "epoch": 1.96, + "grad_norm": 1.76920131473609, + "learning_rate": 1.3901347345590335e-06, + "loss": 0.6777, + "step": 13117 + }, + { + "epoch": 1.96, + "grad_norm": 4.2878478321892874, + "learning_rate": 1.3900457802530762e-06, + "loss": 0.6706, + "step": 13118 + }, + { + "epoch": 1.96, + "grad_norm": 3.892206592002078, + "learning_rate": 1.3899568223067863e-06, + "loss": 0.694, + "step": 13119 + }, + { + "epoch": 1.96, + "grad_norm": 2.5890120773584577, + "learning_rate": 1.3898678607209954e-06, + "loss": 0.6595, + "step": 13120 + }, + { + "epoch": 1.96, + "grad_norm": 3.3933638462878735, + "learning_rate": 1.3897788954965335e-06, + "loss": 0.653, + "step": 13121 + }, + { + "epoch": 1.96, + "grad_norm": 5.459006509337071, + "learning_rate": 1.3896899266342306e-06, + "loss": 0.6953, + "step": 13122 + }, + { + "epoch": 1.96, + "grad_norm": 4.735114632504842, + "learning_rate": 1.389600954134917e-06, + "loss": 0.6628, + "step": 13123 + }, + { + "epoch": 1.96, + "grad_norm": 1.75920770254098, + "learning_rate": 1.3895119779994234e-06, + "loss": 0.6556, + "step": 13124 + }, + { + "epoch": 1.96, + "grad_norm": 2.2474646574131643, + "learning_rate": 1.3894229982285799e-06, + "loss": 0.668, + "step": 13125 + }, + { + "epoch": 1.96, + "grad_norm": 3.8630650397340593, + "learning_rate": 1.3893340148232168e-06, + "loss": 0.7116, + "step": 13126 + }, + { + "epoch": 1.96, + "grad_norm": 1.1045986978929963, + "learning_rate": 1.3892450277841655e-06, + "loss": 0.6816, + "step": 13127 + }, + { + "epoch": 1.96, + "grad_norm": 1.9421509254326832, + "learning_rate": 1.3891560371122556e-06, + "loss": 0.6484, + "step": 13128 + }, + { + "epoch": 1.96, + "grad_norm": 2.7513313619330004, + "learning_rate": 1.389067042808318e-06, + "loss": 0.6849, + "step": 13129 + }, + { + "epoch": 1.96, + "grad_norm": 3.105890477144193, + "learning_rate": 1.3889780448731835e-06, + "loss": 0.6875, + "step": 13130 + }, + { + "epoch": 1.96, + "grad_norm": 1.4650765368949865, + "learning_rate": 1.3888890433076825e-06, + "loss": 0.6784, + "step": 13131 + }, + { + "epoch": 1.96, + "grad_norm": 1.7582644012913577, + "learning_rate": 1.3888000381126453e-06, + "loss": 0.6836, + "step": 13132 + }, + { + "epoch": 1.96, + "grad_norm": 4.0050203163696985, + "learning_rate": 1.3887110292889033e-06, + "loss": 0.6719, + "step": 13133 + }, + { + "epoch": 1.96, + "grad_norm": 2.872927287679603, + "learning_rate": 1.388622016837287e-06, + "loss": 0.6602, + "step": 13134 + }, + { + "epoch": 1.96, + "grad_norm": 1.6479763643269787, + "learning_rate": 1.3885330007586264e-06, + "loss": 0.6732, + "step": 13135 + }, + { + "epoch": 1.96, + "grad_norm": 3.701994989382979, + "learning_rate": 1.3884439810537537e-06, + "loss": 0.6862, + "step": 13136 + }, + { + "epoch": 1.96, + "grad_norm": 1.7879132325794633, + "learning_rate": 1.3883549577234984e-06, + "loss": 0.638, + "step": 13137 + }, + { + "epoch": 1.96, + "grad_norm": 1.9656468801344567, + "learning_rate": 1.3882659307686923e-06, + "loss": 0.6908, + "step": 13138 + }, + { + "epoch": 1.96, + "grad_norm": 2.3783591551739733, + "learning_rate": 1.3881769001901658e-06, + "loss": 0.6543, + "step": 13139 + }, + { + "epoch": 1.96, + "grad_norm": 3.5278843684696577, + "learning_rate": 1.3880878659887498e-06, + "loss": 0.6829, + "step": 13140 + }, + { + "epoch": 1.96, + "grad_norm": 1.1453765078810896, + "learning_rate": 1.387998828165276e-06, + "loss": 0.6576, + "step": 13141 + }, + { + "epoch": 1.96, + "grad_norm": 1.5579320663803673, + "learning_rate": 1.3879097867205743e-06, + "loss": 0.6478, + "step": 13142 + }, + { + "epoch": 1.96, + "grad_norm": 0.99353246488745, + "learning_rate": 1.3878207416554763e-06, + "loss": 0.6439, + "step": 13143 + }, + { + "epoch": 1.96, + "grad_norm": 0.9580310242903272, + "learning_rate": 1.387731692970813e-06, + "loss": 0.6621, + "step": 13144 + }, + { + "epoch": 1.96, + "grad_norm": 1.2975781717233958, + "learning_rate": 1.3876426406674157e-06, + "loss": 0.679, + "step": 13145 + }, + { + "epoch": 1.96, + "grad_norm": 1.9398149725905756, + "learning_rate": 1.3875535847461156e-06, + "loss": 0.623, + "step": 13146 + }, + { + "epoch": 1.96, + "grad_norm": 1.9265996880535987, + "learning_rate": 1.3874645252077433e-06, + "loss": 0.6908, + "step": 13147 + }, + { + "epoch": 1.96, + "grad_norm": 1.1245037137277158, + "learning_rate": 1.38737546205313e-06, + "loss": 0.6849, + "step": 13148 + }, + { + "epoch": 1.96, + "grad_norm": 1.0208729632213278, + "learning_rate": 1.3872863952831078e-06, + "loss": 0.6693, + "step": 13149 + }, + { + "epoch": 1.96, + "grad_norm": 1.195400548741567, + "learning_rate": 1.387197324898507e-06, + "loss": 0.6478, + "step": 13150 + }, + { + "epoch": 1.96, + "grad_norm": 1.6188282028523768, + "learning_rate": 1.3871082509001598e-06, + "loss": 0.6927, + "step": 13151 + }, + { + "epoch": 1.96, + "grad_norm": 3.893684783272169, + "learning_rate": 1.3870191732888965e-06, + "loss": 0.6621, + "step": 13152 + }, + { + "epoch": 1.96, + "grad_norm": 5.502459587131113, + "learning_rate": 1.3869300920655492e-06, + "loss": 0.6745, + "step": 13153 + }, + { + "epoch": 1.96, + "grad_norm": 1.2050482330638326, + "learning_rate": 1.3868410072309495e-06, + "loss": 0.6725, + "step": 13154 + }, + { + "epoch": 1.96, + "grad_norm": 3.2608567759179077, + "learning_rate": 1.3867519187859283e-06, + "loss": 0.6673, + "step": 13155 + }, + { + "epoch": 1.96, + "grad_norm": 1.4409139276091314, + "learning_rate": 1.3866628267313173e-06, + "loss": 0.696, + "step": 13156 + }, + { + "epoch": 1.96, + "grad_norm": 2.059443438028437, + "learning_rate": 1.3865737310679478e-06, + "loss": 0.6862, + "step": 13157 + }, + { + "epoch": 1.96, + "grad_norm": 2.255717521821227, + "learning_rate": 1.3864846317966512e-06, + "loss": 0.6686, + "step": 13158 + }, + { + "epoch": 1.96, + "grad_norm": 2.050919089923011, + "learning_rate": 1.3863955289182596e-06, + "loss": 0.6745, + "step": 13159 + }, + { + "epoch": 1.96, + "grad_norm": 1.1311902968362824, + "learning_rate": 1.3863064224336044e-06, + "loss": 0.6517, + "step": 13160 + }, + { + "epoch": 1.96, + "grad_norm": 1.1562801470574693, + "learning_rate": 1.3862173123435173e-06, + "loss": 0.6999, + "step": 13161 + }, + { + "epoch": 1.96, + "grad_norm": 1.1731222596664979, + "learning_rate": 1.3861281986488297e-06, + "loss": 0.6751, + "step": 13162 + }, + { + "epoch": 1.96, + "grad_norm": 1.1856178564877975, + "learning_rate": 1.3860390813503734e-06, + "loss": 0.6517, + "step": 13163 + }, + { + "epoch": 1.96, + "grad_norm": 1.642262272088208, + "learning_rate": 1.3859499604489803e-06, + "loss": 0.6504, + "step": 13164 + }, + { + "epoch": 1.96, + "grad_norm": 5.810885422932143, + "learning_rate": 1.3858608359454824e-06, + "loss": 0.6608, + "step": 13165 + }, + { + "epoch": 1.96, + "grad_norm": 1.1638360631666165, + "learning_rate": 1.3857717078407108e-06, + "loss": 0.6829, + "step": 13166 + }, + { + "epoch": 1.96, + "grad_norm": 2.5395766179572554, + "learning_rate": 1.385682576135498e-06, + "loss": 0.6882, + "step": 13167 + }, + { + "epoch": 1.96, + "grad_norm": 1.777646845296438, + "learning_rate": 1.3855934408306755e-06, + "loss": 0.7051, + "step": 13168 + }, + { + "epoch": 1.96, + "grad_norm": 2.011235879218841, + "learning_rate": 1.3855043019270752e-06, + "loss": 0.6986, + "step": 13169 + }, + { + "epoch": 1.96, + "grad_norm": 2.33711726509822, + "learning_rate": 1.3854151594255292e-06, + "loss": 0.6654, + "step": 13170 + }, + { + "epoch": 1.96, + "grad_norm": 1.7830676772235372, + "learning_rate": 1.3853260133268695e-06, + "loss": 0.6758, + "step": 13171 + }, + { + "epoch": 1.96, + "grad_norm": 1.9769644179965717, + "learning_rate": 1.385236863631928e-06, + "loss": 0.6777, + "step": 13172 + }, + { + "epoch": 1.96, + "grad_norm": 3.616049647981659, + "learning_rate": 1.3851477103415368e-06, + "loss": 0.6829, + "step": 13173 + }, + { + "epoch": 1.96, + "grad_norm": 1.8542830946942923, + "learning_rate": 1.385058553456528e-06, + "loss": 0.6738, + "step": 13174 + }, + { + "epoch": 1.96, + "grad_norm": 0.9778737772585646, + "learning_rate": 1.3849693929777337e-06, + "loss": 0.679, + "step": 13175 + }, + { + "epoch": 1.97, + "grad_norm": 0.9761992751399015, + "learning_rate": 1.3848802289059857e-06, + "loss": 0.6582, + "step": 13176 + }, + { + "epoch": 1.97, + "grad_norm": 2.307165336539664, + "learning_rate": 1.3847910612421169e-06, + "loss": 0.6901, + "step": 13177 + }, + { + "epoch": 1.97, + "grad_norm": 3.2284093407386933, + "learning_rate": 1.384701889986959e-06, + "loss": 0.6725, + "step": 13178 + }, + { + "epoch": 1.97, + "grad_norm": 3.386481402686784, + "learning_rate": 1.384612715141344e-06, + "loss": 0.6576, + "step": 13179 + }, + { + "epoch": 1.97, + "grad_norm": 3.558277434142353, + "learning_rate": 1.384523536706105e-06, + "loss": 0.6823, + "step": 13180 + }, + { + "epoch": 1.97, + "grad_norm": 3.021679971990669, + "learning_rate": 1.3844343546820736e-06, + "loss": 0.6986, + "step": 13181 + }, + { + "epoch": 1.97, + "grad_norm": 3.2828272483272602, + "learning_rate": 1.3843451690700823e-06, + "loss": 0.6589, + "step": 13182 + }, + { + "epoch": 1.97, + "grad_norm": 1.938842276573048, + "learning_rate": 1.3842559798709636e-06, + "loss": 0.6738, + "step": 13183 + }, + { + "epoch": 1.97, + "grad_norm": 1.3382242041803145, + "learning_rate": 1.38416678708555e-06, + "loss": 0.6758, + "step": 13184 + }, + { + "epoch": 1.97, + "grad_norm": 2.2621561651880437, + "learning_rate": 1.3840775907146735e-06, + "loss": 0.6901, + "step": 13185 + }, + { + "epoch": 1.97, + "grad_norm": 1.415910599888298, + "learning_rate": 1.3839883907591673e-06, + "loss": 0.7057, + "step": 13186 + }, + { + "epoch": 1.97, + "grad_norm": 1.683747381356965, + "learning_rate": 1.3838991872198632e-06, + "loss": 0.6719, + "step": 13187 + }, + { + "epoch": 1.97, + "grad_norm": 2.1429588189113264, + "learning_rate": 1.3838099800975943e-06, + "loss": 0.6452, + "step": 13188 + }, + { + "epoch": 1.97, + "grad_norm": 0.9169902145502226, + "learning_rate": 1.3837207693931925e-06, + "loss": 0.6875, + "step": 13189 + }, + { + "epoch": 1.97, + "grad_norm": 1.5902472384227149, + "learning_rate": 1.3836315551074908e-06, + "loss": 0.6758, + "step": 13190 + }, + { + "epoch": 1.97, + "grad_norm": 1.5066903903787097, + "learning_rate": 1.3835423372413224e-06, + "loss": 0.6458, + "step": 13191 + }, + { + "epoch": 1.97, + "grad_norm": 1.9072444809831086, + "learning_rate": 1.383453115795519e-06, + "loss": 0.6751, + "step": 13192 + }, + { + "epoch": 1.97, + "grad_norm": 1.8288474631435048, + "learning_rate": 1.383363890770914e-06, + "loss": 0.6732, + "step": 13193 + }, + { + "epoch": 1.97, + "grad_norm": 0.9207657035623377, + "learning_rate": 1.3832746621683397e-06, + "loss": 0.6823, + "step": 13194 + }, + { + "epoch": 1.97, + "grad_norm": 0.8593329290881322, + "learning_rate": 1.3831854299886292e-06, + "loss": 0.6647, + "step": 13195 + }, + { + "epoch": 1.97, + "grad_norm": 0.8849002397650583, + "learning_rate": 1.383096194232615e-06, + "loss": 0.679, + "step": 13196 + }, + { + "epoch": 1.97, + "grad_norm": 1.8262313012117473, + "learning_rate": 1.3830069549011306e-06, + "loss": 0.6576, + "step": 13197 + }, + { + "epoch": 1.97, + "grad_norm": 1.8321438931415024, + "learning_rate": 1.382917711995008e-06, + "loss": 0.6582, + "step": 13198 + }, + { + "epoch": 1.97, + "grad_norm": 1.2861276828948836, + "learning_rate": 1.3828284655150807e-06, + "loss": 0.6634, + "step": 13199 + }, + { + "epoch": 1.97, + "grad_norm": 1.0593119209990085, + "learning_rate": 1.3827392154621816e-06, + "loss": 0.627, + "step": 13200 + }, + { + "epoch": 1.97, + "grad_norm": 1.6799424806487475, + "learning_rate": 1.3826499618371432e-06, + "loss": 0.6732, + "step": 13201 + }, + { + "epoch": 1.97, + "grad_norm": 1.0958256786057488, + "learning_rate": 1.3825607046407991e-06, + "loss": 0.7057, + "step": 13202 + }, + { + "epoch": 1.97, + "grad_norm": 2.4683261633534404, + "learning_rate": 1.3824714438739819e-06, + "loss": 0.7005, + "step": 13203 + }, + { + "epoch": 1.97, + "grad_norm": 1.6675271842554675, + "learning_rate": 1.3823821795375253e-06, + "loss": 0.6361, + "step": 13204 + }, + { + "epoch": 1.97, + "grad_norm": 3.104977478502781, + "learning_rate": 1.3822929116322617e-06, + "loss": 0.7103, + "step": 13205 + }, + { + "epoch": 1.97, + "grad_norm": 1.8580870766708084, + "learning_rate": 1.3822036401590245e-06, + "loss": 0.6836, + "step": 13206 + }, + { + "epoch": 1.97, + "grad_norm": 2.3191930401877574, + "learning_rate": 1.3821143651186473e-06, + "loss": 0.6517, + "step": 13207 + }, + { + "epoch": 1.97, + "grad_norm": 3.7072252179864393, + "learning_rate": 1.3820250865119624e-06, + "loss": 0.668, + "step": 13208 + }, + { + "epoch": 1.97, + "grad_norm": 1.801533170507398, + "learning_rate": 1.3819358043398041e-06, + "loss": 0.6686, + "step": 13209 + }, + { + "epoch": 1.97, + "grad_norm": 2.600072449429403, + "learning_rate": 1.3818465186030051e-06, + "loss": 0.6836, + "step": 13210 + }, + { + "epoch": 1.97, + "grad_norm": 1.9973352192429075, + "learning_rate": 1.3817572293023984e-06, + "loss": 0.6732, + "step": 13211 + }, + { + "epoch": 1.97, + "grad_norm": 4.102436434116146, + "learning_rate": 1.381667936438818e-06, + "loss": 0.6634, + "step": 13212 + }, + { + "epoch": 1.97, + "grad_norm": 1.138691923276484, + "learning_rate": 1.3815786400130969e-06, + "loss": 0.679, + "step": 13213 + }, + { + "epoch": 1.97, + "grad_norm": 4.355499941238161, + "learning_rate": 1.3814893400260686e-06, + "loss": 0.6615, + "step": 13214 + }, + { + "epoch": 1.97, + "grad_norm": 1.7803387635489443, + "learning_rate": 1.3814000364785666e-06, + "loss": 0.6862, + "step": 13215 + }, + { + "epoch": 1.97, + "grad_norm": 1.8595632769294927, + "learning_rate": 1.381310729371424e-06, + "loss": 0.7012, + "step": 13216 + }, + { + "epoch": 1.97, + "grad_norm": 0.9312728380072386, + "learning_rate": 1.3812214187054753e-06, + "loss": 0.6647, + "step": 13217 + }, + { + "epoch": 1.97, + "grad_norm": 2.3746614296055113, + "learning_rate": 1.3811321044815528e-06, + "loss": 0.6387, + "step": 13218 + }, + { + "epoch": 1.97, + "grad_norm": 5.069975599103567, + "learning_rate": 1.381042786700491e-06, + "loss": 0.6908, + "step": 13219 + }, + { + "epoch": 1.97, + "grad_norm": 1.626390451918204, + "learning_rate": 1.3809534653631233e-06, + "loss": 0.6868, + "step": 13220 + }, + { + "epoch": 1.97, + "grad_norm": 2.2954322121679716, + "learning_rate": 1.3808641404702828e-06, + "loss": 0.6908, + "step": 13221 + }, + { + "epoch": 1.97, + "grad_norm": 2.01057032131821, + "learning_rate": 1.3807748120228037e-06, + "loss": 0.6725, + "step": 13222 + }, + { + "epoch": 1.97, + "grad_norm": 1.1227540416570734, + "learning_rate": 1.38068548002152e-06, + "loss": 0.653, + "step": 13223 + }, + { + "epoch": 1.97, + "grad_norm": 2.390908874139006, + "learning_rate": 1.3805961444672644e-06, + "loss": 0.6764, + "step": 13224 + }, + { + "epoch": 1.97, + "grad_norm": 1.9616962287143727, + "learning_rate": 1.380506805360872e-06, + "loss": 0.651, + "step": 13225 + }, + { + "epoch": 1.97, + "grad_norm": 1.8888456168572931, + "learning_rate": 1.3804174627031752e-06, + "loss": 0.6719, + "step": 13226 + }, + { + "epoch": 1.97, + "grad_norm": 2.909777968783425, + "learning_rate": 1.380328116495009e-06, + "loss": 0.6576, + "step": 13227 + }, + { + "epoch": 1.97, + "grad_norm": 2.134587105690488, + "learning_rate": 1.380238766737207e-06, + "loss": 0.6595, + "step": 13228 + }, + { + "epoch": 1.97, + "grad_norm": 0.9374478901656317, + "learning_rate": 1.3801494134306027e-06, + "loss": 0.6445, + "step": 13229 + }, + { + "epoch": 1.97, + "grad_norm": 1.509260690381801, + "learning_rate": 1.3800600565760306e-06, + "loss": 0.6615, + "step": 13230 + }, + { + "epoch": 1.97, + "grad_norm": 5.582166487221471, + "learning_rate": 1.379970696174324e-06, + "loss": 0.6764, + "step": 13231 + }, + { + "epoch": 1.97, + "grad_norm": 1.4523925444344588, + "learning_rate": 1.3798813322263174e-06, + "loss": 0.6628, + "step": 13232 + }, + { + "epoch": 1.97, + "grad_norm": 1.3488575819484434, + "learning_rate": 1.379791964732845e-06, + "loss": 0.6934, + "step": 13233 + }, + { + "epoch": 1.97, + "grad_norm": 2.7373157307341387, + "learning_rate": 1.3797025936947403e-06, + "loss": 0.625, + "step": 13234 + }, + { + "epoch": 1.97, + "grad_norm": 1.589474380546973, + "learning_rate": 1.3796132191128378e-06, + "loss": 0.6777, + "step": 13235 + }, + { + "epoch": 1.97, + "grad_norm": 1.6291456711514318, + "learning_rate": 1.3795238409879718e-06, + "loss": 0.6693, + "step": 13236 + }, + { + "epoch": 1.97, + "grad_norm": 1.8459192483856257, + "learning_rate": 1.379434459320976e-06, + "loss": 0.668, + "step": 13237 + }, + { + "epoch": 1.97, + "grad_norm": 1.6896872640313128, + "learning_rate": 1.379345074112685e-06, + "loss": 0.651, + "step": 13238 + }, + { + "epoch": 1.97, + "grad_norm": 1.1165031960664347, + "learning_rate": 1.3792556853639323e-06, + "loss": 0.6634, + "step": 13239 + }, + { + "epoch": 1.97, + "grad_norm": 6.7326071732991615, + "learning_rate": 1.3791662930755535e-06, + "loss": 0.6764, + "step": 13240 + }, + { + "epoch": 1.97, + "grad_norm": 1.8039944212146068, + "learning_rate": 1.3790768972483816e-06, + "loss": 0.6569, + "step": 13241 + }, + { + "epoch": 1.97, + "grad_norm": 4.998151718522531, + "learning_rate": 1.3789874978832518e-06, + "loss": 0.6484, + "step": 13242 + }, + { + "epoch": 1.98, + "grad_norm": 1.2699263832191847, + "learning_rate": 1.378898094980998e-06, + "loss": 0.6452, + "step": 13243 + }, + { + "epoch": 1.98, + "grad_norm": 1.1257531021236558, + "learning_rate": 1.378808688542455e-06, + "loss": 0.6543, + "step": 13244 + }, + { + "epoch": 1.98, + "grad_norm": 1.8488761017809914, + "learning_rate": 1.3787192785684568e-06, + "loss": 0.6641, + "step": 13245 + }, + { + "epoch": 1.98, + "grad_norm": 2.7355229766545706, + "learning_rate": 1.378629865059838e-06, + "loss": 0.653, + "step": 13246 + }, + { + "epoch": 1.98, + "grad_norm": 2.021425019059304, + "learning_rate": 1.3785404480174334e-06, + "loss": 0.6927, + "step": 13247 + }, + { + "epoch": 1.98, + "grad_norm": 2.070262046879961, + "learning_rate": 1.3784510274420772e-06, + "loss": 0.7148, + "step": 13248 + }, + { + "epoch": 1.98, + "grad_norm": 2.1041741496967825, + "learning_rate": 1.3783616033346044e-06, + "loss": 0.6842, + "step": 13249 + }, + { + "epoch": 1.98, + "grad_norm": 1.6015826401050397, + "learning_rate": 1.3782721756958488e-06, + "loss": 0.7122, + "step": 13250 + }, + { + "epoch": 1.98, + "grad_norm": 2.6188704274664474, + "learning_rate": 1.3781827445266458e-06, + "loss": 0.6803, + "step": 13251 + }, + { + "epoch": 1.98, + "grad_norm": 4.659733327596371, + "learning_rate": 1.3780933098278298e-06, + "loss": 0.696, + "step": 13252 + }, + { + "epoch": 1.98, + "grad_norm": 3.399589102844746, + "learning_rate": 1.3780038716002356e-06, + "loss": 0.6712, + "step": 13253 + }, + { + "epoch": 1.98, + "grad_norm": 5.618261303007728, + "learning_rate": 1.3779144298446976e-06, + "loss": 0.694, + "step": 13254 + }, + { + "epoch": 1.98, + "grad_norm": 5.102374742242893, + "learning_rate": 1.3778249845620508e-06, + "loss": 0.6797, + "step": 13255 + }, + { + "epoch": 1.98, + "grad_norm": 0.9937846902012626, + "learning_rate": 1.3777355357531305e-06, + "loss": 0.6536, + "step": 13256 + }, + { + "epoch": 1.98, + "grad_norm": 2.035406199203448, + "learning_rate": 1.3776460834187705e-06, + "loss": 0.694, + "step": 13257 + }, + { + "epoch": 1.98, + "grad_norm": 2.1731304764743404, + "learning_rate": 1.3775566275598062e-06, + "loss": 0.6836, + "step": 13258 + }, + { + "epoch": 1.98, + "grad_norm": 4.058191174411447, + "learning_rate": 1.377467168177073e-06, + "loss": 0.6582, + "step": 13259 + }, + { + "epoch": 1.98, + "grad_norm": 0.9263133254462386, + "learning_rate": 1.3773777052714049e-06, + "loss": 0.6947, + "step": 13260 + }, + { + "epoch": 1.98, + "grad_norm": 1.3832537570799932, + "learning_rate": 1.3772882388436372e-06, + "loss": 0.6784, + "step": 13261 + }, + { + "epoch": 1.98, + "grad_norm": 2.9243734346148775, + "learning_rate": 1.3771987688946054e-06, + "loss": 0.6667, + "step": 13262 + }, + { + "epoch": 1.98, + "grad_norm": 1.5609477456954146, + "learning_rate": 1.377109295425144e-06, + "loss": 0.6432, + "step": 13263 + }, + { + "epoch": 1.98, + "grad_norm": 1.2768954469290736, + "learning_rate": 1.377019818436088e-06, + "loss": 0.6823, + "step": 13264 + }, + { + "epoch": 1.98, + "grad_norm": 1.357458466186097, + "learning_rate": 1.3769303379282729e-06, + "loss": 0.6615, + "step": 13265 + }, + { + "epoch": 1.98, + "grad_norm": 0.9566169395869201, + "learning_rate": 1.3768408539025335e-06, + "loss": 0.6576, + "step": 13266 + }, + { + "epoch": 1.98, + "grad_norm": 1.0763298479764194, + "learning_rate": 1.3767513663597053e-06, + "loss": 0.6777, + "step": 13267 + }, + { + "epoch": 1.98, + "grad_norm": 1.2481481873562215, + "learning_rate": 1.376661875300623e-06, + "loss": 0.6927, + "step": 13268 + }, + { + "epoch": 1.98, + "grad_norm": 1.2038459544430773, + "learning_rate": 1.376572380726122e-06, + "loss": 0.6966, + "step": 13269 + }, + { + "epoch": 1.98, + "grad_norm": 1.2275084763437998, + "learning_rate": 1.3764828826370383e-06, + "loss": 0.668, + "step": 13270 + }, + { + "epoch": 1.98, + "grad_norm": 2.7620992518429532, + "learning_rate": 1.376393381034206e-06, + "loss": 0.6699, + "step": 13271 + }, + { + "epoch": 1.98, + "grad_norm": 0.757648119250422, + "learning_rate": 1.3763038759184612e-06, + "loss": 0.666, + "step": 13272 + }, + { + "epoch": 1.98, + "grad_norm": 1.4978808083360518, + "learning_rate": 1.376214367290639e-06, + "loss": 0.6654, + "step": 13273 + }, + { + "epoch": 1.98, + "grad_norm": 3.710096718105727, + "learning_rate": 1.3761248551515747e-06, + "loss": 0.6758, + "step": 13274 + }, + { + "epoch": 1.98, + "grad_norm": 5.874266718725227, + "learning_rate": 1.376035339502104e-06, + "loss": 0.6895, + "step": 13275 + }, + { + "epoch": 1.98, + "grad_norm": 8.802380678391776, + "learning_rate": 1.3759458203430625e-06, + "loss": 0.681, + "step": 13276 + }, + { + "epoch": 1.98, + "grad_norm": 1.0881647222985695, + "learning_rate": 1.375856297675285e-06, + "loss": 0.6875, + "step": 13277 + }, + { + "epoch": 1.98, + "grad_norm": 3.699570107710257, + "learning_rate": 1.3757667714996074e-06, + "loss": 0.6582, + "step": 13278 + }, + { + "epoch": 1.98, + "grad_norm": 1.089729899036906, + "learning_rate": 1.3756772418168656e-06, + "loss": 0.653, + "step": 13279 + }, + { + "epoch": 1.98, + "grad_norm": 5.033867755203355, + "learning_rate": 1.3755877086278946e-06, + "loss": 0.6686, + "step": 13280 + }, + { + "epoch": 1.98, + "grad_norm": 1.631992229564578, + "learning_rate": 1.3754981719335305e-06, + "loss": 0.6406, + "step": 13281 + }, + { + "epoch": 1.98, + "grad_norm": 1.3273298580707524, + "learning_rate": 1.3754086317346087e-06, + "loss": 0.6725, + "step": 13282 + }, + { + "epoch": 1.98, + "grad_norm": 4.817658332164304, + "learning_rate": 1.375319088031965e-06, + "loss": 0.6888, + "step": 13283 + }, + { + "epoch": 1.98, + "grad_norm": 1.7770073396707415, + "learning_rate": 1.375229540826435e-06, + "loss": 0.6641, + "step": 13284 + }, + { + "epoch": 1.98, + "grad_norm": 2.0716009919769895, + "learning_rate": 1.3751399901188544e-06, + "loss": 0.6484, + "step": 13285 + }, + { + "epoch": 1.98, + "grad_norm": 2.43780828162356, + "learning_rate": 1.3750504359100595e-06, + "loss": 0.6497, + "step": 13286 + }, + { + "epoch": 1.98, + "grad_norm": 1.8650094260022902, + "learning_rate": 1.3749608782008852e-06, + "loss": 0.6888, + "step": 13287 + }, + { + "epoch": 1.98, + "grad_norm": 0.9644277255485094, + "learning_rate": 1.3748713169921681e-06, + "loss": 0.6647, + "step": 13288 + }, + { + "epoch": 1.98, + "grad_norm": 2.842778221065664, + "learning_rate": 1.3747817522847437e-06, + "loss": 0.6908, + "step": 13289 + }, + { + "epoch": 1.98, + "grad_norm": 1.789481057226158, + "learning_rate": 1.3746921840794483e-06, + "loss": 0.6751, + "step": 13290 + }, + { + "epoch": 1.98, + "grad_norm": 1.5488648129120421, + "learning_rate": 1.3746026123771176e-06, + "loss": 0.6868, + "step": 13291 + }, + { + "epoch": 1.98, + "grad_norm": 3.785253129590963, + "learning_rate": 1.3745130371785873e-06, + "loss": 0.6751, + "step": 13292 + }, + { + "epoch": 1.98, + "grad_norm": 4.5832963219647125, + "learning_rate": 1.374423458484694e-06, + "loss": 0.6966, + "step": 13293 + }, + { + "epoch": 1.98, + "grad_norm": 2.354381722712488, + "learning_rate": 1.3743338762962733e-06, + "loss": 0.6354, + "step": 13294 + }, + { + "epoch": 1.98, + "grad_norm": 2.1021972470310977, + "learning_rate": 1.3742442906141614e-06, + "loss": 0.6966, + "step": 13295 + }, + { + "epoch": 1.98, + "grad_norm": 3.0614199037667507, + "learning_rate": 1.3741547014391946e-06, + "loss": 0.679, + "step": 13296 + }, + { + "epoch": 1.98, + "grad_norm": 3.056095575632454, + "learning_rate": 1.3740651087722088e-06, + "loss": 0.6536, + "step": 13297 + }, + { + "epoch": 1.98, + "grad_norm": 2.3070367543700327, + "learning_rate": 1.3739755126140402e-06, + "loss": 0.6576, + "step": 13298 + }, + { + "epoch": 1.98, + "grad_norm": 1.0152699929450704, + "learning_rate": 1.3738859129655255e-06, + "loss": 0.6445, + "step": 13299 + }, + { + "epoch": 1.98, + "grad_norm": 1.1163610901901246, + "learning_rate": 1.3737963098274999e-06, + "loss": 0.6654, + "step": 13300 + }, + { + "epoch": 1.98, + "grad_norm": 1.0499746661614564, + "learning_rate": 1.3737067032008003e-06, + "loss": 0.6654, + "step": 13301 + }, + { + "epoch": 1.98, + "grad_norm": 2.9353606879792813, + "learning_rate": 1.3736170930862635e-06, + "loss": 0.6875, + "step": 13302 + }, + { + "epoch": 1.98, + "grad_norm": 2.003301557320665, + "learning_rate": 1.3735274794847249e-06, + "loss": 0.6816, + "step": 13303 + }, + { + "epoch": 1.98, + "grad_norm": 1.23036483228985, + "learning_rate": 1.3734378623970215e-06, + "loss": 0.6523, + "step": 13304 + }, + { + "epoch": 1.98, + "grad_norm": 1.3899511646650842, + "learning_rate": 1.3733482418239892e-06, + "loss": 0.6712, + "step": 13305 + }, + { + "epoch": 1.98, + "grad_norm": 1.064867458512888, + "learning_rate": 1.373258617766465e-06, + "loss": 0.6543, + "step": 13306 + }, + { + "epoch": 1.98, + "grad_norm": 1.858328235066125, + "learning_rate": 1.3731689902252853e-06, + "loss": 0.6667, + "step": 13307 + }, + { + "epoch": 1.98, + "grad_norm": 1.8996613315131026, + "learning_rate": 1.3730793592012862e-06, + "loss": 0.6576, + "step": 13308 + }, + { + "epoch": 1.98, + "grad_norm": 4.678003663147057, + "learning_rate": 1.3729897246953044e-06, + "loss": 0.696, + "step": 13309 + }, + { + "epoch": 1.99, + "grad_norm": 4.305548481810089, + "learning_rate": 1.3729000867081766e-06, + "loss": 0.6465, + "step": 13310 + }, + { + "epoch": 1.99, + "grad_norm": 3.5522399547204957, + "learning_rate": 1.3728104452407391e-06, + "loss": 0.7116, + "step": 13311 + }, + { + "epoch": 1.99, + "grad_norm": 3.489158571895533, + "learning_rate": 1.372720800293829e-06, + "loss": 0.6816, + "step": 13312 + }, + { + "epoch": 1.99, + "grad_norm": 1.0093891550719245, + "learning_rate": 1.3726311518682827e-06, + "loss": 0.6771, + "step": 13313 + }, + { + "epoch": 1.99, + "grad_norm": 1.886891855366701, + "learning_rate": 1.3725414999649368e-06, + "loss": 0.6543, + "step": 13314 + }, + { + "epoch": 1.99, + "grad_norm": 3.0813051924326076, + "learning_rate": 1.3724518445846283e-06, + "loss": 0.6471, + "step": 13315 + }, + { + "epoch": 1.99, + "grad_norm": 5.582863008794248, + "learning_rate": 1.3723621857281935e-06, + "loss": 0.6406, + "step": 13316 + }, + { + "epoch": 1.99, + "grad_norm": 1.0340821514673189, + "learning_rate": 1.3722725233964696e-06, + "loss": 0.6738, + "step": 13317 + }, + { + "epoch": 1.99, + "grad_norm": 4.591077042940746, + "learning_rate": 1.3721828575902934e-06, + "loss": 0.6706, + "step": 13318 + }, + { + "epoch": 1.99, + "grad_norm": 2.059074701342683, + "learning_rate": 1.3720931883105018e-06, + "loss": 0.681, + "step": 13319 + }, + { + "epoch": 1.99, + "grad_norm": 1.6838743365793072, + "learning_rate": 1.3720035155579314e-06, + "loss": 0.6641, + "step": 13320 + }, + { + "epoch": 1.99, + "grad_norm": 1.6731511728244466, + "learning_rate": 1.3719138393334193e-06, + "loss": 0.6615, + "step": 13321 + }, + { + "epoch": 1.99, + "grad_norm": 1.0563700090164885, + "learning_rate": 1.3718241596378023e-06, + "loss": 0.6784, + "step": 13322 + }, + { + "epoch": 1.99, + "grad_norm": 1.1991153790518887, + "learning_rate": 1.371734476471918e-06, + "loss": 0.6621, + "step": 13323 + }, + { + "epoch": 1.99, + "grad_norm": 2.718640907902632, + "learning_rate": 1.3716447898366025e-06, + "loss": 0.6797, + "step": 13324 + }, + { + "epoch": 1.99, + "grad_norm": 4.835088149494721, + "learning_rate": 1.3715550997326934e-06, + "loss": 0.7051, + "step": 13325 + }, + { + "epoch": 1.99, + "grad_norm": 2.2761772954355273, + "learning_rate": 1.371465406161028e-06, + "loss": 0.694, + "step": 13326 + }, + { + "epoch": 1.99, + "grad_norm": 1.07643460516648, + "learning_rate": 1.3713757091224424e-06, + "loss": 0.6699, + "step": 13327 + }, + { + "epoch": 1.99, + "grad_norm": 1.1962835950519988, + "learning_rate": 1.3712860086177754e-06, + "loss": 0.6497, + "step": 13328 + }, + { + "epoch": 1.99, + "grad_norm": 1.2613731809682571, + "learning_rate": 1.3711963046478627e-06, + "loss": 0.6829, + "step": 13329 + }, + { + "epoch": 1.99, + "grad_norm": 2.6921291457833245, + "learning_rate": 1.371106597213542e-06, + "loss": 0.6738, + "step": 13330 + }, + { + "epoch": 1.99, + "grad_norm": 4.720586016430287, + "learning_rate": 1.3710168863156508e-06, + "loss": 0.6953, + "step": 13331 + }, + { + "epoch": 1.99, + "grad_norm": 1.0675449927796752, + "learning_rate": 1.3709271719550261e-06, + "loss": 0.6745, + "step": 13332 + }, + { + "epoch": 1.99, + "grad_norm": 1.0302533126270066, + "learning_rate": 1.3708374541325052e-06, + "loss": 0.6966, + "step": 13333 + }, + { + "epoch": 1.99, + "grad_norm": 1.6473826418684943, + "learning_rate": 1.3707477328489258e-06, + "loss": 0.6706, + "step": 13334 + }, + { + "epoch": 1.99, + "grad_norm": 1.1563281180216032, + "learning_rate": 1.3706580081051248e-06, + "loss": 0.6829, + "step": 13335 + }, + { + "epoch": 1.99, + "grad_norm": 1.0477044387291232, + "learning_rate": 1.3705682799019402e-06, + "loss": 0.6842, + "step": 13336 + }, + { + "epoch": 1.99, + "grad_norm": 2.884627313059669, + "learning_rate": 1.3704785482402085e-06, + "loss": 0.653, + "step": 13337 + }, + { + "epoch": 1.99, + "grad_norm": 2.2096985696856613, + "learning_rate": 1.370388813120768e-06, + "loss": 0.6725, + "step": 13338 + }, + { + "epoch": 1.99, + "grad_norm": 1.097450134258271, + "learning_rate": 1.3702990745444565e-06, + "loss": 0.6855, + "step": 13339 + }, + { + "epoch": 1.99, + "grad_norm": 3.605796077392864, + "learning_rate": 1.3702093325121102e-06, + "loss": 0.6849, + "step": 13340 + }, + { + "epoch": 1.99, + "grad_norm": 0.897061892899434, + "learning_rate": 1.3701195870245678e-06, + "loss": 0.6621, + "step": 13341 + }, + { + "epoch": 1.99, + "grad_norm": 4.181825934164225, + "learning_rate": 1.3700298380826666e-06, + "loss": 0.6875, + "step": 13342 + }, + { + "epoch": 1.99, + "grad_norm": 4.905343796200434, + "learning_rate": 1.369940085687244e-06, + "loss": 0.6836, + "step": 13343 + }, + { + "epoch": 1.99, + "grad_norm": 0.9240881910645153, + "learning_rate": 1.369850329839138e-06, + "loss": 0.6823, + "step": 13344 + }, + { + "epoch": 1.99, + "grad_norm": 2.514690309054045, + "learning_rate": 1.3697605705391861e-06, + "loss": 0.6764, + "step": 13345 + }, + { + "epoch": 1.99, + "grad_norm": 2.6297466953694904, + "learning_rate": 1.3696708077882263e-06, + "loss": 0.6803, + "step": 13346 + }, + { + "epoch": 1.99, + "grad_norm": 0.8218421503855059, + "learning_rate": 1.369581041587096e-06, + "loss": 0.6901, + "step": 13347 + }, + { + "epoch": 1.99, + "grad_norm": 0.8825880652501386, + "learning_rate": 1.369491271936633e-06, + "loss": 0.6901, + "step": 13348 + }, + { + "epoch": 1.99, + "grad_norm": 1.4939776151787463, + "learning_rate": 1.3694014988376759e-06, + "loss": 0.6602, + "step": 13349 + }, + { + "epoch": 1.99, + "grad_norm": 0.9144035641625052, + "learning_rate": 1.3693117222910612e-06, + "loss": 0.6549, + "step": 13350 + }, + { + "epoch": 1.99, + "grad_norm": 3.570049009202544, + "learning_rate": 1.3692219422976276e-06, + "loss": 0.6615, + "step": 13351 + }, + { + "epoch": 1.99, + "grad_norm": 3.2684007597761346, + "learning_rate": 1.3691321588582135e-06, + "loss": 0.6686, + "step": 13352 + }, + { + "epoch": 1.99, + "grad_norm": 4.0266285680708025, + "learning_rate": 1.369042371973656e-06, + "loss": 0.6999, + "step": 13353 + }, + { + "epoch": 1.99, + "grad_norm": 1.866712352377824, + "learning_rate": 1.3689525816447932e-06, + "loss": 0.6777, + "step": 13354 + }, + { + "epoch": 1.99, + "grad_norm": 2.8640627217734163, + "learning_rate": 1.3688627878724634e-06, + "loss": 0.6784, + "step": 13355 + }, + { + "epoch": 1.99, + "grad_norm": 0.8754017238451621, + "learning_rate": 1.3687729906575049e-06, + "loss": 0.6693, + "step": 13356 + }, + { + "epoch": 1.99, + "grad_norm": 2.0858277914218544, + "learning_rate": 1.3686831900007555e-06, + "loss": 0.666, + "step": 13357 + }, + { + "epoch": 1.99, + "grad_norm": 2.640494937004661, + "learning_rate": 1.3685933859030527e-06, + "loss": 0.6523, + "step": 13358 + }, + { + "epoch": 1.99, + "grad_norm": 1.6811650214130116, + "learning_rate": 1.3685035783652357e-06, + "loss": 0.6647, + "step": 13359 + }, + { + "epoch": 1.99, + "grad_norm": 2.587422106713824, + "learning_rate": 1.3684137673881422e-06, + "loss": 0.6797, + "step": 13360 + }, + { + "epoch": 1.99, + "grad_norm": 2.39731253451383, + "learning_rate": 1.3683239529726102e-06, + "loss": 0.666, + "step": 13361 + }, + { + "epoch": 1.99, + "grad_norm": 3.1901316770757666, + "learning_rate": 1.3682341351194784e-06, + "loss": 0.7005, + "step": 13362 + }, + { + "epoch": 1.99, + "grad_norm": 5.7875737451126295, + "learning_rate": 1.3681443138295847e-06, + "loss": 0.6667, + "step": 13363 + }, + { + "epoch": 1.99, + "grad_norm": 2.9375115697339558, + "learning_rate": 1.3680544891037675e-06, + "loss": 0.6615, + "step": 13364 + }, + { + "epoch": 1.99, + "grad_norm": 2.396071146978566, + "learning_rate": 1.3679646609428654e-06, + "loss": 0.6777, + "step": 13365 + }, + { + "epoch": 1.99, + "grad_norm": 0.7652370658712304, + "learning_rate": 1.3678748293477167e-06, + "loss": 0.6693, + "step": 13366 + }, + { + "epoch": 1.99, + "grad_norm": 2.58327577805397, + "learning_rate": 1.3677849943191593e-06, + "loss": 0.681, + "step": 13367 + }, + { + "epoch": 1.99, + "grad_norm": 2.600959676445185, + "learning_rate": 1.3676951558580323e-06, + "loss": 0.6621, + "step": 13368 + }, + { + "epoch": 1.99, + "grad_norm": 1.186983465733605, + "learning_rate": 1.3676053139651737e-06, + "loss": 0.6504, + "step": 13369 + }, + { + "epoch": 1.99, + "grad_norm": 1.7735822015108633, + "learning_rate": 1.3675154686414225e-06, + "loss": 0.7012, + "step": 13370 + }, + { + "epoch": 1.99, + "grad_norm": 0.9061463660838851, + "learning_rate": 1.3674256198876165e-06, + "loss": 0.6862, + "step": 13371 + }, + { + "epoch": 1.99, + "grad_norm": 1.8657058905155948, + "learning_rate": 1.367335767704595e-06, + "loss": 0.6699, + "step": 13372 + }, + { + "epoch": 1.99, + "grad_norm": 0.9725580143207441, + "learning_rate": 1.3672459120931962e-06, + "loss": 0.6719, + "step": 13373 + }, + { + "epoch": 1.99, + "grad_norm": 6.551824194807773, + "learning_rate": 1.3671560530542588e-06, + "loss": 0.6979, + "step": 13374 + }, + { + "epoch": 1.99, + "grad_norm": 5.282033879657559, + "learning_rate": 1.3670661905886216e-06, + "loss": 0.6842, + "step": 13375 + }, + { + "epoch": 1.99, + "grad_norm": 1.2781966863413294, + "learning_rate": 1.366976324697123e-06, + "loss": 0.6478, + "step": 13376 + }, + { + "epoch": 2.0, + "grad_norm": 4.414941476746597, + "learning_rate": 1.366886455380602e-06, + "loss": 0.6836, + "step": 13377 + }, + { + "epoch": 2.0, + "grad_norm": 5.199621770820516, + "learning_rate": 1.3667965826398975e-06, + "loss": 0.6862, + "step": 13378 + }, + { + "epoch": 2.0, + "grad_norm": 1.8464577866235123, + "learning_rate": 1.3667067064758478e-06, + "loss": 0.6543, + "step": 13379 + }, + { + "epoch": 2.0, + "grad_norm": 3.8211721480751106, + "learning_rate": 1.3666168268892919e-06, + "loss": 0.6602, + "step": 13380 + }, + { + "epoch": 2.0, + "grad_norm": 5.243672650725814, + "learning_rate": 1.366526943881069e-06, + "loss": 0.6842, + "step": 13381 + }, + { + "epoch": 2.0, + "grad_norm": 1.155969946173069, + "learning_rate": 1.3664370574520177e-06, + "loss": 0.6452, + "step": 13382 + }, + { + "epoch": 2.0, + "grad_norm": 6.607868425745373, + "learning_rate": 1.3663471676029765e-06, + "loss": 0.7116, + "step": 13383 + }, + { + "epoch": 2.0, + "grad_norm": 6.2485148801931, + "learning_rate": 1.366257274334785e-06, + "loss": 0.6732, + "step": 13384 + }, + { + "epoch": 2.0, + "grad_norm": 2.2165569383817414, + "learning_rate": 1.3661673776482822e-06, + "loss": 0.6986, + "step": 13385 + }, + { + "epoch": 2.0, + "grad_norm": 6.650310739425231, + "learning_rate": 1.3660774775443068e-06, + "loss": 0.6849, + "step": 13386 + }, + { + "epoch": 2.0, + "grad_norm": 4.414167357097979, + "learning_rate": 1.3659875740236978e-06, + "loss": 0.6732, + "step": 13387 + }, + { + "epoch": 2.0, + "grad_norm": 0.9012788372904439, + "learning_rate": 1.3658976670872944e-06, + "loss": 0.6543, + "step": 13388 + }, + { + "epoch": 2.0, + "grad_norm": 1.1033193309081588, + "learning_rate": 1.365807756735936e-06, + "loss": 0.6875, + "step": 13389 + }, + { + "epoch": 2.0, + "grad_norm": 1.694457143387877, + "learning_rate": 1.3657178429704611e-06, + "loss": 0.6784, + "step": 13390 + }, + { + "epoch": 2.0, + "grad_norm": 4.773032758725273, + "learning_rate": 1.3656279257917097e-06, + "loss": 0.6771, + "step": 13391 + }, + { + "epoch": 2.0, + "grad_norm": 2.5411449345100476, + "learning_rate": 1.36553800520052e-06, + "loss": 0.7122, + "step": 13392 + }, + { + "epoch": 2.0, + "grad_norm": 2.5527849039399553, + "learning_rate": 1.3654480811977321e-06, + "loss": 0.6628, + "step": 13393 + }, + { + "epoch": 2.0, + "grad_norm": 3.808326163653883, + "learning_rate": 1.3653581537841848e-06, + "loss": 0.6719, + "step": 13394 + }, + { + "epoch": 2.0, + "grad_norm": 0.8729978611972425, + "learning_rate": 1.3652682229607177e-06, + "loss": 0.6647, + "step": 13395 + }, + { + "epoch": 2.0, + "grad_norm": 3.6744640250792675, + "learning_rate": 1.3651782887281695e-06, + "loss": 0.6934, + "step": 13396 + }, + { + "epoch": 2.0, + "grad_norm": 6.322234669837583, + "learning_rate": 1.3650883510873806e-06, + "loss": 0.6823, + "step": 13397 + }, + { + "epoch": 2.0, + "grad_norm": 2.1033614974321817, + "learning_rate": 1.3649984100391893e-06, + "loss": 0.668, + "step": 13398 + }, + { + "epoch": 2.0, + "grad_norm": 3.5002626100104335, + "learning_rate": 1.3649084655844362e-06, + "loss": 0.6738, + "step": 13399 + }, + { + "epoch": 2.0, + "grad_norm": 3.5369907874539597, + "learning_rate": 1.3648185177239597e-06, + "loss": 0.7018, + "step": 13400 + }, + { + "epoch": 2.0, + "grad_norm": 5.992853586195895, + "learning_rate": 1.3647285664585996e-06, + "loss": 0.6862, + "step": 13401 + }, + { + "epoch": 2.0, + "grad_norm": 4.414146407717461, + "learning_rate": 1.364638611789196e-06, + "loss": 0.6771, + "step": 13402 + }, + { + "epoch": 2.0, + "grad_norm": 5.442659437170646, + "learning_rate": 1.3645486537165876e-06, + "loss": 0.6465, + "step": 13403 + }, + { + "epoch": 2.0, + "grad_norm": 3.336888397494825, + "learning_rate": 1.3644586922416143e-06, + "loss": 0.6973, + "step": 13404 + }, + { + "epoch": 2.0, + "grad_norm": 1.4464509134913854, + "learning_rate": 1.3643687273651163e-06, + "loss": 0.6673, + "step": 13405 + }, + { + "epoch": 2.0, + "grad_norm": 0.9202480502794188, + "learning_rate": 1.3642787590879323e-06, + "loss": 0.6751, + "step": 13406 + }, + { + "epoch": 2.0, + "grad_norm": 3.8892149041164337, + "learning_rate": 1.3641887874109025e-06, + "loss": 0.6882, + "step": 13407 + }, + { + "epoch": 2.0, + "grad_norm": 5.45240322177311, + "learning_rate": 1.364098812334867e-06, + "loss": 0.6888, + "step": 13408 + }, + { + "epoch": 2.0, + "grad_norm": 2.4194919127395194, + "learning_rate": 1.3640088338606646e-06, + "loss": 0.6348, + "step": 13409 + }, + { + "epoch": 2.0, + "grad_norm": 5.739544894447946, + "learning_rate": 1.3639188519891355e-06, + "loss": 0.666, + "step": 13410 + }, + { + "epoch": 2.0, + "grad_norm": 1.0815525556602728, + "learning_rate": 1.3638288667211195e-06, + "loss": 0.6556, + "step": 13411 + }, + { + "epoch": 2.0, + "grad_norm": 0.7520992652987862, + "learning_rate": 1.363738878057457e-06, + "loss": 0.6602, + "step": 13412 + }, + { + "epoch": 2.0, + "grad_norm": 2.5533341776805285, + "learning_rate": 1.363648885998987e-06, + "loss": 0.6452, + "step": 13413 + }, + { + "epoch": 2.0, + "grad_norm": 5.04805144618195, + "learning_rate": 1.3635588905465498e-06, + "loss": 0.6823, + "step": 13414 + }, + { + "epoch": 2.0, + "grad_norm": 6.195126617433276, + "learning_rate": 1.3634688917009857e-06, + "loss": 0.6745, + "step": 13415 + }, + { + "epoch": 2.0, + "grad_norm": 2.3045118637573614, + "learning_rate": 1.363378889463134e-06, + "loss": 0.6758, + "step": 13416 + }, + { + "epoch": 2.0, + "grad_norm": 1.5894611901585247, + "learning_rate": 1.3632888838338348e-06, + "loss": 0.6595, + "step": 13417 + }, + { + "epoch": 2.0, + "grad_norm": 0.9940049821484593, + "learning_rate": 1.3631988748139289e-06, + "loss": 0.6556, + "step": 13418 + }, + { + "epoch": 2.0, + "grad_norm": 0.853336487761782, + "learning_rate": 1.363108862404255e-06, + "loss": 0.6582, + "step": 13419 + }, + { + "epoch": 2.0, + "grad_norm": 1.221589295582053, + "learning_rate": 1.3630188466056547e-06, + "loss": 0.6296, + "step": 13420 + }, + { + "epoch": 2.0, + "grad_norm": 1.1312783527648897, + "learning_rate": 1.3629288274189668e-06, + "loss": 0.6667, + "step": 13421 + }, + { + "epoch": 2.0, + "grad_norm": 1.097484221321375, + "learning_rate": 1.3628388048450325e-06, + "loss": 0.6569, + "step": 13422 + }, + { + "epoch": 2.0, + "grad_norm": 1.1270642906670085, + "learning_rate": 1.3627487788846914e-06, + "loss": 0.6393, + "step": 13423 + }, + { + "epoch": 2.0, + "grad_norm": 2.6992002997625466, + "learning_rate": 1.3626587495387838e-06, + "loss": 0.6764, + "step": 13424 + }, + { + "epoch": 2.0, + "grad_norm": 1.8017140930994748, + "learning_rate": 1.36256871680815e-06, + "loss": 0.6478, + "step": 13425 + }, + { + "epoch": 2.0, + "grad_norm": 1.1541448997313373, + "learning_rate": 1.3624786806936307e-06, + "loss": 0.6497, + "step": 13426 + }, + { + "epoch": 2.0, + "grad_norm": 1.3722441122906721, + "learning_rate": 1.3623886411960653e-06, + "loss": 0.6725, + "step": 13427 + }, + { + "epoch": 2.0, + "grad_norm": 3.3283604592918463, + "learning_rate": 1.3622985983162947e-06, + "loss": 0.6868, + "step": 13428 + }, + { + "epoch": 2.0, + "grad_norm": 1.8582732152476003, + "learning_rate": 1.3622085520551596e-06, + "loss": 0.6543, + "step": 13429 + }, + { + "epoch": 2.0, + "grad_norm": 2.001036374145405, + "learning_rate": 1.3621185024134997e-06, + "loss": 0.6732, + "step": 13430 + }, + { + "epoch": 2.0, + "grad_norm": 3.351069610145807, + "learning_rate": 1.362028449392156e-06, + "loss": 0.6354, + "step": 13431 + }, + { + "epoch": 2.0, + "grad_norm": 1.4913614646109223, + "learning_rate": 1.3619383929919687e-06, + "loss": 0.6484, + "step": 13432 + }, + { + "epoch": 2.0, + "grad_norm": 1.6375565313602714, + "learning_rate": 1.3618483332137783e-06, + "loss": 0.6569, + "step": 13433 + }, + { + "epoch": 2.0, + "grad_norm": 2.5946337671342525, + "learning_rate": 1.3617582700584256e-06, + "loss": 0.6387, + "step": 13434 + }, + { + "epoch": 2.0, + "grad_norm": 2.9616074742914806, + "learning_rate": 1.361668203526751e-06, + "loss": 0.6536, + "step": 13435 + }, + { + "epoch": 2.0, + "grad_norm": 1.3348811244568353, + "learning_rate": 1.3615781336195949e-06, + "loss": 0.6797, + "step": 13436 + }, + { + "epoch": 2.0, + "grad_norm": 1.156104207635896, + "learning_rate": 1.361488060337798e-06, + "loss": 0.6198, + "step": 13437 + }, + { + "epoch": 2.0, + "grad_norm": 1.873429853183892, + "learning_rate": 1.3613979836822012e-06, + "loss": 0.6204, + "step": 13438 + }, + { + "epoch": 2.0, + "grad_norm": 2.263062646500634, + "learning_rate": 1.3613079036536451e-06, + "loss": 0.6751, + "step": 13439 + }, + { + "epoch": 2.0, + "grad_norm": 1.7759578756169916, + "learning_rate": 1.3612178202529702e-06, + "loss": 0.6654, + "step": 13440 + }, + { + "epoch": 2.0, + "grad_norm": 1.7180515752559813, + "learning_rate": 1.3611277334810176e-06, + "loss": 0.6439, + "step": 13441 + }, + { + "epoch": 2.0, + "grad_norm": 1.6700757479218697, + "learning_rate": 1.3610376433386281e-06, + "loss": 0.6341, + "step": 13442 + }, + { + "epoch": 2.0, + "grad_norm": 1.4536343049884402, + "learning_rate": 1.3609475498266419e-06, + "loss": 0.6445, + "step": 13443 + }, + { + "epoch": 2.01, + "grad_norm": 3.6178177837838845, + "learning_rate": 1.3608574529459007e-06, + "loss": 0.7005, + "step": 13444 + }, + { + "epoch": 2.01, + "grad_norm": 4.9887823695770415, + "learning_rate": 1.360767352697245e-06, + "loss": 0.6641, + "step": 13445 + }, + { + "epoch": 2.01, + "grad_norm": 2.1973958273094327, + "learning_rate": 1.3606772490815152e-06, + "loss": 0.6849, + "step": 13446 + }, + { + "epoch": 2.01, + "grad_norm": 1.5943815588031862, + "learning_rate": 1.3605871420995532e-06, + "loss": 0.64, + "step": 13447 + }, + { + "epoch": 2.01, + "grad_norm": 1.4298661551836425, + "learning_rate": 1.3604970317521996e-06, + "loss": 0.6309, + "step": 13448 + }, + { + "epoch": 2.01, + "grad_norm": 1.4618494905422166, + "learning_rate": 1.3604069180402949e-06, + "loss": 0.6497, + "step": 13449 + }, + { + "epoch": 2.01, + "grad_norm": 1.603148842087315, + "learning_rate": 1.3603168009646808e-06, + "loss": 0.6764, + "step": 13450 + }, + { + "epoch": 2.01, + "grad_norm": 2.208394333409434, + "learning_rate": 1.360226680526198e-06, + "loss": 0.7012, + "step": 13451 + }, + { + "epoch": 2.01, + "grad_norm": 1.525988780284267, + "learning_rate": 1.360136556725688e-06, + "loss": 0.6725, + "step": 13452 + }, + { + "epoch": 2.01, + "grad_norm": 1.482289715468606, + "learning_rate": 1.3600464295639912e-06, + "loss": 0.6374, + "step": 13453 + }, + { + "epoch": 2.01, + "grad_norm": 4.48783799937286, + "learning_rate": 1.3599562990419496e-06, + "loss": 0.6283, + "step": 13454 + }, + { + "epoch": 2.01, + "grad_norm": 1.468512963462477, + "learning_rate": 1.3598661651604042e-06, + "loss": 0.6667, + "step": 13455 + }, + { + "epoch": 2.01, + "grad_norm": 3.4949948757751295, + "learning_rate": 1.3597760279201955e-06, + "loss": 0.6354, + "step": 13456 + }, + { + "epoch": 2.01, + "grad_norm": 4.053857077033095, + "learning_rate": 1.3596858873221656e-06, + "loss": 0.6471, + "step": 13457 + }, + { + "epoch": 2.01, + "grad_norm": 3.457955699122341, + "learning_rate": 1.3595957433671557e-06, + "loss": 0.6777, + "step": 13458 + }, + { + "epoch": 2.01, + "grad_norm": 4.542726860227372, + "learning_rate": 1.3595055960560067e-06, + "loss": 0.64, + "step": 13459 + }, + { + "epoch": 2.01, + "grad_norm": 1.3579458898256087, + "learning_rate": 1.35941544538956e-06, + "loss": 0.6302, + "step": 13460 + }, + { + "epoch": 2.01, + "grad_norm": 3.813516704623326, + "learning_rate": 1.3593252913686578e-06, + "loss": 0.6738, + "step": 13461 + }, + { + "epoch": 2.01, + "grad_norm": 3.284799478668429, + "learning_rate": 1.35923513399414e-06, + "loss": 0.638, + "step": 13462 + }, + { + "epoch": 2.01, + "grad_norm": 1.761669209633376, + "learning_rate": 1.3591449732668497e-06, + "loss": 0.6478, + "step": 13463 + }, + { + "epoch": 2.01, + "grad_norm": 2.6937769442263333, + "learning_rate": 1.359054809187627e-06, + "loss": 0.623, + "step": 13464 + }, + { + "epoch": 2.01, + "grad_norm": 1.466975290226957, + "learning_rate": 1.3589646417573144e-06, + "loss": 0.6217, + "step": 13465 + }, + { + "epoch": 2.01, + "grad_norm": 4.645386352396363, + "learning_rate": 1.3588744709767532e-06, + "loss": 0.6602, + "step": 13466 + }, + { + "epoch": 2.01, + "grad_norm": 5.900885818514905, + "learning_rate": 1.3587842968467844e-06, + "loss": 0.6992, + "step": 13467 + }, + { + "epoch": 2.01, + "grad_norm": 1.6918653626571019, + "learning_rate": 1.3586941193682505e-06, + "loss": 0.6302, + "step": 13468 + }, + { + "epoch": 2.01, + "grad_norm": 3.00267798958568, + "learning_rate": 1.3586039385419921e-06, + "loss": 0.6426, + "step": 13469 + }, + { + "epoch": 2.01, + "grad_norm": 3.701333600554813, + "learning_rate": 1.3585137543688516e-06, + "loss": 0.6641, + "step": 13470 + }, + { + "epoch": 2.01, + "grad_norm": 4.716804802580609, + "learning_rate": 1.358423566849671e-06, + "loss": 0.6732, + "step": 13471 + }, + { + "epoch": 2.01, + "grad_norm": 2.351858070845399, + "learning_rate": 1.358333375985291e-06, + "loss": 0.653, + "step": 13472 + }, + { + "epoch": 2.01, + "grad_norm": 1.4998803702801038, + "learning_rate": 1.358243181776554e-06, + "loss": 0.666, + "step": 13473 + }, + { + "epoch": 2.01, + "grad_norm": 5.579321672317965, + "learning_rate": 1.3581529842243018e-06, + "loss": 0.6172, + "step": 13474 + }, + { + "epoch": 2.01, + "grad_norm": 2.998884281541847, + "learning_rate": 1.3580627833293763e-06, + "loss": 0.6172, + "step": 13475 + }, + { + "epoch": 2.01, + "grad_norm": 2.6212234222910067, + "learning_rate": 1.357972579092619e-06, + "loss": 0.6445, + "step": 13476 + }, + { + "epoch": 2.01, + "grad_norm": 4.404092806333264, + "learning_rate": 1.3578823715148718e-06, + "loss": 0.6478, + "step": 13477 + }, + { + "epoch": 2.01, + "grad_norm": 4.124140734836512, + "learning_rate": 1.357792160596977e-06, + "loss": 0.6484, + "step": 13478 + }, + { + "epoch": 2.01, + "grad_norm": 4.189092964042026, + "learning_rate": 1.3577019463397763e-06, + "loss": 0.6751, + "step": 13479 + }, + { + "epoch": 2.01, + "grad_norm": 5.3496016861893105, + "learning_rate": 1.3576117287441116e-06, + "loss": 0.7018, + "step": 13480 + }, + { + "epoch": 2.01, + "grad_norm": 4.153376981597459, + "learning_rate": 1.3575215078108254e-06, + "loss": 0.584, + "step": 13481 + }, + { + "epoch": 2.01, + "grad_norm": 4.913446407711714, + "learning_rate": 1.357431283540759e-06, + "loss": 0.6549, + "step": 13482 + }, + { + "epoch": 2.01, + "grad_norm": 1.7080358514362428, + "learning_rate": 1.3573410559347545e-06, + "loss": 0.6543, + "step": 13483 + }, + { + "epoch": 2.01, + "grad_norm": 2.444049578678295, + "learning_rate": 1.357250824993655e-06, + "loss": 0.6764, + "step": 13484 + }, + { + "epoch": 2.01, + "grad_norm": 1.8145553316947172, + "learning_rate": 1.3571605907183014e-06, + "loss": 0.6888, + "step": 13485 + }, + { + "epoch": 2.01, + "grad_norm": 1.8347032489401274, + "learning_rate": 1.3570703531095368e-06, + "loss": 0.6296, + "step": 13486 + }, + { + "epoch": 2.01, + "grad_norm": 2.181386785584049, + "learning_rate": 1.3569801121682027e-06, + "loss": 0.627, + "step": 13487 + }, + { + "epoch": 2.01, + "grad_norm": 6.293098376986487, + "learning_rate": 1.356889867895142e-06, + "loss": 0.6576, + "step": 13488 + }, + { + "epoch": 2.01, + "grad_norm": 4.90058951054711, + "learning_rate": 1.3567996202911963e-06, + "loss": 0.6374, + "step": 13489 + }, + { + "epoch": 2.01, + "grad_norm": 1.9557798169317724, + "learning_rate": 1.3567093693572083e-06, + "loss": 0.6615, + "step": 13490 + }, + { + "epoch": 2.01, + "grad_norm": 5.381423929586669, + "learning_rate": 1.3566191150940203e-06, + "loss": 0.6536, + "step": 13491 + }, + { + "epoch": 2.01, + "grad_norm": 2.2022510330470526, + "learning_rate": 1.3565288575024746e-06, + "loss": 0.6452, + "step": 13492 + }, + { + "epoch": 2.01, + "grad_norm": 2.008155853838501, + "learning_rate": 1.356438596583413e-06, + "loss": 0.6497, + "step": 13493 + }, + { + "epoch": 2.01, + "grad_norm": 2.535238195586314, + "learning_rate": 1.356348332337679e-06, + "loss": 0.6621, + "step": 13494 + }, + { + "epoch": 2.01, + "grad_norm": 1.6652024456762562, + "learning_rate": 1.3562580647661146e-06, + "loss": 0.6348, + "step": 13495 + }, + { + "epoch": 2.01, + "grad_norm": 3.415134047253328, + "learning_rate": 1.3561677938695617e-06, + "loss": 0.6927, + "step": 13496 + }, + { + "epoch": 2.01, + "grad_norm": 2.4848864412027747, + "learning_rate": 1.3560775196488636e-06, + "loss": 0.651, + "step": 13497 + }, + { + "epoch": 2.01, + "grad_norm": 1.9746988999192507, + "learning_rate": 1.3559872421048626e-06, + "loss": 0.6361, + "step": 13498 + }, + { + "epoch": 2.01, + "grad_norm": 2.0332403039536393, + "learning_rate": 1.3558969612384007e-06, + "loss": 0.6387, + "step": 13499 + }, + { + "epoch": 2.01, + "grad_norm": 1.506641171226551, + "learning_rate": 1.3558066770503214e-06, + "loss": 0.6667, + "step": 13500 + }, + { + "epoch": 2.01, + "grad_norm": 2.4972403864361676, + "learning_rate": 1.355716389541467e-06, + "loss": 0.6641, + "step": 13501 + }, + { + "epoch": 2.01, + "grad_norm": 1.6877139430194366, + "learning_rate": 1.35562609871268e-06, + "loss": 0.6517, + "step": 13502 + }, + { + "epoch": 2.01, + "grad_norm": 4.567589004524895, + "learning_rate": 1.3555358045648029e-06, + "loss": 0.6712, + "step": 13503 + }, + { + "epoch": 2.01, + "grad_norm": 1.6016923631571005, + "learning_rate": 1.355445507098679e-06, + "loss": 0.6641, + "step": 13504 + }, + { + "epoch": 2.01, + "grad_norm": 6.834955302751593, + "learning_rate": 1.3553552063151508e-06, + "loss": 0.6484, + "step": 13505 + }, + { + "epoch": 2.01, + "grad_norm": 4.8770183394677105, + "learning_rate": 1.3552649022150605e-06, + "loss": 0.6595, + "step": 13506 + }, + { + "epoch": 2.01, + "grad_norm": 2.8991308816990733, + "learning_rate": 1.355174594799252e-06, + "loss": 0.6445, + "step": 13507 + }, + { + "epoch": 2.01, + "grad_norm": 1.871245494043392, + "learning_rate": 1.3550842840685674e-06, + "loss": 0.6276, + "step": 13508 + }, + { + "epoch": 2.01, + "grad_norm": 2.4088083234612143, + "learning_rate": 1.3549939700238498e-06, + "loss": 0.6504, + "step": 13509 + }, + { + "epoch": 2.01, + "grad_norm": 1.6116216229587577, + "learning_rate": 1.354903652665942e-06, + "loss": 0.6628, + "step": 13510 + }, + { + "epoch": 2.02, + "grad_norm": 3.470062384602289, + "learning_rate": 1.3548133319956871e-06, + "loss": 0.6465, + "step": 13511 + }, + { + "epoch": 2.02, + "grad_norm": 9.899797391267114, + "learning_rate": 1.354723008013928e-06, + "loss": 0.6973, + "step": 13512 + }, + { + "epoch": 2.02, + "grad_norm": 4.485916192562916, + "learning_rate": 1.3546326807215076e-06, + "loss": 0.6432, + "step": 13513 + }, + { + "epoch": 2.02, + "grad_norm": 2.348616511416443, + "learning_rate": 1.3545423501192694e-06, + "loss": 0.6908, + "step": 13514 + }, + { + "epoch": 2.02, + "grad_norm": 3.6914406595098184, + "learning_rate": 1.3544520162080556e-06, + "loss": 0.6497, + "step": 13515 + }, + { + "epoch": 2.02, + "grad_norm": 2.961816630297597, + "learning_rate": 1.35436167898871e-06, + "loss": 0.6465, + "step": 13516 + }, + { + "epoch": 2.02, + "grad_norm": 1.8148352163827774, + "learning_rate": 1.3542713384620756e-06, + "loss": 0.6113, + "step": 13517 + }, + { + "epoch": 2.02, + "grad_norm": 2.878010554969716, + "learning_rate": 1.3541809946289954e-06, + "loss": 0.6608, + "step": 13518 + }, + { + "epoch": 2.02, + "grad_norm": 1.8320779856914853, + "learning_rate": 1.3540906474903126e-06, + "loss": 0.6432, + "step": 13519 + }, + { + "epoch": 2.02, + "grad_norm": 1.6325591003479345, + "learning_rate": 1.3540002970468701e-06, + "loss": 0.61, + "step": 13520 + }, + { + "epoch": 2.02, + "grad_norm": 1.395115822787504, + "learning_rate": 1.3539099432995124e-06, + "loss": 0.6445, + "step": 13521 + }, + { + "epoch": 2.02, + "grad_norm": 2.2119266928441554, + "learning_rate": 1.3538195862490812e-06, + "loss": 0.6764, + "step": 13522 + }, + { + "epoch": 2.02, + "grad_norm": 1.4314813174083487, + "learning_rate": 1.3537292258964208e-06, + "loss": 0.6549, + "step": 13523 + }, + { + "epoch": 2.02, + "grad_norm": 1.6673938556401158, + "learning_rate": 1.3536388622423744e-06, + "loss": 0.6816, + "step": 13524 + }, + { + "epoch": 2.02, + "grad_norm": 2.2036352670885604, + "learning_rate": 1.3535484952877848e-06, + "loss": 0.6816, + "step": 13525 + }, + { + "epoch": 2.02, + "grad_norm": 3.0219324008900528, + "learning_rate": 1.3534581250334962e-06, + "loss": 0.6452, + "step": 13526 + }, + { + "epoch": 2.02, + "grad_norm": 3.420789763809331, + "learning_rate": 1.3533677514803514e-06, + "loss": 0.6341, + "step": 13527 + }, + { + "epoch": 2.02, + "grad_norm": 3.923023180020789, + "learning_rate": 1.3532773746291942e-06, + "loss": 0.6569, + "step": 13528 + }, + { + "epoch": 2.02, + "grad_norm": 2.15607778572406, + "learning_rate": 1.353186994480868e-06, + "loss": 0.6797, + "step": 13529 + }, + { + "epoch": 2.02, + "grad_norm": 2.656820424317417, + "learning_rate": 1.3530966110362163e-06, + "loss": 0.7031, + "step": 13530 + }, + { + "epoch": 2.02, + "grad_norm": 1.667808680148717, + "learning_rate": 1.3530062242960828e-06, + "loss": 0.6354, + "step": 13531 + }, + { + "epoch": 2.02, + "grad_norm": 2.1455163712556833, + "learning_rate": 1.3529158342613112e-06, + "loss": 0.6328, + "step": 13532 + }, + { + "epoch": 2.02, + "grad_norm": 2.6623518126746126, + "learning_rate": 1.3528254409327444e-06, + "loss": 0.6452, + "step": 13533 + }, + { + "epoch": 2.02, + "grad_norm": 1.6919186001500208, + "learning_rate": 1.352735044311227e-06, + "loss": 0.625, + "step": 13534 + }, + { + "epoch": 2.02, + "grad_norm": 3.3832320805921294, + "learning_rate": 1.352644644397602e-06, + "loss": 0.6777, + "step": 13535 + }, + { + "epoch": 2.02, + "grad_norm": 2.412585328361205, + "learning_rate": 1.3525542411927132e-06, + "loss": 0.6296, + "step": 13536 + }, + { + "epoch": 2.02, + "grad_norm": 2.0863770609466235, + "learning_rate": 1.352463834697405e-06, + "loss": 0.6484, + "step": 13537 + }, + { + "epoch": 2.02, + "grad_norm": 1.5949618099548464, + "learning_rate": 1.3523734249125206e-06, + "loss": 0.6484, + "step": 13538 + }, + { + "epoch": 2.02, + "grad_norm": 1.696104528354575, + "learning_rate": 1.3522830118389034e-06, + "loss": 0.6536, + "step": 13539 + }, + { + "epoch": 2.02, + "grad_norm": 3.1606102793197763, + "learning_rate": 1.352192595477398e-06, + "loss": 0.6354, + "step": 13540 + }, + { + "epoch": 2.02, + "grad_norm": 2.31761412399972, + "learning_rate": 1.352102175828848e-06, + "loss": 0.6992, + "step": 13541 + }, + { + "epoch": 2.02, + "grad_norm": 1.6913871974204782, + "learning_rate": 1.3520117528940973e-06, + "loss": 0.6882, + "step": 13542 + }, + { + "epoch": 2.02, + "grad_norm": 1.9172939135700018, + "learning_rate": 1.3519213266739894e-06, + "loss": 0.6458, + "step": 13543 + }, + { + "epoch": 2.02, + "grad_norm": 3.596482882453033, + "learning_rate": 1.351830897169369e-06, + "loss": 0.6413, + "step": 13544 + }, + { + "epoch": 2.02, + "grad_norm": 1.536351704371488, + "learning_rate": 1.3517404643810796e-06, + "loss": 0.6576, + "step": 13545 + }, + { + "epoch": 2.02, + "grad_norm": 1.9676366365553908, + "learning_rate": 1.3516500283099653e-06, + "loss": 0.6191, + "step": 13546 + }, + { + "epoch": 2.02, + "grad_norm": 3.8318532965934393, + "learning_rate": 1.3515595889568703e-06, + "loss": 0.6322, + "step": 13547 + }, + { + "epoch": 2.02, + "grad_norm": 1.7023185194398178, + "learning_rate": 1.3514691463226384e-06, + "loss": 0.6283, + "step": 13548 + }, + { + "epoch": 2.02, + "grad_norm": 3.333206234125659, + "learning_rate": 1.351378700408114e-06, + "loss": 0.6478, + "step": 13549 + }, + { + "epoch": 2.02, + "grad_norm": 5.822913247212875, + "learning_rate": 1.351288251214141e-06, + "loss": 0.653, + "step": 13550 + }, + { + "epoch": 2.02, + "grad_norm": 1.7661261334332425, + "learning_rate": 1.351197798741564e-06, + "loss": 0.7096, + "step": 13551 + }, + { + "epoch": 2.02, + "grad_norm": 2.3648055881822705, + "learning_rate": 1.3511073429912266e-06, + "loss": 0.6602, + "step": 13552 + }, + { + "epoch": 2.02, + "grad_norm": 2.77264072688375, + "learning_rate": 1.351016883963973e-06, + "loss": 0.638, + "step": 13553 + }, + { + "epoch": 2.02, + "grad_norm": 1.9843479601974854, + "learning_rate": 1.3509264216606485e-06, + "loss": 0.7051, + "step": 13554 + }, + { + "epoch": 2.02, + "grad_norm": 3.5101560789948794, + "learning_rate": 1.3508359560820964e-06, + "loss": 0.6478, + "step": 13555 + }, + { + "epoch": 2.02, + "grad_norm": 2.6920940122703017, + "learning_rate": 1.3507454872291607e-06, + "loss": 0.6478, + "step": 13556 + }, + { + "epoch": 2.02, + "grad_norm": 1.753377499151465, + "learning_rate": 1.350655015102687e-06, + "loss": 0.6523, + "step": 13557 + }, + { + "epoch": 2.02, + "grad_norm": 2.5131123582362753, + "learning_rate": 1.350564539703519e-06, + "loss": 0.6725, + "step": 13558 + }, + { + "epoch": 2.02, + "grad_norm": 4.718029479530143, + "learning_rate": 1.350474061032501e-06, + "loss": 0.6198, + "step": 13559 + }, + { + "epoch": 2.02, + "grad_norm": 3.0172188373229107, + "learning_rate": 1.3503835790904776e-06, + "loss": 0.6289, + "step": 13560 + }, + { + "epoch": 2.02, + "grad_norm": 1.9829611466534387, + "learning_rate": 1.3502930938782934e-06, + "loss": 0.6335, + "step": 13561 + }, + { + "epoch": 2.02, + "grad_norm": 5.674264100843808, + "learning_rate": 1.3502026053967926e-06, + "loss": 0.5918, + "step": 13562 + }, + { + "epoch": 2.02, + "grad_norm": 2.7536962457054264, + "learning_rate": 1.3501121136468202e-06, + "loss": 0.6322, + "step": 13563 + }, + { + "epoch": 2.02, + "grad_norm": 1.8212517511147444, + "learning_rate": 1.3500216186292202e-06, + "loss": 0.666, + "step": 13564 + }, + { + "epoch": 2.02, + "grad_norm": 4.599450351244365, + "learning_rate": 1.3499311203448375e-06, + "loss": 0.6517, + "step": 13565 + }, + { + "epoch": 2.02, + "grad_norm": 2.215882231960931, + "learning_rate": 1.3498406187945168e-06, + "loss": 0.6706, + "step": 13566 + }, + { + "epoch": 2.02, + "grad_norm": 1.6099484126608719, + "learning_rate": 1.3497501139791027e-06, + "loss": 0.627, + "step": 13567 + }, + { + "epoch": 2.02, + "grad_norm": 1.7576085838337492, + "learning_rate": 1.3496596058994398e-06, + "loss": 0.6139, + "step": 13568 + }, + { + "epoch": 2.02, + "grad_norm": 5.0097210584009195, + "learning_rate": 1.3495690945563726e-06, + "loss": 0.6471, + "step": 13569 + }, + { + "epoch": 2.02, + "grad_norm": 2.9947511070036303, + "learning_rate": 1.3494785799507467e-06, + "loss": 0.6289, + "step": 13570 + }, + { + "epoch": 2.02, + "grad_norm": 4.8817279784436804, + "learning_rate": 1.3493880620834061e-06, + "loss": 0.6543, + "step": 13571 + }, + { + "epoch": 2.02, + "grad_norm": 2.0044256894700783, + "learning_rate": 1.3492975409551956e-06, + "loss": 0.6276, + "step": 13572 + }, + { + "epoch": 2.02, + "grad_norm": 5.6411080657927775, + "learning_rate": 1.3492070165669604e-06, + "loss": 0.6517, + "step": 13573 + }, + { + "epoch": 2.02, + "grad_norm": 2.231286019274484, + "learning_rate": 1.3491164889195451e-06, + "loss": 0.6491, + "step": 13574 + }, + { + "epoch": 2.02, + "grad_norm": 3.873614672036474, + "learning_rate": 1.3490259580137948e-06, + "loss": 0.6699, + "step": 13575 + }, + { + "epoch": 2.02, + "grad_norm": 1.9128377018143128, + "learning_rate": 1.3489354238505547e-06, + "loss": 0.6413, + "step": 13576 + }, + { + "epoch": 2.02, + "grad_norm": 3.3109924083436613, + "learning_rate": 1.348844886430669e-06, + "loss": 0.6745, + "step": 13577 + }, + { + "epoch": 2.03, + "grad_norm": 2.9073320173440975, + "learning_rate": 1.3487543457549833e-06, + "loss": 0.6634, + "step": 13578 + }, + { + "epoch": 2.03, + "grad_norm": 2.6304559531358516, + "learning_rate": 1.3486638018243424e-06, + "loss": 0.6706, + "step": 13579 + }, + { + "epoch": 2.03, + "grad_norm": 2.767241266447287, + "learning_rate": 1.3485732546395915e-06, + "loss": 0.6589, + "step": 13580 + }, + { + "epoch": 2.03, + "grad_norm": 2.625602580507604, + "learning_rate": 1.3484827042015754e-06, + "loss": 0.6322, + "step": 13581 + }, + { + "epoch": 2.03, + "grad_norm": 2.5915828723948473, + "learning_rate": 1.3483921505111395e-06, + "loss": 0.6178, + "step": 13582 + }, + { + "epoch": 2.03, + "grad_norm": 2.6792960070698864, + "learning_rate": 1.3483015935691287e-06, + "loss": 0.6491, + "step": 13583 + }, + { + "epoch": 2.03, + "grad_norm": 2.995016376397095, + "learning_rate": 1.3482110333763888e-06, + "loss": 0.6562, + "step": 13584 + }, + { + "epoch": 2.03, + "grad_norm": 2.0402994212966927, + "learning_rate": 1.3481204699337641e-06, + "loss": 0.6328, + "step": 13585 + }, + { + "epoch": 2.03, + "grad_norm": 4.769781314878707, + "learning_rate": 1.3480299032421e-06, + "loss": 0.6393, + "step": 13586 + }, + { + "epoch": 2.03, + "grad_norm": 4.1327600908769595, + "learning_rate": 1.3479393333022429e-06, + "loss": 0.6361, + "step": 13587 + }, + { + "epoch": 2.03, + "grad_norm": 2.18135785523164, + "learning_rate": 1.3478487601150365e-06, + "loss": 0.6699, + "step": 13588 + }, + { + "epoch": 2.03, + "grad_norm": 1.9593203740492222, + "learning_rate": 1.3477581836813271e-06, + "loss": 0.6758, + "step": 13589 + }, + { + "epoch": 2.03, + "grad_norm": 1.6749204663981274, + "learning_rate": 1.34766760400196e-06, + "loss": 0.6354, + "step": 13590 + }, + { + "epoch": 2.03, + "grad_norm": 1.7933463190029055, + "learning_rate": 1.34757702107778e-06, + "loss": 0.6419, + "step": 13591 + }, + { + "epoch": 2.03, + "grad_norm": 2.316523585567213, + "learning_rate": 1.3474864349096333e-06, + "loss": 0.6237, + "step": 13592 + }, + { + "epoch": 2.03, + "grad_norm": 2.137338069847435, + "learning_rate": 1.347395845498365e-06, + "loss": 0.612, + "step": 13593 + }, + { + "epoch": 2.03, + "grad_norm": 4.961744294315645, + "learning_rate": 1.34730525284482e-06, + "loss": 0.5977, + "step": 13594 + }, + { + "epoch": 2.03, + "grad_norm": 3.252924629266121, + "learning_rate": 1.347214656949845e-06, + "loss": 0.6901, + "step": 13595 + }, + { + "epoch": 2.03, + "grad_norm": 2.120440525250097, + "learning_rate": 1.3471240578142846e-06, + "loss": 0.6445, + "step": 13596 + }, + { + "epoch": 2.03, + "grad_norm": 2.445534157069433, + "learning_rate": 1.3470334554389849e-06, + "loss": 0.6686, + "step": 13597 + }, + { + "epoch": 2.03, + "grad_norm": 4.177427939044865, + "learning_rate": 1.346942849824791e-06, + "loss": 0.6641, + "step": 13598 + }, + { + "epoch": 2.03, + "grad_norm": 2.330516817307844, + "learning_rate": 1.3468522409725488e-06, + "loss": 0.6367, + "step": 13599 + }, + { + "epoch": 2.03, + "grad_norm": 2.027652814952941, + "learning_rate": 1.3467616288831044e-06, + "loss": 0.6589, + "step": 13600 + }, + { + "epoch": 2.03, + "grad_norm": 4.421718713819332, + "learning_rate": 1.3466710135573025e-06, + "loss": 0.6667, + "step": 13601 + }, + { + "epoch": 2.03, + "grad_norm": 2.448471129858495, + "learning_rate": 1.3465803949959894e-06, + "loss": 0.6413, + "step": 13602 + }, + { + "epoch": 2.03, + "grad_norm": 4.479960480332824, + "learning_rate": 1.3464897732000112e-06, + "loss": 0.6465, + "step": 13603 + }, + { + "epoch": 2.03, + "grad_norm": 2.0688288303690117, + "learning_rate": 1.3463991481702128e-06, + "loss": 0.6608, + "step": 13604 + }, + { + "epoch": 2.03, + "grad_norm": 2.351059720801898, + "learning_rate": 1.346308519907441e-06, + "loss": 0.6126, + "step": 13605 + }, + { + "epoch": 2.03, + "grad_norm": 2.507303487185487, + "learning_rate": 1.3462178884125408e-06, + "loss": 0.6113, + "step": 13606 + }, + { + "epoch": 2.03, + "grad_norm": 1.9230231764373185, + "learning_rate": 1.3461272536863588e-06, + "loss": 0.6738, + "step": 13607 + }, + { + "epoch": 2.03, + "grad_norm": 5.455222887898202, + "learning_rate": 1.3460366157297403e-06, + "loss": 0.6419, + "step": 13608 + }, + { + "epoch": 2.03, + "grad_norm": 3.9258340712697852, + "learning_rate": 1.3459459745435315e-06, + "loss": 0.6784, + "step": 13609 + }, + { + "epoch": 2.03, + "grad_norm": 1.8887076713585744, + "learning_rate": 1.3458553301285782e-06, + "loss": 0.668, + "step": 13610 + }, + { + "epoch": 2.03, + "grad_norm": 2.017466925915035, + "learning_rate": 1.3457646824857267e-06, + "loss": 0.6445, + "step": 13611 + }, + { + "epoch": 2.03, + "grad_norm": 3.2301850329724036, + "learning_rate": 1.3456740316158228e-06, + "loss": 0.6068, + "step": 13612 + }, + { + "epoch": 2.03, + "grad_norm": 1.8020039106776704, + "learning_rate": 1.3455833775197128e-06, + "loss": 0.6185, + "step": 13613 + }, + { + "epoch": 2.03, + "grad_norm": 2.1008264190828134, + "learning_rate": 1.3454927201982423e-06, + "loss": 0.6771, + "step": 13614 + }, + { + "epoch": 2.03, + "grad_norm": 2.451574464134144, + "learning_rate": 1.3454020596522575e-06, + "loss": 0.6602, + "step": 13615 + }, + { + "epoch": 2.03, + "grad_norm": 1.8902364534892222, + "learning_rate": 1.345311395882605e-06, + "loss": 0.6016, + "step": 13616 + }, + { + "epoch": 2.03, + "grad_norm": 2.0422697774503167, + "learning_rate": 1.3452207288901307e-06, + "loss": 0.6458, + "step": 13617 + }, + { + "epoch": 2.03, + "grad_norm": 2.1073769204835697, + "learning_rate": 1.345130058675681e-06, + "loss": 0.6738, + "step": 13618 + }, + { + "epoch": 2.03, + "grad_norm": 2.079491570253361, + "learning_rate": 1.3450393852401013e-06, + "loss": 0.6751, + "step": 13619 + }, + { + "epoch": 2.03, + "grad_norm": 2.0311845629097256, + "learning_rate": 1.3449487085842392e-06, + "loss": 0.653, + "step": 13620 + }, + { + "epoch": 2.03, + "grad_norm": 3.586192308123405, + "learning_rate": 1.3448580287089401e-06, + "loss": 0.6621, + "step": 13621 + }, + { + "epoch": 2.03, + "grad_norm": 6.881761665360499, + "learning_rate": 1.3447673456150504e-06, + "loss": 0.6309, + "step": 13622 + }, + { + "epoch": 2.03, + "grad_norm": 3.2744506419122095, + "learning_rate": 1.3446766593034167e-06, + "loss": 0.6758, + "step": 13623 + }, + { + "epoch": 2.03, + "grad_norm": 2.9405985274598723, + "learning_rate": 1.344585969774885e-06, + "loss": 0.6465, + "step": 13624 + }, + { + "epoch": 2.03, + "grad_norm": 2.5976155032370283, + "learning_rate": 1.344495277030302e-06, + "loss": 0.6582, + "step": 13625 + }, + { + "epoch": 2.03, + "grad_norm": 2.0308963675281597, + "learning_rate": 1.3444045810705144e-06, + "loss": 0.6458, + "step": 13626 + }, + { + "epoch": 2.03, + "grad_norm": 2.441859652990548, + "learning_rate": 1.3443138818963684e-06, + "loss": 0.6504, + "step": 13627 + }, + { + "epoch": 2.03, + "grad_norm": 6.391849976567222, + "learning_rate": 1.3442231795087103e-06, + "loss": 0.6413, + "step": 13628 + }, + { + "epoch": 2.03, + "grad_norm": 2.057611458100045, + "learning_rate": 1.3441324739083869e-06, + "loss": 0.6523, + "step": 13629 + }, + { + "epoch": 2.03, + "grad_norm": 3.4155478536062445, + "learning_rate": 1.3440417650962445e-06, + "loss": 0.6283, + "step": 13630 + }, + { + "epoch": 2.03, + "grad_norm": 1.7396859650212455, + "learning_rate": 1.34395105307313e-06, + "loss": 0.653, + "step": 13631 + }, + { + "epoch": 2.03, + "grad_norm": 2.1255784916056295, + "learning_rate": 1.34386033783989e-06, + "loss": 0.6764, + "step": 13632 + }, + { + "epoch": 2.03, + "grad_norm": 1.65322170336649, + "learning_rate": 1.343769619397371e-06, + "loss": 0.638, + "step": 13633 + }, + { + "epoch": 2.03, + "grad_norm": 6.560587317709281, + "learning_rate": 1.34367889774642e-06, + "loss": 0.7005, + "step": 13634 + }, + { + "epoch": 2.03, + "grad_norm": 2.0804470243444064, + "learning_rate": 1.3435881728878828e-06, + "loss": 0.6159, + "step": 13635 + }, + { + "epoch": 2.03, + "grad_norm": 2.8392653457238732, + "learning_rate": 1.3434974448226072e-06, + "loss": 0.6771, + "step": 13636 + }, + { + "epoch": 2.03, + "grad_norm": 2.2044550314198474, + "learning_rate": 1.3434067135514396e-06, + "loss": 0.6374, + "step": 13637 + }, + { + "epoch": 2.03, + "grad_norm": 4.100648670417481, + "learning_rate": 1.3433159790752266e-06, + "loss": 0.6191, + "step": 13638 + }, + { + "epoch": 2.03, + "grad_norm": 1.8905548363971671, + "learning_rate": 1.3432252413948155e-06, + "loss": 0.6504, + "step": 13639 + }, + { + "epoch": 2.03, + "grad_norm": 1.91542200156108, + "learning_rate": 1.3431345005110528e-06, + "loss": 0.6673, + "step": 13640 + }, + { + "epoch": 2.03, + "grad_norm": 2.6574434980740245, + "learning_rate": 1.3430437564247851e-06, + "loss": 0.6634, + "step": 13641 + }, + { + "epoch": 2.03, + "grad_norm": 3.3568785464030166, + "learning_rate": 1.3429530091368602e-06, + "loss": 0.6758, + "step": 13642 + }, + { + "epoch": 2.03, + "grad_norm": 1.6161190135481849, + "learning_rate": 1.3428622586481244e-06, + "loss": 0.6719, + "step": 13643 + }, + { + "epoch": 2.03, + "grad_norm": 2.1029842160331103, + "learning_rate": 1.3427715049594243e-06, + "loss": 0.6667, + "step": 13644 + }, + { + "epoch": 2.04, + "grad_norm": 2.4592413690753654, + "learning_rate": 1.342680748071608e-06, + "loss": 0.6322, + "step": 13645 + }, + { + "epoch": 2.04, + "grad_norm": 1.6849659595866038, + "learning_rate": 1.342589987985522e-06, + "loss": 0.6432, + "step": 13646 + }, + { + "epoch": 2.04, + "grad_norm": 2.3421926749496986, + "learning_rate": 1.342499224702013e-06, + "loss": 0.6582, + "step": 13647 + }, + { + "epoch": 2.04, + "grad_norm": 3.944160164736662, + "learning_rate": 1.3424084582219286e-06, + "loss": 0.6107, + "step": 13648 + }, + { + "epoch": 2.04, + "grad_norm": 2.4641776760617957, + "learning_rate": 1.3423176885461156e-06, + "loss": 0.6354, + "step": 13649 + }, + { + "epoch": 2.04, + "grad_norm": 4.918197188711974, + "learning_rate": 1.3422269156754217e-06, + "loss": 0.6862, + "step": 13650 + }, + { + "epoch": 2.04, + "grad_norm": 4.56564979061439, + "learning_rate": 1.3421361396106935e-06, + "loss": 0.6914, + "step": 13651 + }, + { + "epoch": 2.04, + "grad_norm": 5.031921699328621, + "learning_rate": 1.3420453603527785e-06, + "loss": 0.6387, + "step": 13652 + }, + { + "epoch": 2.04, + "grad_norm": 1.7010822218741508, + "learning_rate": 1.341954577902524e-06, + "loss": 0.6413, + "step": 13653 + }, + { + "epoch": 2.04, + "grad_norm": 3.567985459280802, + "learning_rate": 1.3418637922607768e-06, + "loss": 0.6257, + "step": 13654 + }, + { + "epoch": 2.04, + "grad_norm": 3.7976663865085367, + "learning_rate": 1.3417730034283852e-06, + "loss": 0.6478, + "step": 13655 + }, + { + "epoch": 2.04, + "grad_norm": 2.069008763696545, + "learning_rate": 1.3416822114061957e-06, + "loss": 0.625, + "step": 13656 + }, + { + "epoch": 2.04, + "grad_norm": 3.799907505463558, + "learning_rate": 1.3415914161950556e-06, + "loss": 0.6191, + "step": 13657 + }, + { + "epoch": 2.04, + "grad_norm": 2.337906445823828, + "learning_rate": 1.341500617795813e-06, + "loss": 0.6445, + "step": 13658 + }, + { + "epoch": 2.04, + "grad_norm": 1.962731822503073, + "learning_rate": 1.3414098162093146e-06, + "loss": 0.6406, + "step": 13659 + }, + { + "epoch": 2.04, + "grad_norm": 3.1621500775730764, + "learning_rate": 1.3413190114364083e-06, + "loss": 0.6406, + "step": 13660 + }, + { + "epoch": 2.04, + "grad_norm": 1.8812615057699749, + "learning_rate": 1.3412282034779416e-06, + "loss": 0.6712, + "step": 13661 + }, + { + "epoch": 2.04, + "grad_norm": 2.5023556443106263, + "learning_rate": 1.3411373923347617e-06, + "loss": 0.6699, + "step": 13662 + }, + { + "epoch": 2.04, + "grad_norm": 6.576934915486853, + "learning_rate": 1.3410465780077166e-06, + "loss": 0.627, + "step": 13663 + }, + { + "epoch": 2.04, + "grad_norm": 5.345652434664925, + "learning_rate": 1.3409557604976537e-06, + "loss": 0.6478, + "step": 13664 + }, + { + "epoch": 2.04, + "grad_norm": 3.3704047805758393, + "learning_rate": 1.34086493980542e-06, + "loss": 0.6458, + "step": 13665 + }, + { + "epoch": 2.04, + "grad_norm": 3.900817671629744, + "learning_rate": 1.3407741159318643e-06, + "loss": 0.6439, + "step": 13666 + }, + { + "epoch": 2.04, + "grad_norm": 3.2289949747386424, + "learning_rate": 1.3406832888778335e-06, + "loss": 0.666, + "step": 13667 + }, + { + "epoch": 2.04, + "grad_norm": 2.861322690971462, + "learning_rate": 1.340592458644175e-06, + "loss": 0.6361, + "step": 13668 + }, + { + "epoch": 2.04, + "grad_norm": 2.990015800287051, + "learning_rate": 1.3405016252317376e-06, + "loss": 0.6361, + "step": 13669 + }, + { + "epoch": 2.04, + "grad_norm": 2.043840025673521, + "learning_rate": 1.3404107886413677e-06, + "loss": 0.6406, + "step": 13670 + }, + { + "epoch": 2.04, + "grad_norm": 2.405602057709659, + "learning_rate": 1.3403199488739142e-06, + "loss": 0.6478, + "step": 13671 + }, + { + "epoch": 2.04, + "grad_norm": 4.173715584358305, + "learning_rate": 1.3402291059302244e-06, + "loss": 0.6367, + "step": 13672 + }, + { + "epoch": 2.04, + "grad_norm": 2.354034848662325, + "learning_rate": 1.3401382598111466e-06, + "loss": 0.6126, + "step": 13673 + }, + { + "epoch": 2.04, + "grad_norm": 5.094412621867122, + "learning_rate": 1.3400474105175281e-06, + "loss": 0.6634, + "step": 13674 + }, + { + "epoch": 2.04, + "grad_norm": 4.677459319634354, + "learning_rate": 1.339956558050217e-06, + "loss": 0.6354, + "step": 13675 + }, + { + "epoch": 2.04, + "grad_norm": 4.4370683281202945, + "learning_rate": 1.3398657024100613e-06, + "loss": 0.6549, + "step": 13676 + }, + { + "epoch": 2.04, + "grad_norm": 4.169722701655665, + "learning_rate": 1.339774843597909e-06, + "loss": 0.696, + "step": 13677 + }, + { + "epoch": 2.04, + "grad_norm": 3.1538596480795595, + "learning_rate": 1.339683981614608e-06, + "loss": 0.6107, + "step": 13678 + }, + { + "epoch": 2.04, + "grad_norm": 3.762284906617368, + "learning_rate": 1.3395931164610066e-06, + "loss": 0.6276, + "step": 13679 + }, + { + "epoch": 2.04, + "grad_norm": 5.2322311426843, + "learning_rate": 1.3395022481379524e-06, + "loss": 0.6458, + "step": 13680 + }, + { + "epoch": 2.04, + "grad_norm": 2.2280066433612373, + "learning_rate": 1.3394113766462934e-06, + "loss": 0.6253, + "step": 13681 + }, + { + "epoch": 2.04, + "grad_norm": 6.303022305255659, + "learning_rate": 1.3393205019868783e-06, + "loss": 0.6478, + "step": 13682 + }, + { + "epoch": 2.04, + "grad_norm": 2.420730449125169, + "learning_rate": 1.339229624160555e-06, + "loss": 0.6159, + "step": 13683 + }, + { + "epoch": 2.04, + "grad_norm": 5.164787163900372, + "learning_rate": 1.3391387431681715e-06, + "loss": 0.6217, + "step": 13684 + }, + { + "epoch": 2.04, + "grad_norm": 5.305873909570769, + "learning_rate": 1.3390478590105761e-06, + "loss": 0.6836, + "step": 13685 + }, + { + "epoch": 2.04, + "grad_norm": 2.89937764416694, + "learning_rate": 1.3389569716886171e-06, + "loss": 0.6491, + "step": 13686 + }, + { + "epoch": 2.04, + "grad_norm": 2.1489332874788114, + "learning_rate": 1.338866081203143e-06, + "loss": 0.6484, + "step": 13687 + }, + { + "epoch": 2.04, + "grad_norm": 3.2196457048383675, + "learning_rate": 1.3387751875550013e-06, + "loss": 0.6693, + "step": 13688 + }, + { + "epoch": 2.04, + "grad_norm": 5.455308178235422, + "learning_rate": 1.338684290745041e-06, + "loss": 0.679, + "step": 13689 + }, + { + "epoch": 2.04, + "grad_norm": 2.3837874377704975, + "learning_rate": 1.3385933907741105e-06, + "loss": 0.6341, + "step": 13690 + }, + { + "epoch": 2.04, + "grad_norm": 3.0196031397927143, + "learning_rate": 1.3385024876430575e-06, + "loss": 0.6165, + "step": 13691 + }, + { + "epoch": 2.04, + "grad_norm": 2.112708374550765, + "learning_rate": 1.338411581352731e-06, + "loss": 0.6263, + "step": 13692 + }, + { + "epoch": 2.04, + "grad_norm": 2.1740802120894536, + "learning_rate": 1.3383206719039795e-06, + "loss": 0.6419, + "step": 13693 + }, + { + "epoch": 2.04, + "grad_norm": 2.638155809568693, + "learning_rate": 1.338229759297651e-06, + "loss": 0.6484, + "step": 13694 + }, + { + "epoch": 2.04, + "grad_norm": 3.1336769174905106, + "learning_rate": 1.3381388435345944e-06, + "loss": 0.6673, + "step": 13695 + }, + { + "epoch": 2.04, + "grad_norm": 3.8082614207142194, + "learning_rate": 1.3380479246156582e-06, + "loss": 0.6165, + "step": 13696 + }, + { + "epoch": 2.04, + "grad_norm": 2.2067790786976147, + "learning_rate": 1.3379570025416905e-06, + "loss": 0.6582, + "step": 13697 + }, + { + "epoch": 2.04, + "grad_norm": 2.2619857989557794, + "learning_rate": 1.3378660773135405e-06, + "loss": 0.6283, + "step": 13698 + }, + { + "epoch": 2.04, + "grad_norm": 2.001803797347964, + "learning_rate": 1.3377751489320564e-06, + "loss": 0.6725, + "step": 13699 + }, + { + "epoch": 2.04, + "grad_norm": 5.018416256047323, + "learning_rate": 1.337684217398087e-06, + "loss": 0.64, + "step": 13700 + }, + { + "epoch": 2.04, + "grad_norm": 3.114418913011732, + "learning_rate": 1.3375932827124808e-06, + "loss": 0.6257, + "step": 13701 + }, + { + "epoch": 2.04, + "grad_norm": 2.060203008605947, + "learning_rate": 1.3375023448760865e-06, + "loss": 0.6445, + "step": 13702 + }, + { + "epoch": 2.04, + "grad_norm": 2.8497979736026418, + "learning_rate": 1.3374114038897535e-06, + "loss": 0.679, + "step": 13703 + }, + { + "epoch": 2.04, + "grad_norm": 2.4705868782351446, + "learning_rate": 1.3373204597543295e-06, + "loss": 0.653, + "step": 13704 + }, + { + "epoch": 2.04, + "grad_norm": 2.2822749590883564, + "learning_rate": 1.337229512470664e-06, + "loss": 0.6445, + "step": 13705 + }, + { + "epoch": 2.04, + "grad_norm": 1.8487676814131548, + "learning_rate": 1.3371385620396058e-06, + "loss": 0.6328, + "step": 13706 + }, + { + "epoch": 2.04, + "grad_norm": 3.0550929687949018, + "learning_rate": 1.3370476084620035e-06, + "loss": 0.6178, + "step": 13707 + }, + { + "epoch": 2.04, + "grad_norm": 4.550968643117786, + "learning_rate": 1.336956651738706e-06, + "loss": 0.6576, + "step": 13708 + }, + { + "epoch": 2.04, + "grad_norm": 2.932576745887366, + "learning_rate": 1.3368656918705623e-06, + "loss": 0.6224, + "step": 13709 + }, + { + "epoch": 2.04, + "grad_norm": 2.0940887483484136, + "learning_rate": 1.3367747288584212e-06, + "loss": 0.6068, + "step": 13710 + }, + { + "epoch": 2.04, + "grad_norm": 3.4040239465018023, + "learning_rate": 1.336683762703132e-06, + "loss": 0.6361, + "step": 13711 + }, + { + "epoch": 2.05, + "grad_norm": 2.569205565207677, + "learning_rate": 1.3365927934055433e-06, + "loss": 0.6738, + "step": 13712 + }, + { + "epoch": 2.05, + "grad_norm": 2.487051757377246, + "learning_rate": 1.3365018209665043e-06, + "loss": 0.6582, + "step": 13713 + }, + { + "epoch": 2.05, + "grad_norm": 4.020977465726065, + "learning_rate": 1.3364108453868642e-06, + "loss": 0.6452, + "step": 13714 + }, + { + "epoch": 2.05, + "grad_norm": 3.650372780829897, + "learning_rate": 1.3363198666674716e-06, + "loss": 0.6211, + "step": 13715 + }, + { + "epoch": 2.05, + "grad_norm": 2.4311410873760955, + "learning_rate": 1.3362288848091763e-06, + "loss": 0.6712, + "step": 13716 + }, + { + "epoch": 2.05, + "grad_norm": 2.6980299377670907, + "learning_rate": 1.3361378998128267e-06, + "loss": 0.6146, + "step": 13717 + }, + { + "epoch": 2.05, + "grad_norm": 2.292034140353089, + "learning_rate": 1.3360469116792726e-06, + "loss": 0.6191, + "step": 13718 + }, + { + "epoch": 2.05, + "grad_norm": 4.552265316475421, + "learning_rate": 1.3359559204093632e-06, + "loss": 0.6725, + "step": 13719 + }, + { + "epoch": 2.05, + "grad_norm": 3.139734184674025, + "learning_rate": 1.3358649260039472e-06, + "loss": 0.6523, + "step": 13720 + }, + { + "epoch": 2.05, + "grad_norm": 2.4379516441129487, + "learning_rate": 1.335773928463874e-06, + "loss": 0.681, + "step": 13721 + }, + { + "epoch": 2.05, + "grad_norm": 3.0039240088647436, + "learning_rate": 1.3356829277899935e-06, + "loss": 0.6875, + "step": 13722 + }, + { + "epoch": 2.05, + "grad_norm": 4.373176455996195, + "learning_rate": 1.3355919239831539e-06, + "loss": 0.6478, + "step": 13723 + }, + { + "epoch": 2.05, + "grad_norm": 2.696592267570243, + "learning_rate": 1.3355009170442056e-06, + "loss": 0.6016, + "step": 13724 + }, + { + "epoch": 2.05, + "grad_norm": 2.5001481231023477, + "learning_rate": 1.3354099069739976e-06, + "loss": 0.6432, + "step": 13725 + }, + { + "epoch": 2.05, + "grad_norm": 2.035917399997758, + "learning_rate": 1.335318893773379e-06, + "loss": 0.6354, + "step": 13726 + }, + { + "epoch": 2.05, + "grad_norm": 1.8958680579087837, + "learning_rate": 1.3352278774432e-06, + "loss": 0.651, + "step": 13727 + }, + { + "epoch": 2.05, + "grad_norm": 4.025354131648789, + "learning_rate": 1.335136857984309e-06, + "loss": 0.6497, + "step": 13728 + }, + { + "epoch": 2.05, + "grad_norm": 3.0701323691593596, + "learning_rate": 1.3350458353975565e-06, + "loss": 0.6576, + "step": 13729 + }, + { + "epoch": 2.05, + "grad_norm": 2.064590581063908, + "learning_rate": 1.3349548096837913e-06, + "loss": 0.6608, + "step": 13730 + }, + { + "epoch": 2.05, + "grad_norm": 6.342009152350213, + "learning_rate": 1.3348637808438632e-06, + "loss": 0.6491, + "step": 13731 + }, + { + "epoch": 2.05, + "grad_norm": 3.209710456678855, + "learning_rate": 1.3347727488786226e-06, + "loss": 0.6126, + "step": 13732 + }, + { + "epoch": 2.05, + "grad_norm": 2.53618282028034, + "learning_rate": 1.3346817137889175e-06, + "loss": 0.6393, + "step": 13733 + }, + { + "epoch": 2.05, + "grad_norm": 2.1480934021159332, + "learning_rate": 1.3345906755755985e-06, + "loss": 0.6426, + "step": 13734 + }, + { + "epoch": 2.05, + "grad_norm": 6.12436155481708, + "learning_rate": 1.3344996342395151e-06, + "loss": 0.6621, + "step": 13735 + }, + { + "epoch": 2.05, + "grad_norm": 2.000173997238445, + "learning_rate": 1.334408589781517e-06, + "loss": 0.6562, + "step": 13736 + }, + { + "epoch": 2.05, + "grad_norm": 2.9458230127065925, + "learning_rate": 1.3343175422024544e-06, + "loss": 0.623, + "step": 13737 + }, + { + "epoch": 2.05, + "grad_norm": 2.198038763606718, + "learning_rate": 1.3342264915031762e-06, + "loss": 0.6855, + "step": 13738 + }, + { + "epoch": 2.05, + "grad_norm": 2.0759357452308325, + "learning_rate": 1.3341354376845328e-06, + "loss": 0.6517, + "step": 13739 + }, + { + "epoch": 2.05, + "grad_norm": 4.999288399322119, + "learning_rate": 1.3340443807473737e-06, + "loss": 0.6393, + "step": 13740 + }, + { + "epoch": 2.05, + "grad_norm": 4.7318654630655805, + "learning_rate": 1.3339533206925489e-06, + "loss": 0.6009, + "step": 13741 + }, + { + "epoch": 2.05, + "grad_norm": 1.9947101039645376, + "learning_rate": 1.3338622575209082e-06, + "loss": 0.6602, + "step": 13742 + }, + { + "epoch": 2.05, + "grad_norm": 2.717738855698955, + "learning_rate": 1.3337711912333017e-06, + "loss": 0.6908, + "step": 13743 + }, + { + "epoch": 2.05, + "grad_norm": 2.0163456164846005, + "learning_rate": 1.3336801218305787e-06, + "loss": 0.6432, + "step": 13744 + }, + { + "epoch": 2.05, + "grad_norm": 2.604799771906471, + "learning_rate": 1.3335890493135902e-06, + "loss": 0.61, + "step": 13745 + }, + { + "epoch": 2.05, + "grad_norm": 2.4692135617741404, + "learning_rate": 1.333497973683185e-06, + "loss": 0.6406, + "step": 13746 + }, + { + "epoch": 2.05, + "grad_norm": 2.698273425177567, + "learning_rate": 1.333406894940214e-06, + "loss": 0.6094, + "step": 13747 + }, + { + "epoch": 2.05, + "grad_norm": 3.1415963633373587, + "learning_rate": 1.3333158130855271e-06, + "loss": 0.6302, + "step": 13748 + }, + { + "epoch": 2.05, + "grad_norm": 2.423675392963359, + "learning_rate": 1.3332247281199743e-06, + "loss": 0.5863, + "step": 13749 + }, + { + "epoch": 2.05, + "grad_norm": 3.568879387224677, + "learning_rate": 1.3331336400444053e-06, + "loss": 0.6484, + "step": 13750 + }, + { + "epoch": 2.05, + "grad_norm": 1.950694392643686, + "learning_rate": 1.3330425488596708e-06, + "loss": 0.623, + "step": 13751 + }, + { + "epoch": 2.05, + "grad_norm": 2.2266230498857733, + "learning_rate": 1.3329514545666206e-06, + "loss": 0.6471, + "step": 13752 + }, + { + "epoch": 2.05, + "grad_norm": 3.1411565087142237, + "learning_rate": 1.3328603571661051e-06, + "loss": 0.6087, + "step": 13753 + }, + { + "epoch": 2.05, + "grad_norm": 2.360469994863931, + "learning_rate": 1.3327692566589743e-06, + "loss": 0.6296, + "step": 13754 + }, + { + "epoch": 2.05, + "grad_norm": 3.80461448795364, + "learning_rate": 1.3326781530460788e-06, + "loss": 0.6758, + "step": 13755 + }, + { + "epoch": 2.05, + "grad_norm": 2.872506177856161, + "learning_rate": 1.3325870463282689e-06, + "loss": 0.638, + "step": 13756 + }, + { + "epoch": 2.05, + "grad_norm": 2.7822645971020954, + "learning_rate": 1.3324959365063944e-06, + "loss": 0.6217, + "step": 13757 + }, + { + "epoch": 2.05, + "grad_norm": 3.503527699843004, + "learning_rate": 1.332404823581306e-06, + "loss": 0.6771, + "step": 13758 + }, + { + "epoch": 2.05, + "grad_norm": 2.9922114615363715, + "learning_rate": 1.3323137075538537e-06, + "loss": 0.6478, + "step": 13759 + }, + { + "epoch": 2.05, + "grad_norm": 5.285791822841517, + "learning_rate": 1.3322225884248886e-06, + "loss": 0.6335, + "step": 13760 + }, + { + "epoch": 2.05, + "grad_norm": 4.4668785695824695, + "learning_rate": 1.3321314661952605e-06, + "loss": 0.6491, + "step": 13761 + }, + { + "epoch": 2.05, + "grad_norm": 2.4484359478441897, + "learning_rate": 1.33204034086582e-06, + "loss": 0.6849, + "step": 13762 + }, + { + "epoch": 2.05, + "grad_norm": 2.7656980804181335, + "learning_rate": 1.3319492124374177e-06, + "loss": 0.5957, + "step": 13763 + }, + { + "epoch": 2.05, + "grad_norm": 4.167070430655269, + "learning_rate": 1.3318580809109042e-06, + "loss": 0.7031, + "step": 13764 + }, + { + "epoch": 2.05, + "grad_norm": 2.8492835597132156, + "learning_rate": 1.3317669462871294e-06, + "loss": 0.6504, + "step": 13765 + }, + { + "epoch": 2.05, + "grad_norm": 2.0882802553706648, + "learning_rate": 1.331675808566945e-06, + "loss": 0.6152, + "step": 13766 + }, + { + "epoch": 2.05, + "grad_norm": 5.845562998256177, + "learning_rate": 1.3315846677512003e-06, + "loss": 0.7246, + "step": 13767 + }, + { + "epoch": 2.05, + "grad_norm": 2.2132766847120373, + "learning_rate": 1.3314935238407469e-06, + "loss": 0.6615, + "step": 13768 + }, + { + "epoch": 2.05, + "grad_norm": 2.2189066436950204, + "learning_rate": 1.3314023768364355e-06, + "loss": 0.64, + "step": 13769 + }, + { + "epoch": 2.05, + "grad_norm": 3.9481078446221174, + "learning_rate": 1.3313112267391158e-06, + "loss": 0.6191, + "step": 13770 + }, + { + "epoch": 2.05, + "grad_norm": 2.0892762477784763, + "learning_rate": 1.3312200735496392e-06, + "loss": 0.6374, + "step": 13771 + }, + { + "epoch": 2.05, + "grad_norm": 4.290247508662385, + "learning_rate": 1.331128917268857e-06, + "loss": 0.6868, + "step": 13772 + }, + { + "epoch": 2.05, + "grad_norm": 3.0829706474604954, + "learning_rate": 1.3310377578976185e-06, + "loss": 0.627, + "step": 13773 + }, + { + "epoch": 2.05, + "grad_norm": 1.9799261330856857, + "learning_rate": 1.3309465954367757e-06, + "loss": 0.6738, + "step": 13774 + }, + { + "epoch": 2.05, + "grad_norm": 2.121909938646436, + "learning_rate": 1.3308554298871792e-06, + "loss": 0.6328, + "step": 13775 + }, + { + "epoch": 2.05, + "grad_norm": 2.399422132512429, + "learning_rate": 1.3307642612496794e-06, + "loss": 0.6732, + "step": 13776 + }, + { + "epoch": 2.05, + "grad_norm": 2.299841216914005, + "learning_rate": 1.3306730895251277e-06, + "loss": 0.6842, + "step": 13777 + }, + { + "epoch": 2.05, + "grad_norm": 2.513476286927116, + "learning_rate": 1.3305819147143747e-06, + "loss": 0.6751, + "step": 13778 + }, + { + "epoch": 2.06, + "grad_norm": 5.240644648679385, + "learning_rate": 1.3304907368182714e-06, + "loss": 0.6895, + "step": 13779 + }, + { + "epoch": 2.06, + "grad_norm": 1.697503004098666, + "learning_rate": 1.330399555837669e-06, + "loss": 0.6667, + "step": 13780 + }, + { + "epoch": 2.06, + "grad_norm": 1.731051738076965, + "learning_rate": 1.3303083717734183e-06, + "loss": 0.6576, + "step": 13781 + }, + { + "epoch": 2.06, + "grad_norm": 1.917324024311845, + "learning_rate": 1.3302171846263703e-06, + "loss": 0.6595, + "step": 13782 + }, + { + "epoch": 2.06, + "grad_norm": 3.499722602297964, + "learning_rate": 1.3301259943973762e-06, + "loss": 0.6536, + "step": 13783 + }, + { + "epoch": 2.06, + "grad_norm": 1.9405181293829366, + "learning_rate": 1.3300348010872869e-06, + "loss": 0.6387, + "step": 13784 + }, + { + "epoch": 2.06, + "grad_norm": 3.071425028495375, + "learning_rate": 1.329943604696954e-06, + "loss": 0.64, + "step": 13785 + }, + { + "epoch": 2.06, + "grad_norm": 1.8312392969378817, + "learning_rate": 1.3298524052272276e-06, + "loss": 0.6868, + "step": 13786 + }, + { + "epoch": 2.06, + "grad_norm": 2.476623426436841, + "learning_rate": 1.3297612026789598e-06, + "loss": 0.6309, + "step": 13787 + }, + { + "epoch": 2.06, + "grad_norm": 5.100767497063008, + "learning_rate": 1.3296699970530017e-06, + "loss": 0.6478, + "step": 13788 + }, + { + "epoch": 2.06, + "grad_norm": 2.0264435621436054, + "learning_rate": 1.3295787883502042e-06, + "loss": 0.6673, + "step": 13789 + }, + { + "epoch": 2.06, + "grad_norm": 2.0299925017443097, + "learning_rate": 1.3294875765714186e-06, + "loss": 0.6445, + "step": 13790 + }, + { + "epoch": 2.06, + "grad_norm": 1.588238879369781, + "learning_rate": 1.3293963617174964e-06, + "loss": 0.653, + "step": 13791 + }, + { + "epoch": 2.06, + "grad_norm": 1.703441890570828, + "learning_rate": 1.329305143789289e-06, + "loss": 0.6699, + "step": 13792 + }, + { + "epoch": 2.06, + "grad_norm": 1.8636878278256956, + "learning_rate": 1.3292139227876475e-06, + "loss": 0.6634, + "step": 13793 + }, + { + "epoch": 2.06, + "grad_norm": 4.041864388888658, + "learning_rate": 1.3291226987134231e-06, + "loss": 0.6517, + "step": 13794 + }, + { + "epoch": 2.06, + "grad_norm": 3.823710325451892, + "learning_rate": 1.3290314715674678e-06, + "loss": 0.6927, + "step": 13795 + }, + { + "epoch": 2.06, + "grad_norm": 5.026709805854554, + "learning_rate": 1.3289402413506325e-06, + "loss": 0.6549, + "step": 13796 + }, + { + "epoch": 2.06, + "grad_norm": 1.9846001960809962, + "learning_rate": 1.3288490080637689e-06, + "loss": 0.6562, + "step": 13797 + }, + { + "epoch": 2.06, + "grad_norm": 2.2265946752661625, + "learning_rate": 1.3287577717077285e-06, + "loss": 0.6497, + "step": 13798 + }, + { + "epoch": 2.06, + "grad_norm": 1.8196726306778648, + "learning_rate": 1.3286665322833626e-06, + "loss": 0.6419, + "step": 13799 + }, + { + "epoch": 2.06, + "grad_norm": 2.1367386191806674, + "learning_rate": 1.328575289791523e-06, + "loss": 0.6113, + "step": 13800 + }, + { + "epoch": 2.06, + "grad_norm": 3.8280847176913406, + "learning_rate": 1.328484044233061e-06, + "loss": 0.6764, + "step": 13801 + }, + { + "epoch": 2.06, + "grad_norm": 1.9686094220077714, + "learning_rate": 1.3283927956088285e-06, + "loss": 0.6641, + "step": 13802 + }, + { + "epoch": 2.06, + "grad_norm": 1.8738439977031984, + "learning_rate": 1.328301543919677e-06, + "loss": 0.6406, + "step": 13803 + }, + { + "epoch": 2.06, + "grad_norm": 1.826021459095901, + "learning_rate": 1.3282102891664579e-06, + "loss": 0.627, + "step": 13804 + }, + { + "epoch": 2.06, + "grad_norm": 2.643802070019372, + "learning_rate": 1.3281190313500234e-06, + "loss": 0.6243, + "step": 13805 + }, + { + "epoch": 2.06, + "grad_norm": 2.2474507843715577, + "learning_rate": 1.3280277704712252e-06, + "loss": 0.6569, + "step": 13806 + }, + { + "epoch": 2.06, + "grad_norm": 2.245922710266816, + "learning_rate": 1.3279365065309142e-06, + "loss": 0.6276, + "step": 13807 + }, + { + "epoch": 2.06, + "grad_norm": 5.820161772138568, + "learning_rate": 1.327845239529943e-06, + "loss": 0.6217, + "step": 13808 + }, + { + "epoch": 2.06, + "grad_norm": 3.422170655922268, + "learning_rate": 1.3277539694691635e-06, + "loss": 0.6478, + "step": 13809 + }, + { + "epoch": 2.06, + "grad_norm": 2.1640488414145733, + "learning_rate": 1.327662696349427e-06, + "loss": 0.6504, + "step": 13810 + }, + { + "epoch": 2.06, + "grad_norm": 3.1313321753720014, + "learning_rate": 1.3275714201715855e-06, + "loss": 0.6602, + "step": 13811 + }, + { + "epoch": 2.06, + "grad_norm": 2.10703351999231, + "learning_rate": 1.3274801409364912e-06, + "loss": 0.6224, + "step": 13812 + }, + { + "epoch": 2.06, + "grad_norm": 5.619281030079863, + "learning_rate": 1.3273888586449955e-06, + "loss": 0.6842, + "step": 13813 + }, + { + "epoch": 2.06, + "grad_norm": 2.5609035145044676, + "learning_rate": 1.3272975732979508e-06, + "loss": 0.6628, + "step": 13814 + }, + { + "epoch": 2.06, + "grad_norm": 2.95612504897589, + "learning_rate": 1.3272062848962088e-06, + "loss": 0.6393, + "step": 13815 + }, + { + "epoch": 2.06, + "grad_norm": 3.086118918188606, + "learning_rate": 1.327114993440622e-06, + "loss": 0.6777, + "step": 13816 + }, + { + "epoch": 2.06, + "grad_norm": 2.2984152208613704, + "learning_rate": 1.3270236989320415e-06, + "loss": 0.6263, + "step": 13817 + }, + { + "epoch": 2.06, + "grad_norm": 3.9660819045383415, + "learning_rate": 1.32693240137132e-06, + "loss": 0.6569, + "step": 13818 + }, + { + "epoch": 2.06, + "grad_norm": 4.470414167954572, + "learning_rate": 1.32684110075931e-06, + "loss": 0.6081, + "step": 13819 + }, + { + "epoch": 2.06, + "grad_norm": 4.712125890770199, + "learning_rate": 1.3267497970968625e-06, + "loss": 0.6602, + "step": 13820 + }, + { + "epoch": 2.06, + "grad_norm": 2.3387909158417144, + "learning_rate": 1.3266584903848303e-06, + "loss": 0.668, + "step": 13821 + }, + { + "epoch": 2.06, + "grad_norm": 2.023951173266732, + "learning_rate": 1.326567180624066e-06, + "loss": 0.6322, + "step": 13822 + }, + { + "epoch": 2.06, + "grad_norm": 2.3835255837565756, + "learning_rate": 1.3264758678154206e-06, + "loss": 0.6908, + "step": 13823 + }, + { + "epoch": 2.06, + "grad_norm": 3.003749118454782, + "learning_rate": 1.3263845519597477e-06, + "loss": 0.6348, + "step": 13824 + }, + { + "epoch": 2.06, + "grad_norm": 1.8514406330546804, + "learning_rate": 1.3262932330578985e-06, + "loss": 0.6315, + "step": 13825 + }, + { + "epoch": 2.06, + "grad_norm": 3.0638575644414043, + "learning_rate": 1.3262019111107258e-06, + "loss": 0.6628, + "step": 13826 + }, + { + "epoch": 2.06, + "grad_norm": 4.9840694893571795, + "learning_rate": 1.326110586119082e-06, + "loss": 0.6497, + "step": 13827 + }, + { + "epoch": 2.06, + "grad_norm": 2.363503061613652, + "learning_rate": 1.3260192580838193e-06, + "loss": 0.6654, + "step": 13828 + }, + { + "epoch": 2.06, + "grad_norm": 3.657327481040882, + "learning_rate": 1.3259279270057897e-06, + "loss": 0.6126, + "step": 13829 + }, + { + "epoch": 2.06, + "grad_norm": 1.8626317356683109, + "learning_rate": 1.3258365928858462e-06, + "loss": 0.6113, + "step": 13830 + }, + { + "epoch": 2.06, + "grad_norm": 2.8281914095946488, + "learning_rate": 1.3257452557248408e-06, + "loss": 0.6283, + "step": 13831 + }, + { + "epoch": 2.06, + "grad_norm": 2.571132568312137, + "learning_rate": 1.3256539155236261e-06, + "loss": 0.6686, + "step": 13832 + }, + { + "epoch": 2.06, + "grad_norm": 1.9430685978792759, + "learning_rate": 1.3255625722830547e-06, + "loss": 0.6393, + "step": 13833 + }, + { + "epoch": 2.06, + "grad_norm": 1.8486465898207265, + "learning_rate": 1.3254712260039788e-06, + "loss": 0.6484, + "step": 13834 + }, + { + "epoch": 2.06, + "grad_norm": 3.610684571891913, + "learning_rate": 1.3253798766872517e-06, + "loss": 0.6315, + "step": 13835 + }, + { + "epoch": 2.06, + "grad_norm": 3.355367263697948, + "learning_rate": 1.325288524333725e-06, + "loss": 0.6491, + "step": 13836 + }, + { + "epoch": 2.06, + "grad_norm": 2.1121258125572333, + "learning_rate": 1.3251971689442521e-06, + "loss": 0.6328, + "step": 13837 + }, + { + "epoch": 2.06, + "grad_norm": 3.173705341364245, + "learning_rate": 1.325105810519685e-06, + "loss": 0.6673, + "step": 13838 + }, + { + "epoch": 2.06, + "grad_norm": 2.1333270549368355, + "learning_rate": 1.3250144490608763e-06, + "loss": 0.6699, + "step": 13839 + }, + { + "epoch": 2.06, + "grad_norm": 3.240077174160661, + "learning_rate": 1.3249230845686796e-06, + "loss": 0.6673, + "step": 13840 + }, + { + "epoch": 2.06, + "grad_norm": 2.586420504256988, + "learning_rate": 1.3248317170439468e-06, + "loss": 0.6198, + "step": 13841 + }, + { + "epoch": 2.06, + "grad_norm": 2.6809821466293284, + "learning_rate": 1.3247403464875306e-06, + "loss": 0.6322, + "step": 13842 + }, + { + "epoch": 2.06, + "grad_norm": 2.2675152468217723, + "learning_rate": 1.3246489729002844e-06, + "loss": 0.6465, + "step": 13843 + }, + { + "epoch": 2.06, + "grad_norm": 5.845433471158114, + "learning_rate": 1.32455759628306e-06, + "loss": 0.6914, + "step": 13844 + }, + { + "epoch": 2.06, + "grad_norm": 4.22733796167694, + "learning_rate": 1.3244662166367117e-06, + "loss": 0.6855, + "step": 13845 + }, + { + "epoch": 2.07, + "grad_norm": 2.164664102187886, + "learning_rate": 1.324374833962091e-06, + "loss": 0.6751, + "step": 13846 + }, + { + "epoch": 2.07, + "grad_norm": 2.786891213948318, + "learning_rate": 1.3242834482600512e-06, + "loss": 0.6517, + "step": 13847 + }, + { + "epoch": 2.07, + "grad_norm": 2.1724321227355157, + "learning_rate": 1.3241920595314459e-06, + "loss": 0.5944, + "step": 13848 + }, + { + "epoch": 2.07, + "grad_norm": 6.186871853334639, + "learning_rate": 1.3241006677771268e-06, + "loss": 0.7227, + "step": 13849 + }, + { + "epoch": 2.07, + "grad_norm": 2.1905249019928967, + "learning_rate": 1.3240092729979477e-06, + "loss": 0.6628, + "step": 13850 + }, + { + "epoch": 2.07, + "grad_norm": 2.2729972750758956, + "learning_rate": 1.3239178751947618e-06, + "loss": 0.6113, + "step": 13851 + }, + { + "epoch": 2.07, + "grad_norm": 2.1122073263967884, + "learning_rate": 1.3238264743684213e-06, + "loss": 0.6452, + "step": 13852 + }, + { + "epoch": 2.07, + "grad_norm": 1.8233345559264589, + "learning_rate": 1.32373507051978e-06, + "loss": 0.6628, + "step": 13853 + }, + { + "epoch": 2.07, + "grad_norm": 2.5854310078797895, + "learning_rate": 1.3236436636496902e-06, + "loss": 0.6006, + "step": 13854 + }, + { + "epoch": 2.07, + "grad_norm": 2.7619698046934853, + "learning_rate": 1.3235522537590055e-06, + "loss": 0.6074, + "step": 13855 + }, + { + "epoch": 2.07, + "grad_norm": 3.0360769978781943, + "learning_rate": 1.3234608408485794e-06, + "loss": 0.6159, + "step": 13856 + }, + { + "epoch": 2.07, + "grad_norm": 1.986830065416827, + "learning_rate": 1.3233694249192643e-06, + "loss": 0.6211, + "step": 13857 + }, + { + "epoch": 2.07, + "grad_norm": 2.2110357326881602, + "learning_rate": 1.323278005971914e-06, + "loss": 0.6556, + "step": 13858 + }, + { + "epoch": 2.07, + "grad_norm": 2.1406999014450605, + "learning_rate": 1.3231865840073814e-06, + "loss": 0.6237, + "step": 13859 + }, + { + "epoch": 2.07, + "grad_norm": 2.292904872988112, + "learning_rate": 1.3230951590265198e-06, + "loss": 0.6335, + "step": 13860 + }, + { + "epoch": 2.07, + "grad_norm": 2.263632119819142, + "learning_rate": 1.3230037310301825e-06, + "loss": 0.6413, + "step": 13861 + }, + { + "epoch": 2.07, + "grad_norm": 2.2332337218400644, + "learning_rate": 1.3229123000192228e-06, + "loss": 0.6914, + "step": 13862 + }, + { + "epoch": 2.07, + "grad_norm": 2.5803020119125497, + "learning_rate": 1.322820865994494e-06, + "loss": 0.6556, + "step": 13863 + }, + { + "epoch": 2.07, + "grad_norm": 2.996497331736874, + "learning_rate": 1.3227294289568498e-06, + "loss": 0.6549, + "step": 13864 + }, + { + "epoch": 2.07, + "grad_norm": 3.0012789493108905, + "learning_rate": 1.322637988907143e-06, + "loss": 0.61, + "step": 13865 + }, + { + "epoch": 2.07, + "grad_norm": 3.0906364514905875, + "learning_rate": 1.3225465458462274e-06, + "loss": 0.6361, + "step": 13866 + }, + { + "epoch": 2.07, + "grad_norm": 4.739086695282214, + "learning_rate": 1.3224550997749565e-06, + "loss": 0.6549, + "step": 13867 + }, + { + "epoch": 2.07, + "grad_norm": 2.426177200473305, + "learning_rate": 1.3223636506941836e-06, + "loss": 0.6497, + "step": 13868 + }, + { + "epoch": 2.07, + "grad_norm": 3.4478709541698125, + "learning_rate": 1.3222721986047622e-06, + "loss": 0.6328, + "step": 13869 + }, + { + "epoch": 2.07, + "grad_norm": 2.6194812652253647, + "learning_rate": 1.3221807435075457e-06, + "loss": 0.6029, + "step": 13870 + }, + { + "epoch": 2.07, + "grad_norm": 4.731631464422215, + "learning_rate": 1.322089285403388e-06, + "loss": 0.6146, + "step": 13871 + }, + { + "epoch": 2.07, + "grad_norm": 2.4621160817179275, + "learning_rate": 1.3219978242931424e-06, + "loss": 0.6243, + "step": 13872 + }, + { + "epoch": 2.07, + "grad_norm": 4.679492820142655, + "learning_rate": 1.3219063601776626e-06, + "loss": 0.6745, + "step": 13873 + }, + { + "epoch": 2.07, + "grad_norm": 2.5820715261904446, + "learning_rate": 1.3218148930578023e-06, + "loss": 0.6751, + "step": 13874 + }, + { + "epoch": 2.07, + "grad_norm": 3.2263263089993197, + "learning_rate": 1.3217234229344154e-06, + "loss": 0.6322, + "step": 13875 + }, + { + "epoch": 2.07, + "grad_norm": 2.6796972363542233, + "learning_rate": 1.321631949808355e-06, + "loss": 0.6589, + "step": 13876 + }, + { + "epoch": 2.07, + "grad_norm": 2.5800673949869823, + "learning_rate": 1.3215404736804753e-06, + "loss": 0.653, + "step": 13877 + }, + { + "epoch": 2.07, + "grad_norm": 4.094769735470674, + "learning_rate": 1.32144899455163e-06, + "loss": 0.6393, + "step": 13878 + }, + { + "epoch": 2.07, + "grad_norm": 2.5231326724629315, + "learning_rate": 1.3213575124226728e-06, + "loss": 0.6406, + "step": 13879 + }, + { + "epoch": 2.07, + "grad_norm": 2.6479425738599125, + "learning_rate": 1.3212660272944575e-06, + "loss": 0.6374, + "step": 13880 + }, + { + "epoch": 2.07, + "grad_norm": 2.35470676407272, + "learning_rate": 1.321174539167838e-06, + "loss": 0.6608, + "step": 13881 + }, + { + "epoch": 2.07, + "grad_norm": 2.442125200242429, + "learning_rate": 1.321083048043668e-06, + "loss": 0.6361, + "step": 13882 + }, + { + "epoch": 2.07, + "grad_norm": 3.2987295523058604, + "learning_rate": 1.3209915539228016e-06, + "loss": 0.6686, + "step": 13883 + }, + { + "epoch": 2.07, + "grad_norm": 2.624878385789162, + "learning_rate": 1.3209000568060926e-06, + "loss": 0.679, + "step": 13884 + }, + { + "epoch": 2.07, + "grad_norm": 2.246032293775682, + "learning_rate": 1.3208085566943949e-06, + "loss": 0.6393, + "step": 13885 + }, + { + "epoch": 2.07, + "grad_norm": 2.0829292861205677, + "learning_rate": 1.3207170535885628e-06, + "loss": 0.6055, + "step": 13886 + }, + { + "epoch": 2.07, + "grad_norm": 2.2878372459866747, + "learning_rate": 1.3206255474894499e-06, + "loss": 0.6471, + "step": 13887 + }, + { + "epoch": 2.07, + "grad_norm": 2.1211230745805563, + "learning_rate": 1.3205340383979104e-06, + "loss": 0.6146, + "step": 13888 + }, + { + "epoch": 2.07, + "grad_norm": 3.8117874055028693, + "learning_rate": 1.3204425263147985e-06, + "loss": 0.6211, + "step": 13889 + }, + { + "epoch": 2.07, + "grad_norm": 2.095051740819382, + "learning_rate": 1.3203510112409681e-06, + "loss": 0.6341, + "step": 13890 + }, + { + "epoch": 2.07, + "grad_norm": 3.9934125003734837, + "learning_rate": 1.3202594931772734e-06, + "loss": 0.6686, + "step": 13891 + }, + { + "epoch": 2.07, + "grad_norm": 2.260796962801286, + "learning_rate": 1.3201679721245684e-06, + "loss": 0.6107, + "step": 13892 + }, + { + "epoch": 2.07, + "grad_norm": 2.426600773537462, + "learning_rate": 1.3200764480837076e-06, + "loss": 0.6914, + "step": 13893 + }, + { + "epoch": 2.07, + "grad_norm": 3.1580491585744586, + "learning_rate": 1.319984921055545e-06, + "loss": 0.6589, + "step": 13894 + }, + { + "epoch": 2.07, + "grad_norm": 3.19596255621131, + "learning_rate": 1.3198933910409345e-06, + "loss": 0.6289, + "step": 13895 + }, + { + "epoch": 2.07, + "grad_norm": 3.698883363107001, + "learning_rate": 1.319801858040731e-06, + "loss": 0.6445, + "step": 13896 + }, + { + "epoch": 2.07, + "grad_norm": 2.4031100083153887, + "learning_rate": 1.3197103220557883e-06, + "loss": 0.6641, + "step": 13897 + }, + { + "epoch": 2.07, + "grad_norm": 5.121335801927704, + "learning_rate": 1.3196187830869612e-06, + "loss": 0.6504, + "step": 13898 + }, + { + "epoch": 2.07, + "grad_norm": 2.5598934779303635, + "learning_rate": 1.3195272411351035e-06, + "loss": 0.6419, + "step": 13899 + }, + { + "epoch": 2.07, + "grad_norm": 2.2581619127664823, + "learning_rate": 1.3194356962010694e-06, + "loss": 0.6491, + "step": 13900 + }, + { + "epoch": 2.07, + "grad_norm": 4.720261223657902, + "learning_rate": 1.3193441482857147e-06, + "loss": 0.6536, + "step": 13901 + }, + { + "epoch": 2.07, + "grad_norm": 2.7549660760210055, + "learning_rate": 1.3192525973898921e-06, + "loss": 0.6823, + "step": 13902 + }, + { + "epoch": 2.07, + "grad_norm": 2.268244135170035, + "learning_rate": 1.319161043514457e-06, + "loss": 0.6595, + "step": 13903 + }, + { + "epoch": 2.07, + "grad_norm": 2.177535122617673, + "learning_rate": 1.3190694866602637e-06, + "loss": 0.6471, + "step": 13904 + }, + { + "epoch": 2.07, + "grad_norm": 2.0521607067018968, + "learning_rate": 1.3189779268281665e-06, + "loss": 0.6113, + "step": 13905 + }, + { + "epoch": 2.07, + "grad_norm": 2.2824886108480213, + "learning_rate": 1.31888636401902e-06, + "loss": 0.6602, + "step": 13906 + }, + { + "epoch": 2.07, + "grad_norm": 2.7716311200717945, + "learning_rate": 1.3187947982336792e-06, + "loss": 0.6263, + "step": 13907 + }, + { + "epoch": 2.07, + "grad_norm": 2.03745012928237, + "learning_rate": 1.3187032294729982e-06, + "loss": 0.6367, + "step": 13908 + }, + { + "epoch": 2.07, + "grad_norm": 3.142427461400779, + "learning_rate": 1.3186116577378316e-06, + "loss": 0.6562, + "step": 13909 + }, + { + "epoch": 2.07, + "grad_norm": 3.7666614387672155, + "learning_rate": 1.3185200830290342e-06, + "loss": 0.6465, + "step": 13910 + }, + { + "epoch": 2.07, + "grad_norm": 2.156466970367543, + "learning_rate": 1.3184285053474609e-06, + "loss": 0.6562, + "step": 13911 + }, + { + "epoch": 2.07, + "grad_norm": 2.7310848185976595, + "learning_rate": 1.318336924693966e-06, + "loss": 0.6536, + "step": 13912 + }, + { + "epoch": 2.08, + "grad_norm": 2.139611215105596, + "learning_rate": 1.3182453410694048e-06, + "loss": 0.5964, + "step": 13913 + }, + { + "epoch": 2.08, + "grad_norm": 3.2537591895778766, + "learning_rate": 1.3181537544746318e-06, + "loss": 0.6458, + "step": 13914 + }, + { + "epoch": 2.08, + "grad_norm": 2.4418218719351286, + "learning_rate": 1.318062164910501e-06, + "loss": 0.6504, + "step": 13915 + }, + { + "epoch": 2.08, + "grad_norm": 2.5172476201770313, + "learning_rate": 1.317970572377868e-06, + "loss": 0.6536, + "step": 13916 + }, + { + "epoch": 2.08, + "grad_norm": 3.829840242069281, + "learning_rate": 1.3178789768775883e-06, + "loss": 0.6458, + "step": 13917 + }, + { + "epoch": 2.08, + "grad_norm": 2.0497025476715973, + "learning_rate": 1.3177873784105152e-06, + "loss": 0.6602, + "step": 13918 + }, + { + "epoch": 2.08, + "grad_norm": 2.3468141656046346, + "learning_rate": 1.3176957769775049e-06, + "loss": 0.653, + "step": 13919 + }, + { + "epoch": 2.08, + "grad_norm": 2.225643438837969, + "learning_rate": 1.3176041725794114e-06, + "loss": 0.625, + "step": 13920 + }, + { + "epoch": 2.08, + "grad_norm": 3.3815780893743366, + "learning_rate": 1.3175125652170901e-06, + "loss": 0.666, + "step": 13921 + }, + { + "epoch": 2.08, + "grad_norm": 4.634369280778054, + "learning_rate": 1.3174209548913962e-06, + "loss": 0.6582, + "step": 13922 + }, + { + "epoch": 2.08, + "grad_norm": 2.320234496456539, + "learning_rate": 1.3173293416031843e-06, + "loss": 0.6771, + "step": 13923 + }, + { + "epoch": 2.08, + "grad_norm": 1.9353761168153298, + "learning_rate": 1.3172377253533095e-06, + "loss": 0.6322, + "step": 13924 + }, + { + "epoch": 2.08, + "grad_norm": 6.007887430121914, + "learning_rate": 1.3171461061426272e-06, + "loss": 0.6309, + "step": 13925 + }, + { + "epoch": 2.08, + "grad_norm": 3.124295761312196, + "learning_rate": 1.3170544839719921e-06, + "loss": 0.6641, + "step": 13926 + }, + { + "epoch": 2.08, + "grad_norm": 2.5213831473688635, + "learning_rate": 1.3169628588422597e-06, + "loss": 0.6745, + "step": 13927 + }, + { + "epoch": 2.08, + "grad_norm": 3.1697825532683384, + "learning_rate": 1.3168712307542846e-06, + "loss": 0.623, + "step": 13928 + }, + { + "epoch": 2.08, + "grad_norm": 2.8805287009753586, + "learning_rate": 1.3167795997089224e-06, + "loss": 0.651, + "step": 13929 + }, + { + "epoch": 2.08, + "grad_norm": 4.499934393889355, + "learning_rate": 1.316687965707028e-06, + "loss": 0.6523, + "step": 13930 + }, + { + "epoch": 2.08, + "grad_norm": 2.5411511974214225, + "learning_rate": 1.3165963287494573e-06, + "loss": 0.6016, + "step": 13931 + }, + { + "epoch": 2.08, + "grad_norm": 2.523079308569891, + "learning_rate": 1.3165046888370647e-06, + "loss": 0.6517, + "step": 13932 + }, + { + "epoch": 2.08, + "grad_norm": 5.299822112265228, + "learning_rate": 1.3164130459707057e-06, + "loss": 0.651, + "step": 13933 + }, + { + "epoch": 2.08, + "grad_norm": 4.171393713373506, + "learning_rate": 1.316321400151236e-06, + "loss": 0.6738, + "step": 13934 + }, + { + "epoch": 2.08, + "grad_norm": 2.961354734417333, + "learning_rate": 1.3162297513795108e-06, + "loss": 0.6204, + "step": 13935 + }, + { + "epoch": 2.08, + "grad_norm": 2.2773391466726953, + "learning_rate": 1.3161380996563851e-06, + "loss": 0.6139, + "step": 13936 + }, + { + "epoch": 2.08, + "grad_norm": 3.137281913783716, + "learning_rate": 1.3160464449827146e-06, + "loss": 0.6478, + "step": 13937 + }, + { + "epoch": 2.08, + "grad_norm": 2.6262694954579078, + "learning_rate": 1.315954787359355e-06, + "loss": 0.612, + "step": 13938 + }, + { + "epoch": 2.08, + "grad_norm": 2.5868698034784128, + "learning_rate": 1.3158631267871609e-06, + "loss": 0.6523, + "step": 13939 + }, + { + "epoch": 2.08, + "grad_norm": 2.3835969247841953, + "learning_rate": 1.3157714632669887e-06, + "loss": 0.6042, + "step": 13940 + }, + { + "epoch": 2.08, + "grad_norm": 2.398281167267324, + "learning_rate": 1.3156797967996935e-06, + "loss": 0.6289, + "step": 13941 + }, + { + "epoch": 2.08, + "grad_norm": 4.653170762884248, + "learning_rate": 1.3155881273861305e-06, + "loss": 0.6387, + "step": 13942 + }, + { + "epoch": 2.08, + "grad_norm": 2.871150616556487, + "learning_rate": 1.315496455027156e-06, + "loss": 0.6764, + "step": 13943 + }, + { + "epoch": 2.08, + "grad_norm": 3.228802812608975, + "learning_rate": 1.3154047797236251e-06, + "loss": 0.6647, + "step": 13944 + }, + { + "epoch": 2.08, + "grad_norm": 4.0499368753435085, + "learning_rate": 1.3153131014763933e-06, + "loss": 0.6159, + "step": 13945 + }, + { + "epoch": 2.08, + "grad_norm": 4.805727978725046, + "learning_rate": 1.3152214202863165e-06, + "loss": 0.6217, + "step": 13946 + }, + { + "epoch": 2.08, + "grad_norm": 2.875521331133765, + "learning_rate": 1.3151297361542506e-06, + "loss": 0.6289, + "step": 13947 + }, + { + "epoch": 2.08, + "grad_norm": 3.0271440610482916, + "learning_rate": 1.3150380490810506e-06, + "loss": 0.6413, + "step": 13948 + }, + { + "epoch": 2.08, + "grad_norm": 3.261271972695732, + "learning_rate": 1.314946359067573e-06, + "loss": 0.6758, + "step": 13949 + }, + { + "epoch": 2.08, + "grad_norm": 2.6163689365003804, + "learning_rate": 1.3148546661146728e-06, + "loss": 0.6517, + "step": 13950 + }, + { + "epoch": 2.08, + "grad_norm": 3.034742978888434, + "learning_rate": 1.3147629702232063e-06, + "loss": 0.6335, + "step": 13951 + }, + { + "epoch": 2.08, + "grad_norm": 3.4329399284284823, + "learning_rate": 1.314671271394029e-06, + "loss": 0.6784, + "step": 13952 + }, + { + "epoch": 2.08, + "grad_norm": 4.329848691009587, + "learning_rate": 1.3145795696279974e-06, + "loss": 0.6719, + "step": 13953 + }, + { + "epoch": 2.08, + "grad_norm": 2.7476492854396612, + "learning_rate": 1.3144878649259666e-06, + "loss": 0.6908, + "step": 13954 + }, + { + "epoch": 2.08, + "grad_norm": 2.259250225462121, + "learning_rate": 1.3143961572887925e-06, + "loss": 0.6227, + "step": 13955 + }, + { + "epoch": 2.08, + "grad_norm": 3.50963790125373, + "learning_rate": 1.3143044467173313e-06, + "loss": 0.5921, + "step": 13956 + }, + { + "epoch": 2.08, + "grad_norm": 3.1735037641973376, + "learning_rate": 1.3142127332124392e-06, + "loss": 0.6016, + "step": 13957 + }, + { + "epoch": 2.08, + "grad_norm": 3.3951246323329234, + "learning_rate": 1.3141210167749717e-06, + "loss": 0.6908, + "step": 13958 + }, + { + "epoch": 2.08, + "grad_norm": 2.750884509441376, + "learning_rate": 1.314029297405785e-06, + "loss": 0.6048, + "step": 13959 + }, + { + "epoch": 2.08, + "grad_norm": 2.5473447494460344, + "learning_rate": 1.3139375751057352e-06, + "loss": 0.6536, + "step": 13960 + }, + { + "epoch": 2.08, + "grad_norm": 4.183841627563956, + "learning_rate": 1.313845849875678e-06, + "loss": 0.6302, + "step": 13961 + }, + { + "epoch": 2.08, + "grad_norm": 5.698450138481281, + "learning_rate": 1.31375412171647e-06, + "loss": 0.6452, + "step": 13962 + }, + { + "epoch": 2.08, + "grad_norm": 4.74981487023842, + "learning_rate": 1.3136623906289668e-06, + "loss": 0.6283, + "step": 13963 + }, + { + "epoch": 2.08, + "grad_norm": 3.7214759727135776, + "learning_rate": 1.313570656614025e-06, + "loss": 0.6667, + "step": 13964 + }, + { + "epoch": 2.08, + "grad_norm": 2.190454415569204, + "learning_rate": 1.3134789196725002e-06, + "loss": 0.6458, + "step": 13965 + }, + { + "epoch": 2.08, + "grad_norm": 4.01937385682406, + "learning_rate": 1.3133871798052492e-06, + "loss": 0.6484, + "step": 13966 + }, + { + "epoch": 2.08, + "grad_norm": 3.3382815276736872, + "learning_rate": 1.313295437013128e-06, + "loss": 0.627, + "step": 13967 + }, + { + "epoch": 2.08, + "grad_norm": 2.2419294894325996, + "learning_rate": 1.3132036912969925e-06, + "loss": 0.6387, + "step": 13968 + }, + { + "epoch": 2.08, + "grad_norm": 3.7603171262712136, + "learning_rate": 1.3131119426576996e-06, + "loss": 0.707, + "step": 13969 + }, + { + "epoch": 2.08, + "grad_norm": 2.2932547189267396, + "learning_rate": 1.313020191096105e-06, + "loss": 0.6836, + "step": 13970 + }, + { + "epoch": 2.08, + "grad_norm": 3.388985931429113, + "learning_rate": 1.312928436613065e-06, + "loss": 0.6097, + "step": 13971 + }, + { + "epoch": 2.08, + "grad_norm": 4.094740509499872, + "learning_rate": 1.3128366792094366e-06, + "loss": 0.6309, + "step": 13972 + }, + { + "epoch": 2.08, + "grad_norm": 2.9447041292129965, + "learning_rate": 1.312744918886076e-06, + "loss": 0.6257, + "step": 13973 + }, + { + "epoch": 2.08, + "grad_norm": 2.268061025788881, + "learning_rate": 1.312653155643839e-06, + "loss": 0.6556, + "step": 13974 + }, + { + "epoch": 2.08, + "grad_norm": 3.417093074979135, + "learning_rate": 1.3125613894835827e-06, + "loss": 0.6556, + "step": 13975 + }, + { + "epoch": 2.08, + "grad_norm": 3.0539128828025426, + "learning_rate": 1.3124696204061631e-06, + "loss": 0.6243, + "step": 13976 + }, + { + "epoch": 2.08, + "grad_norm": 2.4243004868794578, + "learning_rate": 1.312377848412437e-06, + "loss": 0.6576, + "step": 13977 + }, + { + "epoch": 2.08, + "grad_norm": 2.359424640918365, + "learning_rate": 1.3122860735032611e-06, + "loss": 0.6771, + "step": 13978 + }, + { + "epoch": 2.08, + "grad_norm": 5.183208264040546, + "learning_rate": 1.3121942956794912e-06, + "loss": 0.6426, + "step": 13979 + }, + { + "epoch": 2.09, + "grad_norm": 2.073700310893715, + "learning_rate": 1.3121025149419846e-06, + "loss": 0.6523, + "step": 13980 + }, + { + "epoch": 2.09, + "grad_norm": 3.7735648857073465, + "learning_rate": 1.3120107312915976e-06, + "loss": 0.6797, + "step": 13981 + }, + { + "epoch": 2.09, + "grad_norm": 2.5401276831429036, + "learning_rate": 1.3119189447291867e-06, + "loss": 0.6751, + "step": 13982 + }, + { + "epoch": 2.09, + "grad_norm": 2.0568415214916542, + "learning_rate": 1.3118271552556092e-06, + "loss": 0.6204, + "step": 13983 + }, + { + "epoch": 2.09, + "grad_norm": 2.2405336210666316, + "learning_rate": 1.3117353628717204e-06, + "loss": 0.6341, + "step": 13984 + }, + { + "epoch": 2.09, + "grad_norm": 4.037078253561951, + "learning_rate": 1.3116435675783785e-06, + "loss": 0.6478, + "step": 13985 + }, + { + "epoch": 2.09, + "grad_norm": 3.040137436267887, + "learning_rate": 1.3115517693764396e-06, + "loss": 0.6484, + "step": 13986 + }, + { + "epoch": 2.09, + "grad_norm": 3.8886062097477883, + "learning_rate": 1.3114599682667602e-06, + "loss": 0.6413, + "step": 13987 + }, + { + "epoch": 2.09, + "grad_norm": 2.121228160712867, + "learning_rate": 1.3113681642501976e-06, + "loss": 0.6185, + "step": 13988 + }, + { + "epoch": 2.09, + "grad_norm": 5.2170440507638, + "learning_rate": 1.3112763573276081e-06, + "loss": 0.6126, + "step": 13989 + }, + { + "epoch": 2.09, + "grad_norm": 5.1047758622877755, + "learning_rate": 1.3111845474998494e-06, + "loss": 0.6309, + "step": 13990 + }, + { + "epoch": 2.09, + "grad_norm": 2.262592833096236, + "learning_rate": 1.3110927347677777e-06, + "loss": 0.6686, + "step": 13991 + }, + { + "epoch": 2.09, + "grad_norm": 2.739262101419198, + "learning_rate": 1.3110009191322494e-06, + "loss": 0.6016, + "step": 13992 + }, + { + "epoch": 2.09, + "grad_norm": 3.2922299033587943, + "learning_rate": 1.3109091005941229e-06, + "loss": 0.6328, + "step": 13993 + }, + { + "epoch": 2.09, + "grad_norm": 2.7248524919905464, + "learning_rate": 1.3108172791542535e-06, + "loss": 0.6673, + "step": 13994 + }, + { + "epoch": 2.09, + "grad_norm": 2.469443790019274, + "learning_rate": 1.310725454813499e-06, + "loss": 0.6673, + "step": 13995 + }, + { + "epoch": 2.09, + "grad_norm": 2.693595459040533, + "learning_rate": 1.3106336275727166e-06, + "loss": 0.6302, + "step": 13996 + }, + { + "epoch": 2.09, + "grad_norm": 3.1442582709440288, + "learning_rate": 1.310541797432763e-06, + "loss": 0.6126, + "step": 13997 + }, + { + "epoch": 2.09, + "grad_norm": 4.416849083846049, + "learning_rate": 1.3104499643944956e-06, + "loss": 0.6634, + "step": 13998 + }, + { + "epoch": 2.09, + "grad_norm": 4.464838855124471, + "learning_rate": 1.3103581284587708e-06, + "loss": 0.6152, + "step": 13999 + }, + { + "epoch": 2.09, + "grad_norm": 5.375699596506862, + "learning_rate": 1.3102662896264465e-06, + "loss": 0.6699, + "step": 14000 + }, + { + "epoch": 2.09, + "grad_norm": 2.4822844126912362, + "learning_rate": 1.3101744478983794e-06, + "loss": 0.694, + "step": 14001 + }, + { + "epoch": 2.09, + "grad_norm": 2.691394611113899, + "learning_rate": 1.3100826032754264e-06, + "loss": 0.653, + "step": 14002 + }, + { + "epoch": 2.09, + "grad_norm": 4.488353626601446, + "learning_rate": 1.3099907557584456e-06, + "loss": 0.6367, + "step": 14003 + }, + { + "epoch": 2.09, + "grad_norm": 2.5978734354816955, + "learning_rate": 1.3098989053482935e-06, + "loss": 0.64, + "step": 14004 + }, + { + "epoch": 2.09, + "grad_norm": 2.500591430135355, + "learning_rate": 1.3098070520458271e-06, + "loss": 0.6882, + "step": 14005 + }, + { + "epoch": 2.09, + "grad_norm": 4.822545291144264, + "learning_rate": 1.3097151958519043e-06, + "loss": 0.6986, + "step": 14006 + }, + { + "epoch": 2.09, + "grad_norm": 3.4063583061105915, + "learning_rate": 1.3096233367673825e-06, + "loss": 0.681, + "step": 14007 + }, + { + "epoch": 2.09, + "grad_norm": 2.4655855936286444, + "learning_rate": 1.3095314747931184e-06, + "loss": 0.6504, + "step": 14008 + }, + { + "epoch": 2.09, + "grad_norm": 5.80885185826291, + "learning_rate": 1.3094396099299699e-06, + "loss": 0.6257, + "step": 14009 + }, + { + "epoch": 2.09, + "grad_norm": 2.022848192232941, + "learning_rate": 1.3093477421787937e-06, + "loss": 0.6315, + "step": 14010 + }, + { + "epoch": 2.09, + "grad_norm": 2.116968481379542, + "learning_rate": 1.3092558715404481e-06, + "loss": 0.6432, + "step": 14011 + }, + { + "epoch": 2.09, + "grad_norm": 2.323287659845757, + "learning_rate": 1.3091639980157897e-06, + "loss": 0.623, + "step": 14012 + }, + { + "epoch": 2.09, + "grad_norm": 5.154082940421053, + "learning_rate": 1.3090721216056771e-06, + "loss": 0.6172, + "step": 14013 + }, + { + "epoch": 2.09, + "grad_norm": 2.5110652017014554, + "learning_rate": 1.3089802423109663e-06, + "loss": 0.6647, + "step": 14014 + }, + { + "epoch": 2.09, + "grad_norm": 1.8974814878584845, + "learning_rate": 1.3088883601325158e-06, + "loss": 0.64, + "step": 14015 + }, + { + "epoch": 2.09, + "grad_norm": 5.084452429268707, + "learning_rate": 1.3087964750711833e-06, + "loss": 0.6367, + "step": 14016 + }, + { + "epoch": 2.09, + "grad_norm": 1.9195282091532198, + "learning_rate": 1.3087045871278254e-06, + "loss": 0.6367, + "step": 14017 + }, + { + "epoch": 2.09, + "grad_norm": 2.808441817813912, + "learning_rate": 1.3086126963033006e-06, + "loss": 0.6413, + "step": 14018 + }, + { + "epoch": 2.09, + "grad_norm": 2.4319982602123904, + "learning_rate": 1.3085208025984664e-06, + "loss": 0.6732, + "step": 14019 + }, + { + "epoch": 2.09, + "grad_norm": 1.9675965615831925, + "learning_rate": 1.30842890601418e-06, + "loss": 0.6178, + "step": 14020 + }, + { + "epoch": 2.09, + "grad_norm": 2.3824254164431946, + "learning_rate": 1.3083370065512992e-06, + "loss": 0.6536, + "step": 14021 + }, + { + "epoch": 2.09, + "grad_norm": 4.167775184016157, + "learning_rate": 1.308245104210682e-06, + "loss": 0.6178, + "step": 14022 + }, + { + "epoch": 2.09, + "grad_norm": 3.1109796967201757, + "learning_rate": 1.3081531989931862e-06, + "loss": 0.6465, + "step": 14023 + }, + { + "epoch": 2.09, + "grad_norm": 2.275058682693567, + "learning_rate": 1.308061290899669e-06, + "loss": 0.6654, + "step": 14024 + }, + { + "epoch": 2.09, + "grad_norm": 2.0556929338961005, + "learning_rate": 1.3079693799309887e-06, + "loss": 0.6328, + "step": 14025 + }, + { + "epoch": 2.09, + "grad_norm": 3.131959977707898, + "learning_rate": 1.3078774660880031e-06, + "loss": 0.6263, + "step": 14026 + }, + { + "epoch": 2.09, + "grad_norm": 3.526636201691364, + "learning_rate": 1.3077855493715694e-06, + "loss": 0.6764, + "step": 14027 + }, + { + "epoch": 2.09, + "grad_norm": 3.2251185585977, + "learning_rate": 1.3076936297825465e-06, + "loss": 0.638, + "step": 14028 + }, + { + "epoch": 2.09, + "grad_norm": 2.2495833482747134, + "learning_rate": 1.3076017073217913e-06, + "loss": 0.6725, + "step": 14029 + }, + { + "epoch": 2.09, + "grad_norm": 2.5547437453635577, + "learning_rate": 1.3075097819901629e-06, + "loss": 0.6549, + "step": 14030 + }, + { + "epoch": 2.09, + "grad_norm": 2.486338022853924, + "learning_rate": 1.3074178537885176e-06, + "loss": 0.6156, + "step": 14031 + }, + { + "epoch": 2.09, + "grad_norm": 3.0827616210177915, + "learning_rate": 1.3073259227177147e-06, + "loss": 0.6296, + "step": 14032 + }, + { + "epoch": 2.09, + "grad_norm": 2.306761871870497, + "learning_rate": 1.307233988778612e-06, + "loss": 0.6562, + "step": 14033 + }, + { + "epoch": 2.09, + "grad_norm": 4.206319354328709, + "learning_rate": 1.3071420519720668e-06, + "loss": 0.5938, + "step": 14034 + }, + { + "epoch": 2.09, + "grad_norm": 2.417430392572754, + "learning_rate": 1.3070501122989379e-06, + "loss": 0.6562, + "step": 14035 + }, + { + "epoch": 2.09, + "grad_norm": 2.443353306584782, + "learning_rate": 1.3069581697600832e-06, + "loss": 0.6556, + "step": 14036 + }, + { + "epoch": 2.09, + "grad_norm": 3.0303924226315146, + "learning_rate": 1.3068662243563608e-06, + "loss": 0.668, + "step": 14037 + }, + { + "epoch": 2.09, + "grad_norm": 4.600325101860032, + "learning_rate": 1.3067742760886287e-06, + "loss": 0.6569, + "step": 14038 + }, + { + "epoch": 2.09, + "grad_norm": 3.7301262016565557, + "learning_rate": 1.306682324957745e-06, + "loss": 0.6191, + "step": 14039 + }, + { + "epoch": 2.09, + "grad_norm": 5.176902282727186, + "learning_rate": 1.3065903709645682e-06, + "loss": 0.6445, + "step": 14040 + }, + { + "epoch": 2.09, + "grad_norm": 5.6854976763506855, + "learning_rate": 1.3064984141099564e-06, + "loss": 0.6354, + "step": 14041 + }, + { + "epoch": 2.09, + "grad_norm": 4.156481923885818, + "learning_rate": 1.3064064543947678e-06, + "loss": 0.6146, + "step": 14042 + }, + { + "epoch": 2.09, + "grad_norm": 2.71060233768252, + "learning_rate": 1.3063144918198608e-06, + "loss": 0.651, + "step": 14043 + }, + { + "epoch": 2.09, + "grad_norm": 2.4686811992623516, + "learning_rate": 1.3062225263860934e-06, + "loss": 0.6335, + "step": 14044 + }, + { + "epoch": 2.09, + "grad_norm": 3.337651366334313, + "learning_rate": 1.3061305580943239e-06, + "loss": 0.6315, + "step": 14045 + }, + { + "epoch": 2.09, + "grad_norm": 3.2718384355066297, + "learning_rate": 1.3060385869454112e-06, + "loss": 0.6309, + "step": 14046 + }, + { + "epoch": 2.1, + "grad_norm": 7.044308490110256, + "learning_rate": 1.3059466129402129e-06, + "loss": 0.6758, + "step": 14047 + }, + { + "epoch": 2.1, + "grad_norm": 6.971691230329399, + "learning_rate": 1.305854636079588e-06, + "loss": 0.6706, + "step": 14048 + }, + { + "epoch": 2.1, + "grad_norm": 2.94117300107784, + "learning_rate": 1.305762656364395e-06, + "loss": 0.6536, + "step": 14049 + }, + { + "epoch": 2.1, + "grad_norm": 2.774729111432184, + "learning_rate": 1.3056706737954915e-06, + "loss": 0.6855, + "step": 14050 + }, + { + "epoch": 2.1, + "grad_norm": 2.737299109206128, + "learning_rate": 1.305578688373737e-06, + "loss": 0.6393, + "step": 14051 + }, + { + "epoch": 2.1, + "grad_norm": 2.7250640922589744, + "learning_rate": 1.3054867000999893e-06, + "loss": 0.61, + "step": 14052 + }, + { + "epoch": 2.1, + "grad_norm": 2.8417168571003937, + "learning_rate": 1.3053947089751076e-06, + "loss": 0.5983, + "step": 14053 + }, + { + "epoch": 2.1, + "grad_norm": 5.255387159100226, + "learning_rate": 1.3053027149999498e-06, + "loss": 0.6868, + "step": 14054 + }, + { + "epoch": 2.1, + "grad_norm": 3.249993399561968, + "learning_rate": 1.3052107181753747e-06, + "loss": 0.61, + "step": 14055 + }, + { + "epoch": 2.1, + "grad_norm": 2.856388160987972, + "learning_rate": 1.3051187185022411e-06, + "loss": 0.6654, + "step": 14056 + }, + { + "epoch": 2.1, + "grad_norm": 2.8090768885626876, + "learning_rate": 1.3050267159814078e-06, + "loss": 0.6491, + "step": 14057 + }, + { + "epoch": 2.1, + "grad_norm": 3.9344212358749284, + "learning_rate": 1.3049347106137325e-06, + "loss": 0.638, + "step": 14058 + }, + { + "epoch": 2.1, + "grad_norm": 3.951433676570609, + "learning_rate": 1.3048427024000751e-06, + "loss": 0.5892, + "step": 14059 + }, + { + "epoch": 2.1, + "grad_norm": 3.2385815924866503, + "learning_rate": 1.3047506913412939e-06, + "loss": 0.6862, + "step": 14060 + }, + { + "epoch": 2.1, + "grad_norm": 3.910010095312655, + "learning_rate": 1.3046586774382471e-06, + "loss": 0.638, + "step": 14061 + }, + { + "epoch": 2.1, + "grad_norm": 3.5112305039958107, + "learning_rate": 1.304566660691794e-06, + "loss": 0.6693, + "step": 14062 + }, + { + "epoch": 2.1, + "grad_norm": 2.3597517513737114, + "learning_rate": 1.3044746411027936e-06, + "loss": 0.6732, + "step": 14063 + }, + { + "epoch": 2.1, + "grad_norm": 2.2122101410538795, + "learning_rate": 1.3043826186721046e-06, + "loss": 0.6003, + "step": 14064 + }, + { + "epoch": 2.1, + "grad_norm": 2.092791303375667, + "learning_rate": 1.304290593400585e-06, + "loss": 0.6087, + "step": 14065 + }, + { + "epoch": 2.1, + "grad_norm": 2.2381723088342453, + "learning_rate": 1.304198565289095e-06, + "loss": 0.6341, + "step": 14066 + }, + { + "epoch": 2.1, + "grad_norm": 4.070267983657381, + "learning_rate": 1.3041065343384927e-06, + "loss": 0.6745, + "step": 14067 + }, + { + "epoch": 2.1, + "grad_norm": 2.771548117302386, + "learning_rate": 1.304014500549637e-06, + "loss": 0.6621, + "step": 14068 + }, + { + "epoch": 2.1, + "grad_norm": 2.5159612280532517, + "learning_rate": 1.3039224639233874e-06, + "loss": 0.6393, + "step": 14069 + }, + { + "epoch": 2.1, + "grad_norm": 1.9132652450151522, + "learning_rate": 1.3038304244606025e-06, + "loss": 0.626, + "step": 14070 + }, + { + "epoch": 2.1, + "grad_norm": 4.586694195338884, + "learning_rate": 1.303738382162141e-06, + "loss": 0.6315, + "step": 14071 + }, + { + "epoch": 2.1, + "grad_norm": 2.5757391657987405, + "learning_rate": 1.3036463370288628e-06, + "loss": 0.6602, + "step": 14072 + }, + { + "epoch": 2.1, + "grad_norm": 3.9385034943765898, + "learning_rate": 1.3035542890616265e-06, + "loss": 0.5938, + "step": 14073 + }, + { + "epoch": 2.1, + "grad_norm": 2.7053528879559665, + "learning_rate": 1.3034622382612907e-06, + "loss": 0.6315, + "step": 14074 + }, + { + "epoch": 2.1, + "grad_norm": 2.795385600553459, + "learning_rate": 1.3033701846287154e-06, + "loss": 0.6882, + "step": 14075 + }, + { + "epoch": 2.1, + "grad_norm": 2.501130722329823, + "learning_rate": 1.3032781281647593e-06, + "loss": 0.6374, + "step": 14076 + }, + { + "epoch": 2.1, + "grad_norm": 4.187685699420716, + "learning_rate": 1.3031860688702816e-06, + "loss": 0.6517, + "step": 14077 + }, + { + "epoch": 2.1, + "grad_norm": 3.8744000499192777, + "learning_rate": 1.3030940067461415e-06, + "loss": 0.6465, + "step": 14078 + }, + { + "epoch": 2.1, + "grad_norm": 2.557733867006154, + "learning_rate": 1.303001941793198e-06, + "loss": 0.6628, + "step": 14079 + }, + { + "epoch": 2.1, + "grad_norm": 2.604929412950054, + "learning_rate": 1.302909874012311e-06, + "loss": 0.6667, + "step": 14080 + }, + { + "epoch": 2.1, + "grad_norm": 2.3708179818901396, + "learning_rate": 1.3028178034043392e-06, + "loss": 0.6367, + "step": 14081 + }, + { + "epoch": 2.1, + "grad_norm": 3.1394646197510756, + "learning_rate": 1.3027257299701417e-06, + "loss": 0.6113, + "step": 14082 + }, + { + "epoch": 2.1, + "grad_norm": 4.7190368606398545, + "learning_rate": 1.3026336537105787e-06, + "loss": 0.6224, + "step": 14083 + }, + { + "epoch": 2.1, + "grad_norm": 2.9685606006715632, + "learning_rate": 1.3025415746265086e-06, + "loss": 0.6745, + "step": 14084 + }, + { + "epoch": 2.1, + "grad_norm": 3.203667737948935, + "learning_rate": 1.3024494927187917e-06, + "loss": 0.6719, + "step": 14085 + }, + { + "epoch": 2.1, + "grad_norm": 3.8492192658552753, + "learning_rate": 1.3023574079882867e-06, + "loss": 0.6484, + "step": 14086 + }, + { + "epoch": 2.1, + "grad_norm": 3.5768891337493143, + "learning_rate": 1.302265320435853e-06, + "loss": 0.6719, + "step": 14087 + }, + { + "epoch": 2.1, + "grad_norm": 2.6683041071269606, + "learning_rate": 1.3021732300623506e-06, + "loss": 0.6439, + "step": 14088 + }, + { + "epoch": 2.1, + "grad_norm": 4.820167355749166, + "learning_rate": 1.3020811368686388e-06, + "loss": 0.61, + "step": 14089 + }, + { + "epoch": 2.1, + "grad_norm": 2.470225836298866, + "learning_rate": 1.301989040855577e-06, + "loss": 0.6732, + "step": 14090 + }, + { + "epoch": 2.1, + "grad_norm": 2.378496110739665, + "learning_rate": 1.3018969420240247e-06, + "loss": 0.6081, + "step": 14091 + }, + { + "epoch": 2.1, + "grad_norm": 2.2692245109534652, + "learning_rate": 1.3018048403748417e-06, + "loss": 0.6419, + "step": 14092 + }, + { + "epoch": 2.1, + "grad_norm": 7.006027562161865, + "learning_rate": 1.301712735908887e-06, + "loss": 0.6803, + "step": 14093 + }, + { + "epoch": 2.1, + "grad_norm": 3.8360887231726086, + "learning_rate": 1.301620628627021e-06, + "loss": 0.6224, + "step": 14094 + }, + { + "epoch": 2.1, + "grad_norm": 6.093665365850763, + "learning_rate": 1.3015285185301027e-06, + "loss": 0.6882, + "step": 14095 + }, + { + "epoch": 2.1, + "grad_norm": 2.539727768168298, + "learning_rate": 1.3014364056189928e-06, + "loss": 0.6523, + "step": 14096 + }, + { + "epoch": 2.1, + "grad_norm": 3.4654370336483176, + "learning_rate": 1.3013442898945496e-06, + "loss": 0.612, + "step": 14097 + }, + { + "epoch": 2.1, + "grad_norm": 3.7465308263291215, + "learning_rate": 1.3012521713576336e-06, + "loss": 0.6517, + "step": 14098 + }, + { + "epoch": 2.1, + "grad_norm": 4.620337385560622, + "learning_rate": 1.3011600500091046e-06, + "loss": 0.6732, + "step": 14099 + }, + { + "epoch": 2.1, + "grad_norm": 3.914166636144451, + "learning_rate": 1.3010679258498222e-06, + "loss": 0.6185, + "step": 14100 + }, + { + "epoch": 2.1, + "grad_norm": 2.703713925853438, + "learning_rate": 1.3009757988806462e-06, + "loss": 0.679, + "step": 14101 + }, + { + "epoch": 2.1, + "grad_norm": 3.3833504759478026, + "learning_rate": 1.3008836691024366e-06, + "loss": 0.6341, + "step": 14102 + }, + { + "epoch": 2.1, + "grad_norm": 2.8632875020444097, + "learning_rate": 1.3007915365160526e-06, + "loss": 0.6361, + "step": 14103 + }, + { + "epoch": 2.1, + "grad_norm": 2.5687863348768656, + "learning_rate": 1.3006994011223551e-06, + "loss": 0.6628, + "step": 14104 + }, + { + "epoch": 2.1, + "grad_norm": 2.4083656437542245, + "learning_rate": 1.300607262922203e-06, + "loss": 0.6618, + "step": 14105 + }, + { + "epoch": 2.1, + "grad_norm": 2.1201571393189735, + "learning_rate": 1.3005151219164572e-06, + "loss": 0.6803, + "step": 14106 + }, + { + "epoch": 2.1, + "grad_norm": 4.22237632338309, + "learning_rate": 1.3004229781059772e-06, + "loss": 0.6556, + "step": 14107 + }, + { + "epoch": 2.1, + "grad_norm": 2.276319301582747, + "learning_rate": 1.3003308314916226e-06, + "loss": 0.6556, + "step": 14108 + }, + { + "epoch": 2.1, + "grad_norm": 4.423779224030957, + "learning_rate": 1.300238682074254e-06, + "loss": 0.6322, + "step": 14109 + }, + { + "epoch": 2.1, + "grad_norm": 3.466807904842791, + "learning_rate": 1.3001465298547314e-06, + "loss": 0.6992, + "step": 14110 + }, + { + "epoch": 2.1, + "grad_norm": 2.9482735873453, + "learning_rate": 1.3000543748339142e-06, + "loss": 0.6191, + "step": 14111 + }, + { + "epoch": 2.1, + "grad_norm": 2.628426146281082, + "learning_rate": 1.2999622170126637e-06, + "loss": 0.627, + "step": 14112 + }, + { + "epoch": 2.1, + "grad_norm": 2.4339106750187347, + "learning_rate": 1.2998700563918388e-06, + "loss": 0.6328, + "step": 14113 + }, + { + "epoch": 2.1, + "grad_norm": 2.2663672721891306, + "learning_rate": 1.2997778929723001e-06, + "loss": 0.6263, + "step": 14114 + }, + { + "epoch": 2.11, + "grad_norm": 2.611042518793047, + "learning_rate": 1.2996857267549079e-06, + "loss": 0.6517, + "step": 14115 + }, + { + "epoch": 2.11, + "grad_norm": 2.9037147137041246, + "learning_rate": 1.2995935577405226e-06, + "loss": 0.6667, + "step": 14116 + }, + { + "epoch": 2.11, + "grad_norm": 2.9664680705476814, + "learning_rate": 1.299501385930004e-06, + "loss": 0.6582, + "step": 14117 + }, + { + "epoch": 2.11, + "grad_norm": 2.3334257069849236, + "learning_rate": 1.299409211324212e-06, + "loss": 0.6393, + "step": 14118 + }, + { + "epoch": 2.11, + "grad_norm": 3.4894538862619955, + "learning_rate": 1.299317033924008e-06, + "loss": 0.6628, + "step": 14119 + }, + { + "epoch": 2.11, + "grad_norm": 2.178699479924641, + "learning_rate": 1.2992248537302515e-06, + "loss": 0.6771, + "step": 14120 + }, + { + "epoch": 2.11, + "grad_norm": 3.385723397043175, + "learning_rate": 1.2991326707438026e-06, + "loss": 0.653, + "step": 14121 + }, + { + "epoch": 2.11, + "grad_norm": 2.128421486785411, + "learning_rate": 1.2990404849655226e-06, + "loss": 0.627, + "step": 14122 + }, + { + "epoch": 2.11, + "grad_norm": 2.9743708127595503, + "learning_rate": 1.2989482963962708e-06, + "loss": 0.6309, + "step": 14123 + }, + { + "epoch": 2.11, + "grad_norm": 3.5848445045858304, + "learning_rate": 1.2988561050369083e-06, + "loss": 0.612, + "step": 14124 + }, + { + "epoch": 2.11, + "grad_norm": 2.600555033201738, + "learning_rate": 1.2987639108882954e-06, + "loss": 0.6634, + "step": 14125 + }, + { + "epoch": 2.11, + "grad_norm": 3.811280089407511, + "learning_rate": 1.2986717139512927e-06, + "loss": 0.6615, + "step": 14126 + }, + { + "epoch": 2.11, + "grad_norm": 3.106999222367392, + "learning_rate": 1.29857951422676e-06, + "loss": 0.6556, + "step": 14127 + }, + { + "epoch": 2.11, + "grad_norm": 4.449405506058259, + "learning_rate": 1.2984873117155586e-06, + "loss": 0.6452, + "step": 14128 + }, + { + "epoch": 2.11, + "grad_norm": 2.2958834442817753, + "learning_rate": 1.2983951064185488e-06, + "loss": 0.6595, + "step": 14129 + }, + { + "epoch": 2.11, + "grad_norm": 2.8194586750902495, + "learning_rate": 1.2983028983365908e-06, + "loss": 0.6465, + "step": 14130 + }, + { + "epoch": 2.11, + "grad_norm": 3.063885449609387, + "learning_rate": 1.2982106874705455e-06, + "loss": 0.6855, + "step": 14131 + }, + { + "epoch": 2.11, + "grad_norm": 2.2930392660471535, + "learning_rate": 1.2981184738212737e-06, + "loss": 0.6497, + "step": 14132 + }, + { + "epoch": 2.11, + "grad_norm": 3.319144119959071, + "learning_rate": 1.298026257389636e-06, + "loss": 0.6276, + "step": 14133 + }, + { + "epoch": 2.11, + "grad_norm": 2.799210475473489, + "learning_rate": 1.2979340381764924e-06, + "loss": 0.6621, + "step": 14134 + }, + { + "epoch": 2.11, + "grad_norm": 3.3140307313196025, + "learning_rate": 1.2978418161827041e-06, + "loss": 0.6393, + "step": 14135 + }, + { + "epoch": 2.11, + "grad_norm": 2.29522423140808, + "learning_rate": 1.297749591409132e-06, + "loss": 0.6862, + "step": 14136 + }, + { + "epoch": 2.11, + "grad_norm": 2.6231189302161573, + "learning_rate": 1.2976573638566363e-06, + "loss": 0.6478, + "step": 14137 + }, + { + "epoch": 2.11, + "grad_norm": 2.2387995956487283, + "learning_rate": 1.2975651335260787e-06, + "loss": 0.6758, + "step": 14138 + }, + { + "epoch": 2.11, + "grad_norm": 2.11049505201342, + "learning_rate": 1.297472900418319e-06, + "loss": 0.6816, + "step": 14139 + }, + { + "epoch": 2.11, + "grad_norm": 5.154490572615039, + "learning_rate": 1.2973806645342184e-06, + "loss": 0.6549, + "step": 14140 + }, + { + "epoch": 2.11, + "grad_norm": 2.0700131611069863, + "learning_rate": 1.2972884258746379e-06, + "loss": 0.6426, + "step": 14141 + }, + { + "epoch": 2.11, + "grad_norm": 4.9402659034724845, + "learning_rate": 1.2971961844404385e-06, + "loss": 0.6452, + "step": 14142 + }, + { + "epoch": 2.11, + "grad_norm": 6.579711600120295, + "learning_rate": 1.2971039402324802e-06, + "loss": 0.6979, + "step": 14143 + }, + { + "epoch": 2.11, + "grad_norm": 6.365519061547131, + "learning_rate": 1.2970116932516249e-06, + "loss": 0.638, + "step": 14144 + }, + { + "epoch": 2.11, + "grad_norm": 2.3469763580551004, + "learning_rate": 1.2969194434987331e-06, + "loss": 0.6165, + "step": 14145 + }, + { + "epoch": 2.11, + "grad_norm": 2.251678334221605, + "learning_rate": 1.2968271909746662e-06, + "loss": 0.6341, + "step": 14146 + }, + { + "epoch": 2.11, + "grad_norm": 2.530739074266839, + "learning_rate": 1.2967349356802843e-06, + "loss": 0.6081, + "step": 14147 + }, + { + "epoch": 2.11, + "grad_norm": 3.5859497474541078, + "learning_rate": 1.296642677616449e-06, + "loss": 0.6517, + "step": 14148 + }, + { + "epoch": 2.11, + "grad_norm": 2.8956210026771703, + "learning_rate": 1.2965504167840223e-06, + "loss": 0.6452, + "step": 14149 + }, + { + "epoch": 2.11, + "grad_norm": 3.5483358936311893, + "learning_rate": 1.2964581531838635e-06, + "loss": 0.6243, + "step": 14150 + }, + { + "epoch": 2.11, + "grad_norm": 2.1869443433983866, + "learning_rate": 1.2963658868168349e-06, + "loss": 0.6289, + "step": 14151 + }, + { + "epoch": 2.11, + "grad_norm": 2.287760929624699, + "learning_rate": 1.296273617683797e-06, + "loss": 0.6758, + "step": 14152 + }, + { + "epoch": 2.11, + "grad_norm": 2.870035833824922, + "learning_rate": 1.2961813457856112e-06, + "loss": 0.6693, + "step": 14153 + }, + { + "epoch": 2.11, + "grad_norm": 3.2550391506576615, + "learning_rate": 1.296089071123139e-06, + "loss": 0.668, + "step": 14154 + }, + { + "epoch": 2.11, + "grad_norm": 2.709832969136977, + "learning_rate": 1.2959967936972412e-06, + "loss": 0.6647, + "step": 14155 + }, + { + "epoch": 2.11, + "grad_norm": 2.8292315684874003, + "learning_rate": 1.295904513508779e-06, + "loss": 0.6257, + "step": 14156 + }, + { + "epoch": 2.11, + "grad_norm": 2.269125529686266, + "learning_rate": 1.2958122305586138e-06, + "loss": 0.6452, + "step": 14157 + }, + { + "epoch": 2.11, + "grad_norm": 2.4841219430862505, + "learning_rate": 1.2957199448476073e-06, + "loss": 0.6771, + "step": 14158 + }, + { + "epoch": 2.11, + "grad_norm": 7.544661988243387, + "learning_rate": 1.29562765637662e-06, + "loss": 0.6556, + "step": 14159 + }, + { + "epoch": 2.11, + "grad_norm": 3.3121561511369406, + "learning_rate": 1.2955353651465138e-06, + "loss": 0.5931, + "step": 14160 + }, + { + "epoch": 2.11, + "grad_norm": 2.1336300997227102, + "learning_rate": 1.2954430711581497e-06, + "loss": 0.6133, + "step": 14161 + }, + { + "epoch": 2.11, + "grad_norm": 2.288089332354023, + "learning_rate": 1.2953507744123897e-06, + "loss": 0.6738, + "step": 14162 + }, + { + "epoch": 2.11, + "grad_norm": 5.114090132392826, + "learning_rate": 1.2952584749100944e-06, + "loss": 0.6908, + "step": 14163 + }, + { + "epoch": 2.11, + "grad_norm": 2.173109856215186, + "learning_rate": 1.2951661726521254e-06, + "loss": 0.6035, + "step": 14164 + }, + { + "epoch": 2.11, + "grad_norm": 2.190770960843817, + "learning_rate": 1.2950738676393452e-06, + "loss": 0.6074, + "step": 14165 + }, + { + "epoch": 2.11, + "grad_norm": 2.4607397654961156, + "learning_rate": 1.294981559872614e-06, + "loss": 0.6712, + "step": 14166 + }, + { + "epoch": 2.11, + "grad_norm": 4.263420246330259, + "learning_rate": 1.2948892493527938e-06, + "loss": 0.6523, + "step": 14167 + }, + { + "epoch": 2.11, + "grad_norm": 2.9453013101818044, + "learning_rate": 1.2947969360807466e-06, + "loss": 0.7057, + "step": 14168 + }, + { + "epoch": 2.11, + "grad_norm": 4.299932922842319, + "learning_rate": 1.2947046200573328e-06, + "loss": 0.651, + "step": 14169 + }, + { + "epoch": 2.11, + "grad_norm": 6.973879415199715, + "learning_rate": 1.2946123012834154e-06, + "loss": 0.6999, + "step": 14170 + }, + { + "epoch": 2.11, + "grad_norm": 2.512649430870439, + "learning_rate": 1.2945199797598546e-06, + "loss": 0.582, + "step": 14171 + }, + { + "epoch": 2.11, + "grad_norm": 2.344328374636671, + "learning_rate": 1.2944276554875135e-06, + "loss": 0.6569, + "step": 14172 + }, + { + "epoch": 2.11, + "grad_norm": 2.2717060174148482, + "learning_rate": 1.2943353284672529e-06, + "loss": 0.6842, + "step": 14173 + }, + { + "epoch": 2.11, + "grad_norm": 2.645514328740593, + "learning_rate": 1.2942429986999344e-06, + "loss": 0.6348, + "step": 14174 + }, + { + "epoch": 2.11, + "grad_norm": 2.898737669057107, + "learning_rate": 1.29415066618642e-06, + "loss": 0.5938, + "step": 14175 + }, + { + "epoch": 2.11, + "grad_norm": 2.3881782004755845, + "learning_rate": 1.2940583309275717e-06, + "loss": 0.6139, + "step": 14176 + }, + { + "epoch": 2.11, + "grad_norm": 2.6452918881899383, + "learning_rate": 1.2939659929242507e-06, + "loss": 0.6035, + "step": 14177 + }, + { + "epoch": 2.11, + "grad_norm": 3.8292026298984094, + "learning_rate": 1.2938736521773195e-06, + "loss": 0.6413, + "step": 14178 + }, + { + "epoch": 2.11, + "grad_norm": 1.9872427926155305, + "learning_rate": 1.293781308687639e-06, + "loss": 0.64, + "step": 14179 + }, + { + "epoch": 2.11, + "grad_norm": 2.76703983762408, + "learning_rate": 1.2936889624560715e-06, + "loss": 0.6237, + "step": 14180 + }, + { + "epoch": 2.11, + "grad_norm": 2.154027078920593, + "learning_rate": 1.2935966134834795e-06, + "loss": 0.6211, + "step": 14181 + }, + { + "epoch": 2.12, + "grad_norm": 2.5613064202913836, + "learning_rate": 1.293504261770724e-06, + "loss": 0.6432, + "step": 14182 + }, + { + "epoch": 2.12, + "grad_norm": 2.6726155176976185, + "learning_rate": 1.2934119073186676e-06, + "loss": 0.6549, + "step": 14183 + }, + { + "epoch": 2.12, + "grad_norm": 3.1969458536606976, + "learning_rate": 1.2933195501281715e-06, + "loss": 0.6549, + "step": 14184 + }, + { + "epoch": 2.12, + "grad_norm": 2.861665870480649, + "learning_rate": 1.293227190200098e-06, + "loss": 0.6777, + "step": 14185 + }, + { + "epoch": 2.12, + "grad_norm": 4.6691881576076595, + "learning_rate": 1.2931348275353098e-06, + "loss": 0.5749, + "step": 14186 + }, + { + "epoch": 2.12, + "grad_norm": 2.316532576764467, + "learning_rate": 1.2930424621346679e-06, + "loss": 0.6198, + "step": 14187 + }, + { + "epoch": 2.12, + "grad_norm": 3.9873436551332455, + "learning_rate": 1.292950093999035e-06, + "loss": 0.6842, + "step": 14188 + }, + { + "epoch": 2.12, + "grad_norm": 3.013300316069063, + "learning_rate": 1.2928577231292727e-06, + "loss": 0.6068, + "step": 14189 + }, + { + "epoch": 2.12, + "grad_norm": 6.06598914481342, + "learning_rate": 1.2927653495262435e-06, + "loss": 0.6914, + "step": 14190 + }, + { + "epoch": 2.12, + "grad_norm": 4.303683302837477, + "learning_rate": 1.2926729731908094e-06, + "loss": 0.6113, + "step": 14191 + }, + { + "epoch": 2.12, + "grad_norm": 3.007297435125526, + "learning_rate": 1.2925805941238329e-06, + "loss": 0.5931, + "step": 14192 + }, + { + "epoch": 2.12, + "grad_norm": 3.7600639051327462, + "learning_rate": 1.2924882123261752e-06, + "loss": 0.6406, + "step": 14193 + }, + { + "epoch": 2.12, + "grad_norm": 2.5194556413804783, + "learning_rate": 1.2923958277986995e-06, + "loss": 0.6419, + "step": 14194 + }, + { + "epoch": 2.12, + "grad_norm": 2.721017061128091, + "learning_rate": 1.2923034405422676e-06, + "loss": 0.6504, + "step": 14195 + }, + { + "epoch": 2.12, + "grad_norm": 2.843090211072589, + "learning_rate": 1.292211050557742e-06, + "loss": 0.6484, + "step": 14196 + }, + { + "epoch": 2.12, + "grad_norm": 3.6405173673094224, + "learning_rate": 1.2921186578459846e-06, + "loss": 0.6439, + "step": 14197 + }, + { + "epoch": 2.12, + "grad_norm": 4.459100637853179, + "learning_rate": 1.292026262407858e-06, + "loss": 0.651, + "step": 14198 + }, + { + "epoch": 2.12, + "grad_norm": 2.5806381937404703, + "learning_rate": 1.2919338642442249e-06, + "loss": 0.6491, + "step": 14199 + }, + { + "epoch": 2.12, + "grad_norm": 2.8877044305816457, + "learning_rate": 1.2918414633559466e-06, + "loss": 0.6523, + "step": 14200 + }, + { + "epoch": 2.12, + "grad_norm": 3.632024311093966, + "learning_rate": 1.2917490597438862e-06, + "loss": 0.612, + "step": 14201 + }, + { + "epoch": 2.12, + "grad_norm": 2.381741443242542, + "learning_rate": 1.2916566534089063e-06, + "loss": 0.6146, + "step": 14202 + }, + { + "epoch": 2.12, + "grad_norm": 4.486213447857525, + "learning_rate": 1.2915642443518689e-06, + "loss": 0.6634, + "step": 14203 + }, + { + "epoch": 2.12, + "grad_norm": 2.8531931606461636, + "learning_rate": 1.2914718325736367e-06, + "loss": 0.5983, + "step": 14204 + }, + { + "epoch": 2.12, + "grad_norm": 3.781318769990147, + "learning_rate": 1.291379418075072e-06, + "loss": 0.6315, + "step": 14205 + }, + { + "epoch": 2.12, + "grad_norm": 2.9641048060511532, + "learning_rate": 1.2912870008570373e-06, + "loss": 0.64, + "step": 14206 + }, + { + "epoch": 2.12, + "grad_norm": 3.545763389979362, + "learning_rate": 1.2911945809203956e-06, + "loss": 0.6523, + "step": 14207 + }, + { + "epoch": 2.12, + "grad_norm": 3.4752746493342883, + "learning_rate": 1.291102158266009e-06, + "loss": 0.6471, + "step": 14208 + }, + { + "epoch": 2.12, + "grad_norm": 2.7098848843739973, + "learning_rate": 1.2910097328947401e-06, + "loss": 0.6582, + "step": 14209 + }, + { + "epoch": 2.12, + "grad_norm": 3.3501934418377624, + "learning_rate": 1.2909173048074518e-06, + "loss": 0.6289, + "step": 14210 + }, + { + "epoch": 2.12, + "grad_norm": 2.948924904755176, + "learning_rate": 1.2908248740050064e-06, + "loss": 0.6107, + "step": 14211 + }, + { + "epoch": 2.12, + "grad_norm": 2.736144310745111, + "learning_rate": 1.290732440488267e-06, + "loss": 0.6419, + "step": 14212 + }, + { + "epoch": 2.12, + "grad_norm": 4.498231726121505, + "learning_rate": 1.2906400042580956e-06, + "loss": 0.6374, + "step": 14213 + }, + { + "epoch": 2.12, + "grad_norm": 2.5212761917497883, + "learning_rate": 1.2905475653153555e-06, + "loss": 0.6452, + "step": 14214 + }, + { + "epoch": 2.12, + "grad_norm": 2.8445995816505056, + "learning_rate": 1.2904551236609097e-06, + "loss": 0.6191, + "step": 14215 + }, + { + "epoch": 2.12, + "grad_norm": 4.822887155951058, + "learning_rate": 1.29036267929562e-06, + "loss": 0.6693, + "step": 14216 + }, + { + "epoch": 2.12, + "grad_norm": 4.165324356500572, + "learning_rate": 1.29027023222035e-06, + "loss": 0.6126, + "step": 14217 + }, + { + "epoch": 2.12, + "grad_norm": 2.372909854863169, + "learning_rate": 1.2901777824359624e-06, + "loss": 0.623, + "step": 14218 + }, + { + "epoch": 2.12, + "grad_norm": 2.406781662752044, + "learning_rate": 1.2900853299433194e-06, + "loss": 0.6003, + "step": 14219 + }, + { + "epoch": 2.12, + "grad_norm": 4.5002492603683, + "learning_rate": 1.2899928747432847e-06, + "loss": 0.6999, + "step": 14220 + }, + { + "epoch": 2.12, + "grad_norm": 2.7907898038855277, + "learning_rate": 1.289900416836721e-06, + "loss": 0.6562, + "step": 14221 + }, + { + "epoch": 2.12, + "grad_norm": 3.569029733907951, + "learning_rate": 1.2898079562244906e-06, + "loss": 0.6328, + "step": 14222 + }, + { + "epoch": 2.12, + "grad_norm": 2.5432863679096265, + "learning_rate": 1.289715492907457e-06, + "loss": 0.651, + "step": 14223 + }, + { + "epoch": 2.12, + "grad_norm": 2.7127503692187585, + "learning_rate": 1.2896230268864833e-06, + "loss": 0.6107, + "step": 14224 + }, + { + "epoch": 2.12, + "grad_norm": 2.1953941947588236, + "learning_rate": 1.2895305581624323e-06, + "loss": 0.6107, + "step": 14225 + }, + { + "epoch": 2.12, + "grad_norm": 2.9450372374245024, + "learning_rate": 1.2894380867361671e-06, + "loss": 0.6439, + "step": 14226 + }, + { + "epoch": 2.12, + "grad_norm": 6.516629433760076, + "learning_rate": 1.2893456126085501e-06, + "loss": 0.6126, + "step": 14227 + }, + { + "epoch": 2.12, + "grad_norm": 2.4285842288597173, + "learning_rate": 1.2892531357804458e-06, + "loss": 0.6322, + "step": 14228 + }, + { + "epoch": 2.12, + "grad_norm": 2.3066988472832364, + "learning_rate": 1.2891606562527158e-06, + "loss": 0.612, + "step": 14229 + }, + { + "epoch": 2.12, + "grad_norm": 2.609044468965294, + "learning_rate": 1.2890681740262238e-06, + "loss": 0.6022, + "step": 14230 + }, + { + "epoch": 2.12, + "grad_norm": 2.4759232555753816, + "learning_rate": 1.2889756891018331e-06, + "loss": 0.599, + "step": 14231 + }, + { + "epoch": 2.12, + "grad_norm": 2.422722603652132, + "learning_rate": 1.2888832014804068e-06, + "loss": 0.6797, + "step": 14232 + }, + { + "epoch": 2.12, + "grad_norm": 2.5543668344124426, + "learning_rate": 1.2887907111628077e-06, + "loss": 0.6901, + "step": 14233 + }, + { + "epoch": 2.12, + "grad_norm": 2.3013853421568156, + "learning_rate": 1.2886982181499e-06, + "loss": 0.6576, + "step": 14234 + }, + { + "epoch": 2.12, + "grad_norm": 2.424036085599298, + "learning_rate": 1.2886057224425457e-06, + "loss": 0.6302, + "step": 14235 + }, + { + "epoch": 2.12, + "grad_norm": 2.4464148699978128, + "learning_rate": 1.2885132240416087e-06, + "loss": 0.6452, + "step": 14236 + }, + { + "epoch": 2.12, + "grad_norm": 2.6404311913313347, + "learning_rate": 1.2884207229479525e-06, + "loss": 0.6237, + "step": 14237 + }, + { + "epoch": 2.12, + "grad_norm": 4.477922447035, + "learning_rate": 1.2883282191624402e-06, + "loss": 0.6211, + "step": 14238 + }, + { + "epoch": 2.12, + "grad_norm": 2.6578360460769446, + "learning_rate": 1.288235712685935e-06, + "loss": 0.6556, + "step": 14239 + }, + { + "epoch": 2.12, + "grad_norm": 3.540707871565778, + "learning_rate": 1.2881432035193004e-06, + "loss": 0.6302, + "step": 14240 + }, + { + "epoch": 2.12, + "grad_norm": 4.996362633030808, + "learning_rate": 1.2880506916634e-06, + "loss": 0.6387, + "step": 14241 + }, + { + "epoch": 2.12, + "grad_norm": 3.1215405223165233, + "learning_rate": 1.2879581771190969e-06, + "loss": 0.6387, + "step": 14242 + }, + { + "epoch": 2.12, + "grad_norm": 2.332994571008652, + "learning_rate": 1.2878656598872546e-06, + "loss": 0.6204, + "step": 14243 + }, + { + "epoch": 2.12, + "grad_norm": 2.4254568659712925, + "learning_rate": 1.2877731399687368e-06, + "loss": 0.6374, + "step": 14244 + }, + { + "epoch": 2.12, + "grad_norm": 3.497190711584487, + "learning_rate": 1.2876806173644066e-06, + "loss": 0.6322, + "step": 14245 + }, + { + "epoch": 2.12, + "grad_norm": 2.889793440456418, + "learning_rate": 1.2875880920751278e-06, + "loss": 0.6595, + "step": 14246 + }, + { + "epoch": 2.12, + "grad_norm": 5.638137896224925, + "learning_rate": 1.287495564101764e-06, + "loss": 0.6263, + "step": 14247 + }, + { + "epoch": 2.12, + "grad_norm": 2.7169274993799535, + "learning_rate": 1.2874030334451785e-06, + "loss": 0.6576, + "step": 14248 + }, + { + "epoch": 2.13, + "grad_norm": 2.1942860381299503, + "learning_rate": 1.2873105001062356e-06, + "loss": 0.6217, + "step": 14249 + }, + { + "epoch": 2.13, + "grad_norm": 2.814712346551674, + "learning_rate": 1.287217964085798e-06, + "loss": 0.5931, + "step": 14250 + }, + { + "epoch": 2.13, + "grad_norm": 2.4002932004562116, + "learning_rate": 1.2871254253847297e-06, + "loss": 0.6706, + "step": 14251 + }, + { + "epoch": 2.13, + "grad_norm": 2.499048063815453, + "learning_rate": 1.2870328840038944e-06, + "loss": 0.6328, + "step": 14252 + }, + { + "epoch": 2.13, + "grad_norm": 3.595477252909791, + "learning_rate": 1.2869403399441559e-06, + "loss": 0.6647, + "step": 14253 + }, + { + "epoch": 2.13, + "grad_norm": 3.233040992603795, + "learning_rate": 1.2868477932063776e-06, + "loss": 0.6471, + "step": 14254 + }, + { + "epoch": 2.13, + "grad_norm": 3.0531559316593126, + "learning_rate": 1.286755243791424e-06, + "loss": 0.6634, + "step": 14255 + }, + { + "epoch": 2.13, + "grad_norm": 2.2949213151850705, + "learning_rate": 1.2866626917001578e-06, + "loss": 0.6562, + "step": 14256 + }, + { + "epoch": 2.13, + "grad_norm": 6.010484363737371, + "learning_rate": 1.2865701369334438e-06, + "loss": 0.6641, + "step": 14257 + }, + { + "epoch": 2.13, + "grad_norm": 2.6827157845380496, + "learning_rate": 1.2864775794921452e-06, + "loss": 0.6367, + "step": 14258 + }, + { + "epoch": 2.13, + "grad_norm": 2.7653155787772454, + "learning_rate": 1.2863850193771258e-06, + "loss": 0.6823, + "step": 14259 + }, + { + "epoch": 2.13, + "grad_norm": 2.386076277297341, + "learning_rate": 1.28629245658925e-06, + "loss": 0.6823, + "step": 14260 + }, + { + "epoch": 2.13, + "grad_norm": 5.8781943189564645, + "learning_rate": 1.2861998911293813e-06, + "loss": 0.6895, + "step": 14261 + }, + { + "epoch": 2.13, + "grad_norm": 2.7578657001424207, + "learning_rate": 1.2861073229983837e-06, + "loss": 0.6706, + "step": 14262 + }, + { + "epoch": 2.13, + "grad_norm": 3.0070267931272494, + "learning_rate": 1.2860147521971209e-06, + "loss": 0.6595, + "step": 14263 + }, + { + "epoch": 2.13, + "grad_norm": 2.276155493323584, + "learning_rate": 1.2859221787264574e-06, + "loss": 0.6725, + "step": 14264 + }, + { + "epoch": 2.13, + "grad_norm": 2.0670065242420548, + "learning_rate": 1.2858296025872571e-06, + "loss": 0.6699, + "step": 14265 + }, + { + "epoch": 2.13, + "grad_norm": 2.5307223277848236, + "learning_rate": 1.2857370237803833e-06, + "loss": 0.6341, + "step": 14266 + }, + { + "epoch": 2.13, + "grad_norm": 2.039436515518532, + "learning_rate": 1.285644442306701e-06, + "loss": 0.6602, + "step": 14267 + }, + { + "epoch": 2.13, + "grad_norm": 4.669518602234941, + "learning_rate": 1.2855518581670737e-06, + "loss": 0.6602, + "step": 14268 + }, + { + "epoch": 2.13, + "grad_norm": 4.623271422215557, + "learning_rate": 1.2854592713623658e-06, + "loss": 0.6465, + "step": 14269 + }, + { + "epoch": 2.13, + "grad_norm": 3.8803644781489632, + "learning_rate": 1.285366681893441e-06, + "loss": 0.6224, + "step": 14270 + }, + { + "epoch": 2.13, + "grad_norm": 4.617719435607542, + "learning_rate": 1.285274089761164e-06, + "loss": 0.6361, + "step": 14271 + }, + { + "epoch": 2.13, + "grad_norm": 3.4910442785080247, + "learning_rate": 1.2851814949663985e-06, + "loss": 0.6543, + "step": 14272 + }, + { + "epoch": 2.13, + "grad_norm": 3.678281625371637, + "learning_rate": 1.285088897510009e-06, + "loss": 0.6686, + "step": 14273 + }, + { + "epoch": 2.13, + "grad_norm": 3.8369771651619597, + "learning_rate": 1.2849962973928596e-06, + "loss": 0.6797, + "step": 14274 + }, + { + "epoch": 2.13, + "grad_norm": 1.8408461909355114, + "learning_rate": 1.2849036946158146e-06, + "loss": 0.64, + "step": 14275 + }, + { + "epoch": 2.13, + "grad_norm": 2.1551327810082053, + "learning_rate": 1.2848110891797381e-06, + "loss": 0.6654, + "step": 14276 + }, + { + "epoch": 2.13, + "grad_norm": 1.8257579840828795, + "learning_rate": 1.2847184810854948e-06, + "loss": 0.6751, + "step": 14277 + }, + { + "epoch": 2.13, + "grad_norm": 2.0637996934821228, + "learning_rate": 1.2846258703339482e-06, + "loss": 0.6322, + "step": 14278 + }, + { + "epoch": 2.13, + "grad_norm": 3.9832806225663684, + "learning_rate": 1.2845332569259636e-06, + "loss": 0.6784, + "step": 14279 + }, + { + "epoch": 2.13, + "grad_norm": 2.3855638370068135, + "learning_rate": 1.2844406408624047e-06, + "loss": 0.6374, + "step": 14280 + }, + { + "epoch": 2.13, + "grad_norm": 2.7922970367700075, + "learning_rate": 1.2843480221441365e-06, + "loss": 0.653, + "step": 14281 + }, + { + "epoch": 2.13, + "grad_norm": 2.006888172654611, + "learning_rate": 1.2842554007720228e-06, + "loss": 0.6478, + "step": 14282 + }, + { + "epoch": 2.13, + "grad_norm": 2.026645154923924, + "learning_rate": 1.2841627767469283e-06, + "loss": 0.6777, + "step": 14283 + }, + { + "epoch": 2.13, + "grad_norm": 2.025777475349999, + "learning_rate": 1.2840701500697179e-06, + "loss": 0.6439, + "step": 14284 + }, + { + "epoch": 2.13, + "grad_norm": 1.6929191376105182, + "learning_rate": 1.283977520741255e-06, + "loss": 0.6517, + "step": 14285 + }, + { + "epoch": 2.13, + "grad_norm": 2.234800299314572, + "learning_rate": 1.283884888762405e-06, + "loss": 0.6289, + "step": 14286 + }, + { + "epoch": 2.13, + "grad_norm": 2.4549225937194357, + "learning_rate": 1.2837922541340325e-06, + "loss": 0.6569, + "step": 14287 + }, + { + "epoch": 2.13, + "grad_norm": 2.0494549540886258, + "learning_rate": 1.2836996168570014e-06, + "loss": 0.6921, + "step": 14288 + }, + { + "epoch": 2.13, + "grad_norm": 1.9421055233707631, + "learning_rate": 1.2836069769321769e-06, + "loss": 0.638, + "step": 14289 + }, + { + "epoch": 2.13, + "grad_norm": 1.9844820657753575, + "learning_rate": 1.2835143343604233e-06, + "loss": 0.6576, + "step": 14290 + }, + { + "epoch": 2.13, + "grad_norm": 1.984119118711319, + "learning_rate": 1.2834216891426055e-06, + "loss": 0.6842, + "step": 14291 + }, + { + "epoch": 2.13, + "grad_norm": 3.2825198409061582, + "learning_rate": 1.283329041279588e-06, + "loss": 0.6543, + "step": 14292 + }, + { + "epoch": 2.13, + "grad_norm": 3.1868631937260066, + "learning_rate": 1.2832363907722352e-06, + "loss": 0.6445, + "step": 14293 + }, + { + "epoch": 2.13, + "grad_norm": 3.6831025075360704, + "learning_rate": 1.2831437376214128e-06, + "loss": 0.6484, + "step": 14294 + }, + { + "epoch": 2.13, + "grad_norm": 2.020379261891402, + "learning_rate": 1.2830510818279842e-06, + "loss": 0.6732, + "step": 14295 + }, + { + "epoch": 2.13, + "grad_norm": 2.3608749981122834, + "learning_rate": 1.282958423392815e-06, + "loss": 0.6523, + "step": 14296 + }, + { + "epoch": 2.13, + "grad_norm": 2.405406161712065, + "learning_rate": 1.28286576231677e-06, + "loss": 0.6771, + "step": 14297 + }, + { + "epoch": 2.13, + "grad_norm": 6.172391307465908, + "learning_rate": 1.2827730986007135e-06, + "loss": 0.6777, + "step": 14298 + }, + { + "epoch": 2.13, + "grad_norm": 2.499650469583569, + "learning_rate": 1.282680432245511e-06, + "loss": 0.6569, + "step": 14299 + }, + { + "epoch": 2.13, + "grad_norm": 2.998835586604439, + "learning_rate": 1.2825877632520268e-06, + "loss": 0.6348, + "step": 14300 + }, + { + "epoch": 2.13, + "grad_norm": 1.9788671263122677, + "learning_rate": 1.2824950916211262e-06, + "loss": 0.7044, + "step": 14301 + }, + { + "epoch": 2.13, + "grad_norm": 2.626719543601096, + "learning_rate": 1.282402417353674e-06, + "loss": 0.6133, + "step": 14302 + }, + { + "epoch": 2.13, + "grad_norm": 1.9175972043008314, + "learning_rate": 1.2823097404505345e-06, + "loss": 0.6406, + "step": 14303 + }, + { + "epoch": 2.13, + "grad_norm": 5.019320880609217, + "learning_rate": 1.2822170609125738e-06, + "loss": 0.6367, + "step": 14304 + }, + { + "epoch": 2.13, + "grad_norm": 2.2001161074841797, + "learning_rate": 1.282124378740656e-06, + "loss": 0.6452, + "step": 14305 + }, + { + "epoch": 2.13, + "grad_norm": 2.072416876831584, + "learning_rate": 1.2820316939356466e-06, + "loss": 0.6842, + "step": 14306 + }, + { + "epoch": 2.13, + "grad_norm": 2.494274285247469, + "learning_rate": 1.2819390064984103e-06, + "loss": 0.6354, + "step": 14307 + }, + { + "epoch": 2.13, + "grad_norm": 4.340434477748056, + "learning_rate": 1.2818463164298125e-06, + "loss": 0.6465, + "step": 14308 + }, + { + "epoch": 2.13, + "grad_norm": 2.1939010364296494, + "learning_rate": 1.281753623730718e-06, + "loss": 0.6309, + "step": 14309 + }, + { + "epoch": 2.13, + "grad_norm": 2.402360289214628, + "learning_rate": 1.2816609284019922e-06, + "loss": 0.6777, + "step": 14310 + }, + { + "epoch": 2.13, + "grad_norm": 2.2340465976949515, + "learning_rate": 1.2815682304445e-06, + "loss": 0.6094, + "step": 14311 + }, + { + "epoch": 2.13, + "grad_norm": 2.221595434777645, + "learning_rate": 1.2814755298591063e-06, + "loss": 0.6562, + "step": 14312 + }, + { + "epoch": 2.13, + "grad_norm": 2.0932173424131295, + "learning_rate": 1.2813828266466767e-06, + "loss": 0.6276, + "step": 14313 + }, + { + "epoch": 2.13, + "grad_norm": 3.593299486411927, + "learning_rate": 1.2812901208080765e-06, + "loss": 0.6159, + "step": 14314 + }, + { + "epoch": 2.13, + "grad_norm": 3.393834064830222, + "learning_rate": 1.2811974123441705e-06, + "loss": 0.638, + "step": 14315 + }, + { + "epoch": 2.14, + "grad_norm": 2.5714438294728437, + "learning_rate": 1.2811047012558242e-06, + "loss": 0.6003, + "step": 14316 + }, + { + "epoch": 2.14, + "grad_norm": 2.289644798164945, + "learning_rate": 1.281011987543903e-06, + "loss": 0.6087, + "step": 14317 + }, + { + "epoch": 2.14, + "grad_norm": 3.802676995689498, + "learning_rate": 1.280919271209272e-06, + "loss": 0.6673, + "step": 14318 + }, + { + "epoch": 2.14, + "grad_norm": 3.488485525546758, + "learning_rate": 1.2808265522527966e-06, + "loss": 0.6367, + "step": 14319 + }, + { + "epoch": 2.14, + "grad_norm": 3.705454420558888, + "learning_rate": 1.280733830675342e-06, + "loss": 0.6816, + "step": 14320 + }, + { + "epoch": 2.14, + "grad_norm": 3.6789555061308685, + "learning_rate": 1.2806411064777743e-06, + "loss": 0.6602, + "step": 14321 + }, + { + "epoch": 2.14, + "grad_norm": 2.5663880453318777, + "learning_rate": 1.2805483796609577e-06, + "loss": 0.623, + "step": 14322 + }, + { + "epoch": 2.14, + "grad_norm": 2.4674946626370664, + "learning_rate": 1.2804556502257583e-06, + "loss": 0.6823, + "step": 14323 + }, + { + "epoch": 2.14, + "grad_norm": 3.470307992325265, + "learning_rate": 1.280362918173042e-06, + "loss": 0.6556, + "step": 14324 + }, + { + "epoch": 2.14, + "grad_norm": 3.9172016111525236, + "learning_rate": 1.2802701835036733e-06, + "loss": 0.6764, + "step": 14325 + }, + { + "epoch": 2.14, + "grad_norm": 2.793268690498671, + "learning_rate": 1.2801774462185184e-06, + "loss": 0.6283, + "step": 14326 + }, + { + "epoch": 2.14, + "grad_norm": 2.307567009692997, + "learning_rate": 1.2800847063184427e-06, + "loss": 0.6361, + "step": 14327 + }, + { + "epoch": 2.14, + "grad_norm": 2.960820689362724, + "learning_rate": 1.2799919638043118e-06, + "loss": 0.6335, + "step": 14328 + }, + { + "epoch": 2.14, + "grad_norm": 3.7078754545633577, + "learning_rate": 1.2798992186769906e-06, + "loss": 0.6133, + "step": 14329 + }, + { + "epoch": 2.14, + "grad_norm": 3.263709850727529, + "learning_rate": 1.2798064709373457e-06, + "loss": 0.6289, + "step": 14330 + }, + { + "epoch": 2.14, + "grad_norm": 3.7858303880939297, + "learning_rate": 1.279713720586242e-06, + "loss": 0.6322, + "step": 14331 + }, + { + "epoch": 2.14, + "grad_norm": 3.031680381571265, + "learning_rate": 1.2796209676245455e-06, + "loss": 0.6289, + "step": 14332 + }, + { + "epoch": 2.14, + "grad_norm": 2.4534847809535987, + "learning_rate": 1.279528212053122e-06, + "loss": 0.651, + "step": 14333 + }, + { + "epoch": 2.14, + "grad_norm": 7.112505322801568, + "learning_rate": 1.2794354538728367e-06, + "loss": 0.6576, + "step": 14334 + }, + { + "epoch": 2.14, + "grad_norm": 3.3089221367182007, + "learning_rate": 1.2793426930845556e-06, + "loss": 0.6569, + "step": 14335 + }, + { + "epoch": 2.14, + "grad_norm": 2.461935751213244, + "learning_rate": 1.2792499296891447e-06, + "loss": 0.6426, + "step": 14336 + }, + { + "epoch": 2.14, + "grad_norm": 2.912685517050995, + "learning_rate": 1.2791571636874692e-06, + "loss": 0.6475, + "step": 14337 + }, + { + "epoch": 2.14, + "grad_norm": 2.5157231982517976, + "learning_rate": 1.2790643950803952e-06, + "loss": 0.6126, + "step": 14338 + }, + { + "epoch": 2.14, + "grad_norm": 2.8064392758964507, + "learning_rate": 1.2789716238687888e-06, + "loss": 0.6393, + "step": 14339 + }, + { + "epoch": 2.14, + "grad_norm": 2.7813202587127854, + "learning_rate": 1.2788788500535154e-06, + "loss": 0.6263, + "step": 14340 + }, + { + "epoch": 2.14, + "grad_norm": 2.6623238105691276, + "learning_rate": 1.278786073635441e-06, + "loss": 0.6497, + "step": 14341 + }, + { + "epoch": 2.14, + "grad_norm": 3.899417201225184, + "learning_rate": 1.2786932946154316e-06, + "loss": 0.6204, + "step": 14342 + }, + { + "epoch": 2.14, + "grad_norm": 2.3542829662186495, + "learning_rate": 1.2786005129943528e-06, + "loss": 0.6315, + "step": 14343 + }, + { + "epoch": 2.14, + "grad_norm": 5.424016077603569, + "learning_rate": 1.2785077287730713e-06, + "loss": 0.6999, + "step": 14344 + }, + { + "epoch": 2.14, + "grad_norm": 2.7892051878276924, + "learning_rate": 1.2784149419524521e-06, + "loss": 0.6582, + "step": 14345 + }, + { + "epoch": 2.14, + "grad_norm": 2.7233057763425847, + "learning_rate": 1.2783221525333616e-06, + "loss": 0.6595, + "step": 14346 + }, + { + "epoch": 2.14, + "grad_norm": 2.3153040746332842, + "learning_rate": 1.2782293605166663e-06, + "loss": 0.5996, + "step": 14347 + }, + { + "epoch": 2.14, + "grad_norm": 3.6179941095525687, + "learning_rate": 1.2781365659032313e-06, + "loss": 0.6419, + "step": 14348 + }, + { + "epoch": 2.14, + "grad_norm": 6.501462159709191, + "learning_rate": 1.278043768693923e-06, + "loss": 0.64, + "step": 14349 + }, + { + "epoch": 2.14, + "grad_norm": 2.6329805341081, + "learning_rate": 1.277950968889608e-06, + "loss": 0.6725, + "step": 14350 + }, + { + "epoch": 2.14, + "grad_norm": 4.712645441570771, + "learning_rate": 1.277858166491152e-06, + "loss": 0.6738, + "step": 14351 + }, + { + "epoch": 2.14, + "grad_norm": 4.074296320236339, + "learning_rate": 1.277765361499421e-06, + "loss": 0.64, + "step": 14352 + }, + { + "epoch": 2.14, + "grad_norm": 2.9324032721471185, + "learning_rate": 1.2776725539152815e-06, + "loss": 0.6361, + "step": 14353 + }, + { + "epoch": 2.14, + "grad_norm": 2.1611995074447448, + "learning_rate": 1.2775797437395995e-06, + "loss": 0.6094, + "step": 14354 + }, + { + "epoch": 2.14, + "grad_norm": 2.9260297102470934, + "learning_rate": 1.277486930973241e-06, + "loss": 0.6999, + "step": 14355 + }, + { + "epoch": 2.14, + "grad_norm": 2.085119094622811, + "learning_rate": 1.2773941156170724e-06, + "loss": 0.6328, + "step": 14356 + }, + { + "epoch": 2.14, + "grad_norm": 3.158862401662736, + "learning_rate": 1.2773012976719601e-06, + "loss": 0.681, + "step": 14357 + }, + { + "epoch": 2.14, + "grad_norm": 2.8925685686149105, + "learning_rate": 1.2772084771387706e-06, + "loss": 0.6589, + "step": 14358 + }, + { + "epoch": 2.14, + "grad_norm": 3.284447914515811, + "learning_rate": 1.2771156540183694e-06, + "loss": 0.6914, + "step": 14359 + }, + { + "epoch": 2.14, + "grad_norm": 5.821471959172181, + "learning_rate": 1.2770228283116237e-06, + "loss": 0.6693, + "step": 14360 + }, + { + "epoch": 2.14, + "grad_norm": 1.9839987442460854, + "learning_rate": 1.276930000019399e-06, + "loss": 0.6243, + "step": 14361 + }, + { + "epoch": 2.14, + "grad_norm": 2.4903849818090724, + "learning_rate": 1.276837169142562e-06, + "loss": 0.61, + "step": 14362 + }, + { + "epoch": 2.14, + "grad_norm": 5.835560270918959, + "learning_rate": 1.27674433568198e-06, + "loss": 0.6257, + "step": 14363 + }, + { + "epoch": 2.14, + "grad_norm": 2.094130972801763, + "learning_rate": 1.2766514996385181e-06, + "loss": 0.6367, + "step": 14364 + }, + { + "epoch": 2.14, + "grad_norm": 4.394987424844345, + "learning_rate": 1.2765586610130435e-06, + "loss": 0.6784, + "step": 14365 + }, + { + "epoch": 2.14, + "grad_norm": 2.537434250716388, + "learning_rate": 1.276465819806422e-06, + "loss": 0.6224, + "step": 14366 + }, + { + "epoch": 2.14, + "grad_norm": 1.9923943985317631, + "learning_rate": 1.276372976019521e-06, + "loss": 0.6055, + "step": 14367 + }, + { + "epoch": 2.14, + "grad_norm": 2.657861690783009, + "learning_rate": 1.2762801296532064e-06, + "loss": 0.6641, + "step": 14368 + }, + { + "epoch": 2.14, + "grad_norm": 5.700944465535807, + "learning_rate": 1.2761872807083448e-06, + "loss": 0.6458, + "step": 14369 + }, + { + "epoch": 2.14, + "grad_norm": 2.1955545928657707, + "learning_rate": 1.2760944291858034e-06, + "loss": 0.6383, + "step": 14370 + }, + { + "epoch": 2.14, + "grad_norm": 4.593337073744588, + "learning_rate": 1.2760015750864476e-06, + "loss": 0.6504, + "step": 14371 + }, + { + "epoch": 2.14, + "grad_norm": 2.8906843705986134, + "learning_rate": 1.275908718411145e-06, + "loss": 0.6432, + "step": 14372 + }, + { + "epoch": 2.14, + "grad_norm": 3.601587415580353, + "learning_rate": 1.2758158591607622e-06, + "loss": 0.6621, + "step": 14373 + }, + { + "epoch": 2.14, + "grad_norm": 2.258031416253772, + "learning_rate": 1.2757229973361654e-06, + "loss": 0.6322, + "step": 14374 + }, + { + "epoch": 2.14, + "grad_norm": 2.450460395290824, + "learning_rate": 1.2756301329382213e-06, + "loss": 0.6523, + "step": 14375 + }, + { + "epoch": 2.14, + "grad_norm": 2.4686313162256295, + "learning_rate": 1.275537265967797e-06, + "loss": 0.6569, + "step": 14376 + }, + { + "epoch": 2.14, + "grad_norm": 2.093842948327486, + "learning_rate": 1.2754443964257588e-06, + "loss": 0.6289, + "step": 14377 + }, + { + "epoch": 2.14, + "grad_norm": 2.510602367215658, + "learning_rate": 1.2753515243129738e-06, + "loss": 0.6582, + "step": 14378 + }, + { + "epoch": 2.14, + "grad_norm": 3.0817118768381233, + "learning_rate": 1.275258649630309e-06, + "loss": 0.6139, + "step": 14379 + }, + { + "epoch": 2.14, + "grad_norm": 3.5130947520151334, + "learning_rate": 1.2751657723786305e-06, + "loss": 0.6562, + "step": 14380 + }, + { + "epoch": 2.14, + "grad_norm": 2.5641773682579507, + "learning_rate": 1.2750728925588056e-06, + "loss": 0.6491, + "step": 14381 + }, + { + "epoch": 2.14, + "grad_norm": 4.193486192375764, + "learning_rate": 1.274980010171701e-06, + "loss": 0.6615, + "step": 14382 + }, + { + "epoch": 2.15, + "grad_norm": 4.647080078004876, + "learning_rate": 1.2748871252181839e-06, + "loss": 0.6582, + "step": 14383 + }, + { + "epoch": 2.15, + "grad_norm": 4.954678888875232, + "learning_rate": 1.2747942376991207e-06, + "loss": 0.64, + "step": 14384 + }, + { + "epoch": 2.15, + "grad_norm": 3.6508762270224975, + "learning_rate": 1.2747013476153785e-06, + "loss": 0.5964, + "step": 14385 + }, + { + "epoch": 2.15, + "grad_norm": 2.4649682713536794, + "learning_rate": 1.2746084549678245e-06, + "loss": 0.6224, + "step": 14386 + }, + { + "epoch": 2.15, + "grad_norm": 3.1033686456659857, + "learning_rate": 1.2745155597573256e-06, + "loss": 0.6452, + "step": 14387 + }, + { + "epoch": 2.15, + "grad_norm": 5.639178012896704, + "learning_rate": 1.2744226619847484e-06, + "loss": 0.6634, + "step": 14388 + }, + { + "epoch": 2.15, + "grad_norm": 2.9772914282375433, + "learning_rate": 1.2743297616509606e-06, + "loss": 0.6745, + "step": 14389 + }, + { + "epoch": 2.15, + "grad_norm": 4.821960758209532, + "learning_rate": 1.2742368587568287e-06, + "loss": 0.6094, + "step": 14390 + }, + { + "epoch": 2.15, + "grad_norm": 3.730566824450621, + "learning_rate": 1.2741439533032197e-06, + "loss": 0.6315, + "step": 14391 + }, + { + "epoch": 2.15, + "grad_norm": 4.11575495239181, + "learning_rate": 1.2740510452910011e-06, + "loss": 0.6842, + "step": 14392 + }, + { + "epoch": 2.15, + "grad_norm": 3.0142758231327185, + "learning_rate": 1.2739581347210401e-06, + "loss": 0.6315, + "step": 14393 + }, + { + "epoch": 2.15, + "grad_norm": 7.648492209854851, + "learning_rate": 1.2738652215942037e-06, + "loss": 0.679, + "step": 14394 + }, + { + "epoch": 2.15, + "grad_norm": 2.621379422590176, + "learning_rate": 1.2737723059113585e-06, + "loss": 0.651, + "step": 14395 + }, + { + "epoch": 2.15, + "grad_norm": 3.9291080816226547, + "learning_rate": 1.2736793876733724e-06, + "loss": 0.6374, + "step": 14396 + }, + { + "epoch": 2.15, + "grad_norm": 3.1364650544627133, + "learning_rate": 1.2735864668811123e-06, + "loss": 0.611, + "step": 14397 + }, + { + "epoch": 2.15, + "grad_norm": 2.615692533352473, + "learning_rate": 1.2734935435354455e-06, + "loss": 0.6217, + "step": 14398 + }, + { + "epoch": 2.15, + "grad_norm": 3.3649202363850907, + "learning_rate": 1.2734006176372394e-06, + "loss": 0.6621, + "step": 14399 + }, + { + "epoch": 2.15, + "grad_norm": 4.707241234235636, + "learning_rate": 1.2733076891873611e-06, + "loss": 0.6914, + "step": 14400 + }, + { + "epoch": 2.15, + "grad_norm": 2.4714930846212275, + "learning_rate": 1.2732147581866777e-06, + "loss": 0.6042, + "step": 14401 + }, + { + "epoch": 2.15, + "grad_norm": 3.3757434045235866, + "learning_rate": 1.273121824636057e-06, + "loss": 0.6426, + "step": 14402 + }, + { + "epoch": 2.15, + "grad_norm": 2.550017694337806, + "learning_rate": 1.2730288885363666e-06, + "loss": 0.6055, + "step": 14403 + }, + { + "epoch": 2.15, + "grad_norm": 4.266206507222662, + "learning_rate": 1.2729359498884725e-06, + "loss": 0.6452, + "step": 14404 + }, + { + "epoch": 2.15, + "grad_norm": 6.476219437175364, + "learning_rate": 1.2728430086932438e-06, + "loss": 0.735, + "step": 14405 + }, + { + "epoch": 2.15, + "grad_norm": 3.062788548505642, + "learning_rate": 1.272750064951547e-06, + "loss": 0.6673, + "step": 14406 + }, + { + "epoch": 2.15, + "grad_norm": 2.595104521344208, + "learning_rate": 1.2726571186642496e-06, + "loss": 0.6387, + "step": 14407 + }, + { + "epoch": 2.15, + "grad_norm": 2.8994452548909324, + "learning_rate": 1.2725641698322194e-06, + "loss": 0.6621, + "step": 14408 + }, + { + "epoch": 2.15, + "grad_norm": 2.3741363252483665, + "learning_rate": 1.2724712184563234e-06, + "loss": 0.6432, + "step": 14409 + }, + { + "epoch": 2.15, + "grad_norm": 2.9171320374930754, + "learning_rate": 1.2723782645374297e-06, + "loss": 0.6576, + "step": 14410 + }, + { + "epoch": 2.15, + "grad_norm": 5.457912589496075, + "learning_rate": 1.2722853080764053e-06, + "loss": 0.6523, + "step": 14411 + }, + { + "epoch": 2.15, + "grad_norm": 6.560604298965864, + "learning_rate": 1.2721923490741183e-06, + "loss": 0.6419, + "step": 14412 + }, + { + "epoch": 2.15, + "grad_norm": 2.2654571459035227, + "learning_rate": 1.2720993875314362e-06, + "loss": 0.6354, + "step": 14413 + }, + { + "epoch": 2.15, + "grad_norm": 2.59633079265889, + "learning_rate": 1.2720064234492258e-06, + "loss": 0.6348, + "step": 14414 + }, + { + "epoch": 2.15, + "grad_norm": 5.2927384491091125, + "learning_rate": 1.271913456828356e-06, + "loss": 0.6908, + "step": 14415 + }, + { + "epoch": 2.15, + "grad_norm": 2.3687353475372874, + "learning_rate": 1.2718204876696936e-06, + "loss": 0.6133, + "step": 14416 + }, + { + "epoch": 2.15, + "grad_norm": 2.3393939258442886, + "learning_rate": 1.2717275159741065e-06, + "loss": 0.6217, + "step": 14417 + }, + { + "epoch": 2.15, + "grad_norm": 2.4143154080653613, + "learning_rate": 1.2716345417424626e-06, + "loss": 0.6152, + "step": 14418 + }, + { + "epoch": 2.15, + "grad_norm": 2.5625752648916786, + "learning_rate": 1.2715415649756293e-06, + "loss": 0.6595, + "step": 14419 + }, + { + "epoch": 2.15, + "grad_norm": 4.404956669186006, + "learning_rate": 1.2714485856744747e-06, + "loss": 0.6797, + "step": 14420 + }, + { + "epoch": 2.15, + "grad_norm": 4.914165886686815, + "learning_rate": 1.2713556038398664e-06, + "loss": 0.6465, + "step": 14421 + }, + { + "epoch": 2.15, + "grad_norm": 3.2889820360867863, + "learning_rate": 1.2712626194726724e-06, + "loss": 0.6211, + "step": 14422 + }, + { + "epoch": 2.15, + "grad_norm": 3.3911438870459385, + "learning_rate": 1.2711696325737604e-06, + "loss": 0.6393, + "step": 14423 + }, + { + "epoch": 2.15, + "grad_norm": 2.131110842740263, + "learning_rate": 1.271076643143998e-06, + "loss": 0.6224, + "step": 14424 + }, + { + "epoch": 2.15, + "grad_norm": 2.6866194940910786, + "learning_rate": 1.2709836511842532e-06, + "loss": 0.6296, + "step": 14425 + }, + { + "epoch": 2.15, + "grad_norm": 2.8148885776111596, + "learning_rate": 1.2708906566953947e-06, + "loss": 0.5736, + "step": 14426 + }, + { + "epoch": 2.15, + "grad_norm": 3.7668089709143127, + "learning_rate": 1.2707976596782892e-06, + "loss": 0.6335, + "step": 14427 + }, + { + "epoch": 2.15, + "grad_norm": 2.5786962853744866, + "learning_rate": 1.270704660133805e-06, + "loss": 0.6289, + "step": 14428 + }, + { + "epoch": 2.15, + "grad_norm": 3.9787995316608042, + "learning_rate": 1.270611658062811e-06, + "loss": 0.6706, + "step": 14429 + }, + { + "epoch": 2.15, + "grad_norm": 3.069642190483013, + "learning_rate": 1.270518653466174e-06, + "loss": 0.6367, + "step": 14430 + }, + { + "epoch": 2.15, + "grad_norm": 2.7050478577073878, + "learning_rate": 1.2704256463447625e-06, + "loss": 0.6374, + "step": 14431 + }, + { + "epoch": 2.15, + "grad_norm": 3.2952539013150837, + "learning_rate": 1.2703326366994444e-06, + "loss": 0.6452, + "step": 14432 + }, + { + "epoch": 2.15, + "grad_norm": 3.4640732897530104, + "learning_rate": 1.270239624531088e-06, + "loss": 0.6296, + "step": 14433 + }, + { + "epoch": 2.15, + "grad_norm": 4.129986741701044, + "learning_rate": 1.2701466098405615e-06, + "loss": 0.6784, + "step": 14434 + }, + { + "epoch": 2.15, + "grad_norm": 4.830750926101436, + "learning_rate": 1.2700535926287326e-06, + "loss": 0.6523, + "step": 14435 + }, + { + "epoch": 2.15, + "grad_norm": 4.714835502998659, + "learning_rate": 1.2699605728964694e-06, + "loss": 0.6465, + "step": 14436 + }, + { + "epoch": 2.15, + "grad_norm": 3.172082500070012, + "learning_rate": 1.2698675506446407e-06, + "loss": 0.638, + "step": 14437 + }, + { + "epoch": 2.15, + "grad_norm": 3.1554492061731487, + "learning_rate": 1.2697745258741142e-06, + "loss": 0.6491, + "step": 14438 + }, + { + "epoch": 2.15, + "grad_norm": 3.222977408635115, + "learning_rate": 1.2696814985857583e-06, + "loss": 0.5905, + "step": 14439 + }, + { + "epoch": 2.15, + "grad_norm": 2.7597916122370045, + "learning_rate": 1.2695884687804409e-06, + "loss": 0.6419, + "step": 14440 + }, + { + "epoch": 2.15, + "grad_norm": 3.403424634638367, + "learning_rate": 1.2694954364590303e-06, + "loss": 0.6237, + "step": 14441 + }, + { + "epoch": 2.15, + "grad_norm": 2.6970650666333884, + "learning_rate": 1.2694024016223952e-06, + "loss": 0.6706, + "step": 14442 + }, + { + "epoch": 2.15, + "grad_norm": 4.742674173456901, + "learning_rate": 1.269309364271404e-06, + "loss": 0.6406, + "step": 14443 + }, + { + "epoch": 2.15, + "grad_norm": 2.406069818907037, + "learning_rate": 1.2692163244069241e-06, + "loss": 0.6439, + "step": 14444 + }, + { + "epoch": 2.15, + "grad_norm": 2.543322116355035, + "learning_rate": 1.2691232820298247e-06, + "loss": 0.5749, + "step": 14445 + }, + { + "epoch": 2.15, + "grad_norm": 3.211133659820332, + "learning_rate": 1.269030237140974e-06, + "loss": 0.6816, + "step": 14446 + }, + { + "epoch": 2.15, + "grad_norm": 5.269369256220564, + "learning_rate": 1.2689371897412402e-06, + "loss": 0.6549, + "step": 14447 + }, + { + "epoch": 2.15, + "grad_norm": 4.565025406643107, + "learning_rate": 1.2688441398314917e-06, + "loss": 0.6654, + "step": 14448 + }, + { + "epoch": 2.15, + "grad_norm": 4.100804352126534, + "learning_rate": 1.2687510874125973e-06, + "loss": 0.6191, + "step": 14449 + }, + { + "epoch": 2.16, + "grad_norm": 3.5254833494352797, + "learning_rate": 1.2686580324854253e-06, + "loss": 0.6458, + "step": 14450 + }, + { + "epoch": 2.16, + "grad_norm": 2.280874958637613, + "learning_rate": 1.2685649750508435e-06, + "loss": 0.6406, + "step": 14451 + }, + { + "epoch": 2.16, + "grad_norm": 4.597100240295592, + "learning_rate": 1.2684719151097216e-06, + "loss": 0.6354, + "step": 14452 + }, + { + "epoch": 2.16, + "grad_norm": 2.4290086838347986, + "learning_rate": 1.2683788526629275e-06, + "loss": 0.6217, + "step": 14453 + }, + { + "epoch": 2.16, + "grad_norm": 2.568690761624818, + "learning_rate": 1.2682857877113298e-06, + "loss": 0.6471, + "step": 14454 + }, + { + "epoch": 2.16, + "grad_norm": 3.041428782628014, + "learning_rate": 1.2681927202557972e-06, + "loss": 0.6217, + "step": 14455 + }, + { + "epoch": 2.16, + "grad_norm": 5.2858229794898675, + "learning_rate": 1.2680996502971982e-06, + "loss": 0.6224, + "step": 14456 + }, + { + "epoch": 2.16, + "grad_norm": 3.1141700056602963, + "learning_rate": 1.268006577836401e-06, + "loss": 0.6296, + "step": 14457 + }, + { + "epoch": 2.16, + "grad_norm": 2.5204941302195083, + "learning_rate": 1.2679135028742752e-06, + "loss": 0.6432, + "step": 14458 + }, + { + "epoch": 2.16, + "grad_norm": 2.601079799855409, + "learning_rate": 1.2678204254116892e-06, + "loss": 0.5866, + "step": 14459 + }, + { + "epoch": 2.16, + "grad_norm": 2.709101582757441, + "learning_rate": 1.2677273454495112e-06, + "loss": 0.6634, + "step": 14460 + }, + { + "epoch": 2.16, + "grad_norm": 4.859956969336383, + "learning_rate": 1.2676342629886102e-06, + "loss": 0.6914, + "step": 14461 + }, + { + "epoch": 2.16, + "grad_norm": 7.06459072095258, + "learning_rate": 1.2675411780298548e-06, + "loss": 0.6204, + "step": 14462 + }, + { + "epoch": 2.16, + "grad_norm": 2.6902061818020337, + "learning_rate": 1.2674480905741142e-06, + "loss": 0.6367, + "step": 14463 + }, + { + "epoch": 2.16, + "grad_norm": 3.3431014545800637, + "learning_rate": 1.2673550006222566e-06, + "loss": 0.6133, + "step": 14464 + }, + { + "epoch": 2.16, + "grad_norm": 4.5520782768679595, + "learning_rate": 1.2672619081751514e-06, + "loss": 0.5898, + "step": 14465 + }, + { + "epoch": 2.16, + "grad_norm": 4.9816436962633235, + "learning_rate": 1.2671688132336671e-06, + "loss": 0.6348, + "step": 14466 + }, + { + "epoch": 2.16, + "grad_norm": 3.581020798668728, + "learning_rate": 1.2670757157986725e-06, + "loss": 0.6139, + "step": 14467 + }, + { + "epoch": 2.16, + "grad_norm": 4.216797118385344, + "learning_rate": 1.2669826158710368e-06, + "loss": 0.6139, + "step": 14468 + }, + { + "epoch": 2.16, + "grad_norm": 3.7908854441613373, + "learning_rate": 1.2668895134516287e-06, + "loss": 0.6647, + "step": 14469 + }, + { + "epoch": 2.16, + "grad_norm": 2.731089144510179, + "learning_rate": 1.266796408541317e-06, + "loss": 0.6198, + "step": 14470 + }, + { + "epoch": 2.16, + "grad_norm": 2.7325316666295563, + "learning_rate": 1.2667033011409712e-06, + "loss": 0.612, + "step": 14471 + }, + { + "epoch": 2.16, + "grad_norm": 4.060595743391856, + "learning_rate": 1.2666101912514596e-06, + "loss": 0.6647, + "step": 14472 + }, + { + "epoch": 2.16, + "grad_norm": 4.8604561183000845, + "learning_rate": 1.2665170788736514e-06, + "loss": 0.6484, + "step": 14473 + }, + { + "epoch": 2.16, + "grad_norm": 2.864300900504003, + "learning_rate": 1.266423964008416e-06, + "loss": 0.6348, + "step": 14474 + }, + { + "epoch": 2.16, + "grad_norm": 4.719892112545786, + "learning_rate": 1.266330846656622e-06, + "loss": 0.7077, + "step": 14475 + }, + { + "epoch": 2.16, + "grad_norm": 3.1684530794123753, + "learning_rate": 1.2662377268191392e-06, + "loss": 0.6146, + "step": 14476 + }, + { + "epoch": 2.16, + "grad_norm": 3.8116449940842534, + "learning_rate": 1.2661446044968353e-06, + "loss": 0.6471, + "step": 14477 + }, + { + "epoch": 2.16, + "grad_norm": 3.259382118786745, + "learning_rate": 1.2660514796905806e-06, + "loss": 0.6445, + "step": 14478 + }, + { + "epoch": 2.16, + "grad_norm": 3.9010711192691923, + "learning_rate": 1.2659583524012443e-06, + "loss": 0.64, + "step": 14479 + }, + { + "epoch": 2.16, + "grad_norm": 3.258239978402526, + "learning_rate": 1.265865222629695e-06, + "loss": 0.6986, + "step": 14480 + }, + { + "epoch": 2.16, + "grad_norm": 5.003017973722865, + "learning_rate": 1.2657720903768018e-06, + "loss": 0.6576, + "step": 14481 + }, + { + "epoch": 2.16, + "grad_norm": 3.016927435053214, + "learning_rate": 1.2656789556434344e-06, + "loss": 0.6159, + "step": 14482 + }, + { + "epoch": 2.16, + "grad_norm": 3.988906147768949, + "learning_rate": 1.2655858184304615e-06, + "loss": 0.6471, + "step": 14483 + }, + { + "epoch": 2.16, + "grad_norm": 3.7560144425257036, + "learning_rate": 1.265492678738753e-06, + "loss": 0.6458, + "step": 14484 + }, + { + "epoch": 2.16, + "grad_norm": 2.767478158866083, + "learning_rate": 1.2653995365691773e-06, + "loss": 0.7005, + "step": 14485 + }, + { + "epoch": 2.16, + "grad_norm": 2.3421273639230007, + "learning_rate": 1.265306391922605e-06, + "loss": 0.6686, + "step": 14486 + }, + { + "epoch": 2.16, + "grad_norm": 3.825319510339591, + "learning_rate": 1.2652132447999044e-06, + "loss": 0.6439, + "step": 14487 + }, + { + "epoch": 2.16, + "grad_norm": 4.879262210178548, + "learning_rate": 1.265120095201945e-06, + "loss": 0.6562, + "step": 14488 + }, + { + "epoch": 2.16, + "grad_norm": 3.221670088735159, + "learning_rate": 1.2650269431295966e-06, + "loss": 0.651, + "step": 14489 + }, + { + "epoch": 2.16, + "grad_norm": 2.1578355850904423, + "learning_rate": 1.2649337885837282e-06, + "loss": 0.696, + "step": 14490 + }, + { + "epoch": 2.16, + "grad_norm": 2.016580826400459, + "learning_rate": 1.2648406315652088e-06, + "loss": 0.6458, + "step": 14491 + }, + { + "epoch": 2.16, + "grad_norm": 3.8669520683008813, + "learning_rate": 1.2647474720749092e-06, + "loss": 0.6562, + "step": 14492 + }, + { + "epoch": 2.16, + "grad_norm": 4.287115737806639, + "learning_rate": 1.2646543101136974e-06, + "loss": 0.6458, + "step": 14493 + }, + { + "epoch": 2.16, + "grad_norm": 2.1810277826121123, + "learning_rate": 1.264561145682444e-06, + "loss": 0.6484, + "step": 14494 + }, + { + "epoch": 2.16, + "grad_norm": 2.1816191457572813, + "learning_rate": 1.2644679787820177e-06, + "loss": 0.6335, + "step": 14495 + }, + { + "epoch": 2.16, + "grad_norm": 2.0109490560717376, + "learning_rate": 1.2643748094132885e-06, + "loss": 0.651, + "step": 14496 + }, + { + "epoch": 2.16, + "grad_norm": 2.277472537921852, + "learning_rate": 1.2642816375771258e-06, + "loss": 0.6569, + "step": 14497 + }, + { + "epoch": 2.16, + "grad_norm": 4.120056008189543, + "learning_rate": 1.2641884632743992e-06, + "loss": 0.6374, + "step": 14498 + }, + { + "epoch": 2.16, + "grad_norm": 3.919581930650713, + "learning_rate": 1.2640952865059783e-06, + "loss": 0.6647, + "step": 14499 + }, + { + "epoch": 2.16, + "grad_norm": 2.2263323786125544, + "learning_rate": 1.2640021072727328e-06, + "loss": 0.6204, + "step": 14500 + }, + { + "epoch": 2.16, + "grad_norm": 2.284057434751612, + "learning_rate": 1.263908925575532e-06, + "loss": 0.6296, + "step": 14501 + }, + { + "epoch": 2.16, + "grad_norm": 2.478605838019839, + "learning_rate": 1.2638157414152462e-06, + "loss": 0.6764, + "step": 14502 + }, + { + "epoch": 2.16, + "grad_norm": 3.6398134916668994, + "learning_rate": 1.2637225547927446e-06, + "loss": 0.7031, + "step": 14503 + }, + { + "epoch": 2.16, + "grad_norm": 2.176241861548069, + "learning_rate": 1.2636293657088969e-06, + "loss": 0.6491, + "step": 14504 + }, + { + "epoch": 2.16, + "grad_norm": 2.356650420015605, + "learning_rate": 1.2635361741645732e-06, + "loss": 0.668, + "step": 14505 + }, + { + "epoch": 2.16, + "grad_norm": 1.9587402116109762, + "learning_rate": 1.2634429801606433e-06, + "loss": 0.6465, + "step": 14506 + }, + { + "epoch": 2.16, + "grad_norm": 2.6957132981359404, + "learning_rate": 1.2633497836979762e-06, + "loss": 0.6426, + "step": 14507 + }, + { + "epoch": 2.16, + "grad_norm": 2.142334001384778, + "learning_rate": 1.2632565847774427e-06, + "loss": 0.666, + "step": 14508 + }, + { + "epoch": 2.16, + "grad_norm": 3.2215271822612195, + "learning_rate": 1.2631633833999123e-06, + "loss": 0.6439, + "step": 14509 + }, + { + "epoch": 2.16, + "grad_norm": 5.177123675263066, + "learning_rate": 1.2630701795662546e-06, + "loss": 0.6543, + "step": 14510 + }, + { + "epoch": 2.16, + "grad_norm": 4.011321859478851, + "learning_rate": 1.2629769732773396e-06, + "loss": 0.6497, + "step": 14511 + }, + { + "epoch": 2.16, + "grad_norm": 5.353041814346228, + "learning_rate": 1.2628837645340373e-06, + "loss": 0.6458, + "step": 14512 + }, + { + "epoch": 2.16, + "grad_norm": 4.338584747943553, + "learning_rate": 1.262790553337218e-06, + "loss": 0.6589, + "step": 14513 + }, + { + "epoch": 2.16, + "grad_norm": 2.308227353754241, + "learning_rate": 1.2626973396877505e-06, + "loss": 0.6719, + "step": 14514 + }, + { + "epoch": 2.16, + "grad_norm": 2.273624704337359, + "learning_rate": 1.262604123586506e-06, + "loss": 0.6387, + "step": 14515 + }, + { + "epoch": 2.16, + "grad_norm": 2.6894128738156433, + "learning_rate": 1.2625109050343538e-06, + "loss": 0.679, + "step": 14516 + }, + { + "epoch": 2.17, + "grad_norm": 3.4721233907982905, + "learning_rate": 1.262417684032164e-06, + "loss": 0.6471, + "step": 14517 + }, + { + "epoch": 2.17, + "grad_norm": 2.112770796741894, + "learning_rate": 1.2623244605808069e-06, + "loss": 0.6452, + "step": 14518 + }, + { + "epoch": 2.17, + "grad_norm": 2.4730432613846065, + "learning_rate": 1.2622312346811525e-06, + "loss": 0.6341, + "step": 14519 + }, + { + "epoch": 2.17, + "grad_norm": 2.014010869772636, + "learning_rate": 1.2621380063340706e-06, + "loss": 0.623, + "step": 14520 + }, + { + "epoch": 2.17, + "grad_norm": 3.7473389318574934, + "learning_rate": 1.2620447755404316e-06, + "loss": 0.6497, + "step": 14521 + }, + { + "epoch": 2.17, + "grad_norm": 3.344040158254235, + "learning_rate": 1.2619515423011055e-06, + "loss": 0.6406, + "step": 14522 + }, + { + "epoch": 2.17, + "grad_norm": 2.2272084480232235, + "learning_rate": 1.2618583066169624e-06, + "loss": 0.6667, + "step": 14523 + }, + { + "epoch": 2.17, + "grad_norm": 2.7797863329992984, + "learning_rate": 1.2617650684888727e-06, + "loss": 0.6191, + "step": 14524 + }, + { + "epoch": 2.17, + "grad_norm": 3.515614713036336, + "learning_rate": 1.2616718279177065e-06, + "loss": 0.6549, + "step": 14525 + }, + { + "epoch": 2.17, + "grad_norm": 2.042485813110608, + "learning_rate": 1.2615785849043338e-06, + "loss": 0.6419, + "step": 14526 + }, + { + "epoch": 2.17, + "grad_norm": 4.0231458132105, + "learning_rate": 1.2614853394496252e-06, + "loss": 0.6569, + "step": 14527 + }, + { + "epoch": 2.17, + "grad_norm": 2.5157461206543923, + "learning_rate": 1.2613920915544507e-06, + "loss": 0.6348, + "step": 14528 + }, + { + "epoch": 2.17, + "grad_norm": 2.132689407819257, + "learning_rate": 1.261298841219681e-06, + "loss": 0.6536, + "step": 14529 + }, + { + "epoch": 2.17, + "grad_norm": 2.5693073545419587, + "learning_rate": 1.2612055884461857e-06, + "loss": 0.6706, + "step": 14530 + }, + { + "epoch": 2.17, + "grad_norm": 2.203635362786755, + "learning_rate": 1.261112333234836e-06, + "loss": 0.5983, + "step": 14531 + }, + { + "epoch": 2.17, + "grad_norm": 5.048477988186719, + "learning_rate": 1.2610190755865014e-06, + "loss": 0.6647, + "step": 14532 + }, + { + "epoch": 2.17, + "grad_norm": 3.076702619007778, + "learning_rate": 1.2609258155020525e-06, + "loss": 0.6543, + "step": 14533 + }, + { + "epoch": 2.17, + "grad_norm": 3.6323020329490063, + "learning_rate": 1.2608325529823604e-06, + "loss": 0.6478, + "step": 14534 + }, + { + "epoch": 2.17, + "grad_norm": 2.6078735406023945, + "learning_rate": 1.2607392880282948e-06, + "loss": 0.61, + "step": 14535 + }, + { + "epoch": 2.17, + "grad_norm": 2.518486780900882, + "learning_rate": 1.2606460206407263e-06, + "loss": 0.6322, + "step": 14536 + }, + { + "epoch": 2.17, + "grad_norm": 2.833706273283816, + "learning_rate": 1.2605527508205257e-06, + "loss": 0.6823, + "step": 14537 + }, + { + "epoch": 2.17, + "grad_norm": 3.6793307501800325, + "learning_rate": 1.260459478568563e-06, + "loss": 0.6146, + "step": 14538 + }, + { + "epoch": 2.17, + "grad_norm": 2.7222719319706234, + "learning_rate": 1.2603662038857088e-06, + "loss": 0.6335, + "step": 14539 + }, + { + "epoch": 2.17, + "grad_norm": 2.7545775865957487, + "learning_rate": 1.260272926772834e-06, + "loss": 0.6341, + "step": 14540 + }, + { + "epoch": 2.17, + "grad_norm": 2.8254422099017997, + "learning_rate": 1.2601796472308088e-06, + "loss": 0.6719, + "step": 14541 + }, + { + "epoch": 2.17, + "grad_norm": 3.8541435605704666, + "learning_rate": 1.2600863652605043e-06, + "loss": 0.6322, + "step": 14542 + }, + { + "epoch": 2.17, + "grad_norm": 3.29314151082272, + "learning_rate": 1.2599930808627906e-06, + "loss": 0.6243, + "step": 14543 + }, + { + "epoch": 2.17, + "grad_norm": 3.868447585531573, + "learning_rate": 1.2598997940385384e-06, + "loss": 0.6074, + "step": 14544 + }, + { + "epoch": 2.17, + "grad_norm": 2.8981291681427126, + "learning_rate": 1.2598065047886187e-06, + "loss": 0.6348, + "step": 14545 + }, + { + "epoch": 2.17, + "grad_norm": 3.269635996557185, + "learning_rate": 1.2597132131139013e-06, + "loss": 0.6543, + "step": 14546 + }, + { + "epoch": 2.17, + "grad_norm": 2.7142968082098373, + "learning_rate": 1.259619919015258e-06, + "loss": 0.6309, + "step": 14547 + }, + { + "epoch": 2.17, + "grad_norm": 2.759400511940406, + "learning_rate": 1.259526622493559e-06, + "loss": 0.6139, + "step": 14548 + }, + { + "epoch": 2.17, + "grad_norm": 3.503872018852784, + "learning_rate": 1.2594333235496749e-06, + "loss": 0.6224, + "step": 14549 + }, + { + "epoch": 2.17, + "grad_norm": 3.5712471672228663, + "learning_rate": 1.2593400221844769e-06, + "loss": 0.6764, + "step": 14550 + }, + { + "epoch": 2.17, + "grad_norm": 3.0182980354770077, + "learning_rate": 1.2592467183988352e-06, + "loss": 0.6628, + "step": 14551 + }, + { + "epoch": 2.17, + "grad_norm": 2.786350096329054, + "learning_rate": 1.259153412193621e-06, + "loss": 0.6615, + "step": 14552 + }, + { + "epoch": 2.17, + "grad_norm": 2.9798793062278355, + "learning_rate": 1.2590601035697054e-06, + "loss": 0.6641, + "step": 14553 + }, + { + "epoch": 2.17, + "grad_norm": 4.630609588919435, + "learning_rate": 1.2589667925279588e-06, + "loss": 0.6188, + "step": 14554 + }, + { + "epoch": 2.17, + "grad_norm": 3.308108597640222, + "learning_rate": 1.2588734790692521e-06, + "loss": 0.6094, + "step": 14555 + }, + { + "epoch": 2.17, + "grad_norm": 9.639249588744361, + "learning_rate": 1.2587801631944566e-06, + "loss": 0.6699, + "step": 14556 + }, + { + "epoch": 2.17, + "grad_norm": 2.964704889616016, + "learning_rate": 1.2586868449044426e-06, + "loss": 0.6686, + "step": 14557 + }, + { + "epoch": 2.17, + "grad_norm": 2.5794570891025215, + "learning_rate": 1.2585935242000818e-06, + "loss": 0.6432, + "step": 14558 + }, + { + "epoch": 2.17, + "grad_norm": 2.338536335358407, + "learning_rate": 1.2585002010822442e-06, + "loss": 0.6354, + "step": 14559 + }, + { + "epoch": 2.17, + "grad_norm": 3.0971322240679116, + "learning_rate": 1.2584068755518019e-06, + "loss": 0.6322, + "step": 14560 + }, + { + "epoch": 2.17, + "grad_norm": 3.3326740222087063, + "learning_rate": 1.258313547609625e-06, + "loss": 0.623, + "step": 14561 + }, + { + "epoch": 2.17, + "grad_norm": 4.845494048309921, + "learning_rate": 1.2582202172565852e-06, + "loss": 0.6419, + "step": 14562 + }, + { + "epoch": 2.17, + "grad_norm": 3.520682117302017, + "learning_rate": 1.2581268844935534e-06, + "loss": 0.6315, + "step": 14563 + }, + { + "epoch": 2.17, + "grad_norm": 2.7841404492849215, + "learning_rate": 1.2580335493214e-06, + "loss": 0.6452, + "step": 14564 + }, + { + "epoch": 2.17, + "grad_norm": 2.3936689346323496, + "learning_rate": 1.2579402117409973e-06, + "loss": 0.597, + "step": 14565 + }, + { + "epoch": 2.17, + "grad_norm": 5.794125518775692, + "learning_rate": 1.2578468717532153e-06, + "loss": 0.6458, + "step": 14566 + }, + { + "epoch": 2.17, + "grad_norm": 4.135329857233327, + "learning_rate": 1.257753529358926e-06, + "loss": 0.6582, + "step": 14567 + }, + { + "epoch": 2.17, + "grad_norm": 4.197817248713362, + "learning_rate": 1.257660184559e-06, + "loss": 0.6751, + "step": 14568 + }, + { + "epoch": 2.17, + "grad_norm": 2.7773563444805474, + "learning_rate": 1.2575668373543087e-06, + "loss": 0.6413, + "step": 14569 + }, + { + "epoch": 2.17, + "grad_norm": 5.319798076191076, + "learning_rate": 1.2574734877457235e-06, + "loss": 0.6484, + "step": 14570 + }, + { + "epoch": 2.17, + "grad_norm": 2.8569268724685863, + "learning_rate": 1.2573801357341153e-06, + "loss": 0.6068, + "step": 14571 + }, + { + "epoch": 2.17, + "grad_norm": 6.469928324341321, + "learning_rate": 1.2572867813203555e-06, + "loss": 0.6706, + "step": 14572 + }, + { + "epoch": 2.17, + "grad_norm": 2.7514497174934034, + "learning_rate": 1.2571934245053156e-06, + "loss": 0.6803, + "step": 14573 + }, + { + "epoch": 2.17, + "grad_norm": 2.8091171424595935, + "learning_rate": 1.2571000652898667e-06, + "loss": 0.6419, + "step": 14574 + }, + { + "epoch": 2.17, + "grad_norm": 2.9591351134065214, + "learning_rate": 1.2570067036748801e-06, + "loss": 0.6491, + "step": 14575 + }, + { + "epoch": 2.17, + "grad_norm": 2.8523186679441794, + "learning_rate": 1.256913339661227e-06, + "loss": 0.6413, + "step": 14576 + }, + { + "epoch": 2.17, + "grad_norm": 2.4223269863097134, + "learning_rate": 1.2568199732497793e-06, + "loss": 0.6549, + "step": 14577 + }, + { + "epoch": 2.17, + "grad_norm": 4.023907464062921, + "learning_rate": 1.256726604441408e-06, + "loss": 0.6693, + "step": 14578 + }, + { + "epoch": 2.17, + "grad_norm": 3.649868445734953, + "learning_rate": 1.2566332332369847e-06, + "loss": 0.6432, + "step": 14579 + }, + { + "epoch": 2.17, + "grad_norm": 2.7545168923880694, + "learning_rate": 1.2565398596373805e-06, + "loss": 0.6361, + "step": 14580 + }, + { + "epoch": 2.17, + "grad_norm": 2.911444894034895, + "learning_rate": 1.256446483643467e-06, + "loss": 0.6263, + "step": 14581 + }, + { + "epoch": 2.17, + "grad_norm": 2.371808384242374, + "learning_rate": 1.256353105256116e-06, + "loss": 0.6126, + "step": 14582 + }, + { + "epoch": 2.17, + "grad_norm": 3.9264271337501553, + "learning_rate": 1.2562597244761987e-06, + "loss": 0.6497, + "step": 14583 + }, + { + "epoch": 2.18, + "grad_norm": 2.5643461589246557, + "learning_rate": 1.2561663413045868e-06, + "loss": 0.651, + "step": 14584 + }, + { + "epoch": 2.18, + "grad_norm": 6.214547147731528, + "learning_rate": 1.2560729557421518e-06, + "loss": 0.6712, + "step": 14585 + }, + { + "epoch": 2.18, + "grad_norm": 2.4923566164699773, + "learning_rate": 1.2559795677897653e-06, + "loss": 0.6296, + "step": 14586 + }, + { + "epoch": 2.18, + "grad_norm": 3.2426427562513442, + "learning_rate": 1.255886177448299e-06, + "loss": 0.597, + "step": 14587 + }, + { + "epoch": 2.18, + "grad_norm": 2.856288166849874, + "learning_rate": 1.255792784718624e-06, + "loss": 0.6673, + "step": 14588 + }, + { + "epoch": 2.18, + "grad_norm": 3.9386484459066704, + "learning_rate": 1.2556993896016125e-06, + "loss": 0.6979, + "step": 14589 + }, + { + "epoch": 2.18, + "grad_norm": 3.2470756283027034, + "learning_rate": 1.2556059920981362e-06, + "loss": 0.6706, + "step": 14590 + }, + { + "epoch": 2.18, + "grad_norm": 3.8644965084792324, + "learning_rate": 1.2555125922090662e-06, + "loss": 0.6367, + "step": 14591 + }, + { + "epoch": 2.18, + "grad_norm": 3.5911271255615134, + "learning_rate": 1.2554191899352749e-06, + "loss": 0.6966, + "step": 14592 + }, + { + "epoch": 2.18, + "grad_norm": 2.3363668196658147, + "learning_rate": 1.2553257852776334e-06, + "loss": 0.6348, + "step": 14593 + }, + { + "epoch": 2.18, + "grad_norm": 2.431099418293222, + "learning_rate": 1.2552323782370136e-06, + "loss": 0.6419, + "step": 14594 + }, + { + "epoch": 2.18, + "grad_norm": 4.299832268004123, + "learning_rate": 1.255138968814288e-06, + "loss": 0.696, + "step": 14595 + }, + { + "epoch": 2.18, + "grad_norm": 3.3353919694392165, + "learning_rate": 1.2550455570103275e-06, + "loss": 0.6628, + "step": 14596 + }, + { + "epoch": 2.18, + "grad_norm": 2.3553417597803357, + "learning_rate": 1.2549521428260046e-06, + "loss": 0.638, + "step": 14597 + }, + { + "epoch": 2.18, + "grad_norm": 3.1853300194436414, + "learning_rate": 1.2548587262621906e-06, + "loss": 0.6536, + "step": 14598 + }, + { + "epoch": 2.18, + "grad_norm": 2.336300132271448, + "learning_rate": 1.2547653073197573e-06, + "loss": 0.6296, + "step": 14599 + }, + { + "epoch": 2.18, + "grad_norm": 5.880594938325326, + "learning_rate": 1.2546718859995772e-06, + "loss": 0.638, + "step": 14600 + }, + { + "epoch": 2.18, + "grad_norm": 3.41822312758735, + "learning_rate": 1.2545784623025217e-06, + "loss": 0.6328, + "step": 14601 + }, + { + "epoch": 2.18, + "grad_norm": 5.799964168484688, + "learning_rate": 1.2544850362294629e-06, + "loss": 0.6315, + "step": 14602 + }, + { + "epoch": 2.18, + "grad_norm": 2.7077027258564734, + "learning_rate": 1.2543916077812729e-06, + "loss": 0.6335, + "step": 14603 + }, + { + "epoch": 2.18, + "grad_norm": 2.843176456450894, + "learning_rate": 1.2542981769588232e-06, + "loss": 0.6211, + "step": 14604 + }, + { + "epoch": 2.18, + "grad_norm": 2.471131521938686, + "learning_rate": 1.2542047437629863e-06, + "loss": 0.6758, + "step": 14605 + }, + { + "epoch": 2.18, + "grad_norm": 2.4457284396174366, + "learning_rate": 1.2541113081946343e-06, + "loss": 0.627, + "step": 14606 + }, + { + "epoch": 2.18, + "grad_norm": 3.3165601666790057, + "learning_rate": 1.2540178702546386e-06, + "loss": 0.6074, + "step": 14607 + }, + { + "epoch": 2.18, + "grad_norm": 3.0723660844167715, + "learning_rate": 1.2539244299438723e-06, + "loss": 0.61, + "step": 14608 + }, + { + "epoch": 2.18, + "grad_norm": 3.1837114403688336, + "learning_rate": 1.253830987263206e-06, + "loss": 0.6289, + "step": 14609 + }, + { + "epoch": 2.18, + "grad_norm": 2.5349721048863096, + "learning_rate": 1.253737542213513e-06, + "loss": 0.6478, + "step": 14610 + }, + { + "epoch": 2.18, + "grad_norm": 2.692753150916202, + "learning_rate": 1.2536440947956655e-06, + "loss": 0.6315, + "step": 14611 + }, + { + "epoch": 2.18, + "grad_norm": 3.951354894136671, + "learning_rate": 1.2535506450105347e-06, + "loss": 0.6556, + "step": 14612 + }, + { + "epoch": 2.18, + "grad_norm": 5.165333250186095, + "learning_rate": 1.2534571928589934e-06, + "loss": 0.651, + "step": 14613 + }, + { + "epoch": 2.18, + "grad_norm": 3.7917696941613643, + "learning_rate": 1.2533637383419139e-06, + "loss": 0.6517, + "step": 14614 + }, + { + "epoch": 2.18, + "grad_norm": 2.4291150256909386, + "learning_rate": 1.2532702814601678e-06, + "loss": 0.6556, + "step": 14615 + }, + { + "epoch": 2.18, + "grad_norm": 3.352227834801144, + "learning_rate": 1.253176822214628e-06, + "loss": 0.6198, + "step": 14616 + }, + { + "epoch": 2.18, + "grad_norm": 4.528940403400527, + "learning_rate": 1.2530833606061665e-06, + "loss": 0.6908, + "step": 14617 + }, + { + "epoch": 2.18, + "grad_norm": 2.5074853009977005, + "learning_rate": 1.2529898966356558e-06, + "loss": 0.6602, + "step": 14618 + }, + { + "epoch": 2.18, + "grad_norm": 5.3951608355712635, + "learning_rate": 1.252896430303968e-06, + "loss": 0.6556, + "step": 14619 + }, + { + "epoch": 2.18, + "grad_norm": 2.388416381509622, + "learning_rate": 1.2528029616119753e-06, + "loss": 0.6087, + "step": 14620 + }, + { + "epoch": 2.18, + "grad_norm": 4.169312903696001, + "learning_rate": 1.25270949056055e-06, + "loss": 0.6536, + "step": 14621 + }, + { + "epoch": 2.18, + "grad_norm": 2.6056455556474476, + "learning_rate": 1.2526160171505652e-06, + "loss": 0.6576, + "step": 14622 + }, + { + "epoch": 2.18, + "grad_norm": 3.7127279150765924, + "learning_rate": 1.2525225413828924e-06, + "loss": 0.6263, + "step": 14623 + }, + { + "epoch": 2.18, + "grad_norm": 2.7436536521580352, + "learning_rate": 1.252429063258405e-06, + "loss": 0.6589, + "step": 14624 + }, + { + "epoch": 2.18, + "grad_norm": 6.689216017235354, + "learning_rate": 1.252335582777974e-06, + "loss": 0.6556, + "step": 14625 + }, + { + "epoch": 2.18, + "grad_norm": 2.229573516969056, + "learning_rate": 1.2522420999424728e-06, + "loss": 0.5905, + "step": 14626 + }, + { + "epoch": 2.18, + "grad_norm": 2.503612380415172, + "learning_rate": 1.252148614752774e-06, + "loss": 0.6185, + "step": 14627 + }, + { + "epoch": 2.18, + "grad_norm": 3.302786611302436, + "learning_rate": 1.2520551272097501e-06, + "loss": 0.6771, + "step": 14628 + }, + { + "epoch": 2.18, + "grad_norm": 2.7587563480510204, + "learning_rate": 1.2519616373142733e-06, + "loss": 0.6751, + "step": 14629 + }, + { + "epoch": 2.18, + "grad_norm": 4.136179110277189, + "learning_rate": 1.2518681450672159e-06, + "loss": 0.6315, + "step": 14630 + }, + { + "epoch": 2.18, + "grad_norm": 2.9278830816557075, + "learning_rate": 1.251774650469451e-06, + "loss": 0.6719, + "step": 14631 + }, + { + "epoch": 2.18, + "grad_norm": 4.937453408590218, + "learning_rate": 1.2516811535218512e-06, + "loss": 0.6569, + "step": 14632 + }, + { + "epoch": 2.18, + "grad_norm": 3.759597490278777, + "learning_rate": 1.2515876542252889e-06, + "loss": 0.627, + "step": 14633 + }, + { + "epoch": 2.18, + "grad_norm": 2.760331692006826, + "learning_rate": 1.2514941525806366e-06, + "loss": 0.6901, + "step": 14634 + }, + { + "epoch": 2.18, + "grad_norm": 2.2546604141825703, + "learning_rate": 1.2514006485887673e-06, + "loss": 0.6348, + "step": 14635 + }, + { + "epoch": 2.18, + "grad_norm": 3.451880368453681, + "learning_rate": 1.2513071422505533e-06, + "loss": 0.64, + "step": 14636 + }, + { + "epoch": 2.18, + "grad_norm": 3.5930482161919532, + "learning_rate": 1.2512136335668677e-06, + "loss": 0.6504, + "step": 14637 + }, + { + "epoch": 2.18, + "grad_norm": 2.4940940024511327, + "learning_rate": 1.251120122538583e-06, + "loss": 0.6621, + "step": 14638 + }, + { + "epoch": 2.18, + "grad_norm": 2.5297059910459776, + "learning_rate": 1.251026609166572e-06, + "loss": 0.6816, + "step": 14639 + }, + { + "epoch": 2.18, + "grad_norm": 3.11210716471469, + "learning_rate": 1.2509330934517075e-06, + "loss": 0.6296, + "step": 14640 + }, + { + "epoch": 2.18, + "grad_norm": 3.184086864698617, + "learning_rate": 1.2508395753948622e-06, + "loss": 0.6738, + "step": 14641 + }, + { + "epoch": 2.18, + "grad_norm": 2.221171097438364, + "learning_rate": 1.2507460549969086e-06, + "loss": 0.6589, + "step": 14642 + }, + { + "epoch": 2.18, + "grad_norm": 2.2854126569059936, + "learning_rate": 1.2506525322587204e-06, + "loss": 0.6302, + "step": 14643 + }, + { + "epoch": 2.18, + "grad_norm": 5.112814690562303, + "learning_rate": 1.25055900718117e-06, + "loss": 0.6719, + "step": 14644 + }, + { + "epoch": 2.18, + "grad_norm": 4.587917208557352, + "learning_rate": 1.25046547976513e-06, + "loss": 0.6634, + "step": 14645 + }, + { + "epoch": 2.18, + "grad_norm": 2.2226896628729786, + "learning_rate": 1.2503719500114733e-06, + "loss": 0.6107, + "step": 14646 + }, + { + "epoch": 2.18, + "grad_norm": 2.2572762076658996, + "learning_rate": 1.2502784179210734e-06, + "loss": 0.6393, + "step": 14647 + }, + { + "epoch": 2.18, + "grad_norm": 2.350974690717858, + "learning_rate": 1.250184883494803e-06, + "loss": 0.6582, + "step": 14648 + }, + { + "epoch": 2.18, + "grad_norm": 2.232343327159912, + "learning_rate": 1.2500913467335346e-06, + "loss": 0.6217, + "step": 14649 + }, + { + "epoch": 2.18, + "grad_norm": 2.9395367706616784, + "learning_rate": 1.2499978076381416e-06, + "loss": 0.6348, + "step": 14650 + }, + { + "epoch": 2.19, + "grad_norm": 2.564379565911669, + "learning_rate": 1.2499042662094972e-06, + "loss": 0.6335, + "step": 14651 + }, + { + "epoch": 2.19, + "grad_norm": 5.8167435505991785, + "learning_rate": 1.2498107224484739e-06, + "loss": 0.6621, + "step": 14652 + }, + { + "epoch": 2.19, + "grad_norm": 2.7776599445138364, + "learning_rate": 1.2497171763559454e-06, + "loss": 0.6296, + "step": 14653 + }, + { + "epoch": 2.19, + "grad_norm": 2.19837682001825, + "learning_rate": 1.2496236279327841e-06, + "loss": 0.6107, + "step": 14654 + }, + { + "epoch": 2.19, + "grad_norm": 5.107777715599288, + "learning_rate": 1.2495300771798634e-06, + "loss": 0.6549, + "step": 14655 + }, + { + "epoch": 2.19, + "grad_norm": 2.1503420432138523, + "learning_rate": 1.2494365240980565e-06, + "loss": 0.6523, + "step": 14656 + }, + { + "epoch": 2.19, + "grad_norm": 2.7356279536314925, + "learning_rate": 1.2493429686882366e-06, + "loss": 0.627, + "step": 14657 + }, + { + "epoch": 2.19, + "grad_norm": 3.6752695208027464, + "learning_rate": 1.2492494109512766e-06, + "loss": 0.7246, + "step": 14658 + }, + { + "epoch": 2.19, + "grad_norm": 2.359622986391555, + "learning_rate": 1.2491558508880494e-06, + "loss": 0.6139, + "step": 14659 + }, + { + "epoch": 2.19, + "grad_norm": 2.5634969375508208, + "learning_rate": 1.249062288499429e-06, + "loss": 0.6706, + "step": 14660 + }, + { + "epoch": 2.19, + "grad_norm": 2.202708167612195, + "learning_rate": 1.2489687237862888e-06, + "loss": 0.6178, + "step": 14661 + }, + { + "epoch": 2.19, + "grad_norm": 2.9225731863131816, + "learning_rate": 1.2488751567495006e-06, + "loss": 0.6419, + "step": 14662 + }, + { + "epoch": 2.19, + "grad_norm": 5.111798023384258, + "learning_rate": 1.248781587389939e-06, + "loss": 0.6458, + "step": 14663 + }, + { + "epoch": 2.19, + "grad_norm": 4.682422899978719, + "learning_rate": 1.2486880157084766e-06, + "loss": 0.6471, + "step": 14664 + }, + { + "epoch": 2.19, + "grad_norm": 3.7331953107953564, + "learning_rate": 1.248594441705987e-06, + "loss": 0.6413, + "step": 14665 + }, + { + "epoch": 2.19, + "grad_norm": 2.611901823531557, + "learning_rate": 1.2485008653833435e-06, + "loss": 0.6543, + "step": 14666 + }, + { + "epoch": 2.19, + "grad_norm": 3.21010753134833, + "learning_rate": 1.2484072867414197e-06, + "loss": 0.6621, + "step": 14667 + }, + { + "epoch": 2.19, + "grad_norm": 3.312758570666702, + "learning_rate": 1.2483137057810885e-06, + "loss": 0.651, + "step": 14668 + }, + { + "epoch": 2.19, + "grad_norm": 4.42522761890329, + "learning_rate": 1.2482201225032234e-06, + "loss": 0.638, + "step": 14669 + }, + { + "epoch": 2.19, + "grad_norm": 2.504476195483101, + "learning_rate": 1.2481265369086979e-06, + "loss": 0.6484, + "step": 14670 + }, + { + "epoch": 2.19, + "grad_norm": 3.4655470242208946, + "learning_rate": 1.248032948998386e-06, + "loss": 0.6595, + "step": 14671 + }, + { + "epoch": 2.19, + "grad_norm": 2.8734851270958095, + "learning_rate": 1.2479393587731604e-06, + "loss": 0.6009, + "step": 14672 + }, + { + "epoch": 2.19, + "grad_norm": 2.7139608098169594, + "learning_rate": 1.2478457662338945e-06, + "loss": 0.6751, + "step": 14673 + }, + { + "epoch": 2.19, + "grad_norm": 2.4269437901248905, + "learning_rate": 1.2477521713814627e-06, + "loss": 0.6322, + "step": 14674 + }, + { + "epoch": 2.19, + "grad_norm": 5.8722685893191215, + "learning_rate": 1.2476585742167375e-06, + "loss": 0.6361, + "step": 14675 + }, + { + "epoch": 2.19, + "grad_norm": 2.4324407314069107, + "learning_rate": 1.2475649747405932e-06, + "loss": 0.6569, + "step": 14676 + }, + { + "epoch": 2.19, + "grad_norm": 4.066282701007285, + "learning_rate": 1.2474713729539033e-06, + "loss": 0.6178, + "step": 14677 + }, + { + "epoch": 2.19, + "grad_norm": 5.998330758483331, + "learning_rate": 1.2473777688575405e-06, + "loss": 0.6504, + "step": 14678 + }, + { + "epoch": 2.19, + "grad_norm": 3.8541500152241435, + "learning_rate": 1.2472841624523798e-06, + "loss": 0.6393, + "step": 14679 + }, + { + "epoch": 2.19, + "grad_norm": 3.0913355504190227, + "learning_rate": 1.2471905537392938e-06, + "loss": 0.6478, + "step": 14680 + }, + { + "epoch": 2.19, + "grad_norm": 2.3814173242758225, + "learning_rate": 1.2470969427191564e-06, + "loss": 0.6296, + "step": 14681 + }, + { + "epoch": 2.19, + "grad_norm": 2.902816549233596, + "learning_rate": 1.2470033293928418e-06, + "loss": 0.6367, + "step": 14682 + }, + { + "epoch": 2.19, + "grad_norm": 4.803616533084308, + "learning_rate": 1.2469097137612228e-06, + "loss": 0.6204, + "step": 14683 + }, + { + "epoch": 2.19, + "grad_norm": 5.379392815147804, + "learning_rate": 1.246816095825174e-06, + "loss": 0.6803, + "step": 14684 + }, + { + "epoch": 2.19, + "grad_norm": 4.1041559695454435, + "learning_rate": 1.2467224755855684e-06, + "loss": 0.6426, + "step": 14685 + }, + { + "epoch": 2.19, + "grad_norm": 3.816739285757766, + "learning_rate": 1.2466288530432802e-06, + "loss": 0.6608, + "step": 14686 + }, + { + "epoch": 2.19, + "grad_norm": 2.4535542451272145, + "learning_rate": 1.2465352281991835e-06, + "loss": 0.6217, + "step": 14687 + }, + { + "epoch": 2.19, + "grad_norm": 2.849249757859827, + "learning_rate": 1.2464416010541513e-06, + "loss": 0.6536, + "step": 14688 + }, + { + "epoch": 2.19, + "grad_norm": 2.520691598253275, + "learning_rate": 1.2463479716090579e-06, + "loss": 0.6178, + "step": 14689 + }, + { + "epoch": 2.19, + "grad_norm": 3.321419839887923, + "learning_rate": 1.2462543398647773e-06, + "loss": 0.6803, + "step": 14690 + }, + { + "epoch": 2.19, + "grad_norm": 2.3988052993939655, + "learning_rate": 1.246160705822183e-06, + "loss": 0.6686, + "step": 14691 + }, + { + "epoch": 2.19, + "grad_norm": 2.7877558379746143, + "learning_rate": 1.2460670694821488e-06, + "loss": 0.6647, + "step": 14692 + }, + { + "epoch": 2.19, + "grad_norm": 2.2743605971954066, + "learning_rate": 1.2459734308455494e-06, + "loss": 0.6361, + "step": 14693 + }, + { + "epoch": 2.19, + "grad_norm": 2.633594533935641, + "learning_rate": 1.245879789913258e-06, + "loss": 0.6237, + "step": 14694 + }, + { + "epoch": 2.19, + "grad_norm": 3.736365215424283, + "learning_rate": 1.2457861466861487e-06, + "loss": 0.6426, + "step": 14695 + }, + { + "epoch": 2.19, + "grad_norm": 2.4290630649729503, + "learning_rate": 1.2456925011650956e-06, + "loss": 0.6139, + "step": 14696 + }, + { + "epoch": 2.19, + "grad_norm": 3.0708634298294792, + "learning_rate": 1.2455988533509727e-06, + "loss": 0.6432, + "step": 14697 + }, + { + "epoch": 2.19, + "grad_norm": 3.6313574720246846, + "learning_rate": 1.245505203244654e-06, + "loss": 0.6146, + "step": 14698 + }, + { + "epoch": 2.19, + "grad_norm": 2.8367838900048667, + "learning_rate": 1.2454115508470134e-06, + "loss": 0.6224, + "step": 14699 + }, + { + "epoch": 2.19, + "grad_norm": 4.554370736361144, + "learning_rate": 1.2453178961589253e-06, + "loss": 0.6426, + "step": 14700 + }, + { + "epoch": 2.19, + "grad_norm": 2.457866552882189, + "learning_rate": 1.2452242391812635e-06, + "loss": 0.6439, + "step": 14701 + }, + { + "epoch": 2.19, + "grad_norm": 2.3365222074714382, + "learning_rate": 1.245130579914902e-06, + "loss": 0.6478, + "step": 14702 + }, + { + "epoch": 2.19, + "grad_norm": 2.9262336354177596, + "learning_rate": 1.2450369183607151e-06, + "loss": 0.6523, + "step": 14703 + }, + { + "epoch": 2.19, + "grad_norm": 3.8864998287619086, + "learning_rate": 1.2449432545195776e-06, + "loss": 0.6504, + "step": 14704 + }, + { + "epoch": 2.19, + "grad_norm": 2.4217379021163166, + "learning_rate": 1.2448495883923622e-06, + "loss": 0.6449, + "step": 14705 + }, + { + "epoch": 2.19, + "grad_norm": 2.7652360895415726, + "learning_rate": 1.2447559199799445e-06, + "loss": 0.6908, + "step": 14706 + }, + { + "epoch": 2.19, + "grad_norm": 4.389021390756673, + "learning_rate": 1.244662249283198e-06, + "loss": 0.6413, + "step": 14707 + }, + { + "epoch": 2.19, + "grad_norm": 4.794135913927578, + "learning_rate": 1.2445685763029969e-06, + "loss": 0.625, + "step": 14708 + }, + { + "epoch": 2.19, + "grad_norm": 3.573859606662216, + "learning_rate": 1.2444749010402159e-06, + "loss": 0.6133, + "step": 14709 + }, + { + "epoch": 2.19, + "grad_norm": 2.780435328625068, + "learning_rate": 1.2443812234957289e-06, + "loss": 0.6608, + "step": 14710 + }, + { + "epoch": 2.19, + "grad_norm": 2.8849967158301344, + "learning_rate": 1.2442875436704106e-06, + "loss": 0.6139, + "step": 14711 + }, + { + "epoch": 2.19, + "grad_norm": 4.4455759558085495, + "learning_rate": 1.2441938615651346e-06, + "loss": 0.6283, + "step": 14712 + }, + { + "epoch": 2.19, + "grad_norm": 3.510391507132702, + "learning_rate": 1.2441001771807757e-06, + "loss": 0.6771, + "step": 14713 + }, + { + "epoch": 2.19, + "grad_norm": 3.3365341892730354, + "learning_rate": 1.2440064905182087e-06, + "loss": 0.5885, + "step": 14714 + }, + { + "epoch": 2.19, + "grad_norm": 3.458591662256535, + "learning_rate": 1.2439128015783073e-06, + "loss": 0.6621, + "step": 14715 + }, + { + "epoch": 2.19, + "grad_norm": 2.8166402315542842, + "learning_rate": 1.2438191103619462e-06, + "loss": 0.6589, + "step": 14716 + }, + { + "epoch": 2.19, + "grad_norm": 2.2878889102735798, + "learning_rate": 1.2437254168699997e-06, + "loss": 0.6315, + "step": 14717 + }, + { + "epoch": 2.2, + "grad_norm": 2.9377541099462547, + "learning_rate": 1.2436317211033425e-06, + "loss": 0.6374, + "step": 14718 + }, + { + "epoch": 2.2, + "grad_norm": 3.4894959958351017, + "learning_rate": 1.2435380230628488e-06, + "loss": 0.6628, + "step": 14719 + }, + { + "epoch": 2.2, + "grad_norm": 2.8847467939333895, + "learning_rate": 1.2434443227493932e-06, + "loss": 0.6048, + "step": 14720 + }, + { + "epoch": 2.2, + "grad_norm": 2.6043526188821375, + "learning_rate": 1.2433506201638499e-06, + "loss": 0.6361, + "step": 14721 + }, + { + "epoch": 2.2, + "grad_norm": 5.6777449735935, + "learning_rate": 1.2432569153070942e-06, + "loss": 0.6426, + "step": 14722 + }, + { + "epoch": 2.2, + "grad_norm": 3.709302886595163, + "learning_rate": 1.2431632081800002e-06, + "loss": 0.6012, + "step": 14723 + }, + { + "epoch": 2.2, + "grad_norm": 3.452569445101003, + "learning_rate": 1.2430694987834423e-06, + "loss": 0.6589, + "step": 14724 + }, + { + "epoch": 2.2, + "grad_norm": 3.487191092441426, + "learning_rate": 1.242975787118295e-06, + "loss": 0.6562, + "step": 14725 + }, + { + "epoch": 2.2, + "grad_norm": 3.377635671123433, + "learning_rate": 1.2428820731854334e-06, + "loss": 0.6061, + "step": 14726 + }, + { + "epoch": 2.2, + "grad_norm": 6.588884428865809, + "learning_rate": 1.2427883569857324e-06, + "loss": 0.666, + "step": 14727 + }, + { + "epoch": 2.2, + "grad_norm": 2.8390366686781627, + "learning_rate": 1.2426946385200655e-06, + "loss": 0.6751, + "step": 14728 + }, + { + "epoch": 2.2, + "grad_norm": 5.332145768957236, + "learning_rate": 1.2426009177893085e-06, + "loss": 0.6302, + "step": 14729 + }, + { + "epoch": 2.2, + "grad_norm": 3.3588595165747144, + "learning_rate": 1.2425071947943354e-06, + "loss": 0.666, + "step": 14730 + }, + { + "epoch": 2.2, + "grad_norm": 3.5884214924807982, + "learning_rate": 1.2424134695360213e-06, + "loss": 0.638, + "step": 14731 + }, + { + "epoch": 2.2, + "grad_norm": 3.3232764389162304, + "learning_rate": 1.2423197420152409e-06, + "loss": 0.6042, + "step": 14732 + }, + { + "epoch": 2.2, + "grad_norm": 2.743762453960156, + "learning_rate": 1.2422260122328691e-06, + "loss": 0.6009, + "step": 14733 + }, + { + "epoch": 2.2, + "grad_norm": 3.1106063811786497, + "learning_rate": 1.2421322801897802e-06, + "loss": 0.6986, + "step": 14734 + }, + { + "epoch": 2.2, + "grad_norm": 4.299120801009346, + "learning_rate": 1.2420385458868495e-06, + "loss": 0.6452, + "step": 14735 + }, + { + "epoch": 2.2, + "grad_norm": 3.5863045082107727, + "learning_rate": 1.2419448093249514e-06, + "loss": 0.6504, + "step": 14736 + }, + { + "epoch": 2.2, + "grad_norm": 3.072587279682176, + "learning_rate": 1.2418510705049611e-06, + "loss": 0.6562, + "step": 14737 + }, + { + "epoch": 2.2, + "grad_norm": 4.604733425006412, + "learning_rate": 1.2417573294277538e-06, + "loss": 0.6367, + "step": 14738 + }, + { + "epoch": 2.2, + "grad_norm": 4.885852368397556, + "learning_rate": 1.2416635860942034e-06, + "loss": 0.6914, + "step": 14739 + }, + { + "epoch": 2.2, + "grad_norm": 6.159893497211562, + "learning_rate": 1.2415698405051857e-06, + "loss": 0.6296, + "step": 14740 + }, + { + "epoch": 2.2, + "grad_norm": 3.013986188465755, + "learning_rate": 1.2414760926615753e-06, + "loss": 0.5918, + "step": 14741 + }, + { + "epoch": 2.2, + "grad_norm": 3.2156240240649443, + "learning_rate": 1.241382342564247e-06, + "loss": 0.6803, + "step": 14742 + }, + { + "epoch": 2.2, + "grad_norm": 2.472248953810297, + "learning_rate": 1.2412885902140761e-06, + "loss": 0.6003, + "step": 14743 + }, + { + "epoch": 2.2, + "grad_norm": 3.103928229333382, + "learning_rate": 1.2411948356119373e-06, + "loss": 0.597, + "step": 14744 + }, + { + "epoch": 2.2, + "grad_norm": 2.847831338147369, + "learning_rate": 1.241101078758706e-06, + "loss": 0.6543, + "step": 14745 + }, + { + "epoch": 2.2, + "grad_norm": 2.5385649213528834, + "learning_rate": 1.2410073196552567e-06, + "loss": 0.6315, + "step": 14746 + }, + { + "epoch": 2.2, + "grad_norm": 2.9784197805706163, + "learning_rate": 1.240913558302465e-06, + "loss": 0.6504, + "step": 14747 + }, + { + "epoch": 2.2, + "grad_norm": 3.2580358602483765, + "learning_rate": 1.2408197947012057e-06, + "loss": 0.6634, + "step": 14748 + }, + { + "epoch": 2.2, + "grad_norm": 4.956629512043636, + "learning_rate": 1.2407260288523538e-06, + "loss": 0.6712, + "step": 14749 + }, + { + "epoch": 2.2, + "grad_norm": 2.689894718185501, + "learning_rate": 1.2406322607567847e-06, + "loss": 0.6263, + "step": 14750 + }, + { + "epoch": 2.2, + "grad_norm": 3.0008793302385453, + "learning_rate": 1.2405384904153734e-06, + "loss": 0.6979, + "step": 14751 + }, + { + "epoch": 2.2, + "grad_norm": 2.979517997308809, + "learning_rate": 1.240444717828995e-06, + "loss": 0.6602, + "step": 14752 + }, + { + "epoch": 2.2, + "grad_norm": 2.740260573350636, + "learning_rate": 1.2403509429985254e-06, + "loss": 0.653, + "step": 14753 + }, + { + "epoch": 2.2, + "grad_norm": 2.6439374793196917, + "learning_rate": 1.2402571659248386e-06, + "loss": 0.61, + "step": 14754 + }, + { + "epoch": 2.2, + "grad_norm": 2.4531797578617613, + "learning_rate": 1.2401633866088104e-06, + "loss": 0.6361, + "step": 14755 + }, + { + "epoch": 2.2, + "grad_norm": 2.787904358794559, + "learning_rate": 1.2400696050513164e-06, + "loss": 0.6198, + "step": 14756 + }, + { + "epoch": 2.2, + "grad_norm": 4.974870520465567, + "learning_rate": 1.2399758212532314e-06, + "loss": 0.6953, + "step": 14757 + }, + { + "epoch": 2.2, + "grad_norm": 6.109050023111121, + "learning_rate": 1.2398820352154307e-06, + "loss": 0.6569, + "step": 14758 + }, + { + "epoch": 2.2, + "grad_norm": 2.6289292609840293, + "learning_rate": 1.23978824693879e-06, + "loss": 0.6341, + "step": 14759 + }, + { + "epoch": 2.2, + "grad_norm": 2.451226012434306, + "learning_rate": 1.2396944564241843e-06, + "loss": 0.6432, + "step": 14760 + }, + { + "epoch": 2.2, + "grad_norm": 3.092808442103815, + "learning_rate": 1.239600663672489e-06, + "loss": 0.6576, + "step": 14761 + }, + { + "epoch": 2.2, + "grad_norm": 2.6135795195356546, + "learning_rate": 1.2395068686845793e-06, + "loss": 0.6367, + "step": 14762 + }, + { + "epoch": 2.2, + "grad_norm": 2.662308994868519, + "learning_rate": 1.239413071461331e-06, + "loss": 0.6445, + "step": 14763 + }, + { + "epoch": 2.2, + "grad_norm": 2.961363668558205, + "learning_rate": 1.2393192720036195e-06, + "loss": 0.6191, + "step": 14764 + }, + { + "epoch": 2.2, + "grad_norm": 2.5838260744026327, + "learning_rate": 1.2392254703123199e-06, + "loss": 0.6165, + "step": 14765 + }, + { + "epoch": 2.2, + "grad_norm": 2.882577160194331, + "learning_rate": 1.2391316663883079e-06, + "loss": 0.6628, + "step": 14766 + }, + { + "epoch": 2.2, + "grad_norm": 2.976856289688177, + "learning_rate": 1.239037860232459e-06, + "loss": 0.6074, + "step": 14767 + }, + { + "epoch": 2.2, + "grad_norm": 3.076721820937446, + "learning_rate": 1.2389440518456482e-06, + "loss": 0.6725, + "step": 14768 + }, + { + "epoch": 2.2, + "grad_norm": 2.593635464313873, + "learning_rate": 1.2388502412287518e-06, + "loss": 0.6348, + "step": 14769 + }, + { + "epoch": 2.2, + "grad_norm": 2.478413813947112, + "learning_rate": 1.238756428382645e-06, + "loss": 0.6745, + "step": 14770 + }, + { + "epoch": 2.2, + "grad_norm": 5.461980124696165, + "learning_rate": 1.238662613308203e-06, + "loss": 0.6361, + "step": 14771 + }, + { + "epoch": 2.2, + "grad_norm": 2.4719066405254524, + "learning_rate": 1.2385687960063021e-06, + "loss": 0.6745, + "step": 14772 + }, + { + "epoch": 2.2, + "grad_norm": 3.0188974094749694, + "learning_rate": 1.2384749764778174e-06, + "loss": 0.6302, + "step": 14773 + }, + { + "epoch": 2.2, + "grad_norm": 2.4706611818783886, + "learning_rate": 1.2383811547236244e-06, + "loss": 0.6393, + "step": 14774 + }, + { + "epoch": 2.2, + "grad_norm": 3.68575083286521, + "learning_rate": 1.2382873307445994e-06, + "loss": 0.6081, + "step": 14775 + }, + { + "epoch": 2.2, + "grad_norm": 3.0487996224705243, + "learning_rate": 1.2381935045416176e-06, + "loss": 0.6732, + "step": 14776 + }, + { + "epoch": 2.2, + "grad_norm": 2.658606452413135, + "learning_rate": 1.2380996761155547e-06, + "loss": 0.6413, + "step": 14777 + }, + { + "epoch": 2.2, + "grad_norm": 2.7820217284006232, + "learning_rate": 1.2380058454672862e-06, + "loss": 0.6458, + "step": 14778 + }, + { + "epoch": 2.2, + "grad_norm": 3.594375981731638, + "learning_rate": 1.2379120125976883e-06, + "loss": 0.6289, + "step": 14779 + }, + { + "epoch": 2.2, + "grad_norm": 2.3407505565120017, + "learning_rate": 1.2378181775076368e-06, + "loss": 0.6445, + "step": 14780 + }, + { + "epoch": 2.2, + "grad_norm": 6.726943744047224, + "learning_rate": 1.2377243401980069e-06, + "loss": 0.6901, + "step": 14781 + }, + { + "epoch": 2.2, + "grad_norm": 2.960526394337498, + "learning_rate": 1.2376305006696747e-06, + "loss": 0.6257, + "step": 14782 + }, + { + "epoch": 2.2, + "grad_norm": 2.7339232434185767, + "learning_rate": 1.2375366589235164e-06, + "loss": 0.6224, + "step": 14783 + }, + { + "epoch": 2.2, + "grad_norm": 2.413240897083263, + "learning_rate": 1.237442814960407e-06, + "loss": 0.6393, + "step": 14784 + }, + { + "epoch": 2.21, + "grad_norm": 2.7207783689327183, + "learning_rate": 1.2373489687812231e-06, + "loss": 0.6152, + "step": 14785 + }, + { + "epoch": 2.21, + "grad_norm": 3.3665250753538642, + "learning_rate": 1.2372551203868402e-06, + "loss": 0.6328, + "step": 14786 + }, + { + "epoch": 2.21, + "grad_norm": 2.8568302509737236, + "learning_rate": 1.237161269778134e-06, + "loss": 0.6257, + "step": 14787 + }, + { + "epoch": 2.21, + "grad_norm": 2.4463838250896246, + "learning_rate": 1.237067416955981e-06, + "loss": 0.6374, + "step": 14788 + }, + { + "epoch": 2.21, + "grad_norm": 4.208359644764564, + "learning_rate": 1.2369735619212566e-06, + "loss": 0.6686, + "step": 14789 + }, + { + "epoch": 2.21, + "grad_norm": 5.996356960015916, + "learning_rate": 1.2368797046748375e-06, + "loss": 0.6771, + "step": 14790 + }, + { + "epoch": 2.21, + "grad_norm": 4.823979717800155, + "learning_rate": 1.2367858452175987e-06, + "loss": 0.627, + "step": 14791 + }, + { + "epoch": 2.21, + "grad_norm": 2.4938357800831237, + "learning_rate": 1.2366919835504166e-06, + "loss": 0.6283, + "step": 14792 + }, + { + "epoch": 2.21, + "grad_norm": 7.030114281023856, + "learning_rate": 1.2365981196741677e-06, + "loss": 0.6471, + "step": 14793 + }, + { + "epoch": 2.21, + "grad_norm": 2.736929676538827, + "learning_rate": 1.2365042535897274e-06, + "loss": 0.6634, + "step": 14794 + }, + { + "epoch": 2.21, + "grad_norm": 3.7351079879295814, + "learning_rate": 1.236410385297972e-06, + "loss": 0.638, + "step": 14795 + }, + { + "epoch": 2.21, + "grad_norm": 2.900997154367461, + "learning_rate": 1.2363165147997775e-06, + "loss": 0.6536, + "step": 14796 + }, + { + "epoch": 2.21, + "grad_norm": 4.31783540552272, + "learning_rate": 1.2362226420960202e-06, + "loss": 0.6393, + "step": 14797 + }, + { + "epoch": 2.21, + "grad_norm": 3.2111053582699056, + "learning_rate": 1.236128767187576e-06, + "loss": 0.6289, + "step": 14798 + }, + { + "epoch": 2.21, + "grad_norm": 2.582089960481317, + "learning_rate": 1.2360348900753212e-06, + "loss": 0.6576, + "step": 14799 + }, + { + "epoch": 2.21, + "grad_norm": 3.5291896561414458, + "learning_rate": 1.2359410107601318e-06, + "loss": 0.64, + "step": 14800 + }, + { + "epoch": 2.21, + "grad_norm": 2.4526284951376214, + "learning_rate": 1.2358471292428842e-06, + "loss": 0.5846, + "step": 14801 + }, + { + "epoch": 2.21, + "grad_norm": 3.072752847299142, + "learning_rate": 1.2357532455244542e-06, + "loss": 0.6012, + "step": 14802 + }, + { + "epoch": 2.21, + "grad_norm": 2.6557484217857406, + "learning_rate": 1.2356593596057185e-06, + "loss": 0.651, + "step": 14803 + }, + { + "epoch": 2.21, + "grad_norm": 2.3271912755740196, + "learning_rate": 1.2355654714875535e-06, + "loss": 0.6198, + "step": 14804 + }, + { + "epoch": 2.21, + "grad_norm": 3.6936992015699435, + "learning_rate": 1.2354715811708345e-06, + "loss": 0.6895, + "step": 14805 + }, + { + "epoch": 2.21, + "grad_norm": 2.877768840597768, + "learning_rate": 1.2353776886564391e-06, + "loss": 0.6283, + "step": 14806 + }, + { + "epoch": 2.21, + "grad_norm": 4.541046510028128, + "learning_rate": 1.2352837939452422e-06, + "loss": 0.6406, + "step": 14807 + }, + { + "epoch": 2.21, + "grad_norm": 2.622660629356291, + "learning_rate": 1.235189897038121e-06, + "loss": 0.6641, + "step": 14808 + }, + { + "epoch": 2.21, + "grad_norm": 3.841491098654051, + "learning_rate": 1.2350959979359524e-06, + "loss": 0.6654, + "step": 14809 + }, + { + "epoch": 2.21, + "grad_norm": 3.693098633278268, + "learning_rate": 1.2350020966396113e-06, + "loss": 0.6615, + "step": 14810 + }, + { + "epoch": 2.21, + "grad_norm": 2.823927934351698, + "learning_rate": 1.2349081931499752e-06, + "loss": 0.6068, + "step": 14811 + }, + { + "epoch": 2.21, + "grad_norm": 2.562723309788467, + "learning_rate": 1.23481428746792e-06, + "loss": 0.6237, + "step": 14812 + }, + { + "epoch": 2.21, + "grad_norm": 3.605832299036598, + "learning_rate": 1.234720379594322e-06, + "loss": 0.6322, + "step": 14813 + }, + { + "epoch": 2.21, + "grad_norm": 2.5461699826522763, + "learning_rate": 1.2346264695300586e-06, + "loss": 0.6348, + "step": 14814 + }, + { + "epoch": 2.21, + "grad_norm": 3.5813146651440033, + "learning_rate": 1.234532557276005e-06, + "loss": 0.6504, + "step": 14815 + }, + { + "epoch": 2.21, + "grad_norm": 3.0826744084380855, + "learning_rate": 1.2344386428330387e-06, + "loss": 0.6458, + "step": 14816 + }, + { + "epoch": 2.21, + "grad_norm": 2.7731174058009977, + "learning_rate": 1.2343447262020358e-06, + "loss": 0.582, + "step": 14817 + }, + { + "epoch": 2.21, + "grad_norm": 2.7819992280576855, + "learning_rate": 1.2342508073838727e-06, + "loss": 0.651, + "step": 14818 + }, + { + "epoch": 2.21, + "grad_norm": 3.013918809893278, + "learning_rate": 1.2341568863794261e-06, + "loss": 0.6204, + "step": 14819 + }, + { + "epoch": 2.21, + "grad_norm": 4.738801467432079, + "learning_rate": 1.2340629631895727e-06, + "loss": 0.653, + "step": 14820 + }, + { + "epoch": 2.21, + "grad_norm": 3.308449070283841, + "learning_rate": 1.2339690378151887e-06, + "loss": 0.6243, + "step": 14821 + }, + { + "epoch": 2.21, + "grad_norm": 3.6624027273223665, + "learning_rate": 1.2338751102571512e-06, + "loss": 0.627, + "step": 14822 + }, + { + "epoch": 2.21, + "grad_norm": 4.519826510986403, + "learning_rate": 1.2337811805163367e-06, + "loss": 0.5996, + "step": 14823 + }, + { + "epoch": 2.21, + "grad_norm": 3.1054751957245403, + "learning_rate": 1.2336872485936213e-06, + "loss": 0.6283, + "step": 14824 + }, + { + "epoch": 2.21, + "grad_norm": 4.753850603055007, + "learning_rate": 1.2335933144898826e-06, + "loss": 0.6439, + "step": 14825 + }, + { + "epoch": 2.21, + "grad_norm": 3.164202757722794, + "learning_rate": 1.2334993782059967e-06, + "loss": 0.6966, + "step": 14826 + }, + { + "epoch": 2.21, + "grad_norm": 3.981315660371182, + "learning_rate": 1.2334054397428408e-06, + "loss": 0.6374, + "step": 14827 + }, + { + "epoch": 2.21, + "grad_norm": 3.2097403360572856, + "learning_rate": 1.2333114991012907e-06, + "loss": 0.6719, + "step": 14828 + }, + { + "epoch": 2.21, + "grad_norm": 4.292577672159467, + "learning_rate": 1.233217556282224e-06, + "loss": 0.6484, + "step": 14829 + }, + { + "epoch": 2.21, + "grad_norm": 2.6494334091228446, + "learning_rate": 1.2331236112865172e-06, + "loss": 0.5924, + "step": 14830 + }, + { + "epoch": 2.21, + "grad_norm": 4.308286544939772, + "learning_rate": 1.2330296641150473e-06, + "loss": 0.6491, + "step": 14831 + }, + { + "epoch": 2.21, + "grad_norm": 4.106058137482465, + "learning_rate": 1.2329357147686907e-06, + "loss": 0.5905, + "step": 14832 + }, + { + "epoch": 2.21, + "grad_norm": 3.01613127263927, + "learning_rate": 1.2328417632483247e-06, + "loss": 0.6406, + "step": 14833 + }, + { + "epoch": 2.21, + "grad_norm": 2.8023610577514018, + "learning_rate": 1.2327478095548256e-06, + "loss": 0.6419, + "step": 14834 + }, + { + "epoch": 2.21, + "grad_norm": 3.144593604731493, + "learning_rate": 1.232653853689071e-06, + "loss": 0.6764, + "step": 14835 + }, + { + "epoch": 2.21, + "grad_norm": 3.5922845387629248, + "learning_rate": 1.2325598956519373e-06, + "loss": 0.6152, + "step": 14836 + }, + { + "epoch": 2.21, + "grad_norm": 2.9750722736724797, + "learning_rate": 1.2324659354443012e-06, + "loss": 0.6693, + "step": 14837 + }, + { + "epoch": 2.21, + "grad_norm": 2.9916342504003155, + "learning_rate": 1.2323719730670402e-06, + "loss": 0.6504, + "step": 14838 + }, + { + "epoch": 2.21, + "grad_norm": 3.6166650345255507, + "learning_rate": 1.2322780085210312e-06, + "loss": 0.6198, + "step": 14839 + }, + { + "epoch": 2.21, + "grad_norm": 2.7277831528119125, + "learning_rate": 1.2321840418071507e-06, + "loss": 0.6803, + "step": 14840 + }, + { + "epoch": 2.21, + "grad_norm": 3.0697596794634454, + "learning_rate": 1.2320900729262763e-06, + "loss": 0.64, + "step": 14841 + }, + { + "epoch": 2.21, + "grad_norm": 4.265729850087329, + "learning_rate": 1.2319961018792843e-06, + "loss": 0.6478, + "step": 14842 + }, + { + "epoch": 2.21, + "grad_norm": 2.7430929320664013, + "learning_rate": 1.2319021286670525e-06, + "loss": 0.6582, + "step": 14843 + }, + { + "epoch": 2.21, + "grad_norm": 2.9651861325575575, + "learning_rate": 1.2318081532904576e-06, + "loss": 0.6751, + "step": 14844 + }, + { + "epoch": 2.21, + "grad_norm": 4.430620740314163, + "learning_rate": 1.2317141757503764e-06, + "loss": 0.6439, + "step": 14845 + }, + { + "epoch": 2.21, + "grad_norm": 3.3819521406122304, + "learning_rate": 1.2316201960476869e-06, + "loss": 0.666, + "step": 14846 + }, + { + "epoch": 2.21, + "grad_norm": 4.090212104553938, + "learning_rate": 1.231526214183265e-06, + "loss": 0.6921, + "step": 14847 + }, + { + "epoch": 2.21, + "grad_norm": 4.599457884910559, + "learning_rate": 1.231432230157989e-06, + "loss": 0.638, + "step": 14848 + }, + { + "epoch": 2.21, + "grad_norm": 3.3530235877272507, + "learning_rate": 1.2313382439727353e-06, + "loss": 0.6602, + "step": 14849 + }, + { + "epoch": 2.21, + "grad_norm": 2.4412711006640615, + "learning_rate": 1.2312442556283812e-06, + "loss": 0.6439, + "step": 14850 + }, + { + "epoch": 2.21, + "grad_norm": 4.312765496758351, + "learning_rate": 1.2311502651258042e-06, + "loss": 0.6243, + "step": 14851 + }, + { + "epoch": 2.22, + "grad_norm": 4.515934661630445, + "learning_rate": 1.2310562724658812e-06, + "loss": 0.6406, + "step": 14852 + }, + { + "epoch": 2.22, + "grad_norm": 3.3233082939797147, + "learning_rate": 1.2309622776494894e-06, + "loss": 0.6204, + "step": 14853 + }, + { + "epoch": 2.22, + "grad_norm": 2.456097350573825, + "learning_rate": 1.2308682806775066e-06, + "loss": 0.6608, + "step": 14854 + }, + { + "epoch": 2.22, + "grad_norm": 3.274975439594404, + "learning_rate": 1.2307742815508096e-06, + "loss": 0.6641, + "step": 14855 + }, + { + "epoch": 2.22, + "grad_norm": 3.3031433274100146, + "learning_rate": 1.2306802802702761e-06, + "loss": 0.6764, + "step": 14856 + }, + { + "epoch": 2.22, + "grad_norm": 4.088899302239602, + "learning_rate": 1.2305862768367825e-06, + "loss": 0.6419, + "step": 14857 + }, + { + "epoch": 2.22, + "grad_norm": 3.0208671871928323, + "learning_rate": 1.230492271251207e-06, + "loss": 0.6602, + "step": 14858 + }, + { + "epoch": 2.22, + "grad_norm": 4.0694375717846825, + "learning_rate": 1.2303982635144273e-06, + "loss": 0.6693, + "step": 14859 + }, + { + "epoch": 2.22, + "grad_norm": 2.5838115236571224, + "learning_rate": 1.2303042536273196e-06, + "loss": 0.6517, + "step": 14860 + }, + { + "epoch": 2.22, + "grad_norm": 2.075132142652196, + "learning_rate": 1.2302102415907622e-06, + "loss": 0.6146, + "step": 14861 + }, + { + "epoch": 2.22, + "grad_norm": 2.345147673171704, + "learning_rate": 1.2301162274056322e-06, + "loss": 0.681, + "step": 14862 + }, + { + "epoch": 2.22, + "grad_norm": 2.493609918073523, + "learning_rate": 1.230022211072807e-06, + "loss": 0.6302, + "step": 14863 + }, + { + "epoch": 2.22, + "grad_norm": 4.180331236032236, + "learning_rate": 1.2299281925931643e-06, + "loss": 0.6888, + "step": 14864 + }, + { + "epoch": 2.22, + "grad_norm": 2.570147226418743, + "learning_rate": 1.2298341719675812e-06, + "loss": 0.6354, + "step": 14865 + }, + { + "epoch": 2.22, + "grad_norm": 4.413159904516105, + "learning_rate": 1.2297401491969357e-06, + "loss": 0.6152, + "step": 14866 + }, + { + "epoch": 2.22, + "grad_norm": 3.503808785046484, + "learning_rate": 1.2296461242821052e-06, + "loss": 0.6673, + "step": 14867 + }, + { + "epoch": 2.22, + "grad_norm": 3.096795527644905, + "learning_rate": 1.2295520972239668e-06, + "loss": 0.6426, + "step": 14868 + }, + { + "epoch": 2.22, + "grad_norm": 3.013433398498144, + "learning_rate": 1.2294580680233986e-06, + "loss": 0.6439, + "step": 14869 + }, + { + "epoch": 2.22, + "grad_norm": 2.646327776712175, + "learning_rate": 1.229364036681278e-06, + "loss": 0.6491, + "step": 14870 + }, + { + "epoch": 2.22, + "grad_norm": 2.175248865550721, + "learning_rate": 1.2292700031984822e-06, + "loss": 0.6628, + "step": 14871 + }, + { + "epoch": 2.22, + "grad_norm": 3.2321038679773157, + "learning_rate": 1.2291759675758896e-06, + "loss": 0.6445, + "step": 14872 + }, + { + "epoch": 2.22, + "grad_norm": 2.2803920530034665, + "learning_rate": 1.229081929814377e-06, + "loss": 0.6309, + "step": 14873 + }, + { + "epoch": 2.22, + "grad_norm": 2.531280975075025, + "learning_rate": 1.2289878899148227e-06, + "loss": 0.6875, + "step": 14874 + }, + { + "epoch": 2.22, + "grad_norm": 4.294244646093791, + "learning_rate": 1.2288938478781047e-06, + "loss": 0.6263, + "step": 14875 + }, + { + "epoch": 2.22, + "grad_norm": 2.455831984070948, + "learning_rate": 1.2287998037050993e-06, + "loss": 0.6517, + "step": 14876 + }, + { + "epoch": 2.22, + "grad_norm": 2.5022888977262827, + "learning_rate": 1.2287057573966857e-06, + "loss": 0.6348, + "step": 14877 + }, + { + "epoch": 2.22, + "grad_norm": 3.6177095800203323, + "learning_rate": 1.2286117089537408e-06, + "loss": 0.6595, + "step": 14878 + }, + { + "epoch": 2.22, + "grad_norm": 3.0769677484291593, + "learning_rate": 1.2285176583771423e-06, + "loss": 0.7025, + "step": 14879 + }, + { + "epoch": 2.22, + "grad_norm": 3.8522028310047416, + "learning_rate": 1.2284236056677687e-06, + "loss": 0.6615, + "step": 14880 + }, + { + "epoch": 2.22, + "grad_norm": 3.848876919097087, + "learning_rate": 1.2283295508264973e-06, + "loss": 0.6576, + "step": 14881 + }, + { + "epoch": 2.22, + "grad_norm": 3.43828338751755, + "learning_rate": 1.228235493854206e-06, + "loss": 0.6458, + "step": 14882 + }, + { + "epoch": 2.22, + "grad_norm": 4.8538484193100135, + "learning_rate": 1.2281414347517728e-06, + "loss": 0.6745, + "step": 14883 + }, + { + "epoch": 2.22, + "grad_norm": 3.1384431597981566, + "learning_rate": 1.2280473735200752e-06, + "loss": 0.6634, + "step": 14884 + }, + { + "epoch": 2.22, + "grad_norm": 2.98300003134596, + "learning_rate": 1.2279533101599914e-06, + "loss": 0.6439, + "step": 14885 + }, + { + "epoch": 2.22, + "grad_norm": 2.323841448161686, + "learning_rate": 1.2278592446723992e-06, + "loss": 0.6621, + "step": 14886 + }, + { + "epoch": 2.22, + "grad_norm": 3.1685562515543615, + "learning_rate": 1.2277651770581763e-06, + "loss": 0.6471, + "step": 14887 + }, + { + "epoch": 2.22, + "grad_norm": 2.3217006867579033, + "learning_rate": 1.227671107318201e-06, + "loss": 0.625, + "step": 14888 + }, + { + "epoch": 2.22, + "grad_norm": 2.7711284102937617, + "learning_rate": 1.2275770354533514e-06, + "loss": 0.6693, + "step": 14889 + }, + { + "epoch": 2.22, + "grad_norm": 2.213852650089714, + "learning_rate": 1.2274829614645048e-06, + "loss": 0.6198, + "step": 14890 + }, + { + "epoch": 2.22, + "grad_norm": 4.869401874099204, + "learning_rate": 1.2273888853525399e-06, + "loss": 0.7031, + "step": 14891 + }, + { + "epoch": 2.22, + "grad_norm": 2.856540106657301, + "learning_rate": 1.2272948071183343e-06, + "loss": 0.6283, + "step": 14892 + }, + { + "epoch": 2.22, + "grad_norm": 2.4894298566449455, + "learning_rate": 1.2272007267627663e-06, + "loss": 0.6452, + "step": 14893 + }, + { + "epoch": 2.22, + "grad_norm": 4.991350565724439, + "learning_rate": 1.2271066442867135e-06, + "loss": 0.6224, + "step": 14894 + }, + { + "epoch": 2.22, + "grad_norm": 5.1380605296612325, + "learning_rate": 1.2270125596910546e-06, + "loss": 0.6764, + "step": 14895 + }, + { + "epoch": 2.22, + "grad_norm": 2.6698565170813993, + "learning_rate": 1.2269184729766673e-06, + "loss": 0.6458, + "step": 14896 + }, + { + "epoch": 2.22, + "grad_norm": 3.9263413756641183, + "learning_rate": 1.2268243841444297e-06, + "loss": 0.666, + "step": 14897 + }, + { + "epoch": 2.22, + "grad_norm": 4.155871220180058, + "learning_rate": 1.22673029319522e-06, + "loss": 0.6048, + "step": 14898 + }, + { + "epoch": 2.22, + "grad_norm": 2.510760038976961, + "learning_rate": 1.226636200129917e-06, + "loss": 0.653, + "step": 14899 + }, + { + "epoch": 2.22, + "grad_norm": 2.9540869318581455, + "learning_rate": 1.2265421049493976e-06, + "loss": 0.6491, + "step": 14900 + }, + { + "epoch": 2.22, + "grad_norm": 3.7250633855498694, + "learning_rate": 1.226448007654541e-06, + "loss": 0.6725, + "step": 14901 + }, + { + "epoch": 2.22, + "grad_norm": 2.5473033160154563, + "learning_rate": 1.2263539082462254e-06, + "loss": 0.6214, + "step": 14902 + }, + { + "epoch": 2.22, + "grad_norm": 2.778867707562717, + "learning_rate": 1.226259806725328e-06, + "loss": 0.6406, + "step": 14903 + }, + { + "epoch": 2.22, + "grad_norm": 3.246283289064335, + "learning_rate": 1.2261657030927285e-06, + "loss": 0.6335, + "step": 14904 + }, + { + "epoch": 2.22, + "grad_norm": 3.2668131600026054, + "learning_rate": 1.2260715973493043e-06, + "loss": 0.6484, + "step": 14905 + }, + { + "epoch": 2.22, + "grad_norm": 4.714356091875881, + "learning_rate": 1.2259774894959337e-06, + "loss": 0.6322, + "step": 14906 + }, + { + "epoch": 2.22, + "grad_norm": 3.534243776794797, + "learning_rate": 1.2258833795334956e-06, + "loss": 0.6732, + "step": 14907 + }, + { + "epoch": 2.22, + "grad_norm": 2.7684426282374814, + "learning_rate": 1.2257892674628676e-06, + "loss": 0.6224, + "step": 14908 + }, + { + "epoch": 2.22, + "grad_norm": 2.610251339114198, + "learning_rate": 1.2256951532849286e-06, + "loss": 0.6758, + "step": 14909 + }, + { + "epoch": 2.22, + "grad_norm": 4.172692988674572, + "learning_rate": 1.2256010370005567e-06, + "loss": 0.6419, + "step": 14910 + }, + { + "epoch": 2.22, + "grad_norm": 2.677320735609074, + "learning_rate": 1.2255069186106301e-06, + "loss": 0.6543, + "step": 14911 + }, + { + "epoch": 2.22, + "grad_norm": 2.684540955842827, + "learning_rate": 1.225412798116028e-06, + "loss": 0.653, + "step": 14912 + }, + { + "epoch": 2.22, + "grad_norm": 3.587167399768182, + "learning_rate": 1.2253186755176282e-06, + "loss": 0.6341, + "step": 14913 + }, + { + "epoch": 2.22, + "grad_norm": 3.124523532697952, + "learning_rate": 1.225224550816309e-06, + "loss": 0.6302, + "step": 14914 + }, + { + "epoch": 2.22, + "grad_norm": 3.1508168961145455, + "learning_rate": 1.2251304240129494e-06, + "loss": 0.6243, + "step": 14915 + }, + { + "epoch": 2.22, + "grad_norm": 3.053458626520905, + "learning_rate": 1.2250362951084274e-06, + "loss": 0.6901, + "step": 14916 + }, + { + "epoch": 2.22, + "grad_norm": 4.055842363481635, + "learning_rate": 1.224942164103622e-06, + "loss": 0.6549, + "step": 14917 + }, + { + "epoch": 2.22, + "grad_norm": 2.7516058228412934, + "learning_rate": 1.2248480309994113e-06, + "loss": 0.6439, + "step": 14918 + }, + { + "epoch": 2.23, + "grad_norm": 3.289368819467012, + "learning_rate": 1.2247538957966741e-06, + "loss": 0.7064, + "step": 14919 + }, + { + "epoch": 2.23, + "grad_norm": 3.1426867878235774, + "learning_rate": 1.224659758496289e-06, + "loss": 0.6146, + "step": 14920 + }, + { + "epoch": 2.23, + "grad_norm": 3.3626716976959874, + "learning_rate": 1.2245656190991342e-06, + "loss": 0.6413, + "step": 14921 + }, + { + "epoch": 2.23, + "grad_norm": 4.95641206486636, + "learning_rate": 1.2244714776060892e-06, + "loss": 0.6582, + "step": 14922 + }, + { + "epoch": 2.23, + "grad_norm": 2.607742843067064, + "learning_rate": 1.2243773340180317e-06, + "loss": 0.6003, + "step": 14923 + }, + { + "epoch": 2.23, + "grad_norm": 2.8435956784432843, + "learning_rate": 1.2242831883358404e-06, + "loss": 0.6445, + "step": 14924 + }, + { + "epoch": 2.23, + "grad_norm": 2.543851144382488, + "learning_rate": 1.224189040560395e-06, + "loss": 0.6634, + "step": 14925 + }, + { + "epoch": 2.23, + "grad_norm": 2.483720972547578, + "learning_rate": 1.2240948906925728e-06, + "loss": 0.5892, + "step": 14926 + }, + { + "epoch": 2.23, + "grad_norm": 2.8795077345881475, + "learning_rate": 1.2240007387332535e-06, + "loss": 0.6133, + "step": 14927 + }, + { + "epoch": 2.23, + "grad_norm": 3.3220531316406703, + "learning_rate": 1.2239065846833155e-06, + "loss": 0.5951, + "step": 14928 + }, + { + "epoch": 2.23, + "grad_norm": 3.172593049376459, + "learning_rate": 1.2238124285436374e-06, + "loss": 0.6361, + "step": 14929 + }, + { + "epoch": 2.23, + "grad_norm": 3.647991696928895, + "learning_rate": 1.2237182703150982e-06, + "loss": 0.6309, + "step": 14930 + }, + { + "epoch": 2.23, + "grad_norm": 4.993210288698324, + "learning_rate": 1.2236241099985762e-06, + "loss": 0.6491, + "step": 14931 + }, + { + "epoch": 2.23, + "grad_norm": 2.7252812742997676, + "learning_rate": 1.2235299475949512e-06, + "loss": 0.6745, + "step": 14932 + }, + { + "epoch": 2.23, + "grad_norm": 2.6342025689664204, + "learning_rate": 1.2234357831051012e-06, + "loss": 0.6455, + "step": 14933 + }, + { + "epoch": 2.23, + "grad_norm": 2.824185006894598, + "learning_rate": 1.223341616529905e-06, + "loss": 0.6419, + "step": 14934 + }, + { + "epoch": 2.23, + "grad_norm": 2.837842234573174, + "learning_rate": 1.2232474478702421e-06, + "loss": 0.638, + "step": 14935 + }, + { + "epoch": 2.23, + "grad_norm": 6.323179965534723, + "learning_rate": 1.223153277126991e-06, + "loss": 0.6374, + "step": 14936 + }, + { + "epoch": 2.23, + "grad_norm": 3.7729106928715757, + "learning_rate": 1.2230591043010305e-06, + "loss": 0.6803, + "step": 14937 + }, + { + "epoch": 2.23, + "grad_norm": 2.6106876092854376, + "learning_rate": 1.22296492939324e-06, + "loss": 0.6439, + "step": 14938 + }, + { + "epoch": 2.23, + "grad_norm": 2.510332468272036, + "learning_rate": 1.2228707524044976e-06, + "loss": 0.6296, + "step": 14939 + }, + { + "epoch": 2.23, + "grad_norm": 3.773020544297159, + "learning_rate": 1.222776573335683e-06, + "loss": 0.6738, + "step": 14940 + }, + { + "epoch": 2.23, + "grad_norm": 3.1652797704998936, + "learning_rate": 1.222682392187675e-06, + "loss": 0.6947, + "step": 14941 + }, + { + "epoch": 2.23, + "grad_norm": 2.9317107553164434, + "learning_rate": 1.2225882089613525e-06, + "loss": 0.6517, + "step": 14942 + }, + { + "epoch": 2.23, + "grad_norm": 2.8328082198448095, + "learning_rate": 1.2224940236575944e-06, + "loss": 0.6471, + "step": 14943 + }, + { + "epoch": 2.23, + "grad_norm": 2.3897386014666124, + "learning_rate": 1.22239983627728e-06, + "loss": 0.6016, + "step": 14944 + }, + { + "epoch": 2.23, + "grad_norm": 3.4774104379214066, + "learning_rate": 1.2223056468212885e-06, + "loss": 0.6758, + "step": 14945 + }, + { + "epoch": 2.23, + "grad_norm": 3.664944867372338, + "learning_rate": 1.2222114552904985e-06, + "loss": 0.6732, + "step": 14946 + }, + { + "epoch": 2.23, + "grad_norm": 2.3164986910823915, + "learning_rate": 1.2221172616857892e-06, + "loss": 0.6341, + "step": 14947 + }, + { + "epoch": 2.23, + "grad_norm": 2.4254899606884726, + "learning_rate": 1.22202306600804e-06, + "loss": 0.6419, + "step": 14948 + }, + { + "epoch": 2.23, + "grad_norm": 2.5633120394412354, + "learning_rate": 1.22192886825813e-06, + "loss": 0.6556, + "step": 14949 + }, + { + "epoch": 2.23, + "grad_norm": 6.21862264091901, + "learning_rate": 1.221834668436938e-06, + "loss": 0.6445, + "step": 14950 + }, + { + "epoch": 2.23, + "grad_norm": 2.531473709553242, + "learning_rate": 1.2217404665453437e-06, + "loss": 0.6354, + "step": 14951 + }, + { + "epoch": 2.23, + "grad_norm": 2.6803760463346173, + "learning_rate": 1.221646262584226e-06, + "loss": 0.6491, + "step": 14952 + }, + { + "epoch": 2.23, + "grad_norm": 2.5633049511178525, + "learning_rate": 1.2215520565544638e-06, + "loss": 0.6641, + "step": 14953 + }, + { + "epoch": 2.23, + "grad_norm": 3.497661977545487, + "learning_rate": 1.2214578484569369e-06, + "loss": 0.666, + "step": 14954 + }, + { + "epoch": 2.23, + "grad_norm": 2.264893433678513, + "learning_rate": 1.2213636382925244e-06, + "loss": 0.6458, + "step": 14955 + }, + { + "epoch": 2.23, + "grad_norm": 2.0599054643543706, + "learning_rate": 1.221269426062105e-06, + "loss": 0.6113, + "step": 14956 + }, + { + "epoch": 2.23, + "grad_norm": 6.355018986244844, + "learning_rate": 1.2211752117665591e-06, + "loss": 0.6257, + "step": 14957 + }, + { + "epoch": 2.23, + "grad_norm": 4.374034231123579, + "learning_rate": 1.2210809954067652e-06, + "loss": 0.6432, + "step": 14958 + }, + { + "epoch": 2.23, + "grad_norm": 2.1554283268316823, + "learning_rate": 1.220986776983603e-06, + "loss": 0.6139, + "step": 14959 + }, + { + "epoch": 2.23, + "grad_norm": 2.12471833072646, + "learning_rate": 1.2208925564979511e-06, + "loss": 0.6419, + "step": 14960 + }, + { + "epoch": 2.23, + "grad_norm": 4.795324425519524, + "learning_rate": 1.2207983339506897e-06, + "loss": 0.6628, + "step": 14961 + }, + { + "epoch": 2.23, + "grad_norm": 3.4362275418740866, + "learning_rate": 1.220704109342698e-06, + "loss": 0.6445, + "step": 14962 + }, + { + "epoch": 2.23, + "grad_norm": 3.149436097997367, + "learning_rate": 1.2206098826748554e-06, + "loss": 0.653, + "step": 14963 + }, + { + "epoch": 2.23, + "grad_norm": 2.83277888811337, + "learning_rate": 1.2205156539480411e-06, + "loss": 0.6504, + "step": 14964 + }, + { + "epoch": 2.23, + "grad_norm": 2.5502171479313014, + "learning_rate": 1.2204214231631348e-06, + "loss": 0.6367, + "step": 14965 + }, + { + "epoch": 2.23, + "grad_norm": 4.675823510031592, + "learning_rate": 1.2203271903210157e-06, + "loss": 0.6387, + "step": 14966 + }, + { + "epoch": 2.23, + "grad_norm": 2.3904408514651063, + "learning_rate": 1.2202329554225638e-06, + "loss": 0.6198, + "step": 14967 + }, + { + "epoch": 2.23, + "grad_norm": 3.4920719919736274, + "learning_rate": 1.220138718468658e-06, + "loss": 0.6419, + "step": 14968 + }, + { + "epoch": 2.23, + "grad_norm": 2.530068143195515, + "learning_rate": 1.220044479460178e-06, + "loss": 0.6445, + "step": 14969 + }, + { + "epoch": 2.23, + "grad_norm": 3.0963784266639167, + "learning_rate": 1.2199502383980035e-06, + "loss": 0.6641, + "step": 14970 + }, + { + "epoch": 2.23, + "grad_norm": 2.649001340568826, + "learning_rate": 1.2198559952830141e-06, + "loss": 0.6211, + "step": 14971 + }, + { + "epoch": 2.23, + "grad_norm": 5.447840727029424, + "learning_rate": 1.2197617501160892e-06, + "loss": 0.6836, + "step": 14972 + }, + { + "epoch": 2.23, + "grad_norm": 3.316059849865935, + "learning_rate": 1.2196675028981082e-06, + "loss": 0.6634, + "step": 14973 + }, + { + "epoch": 2.23, + "grad_norm": 2.869552214653787, + "learning_rate": 1.2195732536299513e-06, + "loss": 0.6296, + "step": 14974 + }, + { + "epoch": 2.23, + "grad_norm": 2.580098592464276, + "learning_rate": 1.219479002312498e-06, + "loss": 0.6523, + "step": 14975 + }, + { + "epoch": 2.23, + "grad_norm": 2.4182591593321416, + "learning_rate": 1.2193847489466272e-06, + "loss": 0.6159, + "step": 14976 + }, + { + "epoch": 2.23, + "grad_norm": 3.5161664991709123, + "learning_rate": 1.2192904935332191e-06, + "loss": 0.6426, + "step": 14977 + }, + { + "epoch": 2.23, + "grad_norm": 5.877126831836021, + "learning_rate": 1.219196236073154e-06, + "loss": 0.6315, + "step": 14978 + }, + { + "epoch": 2.23, + "grad_norm": 2.9453893885318174, + "learning_rate": 1.2191019765673106e-06, + "loss": 0.6452, + "step": 14979 + }, + { + "epoch": 2.23, + "grad_norm": 2.7614027663970897, + "learning_rate": 1.2190077150165695e-06, + "loss": 0.6966, + "step": 14980 + }, + { + "epoch": 2.23, + "grad_norm": 3.001378536077536, + "learning_rate": 1.2189134514218096e-06, + "loss": 0.624, + "step": 14981 + }, + { + "epoch": 2.23, + "grad_norm": 3.9931089189550066, + "learning_rate": 1.2188191857839111e-06, + "loss": 0.6999, + "step": 14982 + }, + { + "epoch": 2.23, + "grad_norm": 2.5448994535146676, + "learning_rate": 1.218724918103754e-06, + "loss": 0.6413, + "step": 14983 + }, + { + "epoch": 2.23, + "grad_norm": 4.607153672166351, + "learning_rate": 1.2186306483822178e-06, + "loss": 0.6901, + "step": 14984 + }, + { + "epoch": 2.23, + "grad_norm": 2.7585352468960913, + "learning_rate": 1.2185363766201824e-06, + "loss": 0.6921, + "step": 14985 + }, + { + "epoch": 2.24, + "grad_norm": 2.3367307116628933, + "learning_rate": 1.2184421028185279e-06, + "loss": 0.6481, + "step": 14986 + }, + { + "epoch": 2.24, + "grad_norm": 2.708620407341763, + "learning_rate": 1.2183478269781336e-06, + "loss": 0.6204, + "step": 14987 + }, + { + "epoch": 2.24, + "grad_norm": 2.728547828368543, + "learning_rate": 1.21825354909988e-06, + "loss": 0.6621, + "step": 14988 + }, + { + "epoch": 2.24, + "grad_norm": 2.6624579379113342, + "learning_rate": 1.2181592691846466e-06, + "loss": 0.6302, + "step": 14989 + }, + { + "epoch": 2.24, + "grad_norm": 3.0657032102416255, + "learning_rate": 1.2180649872333133e-06, + "loss": 0.6523, + "step": 14990 + }, + { + "epoch": 2.24, + "grad_norm": 2.7035802503469695, + "learning_rate": 1.2179707032467606e-06, + "loss": 0.6497, + "step": 14991 + }, + { + "epoch": 2.24, + "grad_norm": 2.321554282542098, + "learning_rate": 1.2178764172258676e-06, + "loss": 0.6471, + "step": 14992 + }, + { + "epoch": 2.24, + "grad_norm": 2.79410583250377, + "learning_rate": 1.2177821291715152e-06, + "loss": 0.5957, + "step": 14993 + }, + { + "epoch": 2.24, + "grad_norm": 5.760228973143944, + "learning_rate": 1.2176878390845826e-06, + "loss": 0.6523, + "step": 14994 + }, + { + "epoch": 2.24, + "grad_norm": 2.432944746515558, + "learning_rate": 1.21759354696595e-06, + "loss": 0.6439, + "step": 14995 + }, + { + "epoch": 2.24, + "grad_norm": 5.8848740252334455, + "learning_rate": 1.2174992528164979e-06, + "loss": 0.6458, + "step": 14996 + }, + { + "epoch": 2.24, + "grad_norm": 2.3680375322692377, + "learning_rate": 1.217404956637106e-06, + "loss": 0.6107, + "step": 14997 + }, + { + "epoch": 2.24, + "grad_norm": 4.3640605017818785, + "learning_rate": 1.217310658428654e-06, + "loss": 0.6224, + "step": 14998 + }, + { + "epoch": 2.24, + "grad_norm": 4.525248347534493, + "learning_rate": 1.217216358192023e-06, + "loss": 0.653, + "step": 14999 + }, + { + "epoch": 2.24, + "grad_norm": 2.8875420722615726, + "learning_rate": 1.2171220559280921e-06, + "loss": 0.6497, + "step": 15000 + }, + { + "epoch": 2.24, + "grad_norm": 3.03616289860513, + "learning_rate": 1.217027751637742e-06, + "loss": 0.5762, + "step": 15001 + }, + { + "epoch": 2.24, + "grad_norm": 6.102572751788112, + "learning_rate": 1.2169334453218529e-06, + "loss": 0.651, + "step": 15002 + }, + { + "epoch": 2.24, + "grad_norm": 2.518145156927488, + "learning_rate": 1.2168391369813044e-06, + "loss": 0.64, + "step": 15003 + }, + { + "epoch": 2.24, + "grad_norm": 4.093670368646742, + "learning_rate": 1.2167448266169774e-06, + "loss": 0.6777, + "step": 15004 + }, + { + "epoch": 2.24, + "grad_norm": 2.8596982515376883, + "learning_rate": 1.2166505142297515e-06, + "loss": 0.6361, + "step": 15005 + }, + { + "epoch": 2.24, + "grad_norm": 5.063526027147953, + "learning_rate": 1.2165561998205072e-06, + "loss": 0.6348, + "step": 15006 + }, + { + "epoch": 2.24, + "grad_norm": 2.6595119604960433, + "learning_rate": 1.216461883390125e-06, + "loss": 0.6191, + "step": 15007 + }, + { + "epoch": 2.24, + "grad_norm": 4.343829012477665, + "learning_rate": 1.2163675649394847e-06, + "loss": 0.7012, + "step": 15008 + }, + { + "epoch": 2.24, + "grad_norm": 6.246109810999246, + "learning_rate": 1.216273244469467e-06, + "loss": 0.612, + "step": 15009 + }, + { + "epoch": 2.24, + "grad_norm": 4.363215338696499, + "learning_rate": 1.2161789219809516e-06, + "loss": 0.6178, + "step": 15010 + }, + { + "epoch": 2.24, + "grad_norm": 2.2951142196895864, + "learning_rate": 1.2160845974748196e-06, + "loss": 0.5947, + "step": 15011 + }, + { + "epoch": 2.24, + "grad_norm": 3.3618862627780937, + "learning_rate": 1.2159902709519506e-06, + "loss": 0.6439, + "step": 15012 + }, + { + "epoch": 2.24, + "grad_norm": 5.118430282521023, + "learning_rate": 1.2158959424132254e-06, + "loss": 0.6641, + "step": 15013 + }, + { + "epoch": 2.24, + "grad_norm": 4.529673637924808, + "learning_rate": 1.2158016118595244e-06, + "loss": 0.6699, + "step": 15014 + }, + { + "epoch": 2.24, + "grad_norm": 3.2205623175682225, + "learning_rate": 1.2157072792917278e-06, + "loss": 0.6836, + "step": 15015 + }, + { + "epoch": 2.24, + "grad_norm": 10.23939689993248, + "learning_rate": 1.215612944710716e-06, + "loss": 0.6576, + "step": 15016 + }, + { + "epoch": 2.24, + "grad_norm": 2.857040956093604, + "learning_rate": 1.2155186081173695e-06, + "loss": 0.6387, + "step": 15017 + }, + { + "epoch": 2.24, + "grad_norm": 3.604307041734988, + "learning_rate": 1.2154242695125692e-06, + "loss": 0.6465, + "step": 15018 + }, + { + "epoch": 2.24, + "grad_norm": 3.255893456449706, + "learning_rate": 1.2153299288971946e-06, + "loss": 0.6484, + "step": 15019 + }, + { + "epoch": 2.24, + "grad_norm": 2.6727235952049186, + "learning_rate": 1.2152355862721269e-06, + "loss": 0.6341, + "step": 15020 + }, + { + "epoch": 2.24, + "grad_norm": 3.5896473005311016, + "learning_rate": 1.2151412416382468e-06, + "loss": 0.6419, + "step": 15021 + }, + { + "epoch": 2.24, + "grad_norm": 2.72416030142104, + "learning_rate": 1.215046894996434e-06, + "loss": 0.6413, + "step": 15022 + }, + { + "epoch": 2.24, + "grad_norm": 3.9051715289127675, + "learning_rate": 1.2149525463475696e-06, + "loss": 0.666, + "step": 15023 + }, + { + "epoch": 2.24, + "grad_norm": 2.3913526392672417, + "learning_rate": 1.2148581956925342e-06, + "loss": 0.6569, + "step": 15024 + }, + { + "epoch": 2.24, + "grad_norm": 2.8184617728691013, + "learning_rate": 1.2147638430322082e-06, + "loss": 0.623, + "step": 15025 + }, + { + "epoch": 2.24, + "grad_norm": 2.60749768572689, + "learning_rate": 1.2146694883674722e-06, + "loss": 0.6504, + "step": 15026 + }, + { + "epoch": 2.24, + "grad_norm": 3.0690636579991097, + "learning_rate": 1.214575131699207e-06, + "loss": 0.6751, + "step": 15027 + }, + { + "epoch": 2.24, + "grad_norm": 3.29910572576604, + "learning_rate": 1.214480773028293e-06, + "loss": 0.6576, + "step": 15028 + }, + { + "epoch": 2.24, + "grad_norm": 2.608633893889012, + "learning_rate": 1.214386412355611e-06, + "loss": 0.6745, + "step": 15029 + }, + { + "epoch": 2.24, + "grad_norm": 2.1127673160513174, + "learning_rate": 1.2142920496820418e-06, + "loss": 0.6003, + "step": 15030 + }, + { + "epoch": 2.24, + "grad_norm": 3.1758989972116005, + "learning_rate": 1.2141976850084658e-06, + "loss": 0.6608, + "step": 15031 + }, + { + "epoch": 2.24, + "grad_norm": 2.731216472850241, + "learning_rate": 1.2141033183357636e-06, + "loss": 0.7038, + "step": 15032 + }, + { + "epoch": 2.24, + "grad_norm": 3.256449953309859, + "learning_rate": 1.2140089496648168e-06, + "loss": 0.6146, + "step": 15033 + }, + { + "epoch": 2.24, + "grad_norm": 2.7360217253552213, + "learning_rate": 1.2139145789965053e-06, + "loss": 0.6029, + "step": 15034 + }, + { + "epoch": 2.24, + "grad_norm": 2.4094210641449703, + "learning_rate": 1.2138202063317097e-06, + "loss": 0.6393, + "step": 15035 + }, + { + "epoch": 2.24, + "grad_norm": 3.264536405618316, + "learning_rate": 1.2137258316713116e-06, + "loss": 0.668, + "step": 15036 + }, + { + "epoch": 2.24, + "grad_norm": 3.1497696821511036, + "learning_rate": 1.2136314550161916e-06, + "loss": 0.6266, + "step": 15037 + }, + { + "epoch": 2.24, + "grad_norm": 3.6370606489808033, + "learning_rate": 1.21353707636723e-06, + "loss": 0.5931, + "step": 15038 + }, + { + "epoch": 2.24, + "grad_norm": 2.1966777168120992, + "learning_rate": 1.213442695725308e-06, + "loss": 0.625, + "step": 15039 + }, + { + "epoch": 2.24, + "grad_norm": 3.520722819473551, + "learning_rate": 1.2133483130913065e-06, + "loss": 0.6204, + "step": 15040 + }, + { + "epoch": 2.24, + "grad_norm": 2.475536002947585, + "learning_rate": 1.2132539284661066e-06, + "loss": 0.6582, + "step": 15041 + }, + { + "epoch": 2.24, + "grad_norm": 3.0621103388771393, + "learning_rate": 1.2131595418505888e-06, + "loss": 0.6549, + "step": 15042 + }, + { + "epoch": 2.24, + "grad_norm": 3.6106867528996367, + "learning_rate": 1.213065153245634e-06, + "loss": 0.6471, + "step": 15043 + }, + { + "epoch": 2.24, + "grad_norm": 3.7338902617244307, + "learning_rate": 1.2129707626521234e-06, + "loss": 0.6276, + "step": 15044 + }, + { + "epoch": 2.24, + "grad_norm": 2.5474802957420213, + "learning_rate": 1.2128763700709374e-06, + "loss": 0.6458, + "step": 15045 + }, + { + "epoch": 2.24, + "grad_norm": 2.5230745558124283, + "learning_rate": 1.212781975502958e-06, + "loss": 0.6257, + "step": 15046 + }, + { + "epoch": 2.24, + "grad_norm": 2.7422119740197313, + "learning_rate": 1.2126875789490654e-06, + "loss": 0.6491, + "step": 15047 + }, + { + "epoch": 2.24, + "grad_norm": 2.423415466690318, + "learning_rate": 1.2125931804101406e-06, + "loss": 0.6055, + "step": 15048 + }, + { + "epoch": 2.24, + "grad_norm": 2.2381158305589106, + "learning_rate": 1.2124987798870652e-06, + "loss": 0.6257, + "step": 15049 + }, + { + "epoch": 2.24, + "grad_norm": 7.699796551411144, + "learning_rate": 1.2124043773807196e-06, + "loss": 0.6335, + "step": 15050 + }, + { + "epoch": 2.24, + "grad_norm": 2.544073588982971, + "learning_rate": 1.2123099728919852e-06, + "loss": 0.6335, + "step": 15051 + }, + { + "epoch": 2.24, + "grad_norm": 4.0365466833429675, + "learning_rate": 1.2122155664217434e-06, + "loss": 0.6126, + "step": 15052 + }, + { + "epoch": 2.25, + "grad_norm": 2.537505181023195, + "learning_rate": 1.2121211579708742e-06, + "loss": 0.6419, + "step": 15053 + }, + { + "epoch": 2.25, + "grad_norm": 2.547026386290593, + "learning_rate": 1.2120267475402606e-06, + "loss": 0.6025, + "step": 15054 + }, + { + "epoch": 2.25, + "grad_norm": 2.722324600625174, + "learning_rate": 1.2119323351307818e-06, + "loss": 0.6367, + "step": 15055 + }, + { + "epoch": 2.25, + "grad_norm": 2.8611535191027, + "learning_rate": 1.2118379207433198e-06, + "loss": 0.6608, + "step": 15056 + }, + { + "epoch": 2.25, + "grad_norm": 4.479727212343739, + "learning_rate": 1.211743504378756e-06, + "loss": 0.6328, + "step": 15057 + }, + { + "epoch": 2.25, + "grad_norm": 3.061803401763941, + "learning_rate": 1.2116490860379712e-06, + "loss": 0.6497, + "step": 15058 + }, + { + "epoch": 2.25, + "grad_norm": 4.512784468948868, + "learning_rate": 1.211554665721847e-06, + "loss": 0.6621, + "step": 15059 + }, + { + "epoch": 2.25, + "grad_norm": 3.7782489283976917, + "learning_rate": 1.211460243431264e-06, + "loss": 0.6641, + "step": 15060 + }, + { + "epoch": 2.25, + "grad_norm": 3.735835953791786, + "learning_rate": 1.211365819167104e-06, + "loss": 0.6608, + "step": 15061 + }, + { + "epoch": 2.25, + "grad_norm": 5.35304992055977, + "learning_rate": 1.2112713929302484e-06, + "loss": 0.6556, + "step": 15062 + }, + { + "epoch": 2.25, + "grad_norm": 3.3414170957899096, + "learning_rate": 1.2111769647215776e-06, + "loss": 0.6471, + "step": 15063 + }, + { + "epoch": 2.25, + "grad_norm": 3.528642240150095, + "learning_rate": 1.2110825345419741e-06, + "loss": 0.6706, + "step": 15064 + }, + { + "epoch": 2.25, + "grad_norm": 2.803666811359874, + "learning_rate": 1.2109881023923185e-06, + "loss": 0.5931, + "step": 15065 + }, + { + "epoch": 2.25, + "grad_norm": 3.1713731309368858, + "learning_rate": 1.2108936682734921e-06, + "loss": 0.6413, + "step": 15066 + }, + { + "epoch": 2.25, + "grad_norm": 2.9596762083419645, + "learning_rate": 1.2107992321863768e-06, + "loss": 0.6165, + "step": 15067 + }, + { + "epoch": 2.25, + "grad_norm": 3.0899678470749627, + "learning_rate": 1.2107047941318533e-06, + "loss": 0.6191, + "step": 15068 + }, + { + "epoch": 2.25, + "grad_norm": 3.0685181224407745, + "learning_rate": 1.2106103541108033e-06, + "loss": 0.6217, + "step": 15069 + }, + { + "epoch": 2.25, + "grad_norm": 2.585081409390481, + "learning_rate": 1.2105159121241085e-06, + "loss": 0.6003, + "step": 15070 + }, + { + "epoch": 2.25, + "grad_norm": 3.4200041025332815, + "learning_rate": 1.21042146817265e-06, + "loss": 0.6452, + "step": 15071 + }, + { + "epoch": 2.25, + "grad_norm": 3.5216861570850573, + "learning_rate": 1.210327022257309e-06, + "loss": 0.6589, + "step": 15072 + }, + { + "epoch": 2.25, + "grad_norm": 3.777960295058669, + "learning_rate": 1.2102325743789677e-06, + "loss": 0.6777, + "step": 15073 + }, + { + "epoch": 2.25, + "grad_norm": 3.2475203956530305, + "learning_rate": 1.210138124538507e-06, + "loss": 0.6595, + "step": 15074 + }, + { + "epoch": 2.25, + "grad_norm": 3.1232894406592804, + "learning_rate": 1.2100436727368086e-06, + "loss": 0.5938, + "step": 15075 + }, + { + "epoch": 2.25, + "grad_norm": 3.0015511787064795, + "learning_rate": 1.209949218974754e-06, + "loss": 0.5983, + "step": 15076 + }, + { + "epoch": 2.25, + "grad_norm": 3.7237979303136726, + "learning_rate": 1.209854763253225e-06, + "loss": 0.627, + "step": 15077 + }, + { + "epoch": 2.25, + "grad_norm": 2.823351636568076, + "learning_rate": 1.2097603055731026e-06, + "loss": 0.5964, + "step": 15078 + }, + { + "epoch": 2.25, + "grad_norm": 4.9598444373059785, + "learning_rate": 1.209665845935269e-06, + "loss": 0.6953, + "step": 15079 + }, + { + "epoch": 2.25, + "grad_norm": 3.6722904146969704, + "learning_rate": 1.2095713843406055e-06, + "loss": 0.6452, + "step": 15080 + }, + { + "epoch": 2.25, + "grad_norm": 2.8875882345861346, + "learning_rate": 1.2094769207899935e-06, + "loss": 0.6374, + "step": 15081 + }, + { + "epoch": 2.25, + "grad_norm": 2.523861831495266, + "learning_rate": 1.209382455284315e-06, + "loss": 0.6182, + "step": 15082 + }, + { + "epoch": 2.25, + "grad_norm": 3.0769881698506403, + "learning_rate": 1.2092879878244514e-06, + "loss": 0.612, + "step": 15083 + }, + { + "epoch": 2.25, + "grad_norm": 3.1712611656905687, + "learning_rate": 1.2091935184112849e-06, + "loss": 0.6328, + "step": 15084 + }, + { + "epoch": 2.25, + "grad_norm": 5.507145712798861, + "learning_rate": 1.2090990470456964e-06, + "loss": 0.6335, + "step": 15085 + }, + { + "epoch": 2.25, + "grad_norm": 3.49799498547592, + "learning_rate": 1.2090045737285684e-06, + "loss": 0.6517, + "step": 15086 + }, + { + "epoch": 2.25, + "grad_norm": 5.448965582536913, + "learning_rate": 1.2089100984607819e-06, + "loss": 0.6315, + "step": 15087 + }, + { + "epoch": 2.25, + "grad_norm": 3.672911594131762, + "learning_rate": 1.208815621243219e-06, + "loss": 0.6927, + "step": 15088 + }, + { + "epoch": 2.25, + "grad_norm": 4.922601877340478, + "learning_rate": 1.2087211420767615e-06, + "loss": 0.6979, + "step": 15089 + }, + { + "epoch": 2.25, + "grad_norm": 7.108385608399602, + "learning_rate": 1.208626660962291e-06, + "loss": 0.6087, + "step": 15090 + }, + { + "epoch": 2.25, + "grad_norm": 2.6617348925826225, + "learning_rate": 1.2085321779006898e-06, + "loss": 0.6569, + "step": 15091 + }, + { + "epoch": 2.25, + "grad_norm": 2.4721828454098094, + "learning_rate": 1.2084376928928389e-06, + "loss": 0.6608, + "step": 15092 + }, + { + "epoch": 2.25, + "grad_norm": 3.63303006716086, + "learning_rate": 1.2083432059396205e-06, + "loss": 0.6159, + "step": 15093 + }, + { + "epoch": 2.25, + "grad_norm": 4.148290368562641, + "learning_rate": 1.208248717041917e-06, + "loss": 0.638, + "step": 15094 + }, + { + "epoch": 2.25, + "grad_norm": 5.849541786378522, + "learning_rate": 1.2081542262006097e-06, + "loss": 0.6276, + "step": 15095 + }, + { + "epoch": 2.25, + "grad_norm": 2.8888342139101506, + "learning_rate": 1.2080597334165806e-06, + "loss": 0.6406, + "step": 15096 + }, + { + "epoch": 2.25, + "grad_norm": 2.58373012682329, + "learning_rate": 1.2079652386907116e-06, + "loss": 0.6517, + "step": 15097 + }, + { + "epoch": 2.25, + "grad_norm": 2.8559842937201396, + "learning_rate": 1.2078707420238843e-06, + "loss": 0.6393, + "step": 15098 + }, + { + "epoch": 2.25, + "grad_norm": 2.9873154121088126, + "learning_rate": 1.2077762434169812e-06, + "loss": 0.7012, + "step": 15099 + }, + { + "epoch": 2.25, + "grad_norm": 3.600160841799293, + "learning_rate": 1.2076817428708843e-06, + "loss": 0.6582, + "step": 15100 + }, + { + "epoch": 2.25, + "grad_norm": 2.509358379731099, + "learning_rate": 1.2075872403864748e-06, + "loss": 0.6497, + "step": 15101 + }, + { + "epoch": 2.25, + "grad_norm": 3.4333546378933466, + "learning_rate": 1.2074927359646357e-06, + "loss": 0.6816, + "step": 15102 + }, + { + "epoch": 2.25, + "grad_norm": 2.3388580166444086, + "learning_rate": 1.2073982296062483e-06, + "loss": 0.6185, + "step": 15103 + }, + { + "epoch": 2.25, + "grad_norm": 4.336551505698871, + "learning_rate": 1.2073037213121949e-06, + "loss": 0.6009, + "step": 15104 + }, + { + "epoch": 2.25, + "grad_norm": 2.5504695410776907, + "learning_rate": 1.2072092110833577e-06, + "loss": 0.6628, + "step": 15105 + }, + { + "epoch": 2.25, + "grad_norm": 6.032671048424238, + "learning_rate": 1.2071146989206183e-06, + "loss": 0.6673, + "step": 15106 + }, + { + "epoch": 2.25, + "grad_norm": 2.944895019273526, + "learning_rate": 1.2070201848248595e-06, + "loss": 0.6367, + "step": 15107 + }, + { + "epoch": 2.25, + "grad_norm": 2.3584723579269147, + "learning_rate": 1.2069256687969627e-06, + "loss": 0.5872, + "step": 15108 + }, + { + "epoch": 2.25, + "grad_norm": 2.59033337056881, + "learning_rate": 1.2068311508378106e-06, + "loss": 0.7057, + "step": 15109 + }, + { + "epoch": 2.25, + "grad_norm": 2.4889789164594425, + "learning_rate": 1.2067366309482849e-06, + "loss": 0.6543, + "step": 15110 + }, + { + "epoch": 2.25, + "grad_norm": 6.302509877913746, + "learning_rate": 1.2066421091292678e-06, + "loss": 0.6849, + "step": 15111 + }, + { + "epoch": 2.25, + "grad_norm": 4.67338875794966, + "learning_rate": 1.2065475853816419e-06, + "loss": 0.6159, + "step": 15112 + }, + { + "epoch": 2.25, + "grad_norm": 2.347277271143919, + "learning_rate": 1.206453059706289e-06, + "loss": 0.6602, + "step": 15113 + }, + { + "epoch": 2.25, + "grad_norm": 2.7989030840733515, + "learning_rate": 1.2063585321040912e-06, + "loss": 0.6328, + "step": 15114 + }, + { + "epoch": 2.25, + "grad_norm": 2.5576213702762836, + "learning_rate": 1.2062640025759314e-06, + "loss": 0.6387, + "step": 15115 + }, + { + "epoch": 2.25, + "grad_norm": 3.218218080224127, + "learning_rate": 1.2061694711226911e-06, + "loss": 0.668, + "step": 15116 + }, + { + "epoch": 2.25, + "grad_norm": 3.250028664265338, + "learning_rate": 1.206074937745253e-06, + "loss": 0.6654, + "step": 15117 + }, + { + "epoch": 2.25, + "grad_norm": 3.556951177552612, + "learning_rate": 1.2059804024444994e-06, + "loss": 0.6445, + "step": 15118 + }, + { + "epoch": 2.25, + "grad_norm": 2.5672000250052824, + "learning_rate": 1.2058858652213122e-06, + "loss": 0.6706, + "step": 15119 + }, + { + "epoch": 2.26, + "grad_norm": 2.4906204009002777, + "learning_rate": 1.2057913260765746e-06, + "loss": 0.6393, + "step": 15120 + }, + { + "epoch": 2.26, + "grad_norm": 2.5000364893373255, + "learning_rate": 1.2056967850111677e-06, + "loss": 0.6341, + "step": 15121 + }, + { + "epoch": 2.26, + "grad_norm": 2.342898420813274, + "learning_rate": 1.2056022420259745e-06, + "loss": 0.6224, + "step": 15122 + }, + { + "epoch": 2.26, + "grad_norm": 2.9706535827929224, + "learning_rate": 1.205507697121878e-06, + "loss": 0.6152, + "step": 15123 + }, + { + "epoch": 2.26, + "grad_norm": 3.2011328268790575, + "learning_rate": 1.2054131502997594e-06, + "loss": 0.6302, + "step": 15124 + }, + { + "epoch": 2.26, + "grad_norm": 2.3700089992531224, + "learning_rate": 1.2053186015605022e-06, + "loss": 0.6445, + "step": 15125 + }, + { + "epoch": 2.26, + "grad_norm": 2.2934850168867618, + "learning_rate": 1.2052240509049881e-06, + "loss": 0.6497, + "step": 15126 + }, + { + "epoch": 2.26, + "grad_norm": 2.8687652652694746, + "learning_rate": 1.2051294983340996e-06, + "loss": 0.7038, + "step": 15127 + }, + { + "epoch": 2.26, + "grad_norm": 2.4494667654233857, + "learning_rate": 1.2050349438487197e-06, + "loss": 0.6296, + "step": 15128 + }, + { + "epoch": 2.26, + "grad_norm": 3.0371616048299526, + "learning_rate": 1.2049403874497302e-06, + "loss": 0.6393, + "step": 15129 + }, + { + "epoch": 2.26, + "grad_norm": 3.3851142047639686, + "learning_rate": 1.204845829138014e-06, + "loss": 0.6198, + "step": 15130 + }, + { + "epoch": 2.26, + "grad_norm": 4.191245658249985, + "learning_rate": 1.204751268914454e-06, + "loss": 0.6491, + "step": 15131 + }, + { + "epoch": 2.26, + "grad_norm": 3.777278005899018, + "learning_rate": 1.2046567067799318e-06, + "loss": 0.6699, + "step": 15132 + }, + { + "epoch": 2.26, + "grad_norm": 3.2347708928449515, + "learning_rate": 1.2045621427353304e-06, + "loss": 0.6562, + "step": 15133 + }, + { + "epoch": 2.26, + "grad_norm": 2.416769624750579, + "learning_rate": 1.204467576781533e-06, + "loss": 0.6523, + "step": 15134 + }, + { + "epoch": 2.26, + "grad_norm": 2.533225379488638, + "learning_rate": 1.204373008919421e-06, + "loss": 0.6367, + "step": 15135 + }, + { + "epoch": 2.26, + "grad_norm": 2.774542803338995, + "learning_rate": 1.204278439149878e-06, + "loss": 0.6048, + "step": 15136 + }, + { + "epoch": 2.26, + "grad_norm": 2.816185493407762, + "learning_rate": 1.2041838674737863e-06, + "loss": 0.6667, + "step": 15137 + }, + { + "epoch": 2.26, + "grad_norm": 5.955298537853003, + "learning_rate": 1.2040892938920282e-06, + "loss": 0.6621, + "step": 15138 + }, + { + "epoch": 2.26, + "grad_norm": 5.668101987462419, + "learning_rate": 1.2039947184054869e-06, + "loss": 0.7025, + "step": 15139 + }, + { + "epoch": 2.26, + "grad_norm": 3.7770436319663503, + "learning_rate": 1.2039001410150448e-06, + "loss": 0.638, + "step": 15140 + }, + { + "epoch": 2.26, + "grad_norm": 3.9881887600324584, + "learning_rate": 1.2038055617215848e-06, + "loss": 0.5684, + "step": 15141 + }, + { + "epoch": 2.26, + "grad_norm": 3.183360883977672, + "learning_rate": 1.203710980525989e-06, + "loss": 0.7129, + "step": 15142 + }, + { + "epoch": 2.26, + "grad_norm": 3.6669415121890663, + "learning_rate": 1.203616397429141e-06, + "loss": 0.6504, + "step": 15143 + }, + { + "epoch": 2.26, + "grad_norm": 2.580710219442675, + "learning_rate": 1.2035218124319232e-06, + "loss": 0.5999, + "step": 15144 + }, + { + "epoch": 2.26, + "grad_norm": 4.989388249141368, + "learning_rate": 1.203427225535218e-06, + "loss": 0.6569, + "step": 15145 + }, + { + "epoch": 2.26, + "grad_norm": 2.6657226902934403, + "learning_rate": 1.2033326367399091e-06, + "loss": 0.6296, + "step": 15146 + }, + { + "epoch": 2.26, + "grad_norm": 2.5345888972480615, + "learning_rate": 1.2032380460468783e-06, + "loss": 0.6094, + "step": 15147 + }, + { + "epoch": 2.26, + "grad_norm": 2.971758221440026, + "learning_rate": 1.2031434534570087e-06, + "loss": 0.6374, + "step": 15148 + }, + { + "epoch": 2.26, + "grad_norm": 4.667938181558141, + "learning_rate": 1.2030488589711836e-06, + "loss": 0.5996, + "step": 15149 + }, + { + "epoch": 2.26, + "grad_norm": 2.6796399201350534, + "learning_rate": 1.2029542625902853e-06, + "loss": 0.6465, + "step": 15150 + }, + { + "epoch": 2.26, + "grad_norm": 2.9427865648481615, + "learning_rate": 1.202859664315197e-06, + "loss": 0.6211, + "step": 15151 + }, + { + "epoch": 2.26, + "grad_norm": 4.201029261700578, + "learning_rate": 1.2027650641468015e-06, + "loss": 0.6686, + "step": 15152 + }, + { + "epoch": 2.26, + "grad_norm": 4.248434669256806, + "learning_rate": 1.202670462085982e-06, + "loss": 0.6322, + "step": 15153 + }, + { + "epoch": 2.26, + "grad_norm": 3.865049141399605, + "learning_rate": 1.2025758581336207e-06, + "loss": 0.696, + "step": 15154 + }, + { + "epoch": 2.26, + "grad_norm": 2.6018616166421764, + "learning_rate": 1.2024812522906014e-06, + "loss": 0.6562, + "step": 15155 + }, + { + "epoch": 2.26, + "grad_norm": 3.5372697222374803, + "learning_rate": 1.2023866445578063e-06, + "loss": 0.6497, + "step": 15156 + }, + { + "epoch": 2.26, + "grad_norm": 2.6494908802933503, + "learning_rate": 1.2022920349361192e-06, + "loss": 0.6322, + "step": 15157 + }, + { + "epoch": 2.26, + "grad_norm": 3.8532895470162365, + "learning_rate": 1.2021974234264222e-06, + "loss": 0.6367, + "step": 15158 + }, + { + "epoch": 2.26, + "grad_norm": 3.191991024240357, + "learning_rate": 1.2021028100295986e-06, + "loss": 0.681, + "step": 15159 + }, + { + "epoch": 2.26, + "grad_norm": 2.739667840381369, + "learning_rate": 1.2020081947465324e-06, + "loss": 0.6419, + "step": 15160 + }, + { + "epoch": 2.26, + "grad_norm": 3.447999529817779, + "learning_rate": 1.2019135775781054e-06, + "loss": 0.6673, + "step": 15161 + }, + { + "epoch": 2.26, + "grad_norm": 2.5425809092687053, + "learning_rate": 1.201818958525201e-06, + "loss": 0.6263, + "step": 15162 + }, + { + "epoch": 2.26, + "grad_norm": 4.267043324280155, + "learning_rate": 1.2017243375887027e-06, + "loss": 0.7031, + "step": 15163 + }, + { + "epoch": 2.26, + "grad_norm": 3.0674214197403358, + "learning_rate": 1.2016297147694929e-06, + "loss": 0.6471, + "step": 15164 + }, + { + "epoch": 2.26, + "grad_norm": 2.3601632237901673, + "learning_rate": 1.2015350900684556e-06, + "loss": 0.6087, + "step": 15165 + }, + { + "epoch": 2.26, + "grad_norm": 3.4387887529311, + "learning_rate": 1.2014404634864732e-06, + "loss": 0.6016, + "step": 15166 + }, + { + "epoch": 2.26, + "grad_norm": 2.5912735531603746, + "learning_rate": 1.2013458350244293e-06, + "loss": 0.6452, + "step": 15167 + }, + { + "epoch": 2.26, + "grad_norm": 2.3120231009802854, + "learning_rate": 1.2012512046832069e-06, + "loss": 0.5999, + "step": 15168 + }, + { + "epoch": 2.26, + "grad_norm": 2.406743235835118, + "learning_rate": 1.201156572463689e-06, + "loss": 0.6712, + "step": 15169 + }, + { + "epoch": 2.26, + "grad_norm": 3.6288820498464758, + "learning_rate": 1.2010619383667592e-06, + "loss": 0.668, + "step": 15170 + }, + { + "epoch": 2.26, + "grad_norm": 3.334192757431038, + "learning_rate": 1.2009673023933005e-06, + "loss": 0.6061, + "step": 15171 + }, + { + "epoch": 2.26, + "grad_norm": 3.039355686377249, + "learning_rate": 1.200872664544196e-06, + "loss": 0.6341, + "step": 15172 + }, + { + "epoch": 2.26, + "grad_norm": 2.4156435642093097, + "learning_rate": 1.2007780248203297e-06, + "loss": 0.6071, + "step": 15173 + }, + { + "epoch": 2.26, + "grad_norm": 3.99330179413182, + "learning_rate": 1.2006833832225837e-06, + "loss": 0.623, + "step": 15174 + }, + { + "epoch": 2.26, + "grad_norm": 2.878689940139553, + "learning_rate": 1.2005887397518424e-06, + "loss": 0.6712, + "step": 15175 + }, + { + "epoch": 2.26, + "grad_norm": 3.754331800687567, + "learning_rate": 1.2004940944089885e-06, + "loss": 0.6315, + "step": 15176 + }, + { + "epoch": 2.26, + "grad_norm": 3.1218672576263473, + "learning_rate": 1.2003994471949054e-06, + "loss": 0.6478, + "step": 15177 + }, + { + "epoch": 2.26, + "grad_norm": 2.8929386755860835, + "learning_rate": 1.2003047981104768e-06, + "loss": 0.5983, + "step": 15178 + }, + { + "epoch": 2.26, + "grad_norm": 2.6739256572096357, + "learning_rate": 1.2002101471565855e-06, + "loss": 0.6165, + "step": 15179 + }, + { + "epoch": 2.26, + "grad_norm": 3.8695644981873687, + "learning_rate": 1.2001154943341152e-06, + "loss": 0.6217, + "step": 15180 + }, + { + "epoch": 2.26, + "grad_norm": 2.557747435575275, + "learning_rate": 1.2000208396439494e-06, + "loss": 0.6178, + "step": 15181 + }, + { + "epoch": 2.26, + "grad_norm": 2.77615279879227, + "learning_rate": 1.1999261830869714e-06, + "loss": 0.6615, + "step": 15182 + }, + { + "epoch": 2.26, + "grad_norm": 3.218522623064391, + "learning_rate": 1.1998315246640647e-06, + "loss": 0.6758, + "step": 15183 + }, + { + "epoch": 2.26, + "grad_norm": 3.7314414476839524, + "learning_rate": 1.1997368643761127e-06, + "loss": 0.6576, + "step": 15184 + }, + { + "epoch": 2.26, + "grad_norm": 2.9492866564321627, + "learning_rate": 1.1996422022239986e-06, + "loss": 0.5977, + "step": 15185 + }, + { + "epoch": 2.26, + "grad_norm": 3.67490697887356, + "learning_rate": 1.1995475382086068e-06, + "loss": 0.64, + "step": 15186 + }, + { + "epoch": 2.27, + "grad_norm": 2.758097519343222, + "learning_rate": 1.1994528723308198e-06, + "loss": 0.668, + "step": 15187 + }, + { + "epoch": 2.27, + "grad_norm": 5.849846123074867, + "learning_rate": 1.1993582045915213e-06, + "loss": 0.6517, + "step": 15188 + }, + { + "epoch": 2.27, + "grad_norm": 2.747647280513092, + "learning_rate": 1.1992635349915956e-06, + "loss": 0.5996, + "step": 15189 + }, + { + "epoch": 2.27, + "grad_norm": 4.347355382547056, + "learning_rate": 1.1991688635319252e-06, + "loss": 0.651, + "step": 15190 + }, + { + "epoch": 2.27, + "grad_norm": 2.5564635257832604, + "learning_rate": 1.1990741902133943e-06, + "loss": 0.6159, + "step": 15191 + }, + { + "epoch": 2.27, + "grad_norm": 3.5349967398406514, + "learning_rate": 1.1989795150368864e-06, + "loss": 0.6432, + "step": 15192 + }, + { + "epoch": 2.27, + "grad_norm": 2.8759686541462277, + "learning_rate": 1.1988848380032851e-06, + "loss": 0.5853, + "step": 15193 + }, + { + "epoch": 2.27, + "grad_norm": 3.335255340362606, + "learning_rate": 1.198790159113474e-06, + "loss": 0.735, + "step": 15194 + }, + { + "epoch": 2.27, + "grad_norm": 2.7226565320405225, + "learning_rate": 1.1986954783683367e-06, + "loss": 0.6354, + "step": 15195 + }, + { + "epoch": 2.27, + "grad_norm": 3.1221246396999387, + "learning_rate": 1.1986007957687572e-06, + "loss": 0.651, + "step": 15196 + }, + { + "epoch": 2.27, + "grad_norm": 4.799865752252739, + "learning_rate": 1.1985061113156187e-06, + "loss": 0.6341, + "step": 15197 + }, + { + "epoch": 2.27, + "grad_norm": 4.437636435289581, + "learning_rate": 1.1984114250098051e-06, + "loss": 0.6777, + "step": 15198 + }, + { + "epoch": 2.27, + "grad_norm": 2.6693213378768874, + "learning_rate": 1.1983167368522002e-06, + "loss": 0.599, + "step": 15199 + }, + { + "epoch": 2.27, + "grad_norm": 5.1553038252943795, + "learning_rate": 1.1982220468436878e-06, + "loss": 0.6341, + "step": 15200 + }, + { + "epoch": 2.27, + "grad_norm": 2.948260688208051, + "learning_rate": 1.1981273549851511e-06, + "loss": 0.6361, + "step": 15201 + }, + { + "epoch": 2.27, + "grad_norm": 5.814395484904959, + "learning_rate": 1.1980326612774744e-06, + "loss": 0.6361, + "step": 15202 + }, + { + "epoch": 2.27, + "grad_norm": 4.8769763184434165, + "learning_rate": 1.1979379657215418e-06, + "loss": 0.5918, + "step": 15203 + }, + { + "epoch": 2.27, + "grad_norm": 4.308587168365515, + "learning_rate": 1.1978432683182362e-06, + "loss": 0.6133, + "step": 15204 + }, + { + "epoch": 2.27, + "grad_norm": 5.813579721128315, + "learning_rate": 1.1977485690684418e-06, + "loss": 0.6289, + "step": 15205 + }, + { + "epoch": 2.27, + "grad_norm": 3.1152054670884555, + "learning_rate": 1.1976538679730431e-06, + "loss": 0.6393, + "step": 15206 + }, + { + "epoch": 2.27, + "grad_norm": 2.6563358111565294, + "learning_rate": 1.1975591650329229e-06, + "loss": 0.6224, + "step": 15207 + }, + { + "epoch": 2.27, + "grad_norm": 3.2346904886142185, + "learning_rate": 1.1974644602489657e-06, + "loss": 0.6413, + "step": 15208 + }, + { + "epoch": 2.27, + "grad_norm": 2.858387967029551, + "learning_rate": 1.197369753622055e-06, + "loss": 0.6328, + "step": 15209 + }, + { + "epoch": 2.27, + "grad_norm": 3.2263241602528554, + "learning_rate": 1.1972750451530753e-06, + "loss": 0.6354, + "step": 15210 + }, + { + "epoch": 2.27, + "grad_norm": 3.3673174274595947, + "learning_rate": 1.1971803348429098e-06, + "loss": 0.6654, + "step": 15211 + }, + { + "epoch": 2.27, + "grad_norm": 2.7815535828192104, + "learning_rate": 1.1970856226924432e-06, + "loss": 0.666, + "step": 15212 + }, + { + "epoch": 2.27, + "grad_norm": 4.303695407467361, + "learning_rate": 1.1969909087025588e-06, + "loss": 0.6556, + "step": 15213 + }, + { + "epoch": 2.27, + "grad_norm": 2.992518105907452, + "learning_rate": 1.1968961928741406e-06, + "loss": 0.6289, + "step": 15214 + }, + { + "epoch": 2.27, + "grad_norm": 4.182609953750445, + "learning_rate": 1.196801475208073e-06, + "loss": 0.6471, + "step": 15215 + }, + { + "epoch": 2.27, + "grad_norm": 2.7821587663170173, + "learning_rate": 1.19670675570524e-06, + "loss": 0.6543, + "step": 15216 + }, + { + "epoch": 2.27, + "grad_norm": 4.8068582347477165, + "learning_rate": 1.196612034366525e-06, + "loss": 0.5964, + "step": 15217 + }, + { + "epoch": 2.27, + "grad_norm": 4.073448620640498, + "learning_rate": 1.196517311192813e-06, + "loss": 0.6341, + "step": 15218 + }, + { + "epoch": 2.27, + "grad_norm": 3.175520329184716, + "learning_rate": 1.1964225861849876e-06, + "loss": 0.6751, + "step": 15219 + }, + { + "epoch": 2.27, + "grad_norm": 4.962204940353816, + "learning_rate": 1.1963278593439325e-06, + "loss": 0.694, + "step": 15220 + }, + { + "epoch": 2.27, + "grad_norm": 2.9797780553637847, + "learning_rate": 1.1962331306705322e-06, + "loss": 0.6426, + "step": 15221 + }, + { + "epoch": 2.27, + "grad_norm": 7.00014784390881, + "learning_rate": 1.196138400165671e-06, + "loss": 0.6159, + "step": 15222 + }, + { + "epoch": 2.27, + "grad_norm": 2.972839734395516, + "learning_rate": 1.1960436678302327e-06, + "loss": 0.6797, + "step": 15223 + }, + { + "epoch": 2.27, + "grad_norm": 4.9757010340516565, + "learning_rate": 1.1959489336651013e-06, + "loss": 0.625, + "step": 15224 + }, + { + "epoch": 2.27, + "grad_norm": 4.366549185718354, + "learning_rate": 1.195854197671161e-06, + "loss": 0.6315, + "step": 15225 + }, + { + "epoch": 2.27, + "grad_norm": 3.255186792161503, + "learning_rate": 1.1957594598492967e-06, + "loss": 0.6217, + "step": 15226 + }, + { + "epoch": 2.27, + "grad_norm": 3.7101067043232336, + "learning_rate": 1.1956647202003914e-06, + "loss": 0.6348, + "step": 15227 + }, + { + "epoch": 2.27, + "grad_norm": 5.4214442282388555, + "learning_rate": 1.1955699787253304e-06, + "loss": 0.6797, + "step": 15228 + }, + { + "epoch": 2.27, + "grad_norm": 3.8076369555606413, + "learning_rate": 1.1954752354249976e-06, + "loss": 0.6335, + "step": 15229 + }, + { + "epoch": 2.27, + "grad_norm": 7.341640349540955, + "learning_rate": 1.1953804903002768e-06, + "loss": 0.6478, + "step": 15230 + }, + { + "epoch": 2.27, + "grad_norm": 3.0629830592459926, + "learning_rate": 1.1952857433520526e-06, + "loss": 0.6536, + "step": 15231 + }, + { + "epoch": 2.27, + "grad_norm": 3.6037242433332084, + "learning_rate": 1.1951909945812096e-06, + "loss": 0.6426, + "step": 15232 + }, + { + "epoch": 2.27, + "grad_norm": 2.766913213247672, + "learning_rate": 1.1950962439886315e-06, + "loss": 0.6699, + "step": 15233 + }, + { + "epoch": 2.27, + "grad_norm": 3.399900999984255, + "learning_rate": 1.1950014915752031e-06, + "loss": 0.6328, + "step": 15234 + }, + { + "epoch": 2.27, + "grad_norm": 4.086097865543194, + "learning_rate": 1.1949067373418083e-06, + "loss": 0.6191, + "step": 15235 + }, + { + "epoch": 2.27, + "grad_norm": 2.3375481277348444, + "learning_rate": 1.1948119812893318e-06, + "loss": 0.6263, + "step": 15236 + }, + { + "epoch": 2.27, + "grad_norm": 3.201622806870857, + "learning_rate": 1.194717223418658e-06, + "loss": 0.6257, + "step": 15237 + }, + { + "epoch": 2.27, + "grad_norm": 3.3548750538927927, + "learning_rate": 1.1946224637306708e-06, + "loss": 0.623, + "step": 15238 + }, + { + "epoch": 2.27, + "grad_norm": 3.2883317647606303, + "learning_rate": 1.1945277022262555e-06, + "loss": 0.6895, + "step": 15239 + }, + { + "epoch": 2.27, + "grad_norm": 3.178215119965613, + "learning_rate": 1.1944329389062954e-06, + "loss": 0.6497, + "step": 15240 + }, + { + "epoch": 2.27, + "grad_norm": 2.746648146495417, + "learning_rate": 1.1943381737716758e-06, + "loss": 0.7005, + "step": 15241 + }, + { + "epoch": 2.27, + "grad_norm": 4.735903972122468, + "learning_rate": 1.1942434068232807e-06, + "loss": 0.6048, + "step": 15242 + }, + { + "epoch": 2.27, + "grad_norm": 2.8724466454654713, + "learning_rate": 1.1941486380619947e-06, + "loss": 0.6204, + "step": 15243 + }, + { + "epoch": 2.27, + "grad_norm": 2.834583132813355, + "learning_rate": 1.1940538674887024e-06, + "loss": 0.6198, + "step": 15244 + }, + { + "epoch": 2.27, + "grad_norm": 2.655772992538555, + "learning_rate": 1.1939590951042878e-06, + "loss": 0.6484, + "step": 15245 + }, + { + "epoch": 2.27, + "grad_norm": 2.911540428355448, + "learning_rate": 1.1938643209096364e-06, + "loss": 0.6484, + "step": 15246 + }, + { + "epoch": 2.27, + "grad_norm": 2.630733391525835, + "learning_rate": 1.193769544905632e-06, + "loss": 0.6172, + "step": 15247 + }, + { + "epoch": 2.27, + "grad_norm": 4.799190010819467, + "learning_rate": 1.1936747670931587e-06, + "loss": 0.6439, + "step": 15248 + }, + { + "epoch": 2.27, + "grad_norm": 2.8300254831350737, + "learning_rate": 1.1935799874731024e-06, + "loss": 0.6413, + "step": 15249 + }, + { + "epoch": 2.27, + "grad_norm": 2.3940247489742164, + "learning_rate": 1.1934852060463467e-06, + "loss": 0.6341, + "step": 15250 + }, + { + "epoch": 2.27, + "grad_norm": 2.704972666189697, + "learning_rate": 1.1933904228137764e-06, + "loss": 0.6029, + "step": 15251 + }, + { + "epoch": 2.27, + "grad_norm": 2.7197058144880097, + "learning_rate": 1.1932956377762767e-06, + "loss": 0.707, + "step": 15252 + }, + { + "epoch": 2.27, + "grad_norm": 3.4373079073071806, + "learning_rate": 1.1932008509347311e-06, + "loss": 0.6855, + "step": 15253 + }, + { + "epoch": 2.28, + "grad_norm": 7.091177659742945, + "learning_rate": 1.193106062290025e-06, + "loss": 0.6406, + "step": 15254 + }, + { + "epoch": 2.28, + "grad_norm": 5.1661378215919775, + "learning_rate": 1.193011271843043e-06, + "loss": 0.6927, + "step": 15255 + }, + { + "epoch": 2.28, + "grad_norm": 3.6305578442318422, + "learning_rate": 1.1929164795946696e-06, + "loss": 0.6146, + "step": 15256 + }, + { + "epoch": 2.28, + "grad_norm": 2.581230184004895, + "learning_rate": 1.1928216855457897e-06, + "loss": 0.6055, + "step": 15257 + }, + { + "epoch": 2.28, + "grad_norm": 3.1441006793821016, + "learning_rate": 1.192726889697288e-06, + "loss": 0.6296, + "step": 15258 + }, + { + "epoch": 2.28, + "grad_norm": 3.815135157113227, + "learning_rate": 1.1926320920500492e-06, + "loss": 0.6432, + "step": 15259 + }, + { + "epoch": 2.28, + "grad_norm": 4.125397619948988, + "learning_rate": 1.1925372926049581e-06, + "loss": 0.6738, + "step": 15260 + }, + { + "epoch": 2.28, + "grad_norm": 6.004850708962287, + "learning_rate": 1.1924424913628995e-06, + "loss": 0.6419, + "step": 15261 + }, + { + "epoch": 2.28, + "grad_norm": 2.64253137449058, + "learning_rate": 1.192347688324758e-06, + "loss": 0.6706, + "step": 15262 + }, + { + "epoch": 2.28, + "grad_norm": 2.97798429032833, + "learning_rate": 1.1922528834914185e-06, + "loss": 0.6165, + "step": 15263 + }, + { + "epoch": 2.28, + "grad_norm": 4.872111831993428, + "learning_rate": 1.1921580768637656e-06, + "loss": 0.6348, + "step": 15264 + }, + { + "epoch": 2.28, + "grad_norm": 3.126925818790896, + "learning_rate": 1.1920632684426848e-06, + "loss": 0.6426, + "step": 15265 + }, + { + "epoch": 2.28, + "grad_norm": 5.909633347444344, + "learning_rate": 1.1919684582290603e-06, + "loss": 0.6953, + "step": 15266 + }, + { + "epoch": 2.28, + "grad_norm": 8.960953336937344, + "learning_rate": 1.191873646223777e-06, + "loss": 0.668, + "step": 15267 + }, + { + "epoch": 2.28, + "grad_norm": 3.0755361168377133, + "learning_rate": 1.1917788324277206e-06, + "loss": 0.6484, + "step": 15268 + }, + { + "epoch": 2.28, + "grad_norm": 3.159398379102945, + "learning_rate": 1.191684016841775e-06, + "loss": 0.6055, + "step": 15269 + }, + { + "epoch": 2.28, + "grad_norm": 2.9964780369184143, + "learning_rate": 1.1915891994668253e-06, + "loss": 0.6745, + "step": 15270 + }, + { + "epoch": 2.28, + "grad_norm": 4.652000565185574, + "learning_rate": 1.191494380303757e-06, + "loss": 0.6426, + "step": 15271 + }, + { + "epoch": 2.28, + "grad_norm": 2.5305625747138367, + "learning_rate": 1.1913995593534547e-06, + "loss": 0.64, + "step": 15272 + }, + { + "epoch": 2.28, + "grad_norm": 3.695738099020112, + "learning_rate": 1.1913047366168034e-06, + "loss": 0.6862, + "step": 15273 + }, + { + "epoch": 2.28, + "grad_norm": 2.5487817117636196, + "learning_rate": 1.191209912094688e-06, + "loss": 0.6283, + "step": 15274 + }, + { + "epoch": 2.28, + "grad_norm": 2.9334252410378925, + "learning_rate": 1.1911150857879934e-06, + "loss": 0.6387, + "step": 15275 + }, + { + "epoch": 2.28, + "grad_norm": 3.673309010108496, + "learning_rate": 1.191020257697605e-06, + "loss": 0.6667, + "step": 15276 + }, + { + "epoch": 2.28, + "grad_norm": 3.7478693429637437, + "learning_rate": 1.1909254278244073e-06, + "loss": 0.6387, + "step": 15277 + }, + { + "epoch": 2.28, + "grad_norm": 3.289894156366346, + "learning_rate": 1.1908305961692864e-06, + "loss": 0.6322, + "step": 15278 + }, + { + "epoch": 2.28, + "grad_norm": 3.43957211925086, + "learning_rate": 1.1907357627331263e-06, + "loss": 0.6725, + "step": 15279 + }, + { + "epoch": 2.28, + "grad_norm": 2.4532447407706854, + "learning_rate": 1.1906409275168123e-06, + "loss": 0.6198, + "step": 15280 + }, + { + "epoch": 2.28, + "grad_norm": 3.465377683445044, + "learning_rate": 1.19054609052123e-06, + "loss": 0.6354, + "step": 15281 + }, + { + "epoch": 2.28, + "grad_norm": 4.659678742690491, + "learning_rate": 1.1904512517472639e-06, + "loss": 0.6315, + "step": 15282 + }, + { + "epoch": 2.28, + "grad_norm": 2.746331127689491, + "learning_rate": 1.1903564111957993e-06, + "loss": 0.6777, + "step": 15283 + }, + { + "epoch": 2.28, + "grad_norm": 3.4636574797522166, + "learning_rate": 1.1902615688677217e-06, + "loss": 0.6445, + "step": 15284 + }, + { + "epoch": 2.28, + "grad_norm": 2.8521624473800657, + "learning_rate": 1.1901667247639162e-06, + "loss": 0.6354, + "step": 15285 + }, + { + "epoch": 2.28, + "grad_norm": 3.5066822034664535, + "learning_rate": 1.1900718788852675e-06, + "loss": 0.6803, + "step": 15286 + }, + { + "epoch": 2.28, + "grad_norm": 3.4697157968031074, + "learning_rate": 1.1899770312326614e-06, + "loss": 0.6471, + "step": 15287 + }, + { + "epoch": 2.28, + "grad_norm": 3.475270817403986, + "learning_rate": 1.1898821818069828e-06, + "loss": 0.5898, + "step": 15288 + }, + { + "epoch": 2.28, + "grad_norm": 3.3241181604763574, + "learning_rate": 1.189787330609117e-06, + "loss": 0.5768, + "step": 15289 + }, + { + "epoch": 2.28, + "grad_norm": 2.3684493830404487, + "learning_rate": 1.1896924776399492e-06, + "loss": 0.6393, + "step": 15290 + }, + { + "epoch": 2.28, + "grad_norm": 2.8066298649517223, + "learning_rate": 1.1895976229003645e-06, + "loss": 0.6243, + "step": 15291 + }, + { + "epoch": 2.28, + "grad_norm": 3.559674144949094, + "learning_rate": 1.189502766391249e-06, + "loss": 0.6647, + "step": 15292 + }, + { + "epoch": 2.28, + "grad_norm": 4.365362093310124, + "learning_rate": 1.189407908113487e-06, + "loss": 0.6686, + "step": 15293 + }, + { + "epoch": 2.28, + "grad_norm": 2.6982166246342776, + "learning_rate": 1.1893130480679645e-06, + "loss": 0.6217, + "step": 15294 + }, + { + "epoch": 2.28, + "grad_norm": 3.9791011633822997, + "learning_rate": 1.1892181862555667e-06, + "loss": 0.7012, + "step": 15295 + }, + { + "epoch": 2.28, + "grad_norm": 3.1946986487529907, + "learning_rate": 1.1891233226771784e-06, + "loss": 0.6426, + "step": 15296 + }, + { + "epoch": 2.28, + "grad_norm": 3.0115486791851573, + "learning_rate": 1.1890284573336854e-06, + "loss": 0.6829, + "step": 15297 + }, + { + "epoch": 2.28, + "grad_norm": 3.084448131754785, + "learning_rate": 1.1889335902259736e-06, + "loss": 0.6641, + "step": 15298 + }, + { + "epoch": 2.28, + "grad_norm": 5.040639015060798, + "learning_rate": 1.1888387213549273e-06, + "loss": 0.6484, + "step": 15299 + }, + { + "epoch": 2.28, + "grad_norm": 2.5224942832341597, + "learning_rate": 1.188743850721433e-06, + "loss": 0.6094, + "step": 15300 + }, + { + "epoch": 2.28, + "grad_norm": 3.0174482495537673, + "learning_rate": 1.188648978326375e-06, + "loss": 0.6107, + "step": 15301 + }, + { + "epoch": 2.28, + "grad_norm": 2.8006417404912347, + "learning_rate": 1.1885541041706404e-06, + "loss": 0.6387, + "step": 15302 + }, + { + "epoch": 2.28, + "grad_norm": 2.7561716066605735, + "learning_rate": 1.1884592282551128e-06, + "loss": 0.6595, + "step": 15303 + }, + { + "epoch": 2.28, + "grad_norm": 4.018750692387599, + "learning_rate": 1.1883643505806788e-06, + "loss": 0.6732, + "step": 15304 + }, + { + "epoch": 2.28, + "grad_norm": 3.1715276920801037, + "learning_rate": 1.188269471148224e-06, + "loss": 0.6335, + "step": 15305 + }, + { + "epoch": 2.28, + "grad_norm": 3.8860244284324486, + "learning_rate": 1.1881745899586333e-06, + "loss": 0.6602, + "step": 15306 + }, + { + "epoch": 2.28, + "grad_norm": 3.8092488272112908, + "learning_rate": 1.1880797070127926e-06, + "loss": 0.6673, + "step": 15307 + }, + { + "epoch": 2.28, + "grad_norm": 2.613616563619354, + "learning_rate": 1.187984822311587e-06, + "loss": 0.6357, + "step": 15308 + }, + { + "epoch": 2.28, + "grad_norm": 2.6306369681799926, + "learning_rate": 1.1878899358559028e-06, + "loss": 0.6569, + "step": 15309 + }, + { + "epoch": 2.28, + "grad_norm": 2.5924581064073133, + "learning_rate": 1.1877950476466252e-06, + "loss": 0.6139, + "step": 15310 + }, + { + "epoch": 2.28, + "grad_norm": 3.1924852864930364, + "learning_rate": 1.1877001576846395e-06, + "loss": 0.6393, + "step": 15311 + }, + { + "epoch": 2.28, + "grad_norm": 3.0554842365802544, + "learning_rate": 1.187605265970832e-06, + "loss": 0.6556, + "step": 15312 + }, + { + "epoch": 2.28, + "grad_norm": 2.4436107239282117, + "learning_rate": 1.1875103725060878e-06, + "loss": 0.6217, + "step": 15313 + }, + { + "epoch": 2.28, + "grad_norm": 2.6811459397760706, + "learning_rate": 1.1874154772912927e-06, + "loss": 0.6556, + "step": 15314 + }, + { + "epoch": 2.28, + "grad_norm": 2.8654668499969915, + "learning_rate": 1.1873205803273324e-06, + "loss": 0.6348, + "step": 15315 + }, + { + "epoch": 2.28, + "grad_norm": 2.646247282372318, + "learning_rate": 1.1872256816150928e-06, + "loss": 0.6387, + "step": 15316 + }, + { + "epoch": 2.28, + "grad_norm": 2.8249902320708435, + "learning_rate": 1.1871307811554587e-06, + "loss": 0.6465, + "step": 15317 + }, + { + "epoch": 2.28, + "grad_norm": 2.530946961572825, + "learning_rate": 1.1870358789493174e-06, + "loss": 0.6426, + "step": 15318 + }, + { + "epoch": 2.28, + "grad_norm": 3.292022148634445, + "learning_rate": 1.186940974997553e-06, + "loss": 0.6328, + "step": 15319 + }, + { + "epoch": 2.28, + "grad_norm": 3.3856658504233303, + "learning_rate": 1.186846069301052e-06, + "loss": 0.6038, + "step": 15320 + }, + { + "epoch": 2.29, + "grad_norm": 3.114017307336941, + "learning_rate": 1.1867511618607002e-06, + "loss": 0.6641, + "step": 15321 + }, + { + "epoch": 2.29, + "grad_norm": 3.6919861347519367, + "learning_rate": 1.1866562526773836e-06, + "loss": 0.6868, + "step": 15322 + }, + { + "epoch": 2.29, + "grad_norm": 3.7150638982505697, + "learning_rate": 1.1865613417519872e-06, + "loss": 0.6471, + "step": 15323 + }, + { + "epoch": 2.29, + "grad_norm": 4.135790671411815, + "learning_rate": 1.1864664290853977e-06, + "loss": 0.6198, + "step": 15324 + }, + { + "epoch": 2.29, + "grad_norm": 3.924749911334911, + "learning_rate": 1.1863715146785003e-06, + "loss": 0.6471, + "step": 15325 + }, + { + "epoch": 2.29, + "grad_norm": 3.194978402250479, + "learning_rate": 1.1862765985321813e-06, + "loss": 0.6517, + "step": 15326 + }, + { + "epoch": 2.29, + "grad_norm": 2.5468015321582675, + "learning_rate": 1.1861816806473259e-06, + "loss": 0.6315, + "step": 15327 + }, + { + "epoch": 2.29, + "grad_norm": 5.333241547070015, + "learning_rate": 1.1860867610248207e-06, + "loss": 0.6227, + "step": 15328 + }, + { + "epoch": 2.29, + "grad_norm": 5.223531866688676, + "learning_rate": 1.1859918396655514e-06, + "loss": 0.612, + "step": 15329 + }, + { + "epoch": 2.29, + "grad_norm": 4.014945103757636, + "learning_rate": 1.1858969165704036e-06, + "loss": 0.6328, + "step": 15330 + }, + { + "epoch": 2.29, + "grad_norm": 2.7858884916516793, + "learning_rate": 1.1858019917402635e-06, + "loss": 0.6211, + "step": 15331 + }, + { + "epoch": 2.29, + "grad_norm": 2.6918263171431818, + "learning_rate": 1.185707065176017e-06, + "loss": 0.6237, + "step": 15332 + }, + { + "epoch": 2.29, + "grad_norm": 3.1612155414833367, + "learning_rate": 1.1856121368785499e-06, + "loss": 0.6445, + "step": 15333 + }, + { + "epoch": 2.29, + "grad_norm": 3.5009692060980484, + "learning_rate": 1.1855172068487487e-06, + "loss": 0.6465, + "step": 15334 + }, + { + "epoch": 2.29, + "grad_norm": 3.258195465194532, + "learning_rate": 1.1854222750874986e-06, + "loss": 0.6777, + "step": 15335 + }, + { + "epoch": 2.29, + "grad_norm": 2.912982876162719, + "learning_rate": 1.1853273415956862e-06, + "loss": 0.6595, + "step": 15336 + }, + { + "epoch": 2.29, + "grad_norm": 3.282978408926545, + "learning_rate": 1.1852324063741973e-06, + "loss": 0.6413, + "step": 15337 + }, + { + "epoch": 2.29, + "grad_norm": 3.6002560372724703, + "learning_rate": 1.185137469423918e-06, + "loss": 0.6074, + "step": 15338 + }, + { + "epoch": 2.29, + "grad_norm": 3.2943744999203624, + "learning_rate": 1.1850425307457344e-06, + "loss": 0.6217, + "step": 15339 + }, + { + "epoch": 2.29, + "grad_norm": 3.146761422414615, + "learning_rate": 1.1849475903405322e-06, + "loss": 0.5885, + "step": 15340 + }, + { + "epoch": 2.29, + "grad_norm": 4.246137536774571, + "learning_rate": 1.1848526482091981e-06, + "loss": 0.5951, + "step": 15341 + }, + { + "epoch": 2.29, + "grad_norm": 5.68543344424926, + "learning_rate": 1.1847577043526179e-06, + "loss": 0.666, + "step": 15342 + }, + { + "epoch": 2.29, + "grad_norm": 3.8776134381632126, + "learning_rate": 1.1846627587716774e-06, + "loss": 0.7142, + "step": 15343 + }, + { + "epoch": 2.29, + "grad_norm": 3.6641026895826414, + "learning_rate": 1.184567811467263e-06, + "loss": 0.6868, + "step": 15344 + }, + { + "epoch": 2.29, + "grad_norm": 8.55902418471175, + "learning_rate": 1.1844728624402613e-06, + "loss": 0.6543, + "step": 15345 + }, + { + "epoch": 2.29, + "grad_norm": 4.414726489290211, + "learning_rate": 1.1843779116915578e-06, + "loss": 0.7018, + "step": 15346 + }, + { + "epoch": 2.29, + "grad_norm": 3.019146428733983, + "learning_rate": 1.1842829592220388e-06, + "loss": 0.6992, + "step": 15347 + }, + { + "epoch": 2.29, + "grad_norm": 2.716288820714218, + "learning_rate": 1.184188005032591e-06, + "loss": 0.6393, + "step": 15348 + }, + { + "epoch": 2.29, + "grad_norm": 2.7091253404153557, + "learning_rate": 1.1840930491241002e-06, + "loss": 0.6029, + "step": 15349 + }, + { + "epoch": 2.29, + "grad_norm": 3.0263634614648782, + "learning_rate": 1.1839980914974525e-06, + "loss": 0.6615, + "step": 15350 + }, + { + "epoch": 2.29, + "grad_norm": 3.1177137471636707, + "learning_rate": 1.1839031321535344e-06, + "loss": 0.6829, + "step": 15351 + }, + { + "epoch": 2.29, + "grad_norm": 2.710087097613653, + "learning_rate": 1.1838081710932322e-06, + "loss": 0.6302, + "step": 15352 + }, + { + "epoch": 2.29, + "grad_norm": 2.781504549307216, + "learning_rate": 1.183713208317432e-06, + "loss": 0.6439, + "step": 15353 + }, + { + "epoch": 2.29, + "grad_norm": 4.282174041144426, + "learning_rate": 1.18361824382702e-06, + "loss": 0.6615, + "step": 15354 + }, + { + "epoch": 2.29, + "grad_norm": 3.0499957529420643, + "learning_rate": 1.1835232776228833e-06, + "loss": 0.6452, + "step": 15355 + }, + { + "epoch": 2.29, + "grad_norm": 4.867569584947434, + "learning_rate": 1.183428309705907e-06, + "loss": 0.6888, + "step": 15356 + }, + { + "epoch": 2.29, + "grad_norm": 3.015541822897121, + "learning_rate": 1.1833333400769781e-06, + "loss": 0.5837, + "step": 15357 + }, + { + "epoch": 2.29, + "grad_norm": 3.0214348055265057, + "learning_rate": 1.1832383687369833e-06, + "loss": 0.6296, + "step": 15358 + }, + { + "epoch": 2.29, + "grad_norm": 4.661427006823941, + "learning_rate": 1.1831433956868085e-06, + "loss": 0.6439, + "step": 15359 + }, + { + "epoch": 2.29, + "grad_norm": 4.818658151249231, + "learning_rate": 1.1830484209273402e-06, + "loss": 0.6549, + "step": 15360 + }, + { + "epoch": 2.29, + "grad_norm": 2.4477702485054103, + "learning_rate": 1.1829534444594646e-06, + "loss": 0.6354, + "step": 15361 + }, + { + "epoch": 2.29, + "grad_norm": 6.309964886226435, + "learning_rate": 1.1828584662840684e-06, + "loss": 0.6432, + "step": 15362 + }, + { + "epoch": 2.29, + "grad_norm": 3.570371419861426, + "learning_rate": 1.1827634864020379e-06, + "loss": 0.5977, + "step": 15363 + }, + { + "epoch": 2.29, + "grad_norm": 5.997444859568011, + "learning_rate": 1.1826685048142598e-06, + "loss": 0.6289, + "step": 15364 + }, + { + "epoch": 2.29, + "grad_norm": 3.29931093401615, + "learning_rate": 1.1825735215216201e-06, + "loss": 0.6602, + "step": 15365 + }, + { + "epoch": 2.29, + "grad_norm": 2.836716885895707, + "learning_rate": 1.1824785365250058e-06, + "loss": 0.6465, + "step": 15366 + }, + { + "epoch": 2.29, + "grad_norm": 3.1218756070407374, + "learning_rate": 1.1823835498253032e-06, + "loss": 0.6686, + "step": 15367 + }, + { + "epoch": 2.29, + "grad_norm": 5.874276464453335, + "learning_rate": 1.1822885614233987e-06, + "loss": 0.6816, + "step": 15368 + }, + { + "epoch": 2.29, + "grad_norm": 6.902337748410941, + "learning_rate": 1.1821935713201788e-06, + "loss": 0.7174, + "step": 15369 + }, + { + "epoch": 2.29, + "grad_norm": 3.0105352399090113, + "learning_rate": 1.1820985795165305e-06, + "loss": 0.6628, + "step": 15370 + }, + { + "epoch": 2.29, + "grad_norm": 6.799473507269287, + "learning_rate": 1.18200358601334e-06, + "loss": 0.6387, + "step": 15371 + }, + { + "epoch": 2.29, + "grad_norm": 5.186913563838606, + "learning_rate": 1.1819085908114939e-06, + "loss": 0.6986, + "step": 15372 + }, + { + "epoch": 2.29, + "grad_norm": 3.0357671733449334, + "learning_rate": 1.181813593911879e-06, + "loss": 0.6188, + "step": 15373 + }, + { + "epoch": 2.29, + "grad_norm": 2.8754977182359007, + "learning_rate": 1.1817185953153813e-06, + "loss": 0.6302, + "step": 15374 + }, + { + "epoch": 2.29, + "grad_norm": 2.9055020298622396, + "learning_rate": 1.181623595022888e-06, + "loss": 0.6686, + "step": 15375 + }, + { + "epoch": 2.29, + "grad_norm": 3.315294194602545, + "learning_rate": 1.181528593035286e-06, + "loss": 0.6484, + "step": 15376 + }, + { + "epoch": 2.29, + "grad_norm": 2.454353245784931, + "learning_rate": 1.1814335893534614e-06, + "loss": 0.639, + "step": 15377 + }, + { + "epoch": 2.29, + "grad_norm": 2.9333042172137476, + "learning_rate": 1.1813385839783007e-06, + "loss": 0.6536, + "step": 15378 + }, + { + "epoch": 2.29, + "grad_norm": 3.832360709348862, + "learning_rate": 1.1812435769106914e-06, + "loss": 0.6211, + "step": 15379 + }, + { + "epoch": 2.29, + "grad_norm": 2.574916911103472, + "learning_rate": 1.1811485681515196e-06, + "loss": 0.6816, + "step": 15380 + }, + { + "epoch": 2.29, + "grad_norm": 2.7446872752612337, + "learning_rate": 1.1810535577016723e-06, + "loss": 0.6777, + "step": 15381 + }, + { + "epoch": 2.29, + "grad_norm": 3.0083065596611562, + "learning_rate": 1.180958545562036e-06, + "loss": 0.6654, + "step": 15382 + }, + { + "epoch": 2.29, + "grad_norm": 2.8683058216562713, + "learning_rate": 1.1808635317334977e-06, + "loss": 0.653, + "step": 15383 + }, + { + "epoch": 2.29, + "grad_norm": 2.8424009070649974, + "learning_rate": 1.1807685162169443e-06, + "loss": 0.6738, + "step": 15384 + }, + { + "epoch": 2.29, + "grad_norm": 3.0372284978722615, + "learning_rate": 1.1806734990132619e-06, + "loss": 0.6855, + "step": 15385 + }, + { + "epoch": 2.29, + "grad_norm": 2.2786440424281036, + "learning_rate": 1.1805784801233376e-06, + "loss": 0.6471, + "step": 15386 + }, + { + "epoch": 2.29, + "grad_norm": 5.2932168318700095, + "learning_rate": 1.180483459548059e-06, + "loss": 0.6191, + "step": 15387 + }, + { + "epoch": 2.3, + "grad_norm": 2.464488084255283, + "learning_rate": 1.1803884372883118e-06, + "loss": 0.6523, + "step": 15388 + }, + { + "epoch": 2.3, + "grad_norm": 2.9137432748834056, + "learning_rate": 1.1802934133449837e-06, + "loss": 0.627, + "step": 15389 + }, + { + "epoch": 2.3, + "grad_norm": 2.782125624349598, + "learning_rate": 1.180198387718961e-06, + "loss": 0.638, + "step": 15390 + }, + { + "epoch": 2.3, + "grad_norm": 3.8080451065056216, + "learning_rate": 1.180103360411131e-06, + "loss": 0.6263, + "step": 15391 + }, + { + "epoch": 2.3, + "grad_norm": 4.437352617599294, + "learning_rate": 1.1800083314223804e-06, + "loss": 0.6849, + "step": 15392 + }, + { + "epoch": 2.3, + "grad_norm": 4.205143590614912, + "learning_rate": 1.179913300753596e-06, + "loss": 0.6387, + "step": 15393 + }, + { + "epoch": 2.3, + "grad_norm": 2.916474168807981, + "learning_rate": 1.1798182684056647e-06, + "loss": 0.668, + "step": 15394 + }, + { + "epoch": 2.3, + "grad_norm": 2.2307830232053036, + "learning_rate": 1.179723234379474e-06, + "loss": 0.5996, + "step": 15395 + }, + { + "epoch": 2.3, + "grad_norm": 5.367972674590326, + "learning_rate": 1.17962819867591e-06, + "loss": 0.6973, + "step": 15396 + }, + { + "epoch": 2.3, + "grad_norm": 3.5595770284115105, + "learning_rate": 1.1795331612958606e-06, + "loss": 0.6309, + "step": 15397 + }, + { + "epoch": 2.3, + "grad_norm": 3.6950754499905885, + "learning_rate": 1.179438122240212e-06, + "loss": 0.6602, + "step": 15398 + }, + { + "epoch": 2.3, + "grad_norm": 2.3828844266695404, + "learning_rate": 1.1793430815098518e-06, + "loss": 0.6602, + "step": 15399 + }, + { + "epoch": 2.3, + "grad_norm": 4.364764235659605, + "learning_rate": 1.1792480391056665e-06, + "loss": 0.6439, + "step": 15400 + }, + { + "epoch": 2.3, + "grad_norm": 3.002878668054556, + "learning_rate": 1.1791529950285437e-06, + "loss": 0.6471, + "step": 15401 + }, + { + "epoch": 2.3, + "grad_norm": 2.23352796075945, + "learning_rate": 1.1790579492793698e-06, + "loss": 0.6243, + "step": 15402 + }, + { + "epoch": 2.3, + "grad_norm": 2.3025937402971444, + "learning_rate": 1.1789629018590323e-06, + "loss": 0.6569, + "step": 15403 + }, + { + "epoch": 2.3, + "grad_norm": 2.84328866884314, + "learning_rate": 1.1788678527684187e-06, + "loss": 0.6699, + "step": 15404 + }, + { + "epoch": 2.3, + "grad_norm": 2.687537262945748, + "learning_rate": 1.1787728020084151e-06, + "loss": 0.6374, + "step": 15405 + }, + { + "epoch": 2.3, + "grad_norm": 3.1824994912515465, + "learning_rate": 1.1786777495799091e-06, + "loss": 0.668, + "step": 15406 + }, + { + "epoch": 2.3, + "grad_norm": 2.513836333850433, + "learning_rate": 1.178582695483788e-06, + "loss": 0.6465, + "step": 15407 + }, + { + "epoch": 2.3, + "grad_norm": 2.988531636818401, + "learning_rate": 1.178487639720939e-06, + "loss": 0.6165, + "step": 15408 + }, + { + "epoch": 2.3, + "grad_norm": 3.403542112654442, + "learning_rate": 1.1783925822922486e-06, + "loss": 0.6771, + "step": 15409 + }, + { + "epoch": 2.3, + "grad_norm": 3.01178822151872, + "learning_rate": 1.178297523198605e-06, + "loss": 0.6504, + "step": 15410 + }, + { + "epoch": 2.3, + "grad_norm": 3.1033063046621012, + "learning_rate": 1.1782024624408947e-06, + "loss": 0.6126, + "step": 15411 + }, + { + "epoch": 2.3, + "grad_norm": 2.9298033042823177, + "learning_rate": 1.1781074000200045e-06, + "loss": 0.6263, + "step": 15412 + }, + { + "epoch": 2.3, + "grad_norm": 3.753801257536937, + "learning_rate": 1.1780123359368228e-06, + "loss": 0.6589, + "step": 15413 + }, + { + "epoch": 2.3, + "grad_norm": 4.285497506059499, + "learning_rate": 1.1779172701922362e-06, + "loss": 0.6419, + "step": 15414 + }, + { + "epoch": 2.3, + "grad_norm": 3.04635384870484, + "learning_rate": 1.1778222027871317e-06, + "loss": 0.6641, + "step": 15415 + }, + { + "epoch": 2.3, + "grad_norm": 2.8725919902389436, + "learning_rate": 1.1777271337223972e-06, + "loss": 0.6361, + "step": 15416 + }, + { + "epoch": 2.3, + "grad_norm": 2.7073717401935706, + "learning_rate": 1.1776320629989193e-06, + "loss": 0.6133, + "step": 15417 + }, + { + "epoch": 2.3, + "grad_norm": 3.8855323651212457, + "learning_rate": 1.1775369906175858e-06, + "loss": 0.6849, + "step": 15418 + }, + { + "epoch": 2.3, + "grad_norm": 4.952697693072487, + "learning_rate": 1.1774419165792839e-06, + "loss": 0.6634, + "step": 15419 + }, + { + "epoch": 2.3, + "grad_norm": 2.993016688956522, + "learning_rate": 1.1773468408849008e-06, + "loss": 0.6562, + "step": 15420 + }, + { + "epoch": 2.3, + "grad_norm": 3.5774171985089676, + "learning_rate": 1.1772517635353242e-06, + "loss": 0.6478, + "step": 15421 + }, + { + "epoch": 2.3, + "grad_norm": 5.294351929014379, + "learning_rate": 1.1771566845314409e-06, + "loss": 0.6426, + "step": 15422 + }, + { + "epoch": 2.3, + "grad_norm": 3.5999585534949476, + "learning_rate": 1.1770616038741385e-06, + "loss": 0.6471, + "step": 15423 + }, + { + "epoch": 2.3, + "grad_norm": 3.5877800973572, + "learning_rate": 1.176966521564305e-06, + "loss": 0.5944, + "step": 15424 + }, + { + "epoch": 2.3, + "grad_norm": 4.0322794995064415, + "learning_rate": 1.1768714376028267e-06, + "loss": 0.6517, + "step": 15425 + }, + { + "epoch": 2.3, + "grad_norm": 2.907867429032107, + "learning_rate": 1.1767763519905923e-06, + "loss": 0.6217, + "step": 15426 + }, + { + "epoch": 2.3, + "grad_norm": 2.54199149027121, + "learning_rate": 1.1766812647284883e-06, + "loss": 0.6328, + "step": 15427 + }, + { + "epoch": 2.3, + "grad_norm": 2.9065285286687748, + "learning_rate": 1.176586175817402e-06, + "loss": 0.6523, + "step": 15428 + }, + { + "epoch": 2.3, + "grad_norm": 5.7857983718681005, + "learning_rate": 1.1764910852582216e-06, + "loss": 0.6725, + "step": 15429 + }, + { + "epoch": 2.3, + "grad_norm": 2.8814630141314286, + "learning_rate": 1.1763959930518344e-06, + "loss": 0.6628, + "step": 15430 + }, + { + "epoch": 2.3, + "grad_norm": 2.8826969138044194, + "learning_rate": 1.176300899199128e-06, + "loss": 0.6536, + "step": 15431 + }, + { + "epoch": 2.3, + "grad_norm": 6.671325362038667, + "learning_rate": 1.1762058037009895e-06, + "loss": 0.6393, + "step": 15432 + }, + { + "epoch": 2.3, + "grad_norm": 3.832396643313422, + "learning_rate": 1.1761107065583062e-06, + "loss": 0.6393, + "step": 15433 + }, + { + "epoch": 2.3, + "grad_norm": 3.2347380690248917, + "learning_rate": 1.176015607771967e-06, + "loss": 0.6165, + "step": 15434 + }, + { + "epoch": 2.3, + "grad_norm": 2.8746566808696707, + "learning_rate": 1.175920507342858e-06, + "loss": 0.5885, + "step": 15435 + }, + { + "epoch": 2.3, + "grad_norm": 2.7039424680148922, + "learning_rate": 1.1758254052718673e-06, + "loss": 0.597, + "step": 15436 + }, + { + "epoch": 2.3, + "grad_norm": 3.0287702727245787, + "learning_rate": 1.1757303015598828e-06, + "loss": 0.5892, + "step": 15437 + }, + { + "epoch": 2.3, + "grad_norm": 4.1159395395947005, + "learning_rate": 1.1756351962077916e-06, + "loss": 0.6328, + "step": 15438 + }, + { + "epoch": 2.3, + "grad_norm": 4.042057127649672, + "learning_rate": 1.1755400892164818e-06, + "loss": 0.6367, + "step": 15439 + }, + { + "epoch": 2.3, + "grad_norm": 3.91018962026587, + "learning_rate": 1.1754449805868406e-06, + "loss": 0.6348, + "step": 15440 + }, + { + "epoch": 2.3, + "grad_norm": 3.7631203924966545, + "learning_rate": 1.175349870319756e-06, + "loss": 0.6159, + "step": 15441 + }, + { + "epoch": 2.3, + "grad_norm": 2.760893129383927, + "learning_rate": 1.1752547584161157e-06, + "loss": 0.6328, + "step": 15442 + }, + { + "epoch": 2.3, + "grad_norm": 4.0022891084486565, + "learning_rate": 1.175159644876807e-06, + "loss": 0.6595, + "step": 15443 + }, + { + "epoch": 2.3, + "grad_norm": 2.8217458880196804, + "learning_rate": 1.175064529702718e-06, + "loss": 0.6048, + "step": 15444 + }, + { + "epoch": 2.3, + "grad_norm": 3.132607355803799, + "learning_rate": 1.1749694128947363e-06, + "loss": 0.6243, + "step": 15445 + }, + { + "epoch": 2.3, + "grad_norm": 3.375846434958951, + "learning_rate": 1.1748742944537493e-06, + "loss": 0.6647, + "step": 15446 + }, + { + "epoch": 2.3, + "grad_norm": 3.0564460702549834, + "learning_rate": 1.1747791743806454e-06, + "loss": 0.6367, + "step": 15447 + }, + { + "epoch": 2.3, + "grad_norm": 3.2609243865723143, + "learning_rate": 1.1746840526763117e-06, + "loss": 0.6201, + "step": 15448 + }, + { + "epoch": 2.3, + "grad_norm": 4.258018316293525, + "learning_rate": 1.1745889293416365e-06, + "loss": 0.6497, + "step": 15449 + }, + { + "epoch": 2.3, + "grad_norm": 7.3384193611253945, + "learning_rate": 1.1744938043775077e-06, + "loss": 0.6543, + "step": 15450 + }, + { + "epoch": 2.3, + "grad_norm": 3.321514430176881, + "learning_rate": 1.1743986777848123e-06, + "loss": 0.6276, + "step": 15451 + }, + { + "epoch": 2.3, + "grad_norm": 3.4428253017393633, + "learning_rate": 1.1743035495644384e-06, + "loss": 0.6562, + "step": 15452 + }, + { + "epoch": 2.3, + "grad_norm": 2.9541424558552647, + "learning_rate": 1.1742084197172746e-06, + "loss": 0.6647, + "step": 15453 + }, + { + "epoch": 2.3, + "grad_norm": 5.196268565928766, + "learning_rate": 1.174113288244208e-06, + "loss": 0.6947, + "step": 15454 + }, + { + "epoch": 2.3, + "grad_norm": 3.484118370589236, + "learning_rate": 1.1740181551461267e-06, + "loss": 0.6328, + "step": 15455 + }, + { + "epoch": 2.31, + "grad_norm": 4.230788896436972, + "learning_rate": 1.1739230204239186e-06, + "loss": 0.6432, + "step": 15456 + }, + { + "epoch": 2.31, + "grad_norm": 4.106460364882983, + "learning_rate": 1.1738278840784715e-06, + "loss": 0.6973, + "step": 15457 + }, + { + "epoch": 2.31, + "grad_norm": 2.743346797243414, + "learning_rate": 1.1737327461106734e-06, + "loss": 0.61, + "step": 15458 + }, + { + "epoch": 2.31, + "grad_norm": 2.724314403310257, + "learning_rate": 1.1736376065214122e-06, + "loss": 0.6426, + "step": 15459 + }, + { + "epoch": 2.31, + "grad_norm": 2.882968850265827, + "learning_rate": 1.173542465311576e-06, + "loss": 0.6113, + "step": 15460 + }, + { + "epoch": 2.31, + "grad_norm": 2.927350889840967, + "learning_rate": 1.1734473224820524e-06, + "loss": 0.6732, + "step": 15461 + }, + { + "epoch": 2.31, + "grad_norm": 3.176851103848874, + "learning_rate": 1.1733521780337297e-06, + "loss": 0.6484, + "step": 15462 + }, + { + "epoch": 2.31, + "grad_norm": 4.693521513836052, + "learning_rate": 1.173257031967496e-06, + "loss": 0.6335, + "step": 15463 + }, + { + "epoch": 2.31, + "grad_norm": 2.9590005769383567, + "learning_rate": 1.1731618842842387e-06, + "loss": 0.6338, + "step": 15464 + }, + { + "epoch": 2.31, + "grad_norm": 2.8308126867074184, + "learning_rate": 1.1730667349848463e-06, + "loss": 0.6901, + "step": 15465 + }, + { + "epoch": 2.31, + "grad_norm": 4.48393011368664, + "learning_rate": 1.1729715840702073e-06, + "loss": 0.6257, + "step": 15466 + }, + { + "epoch": 2.31, + "grad_norm": 2.9455679699142525, + "learning_rate": 1.1728764315412087e-06, + "loss": 0.6406, + "step": 15467 + }, + { + "epoch": 2.31, + "grad_norm": 5.161026882708681, + "learning_rate": 1.172781277398739e-06, + "loss": 0.6165, + "step": 15468 + }, + { + "epoch": 2.31, + "grad_norm": 2.56991419860545, + "learning_rate": 1.1726861216436868e-06, + "loss": 0.6309, + "step": 15469 + }, + { + "epoch": 2.31, + "grad_norm": 5.389968274317099, + "learning_rate": 1.1725909642769394e-06, + "loss": 0.6751, + "step": 15470 + }, + { + "epoch": 2.31, + "grad_norm": 3.0006586290534876, + "learning_rate": 1.1724958052993854e-06, + "loss": 0.653, + "step": 15471 + }, + { + "epoch": 2.31, + "grad_norm": 4.0287740796383815, + "learning_rate": 1.1724006447119126e-06, + "loss": 0.64, + "step": 15472 + }, + { + "epoch": 2.31, + "grad_norm": 3.4435234233445846, + "learning_rate": 1.1723054825154093e-06, + "loss": 0.6523, + "step": 15473 + }, + { + "epoch": 2.31, + "grad_norm": 4.751887971973845, + "learning_rate": 1.1722103187107641e-06, + "loss": 0.6296, + "step": 15474 + }, + { + "epoch": 2.31, + "grad_norm": 3.0090017902107324, + "learning_rate": 1.1721151532988645e-06, + "loss": 0.6224, + "step": 15475 + }, + { + "epoch": 2.31, + "grad_norm": 2.8806217625694495, + "learning_rate": 1.172019986280599e-06, + "loss": 0.6074, + "step": 15476 + }, + { + "epoch": 2.31, + "grad_norm": 3.414870966663561, + "learning_rate": 1.171924817656856e-06, + "loss": 0.6081, + "step": 15477 + }, + { + "epoch": 2.31, + "grad_norm": 4.3629909844766335, + "learning_rate": 1.1718296474285229e-06, + "loss": 0.6309, + "step": 15478 + }, + { + "epoch": 2.31, + "grad_norm": 2.8924512222840546, + "learning_rate": 1.171734475596489e-06, + "loss": 0.6556, + "step": 15479 + }, + { + "epoch": 2.31, + "grad_norm": 2.6919058259567072, + "learning_rate": 1.171639302161642e-06, + "loss": 0.6439, + "step": 15480 + }, + { + "epoch": 2.31, + "grad_norm": 2.7715152631485616, + "learning_rate": 1.1715441271248698e-06, + "loss": 0.651, + "step": 15481 + }, + { + "epoch": 2.31, + "grad_norm": 3.049036182715633, + "learning_rate": 1.1714489504870615e-06, + "loss": 0.6647, + "step": 15482 + }, + { + "epoch": 2.31, + "grad_norm": 3.472514428800762, + "learning_rate": 1.171353772249105e-06, + "loss": 0.6712, + "step": 15483 + }, + { + "epoch": 2.31, + "grad_norm": 3.2189506839286777, + "learning_rate": 1.1712585924118882e-06, + "loss": 0.6393, + "step": 15484 + }, + { + "epoch": 2.31, + "grad_norm": 2.9738768047646933, + "learning_rate": 1.1711634109763003e-06, + "loss": 0.651, + "step": 15485 + }, + { + "epoch": 2.31, + "grad_norm": 3.573603000966073, + "learning_rate": 1.1710682279432287e-06, + "loss": 0.6354, + "step": 15486 + }, + { + "epoch": 2.31, + "grad_norm": 3.8074454794794246, + "learning_rate": 1.1709730433135627e-06, + "loss": 0.6517, + "step": 15487 + }, + { + "epoch": 2.31, + "grad_norm": 2.922672808804402, + "learning_rate": 1.1708778570881898e-06, + "loss": 0.5794, + "step": 15488 + }, + { + "epoch": 2.31, + "grad_norm": 6.3444267193281645, + "learning_rate": 1.1707826692679987e-06, + "loss": 0.5918, + "step": 15489 + }, + { + "epoch": 2.31, + "grad_norm": 3.9984370756799565, + "learning_rate": 1.1706874798538783e-06, + "loss": 0.6152, + "step": 15490 + }, + { + "epoch": 2.31, + "grad_norm": 3.4442963800713957, + "learning_rate": 1.1705922888467162e-06, + "loss": 0.5999, + "step": 15491 + }, + { + "epoch": 2.31, + "grad_norm": 3.2310296751556047, + "learning_rate": 1.1704970962474013e-06, + "loss": 0.5931, + "step": 15492 + }, + { + "epoch": 2.31, + "grad_norm": 3.2367750855569475, + "learning_rate": 1.1704019020568217e-06, + "loss": 0.6693, + "step": 15493 + }, + { + "epoch": 2.31, + "grad_norm": 3.0775494656014493, + "learning_rate": 1.1703067062758662e-06, + "loss": 0.6855, + "step": 15494 + }, + { + "epoch": 2.31, + "grad_norm": 4.031575081423354, + "learning_rate": 1.1702115089054232e-06, + "loss": 0.6589, + "step": 15495 + }, + { + "epoch": 2.31, + "grad_norm": 4.045830479421231, + "learning_rate": 1.1701163099463808e-06, + "loss": 0.6257, + "step": 15496 + }, + { + "epoch": 2.31, + "grad_norm": 4.726510828556007, + "learning_rate": 1.1700211093996285e-06, + "loss": 0.6536, + "step": 15497 + }, + { + "epoch": 2.31, + "grad_norm": 3.140346782384717, + "learning_rate": 1.1699259072660538e-06, + "loss": 0.6257, + "step": 15498 + }, + { + "epoch": 2.31, + "grad_norm": 5.337720149252819, + "learning_rate": 1.1698307035465454e-06, + "loss": 0.6693, + "step": 15499 + }, + { + "epoch": 2.31, + "grad_norm": 3.246048175439891, + "learning_rate": 1.1697354982419925e-06, + "loss": 0.6693, + "step": 15500 + }, + { + "epoch": 2.31, + "grad_norm": 3.6059651655418135, + "learning_rate": 1.1696402913532825e-06, + "loss": 0.6621, + "step": 15501 + }, + { + "epoch": 2.31, + "grad_norm": 3.675183040373272, + "learning_rate": 1.169545082881305e-06, + "loss": 0.5951, + "step": 15502 + }, + { + "epoch": 2.31, + "grad_norm": 3.6616752976925437, + "learning_rate": 1.1694498728269486e-06, + "loss": 0.7025, + "step": 15503 + }, + { + "epoch": 2.31, + "grad_norm": 3.501290624343809, + "learning_rate": 1.1693546611911012e-06, + "loss": 0.64, + "step": 15504 + }, + { + "epoch": 2.31, + "grad_norm": 3.7217990003250576, + "learning_rate": 1.1692594479746517e-06, + "loss": 0.6191, + "step": 15505 + }, + { + "epoch": 2.31, + "grad_norm": 3.4298494583370593, + "learning_rate": 1.1691642331784888e-06, + "loss": 0.638, + "step": 15506 + }, + { + "epoch": 2.31, + "grad_norm": 5.822439304104232, + "learning_rate": 1.1690690168035014e-06, + "loss": 0.6895, + "step": 15507 + }, + { + "epoch": 2.31, + "grad_norm": 3.9421283983760054, + "learning_rate": 1.168973798850578e-06, + "loss": 0.6751, + "step": 15508 + }, + { + "epoch": 2.31, + "grad_norm": 2.665911864137881, + "learning_rate": 1.1688785793206067e-06, + "loss": 0.6126, + "step": 15509 + }, + { + "epoch": 2.31, + "grad_norm": 3.1192013257297804, + "learning_rate": 1.1687833582144771e-06, + "loss": 0.6172, + "step": 15510 + }, + { + "epoch": 2.31, + "grad_norm": 2.938957873641972, + "learning_rate": 1.1686881355330774e-06, + "loss": 0.6829, + "step": 15511 + }, + { + "epoch": 2.31, + "grad_norm": 3.560910993971844, + "learning_rate": 1.168592911277296e-06, + "loss": 0.627, + "step": 15512 + }, + { + "epoch": 2.31, + "grad_norm": 3.943589296764702, + "learning_rate": 1.1684976854480227e-06, + "loss": 0.6204, + "step": 15513 + }, + { + "epoch": 2.31, + "grad_norm": 3.8073656152004163, + "learning_rate": 1.1684024580461454e-06, + "loss": 0.638, + "step": 15514 + }, + { + "epoch": 2.31, + "grad_norm": 4.649642800447774, + "learning_rate": 1.1683072290725526e-06, + "loss": 0.6562, + "step": 15515 + }, + { + "epoch": 2.31, + "grad_norm": 2.9396998391144638, + "learning_rate": 1.1682119985281341e-06, + "loss": 0.6432, + "step": 15516 + }, + { + "epoch": 2.31, + "grad_norm": 3.679680255480015, + "learning_rate": 1.168116766413778e-06, + "loss": 0.6543, + "step": 15517 + }, + { + "epoch": 2.31, + "grad_norm": 3.2811097490483783, + "learning_rate": 1.168021532730373e-06, + "loss": 0.6185, + "step": 15518 + }, + { + "epoch": 2.31, + "grad_norm": 2.8066066141267507, + "learning_rate": 1.1679262974788085e-06, + "loss": 0.6426, + "step": 15519 + }, + { + "epoch": 2.31, + "grad_norm": 3.1350179320591036, + "learning_rate": 1.167831060659973e-06, + "loss": 0.625, + "step": 15520 + }, + { + "epoch": 2.31, + "grad_norm": 3.247844943453554, + "learning_rate": 1.1677358222747552e-06, + "loss": 0.6211, + "step": 15521 + }, + { + "epoch": 2.31, + "grad_norm": 2.5085084540237856, + "learning_rate": 1.1676405823240442e-06, + "loss": 0.5804, + "step": 15522 + }, + { + "epoch": 2.32, + "grad_norm": 3.258447201046247, + "learning_rate": 1.1675453408087288e-06, + "loss": 0.6641, + "step": 15523 + }, + { + "epoch": 2.32, + "grad_norm": 3.1444944790007865, + "learning_rate": 1.167450097729698e-06, + "loss": 0.6133, + "step": 15524 + }, + { + "epoch": 2.32, + "grad_norm": 3.088613821124058, + "learning_rate": 1.1673548530878406e-06, + "loss": 0.6484, + "step": 15525 + }, + { + "epoch": 2.32, + "grad_norm": 2.8666406753812836, + "learning_rate": 1.1672596068840454e-06, + "loss": 0.6523, + "step": 15526 + }, + { + "epoch": 2.32, + "grad_norm": 4.751503879782037, + "learning_rate": 1.1671643591192017e-06, + "loss": 0.6641, + "step": 15527 + }, + { + "epoch": 2.32, + "grad_norm": 5.151236289057366, + "learning_rate": 1.1670691097941983e-06, + "loss": 0.6862, + "step": 15528 + }, + { + "epoch": 2.32, + "grad_norm": 3.1323269451381655, + "learning_rate": 1.1669738589099241e-06, + "loss": 0.6608, + "step": 15529 + }, + { + "epoch": 2.32, + "grad_norm": 3.4014426526516273, + "learning_rate": 1.166878606467268e-06, + "loss": 0.6074, + "step": 15530 + }, + { + "epoch": 2.32, + "grad_norm": 4.123862170943287, + "learning_rate": 1.166783352467119e-06, + "loss": 0.6243, + "step": 15531 + }, + { + "epoch": 2.32, + "grad_norm": 5.517404568168167, + "learning_rate": 1.1666880969103662e-06, + "loss": 0.6309, + "step": 15532 + }, + { + "epoch": 2.32, + "grad_norm": 4.675660656259796, + "learning_rate": 1.166592839797899e-06, + "loss": 0.5788, + "step": 15533 + }, + { + "epoch": 2.32, + "grad_norm": 3.3638448668149223, + "learning_rate": 1.1664975811306057e-06, + "loss": 0.6699, + "step": 15534 + }, + { + "epoch": 2.32, + "grad_norm": 3.2382629343139366, + "learning_rate": 1.1664023209093762e-06, + "loss": 0.6979, + "step": 15535 + }, + { + "epoch": 2.32, + "grad_norm": 2.9252928467745845, + "learning_rate": 1.1663070591350988e-06, + "loss": 0.6328, + "step": 15536 + }, + { + "epoch": 2.32, + "grad_norm": 6.80395563727962, + "learning_rate": 1.166211795808663e-06, + "loss": 0.6719, + "step": 15537 + }, + { + "epoch": 2.32, + "grad_norm": 4.2938944414883435, + "learning_rate": 1.1661165309309576e-06, + "loss": 0.6745, + "step": 15538 + }, + { + "epoch": 2.32, + "grad_norm": 3.1276305658881394, + "learning_rate": 1.166021264502872e-06, + "loss": 0.5996, + "step": 15539 + }, + { + "epoch": 2.32, + "grad_norm": 4.0648572318353615, + "learning_rate": 1.1659259965252957e-06, + "loss": 0.6113, + "step": 15540 + }, + { + "epoch": 2.32, + "grad_norm": 6.3983097620726745, + "learning_rate": 1.165830726999117e-06, + "loss": 0.6654, + "step": 15541 + }, + { + "epoch": 2.32, + "grad_norm": 2.7023817191536565, + "learning_rate": 1.1657354559252254e-06, + "loss": 0.6094, + "step": 15542 + }, + { + "epoch": 2.32, + "grad_norm": 3.1002199208157166, + "learning_rate": 1.1656401833045103e-06, + "loss": 0.6146, + "step": 15543 + }, + { + "epoch": 2.32, + "grad_norm": 4.295983172922668, + "learning_rate": 1.1655449091378606e-06, + "loss": 0.6536, + "step": 15544 + }, + { + "epoch": 2.32, + "grad_norm": 3.4651236446571114, + "learning_rate": 1.1654496334261658e-06, + "loss": 0.6191, + "step": 15545 + }, + { + "epoch": 2.32, + "grad_norm": 3.2483468223097707, + "learning_rate": 1.165354356170315e-06, + "loss": 0.6484, + "step": 15546 + }, + { + "epoch": 2.32, + "grad_norm": 3.6740345142175337, + "learning_rate": 1.165259077371197e-06, + "loss": 0.6504, + "step": 15547 + }, + { + "epoch": 2.32, + "grad_norm": 3.295642863772294, + "learning_rate": 1.1651637970297018e-06, + "loss": 0.6276, + "step": 15548 + }, + { + "epoch": 2.32, + "grad_norm": 5.1943809181182035, + "learning_rate": 1.1650685151467179e-06, + "loss": 0.6217, + "step": 15549 + }, + { + "epoch": 2.32, + "grad_norm": 3.318611490173517, + "learning_rate": 1.1649732317231352e-06, + "loss": 0.6094, + "step": 15550 + }, + { + "epoch": 2.32, + "grad_norm": 2.729747485937904, + "learning_rate": 1.1648779467598427e-06, + "loss": 0.6413, + "step": 15551 + }, + { + "epoch": 2.32, + "grad_norm": 3.6563331232219802, + "learning_rate": 1.1647826602577295e-06, + "loss": 0.6283, + "step": 15552 + }, + { + "epoch": 2.32, + "grad_norm": 4.446404825702435, + "learning_rate": 1.1646873722176858e-06, + "loss": 0.6133, + "step": 15553 + }, + { + "epoch": 2.32, + "grad_norm": 3.139237756373081, + "learning_rate": 1.1645920826405997e-06, + "loss": 0.6283, + "step": 15554 + }, + { + "epoch": 2.32, + "grad_norm": 3.749708012756754, + "learning_rate": 1.1644967915273612e-06, + "loss": 0.6497, + "step": 15555 + }, + { + "epoch": 2.32, + "grad_norm": 3.817155826265496, + "learning_rate": 1.16440149887886e-06, + "loss": 0.6113, + "step": 15556 + }, + { + "epoch": 2.32, + "grad_norm": 4.260749637770168, + "learning_rate": 1.1643062046959846e-06, + "loss": 0.5977, + "step": 15557 + }, + { + "epoch": 2.32, + "grad_norm": 3.2976235814094954, + "learning_rate": 1.1642109089796253e-06, + "loss": 0.6536, + "step": 15558 + }, + { + "epoch": 2.32, + "grad_norm": 3.8065925816809925, + "learning_rate": 1.1641156117306707e-06, + "loss": 0.6322, + "step": 15559 + }, + { + "epoch": 2.32, + "grad_norm": 4.71955516292312, + "learning_rate": 1.1640203129500105e-06, + "loss": 0.6562, + "step": 15560 + }, + { + "epoch": 2.32, + "grad_norm": 2.983268399630613, + "learning_rate": 1.1639250126385344e-06, + "loss": 0.6491, + "step": 15561 + }, + { + "epoch": 2.32, + "grad_norm": 3.285562912202388, + "learning_rate": 1.1638297107971319e-06, + "loss": 0.6289, + "step": 15562 + }, + { + "epoch": 2.32, + "grad_norm": 3.0983879041379407, + "learning_rate": 1.163734407426692e-06, + "loss": 0.6491, + "step": 15563 + }, + { + "epoch": 2.32, + "grad_norm": 3.505653376066035, + "learning_rate": 1.1636391025281042e-06, + "loss": 0.6393, + "step": 15564 + }, + { + "epoch": 2.32, + "grad_norm": 4.846386766454327, + "learning_rate": 1.1635437961022585e-06, + "loss": 0.625, + "step": 15565 + }, + { + "epoch": 2.32, + "grad_norm": 4.119367151518711, + "learning_rate": 1.1634484881500441e-06, + "loss": 0.6419, + "step": 15566 + }, + { + "epoch": 2.32, + "grad_norm": 4.219266549804639, + "learning_rate": 1.1633531786723503e-06, + "loss": 0.6299, + "step": 15567 + }, + { + "epoch": 2.32, + "grad_norm": 5.631615346310551, + "learning_rate": 1.1632578676700667e-06, + "loss": 0.6315, + "step": 15568 + }, + { + "epoch": 2.32, + "grad_norm": 3.0080077523796347, + "learning_rate": 1.1631625551440836e-06, + "loss": 0.6058, + "step": 15569 + }, + { + "epoch": 2.32, + "grad_norm": 3.4678097398228678, + "learning_rate": 1.1630672410952893e-06, + "loss": 0.6243, + "step": 15570 + }, + { + "epoch": 2.32, + "grad_norm": 3.2316637504737775, + "learning_rate": 1.1629719255245743e-06, + "loss": 0.6562, + "step": 15571 + }, + { + "epoch": 2.32, + "grad_norm": 3.141131394938036, + "learning_rate": 1.1628766084328277e-06, + "loss": 0.6966, + "step": 15572 + }, + { + "epoch": 2.32, + "grad_norm": 3.7661529759220107, + "learning_rate": 1.1627812898209396e-06, + "loss": 0.6348, + "step": 15573 + }, + { + "epoch": 2.32, + "grad_norm": 3.3430642367417764, + "learning_rate": 1.1626859696897992e-06, + "loss": 0.6452, + "step": 15574 + }, + { + "epoch": 2.32, + "grad_norm": 4.465673458185593, + "learning_rate": 1.162590648040296e-06, + "loss": 0.6471, + "step": 15575 + }, + { + "epoch": 2.32, + "grad_norm": 2.871871662272028, + "learning_rate": 1.1624953248733203e-06, + "loss": 0.6107, + "step": 15576 + }, + { + "epoch": 2.32, + "grad_norm": 3.1392768405231335, + "learning_rate": 1.1624000001897613e-06, + "loss": 0.6146, + "step": 15577 + }, + { + "epoch": 2.32, + "grad_norm": 3.2470991214754803, + "learning_rate": 1.1623046739905085e-06, + "loss": 0.6484, + "step": 15578 + }, + { + "epoch": 2.32, + "grad_norm": 3.1513065543568706, + "learning_rate": 1.1622093462764522e-06, + "loss": 0.6315, + "step": 15579 + }, + { + "epoch": 2.32, + "grad_norm": 10.570859081691436, + "learning_rate": 1.1621140170484816e-06, + "loss": 0.653, + "step": 15580 + }, + { + "epoch": 2.32, + "grad_norm": 3.66029543468407, + "learning_rate": 1.1620186863074862e-06, + "loss": 0.6484, + "step": 15581 + }, + { + "epoch": 2.32, + "grad_norm": 2.9689065347664765, + "learning_rate": 1.1619233540543563e-06, + "loss": 0.6647, + "step": 15582 + }, + { + "epoch": 2.32, + "grad_norm": 3.689593929012432, + "learning_rate": 1.1618280202899816e-06, + "loss": 0.6523, + "step": 15583 + }, + { + "epoch": 2.32, + "grad_norm": 5.233298754882079, + "learning_rate": 1.1617326850152514e-06, + "loss": 0.6406, + "step": 15584 + }, + { + "epoch": 2.32, + "grad_norm": 4.349632281746875, + "learning_rate": 1.1616373482310557e-06, + "loss": 0.6289, + "step": 15585 + }, + { + "epoch": 2.32, + "grad_norm": 3.3705331817813007, + "learning_rate": 1.1615420099382847e-06, + "loss": 0.6719, + "step": 15586 + }, + { + "epoch": 2.32, + "grad_norm": 2.845510109952966, + "learning_rate": 1.1614466701378276e-06, + "loss": 0.6484, + "step": 15587 + }, + { + "epoch": 2.32, + "grad_norm": 3.107643943422341, + "learning_rate": 1.1613513288305745e-06, + "loss": 0.5947, + "step": 15588 + }, + { + "epoch": 2.32, + "grad_norm": 2.600539535306128, + "learning_rate": 1.1612559860174152e-06, + "loss": 0.6335, + "step": 15589 + }, + { + "epoch": 2.33, + "grad_norm": 3.371476311086528, + "learning_rate": 1.1611606416992395e-06, + "loss": 0.6576, + "step": 15590 + }, + { + "epoch": 2.33, + "grad_norm": 3.769747930814936, + "learning_rate": 1.1610652958769372e-06, + "loss": 0.6621, + "step": 15591 + }, + { + "epoch": 2.33, + "grad_norm": 2.6376827644893406, + "learning_rate": 1.1609699485513981e-06, + "loss": 0.5807, + "step": 15592 + }, + { + "epoch": 2.33, + "grad_norm": 4.336419585603276, + "learning_rate": 1.1608745997235127e-06, + "loss": 0.6497, + "step": 15593 + }, + { + "epoch": 2.33, + "grad_norm": 3.5382980237935566, + "learning_rate": 1.16077924939417e-06, + "loss": 0.6504, + "step": 15594 + }, + { + "epoch": 2.33, + "grad_norm": 2.631646277246079, + "learning_rate": 1.1606838975642606e-06, + "loss": 0.6139, + "step": 15595 + }, + { + "epoch": 2.33, + "grad_norm": 3.067400721662484, + "learning_rate": 1.1605885442346742e-06, + "loss": 0.6276, + "step": 15596 + }, + { + "epoch": 2.33, + "grad_norm": 2.7316451112522113, + "learning_rate": 1.1604931894063004e-06, + "loss": 0.6699, + "step": 15597 + }, + { + "epoch": 2.33, + "grad_norm": 3.00377917373943, + "learning_rate": 1.1603978330800296e-06, + "loss": 0.7057, + "step": 15598 + }, + { + "epoch": 2.33, + "grad_norm": 2.730310796217601, + "learning_rate": 1.160302475256752e-06, + "loss": 0.6341, + "step": 15599 + }, + { + "epoch": 2.33, + "grad_norm": 3.1020737462068233, + "learning_rate": 1.1602071159373567e-06, + "loss": 0.6185, + "step": 15600 + }, + { + "epoch": 2.33, + "grad_norm": 4.075294713224141, + "learning_rate": 1.1601117551227345e-06, + "loss": 0.6484, + "step": 15601 + }, + { + "epoch": 2.33, + "grad_norm": 3.3934602368782367, + "learning_rate": 1.1600163928137751e-06, + "loss": 0.6589, + "step": 15602 + }, + { + "epoch": 2.33, + "grad_norm": 4.203728038372765, + "learning_rate": 1.1599210290113687e-06, + "loss": 0.6882, + "step": 15603 + }, + { + "epoch": 2.33, + "grad_norm": 2.7223471556472805, + "learning_rate": 1.159825663716405e-06, + "loss": 0.6452, + "step": 15604 + }, + { + "epoch": 2.33, + "grad_norm": 2.7603236683477403, + "learning_rate": 1.1597302969297742e-06, + "loss": 0.6341, + "step": 15605 + }, + { + "epoch": 2.33, + "grad_norm": 4.788121241915303, + "learning_rate": 1.1596349286523665e-06, + "loss": 0.638, + "step": 15606 + }, + { + "epoch": 2.33, + "grad_norm": 3.245162280728777, + "learning_rate": 1.1595395588850717e-06, + "loss": 0.6628, + "step": 15607 + }, + { + "epoch": 2.33, + "grad_norm": 3.3133875075271324, + "learning_rate": 1.1594441876287807e-06, + "loss": 0.6696, + "step": 15608 + }, + { + "epoch": 2.33, + "grad_norm": 4.974308015473831, + "learning_rate": 1.1593488148843825e-06, + "loss": 0.6966, + "step": 15609 + }, + { + "epoch": 2.33, + "grad_norm": 8.584846437903323, + "learning_rate": 1.1592534406527676e-06, + "loss": 0.6621, + "step": 15610 + }, + { + "epoch": 2.33, + "grad_norm": 2.5501152317537805, + "learning_rate": 1.1591580649348264e-06, + "loss": 0.6699, + "step": 15611 + }, + { + "epoch": 2.33, + "grad_norm": 2.6981541508733464, + "learning_rate": 1.159062687731449e-06, + "loss": 0.6159, + "step": 15612 + }, + { + "epoch": 2.33, + "grad_norm": 2.4257401497322677, + "learning_rate": 1.1589673090435256e-06, + "loss": 0.6361, + "step": 15613 + }, + { + "epoch": 2.33, + "grad_norm": 5.967355384353739, + "learning_rate": 1.158871928871946e-06, + "loss": 0.6237, + "step": 15614 + }, + { + "epoch": 2.33, + "grad_norm": 2.4415263001969016, + "learning_rate": 1.1587765472176006e-06, + "loss": 0.6185, + "step": 15615 + }, + { + "epoch": 2.33, + "grad_norm": 3.8595726547687574, + "learning_rate": 1.1586811640813798e-06, + "loss": 0.6693, + "step": 15616 + }, + { + "epoch": 2.33, + "grad_norm": 3.4679009405122, + "learning_rate": 1.1585857794641736e-06, + "loss": 0.7253, + "step": 15617 + }, + { + "epoch": 2.33, + "grad_norm": 2.8811675496205575, + "learning_rate": 1.1584903933668722e-06, + "loss": 0.6302, + "step": 15618 + }, + { + "epoch": 2.33, + "grad_norm": 2.7362368927815135, + "learning_rate": 1.1583950057903664e-06, + "loss": 0.6328, + "step": 15619 + }, + { + "epoch": 2.33, + "grad_norm": 2.3370949878375744, + "learning_rate": 1.1582996167355455e-06, + "loss": 0.6217, + "step": 15620 + }, + { + "epoch": 2.33, + "grad_norm": 3.4475800636037865, + "learning_rate": 1.1582042262033003e-06, + "loss": 0.6341, + "step": 15621 + }, + { + "epoch": 2.33, + "grad_norm": 2.6174637070290188, + "learning_rate": 1.1581088341945214e-06, + "loss": 0.6777, + "step": 15622 + }, + { + "epoch": 2.33, + "grad_norm": 3.2578410064514003, + "learning_rate": 1.1580134407100984e-06, + "loss": 0.6595, + "step": 15623 + }, + { + "epoch": 2.33, + "grad_norm": 2.9716912787785574, + "learning_rate": 1.1579180457509223e-06, + "loss": 0.6263, + "step": 15624 + }, + { + "epoch": 2.33, + "grad_norm": 2.653961442170425, + "learning_rate": 1.1578226493178828e-06, + "loss": 0.6452, + "step": 15625 + }, + { + "epoch": 2.33, + "grad_norm": 2.759931779833622, + "learning_rate": 1.1577272514118708e-06, + "loss": 0.6595, + "step": 15626 + }, + { + "epoch": 2.33, + "grad_norm": 2.451458423763547, + "learning_rate": 1.1576318520337764e-06, + "loss": 0.6107, + "step": 15627 + }, + { + "epoch": 2.33, + "grad_norm": 3.206193281676228, + "learning_rate": 1.1575364511844898e-06, + "loss": 0.6497, + "step": 15628 + }, + { + "epoch": 2.33, + "grad_norm": 3.0345320646376077, + "learning_rate": 1.1574410488649017e-06, + "loss": 0.638, + "step": 15629 + }, + { + "epoch": 2.33, + "grad_norm": 2.497763921651016, + "learning_rate": 1.1573456450759024e-06, + "loss": 0.6289, + "step": 15630 + }, + { + "epoch": 2.33, + "grad_norm": 2.7062004271982967, + "learning_rate": 1.1572502398183822e-06, + "loss": 0.6686, + "step": 15631 + }, + { + "epoch": 2.33, + "grad_norm": 2.5192676974779418, + "learning_rate": 1.157154833093232e-06, + "loss": 0.6543, + "step": 15632 + }, + { + "epoch": 2.33, + "grad_norm": 2.9850753587083965, + "learning_rate": 1.1570594249013412e-06, + "loss": 0.6882, + "step": 15633 + }, + { + "epoch": 2.33, + "grad_norm": 3.9983276751449255, + "learning_rate": 1.156964015243601e-06, + "loss": 0.6354, + "step": 15634 + }, + { + "epoch": 2.33, + "grad_norm": 3.6448652632290317, + "learning_rate": 1.1568686041209024e-06, + "loss": 0.625, + "step": 15635 + }, + { + "epoch": 2.33, + "grad_norm": 3.051479581226954, + "learning_rate": 1.1567731915341345e-06, + "loss": 0.6243, + "step": 15636 + }, + { + "epoch": 2.33, + "grad_norm": 2.6207315568241056, + "learning_rate": 1.1566777774841888e-06, + "loss": 0.6406, + "step": 15637 + }, + { + "epoch": 2.33, + "grad_norm": 3.248444891576933, + "learning_rate": 1.1565823619719554e-06, + "loss": 0.6426, + "step": 15638 + }, + { + "epoch": 2.33, + "grad_norm": 2.705995412343959, + "learning_rate": 1.1564869449983255e-06, + "loss": 0.6354, + "step": 15639 + }, + { + "epoch": 2.33, + "grad_norm": 2.708449692597205, + "learning_rate": 1.1563915265641886e-06, + "loss": 0.6536, + "step": 15640 + }, + { + "epoch": 2.33, + "grad_norm": 3.642969487717022, + "learning_rate": 1.1562961066704358e-06, + "loss": 0.6517, + "step": 15641 + }, + { + "epoch": 2.33, + "grad_norm": 2.582060617689002, + "learning_rate": 1.1562006853179578e-06, + "loss": 0.6029, + "step": 15642 + }, + { + "epoch": 2.33, + "grad_norm": 2.5542440686431758, + "learning_rate": 1.1561052625076451e-06, + "loss": 0.6406, + "step": 15643 + }, + { + "epoch": 2.33, + "grad_norm": 2.920405351416478, + "learning_rate": 1.1560098382403879e-06, + "loss": 0.6159, + "step": 15644 + }, + { + "epoch": 2.33, + "grad_norm": 3.995526949334935, + "learning_rate": 1.155914412517077e-06, + "loss": 0.668, + "step": 15645 + }, + { + "epoch": 2.33, + "grad_norm": 2.6541117291788017, + "learning_rate": 1.1558189853386033e-06, + "loss": 0.6322, + "step": 15646 + }, + { + "epoch": 2.33, + "grad_norm": 7.68065223401273, + "learning_rate": 1.1557235567058572e-06, + "loss": 0.6354, + "step": 15647 + }, + { + "epoch": 2.33, + "grad_norm": 3.2987030323354283, + "learning_rate": 1.1556281266197295e-06, + "loss": 0.6758, + "step": 15648 + }, + { + "epoch": 2.33, + "grad_norm": 2.810645808808605, + "learning_rate": 1.1555326950811107e-06, + "loss": 0.5905, + "step": 15649 + }, + { + "epoch": 2.33, + "grad_norm": 3.506281007939244, + "learning_rate": 1.1554372620908911e-06, + "loss": 0.6602, + "step": 15650 + }, + { + "epoch": 2.33, + "grad_norm": 4.337397709285941, + "learning_rate": 1.1553418276499622e-06, + "loss": 0.6312, + "step": 15651 + }, + { + "epoch": 2.33, + "grad_norm": 3.1026690917465958, + "learning_rate": 1.1552463917592143e-06, + "loss": 0.6611, + "step": 15652 + }, + { + "epoch": 2.33, + "grad_norm": 3.1482640763510736, + "learning_rate": 1.1551509544195379e-06, + "loss": 0.6361, + "step": 15653 + }, + { + "epoch": 2.33, + "grad_norm": 2.9648057578641693, + "learning_rate": 1.155055515631824e-06, + "loss": 0.679, + "step": 15654 + }, + { + "epoch": 2.33, + "grad_norm": 3.125424422118839, + "learning_rate": 1.1549600753969633e-06, + "loss": 0.597, + "step": 15655 + }, + { + "epoch": 2.33, + "grad_norm": 4.416872396725043, + "learning_rate": 1.1548646337158465e-06, + "loss": 0.6914, + "step": 15656 + }, + { + "epoch": 2.34, + "grad_norm": 2.731814983181606, + "learning_rate": 1.1547691905893642e-06, + "loss": 0.6413, + "step": 15657 + }, + { + "epoch": 2.34, + "grad_norm": 4.667004047656661, + "learning_rate": 1.1546737460184075e-06, + "loss": 0.6966, + "step": 15658 + }, + { + "epoch": 2.34, + "grad_norm": 3.1084580879960244, + "learning_rate": 1.1545783000038671e-06, + "loss": 0.6068, + "step": 15659 + }, + { + "epoch": 2.34, + "grad_norm": 3.448179718986582, + "learning_rate": 1.1544828525466337e-06, + "loss": 0.638, + "step": 15660 + }, + { + "epoch": 2.34, + "grad_norm": 2.7820270984941575, + "learning_rate": 1.1543874036475983e-06, + "loss": 0.627, + "step": 15661 + }, + { + "epoch": 2.34, + "grad_norm": 8.515203991664343, + "learning_rate": 1.1542919533076513e-06, + "loss": 0.6816, + "step": 15662 + }, + { + "epoch": 2.34, + "grad_norm": 5.338987374519392, + "learning_rate": 1.154196501527684e-06, + "loss": 0.6855, + "step": 15663 + }, + { + "epoch": 2.34, + "grad_norm": 2.4597504000458303, + "learning_rate": 1.1541010483085873e-06, + "loss": 0.6224, + "step": 15664 + }, + { + "epoch": 2.34, + "grad_norm": 2.8587129575264316, + "learning_rate": 1.1540055936512518e-06, + "loss": 0.6185, + "step": 15665 + }, + { + "epoch": 2.34, + "grad_norm": 3.2497843426636828, + "learning_rate": 1.1539101375565683e-06, + "loss": 0.6595, + "step": 15666 + }, + { + "epoch": 2.34, + "grad_norm": 5.754718187596582, + "learning_rate": 1.153814680025428e-06, + "loss": 0.6276, + "step": 15667 + }, + { + "epoch": 2.34, + "grad_norm": 3.447118504370182, + "learning_rate": 1.1537192210587218e-06, + "loss": 0.6185, + "step": 15668 + }, + { + "epoch": 2.34, + "grad_norm": 3.328710757410439, + "learning_rate": 1.1536237606573404e-06, + "loss": 0.6413, + "step": 15669 + }, + { + "epoch": 2.34, + "grad_norm": 2.6661153897159395, + "learning_rate": 1.1535282988221749e-06, + "loss": 0.6185, + "step": 15670 + }, + { + "epoch": 2.34, + "grad_norm": 3.7405932491888594, + "learning_rate": 1.1534328355541159e-06, + "loss": 0.6439, + "step": 15671 + }, + { + "epoch": 2.34, + "grad_norm": 3.4621293812382055, + "learning_rate": 1.1533373708540553e-06, + "loss": 0.5898, + "step": 15672 + }, + { + "epoch": 2.34, + "grad_norm": 3.022729422097801, + "learning_rate": 1.153241904722883e-06, + "loss": 0.6374, + "step": 15673 + }, + { + "epoch": 2.34, + "grad_norm": 4.84084717925837, + "learning_rate": 1.1531464371614904e-06, + "loss": 0.61, + "step": 15674 + }, + { + "epoch": 2.34, + "grad_norm": 2.8161507912385177, + "learning_rate": 1.1530509681707689e-06, + "loss": 0.6549, + "step": 15675 + }, + { + "epoch": 2.34, + "grad_norm": 4.987189004999574, + "learning_rate": 1.152955497751609e-06, + "loss": 0.6654, + "step": 15676 + }, + { + "epoch": 2.34, + "grad_norm": 3.88356479831321, + "learning_rate": 1.1528600259049018e-06, + "loss": 0.6615, + "step": 15677 + }, + { + "epoch": 2.34, + "grad_norm": 3.9346950892720485, + "learning_rate": 1.1527645526315386e-06, + "loss": 0.6732, + "step": 15678 + }, + { + "epoch": 2.34, + "grad_norm": 3.2141904745029244, + "learning_rate": 1.1526690779324102e-06, + "loss": 0.694, + "step": 15679 + }, + { + "epoch": 2.34, + "grad_norm": 2.897935208374883, + "learning_rate": 1.152573601808408e-06, + "loss": 0.6224, + "step": 15680 + }, + { + "epoch": 2.34, + "grad_norm": 2.9372173983179355, + "learning_rate": 1.1524781242604226e-06, + "loss": 0.6432, + "step": 15681 + }, + { + "epoch": 2.34, + "grad_norm": 2.925257485355486, + "learning_rate": 1.1523826452893456e-06, + "loss": 0.6582, + "step": 15682 + }, + { + "epoch": 2.34, + "grad_norm": 3.1952486376687053, + "learning_rate": 1.1522871648960679e-06, + "loss": 0.6406, + "step": 15683 + }, + { + "epoch": 2.34, + "grad_norm": 2.886616846939566, + "learning_rate": 1.1521916830814805e-06, + "loss": 0.6986, + "step": 15684 + }, + { + "epoch": 2.34, + "grad_norm": 3.4615798638249675, + "learning_rate": 1.1520961998464748e-06, + "loss": 0.6497, + "step": 15685 + }, + { + "epoch": 2.34, + "grad_norm": 4.129965558469427, + "learning_rate": 1.1520007151919418e-06, + "loss": 0.6309, + "step": 15686 + }, + { + "epoch": 2.34, + "grad_norm": 3.080105933625945, + "learning_rate": 1.1519052291187724e-06, + "loss": 0.6615, + "step": 15687 + }, + { + "epoch": 2.34, + "grad_norm": 3.3539778567397156, + "learning_rate": 1.1518097416278585e-06, + "loss": 0.6185, + "step": 15688 + }, + { + "epoch": 2.34, + "grad_norm": 3.064220468699017, + "learning_rate": 1.1517142527200904e-06, + "loss": 0.6296, + "step": 15689 + }, + { + "epoch": 2.34, + "grad_norm": 2.7321470141482638, + "learning_rate": 1.1516187623963602e-06, + "loss": 0.6309, + "step": 15690 + }, + { + "epoch": 2.34, + "grad_norm": 2.578949423135296, + "learning_rate": 1.1515232706575585e-06, + "loss": 0.6484, + "step": 15691 + }, + { + "epoch": 2.34, + "grad_norm": 2.633675003007874, + "learning_rate": 1.1514277775045766e-06, + "loss": 0.6263, + "step": 15692 + }, + { + "epoch": 2.34, + "grad_norm": 2.885838949841728, + "learning_rate": 1.1513322829383061e-06, + "loss": 0.6478, + "step": 15693 + }, + { + "epoch": 2.34, + "grad_norm": 3.343078053993023, + "learning_rate": 1.1512367869596378e-06, + "loss": 0.6191, + "step": 15694 + }, + { + "epoch": 2.34, + "grad_norm": 2.9842074149548963, + "learning_rate": 1.1511412895694633e-06, + "loss": 0.6126, + "step": 15695 + }, + { + "epoch": 2.34, + "grad_norm": 4.521089335142912, + "learning_rate": 1.1510457907686737e-06, + "loss": 0.6478, + "step": 15696 + }, + { + "epoch": 2.34, + "grad_norm": 4.482890300966469, + "learning_rate": 1.1509502905581604e-06, + "loss": 0.6484, + "step": 15697 + }, + { + "epoch": 2.34, + "grad_norm": 2.9094262528278803, + "learning_rate": 1.1508547889388149e-06, + "loss": 0.6367, + "step": 15698 + }, + { + "epoch": 2.34, + "grad_norm": 2.9982784180826307, + "learning_rate": 1.150759285911528e-06, + "loss": 0.64, + "step": 15699 + }, + { + "epoch": 2.34, + "grad_norm": 3.1275255547958087, + "learning_rate": 1.1506637814771913e-06, + "loss": 0.6693, + "step": 15700 + }, + { + "epoch": 2.34, + "grad_norm": 2.685926229939487, + "learning_rate": 1.1505682756366965e-06, + "loss": 0.6458, + "step": 15701 + }, + { + "epoch": 2.34, + "grad_norm": 3.0941180310864707, + "learning_rate": 1.1504727683909349e-06, + "loss": 0.627, + "step": 15702 + }, + { + "epoch": 2.34, + "grad_norm": 2.6306704361551527, + "learning_rate": 1.1503772597407972e-06, + "loss": 0.6055, + "step": 15703 + }, + { + "epoch": 2.34, + "grad_norm": 3.52695286094051, + "learning_rate": 1.1502817496871752e-06, + "loss": 0.6536, + "step": 15704 + }, + { + "epoch": 2.34, + "grad_norm": 2.825363117542417, + "learning_rate": 1.1501862382309603e-06, + "loss": 0.6582, + "step": 15705 + }, + { + "epoch": 2.34, + "grad_norm": 4.949685422712726, + "learning_rate": 1.1500907253730442e-06, + "loss": 0.6185, + "step": 15706 + }, + { + "epoch": 2.34, + "grad_norm": 4.382802791397897, + "learning_rate": 1.149995211114318e-06, + "loss": 0.6797, + "step": 15707 + }, + { + "epoch": 2.34, + "grad_norm": 3.0945034830664593, + "learning_rate": 1.1498996954556735e-06, + "loss": 0.6797, + "step": 15708 + }, + { + "epoch": 2.34, + "grad_norm": 2.6186114990174603, + "learning_rate": 1.1498041783980016e-06, + "loss": 0.6836, + "step": 15709 + }, + { + "epoch": 2.34, + "grad_norm": 4.051023623626522, + "learning_rate": 1.1497086599421939e-06, + "loss": 0.6445, + "step": 15710 + }, + { + "epoch": 2.34, + "grad_norm": 4.78068803607225, + "learning_rate": 1.1496131400891424e-06, + "loss": 0.6732, + "step": 15711 + }, + { + "epoch": 2.34, + "grad_norm": 2.6375284002017274, + "learning_rate": 1.149517618839738e-06, + "loss": 0.6517, + "step": 15712 + }, + { + "epoch": 2.34, + "grad_norm": 4.372721010788345, + "learning_rate": 1.1494220961948725e-06, + "loss": 0.6237, + "step": 15713 + }, + { + "epoch": 2.34, + "grad_norm": 3.152446427751764, + "learning_rate": 1.1493265721554373e-06, + "loss": 0.6175, + "step": 15714 + }, + { + "epoch": 2.34, + "grad_norm": 3.8948802829983626, + "learning_rate": 1.149231046722324e-06, + "loss": 0.6432, + "step": 15715 + }, + { + "epoch": 2.34, + "grad_norm": 4.283449830443574, + "learning_rate": 1.1491355198964244e-06, + "loss": 0.6725, + "step": 15716 + }, + { + "epoch": 2.34, + "grad_norm": 3.851442016057324, + "learning_rate": 1.1490399916786295e-06, + "loss": 0.6777, + "step": 15717 + }, + { + "epoch": 2.34, + "grad_norm": 2.6102574492582638, + "learning_rate": 1.1489444620698314e-06, + "loss": 0.6647, + "step": 15718 + }, + { + "epoch": 2.34, + "grad_norm": 2.4700270412713095, + "learning_rate": 1.1488489310709213e-06, + "loss": 0.6204, + "step": 15719 + }, + { + "epoch": 2.34, + "grad_norm": 3.062447643924152, + "learning_rate": 1.1487533986827908e-06, + "loss": 0.6764, + "step": 15720 + }, + { + "epoch": 2.34, + "grad_norm": 3.720075936422059, + "learning_rate": 1.1486578649063322e-06, + "loss": 0.6549, + "step": 15721 + }, + { + "epoch": 2.34, + "grad_norm": 4.010379650965508, + "learning_rate": 1.1485623297424362e-06, + "loss": 0.6413, + "step": 15722 + }, + { + "epoch": 2.34, + "grad_norm": 2.5289676563978225, + "learning_rate": 1.1484667931919949e-06, + "loss": 0.6354, + "step": 15723 + }, + { + "epoch": 2.35, + "grad_norm": 5.052904186351656, + "learning_rate": 1.1483712552559e-06, + "loss": 0.6237, + "step": 15724 + }, + { + "epoch": 2.35, + "grad_norm": 2.657520446853786, + "learning_rate": 1.1482757159350428e-06, + "loss": 0.6146, + "step": 15725 + }, + { + "epoch": 2.35, + "grad_norm": 2.5690870424979657, + "learning_rate": 1.1481801752303155e-06, + "loss": 0.6338, + "step": 15726 + }, + { + "epoch": 2.35, + "grad_norm": 3.0889984342859775, + "learning_rate": 1.1480846331426096e-06, + "loss": 0.6237, + "step": 15727 + }, + { + "epoch": 2.35, + "grad_norm": 2.5660000761573403, + "learning_rate": 1.1479890896728165e-06, + "loss": 0.6582, + "step": 15728 + }, + { + "epoch": 2.35, + "grad_norm": 3.195935981915697, + "learning_rate": 1.147893544821828e-06, + "loss": 0.6569, + "step": 15729 + }, + { + "epoch": 2.35, + "grad_norm": 3.3670132970366704, + "learning_rate": 1.1477979985905361e-06, + "loss": 0.6497, + "step": 15730 + }, + { + "epoch": 2.35, + "grad_norm": 3.5789485293404772, + "learning_rate": 1.1477024509798325e-06, + "loss": 0.625, + "step": 15731 + }, + { + "epoch": 2.35, + "grad_norm": 3.1871775960534143, + "learning_rate": 1.1476069019906088e-06, + "loss": 0.6367, + "step": 15732 + }, + { + "epoch": 2.35, + "grad_norm": 2.504116314003527, + "learning_rate": 1.147511351623757e-06, + "loss": 0.6452, + "step": 15733 + }, + { + "epoch": 2.35, + "grad_norm": 3.532051763299711, + "learning_rate": 1.1474157998801683e-06, + "loss": 0.6152, + "step": 15734 + }, + { + "epoch": 2.35, + "grad_norm": 3.3703383446860156, + "learning_rate": 1.1473202467607355e-06, + "loss": 0.6178, + "step": 15735 + }, + { + "epoch": 2.35, + "grad_norm": 3.0636318313708455, + "learning_rate": 1.1472246922663493e-06, + "loss": 0.6465, + "step": 15736 + }, + { + "epoch": 2.35, + "grad_norm": 2.8414192162872847, + "learning_rate": 1.147129136397902e-06, + "loss": 0.6172, + "step": 15737 + }, + { + "epoch": 2.35, + "grad_norm": 3.0009366635561077, + "learning_rate": 1.147033579156286e-06, + "loss": 0.6211, + "step": 15738 + }, + { + "epoch": 2.35, + "grad_norm": 2.6734974543068795, + "learning_rate": 1.1469380205423923e-06, + "loss": 0.6341, + "step": 15739 + }, + { + "epoch": 2.35, + "grad_norm": 2.756253153236887, + "learning_rate": 1.1468424605571129e-06, + "loss": 0.6237, + "step": 15740 + }, + { + "epoch": 2.35, + "grad_norm": 3.130925840119248, + "learning_rate": 1.14674689920134e-06, + "loss": 0.6458, + "step": 15741 + }, + { + "epoch": 2.35, + "grad_norm": 2.803978799320831, + "learning_rate": 1.1466513364759652e-06, + "loss": 0.6022, + "step": 15742 + }, + { + "epoch": 2.35, + "grad_norm": 2.7517971539431136, + "learning_rate": 1.1465557723818806e-06, + "loss": 0.64, + "step": 15743 + }, + { + "epoch": 2.35, + "grad_norm": 5.330957923988224, + "learning_rate": 1.146460206919978e-06, + "loss": 0.653, + "step": 15744 + }, + { + "epoch": 2.35, + "grad_norm": 3.307130169235897, + "learning_rate": 1.146364640091149e-06, + "loss": 0.6283, + "step": 15745 + }, + { + "epoch": 2.35, + "grad_norm": 3.5268654147622023, + "learning_rate": 1.1462690718962865e-06, + "loss": 0.6445, + "step": 15746 + }, + { + "epoch": 2.35, + "grad_norm": 3.8505924695752096, + "learning_rate": 1.1461735023362813e-06, + "loss": 0.6725, + "step": 15747 + }, + { + "epoch": 2.35, + "grad_norm": 2.945243620127022, + "learning_rate": 1.146077931412026e-06, + "loss": 0.6582, + "step": 15748 + }, + { + "epoch": 2.35, + "grad_norm": 2.7938599494167944, + "learning_rate": 1.1459823591244128e-06, + "loss": 0.6185, + "step": 15749 + }, + { + "epoch": 2.35, + "grad_norm": 2.872090063427313, + "learning_rate": 1.1458867854743329e-06, + "loss": 0.6445, + "step": 15750 + }, + { + "epoch": 2.35, + "grad_norm": 3.405852772573199, + "learning_rate": 1.1457912104626792e-06, + "loss": 0.6543, + "step": 15751 + }, + { + "epoch": 2.35, + "grad_norm": 4.635019368354188, + "learning_rate": 1.1456956340903428e-06, + "loss": 0.6445, + "step": 15752 + }, + { + "epoch": 2.35, + "grad_norm": 3.457014732519546, + "learning_rate": 1.145600056358216e-06, + "loss": 0.6673, + "step": 15753 + }, + { + "epoch": 2.35, + "grad_norm": 3.9777934003752535, + "learning_rate": 1.1455044772671916e-06, + "loss": 0.6465, + "step": 15754 + }, + { + "epoch": 2.35, + "grad_norm": 2.856658816385258, + "learning_rate": 1.1454088968181607e-06, + "loss": 0.6517, + "step": 15755 + }, + { + "epoch": 2.35, + "grad_norm": 4.13808181392674, + "learning_rate": 1.1453133150120158e-06, + "loss": 0.6628, + "step": 15756 + }, + { + "epoch": 2.35, + "grad_norm": 3.7631023556847514, + "learning_rate": 1.1452177318496485e-06, + "loss": 0.6484, + "step": 15757 + }, + { + "epoch": 2.35, + "grad_norm": 3.282009130556564, + "learning_rate": 1.1451221473319518e-06, + "loss": 0.6693, + "step": 15758 + }, + { + "epoch": 2.35, + "grad_norm": 3.9920338650546516, + "learning_rate": 1.145026561459817e-06, + "loss": 0.6686, + "step": 15759 + }, + { + "epoch": 2.35, + "grad_norm": 3.1509874262076183, + "learning_rate": 1.1449309742341364e-06, + "loss": 0.6413, + "step": 15760 + }, + { + "epoch": 2.35, + "grad_norm": 2.8018267059457003, + "learning_rate": 1.1448353856558023e-06, + "loss": 0.6439, + "step": 15761 + }, + { + "epoch": 2.35, + "grad_norm": 8.28803853535034, + "learning_rate": 1.144739795725707e-06, + "loss": 0.7057, + "step": 15762 + }, + { + "epoch": 2.35, + "grad_norm": 4.73680842260369, + "learning_rate": 1.144644204444742e-06, + "loss": 0.6647, + "step": 15763 + }, + { + "epoch": 2.35, + "grad_norm": 3.166416304849486, + "learning_rate": 1.1445486118138003e-06, + "loss": 0.6458, + "step": 15764 + }, + { + "epoch": 2.35, + "grad_norm": 2.703528405233583, + "learning_rate": 1.1444530178337732e-06, + "loss": 0.6621, + "step": 15765 + }, + { + "epoch": 2.35, + "grad_norm": 4.640757906154732, + "learning_rate": 1.1443574225055534e-06, + "loss": 0.7018, + "step": 15766 + }, + { + "epoch": 2.35, + "grad_norm": 2.9754659224642324, + "learning_rate": 1.144261825830033e-06, + "loss": 0.6348, + "step": 15767 + }, + { + "epoch": 2.35, + "grad_norm": 2.804486890726017, + "learning_rate": 1.1441662278081044e-06, + "loss": 0.6543, + "step": 15768 + }, + { + "epoch": 2.35, + "grad_norm": 3.942736255632947, + "learning_rate": 1.1440706284406595e-06, + "loss": 0.6719, + "step": 15769 + }, + { + "epoch": 2.35, + "grad_norm": 3.2033760045583057, + "learning_rate": 1.1439750277285908e-06, + "loss": 0.6361, + "step": 15770 + }, + { + "epoch": 2.35, + "grad_norm": 2.8871671658111158, + "learning_rate": 1.1438794256727902e-06, + "loss": 0.6751, + "step": 15771 + }, + { + "epoch": 2.35, + "grad_norm": 2.5580586561029928, + "learning_rate": 1.1437838222741504e-06, + "loss": 0.6413, + "step": 15772 + }, + { + "epoch": 2.35, + "grad_norm": 2.3123556844308912, + "learning_rate": 1.1436882175335633e-06, + "loss": 0.6504, + "step": 15773 + }, + { + "epoch": 2.35, + "grad_norm": 3.063707503027957, + "learning_rate": 1.1435926114519214e-06, + "loss": 0.6328, + "step": 15774 + }, + { + "epoch": 2.35, + "grad_norm": 3.1435534451664644, + "learning_rate": 1.1434970040301172e-06, + "loss": 0.6589, + "step": 15775 + }, + { + "epoch": 2.35, + "grad_norm": 5.241252064928334, + "learning_rate": 1.1434013952690425e-06, + "loss": 0.6257, + "step": 15776 + }, + { + "epoch": 2.35, + "grad_norm": 3.4057395788172693, + "learning_rate": 1.14330578516959e-06, + "loss": 0.6615, + "step": 15777 + }, + { + "epoch": 2.35, + "grad_norm": 5.299938838644335, + "learning_rate": 1.143210173732652e-06, + "loss": 0.6836, + "step": 15778 + }, + { + "epoch": 2.35, + "grad_norm": 2.787373826183395, + "learning_rate": 1.1431145609591206e-06, + "loss": 0.6387, + "step": 15779 + }, + { + "epoch": 2.35, + "grad_norm": 2.8994612934185193, + "learning_rate": 1.1430189468498885e-06, + "loss": 0.6296, + "step": 15780 + }, + { + "epoch": 2.35, + "grad_norm": 2.9130000465147714, + "learning_rate": 1.142923331405848e-06, + "loss": 0.6367, + "step": 15781 + }, + { + "epoch": 2.35, + "grad_norm": 3.725431092429716, + "learning_rate": 1.1428277146278912e-06, + "loss": 0.6211, + "step": 15782 + }, + { + "epoch": 2.35, + "grad_norm": 2.5534234391494612, + "learning_rate": 1.1427320965169108e-06, + "loss": 0.6725, + "step": 15783 + }, + { + "epoch": 2.35, + "grad_norm": 3.2480678173886406, + "learning_rate": 1.142636477073799e-06, + "loss": 0.666, + "step": 15784 + }, + { + "epoch": 2.35, + "grad_norm": 2.581879546519884, + "learning_rate": 1.1425408562994484e-06, + "loss": 0.6191, + "step": 15785 + }, + { + "epoch": 2.35, + "grad_norm": 4.317185136606181, + "learning_rate": 1.1424452341947515e-06, + "loss": 0.6439, + "step": 15786 + }, + { + "epoch": 2.35, + "grad_norm": 2.964601335554055, + "learning_rate": 1.1423496107606005e-06, + "loss": 0.6465, + "step": 15787 + }, + { + "epoch": 2.35, + "grad_norm": 3.201083412378467, + "learning_rate": 1.1422539859978881e-06, + "loss": 0.6517, + "step": 15788 + }, + { + "epoch": 2.35, + "grad_norm": 3.557976696816228, + "learning_rate": 1.1421583599075067e-06, + "loss": 0.625, + "step": 15789 + }, + { + "epoch": 2.35, + "grad_norm": 3.290349810740807, + "learning_rate": 1.1420627324903486e-06, + "loss": 0.6367, + "step": 15790 + }, + { + "epoch": 2.36, + "grad_norm": 3.100428662102095, + "learning_rate": 1.1419671037473067e-06, + "loss": 0.6569, + "step": 15791 + }, + { + "epoch": 2.36, + "grad_norm": 2.666687669845471, + "learning_rate": 1.141871473679273e-06, + "loss": 0.6335, + "step": 15792 + }, + { + "epoch": 2.36, + "grad_norm": 4.20502657655601, + "learning_rate": 1.1417758422871404e-06, + "loss": 0.6634, + "step": 15793 + }, + { + "epoch": 2.36, + "grad_norm": 2.808172981889703, + "learning_rate": 1.1416802095718014e-06, + "loss": 0.6602, + "step": 15794 + }, + { + "epoch": 2.36, + "grad_norm": 2.5929471755788915, + "learning_rate": 1.1415845755341481e-06, + "loss": 0.6068, + "step": 15795 + }, + { + "epoch": 2.36, + "grad_norm": 2.688515094969571, + "learning_rate": 1.141488940175074e-06, + "loss": 0.6595, + "step": 15796 + }, + { + "epoch": 2.36, + "grad_norm": 2.887162031681495, + "learning_rate": 1.1413933034954708e-06, + "loss": 0.6491, + "step": 15797 + }, + { + "epoch": 2.36, + "grad_norm": 2.703942140329897, + "learning_rate": 1.1412976654962314e-06, + "loss": 0.6491, + "step": 15798 + }, + { + "epoch": 2.36, + "grad_norm": 3.8281227006674934, + "learning_rate": 1.1412020261782485e-06, + "loss": 0.6322, + "step": 15799 + }, + { + "epoch": 2.36, + "grad_norm": 2.306939327565783, + "learning_rate": 1.1411063855424144e-06, + "loss": 0.6029, + "step": 15800 + }, + { + "epoch": 2.36, + "grad_norm": 2.3705056623371785, + "learning_rate": 1.1410107435896222e-06, + "loss": 0.6471, + "step": 15801 + }, + { + "epoch": 2.36, + "grad_norm": 2.6121370281135845, + "learning_rate": 1.140915100320764e-06, + "loss": 0.6465, + "step": 15802 + }, + { + "epoch": 2.36, + "grad_norm": 3.276677718013321, + "learning_rate": 1.1408194557367328e-06, + "loss": 0.6719, + "step": 15803 + }, + { + "epoch": 2.36, + "grad_norm": 3.4974152409941497, + "learning_rate": 1.1407238098384214e-06, + "loss": 0.6113, + "step": 15804 + }, + { + "epoch": 2.36, + "grad_norm": 2.8520717013289967, + "learning_rate": 1.1406281626267218e-06, + "loss": 0.6296, + "step": 15805 + }, + { + "epoch": 2.36, + "grad_norm": 3.5512460417856415, + "learning_rate": 1.1405325141025275e-06, + "loss": 0.6413, + "step": 15806 + }, + { + "epoch": 2.36, + "grad_norm": 2.8520375539935556, + "learning_rate": 1.1404368642667305e-06, + "loss": 0.6302, + "step": 15807 + }, + { + "epoch": 2.36, + "grad_norm": 3.573950188715342, + "learning_rate": 1.1403412131202238e-06, + "loss": 0.6673, + "step": 15808 + }, + { + "epoch": 2.36, + "grad_norm": 4.046882024338023, + "learning_rate": 1.1402455606639006e-06, + "loss": 0.6003, + "step": 15809 + }, + { + "epoch": 2.36, + "grad_norm": 3.2947965362506473, + "learning_rate": 1.140149906898653e-06, + "loss": 0.6393, + "step": 15810 + }, + { + "epoch": 2.36, + "grad_norm": 4.179838462470135, + "learning_rate": 1.1400542518253737e-06, + "loss": 0.6146, + "step": 15811 + }, + { + "epoch": 2.36, + "grad_norm": 3.7140892324830417, + "learning_rate": 1.139958595444956e-06, + "loss": 0.6413, + "step": 15812 + }, + { + "epoch": 2.36, + "grad_norm": 3.5935490671847536, + "learning_rate": 1.1398629377582919e-06, + "loss": 0.5908, + "step": 15813 + }, + { + "epoch": 2.36, + "grad_norm": 4.926009256755029, + "learning_rate": 1.1397672787662749e-06, + "loss": 0.6803, + "step": 15814 + }, + { + "epoch": 2.36, + "grad_norm": 3.172859224099184, + "learning_rate": 1.1396716184697978e-06, + "loss": 0.651, + "step": 15815 + }, + { + "epoch": 2.36, + "grad_norm": 3.1458438290852886, + "learning_rate": 1.1395759568697526e-06, + "loss": 0.6432, + "step": 15816 + }, + { + "epoch": 2.36, + "grad_norm": 4.0628061464334415, + "learning_rate": 1.1394802939670332e-06, + "loss": 0.6289, + "step": 15817 + }, + { + "epoch": 2.36, + "grad_norm": 2.8751970948541596, + "learning_rate": 1.1393846297625315e-06, + "loss": 0.6237, + "step": 15818 + }, + { + "epoch": 2.36, + "grad_norm": 3.354192310415196, + "learning_rate": 1.1392889642571406e-06, + "loss": 0.6003, + "step": 15819 + }, + { + "epoch": 2.36, + "grad_norm": 3.869629816777073, + "learning_rate": 1.139193297451754e-06, + "loss": 0.6335, + "step": 15820 + }, + { + "epoch": 2.36, + "grad_norm": 3.4859566088744685, + "learning_rate": 1.1390976293472636e-06, + "loss": 0.6504, + "step": 15821 + }, + { + "epoch": 2.36, + "grad_norm": 4.157257447486696, + "learning_rate": 1.1390019599445631e-06, + "loss": 0.6921, + "step": 15822 + }, + { + "epoch": 2.36, + "grad_norm": 3.894028944016534, + "learning_rate": 1.1389062892445444e-06, + "loss": 0.6589, + "step": 15823 + }, + { + "epoch": 2.36, + "grad_norm": 4.3692942838173545, + "learning_rate": 1.1388106172481015e-06, + "loss": 0.6576, + "step": 15824 + }, + { + "epoch": 2.36, + "grad_norm": 3.4315443975371744, + "learning_rate": 1.1387149439561269e-06, + "loss": 0.6523, + "step": 15825 + }, + { + "epoch": 2.36, + "grad_norm": 3.135536008329789, + "learning_rate": 1.1386192693695132e-06, + "loss": 0.6335, + "step": 15826 + }, + { + "epoch": 2.36, + "grad_norm": 2.8581802280731794, + "learning_rate": 1.1385235934891538e-06, + "loss": 0.6263, + "step": 15827 + }, + { + "epoch": 2.36, + "grad_norm": 3.9941127203817457, + "learning_rate": 1.1384279163159414e-06, + "loss": 0.6178, + "step": 15828 + }, + { + "epoch": 2.36, + "grad_norm": 4.0551097855204254, + "learning_rate": 1.1383322378507687e-06, + "loss": 0.6654, + "step": 15829 + }, + { + "epoch": 2.36, + "grad_norm": 3.481017136001444, + "learning_rate": 1.1382365580945297e-06, + "loss": 0.6647, + "step": 15830 + }, + { + "epoch": 2.36, + "grad_norm": 3.163907449752612, + "learning_rate": 1.138140877048116e-06, + "loss": 0.6901, + "step": 15831 + }, + { + "epoch": 2.36, + "grad_norm": 5.0783049740645065, + "learning_rate": 1.1380451947124212e-06, + "loss": 0.6719, + "step": 15832 + }, + { + "epoch": 2.36, + "grad_norm": 2.7912017597442156, + "learning_rate": 1.1379495110883388e-06, + "loss": 0.638, + "step": 15833 + }, + { + "epoch": 2.36, + "grad_norm": 4.305357684386184, + "learning_rate": 1.1378538261767613e-06, + "loss": 0.6354, + "step": 15834 + }, + { + "epoch": 2.36, + "grad_norm": 2.8745130786079143, + "learning_rate": 1.137758139978582e-06, + "loss": 0.6419, + "step": 15835 + }, + { + "epoch": 2.36, + "grad_norm": 4.212757782947894, + "learning_rate": 1.1376624524946936e-06, + "loss": 0.6107, + "step": 15836 + }, + { + "epoch": 2.36, + "grad_norm": 2.7874742298146407, + "learning_rate": 1.1375667637259896e-06, + "loss": 0.6022, + "step": 15837 + }, + { + "epoch": 2.36, + "grad_norm": 3.0945619462919427, + "learning_rate": 1.1374710736733625e-06, + "loss": 0.6647, + "step": 15838 + }, + { + "epoch": 2.36, + "grad_norm": 3.9595730830672546, + "learning_rate": 1.1373753823377059e-06, + "loss": 0.6165, + "step": 15839 + }, + { + "epoch": 2.36, + "grad_norm": 5.146828433752816, + "learning_rate": 1.1372796897199126e-06, + "loss": 0.6693, + "step": 15840 + }, + { + "epoch": 2.36, + "grad_norm": 2.5734080731958113, + "learning_rate": 1.1371839958208759e-06, + "loss": 0.6133, + "step": 15841 + }, + { + "epoch": 2.36, + "grad_norm": 3.10739823715781, + "learning_rate": 1.137088300641489e-06, + "loss": 0.6315, + "step": 15842 + }, + { + "epoch": 2.36, + "grad_norm": 2.7811276497030812, + "learning_rate": 1.1369926041826446e-06, + "loss": 0.6458, + "step": 15843 + }, + { + "epoch": 2.36, + "grad_norm": 2.804147747330815, + "learning_rate": 1.1368969064452364e-06, + "loss": 0.6491, + "step": 15844 + }, + { + "epoch": 2.36, + "grad_norm": 2.9322221472465175, + "learning_rate": 1.1368012074301567e-06, + "loss": 0.6439, + "step": 15845 + }, + { + "epoch": 2.36, + "grad_norm": 4.327389978169689, + "learning_rate": 1.1367055071382995e-06, + "loss": 0.6517, + "step": 15846 + }, + { + "epoch": 2.36, + "grad_norm": 2.7897626530579944, + "learning_rate": 1.136609805570558e-06, + "loss": 0.6335, + "step": 15847 + }, + { + "epoch": 2.36, + "grad_norm": 3.743807995404757, + "learning_rate": 1.1365141027278249e-06, + "loss": 0.6322, + "step": 15848 + }, + { + "epoch": 2.36, + "grad_norm": 3.7473845884742656, + "learning_rate": 1.1364183986109937e-06, + "loss": 0.6341, + "step": 15849 + }, + { + "epoch": 2.36, + "grad_norm": 2.523371044455536, + "learning_rate": 1.1363226932209573e-06, + "loss": 0.668, + "step": 15850 + }, + { + "epoch": 2.36, + "grad_norm": 3.034440062312992, + "learning_rate": 1.1362269865586094e-06, + "loss": 0.6536, + "step": 15851 + }, + { + "epoch": 2.36, + "grad_norm": 3.4252794925782224, + "learning_rate": 1.1361312786248426e-06, + "loss": 0.6309, + "step": 15852 + }, + { + "epoch": 2.36, + "grad_norm": 4.188381073125677, + "learning_rate": 1.136035569420551e-06, + "loss": 0.6615, + "step": 15853 + }, + { + "epoch": 2.36, + "grad_norm": 5.362495473130766, + "learning_rate": 1.1359398589466275e-06, + "loss": 0.6452, + "step": 15854 + }, + { + "epoch": 2.36, + "grad_norm": 4.5611368692369165, + "learning_rate": 1.1358441472039646e-06, + "loss": 0.6335, + "step": 15855 + }, + { + "epoch": 2.36, + "grad_norm": 4.7911299203316124, + "learning_rate": 1.1357484341934567e-06, + "loss": 0.6393, + "step": 15856 + }, + { + "epoch": 2.36, + "grad_norm": 2.838695112652654, + "learning_rate": 1.1356527199159968e-06, + "loss": 0.6348, + "step": 15857 + }, + { + "epoch": 2.37, + "grad_norm": 6.535957255986538, + "learning_rate": 1.1355570043724775e-06, + "loss": 0.6432, + "step": 15858 + }, + { + "epoch": 2.37, + "grad_norm": 3.3936142952134225, + "learning_rate": 1.1354612875637932e-06, + "loss": 0.7207, + "step": 15859 + }, + { + "epoch": 2.37, + "grad_norm": 3.78743223454934, + "learning_rate": 1.1353655694908369e-06, + "loss": 0.6471, + "step": 15860 + }, + { + "epoch": 2.37, + "grad_norm": 5.69397850037158, + "learning_rate": 1.135269850154501e-06, + "loss": 0.6504, + "step": 15861 + }, + { + "epoch": 2.37, + "grad_norm": 3.3889493402897677, + "learning_rate": 1.1351741295556803e-06, + "loss": 0.7096, + "step": 15862 + }, + { + "epoch": 2.37, + "grad_norm": 3.182448437659767, + "learning_rate": 1.1350784076952674e-06, + "loss": 0.6139, + "step": 15863 + }, + { + "epoch": 2.37, + "grad_norm": 5.025287179788519, + "learning_rate": 1.1349826845741554e-06, + "loss": 0.6471, + "step": 15864 + }, + { + "epoch": 2.37, + "grad_norm": 3.569760339880629, + "learning_rate": 1.1348869601932386e-06, + "loss": 0.6413, + "step": 15865 + }, + { + "epoch": 2.37, + "grad_norm": 3.6045568196374487, + "learning_rate": 1.1347912345534093e-06, + "loss": 0.64, + "step": 15866 + }, + { + "epoch": 2.37, + "grad_norm": 2.736131967806333, + "learning_rate": 1.134695507655562e-06, + "loss": 0.6055, + "step": 15867 + }, + { + "epoch": 2.37, + "grad_norm": 3.99468582354883, + "learning_rate": 1.1345997795005893e-06, + "loss": 0.6042, + "step": 15868 + }, + { + "epoch": 2.37, + "grad_norm": 2.7640156324677063, + "learning_rate": 1.134504050089385e-06, + "loss": 0.6224, + "step": 15869 + }, + { + "epoch": 2.37, + "grad_norm": 3.0797146889636147, + "learning_rate": 1.1344083194228425e-06, + "loss": 0.651, + "step": 15870 + }, + { + "epoch": 2.37, + "grad_norm": 2.777392407644592, + "learning_rate": 1.1343125875018553e-06, + "loss": 0.6302, + "step": 15871 + }, + { + "epoch": 2.37, + "grad_norm": 4.239444688259647, + "learning_rate": 1.1342168543273168e-06, + "loss": 0.6361, + "step": 15872 + }, + { + "epoch": 2.37, + "grad_norm": 2.784180383397691, + "learning_rate": 1.1341211199001209e-06, + "loss": 0.6341, + "step": 15873 + }, + { + "epoch": 2.37, + "grad_norm": 3.310249779437611, + "learning_rate": 1.1340253842211601e-06, + "loss": 0.6458, + "step": 15874 + }, + { + "epoch": 2.37, + "grad_norm": 4.293694521614699, + "learning_rate": 1.133929647291329e-06, + "loss": 0.6074, + "step": 15875 + }, + { + "epoch": 2.37, + "grad_norm": 2.9632932281772253, + "learning_rate": 1.1338339091115205e-06, + "loss": 0.6165, + "step": 15876 + }, + { + "epoch": 2.37, + "grad_norm": 2.911498963109746, + "learning_rate": 1.1337381696826282e-06, + "loss": 0.5967, + "step": 15877 + }, + { + "epoch": 2.37, + "grad_norm": 3.6289929855047682, + "learning_rate": 1.133642429005546e-06, + "loss": 0.6875, + "step": 15878 + }, + { + "epoch": 2.37, + "grad_norm": 4.1489766152677765, + "learning_rate": 1.133546687081167e-06, + "loss": 0.6367, + "step": 15879 + }, + { + "epoch": 2.37, + "grad_norm": 3.8468239618161966, + "learning_rate": 1.133450943910385e-06, + "loss": 0.6849, + "step": 15880 + }, + { + "epoch": 2.37, + "grad_norm": 3.534982243184555, + "learning_rate": 1.1333551994940936e-06, + "loss": 0.6302, + "step": 15881 + }, + { + "epoch": 2.37, + "grad_norm": 3.078178378655002, + "learning_rate": 1.133259453833186e-06, + "loss": 0.6419, + "step": 15882 + }, + { + "epoch": 2.37, + "grad_norm": 3.8773638768870575, + "learning_rate": 1.1331637069285568e-06, + "loss": 0.6419, + "step": 15883 + }, + { + "epoch": 2.37, + "grad_norm": 3.7620293993691156, + "learning_rate": 1.1330679587810982e-06, + "loss": 0.6784, + "step": 15884 + }, + { + "epoch": 2.37, + "grad_norm": 4.555117029698259, + "learning_rate": 1.132972209391705e-06, + "loss": 0.6055, + "step": 15885 + }, + { + "epoch": 2.37, + "grad_norm": 3.033466737531785, + "learning_rate": 1.1328764587612702e-06, + "loss": 0.6097, + "step": 15886 + }, + { + "epoch": 2.37, + "grad_norm": 4.042219328337598, + "learning_rate": 1.132780706890688e-06, + "loss": 0.6608, + "step": 15887 + }, + { + "epoch": 2.37, + "grad_norm": 4.392516632518395, + "learning_rate": 1.1326849537808516e-06, + "loss": 0.6706, + "step": 15888 + }, + { + "epoch": 2.37, + "grad_norm": 4.961284250711164, + "learning_rate": 1.1325891994326543e-06, + "loss": 0.6543, + "step": 15889 + }, + { + "epoch": 2.37, + "grad_norm": 3.8681460287775384, + "learning_rate": 1.1324934438469908e-06, + "loss": 0.6074, + "step": 15890 + }, + { + "epoch": 2.37, + "grad_norm": 3.0494756028003267, + "learning_rate": 1.1323976870247542e-06, + "loss": 0.6654, + "step": 15891 + }, + { + "epoch": 2.37, + "grad_norm": 5.529816430758364, + "learning_rate": 1.132301928966838e-06, + "loss": 0.6953, + "step": 15892 + }, + { + "epoch": 2.37, + "grad_norm": 2.8862844058575616, + "learning_rate": 1.1322061696741368e-06, + "loss": 0.6237, + "step": 15893 + }, + { + "epoch": 2.37, + "grad_norm": 3.2209552667348422, + "learning_rate": 1.1321104091475433e-06, + "loss": 0.6862, + "step": 15894 + }, + { + "epoch": 2.37, + "grad_norm": 3.72895880041363, + "learning_rate": 1.1320146473879515e-06, + "loss": 0.6608, + "step": 15895 + }, + { + "epoch": 2.37, + "grad_norm": 2.9396478320682125, + "learning_rate": 1.131918884396256e-06, + "loss": 0.6732, + "step": 15896 + }, + { + "epoch": 2.37, + "grad_norm": 3.148769384043705, + "learning_rate": 1.1318231201733492e-06, + "loss": 0.653, + "step": 15897 + }, + { + "epoch": 2.37, + "grad_norm": 3.2962009771422327, + "learning_rate": 1.1317273547201256e-06, + "loss": 0.6471, + "step": 15898 + }, + { + "epoch": 2.37, + "grad_norm": 4.520487743568668, + "learning_rate": 1.1316315880374792e-06, + "loss": 0.6393, + "step": 15899 + }, + { + "epoch": 2.37, + "grad_norm": 3.278244560681304, + "learning_rate": 1.1315358201263034e-06, + "loss": 0.6315, + "step": 15900 + }, + { + "epoch": 2.37, + "grad_norm": 4.269449686826547, + "learning_rate": 1.1314400509874925e-06, + "loss": 0.6074, + "step": 15901 + }, + { + "epoch": 2.37, + "grad_norm": 2.6007146760572435, + "learning_rate": 1.1313442806219396e-06, + "loss": 0.627, + "step": 15902 + }, + { + "epoch": 2.37, + "grad_norm": 3.6962125797987073, + "learning_rate": 1.131248509030539e-06, + "loss": 0.6243, + "step": 15903 + }, + { + "epoch": 2.37, + "grad_norm": 3.2157323529812376, + "learning_rate": 1.1311527362141847e-06, + "loss": 0.6243, + "step": 15904 + }, + { + "epoch": 2.37, + "grad_norm": 2.9193321465564037, + "learning_rate": 1.1310569621737699e-06, + "loss": 0.6966, + "step": 15905 + }, + { + "epoch": 2.37, + "grad_norm": 3.304618995516055, + "learning_rate": 1.130961186910189e-06, + "loss": 0.6322, + "step": 15906 + }, + { + "epoch": 2.37, + "grad_norm": 3.7437406784059326, + "learning_rate": 1.1308654104243361e-06, + "loss": 0.696, + "step": 15907 + }, + { + "epoch": 2.37, + "grad_norm": 4.221913687051314, + "learning_rate": 1.1307696327171042e-06, + "loss": 0.6413, + "step": 15908 + }, + { + "epoch": 2.37, + "grad_norm": 3.8362109930634443, + "learning_rate": 1.130673853789388e-06, + "loss": 0.653, + "step": 15909 + }, + { + "epoch": 2.37, + "grad_norm": 4.4415557697326244, + "learning_rate": 1.1305780736420813e-06, + "loss": 0.651, + "step": 15910 + }, + { + "epoch": 2.37, + "grad_norm": 3.0959303538892415, + "learning_rate": 1.1304822922760776e-06, + "loss": 0.6999, + "step": 15911 + }, + { + "epoch": 2.37, + "grad_norm": 3.579657961169723, + "learning_rate": 1.1303865096922717e-06, + "loss": 0.6139, + "step": 15912 + }, + { + "epoch": 2.37, + "grad_norm": 3.6999772839685936, + "learning_rate": 1.1302907258915565e-06, + "loss": 0.6445, + "step": 15913 + }, + { + "epoch": 2.37, + "grad_norm": 2.984905840534368, + "learning_rate": 1.1301949408748263e-06, + "loss": 0.6576, + "step": 15914 + }, + { + "epoch": 2.37, + "grad_norm": 2.68412456516553, + "learning_rate": 1.1300991546429756e-06, + "loss": 0.6882, + "step": 15915 + }, + { + "epoch": 2.37, + "grad_norm": 3.2862465981914606, + "learning_rate": 1.1300033671968981e-06, + "loss": 0.5827, + "step": 15916 + }, + { + "epoch": 2.37, + "grad_norm": 2.8980314052684344, + "learning_rate": 1.1299075785374874e-06, + "loss": 0.5983, + "step": 15917 + }, + { + "epoch": 2.37, + "grad_norm": 5.420323316350745, + "learning_rate": 1.1298117886656376e-06, + "loss": 0.6263, + "step": 15918 + }, + { + "epoch": 2.37, + "grad_norm": 3.3264476586519303, + "learning_rate": 1.1297159975822427e-06, + "loss": 0.6146, + "step": 15919 + }, + { + "epoch": 2.37, + "grad_norm": 3.586910110136496, + "learning_rate": 1.1296202052881977e-06, + "loss": 0.6296, + "step": 15920 + }, + { + "epoch": 2.37, + "grad_norm": 3.1691436034217277, + "learning_rate": 1.1295244117843954e-06, + "loss": 0.6686, + "step": 15921 + }, + { + "epoch": 2.37, + "grad_norm": 3.1756549485068892, + "learning_rate": 1.1294286170717304e-06, + "loss": 0.6449, + "step": 15922 + }, + { + "epoch": 2.37, + "grad_norm": 3.7703726995010727, + "learning_rate": 1.1293328211510966e-06, + "loss": 0.6322, + "step": 15923 + }, + { + "epoch": 2.37, + "grad_norm": 2.610535435982195, + "learning_rate": 1.1292370240233882e-06, + "loss": 0.6471, + "step": 15924 + }, + { + "epoch": 2.38, + "grad_norm": 2.7102247833893967, + "learning_rate": 1.1291412256894993e-06, + "loss": 0.6302, + "step": 15925 + }, + { + "epoch": 2.38, + "grad_norm": 3.683518268704734, + "learning_rate": 1.129045426150324e-06, + "loss": 0.6484, + "step": 15926 + }, + { + "epoch": 2.38, + "grad_norm": 2.85041928147152, + "learning_rate": 1.1289496254067558e-06, + "loss": 0.6589, + "step": 15927 + }, + { + "epoch": 2.38, + "grad_norm": 6.060261737069827, + "learning_rate": 1.1288538234596899e-06, + "loss": 0.7057, + "step": 15928 + }, + { + "epoch": 2.38, + "grad_norm": 3.151229271133701, + "learning_rate": 1.1287580203100199e-06, + "loss": 0.5846, + "step": 15929 + }, + { + "epoch": 2.38, + "grad_norm": 3.347921214316046, + "learning_rate": 1.1286622159586392e-06, + "loss": 0.6549, + "step": 15930 + }, + { + "epoch": 2.38, + "grad_norm": 2.818109952572884, + "learning_rate": 1.1285664104064435e-06, + "loss": 0.6354, + "step": 15931 + }, + { + "epoch": 2.38, + "grad_norm": 3.516593196405455, + "learning_rate": 1.1284706036543254e-06, + "loss": 0.6497, + "step": 15932 + }, + { + "epoch": 2.38, + "grad_norm": 3.648431812257964, + "learning_rate": 1.1283747957031805e-06, + "loss": 0.6536, + "step": 15933 + }, + { + "epoch": 2.38, + "grad_norm": 2.7030268458017934, + "learning_rate": 1.1282789865539018e-06, + "loss": 0.5983, + "step": 15934 + }, + { + "epoch": 2.38, + "grad_norm": 3.1146956595623947, + "learning_rate": 1.128183176207384e-06, + "loss": 0.6152, + "step": 15935 + }, + { + "epoch": 2.38, + "grad_norm": 3.3537400021975876, + "learning_rate": 1.1280873646645217e-06, + "loss": 0.6497, + "step": 15936 + }, + { + "epoch": 2.38, + "grad_norm": 4.199663106743153, + "learning_rate": 1.1279915519262083e-06, + "loss": 0.6862, + "step": 15937 + }, + { + "epoch": 2.38, + "grad_norm": 2.72473371856909, + "learning_rate": 1.1278957379933385e-06, + "loss": 0.6367, + "step": 15938 + }, + { + "epoch": 2.38, + "grad_norm": 3.0411363010506207, + "learning_rate": 1.1277999228668065e-06, + "loss": 0.6087, + "step": 15939 + }, + { + "epoch": 2.38, + "grad_norm": 6.17421930294263, + "learning_rate": 1.1277041065475063e-06, + "loss": 0.6927, + "step": 15940 + }, + { + "epoch": 2.38, + "grad_norm": 5.390496714832426, + "learning_rate": 1.1276082890363327e-06, + "loss": 0.5957, + "step": 15941 + }, + { + "epoch": 2.38, + "grad_norm": 3.295812400248977, + "learning_rate": 1.1275124703341794e-06, + "loss": 0.694, + "step": 15942 + }, + { + "epoch": 2.38, + "grad_norm": 3.0083738480547475, + "learning_rate": 1.1274166504419412e-06, + "loss": 0.6608, + "step": 15943 + }, + { + "epoch": 2.38, + "grad_norm": 4.0318904459347955, + "learning_rate": 1.127320829360512e-06, + "loss": 0.6465, + "step": 15944 + }, + { + "epoch": 2.38, + "grad_norm": 2.9782931183573833, + "learning_rate": 1.127225007090786e-06, + "loss": 0.6628, + "step": 15945 + }, + { + "epoch": 2.38, + "grad_norm": 4.0025654423306944, + "learning_rate": 1.1271291836336582e-06, + "loss": 0.6211, + "step": 15946 + }, + { + "epoch": 2.38, + "grad_norm": 6.267129140141695, + "learning_rate": 1.1270333589900223e-06, + "loss": 0.7005, + "step": 15947 + }, + { + "epoch": 2.38, + "grad_norm": 9.473915089093387, + "learning_rate": 1.1269375331607726e-06, + "loss": 0.7376, + "step": 15948 + }, + { + "epoch": 2.38, + "grad_norm": 3.0353836659594706, + "learning_rate": 1.1268417061468042e-06, + "loss": 0.5801, + "step": 15949 + }, + { + "epoch": 2.38, + "grad_norm": 2.814409653992196, + "learning_rate": 1.1267458779490107e-06, + "loss": 0.6374, + "step": 15950 + }, + { + "epoch": 2.38, + "grad_norm": 3.73925987622777, + "learning_rate": 1.1266500485682865e-06, + "loss": 0.6445, + "step": 15951 + }, + { + "epoch": 2.38, + "grad_norm": 3.1118327128741576, + "learning_rate": 1.1265542180055263e-06, + "loss": 0.6693, + "step": 15952 + }, + { + "epoch": 2.38, + "grad_norm": 2.8620688949316597, + "learning_rate": 1.1264583862616243e-06, + "loss": 0.6374, + "step": 15953 + }, + { + "epoch": 2.38, + "grad_norm": 3.714676099440628, + "learning_rate": 1.1263625533374754e-06, + "loss": 0.6634, + "step": 15954 + }, + { + "epoch": 2.38, + "grad_norm": 4.491328275974797, + "learning_rate": 1.1262667192339732e-06, + "loss": 0.6152, + "step": 15955 + }, + { + "epoch": 2.38, + "grad_norm": 2.557864691331143, + "learning_rate": 1.1261708839520128e-06, + "loss": 0.5977, + "step": 15956 + }, + { + "epoch": 2.38, + "grad_norm": 2.984145693881013, + "learning_rate": 1.1260750474924884e-06, + "loss": 0.6302, + "step": 15957 + }, + { + "epoch": 2.38, + "grad_norm": 2.97179597802095, + "learning_rate": 1.1259792098562942e-06, + "loss": 0.6745, + "step": 15958 + }, + { + "epoch": 2.38, + "grad_norm": 3.9074186933901665, + "learning_rate": 1.125883371044325e-06, + "loss": 0.6322, + "step": 15959 + }, + { + "epoch": 2.38, + "grad_norm": 6.771173283201224, + "learning_rate": 1.125787531057475e-06, + "loss": 0.6536, + "step": 15960 + }, + { + "epoch": 2.38, + "grad_norm": 5.913632850965144, + "learning_rate": 1.1256916898966391e-06, + "loss": 0.5872, + "step": 15961 + }, + { + "epoch": 2.38, + "grad_norm": 2.5265707299778337, + "learning_rate": 1.1255958475627115e-06, + "loss": 0.6068, + "step": 15962 + }, + { + "epoch": 2.38, + "grad_norm": 2.668743129347392, + "learning_rate": 1.1255000040565868e-06, + "loss": 0.6185, + "step": 15963 + }, + { + "epoch": 2.38, + "grad_norm": 3.6847375622058953, + "learning_rate": 1.1254041593791593e-06, + "loss": 0.6693, + "step": 15964 + }, + { + "epoch": 2.38, + "grad_norm": 2.852563075570596, + "learning_rate": 1.1253083135313237e-06, + "loss": 0.6257, + "step": 15965 + }, + { + "epoch": 2.38, + "grad_norm": 5.144809142907136, + "learning_rate": 1.1252124665139747e-06, + "loss": 0.6738, + "step": 15966 + }, + { + "epoch": 2.38, + "grad_norm": 3.0574801540987244, + "learning_rate": 1.1251166183280065e-06, + "loss": 0.6497, + "step": 15967 + }, + { + "epoch": 2.38, + "grad_norm": 3.3330607053067824, + "learning_rate": 1.1250207689743136e-06, + "loss": 0.6523, + "step": 15968 + }, + { + "epoch": 2.38, + "grad_norm": 4.096843486567493, + "learning_rate": 1.1249249184537913e-06, + "loss": 0.5879, + "step": 15969 + }, + { + "epoch": 2.38, + "grad_norm": 4.222385414843731, + "learning_rate": 1.1248290667673335e-06, + "loss": 0.6953, + "step": 15970 + }, + { + "epoch": 2.38, + "grad_norm": 2.9042043471230543, + "learning_rate": 1.124733213915835e-06, + "loss": 0.64, + "step": 15971 + }, + { + "epoch": 2.38, + "grad_norm": 3.361953733917618, + "learning_rate": 1.1246373599001903e-06, + "loss": 0.6549, + "step": 15972 + }, + { + "epoch": 2.38, + "grad_norm": 4.308312301632412, + "learning_rate": 1.1245415047212943e-06, + "loss": 0.6374, + "step": 15973 + }, + { + "epoch": 2.38, + "grad_norm": 3.4630636186468324, + "learning_rate": 1.124445648380041e-06, + "loss": 0.6302, + "step": 15974 + }, + { + "epoch": 2.38, + "grad_norm": 3.7827649962197976, + "learning_rate": 1.1243497908773258e-06, + "loss": 0.6003, + "step": 15975 + }, + { + "epoch": 2.38, + "grad_norm": 3.0841089733464364, + "learning_rate": 1.1242539322140433e-06, + "loss": 0.5924, + "step": 15976 + }, + { + "epoch": 2.38, + "grad_norm": 3.832639113902774, + "learning_rate": 1.124158072391087e-06, + "loss": 0.6562, + "step": 15977 + }, + { + "epoch": 2.38, + "grad_norm": 3.116939860749533, + "learning_rate": 1.1240622114093532e-06, + "loss": 0.6273, + "step": 15978 + }, + { + "epoch": 2.38, + "grad_norm": 4.229429171491874, + "learning_rate": 1.1239663492697355e-06, + "loss": 0.6393, + "step": 15979 + }, + { + "epoch": 2.38, + "grad_norm": 3.4140929168467435, + "learning_rate": 1.123870485973129e-06, + "loss": 0.6035, + "step": 15980 + }, + { + "epoch": 2.38, + "grad_norm": 3.0849633651683477, + "learning_rate": 1.1237746215204282e-06, + "loss": 0.638, + "step": 15981 + }, + { + "epoch": 2.38, + "grad_norm": 2.949226397470719, + "learning_rate": 1.1236787559125281e-06, + "loss": 0.5892, + "step": 15982 + }, + { + "epoch": 2.38, + "grad_norm": 4.493257444750188, + "learning_rate": 1.123582889150323e-06, + "loss": 0.5846, + "step": 15983 + }, + { + "epoch": 2.38, + "grad_norm": 3.346193044721229, + "learning_rate": 1.123487021234708e-06, + "loss": 0.64, + "step": 15984 + }, + { + "epoch": 2.38, + "grad_norm": 4.959407806404358, + "learning_rate": 1.1233911521665773e-06, + "loss": 0.6914, + "step": 15985 + }, + { + "epoch": 2.38, + "grad_norm": 3.2826494646437285, + "learning_rate": 1.1232952819468266e-06, + "loss": 0.6803, + "step": 15986 + }, + { + "epoch": 2.38, + "grad_norm": 5.547748330237594, + "learning_rate": 1.12319941057635e-06, + "loss": 0.6289, + "step": 15987 + }, + { + "epoch": 2.38, + "grad_norm": 4.751843283729108, + "learning_rate": 1.1231035380560423e-06, + "loss": 0.6647, + "step": 15988 + }, + { + "epoch": 2.38, + "grad_norm": 3.5135935520778547, + "learning_rate": 1.1230076643867988e-06, + "loss": 0.625, + "step": 15989 + }, + { + "epoch": 2.38, + "grad_norm": 5.19786064295236, + "learning_rate": 1.1229117895695134e-06, + "loss": 0.6257, + "step": 15990 + }, + { + "epoch": 2.38, + "grad_norm": 3.163834061291581, + "learning_rate": 1.1228159136050815e-06, + "loss": 0.6484, + "step": 15991 + }, + { + "epoch": 2.39, + "grad_norm": 3.7905035612720064, + "learning_rate": 1.1227200364943978e-06, + "loss": 0.6283, + "step": 15992 + }, + { + "epoch": 2.39, + "grad_norm": 3.2534703283875044, + "learning_rate": 1.122624158238357e-06, + "loss": 0.638, + "step": 15993 + }, + { + "epoch": 2.39, + "grad_norm": 3.7566997319918616, + "learning_rate": 1.1225282788378544e-06, + "loss": 0.679, + "step": 15994 + }, + { + "epoch": 2.39, + "grad_norm": 3.5881351070549714, + "learning_rate": 1.1224323982937845e-06, + "loss": 0.6263, + "step": 15995 + }, + { + "epoch": 2.39, + "grad_norm": 7.046149292564972, + "learning_rate": 1.122336516607042e-06, + "loss": 0.6777, + "step": 15996 + }, + { + "epoch": 2.39, + "grad_norm": 4.191495725158638, + "learning_rate": 1.122240633778522e-06, + "loss": 0.6543, + "step": 15997 + }, + { + "epoch": 2.39, + "grad_norm": 6.02553166109281, + "learning_rate": 1.1221447498091194e-06, + "loss": 0.6543, + "step": 15998 + }, + { + "epoch": 2.39, + "grad_norm": 5.477925237376793, + "learning_rate": 1.1220488646997295e-06, + "loss": 0.625, + "step": 15999 + }, + { + "epoch": 2.39, + "grad_norm": 3.884157513761074, + "learning_rate": 1.1219529784512463e-06, + "loss": 0.6562, + "step": 16000 + }, + { + "epoch": 2.39, + "grad_norm": 3.0065588730448707, + "learning_rate": 1.121857091064565e-06, + "loss": 0.6276, + "step": 16001 + }, + { + "epoch": 2.39, + "grad_norm": 5.23295493786255, + "learning_rate": 1.1217612025405812e-06, + "loss": 0.6771, + "step": 16002 + }, + { + "epoch": 2.39, + "grad_norm": 5.086968375910149, + "learning_rate": 1.121665312880189e-06, + "loss": 0.5827, + "step": 16003 + }, + { + "epoch": 2.39, + "grad_norm": 5.043155370643646, + "learning_rate": 1.1215694220842838e-06, + "loss": 0.6523, + "step": 16004 + }, + { + "epoch": 2.39, + "grad_norm": 2.6448588514813585, + "learning_rate": 1.1214735301537603e-06, + "loss": 0.6348, + "step": 16005 + }, + { + "epoch": 2.39, + "grad_norm": 3.213952559827501, + "learning_rate": 1.1213776370895137e-06, + "loss": 0.6419, + "step": 16006 + }, + { + "epoch": 2.39, + "grad_norm": 3.650437505480731, + "learning_rate": 1.1212817428924388e-06, + "loss": 0.6237, + "step": 16007 + }, + { + "epoch": 2.39, + "grad_norm": 3.0581947966255765, + "learning_rate": 1.1211858475634304e-06, + "loss": 0.6823, + "step": 16008 + }, + { + "epoch": 2.39, + "grad_norm": 3.5155968354425697, + "learning_rate": 1.1210899511033842e-06, + "loss": 0.6875, + "step": 16009 + }, + { + "epoch": 2.39, + "grad_norm": 3.386549504661448, + "learning_rate": 1.1209940535131947e-06, + "loss": 0.6315, + "step": 16010 + }, + { + "epoch": 2.39, + "grad_norm": 2.915655688357336, + "learning_rate": 1.120898154793757e-06, + "loss": 0.6576, + "step": 16011 + }, + { + "epoch": 2.39, + "grad_norm": 3.220314836028879, + "learning_rate": 1.1208022549459658e-06, + "loss": 0.6159, + "step": 16012 + }, + { + "epoch": 2.39, + "grad_norm": 4.099946020111482, + "learning_rate": 1.120706353970717e-06, + "loss": 0.6458, + "step": 16013 + }, + { + "epoch": 2.39, + "grad_norm": 2.734100900502355, + "learning_rate": 1.1206104518689048e-06, + "loss": 0.6432, + "step": 16014 + }, + { + "epoch": 2.39, + "grad_norm": 3.3274394004280685, + "learning_rate": 1.120514548641425e-06, + "loss": 0.6523, + "step": 16015 + }, + { + "epoch": 2.39, + "grad_norm": 5.370699317034133, + "learning_rate": 1.1204186442891717e-06, + "loss": 0.6641, + "step": 16016 + }, + { + "epoch": 2.39, + "grad_norm": 3.206739447317396, + "learning_rate": 1.1203227388130405e-06, + "loss": 0.6178, + "step": 16017 + }, + { + "epoch": 2.39, + "grad_norm": 3.0331001285566295, + "learning_rate": 1.1202268322139269e-06, + "loss": 0.6348, + "step": 16018 + }, + { + "epoch": 2.39, + "grad_norm": 2.9724805114777535, + "learning_rate": 1.1201309244927254e-06, + "loss": 0.6126, + "step": 16019 + }, + { + "epoch": 2.39, + "grad_norm": 4.27581916328456, + "learning_rate": 1.1200350156503317e-06, + "loss": 0.6803, + "step": 16020 + }, + { + "epoch": 2.39, + "grad_norm": 2.88957525263457, + "learning_rate": 1.1199391056876403e-06, + "loss": 0.6387, + "step": 16021 + }, + { + "epoch": 2.39, + "grad_norm": 2.869931868642469, + "learning_rate": 1.1198431946055467e-06, + "loss": 0.6074, + "step": 16022 + }, + { + "epoch": 2.39, + "grad_norm": 6.045725759492785, + "learning_rate": 1.1197472824049462e-06, + "loss": 0.6458, + "step": 16023 + }, + { + "epoch": 2.39, + "grad_norm": 2.88935164115915, + "learning_rate": 1.1196513690867334e-06, + "loss": 0.6009, + "step": 16024 + }, + { + "epoch": 2.39, + "grad_norm": 2.7606487753342623, + "learning_rate": 1.1195554546518037e-06, + "loss": 0.6543, + "step": 16025 + }, + { + "epoch": 2.39, + "grad_norm": 5.560158258541504, + "learning_rate": 1.1194595391010526e-06, + "loss": 0.6764, + "step": 16026 + }, + { + "epoch": 2.39, + "grad_norm": 5.088141965698427, + "learning_rate": 1.119363622435375e-06, + "loss": 0.6361, + "step": 16027 + }, + { + "epoch": 2.39, + "grad_norm": 3.7752546848311375, + "learning_rate": 1.1192677046556662e-06, + "loss": 0.6595, + "step": 16028 + }, + { + "epoch": 2.39, + "grad_norm": 4.41125274391606, + "learning_rate": 1.1191717857628214e-06, + "loss": 0.61, + "step": 16029 + }, + { + "epoch": 2.39, + "grad_norm": 3.1631689351239682, + "learning_rate": 1.1190758657577356e-06, + "loss": 0.6204, + "step": 16030 + }, + { + "epoch": 2.39, + "grad_norm": 3.443853966430382, + "learning_rate": 1.1189799446413044e-06, + "loss": 0.6615, + "step": 16031 + }, + { + "epoch": 2.39, + "grad_norm": 4.608314763563522, + "learning_rate": 1.1188840224144228e-06, + "loss": 0.6178, + "step": 16032 + }, + { + "epoch": 2.39, + "grad_norm": 3.5646241895787294, + "learning_rate": 1.118788099077986e-06, + "loss": 0.6771, + "step": 16033 + }, + { + "epoch": 2.39, + "grad_norm": 2.9799535688595595, + "learning_rate": 1.1186921746328894e-06, + "loss": 0.6497, + "step": 16034 + }, + { + "epoch": 2.39, + "grad_norm": 4.000872368681424, + "learning_rate": 1.1185962490800285e-06, + "loss": 0.64, + "step": 16035 + }, + { + "epoch": 2.39, + "grad_norm": 3.4286238794489177, + "learning_rate": 1.118500322420298e-06, + "loss": 0.6641, + "step": 16036 + }, + { + "epoch": 2.39, + "grad_norm": 3.4438528339846624, + "learning_rate": 1.1184043946545936e-06, + "loss": 0.6296, + "step": 16037 + }, + { + "epoch": 2.39, + "grad_norm": 3.4083722784136445, + "learning_rate": 1.1183084657838107e-06, + "loss": 0.696, + "step": 16038 + }, + { + "epoch": 2.39, + "grad_norm": 3.4318393104438485, + "learning_rate": 1.118212535808844e-06, + "loss": 0.6602, + "step": 16039 + }, + { + "epoch": 2.39, + "grad_norm": 5.322258070430384, + "learning_rate": 1.1181166047305897e-06, + "loss": 0.6191, + "step": 16040 + }, + { + "epoch": 2.39, + "grad_norm": 3.309777537975936, + "learning_rate": 1.1180206725499424e-06, + "loss": 0.6611, + "step": 16041 + }, + { + "epoch": 2.39, + "grad_norm": 3.3367862938638617, + "learning_rate": 1.117924739267798e-06, + "loss": 0.6322, + "step": 16042 + }, + { + "epoch": 2.39, + "grad_norm": 3.8888467385844456, + "learning_rate": 1.117828804885051e-06, + "loss": 0.6445, + "step": 16043 + }, + { + "epoch": 2.39, + "grad_norm": 2.9183995338337607, + "learning_rate": 1.117732869402598e-06, + "loss": 0.6283, + "step": 16044 + }, + { + "epoch": 2.39, + "grad_norm": 4.5974500131339155, + "learning_rate": 1.1176369328213336e-06, + "loss": 0.6426, + "step": 16045 + }, + { + "epoch": 2.39, + "grad_norm": 4.130826455416171, + "learning_rate": 1.117540995142153e-06, + "loss": 0.6615, + "step": 16046 + }, + { + "epoch": 2.39, + "grad_norm": 5.332951879514919, + "learning_rate": 1.1174450563659522e-06, + "loss": 0.6257, + "step": 16047 + }, + { + "epoch": 2.39, + "grad_norm": 2.582548716346851, + "learning_rate": 1.1173491164936262e-06, + "loss": 0.6172, + "step": 16048 + }, + { + "epoch": 2.39, + "grad_norm": 5.491006833165741, + "learning_rate": 1.1172531755260703e-06, + "loss": 0.696, + "step": 16049 + }, + { + "epoch": 2.39, + "grad_norm": 2.9653946871430894, + "learning_rate": 1.1171572334641802e-06, + "loss": 0.6367, + "step": 16050 + }, + { + "epoch": 2.39, + "grad_norm": 3.573852627950711, + "learning_rate": 1.1170612903088513e-06, + "loss": 0.6829, + "step": 16051 + }, + { + "epoch": 2.39, + "grad_norm": 3.6273692195634855, + "learning_rate": 1.1169653460609793e-06, + "loss": 0.6569, + "step": 16052 + }, + { + "epoch": 2.39, + "grad_norm": 4.302592504127437, + "learning_rate": 1.1168694007214593e-06, + "loss": 0.623, + "step": 16053 + }, + { + "epoch": 2.39, + "grad_norm": 4.544258025597939, + "learning_rate": 1.1167734542911867e-06, + "loss": 0.6289, + "step": 16054 + }, + { + "epoch": 2.39, + "grad_norm": 2.8682192691422275, + "learning_rate": 1.1166775067710572e-06, + "loss": 0.6556, + "step": 16055 + }, + { + "epoch": 2.39, + "grad_norm": 2.620803959607594, + "learning_rate": 1.1165815581619663e-06, + "loss": 0.6556, + "step": 16056 + }, + { + "epoch": 2.39, + "grad_norm": 3.6310827364992715, + "learning_rate": 1.116485608464809e-06, + "loss": 0.653, + "step": 16057 + }, + { + "epoch": 2.39, + "grad_norm": 2.6722177643468394, + "learning_rate": 1.1163896576804818e-06, + "loss": 0.666, + "step": 16058 + }, + { + "epoch": 2.4, + "grad_norm": 2.628701003502692, + "learning_rate": 1.116293705809879e-06, + "loss": 0.6608, + "step": 16059 + }, + { + "epoch": 2.4, + "grad_norm": 2.4792867342717377, + "learning_rate": 1.1161977528538972e-06, + "loss": 0.6302, + "step": 16060 + }, + { + "epoch": 2.4, + "grad_norm": 2.457442902616609, + "learning_rate": 1.1161017988134313e-06, + "loss": 0.584, + "step": 16061 + }, + { + "epoch": 2.4, + "grad_norm": 3.319661045176083, + "learning_rate": 1.1160058436893774e-06, + "loss": 0.651, + "step": 16062 + }, + { + "epoch": 2.4, + "grad_norm": 5.7762898642342115, + "learning_rate": 1.1159098874826304e-06, + "loss": 0.6719, + "step": 16063 + }, + { + "epoch": 2.4, + "grad_norm": 2.925049043107803, + "learning_rate": 1.115813930194086e-06, + "loss": 0.6842, + "step": 16064 + }, + { + "epoch": 2.4, + "grad_norm": 2.9731440515808654, + "learning_rate": 1.1157179718246406e-06, + "loss": 0.6191, + "step": 16065 + }, + { + "epoch": 2.4, + "grad_norm": 2.489855041459923, + "learning_rate": 1.1156220123751886e-06, + "loss": 0.623, + "step": 16066 + }, + { + "epoch": 2.4, + "grad_norm": 3.4978096043321, + "learning_rate": 1.115526051846626e-06, + "loss": 0.6204, + "step": 16067 + }, + { + "epoch": 2.4, + "grad_norm": 2.7531943832651327, + "learning_rate": 1.115430090239849e-06, + "loss": 0.6458, + "step": 16068 + }, + { + "epoch": 2.4, + "grad_norm": 2.7074860618754304, + "learning_rate": 1.1153341275557525e-06, + "loss": 0.5788, + "step": 16069 + }, + { + "epoch": 2.4, + "grad_norm": 5.109046785666444, + "learning_rate": 1.1152381637952324e-06, + "loss": 0.6289, + "step": 16070 + }, + { + "epoch": 2.4, + "grad_norm": 3.892918346553909, + "learning_rate": 1.1151421989591842e-06, + "loss": 0.6751, + "step": 16071 + }, + { + "epoch": 2.4, + "grad_norm": 3.0026069920595737, + "learning_rate": 1.115046233048504e-06, + "loss": 0.6686, + "step": 16072 + }, + { + "epoch": 2.4, + "grad_norm": 3.8462673658940525, + "learning_rate": 1.114950266064087e-06, + "loss": 0.6074, + "step": 16073 + }, + { + "epoch": 2.4, + "grad_norm": 2.6181908238906124, + "learning_rate": 1.1148542980068287e-06, + "loss": 0.6188, + "step": 16074 + }, + { + "epoch": 2.4, + "grad_norm": 5.570680211376234, + "learning_rate": 1.1147583288776254e-06, + "loss": 0.623, + "step": 16075 + }, + { + "epoch": 2.4, + "grad_norm": 6.109927371591423, + "learning_rate": 1.1146623586773724e-06, + "loss": 0.6341, + "step": 16076 + }, + { + "epoch": 2.4, + "grad_norm": 3.809847481773316, + "learning_rate": 1.1145663874069655e-06, + "loss": 0.6764, + "step": 16077 + }, + { + "epoch": 2.4, + "grad_norm": 5.25937442640351, + "learning_rate": 1.1144704150673004e-06, + "loss": 0.6016, + "step": 16078 + }, + { + "epoch": 2.4, + "grad_norm": 4.177219241621853, + "learning_rate": 1.1143744416592729e-06, + "loss": 0.653, + "step": 16079 + }, + { + "epoch": 2.4, + "grad_norm": 2.8442636019896956, + "learning_rate": 1.1142784671837782e-06, + "loss": 0.6335, + "step": 16080 + }, + { + "epoch": 2.4, + "grad_norm": 2.7151273901560367, + "learning_rate": 1.114182491641713e-06, + "loss": 0.6523, + "step": 16081 + }, + { + "epoch": 2.4, + "grad_norm": 3.0221392589290828, + "learning_rate": 1.114086515033972e-06, + "loss": 0.6712, + "step": 16082 + }, + { + "epoch": 2.4, + "grad_norm": 3.0492546095904043, + "learning_rate": 1.1139905373614515e-06, + "loss": 0.6283, + "step": 16083 + }, + { + "epoch": 2.4, + "grad_norm": 4.608583685624029, + "learning_rate": 1.1138945586250476e-06, + "loss": 0.6569, + "step": 16084 + }, + { + "epoch": 2.4, + "grad_norm": 3.696945914182056, + "learning_rate": 1.1137985788256557e-06, + "loss": 0.6758, + "step": 16085 + }, + { + "epoch": 2.4, + "grad_norm": 2.9129281218208187, + "learning_rate": 1.1137025979641715e-06, + "loss": 0.6576, + "step": 16086 + }, + { + "epoch": 2.4, + "grad_norm": 5.772586364488629, + "learning_rate": 1.1136066160414905e-06, + "loss": 0.6165, + "step": 16087 + }, + { + "epoch": 2.4, + "grad_norm": 2.970791043798506, + "learning_rate": 1.1135106330585094e-06, + "loss": 0.6536, + "step": 16088 + }, + { + "epoch": 2.4, + "grad_norm": 2.8888371102544594, + "learning_rate": 1.1134146490161236e-06, + "loss": 0.6146, + "step": 16089 + }, + { + "epoch": 2.4, + "grad_norm": 3.1587352835903366, + "learning_rate": 1.1133186639152282e-06, + "loss": 0.6328, + "step": 16090 + }, + { + "epoch": 2.4, + "grad_norm": 3.1056294860043985, + "learning_rate": 1.1132226777567204e-06, + "loss": 0.6771, + "step": 16091 + }, + { + "epoch": 2.4, + "grad_norm": 3.037284597149811, + "learning_rate": 1.113126690541495e-06, + "loss": 0.6348, + "step": 16092 + }, + { + "epoch": 2.4, + "grad_norm": 2.792537725471638, + "learning_rate": 1.113030702270448e-06, + "loss": 0.6325, + "step": 16093 + }, + { + "epoch": 2.4, + "grad_norm": 3.131600525668189, + "learning_rate": 1.112934712944476e-06, + "loss": 0.6159, + "step": 16094 + }, + { + "epoch": 2.4, + "grad_norm": 3.1567915833800346, + "learning_rate": 1.112838722564474e-06, + "loss": 0.6107, + "step": 16095 + }, + { + "epoch": 2.4, + "grad_norm": 3.457384362619689, + "learning_rate": 1.112742731131338e-06, + "loss": 0.6263, + "step": 16096 + }, + { + "epoch": 2.4, + "grad_norm": 5.024693205730018, + "learning_rate": 1.1126467386459646e-06, + "loss": 0.6465, + "step": 16097 + }, + { + "epoch": 2.4, + "grad_norm": 4.533287469315511, + "learning_rate": 1.1125507451092491e-06, + "loss": 0.6243, + "step": 16098 + }, + { + "epoch": 2.4, + "grad_norm": 3.2348750096225474, + "learning_rate": 1.1124547505220873e-06, + "loss": 0.6419, + "step": 16099 + }, + { + "epoch": 2.4, + "grad_norm": 2.9108850745921164, + "learning_rate": 1.1123587548853754e-06, + "loss": 0.6751, + "step": 16100 + }, + { + "epoch": 2.4, + "grad_norm": 4.346490961588511, + "learning_rate": 1.1122627582000094e-06, + "loss": 0.5853, + "step": 16101 + }, + { + "epoch": 2.4, + "grad_norm": 3.021360763139583, + "learning_rate": 1.1121667604668854e-06, + "loss": 0.6432, + "step": 16102 + }, + { + "epoch": 2.4, + "grad_norm": 3.4183334169343387, + "learning_rate": 1.1120707616868987e-06, + "loss": 0.6523, + "step": 16103 + }, + { + "epoch": 2.4, + "grad_norm": 3.2289521170082773, + "learning_rate": 1.1119747618609458e-06, + "loss": 0.6693, + "step": 16104 + }, + { + "epoch": 2.4, + "grad_norm": 3.111777379747224, + "learning_rate": 1.1118787609899227e-06, + "loss": 0.651, + "step": 16105 + }, + { + "epoch": 2.4, + "grad_norm": 6.268333703581339, + "learning_rate": 1.1117827590747252e-06, + "loss": 0.6217, + "step": 16106 + }, + { + "epoch": 2.4, + "grad_norm": 3.478071334037668, + "learning_rate": 1.111686756116249e-06, + "loss": 0.5863, + "step": 16107 + }, + { + "epoch": 2.4, + "grad_norm": 4.445358166183908, + "learning_rate": 1.1115907521153909e-06, + "loss": 0.64, + "step": 16108 + }, + { + "epoch": 2.4, + "grad_norm": 3.1160696466125186, + "learning_rate": 1.111494747073046e-06, + "loss": 0.6035, + "step": 16109 + }, + { + "epoch": 2.4, + "grad_norm": 4.098171483568357, + "learning_rate": 1.1113987409901112e-06, + "loss": 0.651, + "step": 16110 + }, + { + "epoch": 2.4, + "grad_norm": 3.2549449723257218, + "learning_rate": 1.111302733867482e-06, + "loss": 0.6576, + "step": 16111 + }, + { + "epoch": 2.4, + "grad_norm": 3.1866334996270003, + "learning_rate": 1.111206725706054e-06, + "loss": 0.6875, + "step": 16112 + }, + { + "epoch": 2.4, + "grad_norm": 3.2963936269469873, + "learning_rate": 1.1111107165067243e-06, + "loss": 0.6406, + "step": 16113 + }, + { + "epoch": 2.4, + "grad_norm": 4.156253028028251, + "learning_rate": 1.1110147062703884e-06, + "loss": 0.6055, + "step": 16114 + }, + { + "epoch": 2.4, + "grad_norm": 3.0983786821156243, + "learning_rate": 1.1109186949979425e-06, + "loss": 0.6615, + "step": 16115 + }, + { + "epoch": 2.4, + "grad_norm": 5.109273505085939, + "learning_rate": 1.1108226826902824e-06, + "loss": 0.7044, + "step": 16116 + }, + { + "epoch": 2.4, + "grad_norm": 3.4205672944452794, + "learning_rate": 1.1107266693483045e-06, + "loss": 0.6621, + "step": 16117 + }, + { + "epoch": 2.4, + "grad_norm": 2.93497811568179, + "learning_rate": 1.110630654972905e-06, + "loss": 0.599, + "step": 16118 + }, + { + "epoch": 2.4, + "grad_norm": 3.721290824161096, + "learning_rate": 1.1105346395649794e-06, + "loss": 0.653, + "step": 16119 + }, + { + "epoch": 2.4, + "grad_norm": 3.6098812727512843, + "learning_rate": 1.1104386231254246e-06, + "loss": 0.6126, + "step": 16120 + }, + { + "epoch": 2.4, + "grad_norm": 6.470414672864971, + "learning_rate": 1.1103426056551362e-06, + "loss": 0.6289, + "step": 16121 + }, + { + "epoch": 2.4, + "grad_norm": 7.76254598681364, + "learning_rate": 1.1102465871550105e-06, + "loss": 0.6491, + "step": 16122 + }, + { + "epoch": 2.4, + "grad_norm": 3.2776089126451877, + "learning_rate": 1.1101505676259437e-06, + "loss": 0.6237, + "step": 16123 + }, + { + "epoch": 2.4, + "grad_norm": 3.145882994185805, + "learning_rate": 1.1100545470688317e-06, + "loss": 0.6484, + "step": 16124 + }, + { + "epoch": 2.4, + "grad_norm": 5.104109444245109, + "learning_rate": 1.1099585254845712e-06, + "loss": 0.6367, + "step": 16125 + }, + { + "epoch": 2.41, + "grad_norm": 2.784042391310777, + "learning_rate": 1.109862502874058e-06, + "loss": 0.6387, + "step": 16126 + }, + { + "epoch": 2.41, + "grad_norm": 3.1805952272187836, + "learning_rate": 1.109766479238188e-06, + "loss": 0.5833, + "step": 16127 + }, + { + "epoch": 2.41, + "grad_norm": 3.419156216826375, + "learning_rate": 1.109670454577858e-06, + "loss": 0.597, + "step": 16128 + }, + { + "epoch": 2.41, + "grad_norm": 3.694927049689944, + "learning_rate": 1.1095744288939642e-06, + "loss": 0.6322, + "step": 16129 + }, + { + "epoch": 2.41, + "grad_norm": 2.8149108670565597, + "learning_rate": 1.1094784021874022e-06, + "loss": 0.623, + "step": 16130 + }, + { + "epoch": 2.41, + "grad_norm": 3.5465843319570896, + "learning_rate": 1.109382374459069e-06, + "loss": 0.6413, + "step": 16131 + }, + { + "epoch": 2.41, + "grad_norm": 3.206310401785116, + "learning_rate": 1.10928634570986e-06, + "loss": 0.6243, + "step": 16132 + }, + { + "epoch": 2.41, + "grad_norm": 5.523610159236279, + "learning_rate": 1.1091903159406716e-06, + "loss": 0.5931, + "step": 16133 + }, + { + "epoch": 2.41, + "grad_norm": 4.426689057464205, + "learning_rate": 1.1090942851524012e-06, + "loss": 0.6719, + "step": 16134 + }, + { + "epoch": 2.41, + "grad_norm": 3.550485808956085, + "learning_rate": 1.1089982533459437e-06, + "loss": 0.6341, + "step": 16135 + }, + { + "epoch": 2.41, + "grad_norm": 5.22077015244825, + "learning_rate": 1.108902220522196e-06, + "loss": 0.611, + "step": 16136 + }, + { + "epoch": 2.41, + "grad_norm": 3.1212141190392693, + "learning_rate": 1.1088061866820537e-06, + "loss": 0.6517, + "step": 16137 + }, + { + "epoch": 2.41, + "grad_norm": 3.6932200526493277, + "learning_rate": 1.1087101518264144e-06, + "loss": 0.6738, + "step": 16138 + }, + { + "epoch": 2.41, + "grad_norm": 3.3678866623471526, + "learning_rate": 1.1086141159561733e-06, + "loss": 0.6406, + "step": 16139 + }, + { + "epoch": 2.41, + "grad_norm": 3.471805719338849, + "learning_rate": 1.1085180790722268e-06, + "loss": 0.6699, + "step": 16140 + }, + { + "epoch": 2.41, + "grad_norm": 5.878813440108217, + "learning_rate": 1.1084220411754718e-06, + "loss": 0.707, + "step": 16141 + }, + { + "epoch": 2.41, + "grad_norm": 4.8771446254875785, + "learning_rate": 1.1083260022668043e-06, + "loss": 0.6478, + "step": 16142 + }, + { + "epoch": 2.41, + "grad_norm": 3.1423557783566345, + "learning_rate": 1.1082299623471205e-06, + "loss": 0.6257, + "step": 16143 + }, + { + "epoch": 2.41, + "grad_norm": 3.6192449345676176, + "learning_rate": 1.1081339214173172e-06, + "loss": 0.6634, + "step": 16144 + }, + { + "epoch": 2.41, + "grad_norm": 3.056715115322761, + "learning_rate": 1.1080378794782902e-06, + "loss": 0.6458, + "step": 16145 + }, + { + "epoch": 2.41, + "grad_norm": 2.7501126463989127, + "learning_rate": 1.1079418365309359e-06, + "loss": 0.6335, + "step": 16146 + }, + { + "epoch": 2.41, + "grad_norm": 3.675256133109593, + "learning_rate": 1.107845792576151e-06, + "loss": 0.5794, + "step": 16147 + }, + { + "epoch": 2.41, + "grad_norm": 3.1738217754833395, + "learning_rate": 1.1077497476148321e-06, + "loss": 0.6569, + "step": 16148 + }, + { + "epoch": 2.41, + "grad_norm": 2.8909742493649278, + "learning_rate": 1.1076537016478748e-06, + "loss": 0.6074, + "step": 16149 + }, + { + "epoch": 2.41, + "grad_norm": 2.952110884279853, + "learning_rate": 1.1075576546761764e-06, + "loss": 0.6374, + "step": 16150 + }, + { + "epoch": 2.41, + "grad_norm": 3.0462029502589965, + "learning_rate": 1.1074616067006329e-06, + "loss": 0.6048, + "step": 16151 + }, + { + "epoch": 2.41, + "grad_norm": 3.146374743840393, + "learning_rate": 1.1073655577221406e-06, + "loss": 0.6094, + "step": 16152 + }, + { + "epoch": 2.41, + "grad_norm": 3.6056987756239787, + "learning_rate": 1.107269507741596e-06, + "loss": 0.6673, + "step": 16153 + }, + { + "epoch": 2.41, + "grad_norm": 3.586788531421628, + "learning_rate": 1.1071734567598955e-06, + "loss": 0.6953, + "step": 16154 + }, + { + "epoch": 2.41, + "grad_norm": 3.4878436269578197, + "learning_rate": 1.1070774047779362e-06, + "loss": 0.6523, + "step": 16155 + }, + { + "epoch": 2.41, + "grad_norm": 5.443551079600942, + "learning_rate": 1.1069813517966133e-06, + "loss": 0.612, + "step": 16156 + }, + { + "epoch": 2.41, + "grad_norm": 2.839815272856742, + "learning_rate": 1.1068852978168244e-06, + "loss": 0.5729, + "step": 16157 + }, + { + "epoch": 2.41, + "grad_norm": 3.214225851403352, + "learning_rate": 1.1067892428394654e-06, + "loss": 0.6165, + "step": 16158 + }, + { + "epoch": 2.41, + "grad_norm": 3.2265590549103798, + "learning_rate": 1.1066931868654328e-06, + "loss": 0.6165, + "step": 16159 + }, + { + "epoch": 2.41, + "grad_norm": 3.5306549531350315, + "learning_rate": 1.1065971298956236e-06, + "loss": 0.6465, + "step": 16160 + }, + { + "epoch": 2.41, + "grad_norm": 3.7115687824432744, + "learning_rate": 1.1065010719309338e-06, + "loss": 0.6823, + "step": 16161 + }, + { + "epoch": 2.41, + "grad_norm": 3.285568869994458, + "learning_rate": 1.1064050129722599e-06, + "loss": 0.6725, + "step": 16162 + }, + { + "epoch": 2.41, + "grad_norm": 3.721479183307361, + "learning_rate": 1.106308953020499e-06, + "loss": 0.653, + "step": 16163 + }, + { + "epoch": 2.41, + "grad_norm": 3.122746997596649, + "learning_rate": 1.106212892076547e-06, + "loss": 0.6549, + "step": 16164 + }, + { + "epoch": 2.41, + "grad_norm": 3.2867862077414447, + "learning_rate": 1.106116830141301e-06, + "loss": 0.6478, + "step": 16165 + }, + { + "epoch": 2.41, + "grad_norm": 4.02516919687932, + "learning_rate": 1.1060207672156564e-06, + "loss": 0.6081, + "step": 16166 + }, + { + "epoch": 2.41, + "grad_norm": 4.065917041858896, + "learning_rate": 1.1059247033005114e-06, + "loss": 0.6302, + "step": 16167 + }, + { + "epoch": 2.41, + "grad_norm": 4.769705302767921, + "learning_rate": 1.1058286383967616e-06, + "loss": 0.6667, + "step": 16168 + }, + { + "epoch": 2.41, + "grad_norm": 3.3652364991495336, + "learning_rate": 1.1057325725053035e-06, + "loss": 0.6901, + "step": 16169 + }, + { + "epoch": 2.41, + "grad_norm": 3.322921039444759, + "learning_rate": 1.1056365056270342e-06, + "loss": 0.6185, + "step": 16170 + }, + { + "epoch": 2.41, + "grad_norm": 3.9182572649189167, + "learning_rate": 1.10554043776285e-06, + "loss": 0.707, + "step": 16171 + }, + { + "epoch": 2.41, + "grad_norm": 3.5405346450966335, + "learning_rate": 1.1054443689136475e-06, + "loss": 0.6745, + "step": 16172 + }, + { + "epoch": 2.41, + "grad_norm": 5.2621690774010625, + "learning_rate": 1.1053482990803235e-06, + "loss": 0.6973, + "step": 16173 + }, + { + "epoch": 2.41, + "grad_norm": 2.771418625633773, + "learning_rate": 1.1052522282637747e-06, + "loss": 0.6471, + "step": 16174 + }, + { + "epoch": 2.41, + "grad_norm": 3.271025790962827, + "learning_rate": 1.105156156464897e-06, + "loss": 0.6842, + "step": 16175 + }, + { + "epoch": 2.41, + "grad_norm": 2.518470712156387, + "learning_rate": 1.1050600836845879e-06, + "loss": 0.6335, + "step": 16176 + }, + { + "epoch": 2.41, + "grad_norm": 3.4817669410357404, + "learning_rate": 1.1049640099237439e-06, + "loss": 0.5768, + "step": 16177 + }, + { + "epoch": 2.41, + "grad_norm": 2.6494382550073055, + "learning_rate": 1.1048679351832611e-06, + "loss": 0.666, + "step": 16178 + }, + { + "epoch": 2.41, + "grad_norm": 2.5002146257825992, + "learning_rate": 1.104771859464037e-06, + "loss": 0.6608, + "step": 16179 + }, + { + "epoch": 2.41, + "grad_norm": 3.119049057913742, + "learning_rate": 1.1046757827669674e-06, + "loss": 0.6849, + "step": 16180 + }, + { + "epoch": 2.41, + "grad_norm": 2.7584972848147795, + "learning_rate": 1.10457970509295e-06, + "loss": 0.6257, + "step": 16181 + }, + { + "epoch": 2.41, + "grad_norm": 2.7850878289544325, + "learning_rate": 1.1044836264428808e-06, + "loss": 0.666, + "step": 16182 + }, + { + "epoch": 2.41, + "grad_norm": 3.3843156169647566, + "learning_rate": 1.1043875468176563e-06, + "loss": 0.6165, + "step": 16183 + }, + { + "epoch": 2.41, + "grad_norm": 3.094403122957257, + "learning_rate": 1.1042914662181742e-06, + "loss": 0.6048, + "step": 16184 + }, + { + "epoch": 2.41, + "grad_norm": 4.013973769845632, + "learning_rate": 1.10419538464533e-06, + "loss": 0.666, + "step": 16185 + }, + { + "epoch": 2.41, + "grad_norm": 2.3796108324547363, + "learning_rate": 1.1040993021000216e-06, + "loss": 0.6178, + "step": 16186 + }, + { + "epoch": 2.41, + "grad_norm": 2.540014950275787, + "learning_rate": 1.1040032185831449e-06, + "loss": 0.6374, + "step": 16187 + }, + { + "epoch": 2.41, + "grad_norm": 3.7771911670116323, + "learning_rate": 1.103907134095597e-06, + "loss": 0.6855, + "step": 16188 + }, + { + "epoch": 2.41, + "grad_norm": 4.0460338720699065, + "learning_rate": 1.1038110486382749e-06, + "loss": 0.6947, + "step": 16189 + }, + { + "epoch": 2.41, + "grad_norm": 2.677394153356149, + "learning_rate": 1.1037149622120746e-06, + "loss": 0.6022, + "step": 16190 + }, + { + "epoch": 2.41, + "grad_norm": 4.031349011947018, + "learning_rate": 1.1036188748178937e-06, + "loss": 0.6224, + "step": 16191 + }, + { + "epoch": 2.41, + "grad_norm": 2.513396735277156, + "learning_rate": 1.1035227864566289e-06, + "loss": 0.6029, + "step": 16192 + }, + { + "epoch": 2.42, + "grad_norm": 6.648816118159312, + "learning_rate": 1.1034266971291762e-06, + "loss": 0.6289, + "step": 16193 + }, + { + "epoch": 2.42, + "grad_norm": 4.005271997663111, + "learning_rate": 1.1033306068364333e-06, + "loss": 0.6361, + "step": 16194 + }, + { + "epoch": 2.42, + "grad_norm": 2.755911477149863, + "learning_rate": 1.1032345155792969e-06, + "loss": 0.6341, + "step": 16195 + }, + { + "epoch": 2.42, + "grad_norm": 2.8557220187699777, + "learning_rate": 1.1031384233586632e-06, + "loss": 0.6699, + "step": 16196 + }, + { + "epoch": 2.42, + "grad_norm": 3.116158875397881, + "learning_rate": 1.1030423301754301e-06, + "loss": 0.6549, + "step": 16197 + }, + { + "epoch": 2.42, + "grad_norm": 3.384526077141439, + "learning_rate": 1.1029462360304933e-06, + "loss": 0.6152, + "step": 16198 + }, + { + "epoch": 2.42, + "grad_norm": 2.846512272221521, + "learning_rate": 1.1028501409247502e-06, + "loss": 0.6686, + "step": 16199 + }, + { + "epoch": 2.42, + "grad_norm": 4.541131141217938, + "learning_rate": 1.1027540448590979e-06, + "loss": 0.6556, + "step": 16200 + }, + { + "epoch": 2.42, + "grad_norm": 2.633037803364924, + "learning_rate": 1.1026579478344328e-06, + "loss": 0.6784, + "step": 16201 + }, + { + "epoch": 2.42, + "grad_norm": 2.720346961225457, + "learning_rate": 1.102561849851652e-06, + "loss": 0.6595, + "step": 16202 + }, + { + "epoch": 2.42, + "grad_norm": 3.534629859162402, + "learning_rate": 1.1024657509116523e-06, + "loss": 0.6048, + "step": 16203 + }, + { + "epoch": 2.42, + "grad_norm": 3.959878757358566, + "learning_rate": 1.102369651015331e-06, + "loss": 0.6123, + "step": 16204 + }, + { + "epoch": 2.42, + "grad_norm": 2.953136894234399, + "learning_rate": 1.1022735501635847e-06, + "loss": 0.6699, + "step": 16205 + }, + { + "epoch": 2.42, + "grad_norm": 3.360232134382735, + "learning_rate": 1.1021774483573098e-06, + "loss": 0.6523, + "step": 16206 + }, + { + "epoch": 2.42, + "grad_norm": 3.870207450309303, + "learning_rate": 1.1020813455974043e-06, + "loss": 0.6172, + "step": 16207 + }, + { + "epoch": 2.42, + "grad_norm": 4.136728706131551, + "learning_rate": 1.1019852418847644e-06, + "loss": 0.681, + "step": 16208 + }, + { + "epoch": 2.42, + "grad_norm": 3.431088926542497, + "learning_rate": 1.1018891372202869e-06, + "loss": 0.6517, + "step": 16209 + }, + { + "epoch": 2.42, + "grad_norm": 3.4640254963097767, + "learning_rate": 1.1017930316048696e-06, + "loss": 0.6621, + "step": 16210 + }, + { + "epoch": 2.42, + "grad_norm": 2.8826023370729996, + "learning_rate": 1.1016969250394086e-06, + "loss": 0.6289, + "step": 16211 + }, + { + "epoch": 2.42, + "grad_norm": 3.9251996836481444, + "learning_rate": 1.1016008175248011e-06, + "loss": 0.6302, + "step": 16212 + }, + { + "epoch": 2.42, + "grad_norm": 4.249268248990979, + "learning_rate": 1.1015047090619446e-06, + "loss": 0.5931, + "step": 16213 + }, + { + "epoch": 2.42, + "grad_norm": 3.083647012558321, + "learning_rate": 1.1014085996517354e-06, + "loss": 0.6198, + "step": 16214 + }, + { + "epoch": 2.42, + "grad_norm": 2.9029387173816765, + "learning_rate": 1.1013124892950707e-06, + "loss": 0.6536, + "step": 16215 + }, + { + "epoch": 2.42, + "grad_norm": 3.0703260603389686, + "learning_rate": 1.101216377992848e-06, + "loss": 0.6211, + "step": 16216 + }, + { + "epoch": 2.42, + "grad_norm": 4.619636424248097, + "learning_rate": 1.1011202657459633e-06, + "loss": 0.6497, + "step": 16217 + }, + { + "epoch": 2.42, + "grad_norm": 3.127611482869359, + "learning_rate": 1.1010241525553145e-06, + "loss": 0.5957, + "step": 16218 + }, + { + "epoch": 2.42, + "grad_norm": 3.3068200118206774, + "learning_rate": 1.1009280384217982e-06, + "loss": 0.6908, + "step": 16219 + }, + { + "epoch": 2.42, + "grad_norm": 5.557049396578043, + "learning_rate": 1.100831923346312e-06, + "loss": 0.6921, + "step": 16220 + }, + { + "epoch": 2.42, + "grad_norm": 4.680039186971334, + "learning_rate": 1.1007358073297523e-06, + "loss": 0.6387, + "step": 16221 + }, + { + "epoch": 2.42, + "grad_norm": 2.900498216604118, + "learning_rate": 1.1006396903730162e-06, + "loss": 0.6016, + "step": 16222 + }, + { + "epoch": 2.42, + "grad_norm": 3.232324685298537, + "learning_rate": 1.1005435724770013e-06, + "loss": 0.6198, + "step": 16223 + }, + { + "epoch": 2.42, + "grad_norm": 2.9400945855978566, + "learning_rate": 1.1004474536426043e-06, + "loss": 0.6198, + "step": 16224 + }, + { + "epoch": 2.42, + "grad_norm": 4.619092659014568, + "learning_rate": 1.100351333870722e-06, + "loss": 0.6621, + "step": 16225 + }, + { + "epoch": 2.42, + "grad_norm": 3.7310670760841207, + "learning_rate": 1.1002552131622523e-06, + "loss": 0.6152, + "step": 16226 + }, + { + "epoch": 2.42, + "grad_norm": 3.710773906966693, + "learning_rate": 1.1001590915180915e-06, + "loss": 0.6986, + "step": 16227 + }, + { + "epoch": 2.42, + "grad_norm": 2.820845447331387, + "learning_rate": 1.100062968939137e-06, + "loss": 0.6289, + "step": 16228 + }, + { + "epoch": 2.42, + "grad_norm": 3.698851721903911, + "learning_rate": 1.0999668454262862e-06, + "loss": 0.6406, + "step": 16229 + }, + { + "epoch": 2.42, + "grad_norm": 3.003222252348611, + "learning_rate": 1.099870720980436e-06, + "loss": 0.6123, + "step": 16230 + }, + { + "epoch": 2.42, + "grad_norm": 3.218076808759493, + "learning_rate": 1.0997745956024834e-06, + "loss": 0.6152, + "step": 16231 + }, + { + "epoch": 2.42, + "grad_norm": 3.2063671029342853, + "learning_rate": 1.0996784692933254e-06, + "loss": 0.5944, + "step": 16232 + }, + { + "epoch": 2.42, + "grad_norm": 3.863605541177366, + "learning_rate": 1.09958234205386e-06, + "loss": 0.6458, + "step": 16233 + }, + { + "epoch": 2.42, + "grad_norm": 3.181892653573195, + "learning_rate": 1.0994862138849835e-06, + "loss": 0.623, + "step": 16234 + }, + { + "epoch": 2.42, + "grad_norm": 3.279731258041385, + "learning_rate": 1.0993900847875933e-06, + "loss": 0.6029, + "step": 16235 + }, + { + "epoch": 2.42, + "grad_norm": 3.9841900299123894, + "learning_rate": 1.0992939547625868e-06, + "loss": 0.6556, + "step": 16236 + }, + { + "epoch": 2.42, + "grad_norm": 3.537177919730715, + "learning_rate": 1.099197823810861e-06, + "loss": 0.6582, + "step": 16237 + }, + { + "epoch": 2.42, + "grad_norm": 3.3181160922505333, + "learning_rate": 1.0991016919333129e-06, + "loss": 0.6289, + "step": 16238 + }, + { + "epoch": 2.42, + "grad_norm": 3.7979001713987475, + "learning_rate": 1.0990055591308403e-06, + "loss": 0.5771, + "step": 16239 + }, + { + "epoch": 2.42, + "grad_norm": 3.3973225495514705, + "learning_rate": 1.0989094254043398e-06, + "loss": 0.6452, + "step": 16240 + }, + { + "epoch": 2.42, + "grad_norm": 3.516983623006026, + "learning_rate": 1.098813290754709e-06, + "loss": 0.6113, + "step": 16241 + }, + { + "epoch": 2.42, + "grad_norm": 4.171010165900602, + "learning_rate": 1.0987171551828452e-06, + "loss": 0.6022, + "step": 16242 + }, + { + "epoch": 2.42, + "grad_norm": 4.418820654271641, + "learning_rate": 1.098621018689645e-06, + "loss": 0.6875, + "step": 16243 + }, + { + "epoch": 2.42, + "grad_norm": 3.723969779387697, + "learning_rate": 1.0985248812760064e-06, + "loss": 0.6621, + "step": 16244 + }, + { + "epoch": 2.42, + "grad_norm": 6.134400999533632, + "learning_rate": 1.0984287429428265e-06, + "loss": 0.6582, + "step": 16245 + }, + { + "epoch": 2.42, + "grad_norm": 3.6276698639047447, + "learning_rate": 1.0983326036910022e-06, + "loss": 0.6576, + "step": 16246 + }, + { + "epoch": 2.42, + "grad_norm": 3.4965730024125925, + "learning_rate": 1.0982364635214313e-06, + "loss": 0.679, + "step": 16247 + }, + { + "epoch": 2.42, + "grad_norm": 3.5778063544134273, + "learning_rate": 1.0981403224350106e-06, + "loss": 0.666, + "step": 16248 + }, + { + "epoch": 2.42, + "grad_norm": 5.520560714689388, + "learning_rate": 1.0980441804326372e-06, + "loss": 0.7507, + "step": 16249 + }, + { + "epoch": 2.42, + "grad_norm": 3.2343033005989503, + "learning_rate": 1.0979480375152097e-06, + "loss": 0.6458, + "step": 16250 + }, + { + "epoch": 2.42, + "grad_norm": 3.2960960117162195, + "learning_rate": 1.0978518936836238e-06, + "loss": 0.6237, + "step": 16251 + }, + { + "epoch": 2.42, + "grad_norm": 3.8330284184704366, + "learning_rate": 1.097755748938778e-06, + "loss": 0.6445, + "step": 16252 + }, + { + "epoch": 2.42, + "grad_norm": 4.737859188421782, + "learning_rate": 1.0976596032815688e-06, + "loss": 0.6022, + "step": 16253 + }, + { + "epoch": 2.42, + "grad_norm": 5.166403881364413, + "learning_rate": 1.0975634567128937e-06, + "loss": 0.6276, + "step": 16254 + }, + { + "epoch": 2.42, + "grad_norm": 6.1172586798019015, + "learning_rate": 1.0974673092336506e-06, + "loss": 0.7253, + "step": 16255 + }, + { + "epoch": 2.42, + "grad_norm": 2.9622524891070445, + "learning_rate": 1.0973711608447363e-06, + "loss": 0.6302, + "step": 16256 + }, + { + "epoch": 2.42, + "grad_norm": 3.99547947162132, + "learning_rate": 1.0972750115470483e-06, + "loss": 0.6908, + "step": 16257 + }, + { + "epoch": 2.42, + "grad_norm": 4.310000470408356, + "learning_rate": 1.0971788613414842e-06, + "loss": 0.6484, + "step": 16258 + }, + { + "epoch": 2.42, + "grad_norm": 3.048441843511526, + "learning_rate": 1.0970827102289412e-06, + "loss": 0.6556, + "step": 16259 + }, + { + "epoch": 2.43, + "grad_norm": 3.7160967308869717, + "learning_rate": 1.0969865582103166e-06, + "loss": 0.6426, + "step": 16260 + }, + { + "epoch": 2.43, + "grad_norm": 3.4742486554210217, + "learning_rate": 1.0968904052865077e-06, + "loss": 0.6185, + "step": 16261 + }, + { + "epoch": 2.43, + "grad_norm": 2.590785839126898, + "learning_rate": 1.0967942514584124e-06, + "loss": 0.6458, + "step": 16262 + }, + { + "epoch": 2.43, + "grad_norm": 2.637189318598999, + "learning_rate": 1.0966980967269278e-06, + "loss": 0.5814, + "step": 16263 + }, + { + "epoch": 2.43, + "grad_norm": 4.684753932982316, + "learning_rate": 1.096601941092951e-06, + "loss": 0.651, + "step": 16264 + }, + { + "epoch": 2.43, + "grad_norm": 4.5645232523278185, + "learning_rate": 1.0965057845573797e-06, + "loss": 0.668, + "step": 16265 + }, + { + "epoch": 2.43, + "grad_norm": 3.146638056992002, + "learning_rate": 1.0964096271211116e-06, + "loss": 0.6309, + "step": 16266 + }, + { + "epoch": 2.43, + "grad_norm": 4.054995515632718, + "learning_rate": 1.0963134687850443e-06, + "loss": 0.6328, + "step": 16267 + }, + { + "epoch": 2.43, + "grad_norm": 5.0758390342298405, + "learning_rate": 1.0962173095500744e-06, + "loss": 0.6621, + "step": 16268 + }, + { + "epoch": 2.43, + "grad_norm": 6.015995893488659, + "learning_rate": 1.0961211494170997e-06, + "loss": 0.6706, + "step": 16269 + }, + { + "epoch": 2.43, + "grad_norm": 2.8492153178923134, + "learning_rate": 1.096024988387018e-06, + "loss": 0.6335, + "step": 16270 + }, + { + "epoch": 2.43, + "grad_norm": 3.224736512370168, + "learning_rate": 1.0959288264607268e-06, + "loss": 0.6074, + "step": 16271 + }, + { + "epoch": 2.43, + "grad_norm": 3.149821148493789, + "learning_rate": 1.0958326636391232e-06, + "loss": 0.6406, + "step": 16272 + }, + { + "epoch": 2.43, + "grad_norm": 3.9663861415839796, + "learning_rate": 1.0957364999231049e-06, + "loss": 0.6367, + "step": 16273 + }, + { + "epoch": 2.43, + "grad_norm": 2.779477897805329, + "learning_rate": 1.0956403353135693e-06, + "loss": 0.6107, + "step": 16274 + }, + { + "epoch": 2.43, + "grad_norm": 3.2337194390336306, + "learning_rate": 1.095544169811414e-06, + "loss": 0.6914, + "step": 16275 + }, + { + "epoch": 2.43, + "grad_norm": 3.6755465926264943, + "learning_rate": 1.0954480034175367e-06, + "loss": 0.6654, + "step": 16276 + }, + { + "epoch": 2.43, + "grad_norm": 3.430746067746198, + "learning_rate": 1.0953518361328344e-06, + "loss": 0.681, + "step": 16277 + }, + { + "epoch": 2.43, + "grad_norm": 3.839158563562915, + "learning_rate": 1.0952556679582053e-06, + "loss": 0.6354, + "step": 16278 + }, + { + "epoch": 2.43, + "grad_norm": 3.445400867019921, + "learning_rate": 1.0951594988945466e-06, + "loss": 0.5742, + "step": 16279 + }, + { + "epoch": 2.43, + "grad_norm": 3.5925337807861126, + "learning_rate": 1.0950633289427558e-06, + "loss": 0.6263, + "step": 16280 + }, + { + "epoch": 2.43, + "grad_norm": 3.10190441777177, + "learning_rate": 1.0949671581037304e-06, + "loss": 0.6693, + "step": 16281 + }, + { + "epoch": 2.43, + "grad_norm": 2.9018236894506018, + "learning_rate": 1.0948709863783682e-06, + "loss": 0.6706, + "step": 16282 + }, + { + "epoch": 2.43, + "grad_norm": 3.108010632442052, + "learning_rate": 1.094774813767567e-06, + "loss": 0.6055, + "step": 16283 + }, + { + "epoch": 2.43, + "grad_norm": 2.874359275801509, + "learning_rate": 1.0946786402722236e-06, + "loss": 0.6257, + "step": 16284 + }, + { + "epoch": 2.43, + "grad_norm": 3.749806580189776, + "learning_rate": 1.0945824658932364e-06, + "loss": 0.6615, + "step": 16285 + }, + { + "epoch": 2.43, + "grad_norm": 3.3172097505160716, + "learning_rate": 1.0944862906315028e-06, + "loss": 0.6458, + "step": 16286 + }, + { + "epoch": 2.43, + "grad_norm": 3.1069815100283065, + "learning_rate": 1.09439011448792e-06, + "loss": 0.666, + "step": 16287 + }, + { + "epoch": 2.43, + "grad_norm": 4.117939136843455, + "learning_rate": 1.0942939374633857e-06, + "loss": 0.6478, + "step": 16288 + }, + { + "epoch": 2.43, + "grad_norm": 5.773664191045305, + "learning_rate": 1.0941977595587983e-06, + "loss": 0.651, + "step": 16289 + }, + { + "epoch": 2.43, + "grad_norm": 3.4909785588902986, + "learning_rate": 1.0941015807750547e-06, + "loss": 0.6712, + "step": 16290 + }, + { + "epoch": 2.43, + "grad_norm": 3.270424012178166, + "learning_rate": 1.0940054011130525e-06, + "loss": 0.6602, + "step": 16291 + }, + { + "epoch": 2.43, + "grad_norm": 3.020142900387604, + "learning_rate": 1.0939092205736897e-06, + "loss": 0.6582, + "step": 16292 + }, + { + "epoch": 2.43, + "grad_norm": 2.7958589722438787, + "learning_rate": 1.0938130391578638e-06, + "loss": 0.6491, + "step": 16293 + }, + { + "epoch": 2.43, + "grad_norm": 2.7427214741959434, + "learning_rate": 1.0937168568664725e-06, + "loss": 0.6693, + "step": 16294 + }, + { + "epoch": 2.43, + "grad_norm": 2.9709493071947772, + "learning_rate": 1.0936206737004136e-06, + "loss": 0.5944, + "step": 16295 + }, + { + "epoch": 2.43, + "grad_norm": 3.685297698589531, + "learning_rate": 1.093524489660585e-06, + "loss": 0.6504, + "step": 16296 + }, + { + "epoch": 2.43, + "grad_norm": 3.32228871693566, + "learning_rate": 1.0934283047478837e-06, + "loss": 0.6478, + "step": 16297 + }, + { + "epoch": 2.43, + "grad_norm": 2.992250198129525, + "learning_rate": 1.0933321189632073e-06, + "loss": 0.6497, + "step": 16298 + }, + { + "epoch": 2.43, + "grad_norm": 3.7333144274467576, + "learning_rate": 1.0932359323074544e-06, + "loss": 0.6921, + "step": 16299 + }, + { + "epoch": 2.43, + "grad_norm": 2.8383191135999626, + "learning_rate": 1.0931397447815227e-06, + "loss": 0.6081, + "step": 16300 + }, + { + "epoch": 2.43, + "grad_norm": 2.6598641959003406, + "learning_rate": 1.093043556386309e-06, + "loss": 0.6087, + "step": 16301 + }, + { + "epoch": 2.43, + "grad_norm": 3.000603901105713, + "learning_rate": 1.092947367122712e-06, + "loss": 0.6243, + "step": 16302 + }, + { + "epoch": 2.43, + "grad_norm": 4.075104597478175, + "learning_rate": 1.0928511769916288e-06, + "loss": 0.5996, + "step": 16303 + }, + { + "epoch": 2.43, + "grad_norm": 3.051662844363592, + "learning_rate": 1.092754985993957e-06, + "loss": 0.6426, + "step": 16304 + }, + { + "epoch": 2.43, + "grad_norm": 3.110144956732667, + "learning_rate": 1.092658794130595e-06, + "loss": 0.6198, + "step": 16305 + }, + { + "epoch": 2.43, + "grad_norm": 3.251930090849647, + "learning_rate": 1.0925626014024405e-06, + "loss": 0.668, + "step": 16306 + }, + { + "epoch": 2.43, + "grad_norm": 3.325094902419621, + "learning_rate": 1.0924664078103907e-06, + "loss": 0.6328, + "step": 16307 + }, + { + "epoch": 2.43, + "grad_norm": 3.8812876184946674, + "learning_rate": 1.0923702133553438e-06, + "loss": 0.6016, + "step": 16308 + }, + { + "epoch": 2.43, + "grad_norm": 2.740947567560114, + "learning_rate": 1.0922740180381977e-06, + "loss": 0.6113, + "step": 16309 + }, + { + "epoch": 2.43, + "grad_norm": 3.603343751898623, + "learning_rate": 1.0921778218598498e-06, + "loss": 0.6732, + "step": 16310 + }, + { + "epoch": 2.43, + "grad_norm": 5.157405236508481, + "learning_rate": 1.0920816248211984e-06, + "loss": 0.6126, + "step": 16311 + }, + { + "epoch": 2.43, + "grad_norm": 3.7526992484302557, + "learning_rate": 1.0919854269231407e-06, + "loss": 0.6634, + "step": 16312 + }, + { + "epoch": 2.43, + "grad_norm": 4.336500327621149, + "learning_rate": 1.0918892281665756e-06, + "loss": 0.6107, + "step": 16313 + }, + { + "epoch": 2.43, + "grad_norm": 3.750431572880712, + "learning_rate": 1.0917930285523993e-06, + "loss": 0.5977, + "step": 16314 + }, + { + "epoch": 2.43, + "grad_norm": 3.1967485667564994, + "learning_rate": 1.0916968280815108e-06, + "loss": 0.6309, + "step": 16315 + }, + { + "epoch": 2.43, + "grad_norm": 4.340962817892974, + "learning_rate": 1.0916006267548082e-06, + "loss": 0.6263, + "step": 16316 + }, + { + "epoch": 2.43, + "grad_norm": 3.6315349286721648, + "learning_rate": 1.0915044245731883e-06, + "loss": 0.6445, + "step": 16317 + }, + { + "epoch": 2.43, + "grad_norm": 4.529691816420575, + "learning_rate": 1.0914082215375497e-06, + "loss": 0.668, + "step": 16318 + }, + { + "epoch": 2.43, + "grad_norm": 3.1039263296531603, + "learning_rate": 1.0913120176487903e-06, + "loss": 0.6348, + "step": 16319 + }, + { + "epoch": 2.43, + "grad_norm": 3.109587591698981, + "learning_rate": 1.0912158129078074e-06, + "loss": 0.6191, + "step": 16320 + }, + { + "epoch": 2.43, + "grad_norm": 3.5489291834942165, + "learning_rate": 1.0911196073154995e-06, + "loss": 0.6973, + "step": 16321 + }, + { + "epoch": 2.43, + "grad_norm": 3.0507792494955948, + "learning_rate": 1.0910234008727639e-06, + "loss": 0.61, + "step": 16322 + }, + { + "epoch": 2.43, + "grad_norm": 7.227747831310424, + "learning_rate": 1.0909271935804993e-06, + "loss": 0.6348, + "step": 16323 + }, + { + "epoch": 2.43, + "grad_norm": 3.504351599181026, + "learning_rate": 1.090830985439603e-06, + "loss": 0.6758, + "step": 16324 + }, + { + "epoch": 2.43, + "grad_norm": 3.580627731678024, + "learning_rate": 1.090734776450973e-06, + "loss": 0.7005, + "step": 16325 + }, + { + "epoch": 2.43, + "grad_norm": 4.120880154798373, + "learning_rate": 1.0906385666155074e-06, + "loss": 0.6582, + "step": 16326 + }, + { + "epoch": 2.44, + "grad_norm": 3.620476988232404, + "learning_rate": 1.0905423559341041e-06, + "loss": 0.6784, + "step": 16327 + }, + { + "epoch": 2.44, + "grad_norm": 6.848483543778698, + "learning_rate": 1.0904461444076607e-06, + "loss": 0.6419, + "step": 16328 + }, + { + "epoch": 2.44, + "grad_norm": 4.088885249639158, + "learning_rate": 1.090349932037076e-06, + "loss": 0.6784, + "step": 16329 + }, + { + "epoch": 2.44, + "grad_norm": 4.402394284707644, + "learning_rate": 1.0902537188232468e-06, + "loss": 0.625, + "step": 16330 + }, + { + "epoch": 2.44, + "grad_norm": 3.881104112893777, + "learning_rate": 1.0901575047670717e-06, + "loss": 0.6562, + "step": 16331 + }, + { + "epoch": 2.44, + "grad_norm": 3.443533750411454, + "learning_rate": 1.0900612898694489e-06, + "loss": 0.6348, + "step": 16332 + }, + { + "epoch": 2.44, + "grad_norm": 3.8089016533068816, + "learning_rate": 1.089965074131276e-06, + "loss": 0.6022, + "step": 16333 + }, + { + "epoch": 2.44, + "grad_norm": 3.1785308219446646, + "learning_rate": 1.089868857553451e-06, + "loss": 0.6107, + "step": 16334 + }, + { + "epoch": 2.44, + "grad_norm": 2.9650354772437635, + "learning_rate": 1.089772640136872e-06, + "loss": 0.6126, + "step": 16335 + }, + { + "epoch": 2.44, + "grad_norm": 3.5930329838409616, + "learning_rate": 1.0896764218824375e-06, + "loss": 0.6198, + "step": 16336 + }, + { + "epoch": 2.44, + "grad_norm": 2.670980069448908, + "learning_rate": 1.0895802027910446e-06, + "loss": 0.6387, + "step": 16337 + }, + { + "epoch": 2.44, + "grad_norm": 3.4896656528191055, + "learning_rate": 1.0894839828635918e-06, + "loss": 0.6276, + "step": 16338 + }, + { + "epoch": 2.44, + "grad_norm": 4.749159603587681, + "learning_rate": 1.089387762100977e-06, + "loss": 0.7031, + "step": 16339 + }, + { + "epoch": 2.44, + "grad_norm": 2.663866642277847, + "learning_rate": 1.0892915405040986e-06, + "loss": 0.6426, + "step": 16340 + }, + { + "epoch": 2.44, + "grad_norm": 4.2338177832931985, + "learning_rate": 1.089195318073854e-06, + "loss": 0.638, + "step": 16341 + }, + { + "epoch": 2.44, + "grad_norm": 2.903731577258681, + "learning_rate": 1.0890990948111418e-06, + "loss": 0.6367, + "step": 16342 + }, + { + "epoch": 2.44, + "grad_norm": 2.9466740576310397, + "learning_rate": 1.08900287071686e-06, + "loss": 0.6367, + "step": 16343 + }, + { + "epoch": 2.44, + "grad_norm": 3.544644613320119, + "learning_rate": 1.088906645791906e-06, + "loss": 0.5977, + "step": 16344 + }, + { + "epoch": 2.44, + "grad_norm": 4.395557033231281, + "learning_rate": 1.088810420037179e-06, + "loss": 0.6413, + "step": 16345 + }, + { + "epoch": 2.44, + "grad_norm": 4.119441593292491, + "learning_rate": 1.0887141934535764e-06, + "loss": 0.6465, + "step": 16346 + }, + { + "epoch": 2.44, + "grad_norm": 2.9204486402562164, + "learning_rate": 1.088617966041996e-06, + "loss": 0.6367, + "step": 16347 + }, + { + "epoch": 2.44, + "grad_norm": 2.774764103208645, + "learning_rate": 1.0885217378033366e-06, + "loss": 0.6667, + "step": 16348 + }, + { + "epoch": 2.44, + "grad_norm": 2.980087151098614, + "learning_rate": 1.088425508738496e-06, + "loss": 0.6621, + "step": 16349 + }, + { + "epoch": 2.44, + "grad_norm": 3.333088633727813, + "learning_rate": 1.0883292788483725e-06, + "loss": 0.7122, + "step": 16350 + }, + { + "epoch": 2.44, + "grad_norm": 3.518022949333238, + "learning_rate": 1.0882330481338634e-06, + "loss": 0.6426, + "step": 16351 + }, + { + "epoch": 2.44, + "grad_norm": 4.770125769673651, + "learning_rate": 1.088136816595868e-06, + "loss": 0.6237, + "step": 16352 + }, + { + "epoch": 2.44, + "grad_norm": 4.480101021152408, + "learning_rate": 1.0880405842352838e-06, + "loss": 0.6465, + "step": 16353 + }, + { + "epoch": 2.44, + "grad_norm": 3.1065589487172733, + "learning_rate": 1.0879443510530089e-06, + "loss": 0.6816, + "step": 16354 + }, + { + "epoch": 2.44, + "grad_norm": 5.602969362668453, + "learning_rate": 1.0878481170499418e-06, + "loss": 0.6361, + "step": 16355 + }, + { + "epoch": 2.44, + "grad_norm": 3.2533111870136073, + "learning_rate": 1.0877518822269802e-06, + "loss": 0.6654, + "step": 16356 + }, + { + "epoch": 2.44, + "grad_norm": 3.6405603413819683, + "learning_rate": 1.0876556465850226e-06, + "loss": 0.6647, + "step": 16357 + }, + { + "epoch": 2.44, + "grad_norm": 3.3103881059801115, + "learning_rate": 1.0875594101249671e-06, + "loss": 0.6211, + "step": 16358 + }, + { + "epoch": 2.44, + "grad_norm": 4.561621843559702, + "learning_rate": 1.0874631728477123e-06, + "loss": 0.6921, + "step": 16359 + }, + { + "epoch": 2.44, + "grad_norm": 2.685721928018881, + "learning_rate": 1.0873669347541552e-06, + "loss": 0.6289, + "step": 16360 + }, + { + "epoch": 2.44, + "grad_norm": 2.842082089491518, + "learning_rate": 1.0872706958451954e-06, + "loss": 0.627, + "step": 16361 + }, + { + "epoch": 2.44, + "grad_norm": 2.5872330911140997, + "learning_rate": 1.08717445612173e-06, + "loss": 0.6107, + "step": 16362 + }, + { + "epoch": 2.44, + "grad_norm": 4.539938025946263, + "learning_rate": 1.0870782155846584e-06, + "loss": 0.6296, + "step": 16363 + }, + { + "epoch": 2.44, + "grad_norm": 2.7771436643238694, + "learning_rate": 1.0869819742348773e-06, + "loss": 0.612, + "step": 16364 + }, + { + "epoch": 2.44, + "grad_norm": 2.7787578161354567, + "learning_rate": 1.086885732073286e-06, + "loss": 0.6224, + "step": 16365 + }, + { + "epoch": 2.44, + "grad_norm": 3.1256561136378918, + "learning_rate": 1.086789489100783e-06, + "loss": 0.6387, + "step": 16366 + }, + { + "epoch": 2.44, + "grad_norm": 3.115352555066573, + "learning_rate": 1.0866932453182653e-06, + "loss": 0.6484, + "step": 16367 + }, + { + "epoch": 2.44, + "grad_norm": 2.7198784835814043, + "learning_rate": 1.0865970007266323e-06, + "loss": 0.6237, + "step": 16368 + }, + { + "epoch": 2.44, + "grad_norm": 3.7743976183620154, + "learning_rate": 1.0865007553267817e-06, + "loss": 0.6829, + "step": 16369 + }, + { + "epoch": 2.44, + "grad_norm": 3.0404260040569606, + "learning_rate": 1.0864045091196119e-06, + "loss": 0.6914, + "step": 16370 + }, + { + "epoch": 2.44, + "grad_norm": 3.1525230328599654, + "learning_rate": 1.0863082621060212e-06, + "loss": 0.6934, + "step": 16371 + }, + { + "epoch": 2.44, + "grad_norm": 2.8817242559792424, + "learning_rate": 1.086212014286908e-06, + "loss": 0.6328, + "step": 16372 + }, + { + "epoch": 2.44, + "grad_norm": 5.130242512744457, + "learning_rate": 1.08611576566317e-06, + "loss": 0.638, + "step": 16373 + }, + { + "epoch": 2.44, + "grad_norm": 5.559364765328557, + "learning_rate": 1.0860195162357063e-06, + "loss": 0.7057, + "step": 16374 + }, + { + "epoch": 2.44, + "grad_norm": 2.5722261881593487, + "learning_rate": 1.0859232660054148e-06, + "loss": 0.6439, + "step": 16375 + }, + { + "epoch": 2.44, + "grad_norm": 3.1040283480151207, + "learning_rate": 1.0858270149731938e-06, + "loss": 0.6139, + "step": 16376 + }, + { + "epoch": 2.44, + "grad_norm": 3.0572213490063507, + "learning_rate": 1.0857307631399416e-06, + "loss": 0.6165, + "step": 16377 + }, + { + "epoch": 2.44, + "grad_norm": 3.4178591429256073, + "learning_rate": 1.0856345105065567e-06, + "loss": 0.6361, + "step": 16378 + }, + { + "epoch": 2.44, + "grad_norm": 2.8834220980260943, + "learning_rate": 1.0855382570739377e-06, + "loss": 0.5918, + "step": 16379 + }, + { + "epoch": 2.44, + "grad_norm": 3.046350713774598, + "learning_rate": 1.0854420028429821e-06, + "loss": 0.6224, + "step": 16380 + }, + { + "epoch": 2.44, + "grad_norm": 3.9149568792774114, + "learning_rate": 1.0853457478145887e-06, + "loss": 0.6419, + "step": 16381 + }, + { + "epoch": 2.44, + "grad_norm": 2.752907277681982, + "learning_rate": 1.0852494919896564e-06, + "loss": 0.6374, + "step": 16382 + }, + { + "epoch": 2.44, + "grad_norm": 2.8479612804665533, + "learning_rate": 1.0851532353690826e-06, + "loss": 0.6641, + "step": 16383 + }, + { + "epoch": 2.44, + "grad_norm": 3.4928952179764083, + "learning_rate": 1.0850569779537664e-06, + "loss": 0.6257, + "step": 16384 + }, + { + "epoch": 2.44, + "grad_norm": 4.338394248006104, + "learning_rate": 1.0849607197446058e-06, + "loss": 0.6589, + "step": 16385 + }, + { + "epoch": 2.44, + "grad_norm": 2.621596925142939, + "learning_rate": 1.0848644607424991e-06, + "loss": 0.6445, + "step": 16386 + }, + { + "epoch": 2.44, + "grad_norm": 3.03017457969983, + "learning_rate": 1.0847682009483449e-06, + "loss": 0.6406, + "step": 16387 + }, + { + "epoch": 2.44, + "grad_norm": 3.3029231164004322, + "learning_rate": 1.0846719403630415e-06, + "loss": 0.6576, + "step": 16388 + }, + { + "epoch": 2.44, + "grad_norm": 5.365634682493997, + "learning_rate": 1.0845756789874878e-06, + "loss": 0.6172, + "step": 16389 + }, + { + "epoch": 2.44, + "grad_norm": 3.814645647238855, + "learning_rate": 1.0844794168225817e-06, + "loss": 0.6328, + "step": 16390 + }, + { + "epoch": 2.44, + "grad_norm": 2.9653284781155587, + "learning_rate": 1.0843831538692213e-06, + "loss": 0.6172, + "step": 16391 + }, + { + "epoch": 2.44, + "grad_norm": 3.5907357695683593, + "learning_rate": 1.084286890128306e-06, + "loss": 0.6725, + "step": 16392 + }, + { + "epoch": 2.44, + "grad_norm": 5.427323467763186, + "learning_rate": 1.0841906256007337e-06, + "loss": 0.6771, + "step": 16393 + }, + { + "epoch": 2.45, + "grad_norm": 2.9346180670110806, + "learning_rate": 1.0840943602874025e-06, + "loss": 0.6077, + "step": 16394 + }, + { + "epoch": 2.45, + "grad_norm": 4.319481627151488, + "learning_rate": 1.0839980941892115e-06, + "loss": 0.6549, + "step": 16395 + }, + { + "epoch": 2.45, + "grad_norm": 2.959320507756627, + "learning_rate": 1.0839018273070586e-06, + "loss": 0.6497, + "step": 16396 + }, + { + "epoch": 2.45, + "grad_norm": 3.0571140443531744, + "learning_rate": 1.0838055596418424e-06, + "loss": 0.6224, + "step": 16397 + }, + { + "epoch": 2.45, + "grad_norm": 2.960258836414737, + "learning_rate": 1.0837092911944619e-06, + "loss": 0.627, + "step": 16398 + }, + { + "epoch": 2.45, + "grad_norm": 4.315268005106289, + "learning_rate": 1.0836130219658151e-06, + "loss": 0.666, + "step": 16399 + }, + { + "epoch": 2.45, + "grad_norm": 3.134249698197382, + "learning_rate": 1.0835167519568006e-06, + "loss": 0.6396, + "step": 16400 + }, + { + "epoch": 2.45, + "grad_norm": 3.4595581906547377, + "learning_rate": 1.0834204811683166e-06, + "loss": 0.623, + "step": 16401 + }, + { + "epoch": 2.45, + "grad_norm": 3.0105214162570433, + "learning_rate": 1.083324209601262e-06, + "loss": 0.6458, + "step": 16402 + }, + { + "epoch": 2.45, + "grad_norm": 3.20222350878902, + "learning_rate": 1.0832279372565352e-06, + "loss": 0.6504, + "step": 16403 + }, + { + "epoch": 2.45, + "grad_norm": 4.229916748267959, + "learning_rate": 1.0831316641350346e-06, + "loss": 0.6074, + "step": 16404 + }, + { + "epoch": 2.45, + "grad_norm": 2.873645898034827, + "learning_rate": 1.083035390237659e-06, + "loss": 0.6224, + "step": 16405 + }, + { + "epoch": 2.45, + "grad_norm": 3.6511204151152277, + "learning_rate": 1.0829391155653068e-06, + "loss": 0.6621, + "step": 16406 + }, + { + "epoch": 2.45, + "grad_norm": 2.937619763451913, + "learning_rate": 1.0828428401188762e-06, + "loss": 0.5866, + "step": 16407 + }, + { + "epoch": 2.45, + "grad_norm": 4.4610041066455555, + "learning_rate": 1.0827465638992663e-06, + "loss": 0.6048, + "step": 16408 + }, + { + "epoch": 2.45, + "grad_norm": 4.0208663418518, + "learning_rate": 1.0826502869073755e-06, + "loss": 0.668, + "step": 16409 + }, + { + "epoch": 2.45, + "grad_norm": 3.907365221811988, + "learning_rate": 1.0825540091441019e-06, + "loss": 0.6803, + "step": 16410 + }, + { + "epoch": 2.45, + "grad_norm": 3.06953764770174, + "learning_rate": 1.0824577306103446e-06, + "loss": 0.6426, + "step": 16411 + }, + { + "epoch": 2.45, + "grad_norm": 3.155152987671275, + "learning_rate": 1.0823614513070023e-06, + "loss": 0.6758, + "step": 16412 + }, + { + "epoch": 2.45, + "grad_norm": 3.159946956765542, + "learning_rate": 1.0822651712349728e-06, + "loss": 0.7012, + "step": 16413 + }, + { + "epoch": 2.45, + "grad_norm": 3.251484762984483, + "learning_rate": 1.0821688903951555e-06, + "loss": 0.6126, + "step": 16414 + }, + { + "epoch": 2.45, + "grad_norm": 3.5660586316634597, + "learning_rate": 1.0820726087884487e-06, + "loss": 0.6478, + "step": 16415 + }, + { + "epoch": 2.45, + "grad_norm": 3.222180305337158, + "learning_rate": 1.081976326415751e-06, + "loss": 0.6615, + "step": 16416 + }, + { + "epoch": 2.45, + "grad_norm": 2.9814590301329242, + "learning_rate": 1.0818800432779607e-06, + "loss": 0.6341, + "step": 16417 + }, + { + "epoch": 2.45, + "grad_norm": 2.9178391876100966, + "learning_rate": 1.081783759375977e-06, + "loss": 0.6374, + "step": 16418 + }, + { + "epoch": 2.45, + "grad_norm": 4.720578262246573, + "learning_rate": 1.0816874747106982e-06, + "loss": 0.6315, + "step": 16419 + }, + { + "epoch": 2.45, + "grad_norm": 4.641967355337304, + "learning_rate": 1.0815911892830228e-06, + "loss": 0.6081, + "step": 16420 + }, + { + "epoch": 2.45, + "grad_norm": 3.2349580387338936, + "learning_rate": 1.0814949030938495e-06, + "loss": 0.6146, + "step": 16421 + }, + { + "epoch": 2.45, + "grad_norm": 3.0443284044246135, + "learning_rate": 1.0813986161440775e-06, + "loss": 0.6452, + "step": 16422 + }, + { + "epoch": 2.45, + "grad_norm": 6.172267113954929, + "learning_rate": 1.0813023284346045e-06, + "loss": 0.6387, + "step": 16423 + }, + { + "epoch": 2.45, + "grad_norm": 4.686493161938374, + "learning_rate": 1.08120603996633e-06, + "loss": 0.651, + "step": 16424 + }, + { + "epoch": 2.45, + "grad_norm": 4.977900098418853, + "learning_rate": 1.0811097507401525e-06, + "loss": 0.6283, + "step": 16425 + }, + { + "epoch": 2.45, + "grad_norm": 2.95673548993502, + "learning_rate": 1.0810134607569703e-06, + "loss": 0.6654, + "step": 16426 + }, + { + "epoch": 2.45, + "grad_norm": 6.828190608050031, + "learning_rate": 1.0809171700176824e-06, + "loss": 0.6387, + "step": 16427 + }, + { + "epoch": 2.45, + "grad_norm": 4.966618295499133, + "learning_rate": 1.0808208785231873e-06, + "loss": 0.6784, + "step": 16428 + }, + { + "epoch": 2.45, + "grad_norm": 5.031642841957264, + "learning_rate": 1.0807245862743842e-06, + "loss": 0.6784, + "step": 16429 + }, + { + "epoch": 2.45, + "grad_norm": 2.749515839631927, + "learning_rate": 1.0806282932721709e-06, + "loss": 0.6328, + "step": 16430 + }, + { + "epoch": 2.45, + "grad_norm": 4.619559911776305, + "learning_rate": 1.0805319995174465e-06, + "loss": 0.6263, + "step": 16431 + }, + { + "epoch": 2.45, + "grad_norm": 4.162798651697016, + "learning_rate": 1.0804357050111104e-06, + "loss": 0.6732, + "step": 16432 + }, + { + "epoch": 2.45, + "grad_norm": 3.097444800608062, + "learning_rate": 1.08033940975406e-06, + "loss": 0.6315, + "step": 16433 + }, + { + "epoch": 2.45, + "grad_norm": 7.1103972298137945, + "learning_rate": 1.0802431137471954e-06, + "loss": 0.6641, + "step": 16434 + }, + { + "epoch": 2.45, + "grad_norm": 2.780228438666357, + "learning_rate": 1.0801468169914148e-06, + "loss": 0.6374, + "step": 16435 + }, + { + "epoch": 2.45, + "grad_norm": 5.894009255158155, + "learning_rate": 1.0800505194876163e-06, + "loss": 0.6836, + "step": 16436 + }, + { + "epoch": 2.45, + "grad_norm": 2.9905009700183083, + "learning_rate": 1.0799542212366996e-06, + "loss": 0.6517, + "step": 16437 + }, + { + "epoch": 2.45, + "grad_norm": 3.7161365942989573, + "learning_rate": 1.0798579222395632e-06, + "loss": 0.6035, + "step": 16438 + }, + { + "epoch": 2.45, + "grad_norm": 3.1105367501975523, + "learning_rate": 1.0797616224971054e-06, + "loss": 0.6328, + "step": 16439 + }, + { + "epoch": 2.45, + "grad_norm": 2.592379437716966, + "learning_rate": 1.0796653220102254e-06, + "loss": 0.5938, + "step": 16440 + }, + { + "epoch": 2.45, + "grad_norm": 2.890423236315873, + "learning_rate": 1.079569020779822e-06, + "loss": 0.6133, + "step": 16441 + }, + { + "epoch": 2.45, + "grad_norm": 4.194180650445173, + "learning_rate": 1.0794727188067937e-06, + "loss": 0.5872, + "step": 16442 + }, + { + "epoch": 2.45, + "grad_norm": 3.2634018722350433, + "learning_rate": 1.07937641609204e-06, + "loss": 0.6836, + "step": 16443 + }, + { + "epoch": 2.45, + "grad_norm": 4.014230312463819, + "learning_rate": 1.0792801126364585e-06, + "loss": 0.6328, + "step": 16444 + }, + { + "epoch": 2.45, + "grad_norm": 5.043233499379622, + "learning_rate": 1.0791838084409494e-06, + "loss": 0.6562, + "step": 16445 + }, + { + "epoch": 2.45, + "grad_norm": 3.128353066226807, + "learning_rate": 1.0790875035064104e-06, + "loss": 0.6419, + "step": 16446 + }, + { + "epoch": 2.45, + "grad_norm": 3.2563412672890535, + "learning_rate": 1.0789911978337408e-06, + "loss": 0.6237, + "step": 16447 + }, + { + "epoch": 2.45, + "grad_norm": 2.9403530949836574, + "learning_rate": 1.0788948914238396e-06, + "loss": 0.653, + "step": 16448 + }, + { + "epoch": 2.45, + "grad_norm": 3.06980782609871, + "learning_rate": 1.0787985842776048e-06, + "loss": 0.6432, + "step": 16449 + }, + { + "epoch": 2.45, + "grad_norm": 3.2999082256659023, + "learning_rate": 1.0787022763959364e-06, + "loss": 0.6406, + "step": 16450 + }, + { + "epoch": 2.45, + "grad_norm": 3.1468289714846116, + "learning_rate": 1.0786059677797322e-06, + "loss": 0.6198, + "step": 16451 + }, + { + "epoch": 2.45, + "grad_norm": 4.187458230573429, + "learning_rate": 1.0785096584298921e-06, + "loss": 0.6992, + "step": 16452 + }, + { + "epoch": 2.45, + "grad_norm": 3.200318923469718, + "learning_rate": 1.0784133483473143e-06, + "loss": 0.6198, + "step": 16453 + }, + { + "epoch": 2.45, + "grad_norm": 3.0701673905769606, + "learning_rate": 1.0783170375328974e-06, + "loss": 0.6224, + "step": 16454 + }, + { + "epoch": 2.45, + "grad_norm": 3.499155470040558, + "learning_rate": 1.0782207259875411e-06, + "loss": 0.6523, + "step": 16455 + }, + { + "epoch": 2.45, + "grad_norm": 4.053831887423509, + "learning_rate": 1.0781244137121436e-06, + "loss": 0.6328, + "step": 16456 + }, + { + "epoch": 2.45, + "grad_norm": 3.4786737875276526, + "learning_rate": 1.0780281007076039e-06, + "loss": 0.6523, + "step": 16457 + }, + { + "epoch": 2.45, + "grad_norm": 3.1963522721825792, + "learning_rate": 1.0779317869748214e-06, + "loss": 0.6211, + "step": 16458 + }, + { + "epoch": 2.45, + "grad_norm": 3.1086793504698855, + "learning_rate": 1.0778354725146945e-06, + "loss": 0.6341, + "step": 16459 + }, + { + "epoch": 2.45, + "grad_norm": 2.9097850725510788, + "learning_rate": 1.0777391573281223e-06, + "loss": 0.623, + "step": 16460 + }, + { + "epoch": 2.46, + "grad_norm": 2.9478056339022887, + "learning_rate": 1.0776428414160037e-06, + "loss": 0.6452, + "step": 16461 + }, + { + "epoch": 2.46, + "grad_norm": 3.6616999414418974, + "learning_rate": 1.0775465247792375e-06, + "loss": 0.6354, + "step": 16462 + }, + { + "epoch": 2.46, + "grad_norm": 8.993978098101477, + "learning_rate": 1.0774502074187227e-06, + "loss": 0.6361, + "step": 16463 + }, + { + "epoch": 2.46, + "grad_norm": 3.621021062809438, + "learning_rate": 1.077353889335358e-06, + "loss": 0.6081, + "step": 16464 + }, + { + "epoch": 2.46, + "grad_norm": 3.0768704349795377, + "learning_rate": 1.077257570530043e-06, + "loss": 0.6426, + "step": 16465 + }, + { + "epoch": 2.46, + "grad_norm": 3.0359069686867146, + "learning_rate": 1.0771612510036763e-06, + "loss": 0.6621, + "step": 16466 + }, + { + "epoch": 2.46, + "grad_norm": 2.875821343914683, + "learning_rate": 1.0770649307571567e-06, + "loss": 0.638, + "step": 16467 + }, + { + "epoch": 2.46, + "grad_norm": 5.1748041138358785, + "learning_rate": 1.0769686097913833e-06, + "loss": 0.6432, + "step": 16468 + }, + { + "epoch": 2.46, + "grad_norm": 4.552246076707506, + "learning_rate": 1.0768722881072552e-06, + "loss": 0.5928, + "step": 16469 + }, + { + "epoch": 2.46, + "grad_norm": 4.492653999362387, + "learning_rate": 1.0767759657056707e-06, + "loss": 0.5902, + "step": 16470 + }, + { + "epoch": 2.46, + "grad_norm": 4.483038799461084, + "learning_rate": 1.0766796425875298e-06, + "loss": 0.6283, + "step": 16471 + }, + { + "epoch": 2.46, + "grad_norm": 4.108142497093002, + "learning_rate": 1.076583318753731e-06, + "loss": 0.6595, + "step": 16472 + }, + { + "epoch": 2.46, + "grad_norm": 2.9782110149244976, + "learning_rate": 1.076486994205173e-06, + "loss": 0.5739, + "step": 16473 + }, + { + "epoch": 2.46, + "grad_norm": 2.659827633945598, + "learning_rate": 1.0763906689427555e-06, + "loss": 0.6302, + "step": 16474 + }, + { + "epoch": 2.46, + "grad_norm": 7.046791778665622, + "learning_rate": 1.076294342967377e-06, + "loss": 0.6341, + "step": 16475 + }, + { + "epoch": 2.46, + "grad_norm": 3.4488702333030696, + "learning_rate": 1.0761980162799365e-06, + "loss": 0.6178, + "step": 16476 + }, + { + "epoch": 2.46, + "grad_norm": 3.2664096222732213, + "learning_rate": 1.0761016888813333e-06, + "loss": 0.6217, + "step": 16477 + }, + { + "epoch": 2.46, + "grad_norm": 5.107725060422061, + "learning_rate": 1.0760053607724664e-06, + "loss": 0.6921, + "step": 16478 + }, + { + "epoch": 2.46, + "grad_norm": 3.110303727287481, + "learning_rate": 1.0759090319542345e-06, + "loss": 0.6087, + "step": 16479 + }, + { + "epoch": 2.46, + "grad_norm": 4.987254609017298, + "learning_rate": 1.075812702427537e-06, + "loss": 0.6113, + "step": 16480 + }, + { + "epoch": 2.46, + "grad_norm": 3.507102296229874, + "learning_rate": 1.075716372193273e-06, + "loss": 0.6628, + "step": 16481 + }, + { + "epoch": 2.46, + "grad_norm": 7.817969017732919, + "learning_rate": 1.0756200412523414e-06, + "loss": 0.6673, + "step": 16482 + }, + { + "epoch": 2.46, + "grad_norm": 5.179717510551749, + "learning_rate": 1.0755237096056409e-06, + "loss": 0.6758, + "step": 16483 + }, + { + "epoch": 2.46, + "grad_norm": 3.449247266036635, + "learning_rate": 1.0754273772540713e-06, + "loss": 0.6439, + "step": 16484 + }, + { + "epoch": 2.46, + "grad_norm": 3.8525375003601683, + "learning_rate": 1.0753310441985312e-06, + "loss": 0.6536, + "step": 16485 + }, + { + "epoch": 2.46, + "grad_norm": 3.153711485154768, + "learning_rate": 1.0752347104399197e-06, + "loss": 0.6621, + "step": 16486 + }, + { + "epoch": 2.46, + "grad_norm": 4.41112701467824, + "learning_rate": 1.075138375979136e-06, + "loss": 0.6536, + "step": 16487 + }, + { + "epoch": 2.46, + "grad_norm": 3.8229161040466164, + "learning_rate": 1.0750420408170797e-06, + "loss": 0.6458, + "step": 16488 + }, + { + "epoch": 2.46, + "grad_norm": 4.318089385283903, + "learning_rate": 1.0749457049546485e-06, + "loss": 0.6569, + "step": 16489 + }, + { + "epoch": 2.46, + "grad_norm": 3.442860511023687, + "learning_rate": 1.0748493683927433e-06, + "loss": 0.6322, + "step": 16490 + }, + { + "epoch": 2.46, + "grad_norm": 3.579696115661697, + "learning_rate": 1.074753031132262e-06, + "loss": 0.6732, + "step": 16491 + }, + { + "epoch": 2.46, + "grad_norm": 4.263490055218794, + "learning_rate": 1.0746566931741036e-06, + "loss": 0.6107, + "step": 16492 + }, + { + "epoch": 2.46, + "grad_norm": 2.85540961358081, + "learning_rate": 1.0745603545191681e-06, + "loss": 0.5918, + "step": 16493 + }, + { + "epoch": 2.46, + "grad_norm": 3.3006835291878867, + "learning_rate": 1.0744640151683544e-06, + "loss": 0.6227, + "step": 16494 + }, + { + "epoch": 2.46, + "grad_norm": 3.120207122886225, + "learning_rate": 1.0743676751225612e-06, + "loss": 0.6634, + "step": 16495 + }, + { + "epoch": 2.46, + "grad_norm": 4.878513883053347, + "learning_rate": 1.0742713343826878e-06, + "loss": 0.6283, + "step": 16496 + }, + { + "epoch": 2.46, + "grad_norm": 2.963285078443818, + "learning_rate": 1.0741749929496336e-06, + "loss": 0.5983, + "step": 16497 + }, + { + "epoch": 2.46, + "grad_norm": 4.129372827025567, + "learning_rate": 1.0740786508242978e-06, + "loss": 0.653, + "step": 16498 + }, + { + "epoch": 2.46, + "grad_norm": 2.7027195629801626, + "learning_rate": 1.0739823080075792e-06, + "loss": 0.6055, + "step": 16499 + }, + { + "epoch": 2.46, + "grad_norm": 4.46439412690948, + "learning_rate": 1.0738859645003772e-06, + "loss": 0.6139, + "step": 16500 + }, + { + "epoch": 2.46, + "grad_norm": 2.974873617522218, + "learning_rate": 1.0737896203035913e-06, + "loss": 0.6191, + "step": 16501 + }, + { + "epoch": 2.46, + "grad_norm": 4.575636459493583, + "learning_rate": 1.0736932754181199e-06, + "loss": 0.6315, + "step": 16502 + }, + { + "epoch": 2.46, + "grad_norm": 3.2399878836413625, + "learning_rate": 1.0735969298448626e-06, + "loss": 0.6862, + "step": 16503 + }, + { + "epoch": 2.46, + "grad_norm": 3.8557962569757085, + "learning_rate": 1.0735005835847188e-06, + "loss": 0.6406, + "step": 16504 + }, + { + "epoch": 2.46, + "grad_norm": 3.195782026835101, + "learning_rate": 1.0734042366385875e-06, + "loss": 0.6673, + "step": 16505 + }, + { + "epoch": 2.46, + "grad_norm": 3.025545544407441, + "learning_rate": 1.0733078890073682e-06, + "loss": 0.6719, + "step": 16506 + }, + { + "epoch": 2.46, + "grad_norm": 4.399279240275406, + "learning_rate": 1.0732115406919595e-06, + "loss": 0.6693, + "step": 16507 + }, + { + "epoch": 2.46, + "grad_norm": 3.2327925098209094, + "learning_rate": 1.0731151916932612e-06, + "loss": 0.6243, + "step": 16508 + }, + { + "epoch": 2.46, + "grad_norm": 4.663094803660011, + "learning_rate": 1.0730188420121726e-06, + "loss": 0.6315, + "step": 16509 + }, + { + "epoch": 2.46, + "grad_norm": 3.009745464213995, + "learning_rate": 1.0729224916495922e-06, + "loss": 0.6484, + "step": 16510 + }, + { + "epoch": 2.46, + "grad_norm": 4.084937654862896, + "learning_rate": 1.0728261406064206e-06, + "loss": 0.6172, + "step": 16511 + }, + { + "epoch": 2.46, + "grad_norm": 2.878843542158494, + "learning_rate": 1.0727297888835553e-06, + "loss": 0.5866, + "step": 16512 + }, + { + "epoch": 2.46, + "grad_norm": 3.788208054841044, + "learning_rate": 1.0726334364818966e-06, + "loss": 0.6484, + "step": 16513 + }, + { + "epoch": 2.46, + "grad_norm": 3.3024067667441583, + "learning_rate": 1.072537083402344e-06, + "loss": 0.6335, + "step": 16514 + }, + { + "epoch": 2.46, + "grad_norm": 3.788377111497587, + "learning_rate": 1.072440729645796e-06, + "loss": 0.6165, + "step": 16515 + }, + { + "epoch": 2.46, + "grad_norm": 3.671500036378551, + "learning_rate": 1.0723443752131527e-06, + "loss": 0.6224, + "step": 16516 + }, + { + "epoch": 2.46, + "grad_norm": 2.994610962257273, + "learning_rate": 1.0722480201053122e-06, + "loss": 0.6888, + "step": 16517 + }, + { + "epoch": 2.46, + "grad_norm": 5.5796532328566, + "learning_rate": 1.0721516643231753e-06, + "loss": 0.625, + "step": 16518 + }, + { + "epoch": 2.46, + "grad_norm": 3.2067887398195105, + "learning_rate": 1.0720553078676403e-06, + "loss": 0.7324, + "step": 16519 + }, + { + "epoch": 2.46, + "grad_norm": 3.4964624094303445, + "learning_rate": 1.0719589507396065e-06, + "loss": 0.6491, + "step": 16520 + }, + { + "epoch": 2.46, + "grad_norm": 2.7090735253221343, + "learning_rate": 1.0718625929399737e-06, + "loss": 0.6068, + "step": 16521 + }, + { + "epoch": 2.46, + "grad_norm": 6.071473784270163, + "learning_rate": 1.071766234469641e-06, + "loss": 0.6263, + "step": 16522 + }, + { + "epoch": 2.46, + "grad_norm": 3.3427718314103365, + "learning_rate": 1.0716698753295074e-06, + "loss": 0.6517, + "step": 16523 + }, + { + "epoch": 2.46, + "grad_norm": 4.362252454323943, + "learning_rate": 1.0715735155204727e-06, + "loss": 0.6387, + "step": 16524 + }, + { + "epoch": 2.46, + "grad_norm": 3.137991938956795, + "learning_rate": 1.0714771550434363e-06, + "loss": 0.6393, + "step": 16525 + }, + { + "epoch": 2.46, + "grad_norm": 3.6757367837969017, + "learning_rate": 1.071380793899297e-06, + "loss": 0.6296, + "step": 16526 + }, + { + "epoch": 2.46, + "grad_norm": 2.78072307830015, + "learning_rate": 1.0712844320889542e-06, + "loss": 0.6165, + "step": 16527 + }, + { + "epoch": 2.47, + "grad_norm": 3.7729738191111415, + "learning_rate": 1.071188069613308e-06, + "loss": 0.6432, + "step": 16528 + }, + { + "epoch": 2.47, + "grad_norm": 3.2935546550910786, + "learning_rate": 1.071091706473257e-06, + "loss": 0.6986, + "step": 16529 + }, + { + "epoch": 2.47, + "grad_norm": 2.8475311136648576, + "learning_rate": 1.0709953426697007e-06, + "loss": 0.6569, + "step": 16530 + }, + { + "epoch": 2.47, + "grad_norm": 2.7114351949218305, + "learning_rate": 1.070898978203539e-06, + "loss": 0.6328, + "step": 16531 + }, + { + "epoch": 2.47, + "grad_norm": 3.052379883945496, + "learning_rate": 1.0708026130756706e-06, + "loss": 0.6595, + "step": 16532 + }, + { + "epoch": 2.47, + "grad_norm": 3.9601434658404626, + "learning_rate": 1.070706247286995e-06, + "loss": 0.6406, + "step": 16533 + }, + { + "epoch": 2.47, + "grad_norm": 2.7482988143919522, + "learning_rate": 1.0706098808384119e-06, + "loss": 0.6406, + "step": 16534 + }, + { + "epoch": 2.47, + "grad_norm": 3.8694616502788968, + "learning_rate": 1.0705135137308206e-06, + "loss": 0.651, + "step": 16535 + }, + { + "epoch": 2.47, + "grad_norm": 2.995931187779108, + "learning_rate": 1.0704171459651201e-06, + "loss": 0.6335, + "step": 16536 + }, + { + "epoch": 2.47, + "grad_norm": 3.1159305290989208, + "learning_rate": 1.0703207775422106e-06, + "loss": 0.6589, + "step": 16537 + }, + { + "epoch": 2.47, + "grad_norm": 2.887441773441922, + "learning_rate": 1.070224408462991e-06, + "loss": 0.6361, + "step": 16538 + }, + { + "epoch": 2.47, + "grad_norm": 3.1463784636278205, + "learning_rate": 1.0701280387283604e-06, + "loss": 0.6536, + "step": 16539 + }, + { + "epoch": 2.47, + "grad_norm": 3.2206934691324633, + "learning_rate": 1.070031668339219e-06, + "loss": 0.651, + "step": 16540 + }, + { + "epoch": 2.47, + "grad_norm": 3.0075542449671806, + "learning_rate": 1.0699352972964657e-06, + "loss": 0.6263, + "step": 16541 + }, + { + "epoch": 2.47, + "grad_norm": 2.770257001413496, + "learning_rate": 1.0698389256010001e-06, + "loss": 0.6335, + "step": 16542 + }, + { + "epoch": 2.47, + "grad_norm": 2.8315517737791036, + "learning_rate": 1.0697425532537217e-06, + "loss": 0.6354, + "step": 16543 + }, + { + "epoch": 2.47, + "grad_norm": 3.6238783145796982, + "learning_rate": 1.0696461802555298e-06, + "loss": 0.6647, + "step": 16544 + }, + { + "epoch": 2.47, + "grad_norm": 2.48225285088224, + "learning_rate": 1.0695498066073238e-06, + "loss": 0.597, + "step": 16545 + }, + { + "epoch": 2.47, + "grad_norm": 2.892866371631566, + "learning_rate": 1.0694534323100034e-06, + "loss": 0.6374, + "step": 16546 + }, + { + "epoch": 2.47, + "grad_norm": 2.6505883381803237, + "learning_rate": 1.0693570573644684e-06, + "loss": 0.6198, + "step": 16547 + }, + { + "epoch": 2.47, + "grad_norm": 3.1819377895701146, + "learning_rate": 1.0692606817716174e-06, + "loss": 0.6628, + "step": 16548 + }, + { + "epoch": 2.47, + "grad_norm": 2.997316659430023, + "learning_rate": 1.0691643055323503e-06, + "loss": 0.696, + "step": 16549 + }, + { + "epoch": 2.47, + "grad_norm": 2.772234399619201, + "learning_rate": 1.0690679286475668e-06, + "loss": 0.6374, + "step": 16550 + }, + { + "epoch": 2.47, + "grad_norm": 2.859487424470108, + "learning_rate": 1.0689715511181663e-06, + "loss": 0.6387, + "step": 16551 + }, + { + "epoch": 2.47, + "grad_norm": 7.416197683490612, + "learning_rate": 1.0688751729450478e-06, + "loss": 0.6569, + "step": 16552 + }, + { + "epoch": 2.47, + "grad_norm": 2.832276799877045, + "learning_rate": 1.0687787941291115e-06, + "loss": 0.6628, + "step": 16553 + }, + { + "epoch": 2.47, + "grad_norm": 2.468993312765913, + "learning_rate": 1.0686824146712567e-06, + "loss": 0.61, + "step": 16554 + }, + { + "epoch": 2.47, + "grad_norm": 5.852264244765483, + "learning_rate": 1.0685860345723827e-06, + "loss": 0.7038, + "step": 16555 + }, + { + "epoch": 2.47, + "grad_norm": 3.399958776631386, + "learning_rate": 1.0684896538333894e-06, + "loss": 0.6159, + "step": 16556 + }, + { + "epoch": 2.47, + "grad_norm": 3.4985123134512603, + "learning_rate": 1.0683932724551758e-06, + "loss": 0.653, + "step": 16557 + }, + { + "epoch": 2.47, + "grad_norm": 2.9537280156187347, + "learning_rate": 1.0682968904386419e-06, + "loss": 0.6914, + "step": 16558 + }, + { + "epoch": 2.47, + "grad_norm": 3.2784181018342657, + "learning_rate": 1.0682005077846871e-06, + "loss": 0.6914, + "step": 16559 + }, + { + "epoch": 2.47, + "grad_norm": 2.5346543929172975, + "learning_rate": 1.0681041244942108e-06, + "loss": 0.6152, + "step": 16560 + }, + { + "epoch": 2.47, + "grad_norm": 3.7404717708602533, + "learning_rate": 1.0680077405681132e-06, + "loss": 0.6608, + "step": 16561 + }, + { + "epoch": 2.47, + "grad_norm": 3.040054061597467, + "learning_rate": 1.0679113560072926e-06, + "loss": 0.6257, + "step": 16562 + }, + { + "epoch": 2.47, + "grad_norm": 3.0678031587986805, + "learning_rate": 1.0678149708126496e-06, + "loss": 0.6439, + "step": 16563 + }, + { + "epoch": 2.47, + "grad_norm": 2.732235023239454, + "learning_rate": 1.0677185849850837e-06, + "loss": 0.6243, + "step": 16564 + }, + { + "epoch": 2.47, + "grad_norm": 3.267265201973663, + "learning_rate": 1.0676221985254937e-06, + "loss": 0.6576, + "step": 16565 + }, + { + "epoch": 2.47, + "grad_norm": 2.807034434828246, + "learning_rate": 1.0675258114347804e-06, + "loss": 0.6348, + "step": 16566 + }, + { + "epoch": 2.47, + "grad_norm": 3.096826316420475, + "learning_rate": 1.0674294237138426e-06, + "loss": 0.6198, + "step": 16567 + }, + { + "epoch": 2.47, + "grad_norm": 2.883176360802321, + "learning_rate": 1.0673330353635796e-06, + "loss": 0.6348, + "step": 16568 + }, + { + "epoch": 2.47, + "grad_norm": 2.7468394506646105, + "learning_rate": 1.0672366463848919e-06, + "loss": 0.6003, + "step": 16569 + }, + { + "epoch": 2.47, + "grad_norm": 3.2645727429381712, + "learning_rate": 1.0671402567786785e-06, + "loss": 0.627, + "step": 16570 + }, + { + "epoch": 2.47, + "grad_norm": 3.90130216538511, + "learning_rate": 1.0670438665458388e-06, + "loss": 0.6283, + "step": 16571 + }, + { + "epoch": 2.47, + "grad_norm": 2.902373511354042, + "learning_rate": 1.066947475687273e-06, + "loss": 0.6315, + "step": 16572 + }, + { + "epoch": 2.47, + "grad_norm": 5.558463785014613, + "learning_rate": 1.0668510842038805e-06, + "loss": 0.681, + "step": 16573 + }, + { + "epoch": 2.47, + "grad_norm": 3.9749889576090616, + "learning_rate": 1.0667546920965612e-06, + "loss": 0.6354, + "step": 16574 + }, + { + "epoch": 2.47, + "grad_norm": 3.1129595629367985, + "learning_rate": 1.0666582993662142e-06, + "loss": 0.6497, + "step": 16575 + }, + { + "epoch": 2.47, + "grad_norm": 2.687233799991034, + "learning_rate": 1.0665619060137393e-06, + "loss": 0.6243, + "step": 16576 + }, + { + "epoch": 2.47, + "grad_norm": 2.846989246163363, + "learning_rate": 1.0664655120400366e-06, + "loss": 0.6445, + "step": 16577 + }, + { + "epoch": 2.47, + "grad_norm": 5.256628166582968, + "learning_rate": 1.0663691174460047e-06, + "loss": 0.6185, + "step": 16578 + }, + { + "epoch": 2.47, + "grad_norm": 2.9633544575835153, + "learning_rate": 1.0662727222325444e-06, + "loss": 0.6257, + "step": 16579 + }, + { + "epoch": 2.47, + "grad_norm": 3.968845100247705, + "learning_rate": 1.0661763264005553e-06, + "loss": 0.6263, + "step": 16580 + }, + { + "epoch": 2.47, + "grad_norm": 3.4270197132540914, + "learning_rate": 1.066079929950936e-06, + "loss": 0.6328, + "step": 16581 + }, + { + "epoch": 2.47, + "grad_norm": 3.4709335076433003, + "learning_rate": 1.0659835328845874e-06, + "loss": 0.6543, + "step": 16582 + }, + { + "epoch": 2.47, + "grad_norm": 2.8992954683603243, + "learning_rate": 1.0658871352024081e-06, + "loss": 0.6263, + "step": 16583 + }, + { + "epoch": 2.47, + "grad_norm": 3.522212942693985, + "learning_rate": 1.065790736905299e-06, + "loss": 0.7135, + "step": 16584 + }, + { + "epoch": 2.47, + "grad_norm": 6.650033454074483, + "learning_rate": 1.0656943379941587e-06, + "loss": 0.6335, + "step": 16585 + }, + { + "epoch": 2.47, + "grad_norm": 6.880607070577137, + "learning_rate": 1.0655979384698874e-06, + "loss": 0.6686, + "step": 16586 + }, + { + "epoch": 2.47, + "grad_norm": 4.786851010560459, + "learning_rate": 1.0655015383333848e-06, + "loss": 0.6136, + "step": 16587 + }, + { + "epoch": 2.47, + "grad_norm": 2.992745691388848, + "learning_rate": 1.0654051375855506e-06, + "loss": 0.6595, + "step": 16588 + }, + { + "epoch": 2.47, + "grad_norm": 2.8751694170547575, + "learning_rate": 1.0653087362272843e-06, + "loss": 0.6029, + "step": 16589 + }, + { + "epoch": 2.47, + "grad_norm": 4.237057252824357, + "learning_rate": 1.0652123342594858e-06, + "loss": 0.6602, + "step": 16590 + }, + { + "epoch": 2.47, + "grad_norm": 4.1457898226713015, + "learning_rate": 1.065115931683055e-06, + "loss": 0.6217, + "step": 16591 + }, + { + "epoch": 2.47, + "grad_norm": 3.00220083090861, + "learning_rate": 1.0650195284988913e-06, + "loss": 0.6576, + "step": 16592 + }, + { + "epoch": 2.47, + "grad_norm": 2.9667644512757634, + "learning_rate": 1.0649231247078946e-06, + "loss": 0.681, + "step": 16593 + }, + { + "epoch": 2.47, + "grad_norm": 6.513711158711923, + "learning_rate": 1.064826720310965e-06, + "loss": 0.6504, + "step": 16594 + }, + { + "epoch": 2.48, + "grad_norm": 2.826766095319304, + "learning_rate": 1.0647303153090013e-06, + "loss": 0.6061, + "step": 16595 + }, + { + "epoch": 2.48, + "grad_norm": 3.267594991795164, + "learning_rate": 1.0646339097029042e-06, + "loss": 0.653, + "step": 16596 + }, + { + "epoch": 2.48, + "grad_norm": 3.596460502900564, + "learning_rate": 1.064537503493573e-06, + "loss": 0.627, + "step": 16597 + }, + { + "epoch": 2.48, + "grad_norm": 3.6735288070569174, + "learning_rate": 1.0644410966819078e-06, + "loss": 0.6458, + "step": 16598 + }, + { + "epoch": 2.48, + "grad_norm": 2.906309131968693, + "learning_rate": 1.0643446892688077e-06, + "loss": 0.6621, + "step": 16599 + }, + { + "epoch": 2.48, + "grad_norm": 4.191984711962047, + "learning_rate": 1.0642482812551732e-06, + "loss": 0.6315, + "step": 16600 + }, + { + "epoch": 2.48, + "grad_norm": 2.713102040467932, + "learning_rate": 1.0641518726419038e-06, + "loss": 0.6732, + "step": 16601 + }, + { + "epoch": 2.48, + "grad_norm": 4.275603015691537, + "learning_rate": 1.0640554634298993e-06, + "loss": 0.6439, + "step": 16602 + }, + { + "epoch": 2.48, + "grad_norm": 2.880085287487562, + "learning_rate": 1.0639590536200596e-06, + "loss": 0.6432, + "step": 16603 + }, + { + "epoch": 2.48, + "grad_norm": 2.7612486557584357, + "learning_rate": 1.0638626432132842e-06, + "loss": 0.6335, + "step": 16604 + }, + { + "epoch": 2.48, + "grad_norm": 5.53325963251375, + "learning_rate": 1.063766232210473e-06, + "loss": 0.6126, + "step": 16605 + }, + { + "epoch": 2.48, + "grad_norm": 3.49036368003377, + "learning_rate": 1.063669820612526e-06, + "loss": 0.6393, + "step": 16606 + }, + { + "epoch": 2.48, + "grad_norm": 2.75863607803986, + "learning_rate": 1.0635734084203432e-06, + "loss": 0.61, + "step": 16607 + }, + { + "epoch": 2.48, + "grad_norm": 2.8511102008861546, + "learning_rate": 1.0634769956348238e-06, + "loss": 0.6156, + "step": 16608 + }, + { + "epoch": 2.48, + "grad_norm": 4.131394513886257, + "learning_rate": 1.0633805822568682e-06, + "loss": 0.6035, + "step": 16609 + }, + { + "epoch": 2.48, + "grad_norm": 4.928504129976515, + "learning_rate": 1.063284168287376e-06, + "loss": 0.64, + "step": 16610 + }, + { + "epoch": 2.48, + "grad_norm": 3.2348091076638354, + "learning_rate": 1.0631877537272474e-06, + "loss": 0.6341, + "step": 16611 + }, + { + "epoch": 2.48, + "grad_norm": 2.8835181492537374, + "learning_rate": 1.0630913385773812e-06, + "loss": 0.6152, + "step": 16612 + }, + { + "epoch": 2.48, + "grad_norm": 3.25550688093306, + "learning_rate": 1.0629949228386783e-06, + "loss": 0.6406, + "step": 16613 + }, + { + "epoch": 2.48, + "grad_norm": 3.1847354817506597, + "learning_rate": 1.0628985065120383e-06, + "loss": 0.64, + "step": 16614 + }, + { + "epoch": 2.48, + "grad_norm": 4.634429406268665, + "learning_rate": 1.0628020895983607e-06, + "loss": 0.6257, + "step": 16615 + }, + { + "epoch": 2.48, + "grad_norm": 4.582719700770154, + "learning_rate": 1.0627056720985462e-06, + "loss": 0.6693, + "step": 16616 + }, + { + "epoch": 2.48, + "grad_norm": 3.1925782586386706, + "learning_rate": 1.0626092540134936e-06, + "loss": 0.6191, + "step": 16617 + }, + { + "epoch": 2.48, + "grad_norm": 4.782339485103639, + "learning_rate": 1.0625128353441034e-06, + "loss": 0.6745, + "step": 16618 + }, + { + "epoch": 2.48, + "grad_norm": 4.448361470873428, + "learning_rate": 1.0624164160912755e-06, + "loss": 0.6686, + "step": 16619 + }, + { + "epoch": 2.48, + "grad_norm": 3.4962292525528964, + "learning_rate": 1.0623199962559095e-06, + "loss": 0.6198, + "step": 16620 + }, + { + "epoch": 2.48, + "grad_norm": 3.163719522122896, + "learning_rate": 1.0622235758389055e-06, + "loss": 0.6328, + "step": 16621 + }, + { + "epoch": 2.48, + "grad_norm": 3.3788994374167447, + "learning_rate": 1.0621271548411634e-06, + "loss": 0.6257, + "step": 16622 + }, + { + "epoch": 2.48, + "grad_norm": 3.7187350554082523, + "learning_rate": 1.0620307332635833e-06, + "loss": 0.625, + "step": 16623 + }, + { + "epoch": 2.48, + "grad_norm": 3.6322324229673524, + "learning_rate": 1.0619343111070645e-06, + "loss": 0.6149, + "step": 16624 + }, + { + "epoch": 2.48, + "grad_norm": 3.967511963074988, + "learning_rate": 1.0618378883725077e-06, + "loss": 0.6803, + "step": 16625 + }, + { + "epoch": 2.48, + "grad_norm": 3.9576871597061416, + "learning_rate": 1.0617414650608118e-06, + "loss": 0.6406, + "step": 16626 + }, + { + "epoch": 2.48, + "grad_norm": 2.9719710495035265, + "learning_rate": 1.061645041172878e-06, + "loss": 0.6029, + "step": 16627 + }, + { + "epoch": 2.48, + "grad_norm": 3.101069648799259, + "learning_rate": 1.0615486167096053e-06, + "loss": 0.6387, + "step": 16628 + }, + { + "epoch": 2.48, + "grad_norm": 3.3765219487202445, + "learning_rate": 1.0614521916718937e-06, + "loss": 0.6569, + "step": 16629 + }, + { + "epoch": 2.48, + "grad_norm": 3.502456121300937, + "learning_rate": 1.0613557660606441e-06, + "loss": 0.709, + "step": 16630 + }, + { + "epoch": 2.48, + "grad_norm": 3.8884898828503958, + "learning_rate": 1.0612593398767548e-06, + "loss": 0.6777, + "step": 16631 + }, + { + "epoch": 2.48, + "grad_norm": 3.9732434346235674, + "learning_rate": 1.0611629131211273e-06, + "loss": 0.6094, + "step": 16632 + }, + { + "epoch": 2.48, + "grad_norm": 2.877016570371078, + "learning_rate": 1.0610664857946607e-06, + "loss": 0.6426, + "step": 16633 + }, + { + "epoch": 2.48, + "grad_norm": 6.764970424275453, + "learning_rate": 1.060970057898255e-06, + "loss": 0.666, + "step": 16634 + }, + { + "epoch": 2.48, + "grad_norm": 2.830504343679963, + "learning_rate": 1.0608736294328106e-06, + "loss": 0.6217, + "step": 16635 + }, + { + "epoch": 2.48, + "grad_norm": 3.4014547161091624, + "learning_rate": 1.0607772003992271e-06, + "loss": 0.6816, + "step": 16636 + }, + { + "epoch": 2.48, + "grad_norm": 3.4024078809750793, + "learning_rate": 1.0606807707984048e-06, + "loss": 0.6289, + "step": 16637 + }, + { + "epoch": 2.48, + "grad_norm": 3.511539968344674, + "learning_rate": 1.0605843406312433e-06, + "loss": 0.6673, + "step": 16638 + }, + { + "epoch": 2.48, + "grad_norm": 2.938726235943024, + "learning_rate": 1.0604879098986428e-06, + "loss": 0.6374, + "step": 16639 + }, + { + "epoch": 2.48, + "grad_norm": 3.1863374529974138, + "learning_rate": 1.0603914786015033e-06, + "loss": 0.5905, + "step": 16640 + }, + { + "epoch": 2.48, + "grad_norm": 3.3884804385245433, + "learning_rate": 1.060295046740725e-06, + "loss": 0.6673, + "step": 16641 + }, + { + "epoch": 2.48, + "grad_norm": 3.352522654112734, + "learning_rate": 1.0601986143172076e-06, + "loss": 0.6146, + "step": 16642 + }, + { + "epoch": 2.48, + "grad_norm": 3.582584574422062, + "learning_rate": 1.0601021813318514e-06, + "loss": 0.6152, + "step": 16643 + }, + { + "epoch": 2.48, + "grad_norm": 4.03727312199283, + "learning_rate": 1.0600057477855557e-06, + "loss": 0.6094, + "step": 16644 + }, + { + "epoch": 2.48, + "grad_norm": 2.7698265030398677, + "learning_rate": 1.0599093136792211e-06, + "loss": 0.6426, + "step": 16645 + }, + { + "epoch": 2.48, + "grad_norm": 5.815466822053905, + "learning_rate": 1.0598128790137482e-06, + "loss": 0.6693, + "step": 16646 + }, + { + "epoch": 2.48, + "grad_norm": 4.074200274951365, + "learning_rate": 1.059716443790036e-06, + "loss": 0.6042, + "step": 16647 + }, + { + "epoch": 2.48, + "grad_norm": 4.200147246107283, + "learning_rate": 1.059620008008985e-06, + "loss": 0.6341, + "step": 16648 + }, + { + "epoch": 2.48, + "grad_norm": 3.2370592803999894, + "learning_rate": 1.059523571671495e-06, + "loss": 0.6777, + "step": 16649 + }, + { + "epoch": 2.48, + "grad_norm": 2.9271323811889163, + "learning_rate": 1.0594271347784664e-06, + "loss": 0.6117, + "step": 16650 + }, + { + "epoch": 2.48, + "grad_norm": 3.5581785425169463, + "learning_rate": 1.0593306973307992e-06, + "loss": 0.6615, + "step": 16651 + }, + { + "epoch": 2.48, + "grad_norm": 3.281993730173031, + "learning_rate": 1.059234259329393e-06, + "loss": 0.6758, + "step": 16652 + }, + { + "epoch": 2.48, + "grad_norm": 4.560553884726162, + "learning_rate": 1.0591378207751488e-06, + "loss": 0.6465, + "step": 16653 + }, + { + "epoch": 2.48, + "grad_norm": 5.036719903257724, + "learning_rate": 1.0590413816689657e-06, + "loss": 0.6634, + "step": 16654 + }, + { + "epoch": 2.48, + "grad_norm": 3.2330018953085102, + "learning_rate": 1.058944942011744e-06, + "loss": 0.627, + "step": 16655 + }, + { + "epoch": 2.48, + "grad_norm": 3.1181705346646535, + "learning_rate": 1.0588485018043842e-06, + "loss": 0.6595, + "step": 16656 + }, + { + "epoch": 2.48, + "grad_norm": 5.970278844457859, + "learning_rate": 1.0587520610477862e-06, + "loss": 0.6517, + "step": 16657 + }, + { + "epoch": 2.48, + "grad_norm": 4.207339918799984, + "learning_rate": 1.0586556197428498e-06, + "loss": 0.6686, + "step": 16658 + }, + { + "epoch": 2.48, + "grad_norm": 4.599215376080897, + "learning_rate": 1.0585591778904755e-06, + "loss": 0.6139, + "step": 16659 + }, + { + "epoch": 2.48, + "grad_norm": 4.873633616316202, + "learning_rate": 1.0584627354915632e-06, + "loss": 0.6471, + "step": 16660 + }, + { + "epoch": 2.48, + "grad_norm": 2.847387235039533, + "learning_rate": 1.0583662925470126e-06, + "loss": 0.6504, + "step": 16661 + }, + { + "epoch": 2.49, + "grad_norm": 5.489494735697636, + "learning_rate": 1.058269849057725e-06, + "loss": 0.6341, + "step": 16662 + }, + { + "epoch": 2.49, + "grad_norm": 4.7207134074131964, + "learning_rate": 1.0581734050245992e-06, + "loss": 0.679, + "step": 16663 + }, + { + "epoch": 2.49, + "grad_norm": 3.044647723352439, + "learning_rate": 1.0580769604485363e-06, + "loss": 0.5892, + "step": 16664 + }, + { + "epoch": 2.49, + "grad_norm": 4.5388649268154255, + "learning_rate": 1.0579805153304354e-06, + "loss": 0.6758, + "step": 16665 + }, + { + "epoch": 2.49, + "grad_norm": 4.313212804699694, + "learning_rate": 1.0578840696711976e-06, + "loss": 0.6328, + "step": 16666 + }, + { + "epoch": 2.49, + "grad_norm": 3.101853052246686, + "learning_rate": 1.0577876234717228e-06, + "loss": 0.6374, + "step": 16667 + }, + { + "epoch": 2.49, + "grad_norm": 3.7580819807143233, + "learning_rate": 1.0576911767329103e-06, + "loss": 0.6829, + "step": 16668 + }, + { + "epoch": 2.49, + "grad_norm": 2.926695154088618, + "learning_rate": 1.0575947294556617e-06, + "loss": 0.6549, + "step": 16669 + }, + { + "epoch": 2.49, + "grad_norm": 2.7749673586828645, + "learning_rate": 1.0574982816408762e-06, + "loss": 0.6159, + "step": 16670 + }, + { + "epoch": 2.49, + "grad_norm": 4.124525747595442, + "learning_rate": 1.0574018332894538e-06, + "loss": 0.6562, + "step": 16671 + }, + { + "epoch": 2.49, + "grad_norm": 3.2244324434274163, + "learning_rate": 1.0573053844022954e-06, + "loss": 0.6257, + "step": 16672 + }, + { + "epoch": 2.49, + "grad_norm": 5.336998121832415, + "learning_rate": 1.057208934980301e-06, + "loss": 0.6217, + "step": 16673 + }, + { + "epoch": 2.49, + "grad_norm": 4.019082338294373, + "learning_rate": 1.0571124850243699e-06, + "loss": 0.6634, + "step": 16674 + }, + { + "epoch": 2.49, + "grad_norm": 3.4491959148669453, + "learning_rate": 1.0570160345354033e-06, + "loss": 0.6693, + "step": 16675 + }, + { + "epoch": 2.49, + "grad_norm": 2.6043250709275116, + "learning_rate": 1.056919583514301e-06, + "loss": 0.6549, + "step": 16676 + }, + { + "epoch": 2.49, + "grad_norm": 2.868362418224682, + "learning_rate": 1.0568231319619632e-06, + "loss": 0.6426, + "step": 16677 + }, + { + "epoch": 2.49, + "grad_norm": 5.310663782437872, + "learning_rate": 1.0567266798792897e-06, + "loss": 0.6328, + "step": 16678 + }, + { + "epoch": 2.49, + "grad_norm": 2.8598871230571565, + "learning_rate": 1.0566302272671813e-06, + "loss": 0.6393, + "step": 16679 + }, + { + "epoch": 2.49, + "grad_norm": 5.5012911232871575, + "learning_rate": 1.0565337741265385e-06, + "loss": 0.64, + "step": 16680 + }, + { + "epoch": 2.49, + "grad_norm": 2.9277283398713165, + "learning_rate": 1.0564373204582603e-06, + "loss": 0.653, + "step": 16681 + }, + { + "epoch": 2.49, + "grad_norm": 3.039165271995656, + "learning_rate": 1.056340866263248e-06, + "loss": 0.6224, + "step": 16682 + }, + { + "epoch": 2.49, + "grad_norm": 3.244326660419785, + "learning_rate": 1.0562444115424014e-06, + "loss": 0.627, + "step": 16683 + }, + { + "epoch": 2.49, + "grad_norm": 3.4415148151340906, + "learning_rate": 1.0561479562966204e-06, + "loss": 0.6094, + "step": 16684 + }, + { + "epoch": 2.49, + "grad_norm": 4.568072653297996, + "learning_rate": 1.0560515005268058e-06, + "loss": 0.6771, + "step": 16685 + }, + { + "epoch": 2.49, + "grad_norm": 2.9141406179170173, + "learning_rate": 1.0559550442338579e-06, + "loss": 0.6315, + "step": 16686 + }, + { + "epoch": 2.49, + "grad_norm": 3.6346712013069107, + "learning_rate": 1.055858587418676e-06, + "loss": 0.6458, + "step": 16687 + }, + { + "epoch": 2.49, + "grad_norm": 2.965162464929123, + "learning_rate": 1.0557621300821613e-06, + "loss": 0.6178, + "step": 16688 + }, + { + "epoch": 2.49, + "grad_norm": 3.2566510461460276, + "learning_rate": 1.0556656722252137e-06, + "loss": 0.623, + "step": 16689 + }, + { + "epoch": 2.49, + "grad_norm": 4.794409157675358, + "learning_rate": 1.0555692138487333e-06, + "loss": 0.666, + "step": 16690 + }, + { + "epoch": 2.49, + "grad_norm": 3.265021112143821, + "learning_rate": 1.0554727549536206e-06, + "loss": 0.6543, + "step": 16691 + }, + { + "epoch": 2.49, + "grad_norm": 3.032405073184279, + "learning_rate": 1.0553762955407757e-06, + "loss": 0.638, + "step": 16692 + }, + { + "epoch": 2.49, + "grad_norm": 4.337590499103263, + "learning_rate": 1.0552798356110994e-06, + "loss": 0.6465, + "step": 16693 + }, + { + "epoch": 2.49, + "grad_norm": 5.158241458150883, + "learning_rate": 1.055183375165491e-06, + "loss": 0.7031, + "step": 16694 + }, + { + "epoch": 2.49, + "grad_norm": 3.802378820596823, + "learning_rate": 1.0550869142048514e-06, + "loss": 0.6699, + "step": 16695 + }, + { + "epoch": 2.49, + "grad_norm": 3.3010590245009515, + "learning_rate": 1.054990452730081e-06, + "loss": 0.6237, + "step": 16696 + }, + { + "epoch": 2.49, + "grad_norm": 6.312385183750721, + "learning_rate": 1.0548939907420795e-06, + "loss": 0.6569, + "step": 16697 + }, + { + "epoch": 2.49, + "grad_norm": 3.024915219980326, + "learning_rate": 1.0547975282417476e-06, + "loss": 0.6237, + "step": 16698 + }, + { + "epoch": 2.49, + "grad_norm": 3.3885570776936893, + "learning_rate": 1.0547010652299857e-06, + "loss": 0.6068, + "step": 16699 + }, + { + "epoch": 2.49, + "grad_norm": 3.769171024613509, + "learning_rate": 1.0546046017076938e-06, + "loss": 0.6504, + "step": 16700 + }, + { + "epoch": 2.49, + "grad_norm": 3.0133488664557677, + "learning_rate": 1.0545081376757722e-06, + "loss": 0.6061, + "step": 16701 + }, + { + "epoch": 2.49, + "grad_norm": 5.243736316639891, + "learning_rate": 1.0544116731351214e-06, + "loss": 0.6439, + "step": 16702 + }, + { + "epoch": 2.49, + "grad_norm": 4.570173735816213, + "learning_rate": 1.0543152080866419e-06, + "loss": 0.6751, + "step": 16703 + }, + { + "epoch": 2.49, + "grad_norm": 4.279643109820581, + "learning_rate": 1.0542187425312335e-06, + "loss": 0.6211, + "step": 16704 + }, + { + "epoch": 2.49, + "grad_norm": 2.717728366042756, + "learning_rate": 1.0541222764697966e-06, + "loss": 0.6374, + "step": 16705 + }, + { + "epoch": 2.49, + "grad_norm": 3.380698415364068, + "learning_rate": 1.054025809903232e-06, + "loss": 0.6328, + "step": 16706 + }, + { + "epoch": 2.49, + "grad_norm": 4.231476291625808, + "learning_rate": 1.0539293428324397e-06, + "loss": 0.6133, + "step": 16707 + }, + { + "epoch": 2.49, + "grad_norm": 3.616093548162246, + "learning_rate": 1.05383287525832e-06, + "loss": 0.7266, + "step": 16708 + }, + { + "epoch": 2.49, + "grad_norm": 5.005052292412537, + "learning_rate": 1.0537364071817737e-06, + "loss": 0.6497, + "step": 16709 + }, + { + "epoch": 2.49, + "grad_norm": 3.6767961023494746, + "learning_rate": 1.0536399386037003e-06, + "loss": 0.6758, + "step": 16710 + }, + { + "epoch": 2.49, + "grad_norm": 3.5507605047762816, + "learning_rate": 1.0535434695250005e-06, + "loss": 0.6341, + "step": 16711 + }, + { + "epoch": 2.49, + "grad_norm": 3.688375936009811, + "learning_rate": 1.053446999946575e-06, + "loss": 0.6198, + "step": 16712 + }, + { + "epoch": 2.49, + "grad_norm": 6.3035526342155945, + "learning_rate": 1.053350529869324e-06, + "loss": 0.6478, + "step": 16713 + }, + { + "epoch": 2.49, + "grad_norm": 4.376976823930491, + "learning_rate": 1.053254059294148e-06, + "loss": 0.6478, + "step": 16714 + }, + { + "epoch": 2.49, + "grad_norm": 2.914395199084868, + "learning_rate": 1.0531575882219466e-06, + "loss": 0.6634, + "step": 16715 + }, + { + "epoch": 2.49, + "grad_norm": 2.756798056678842, + "learning_rate": 1.053061116653621e-06, + "loss": 0.6471, + "step": 16716 + }, + { + "epoch": 2.49, + "grad_norm": 2.944839166750393, + "learning_rate": 1.0529646445900715e-06, + "loss": 0.64, + "step": 16717 + }, + { + "epoch": 2.49, + "grad_norm": 3.2655853285664604, + "learning_rate": 1.052868172032198e-06, + "loss": 0.6243, + "step": 16718 + }, + { + "epoch": 2.49, + "grad_norm": 3.927548861317072, + "learning_rate": 1.0527716989809011e-06, + "loss": 0.5977, + "step": 16719 + }, + { + "epoch": 2.49, + "grad_norm": 4.3443598274521165, + "learning_rate": 1.0526752254370816e-06, + "loss": 0.6224, + "step": 16720 + }, + { + "epoch": 2.49, + "grad_norm": 3.063780809139767, + "learning_rate": 1.0525787514016393e-06, + "loss": 0.6862, + "step": 16721 + }, + { + "epoch": 2.49, + "grad_norm": 3.7977920798587825, + "learning_rate": 1.0524822768754748e-06, + "loss": 0.582, + "step": 16722 + }, + { + "epoch": 2.49, + "grad_norm": 4.182097409983831, + "learning_rate": 1.052385801859489e-06, + "loss": 0.6673, + "step": 16723 + }, + { + "epoch": 2.49, + "grad_norm": 4.56261501170903, + "learning_rate": 1.0522893263545814e-06, + "loss": 0.6445, + "step": 16724 + }, + { + "epoch": 2.49, + "grad_norm": 5.061784718040665, + "learning_rate": 1.0521928503616534e-06, + "loss": 0.6706, + "step": 16725 + }, + { + "epoch": 2.49, + "grad_norm": 4.016458520348658, + "learning_rate": 1.0520963738816045e-06, + "loss": 0.6803, + "step": 16726 + }, + { + "epoch": 2.49, + "grad_norm": 3.0978559758084847, + "learning_rate": 1.0519998969153355e-06, + "loss": 0.6595, + "step": 16727 + }, + { + "epoch": 2.49, + "grad_norm": 5.608128065479814, + "learning_rate": 1.051903419463747e-06, + "loss": 0.6452, + "step": 16728 + }, + { + "epoch": 2.5, + "grad_norm": 4.769415135650263, + "learning_rate": 1.0518069415277393e-06, + "loss": 0.6387, + "step": 16729 + }, + { + "epoch": 2.5, + "grad_norm": 5.441555535563476, + "learning_rate": 1.051710463108213e-06, + "loss": 0.6595, + "step": 16730 + }, + { + "epoch": 2.5, + "grad_norm": 3.6266426018794027, + "learning_rate": 1.051613984206068e-06, + "loss": 0.6094, + "step": 16731 + }, + { + "epoch": 2.5, + "grad_norm": 3.4514590323750554, + "learning_rate": 1.0515175048222055e-06, + "loss": 0.6517, + "step": 16732 + }, + { + "epoch": 2.5, + "grad_norm": 3.011522106382112, + "learning_rate": 1.0514210249575255e-06, + "loss": 0.6289, + "step": 16733 + }, + { + "epoch": 2.5, + "grad_norm": 2.9578704899716106, + "learning_rate": 1.0513245446129282e-06, + "loss": 0.6361, + "step": 16734 + }, + { + "epoch": 2.5, + "grad_norm": 5.299102980851394, + "learning_rate": 1.0512280637893147e-06, + "loss": 0.6315, + "step": 16735 + }, + { + "epoch": 2.5, + "grad_norm": 2.8931812331011773, + "learning_rate": 1.0511315824875852e-06, + "loss": 0.6556, + "step": 16736 + }, + { + "epoch": 2.5, + "grad_norm": 3.2895052345367954, + "learning_rate": 1.0510351007086399e-06, + "loss": 0.6315, + "step": 16737 + }, + { + "epoch": 2.5, + "grad_norm": 3.033732148528424, + "learning_rate": 1.0509386184533795e-06, + "loss": 0.6745, + "step": 16738 + }, + { + "epoch": 2.5, + "grad_norm": 2.896568761994149, + "learning_rate": 1.0508421357227046e-06, + "loss": 0.6413, + "step": 16739 + }, + { + "epoch": 2.5, + "grad_norm": 3.101849273148128, + "learning_rate": 1.0507456525175153e-06, + "loss": 0.6445, + "step": 16740 + }, + { + "epoch": 2.5, + "grad_norm": 4.418778132126597, + "learning_rate": 1.0506491688387128e-06, + "loss": 0.6764, + "step": 16741 + }, + { + "epoch": 2.5, + "grad_norm": 2.9682339880868005, + "learning_rate": 1.0505526846871968e-06, + "loss": 0.6035, + "step": 16742 + }, + { + "epoch": 2.5, + "grad_norm": 2.895897741437786, + "learning_rate": 1.0504562000638683e-06, + "loss": 0.6842, + "step": 16743 + }, + { + "epoch": 2.5, + "grad_norm": 2.7251127461996227, + "learning_rate": 1.0503597149696273e-06, + "loss": 0.6523, + "step": 16744 + }, + { + "epoch": 2.5, + "grad_norm": 3.4211880032357396, + "learning_rate": 1.0502632294053748e-06, + "loss": 0.6621, + "step": 16745 + }, + { + "epoch": 2.5, + "grad_norm": 4.576379416943239, + "learning_rate": 1.0501667433720114e-06, + "loss": 0.6302, + "step": 16746 + }, + { + "epoch": 2.5, + "grad_norm": 4.452159982104053, + "learning_rate": 1.050070256870437e-06, + "loss": 0.6243, + "step": 16747 + }, + { + "epoch": 2.5, + "grad_norm": 3.268584769125812, + "learning_rate": 1.0499737699015528e-06, + "loss": 0.5866, + "step": 16748 + }, + { + "epoch": 2.5, + "grad_norm": 4.360959484686502, + "learning_rate": 1.0498772824662587e-06, + "loss": 0.6901, + "step": 16749 + }, + { + "epoch": 2.5, + "grad_norm": 4.295397658194169, + "learning_rate": 1.0497807945654554e-06, + "loss": 0.6354, + "step": 16750 + }, + { + "epoch": 2.5, + "grad_norm": 4.029717481595826, + "learning_rate": 1.0496843062000437e-06, + "loss": 0.6602, + "step": 16751 + }, + { + "epoch": 2.5, + "grad_norm": 6.544096062369667, + "learning_rate": 1.049587817370924e-06, + "loss": 0.6816, + "step": 16752 + }, + { + "epoch": 2.5, + "grad_norm": 3.941999247644441, + "learning_rate": 1.0494913280789968e-06, + "loss": 0.6829, + "step": 16753 + }, + { + "epoch": 2.5, + "grad_norm": 3.7908619958665755, + "learning_rate": 1.0493948383251628e-06, + "loss": 0.6328, + "step": 16754 + }, + { + "epoch": 2.5, + "grad_norm": 3.6352911035313995, + "learning_rate": 1.0492983481103222e-06, + "loss": 0.6432, + "step": 16755 + }, + { + "epoch": 2.5, + "grad_norm": 5.567766226807062, + "learning_rate": 1.049201857435376e-06, + "loss": 0.6419, + "step": 16756 + }, + { + "epoch": 2.5, + "grad_norm": 3.022678771846916, + "learning_rate": 1.0491053663012242e-06, + "loss": 0.623, + "step": 16757 + }, + { + "epoch": 2.5, + "grad_norm": 4.111370358056637, + "learning_rate": 1.0490088747087678e-06, + "loss": 0.6523, + "step": 16758 + }, + { + "epoch": 2.5, + "grad_norm": 3.3622661802496845, + "learning_rate": 1.0489123826589074e-06, + "loss": 0.6367, + "step": 16759 + }, + { + "epoch": 2.5, + "grad_norm": 3.9601910504267104, + "learning_rate": 1.0488158901525433e-06, + "loss": 0.6322, + "step": 16760 + }, + { + "epoch": 2.5, + "grad_norm": 2.6160107550964744, + "learning_rate": 1.0487193971905758e-06, + "loss": 0.6152, + "step": 16761 + }, + { + "epoch": 2.5, + "grad_norm": 2.8576447882195306, + "learning_rate": 1.0486229037739064e-06, + "loss": 0.6152, + "step": 16762 + }, + { + "epoch": 2.5, + "grad_norm": 2.7153006546520815, + "learning_rate": 1.048526409903435e-06, + "loss": 0.6367, + "step": 16763 + }, + { + "epoch": 2.5, + "grad_norm": 3.0495556127129495, + "learning_rate": 1.0484299155800621e-06, + "loss": 0.6816, + "step": 16764 + }, + { + "epoch": 2.5, + "grad_norm": 4.718525053602093, + "learning_rate": 1.048333420804689e-06, + "loss": 0.6172, + "step": 16765 + }, + { + "epoch": 2.5, + "grad_norm": 3.3418754184824455, + "learning_rate": 1.0482369255782154e-06, + "loss": 0.6706, + "step": 16766 + }, + { + "epoch": 2.5, + "grad_norm": 3.256156679092653, + "learning_rate": 1.0481404299015425e-06, + "loss": 0.6562, + "step": 16767 + }, + { + "epoch": 2.5, + "grad_norm": 4.334017989046861, + "learning_rate": 1.0480439337755705e-06, + "loss": 0.6367, + "step": 16768 + }, + { + "epoch": 2.5, + "grad_norm": 4.974865920204815, + "learning_rate": 1.0479474372012004e-06, + "loss": 0.6302, + "step": 16769 + }, + { + "epoch": 2.5, + "grad_norm": 3.054422475176884, + "learning_rate": 1.0478509401793324e-06, + "loss": 0.5739, + "step": 16770 + }, + { + "epoch": 2.5, + "grad_norm": 3.476320939669179, + "learning_rate": 1.0477544427108674e-06, + "loss": 0.679, + "step": 16771 + }, + { + "epoch": 2.5, + "grad_norm": 3.496081263166006, + "learning_rate": 1.0476579447967061e-06, + "loss": 0.6009, + "step": 16772 + }, + { + "epoch": 2.5, + "grad_norm": 2.9906720425542486, + "learning_rate": 1.0475614464377492e-06, + "loss": 0.6569, + "step": 16773 + }, + { + "epoch": 2.5, + "grad_norm": 3.3642529787155286, + "learning_rate": 1.0474649476348967e-06, + "loss": 0.6322, + "step": 16774 + }, + { + "epoch": 2.5, + "grad_norm": 3.0537632169519098, + "learning_rate": 1.0473684483890499e-06, + "loss": 0.6543, + "step": 16775 + }, + { + "epoch": 2.5, + "grad_norm": 2.792993748673278, + "learning_rate": 1.047271948701109e-06, + "loss": 0.638, + "step": 16776 + }, + { + "epoch": 2.5, + "grad_norm": 3.294510271892601, + "learning_rate": 1.0471754485719749e-06, + "loss": 0.6048, + "step": 16777 + }, + { + "epoch": 2.5, + "grad_norm": 3.371707552064574, + "learning_rate": 1.047078948002548e-06, + "loss": 0.6536, + "step": 16778 + }, + { + "epoch": 2.5, + "grad_norm": 3.384140837912468, + "learning_rate": 1.0469824469937297e-06, + "loss": 0.6667, + "step": 16779 + }, + { + "epoch": 2.5, + "grad_norm": 3.149059903133516, + "learning_rate": 1.0468859455464197e-06, + "loss": 0.6543, + "step": 16780 + }, + { + "epoch": 2.5, + "grad_norm": 4.295928642888411, + "learning_rate": 1.0467894436615188e-06, + "loss": 0.6478, + "step": 16781 + }, + { + "epoch": 2.5, + "grad_norm": 3.8025843787768348, + "learning_rate": 1.0466929413399279e-06, + "loss": 0.6471, + "step": 16782 + }, + { + "epoch": 2.5, + "grad_norm": 4.366872975144451, + "learning_rate": 1.046596438582548e-06, + "loss": 0.6888, + "step": 16783 + }, + { + "epoch": 2.5, + "grad_norm": 3.389247387421261, + "learning_rate": 1.046499935390279e-06, + "loss": 0.6315, + "step": 16784 + }, + { + "epoch": 2.5, + "grad_norm": 3.1100411519302362, + "learning_rate": 1.0464034317640226e-06, + "loss": 0.6784, + "step": 16785 + }, + { + "epoch": 2.5, + "grad_norm": 3.4815814861314633, + "learning_rate": 1.0463069277046783e-06, + "loss": 0.7305, + "step": 16786 + }, + { + "epoch": 2.5, + "grad_norm": 4.985796946673073, + "learning_rate": 1.0462104232131474e-06, + "loss": 0.6589, + "step": 16787 + }, + { + "epoch": 2.5, + "grad_norm": 2.8183534558678915, + "learning_rate": 1.0461139182903308e-06, + "loss": 0.6615, + "step": 16788 + }, + { + "epoch": 2.5, + "grad_norm": 3.2151204026361437, + "learning_rate": 1.046017412937129e-06, + "loss": 0.6452, + "step": 16789 + }, + { + "epoch": 2.5, + "grad_norm": 3.485129235655617, + "learning_rate": 1.0459209071544422e-06, + "loss": 0.6582, + "step": 16790 + }, + { + "epoch": 2.5, + "grad_norm": 3.003686178808569, + "learning_rate": 1.0458244009431718e-06, + "loss": 0.6595, + "step": 16791 + }, + { + "epoch": 2.5, + "grad_norm": 4.050748923792225, + "learning_rate": 1.0457278943042182e-06, + "loss": 0.6178, + "step": 16792 + }, + { + "epoch": 2.5, + "grad_norm": 4.398489534484192, + "learning_rate": 1.045631387238482e-06, + "loss": 0.6257, + "step": 16793 + }, + { + "epoch": 2.5, + "grad_norm": 3.813938637735066, + "learning_rate": 1.045534879746864e-06, + "loss": 0.6758, + "step": 16794 + }, + { + "epoch": 2.5, + "grad_norm": 2.897223892881052, + "learning_rate": 1.045438371830265e-06, + "loss": 0.6536, + "step": 16795 + }, + { + "epoch": 2.5, + "grad_norm": 3.0085196097650293, + "learning_rate": 1.0453418634895858e-06, + "loss": 0.6777, + "step": 16796 + }, + { + "epoch": 2.51, + "grad_norm": 3.0780471664656255, + "learning_rate": 1.0452453547257268e-06, + "loss": 0.6393, + "step": 16797 + }, + { + "epoch": 2.51, + "grad_norm": 3.517126195748907, + "learning_rate": 1.0451488455395892e-06, + "loss": 0.6536, + "step": 16798 + }, + { + "epoch": 2.51, + "grad_norm": 3.5176745214858913, + "learning_rate": 1.045052335932073e-06, + "loss": 0.6426, + "step": 16799 + }, + { + "epoch": 2.51, + "grad_norm": 3.4632282609999216, + "learning_rate": 1.0449558259040797e-06, + "loss": 0.6868, + "step": 16800 + }, + { + "epoch": 2.51, + "grad_norm": 2.6508244282627706, + "learning_rate": 1.0448593154565095e-06, + "loss": 0.6478, + "step": 16801 + }, + { + "epoch": 2.51, + "grad_norm": 4.769249897437627, + "learning_rate": 1.0447628045902635e-06, + "loss": 0.6309, + "step": 16802 + }, + { + "epoch": 2.51, + "grad_norm": 2.579583371805833, + "learning_rate": 1.044666293306242e-06, + "loss": 0.6309, + "step": 16803 + }, + { + "epoch": 2.51, + "grad_norm": 4.466951277208296, + "learning_rate": 1.0445697816053462e-06, + "loss": 0.6367, + "step": 16804 + }, + { + "epoch": 2.51, + "grad_norm": 2.534608687480673, + "learning_rate": 1.0444732694884767e-06, + "loss": 0.6432, + "step": 16805 + }, + { + "epoch": 2.51, + "grad_norm": 2.7093044289001287, + "learning_rate": 1.0443767569565343e-06, + "loss": 0.6458, + "step": 16806 + }, + { + "epoch": 2.51, + "grad_norm": 3.189676580540413, + "learning_rate": 1.0442802440104198e-06, + "loss": 0.6133, + "step": 16807 + }, + { + "epoch": 2.51, + "grad_norm": 3.1595802581092363, + "learning_rate": 1.0441837306510337e-06, + "loss": 0.6419, + "step": 16808 + }, + { + "epoch": 2.51, + "grad_norm": 5.377334378186979, + "learning_rate": 1.044087216879277e-06, + "loss": 0.6094, + "step": 16809 + }, + { + "epoch": 2.51, + "grad_norm": 2.8307888316350693, + "learning_rate": 1.0439907026960504e-06, + "loss": 0.6276, + "step": 16810 + }, + { + "epoch": 2.51, + "grad_norm": 2.7086016944132236, + "learning_rate": 1.0438941881022544e-06, + "loss": 0.6309, + "step": 16811 + }, + { + "epoch": 2.51, + "grad_norm": 3.496447590868196, + "learning_rate": 1.0437976730987905e-06, + "loss": 0.6497, + "step": 16812 + }, + { + "epoch": 2.51, + "grad_norm": 2.5476043240475006, + "learning_rate": 1.0437011576865589e-06, + "loss": 0.6237, + "step": 16813 + }, + { + "epoch": 2.51, + "grad_norm": 2.8274388750962447, + "learning_rate": 1.0436046418664603e-06, + "loss": 0.6543, + "step": 16814 + }, + { + "epoch": 2.51, + "grad_norm": 3.4005856977557203, + "learning_rate": 1.0435081256393961e-06, + "loss": 0.653, + "step": 16815 + }, + { + "epoch": 2.51, + "grad_norm": 5.85829946354656, + "learning_rate": 1.0434116090062663e-06, + "loss": 0.6491, + "step": 16816 + }, + { + "epoch": 2.51, + "grad_norm": 3.0806542652550637, + "learning_rate": 1.0433150919679726e-06, + "loss": 0.6523, + "step": 16817 + }, + { + "epoch": 2.51, + "grad_norm": 2.8663948378952506, + "learning_rate": 1.043218574525415e-06, + "loss": 0.6087, + "step": 16818 + }, + { + "epoch": 2.51, + "grad_norm": 3.05654041414428, + "learning_rate": 1.0431220566794945e-06, + "loss": 0.6452, + "step": 16819 + }, + { + "epoch": 2.51, + "grad_norm": 3.124031016798399, + "learning_rate": 1.0430255384311124e-06, + "loss": 0.6146, + "step": 16820 + }, + { + "epoch": 2.51, + "grad_norm": 3.6244175208647893, + "learning_rate": 1.0429290197811686e-06, + "loss": 0.5983, + "step": 16821 + }, + { + "epoch": 2.51, + "grad_norm": 2.94898598530027, + "learning_rate": 1.042832500730565e-06, + "loss": 0.6432, + "step": 16822 + }, + { + "epoch": 2.51, + "grad_norm": 4.130353366002865, + "learning_rate": 1.0427359812802016e-06, + "loss": 0.6484, + "step": 16823 + }, + { + "epoch": 2.51, + "grad_norm": 4.266708555630383, + "learning_rate": 1.0426394614309798e-06, + "loss": 0.6458, + "step": 16824 + }, + { + "epoch": 2.51, + "grad_norm": 5.2322756531026196, + "learning_rate": 1.0425429411838e-06, + "loss": 0.6315, + "step": 16825 + }, + { + "epoch": 2.51, + "grad_norm": 3.3430377909551647, + "learning_rate": 1.0424464205395632e-06, + "loss": 0.6289, + "step": 16826 + }, + { + "epoch": 2.51, + "grad_norm": 2.6507643804923835, + "learning_rate": 1.0423498994991697e-06, + "loss": 0.61, + "step": 16827 + }, + { + "epoch": 2.51, + "grad_norm": 3.5917225126610304, + "learning_rate": 1.0422533780635216e-06, + "loss": 0.6367, + "step": 16828 + }, + { + "epoch": 2.51, + "grad_norm": 3.6280323187597783, + "learning_rate": 1.0421568562335187e-06, + "loss": 0.6549, + "step": 16829 + }, + { + "epoch": 2.51, + "grad_norm": 3.0619797283483043, + "learning_rate": 1.042060334010062e-06, + "loss": 0.6387, + "step": 16830 + }, + { + "epoch": 2.51, + "grad_norm": 3.2535396578254043, + "learning_rate": 1.0419638113940523e-06, + "loss": 0.6296, + "step": 16831 + }, + { + "epoch": 2.51, + "grad_norm": 3.375827777913524, + "learning_rate": 1.041867288386391e-06, + "loss": 0.6328, + "step": 16832 + }, + { + "epoch": 2.51, + "grad_norm": 3.652719385605637, + "learning_rate": 1.0417707649879786e-06, + "loss": 0.6283, + "step": 16833 + }, + { + "epoch": 2.51, + "grad_norm": 4.289764564048605, + "learning_rate": 1.0416742411997157e-06, + "loss": 0.6064, + "step": 16834 + }, + { + "epoch": 2.51, + "grad_norm": 3.6811402740105716, + "learning_rate": 1.0415777170225036e-06, + "loss": 0.6732, + "step": 16835 + }, + { + "epoch": 2.51, + "grad_norm": 3.04704352376943, + "learning_rate": 1.041481192457243e-06, + "loss": 0.5729, + "step": 16836 + }, + { + "epoch": 2.51, + "grad_norm": 3.172247387647493, + "learning_rate": 1.0413846675048345e-06, + "loss": 0.6465, + "step": 16837 + }, + { + "epoch": 2.51, + "grad_norm": 3.169652268384149, + "learning_rate": 1.0412881421661797e-06, + "loss": 0.6393, + "step": 16838 + }, + { + "epoch": 2.51, + "grad_norm": 3.2611872674461173, + "learning_rate": 1.0411916164421789e-06, + "loss": 0.6406, + "step": 16839 + }, + { + "epoch": 2.51, + "grad_norm": 3.912198703220977, + "learning_rate": 1.0410950903337325e-06, + "loss": 0.6745, + "step": 16840 + }, + { + "epoch": 2.51, + "grad_norm": 4.90658028254536, + "learning_rate": 1.0409985638417428e-06, + "loss": 0.6107, + "step": 16841 + }, + { + "epoch": 2.51, + "grad_norm": 3.168900241529142, + "learning_rate": 1.0409020369671096e-06, + "loss": 0.6836, + "step": 16842 + }, + { + "epoch": 2.51, + "grad_norm": 5.905280853854452, + "learning_rate": 1.0408055097107337e-06, + "loss": 0.6113, + "step": 16843 + }, + { + "epoch": 2.51, + "grad_norm": 3.480622800575258, + "learning_rate": 1.040708982073517e-06, + "loss": 0.5729, + "step": 16844 + }, + { + "epoch": 2.51, + "grad_norm": 5.715339276559126, + "learning_rate": 1.0406124540563591e-06, + "loss": 0.6576, + "step": 16845 + }, + { + "epoch": 2.51, + "grad_norm": 5.262382940583709, + "learning_rate": 1.0405159256601623e-06, + "loss": 0.6602, + "step": 16846 + }, + { + "epoch": 2.51, + "grad_norm": 3.4040865945744705, + "learning_rate": 1.040419396885826e-06, + "loss": 0.6536, + "step": 16847 + }, + { + "epoch": 2.51, + "grad_norm": 3.8465753603918986, + "learning_rate": 1.0403228677342527e-06, + "loss": 0.6286, + "step": 16848 + }, + { + "epoch": 2.51, + "grad_norm": 3.8638413179273825, + "learning_rate": 1.0402263382063421e-06, + "loss": 0.6628, + "step": 16849 + }, + { + "epoch": 2.51, + "grad_norm": 3.071647709695875, + "learning_rate": 1.0401298083029953e-06, + "loss": 0.6693, + "step": 16850 + }, + { + "epoch": 2.51, + "grad_norm": 6.135104193441353, + "learning_rate": 1.0400332780251137e-06, + "loss": 0.6562, + "step": 16851 + }, + { + "epoch": 2.51, + "grad_norm": 3.9764113992256758, + "learning_rate": 1.039936747373598e-06, + "loss": 0.6081, + "step": 16852 + }, + { + "epoch": 2.51, + "grad_norm": 5.036487214241292, + "learning_rate": 1.039840216349349e-06, + "loss": 0.6022, + "step": 16853 + }, + { + "epoch": 2.51, + "grad_norm": 3.3880312819673533, + "learning_rate": 1.0397436849532679e-06, + "loss": 0.6289, + "step": 16854 + }, + { + "epoch": 2.51, + "grad_norm": 3.395276063035004, + "learning_rate": 1.0396471531862554e-06, + "loss": 0.6107, + "step": 16855 + }, + { + "epoch": 2.51, + "grad_norm": 5.695329658150383, + "learning_rate": 1.0395506210492125e-06, + "loss": 0.5833, + "step": 16856 + }, + { + "epoch": 2.51, + "grad_norm": 2.9039239352067248, + "learning_rate": 1.03945408854304e-06, + "loss": 0.6178, + "step": 16857 + }, + { + "epoch": 2.51, + "grad_norm": 3.3885379673690332, + "learning_rate": 1.0393575556686394e-06, + "loss": 0.5833, + "step": 16858 + }, + { + "epoch": 2.51, + "grad_norm": 3.7203741123696816, + "learning_rate": 1.0392610224269108e-06, + "loss": 0.6439, + "step": 16859 + }, + { + "epoch": 2.51, + "grad_norm": 3.6777559271010483, + "learning_rate": 1.0391644888187559e-06, + "loss": 0.6497, + "step": 16860 + }, + { + "epoch": 2.51, + "grad_norm": 3.1841910598561585, + "learning_rate": 1.0390679548450754e-06, + "loss": 0.61, + "step": 16861 + }, + { + "epoch": 2.51, + "grad_norm": 3.1327452340226136, + "learning_rate": 1.03897142050677e-06, + "loss": 0.5918, + "step": 16862 + }, + { + "epoch": 2.51, + "grad_norm": 6.473037429079463, + "learning_rate": 1.038874885804741e-06, + "loss": 0.5749, + "step": 16863 + }, + { + "epoch": 2.52, + "grad_norm": 3.1180108314384625, + "learning_rate": 1.0387783507398891e-06, + "loss": 0.6133, + "step": 16864 + }, + { + "epoch": 2.52, + "grad_norm": 3.14282703861017, + "learning_rate": 1.0386818153131158e-06, + "loss": 0.6276, + "step": 16865 + }, + { + "epoch": 2.52, + "grad_norm": 3.626832529036941, + "learning_rate": 1.0385852795253211e-06, + "loss": 0.6035, + "step": 16866 + }, + { + "epoch": 2.52, + "grad_norm": 4.468196128621624, + "learning_rate": 1.0384887433774072e-06, + "loss": 0.6165, + "step": 16867 + }, + { + "epoch": 2.52, + "grad_norm": 5.385336341412554, + "learning_rate": 1.0383922068702742e-06, + "loss": 0.6243, + "step": 16868 + }, + { + "epoch": 2.52, + "grad_norm": 6.518745311362683, + "learning_rate": 1.038295670004823e-06, + "loss": 0.7253, + "step": 16869 + }, + { + "epoch": 2.52, + "grad_norm": 3.595794793416118, + "learning_rate": 1.0381991327819555e-06, + "loss": 0.6517, + "step": 16870 + }, + { + "epoch": 2.52, + "grad_norm": 5.327302818124801, + "learning_rate": 1.038102595202572e-06, + "loss": 0.6315, + "step": 16871 + }, + { + "epoch": 2.52, + "grad_norm": 3.864622619111131, + "learning_rate": 1.0380060572675734e-06, + "loss": 0.6465, + "step": 16872 + }, + { + "epoch": 2.52, + "grad_norm": 3.8697149342752524, + "learning_rate": 1.0379095189778608e-06, + "loss": 0.6191, + "step": 16873 + }, + { + "epoch": 2.52, + "grad_norm": 3.935109975975967, + "learning_rate": 1.0378129803343357e-06, + "loss": 0.5951, + "step": 16874 + }, + { + "epoch": 2.52, + "grad_norm": 4.717897118414862, + "learning_rate": 1.0377164413378984e-06, + "loss": 0.61, + "step": 16875 + }, + { + "epoch": 2.52, + "grad_norm": 3.6887003137918732, + "learning_rate": 1.0376199019894502e-06, + "loss": 0.5905, + "step": 16876 + }, + { + "epoch": 2.52, + "grad_norm": 3.3232395517461955, + "learning_rate": 1.0375233622898922e-06, + "loss": 0.5859, + "step": 16877 + }, + { + "epoch": 2.52, + "grad_norm": 3.536613367417289, + "learning_rate": 1.0374268222401257e-06, + "loss": 0.6191, + "step": 16878 + }, + { + "epoch": 2.52, + "grad_norm": 3.4562617557235753, + "learning_rate": 1.0373302818410508e-06, + "loss": 0.6406, + "step": 16879 + }, + { + "epoch": 2.52, + "grad_norm": 3.7614115339185172, + "learning_rate": 1.0372337410935695e-06, + "loss": 0.5625, + "step": 16880 + }, + { + "epoch": 2.52, + "grad_norm": 4.3207548200839225, + "learning_rate": 1.0371371999985823e-06, + "loss": 0.696, + "step": 16881 + }, + { + "epoch": 2.52, + "grad_norm": 6.745829387297003, + "learning_rate": 1.0370406585569899e-06, + "loss": 0.6608, + "step": 16882 + }, + { + "epoch": 2.52, + "grad_norm": 4.459179359641243, + "learning_rate": 1.036944116769694e-06, + "loss": 0.5859, + "step": 16883 + }, + { + "epoch": 2.52, + "grad_norm": 3.7923002913373485, + "learning_rate": 1.0368475746375958e-06, + "loss": 0.6139, + "step": 16884 + }, + { + "epoch": 2.52, + "grad_norm": 3.325384526677914, + "learning_rate": 1.0367510321615955e-06, + "loss": 0.5586, + "step": 16885 + }, + { + "epoch": 2.52, + "grad_norm": 3.5203447966497903, + "learning_rate": 1.036654489342595e-06, + "loss": 0.5898, + "step": 16886 + }, + { + "epoch": 2.52, + "grad_norm": 4.044791252459728, + "learning_rate": 1.0365579461814943e-06, + "loss": 0.6732, + "step": 16887 + }, + { + "epoch": 2.52, + "grad_norm": 3.790810237550608, + "learning_rate": 1.0364614026791954e-06, + "loss": 0.6862, + "step": 16888 + }, + { + "epoch": 2.52, + "grad_norm": 4.254415533328343, + "learning_rate": 1.036364858836599e-06, + "loss": 0.6712, + "step": 16889 + }, + { + "epoch": 2.52, + "grad_norm": 3.6322385147089205, + "learning_rate": 1.0362683146546059e-06, + "loss": 0.6439, + "step": 16890 + }, + { + "epoch": 2.52, + "grad_norm": 3.956128850478375, + "learning_rate": 1.036171770134118e-06, + "loss": 0.627, + "step": 16891 + }, + { + "epoch": 2.52, + "grad_norm": 4.059931893321662, + "learning_rate": 1.0360752252760353e-06, + "loss": 0.694, + "step": 16892 + }, + { + "epoch": 2.52, + "grad_norm": 3.6495638765664395, + "learning_rate": 1.0359786800812591e-06, + "loss": 0.6198, + "step": 16893 + }, + { + "epoch": 2.52, + "grad_norm": 3.7291146911585407, + "learning_rate": 1.0358821345506914e-06, + "loss": 0.6654, + "step": 16894 + }, + { + "epoch": 2.52, + "grad_norm": 3.595924551402929, + "learning_rate": 1.035785588685232e-06, + "loss": 0.6354, + "step": 16895 + }, + { + "epoch": 2.52, + "grad_norm": 4.591699386948632, + "learning_rate": 1.0356890424857829e-06, + "loss": 0.6289, + "step": 16896 + }, + { + "epoch": 2.52, + "grad_norm": 5.574264984428751, + "learning_rate": 1.0355924959532444e-06, + "loss": 0.6263, + "step": 16897 + }, + { + "epoch": 2.52, + "grad_norm": 3.521292964190738, + "learning_rate": 1.0354959490885182e-06, + "loss": 0.6719, + "step": 16898 + }, + { + "epoch": 2.52, + "grad_norm": 3.2317268403116177, + "learning_rate": 1.0353994018925054e-06, + "loss": 0.6875, + "step": 16899 + }, + { + "epoch": 2.52, + "grad_norm": 4.63727486693808, + "learning_rate": 1.0353028543661065e-06, + "loss": 0.6341, + "step": 16900 + }, + { + "epoch": 2.52, + "grad_norm": 3.7849971217273124, + "learning_rate": 1.0352063065102233e-06, + "loss": 0.584, + "step": 16901 + }, + { + "epoch": 2.52, + "grad_norm": 4.375820447758978, + "learning_rate": 1.0351097583257563e-06, + "loss": 0.6289, + "step": 16902 + }, + { + "epoch": 2.52, + "grad_norm": 3.9423738166019895, + "learning_rate": 1.0350132098136067e-06, + "loss": 0.6276, + "step": 16903 + }, + { + "epoch": 2.52, + "grad_norm": 3.587760954863385, + "learning_rate": 1.0349166609746758e-06, + "loss": 0.6016, + "step": 16904 + }, + { + "epoch": 2.52, + "grad_norm": 3.106410507306074, + "learning_rate": 1.0348201118098647e-06, + "loss": 0.6504, + "step": 16905 + }, + { + "epoch": 2.52, + "grad_norm": 3.9787260095441273, + "learning_rate": 1.0347235623200742e-06, + "loss": 0.6621, + "step": 16906 + }, + { + "epoch": 2.52, + "grad_norm": 3.3421683733743284, + "learning_rate": 1.034627012506206e-06, + "loss": 0.6621, + "step": 16907 + }, + { + "epoch": 2.52, + "grad_norm": 5.63148219550678, + "learning_rate": 1.0345304623691606e-06, + "loss": 0.6647, + "step": 16908 + }, + { + "epoch": 2.52, + "grad_norm": 4.227222648296845, + "learning_rate": 1.0344339119098393e-06, + "loss": 0.6296, + "step": 16909 + }, + { + "epoch": 2.52, + "grad_norm": 5.803856408744923, + "learning_rate": 1.0343373611291432e-06, + "loss": 0.6426, + "step": 16910 + }, + { + "epoch": 2.52, + "grad_norm": 3.3252775141379436, + "learning_rate": 1.034240810027974e-06, + "loss": 0.6549, + "step": 16911 + }, + { + "epoch": 2.52, + "grad_norm": 3.1476286315965956, + "learning_rate": 1.0341442586072318e-06, + "loss": 0.6517, + "step": 16912 + }, + { + "epoch": 2.52, + "grad_norm": 3.342360864501622, + "learning_rate": 1.034047706867818e-06, + "loss": 0.6315, + "step": 16913 + }, + { + "epoch": 2.52, + "grad_norm": 3.3686264303463855, + "learning_rate": 1.0339511548106345e-06, + "loss": 0.6569, + "step": 16914 + }, + { + "epoch": 2.52, + "grad_norm": 3.383742015239332, + "learning_rate": 1.0338546024365818e-06, + "loss": 0.6458, + "step": 16915 + }, + { + "epoch": 2.52, + "grad_norm": 3.403902516941529, + "learning_rate": 1.0337580497465606e-06, + "loss": 0.6296, + "step": 16916 + }, + { + "epoch": 2.52, + "grad_norm": 2.928045948441593, + "learning_rate": 1.033661496741473e-06, + "loss": 0.6315, + "step": 16917 + }, + { + "epoch": 2.52, + "grad_norm": 2.956286664282401, + "learning_rate": 1.0335649434222196e-06, + "loss": 0.6549, + "step": 16918 + }, + { + "epoch": 2.52, + "grad_norm": 3.7099256663336013, + "learning_rate": 1.0334683897897015e-06, + "loss": 0.6999, + "step": 16919 + }, + { + "epoch": 2.52, + "grad_norm": 3.5799919071305224, + "learning_rate": 1.03337183584482e-06, + "loss": 0.6302, + "step": 16920 + }, + { + "epoch": 2.52, + "grad_norm": 3.746335990017424, + "learning_rate": 1.0332752815884763e-06, + "loss": 0.6432, + "step": 16921 + }, + { + "epoch": 2.52, + "grad_norm": 3.0203873408817477, + "learning_rate": 1.033178727021571e-06, + "loss": 0.6126, + "step": 16922 + }, + { + "epoch": 2.52, + "grad_norm": 3.276576389073642, + "learning_rate": 1.0330821721450064e-06, + "loss": 0.6084, + "step": 16923 + }, + { + "epoch": 2.52, + "grad_norm": 3.3958092069765593, + "learning_rate": 1.0329856169596826e-06, + "loss": 0.6439, + "step": 16924 + }, + { + "epoch": 2.52, + "grad_norm": 3.2557450818829885, + "learning_rate": 1.032889061466501e-06, + "loss": 0.625, + "step": 16925 + }, + { + "epoch": 2.52, + "grad_norm": 2.8138592441800476, + "learning_rate": 1.032792505666363e-06, + "loss": 0.6413, + "step": 16926 + }, + { + "epoch": 2.52, + "grad_norm": 4.2377670908764, + "learning_rate": 1.0326959495601699e-06, + "loss": 0.6276, + "step": 16927 + }, + { + "epoch": 2.52, + "grad_norm": 4.996683443409343, + "learning_rate": 1.0325993931488223e-06, + "loss": 0.6491, + "step": 16928 + }, + { + "epoch": 2.52, + "grad_norm": 3.033095670463723, + "learning_rate": 1.0325028364332218e-06, + "loss": 0.6348, + "step": 16929 + }, + { + "epoch": 2.52, + "grad_norm": 5.37890379229039, + "learning_rate": 1.032406279414269e-06, + "loss": 0.6393, + "step": 16930 + }, + { + "epoch": 2.53, + "grad_norm": 3.0587714988480887, + "learning_rate": 1.0323097220928665e-06, + "loss": 0.609, + "step": 16931 + }, + { + "epoch": 2.53, + "grad_norm": 3.870848905582098, + "learning_rate": 1.0322131644699137e-06, + "loss": 0.6634, + "step": 16932 + }, + { + "epoch": 2.53, + "grad_norm": 3.2644115480494134, + "learning_rate": 1.0321166065463129e-06, + "loss": 0.6608, + "step": 16933 + }, + { + "epoch": 2.53, + "grad_norm": 3.8621008009384044, + "learning_rate": 1.032020048322965e-06, + "loss": 0.6178, + "step": 16934 + }, + { + "epoch": 2.53, + "grad_norm": 2.631854640501059, + "learning_rate": 1.031923489800771e-06, + "loss": 0.6276, + "step": 16935 + }, + { + "epoch": 2.53, + "grad_norm": 3.7336011665859177, + "learning_rate": 1.0318269309806321e-06, + "loss": 0.6296, + "step": 16936 + }, + { + "epoch": 2.53, + "grad_norm": 4.022436065437599, + "learning_rate": 1.0317303718634503e-06, + "loss": 0.6471, + "step": 16937 + }, + { + "epoch": 2.53, + "grad_norm": 2.576371112325695, + "learning_rate": 1.0316338124501254e-06, + "loss": 0.6263, + "step": 16938 + }, + { + "epoch": 2.53, + "grad_norm": 3.69136043643536, + "learning_rate": 1.0315372527415598e-06, + "loss": 0.6335, + "step": 16939 + }, + { + "epoch": 2.53, + "grad_norm": 2.508859877332182, + "learning_rate": 1.0314406927386538e-06, + "loss": 0.6243, + "step": 16940 + }, + { + "epoch": 2.53, + "grad_norm": 4.141714704748336, + "learning_rate": 1.0313441324423096e-06, + "loss": 0.6204, + "step": 16941 + }, + { + "epoch": 2.53, + "grad_norm": 3.886585082964473, + "learning_rate": 1.0312475718534273e-06, + "loss": 0.6706, + "step": 16942 + }, + { + "epoch": 2.53, + "grad_norm": 3.3342934096397627, + "learning_rate": 1.0311510109729087e-06, + "loss": 0.6296, + "step": 16943 + }, + { + "epoch": 2.53, + "grad_norm": 3.1539401952336776, + "learning_rate": 1.0310544498016555e-06, + "loss": 0.6042, + "step": 16944 + }, + { + "epoch": 2.53, + "grad_norm": 5.2137392285151805, + "learning_rate": 1.0309578883405679e-06, + "loss": 0.6523, + "step": 16945 + }, + { + "epoch": 2.53, + "grad_norm": 3.1430680961427564, + "learning_rate": 1.0308613265905476e-06, + "loss": 0.6315, + "step": 16946 + }, + { + "epoch": 2.53, + "grad_norm": 3.3096091617782952, + "learning_rate": 1.0307647645524961e-06, + "loss": 0.6986, + "step": 16947 + }, + { + "epoch": 2.53, + "grad_norm": 2.8193556925369374, + "learning_rate": 1.0306682022273138e-06, + "loss": 0.5951, + "step": 16948 + }, + { + "epoch": 2.53, + "grad_norm": 6.471368748115347, + "learning_rate": 1.030571639615903e-06, + "loss": 0.6263, + "step": 16949 + }, + { + "epoch": 2.53, + "grad_norm": 4.165546528787982, + "learning_rate": 1.0304750767191644e-06, + "loss": 0.5993, + "step": 16950 + }, + { + "epoch": 2.53, + "grad_norm": 2.7122828631331304, + "learning_rate": 1.0303785135379987e-06, + "loss": 0.6302, + "step": 16951 + }, + { + "epoch": 2.53, + "grad_norm": 2.929622944754282, + "learning_rate": 1.030281950073308e-06, + "loss": 0.668, + "step": 16952 + }, + { + "epoch": 2.53, + "grad_norm": 4.885184290853034, + "learning_rate": 1.0301853863259932e-06, + "loss": 0.6576, + "step": 16953 + }, + { + "epoch": 2.53, + "grad_norm": 3.2622111170047163, + "learning_rate": 1.0300888222969553e-06, + "loss": 0.6413, + "step": 16954 + }, + { + "epoch": 2.53, + "grad_norm": 3.013025497622891, + "learning_rate": 1.029992257987096e-06, + "loss": 0.6686, + "step": 16955 + }, + { + "epoch": 2.53, + "grad_norm": 3.2390687374288483, + "learning_rate": 1.0298956933973159e-06, + "loss": 0.6289, + "step": 16956 + }, + { + "epoch": 2.53, + "grad_norm": 3.571458793732338, + "learning_rate": 1.0297991285285175e-06, + "loss": 0.6712, + "step": 16957 + }, + { + "epoch": 2.53, + "grad_norm": 4.7870080295711706, + "learning_rate": 1.0297025633816004e-06, + "loss": 0.6328, + "step": 16958 + }, + { + "epoch": 2.53, + "grad_norm": 4.269729123902515, + "learning_rate": 1.0296059979574667e-06, + "loss": 0.6784, + "step": 16959 + }, + { + "epoch": 2.53, + "grad_norm": 3.0289114903120464, + "learning_rate": 1.0295094322570181e-06, + "loss": 0.6283, + "step": 16960 + }, + { + "epoch": 2.53, + "grad_norm": 5.155955313285144, + "learning_rate": 1.029412866281155e-06, + "loss": 0.6549, + "step": 16961 + }, + { + "epoch": 2.53, + "grad_norm": 2.748958958076715, + "learning_rate": 1.0293163000307791e-06, + "loss": 0.6536, + "step": 16962 + }, + { + "epoch": 2.53, + "grad_norm": 2.924767533611446, + "learning_rate": 1.0292197335067914e-06, + "loss": 0.6094, + "step": 16963 + }, + { + "epoch": 2.53, + "grad_norm": 3.138244400654443, + "learning_rate": 1.0291231667100938e-06, + "loss": 0.6562, + "step": 16964 + }, + { + "epoch": 2.53, + "grad_norm": 3.0473493295180787, + "learning_rate": 1.0290265996415868e-06, + "loss": 0.6048, + "step": 16965 + }, + { + "epoch": 2.53, + "grad_norm": 7.916954519236495, + "learning_rate": 1.0289300323021717e-06, + "loss": 0.6576, + "step": 16966 + }, + { + "epoch": 2.53, + "grad_norm": 2.748272753118663, + "learning_rate": 1.0288334646927504e-06, + "loss": 0.6055, + "step": 16967 + }, + { + "epoch": 2.53, + "grad_norm": 4.624423801116614, + "learning_rate": 1.0287368968142238e-06, + "loss": 0.6556, + "step": 16968 + }, + { + "epoch": 2.53, + "grad_norm": 5.534704487828142, + "learning_rate": 1.0286403286674933e-06, + "loss": 0.6855, + "step": 16969 + }, + { + "epoch": 2.53, + "grad_norm": 5.913687908816576, + "learning_rate": 1.02854376025346e-06, + "loss": 0.5957, + "step": 16970 + }, + { + "epoch": 2.53, + "grad_norm": 3.493494314884079, + "learning_rate": 1.0284471915730251e-06, + "loss": 0.64, + "step": 16971 + }, + { + "epoch": 2.53, + "grad_norm": 4.030761023811398, + "learning_rate": 1.0283506226270898e-06, + "loss": 0.6712, + "step": 16972 + }, + { + "epoch": 2.53, + "grad_norm": 3.103160413351532, + "learning_rate": 1.0282540534165562e-06, + "loss": 0.6393, + "step": 16973 + }, + { + "epoch": 2.53, + "grad_norm": 2.7894548361082463, + "learning_rate": 1.028157483942325e-06, + "loss": 0.5944, + "step": 16974 + }, + { + "epoch": 2.53, + "grad_norm": 3.704451461619044, + "learning_rate": 1.0280609142052971e-06, + "loss": 0.6237, + "step": 16975 + }, + { + "epoch": 2.53, + "grad_norm": 4.649757037100336, + "learning_rate": 1.0279643442063745e-06, + "loss": 0.6784, + "step": 16976 + }, + { + "epoch": 2.53, + "grad_norm": 3.237382350531514, + "learning_rate": 1.0278677739464583e-06, + "loss": 0.668, + "step": 16977 + }, + { + "epoch": 2.53, + "grad_norm": 2.9009242251303857, + "learning_rate": 1.0277712034264495e-06, + "loss": 0.6393, + "step": 16978 + }, + { + "epoch": 2.53, + "grad_norm": 3.7772656624080567, + "learning_rate": 1.0276746326472494e-06, + "loss": 0.6634, + "step": 16979 + }, + { + "epoch": 2.53, + "grad_norm": 2.557062589984387, + "learning_rate": 1.0275780616097598e-06, + "loss": 0.6126, + "step": 16980 + }, + { + "epoch": 2.53, + "grad_norm": 2.7830310392915756, + "learning_rate": 1.0274814903148818e-06, + "loss": 0.6243, + "step": 16981 + }, + { + "epoch": 2.53, + "grad_norm": 5.308127196888093, + "learning_rate": 1.0273849187635164e-06, + "loss": 0.6087, + "step": 16982 + }, + { + "epoch": 2.53, + "grad_norm": 5.232589411928265, + "learning_rate": 1.0272883469565653e-06, + "loss": 0.6751, + "step": 16983 + }, + { + "epoch": 2.53, + "grad_norm": 3.8114845090372302, + "learning_rate": 1.0271917748949294e-06, + "loss": 0.6582, + "step": 16984 + }, + { + "epoch": 2.53, + "grad_norm": 3.315144970813219, + "learning_rate": 1.0270952025795103e-06, + "loss": 0.6686, + "step": 16985 + }, + { + "epoch": 2.53, + "grad_norm": 3.1228485360197045, + "learning_rate": 1.0269986300112094e-06, + "loss": 0.6992, + "step": 16986 + }, + { + "epoch": 2.53, + "grad_norm": 8.963302604133256, + "learning_rate": 1.0269020571909282e-06, + "loss": 0.668, + "step": 16987 + }, + { + "epoch": 2.53, + "grad_norm": 3.616484628215722, + "learning_rate": 1.0268054841195672e-06, + "loss": 0.6608, + "step": 16988 + }, + { + "epoch": 2.53, + "grad_norm": 2.7266624539087263, + "learning_rate": 1.0267089107980285e-06, + "loss": 0.6517, + "step": 16989 + }, + { + "epoch": 2.53, + "grad_norm": 2.758331338024889, + "learning_rate": 1.026612337227213e-06, + "loss": 0.6445, + "step": 16990 + }, + { + "epoch": 2.53, + "grad_norm": 5.4219060661974305, + "learning_rate": 1.0265157634080223e-06, + "loss": 0.6315, + "step": 16991 + }, + { + "epoch": 2.53, + "grad_norm": 3.866004844797063, + "learning_rate": 1.026419189341358e-06, + "loss": 0.6393, + "step": 16992 + }, + { + "epoch": 2.53, + "grad_norm": 3.353460374987434, + "learning_rate": 1.0263226150281207e-06, + "loss": 0.638, + "step": 16993 + }, + { + "epoch": 2.53, + "grad_norm": 2.984943994454884, + "learning_rate": 1.026226040469212e-06, + "loss": 0.61, + "step": 16994 + }, + { + "epoch": 2.53, + "grad_norm": 2.8298685439601856, + "learning_rate": 1.0261294656655335e-06, + "loss": 0.6452, + "step": 16995 + }, + { + "epoch": 2.53, + "grad_norm": 3.356274012499151, + "learning_rate": 1.026032890617986e-06, + "loss": 0.5749, + "step": 16996 + }, + { + "epoch": 2.53, + "grad_norm": 2.982510016652706, + "learning_rate": 1.025936315327472e-06, + "loss": 0.6406, + "step": 16997 + }, + { + "epoch": 2.54, + "grad_norm": 3.1786592378131657, + "learning_rate": 1.0258397397948915e-06, + "loss": 0.6543, + "step": 16998 + }, + { + "epoch": 2.54, + "grad_norm": 2.7592449686809255, + "learning_rate": 1.0257431640211467e-06, + "loss": 0.6042, + "step": 16999 + }, + { + "epoch": 2.54, + "grad_norm": 3.1912298226303157, + "learning_rate": 1.0256465880071388e-06, + "loss": 0.6543, + "step": 17000 + }, + { + "epoch": 2.54, + "grad_norm": 2.9140604984488236, + "learning_rate": 1.0255500117537685e-06, + "loss": 0.6517, + "step": 17001 + }, + { + "epoch": 2.54, + "grad_norm": 3.4456528726432127, + "learning_rate": 1.0254534352619379e-06, + "loss": 0.6986, + "step": 17002 + }, + { + "epoch": 2.54, + "grad_norm": 2.915716736338164, + "learning_rate": 1.0253568585325484e-06, + "loss": 0.6576, + "step": 17003 + }, + { + "epoch": 2.54, + "grad_norm": 2.8128014172338305, + "learning_rate": 1.025260281566501e-06, + "loss": 0.6224, + "step": 17004 + }, + { + "epoch": 2.54, + "grad_norm": 3.5487706030611808, + "learning_rate": 1.025163704364697e-06, + "loss": 0.5983, + "step": 17005 + }, + { + "epoch": 2.54, + "grad_norm": 2.9334467357329195, + "learning_rate": 1.0250671269280378e-06, + "loss": 0.6725, + "step": 17006 + }, + { + "epoch": 2.54, + "grad_norm": 4.260989913662972, + "learning_rate": 1.0249705492574255e-06, + "loss": 0.6276, + "step": 17007 + }, + { + "epoch": 2.54, + "grad_norm": 2.7675278001033896, + "learning_rate": 1.0248739713537602e-06, + "loss": 0.6322, + "step": 17008 + }, + { + "epoch": 2.54, + "grad_norm": 2.982603285029481, + "learning_rate": 1.024777393217944e-06, + "loss": 0.612, + "step": 17009 + }, + { + "epoch": 2.54, + "grad_norm": 6.6224919731429415, + "learning_rate": 1.0246808148508785e-06, + "loss": 0.6647, + "step": 17010 + }, + { + "epoch": 2.54, + "grad_norm": 2.771904889760953, + "learning_rate": 1.0245842362534648e-06, + "loss": 0.5775, + "step": 17011 + }, + { + "epoch": 2.54, + "grad_norm": 4.669237244123511, + "learning_rate": 1.024487657426604e-06, + "loss": 0.6602, + "step": 17012 + }, + { + "epoch": 2.54, + "grad_norm": 5.403405177120064, + "learning_rate": 1.0243910783711977e-06, + "loss": 0.6797, + "step": 17013 + }, + { + "epoch": 2.54, + "grad_norm": 2.949074749627071, + "learning_rate": 1.0242944990881475e-06, + "loss": 0.6185, + "step": 17014 + }, + { + "epoch": 2.54, + "grad_norm": 5.182273666224407, + "learning_rate": 1.0241979195783546e-06, + "loss": 0.6745, + "step": 17015 + }, + { + "epoch": 2.54, + "grad_norm": 4.784583981028072, + "learning_rate": 1.02410133984272e-06, + "loss": 0.6029, + "step": 17016 + }, + { + "epoch": 2.54, + "grad_norm": 3.1977899170539423, + "learning_rate": 1.0240047598821457e-06, + "loss": 0.6211, + "step": 17017 + }, + { + "epoch": 2.54, + "grad_norm": 3.0891220924701144, + "learning_rate": 1.023908179697533e-06, + "loss": 0.6556, + "step": 17018 + }, + { + "epoch": 2.54, + "grad_norm": 2.8840669075192067, + "learning_rate": 1.023811599289783e-06, + "loss": 0.6146, + "step": 17019 + }, + { + "epoch": 2.54, + "grad_norm": 3.117501217478128, + "learning_rate": 1.0237150186597975e-06, + "loss": 0.6855, + "step": 17020 + }, + { + "epoch": 2.54, + "grad_norm": 5.137367010642698, + "learning_rate": 1.0236184378084774e-06, + "loss": 0.6719, + "step": 17021 + }, + { + "epoch": 2.54, + "grad_norm": 3.6465651143866573, + "learning_rate": 1.0235218567367242e-06, + "loss": 0.6504, + "step": 17022 + }, + { + "epoch": 2.54, + "grad_norm": 3.556426746197336, + "learning_rate": 1.0234252754454397e-06, + "loss": 0.6693, + "step": 17023 + }, + { + "epoch": 2.54, + "grad_norm": 3.072902659543418, + "learning_rate": 1.0233286939355248e-06, + "loss": 0.5999, + "step": 17024 + }, + { + "epoch": 2.54, + "grad_norm": 5.918605703821626, + "learning_rate": 1.0232321122078812e-06, + "loss": 0.6667, + "step": 17025 + }, + { + "epoch": 2.54, + "grad_norm": 4.80876348821034, + "learning_rate": 1.0231355302634106e-06, + "loss": 0.6257, + "step": 17026 + }, + { + "epoch": 2.54, + "grad_norm": 4.26016906599224, + "learning_rate": 1.0230389481030134e-06, + "loss": 0.6497, + "step": 17027 + }, + { + "epoch": 2.54, + "grad_norm": 4.7127502521366775, + "learning_rate": 1.0229423657275922e-06, + "loss": 0.6263, + "step": 17028 + }, + { + "epoch": 2.54, + "grad_norm": 4.488701344900023, + "learning_rate": 1.0228457831380475e-06, + "loss": 0.6478, + "step": 17029 + }, + { + "epoch": 2.54, + "grad_norm": 4.086166692004341, + "learning_rate": 1.0227492003352814e-06, + "loss": 0.6387, + "step": 17030 + }, + { + "epoch": 2.54, + "grad_norm": 3.024867803214779, + "learning_rate": 1.022652617320195e-06, + "loss": 0.6341, + "step": 17031 + }, + { + "epoch": 2.54, + "grad_norm": 3.3816875469333163, + "learning_rate": 1.0225560340936894e-06, + "loss": 0.6224, + "step": 17032 + }, + { + "epoch": 2.54, + "grad_norm": 5.855919086459169, + "learning_rate": 1.0224594506566666e-06, + "loss": 0.7005, + "step": 17033 + }, + { + "epoch": 2.54, + "grad_norm": 4.594411771620379, + "learning_rate": 1.0223628670100276e-06, + "loss": 0.625, + "step": 17034 + }, + { + "epoch": 2.54, + "grad_norm": 3.4281876723783897, + "learning_rate": 1.022266283154674e-06, + "loss": 0.6712, + "step": 17035 + }, + { + "epoch": 2.54, + "grad_norm": 4.719206366561483, + "learning_rate": 1.0221696990915073e-06, + "loss": 0.6478, + "step": 17036 + }, + { + "epoch": 2.54, + "grad_norm": 3.1591180227936553, + "learning_rate": 1.022073114821429e-06, + "loss": 0.6133, + "step": 17037 + }, + { + "epoch": 2.54, + "grad_norm": 2.6828038557292015, + "learning_rate": 1.0219765303453401e-06, + "loss": 0.64, + "step": 17038 + }, + { + "epoch": 2.54, + "grad_norm": 3.558791916104818, + "learning_rate": 1.0218799456641423e-06, + "loss": 0.6413, + "step": 17039 + }, + { + "epoch": 2.54, + "grad_norm": 4.020345921468737, + "learning_rate": 1.0217833607787372e-06, + "loss": 0.6517, + "step": 17040 + }, + { + "epoch": 2.54, + "grad_norm": 2.9682791297367346, + "learning_rate": 1.0216867756900257e-06, + "loss": 0.6374, + "step": 17041 + }, + { + "epoch": 2.54, + "grad_norm": 6.177494134841801, + "learning_rate": 1.0215901903989103e-06, + "loss": 0.7253, + "step": 17042 + }, + { + "epoch": 2.54, + "grad_norm": 3.2275426237907516, + "learning_rate": 1.0214936049062911e-06, + "loss": 0.6536, + "step": 17043 + }, + { + "epoch": 2.54, + "grad_norm": 3.0608746315349475, + "learning_rate": 1.0213970192130705e-06, + "loss": 0.651, + "step": 17044 + }, + { + "epoch": 2.54, + "grad_norm": 3.5266758579117394, + "learning_rate": 1.0213004333201493e-06, + "loss": 0.6582, + "step": 17045 + }, + { + "epoch": 2.54, + "grad_norm": 2.7899543627943686, + "learning_rate": 1.0212038472284297e-06, + "loss": 0.6081, + "step": 17046 + }, + { + "epoch": 2.54, + "grad_norm": 2.9630369287809293, + "learning_rate": 1.0211072609388125e-06, + "loss": 0.6335, + "step": 17047 + }, + { + "epoch": 2.54, + "grad_norm": 3.020237390380146, + "learning_rate": 1.0210106744521994e-06, + "loss": 0.6322, + "step": 17048 + }, + { + "epoch": 2.54, + "grad_norm": 3.7855164839397735, + "learning_rate": 1.0209140877694917e-06, + "loss": 0.6784, + "step": 17049 + }, + { + "epoch": 2.54, + "grad_norm": 2.783287310962729, + "learning_rate": 1.0208175008915913e-06, + "loss": 0.6243, + "step": 17050 + }, + { + "epoch": 2.54, + "grad_norm": 2.925294986147642, + "learning_rate": 1.020720913819399e-06, + "loss": 0.6484, + "step": 17051 + }, + { + "epoch": 2.54, + "grad_norm": 3.1182763698755753, + "learning_rate": 1.0206243265538166e-06, + "loss": 0.6387, + "step": 17052 + }, + { + "epoch": 2.54, + "grad_norm": 2.886102917035699, + "learning_rate": 1.0205277390957458e-06, + "loss": 0.6582, + "step": 17053 + }, + { + "epoch": 2.54, + "grad_norm": 5.254679222971205, + "learning_rate": 1.0204311514460874e-06, + "loss": 0.5931, + "step": 17054 + }, + { + "epoch": 2.54, + "grad_norm": 3.2807271256847574, + "learning_rate": 1.0203345636057437e-06, + "loss": 0.6693, + "step": 17055 + }, + { + "epoch": 2.54, + "grad_norm": 5.688865076968785, + "learning_rate": 1.0202379755756154e-06, + "loss": 0.6413, + "step": 17056 + }, + { + "epoch": 2.54, + "grad_norm": 3.0893955371717334, + "learning_rate": 1.0201413873566043e-06, + "loss": 0.6315, + "step": 17057 + }, + { + "epoch": 2.54, + "grad_norm": 2.9730078280418417, + "learning_rate": 1.0200447989496119e-06, + "loss": 0.6263, + "step": 17058 + }, + { + "epoch": 2.54, + "grad_norm": 3.1437465679856955, + "learning_rate": 1.0199482103555397e-06, + "loss": 0.5876, + "step": 17059 + }, + { + "epoch": 2.54, + "grad_norm": 4.780038368078662, + "learning_rate": 1.0198516215752892e-06, + "loss": 0.6562, + "step": 17060 + }, + { + "epoch": 2.54, + "grad_norm": 3.2497800804935633, + "learning_rate": 1.0197550326097615e-06, + "loss": 0.6335, + "step": 17061 + }, + { + "epoch": 2.54, + "grad_norm": 3.1320814012557627, + "learning_rate": 1.0196584434598583e-06, + "loss": 0.6589, + "step": 17062 + }, + { + "epoch": 2.54, + "grad_norm": 2.802112431985338, + "learning_rate": 1.0195618541264813e-06, + "loss": 0.6217, + "step": 17063 + }, + { + "epoch": 2.54, + "grad_norm": 3.1070905249083682, + "learning_rate": 1.0194652646105317e-06, + "loss": 0.6582, + "step": 17064 + }, + { + "epoch": 2.55, + "grad_norm": 3.1296714809325636, + "learning_rate": 1.0193686749129112e-06, + "loss": 0.6302, + "step": 17065 + }, + { + "epoch": 2.55, + "grad_norm": 2.9996738421921854, + "learning_rate": 1.019272085034521e-06, + "loss": 0.6387, + "step": 17066 + }, + { + "epoch": 2.55, + "grad_norm": 3.9898155852426513, + "learning_rate": 1.0191754949762628e-06, + "loss": 0.6009, + "step": 17067 + }, + { + "epoch": 2.55, + "grad_norm": 3.0816392106357884, + "learning_rate": 1.0190789047390378e-06, + "loss": 0.627, + "step": 17068 + }, + { + "epoch": 2.55, + "grad_norm": 3.7084036782130827, + "learning_rate": 1.0189823143237482e-06, + "loss": 0.6569, + "step": 17069 + }, + { + "epoch": 2.55, + "grad_norm": 3.0792883763558336, + "learning_rate": 1.0188857237312942e-06, + "loss": 0.6087, + "step": 17070 + }, + { + "epoch": 2.55, + "grad_norm": 4.467390438940322, + "learning_rate": 1.0187891329625787e-06, + "loss": 0.6172, + "step": 17071 + }, + { + "epoch": 2.55, + "grad_norm": 2.899305187094525, + "learning_rate": 1.018692542018502e-06, + "loss": 0.6576, + "step": 17072 + }, + { + "epoch": 2.55, + "grad_norm": 4.371011974984921, + "learning_rate": 1.018595950899967e-06, + "loss": 0.6139, + "step": 17073 + }, + { + "epoch": 2.55, + "grad_norm": 3.1107781759268134, + "learning_rate": 1.0184993596078738e-06, + "loss": 0.5895, + "step": 17074 + }, + { + "epoch": 2.55, + "grad_norm": 3.2068301992480412, + "learning_rate": 1.0184027681431244e-06, + "loss": 0.6497, + "step": 17075 + }, + { + "epoch": 2.55, + "grad_norm": 3.141634929362269, + "learning_rate": 1.0183061765066208e-06, + "loss": 0.6732, + "step": 17076 + }, + { + "epoch": 2.55, + "grad_norm": 3.603887621724599, + "learning_rate": 1.0182095846992633e-06, + "loss": 0.6497, + "step": 17077 + }, + { + "epoch": 2.55, + "grad_norm": 2.9353561835905424, + "learning_rate": 1.0181129927219547e-06, + "loss": 0.6139, + "step": 17078 + }, + { + "epoch": 2.55, + "grad_norm": 3.3604935001017, + "learning_rate": 1.0180164005755958e-06, + "loss": 0.6126, + "step": 17079 + }, + { + "epoch": 2.55, + "grad_norm": 3.175051392669207, + "learning_rate": 1.017919808261088e-06, + "loss": 0.6211, + "step": 17080 + }, + { + "epoch": 2.55, + "grad_norm": 3.2173737727810154, + "learning_rate": 1.0178232157793335e-06, + "loss": 0.6797, + "step": 17081 + }, + { + "epoch": 2.55, + "grad_norm": 3.754660082145916, + "learning_rate": 1.0177266231312329e-06, + "loss": 0.6139, + "step": 17082 + }, + { + "epoch": 2.55, + "grad_norm": 3.1224042192297503, + "learning_rate": 1.0176300303176883e-06, + "loss": 0.6725, + "step": 17083 + }, + { + "epoch": 2.55, + "grad_norm": 3.7873014095850466, + "learning_rate": 1.0175334373396012e-06, + "loss": 0.653, + "step": 17084 + }, + { + "epoch": 2.55, + "grad_norm": 5.122407482870709, + "learning_rate": 1.0174368441978728e-06, + "loss": 0.638, + "step": 17085 + }, + { + "epoch": 2.55, + "grad_norm": 3.767909782418691, + "learning_rate": 1.0173402508934051e-06, + "loss": 0.6195, + "step": 17086 + }, + { + "epoch": 2.55, + "grad_norm": 3.2940569140899867, + "learning_rate": 1.017243657427099e-06, + "loss": 0.6478, + "step": 17087 + }, + { + "epoch": 2.55, + "grad_norm": 2.9927909796971313, + "learning_rate": 1.0171470637998563e-06, + "loss": 0.638, + "step": 17088 + }, + { + "epoch": 2.55, + "grad_norm": 3.1398778642809173, + "learning_rate": 1.017050470012579e-06, + "loss": 0.6178, + "step": 17089 + }, + { + "epoch": 2.55, + "grad_norm": 3.1225425373307685, + "learning_rate": 1.0169538760661676e-06, + "loss": 0.5944, + "step": 17090 + }, + { + "epoch": 2.55, + "grad_norm": 3.5140477741205864, + "learning_rate": 1.0168572819615242e-06, + "loss": 0.5781, + "step": 17091 + }, + { + "epoch": 2.55, + "grad_norm": 3.283149582569681, + "learning_rate": 1.0167606876995505e-06, + "loss": 0.6426, + "step": 17092 + }, + { + "epoch": 2.55, + "grad_norm": 5.598922858602128, + "learning_rate": 1.0166640932811478e-06, + "loss": 0.6589, + "step": 17093 + }, + { + "epoch": 2.55, + "grad_norm": 3.3776676578199556, + "learning_rate": 1.0165674987072179e-06, + "loss": 0.666, + "step": 17094 + }, + { + "epoch": 2.55, + "grad_norm": 3.3953947866530334, + "learning_rate": 1.0164709039786616e-06, + "loss": 0.5964, + "step": 17095 + }, + { + "epoch": 2.55, + "grad_norm": 3.2416145827616827, + "learning_rate": 1.0163743090963811e-06, + "loss": 0.6426, + "step": 17096 + }, + { + "epoch": 2.55, + "grad_norm": 3.7680289680606105, + "learning_rate": 1.016277714061278e-06, + "loss": 0.6771, + "step": 17097 + }, + { + "epoch": 2.55, + "grad_norm": 5.5092086211796, + "learning_rate": 1.016181118874253e-06, + "loss": 0.6517, + "step": 17098 + }, + { + "epoch": 2.55, + "grad_norm": 3.2711046803894934, + "learning_rate": 1.0160845235362085e-06, + "loss": 0.6452, + "step": 17099 + }, + { + "epoch": 2.55, + "grad_norm": 3.137505240909917, + "learning_rate": 1.015987928048046e-06, + "loss": 0.6673, + "step": 17100 + }, + { + "epoch": 2.55, + "grad_norm": 2.8804641175940526, + "learning_rate": 1.0158913324106661e-06, + "loss": 0.5833, + "step": 17101 + }, + { + "epoch": 2.55, + "grad_norm": 4.3408417665400165, + "learning_rate": 1.0157947366249714e-06, + "loss": 0.6335, + "step": 17102 + }, + { + "epoch": 2.55, + "grad_norm": 3.4912996736860222, + "learning_rate": 1.0156981406918632e-06, + "loss": 0.6908, + "step": 17103 + }, + { + "epoch": 2.55, + "grad_norm": 3.6266118497427122, + "learning_rate": 1.0156015446122424e-06, + "loss": 0.653, + "step": 17104 + }, + { + "epoch": 2.55, + "grad_norm": 3.769777758366648, + "learning_rate": 1.0155049483870113e-06, + "loss": 0.6777, + "step": 17105 + }, + { + "epoch": 2.55, + "grad_norm": 5.181082323911349, + "learning_rate": 1.0154083520170712e-06, + "loss": 0.6888, + "step": 17106 + }, + { + "epoch": 2.55, + "grad_norm": 3.030459228317862, + "learning_rate": 1.0153117555033233e-06, + "loss": 0.6322, + "step": 17107 + }, + { + "epoch": 2.55, + "grad_norm": 4.240220701185133, + "learning_rate": 1.0152151588466695e-06, + "loss": 0.61, + "step": 17108 + }, + { + "epoch": 2.55, + "grad_norm": 3.088880940772439, + "learning_rate": 1.0151185620480115e-06, + "loss": 0.6712, + "step": 17109 + }, + { + "epoch": 2.55, + "grad_norm": 4.094656947978499, + "learning_rate": 1.0150219651082506e-06, + "loss": 0.6758, + "step": 17110 + }, + { + "epoch": 2.55, + "grad_norm": 3.7458714825625834, + "learning_rate": 1.014925368028288e-06, + "loss": 0.7168, + "step": 17111 + }, + { + "epoch": 2.55, + "grad_norm": 2.873536999147027, + "learning_rate": 1.0148287708090261e-06, + "loss": 0.6198, + "step": 17112 + }, + { + "epoch": 2.55, + "grad_norm": 2.833417382091207, + "learning_rate": 1.0147321734513658e-06, + "loss": 0.6003, + "step": 17113 + }, + { + "epoch": 2.55, + "grad_norm": 2.811097474094833, + "learning_rate": 1.0146355759562087e-06, + "loss": 0.6732, + "step": 17114 + }, + { + "epoch": 2.55, + "grad_norm": 3.2153218903694953, + "learning_rate": 1.0145389783244567e-06, + "loss": 0.7018, + "step": 17115 + }, + { + "epoch": 2.55, + "grad_norm": 3.5279761527808695, + "learning_rate": 1.014442380557011e-06, + "loss": 0.6725, + "step": 17116 + }, + { + "epoch": 2.55, + "grad_norm": 2.6405443223053187, + "learning_rate": 1.014345782654773e-06, + "loss": 0.679, + "step": 17117 + }, + { + "epoch": 2.55, + "grad_norm": 3.1878369265264945, + "learning_rate": 1.014249184618645e-06, + "loss": 0.6478, + "step": 17118 + }, + { + "epoch": 2.55, + "grad_norm": 3.6849572313238697, + "learning_rate": 1.014152586449528e-06, + "loss": 0.6432, + "step": 17119 + }, + { + "epoch": 2.55, + "grad_norm": 8.875611032533346, + "learning_rate": 1.0140559881483235e-06, + "loss": 0.6582, + "step": 17120 + }, + { + "epoch": 2.55, + "grad_norm": 3.532050732774491, + "learning_rate": 1.0139593897159333e-06, + "loss": 0.6097, + "step": 17121 + }, + { + "epoch": 2.55, + "grad_norm": 4.425873976024409, + "learning_rate": 1.013862791153259e-06, + "loss": 0.6628, + "step": 17122 + }, + { + "epoch": 2.55, + "grad_norm": 3.1053470228056295, + "learning_rate": 1.0137661924612017e-06, + "loss": 0.6758, + "step": 17123 + }, + { + "epoch": 2.55, + "grad_norm": 2.5904796542305424, + "learning_rate": 1.0136695936406636e-06, + "loss": 0.6081, + "step": 17124 + }, + { + "epoch": 2.55, + "grad_norm": 4.964485795815236, + "learning_rate": 1.0135729946925458e-06, + "loss": 0.6797, + "step": 17125 + }, + { + "epoch": 2.55, + "grad_norm": 2.388192683331944, + "learning_rate": 1.0134763956177504e-06, + "loss": 0.6296, + "step": 17126 + }, + { + "epoch": 2.55, + "grad_norm": 2.703472685347129, + "learning_rate": 1.0133797964171782e-06, + "loss": 0.6413, + "step": 17127 + }, + { + "epoch": 2.55, + "grad_norm": 3.3535575936438637, + "learning_rate": 1.013283197091731e-06, + "loss": 0.6081, + "step": 17128 + }, + { + "epoch": 2.55, + "grad_norm": 2.5620913083067887, + "learning_rate": 1.0131865976423113e-06, + "loss": 0.6263, + "step": 17129 + }, + { + "epoch": 2.55, + "grad_norm": 2.9626912070472526, + "learning_rate": 1.0130899980698193e-06, + "loss": 0.6243, + "step": 17130 + }, + { + "epoch": 2.55, + "grad_norm": 4.309238761533303, + "learning_rate": 1.0129933983751575e-06, + "loss": 0.6562, + "step": 17131 + }, + { + "epoch": 2.56, + "grad_norm": 3.4009842473399767, + "learning_rate": 1.0128967985592271e-06, + "loss": 0.6139, + "step": 17132 + }, + { + "epoch": 2.56, + "grad_norm": 4.2120952417503785, + "learning_rate": 1.0128001986229296e-06, + "loss": 0.6543, + "step": 17133 + }, + { + "epoch": 2.56, + "grad_norm": 2.691611420198526, + "learning_rate": 1.0127035985671669e-06, + "loss": 0.6543, + "step": 17134 + }, + { + "epoch": 2.56, + "grad_norm": 2.976085675559478, + "learning_rate": 1.0126069983928401e-06, + "loss": 0.6283, + "step": 17135 + }, + { + "epoch": 2.56, + "grad_norm": 2.829652063846915, + "learning_rate": 1.0125103981008514e-06, + "loss": 0.6354, + "step": 17136 + }, + { + "epoch": 2.56, + "grad_norm": 4.971975164035998, + "learning_rate": 1.012413797692102e-06, + "loss": 0.6738, + "step": 17137 + }, + { + "epoch": 2.56, + "grad_norm": 2.7459641470557603, + "learning_rate": 1.012317197167493e-06, + "loss": 0.6348, + "step": 17138 + }, + { + "epoch": 2.56, + "grad_norm": 3.75624871253105, + "learning_rate": 1.0122205965279272e-06, + "loss": 0.651, + "step": 17139 + }, + { + "epoch": 2.56, + "grad_norm": 3.8347652143842104, + "learning_rate": 1.0121239957743053e-06, + "loss": 0.612, + "step": 17140 + }, + { + "epoch": 2.56, + "grad_norm": 4.37632466374376, + "learning_rate": 1.012027394907529e-06, + "loss": 0.6191, + "step": 17141 + }, + { + "epoch": 2.56, + "grad_norm": 3.2496724374097505, + "learning_rate": 1.0119307939285e-06, + "loss": 0.6849, + "step": 17142 + }, + { + "epoch": 2.56, + "grad_norm": 3.109133133568469, + "learning_rate": 1.0118341928381198e-06, + "loss": 0.6732, + "step": 17143 + }, + { + "epoch": 2.56, + "grad_norm": 2.9995532734988353, + "learning_rate": 1.01173759163729e-06, + "loss": 0.5944, + "step": 17144 + }, + { + "epoch": 2.56, + "grad_norm": 3.2505056753308956, + "learning_rate": 1.0116409903269126e-06, + "loss": 0.651, + "step": 17145 + }, + { + "epoch": 2.56, + "grad_norm": 2.823945251834524, + "learning_rate": 1.011544388907888e-06, + "loss": 0.5915, + "step": 17146 + }, + { + "epoch": 2.56, + "grad_norm": 4.390129084027486, + "learning_rate": 1.0114477873811193e-06, + "loss": 0.6348, + "step": 17147 + }, + { + "epoch": 2.56, + "grad_norm": 4.154175525169476, + "learning_rate": 1.011351185747507e-06, + "loss": 0.6361, + "step": 17148 + }, + { + "epoch": 2.56, + "grad_norm": 3.814960769335168, + "learning_rate": 1.0112545840079534e-06, + "loss": 0.6172, + "step": 17149 + }, + { + "epoch": 2.56, + "grad_norm": 5.4168870826076345, + "learning_rate": 1.0111579821633596e-06, + "loss": 0.6198, + "step": 17150 + }, + { + "epoch": 2.56, + "grad_norm": 5.944835799960231, + "learning_rate": 1.0110613802146274e-06, + "loss": 0.6497, + "step": 17151 + }, + { + "epoch": 2.56, + "grad_norm": 3.5055727900749005, + "learning_rate": 1.0109647781626581e-06, + "loss": 0.6325, + "step": 17152 + }, + { + "epoch": 2.56, + "grad_norm": 3.7863529944401684, + "learning_rate": 1.010868176008354e-06, + "loss": 0.5804, + "step": 17153 + }, + { + "epoch": 2.56, + "grad_norm": 3.1332042529974737, + "learning_rate": 1.010771573752616e-06, + "loss": 0.6146, + "step": 17154 + }, + { + "epoch": 2.56, + "grad_norm": 3.163917659376779, + "learning_rate": 1.0106749713963463e-06, + "loss": 0.6035, + "step": 17155 + }, + { + "epoch": 2.56, + "grad_norm": 3.5226096533688493, + "learning_rate": 1.0105783689404454e-06, + "loss": 0.6178, + "step": 17156 + }, + { + "epoch": 2.56, + "grad_norm": 4.3344054650924715, + "learning_rate": 1.0104817663858161e-06, + "loss": 0.6016, + "step": 17157 + }, + { + "epoch": 2.56, + "grad_norm": 3.829197716370603, + "learning_rate": 1.0103851637333596e-06, + "loss": 0.6172, + "step": 17158 + }, + { + "epoch": 2.56, + "grad_norm": 3.430394639073579, + "learning_rate": 1.0102885609839775e-06, + "loss": 0.6159, + "step": 17159 + }, + { + "epoch": 2.56, + "grad_norm": 4.009496565270386, + "learning_rate": 1.0101919581385713e-06, + "loss": 0.6243, + "step": 17160 + }, + { + "epoch": 2.56, + "grad_norm": 3.287616219171595, + "learning_rate": 1.0100953551980425e-06, + "loss": 0.6224, + "step": 17161 + }, + { + "epoch": 2.56, + "grad_norm": 3.8042515711603966, + "learning_rate": 1.009998752163293e-06, + "loss": 0.6322, + "step": 17162 + }, + { + "epoch": 2.56, + "grad_norm": 3.122849195037774, + "learning_rate": 1.0099021490352242e-06, + "loss": 0.6009, + "step": 17163 + }, + { + "epoch": 2.56, + "grad_norm": 5.7647931414111255, + "learning_rate": 1.0098055458147376e-06, + "loss": 0.6562, + "step": 17164 + }, + { + "epoch": 2.56, + "grad_norm": 3.7935937354722036, + "learning_rate": 1.009708942502735e-06, + "loss": 0.6523, + "step": 17165 + }, + { + "epoch": 2.56, + "grad_norm": 3.7066616248324973, + "learning_rate": 1.0096123391001185e-06, + "loss": 0.6582, + "step": 17166 + }, + { + "epoch": 2.56, + "grad_norm": 5.004589714788471, + "learning_rate": 1.0095157356077885e-06, + "loss": 0.6471, + "step": 17167 + }, + { + "epoch": 2.56, + "grad_norm": 3.796904079372568, + "learning_rate": 1.0094191320266478e-06, + "loss": 0.6257, + "step": 17168 + }, + { + "epoch": 2.56, + "grad_norm": 3.9507199345217736, + "learning_rate": 1.0093225283575974e-06, + "loss": 0.6582, + "step": 17169 + }, + { + "epoch": 2.56, + "grad_norm": 3.90293776934589, + "learning_rate": 1.009225924601539e-06, + "loss": 0.6263, + "step": 17170 + }, + { + "epoch": 2.56, + "grad_norm": 3.896620028195126, + "learning_rate": 1.009129320759374e-06, + "loss": 0.6328, + "step": 17171 + }, + { + "epoch": 2.56, + "grad_norm": 4.015511117306315, + "learning_rate": 1.0090327168320046e-06, + "loss": 0.6042, + "step": 17172 + }, + { + "epoch": 2.56, + "grad_norm": 3.5616289279565665, + "learning_rate": 1.0089361128203318e-06, + "loss": 0.6478, + "step": 17173 + }, + { + "epoch": 2.56, + "grad_norm": 3.163651832519545, + "learning_rate": 1.0088395087252575e-06, + "loss": 0.6055, + "step": 17174 + }, + { + "epoch": 2.56, + "grad_norm": 3.709578549889298, + "learning_rate": 1.0087429045476834e-06, + "loss": 0.6211, + "step": 17175 + }, + { + "epoch": 2.56, + "grad_norm": 3.5589182602632965, + "learning_rate": 1.008646300288511e-06, + "loss": 0.6387, + "step": 17176 + }, + { + "epoch": 2.56, + "grad_norm": 4.882221088607357, + "learning_rate": 1.0085496959486417e-06, + "loss": 0.6706, + "step": 17177 + }, + { + "epoch": 2.56, + "grad_norm": 5.249662462719627, + "learning_rate": 1.0084530915289775e-06, + "loss": 0.6322, + "step": 17178 + }, + { + "epoch": 2.56, + "grad_norm": 5.346101960198177, + "learning_rate": 1.0083564870304199e-06, + "loss": 0.7311, + "step": 17179 + }, + { + "epoch": 2.56, + "grad_norm": 3.0853205613070296, + "learning_rate": 1.0082598824538702e-06, + "loss": 0.6139, + "step": 17180 + }, + { + "epoch": 2.56, + "grad_norm": 7.266003475039164, + "learning_rate": 1.0081632778002304e-06, + "loss": 0.6172, + "step": 17181 + }, + { + "epoch": 2.56, + "grad_norm": 3.3540834209146504, + "learning_rate": 1.0080666730704025e-06, + "loss": 0.6719, + "step": 17182 + }, + { + "epoch": 2.56, + "grad_norm": 4.62577084703678, + "learning_rate": 1.0079700682652869e-06, + "loss": 0.6608, + "step": 17183 + }, + { + "epoch": 2.56, + "grad_norm": 5.283132057559506, + "learning_rate": 1.0078734633857863e-06, + "loss": 0.61, + "step": 17184 + }, + { + "epoch": 2.56, + "grad_norm": 3.5829779179400454, + "learning_rate": 1.0077768584328019e-06, + "loss": 0.6348, + "step": 17185 + }, + { + "epoch": 2.56, + "grad_norm": 3.023599968709655, + "learning_rate": 1.0076802534072352e-06, + "loss": 0.6302, + "step": 17186 + }, + { + "epoch": 2.56, + "grad_norm": 2.8130146564594454, + "learning_rate": 1.0075836483099885e-06, + "loss": 0.6602, + "step": 17187 + }, + { + "epoch": 2.56, + "grad_norm": 2.7567467256964466, + "learning_rate": 1.0074870431419627e-06, + "loss": 0.5739, + "step": 17188 + }, + { + "epoch": 2.56, + "grad_norm": 2.6457976450610103, + "learning_rate": 1.0073904379040595e-06, + "loss": 0.6029, + "step": 17189 + }, + { + "epoch": 2.56, + "grad_norm": 2.6535079544982123, + "learning_rate": 1.0072938325971808e-06, + "loss": 0.6289, + "step": 17190 + }, + { + "epoch": 2.56, + "grad_norm": 3.4831086105183915, + "learning_rate": 1.007197227222228e-06, + "loss": 0.6549, + "step": 17191 + }, + { + "epoch": 2.56, + "grad_norm": 2.9577123226867883, + "learning_rate": 1.0071006217801033e-06, + "loss": 0.6706, + "step": 17192 + }, + { + "epoch": 2.56, + "grad_norm": 3.632685275547029, + "learning_rate": 1.0070040162717074e-06, + "loss": 0.6133, + "step": 17193 + }, + { + "epoch": 2.56, + "grad_norm": 3.5346070150997457, + "learning_rate": 1.0069074106979425e-06, + "loss": 0.6589, + "step": 17194 + }, + { + "epoch": 2.56, + "grad_norm": 7.056909915235414, + "learning_rate": 1.0068108050597105e-06, + "loss": 0.6162, + "step": 17195 + }, + { + "epoch": 2.56, + "grad_norm": 3.0620933631216474, + "learning_rate": 1.006714199357912e-06, + "loss": 0.6289, + "step": 17196 + }, + { + "epoch": 2.56, + "grad_norm": 3.3716040564234273, + "learning_rate": 1.0066175935934497e-06, + "loss": 0.6426, + "step": 17197 + }, + { + "epoch": 2.56, + "grad_norm": 4.393890924307363, + "learning_rate": 1.0065209877672249e-06, + "loss": 0.6855, + "step": 17198 + }, + { + "epoch": 2.57, + "grad_norm": 4.197635015351404, + "learning_rate": 1.0064243818801389e-06, + "loss": 0.6784, + "step": 17199 + }, + { + "epoch": 2.57, + "grad_norm": 2.8157857101035355, + "learning_rate": 1.0063277759330937e-06, + "loss": 0.6556, + "step": 17200 + }, + { + "epoch": 2.57, + "grad_norm": 4.462105436389611, + "learning_rate": 1.0062311699269906e-06, + "loss": 0.6432, + "step": 17201 + }, + { + "epoch": 2.57, + "grad_norm": 7.00282053812332, + "learning_rate": 1.0061345638627318e-06, + "loss": 0.6868, + "step": 17202 + }, + { + "epoch": 2.57, + "grad_norm": 2.80960231338598, + "learning_rate": 1.0060379577412181e-06, + "loss": 0.623, + "step": 17203 + }, + { + "epoch": 2.57, + "grad_norm": 3.9419895094848383, + "learning_rate": 1.0059413515633518e-06, + "loss": 0.6146, + "step": 17204 + }, + { + "epoch": 2.57, + "grad_norm": 3.8914758200966233, + "learning_rate": 1.0058447453300347e-06, + "loss": 0.5859, + "step": 17205 + }, + { + "epoch": 2.57, + "grad_norm": 3.104985956869868, + "learning_rate": 1.0057481390421676e-06, + "loss": 0.6504, + "step": 17206 + }, + { + "epoch": 2.57, + "grad_norm": 3.3270699307001745, + "learning_rate": 1.0056515327006526e-06, + "loss": 0.6797, + "step": 17207 + }, + { + "epoch": 2.57, + "grad_norm": 3.6580069371638704, + "learning_rate": 1.005554926306392e-06, + "loss": 0.6602, + "step": 17208 + }, + { + "epoch": 2.57, + "grad_norm": 4.051408161100162, + "learning_rate": 1.005458319860286e-06, + "loss": 0.6198, + "step": 17209 + }, + { + "epoch": 2.57, + "grad_norm": 3.022065701854874, + "learning_rate": 1.0053617133632374e-06, + "loss": 0.679, + "step": 17210 + }, + { + "epoch": 2.57, + "grad_norm": 3.6853661476418025, + "learning_rate": 1.0052651068161476e-06, + "loss": 0.6309, + "step": 17211 + }, + { + "epoch": 2.57, + "grad_norm": 2.843517873088034, + "learning_rate": 1.0051685002199177e-06, + "loss": 0.6523, + "step": 17212 + }, + { + "epoch": 2.57, + "grad_norm": 2.912744327178719, + "learning_rate": 1.0050718935754502e-06, + "loss": 0.6549, + "step": 17213 + }, + { + "epoch": 2.57, + "grad_norm": 2.8250334549595024, + "learning_rate": 1.0049752868836456e-06, + "loss": 0.5885, + "step": 17214 + }, + { + "epoch": 2.57, + "grad_norm": 3.79143804738883, + "learning_rate": 1.0048786801454068e-06, + "loss": 0.6823, + "step": 17215 + }, + { + "epoch": 2.57, + "grad_norm": 2.833485890494847, + "learning_rate": 1.0047820733616346e-06, + "loss": 0.6217, + "step": 17216 + }, + { + "epoch": 2.57, + "grad_norm": 2.8429670256201045, + "learning_rate": 1.0046854665332306e-06, + "loss": 0.6097, + "step": 17217 + }, + { + "epoch": 2.57, + "grad_norm": 3.187661787058169, + "learning_rate": 1.0045888596610974e-06, + "loss": 0.5801, + "step": 17218 + }, + { + "epoch": 2.57, + "grad_norm": 5.346193918754595, + "learning_rate": 1.0044922527461358e-06, + "loss": 0.7201, + "step": 17219 + }, + { + "epoch": 2.57, + "grad_norm": 3.710968718404482, + "learning_rate": 1.0043956457892471e-06, + "loss": 0.623, + "step": 17220 + }, + { + "epoch": 2.57, + "grad_norm": 2.9005235639857356, + "learning_rate": 1.0042990387913341e-06, + "loss": 0.651, + "step": 17221 + }, + { + "epoch": 2.57, + "grad_norm": 2.779536315364121, + "learning_rate": 1.0042024317532974e-06, + "loss": 0.6283, + "step": 17222 + }, + { + "epoch": 2.57, + "grad_norm": 4.452272706654731, + "learning_rate": 1.0041058246760388e-06, + "loss": 0.6426, + "step": 17223 + }, + { + "epoch": 2.57, + "grad_norm": 4.426295710859241, + "learning_rate": 1.0040092175604607e-06, + "loss": 0.6784, + "step": 17224 + }, + { + "epoch": 2.57, + "grad_norm": 2.828666497327293, + "learning_rate": 1.003912610407464e-06, + "loss": 0.6198, + "step": 17225 + }, + { + "epoch": 2.57, + "grad_norm": 3.3921448435749286, + "learning_rate": 1.0038160032179506e-06, + "loss": 0.6042, + "step": 17226 + }, + { + "epoch": 2.57, + "grad_norm": 2.965572957642672, + "learning_rate": 1.003719395992822e-06, + "loss": 0.5703, + "step": 17227 + }, + { + "epoch": 2.57, + "grad_norm": 4.838864022203711, + "learning_rate": 1.00362278873298e-06, + "loss": 0.6406, + "step": 17228 + }, + { + "epoch": 2.57, + "grad_norm": 2.9831270615692285, + "learning_rate": 1.0035261814393263e-06, + "loss": 0.6536, + "step": 17229 + }, + { + "epoch": 2.57, + "grad_norm": 4.067226619250042, + "learning_rate": 1.003429574112762e-06, + "loss": 0.681, + "step": 17230 + }, + { + "epoch": 2.57, + "grad_norm": 3.255524516415049, + "learning_rate": 1.0033329667541896e-06, + "loss": 0.6484, + "step": 17231 + }, + { + "epoch": 2.57, + "grad_norm": 3.537202986015353, + "learning_rate": 1.0032363593645105e-06, + "loss": 0.6315, + "step": 17232 + }, + { + "epoch": 2.57, + "grad_norm": 3.690925357096082, + "learning_rate": 1.0031397519446258e-06, + "loss": 0.6289, + "step": 17233 + }, + { + "epoch": 2.57, + "grad_norm": 2.9321483059685747, + "learning_rate": 1.0030431444954376e-06, + "loss": 0.6009, + "step": 17234 + }, + { + "epoch": 2.57, + "grad_norm": 4.675676040395417, + "learning_rate": 1.0029465370178475e-06, + "loss": 0.6133, + "step": 17235 + }, + { + "epoch": 2.57, + "grad_norm": 3.0695168045877272, + "learning_rate": 1.002849929512757e-06, + "loss": 0.6198, + "step": 17236 + }, + { + "epoch": 2.57, + "grad_norm": 3.204261142618547, + "learning_rate": 1.0027533219810682e-06, + "loss": 0.6348, + "step": 17237 + }, + { + "epoch": 2.57, + "grad_norm": 4.173765110309366, + "learning_rate": 1.0026567144236821e-06, + "loss": 0.6732, + "step": 17238 + }, + { + "epoch": 2.57, + "grad_norm": 4.352899532278141, + "learning_rate": 1.0025601068415006e-06, + "loss": 0.6647, + "step": 17239 + }, + { + "epoch": 2.57, + "grad_norm": 3.5467500477963263, + "learning_rate": 1.0024634992354256e-06, + "loss": 0.6354, + "step": 17240 + }, + { + "epoch": 2.57, + "grad_norm": 3.1473697826401845, + "learning_rate": 1.0023668916063585e-06, + "loss": 0.6582, + "step": 17241 + }, + { + "epoch": 2.57, + "grad_norm": 4.614412872483421, + "learning_rate": 1.0022702839552012e-06, + "loss": 0.6061, + "step": 17242 + }, + { + "epoch": 2.57, + "grad_norm": 5.746336158732258, + "learning_rate": 1.0021736762828545e-06, + "loss": 0.6003, + "step": 17243 + }, + { + "epoch": 2.57, + "grad_norm": 4.536387857070599, + "learning_rate": 1.0020770685902212e-06, + "loss": 0.6283, + "step": 17244 + }, + { + "epoch": 2.57, + "grad_norm": 3.4341644112280436, + "learning_rate": 1.0019804608782022e-06, + "loss": 0.6393, + "step": 17245 + }, + { + "epoch": 2.57, + "grad_norm": 3.1254357194537774, + "learning_rate": 1.0018838531476995e-06, + "loss": 0.6497, + "step": 17246 + }, + { + "epoch": 2.57, + "grad_norm": 3.7048415598324245, + "learning_rate": 1.0017872453996148e-06, + "loss": 0.6387, + "step": 17247 + }, + { + "epoch": 2.57, + "grad_norm": 4.072894228444092, + "learning_rate": 1.0016906376348496e-06, + "loss": 0.6165, + "step": 17248 + }, + { + "epoch": 2.57, + "grad_norm": 3.9491256655283933, + "learning_rate": 1.001594029854305e-06, + "loss": 0.6419, + "step": 17249 + }, + { + "epoch": 2.57, + "grad_norm": 3.083475575094535, + "learning_rate": 1.0014974220588836e-06, + "loss": 0.6048, + "step": 17250 + }, + { + "epoch": 2.57, + "grad_norm": 3.7208805066418638, + "learning_rate": 1.0014008142494866e-06, + "loss": 0.6764, + "step": 17251 + }, + { + "epoch": 2.57, + "grad_norm": 4.32665185856043, + "learning_rate": 1.0013042064270154e-06, + "loss": 0.6784, + "step": 17252 + }, + { + "epoch": 2.57, + "grad_norm": 4.7078796649841514, + "learning_rate": 1.0012075985923721e-06, + "loss": 0.6517, + "step": 17253 + }, + { + "epoch": 2.57, + "grad_norm": 3.6550700423285893, + "learning_rate": 1.0011109907464585e-06, + "loss": 0.6908, + "step": 17254 + }, + { + "epoch": 2.57, + "grad_norm": 3.542784337608103, + "learning_rate": 1.0010143828901756e-06, + "loss": 0.6178, + "step": 17255 + }, + { + "epoch": 2.57, + "grad_norm": 4.201422965846114, + "learning_rate": 1.0009177750244253e-06, + "loss": 0.6406, + "step": 17256 + }, + { + "epoch": 2.57, + "grad_norm": 3.1448644087565683, + "learning_rate": 1.0008211671501095e-06, + "loss": 0.6497, + "step": 17257 + }, + { + "epoch": 2.57, + "grad_norm": 3.137764692585806, + "learning_rate": 1.0007245592681298e-06, + "loss": 0.6842, + "step": 17258 + }, + { + "epoch": 2.57, + "grad_norm": 3.271255251513789, + "learning_rate": 1.0006279513793874e-06, + "loss": 0.6296, + "step": 17259 + }, + { + "epoch": 2.57, + "grad_norm": 2.9589464499138773, + "learning_rate": 1.0005313434847841e-06, + "loss": 0.5918, + "step": 17260 + }, + { + "epoch": 2.57, + "grad_norm": 3.220987686230116, + "learning_rate": 1.0004347355852226e-06, + "loss": 0.6986, + "step": 17261 + }, + { + "epoch": 2.57, + "grad_norm": 3.655961241236927, + "learning_rate": 1.0003381276816027e-06, + "loss": 0.6836, + "step": 17262 + }, + { + "epoch": 2.57, + "grad_norm": 3.364984091016474, + "learning_rate": 1.0002415197748277e-06, + "loss": 0.6224, + "step": 17263 + }, + { + "epoch": 2.57, + "grad_norm": 4.5536218342661465, + "learning_rate": 1.0001449118657983e-06, + "loss": 0.6396, + "step": 17264 + }, + { + "epoch": 2.57, + "grad_norm": 2.8495832364689377, + "learning_rate": 1.0000483039554165e-06, + "loss": 0.6654, + "step": 17265 + }, + { + "epoch": 2.58, + "grad_norm": 4.395417155554365, + "learning_rate": 9.999516960445839e-07, + "loss": 0.6296, + "step": 17266 + }, + { + "epoch": 2.58, + "grad_norm": 3.140224175040271, + "learning_rate": 9.998550881342019e-07, + "loss": 0.6247, + "step": 17267 + }, + { + "epoch": 2.58, + "grad_norm": 2.844456804950263, + "learning_rate": 9.997584802251724e-07, + "loss": 0.651, + "step": 17268 + }, + { + "epoch": 2.58, + "grad_norm": 2.870427840025103, + "learning_rate": 9.996618723183973e-07, + "loss": 0.6139, + "step": 17269 + }, + { + "epoch": 2.58, + "grad_norm": 4.9864718207819205, + "learning_rate": 9.995652644147776e-07, + "loss": 0.6771, + "step": 17270 + }, + { + "epoch": 2.58, + "grad_norm": 3.5457313381252717, + "learning_rate": 9.994686565152156e-07, + "loss": 0.6393, + "step": 17271 + }, + { + "epoch": 2.58, + "grad_norm": 3.315858049502274, + "learning_rate": 9.993720486206125e-07, + "loss": 0.6654, + "step": 17272 + }, + { + "epoch": 2.58, + "grad_norm": 4.760908639459377, + "learning_rate": 9.992754407318705e-07, + "loss": 0.6439, + "step": 17273 + }, + { + "epoch": 2.58, + "grad_norm": 3.747000294950304, + "learning_rate": 9.991788328498906e-07, + "loss": 0.6582, + "step": 17274 + }, + { + "epoch": 2.58, + "grad_norm": 4.259178230187191, + "learning_rate": 9.990822249755746e-07, + "loss": 0.651, + "step": 17275 + }, + { + "epoch": 2.58, + "grad_norm": 2.6637273867415394, + "learning_rate": 9.989856171098245e-07, + "loss": 0.6471, + "step": 17276 + }, + { + "epoch": 2.58, + "grad_norm": 3.3500869254066705, + "learning_rate": 9.988890092535417e-07, + "loss": 0.584, + "step": 17277 + }, + { + "epoch": 2.58, + "grad_norm": 3.2506307532713894, + "learning_rate": 9.987924014076276e-07, + "loss": 0.6667, + "step": 17278 + }, + { + "epoch": 2.58, + "grad_norm": 2.584675672693785, + "learning_rate": 9.986957935729845e-07, + "loss": 0.5911, + "step": 17279 + }, + { + "epoch": 2.58, + "grad_norm": 3.378924222098828, + "learning_rate": 9.985991857505135e-07, + "loss": 0.5931, + "step": 17280 + }, + { + "epoch": 2.58, + "grad_norm": 3.335773321660843, + "learning_rate": 9.985025779411165e-07, + "loss": 0.6836, + "step": 17281 + }, + { + "epoch": 2.58, + "grad_norm": 2.8232171815730447, + "learning_rate": 9.98405970145695e-07, + "loss": 0.6348, + "step": 17282 + }, + { + "epoch": 2.58, + "grad_norm": 2.837921491112488, + "learning_rate": 9.983093623651505e-07, + "loss": 0.6484, + "step": 17283 + }, + { + "epoch": 2.58, + "grad_norm": 3.2610787962928556, + "learning_rate": 9.982127546003851e-07, + "loss": 0.6087, + "step": 17284 + }, + { + "epoch": 2.58, + "grad_norm": 5.32294888534835, + "learning_rate": 9.981161468523004e-07, + "loss": 0.627, + "step": 17285 + }, + { + "epoch": 2.58, + "grad_norm": 3.2079006014972693, + "learning_rate": 9.980195391217979e-07, + "loss": 0.6048, + "step": 17286 + }, + { + "epoch": 2.58, + "grad_norm": 3.007766056423373, + "learning_rate": 9.97922931409779e-07, + "loss": 0.6445, + "step": 17287 + }, + { + "epoch": 2.58, + "grad_norm": 4.3915659409035275, + "learning_rate": 9.978263237171454e-07, + "loss": 0.6387, + "step": 17288 + }, + { + "epoch": 2.58, + "grad_norm": 3.0952651244194054, + "learning_rate": 9.97729716044799e-07, + "loss": 0.6094, + "step": 17289 + }, + { + "epoch": 2.58, + "grad_norm": 3.452417464934383, + "learning_rate": 9.976331083936414e-07, + "loss": 0.6221, + "step": 17290 + }, + { + "epoch": 2.58, + "grad_norm": 3.4781533734600947, + "learning_rate": 9.975365007645741e-07, + "loss": 0.6204, + "step": 17291 + }, + { + "epoch": 2.58, + "grad_norm": 3.338990185146652, + "learning_rate": 9.974398931584995e-07, + "loss": 0.6465, + "step": 17292 + }, + { + "epoch": 2.58, + "grad_norm": 2.9627362629400054, + "learning_rate": 9.97343285576318e-07, + "loss": 0.5755, + "step": 17293 + }, + { + "epoch": 2.58, + "grad_norm": 3.585920512255009, + "learning_rate": 9.97246678018932e-07, + "loss": 0.6191, + "step": 17294 + }, + { + "epoch": 2.58, + "grad_norm": 4.365940267845096, + "learning_rate": 9.971500704872429e-07, + "loss": 0.6458, + "step": 17295 + }, + { + "epoch": 2.58, + "grad_norm": 3.74425879005676, + "learning_rate": 9.970534629821524e-07, + "loss": 0.6536, + "step": 17296 + }, + { + "epoch": 2.58, + "grad_norm": 3.4117056685613085, + "learning_rate": 9.969568555045625e-07, + "loss": 0.6745, + "step": 17297 + }, + { + "epoch": 2.58, + "grad_norm": 5.420030241377768, + "learning_rate": 9.96860248055374e-07, + "loss": 0.6784, + "step": 17298 + }, + { + "epoch": 2.58, + "grad_norm": 3.366403874485703, + "learning_rate": 9.967636406354898e-07, + "loss": 0.6615, + "step": 17299 + }, + { + "epoch": 2.58, + "grad_norm": 4.983330414568246, + "learning_rate": 9.966670332458103e-07, + "loss": 0.6439, + "step": 17300 + }, + { + "epoch": 2.58, + "grad_norm": 3.5102768159091267, + "learning_rate": 9.965704258872379e-07, + "loss": 0.6569, + "step": 17301 + }, + { + "epoch": 2.58, + "grad_norm": 4.417103224663362, + "learning_rate": 9.964738185606738e-07, + "loss": 0.6686, + "step": 17302 + }, + { + "epoch": 2.58, + "grad_norm": 4.759486841281788, + "learning_rate": 9.9637721126702e-07, + "loss": 0.6253, + "step": 17303 + }, + { + "epoch": 2.58, + "grad_norm": 4.179723891537662, + "learning_rate": 9.962806040071778e-07, + "loss": 0.638, + "step": 17304 + }, + { + "epoch": 2.58, + "grad_norm": 3.526938078441234, + "learning_rate": 9.961839967820495e-07, + "loss": 0.6348, + "step": 17305 + }, + { + "epoch": 2.58, + "grad_norm": 3.7194891245516004, + "learning_rate": 9.960873895925362e-07, + "loss": 0.6393, + "step": 17306 + }, + { + "epoch": 2.58, + "grad_norm": 3.7155129351732783, + "learning_rate": 9.959907824395395e-07, + "loss": 0.6712, + "step": 17307 + }, + { + "epoch": 2.58, + "grad_norm": 3.0927739629685895, + "learning_rate": 9.95894175323961e-07, + "loss": 0.6719, + "step": 17308 + }, + { + "epoch": 2.58, + "grad_norm": 3.954406680973517, + "learning_rate": 9.957975682467027e-07, + "loss": 0.651, + "step": 17309 + }, + { + "epoch": 2.58, + "grad_norm": 3.7295363347958213, + "learning_rate": 9.95700961208666e-07, + "loss": 0.6471, + "step": 17310 + }, + { + "epoch": 2.58, + "grad_norm": 3.9665502120271543, + "learning_rate": 9.956043542107528e-07, + "loss": 0.6302, + "step": 17311 + }, + { + "epoch": 2.58, + "grad_norm": 5.233318094015883, + "learning_rate": 9.955077472538648e-07, + "loss": 0.6523, + "step": 17312 + }, + { + "epoch": 2.58, + "grad_norm": 5.7268229232388, + "learning_rate": 9.954111403389027e-07, + "loss": 0.6491, + "step": 17313 + }, + { + "epoch": 2.58, + "grad_norm": 3.03693072331686, + "learning_rate": 9.953145334667693e-07, + "loss": 0.627, + "step": 17314 + }, + { + "epoch": 2.58, + "grad_norm": 3.5406821638668102, + "learning_rate": 9.952179266383654e-07, + "loss": 0.6191, + "step": 17315 + }, + { + "epoch": 2.58, + "grad_norm": 4.951799943041009, + "learning_rate": 9.951213198545933e-07, + "loss": 0.627, + "step": 17316 + }, + { + "epoch": 2.58, + "grad_norm": 3.9179141094673873, + "learning_rate": 9.950247131163541e-07, + "loss": 0.5918, + "step": 17317 + }, + { + "epoch": 2.58, + "grad_norm": 3.0826115388545587, + "learning_rate": 9.9492810642455e-07, + "loss": 0.6556, + "step": 17318 + }, + { + "epoch": 2.58, + "grad_norm": 4.485731187401998, + "learning_rate": 9.948314997800826e-07, + "loss": 0.5996, + "step": 17319 + }, + { + "epoch": 2.58, + "grad_norm": 3.778754452548987, + "learning_rate": 9.947348931838526e-07, + "loss": 0.6647, + "step": 17320 + }, + { + "epoch": 2.58, + "grad_norm": 4.364411036436592, + "learning_rate": 9.946382866367625e-07, + "loss": 0.625, + "step": 17321 + }, + { + "epoch": 2.58, + "grad_norm": 6.261521444256794, + "learning_rate": 9.94541680139714e-07, + "loss": 0.6517, + "step": 17322 + }, + { + "epoch": 2.58, + "grad_norm": 3.4314886122175485, + "learning_rate": 9.94445073693608e-07, + "loss": 0.5957, + "step": 17323 + }, + { + "epoch": 2.58, + "grad_norm": 3.343701379580414, + "learning_rate": 9.94348467299347e-07, + "loss": 0.6445, + "step": 17324 + }, + { + "epoch": 2.58, + "grad_norm": 5.995683694853375, + "learning_rate": 9.942518609578327e-07, + "loss": 0.6745, + "step": 17325 + }, + { + "epoch": 2.58, + "grad_norm": 3.2198777924170026, + "learning_rate": 9.941552546699656e-07, + "loss": 0.6732, + "step": 17326 + }, + { + "epoch": 2.58, + "grad_norm": 4.223291198436136, + "learning_rate": 9.940586484366483e-07, + "loss": 0.5716, + "step": 17327 + }, + { + "epoch": 2.58, + "grad_norm": 3.0440251936781832, + "learning_rate": 9.939620422587818e-07, + "loss": 0.6458, + "step": 17328 + }, + { + "epoch": 2.58, + "grad_norm": 3.7024838025837266, + "learning_rate": 9.938654361372686e-07, + "loss": 0.6374, + "step": 17329 + }, + { + "epoch": 2.58, + "grad_norm": 3.80706855031585, + "learning_rate": 9.937688300730093e-07, + "loss": 0.6439, + "step": 17330 + }, + { + "epoch": 2.58, + "grad_norm": 3.5823484919351065, + "learning_rate": 9.936722240669062e-07, + "loss": 0.6074, + "step": 17331 + }, + { + "epoch": 2.58, + "grad_norm": 3.1673195698692624, + "learning_rate": 9.935756181198615e-07, + "loss": 0.6647, + "step": 17332 + }, + { + "epoch": 2.59, + "grad_norm": 3.246309180670043, + "learning_rate": 9.934790122327754e-07, + "loss": 0.5996, + "step": 17333 + }, + { + "epoch": 2.59, + "grad_norm": 3.7535052714546038, + "learning_rate": 9.933824064065502e-07, + "loss": 0.6419, + "step": 17334 + }, + { + "epoch": 2.59, + "grad_norm": 3.711013765243277, + "learning_rate": 9.93285800642088e-07, + "loss": 0.6276, + "step": 17335 + }, + { + "epoch": 2.59, + "grad_norm": 2.9539014243200934, + "learning_rate": 9.931891949402896e-07, + "loss": 0.6387, + "step": 17336 + }, + { + "epoch": 2.59, + "grad_norm": 5.118526561224825, + "learning_rate": 9.930925893020574e-07, + "loss": 0.6204, + "step": 17337 + }, + { + "epoch": 2.59, + "grad_norm": 4.321337173183871, + "learning_rate": 9.929959837282925e-07, + "loss": 0.6608, + "step": 17338 + }, + { + "epoch": 2.59, + "grad_norm": 4.4952101767623285, + "learning_rate": 9.928993782198969e-07, + "loss": 0.6647, + "step": 17339 + }, + { + "epoch": 2.59, + "grad_norm": 3.585883371721984, + "learning_rate": 9.928027727777719e-07, + "loss": 0.6276, + "step": 17340 + }, + { + "epoch": 2.59, + "grad_norm": 3.3689784639617266, + "learning_rate": 9.927061674028193e-07, + "loss": 0.6185, + "step": 17341 + }, + { + "epoch": 2.59, + "grad_norm": 3.400115534738745, + "learning_rate": 9.926095620959407e-07, + "loss": 0.6341, + "step": 17342 + }, + { + "epoch": 2.59, + "grad_norm": 3.0874339191055786, + "learning_rate": 9.925129568580374e-07, + "loss": 0.5983, + "step": 17343 + }, + { + "epoch": 2.59, + "grad_norm": 3.732669778319713, + "learning_rate": 9.924163516900114e-07, + "loss": 0.666, + "step": 17344 + }, + { + "epoch": 2.59, + "grad_norm": 3.577974265245599, + "learning_rate": 9.923197465927647e-07, + "loss": 0.6549, + "step": 17345 + }, + { + "epoch": 2.59, + "grad_norm": 4.761877609253576, + "learning_rate": 9.922231415671982e-07, + "loss": 0.6569, + "step": 17346 + }, + { + "epoch": 2.59, + "grad_norm": 5.4179118524604135, + "learning_rate": 9.921265366142138e-07, + "loss": 0.6221, + "step": 17347 + }, + { + "epoch": 2.59, + "grad_norm": 4.759159570510402, + "learning_rate": 9.920299317347132e-07, + "loss": 0.6276, + "step": 17348 + }, + { + "epoch": 2.59, + "grad_norm": 5.186806934818881, + "learning_rate": 9.919333269295977e-07, + "loss": 0.6758, + "step": 17349 + }, + { + "epoch": 2.59, + "grad_norm": 3.1093917294953513, + "learning_rate": 9.918367221997695e-07, + "loss": 0.627, + "step": 17350 + }, + { + "epoch": 2.59, + "grad_norm": 4.910742793953512, + "learning_rate": 9.917401175461297e-07, + "loss": 0.6556, + "step": 17351 + }, + { + "epoch": 2.59, + "grad_norm": 3.143119323645674, + "learning_rate": 9.916435129695804e-07, + "loss": 0.6113, + "step": 17352 + }, + { + "epoch": 2.59, + "grad_norm": 5.167836027716115, + "learning_rate": 9.915469084710228e-07, + "loss": 0.6374, + "step": 17353 + }, + { + "epoch": 2.59, + "grad_norm": 5.798266749275704, + "learning_rate": 9.914503040513582e-07, + "loss": 0.666, + "step": 17354 + }, + { + "epoch": 2.59, + "grad_norm": 3.592213376995982, + "learning_rate": 9.91353699711489e-07, + "loss": 0.6758, + "step": 17355 + }, + { + "epoch": 2.59, + "grad_norm": 3.4066304034986823, + "learning_rate": 9.912570954523167e-07, + "loss": 0.6029, + "step": 17356 + }, + { + "epoch": 2.59, + "grad_norm": 4.139385727437995, + "learning_rate": 9.911604912747424e-07, + "loss": 0.6478, + "step": 17357 + }, + { + "epoch": 2.59, + "grad_norm": 4.810242389199682, + "learning_rate": 9.91063887179668e-07, + "loss": 0.6654, + "step": 17358 + }, + { + "epoch": 2.59, + "grad_norm": 3.4660876133125313, + "learning_rate": 9.909672831679958e-07, + "loss": 0.6719, + "step": 17359 + }, + { + "epoch": 2.59, + "grad_norm": 3.7615917087470305, + "learning_rate": 9.90870679240626e-07, + "loss": 0.5911, + "step": 17360 + }, + { + "epoch": 2.59, + "grad_norm": 5.368795537608442, + "learning_rate": 9.907740753984612e-07, + "loss": 0.6868, + "step": 17361 + }, + { + "epoch": 2.59, + "grad_norm": 2.9014104207199742, + "learning_rate": 9.906774716424025e-07, + "loss": 0.6048, + "step": 17362 + }, + { + "epoch": 2.59, + "grad_norm": 3.650978330377212, + "learning_rate": 9.90580867973352e-07, + "loss": 0.6309, + "step": 17363 + }, + { + "epoch": 2.59, + "grad_norm": 4.535797971110275, + "learning_rate": 9.904842643922112e-07, + "loss": 0.681, + "step": 17364 + }, + { + "epoch": 2.59, + "grad_norm": 3.18693768728951, + "learning_rate": 9.903876608998819e-07, + "loss": 0.6178, + "step": 17365 + }, + { + "epoch": 2.59, + "grad_norm": 3.022140240803048, + "learning_rate": 9.902910574972648e-07, + "loss": 0.6048, + "step": 17366 + }, + { + "epoch": 2.59, + "grad_norm": 3.6403833293012138, + "learning_rate": 9.901944541852625e-07, + "loss": 0.6556, + "step": 17367 + }, + { + "epoch": 2.59, + "grad_norm": 3.202022565481712, + "learning_rate": 9.90097850964776e-07, + "loss": 0.6471, + "step": 17368 + }, + { + "epoch": 2.59, + "grad_norm": 5.59650957277905, + "learning_rate": 9.900012478367072e-07, + "loss": 0.6706, + "step": 17369 + }, + { + "epoch": 2.59, + "grad_norm": 3.060045523826881, + "learning_rate": 9.899046448019574e-07, + "loss": 0.627, + "step": 17370 + }, + { + "epoch": 2.59, + "grad_norm": 3.6817634898819485, + "learning_rate": 9.898080418614288e-07, + "loss": 0.6087, + "step": 17371 + }, + { + "epoch": 2.59, + "grad_norm": 3.362347622768725, + "learning_rate": 9.897114390160228e-07, + "loss": 0.681, + "step": 17372 + }, + { + "epoch": 2.59, + "grad_norm": 3.1129461463767583, + "learning_rate": 9.896148362666405e-07, + "loss": 0.6328, + "step": 17373 + }, + { + "epoch": 2.59, + "grad_norm": 3.67221745001429, + "learning_rate": 9.89518233614184e-07, + "loss": 0.6497, + "step": 17374 + }, + { + "epoch": 2.59, + "grad_norm": 2.813164234011203, + "learning_rate": 9.894216310595545e-07, + "loss": 0.5918, + "step": 17375 + }, + { + "epoch": 2.59, + "grad_norm": 3.322570891578026, + "learning_rate": 9.893250286036539e-07, + "loss": 0.6953, + "step": 17376 + }, + { + "epoch": 2.59, + "grad_norm": 3.2598945262600205, + "learning_rate": 9.89228426247384e-07, + "loss": 0.6419, + "step": 17377 + }, + { + "epoch": 2.59, + "grad_norm": 3.35444807093282, + "learning_rate": 9.891318239916464e-07, + "loss": 0.6602, + "step": 17378 + }, + { + "epoch": 2.59, + "grad_norm": 4.251136133897727, + "learning_rate": 9.890352218373418e-07, + "loss": 0.6628, + "step": 17379 + }, + { + "epoch": 2.59, + "grad_norm": 3.1331812715517056, + "learning_rate": 9.88938619785373e-07, + "loss": 0.6491, + "step": 17380 + }, + { + "epoch": 2.59, + "grad_norm": 3.0241978522690993, + "learning_rate": 9.888420178366405e-07, + "loss": 0.5885, + "step": 17381 + }, + { + "epoch": 2.59, + "grad_norm": 3.272807722330589, + "learning_rate": 9.887454159920467e-07, + "loss": 0.6178, + "step": 17382 + }, + { + "epoch": 2.59, + "grad_norm": 3.0450502430397877, + "learning_rate": 9.886488142524929e-07, + "loss": 0.6419, + "step": 17383 + }, + { + "epoch": 2.59, + "grad_norm": 4.726805545377388, + "learning_rate": 9.885522126188808e-07, + "loss": 0.6706, + "step": 17384 + }, + { + "epoch": 2.59, + "grad_norm": 3.072424218523567, + "learning_rate": 9.88455611092112e-07, + "loss": 0.6289, + "step": 17385 + }, + { + "epoch": 2.59, + "grad_norm": 3.060415261736962, + "learning_rate": 9.883590096730877e-07, + "loss": 0.6185, + "step": 17386 + }, + { + "epoch": 2.59, + "grad_norm": 3.777056315671761, + "learning_rate": 9.882624083627098e-07, + "loss": 0.6452, + "step": 17387 + }, + { + "epoch": 2.59, + "grad_norm": 3.719879621056617, + "learning_rate": 9.881658071618804e-07, + "loss": 0.6608, + "step": 17388 + }, + { + "epoch": 2.59, + "grad_norm": 3.294683902619622, + "learning_rate": 9.880692060715e-07, + "loss": 0.6361, + "step": 17389 + }, + { + "epoch": 2.59, + "grad_norm": 3.5055183276236836, + "learning_rate": 9.87972605092471e-07, + "loss": 0.6224, + "step": 17390 + }, + { + "epoch": 2.59, + "grad_norm": 3.456624997204455, + "learning_rate": 9.878760042256946e-07, + "loss": 0.64, + "step": 17391 + }, + { + "epoch": 2.59, + "grad_norm": 2.916627990938017, + "learning_rate": 9.877794034720729e-07, + "loss": 0.64, + "step": 17392 + }, + { + "epoch": 2.59, + "grad_norm": 5.037138289209211, + "learning_rate": 9.87682802832507e-07, + "loss": 0.6471, + "step": 17393 + }, + { + "epoch": 2.59, + "grad_norm": 3.036984189930057, + "learning_rate": 9.875862023078982e-07, + "loss": 0.612, + "step": 17394 + }, + { + "epoch": 2.59, + "grad_norm": 2.906154543405576, + "learning_rate": 9.874896018991487e-07, + "loss": 0.6296, + "step": 17395 + }, + { + "epoch": 2.59, + "grad_norm": 5.065978527139109, + "learning_rate": 9.873930016071598e-07, + "loss": 0.6803, + "step": 17396 + }, + { + "epoch": 2.59, + "grad_norm": 3.3154755109171297, + "learning_rate": 9.87296401432833e-07, + "loss": 0.6784, + "step": 17397 + }, + { + "epoch": 2.59, + "grad_norm": 3.8065582831005633, + "learning_rate": 9.871998013770708e-07, + "loss": 0.6426, + "step": 17398 + }, + { + "epoch": 2.59, + "grad_norm": 2.9251757441934707, + "learning_rate": 9.871032014407732e-07, + "loss": 0.6621, + "step": 17399 + }, + { + "epoch": 2.6, + "grad_norm": 3.2272642614863267, + "learning_rate": 9.870066016248426e-07, + "loss": 0.6107, + "step": 17400 + }, + { + "epoch": 2.6, + "grad_norm": 3.069685598222167, + "learning_rate": 9.869100019301807e-07, + "loss": 0.6185, + "step": 17401 + }, + { + "epoch": 2.6, + "grad_norm": 3.1133565095705613, + "learning_rate": 9.868134023576888e-07, + "loss": 0.6081, + "step": 17402 + }, + { + "epoch": 2.6, + "grad_norm": 3.056725711198001, + "learning_rate": 9.867168029082685e-07, + "loss": 0.5872, + "step": 17403 + }, + { + "epoch": 2.6, + "grad_norm": 6.319940714477502, + "learning_rate": 9.866202035828217e-07, + "loss": 0.6589, + "step": 17404 + }, + { + "epoch": 2.6, + "grad_norm": 3.501406441013278, + "learning_rate": 9.8652360438225e-07, + "loss": 0.6813, + "step": 17405 + }, + { + "epoch": 2.6, + "grad_norm": 3.646011405849257, + "learning_rate": 9.864270053074543e-07, + "loss": 0.6914, + "step": 17406 + }, + { + "epoch": 2.6, + "grad_norm": 3.469524801048799, + "learning_rate": 9.863304063593363e-07, + "loss": 0.6159, + "step": 17407 + }, + { + "epoch": 2.6, + "grad_norm": 3.2844510600710883, + "learning_rate": 9.862338075387984e-07, + "loss": 0.7077, + "step": 17408 + }, + { + "epoch": 2.6, + "grad_norm": 4.1604759421577615, + "learning_rate": 9.861372088467412e-07, + "loss": 0.6243, + "step": 17409 + }, + { + "epoch": 2.6, + "grad_norm": 3.9032547669454143, + "learning_rate": 9.860406102840666e-07, + "loss": 0.6309, + "step": 17410 + }, + { + "epoch": 2.6, + "grad_norm": 6.747720990749747, + "learning_rate": 9.859440118516766e-07, + "loss": 0.6237, + "step": 17411 + }, + { + "epoch": 2.6, + "grad_norm": 4.939242946878792, + "learning_rate": 9.858474135504722e-07, + "loss": 0.6914, + "step": 17412 + }, + { + "epoch": 2.6, + "grad_norm": 3.1944593917429827, + "learning_rate": 9.857508153813551e-07, + "loss": 0.6504, + "step": 17413 + }, + { + "epoch": 2.6, + "grad_norm": 3.421183358838909, + "learning_rate": 9.85654217345227e-07, + "loss": 0.6237, + "step": 17414 + }, + { + "epoch": 2.6, + "grad_norm": 2.811727461132674, + "learning_rate": 9.855576194429892e-07, + "loss": 0.6243, + "step": 17415 + }, + { + "epoch": 2.6, + "grad_norm": 2.930557149096399, + "learning_rate": 9.854610216755434e-07, + "loss": 0.571, + "step": 17416 + }, + { + "epoch": 2.6, + "grad_norm": 3.336102320186337, + "learning_rate": 9.853644240437912e-07, + "loss": 0.6667, + "step": 17417 + }, + { + "epoch": 2.6, + "grad_norm": 2.573679562152484, + "learning_rate": 9.852678265486343e-07, + "loss": 0.6211, + "step": 17418 + }, + { + "epoch": 2.6, + "grad_norm": 2.992051105806668, + "learning_rate": 9.851712291909742e-07, + "loss": 0.6289, + "step": 17419 + }, + { + "epoch": 2.6, + "grad_norm": 3.4459706893323694, + "learning_rate": 9.850746319717118e-07, + "loss": 0.6536, + "step": 17420 + }, + { + "epoch": 2.6, + "grad_norm": 3.930301498059312, + "learning_rate": 9.849780348917493e-07, + "loss": 0.6725, + "step": 17421 + }, + { + "epoch": 2.6, + "grad_norm": 3.1620630005428367, + "learning_rate": 9.848814379519884e-07, + "loss": 0.679, + "step": 17422 + }, + { + "epoch": 2.6, + "grad_norm": 3.9183406048997633, + "learning_rate": 9.847848411533302e-07, + "loss": 0.6341, + "step": 17423 + }, + { + "epoch": 2.6, + "grad_norm": 3.8230394021390253, + "learning_rate": 9.846882444966766e-07, + "loss": 0.6615, + "step": 17424 + }, + { + "epoch": 2.6, + "grad_norm": 2.6413190488377736, + "learning_rate": 9.845916479829292e-07, + "loss": 0.6009, + "step": 17425 + }, + { + "epoch": 2.6, + "grad_norm": 3.362864444556243, + "learning_rate": 9.844950516129886e-07, + "loss": 0.6042, + "step": 17426 + }, + { + "epoch": 2.6, + "grad_norm": 3.9288565413567245, + "learning_rate": 9.843984553877578e-07, + "loss": 0.6445, + "step": 17427 + }, + { + "epoch": 2.6, + "grad_norm": 4.905145185780454, + "learning_rate": 9.84301859308137e-07, + "loss": 0.6107, + "step": 17428 + }, + { + "epoch": 2.6, + "grad_norm": 3.191674898384126, + "learning_rate": 9.842052633750285e-07, + "loss": 0.6003, + "step": 17429 + }, + { + "epoch": 2.6, + "grad_norm": 3.496751673574792, + "learning_rate": 9.841086675893336e-07, + "loss": 0.6146, + "step": 17430 + }, + { + "epoch": 2.6, + "grad_norm": 3.04797733868172, + "learning_rate": 9.840120719519544e-07, + "loss": 0.6406, + "step": 17431 + }, + { + "epoch": 2.6, + "grad_norm": 5.161502511428071, + "learning_rate": 9.839154764637914e-07, + "loss": 0.6328, + "step": 17432 + }, + { + "epoch": 2.6, + "grad_norm": 3.535112045212668, + "learning_rate": 9.83818881125747e-07, + "loss": 0.6341, + "step": 17433 + }, + { + "epoch": 2.6, + "grad_norm": 5.18815066014578, + "learning_rate": 9.83722285938722e-07, + "loss": 0.6673, + "step": 17434 + }, + { + "epoch": 2.6, + "grad_norm": 4.524736672063215, + "learning_rate": 9.836256909036188e-07, + "loss": 0.681, + "step": 17435 + }, + { + "epoch": 2.6, + "grad_norm": 3.4155074443210705, + "learning_rate": 9.835290960213381e-07, + "loss": 0.6497, + "step": 17436 + }, + { + "epoch": 2.6, + "grad_norm": 4.681510995882524, + "learning_rate": 9.834325012927823e-07, + "loss": 0.5833, + "step": 17437 + }, + { + "epoch": 2.6, + "grad_norm": 5.3109167193099225, + "learning_rate": 9.833359067188523e-07, + "loss": 0.6432, + "step": 17438 + }, + { + "epoch": 2.6, + "grad_norm": 3.4167227745592115, + "learning_rate": 9.832393123004494e-07, + "loss": 0.625, + "step": 17439 + }, + { + "epoch": 2.6, + "grad_norm": 3.3446655856888716, + "learning_rate": 9.831427180384757e-07, + "loss": 0.6289, + "step": 17440 + }, + { + "epoch": 2.6, + "grad_norm": 3.5335080999615154, + "learning_rate": 9.830461239338325e-07, + "loss": 0.6449, + "step": 17441 + }, + { + "epoch": 2.6, + "grad_norm": 3.453962231765395, + "learning_rate": 9.829495299874212e-07, + "loss": 0.6335, + "step": 17442 + }, + { + "epoch": 2.6, + "grad_norm": 3.795892134040347, + "learning_rate": 9.828529362001436e-07, + "loss": 0.6367, + "step": 17443 + }, + { + "epoch": 2.6, + "grad_norm": 4.314786260033133, + "learning_rate": 9.827563425729013e-07, + "loss": 0.6406, + "step": 17444 + }, + { + "epoch": 2.6, + "grad_norm": 3.4647075313544984, + "learning_rate": 9.826597491065952e-07, + "loss": 0.6113, + "step": 17445 + }, + { + "epoch": 2.6, + "grad_norm": 3.8900313384525442, + "learning_rate": 9.825631558021273e-07, + "loss": 0.6341, + "step": 17446 + }, + { + "epoch": 2.6, + "grad_norm": 4.974207542059112, + "learning_rate": 9.824665626603987e-07, + "loss": 0.6725, + "step": 17447 + }, + { + "epoch": 2.6, + "grad_norm": 3.429354781923542, + "learning_rate": 9.823699696823116e-07, + "loss": 0.6628, + "step": 17448 + }, + { + "epoch": 2.6, + "grad_norm": 3.412236019985001, + "learning_rate": 9.82273376868767e-07, + "loss": 0.6048, + "step": 17449 + }, + { + "epoch": 2.6, + "grad_norm": 6.059029600999812, + "learning_rate": 9.821767842206666e-07, + "loss": 0.6341, + "step": 17450 + }, + { + "epoch": 2.6, + "grad_norm": 4.408834575747848, + "learning_rate": 9.820801917389123e-07, + "loss": 0.7012, + "step": 17451 + }, + { + "epoch": 2.6, + "grad_norm": 3.6188322413747254, + "learning_rate": 9.819835994244044e-07, + "loss": 0.6549, + "step": 17452 + }, + { + "epoch": 2.6, + "grad_norm": 4.530730342935476, + "learning_rate": 9.818870072780454e-07, + "loss": 0.6562, + "step": 17453 + }, + { + "epoch": 2.6, + "grad_norm": 3.148884704531288, + "learning_rate": 9.817904153007366e-07, + "loss": 0.6276, + "step": 17454 + }, + { + "epoch": 2.6, + "grad_norm": 3.6615171548237018, + "learning_rate": 9.816938234933793e-07, + "loss": 0.6693, + "step": 17455 + }, + { + "epoch": 2.6, + "grad_norm": 3.5499695795889363, + "learning_rate": 9.815972318568755e-07, + "loss": 0.6686, + "step": 17456 + }, + { + "epoch": 2.6, + "grad_norm": 3.5903768106029994, + "learning_rate": 9.81500640392126e-07, + "loss": 0.6165, + "step": 17457 + }, + { + "epoch": 2.6, + "grad_norm": 4.995821197228488, + "learning_rate": 9.814040491000331e-07, + "loss": 0.6302, + "step": 17458 + }, + { + "epoch": 2.6, + "grad_norm": 3.375074766500156, + "learning_rate": 9.813074579814978e-07, + "loss": 0.6673, + "step": 17459 + }, + { + "epoch": 2.6, + "grad_norm": 4.338942793661134, + "learning_rate": 9.812108670374212e-07, + "loss": 0.5788, + "step": 17460 + }, + { + "epoch": 2.6, + "grad_norm": 3.0260062241929595, + "learning_rate": 9.811142762687057e-07, + "loss": 0.6094, + "step": 17461 + }, + { + "epoch": 2.6, + "grad_norm": 4.136933443975528, + "learning_rate": 9.81017685676252e-07, + "loss": 0.6169, + "step": 17462 + }, + { + "epoch": 2.6, + "grad_norm": 3.7580479396573527, + "learning_rate": 9.80921095260962e-07, + "loss": 0.6432, + "step": 17463 + }, + { + "epoch": 2.6, + "grad_norm": 3.4768804185083213, + "learning_rate": 9.808245050237375e-07, + "loss": 0.6393, + "step": 17464 + }, + { + "epoch": 2.6, + "grad_norm": 3.3115228169305424, + "learning_rate": 9.807279149654793e-07, + "loss": 0.6009, + "step": 17465 + }, + { + "epoch": 2.6, + "grad_norm": 4.426736309278423, + "learning_rate": 9.80631325087089e-07, + "loss": 0.6667, + "step": 17466 + }, + { + "epoch": 2.61, + "grad_norm": 3.8870452241539235, + "learning_rate": 9.805347353894684e-07, + "loss": 0.5872, + "step": 17467 + }, + { + "epoch": 2.61, + "grad_norm": 2.8897924760772677, + "learning_rate": 9.804381458735186e-07, + "loss": 0.6042, + "step": 17468 + }, + { + "epoch": 2.61, + "grad_norm": 3.046647819875917, + "learning_rate": 9.803415565401416e-07, + "loss": 0.6536, + "step": 17469 + }, + { + "epoch": 2.61, + "grad_norm": 3.23249185623194, + "learning_rate": 9.802449673902384e-07, + "loss": 0.666, + "step": 17470 + }, + { + "epoch": 2.61, + "grad_norm": 3.359449250217625, + "learning_rate": 9.80148378424711e-07, + "loss": 0.6582, + "step": 17471 + }, + { + "epoch": 2.61, + "grad_norm": 4.452055045636468, + "learning_rate": 9.800517896444604e-07, + "loss": 0.6126, + "step": 17472 + }, + { + "epoch": 2.61, + "grad_norm": 4.410774990895538, + "learning_rate": 9.79955201050388e-07, + "loss": 0.6413, + "step": 17473 + }, + { + "epoch": 2.61, + "grad_norm": 5.442921550199329, + "learning_rate": 9.798586126433958e-07, + "loss": 0.7051, + "step": 17474 + }, + { + "epoch": 2.61, + "grad_norm": 2.8228060815215996, + "learning_rate": 9.797620244243847e-07, + "loss": 0.638, + "step": 17475 + }, + { + "epoch": 2.61, + "grad_norm": 3.7802573618408366, + "learning_rate": 9.796654363942562e-07, + "loss": 0.6107, + "step": 17476 + }, + { + "epoch": 2.61, + "grad_norm": 3.488928971299095, + "learning_rate": 9.795688485539127e-07, + "loss": 0.6227, + "step": 17477 + }, + { + "epoch": 2.61, + "grad_norm": 6.136929266456842, + "learning_rate": 9.794722609042545e-07, + "loss": 0.6126, + "step": 17478 + }, + { + "epoch": 2.61, + "grad_norm": 3.950814181594509, + "learning_rate": 9.793756734461833e-07, + "loss": 0.6348, + "step": 17479 + }, + { + "epoch": 2.61, + "grad_norm": 3.454600600253988, + "learning_rate": 9.79279086180601e-07, + "loss": 0.6536, + "step": 17480 + }, + { + "epoch": 2.61, + "grad_norm": 5.173632657989819, + "learning_rate": 9.791824991084088e-07, + "loss": 0.6107, + "step": 17481 + }, + { + "epoch": 2.61, + "grad_norm": 5.569468477573112, + "learning_rate": 9.790859122305082e-07, + "loss": 0.6465, + "step": 17482 + }, + { + "epoch": 2.61, + "grad_norm": 3.2539369848606636, + "learning_rate": 9.789893255478006e-07, + "loss": 0.6354, + "step": 17483 + }, + { + "epoch": 2.61, + "grad_norm": 6.6919652161257295, + "learning_rate": 9.788927390611876e-07, + "loss": 0.6654, + "step": 17484 + }, + { + "epoch": 2.61, + "grad_norm": 4.103323014150516, + "learning_rate": 9.787961527715706e-07, + "loss": 0.6296, + "step": 17485 + }, + { + "epoch": 2.61, + "grad_norm": 3.7174708403501775, + "learning_rate": 9.786995666798506e-07, + "loss": 0.5967, + "step": 17486 + }, + { + "epoch": 2.61, + "grad_norm": 4.936106147355737, + "learning_rate": 9.786029807869294e-07, + "loss": 0.6042, + "step": 17487 + }, + { + "epoch": 2.61, + "grad_norm": 3.412365351758187, + "learning_rate": 9.78506395093709e-07, + "loss": 0.6445, + "step": 17488 + }, + { + "epoch": 2.61, + "grad_norm": 4.332410241920782, + "learning_rate": 9.784098096010897e-07, + "loss": 0.5723, + "step": 17489 + }, + { + "epoch": 2.61, + "grad_norm": 4.673028478768332, + "learning_rate": 9.78313224309974e-07, + "loss": 0.6152, + "step": 17490 + }, + { + "epoch": 2.61, + "grad_norm": 3.8946880645101944, + "learning_rate": 9.782166392212631e-07, + "loss": 0.6354, + "step": 17491 + }, + { + "epoch": 2.61, + "grad_norm": 3.8525945309320138, + "learning_rate": 9.781200543358578e-07, + "loss": 0.6432, + "step": 17492 + }, + { + "epoch": 2.61, + "grad_norm": 3.451557408655703, + "learning_rate": 9.7802346965466e-07, + "loss": 0.6061, + "step": 17493 + }, + { + "epoch": 2.61, + "grad_norm": 4.089037225593564, + "learning_rate": 9.77926885178571e-07, + "loss": 0.6758, + "step": 17494 + }, + { + "epoch": 2.61, + "grad_norm": 3.61726164314525, + "learning_rate": 9.778303009084928e-07, + "loss": 0.6536, + "step": 17495 + }, + { + "epoch": 2.61, + "grad_norm": 3.776572708877954, + "learning_rate": 9.777337168453258e-07, + "loss": 0.6276, + "step": 17496 + }, + { + "epoch": 2.61, + "grad_norm": 3.98601963204349, + "learning_rate": 9.776371329899727e-07, + "loss": 0.6038, + "step": 17497 + }, + { + "epoch": 2.61, + "grad_norm": 3.8613197974757822, + "learning_rate": 9.775405493433336e-07, + "loss": 0.7207, + "step": 17498 + }, + { + "epoch": 2.61, + "grad_norm": 4.194452760184603, + "learning_rate": 9.774439659063107e-07, + "loss": 0.6517, + "step": 17499 + }, + { + "epoch": 2.61, + "grad_norm": 3.652929186472461, + "learning_rate": 9.77347382679805e-07, + "loss": 0.6533, + "step": 17500 + }, + { + "epoch": 2.61, + "grad_norm": 3.7851048496035746, + "learning_rate": 9.772507996647187e-07, + "loss": 0.6699, + "step": 17501 + }, + { + "epoch": 2.61, + "grad_norm": 3.185015315210823, + "learning_rate": 9.771542168619522e-07, + "loss": 0.612, + "step": 17502 + }, + { + "epoch": 2.61, + "grad_norm": 5.76216690234867, + "learning_rate": 9.77057634272408e-07, + "loss": 0.6693, + "step": 17503 + }, + { + "epoch": 2.61, + "grad_norm": 3.6425544901699687, + "learning_rate": 9.769610518969867e-07, + "loss": 0.612, + "step": 17504 + }, + { + "epoch": 2.61, + "grad_norm": 3.124103032896323, + "learning_rate": 9.768644697365897e-07, + "loss": 0.6328, + "step": 17505 + }, + { + "epoch": 2.61, + "grad_norm": 4.015883861898645, + "learning_rate": 9.767678877921187e-07, + "loss": 0.5951, + "step": 17506 + }, + { + "epoch": 2.61, + "grad_norm": 7.578040756275895, + "learning_rate": 9.76671306064475e-07, + "loss": 0.6074, + "step": 17507 + }, + { + "epoch": 2.61, + "grad_norm": 3.43595895075806, + "learning_rate": 9.765747245545602e-07, + "loss": 0.6608, + "step": 17508 + }, + { + "epoch": 2.61, + "grad_norm": 3.3196087538798142, + "learning_rate": 9.764781432632757e-07, + "loss": 0.6289, + "step": 17509 + }, + { + "epoch": 2.61, + "grad_norm": 2.9999314956254066, + "learning_rate": 9.763815621915225e-07, + "loss": 0.6673, + "step": 17510 + }, + { + "epoch": 2.61, + "grad_norm": 4.240930444252206, + "learning_rate": 9.762849813402027e-07, + "loss": 0.6699, + "step": 17511 + }, + { + "epoch": 2.61, + "grad_norm": 3.1047737568740477, + "learning_rate": 9.76188400710217e-07, + "loss": 0.6328, + "step": 17512 + }, + { + "epoch": 2.61, + "grad_norm": 3.2425767166445034, + "learning_rate": 9.76091820302467e-07, + "loss": 0.6471, + "step": 17513 + }, + { + "epoch": 2.61, + "grad_norm": 3.7825560525025104, + "learning_rate": 9.759952401178542e-07, + "loss": 0.6296, + "step": 17514 + }, + { + "epoch": 2.61, + "grad_norm": 3.3424217832116327, + "learning_rate": 9.758986601572799e-07, + "loss": 0.6328, + "step": 17515 + }, + { + "epoch": 2.61, + "grad_norm": 4.386683678454218, + "learning_rate": 9.758020804216456e-07, + "loss": 0.6315, + "step": 17516 + }, + { + "epoch": 2.61, + "grad_norm": 2.892719224859221, + "learning_rate": 9.757055009118529e-07, + "loss": 0.5898, + "step": 17517 + }, + { + "epoch": 2.61, + "grad_norm": 2.9417648412610937, + "learning_rate": 9.756089216288024e-07, + "loss": 0.6341, + "step": 17518 + }, + { + "epoch": 2.61, + "grad_norm": 3.5621419684391915, + "learning_rate": 9.75512342573396e-07, + "loss": 0.6549, + "step": 17519 + }, + { + "epoch": 2.61, + "grad_norm": 3.368351286481359, + "learning_rate": 9.754157637465354e-07, + "loss": 0.6836, + "step": 17520 + }, + { + "epoch": 2.61, + "grad_norm": 3.4592220685943484, + "learning_rate": 9.753191851491215e-07, + "loss": 0.6393, + "step": 17521 + }, + { + "epoch": 2.61, + "grad_norm": 3.3845155478031215, + "learning_rate": 9.752226067820558e-07, + "loss": 0.6608, + "step": 17522 + }, + { + "epoch": 2.61, + "grad_norm": 4.874136260139411, + "learning_rate": 9.751260286462397e-07, + "loss": 0.6445, + "step": 17523 + }, + { + "epoch": 2.61, + "grad_norm": 4.771340903753417, + "learning_rate": 9.75029450742575e-07, + "loss": 0.6549, + "step": 17524 + }, + { + "epoch": 2.61, + "grad_norm": 7.3312549939969855, + "learning_rate": 9.749328730719623e-07, + "loss": 0.6211, + "step": 17525 + }, + { + "epoch": 2.61, + "grad_norm": 3.634258636449962, + "learning_rate": 9.74836295635303e-07, + "loss": 0.6367, + "step": 17526 + }, + { + "epoch": 2.61, + "grad_norm": 2.9821629066669395, + "learning_rate": 9.747397184334992e-07, + "loss": 0.5983, + "step": 17527 + }, + { + "epoch": 2.61, + "grad_norm": 3.5310493819656363, + "learning_rate": 9.746431414674517e-07, + "loss": 0.6406, + "step": 17528 + }, + { + "epoch": 2.61, + "grad_norm": 7.420864831907942, + "learning_rate": 9.745465647380618e-07, + "loss": 0.6784, + "step": 17529 + }, + { + "epoch": 2.61, + "grad_norm": 3.7535534183379085, + "learning_rate": 9.744499882462316e-07, + "loss": 0.5853, + "step": 17530 + }, + { + "epoch": 2.61, + "grad_norm": 3.234881813846411, + "learning_rate": 9.743534119928616e-07, + "loss": 0.6354, + "step": 17531 + }, + { + "epoch": 2.61, + "grad_norm": 4.342361678753801, + "learning_rate": 9.742568359788532e-07, + "loss": 0.6862, + "step": 17532 + }, + { + "epoch": 2.61, + "grad_norm": 4.304423624746413, + "learning_rate": 9.741602602051084e-07, + "loss": 0.6152, + "step": 17533 + }, + { + "epoch": 2.62, + "grad_norm": 4.258456328262849, + "learning_rate": 9.74063684672528e-07, + "loss": 0.6641, + "step": 17534 + }, + { + "epoch": 2.62, + "grad_norm": 4.177200819377178, + "learning_rate": 9.739671093820137e-07, + "loss": 0.623, + "step": 17535 + }, + { + "epoch": 2.62, + "grad_norm": 5.8901584180917546, + "learning_rate": 9.738705343344664e-07, + "loss": 0.6387, + "step": 17536 + }, + { + "epoch": 2.62, + "grad_norm": 3.299375413135025, + "learning_rate": 9.73773959530788e-07, + "loss": 0.6829, + "step": 17537 + }, + { + "epoch": 2.62, + "grad_norm": 3.070332685104839, + "learning_rate": 9.736773849718797e-07, + "loss": 0.6309, + "step": 17538 + }, + { + "epoch": 2.62, + "grad_norm": 3.485822137436113, + "learning_rate": 9.735808106586424e-07, + "loss": 0.6296, + "step": 17539 + }, + { + "epoch": 2.62, + "grad_norm": 3.14101824701917, + "learning_rate": 9.734842365919776e-07, + "loss": 0.6478, + "step": 17540 + }, + { + "epoch": 2.62, + "grad_norm": 2.9897958972784866, + "learning_rate": 9.73387662772787e-07, + "loss": 0.6361, + "step": 17541 + }, + { + "epoch": 2.62, + "grad_norm": 3.0913743915855707, + "learning_rate": 9.732910892019714e-07, + "loss": 0.6263, + "step": 17542 + }, + { + "epoch": 2.62, + "grad_norm": 4.647058756272464, + "learning_rate": 9.731945158804327e-07, + "loss": 0.6771, + "step": 17543 + }, + { + "epoch": 2.62, + "grad_norm": 2.878671287079614, + "learning_rate": 9.73097942809072e-07, + "loss": 0.6562, + "step": 17544 + }, + { + "epoch": 2.62, + "grad_norm": 2.8984629904643255, + "learning_rate": 9.730013699887905e-07, + "loss": 0.623, + "step": 17545 + }, + { + "epoch": 2.62, + "grad_norm": 3.4585240324672903, + "learning_rate": 9.729047974204898e-07, + "loss": 0.6097, + "step": 17546 + }, + { + "epoch": 2.62, + "grad_norm": 2.9890698925644648, + "learning_rate": 9.728082251050705e-07, + "loss": 0.6312, + "step": 17547 + }, + { + "epoch": 2.62, + "grad_norm": 4.065144338453511, + "learning_rate": 9.727116530434348e-07, + "loss": 0.6348, + "step": 17548 + }, + { + "epoch": 2.62, + "grad_norm": 3.8457048614587994, + "learning_rate": 9.726150812364835e-07, + "loss": 0.6074, + "step": 17549 + }, + { + "epoch": 2.62, + "grad_norm": 3.2849807570022653, + "learning_rate": 9.725185096851185e-07, + "loss": 0.6686, + "step": 17550 + }, + { + "epoch": 2.62, + "grad_norm": 2.939533720890301, + "learning_rate": 9.724219383902403e-07, + "loss": 0.6217, + "step": 17551 + }, + { + "epoch": 2.62, + "grad_norm": 4.121447765002271, + "learning_rate": 9.723253673527505e-07, + "loss": 0.6999, + "step": 17552 + }, + { + "epoch": 2.62, + "grad_norm": 2.7354319195029246, + "learning_rate": 9.722287965735504e-07, + "loss": 0.6074, + "step": 17553 + }, + { + "epoch": 2.62, + "grad_norm": 2.8905572645176716, + "learning_rate": 9.721322260535418e-07, + "loss": 0.6471, + "step": 17554 + }, + { + "epoch": 2.62, + "grad_norm": 3.3641939126086986, + "learning_rate": 9.720356557936254e-07, + "loss": 0.6449, + "step": 17555 + }, + { + "epoch": 2.62, + "grad_norm": 3.9780706495869333, + "learning_rate": 9.719390857947028e-07, + "loss": 0.6178, + "step": 17556 + }, + { + "epoch": 2.62, + "grad_norm": 3.794671023181731, + "learning_rate": 9.718425160576753e-07, + "loss": 0.6185, + "step": 17557 + }, + { + "epoch": 2.62, + "grad_norm": 4.18337956061762, + "learning_rate": 9.717459465834439e-07, + "loss": 0.6387, + "step": 17558 + }, + { + "epoch": 2.62, + "grad_norm": 4.454401670935251, + "learning_rate": 9.7164937737291e-07, + "loss": 0.6732, + "step": 17559 + }, + { + "epoch": 2.62, + "grad_norm": 3.1948680049778844, + "learning_rate": 9.71552808426975e-07, + "loss": 0.623, + "step": 17560 + }, + { + "epoch": 2.62, + "grad_norm": 3.8479177751303086, + "learning_rate": 9.714562397465402e-07, + "loss": 0.6094, + "step": 17561 + }, + { + "epoch": 2.62, + "grad_norm": 5.049574617303882, + "learning_rate": 9.713596713325067e-07, + "loss": 0.6602, + "step": 17562 + }, + { + "epoch": 2.62, + "grad_norm": 6.570112066087917, + "learning_rate": 9.712631031857764e-07, + "loss": 0.6348, + "step": 17563 + }, + { + "epoch": 2.62, + "grad_norm": 5.080158020765884, + "learning_rate": 9.711665353072495e-07, + "loss": 0.6217, + "step": 17564 + }, + { + "epoch": 2.62, + "grad_norm": 3.1795189224467646, + "learning_rate": 9.710699676978284e-07, + "loss": 0.6497, + "step": 17565 + }, + { + "epoch": 2.62, + "grad_norm": 3.7532886380725117, + "learning_rate": 9.709734003584133e-07, + "loss": 0.5885, + "step": 17566 + }, + { + "epoch": 2.62, + "grad_norm": 4.123207873329007, + "learning_rate": 9.708768332899064e-07, + "loss": 0.6224, + "step": 17567 + }, + { + "epoch": 2.62, + "grad_norm": 4.001413380685775, + "learning_rate": 9.707802664932085e-07, + "loss": 0.6829, + "step": 17568 + }, + { + "epoch": 2.62, + "grad_norm": 5.021877181999723, + "learning_rate": 9.70683699969221e-07, + "loss": 0.6243, + "step": 17569 + }, + { + "epoch": 2.62, + "grad_norm": 4.1942754165817036, + "learning_rate": 9.705871337188452e-07, + "loss": 0.5612, + "step": 17570 + }, + { + "epoch": 2.62, + "grad_norm": 3.2304826142743486, + "learning_rate": 9.704905677429822e-07, + "loss": 0.5869, + "step": 17571 + }, + { + "epoch": 2.62, + "grad_norm": 4.074432899410848, + "learning_rate": 9.703940020425332e-07, + "loss": 0.6439, + "step": 17572 + }, + { + "epoch": 2.62, + "grad_norm": 3.4742271641976705, + "learning_rate": 9.702974366183996e-07, + "loss": 0.6784, + "step": 17573 + }, + { + "epoch": 2.62, + "grad_norm": 4.147265738820831, + "learning_rate": 9.702008714714826e-07, + "loss": 0.6224, + "step": 17574 + }, + { + "epoch": 2.62, + "grad_norm": 3.8292319258288785, + "learning_rate": 9.701043066026836e-07, + "loss": 0.6374, + "step": 17575 + }, + { + "epoch": 2.62, + "grad_norm": 3.106026934611492, + "learning_rate": 9.700077420129039e-07, + "loss": 0.6243, + "step": 17576 + }, + { + "epoch": 2.62, + "grad_norm": 5.537721386693679, + "learning_rate": 9.699111777030448e-07, + "loss": 0.6139, + "step": 17577 + }, + { + "epoch": 2.62, + "grad_norm": 3.4791577941632137, + "learning_rate": 9.698146136740072e-07, + "loss": 0.6562, + "step": 17578 + }, + { + "epoch": 2.62, + "grad_norm": 3.1266944855609586, + "learning_rate": 9.69718049926692e-07, + "loss": 0.6243, + "step": 17579 + }, + { + "epoch": 2.62, + "grad_norm": 3.947561635418824, + "learning_rate": 9.696214864620012e-07, + "loss": 0.6862, + "step": 17580 + }, + { + "epoch": 2.62, + "grad_norm": 4.634245656497065, + "learning_rate": 9.695249232808358e-07, + "loss": 0.6621, + "step": 17581 + }, + { + "epoch": 2.62, + "grad_norm": 3.5194405195995713, + "learning_rate": 9.69428360384097e-07, + "loss": 0.612, + "step": 17582 + }, + { + "epoch": 2.62, + "grad_norm": 6.458677002213588, + "learning_rate": 9.693317977726863e-07, + "loss": 0.6641, + "step": 17583 + }, + { + "epoch": 2.62, + "grad_norm": 5.1772197214735645, + "learning_rate": 9.692352354475042e-07, + "loss": 0.6628, + "step": 17584 + }, + { + "epoch": 2.62, + "grad_norm": 3.5781154341706722, + "learning_rate": 9.691386734094523e-07, + "loss": 0.6602, + "step": 17585 + }, + { + "epoch": 2.62, + "grad_norm": 3.5687277949070406, + "learning_rate": 9.690421116594322e-07, + "loss": 0.6582, + "step": 17586 + }, + { + "epoch": 2.62, + "grad_norm": 4.071250940545849, + "learning_rate": 9.689455501983446e-07, + "loss": 0.6647, + "step": 17587 + }, + { + "epoch": 2.62, + "grad_norm": 3.027411510735821, + "learning_rate": 9.688489890270912e-07, + "loss": 0.6009, + "step": 17588 + }, + { + "epoch": 2.62, + "grad_norm": 7.420333039892992, + "learning_rate": 9.687524281465724e-07, + "loss": 0.6302, + "step": 17589 + }, + { + "epoch": 2.62, + "grad_norm": 3.7096818693639184, + "learning_rate": 9.686558675576908e-07, + "loss": 0.6191, + "step": 17590 + }, + { + "epoch": 2.62, + "grad_norm": 3.305961134533792, + "learning_rate": 9.685593072613463e-07, + "loss": 0.6022, + "step": 17591 + }, + { + "epoch": 2.62, + "grad_norm": 2.8966955018129044, + "learning_rate": 9.684627472584403e-07, + "loss": 0.6582, + "step": 17592 + }, + { + "epoch": 2.62, + "grad_norm": 4.498298324541048, + "learning_rate": 9.683661875498747e-07, + "loss": 0.6699, + "step": 17593 + }, + { + "epoch": 2.62, + "grad_norm": 4.6024784893867725, + "learning_rate": 9.682696281365498e-07, + "loss": 0.6608, + "step": 17594 + }, + { + "epoch": 2.62, + "grad_norm": 5.075145772533956, + "learning_rate": 9.681730690193675e-07, + "loss": 0.6654, + "step": 17595 + }, + { + "epoch": 2.62, + "grad_norm": 2.9372989694666587, + "learning_rate": 9.680765101992294e-07, + "loss": 0.5983, + "step": 17596 + }, + { + "epoch": 2.62, + "grad_norm": 3.6105850567763236, + "learning_rate": 9.679799516770352e-07, + "loss": 0.6361, + "step": 17597 + }, + { + "epoch": 2.62, + "grad_norm": 5.6232439678590564, + "learning_rate": 9.67883393453687e-07, + "loss": 0.6247, + "step": 17598 + }, + { + "epoch": 2.62, + "grad_norm": 3.830865742917568, + "learning_rate": 9.677868355300864e-07, + "loss": 0.6087, + "step": 17599 + }, + { + "epoch": 2.62, + "grad_norm": 4.325183982289669, + "learning_rate": 9.676902779071336e-07, + "loss": 0.6022, + "step": 17600 + }, + { + "epoch": 2.63, + "grad_norm": 3.0095322591069866, + "learning_rate": 9.675937205857306e-07, + "loss": 0.5996, + "step": 17601 + }, + { + "epoch": 2.63, + "grad_norm": 2.7810406374048697, + "learning_rate": 9.674971635667782e-07, + "loss": 0.653, + "step": 17602 + }, + { + "epoch": 2.63, + "grad_norm": 4.148379923616934, + "learning_rate": 9.674006068511778e-07, + "loss": 0.625, + "step": 17603 + }, + { + "epoch": 2.63, + "grad_norm": 3.6602236398769543, + "learning_rate": 9.673040504398304e-07, + "loss": 0.6159, + "step": 17604 + }, + { + "epoch": 2.63, + "grad_norm": 3.026646714204084, + "learning_rate": 9.672074943336369e-07, + "loss": 0.6439, + "step": 17605 + }, + { + "epoch": 2.63, + "grad_norm": 4.337397427883007, + "learning_rate": 9.67110938533499e-07, + "loss": 0.6263, + "step": 17606 + }, + { + "epoch": 2.63, + "grad_norm": 3.9757081493013757, + "learning_rate": 9.670143830403175e-07, + "loss": 0.6061, + "step": 17607 + }, + { + "epoch": 2.63, + "grad_norm": 3.78089794237827, + "learning_rate": 9.669178278549936e-07, + "loss": 0.6048, + "step": 17608 + }, + { + "epoch": 2.63, + "grad_norm": 4.158642810363513, + "learning_rate": 9.668212729784286e-07, + "loss": 0.681, + "step": 17609 + }, + { + "epoch": 2.63, + "grad_norm": 3.2269177855312754, + "learning_rate": 9.66724718411524e-07, + "loss": 0.6628, + "step": 17610 + }, + { + "epoch": 2.63, + "grad_norm": 4.142447232723732, + "learning_rate": 9.666281641551801e-07, + "loss": 0.6523, + "step": 17611 + }, + { + "epoch": 2.63, + "grad_norm": 2.8848150317220407, + "learning_rate": 9.665316102102987e-07, + "loss": 0.5742, + "step": 17612 + }, + { + "epoch": 2.63, + "grad_norm": 6.089067915333971, + "learning_rate": 9.664350565777805e-07, + "loss": 0.5898, + "step": 17613 + }, + { + "epoch": 2.63, + "grad_norm": 3.1636488292589733, + "learning_rate": 9.663385032585271e-07, + "loss": 0.6126, + "step": 17614 + }, + { + "epoch": 2.63, + "grad_norm": 4.465127194756448, + "learning_rate": 9.662419502534391e-07, + "loss": 0.6764, + "step": 17615 + }, + { + "epoch": 2.63, + "grad_norm": 3.759702962220651, + "learning_rate": 9.661453975634186e-07, + "loss": 0.5905, + "step": 17616 + }, + { + "epoch": 2.63, + "grad_norm": 4.600304498379421, + "learning_rate": 9.660488451893659e-07, + "loss": 0.6439, + "step": 17617 + }, + { + "epoch": 2.63, + "grad_norm": 4.286387786545267, + "learning_rate": 9.659522931321818e-07, + "loss": 0.6986, + "step": 17618 + }, + { + "epoch": 2.63, + "grad_norm": 6.807232660475792, + "learning_rate": 9.658557413927683e-07, + "loss": 0.6549, + "step": 17619 + }, + { + "epoch": 2.63, + "grad_norm": 3.0369580497876356, + "learning_rate": 9.657591899720262e-07, + "loss": 0.6159, + "step": 17620 + }, + { + "epoch": 2.63, + "grad_norm": 4.393719743060412, + "learning_rate": 9.656626388708565e-07, + "loss": 0.681, + "step": 17621 + }, + { + "epoch": 2.63, + "grad_norm": 4.352345341379561, + "learning_rate": 9.655660880901604e-07, + "loss": 0.6517, + "step": 17622 + }, + { + "epoch": 2.63, + "grad_norm": 3.117841661194097, + "learning_rate": 9.654695376308397e-07, + "loss": 0.6204, + "step": 17623 + }, + { + "epoch": 2.63, + "grad_norm": 4.743244078034846, + "learning_rate": 9.65372987493794e-07, + "loss": 0.6445, + "step": 17624 + }, + { + "epoch": 2.63, + "grad_norm": 3.249499502652806, + "learning_rate": 9.652764376799257e-07, + "loss": 0.6152, + "step": 17625 + }, + { + "epoch": 2.63, + "grad_norm": 4.267725972519825, + "learning_rate": 9.651798881901354e-07, + "loss": 0.6628, + "step": 17626 + }, + { + "epoch": 2.63, + "grad_norm": 3.2900519057589195, + "learning_rate": 9.650833390253243e-07, + "loss": 0.6517, + "step": 17627 + }, + { + "epoch": 2.63, + "grad_norm": 3.2181327990027024, + "learning_rate": 9.649867901863932e-07, + "loss": 0.6172, + "step": 17628 + }, + { + "epoch": 2.63, + "grad_norm": 3.1272524161459163, + "learning_rate": 9.64890241674244e-07, + "loss": 0.623, + "step": 17629 + }, + { + "epoch": 2.63, + "grad_norm": 4.499546474639508, + "learning_rate": 9.647936934897768e-07, + "loss": 0.6732, + "step": 17630 + }, + { + "epoch": 2.63, + "grad_norm": 4.375379021216879, + "learning_rate": 9.646971456338936e-07, + "loss": 0.6328, + "step": 17631 + }, + { + "epoch": 2.63, + "grad_norm": 3.882268852678586, + "learning_rate": 9.646005981074945e-07, + "loss": 0.6478, + "step": 17632 + }, + { + "epoch": 2.63, + "grad_norm": 2.9666417758282004, + "learning_rate": 9.645040509114817e-07, + "loss": 0.5996, + "step": 17633 + }, + { + "epoch": 2.63, + "grad_norm": 4.626683303648528, + "learning_rate": 9.644075040467555e-07, + "loss": 0.6387, + "step": 17634 + }, + { + "epoch": 2.63, + "grad_norm": 4.229233194157166, + "learning_rate": 9.643109575142173e-07, + "loss": 0.6432, + "step": 17635 + }, + { + "epoch": 2.63, + "grad_norm": 3.962659952247064, + "learning_rate": 9.642144113147681e-07, + "loss": 0.653, + "step": 17636 + }, + { + "epoch": 2.63, + "grad_norm": 3.638435434212089, + "learning_rate": 9.641178654493088e-07, + "loss": 0.6556, + "step": 17637 + }, + { + "epoch": 2.63, + "grad_norm": 3.195201942921678, + "learning_rate": 9.640213199187408e-07, + "loss": 0.6478, + "step": 17638 + }, + { + "epoch": 2.63, + "grad_norm": 4.095100188100678, + "learning_rate": 9.639247747239649e-07, + "loss": 0.6562, + "step": 17639 + }, + { + "epoch": 2.63, + "grad_norm": 3.3556807937381357, + "learning_rate": 9.638282298658822e-07, + "loss": 0.7064, + "step": 17640 + }, + { + "epoch": 2.63, + "grad_norm": 3.6214648298235192, + "learning_rate": 9.637316853453938e-07, + "loss": 0.6335, + "step": 17641 + }, + { + "epoch": 2.63, + "grad_norm": 4.313779631851879, + "learning_rate": 9.636351411634008e-07, + "loss": 0.6178, + "step": 17642 + }, + { + "epoch": 2.63, + "grad_norm": 3.918935104568814, + "learning_rate": 9.635385973208047e-07, + "loss": 0.7135, + "step": 17643 + }, + { + "epoch": 2.63, + "grad_norm": 3.0606800663716776, + "learning_rate": 9.634420538185059e-07, + "loss": 0.6204, + "step": 17644 + }, + { + "epoch": 2.63, + "grad_norm": 2.951903203264599, + "learning_rate": 9.633455106574054e-07, + "loss": 0.6302, + "step": 17645 + }, + { + "epoch": 2.63, + "grad_norm": 4.093785371479615, + "learning_rate": 9.632489678384044e-07, + "loss": 0.6178, + "step": 17646 + }, + { + "epoch": 2.63, + "grad_norm": 4.003741744258961, + "learning_rate": 9.631524253624042e-07, + "loss": 0.6276, + "step": 17647 + }, + { + "epoch": 2.63, + "grad_norm": 2.7302586212190563, + "learning_rate": 9.630558832303058e-07, + "loss": 0.6064, + "step": 17648 + }, + { + "epoch": 2.63, + "grad_norm": 3.1170411939158065, + "learning_rate": 9.629593414430102e-07, + "loss": 0.6621, + "step": 17649 + }, + { + "epoch": 2.63, + "grad_norm": 2.9483576941421337, + "learning_rate": 9.62862800001418e-07, + "loss": 0.651, + "step": 17650 + }, + { + "epoch": 2.63, + "grad_norm": 4.041050625484223, + "learning_rate": 9.627662589064306e-07, + "loss": 0.7285, + "step": 17651 + }, + { + "epoch": 2.63, + "grad_norm": 2.834806545635652, + "learning_rate": 9.626697181589493e-07, + "loss": 0.6491, + "step": 17652 + }, + { + "epoch": 2.63, + "grad_norm": 3.440048501088676, + "learning_rate": 9.625731777598744e-07, + "loss": 0.6406, + "step": 17653 + }, + { + "epoch": 2.63, + "grad_norm": 2.5294714327031382, + "learning_rate": 9.624766377101077e-07, + "loss": 0.6322, + "step": 17654 + }, + { + "epoch": 2.63, + "grad_norm": 4.637121763909491, + "learning_rate": 9.623800980105495e-07, + "loss": 0.6595, + "step": 17655 + }, + { + "epoch": 2.63, + "grad_norm": 3.06179136340959, + "learning_rate": 9.622835586621017e-07, + "loss": 0.6243, + "step": 17656 + }, + { + "epoch": 2.63, + "grad_norm": 3.069170436255303, + "learning_rate": 9.621870196656647e-07, + "loss": 0.6406, + "step": 17657 + }, + { + "epoch": 2.63, + "grad_norm": 3.141104837490692, + "learning_rate": 9.620904810221393e-07, + "loss": 0.5964, + "step": 17658 + }, + { + "epoch": 2.63, + "grad_norm": 2.4992447495633487, + "learning_rate": 9.619939427324267e-07, + "loss": 0.6595, + "step": 17659 + }, + { + "epoch": 2.63, + "grad_norm": 4.601358709503619, + "learning_rate": 9.618974047974281e-07, + "loss": 0.6335, + "step": 17660 + }, + { + "epoch": 2.63, + "grad_norm": 2.563387837358582, + "learning_rate": 9.618008672180444e-07, + "loss": 0.6061, + "step": 17661 + }, + { + "epoch": 2.63, + "grad_norm": 2.8540892366008346, + "learning_rate": 9.61704329995177e-07, + "loss": 0.6204, + "step": 17662 + }, + { + "epoch": 2.63, + "grad_norm": 3.8529711878954114, + "learning_rate": 9.61607793129726e-07, + "loss": 0.6797, + "step": 17663 + }, + { + "epoch": 2.63, + "grad_norm": 2.9012980965492376, + "learning_rate": 9.61511256622593e-07, + "loss": 0.666, + "step": 17664 + }, + { + "epoch": 2.63, + "grad_norm": 4.649116634841249, + "learning_rate": 9.614147204746788e-07, + "loss": 0.6589, + "step": 17665 + }, + { + "epoch": 2.63, + "grad_norm": 4.192293400200361, + "learning_rate": 9.613181846868843e-07, + "loss": 0.627, + "step": 17666 + }, + { + "epoch": 2.63, + "grad_norm": 3.4871830592082427, + "learning_rate": 9.612216492601108e-07, + "loss": 0.6569, + "step": 17667 + }, + { + "epoch": 2.64, + "grad_norm": 2.7522567525716672, + "learning_rate": 9.61125114195259e-07, + "loss": 0.6328, + "step": 17668 + }, + { + "epoch": 2.64, + "grad_norm": 3.2835102455111707, + "learning_rate": 9.6102857949323e-07, + "loss": 0.6022, + "step": 17669 + }, + { + "epoch": 2.64, + "grad_norm": 3.6872313512894834, + "learning_rate": 9.60932045154925e-07, + "loss": 0.6608, + "step": 17670 + }, + { + "epoch": 2.64, + "grad_norm": 3.357758093844444, + "learning_rate": 9.608355111812442e-07, + "loss": 0.6829, + "step": 17671 + }, + { + "epoch": 2.64, + "grad_norm": 4.837123589793168, + "learning_rate": 9.607389775730892e-07, + "loss": 0.6087, + "step": 17672 + }, + { + "epoch": 2.64, + "grad_norm": 3.0203462361995754, + "learning_rate": 9.606424443313607e-07, + "loss": 0.6576, + "step": 17673 + }, + { + "epoch": 2.64, + "grad_norm": 3.0420334333446304, + "learning_rate": 9.605459114569598e-07, + "loss": 0.557, + "step": 17674 + }, + { + "epoch": 2.64, + "grad_norm": 3.0530698581585844, + "learning_rate": 9.604493789507874e-07, + "loss": 0.6725, + "step": 17675 + }, + { + "epoch": 2.64, + "grad_norm": 3.517119436999365, + "learning_rate": 9.603528468137448e-07, + "loss": 0.6641, + "step": 17676 + }, + { + "epoch": 2.64, + "grad_norm": 3.1123064562282714, + "learning_rate": 9.60256315046732e-07, + "loss": 0.6719, + "step": 17677 + }, + { + "epoch": 2.64, + "grad_norm": 2.843801195559219, + "learning_rate": 9.60159783650651e-07, + "loss": 0.6387, + "step": 17678 + }, + { + "epoch": 2.64, + "grad_norm": 3.053601948684102, + "learning_rate": 9.600632526264019e-07, + "loss": 0.64, + "step": 17679 + }, + { + "epoch": 2.64, + "grad_norm": 2.7493307257386967, + "learning_rate": 9.599667219748862e-07, + "loss": 0.6432, + "step": 17680 + }, + { + "epoch": 2.64, + "grad_norm": 3.247753621858947, + "learning_rate": 9.598701916970044e-07, + "loss": 0.6543, + "step": 17681 + }, + { + "epoch": 2.64, + "grad_norm": 3.051797973223644, + "learning_rate": 9.597736617936582e-07, + "loss": 0.6042, + "step": 17682 + }, + { + "epoch": 2.64, + "grad_norm": 3.131845974729202, + "learning_rate": 9.596771322657479e-07, + "loss": 0.6504, + "step": 17683 + }, + { + "epoch": 2.64, + "grad_norm": 3.76029961804081, + "learning_rate": 9.595806031141738e-07, + "loss": 0.6367, + "step": 17684 + }, + { + "epoch": 2.64, + "grad_norm": 3.266661934104959, + "learning_rate": 9.594840743398378e-07, + "loss": 0.6074, + "step": 17685 + }, + { + "epoch": 2.64, + "grad_norm": 3.2319251260727198, + "learning_rate": 9.593875459436408e-07, + "loss": 0.6621, + "step": 17686 + }, + { + "epoch": 2.64, + "grad_norm": 2.7593880904630277, + "learning_rate": 9.59291017926483e-07, + "loss": 0.5931, + "step": 17687 + }, + { + "epoch": 2.64, + "grad_norm": 3.689116854775534, + "learning_rate": 9.59194490289266e-07, + "loss": 0.5951, + "step": 17688 + }, + { + "epoch": 2.64, + "grad_norm": 7.285647996430094, + "learning_rate": 9.59097963032891e-07, + "loss": 0.724, + "step": 17689 + }, + { + "epoch": 2.64, + "grad_norm": 5.8123589090284185, + "learning_rate": 9.590014361582573e-07, + "loss": 0.6569, + "step": 17690 + }, + { + "epoch": 2.64, + "grad_norm": 3.9042496591593525, + "learning_rate": 9.589049096662674e-07, + "loss": 0.623, + "step": 17691 + }, + { + "epoch": 2.64, + "grad_norm": 3.1618651290119715, + "learning_rate": 9.588083835578214e-07, + "loss": 0.6562, + "step": 17692 + }, + { + "epoch": 2.64, + "grad_norm": 4.012831752326475, + "learning_rate": 9.587118578338205e-07, + "loss": 0.6406, + "step": 17693 + }, + { + "epoch": 2.64, + "grad_norm": 4.368534030655689, + "learning_rate": 9.586153324951652e-07, + "loss": 0.6895, + "step": 17694 + }, + { + "epoch": 2.64, + "grad_norm": 3.841963129158022, + "learning_rate": 9.585188075427568e-07, + "loss": 0.6315, + "step": 17695 + }, + { + "epoch": 2.64, + "grad_norm": 4.38000343442709, + "learning_rate": 9.584222829774965e-07, + "loss": 0.6543, + "step": 17696 + }, + { + "epoch": 2.64, + "grad_norm": 3.907538379199676, + "learning_rate": 9.583257588002844e-07, + "loss": 0.7161, + "step": 17697 + }, + { + "epoch": 2.64, + "grad_norm": 3.5207896994199483, + "learning_rate": 9.582292350120214e-07, + "loss": 0.6569, + "step": 17698 + }, + { + "epoch": 2.64, + "grad_norm": 3.3607139061172253, + "learning_rate": 9.58132711613609e-07, + "loss": 0.6934, + "step": 17699 + }, + { + "epoch": 2.64, + "grad_norm": 3.376779767741697, + "learning_rate": 9.580361886059474e-07, + "loss": 0.6536, + "step": 17700 + }, + { + "epoch": 2.64, + "grad_norm": 3.0120588283228713, + "learning_rate": 9.579396659899382e-07, + "loss": 0.6217, + "step": 17701 + }, + { + "epoch": 2.64, + "grad_norm": 3.575641281845368, + "learning_rate": 9.578431437664819e-07, + "loss": 0.6257, + "step": 17702 + }, + { + "epoch": 2.64, + "grad_norm": 3.9644766649792516, + "learning_rate": 9.577466219364785e-07, + "loss": 0.599, + "step": 17703 + }, + { + "epoch": 2.64, + "grad_norm": 3.3341160618553216, + "learning_rate": 9.576501005008302e-07, + "loss": 0.6309, + "step": 17704 + }, + { + "epoch": 2.64, + "grad_norm": 4.450260144515322, + "learning_rate": 9.57553579460437e-07, + "loss": 0.6322, + "step": 17705 + }, + { + "epoch": 2.64, + "grad_norm": 4.295493632736048, + "learning_rate": 9.574570588162e-07, + "loss": 0.6374, + "step": 17706 + }, + { + "epoch": 2.64, + "grad_norm": 4.366294865714473, + "learning_rate": 9.573605385690201e-07, + "loss": 0.6706, + "step": 17707 + }, + { + "epoch": 2.64, + "grad_norm": 5.189527884563679, + "learning_rate": 9.57264018719798e-07, + "loss": 0.666, + "step": 17708 + }, + { + "epoch": 2.64, + "grad_norm": 2.572093323704653, + "learning_rate": 9.571674992694351e-07, + "loss": 0.5911, + "step": 17709 + }, + { + "epoch": 2.64, + "grad_norm": 5.466011738674901, + "learning_rate": 9.570709802188313e-07, + "loss": 0.6198, + "step": 17710 + }, + { + "epoch": 2.64, + "grad_norm": 2.8794592051735513, + "learning_rate": 9.569744615688878e-07, + "loss": 0.6419, + "step": 17711 + }, + { + "epoch": 2.64, + "grad_norm": 2.8057487430129546, + "learning_rate": 9.568779433205054e-07, + "loss": 0.5918, + "step": 17712 + }, + { + "epoch": 2.64, + "grad_norm": 5.915781043574451, + "learning_rate": 9.567814254745849e-07, + "loss": 0.668, + "step": 17713 + }, + { + "epoch": 2.64, + "grad_norm": 3.448428054153503, + "learning_rate": 9.566849080320276e-07, + "loss": 0.6589, + "step": 17714 + }, + { + "epoch": 2.64, + "grad_norm": 3.0389501708086923, + "learning_rate": 9.565883909937339e-07, + "loss": 0.6536, + "step": 17715 + }, + { + "epoch": 2.64, + "grad_norm": 3.7492859927163296, + "learning_rate": 9.56491874360604e-07, + "loss": 0.6257, + "step": 17716 + }, + { + "epoch": 2.64, + "grad_norm": 3.975912493600467, + "learning_rate": 9.563953581335396e-07, + "loss": 0.612, + "step": 17717 + }, + { + "epoch": 2.64, + "grad_norm": 4.164447030473554, + "learning_rate": 9.562988423134412e-07, + "loss": 0.6029, + "step": 17718 + }, + { + "epoch": 2.64, + "grad_norm": 4.099400730016192, + "learning_rate": 9.562023269012094e-07, + "loss": 0.6667, + "step": 17719 + }, + { + "epoch": 2.64, + "grad_norm": 3.2987597311295365, + "learning_rate": 9.561058118977455e-07, + "loss": 0.6797, + "step": 17720 + }, + { + "epoch": 2.64, + "grad_norm": 5.845491361791202, + "learning_rate": 9.560092973039495e-07, + "loss": 0.666, + "step": 17721 + }, + { + "epoch": 2.64, + "grad_norm": 6.643431572798844, + "learning_rate": 9.559127831207233e-07, + "loss": 0.6322, + "step": 17722 + }, + { + "epoch": 2.64, + "grad_norm": 2.6037818263995924, + "learning_rate": 9.558162693489666e-07, + "loss": 0.6133, + "step": 17723 + }, + { + "epoch": 2.64, + "grad_norm": 4.017053028051504, + "learning_rate": 9.557197559895803e-07, + "loss": 0.6204, + "step": 17724 + }, + { + "epoch": 2.64, + "grad_norm": 3.317755690651298, + "learning_rate": 9.556232430434659e-07, + "loss": 0.5999, + "step": 17725 + }, + { + "epoch": 2.64, + "grad_norm": 4.007476100116577, + "learning_rate": 9.555267305115232e-07, + "loss": 0.6217, + "step": 17726 + }, + { + "epoch": 2.64, + "grad_norm": 3.5982140800163, + "learning_rate": 9.554302183946537e-07, + "loss": 0.6009, + "step": 17727 + }, + { + "epoch": 2.64, + "grad_norm": 4.960104793424752, + "learning_rate": 9.553337066937577e-07, + "loss": 0.6292, + "step": 17728 + }, + { + "epoch": 2.64, + "grad_norm": 2.993505888396005, + "learning_rate": 9.552371954097368e-07, + "loss": 0.6289, + "step": 17729 + }, + { + "epoch": 2.64, + "grad_norm": 3.972491778012177, + "learning_rate": 9.551406845434906e-07, + "loss": 0.6523, + "step": 17730 + }, + { + "epoch": 2.64, + "grad_norm": 3.222647623580929, + "learning_rate": 9.550441740959205e-07, + "loss": 0.6615, + "step": 17731 + }, + { + "epoch": 2.64, + "grad_norm": 3.9311389468514, + "learning_rate": 9.54947664067927e-07, + "loss": 0.7064, + "step": 17732 + }, + { + "epoch": 2.64, + "grad_norm": 3.9502926624027825, + "learning_rate": 9.54851154460411e-07, + "loss": 0.6549, + "step": 17733 + }, + { + "epoch": 2.64, + "grad_norm": 5.306484414441007, + "learning_rate": 9.54754645274273e-07, + "loss": 0.6608, + "step": 17734 + }, + { + "epoch": 2.65, + "grad_norm": 3.5835169835367844, + "learning_rate": 9.546581365104143e-07, + "loss": 0.6738, + "step": 17735 + }, + { + "epoch": 2.65, + "grad_norm": 3.052731705151504, + "learning_rate": 9.54561628169735e-07, + "loss": 0.6491, + "step": 17736 + }, + { + "epoch": 2.65, + "grad_norm": 3.952112064736356, + "learning_rate": 9.54465120253136e-07, + "loss": 0.6152, + "step": 17737 + }, + { + "epoch": 2.65, + "grad_norm": 4.351726185450151, + "learning_rate": 9.543686127615182e-07, + "loss": 0.6921, + "step": 17738 + }, + { + "epoch": 2.65, + "grad_norm": 3.4511820678713194, + "learning_rate": 9.54272105695782e-07, + "loss": 0.6595, + "step": 17739 + }, + { + "epoch": 2.65, + "grad_norm": 3.2349257024757323, + "learning_rate": 9.54175599056828e-07, + "loss": 0.6198, + "step": 17740 + }, + { + "epoch": 2.65, + "grad_norm": 3.7669546690496563, + "learning_rate": 9.540790928455577e-07, + "loss": 0.6576, + "step": 17741 + }, + { + "epoch": 2.65, + "grad_norm": 6.132191704301948, + "learning_rate": 9.539825870628714e-07, + "loss": 0.6549, + "step": 17742 + }, + { + "epoch": 2.65, + "grad_norm": 3.8772122348282774, + "learning_rate": 9.538860817096693e-07, + "loss": 0.7077, + "step": 17743 + }, + { + "epoch": 2.65, + "grad_norm": 3.064278289145504, + "learning_rate": 9.537895767868525e-07, + "loss": 0.6178, + "step": 17744 + }, + { + "epoch": 2.65, + "grad_norm": 5.182273478151772, + "learning_rate": 9.536930722953218e-07, + "loss": 0.6276, + "step": 17745 + }, + { + "epoch": 2.65, + "grad_norm": 4.354841387878707, + "learning_rate": 9.535965682359777e-07, + "loss": 0.5846, + "step": 17746 + }, + { + "epoch": 2.65, + "grad_norm": 2.9586224785838016, + "learning_rate": 9.535000646097207e-07, + "loss": 0.6706, + "step": 17747 + }, + { + "epoch": 2.65, + "grad_norm": 3.0096345591430516, + "learning_rate": 9.534035614174523e-07, + "loss": 0.6016, + "step": 17748 + }, + { + "epoch": 2.65, + "grad_norm": 6.676344519500544, + "learning_rate": 9.533070586600723e-07, + "loss": 0.6081, + "step": 17749 + }, + { + "epoch": 2.65, + "grad_norm": 6.776061542900787, + "learning_rate": 9.532105563384814e-07, + "loss": 0.64, + "step": 17750 + }, + { + "epoch": 2.65, + "grad_norm": 3.668383334292478, + "learning_rate": 9.531140544535805e-07, + "loss": 0.6426, + "step": 17751 + }, + { + "epoch": 2.65, + "grad_norm": 3.042107397540998, + "learning_rate": 9.530175530062705e-07, + "loss": 0.6641, + "step": 17752 + }, + { + "epoch": 2.65, + "grad_norm": 3.859766706229997, + "learning_rate": 9.529210519974516e-07, + "loss": 0.6126, + "step": 17753 + }, + { + "epoch": 2.65, + "grad_norm": 2.9547815074192365, + "learning_rate": 9.528245514280249e-07, + "loss": 0.6686, + "step": 17754 + }, + { + "epoch": 2.65, + "grad_norm": 3.8882214571007885, + "learning_rate": 9.527280512988913e-07, + "loss": 0.6283, + "step": 17755 + }, + { + "epoch": 2.65, + "grad_norm": 3.5907832636827073, + "learning_rate": 9.526315516109502e-07, + "loss": 0.5742, + "step": 17756 + }, + { + "epoch": 2.65, + "grad_norm": 6.016262501919679, + "learning_rate": 9.525350523651035e-07, + "loss": 0.6165, + "step": 17757 + }, + { + "epoch": 2.65, + "grad_norm": 6.645683629356357, + "learning_rate": 9.52438553562251e-07, + "loss": 0.6634, + "step": 17758 + }, + { + "epoch": 2.65, + "grad_norm": 2.809781581811608, + "learning_rate": 9.523420552032939e-07, + "loss": 0.64, + "step": 17759 + }, + { + "epoch": 2.65, + "grad_norm": 4.270344880954026, + "learning_rate": 9.522455572891323e-07, + "loss": 0.653, + "step": 17760 + }, + { + "epoch": 2.65, + "grad_norm": 3.1132507101368443, + "learning_rate": 9.521490598206674e-07, + "loss": 0.6393, + "step": 17761 + }, + { + "epoch": 2.65, + "grad_norm": 3.4004535313018693, + "learning_rate": 9.520525627987997e-07, + "loss": 0.6393, + "step": 17762 + }, + { + "epoch": 2.65, + "grad_norm": 5.546635188669679, + "learning_rate": 9.519560662244297e-07, + "loss": 0.6354, + "step": 17763 + }, + { + "epoch": 2.65, + "grad_norm": 3.8694821917372595, + "learning_rate": 9.518595700984577e-07, + "loss": 0.6341, + "step": 17764 + }, + { + "epoch": 2.65, + "grad_norm": 2.867619698896037, + "learning_rate": 9.517630744217847e-07, + "loss": 0.6419, + "step": 17765 + }, + { + "epoch": 2.65, + "grad_norm": 3.5104362553389783, + "learning_rate": 9.516665791953111e-07, + "loss": 0.6497, + "step": 17766 + }, + { + "epoch": 2.65, + "grad_norm": 7.145952585975903, + "learning_rate": 9.515700844199377e-07, + "loss": 0.6374, + "step": 17767 + }, + { + "epoch": 2.65, + "grad_norm": 3.7612156810185544, + "learning_rate": 9.514735900965653e-07, + "loss": 0.6387, + "step": 17768 + }, + { + "epoch": 2.65, + "grad_norm": 3.1951885350970315, + "learning_rate": 9.513770962260937e-07, + "loss": 0.6816, + "step": 17769 + }, + { + "epoch": 2.65, + "grad_norm": 4.405518722243147, + "learning_rate": 9.512806028094241e-07, + "loss": 0.6061, + "step": 17770 + }, + { + "epoch": 2.65, + "grad_norm": 5.050806283219642, + "learning_rate": 9.511841098474569e-07, + "loss": 0.6667, + "step": 17771 + }, + { + "epoch": 2.65, + "grad_norm": 3.3253533563406656, + "learning_rate": 9.510876173410925e-07, + "loss": 0.6003, + "step": 17772 + }, + { + "epoch": 2.65, + "grad_norm": 3.098123418623327, + "learning_rate": 9.509911252912321e-07, + "loss": 0.6452, + "step": 17773 + }, + { + "epoch": 2.65, + "grad_norm": 3.6563009117982, + "learning_rate": 9.508946336987756e-07, + "loss": 0.6348, + "step": 17774 + }, + { + "epoch": 2.65, + "grad_norm": 4.406808080048502, + "learning_rate": 9.507981425646242e-07, + "loss": 0.6673, + "step": 17775 + }, + { + "epoch": 2.65, + "grad_norm": 3.401229749257686, + "learning_rate": 9.507016518896779e-07, + "loss": 0.6764, + "step": 17776 + }, + { + "epoch": 2.65, + "grad_norm": 5.156126028979888, + "learning_rate": 9.506051616748373e-07, + "loss": 0.6123, + "step": 17777 + }, + { + "epoch": 2.65, + "grad_norm": 2.854037409216599, + "learning_rate": 9.505086719210031e-07, + "loss": 0.6432, + "step": 17778 + }, + { + "epoch": 2.65, + "grad_norm": 3.149375324337243, + "learning_rate": 9.504121826290759e-07, + "loss": 0.6426, + "step": 17779 + }, + { + "epoch": 2.65, + "grad_norm": 8.545947619721105, + "learning_rate": 9.503156937999562e-07, + "loss": 0.638, + "step": 17780 + }, + { + "epoch": 2.65, + "grad_norm": 3.2228744017072284, + "learning_rate": 9.502192054345449e-07, + "loss": 0.6628, + "step": 17781 + }, + { + "epoch": 2.65, + "grad_norm": 4.02609203710027, + "learning_rate": 9.501227175337417e-07, + "loss": 0.6146, + "step": 17782 + }, + { + "epoch": 2.65, + "grad_norm": 3.5288294400796576, + "learning_rate": 9.500262300984474e-07, + "loss": 0.681, + "step": 17783 + }, + { + "epoch": 2.65, + "grad_norm": 2.8571190814781784, + "learning_rate": 9.499297431295631e-07, + "loss": 0.627, + "step": 17784 + }, + { + "epoch": 2.65, + "grad_norm": 5.266238505743333, + "learning_rate": 9.498332566279887e-07, + "loss": 0.6712, + "step": 17785 + }, + { + "epoch": 2.65, + "grad_norm": 3.6288669002711487, + "learning_rate": 9.497367705946251e-07, + "loss": 0.6322, + "step": 17786 + }, + { + "epoch": 2.65, + "grad_norm": 3.0288325970921166, + "learning_rate": 9.496402850303724e-07, + "loss": 0.6126, + "step": 17787 + }, + { + "epoch": 2.65, + "grad_norm": 3.7496152954460173, + "learning_rate": 9.49543799936132e-07, + "loss": 0.6064, + "step": 17788 + }, + { + "epoch": 2.65, + "grad_norm": 6.131984269037891, + "learning_rate": 9.494473153128034e-07, + "loss": 0.6771, + "step": 17789 + }, + { + "epoch": 2.65, + "grad_norm": 3.662014682722589, + "learning_rate": 9.493508311612874e-07, + "loss": 0.6055, + "step": 17790 + }, + { + "epoch": 2.65, + "grad_norm": 3.1911257208618564, + "learning_rate": 9.492543474824845e-07, + "loss": 0.6914, + "step": 17791 + }, + { + "epoch": 2.65, + "grad_norm": 2.992644942234791, + "learning_rate": 9.491578642772954e-07, + "loss": 0.651, + "step": 17792 + }, + { + "epoch": 2.65, + "grad_norm": 4.953613929939214, + "learning_rate": 9.490613815466206e-07, + "loss": 0.6654, + "step": 17793 + }, + { + "epoch": 2.65, + "grad_norm": 3.1392708253047346, + "learning_rate": 9.4896489929136e-07, + "loss": 0.6029, + "step": 17794 + }, + { + "epoch": 2.65, + "grad_norm": 2.989116609013228, + "learning_rate": 9.48868417512415e-07, + "loss": 0.6387, + "step": 17795 + }, + { + "epoch": 2.65, + "grad_norm": 3.3920392836235282, + "learning_rate": 9.487719362106853e-07, + "loss": 0.625, + "step": 17796 + }, + { + "epoch": 2.65, + "grad_norm": 3.32543515705019, + "learning_rate": 9.486754553870718e-07, + "loss": 0.653, + "step": 17797 + }, + { + "epoch": 2.65, + "grad_norm": 3.280465073624446, + "learning_rate": 9.485789750424745e-07, + "loss": 0.6562, + "step": 17798 + }, + { + "epoch": 2.65, + "grad_norm": 3.4797212363672294, + "learning_rate": 9.484824951777946e-07, + "loss": 0.6445, + "step": 17799 + }, + { + "epoch": 2.65, + "grad_norm": 3.326373495615128, + "learning_rate": 9.483860157939317e-07, + "loss": 0.6523, + "step": 17800 + }, + { + "epoch": 2.65, + "grad_norm": 3.074735003077626, + "learning_rate": 9.482895368917872e-07, + "loss": 0.6367, + "step": 17801 + }, + { + "epoch": 2.66, + "grad_norm": 2.8121289151440254, + "learning_rate": 9.481930584722608e-07, + "loss": 0.5964, + "step": 17802 + }, + { + "epoch": 2.66, + "grad_norm": 3.6064312149840703, + "learning_rate": 9.480965805362529e-07, + "loss": 0.679, + "step": 17803 + }, + { + "epoch": 2.66, + "grad_norm": 4.09470713927055, + "learning_rate": 9.480001030846646e-07, + "loss": 0.6263, + "step": 17804 + }, + { + "epoch": 2.66, + "grad_norm": 3.603185572018974, + "learning_rate": 9.479036261183956e-07, + "loss": 0.6217, + "step": 17805 + }, + { + "epoch": 2.66, + "grad_norm": 4.599281636176589, + "learning_rate": 9.478071496383466e-07, + "loss": 0.6374, + "step": 17806 + }, + { + "epoch": 2.66, + "grad_norm": 2.863498728990495, + "learning_rate": 9.477106736454184e-07, + "loss": 0.597, + "step": 17807 + }, + { + "epoch": 2.66, + "grad_norm": 2.91806984115593, + "learning_rate": 9.476141981405112e-07, + "loss": 0.6126, + "step": 17808 + }, + { + "epoch": 2.66, + "grad_norm": 2.9187601556522975, + "learning_rate": 9.47517723124525e-07, + "loss": 0.612, + "step": 17809 + }, + { + "epoch": 2.66, + "grad_norm": 3.352073925767498, + "learning_rate": 9.474212485983608e-07, + "loss": 0.6992, + "step": 17810 + }, + { + "epoch": 2.66, + "grad_norm": 2.946485371609603, + "learning_rate": 9.473247745629185e-07, + "loss": 0.6003, + "step": 17811 + }, + { + "epoch": 2.66, + "grad_norm": 3.1195677559472443, + "learning_rate": 9.472283010190989e-07, + "loss": 0.64, + "step": 17812 + }, + { + "epoch": 2.66, + "grad_norm": 3.314849235999905, + "learning_rate": 9.47131827967802e-07, + "loss": 0.6569, + "step": 17813 + }, + { + "epoch": 2.66, + "grad_norm": 2.9791385957615786, + "learning_rate": 9.470353554099288e-07, + "loss": 0.6217, + "step": 17814 + }, + { + "epoch": 2.66, + "grad_norm": 3.4669929475689187, + "learning_rate": 9.469388833463792e-07, + "loss": 0.6849, + "step": 17815 + }, + { + "epoch": 2.66, + "grad_norm": 3.3629916742209884, + "learning_rate": 9.468424117780534e-07, + "loss": 0.6582, + "step": 17816 + }, + { + "epoch": 2.66, + "grad_norm": 3.3894852430743203, + "learning_rate": 9.467459407058521e-07, + "loss": 0.6452, + "step": 17817 + }, + { + "epoch": 2.66, + "grad_norm": 3.1593581137130244, + "learning_rate": 9.46649470130676e-07, + "loss": 0.6074, + "step": 17818 + }, + { + "epoch": 2.66, + "grad_norm": 5.117741762883233, + "learning_rate": 9.465530000534247e-07, + "loss": 0.6367, + "step": 17819 + }, + { + "epoch": 2.66, + "grad_norm": 3.745684923314711, + "learning_rate": 9.464565304749993e-07, + "loss": 0.6452, + "step": 17820 + }, + { + "epoch": 2.66, + "grad_norm": 3.1498376457538813, + "learning_rate": 9.463600613963e-07, + "loss": 0.6341, + "step": 17821 + }, + { + "epoch": 2.66, + "grad_norm": 3.4910987604095345, + "learning_rate": 9.462635928182266e-07, + "loss": 0.6328, + "step": 17822 + }, + { + "epoch": 2.66, + "grad_norm": 4.414887217638438, + "learning_rate": 9.4616712474168e-07, + "loss": 0.6289, + "step": 17823 + }, + { + "epoch": 2.66, + "grad_norm": 3.3537943653713214, + "learning_rate": 9.460706571675603e-07, + "loss": 0.6341, + "step": 17824 + }, + { + "epoch": 2.66, + "grad_norm": 3.4668329759720633, + "learning_rate": 9.45974190096768e-07, + "loss": 0.6393, + "step": 17825 + }, + { + "epoch": 2.66, + "grad_norm": 3.0584078452599512, + "learning_rate": 9.458777235302032e-07, + "loss": 0.6465, + "step": 17826 + }, + { + "epoch": 2.66, + "grad_norm": 5.798016504745578, + "learning_rate": 9.457812574687663e-07, + "loss": 0.6159, + "step": 17827 + }, + { + "epoch": 2.66, + "grad_norm": 4.04191622351149, + "learning_rate": 9.456847919133584e-07, + "loss": 0.6536, + "step": 17828 + }, + { + "epoch": 2.66, + "grad_norm": 2.898288051426951, + "learning_rate": 9.455883268648787e-07, + "loss": 0.6217, + "step": 17829 + }, + { + "epoch": 2.66, + "grad_norm": 3.019952540974176, + "learning_rate": 9.454918623242278e-07, + "loss": 0.6113, + "step": 17830 + }, + { + "epoch": 2.66, + "grad_norm": 3.450857650604771, + "learning_rate": 9.453953982923064e-07, + "loss": 0.6543, + "step": 17831 + }, + { + "epoch": 2.66, + "grad_norm": 4.370418530536745, + "learning_rate": 9.452989347700143e-07, + "loss": 0.6621, + "step": 17832 + }, + { + "epoch": 2.66, + "grad_norm": 3.9237517286448353, + "learning_rate": 9.452024717582524e-07, + "loss": 0.6257, + "step": 17833 + }, + { + "epoch": 2.66, + "grad_norm": 3.2248221163066018, + "learning_rate": 9.451060092579208e-07, + "loss": 0.6276, + "step": 17834 + }, + { + "epoch": 2.66, + "grad_norm": 2.8994214821921207, + "learning_rate": 9.450095472699193e-07, + "loss": 0.6087, + "step": 17835 + }, + { + "epoch": 2.66, + "grad_norm": 2.8879810677550006, + "learning_rate": 9.449130857951488e-07, + "loss": 0.6315, + "step": 17836 + }, + { + "epoch": 2.66, + "grad_norm": 3.3773317371503904, + "learning_rate": 9.44816624834509e-07, + "loss": 0.6393, + "step": 17837 + }, + { + "epoch": 2.66, + "grad_norm": 3.5293178748777114, + "learning_rate": 9.447201643889007e-07, + "loss": 0.6562, + "step": 17838 + }, + { + "epoch": 2.66, + "grad_norm": 7.260834330108559, + "learning_rate": 9.44623704459224e-07, + "loss": 0.6608, + "step": 17839 + }, + { + "epoch": 2.66, + "grad_norm": 3.5814216867871713, + "learning_rate": 9.445272450463791e-07, + "loss": 0.668, + "step": 17840 + }, + { + "epoch": 2.66, + "grad_norm": 3.6806712497237934, + "learning_rate": 9.444307861512667e-07, + "loss": 0.653, + "step": 17841 + }, + { + "epoch": 2.66, + "grad_norm": 5.181598328854309, + "learning_rate": 9.443343277747864e-07, + "loss": 0.653, + "step": 17842 + }, + { + "epoch": 2.66, + "grad_norm": 3.714309715911518, + "learning_rate": 9.442378699178387e-07, + "loss": 0.6341, + "step": 17843 + }, + { + "epoch": 2.66, + "grad_norm": 4.043618896149813, + "learning_rate": 9.441414125813241e-07, + "loss": 0.6276, + "step": 17844 + }, + { + "epoch": 2.66, + "grad_norm": 3.3442691846574415, + "learning_rate": 9.440449557661423e-07, + "loss": 0.6654, + "step": 17845 + }, + { + "epoch": 2.66, + "grad_norm": 3.2171128351455827, + "learning_rate": 9.439484994731941e-07, + "loss": 0.6738, + "step": 17846 + }, + { + "epoch": 2.66, + "grad_norm": 4.040257802820499, + "learning_rate": 9.438520437033793e-07, + "loss": 0.6406, + "step": 17847 + }, + { + "epoch": 2.66, + "grad_norm": 2.9047497910442712, + "learning_rate": 9.437555884575989e-07, + "loss": 0.6185, + "step": 17848 + }, + { + "epoch": 2.66, + "grad_norm": 4.442966994224173, + "learning_rate": 9.43659133736752e-07, + "loss": 0.6471, + "step": 17849 + }, + { + "epoch": 2.66, + "grad_norm": 3.459294529941251, + "learning_rate": 9.435626795417396e-07, + "loss": 0.6908, + "step": 17850 + }, + { + "epoch": 2.66, + "grad_norm": 4.807502357995917, + "learning_rate": 9.434662258734616e-07, + "loss": 0.5775, + "step": 17851 + }, + { + "epoch": 2.66, + "grad_norm": 3.043287840208821, + "learning_rate": 9.433697727328184e-07, + "loss": 0.6654, + "step": 17852 + }, + { + "epoch": 2.66, + "grad_norm": 3.6243038203433824, + "learning_rate": 9.4327332012071e-07, + "loss": 0.5977, + "step": 17853 + }, + { + "epoch": 2.66, + "grad_norm": 3.857696175442868, + "learning_rate": 9.43176868038037e-07, + "loss": 0.6719, + "step": 17854 + }, + { + "epoch": 2.66, + "grad_norm": 3.189832288205719, + "learning_rate": 9.430804164856994e-07, + "loss": 0.5954, + "step": 17855 + }, + { + "epoch": 2.66, + "grad_norm": 2.867602072372314, + "learning_rate": 9.429839654645968e-07, + "loss": 0.6862, + "step": 17856 + }, + { + "epoch": 2.66, + "grad_norm": 2.9493073255880544, + "learning_rate": 9.428875149756303e-07, + "loss": 0.6204, + "step": 17857 + }, + { + "epoch": 2.66, + "grad_norm": 6.3602049797995575, + "learning_rate": 9.427910650196993e-07, + "loss": 0.6452, + "step": 17858 + }, + { + "epoch": 2.66, + "grad_norm": 3.4720687941810016, + "learning_rate": 9.426946155977046e-07, + "loss": 0.6406, + "step": 17859 + }, + { + "epoch": 2.66, + "grad_norm": 3.350326622593579, + "learning_rate": 9.42598166710546e-07, + "loss": 0.6452, + "step": 17860 + }, + { + "epoch": 2.66, + "grad_norm": 5.212699621642072, + "learning_rate": 9.425017183591241e-07, + "loss": 0.6315, + "step": 17861 + }, + { + "epoch": 2.66, + "grad_norm": 3.641475066912086, + "learning_rate": 9.424052705443383e-07, + "loss": 0.6243, + "step": 17862 + }, + { + "epoch": 2.66, + "grad_norm": 3.3489624548839294, + "learning_rate": 9.423088232670896e-07, + "loss": 0.6719, + "step": 17863 + }, + { + "epoch": 2.66, + "grad_norm": 3.5806683180414356, + "learning_rate": 9.422123765282775e-07, + "loss": 0.6348, + "step": 17864 + }, + { + "epoch": 2.66, + "grad_norm": 3.1670665318602658, + "learning_rate": 9.421159303288024e-07, + "loss": 0.668, + "step": 17865 + }, + { + "epoch": 2.66, + "grad_norm": 3.836204979538136, + "learning_rate": 9.420194846695644e-07, + "loss": 0.6178, + "step": 17866 + }, + { + "epoch": 2.66, + "grad_norm": 3.4078881700576047, + "learning_rate": 9.419230395514642e-07, + "loss": 0.6178, + "step": 17867 + }, + { + "epoch": 2.66, + "grad_norm": 5.9091739263688465, + "learning_rate": 9.418265949754009e-07, + "loss": 0.7129, + "step": 17868 + }, + { + "epoch": 2.67, + "grad_norm": 2.8711964734025006, + "learning_rate": 9.417301509422751e-07, + "loss": 0.6074, + "step": 17869 + }, + { + "epoch": 2.67, + "grad_norm": 3.548767581774667, + "learning_rate": 9.416337074529872e-07, + "loss": 0.6172, + "step": 17870 + }, + { + "epoch": 2.67, + "grad_norm": 3.454311975433809, + "learning_rate": 9.415372645084368e-07, + "loss": 0.6283, + "step": 17871 + }, + { + "epoch": 2.67, + "grad_norm": 3.346210842905487, + "learning_rate": 9.414408221095244e-07, + "loss": 0.6569, + "step": 17872 + }, + { + "epoch": 2.67, + "grad_norm": 3.1409386527160676, + "learning_rate": 9.4134438025715e-07, + "loss": 0.6745, + "step": 17873 + }, + { + "epoch": 2.67, + "grad_norm": 3.3366902214123035, + "learning_rate": 9.412479389522141e-07, + "loss": 0.6465, + "step": 17874 + }, + { + "epoch": 2.67, + "grad_norm": 3.03690933445413, + "learning_rate": 9.411514981956158e-07, + "loss": 0.6367, + "step": 17875 + }, + { + "epoch": 2.67, + "grad_norm": 4.185252267316281, + "learning_rate": 9.41055057988256e-07, + "loss": 0.6517, + "step": 17876 + }, + { + "epoch": 2.67, + "grad_norm": 3.544962603956109, + "learning_rate": 9.409586183310344e-07, + "loss": 0.6478, + "step": 17877 + }, + { + "epoch": 2.67, + "grad_norm": 5.2147660235974715, + "learning_rate": 9.408621792248514e-07, + "loss": 0.6641, + "step": 17878 + }, + { + "epoch": 2.67, + "grad_norm": 3.7465404013659644, + "learning_rate": 9.407657406706068e-07, + "loss": 0.6497, + "step": 17879 + }, + { + "epoch": 2.67, + "grad_norm": 6.138148305906151, + "learning_rate": 9.406693026692009e-07, + "loss": 0.6725, + "step": 17880 + }, + { + "epoch": 2.67, + "grad_norm": 5.298741715973607, + "learning_rate": 9.405728652215339e-07, + "loss": 0.6178, + "step": 17881 + }, + { + "epoch": 2.67, + "grad_norm": 3.0495495145165745, + "learning_rate": 9.404764283285052e-07, + "loss": 0.6471, + "step": 17882 + }, + { + "epoch": 2.67, + "grad_norm": 4.999007003285316, + "learning_rate": 9.403799919910152e-07, + "loss": 0.6738, + "step": 17883 + }, + { + "epoch": 2.67, + "grad_norm": 3.9917105053433732, + "learning_rate": 9.402835562099641e-07, + "loss": 0.651, + "step": 17884 + }, + { + "epoch": 2.67, + "grad_norm": 3.7956645511841494, + "learning_rate": 9.401871209862519e-07, + "loss": 0.6523, + "step": 17885 + }, + { + "epoch": 2.67, + "grad_norm": 3.173530918092411, + "learning_rate": 9.400906863207786e-07, + "loss": 0.6712, + "step": 17886 + }, + { + "epoch": 2.67, + "grad_norm": 2.8783403129960545, + "learning_rate": 9.399942522144445e-07, + "loss": 0.6276, + "step": 17887 + }, + { + "epoch": 2.67, + "grad_norm": 9.236081394600406, + "learning_rate": 9.39897818668149e-07, + "loss": 0.6068, + "step": 17888 + }, + { + "epoch": 2.67, + "grad_norm": 2.880758800163883, + "learning_rate": 9.398013856827927e-07, + "loss": 0.6458, + "step": 17889 + }, + { + "epoch": 2.67, + "grad_norm": 3.1605068571666126, + "learning_rate": 9.39704953259275e-07, + "loss": 0.6335, + "step": 17890 + }, + { + "epoch": 2.67, + "grad_norm": 3.5628727065459427, + "learning_rate": 9.396085213984967e-07, + "loss": 0.6816, + "step": 17891 + }, + { + "epoch": 2.67, + "grad_norm": 2.780433640533812, + "learning_rate": 9.39512090101357e-07, + "loss": 0.6497, + "step": 17892 + }, + { + "epoch": 2.67, + "grad_norm": 3.096387524329257, + "learning_rate": 9.394156593687564e-07, + "loss": 0.6458, + "step": 17893 + }, + { + "epoch": 2.67, + "grad_norm": 2.999331498902498, + "learning_rate": 9.393192292015954e-07, + "loss": 0.6725, + "step": 17894 + }, + { + "epoch": 2.67, + "grad_norm": 3.360588192701973, + "learning_rate": 9.39222799600773e-07, + "loss": 0.638, + "step": 17895 + }, + { + "epoch": 2.67, + "grad_norm": 7.447104745616933, + "learning_rate": 9.391263705671893e-07, + "loss": 0.6628, + "step": 17896 + }, + { + "epoch": 2.67, + "grad_norm": 3.72096616092581, + "learning_rate": 9.39029942101745e-07, + "loss": 0.6328, + "step": 17897 + }, + { + "epoch": 2.67, + "grad_norm": 2.9246699186474054, + "learning_rate": 9.389335142053393e-07, + "loss": 0.6367, + "step": 17898 + }, + { + "epoch": 2.67, + "grad_norm": 6.513640552709621, + "learning_rate": 9.388370868788728e-07, + "loss": 0.6413, + "step": 17899 + }, + { + "epoch": 2.67, + "grad_norm": 4.387108195403713, + "learning_rate": 9.387406601232454e-07, + "loss": 0.6348, + "step": 17900 + }, + { + "epoch": 2.67, + "grad_norm": 4.599091466220337, + "learning_rate": 9.386442339393563e-07, + "loss": 0.6413, + "step": 17901 + }, + { + "epoch": 2.67, + "grad_norm": 2.858426466796927, + "learning_rate": 9.385478083281064e-07, + "loss": 0.6562, + "step": 17902 + }, + { + "epoch": 2.67, + "grad_norm": 3.358051652479251, + "learning_rate": 9.384513832903948e-07, + "loss": 0.6048, + "step": 17903 + }, + { + "epoch": 2.67, + "grad_norm": 3.4430207421473686, + "learning_rate": 9.383549588271219e-07, + "loss": 0.6608, + "step": 17904 + }, + { + "epoch": 2.67, + "grad_norm": 4.801908507917144, + "learning_rate": 9.382585349391879e-07, + "loss": 0.6432, + "step": 17905 + }, + { + "epoch": 2.67, + "grad_norm": 2.8966989068488678, + "learning_rate": 9.381621116274922e-07, + "loss": 0.6816, + "step": 17906 + }, + { + "epoch": 2.67, + "grad_norm": 3.2554673108673353, + "learning_rate": 9.380656888929356e-07, + "loss": 0.6165, + "step": 17907 + }, + { + "epoch": 2.67, + "grad_norm": 2.6996982694004332, + "learning_rate": 9.37969266736417e-07, + "loss": 0.5918, + "step": 17908 + }, + { + "epoch": 2.67, + "grad_norm": 3.6103893594571552, + "learning_rate": 9.378728451588365e-07, + "loss": 0.6178, + "step": 17909 + }, + { + "epoch": 2.67, + "grad_norm": 2.925123329395304, + "learning_rate": 9.377764241610945e-07, + "loss": 0.6419, + "step": 17910 + }, + { + "epoch": 2.67, + "grad_norm": 3.297159666355628, + "learning_rate": 9.376800037440904e-07, + "loss": 0.6413, + "step": 17911 + }, + { + "epoch": 2.67, + "grad_norm": 2.8693104719577254, + "learning_rate": 9.375835839087246e-07, + "loss": 0.6328, + "step": 17912 + }, + { + "epoch": 2.67, + "grad_norm": 3.376844866592482, + "learning_rate": 9.374871646558965e-07, + "loss": 0.6725, + "step": 17913 + }, + { + "epoch": 2.67, + "grad_norm": 2.960753929944141, + "learning_rate": 9.373907459865066e-07, + "loss": 0.6387, + "step": 17914 + }, + { + "epoch": 2.67, + "grad_norm": 3.568143900105626, + "learning_rate": 9.37294327901454e-07, + "loss": 0.6367, + "step": 17915 + }, + { + "epoch": 2.67, + "grad_norm": 3.448951292601405, + "learning_rate": 9.371979104016392e-07, + "loss": 0.6224, + "step": 17916 + }, + { + "epoch": 2.67, + "grad_norm": 4.11184533659313, + "learning_rate": 9.371014934879616e-07, + "loss": 0.668, + "step": 17917 + }, + { + "epoch": 2.67, + "grad_norm": 3.3748625629391262, + "learning_rate": 9.370050771613216e-07, + "loss": 0.6406, + "step": 17918 + }, + { + "epoch": 2.67, + "grad_norm": 6.218726292212582, + "learning_rate": 9.369086614226186e-07, + "loss": 0.6823, + "step": 17919 + }, + { + "epoch": 2.67, + "grad_norm": 2.943685072766547, + "learning_rate": 9.36812246272753e-07, + "loss": 0.6283, + "step": 17920 + }, + { + "epoch": 2.67, + "grad_norm": 4.29196695929045, + "learning_rate": 9.367158317126242e-07, + "loss": 0.6243, + "step": 17921 + }, + { + "epoch": 2.67, + "grad_norm": 3.2593622232678228, + "learning_rate": 9.366194177431318e-07, + "loss": 0.6725, + "step": 17922 + }, + { + "epoch": 2.67, + "grad_norm": 3.8438918683884307, + "learning_rate": 9.365230043651762e-07, + "loss": 0.5977, + "step": 17923 + }, + { + "epoch": 2.67, + "grad_norm": 3.0494944598802696, + "learning_rate": 9.364265915796568e-07, + "loss": 0.612, + "step": 17924 + }, + { + "epoch": 2.67, + "grad_norm": 3.0053848794495948, + "learning_rate": 9.36330179387474e-07, + "loss": 0.6426, + "step": 17925 + }, + { + "epoch": 2.67, + "grad_norm": 3.635943441632832, + "learning_rate": 9.362337677895267e-07, + "loss": 0.6497, + "step": 17926 + }, + { + "epoch": 2.67, + "grad_norm": 2.8731789456862042, + "learning_rate": 9.361373567867162e-07, + "loss": 0.6029, + "step": 17927 + }, + { + "epoch": 2.67, + "grad_norm": 4.284774882854601, + "learning_rate": 9.360409463799405e-07, + "loss": 0.6204, + "step": 17928 + }, + { + "epoch": 2.67, + "grad_norm": 4.167554068706813, + "learning_rate": 9.359445365701008e-07, + "loss": 0.6706, + "step": 17929 + }, + { + "epoch": 2.67, + "grad_norm": 3.2422295723675942, + "learning_rate": 9.358481273580962e-07, + "loss": 0.6432, + "step": 17930 + }, + { + "epoch": 2.67, + "grad_norm": 3.235961242552443, + "learning_rate": 9.357517187448269e-07, + "loss": 0.612, + "step": 17931 + }, + { + "epoch": 2.67, + "grad_norm": 3.6824816031766345, + "learning_rate": 9.356553107311921e-07, + "loss": 0.6634, + "step": 17932 + }, + { + "epoch": 2.67, + "grad_norm": 3.7324169553038757, + "learning_rate": 9.355589033180925e-07, + "loss": 0.6081, + "step": 17933 + }, + { + "epoch": 2.67, + "grad_norm": 3.8005414737376215, + "learning_rate": 9.354624965064271e-07, + "loss": 0.6387, + "step": 17934 + }, + { + "epoch": 2.67, + "grad_norm": 3.8706160393006965, + "learning_rate": 9.353660902970958e-07, + "loss": 0.6387, + "step": 17935 + }, + { + "epoch": 2.68, + "grad_norm": 3.5320562298717237, + "learning_rate": 9.352696846909988e-07, + "loss": 0.7012, + "step": 17936 + }, + { + "epoch": 2.68, + "grad_norm": 3.3929699293137516, + "learning_rate": 9.351732796890352e-07, + "loss": 0.6732, + "step": 17937 + }, + { + "epoch": 2.68, + "grad_norm": 4.259898658805264, + "learning_rate": 9.350768752921051e-07, + "loss": 0.6549, + "step": 17938 + }, + { + "epoch": 2.68, + "grad_norm": 3.262679274296862, + "learning_rate": 9.349804715011084e-07, + "loss": 0.6751, + "step": 17939 + }, + { + "epoch": 2.68, + "grad_norm": 6.8926628065112086, + "learning_rate": 9.348840683169452e-07, + "loss": 0.6283, + "step": 17940 + }, + { + "epoch": 2.68, + "grad_norm": 3.702144126721415, + "learning_rate": 9.347876657405142e-07, + "loss": 0.6595, + "step": 17941 + }, + { + "epoch": 2.68, + "grad_norm": 4.164960573064039, + "learning_rate": 9.346912637727157e-07, + "loss": 0.6224, + "step": 17942 + }, + { + "epoch": 2.68, + "grad_norm": 4.2736300943077765, + "learning_rate": 9.345948624144494e-07, + "loss": 0.6576, + "step": 17943 + }, + { + "epoch": 2.68, + "grad_norm": 3.3819446142289338, + "learning_rate": 9.344984616666153e-07, + "loss": 0.6367, + "step": 17944 + }, + { + "epoch": 2.68, + "grad_norm": 3.154448235733686, + "learning_rate": 9.344020615301125e-07, + "loss": 0.6309, + "step": 17945 + }, + { + "epoch": 2.68, + "grad_norm": 3.1449646396881135, + "learning_rate": 9.343056620058412e-07, + "loss": 0.6439, + "step": 17946 + }, + { + "epoch": 2.68, + "grad_norm": 4.152660914726504, + "learning_rate": 9.342092630947014e-07, + "loss": 0.6517, + "step": 17947 + }, + { + "epoch": 2.68, + "grad_norm": 3.313531763633607, + "learning_rate": 9.341128647975918e-07, + "loss": 0.5957, + "step": 17948 + }, + { + "epoch": 2.68, + "grad_norm": 3.2266285023645405, + "learning_rate": 9.340164671154127e-07, + "loss": 0.6523, + "step": 17949 + }, + { + "epoch": 2.68, + "grad_norm": 3.9941357875333656, + "learning_rate": 9.339200700490639e-07, + "loss": 0.6367, + "step": 17950 + }, + { + "epoch": 2.68, + "grad_norm": 5.081114350543267, + "learning_rate": 9.338236735994448e-07, + "loss": 0.6582, + "step": 17951 + }, + { + "epoch": 2.68, + "grad_norm": 3.9381118504682657, + "learning_rate": 9.337272777674554e-07, + "loss": 0.6908, + "step": 17952 + }, + { + "epoch": 2.68, + "grad_norm": 5.3869472512690875, + "learning_rate": 9.336308825539954e-07, + "loss": 0.6406, + "step": 17953 + }, + { + "epoch": 2.68, + "grad_norm": 3.6342751720018214, + "learning_rate": 9.335344879599638e-07, + "loss": 0.6432, + "step": 17954 + }, + { + "epoch": 2.68, + "grad_norm": 2.980417496182122, + "learning_rate": 9.33438093986261e-07, + "loss": 0.6029, + "step": 17955 + }, + { + "epoch": 2.68, + "grad_norm": 6.846777874794096, + "learning_rate": 9.333417006337859e-07, + "loss": 0.6452, + "step": 17956 + }, + { + "epoch": 2.68, + "grad_norm": 3.577459667922083, + "learning_rate": 9.33245307903439e-07, + "loss": 0.638, + "step": 17957 + }, + { + "epoch": 2.68, + "grad_norm": 4.471219156056992, + "learning_rate": 9.331489157961193e-07, + "loss": 0.6882, + "step": 17958 + }, + { + "epoch": 2.68, + "grad_norm": 3.2791512801997236, + "learning_rate": 9.330525243127267e-07, + "loss": 0.6862, + "step": 17959 + }, + { + "epoch": 2.68, + "grad_norm": 3.609263640272435, + "learning_rate": 9.329561334541613e-07, + "loss": 0.6589, + "step": 17960 + }, + { + "epoch": 2.68, + "grad_norm": 3.6544122761779336, + "learning_rate": 9.328597432213218e-07, + "loss": 0.6491, + "step": 17961 + }, + { + "epoch": 2.68, + "grad_norm": 3.7071211269195015, + "learning_rate": 9.327633536151083e-07, + "loss": 0.6302, + "step": 17962 + }, + { + "epoch": 2.68, + "grad_norm": 3.2685628263368756, + "learning_rate": 9.326669646364203e-07, + "loss": 0.6419, + "step": 17963 + }, + { + "epoch": 2.68, + "grad_norm": 3.1086654715195268, + "learning_rate": 9.325705762861576e-07, + "loss": 0.6302, + "step": 17964 + }, + { + "epoch": 2.68, + "grad_norm": 3.7191869747324495, + "learning_rate": 9.324741885652196e-07, + "loss": 0.6217, + "step": 17965 + }, + { + "epoch": 2.68, + "grad_norm": 3.5857885598944086, + "learning_rate": 9.323778014745063e-07, + "loss": 0.6204, + "step": 17966 + }, + { + "epoch": 2.68, + "grad_norm": 4.003493136819434, + "learning_rate": 9.322814150149165e-07, + "loss": 0.6576, + "step": 17967 + }, + { + "epoch": 2.68, + "grad_norm": 4.129641355386616, + "learning_rate": 9.321850291873505e-07, + "loss": 0.6589, + "step": 17968 + }, + { + "epoch": 2.68, + "grad_norm": 2.8794877160673105, + "learning_rate": 9.320886439927074e-07, + "loss": 0.6478, + "step": 17969 + }, + { + "epoch": 2.68, + "grad_norm": 2.940416052150813, + "learning_rate": 9.31992259431887e-07, + "loss": 0.6289, + "step": 17970 + }, + { + "epoch": 2.68, + "grad_norm": 3.2243418664620713, + "learning_rate": 9.318958755057891e-07, + "loss": 0.6855, + "step": 17971 + }, + { + "epoch": 2.68, + "grad_norm": 3.657784684431376, + "learning_rate": 9.317994922153127e-07, + "loss": 0.6204, + "step": 17972 + }, + { + "epoch": 2.68, + "grad_norm": 3.135419653212132, + "learning_rate": 9.317031095613581e-07, + "loss": 0.6322, + "step": 17973 + }, + { + "epoch": 2.68, + "grad_norm": 3.858286084602687, + "learning_rate": 9.316067275448242e-07, + "loss": 0.6178, + "step": 17974 + }, + { + "epoch": 2.68, + "grad_norm": 4.386941543302486, + "learning_rate": 9.315103461666107e-07, + "loss": 0.6439, + "step": 17975 + }, + { + "epoch": 2.68, + "grad_norm": 3.4155622881101615, + "learning_rate": 9.314139654276173e-07, + "loss": 0.627, + "step": 17976 + }, + { + "epoch": 2.68, + "grad_norm": 3.3195469758906926, + "learning_rate": 9.313175853287433e-07, + "loss": 0.5807, + "step": 17977 + }, + { + "epoch": 2.68, + "grad_norm": 3.3401202060399107, + "learning_rate": 9.312212058708884e-07, + "loss": 0.6556, + "step": 17978 + }, + { + "epoch": 2.68, + "grad_norm": 3.507279740327574, + "learning_rate": 9.31124827054952e-07, + "loss": 0.5996, + "step": 17979 + }, + { + "epoch": 2.68, + "grad_norm": 3.9206732053731463, + "learning_rate": 9.310284488818341e-07, + "loss": 0.6471, + "step": 17980 + }, + { + "epoch": 2.68, + "grad_norm": 4.162020746988836, + "learning_rate": 9.309320713524332e-07, + "loss": 0.6439, + "step": 17981 + }, + { + "epoch": 2.68, + "grad_norm": 5.497321624638129, + "learning_rate": 9.308356944676498e-07, + "loss": 0.6165, + "step": 17982 + }, + { + "epoch": 2.68, + "grad_norm": 3.6640400478407438, + "learning_rate": 9.307393182283826e-07, + "loss": 0.6634, + "step": 17983 + }, + { + "epoch": 2.68, + "grad_norm": 5.627770530854225, + "learning_rate": 9.306429426355317e-07, + "loss": 0.6914, + "step": 17984 + }, + { + "epoch": 2.68, + "grad_norm": 3.7148762472613392, + "learning_rate": 9.305465676899963e-07, + "loss": 0.5931, + "step": 17985 + }, + { + "epoch": 2.68, + "grad_norm": 3.352004606790111, + "learning_rate": 9.304501933926762e-07, + "loss": 0.6491, + "step": 17986 + }, + { + "epoch": 2.68, + "grad_norm": 3.9678014163915427, + "learning_rate": 9.303538197444705e-07, + "loss": 0.6003, + "step": 17987 + }, + { + "epoch": 2.68, + "grad_norm": 4.778852105139559, + "learning_rate": 9.302574467462784e-07, + "loss": 0.6387, + "step": 17988 + }, + { + "epoch": 2.68, + "grad_norm": 4.2254309576883635, + "learning_rate": 9.30161074399e-07, + "loss": 0.6165, + "step": 17989 + }, + { + "epoch": 2.68, + "grad_norm": 3.4895795441831954, + "learning_rate": 9.300647027035342e-07, + "loss": 0.6576, + "step": 17990 + }, + { + "epoch": 2.68, + "grad_norm": 3.806108763250136, + "learning_rate": 9.299683316607811e-07, + "loss": 0.6419, + "step": 17991 + }, + { + "epoch": 2.68, + "grad_norm": 4.913244696576081, + "learning_rate": 9.298719612716394e-07, + "loss": 0.6208, + "step": 17992 + }, + { + "epoch": 2.68, + "grad_norm": 3.334315766659914, + "learning_rate": 9.297755915370093e-07, + "loss": 0.5905, + "step": 17993 + }, + { + "epoch": 2.68, + "grad_norm": 3.8563435055068394, + "learning_rate": 9.296792224577894e-07, + "loss": 0.6426, + "step": 17994 + }, + { + "epoch": 2.68, + "grad_norm": 3.321481426874746, + "learning_rate": 9.295828540348798e-07, + "loss": 0.638, + "step": 17995 + }, + { + "epoch": 2.68, + "grad_norm": 2.898467313533734, + "learning_rate": 9.294864862691795e-07, + "loss": 0.623, + "step": 17996 + }, + { + "epoch": 2.68, + "grad_norm": 3.3124182990517186, + "learning_rate": 9.293901191615882e-07, + "loss": 0.6615, + "step": 17997 + }, + { + "epoch": 2.68, + "grad_norm": 3.297376188451019, + "learning_rate": 9.29293752713005e-07, + "loss": 0.651, + "step": 17998 + }, + { + "epoch": 2.68, + "grad_norm": 3.8189223503535326, + "learning_rate": 9.291973869243294e-07, + "loss": 0.6045, + "step": 17999 + }, + { + "epoch": 2.68, + "grad_norm": 3.402453303602096, + "learning_rate": 9.291010217964614e-07, + "loss": 0.6654, + "step": 18000 + }, + { + "epoch": 2.68, + "grad_norm": 3.9168477435806874, + "learning_rate": 9.290046573302992e-07, + "loss": 0.679, + "step": 18001 + }, + { + "epoch": 2.68, + "grad_norm": 3.065307798660695, + "learning_rate": 9.289082935267431e-07, + "loss": 0.6243, + "step": 18002 + }, + { + "epoch": 2.69, + "grad_norm": 4.241746510004309, + "learning_rate": 9.28811930386692e-07, + "loss": 0.6738, + "step": 18003 + }, + { + "epoch": 2.69, + "grad_norm": 3.036108866630069, + "learning_rate": 9.287155679110455e-07, + "loss": 0.6087, + "step": 18004 + }, + { + "epoch": 2.69, + "grad_norm": 3.5438684780584278, + "learning_rate": 9.28619206100703e-07, + "loss": 0.6426, + "step": 18005 + }, + { + "epoch": 2.69, + "grad_norm": 4.1767122204471425, + "learning_rate": 9.285228449565641e-07, + "loss": 0.6126, + "step": 18006 + }, + { + "epoch": 2.69, + "grad_norm": 3.454021456959534, + "learning_rate": 9.284264844795272e-07, + "loss": 0.6927, + "step": 18007 + }, + { + "epoch": 2.69, + "grad_norm": 3.8591056868427787, + "learning_rate": 9.283301246704926e-07, + "loss": 0.6536, + "step": 18008 + }, + { + "epoch": 2.69, + "grad_norm": 3.790307462584372, + "learning_rate": 9.28233765530359e-07, + "loss": 0.6393, + "step": 18009 + }, + { + "epoch": 2.69, + "grad_norm": 4.292315391623623, + "learning_rate": 9.281374070600263e-07, + "loss": 0.6107, + "step": 18010 + }, + { + "epoch": 2.69, + "grad_norm": 2.9419869460210433, + "learning_rate": 9.280410492603933e-07, + "loss": 0.571, + "step": 18011 + }, + { + "epoch": 2.69, + "grad_norm": 3.2284458665410796, + "learning_rate": 9.279446921323598e-07, + "loss": 0.6003, + "step": 18012 + }, + { + "epoch": 2.69, + "grad_norm": 4.794436196841769, + "learning_rate": 9.27848335676825e-07, + "loss": 0.6302, + "step": 18013 + }, + { + "epoch": 2.69, + "grad_norm": 2.834054942308517, + "learning_rate": 9.277519798946877e-07, + "loss": 0.6068, + "step": 18014 + }, + { + "epoch": 2.69, + "grad_norm": 4.283476705006508, + "learning_rate": 9.276556247868474e-07, + "loss": 0.6895, + "step": 18015 + }, + { + "epoch": 2.69, + "grad_norm": 4.835622343599892, + "learning_rate": 9.275592703542038e-07, + "loss": 0.6465, + "step": 18016 + }, + { + "epoch": 2.69, + "grad_norm": 3.787025415921265, + "learning_rate": 9.274629165976559e-07, + "loss": 0.6647, + "step": 18017 + }, + { + "epoch": 2.69, + "grad_norm": 3.5561792648640687, + "learning_rate": 9.273665635181032e-07, + "loss": 0.6354, + "step": 18018 + }, + { + "epoch": 2.69, + "grad_norm": 3.343225161130771, + "learning_rate": 9.272702111164449e-07, + "loss": 0.638, + "step": 18019 + }, + { + "epoch": 2.69, + "grad_norm": 4.590874806146387, + "learning_rate": 9.271738593935798e-07, + "loss": 0.5983, + "step": 18020 + }, + { + "epoch": 2.69, + "grad_norm": 3.7854330881626215, + "learning_rate": 9.270775083504076e-07, + "loss": 0.6823, + "step": 18021 + }, + { + "epoch": 2.69, + "grad_norm": 4.3298819016287915, + "learning_rate": 9.269811579878274e-07, + "loss": 0.6178, + "step": 18022 + }, + { + "epoch": 2.69, + "grad_norm": 4.285146187095742, + "learning_rate": 9.268848083067386e-07, + "loss": 0.6243, + "step": 18023 + }, + { + "epoch": 2.69, + "grad_norm": 3.558716905422219, + "learning_rate": 9.267884593080403e-07, + "loss": 0.6576, + "step": 18024 + }, + { + "epoch": 2.69, + "grad_norm": 4.554157902063901, + "learning_rate": 9.266921109926317e-07, + "loss": 0.6569, + "step": 18025 + }, + { + "epoch": 2.69, + "grad_norm": 4.3461645005393095, + "learning_rate": 9.265957633614125e-07, + "loss": 0.6335, + "step": 18026 + }, + { + "epoch": 2.69, + "grad_norm": 3.198368347609128, + "learning_rate": 9.264994164152812e-07, + "loss": 0.6126, + "step": 18027 + }, + { + "epoch": 2.69, + "grad_norm": 3.631275926792149, + "learning_rate": 9.264030701551374e-07, + "loss": 0.6628, + "step": 18028 + }, + { + "epoch": 2.69, + "grad_norm": 4.7973821333159785, + "learning_rate": 9.263067245818803e-07, + "loss": 0.6367, + "step": 18029 + }, + { + "epoch": 2.69, + "grad_norm": 4.256748107921519, + "learning_rate": 9.262103796964089e-07, + "loss": 0.6882, + "step": 18030 + }, + { + "epoch": 2.69, + "grad_norm": 4.400116745182738, + "learning_rate": 9.261140354996227e-07, + "loss": 0.6432, + "step": 18031 + }, + { + "epoch": 2.69, + "grad_norm": 5.187619435489262, + "learning_rate": 9.260176919924207e-07, + "loss": 0.6243, + "step": 18032 + }, + { + "epoch": 2.69, + "grad_norm": 6.1375429820026985, + "learning_rate": 9.259213491757023e-07, + "loss": 0.668, + "step": 18033 + }, + { + "epoch": 2.69, + "grad_norm": 4.394875009228196, + "learning_rate": 9.258250070503665e-07, + "loss": 0.6419, + "step": 18034 + }, + { + "epoch": 2.69, + "grad_norm": 4.456723966506128, + "learning_rate": 9.257286656173121e-07, + "loss": 0.6243, + "step": 18035 + }, + { + "epoch": 2.69, + "grad_norm": 4.023989539618392, + "learning_rate": 9.256323248774387e-07, + "loss": 0.6413, + "step": 18036 + }, + { + "epoch": 2.69, + "grad_norm": 3.312432145453305, + "learning_rate": 9.255359848316456e-07, + "loss": 0.666, + "step": 18037 + }, + { + "epoch": 2.69, + "grad_norm": 3.987040490416904, + "learning_rate": 9.254396454808316e-07, + "loss": 0.7005, + "step": 18038 + }, + { + "epoch": 2.69, + "grad_norm": 3.4231980089066054, + "learning_rate": 9.253433068258963e-07, + "loss": 0.6432, + "step": 18039 + }, + { + "epoch": 2.69, + "grad_norm": 6.449758879997457, + "learning_rate": 9.252469688677383e-07, + "loss": 0.5911, + "step": 18040 + }, + { + "epoch": 2.69, + "grad_norm": 4.485759839984833, + "learning_rate": 9.251506316072568e-07, + "loss": 0.6406, + "step": 18041 + }, + { + "epoch": 2.69, + "grad_norm": 2.9487025746671565, + "learning_rate": 9.250542950453513e-07, + "loss": 0.6562, + "step": 18042 + }, + { + "epoch": 2.69, + "grad_norm": 3.040826307637409, + "learning_rate": 9.249579591829204e-07, + "loss": 0.64, + "step": 18043 + }, + { + "epoch": 2.69, + "grad_norm": 3.8754474390835725, + "learning_rate": 9.248616240208637e-07, + "loss": 0.6901, + "step": 18044 + }, + { + "epoch": 2.69, + "grad_norm": 3.0290449534018244, + "learning_rate": 9.2476528956008e-07, + "loss": 0.5892, + "step": 18045 + }, + { + "epoch": 2.69, + "grad_norm": 5.195195528049991, + "learning_rate": 9.24668955801469e-07, + "loss": 0.6497, + "step": 18046 + }, + { + "epoch": 2.69, + "grad_norm": 4.41974518632869, + "learning_rate": 9.245726227459288e-07, + "loss": 0.6094, + "step": 18047 + }, + { + "epoch": 2.69, + "grad_norm": 3.244353602469189, + "learning_rate": 9.24476290394359e-07, + "loss": 0.6706, + "step": 18048 + }, + { + "epoch": 2.69, + "grad_norm": 2.5534971826449513, + "learning_rate": 9.243799587476586e-07, + "loss": 0.638, + "step": 18049 + }, + { + "epoch": 2.69, + "grad_norm": 3.6944871461432722, + "learning_rate": 9.24283627806727e-07, + "loss": 0.6439, + "step": 18050 + }, + { + "epoch": 2.69, + "grad_norm": 2.7942152281693735, + "learning_rate": 9.241872975724627e-07, + "loss": 0.6628, + "step": 18051 + }, + { + "epoch": 2.69, + "grad_norm": 3.411337117430386, + "learning_rate": 9.240909680457655e-07, + "loss": 0.6309, + "step": 18052 + }, + { + "epoch": 2.69, + "grad_norm": 2.835921040513171, + "learning_rate": 9.239946392275338e-07, + "loss": 0.5977, + "step": 18053 + }, + { + "epoch": 2.69, + "grad_norm": 4.606073950359294, + "learning_rate": 9.238983111186666e-07, + "loss": 0.6517, + "step": 18054 + }, + { + "epoch": 2.69, + "grad_norm": 2.806482225157005, + "learning_rate": 9.238019837200636e-07, + "loss": 0.6322, + "step": 18055 + }, + { + "epoch": 2.69, + "grad_norm": 3.3977386381364383, + "learning_rate": 9.237056570326231e-07, + "loss": 0.6465, + "step": 18056 + }, + { + "epoch": 2.69, + "grad_norm": 2.9600789207673834, + "learning_rate": 9.236093310572446e-07, + "loss": 0.6458, + "step": 18057 + }, + { + "epoch": 2.69, + "grad_norm": 5.5440801140776035, + "learning_rate": 9.235130057948267e-07, + "loss": 0.6465, + "step": 18058 + }, + { + "epoch": 2.69, + "grad_norm": 5.084828143781226, + "learning_rate": 9.234166812462693e-07, + "loss": 0.6107, + "step": 18059 + }, + { + "epoch": 2.69, + "grad_norm": 3.7875494361233195, + "learning_rate": 9.233203574124702e-07, + "loss": 0.6497, + "step": 18060 + }, + { + "epoch": 2.69, + "grad_norm": 3.3293363348092293, + "learning_rate": 9.232240342943293e-07, + "loss": 0.6562, + "step": 18061 + }, + { + "epoch": 2.69, + "grad_norm": 3.9500748285821716, + "learning_rate": 9.231277118927449e-07, + "loss": 0.653, + "step": 18062 + }, + { + "epoch": 2.69, + "grad_norm": 3.3997727234270303, + "learning_rate": 9.230313902086167e-07, + "loss": 0.6439, + "step": 18063 + }, + { + "epoch": 2.69, + "grad_norm": 3.6985666422922927, + "learning_rate": 9.229350692428432e-07, + "loss": 0.599, + "step": 18064 + }, + { + "epoch": 2.69, + "grad_norm": 3.271351495468433, + "learning_rate": 9.228387489963236e-07, + "loss": 0.6201, + "step": 18065 + }, + { + "epoch": 2.69, + "grad_norm": 3.222094516013259, + "learning_rate": 9.227424294699571e-07, + "loss": 0.5833, + "step": 18066 + }, + { + "epoch": 2.69, + "grad_norm": 3.1857829139387084, + "learning_rate": 9.226461106646418e-07, + "loss": 0.6048, + "step": 18067 + }, + { + "epoch": 2.69, + "grad_norm": 3.409097702130692, + "learning_rate": 9.225497925812775e-07, + "loss": 0.6016, + "step": 18068 + }, + { + "epoch": 2.69, + "grad_norm": 3.9279052027519845, + "learning_rate": 9.224534752207626e-07, + "loss": 0.6094, + "step": 18069 + }, + { + "epoch": 2.7, + "grad_norm": 3.6157743143745575, + "learning_rate": 9.223571585839963e-07, + "loss": 0.6042, + "step": 18070 + }, + { + "epoch": 2.7, + "grad_norm": 4.255708176434571, + "learning_rate": 9.222608426718776e-07, + "loss": 0.6484, + "step": 18071 + }, + { + "epoch": 2.7, + "grad_norm": 3.101784906143681, + "learning_rate": 9.221645274853056e-07, + "loss": 0.6351, + "step": 18072 + }, + { + "epoch": 2.7, + "grad_norm": 3.1833687985422006, + "learning_rate": 9.220682130251786e-07, + "loss": 0.6159, + "step": 18073 + }, + { + "epoch": 2.7, + "grad_norm": 5.149633906044712, + "learning_rate": 9.219718992923959e-07, + "loss": 0.6582, + "step": 18074 + }, + { + "epoch": 2.7, + "grad_norm": 3.5323546863918276, + "learning_rate": 9.218755862878563e-07, + "loss": 0.5758, + "step": 18075 + }, + { + "epoch": 2.7, + "grad_norm": 3.584234949970057, + "learning_rate": 9.21779274012459e-07, + "loss": 0.6035, + "step": 18076 + }, + { + "epoch": 2.7, + "grad_norm": 4.475165377837814, + "learning_rate": 9.216829624671023e-07, + "loss": 0.6517, + "step": 18077 + }, + { + "epoch": 2.7, + "grad_norm": 4.202628077765428, + "learning_rate": 9.215866516526857e-07, + "loss": 0.6257, + "step": 18078 + }, + { + "epoch": 2.7, + "grad_norm": 4.982071610648805, + "learning_rate": 9.214903415701082e-07, + "loss": 0.6536, + "step": 18079 + }, + { + "epoch": 2.7, + "grad_norm": 3.3767023552644617, + "learning_rate": 9.213940322202677e-07, + "loss": 0.5859, + "step": 18080 + }, + { + "epoch": 2.7, + "grad_norm": 4.276115188225346, + "learning_rate": 9.212977236040636e-07, + "loss": 0.6647, + "step": 18081 + }, + { + "epoch": 2.7, + "grad_norm": 3.9692475065215307, + "learning_rate": 9.21201415722395e-07, + "loss": 0.6725, + "step": 18082 + }, + { + "epoch": 2.7, + "grad_norm": 4.121839105491323, + "learning_rate": 9.211051085761606e-07, + "loss": 0.6348, + "step": 18083 + }, + { + "epoch": 2.7, + "grad_norm": 3.0828693093895407, + "learning_rate": 9.210088021662592e-07, + "loss": 0.6576, + "step": 18084 + }, + { + "epoch": 2.7, + "grad_norm": 6.361864343008302, + "learning_rate": 9.209124964935898e-07, + "loss": 0.6185, + "step": 18085 + }, + { + "epoch": 2.7, + "grad_norm": 4.156013802551538, + "learning_rate": 9.208161915590508e-07, + "loss": 0.5967, + "step": 18086 + }, + { + "epoch": 2.7, + "grad_norm": 3.4116929097419906, + "learning_rate": 9.207198873635413e-07, + "loss": 0.6673, + "step": 18087 + }, + { + "epoch": 2.7, + "grad_norm": 3.2938241493108973, + "learning_rate": 9.206235839079602e-07, + "loss": 0.6198, + "step": 18088 + }, + { + "epoch": 2.7, + "grad_norm": 4.090473543895346, + "learning_rate": 9.205272811932061e-07, + "loss": 0.6009, + "step": 18089 + }, + { + "epoch": 2.7, + "grad_norm": 3.304391302460514, + "learning_rate": 9.204309792201779e-07, + "loss": 0.6146, + "step": 18090 + }, + { + "epoch": 2.7, + "grad_norm": 3.437747530029953, + "learning_rate": 9.203346779897743e-07, + "loss": 0.666, + "step": 18091 + }, + { + "epoch": 2.7, + "grad_norm": 4.11777780087901, + "learning_rate": 9.202383775028947e-07, + "loss": 0.6393, + "step": 18092 + }, + { + "epoch": 2.7, + "grad_norm": 3.842866237233455, + "learning_rate": 9.201420777604372e-07, + "loss": 0.5794, + "step": 18093 + }, + { + "epoch": 2.7, + "grad_norm": 3.1756350874005226, + "learning_rate": 9.200457787633004e-07, + "loss": 0.6289, + "step": 18094 + }, + { + "epoch": 2.7, + "grad_norm": 4.900272923030467, + "learning_rate": 9.199494805123837e-07, + "loss": 0.6751, + "step": 18095 + }, + { + "epoch": 2.7, + "grad_norm": 3.577942150165335, + "learning_rate": 9.198531830085854e-07, + "loss": 0.6003, + "step": 18096 + }, + { + "epoch": 2.7, + "grad_norm": 4.038124141170987, + "learning_rate": 9.197568862528046e-07, + "loss": 0.6204, + "step": 18097 + }, + { + "epoch": 2.7, + "grad_norm": 3.5737864916180584, + "learning_rate": 9.196605902459396e-07, + "loss": 0.653, + "step": 18098 + }, + { + "epoch": 2.7, + "grad_norm": 3.536081426376318, + "learning_rate": 9.195642949888899e-07, + "loss": 0.6068, + "step": 18099 + }, + { + "epoch": 2.7, + "grad_norm": 3.338119805673534, + "learning_rate": 9.194680004825536e-07, + "loss": 0.5768, + "step": 18100 + }, + { + "epoch": 2.7, + "grad_norm": 5.353644032924196, + "learning_rate": 9.193717067278293e-07, + "loss": 0.6185, + "step": 18101 + }, + { + "epoch": 2.7, + "grad_norm": 5.289447331863877, + "learning_rate": 9.192754137256161e-07, + "loss": 0.6836, + "step": 18102 + }, + { + "epoch": 2.7, + "grad_norm": 3.247450216646051, + "learning_rate": 9.191791214768125e-07, + "loss": 0.5944, + "step": 18103 + }, + { + "epoch": 2.7, + "grad_norm": 3.456305846015594, + "learning_rate": 9.190828299823173e-07, + "loss": 0.6484, + "step": 18104 + }, + { + "epoch": 2.7, + "grad_norm": 4.319535245853474, + "learning_rate": 9.189865392430297e-07, + "loss": 0.6738, + "step": 18105 + }, + { + "epoch": 2.7, + "grad_norm": 3.987303812912631, + "learning_rate": 9.188902492598476e-07, + "loss": 0.6458, + "step": 18106 + }, + { + "epoch": 2.7, + "grad_norm": 3.3234308052936163, + "learning_rate": 9.187939600336698e-07, + "loss": 0.6615, + "step": 18107 + }, + { + "epoch": 2.7, + "grad_norm": 3.3824335313595437, + "learning_rate": 9.186976715653953e-07, + "loss": 0.6667, + "step": 18108 + }, + { + "epoch": 2.7, + "grad_norm": 3.314077228820131, + "learning_rate": 9.186013838559226e-07, + "loss": 0.61, + "step": 18109 + }, + { + "epoch": 2.7, + "grad_norm": 4.598522306658079, + "learning_rate": 9.185050969061504e-07, + "loss": 0.6641, + "step": 18110 + }, + { + "epoch": 2.7, + "grad_norm": 3.707597025464148, + "learning_rate": 9.184088107169771e-07, + "loss": 0.6022, + "step": 18111 + }, + { + "epoch": 2.7, + "grad_norm": 3.118504224443284, + "learning_rate": 9.183125252893022e-07, + "loss": 0.653, + "step": 18112 + }, + { + "epoch": 2.7, + "grad_norm": 4.211124553787781, + "learning_rate": 9.182162406240231e-07, + "loss": 0.6152, + "step": 18113 + }, + { + "epoch": 2.7, + "grad_norm": 3.893839408759137, + "learning_rate": 9.181199567220395e-07, + "loss": 0.6465, + "step": 18114 + }, + { + "epoch": 2.7, + "grad_norm": 3.2693068104817264, + "learning_rate": 9.180236735842492e-07, + "loss": 0.6836, + "step": 18115 + }, + { + "epoch": 2.7, + "grad_norm": 3.771089442246397, + "learning_rate": 9.179273912115514e-07, + "loss": 0.6458, + "step": 18116 + }, + { + "epoch": 2.7, + "grad_norm": 4.602721098292529, + "learning_rate": 9.178311096048443e-07, + "loss": 0.64, + "step": 18117 + }, + { + "epoch": 2.7, + "grad_norm": 3.164301917450453, + "learning_rate": 9.177348287650273e-07, + "loss": 0.6419, + "step": 18118 + }, + { + "epoch": 2.7, + "grad_norm": 6.488285155826243, + "learning_rate": 9.176385486929981e-07, + "loss": 0.6686, + "step": 18119 + }, + { + "epoch": 2.7, + "grad_norm": 3.426274496421779, + "learning_rate": 9.175422693896553e-07, + "loss": 0.6758, + "step": 18120 + }, + { + "epoch": 2.7, + "grad_norm": 3.7456599006762348, + "learning_rate": 9.174459908558982e-07, + "loss": 0.681, + "step": 18121 + }, + { + "epoch": 2.7, + "grad_norm": 3.900765787659097, + "learning_rate": 9.173497130926245e-07, + "loss": 0.6667, + "step": 18122 + }, + { + "epoch": 2.7, + "grad_norm": 2.8494846789310806, + "learning_rate": 9.172534361007337e-07, + "loss": 0.6523, + "step": 18123 + }, + { + "epoch": 2.7, + "grad_norm": 3.100763448607279, + "learning_rate": 9.171571598811236e-07, + "loss": 0.6406, + "step": 18124 + }, + { + "epoch": 2.7, + "grad_norm": 2.8232119490901284, + "learning_rate": 9.170608844346935e-07, + "loss": 0.6055, + "step": 18125 + }, + { + "epoch": 2.7, + "grad_norm": 5.587911868924834, + "learning_rate": 9.16964609762341e-07, + "loss": 0.6419, + "step": 18126 + }, + { + "epoch": 2.7, + "grad_norm": 3.13627365525406, + "learning_rate": 9.168683358649653e-07, + "loss": 0.6426, + "step": 18127 + }, + { + "epoch": 2.7, + "grad_norm": 4.367491518039062, + "learning_rate": 9.167720627434648e-07, + "loss": 0.6302, + "step": 18128 + }, + { + "epoch": 2.7, + "grad_norm": 3.0931317387437325, + "learning_rate": 9.166757903987381e-07, + "loss": 0.6771, + "step": 18129 + }, + { + "epoch": 2.7, + "grad_norm": 2.924271670126176, + "learning_rate": 9.165795188316833e-07, + "loss": 0.6335, + "step": 18130 + }, + { + "epoch": 2.7, + "grad_norm": 3.020027402081508, + "learning_rate": 9.164832480431996e-07, + "loss": 0.6322, + "step": 18131 + }, + { + "epoch": 2.7, + "grad_norm": 2.562092251016536, + "learning_rate": 9.163869780341852e-07, + "loss": 0.6289, + "step": 18132 + }, + { + "epoch": 2.7, + "grad_norm": 3.185989782403527, + "learning_rate": 9.162907088055382e-07, + "loss": 0.6172, + "step": 18133 + }, + { + "epoch": 2.7, + "grad_norm": 3.18674756645961, + "learning_rate": 9.161944403581575e-07, + "loss": 0.6257, + "step": 18134 + }, + { + "epoch": 2.7, + "grad_norm": 5.788869352507617, + "learning_rate": 9.160981726929414e-07, + "loss": 0.61, + "step": 18135 + }, + { + "epoch": 2.7, + "grad_norm": 4.77737002447882, + "learning_rate": 9.160019058107885e-07, + "loss": 0.5944, + "step": 18136 + }, + { + "epoch": 2.7, + "grad_norm": 3.904818864654025, + "learning_rate": 9.159056397125974e-07, + "loss": 0.6296, + "step": 18137 + }, + { + "epoch": 2.71, + "grad_norm": 2.817070019309717, + "learning_rate": 9.158093743992667e-07, + "loss": 0.6061, + "step": 18138 + }, + { + "epoch": 2.71, + "grad_norm": 2.8597253107620135, + "learning_rate": 9.15713109871694e-07, + "loss": 0.6126, + "step": 18139 + }, + { + "epoch": 2.71, + "grad_norm": 3.337682796214984, + "learning_rate": 9.156168461307785e-07, + "loss": 0.6615, + "step": 18140 + }, + { + "epoch": 2.71, + "grad_norm": 3.622732825150123, + "learning_rate": 9.155205831774183e-07, + "loss": 0.6423, + "step": 18141 + }, + { + "epoch": 2.71, + "grad_norm": 5.329592685537281, + "learning_rate": 9.154243210125122e-07, + "loss": 0.6237, + "step": 18142 + }, + { + "epoch": 2.71, + "grad_norm": 4.923177347317991, + "learning_rate": 9.153280596369582e-07, + "loss": 0.6536, + "step": 18143 + }, + { + "epoch": 2.71, + "grad_norm": 3.8004037282501497, + "learning_rate": 9.15231799051655e-07, + "loss": 0.653, + "step": 18144 + }, + { + "epoch": 2.71, + "grad_norm": 3.8290465635874837, + "learning_rate": 9.151355392575012e-07, + "loss": 0.599, + "step": 18145 + }, + { + "epoch": 2.71, + "grad_norm": 3.3425841918446375, + "learning_rate": 9.150392802553945e-07, + "loss": 0.6419, + "step": 18146 + }, + { + "epoch": 2.71, + "grad_norm": 3.416536044510755, + "learning_rate": 9.149430220462337e-07, + "loss": 0.5964, + "step": 18147 + }, + { + "epoch": 2.71, + "grad_norm": 4.054075125724332, + "learning_rate": 9.148467646309175e-07, + "loss": 0.6133, + "step": 18148 + }, + { + "epoch": 2.71, + "grad_norm": 3.0053941372707556, + "learning_rate": 9.147505080103436e-07, + "loss": 0.6243, + "step": 18149 + }, + { + "epoch": 2.71, + "grad_norm": 4.644271665627855, + "learning_rate": 9.14654252185411e-07, + "loss": 0.6188, + "step": 18150 + }, + { + "epoch": 2.71, + "grad_norm": 3.3604381988282395, + "learning_rate": 9.145579971570177e-07, + "loss": 0.6452, + "step": 18151 + }, + { + "epoch": 2.71, + "grad_norm": 3.321865098532106, + "learning_rate": 9.144617429260625e-07, + "loss": 0.6341, + "step": 18152 + }, + { + "epoch": 2.71, + "grad_norm": 3.693569344972487, + "learning_rate": 9.143654894934433e-07, + "loss": 0.6647, + "step": 18153 + }, + { + "epoch": 2.71, + "grad_norm": 4.4457999363389575, + "learning_rate": 9.142692368600582e-07, + "loss": 0.6569, + "step": 18154 + }, + { + "epoch": 2.71, + "grad_norm": 3.643255321809023, + "learning_rate": 9.141729850268062e-07, + "loss": 0.6426, + "step": 18155 + }, + { + "epoch": 2.71, + "grad_norm": 4.115093180476964, + "learning_rate": 9.140767339945851e-07, + "loss": 0.5957, + "step": 18156 + }, + { + "epoch": 2.71, + "grad_norm": 4.090527448017933, + "learning_rate": 9.139804837642935e-07, + "loss": 0.6673, + "step": 18157 + }, + { + "epoch": 2.71, + "grad_norm": 3.667164766847176, + "learning_rate": 9.138842343368299e-07, + "loss": 0.6165, + "step": 18158 + }, + { + "epoch": 2.71, + "grad_norm": 7.128525209553047, + "learning_rate": 9.137879857130923e-07, + "loss": 0.6178, + "step": 18159 + }, + { + "epoch": 2.71, + "grad_norm": 3.8852586912674747, + "learning_rate": 9.136917378939788e-07, + "loss": 0.6543, + "step": 18160 + }, + { + "epoch": 2.71, + "grad_norm": 3.481129712284533, + "learning_rate": 9.135954908803882e-07, + "loss": 0.6126, + "step": 18161 + }, + { + "epoch": 2.71, + "grad_norm": 3.7743357282892798, + "learning_rate": 9.134992446732182e-07, + "loss": 0.6374, + "step": 18162 + }, + { + "epoch": 2.71, + "grad_norm": 3.7360800423710994, + "learning_rate": 9.134029992733677e-07, + "loss": 0.6445, + "step": 18163 + }, + { + "epoch": 2.71, + "grad_norm": 3.2022180831310956, + "learning_rate": 9.133067546817344e-07, + "loss": 0.6068, + "step": 18164 + }, + { + "epoch": 2.71, + "grad_norm": 3.178119365361533, + "learning_rate": 9.132105108992173e-07, + "loss": 0.571, + "step": 18165 + }, + { + "epoch": 2.71, + "grad_norm": 3.639202277281453, + "learning_rate": 9.131142679267139e-07, + "loss": 0.6152, + "step": 18166 + }, + { + "epoch": 2.71, + "grad_norm": 3.5987556872988793, + "learning_rate": 9.130180257651227e-07, + "loss": 0.6133, + "step": 18167 + }, + { + "epoch": 2.71, + "grad_norm": 4.1258757705675695, + "learning_rate": 9.129217844153418e-07, + "loss": 0.6348, + "step": 18168 + }, + { + "epoch": 2.71, + "grad_norm": 4.908474067960074, + "learning_rate": 9.128255438782697e-07, + "loss": 0.6849, + "step": 18169 + }, + { + "epoch": 2.71, + "grad_norm": 5.789175521778792, + "learning_rate": 9.127293041548044e-07, + "loss": 0.7012, + "step": 18170 + }, + { + "epoch": 2.71, + "grad_norm": 3.7318750119191995, + "learning_rate": 9.126330652458447e-07, + "loss": 0.6471, + "step": 18171 + }, + { + "epoch": 2.71, + "grad_norm": 4.26274793616103, + "learning_rate": 9.125368271522881e-07, + "loss": 0.6204, + "step": 18172 + }, + { + "epoch": 2.71, + "grad_norm": 6.004910380569, + "learning_rate": 9.124405898750328e-07, + "loss": 0.6136, + "step": 18173 + }, + { + "epoch": 2.71, + "grad_norm": 4.636443585673921, + "learning_rate": 9.123443534149774e-07, + "loss": 0.6432, + "step": 18174 + }, + { + "epoch": 2.71, + "grad_norm": 4.125385836932116, + "learning_rate": 9.122481177730197e-07, + "loss": 0.6621, + "step": 18175 + }, + { + "epoch": 2.71, + "grad_norm": 3.8713513844104495, + "learning_rate": 9.121518829500583e-07, + "loss": 0.681, + "step": 18176 + }, + { + "epoch": 2.71, + "grad_norm": 4.215554464610303, + "learning_rate": 9.12055648946991e-07, + "loss": 0.7135, + "step": 18177 + }, + { + "epoch": 2.71, + "grad_norm": 4.7573237018756735, + "learning_rate": 9.119594157647163e-07, + "loss": 0.6185, + "step": 18178 + }, + { + "epoch": 2.71, + "grad_norm": 4.142019555930571, + "learning_rate": 9.118631834041319e-07, + "loss": 0.668, + "step": 18179 + }, + { + "epoch": 2.71, + "grad_norm": 3.9007288060918444, + "learning_rate": 9.117669518661365e-07, + "loss": 0.5638, + "step": 18180 + }, + { + "epoch": 2.71, + "grad_norm": 3.3637255217504616, + "learning_rate": 9.116707211516276e-07, + "loss": 0.5745, + "step": 18181 + }, + { + "epoch": 2.71, + "grad_norm": 3.5831745815936995, + "learning_rate": 9.115744912615039e-07, + "loss": 0.625, + "step": 18182 + }, + { + "epoch": 2.71, + "grad_norm": 3.619856969729984, + "learning_rate": 9.114782621966632e-07, + "loss": 0.6751, + "step": 18183 + }, + { + "epoch": 2.71, + "grad_norm": 3.462447573823886, + "learning_rate": 9.113820339580037e-07, + "loss": 0.6432, + "step": 18184 + }, + { + "epoch": 2.71, + "grad_norm": 3.2524602129399325, + "learning_rate": 9.11285806546424e-07, + "loss": 0.6335, + "step": 18185 + }, + { + "epoch": 2.71, + "grad_norm": 3.701709670679784, + "learning_rate": 9.111895799628211e-07, + "loss": 0.64, + "step": 18186 + }, + { + "epoch": 2.71, + "grad_norm": 3.2207303736701736, + "learning_rate": 9.110933542080938e-07, + "loss": 0.6517, + "step": 18187 + }, + { + "epoch": 2.71, + "grad_norm": 3.406136711411864, + "learning_rate": 9.109971292831401e-07, + "loss": 0.6152, + "step": 18188 + }, + { + "epoch": 2.71, + "grad_norm": 4.081087191467893, + "learning_rate": 9.109009051888582e-07, + "loss": 0.6439, + "step": 18189 + }, + { + "epoch": 2.71, + "grad_norm": 3.678648088870373, + "learning_rate": 9.108046819261459e-07, + "loss": 0.5983, + "step": 18190 + }, + { + "epoch": 2.71, + "grad_norm": 4.802124775078641, + "learning_rate": 9.107084594959017e-07, + "loss": 0.6406, + "step": 18191 + }, + { + "epoch": 2.71, + "grad_norm": 3.558880195831095, + "learning_rate": 9.10612237899023e-07, + "loss": 0.6445, + "step": 18192 + }, + { + "epoch": 2.71, + "grad_norm": 6.675063016071835, + "learning_rate": 9.105160171364084e-07, + "loss": 0.6406, + "step": 18193 + }, + { + "epoch": 2.71, + "grad_norm": 3.348777888472594, + "learning_rate": 9.104197972089553e-07, + "loss": 0.6087, + "step": 18194 + }, + { + "epoch": 2.71, + "grad_norm": 3.1344093629295453, + "learning_rate": 9.103235781175626e-07, + "loss": 0.6882, + "step": 18195 + }, + { + "epoch": 2.71, + "grad_norm": 3.1443358381636792, + "learning_rate": 9.102273598631275e-07, + "loss": 0.6497, + "step": 18196 + }, + { + "epoch": 2.71, + "grad_norm": 3.0421079517166234, + "learning_rate": 9.101311424465489e-07, + "loss": 0.625, + "step": 18197 + }, + { + "epoch": 2.71, + "grad_norm": 3.380269227505805, + "learning_rate": 9.100349258687242e-07, + "loss": 0.64, + "step": 18198 + }, + { + "epoch": 2.71, + "grad_norm": 3.4791889332117827, + "learning_rate": 9.099387101305513e-07, + "loss": 0.627, + "step": 18199 + }, + { + "epoch": 2.71, + "grad_norm": 3.1378182679735467, + "learning_rate": 9.098424952329284e-07, + "loss": 0.6699, + "step": 18200 + }, + { + "epoch": 2.71, + "grad_norm": 3.1859077755375473, + "learning_rate": 9.097462811767533e-07, + "loss": 0.64, + "step": 18201 + }, + { + "epoch": 2.71, + "grad_norm": 3.1424853905518177, + "learning_rate": 9.096500679629242e-07, + "loss": 0.5957, + "step": 18202 + }, + { + "epoch": 2.71, + "grad_norm": 6.337498457470772, + "learning_rate": 9.095538555923392e-07, + "loss": 0.6185, + "step": 18203 + }, + { + "epoch": 2.71, + "grad_norm": 3.225174939089237, + "learning_rate": 9.094576440658963e-07, + "loss": 0.6322, + "step": 18204 + }, + { + "epoch": 2.72, + "grad_norm": 4.6184353598153765, + "learning_rate": 9.093614333844927e-07, + "loss": 0.6257, + "step": 18205 + }, + { + "epoch": 2.72, + "grad_norm": 4.569148673328583, + "learning_rate": 9.092652235490271e-07, + "loss": 0.6406, + "step": 18206 + }, + { + "epoch": 2.72, + "grad_norm": 3.415467863135101, + "learning_rate": 9.09169014560397e-07, + "loss": 0.6146, + "step": 18207 + }, + { + "epoch": 2.72, + "grad_norm": 3.234231927589386, + "learning_rate": 9.090728064195008e-07, + "loss": 0.6445, + "step": 18208 + }, + { + "epoch": 2.72, + "grad_norm": 3.321810834254782, + "learning_rate": 9.089765991272358e-07, + "loss": 0.6165, + "step": 18209 + }, + { + "epoch": 2.72, + "grad_norm": 4.9990713325767775, + "learning_rate": 9.088803926845006e-07, + "loss": 0.6432, + "step": 18210 + }, + { + "epoch": 2.72, + "grad_norm": 3.3240009919678175, + "learning_rate": 9.087841870921929e-07, + "loss": 0.6185, + "step": 18211 + }, + { + "epoch": 2.72, + "grad_norm": 7.027374764297422, + "learning_rate": 9.086879823512099e-07, + "loss": 0.6816, + "step": 18212 + }, + { + "epoch": 2.72, + "grad_norm": 3.302208783460543, + "learning_rate": 9.085917784624501e-07, + "loss": 0.6289, + "step": 18213 + }, + { + "epoch": 2.72, + "grad_norm": 4.266054125638214, + "learning_rate": 9.084955754268116e-07, + "loss": 0.638, + "step": 18214 + }, + { + "epoch": 2.72, + "grad_norm": 3.733800327276837, + "learning_rate": 9.083993732451918e-07, + "loss": 0.612, + "step": 18215 + }, + { + "epoch": 2.72, + "grad_norm": 3.344563084735018, + "learning_rate": 9.08303171918489e-07, + "loss": 0.6589, + "step": 18216 + }, + { + "epoch": 2.72, + "grad_norm": 7.613574430509282, + "learning_rate": 9.082069714476005e-07, + "loss": 0.6504, + "step": 18217 + }, + { + "epoch": 2.72, + "grad_norm": 3.375606424129696, + "learning_rate": 9.081107718334249e-07, + "loss": 0.638, + "step": 18218 + }, + { + "epoch": 2.72, + "grad_norm": 4.565006351632212, + "learning_rate": 9.080145730768594e-07, + "loss": 0.6673, + "step": 18219 + }, + { + "epoch": 2.72, + "grad_norm": 4.571497743861842, + "learning_rate": 9.079183751788016e-07, + "loss": 0.6842, + "step": 18220 + }, + { + "epoch": 2.72, + "grad_norm": 3.657974958944332, + "learning_rate": 9.078221781401503e-07, + "loss": 0.5996, + "step": 18221 + }, + { + "epoch": 2.72, + "grad_norm": 4.00651032680078, + "learning_rate": 9.077259819618022e-07, + "loss": 0.6354, + "step": 18222 + }, + { + "epoch": 2.72, + "grad_norm": 6.0661068449665345, + "learning_rate": 9.07629786644656e-07, + "loss": 0.6445, + "step": 18223 + }, + { + "epoch": 2.72, + "grad_norm": 5.806421820101088, + "learning_rate": 9.075335921896095e-07, + "loss": 0.6439, + "step": 18224 + }, + { + "epoch": 2.72, + "grad_norm": 6.0669652329281725, + "learning_rate": 9.074373985975598e-07, + "loss": 0.6602, + "step": 18225 + }, + { + "epoch": 2.72, + "grad_norm": 5.709747493012315, + "learning_rate": 9.073412058694049e-07, + "loss": 0.6882, + "step": 18226 + }, + { + "epoch": 2.72, + "grad_norm": 3.7147973048511895, + "learning_rate": 9.07245014006043e-07, + "loss": 0.6465, + "step": 18227 + }, + { + "epoch": 2.72, + "grad_norm": 4.166982526552127, + "learning_rate": 9.071488230083714e-07, + "loss": 0.6081, + "step": 18228 + }, + { + "epoch": 2.72, + "grad_norm": 6.536468816342005, + "learning_rate": 9.070526328772882e-07, + "loss": 0.6673, + "step": 18229 + }, + { + "epoch": 2.72, + "grad_norm": 2.9503490236893914, + "learning_rate": 9.069564436136908e-07, + "loss": 0.6035, + "step": 18230 + }, + { + "epoch": 2.72, + "grad_norm": 4.383141848223786, + "learning_rate": 9.068602552184775e-07, + "loss": 0.6686, + "step": 18231 + }, + { + "epoch": 2.72, + "grad_norm": 3.3928224391650943, + "learning_rate": 9.067640676925455e-07, + "loss": 0.6458, + "step": 18232 + }, + { + "epoch": 2.72, + "grad_norm": 2.962909427310737, + "learning_rate": 9.066678810367926e-07, + "loss": 0.6211, + "step": 18233 + }, + { + "epoch": 2.72, + "grad_norm": 4.146912240721751, + "learning_rate": 9.065716952521166e-07, + "loss": 0.6432, + "step": 18234 + }, + { + "epoch": 2.72, + "grad_norm": 4.6073605185959785, + "learning_rate": 9.064755103394152e-07, + "loss": 0.6419, + "step": 18235 + }, + { + "epoch": 2.72, + "grad_norm": 5.649165239828051, + "learning_rate": 9.063793262995861e-07, + "loss": 0.6582, + "step": 18236 + }, + { + "epoch": 2.72, + "grad_norm": 3.5984541770498772, + "learning_rate": 9.062831431335275e-07, + "loss": 0.6224, + "step": 18237 + }, + { + "epoch": 2.72, + "grad_norm": 3.77731401218629, + "learning_rate": 9.061869608421363e-07, + "loss": 0.6673, + "step": 18238 + }, + { + "epoch": 2.72, + "grad_norm": 3.170561744046858, + "learning_rate": 9.060907794263103e-07, + "loss": 0.6172, + "step": 18239 + }, + { + "epoch": 2.72, + "grad_norm": 3.9383492485525102, + "learning_rate": 9.059945988869477e-07, + "loss": 0.6445, + "step": 18240 + }, + { + "epoch": 2.72, + "grad_norm": 3.0324132265249277, + "learning_rate": 9.058984192249454e-07, + "loss": 0.6745, + "step": 18241 + }, + { + "epoch": 2.72, + "grad_norm": 3.355677555794187, + "learning_rate": 9.058022404412018e-07, + "loss": 0.64, + "step": 18242 + }, + { + "epoch": 2.72, + "grad_norm": 3.3797492617247573, + "learning_rate": 9.057060625366139e-07, + "loss": 0.6706, + "step": 18243 + }, + { + "epoch": 2.72, + "grad_norm": 3.0737314031003073, + "learning_rate": 9.056098855120803e-07, + "loss": 0.6621, + "step": 18244 + }, + { + "epoch": 2.72, + "grad_norm": 5.8506019913576655, + "learning_rate": 9.055137093684977e-07, + "loss": 0.6569, + "step": 18245 + }, + { + "epoch": 2.72, + "grad_norm": 2.8372179401823674, + "learning_rate": 9.054175341067637e-07, + "loss": 0.6178, + "step": 18246 + }, + { + "epoch": 2.72, + "grad_norm": 3.560056148561132, + "learning_rate": 9.053213597277762e-07, + "loss": 0.6113, + "step": 18247 + }, + { + "epoch": 2.72, + "grad_norm": 3.5691756955292435, + "learning_rate": 9.052251862324332e-07, + "loss": 0.6374, + "step": 18248 + }, + { + "epoch": 2.72, + "grad_norm": 2.9247914459582716, + "learning_rate": 9.051290136216315e-07, + "loss": 0.6237, + "step": 18249 + }, + { + "epoch": 2.72, + "grad_norm": 4.08931870798089, + "learning_rate": 9.050328418962695e-07, + "loss": 0.6432, + "step": 18250 + }, + { + "epoch": 2.72, + "grad_norm": 3.8199042946239565, + "learning_rate": 9.049366710572444e-07, + "loss": 0.6608, + "step": 18251 + }, + { + "epoch": 2.72, + "grad_norm": 3.5109587764281818, + "learning_rate": 9.048405011054535e-07, + "loss": 0.6348, + "step": 18252 + }, + { + "epoch": 2.72, + "grad_norm": 2.8834622721756746, + "learning_rate": 9.047443320417949e-07, + "loss": 0.6309, + "step": 18253 + }, + { + "epoch": 2.72, + "grad_norm": 3.402150723466513, + "learning_rate": 9.046481638671654e-07, + "loss": 0.6335, + "step": 18254 + }, + { + "epoch": 2.72, + "grad_norm": 3.5573506279055027, + "learning_rate": 9.045519965824635e-07, + "loss": 0.6302, + "step": 18255 + }, + { + "epoch": 2.72, + "grad_norm": 2.8091121706845215, + "learning_rate": 9.044558301885858e-07, + "loss": 0.6582, + "step": 18256 + }, + { + "epoch": 2.72, + "grad_norm": 3.344345546376157, + "learning_rate": 9.043596646864309e-07, + "loss": 0.6491, + "step": 18257 + }, + { + "epoch": 2.72, + "grad_norm": 7.245976216688334, + "learning_rate": 9.042635000768953e-07, + "loss": 0.64, + "step": 18258 + }, + { + "epoch": 2.72, + "grad_norm": 4.192767834778094, + "learning_rate": 9.04167336360877e-07, + "loss": 0.6374, + "step": 18259 + }, + { + "epoch": 2.72, + "grad_norm": 4.444611525074938, + "learning_rate": 9.040711735392731e-07, + "loss": 0.6543, + "step": 18260 + }, + { + "epoch": 2.72, + "grad_norm": 4.317623310956304, + "learning_rate": 9.03975011612982e-07, + "loss": 0.6471, + "step": 18261 + }, + { + "epoch": 2.72, + "grad_norm": 3.114466597548727, + "learning_rate": 9.038788505829001e-07, + "loss": 0.6426, + "step": 18262 + }, + { + "epoch": 2.72, + "grad_norm": 2.7349117847232645, + "learning_rate": 9.037826904499257e-07, + "loss": 0.5853, + "step": 18263 + }, + { + "epoch": 2.72, + "grad_norm": 4.531006550600416, + "learning_rate": 9.036865312149562e-07, + "loss": 0.6237, + "step": 18264 + }, + { + "epoch": 2.72, + "grad_norm": 2.939798796801863, + "learning_rate": 9.035903728788883e-07, + "loss": 0.6393, + "step": 18265 + }, + { + "epoch": 2.72, + "grad_norm": 3.1851847881281374, + "learning_rate": 9.034942154426202e-07, + "loss": 0.6549, + "step": 18266 + }, + { + "epoch": 2.72, + "grad_norm": 3.5954291423902442, + "learning_rate": 9.03398058907049e-07, + "loss": 0.6641, + "step": 18267 + }, + { + "epoch": 2.72, + "grad_norm": 3.779779299860197, + "learning_rate": 9.033019032730722e-07, + "loss": 0.6829, + "step": 18268 + }, + { + "epoch": 2.72, + "grad_norm": 3.079425380793196, + "learning_rate": 9.032057485415876e-07, + "loss": 0.6406, + "step": 18269 + }, + { + "epoch": 2.72, + "grad_norm": 4.9276323744324975, + "learning_rate": 9.031095947134923e-07, + "loss": 0.6204, + "step": 18270 + }, + { + "epoch": 2.72, + "grad_norm": 3.100011079752312, + "learning_rate": 9.030134417896835e-07, + "loss": 0.625, + "step": 18271 + }, + { + "epoch": 2.73, + "grad_norm": 3.9304244266846875, + "learning_rate": 9.02917289771059e-07, + "loss": 0.6022, + "step": 18272 + }, + { + "epoch": 2.73, + "grad_norm": 3.471118484985715, + "learning_rate": 9.028211386585157e-07, + "loss": 0.6169, + "step": 18273 + }, + { + "epoch": 2.73, + "grad_norm": 4.7145839080249985, + "learning_rate": 9.027249884529517e-07, + "loss": 0.7168, + "step": 18274 + }, + { + "epoch": 2.73, + "grad_norm": 3.713954233976205, + "learning_rate": 9.026288391552636e-07, + "loss": 0.6061, + "step": 18275 + }, + { + "epoch": 2.73, + "grad_norm": 2.8361783635973787, + "learning_rate": 9.025326907663495e-07, + "loss": 0.5934, + "step": 18276 + }, + { + "epoch": 2.73, + "grad_norm": 3.4805158342506153, + "learning_rate": 9.024365432871065e-07, + "loss": 0.6087, + "step": 18277 + }, + { + "epoch": 2.73, + "grad_norm": 3.373653373642267, + "learning_rate": 9.023403967184316e-07, + "loss": 0.6055, + "step": 18278 + }, + { + "epoch": 2.73, + "grad_norm": 3.245473197015915, + "learning_rate": 9.022442510612223e-07, + "loss": 0.596, + "step": 18279 + }, + { + "epoch": 2.73, + "grad_norm": 4.212364415117663, + "learning_rate": 9.021481063163763e-07, + "loss": 0.6426, + "step": 18280 + }, + { + "epoch": 2.73, + "grad_norm": 3.00520093837294, + "learning_rate": 9.020519624847904e-07, + "loss": 0.6211, + "step": 18281 + }, + { + "epoch": 2.73, + "grad_norm": 5.633585047085869, + "learning_rate": 9.019558195673624e-07, + "loss": 0.6777, + "step": 18282 + }, + { + "epoch": 2.73, + "grad_norm": 3.5339583759892776, + "learning_rate": 9.018596775649893e-07, + "loss": 0.6178, + "step": 18283 + }, + { + "epoch": 2.73, + "grad_norm": 4.6081059504282535, + "learning_rate": 9.017635364785689e-07, + "loss": 0.6712, + "step": 18284 + }, + { + "epoch": 2.73, + "grad_norm": 5.6974316850431315, + "learning_rate": 9.016673963089979e-07, + "loss": 0.668, + "step": 18285 + }, + { + "epoch": 2.73, + "grad_norm": 3.4821231514968973, + "learning_rate": 9.015712570571736e-07, + "loss": 0.6478, + "step": 18286 + }, + { + "epoch": 2.73, + "grad_norm": 3.322925927184762, + "learning_rate": 9.014751187239935e-07, + "loss": 0.5885, + "step": 18287 + }, + { + "epoch": 2.73, + "grad_norm": 3.641770734849863, + "learning_rate": 9.013789813103548e-07, + "loss": 0.6374, + "step": 18288 + }, + { + "epoch": 2.73, + "grad_norm": 3.5373466829724673, + "learning_rate": 9.012828448171548e-07, + "loss": 0.5957, + "step": 18289 + }, + { + "epoch": 2.73, + "grad_norm": 3.2793686490578473, + "learning_rate": 9.01186709245291e-07, + "loss": 0.6276, + "step": 18290 + }, + { + "epoch": 2.73, + "grad_norm": 4.272035222282402, + "learning_rate": 9.010905745956603e-07, + "loss": 0.6361, + "step": 18291 + }, + { + "epoch": 2.73, + "grad_norm": 4.777644078013765, + "learning_rate": 9.009944408691598e-07, + "loss": 0.6608, + "step": 18292 + }, + { + "epoch": 2.73, + "grad_norm": 3.4727318435254015, + "learning_rate": 9.008983080666871e-07, + "loss": 0.6654, + "step": 18293 + }, + { + "epoch": 2.73, + "grad_norm": 4.4289742299333685, + "learning_rate": 9.008021761891391e-07, + "loss": 0.6673, + "step": 18294 + }, + { + "epoch": 2.73, + "grad_norm": 3.993340659464121, + "learning_rate": 9.007060452374133e-07, + "loss": 0.6491, + "step": 18295 + }, + { + "epoch": 2.73, + "grad_norm": 3.4621049365301286, + "learning_rate": 9.006099152124066e-07, + "loss": 0.6413, + "step": 18296 + }, + { + "epoch": 2.73, + "grad_norm": 3.262285739585931, + "learning_rate": 9.005137861150167e-07, + "loss": 0.6543, + "step": 18297 + }, + { + "epoch": 2.73, + "grad_norm": 4.579014670311217, + "learning_rate": 9.004176579461403e-07, + "loss": 0.679, + "step": 18298 + }, + { + "epoch": 2.73, + "grad_norm": 3.2663453864725907, + "learning_rate": 9.003215307066744e-07, + "loss": 0.6387, + "step": 18299 + }, + { + "epoch": 2.73, + "grad_norm": 4.893096549780903, + "learning_rate": 9.002254043975166e-07, + "loss": 0.6361, + "step": 18300 + }, + { + "epoch": 2.73, + "grad_norm": 3.5988253776861496, + "learning_rate": 9.001292790195641e-07, + "loss": 0.6549, + "step": 18301 + }, + { + "epoch": 2.73, + "grad_norm": 3.6140026359701243, + "learning_rate": 9.000331545737137e-07, + "loss": 0.6328, + "step": 18302 + }, + { + "epoch": 2.73, + "grad_norm": 3.1879751033492885, + "learning_rate": 8.99937031060863e-07, + "loss": 0.6426, + "step": 18303 + }, + { + "epoch": 2.73, + "grad_norm": 3.229040838679359, + "learning_rate": 8.998409084819087e-07, + "loss": 0.6595, + "step": 18304 + }, + { + "epoch": 2.73, + "grad_norm": 3.132162634028491, + "learning_rate": 8.997447868377479e-07, + "loss": 0.6055, + "step": 18305 + }, + { + "epoch": 2.73, + "grad_norm": 3.94857299648251, + "learning_rate": 8.99648666129278e-07, + "loss": 0.6693, + "step": 18306 + }, + { + "epoch": 2.73, + "grad_norm": 3.2186271678689335, + "learning_rate": 8.995525463573958e-07, + "loss": 0.668, + "step": 18307 + }, + { + "epoch": 2.73, + "grad_norm": 3.7408961024761997, + "learning_rate": 8.994564275229987e-07, + "loss": 0.6549, + "step": 18308 + }, + { + "epoch": 2.73, + "grad_norm": 2.980323764778065, + "learning_rate": 8.993603096269835e-07, + "loss": 0.6051, + "step": 18309 + }, + { + "epoch": 2.73, + "grad_norm": 3.0455746925399816, + "learning_rate": 8.992641926702479e-07, + "loss": 0.625, + "step": 18310 + }, + { + "epoch": 2.73, + "grad_norm": 4.103277198651808, + "learning_rate": 8.991680766536883e-07, + "loss": 0.6426, + "step": 18311 + }, + { + "epoch": 2.73, + "grad_norm": 3.6793503238865677, + "learning_rate": 8.990719615782017e-07, + "loss": 0.6673, + "step": 18312 + }, + { + "epoch": 2.73, + "grad_norm": 3.048104260263636, + "learning_rate": 8.989758474446854e-07, + "loss": 0.5573, + "step": 18313 + }, + { + "epoch": 2.73, + "grad_norm": 4.190087094799505, + "learning_rate": 8.988797342540367e-07, + "loss": 0.5684, + "step": 18314 + }, + { + "epoch": 2.73, + "grad_norm": 4.330680909548514, + "learning_rate": 8.987836220071521e-07, + "loss": 0.6439, + "step": 18315 + }, + { + "epoch": 2.73, + "grad_norm": 3.0759624238472707, + "learning_rate": 8.986875107049291e-07, + "loss": 0.6582, + "step": 18316 + }, + { + "epoch": 2.73, + "grad_norm": 6.154466164293348, + "learning_rate": 8.985914003482649e-07, + "loss": 0.6309, + "step": 18317 + }, + { + "epoch": 2.73, + "grad_norm": 3.4894649929296278, + "learning_rate": 8.984952909380556e-07, + "loss": 0.6348, + "step": 18318 + }, + { + "epoch": 2.73, + "grad_norm": 4.8116778625582866, + "learning_rate": 8.983991824751989e-07, + "loss": 0.6536, + "step": 18319 + }, + { + "epoch": 2.73, + "grad_norm": 3.6050346369517396, + "learning_rate": 8.983030749605914e-07, + "loss": 0.6361, + "step": 18320 + }, + { + "epoch": 2.73, + "grad_norm": 3.2777133397674736, + "learning_rate": 8.982069683951305e-07, + "loss": 0.6432, + "step": 18321 + }, + { + "epoch": 2.73, + "grad_norm": 3.357500808205302, + "learning_rate": 8.981108627797128e-07, + "loss": 0.6829, + "step": 18322 + }, + { + "epoch": 2.73, + "grad_norm": 3.1593275315971057, + "learning_rate": 8.980147581152358e-07, + "loss": 0.5996, + "step": 18323 + }, + { + "epoch": 2.73, + "grad_norm": 4.420034320980556, + "learning_rate": 8.979186544025958e-07, + "loss": 0.6051, + "step": 18324 + }, + { + "epoch": 2.73, + "grad_norm": 4.169487310625603, + "learning_rate": 8.978225516426901e-07, + "loss": 0.6901, + "step": 18325 + }, + { + "epoch": 2.73, + "grad_norm": 3.2177401927880607, + "learning_rate": 8.977264498364154e-07, + "loss": 0.624, + "step": 18326 + }, + { + "epoch": 2.73, + "grad_norm": 4.725892455633093, + "learning_rate": 8.976303489846691e-07, + "loss": 0.6406, + "step": 18327 + }, + { + "epoch": 2.73, + "grad_norm": 4.852582008416363, + "learning_rate": 8.975342490883474e-07, + "loss": 0.6543, + "step": 18328 + }, + { + "epoch": 2.73, + "grad_norm": 4.07905801995058, + "learning_rate": 8.974381501483478e-07, + "loss": 0.6719, + "step": 18329 + }, + { + "epoch": 2.73, + "grad_norm": 4.032940411388419, + "learning_rate": 8.973420521655675e-07, + "loss": 0.6628, + "step": 18330 + }, + { + "epoch": 2.73, + "grad_norm": 4.085900047650812, + "learning_rate": 8.972459551409022e-07, + "loss": 0.666, + "step": 18331 + }, + { + "epoch": 2.73, + "grad_norm": 4.399569325361978, + "learning_rate": 8.9714985907525e-07, + "loss": 0.6445, + "step": 18332 + }, + { + "epoch": 2.73, + "grad_norm": 3.455195528684534, + "learning_rate": 8.970537639695069e-07, + "loss": 0.6452, + "step": 18333 + }, + { + "epoch": 2.73, + "grad_norm": 2.946637993197398, + "learning_rate": 8.9695766982457e-07, + "loss": 0.6198, + "step": 18334 + }, + { + "epoch": 2.73, + "grad_norm": 3.282500091597273, + "learning_rate": 8.968615766413365e-07, + "loss": 0.6439, + "step": 18335 + }, + { + "epoch": 2.73, + "grad_norm": 2.9340864960526156, + "learning_rate": 8.96765484420703e-07, + "loss": 0.6165, + "step": 18336 + }, + { + "epoch": 2.73, + "grad_norm": 3.437993685718336, + "learning_rate": 8.966693931635667e-07, + "loss": 0.6471, + "step": 18337 + }, + { + "epoch": 2.73, + "grad_norm": 3.30635302216227, + "learning_rate": 8.965733028708239e-07, + "loss": 0.6042, + "step": 18338 + }, + { + "epoch": 2.74, + "grad_norm": 2.947039192286493, + "learning_rate": 8.964772135433714e-07, + "loss": 0.6159, + "step": 18339 + }, + { + "epoch": 2.74, + "grad_norm": 4.5708646540631745, + "learning_rate": 8.963811251821063e-07, + "loss": 0.64, + "step": 18340 + }, + { + "epoch": 2.74, + "grad_norm": 3.1587144455410963, + "learning_rate": 8.962850377879252e-07, + "loss": 0.653, + "step": 18341 + }, + { + "epoch": 2.74, + "grad_norm": 3.2409733794469044, + "learning_rate": 8.961889513617254e-07, + "loss": 0.638, + "step": 18342 + }, + { + "epoch": 2.74, + "grad_norm": 3.3086003552791876, + "learning_rate": 8.960928659044033e-07, + "loss": 0.6191, + "step": 18343 + }, + { + "epoch": 2.74, + "grad_norm": 3.0064934091326214, + "learning_rate": 8.959967814168552e-07, + "loss": 0.6074, + "step": 18344 + }, + { + "epoch": 2.74, + "grad_norm": 4.750931041226108, + "learning_rate": 8.959006978999785e-07, + "loss": 0.6882, + "step": 18345 + }, + { + "epoch": 2.74, + "grad_norm": 3.4804693665363517, + "learning_rate": 8.958046153546699e-07, + "loss": 0.6061, + "step": 18346 + }, + { + "epoch": 2.74, + "grad_norm": 3.083039159243666, + "learning_rate": 8.957085337818259e-07, + "loss": 0.6439, + "step": 18347 + }, + { + "epoch": 2.74, + "grad_norm": 5.89756170114702, + "learning_rate": 8.956124531823436e-07, + "loss": 0.6621, + "step": 18348 + }, + { + "epoch": 2.74, + "grad_norm": 2.9361164400979285, + "learning_rate": 8.955163735571191e-07, + "loss": 0.6497, + "step": 18349 + }, + { + "epoch": 2.74, + "grad_norm": 3.184556256738686, + "learning_rate": 8.954202949070502e-07, + "loss": 0.6217, + "step": 18350 + }, + { + "epoch": 2.74, + "grad_norm": 4.768781323002286, + "learning_rate": 8.953242172330326e-07, + "loss": 0.6771, + "step": 18351 + }, + { + "epoch": 2.74, + "grad_norm": 3.0750965952911264, + "learning_rate": 8.952281405359632e-07, + "loss": 0.6361, + "step": 18352 + }, + { + "epoch": 2.74, + "grad_norm": 3.5535696855202428, + "learning_rate": 8.95132064816739e-07, + "loss": 0.6406, + "step": 18353 + }, + { + "epoch": 2.74, + "grad_norm": 2.849575296640186, + "learning_rate": 8.950359900762563e-07, + "loss": 0.6198, + "step": 18354 + }, + { + "epoch": 2.74, + "grad_norm": 3.0160582624217955, + "learning_rate": 8.949399163154118e-07, + "loss": 0.627, + "step": 18355 + }, + { + "epoch": 2.74, + "grad_norm": 3.1526210370464187, + "learning_rate": 8.948438435351031e-07, + "loss": 0.6165, + "step": 18356 + }, + { + "epoch": 2.74, + "grad_norm": 6.108876419107641, + "learning_rate": 8.947477717362257e-07, + "loss": 0.6178, + "step": 18357 + }, + { + "epoch": 2.74, + "grad_norm": 3.612966571757245, + "learning_rate": 8.946517009196766e-07, + "loss": 0.6543, + "step": 18358 + }, + { + "epoch": 2.74, + "grad_norm": 3.425421339371662, + "learning_rate": 8.945556310863524e-07, + "loss": 0.6426, + "step": 18359 + }, + { + "epoch": 2.74, + "grad_norm": 3.265562422977711, + "learning_rate": 8.944595622371499e-07, + "loss": 0.5706, + "step": 18360 + }, + { + "epoch": 2.74, + "grad_norm": 3.4561925671884017, + "learning_rate": 8.943634943729658e-07, + "loss": 0.6126, + "step": 18361 + }, + { + "epoch": 2.74, + "grad_norm": 3.8262187676586255, + "learning_rate": 8.942674274946963e-07, + "loss": 0.6152, + "step": 18362 + }, + { + "epoch": 2.74, + "grad_norm": 3.803348773623065, + "learning_rate": 8.941713616032387e-07, + "loss": 0.5967, + "step": 18363 + }, + { + "epoch": 2.74, + "grad_norm": 4.760916329509893, + "learning_rate": 8.940752966994889e-07, + "loss": 0.5866, + "step": 18364 + }, + { + "epoch": 2.74, + "grad_norm": 3.068411265565115, + "learning_rate": 8.939792327843434e-07, + "loss": 0.5898, + "step": 18365 + }, + { + "epoch": 2.74, + "grad_norm": 3.3969958094214427, + "learning_rate": 8.938831698586993e-07, + "loss": 0.5986, + "step": 18366 + }, + { + "epoch": 2.74, + "grad_norm": 3.4207816414734427, + "learning_rate": 8.937871079234531e-07, + "loss": 0.5794, + "step": 18367 + }, + { + "epoch": 2.74, + "grad_norm": 3.560126499993377, + "learning_rate": 8.936910469795009e-07, + "loss": 0.5938, + "step": 18368 + }, + { + "epoch": 2.74, + "grad_norm": 3.7711980299757295, + "learning_rate": 8.935949870277398e-07, + "loss": 0.6322, + "step": 18369 + }, + { + "epoch": 2.74, + "grad_norm": 3.8434927168723085, + "learning_rate": 8.934989280690665e-07, + "loss": 0.6211, + "step": 18370 + }, + { + "epoch": 2.74, + "grad_norm": 3.617937174265819, + "learning_rate": 8.934028701043765e-07, + "loss": 0.6328, + "step": 18371 + }, + { + "epoch": 2.74, + "grad_norm": 5.1801483197462606, + "learning_rate": 8.933068131345671e-07, + "loss": 0.6966, + "step": 18372 + }, + { + "epoch": 2.74, + "grad_norm": 5.730934670562446, + "learning_rate": 8.932107571605346e-07, + "loss": 0.6491, + "step": 18373 + }, + { + "epoch": 2.74, + "grad_norm": 3.96987959426323, + "learning_rate": 8.931147021831758e-07, + "loss": 0.6764, + "step": 18374 + }, + { + "epoch": 2.74, + "grad_norm": 4.6835608163136975, + "learning_rate": 8.930186482033865e-07, + "loss": 0.6289, + "step": 18375 + }, + { + "epoch": 2.74, + "grad_norm": 3.9820744777316057, + "learning_rate": 8.929225952220643e-07, + "loss": 0.6745, + "step": 18376 + }, + { + "epoch": 2.74, + "grad_norm": 3.4063092565106023, + "learning_rate": 8.928265432401046e-07, + "loss": 0.6237, + "step": 18377 + }, + { + "epoch": 2.74, + "grad_norm": 4.264484926751685, + "learning_rate": 8.92730492258404e-07, + "loss": 0.6208, + "step": 18378 + }, + { + "epoch": 2.74, + "grad_norm": 4.120029398964007, + "learning_rate": 8.926344422778594e-07, + "loss": 0.6396, + "step": 18379 + }, + { + "epoch": 2.74, + "grad_norm": 3.8180924250883637, + "learning_rate": 8.925383932993671e-07, + "loss": 0.6693, + "step": 18380 + }, + { + "epoch": 2.74, + "grad_norm": 3.739101037358439, + "learning_rate": 8.924423453238233e-07, + "loss": 0.6289, + "step": 18381 + }, + { + "epoch": 2.74, + "grad_norm": 3.3676046999879476, + "learning_rate": 8.923462983521249e-07, + "loss": 0.6543, + "step": 18382 + }, + { + "epoch": 2.74, + "grad_norm": 3.603989081806606, + "learning_rate": 8.922502523851682e-07, + "loss": 0.6302, + "step": 18383 + }, + { + "epoch": 2.74, + "grad_norm": 3.6740166298796653, + "learning_rate": 8.921542074238488e-07, + "loss": 0.6745, + "step": 18384 + }, + { + "epoch": 2.74, + "grad_norm": 3.686661527202374, + "learning_rate": 8.920581634690642e-07, + "loss": 0.6693, + "step": 18385 + }, + { + "epoch": 2.74, + "grad_norm": 4.138111481163349, + "learning_rate": 8.9196212052171e-07, + "loss": 0.6523, + "step": 18386 + }, + { + "epoch": 2.74, + "grad_norm": 4.073664816128324, + "learning_rate": 8.91866078582683e-07, + "loss": 0.6921, + "step": 18387 + }, + { + "epoch": 2.74, + "grad_norm": 3.380419521162681, + "learning_rate": 8.917700376528794e-07, + "loss": 0.6038, + "step": 18388 + }, + { + "epoch": 2.74, + "grad_norm": 3.288941759137504, + "learning_rate": 8.916739977331958e-07, + "loss": 0.6615, + "step": 18389 + }, + { + "epoch": 2.74, + "grad_norm": 3.922004770257112, + "learning_rate": 8.915779588245282e-07, + "loss": 0.653, + "step": 18390 + }, + { + "epoch": 2.74, + "grad_norm": 9.441626027904165, + "learning_rate": 8.914819209277732e-07, + "loss": 0.666, + "step": 18391 + }, + { + "epoch": 2.74, + "grad_norm": 4.14772968736576, + "learning_rate": 8.913858840438267e-07, + "loss": 0.6523, + "step": 18392 + }, + { + "epoch": 2.74, + "grad_norm": 5.8612987841627495, + "learning_rate": 8.912898481735858e-07, + "loss": 0.641, + "step": 18393 + }, + { + "epoch": 2.74, + "grad_norm": 4.04252009349974, + "learning_rate": 8.911938133179459e-07, + "loss": 0.6868, + "step": 18394 + }, + { + "epoch": 2.74, + "grad_norm": 5.3370730199613, + "learning_rate": 8.910977794778042e-07, + "loss": 0.6608, + "step": 18395 + }, + { + "epoch": 2.74, + "grad_norm": 2.9971134484638218, + "learning_rate": 8.910017466540567e-07, + "loss": 0.6257, + "step": 18396 + }, + { + "epoch": 2.74, + "grad_norm": 2.808052601707986, + "learning_rate": 8.90905714847599e-07, + "loss": 0.6263, + "step": 18397 + }, + { + "epoch": 2.74, + "grad_norm": 3.0102469502706355, + "learning_rate": 8.908096840593282e-07, + "loss": 0.627, + "step": 18398 + }, + { + "epoch": 2.74, + "grad_norm": 2.8912776821861987, + "learning_rate": 8.907136542901401e-07, + "loss": 0.6152, + "step": 18399 + }, + { + "epoch": 2.74, + "grad_norm": 3.0307948343168625, + "learning_rate": 8.906176255409311e-07, + "loss": 0.6576, + "step": 18400 + }, + { + "epoch": 2.74, + "grad_norm": 5.029211164515479, + "learning_rate": 8.905215978125978e-07, + "loss": 0.6452, + "step": 18401 + }, + { + "epoch": 2.74, + "grad_norm": 4.129455668132014, + "learning_rate": 8.904255711060357e-07, + "loss": 0.6204, + "step": 18402 + }, + { + "epoch": 2.74, + "grad_norm": 3.481688572478496, + "learning_rate": 8.903295454221418e-07, + "loss": 0.6178, + "step": 18403 + }, + { + "epoch": 2.74, + "grad_norm": 3.407360990920802, + "learning_rate": 8.90233520761812e-07, + "loss": 0.6439, + "step": 18404 + }, + { + "epoch": 2.74, + "grad_norm": 5.181572466575967, + "learning_rate": 8.901374971259421e-07, + "loss": 0.6237, + "step": 18405 + }, + { + "epoch": 2.75, + "grad_norm": 3.2495795996084595, + "learning_rate": 8.900414745154289e-07, + "loss": 0.6777, + "step": 18406 + }, + { + "epoch": 2.75, + "grad_norm": 3.1226855535574214, + "learning_rate": 8.899454529311681e-07, + "loss": 0.6146, + "step": 18407 + }, + { + "epoch": 2.75, + "grad_norm": 3.8250020460018956, + "learning_rate": 8.898494323740564e-07, + "loss": 0.6309, + "step": 18408 + }, + { + "epoch": 2.75, + "grad_norm": 4.0442715696966935, + "learning_rate": 8.897534128449898e-07, + "loss": 0.6081, + "step": 18409 + }, + { + "epoch": 2.75, + "grad_norm": 4.462350903598123, + "learning_rate": 8.89657394344864e-07, + "loss": 0.6361, + "step": 18410 + }, + { + "epoch": 2.75, + "grad_norm": 5.6761330866923325, + "learning_rate": 8.895613768745755e-07, + "loss": 0.6419, + "step": 18411 + }, + { + "epoch": 2.75, + "grad_norm": 3.625652888423128, + "learning_rate": 8.894653604350206e-07, + "loss": 0.5964, + "step": 18412 + }, + { + "epoch": 2.75, + "grad_norm": 3.3556165869507644, + "learning_rate": 8.893693450270951e-07, + "loss": 0.6484, + "step": 18413 + }, + { + "epoch": 2.75, + "grad_norm": 4.761949244419, + "learning_rate": 8.892733306516954e-07, + "loss": 0.6549, + "step": 18414 + }, + { + "epoch": 2.75, + "grad_norm": 4.380409170447536, + "learning_rate": 8.891773173097175e-07, + "loss": 0.6237, + "step": 18415 + }, + { + "epoch": 2.75, + "grad_norm": 3.2601214885581546, + "learning_rate": 8.890813050020577e-07, + "loss": 0.6146, + "step": 18416 + }, + { + "epoch": 2.75, + "grad_norm": 5.580150644653418, + "learning_rate": 8.889852937296117e-07, + "loss": 0.6471, + "step": 18417 + }, + { + "epoch": 2.75, + "grad_norm": 3.5144652077673832, + "learning_rate": 8.888892834932756e-07, + "loss": 0.612, + "step": 18418 + }, + { + "epoch": 2.75, + "grad_norm": 4.316346083959718, + "learning_rate": 8.887932742939459e-07, + "loss": 0.6387, + "step": 18419 + }, + { + "epoch": 2.75, + "grad_norm": 6.660774190960675, + "learning_rate": 8.886972661325183e-07, + "loss": 0.6374, + "step": 18420 + }, + { + "epoch": 2.75, + "grad_norm": 3.8866657708664127, + "learning_rate": 8.886012590098887e-07, + "loss": 0.6367, + "step": 18421 + }, + { + "epoch": 2.75, + "grad_norm": 3.439921051396002, + "learning_rate": 8.88505252926954e-07, + "loss": 0.6354, + "step": 18422 + }, + { + "epoch": 2.75, + "grad_norm": 3.569218042381998, + "learning_rate": 8.884092478846094e-07, + "loss": 0.6882, + "step": 18423 + }, + { + "epoch": 2.75, + "grad_norm": 2.8698316577819694, + "learning_rate": 8.883132438837509e-07, + "loss": 0.584, + "step": 18424 + }, + { + "epoch": 2.75, + "grad_norm": 3.68633962794471, + "learning_rate": 8.882172409252751e-07, + "loss": 0.6953, + "step": 18425 + }, + { + "epoch": 2.75, + "grad_norm": 3.560931525316673, + "learning_rate": 8.881212390100773e-07, + "loss": 0.6126, + "step": 18426 + }, + { + "epoch": 2.75, + "grad_norm": 3.6890104248012703, + "learning_rate": 8.880252381390541e-07, + "loss": 0.6556, + "step": 18427 + }, + { + "epoch": 2.75, + "grad_norm": 3.313009282024038, + "learning_rate": 8.87929238313101e-07, + "loss": 0.5996, + "step": 18428 + }, + { + "epoch": 2.75, + "grad_norm": 4.221311372891644, + "learning_rate": 8.878332395331149e-07, + "loss": 0.6276, + "step": 18429 + }, + { + "epoch": 2.75, + "grad_norm": 4.208726150472772, + "learning_rate": 8.877372417999906e-07, + "loss": 0.6693, + "step": 18430 + }, + { + "epoch": 2.75, + "grad_norm": 4.068925308494456, + "learning_rate": 8.876412451146246e-07, + "loss": 0.5859, + "step": 18431 + }, + { + "epoch": 2.75, + "grad_norm": 6.810541687069363, + "learning_rate": 8.875452494779126e-07, + "loss": 0.6823, + "step": 18432 + }, + { + "epoch": 2.75, + "grad_norm": 3.3870053075112545, + "learning_rate": 8.874492548907511e-07, + "loss": 0.6029, + "step": 18433 + }, + { + "epoch": 2.75, + "grad_norm": 3.7298929765445976, + "learning_rate": 8.873532613540354e-07, + "loss": 0.6439, + "step": 18434 + }, + { + "epoch": 2.75, + "grad_norm": 3.3529046340721753, + "learning_rate": 8.872572688686617e-07, + "loss": 0.6113, + "step": 18435 + }, + { + "epoch": 2.75, + "grad_norm": 4.330681225778168, + "learning_rate": 8.871612774355262e-07, + "loss": 0.638, + "step": 18436 + } + ], + "logging_steps": 1.0, + "max_steps": 33525, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1676, + "total_flos": 276711916732416.0, + "train_batch_size": 22, + "trial_name": null, + "trial_params": null +}