{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.72549019607843, "eval_steps": 31, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03137254901960784, "grad_norm": 17.784626007080078, "learning_rate": 2e-05, "loss": 7.7199, "num_input_tokens_seen": 789, "step": 1 }, { "epoch": 0.06274509803921569, "grad_norm": 12.485037803649902, "learning_rate": 4e-05, "loss": 5.2762, "num_input_tokens_seen": 1656, "step": 2 }, { "epoch": 0.09411764705882353, "grad_norm": 7.9316840171813965, "learning_rate": 6e-05, "loss": 4.3107, "num_input_tokens_seen": 2529, "step": 3 }, { "epoch": 0.12549019607843137, "grad_norm": 9.721095085144043, "learning_rate": 8e-05, "loss": 4.8054, "num_input_tokens_seen": 3389, "step": 4 }, { "epoch": 0.1568627450980392, "grad_norm": 8.977057456970215, "learning_rate": 0.0001, "loss": 5.4644, "num_input_tokens_seen": 4244, "step": 5 }, { "epoch": 0.18823529411764706, "grad_norm": 10.086724281311035, "learning_rate": 0.00012, "loss": 5.6263, "num_input_tokens_seen": 5059, "step": 6 }, { "epoch": 0.2196078431372549, "grad_norm": 13.49176025390625, "learning_rate": 0.00014, "loss": 5.4706, "num_input_tokens_seen": 5918, "step": 7 }, { "epoch": 0.25098039215686274, "grad_norm": 11.12248706817627, "learning_rate": 0.00016, "loss": 4.4177, "num_input_tokens_seen": 6758, "step": 8 }, { "epoch": 0.2823529411764706, "grad_norm": 6.923247814178467, "learning_rate": 0.00018, "loss": 3.9388, "num_input_tokens_seen": 7653, "step": 9 }, { "epoch": 0.3137254901960784, "grad_norm": 11.242279052734375, "learning_rate": 0.0002, "loss": 4.3202, "num_input_tokens_seen": 8464, "step": 10 }, { "epoch": 0.34509803921568627, "grad_norm": 8.588418960571289, "learning_rate": 0.00019995065603657316, "loss": 3.116, "num_input_tokens_seen": 9348, "step": 11 }, { "epoch": 0.3764705882352941, "grad_norm": 9.73305892944336, "learning_rate": 0.00019980267284282717, "loss": 2.723, "num_input_tokens_seen": 10204, "step": 12 }, { "epoch": 0.40784313725490196, "grad_norm": 15.396673202514648, "learning_rate": 0.00019955619646030802, "loss": 2.4929, "num_input_tokens_seen": 11030, "step": 13 }, { "epoch": 0.4392156862745098, "grad_norm": 24.212421417236328, "learning_rate": 0.0001992114701314478, "loss": 2.7489, "num_input_tokens_seen": 11845, "step": 14 }, { "epoch": 0.47058823529411764, "grad_norm": 14.499921798706055, "learning_rate": 0.00019876883405951377, "loss": 1.8677, "num_input_tokens_seen": 12686, "step": 15 }, { "epoch": 0.5019607843137255, "grad_norm": 16.502588272094727, "learning_rate": 0.0001982287250728689, "loss": 1.3637, "num_input_tokens_seen": 13505, "step": 16 }, { "epoch": 0.5333333333333333, "grad_norm": 14.297043800354004, "learning_rate": 0.00019759167619387476, "loss": 1.5026, "num_input_tokens_seen": 14344, "step": 17 }, { "epoch": 0.5647058823529412, "grad_norm": 9.703812599182129, "learning_rate": 0.0001968583161128631, "loss": 1.4165, "num_input_tokens_seen": 15218, "step": 18 }, { "epoch": 0.596078431372549, "grad_norm": 12.656659126281738, "learning_rate": 0.0001960293685676943, "loss": 1.0906, "num_input_tokens_seen": 16040, "step": 19 }, { "epoch": 0.6274509803921569, "grad_norm": 58.49094772338867, "learning_rate": 0.00019510565162951537, "loss": 0.9817, "num_input_tokens_seen": 16874, "step": 20 }, { "epoch": 0.6588235294117647, "grad_norm": 9.382222175598145, "learning_rate": 0.00019408807689542257, "loss": 1.1198, "num_input_tokens_seen": 17682, "step": 21 }, { "epoch": 0.6901960784313725, "grad_norm": 10.244654655456543, "learning_rate": 0.00019297764858882514, "loss": 1.2596, "num_input_tokens_seen": 18539, "step": 22 }, { "epoch": 0.7215686274509804, "grad_norm": 8.23478889465332, "learning_rate": 0.00019177546256839812, "loss": 1.2768, "num_input_tokens_seen": 19432, "step": 23 }, { "epoch": 0.7529411764705882, "grad_norm": 19.18866729736328, "learning_rate": 0.00019048270524660196, "loss": 1.3043, "num_input_tokens_seen": 20264, "step": 24 }, { "epoch": 0.7843137254901961, "grad_norm": 6.661506175994873, "learning_rate": 0.0001891006524188368, "loss": 0.8769, "num_input_tokens_seen": 21113, "step": 25 }, { "epoch": 0.8156862745098039, "grad_norm": 6.4282546043396, "learning_rate": 0.00018763066800438636, "loss": 1.0473, "num_input_tokens_seen": 21966, "step": 26 }, { "epoch": 0.8470588235294118, "grad_norm": 11.599508285522461, "learning_rate": 0.0001860742027003944, "loss": 1.0205, "num_input_tokens_seen": 22821, "step": 27 }, { "epoch": 0.8784313725490196, "grad_norm": 9.272802352905273, "learning_rate": 0.00018443279255020152, "loss": 1.2292, "num_input_tokens_seen": 23733, "step": 28 }, { "epoch": 0.9098039215686274, "grad_norm": 7.99858283996582, "learning_rate": 0.00018270805742745617, "loss": 1.1247, "num_input_tokens_seen": 24593, "step": 29 }, { "epoch": 0.9411764705882353, "grad_norm": 10.046642303466797, "learning_rate": 0.00018090169943749476, "loss": 1.2097, "num_input_tokens_seen": 25425, "step": 30 }, { "epoch": 0.9725490196078431, "grad_norm": 9.499373435974121, "learning_rate": 0.00017901550123756904, "loss": 1.1345, "num_input_tokens_seen": 26228, "step": 31 }, { "epoch": 1.003921568627451, "grad_norm": 5.625377178192139, "learning_rate": 0.00017705132427757895, "loss": 1.1178, "num_input_tokens_seen": 27085, "step": 32 }, { "epoch": 1.035294117647059, "grad_norm": 4.716068744659424, "learning_rate": 0.00017501110696304596, "loss": 0.8997, "num_input_tokens_seen": 27917, "step": 33 }, { "epoch": 1.0666666666666667, "grad_norm": 4.880837440490723, "learning_rate": 0.00017289686274214118, "loss": 0.654, "num_input_tokens_seen": 28796, "step": 34 }, { "epoch": 1.0980392156862746, "grad_norm": 3.4638915061950684, "learning_rate": 0.00017071067811865476, "loss": 0.7083, "num_input_tokens_seen": 29650, "step": 35 }, { "epoch": 1.1294117647058823, "grad_norm": 6.4757561683654785, "learning_rate": 0.00016845471059286887, "loss": 0.981, "num_input_tokens_seen": 30477, "step": 36 }, { "epoch": 1.1607843137254903, "grad_norm": 6.666485786437988, "learning_rate": 0.00016613118653236518, "loss": 1.1384, "num_input_tokens_seen": 31314, "step": 37 }, { "epoch": 1.192156862745098, "grad_norm": 6.810871601104736, "learning_rate": 0.000163742398974869, "loss": 1.0914, "num_input_tokens_seen": 32129, "step": 38 }, { "epoch": 1.223529411764706, "grad_norm": 4.342460632324219, "learning_rate": 0.00016129070536529766, "loss": 0.784, "num_input_tokens_seen": 32980, "step": 39 }, { "epoch": 1.2549019607843137, "grad_norm": 6.6151204109191895, "learning_rate": 0.0001587785252292473, "loss": 1.151, "num_input_tokens_seen": 33811, "step": 40 }, { "epoch": 1.2862745098039214, "grad_norm": 6.701898097991943, "learning_rate": 0.00015620833778521307, "loss": 1.1066, "num_input_tokens_seen": 34640, "step": 41 }, { "epoch": 1.3176470588235294, "grad_norm": 3.7148549556732178, "learning_rate": 0.00015358267949789966, "loss": 1.1097, "num_input_tokens_seen": 35540, "step": 42 }, { "epoch": 1.3490196078431373, "grad_norm": 4.791841506958008, "learning_rate": 0.00015090414157503714, "loss": 0.9391, "num_input_tokens_seen": 36388, "step": 43 }, { "epoch": 1.380392156862745, "grad_norm": 6.712942123413086, "learning_rate": 0.00014817536741017152, "loss": 0.8025, "num_input_tokens_seen": 37254, "step": 44 }, { "epoch": 1.4117647058823528, "grad_norm": 3.204245090484619, "learning_rate": 0.00014539904997395468, "loss": 0.6842, "num_input_tokens_seen": 38098, "step": 45 }, { "epoch": 1.4431372549019608, "grad_norm": 4.535635471343994, "learning_rate": 0.00014257792915650728, "loss": 0.7039, "num_input_tokens_seen": 39024, "step": 46 }, { "epoch": 1.4745098039215687, "grad_norm": 5.812551021575928, "learning_rate": 0.00013971478906347806, "loss": 1.0178, "num_input_tokens_seen": 39876, "step": 47 }, { "epoch": 1.5058823529411764, "grad_norm": 5.7899169921875, "learning_rate": 0.00013681245526846783, "loss": 0.8895, "num_input_tokens_seen": 40691, "step": 48 }, { "epoch": 1.5372549019607842, "grad_norm": 4.779058933258057, "learning_rate": 0.00013387379202452917, "loss": 0.7015, "num_input_tokens_seen": 41529, "step": 49 }, { "epoch": 1.5686274509803921, "grad_norm": 4.17733907699585, "learning_rate": 0.00013090169943749476, "loss": 0.8385, "num_input_tokens_seen": 42347, "step": 50 }, { "epoch": 1.6, "grad_norm": 3.26069974899292, "learning_rate": 0.00012789911060392294, "loss": 0.6932, "num_input_tokens_seen": 43170, "step": 51 }, { "epoch": 1.6313725490196078, "grad_norm": 3.7764294147491455, "learning_rate": 0.00012486898871648546, "loss": 0.6722, "num_input_tokens_seen": 44017, "step": 52 }, { "epoch": 1.6627450980392156, "grad_norm": 5.631955146789551, "learning_rate": 0.00012181432413965425, "loss": 0.8225, "num_input_tokens_seen": 44895, "step": 53 }, { "epoch": 1.6941176470588235, "grad_norm": 3.0410749912261963, "learning_rate": 0.00011873813145857249, "loss": 0.5853, "num_input_tokens_seen": 45780, "step": 54 }, { "epoch": 1.7254901960784315, "grad_norm": 6.754289627075195, "learning_rate": 0.00011564344650402312, "loss": 0.8272, "num_input_tokens_seen": 46623, "step": 55 }, { "epoch": 1.7568627450980392, "grad_norm": 4.92588472366333, "learning_rate": 0.00011253332335643046, "loss": 0.588, "num_input_tokens_seen": 47446, "step": 56 }, { "epoch": 1.788235294117647, "grad_norm": 5.982916831970215, "learning_rate": 0.00010941083133185143, "loss": 0.9413, "num_input_tokens_seen": 48260, "step": 57 }, { "epoch": 1.8196078431372549, "grad_norm": 3.09069561958313, "learning_rate": 0.00010627905195293135, "loss": 0.7281, "num_input_tokens_seen": 49095, "step": 58 }, { "epoch": 1.8509803921568628, "grad_norm": 7.942086219787598, "learning_rate": 0.00010314107590781284, "loss": 0.8998, "num_input_tokens_seen": 49948, "step": 59 }, { "epoch": 1.8823529411764706, "grad_norm": 6.040829181671143, "learning_rate": 0.0001, "loss": 0.7494, "num_input_tokens_seen": 50794, "step": 60 }, { "epoch": 1.9137254901960783, "grad_norm": 5.524172782897949, "learning_rate": 9.685892409218717e-05, "loss": 1.1642, "num_input_tokens_seen": 51649, "step": 61 }, { "epoch": 1.9450980392156862, "grad_norm": 4.093544960021973, "learning_rate": 9.372094804706867e-05, "loss": 0.9137, "num_input_tokens_seen": 52499, "step": 62 }, { "epoch": 1.9764705882352942, "grad_norm": 3.3669676780700684, "learning_rate": 9.058916866814858e-05, "loss": 0.7343, "num_input_tokens_seen": 53345, "step": 63 }, { "epoch": 2.007843137254902, "grad_norm": 6.184525966644287, "learning_rate": 8.746667664356956e-05, "loss": 0.8992, "num_input_tokens_seen": 54201, "step": 64 }, { "epoch": 2.0392156862745097, "grad_norm": 4.793566703796387, "learning_rate": 8.435655349597695e-05, "loss": 0.6028, "num_input_tokens_seen": 55036, "step": 65 }, { "epoch": 2.070588235294118, "grad_norm": 4.015380859375, "learning_rate": 8.126186854142752e-05, "loss": 0.6597, "num_input_tokens_seen": 55868, "step": 66 }, { "epoch": 2.1019607843137256, "grad_norm": 2.2828500270843506, "learning_rate": 7.818567586034575e-05, "loss": 0.5288, "num_input_tokens_seen": 56724, "step": 67 }, { "epoch": 2.1333333333333333, "grad_norm": 3.3936123847961426, "learning_rate": 7.513101128351454e-05, "loss": 0.6818, "num_input_tokens_seen": 57579, "step": 68 }, { "epoch": 2.164705882352941, "grad_norm": 3.684924602508545, "learning_rate": 7.210088939607708e-05, "loss": 0.7798, "num_input_tokens_seen": 58425, "step": 69 }, { "epoch": 2.196078431372549, "grad_norm": 3.1493935585021973, "learning_rate": 6.909830056250524e-05, "loss": 0.7207, "num_input_tokens_seen": 59270, "step": 70 }, { "epoch": 2.227450980392157, "grad_norm": 5.103849411010742, "learning_rate": 6.612620797547087e-05, "loss": 0.6483, "num_input_tokens_seen": 60127, "step": 71 }, { "epoch": 2.2588235294117647, "grad_norm": 1.9273674488067627, "learning_rate": 6.318754473153221e-05, "loss": 0.5992, "num_input_tokens_seen": 61059, "step": 72 }, { "epoch": 2.2901960784313724, "grad_norm": 2.9266533851623535, "learning_rate": 6.0285210936521955e-05, "loss": 0.6952, "num_input_tokens_seen": 61905, "step": 73 }, { "epoch": 2.3215686274509806, "grad_norm": 2.3648922443389893, "learning_rate": 5.7422070843492734e-05, "loss": 0.5068, "num_input_tokens_seen": 62745, "step": 74 }, { "epoch": 2.3529411764705883, "grad_norm": 3.8562815189361572, "learning_rate": 5.4600950026045326e-05, "loss": 0.7024, "num_input_tokens_seen": 63594, "step": 75 }, { "epoch": 2.384313725490196, "grad_norm": 2.478416919708252, "learning_rate": 5.182463258982846e-05, "loss": 0.5721, "num_input_tokens_seen": 64453, "step": 76 }, { "epoch": 2.4156862745098038, "grad_norm": 2.117917776107788, "learning_rate": 4.909585842496292e-05, "loss": 0.4472, "num_input_tokens_seen": 65284, "step": 77 }, { "epoch": 2.447058823529412, "grad_norm": 2.4827864170074463, "learning_rate": 4.641732050210036e-05, "loss": 0.414, "num_input_tokens_seen": 66110, "step": 78 }, { "epoch": 2.4784313725490197, "grad_norm": 3.7768778800964355, "learning_rate": 4.3791662214786934e-05, "loss": 0.5808, "num_input_tokens_seen": 66934, "step": 79 }, { "epoch": 2.5098039215686274, "grad_norm": 3.4247841835021973, "learning_rate": 4.12214747707527e-05, "loss": 0.5447, "num_input_tokens_seen": 67775, "step": 80 }, { "epoch": 2.541176470588235, "grad_norm": 4.182628631591797, "learning_rate": 3.8709294634702376e-05, "loss": 0.4573, "num_input_tokens_seen": 68630, "step": 81 }, { "epoch": 2.572549019607843, "grad_norm": 4.045778274536133, "learning_rate": 3.6257601025131026e-05, "loss": 0.7857, "num_input_tokens_seen": 69484, "step": 82 }, { "epoch": 2.603921568627451, "grad_norm": 2.6151039600372314, "learning_rate": 3.386881346763483e-05, "loss": 0.7443, "num_input_tokens_seen": 70355, "step": 83 }, { "epoch": 2.635294117647059, "grad_norm": 3.3898870944976807, "learning_rate": 3.154528940713113e-05, "loss": 0.5147, "num_input_tokens_seen": 71198, "step": 84 }, { "epoch": 2.6666666666666665, "grad_norm": 1.8886942863464355, "learning_rate": 2.9289321881345254e-05, "loss": 0.3022, "num_input_tokens_seen": 72040, "step": 85 }, { "epoch": 2.6980392156862747, "grad_norm": 5.151848316192627, "learning_rate": 2.7103137257858868e-05, "loss": 0.3934, "num_input_tokens_seen": 72907, "step": 86 }, { "epoch": 2.7294117647058824, "grad_norm": 3.1100125312805176, "learning_rate": 2.4988893036954043e-05, "loss": 0.6222, "num_input_tokens_seen": 73742, "step": 87 }, { "epoch": 2.76078431372549, "grad_norm": 2.5638794898986816, "learning_rate": 2.2948675722421086e-05, "loss": 0.5721, "num_input_tokens_seen": 74587, "step": 88 }, { "epoch": 2.792156862745098, "grad_norm": 3.2768828868865967, "learning_rate": 2.098449876243098e-05, "loss": 0.5144, "num_input_tokens_seen": 75433, "step": 89 }, { "epoch": 2.8235294117647056, "grad_norm": 4.178157806396484, "learning_rate": 1.9098300562505266e-05, "loss": 0.463, "num_input_tokens_seen": 76218, "step": 90 }, { "epoch": 2.854901960784314, "grad_norm": 4.432765960693359, "learning_rate": 1.7291942572543807e-05, "loss": 0.6351, "num_input_tokens_seen": 77053, "step": 91 }, { "epoch": 2.8862745098039215, "grad_norm": 2.8439605236053467, "learning_rate": 1.5567207449798515e-05, "loss": 0.5897, "num_input_tokens_seen": 77899, "step": 92 }, { "epoch": 2.9176470588235293, "grad_norm": 3.0817153453826904, "learning_rate": 1.3925797299605625e-05, "loss": 0.6083, "num_input_tokens_seen": 78767, "step": 93 }, { "epoch": 2.9490196078431374, "grad_norm": 3.2820534706115723, "learning_rate": 1.2369331995613643e-05, "loss": 0.5075, "num_input_tokens_seen": 79591, "step": 94 }, { "epoch": 2.980392156862745, "grad_norm": 4.051931858062744, "learning_rate": 1.0899347581163221e-05, "loss": 0.4958, "num_input_tokens_seen": 80422, "step": 95 }, { "epoch": 3.011764705882353, "grad_norm": 3.6439077854156494, "learning_rate": 9.517294753398042e-06, "loss": 0.8383, "num_input_tokens_seen": 81282, "step": 96 }, { "epoch": 3.0431372549019606, "grad_norm": 3.559818983078003, "learning_rate": 8.224537431601908e-06, "loss": 0.5131, "num_input_tokens_seen": 82101, "step": 97 }, { "epoch": 3.074509803921569, "grad_norm": 2.2776846885681152, "learning_rate": 7.022351411174866e-06, "loss": 0.5896, "num_input_tokens_seen": 82989, "step": 98 }, { "epoch": 3.1058823529411765, "grad_norm": 2.8956410884857178, "learning_rate": 5.911923104577455e-06, "loss": 0.5902, "num_input_tokens_seen": 83838, "step": 99 }, { "epoch": 3.1372549019607843, "grad_norm": 2.3653640747070312, "learning_rate": 4.8943483704846585e-06, "loss": 0.278, "num_input_tokens_seen": 84654, "step": 100 }, { "epoch": 3.168627450980392, "grad_norm": 2.7463765144348145, "learning_rate": 3.970631432305694e-06, "loss": 0.4051, "num_input_tokens_seen": 85493, "step": 101 }, { "epoch": 3.2, "grad_norm": 1.9491956233978271, "learning_rate": 3.141683887136904e-06, "loss": 0.3255, "num_input_tokens_seen": 86381, "step": 102 }, { "epoch": 3.231372549019608, "grad_norm": 3.7543513774871826, "learning_rate": 2.4083238061252678e-06, "loss": 0.5123, "num_input_tokens_seen": 87182, "step": 103 }, { "epoch": 3.2627450980392156, "grad_norm": 2.68157958984375, "learning_rate": 1.771274927131128e-06, "loss": 0.5409, "num_input_tokens_seen": 88059, "step": 104 }, { "epoch": 3.2941176470588234, "grad_norm": 2.971251964569092, "learning_rate": 1.231165940486234e-06, "loss": 0.3435, "num_input_tokens_seen": 88889, "step": 105 }, { "epoch": 3.3254901960784315, "grad_norm": 2.606720209121704, "learning_rate": 7.885298685522235e-07, "loss": 0.5119, "num_input_tokens_seen": 89747, "step": 106 }, { "epoch": 3.3568627450980393, "grad_norm": 2.622124433517456, "learning_rate": 4.438035396920004e-07, "loss": 0.5565, "num_input_tokens_seen": 90612, "step": 107 }, { "epoch": 3.388235294117647, "grad_norm": 2.256873846054077, "learning_rate": 1.973271571728441e-07, "loss": 0.4164, "num_input_tokens_seen": 91453, "step": 108 }, { "epoch": 3.4196078431372547, "grad_norm": 3.2237167358398438, "learning_rate": 4.934396342684e-08, "loss": 0.5135, "num_input_tokens_seen": 92304, "step": 109 }, { "epoch": 3.450980392156863, "grad_norm": 2.807044267654419, "learning_rate": 0.0002, "loss": 0.4391, "num_input_tokens_seen": 93130, "step": 110 }, { "epoch": 3.4823529411764707, "grad_norm": 2.9154107570648193, "learning_rate": 0.00019995065603657316, "loss": 0.5719, "num_input_tokens_seen": 93953, "step": 111 }, { "epoch": 3.5137254901960784, "grad_norm": 3.265385150909424, "learning_rate": 0.00019980267284282717, "loss": 0.7463, "num_input_tokens_seen": 94803, "step": 112 }, { "epoch": 3.545098039215686, "grad_norm": 5.992981910705566, "learning_rate": 0.00019955619646030802, "loss": 0.7756, "num_input_tokens_seen": 95673, "step": 113 }, { "epoch": 3.576470588235294, "grad_norm": 6.979043483734131, "learning_rate": 0.0001992114701314478, "loss": 0.5761, "num_input_tokens_seen": 96559, "step": 114 }, { "epoch": 3.607843137254902, "grad_norm": 9.418047904968262, "learning_rate": 0.00019876883405951377, "loss": 0.7796, "num_input_tokens_seen": 97366, "step": 115 }, { "epoch": 3.6392156862745098, "grad_norm": 3.6083357334136963, "learning_rate": 0.00019822872507286888, "loss": 0.3696, "num_input_tokens_seen": 98197, "step": 116 }, { "epoch": 3.6705882352941175, "grad_norm": 4.184770584106445, "learning_rate": 0.00019759167619387476, "loss": 0.4349, "num_input_tokens_seen": 99019, "step": 117 }, { "epoch": 3.7019607843137257, "grad_norm": 5.8201799392700195, "learning_rate": 0.0001968583161128631, "loss": 0.434, "num_input_tokens_seen": 99866, "step": 118 }, { "epoch": 3.7333333333333334, "grad_norm": 5.535106182098389, "learning_rate": 0.0001960293685676943, "loss": 0.6474, "num_input_tokens_seen": 100723, "step": 119 }, { "epoch": 3.764705882352941, "grad_norm": 3.3299312591552734, "learning_rate": 0.00019510565162951537, "loss": 0.4361, "num_input_tokens_seen": 101581, "step": 120 }, { "epoch": 3.796078431372549, "grad_norm": 7.6674323081970215, "learning_rate": 0.00019408807689542257, "loss": 0.575, "num_input_tokens_seen": 102418, "step": 121 }, { "epoch": 3.8274509803921566, "grad_norm": 7.189317226409912, "learning_rate": 0.00019297764858882514, "loss": 0.8572, "num_input_tokens_seen": 103281, "step": 122 }, { "epoch": 3.8588235294117648, "grad_norm": 4.600097179412842, "learning_rate": 0.00019177546256839812, "loss": 0.6253, "num_input_tokens_seen": 104128, "step": 123 }, { "epoch": 3.8901960784313725, "grad_norm": 6.0382585525512695, "learning_rate": 0.00019048270524660193, "loss": 0.4465, "num_input_tokens_seen": 104972, "step": 124 }, { "epoch": 3.9215686274509802, "grad_norm": 5.221591949462891, "learning_rate": 0.0001891006524188368, "loss": 0.6236, "num_input_tokens_seen": 105833, "step": 125 }, { "epoch": 3.9529411764705884, "grad_norm": 4.43485164642334, "learning_rate": 0.0001876306680043864, "loss": 0.6026, "num_input_tokens_seen": 106656, "step": 126 }, { "epoch": 3.984313725490196, "grad_norm": 4.633317947387695, "learning_rate": 0.0001860742027003944, "loss": 0.4891, "num_input_tokens_seen": 107479, "step": 127 }, { "epoch": 4.015686274509804, "grad_norm": 2.710853338241577, "learning_rate": 0.00018443279255020155, "loss": 0.4361, "num_input_tokens_seen": 108376, "step": 128 }, { "epoch": 4.047058823529412, "grad_norm": 3.0707015991210938, "learning_rate": 0.0001827080574274562, "loss": 0.4198, "num_input_tokens_seen": 109228, "step": 129 }, { "epoch": 4.078431372549019, "grad_norm": 2.8015506267547607, "learning_rate": 0.0001809016994374947, "loss": 0.4515, "num_input_tokens_seen": 110061, "step": 130 }, { "epoch": 4.109803921568627, "grad_norm": 3.3442440032958984, "learning_rate": 0.00017901550123756906, "loss": 0.3318, "num_input_tokens_seen": 110927, "step": 131 }, { "epoch": 4.141176470588236, "grad_norm": 2.747676372528076, "learning_rate": 0.00017705132427757895, "loss": 0.4279, "num_input_tokens_seen": 111786, "step": 132 }, { "epoch": 4.172549019607843, "grad_norm": 3.85593581199646, "learning_rate": 0.00017501110696304596, "loss": 0.3526, "num_input_tokens_seen": 112632, "step": 133 }, { "epoch": 4.203921568627451, "grad_norm": 3.7865078449249268, "learning_rate": 0.00017289686274214118, "loss": 0.4617, "num_input_tokens_seen": 113486, "step": 134 }, { "epoch": 4.235294117647059, "grad_norm": 2.6525180339813232, "learning_rate": 0.00017071067811865476, "loss": 0.2388, "num_input_tokens_seen": 114336, "step": 135 }, { "epoch": 4.266666666666667, "grad_norm": 10.199384689331055, "learning_rate": 0.00016845471059286887, "loss": 0.4286, "num_input_tokens_seen": 115202, "step": 136 }, { "epoch": 4.298039215686274, "grad_norm": 3.65986967086792, "learning_rate": 0.00016613118653236518, "loss": 0.478, "num_input_tokens_seen": 116036, "step": 137 }, { "epoch": 4.329411764705882, "grad_norm": 2.575523614883423, "learning_rate": 0.000163742398974869, "loss": 0.229, "num_input_tokens_seen": 116868, "step": 138 }, { "epoch": 4.36078431372549, "grad_norm": 5.756189346313477, "learning_rate": 0.00016129070536529766, "loss": 0.5141, "num_input_tokens_seen": 117745, "step": 139 }, { "epoch": 4.392156862745098, "grad_norm": 3.640237331390381, "learning_rate": 0.0001587785252292473, "loss": 0.2679, "num_input_tokens_seen": 118582, "step": 140 }, { "epoch": 4.423529411764706, "grad_norm": 4.511353015899658, "learning_rate": 0.00015620833778521304, "loss": 0.381, "num_input_tokens_seen": 119445, "step": 141 }, { "epoch": 4.454901960784314, "grad_norm": 12.773442268371582, "learning_rate": 0.00015358267949789966, "loss": 0.5296, "num_input_tokens_seen": 120241, "step": 142 }, { "epoch": 4.486274509803922, "grad_norm": 3.347665548324585, "learning_rate": 0.00015090414157503714, "loss": 0.3427, "num_input_tokens_seen": 121096, "step": 143 }, { "epoch": 4.517647058823529, "grad_norm": 3.0967466831207275, "learning_rate": 0.00014817536741017158, "loss": 0.3916, "num_input_tokens_seen": 121910, "step": 144 }, { "epoch": 4.549019607843137, "grad_norm": 2.6861841678619385, "learning_rate": 0.00014539904997395468, "loss": 0.3054, "num_input_tokens_seen": 122762, "step": 145 }, { "epoch": 4.580392156862745, "grad_norm": 4.615218162536621, "learning_rate": 0.0001425779291565073, "loss": 0.3923, "num_input_tokens_seen": 123593, "step": 146 }, { "epoch": 4.6117647058823525, "grad_norm": 6.789853572845459, "learning_rate": 0.00013971478906347803, "loss": 0.438, "num_input_tokens_seen": 124446, "step": 147 }, { "epoch": 4.643137254901961, "grad_norm": 4.128298282623291, "learning_rate": 0.00013681245526846777, "loss": 0.4208, "num_input_tokens_seen": 125313, "step": 148 }, { "epoch": 4.674509803921569, "grad_norm": 3.4282140731811523, "learning_rate": 0.00013387379202452917, "loss": 0.3717, "num_input_tokens_seen": 126184, "step": 149 }, { "epoch": 4.705882352941177, "grad_norm": 6.779294967651367, "learning_rate": 0.00013090169943749476, "loss": 0.4152, "num_input_tokens_seen": 127026, "step": 150 }, { "epoch": 4.737254901960784, "grad_norm": 3.086122989654541, "learning_rate": 0.00012789911060392294, "loss": 0.3011, "num_input_tokens_seen": 127883, "step": 151 }, { "epoch": 4.768627450980392, "grad_norm": 5.97057580947876, "learning_rate": 0.0001248689887164855, "loss": 0.3705, "num_input_tokens_seen": 128689, "step": 152 }, { "epoch": 4.8, "grad_norm": 4.796477317810059, "learning_rate": 0.00012181432413965421, "loss": 0.4362, "num_input_tokens_seen": 129520, "step": 153 }, { "epoch": 4.8313725490196076, "grad_norm": 3.3298208713531494, "learning_rate": 0.00011873813145857249, "loss": 0.4988, "num_input_tokens_seen": 130406, "step": 154 }, { "epoch": 4.862745098039216, "grad_norm": 3.887026071548462, "learning_rate": 0.00011564344650402312, "loss": 0.5444, "num_input_tokens_seen": 131276, "step": 155 }, { "epoch": 4.894117647058824, "grad_norm": 6.220418453216553, "learning_rate": 0.00011253332335643046, "loss": 0.4078, "num_input_tokens_seen": 132090, "step": 156 }, { "epoch": 4.925490196078432, "grad_norm": 2.757842779159546, "learning_rate": 0.00010941083133185146, "loss": 0.3539, "num_input_tokens_seen": 132909, "step": 157 }, { "epoch": 4.956862745098039, "grad_norm": 2.275514602661133, "learning_rate": 0.00010627905195293135, "loss": 0.3512, "num_input_tokens_seen": 133799, "step": 158 }, { "epoch": 4.988235294117647, "grad_norm": 3.4296388626098633, "learning_rate": 0.00010314107590781284, "loss": 0.3426, "num_input_tokens_seen": 134640, "step": 159 }, { "epoch": 5.019607843137255, "grad_norm": 1.8311214447021484, "learning_rate": 0.0001, "loss": 0.1619, "num_input_tokens_seen": 135491, "step": 160 }, { "epoch": 5.050980392156863, "grad_norm": 2.676769495010376, "learning_rate": 9.685892409218724e-05, "loss": 0.221, "num_input_tokens_seen": 136317, "step": 161 }, { "epoch": 5.08235294117647, "grad_norm": 1.8601126670837402, "learning_rate": 9.372094804706867e-05, "loss": 0.2427, "num_input_tokens_seen": 137191, "step": 162 }, { "epoch": 5.113725490196078, "grad_norm": 1.8501461744308472, "learning_rate": 9.058916866814858e-05, "loss": 0.2443, "num_input_tokens_seen": 138029, "step": 163 }, { "epoch": 5.145098039215687, "grad_norm": 1.612574815750122, "learning_rate": 8.746667664356956e-05, "loss": 0.1391, "num_input_tokens_seen": 138848, "step": 164 }, { "epoch": 5.176470588235294, "grad_norm": 1.7896589040756226, "learning_rate": 8.435655349597684e-05, "loss": 0.0778, "num_input_tokens_seen": 139648, "step": 165 }, { "epoch": 5.207843137254902, "grad_norm": 3.172090530395508, "learning_rate": 8.126186854142752e-05, "loss": 0.2737, "num_input_tokens_seen": 140500, "step": 166 }, { "epoch": 5.23921568627451, "grad_norm": 2.280182361602783, "learning_rate": 7.818567586034581e-05, "loss": 0.18, "num_input_tokens_seen": 141331, "step": 167 }, { "epoch": 5.270588235294118, "grad_norm": 3.10487699508667, "learning_rate": 7.513101128351459e-05, "loss": 0.2636, "num_input_tokens_seen": 142188, "step": 168 }, { "epoch": 5.301960784313725, "grad_norm": 2.16359543800354, "learning_rate": 7.210088939607707e-05, "loss": 0.1788, "num_input_tokens_seen": 143005, "step": 169 }, { "epoch": 5.333333333333333, "grad_norm": 3.690769910812378, "learning_rate": 6.909830056250524e-05, "loss": 0.1271, "num_input_tokens_seen": 143796, "step": 170 }, { "epoch": 5.364705882352941, "grad_norm": 4.808859348297119, "learning_rate": 6.612620797547091e-05, "loss": 0.1235, "num_input_tokens_seen": 144594, "step": 171 }, { "epoch": 5.396078431372549, "grad_norm": 2.4017834663391113, "learning_rate": 6.318754473153218e-05, "loss": 0.1417, "num_input_tokens_seen": 145432, "step": 172 }, { "epoch": 5.427450980392157, "grad_norm": 10.861905097961426, "learning_rate": 6.0285210936521976e-05, "loss": 0.3499, "num_input_tokens_seen": 146307, "step": 173 }, { "epoch": 5.458823529411765, "grad_norm": 6.508179664611816, "learning_rate": 5.742207084349277e-05, "loss": 0.2636, "num_input_tokens_seen": 147159, "step": 174 }, { "epoch": 5.490196078431373, "grad_norm": 3.551328659057617, "learning_rate": 5.4600950026045285e-05, "loss": 0.1685, "num_input_tokens_seen": 147990, "step": 175 }, { "epoch": 5.52156862745098, "grad_norm": 5.636049747467041, "learning_rate": 5.182463258982846e-05, "loss": 0.2804, "num_input_tokens_seen": 148782, "step": 176 }, { "epoch": 5.552941176470588, "grad_norm": 4.598341941833496, "learning_rate": 4.909585842496292e-05, "loss": 0.3087, "num_input_tokens_seen": 149687, "step": 177 }, { "epoch": 5.584313725490196, "grad_norm": 5.29741907119751, "learning_rate": 4.6417320502100316e-05, "loss": 0.3299, "num_input_tokens_seen": 150565, "step": 178 }, { "epoch": 5.6156862745098035, "grad_norm": 3.7548561096191406, "learning_rate": 4.379166221478697e-05, "loss": 0.3047, "num_input_tokens_seen": 151419, "step": 179 }, { "epoch": 5.647058823529412, "grad_norm": 2.995479106903076, "learning_rate": 4.12214747707527e-05, "loss": 0.2234, "num_input_tokens_seen": 152249, "step": 180 }, { "epoch": 5.67843137254902, "grad_norm": 2.391935110092163, "learning_rate": 3.8709294634702376e-05, "loss": 0.1819, "num_input_tokens_seen": 153097, "step": 181 }, { "epoch": 5.709803921568628, "grad_norm": 2.5566658973693848, "learning_rate": 3.625760102513099e-05, "loss": 0.2642, "num_input_tokens_seen": 153970, "step": 182 }, { "epoch": 5.741176470588235, "grad_norm": 2.3486950397491455, "learning_rate": 3.386881346763483e-05, "loss": 0.1399, "num_input_tokens_seen": 154821, "step": 183 }, { "epoch": 5.772549019607843, "grad_norm": 3.2803895473480225, "learning_rate": 3.15452894071312e-05, "loss": 0.2416, "num_input_tokens_seen": 155654, "step": 184 }, { "epoch": 5.803921568627451, "grad_norm": 3.5313189029693604, "learning_rate": 2.9289321881345254e-05, "loss": 0.1808, "num_input_tokens_seen": 156464, "step": 185 }, { "epoch": 5.8352941176470585, "grad_norm": 2.8547775745391846, "learning_rate": 2.7103137257858868e-05, "loss": 0.2565, "num_input_tokens_seen": 157360, "step": 186 }, { "epoch": 5.866666666666667, "grad_norm": 2.3218977451324463, "learning_rate": 2.4988893036954043e-05, "loss": 0.1691, "num_input_tokens_seen": 158193, "step": 187 }, { "epoch": 5.898039215686275, "grad_norm": 2.2196223735809326, "learning_rate": 2.2948675722421032e-05, "loss": 0.4082, "num_input_tokens_seen": 159122, "step": 188 }, { "epoch": 5.929411764705883, "grad_norm": 2.5707294940948486, "learning_rate": 2.098449876243096e-05, "loss": 0.2091, "num_input_tokens_seen": 159987, "step": 189 }, { "epoch": 5.96078431372549, "grad_norm": 3.849001407623291, "learning_rate": 1.909830056250529e-05, "loss": 0.258, "num_input_tokens_seen": 160876, "step": 190 }, { "epoch": 5.992156862745098, "grad_norm": 7.267693042755127, "learning_rate": 1.7291942572543807e-05, "loss": 0.3389, "num_input_tokens_seen": 161713, "step": 191 }, { "epoch": 6.023529411764706, "grad_norm": 2.107445240020752, "learning_rate": 1.5567207449798495e-05, "loss": 0.1413, "num_input_tokens_seen": 162530, "step": 192 }, { "epoch": 6.0549019607843135, "grad_norm": 1.9133341312408447, "learning_rate": 1.3925797299605625e-05, "loss": 0.098, "num_input_tokens_seen": 163339, "step": 193 }, { "epoch": 6.086274509803921, "grad_norm": 2.2540628910064697, "learning_rate": 1.2369331995613665e-05, "loss": 0.1409, "num_input_tokens_seen": 164183, "step": 194 }, { "epoch": 6.117647058823529, "grad_norm": 1.5692024230957031, "learning_rate": 1.08993475811632e-05, "loss": 0.1951, "num_input_tokens_seen": 165009, "step": 195 }, { "epoch": 6.149019607843138, "grad_norm": 1.885223150253296, "learning_rate": 9.517294753398064e-06, "loss": 0.1493, "num_input_tokens_seen": 165856, "step": 196 }, { "epoch": 6.180392156862745, "grad_norm": 1.3335139751434326, "learning_rate": 8.224537431601908e-06, "loss": 0.054, "num_input_tokens_seen": 166686, "step": 197 }, { "epoch": 6.211764705882353, "grad_norm": 2.255988359451294, "learning_rate": 7.022351411174855e-06, "loss": 0.1965, "num_input_tokens_seen": 167544, "step": 198 }, { "epoch": 6.243137254901961, "grad_norm": 4.721761703491211, "learning_rate": 5.911923104577455e-06, "loss": 0.1935, "num_input_tokens_seen": 168348, "step": 199 }, { "epoch": 6.2745098039215685, "grad_norm": 2.2551510334014893, "learning_rate": 4.8943483704846585e-06, "loss": 0.1393, "num_input_tokens_seen": 169231, "step": 200 }, { "epoch": 6.305882352941176, "grad_norm": 1.8611195087432861, "learning_rate": 3.970631432305694e-06, "loss": 0.1288, "num_input_tokens_seen": 170060, "step": 201 }, { "epoch": 6.337254901960784, "grad_norm": 2.192089796066284, "learning_rate": 3.141683887136904e-06, "loss": 0.2037, "num_input_tokens_seen": 170917, "step": 202 }, { "epoch": 6.368627450980393, "grad_norm": 1.2144159078598022, "learning_rate": 2.4083238061252678e-06, "loss": 0.0774, "num_input_tokens_seen": 171774, "step": 203 }, { "epoch": 6.4, "grad_norm": 1.9125508069992065, "learning_rate": 1.771274927131139e-06, "loss": 0.1388, "num_input_tokens_seen": 172589, "step": 204 }, { "epoch": 6.431372549019608, "grad_norm": 1.6493868827819824, "learning_rate": 1.231165940486223e-06, "loss": 0.1192, "num_input_tokens_seen": 173418, "step": 205 }, { "epoch": 6.462745098039216, "grad_norm": 1.91011643409729, "learning_rate": 7.885298685522235e-07, "loss": 0.0885, "num_input_tokens_seen": 174236, "step": 206 }, { "epoch": 6.4941176470588236, "grad_norm": 1.8174355030059814, "learning_rate": 4.438035396920115e-07, "loss": 0.2875, "num_input_tokens_seen": 175172, "step": 207 }, { "epoch": 6.525490196078431, "grad_norm": 1.7092390060424805, "learning_rate": 1.973271571728441e-07, "loss": 0.1039, "num_input_tokens_seen": 176023, "step": 208 }, { "epoch": 6.556862745098039, "grad_norm": 1.8284764289855957, "learning_rate": 4.934396342684e-08, "loss": 0.1365, "num_input_tokens_seen": 176906, "step": 209 }, { "epoch": 6.588235294117647, "grad_norm": 1.697880506515503, "learning_rate": 0.0002, "loss": 0.1889, "num_input_tokens_seen": 177795, "step": 210 }, { "epoch": 6.6196078431372545, "grad_norm": 1.8329994678497314, "learning_rate": 0.00019995065603657314, "loss": 0.2207, "num_input_tokens_seen": 178672, "step": 211 }, { "epoch": 6.650980392156863, "grad_norm": 2.528137683868408, "learning_rate": 0.00019980267284282717, "loss": 0.1294, "num_input_tokens_seen": 179466, "step": 212 }, { "epoch": 6.682352941176471, "grad_norm": 5.707077503204346, "learning_rate": 0.00019955619646030802, "loss": 0.1878, "num_input_tokens_seen": 180332, "step": 213 }, { "epoch": 6.713725490196079, "grad_norm": 5.38962459564209, "learning_rate": 0.0001992114701314478, "loss": 0.1539, "num_input_tokens_seen": 181169, "step": 214 }, { "epoch": 6.745098039215686, "grad_norm": 8.556888580322266, "learning_rate": 0.00019876883405951377, "loss": 0.3257, "num_input_tokens_seen": 182059, "step": 215 }, { "epoch": 6.776470588235294, "grad_norm": 5.112634658813477, "learning_rate": 0.00019822872507286888, "loss": 0.1682, "num_input_tokens_seen": 182890, "step": 216 }, { "epoch": 6.807843137254902, "grad_norm": 5.386922359466553, "learning_rate": 0.00019759167619387476, "loss": 0.1355, "num_input_tokens_seen": 183734, "step": 217 }, { "epoch": 6.8392156862745095, "grad_norm": 9.67380142211914, "learning_rate": 0.0001968583161128631, "loss": 0.1565, "num_input_tokens_seen": 184542, "step": 218 }, { "epoch": 6.870588235294118, "grad_norm": 4.442196369171143, "learning_rate": 0.00019602936856769434, "loss": 0.1822, "num_input_tokens_seen": 185420, "step": 219 }, { "epoch": 6.901960784313726, "grad_norm": 6.283408164978027, "learning_rate": 0.0001951056516295154, "loss": 0.2803, "num_input_tokens_seen": 186278, "step": 220 }, { "epoch": 6.933333333333334, "grad_norm": 2.322974920272827, "learning_rate": 0.0001940880768954225, "loss": 0.175, "num_input_tokens_seen": 187146, "step": 221 }, { "epoch": 6.964705882352941, "grad_norm": 4.757282733917236, "learning_rate": 0.00019297764858882514, "loss": 0.2458, "num_input_tokens_seen": 187998, "step": 222 }, { "epoch": 6.996078431372549, "grad_norm": 10.94787883758545, "learning_rate": 0.00019177546256839812, "loss": 0.4605, "num_input_tokens_seen": 188820, "step": 223 }, { "epoch": 7.027450980392157, "grad_norm": 3.9081947803497314, "learning_rate": 0.00019048270524660193, "loss": 0.1243, "num_input_tokens_seen": 189615, "step": 224 }, { "epoch": 7.0588235294117645, "grad_norm": 4.866150856018066, "learning_rate": 0.0001891006524188368, "loss": 0.2121, "num_input_tokens_seen": 190498, "step": 225 }, { "epoch": 7.090196078431372, "grad_norm": 2.8364100456237793, "learning_rate": 0.00018763066800438636, "loss": 0.2427, "num_input_tokens_seen": 191385, "step": 226 }, { "epoch": 7.12156862745098, "grad_norm": 1.5343717336654663, "learning_rate": 0.0001860742027003944, "loss": 0.0782, "num_input_tokens_seen": 192218, "step": 227 }, { "epoch": 7.152941176470589, "grad_norm": 2.824655771255493, "learning_rate": 0.0001844327925502015, "loss": 0.0852, "num_input_tokens_seen": 193056, "step": 228 }, { "epoch": 7.184313725490196, "grad_norm": 6.053825378417969, "learning_rate": 0.0001827080574274562, "loss": 0.2932, "num_input_tokens_seen": 193874, "step": 229 }, { "epoch": 7.215686274509804, "grad_norm": 7.972029685974121, "learning_rate": 0.0001809016994374948, "loss": 0.1712, "num_input_tokens_seen": 194758, "step": 230 }, { "epoch": 7.247058823529412, "grad_norm": 3.326019287109375, "learning_rate": 0.00017901550123756906, "loss": 0.1383, "num_input_tokens_seen": 195658, "step": 231 }, { "epoch": 7.2784313725490195, "grad_norm": 2.4085988998413086, "learning_rate": 0.000177051324277579, "loss": 0.0777, "num_input_tokens_seen": 196482, "step": 232 }, { "epoch": 7.309803921568627, "grad_norm": 4.651546001434326, "learning_rate": 0.00017501110696304596, "loss": 0.1553, "num_input_tokens_seen": 197329, "step": 233 }, { "epoch": 7.341176470588235, "grad_norm": 2.4471681118011475, "learning_rate": 0.00017289686274214112, "loss": 0.1341, "num_input_tokens_seen": 198158, "step": 234 }, { "epoch": 7.372549019607844, "grad_norm": 4.855953216552734, "learning_rate": 0.00017071067811865476, "loss": 0.081, "num_input_tokens_seen": 198961, "step": 235 }, { "epoch": 7.403921568627451, "grad_norm": 6.432670593261719, "learning_rate": 0.00016845471059286893, "loss": 0.0946, "num_input_tokens_seen": 199788, "step": 236 }, { "epoch": 7.435294117647059, "grad_norm": 4.775556564331055, "learning_rate": 0.00016613118653236518, "loss": 0.1548, "num_input_tokens_seen": 200637, "step": 237 }, { "epoch": 7.466666666666667, "grad_norm": 4.532850742340088, "learning_rate": 0.0001637423989748689, "loss": 0.0952, "num_input_tokens_seen": 201482, "step": 238 }, { "epoch": 7.4980392156862745, "grad_norm": 7.549116134643555, "learning_rate": 0.00016129070536529766, "loss": 0.2452, "num_input_tokens_seen": 202333, "step": 239 }, { "epoch": 7.529411764705882, "grad_norm": 3.5515332221984863, "learning_rate": 0.00015877852522924726, "loss": 0.1033, "num_input_tokens_seen": 203151, "step": 240 }, { "epoch": 7.56078431372549, "grad_norm": 4.602539539337158, "learning_rate": 0.00015620833778521304, "loss": 0.1349, "num_input_tokens_seen": 204022, "step": 241 }, { "epoch": 7.592156862745098, "grad_norm": 2.6608591079711914, "learning_rate": 0.00015358267949789971, "loss": 0.1057, "num_input_tokens_seen": 204856, "step": 242 }, { "epoch": 7.623529411764705, "grad_norm": 5.953851699829102, "learning_rate": 0.00015090414157503714, "loss": 0.168, "num_input_tokens_seen": 205751, "step": 243 }, { "epoch": 7.654901960784314, "grad_norm": 2.4022793769836426, "learning_rate": 0.00014817536741017158, "loss": 0.1534, "num_input_tokens_seen": 206614, "step": 244 }, { "epoch": 7.686274509803922, "grad_norm": 5.277859210968018, "learning_rate": 0.00014539904997395468, "loss": 0.1876, "num_input_tokens_seen": 207442, "step": 245 }, { "epoch": 7.7176470588235295, "grad_norm": 3.6555447578430176, "learning_rate": 0.0001425779291565073, "loss": 0.1528, "num_input_tokens_seen": 208268, "step": 246 }, { "epoch": 7.749019607843137, "grad_norm": 13.82836627960205, "learning_rate": 0.00013971478906347803, "loss": 0.2893, "num_input_tokens_seen": 209161, "step": 247 }, { "epoch": 7.780392156862745, "grad_norm": 4.631089687347412, "learning_rate": 0.00013681245526846785, "loss": 0.1996, "num_input_tokens_seen": 209987, "step": 248 }, { "epoch": 7.811764705882353, "grad_norm": 5.084930419921875, "learning_rate": 0.00013387379202452922, "loss": 0.2305, "num_input_tokens_seen": 210827, "step": 249 }, { "epoch": 7.8431372549019605, "grad_norm": 4.121197700500488, "learning_rate": 0.00013090169943749463, "loss": 0.3338, "num_input_tokens_seen": 211698, "step": 250 }, { "epoch": 7.874509803921569, "grad_norm": 3.6635348796844482, "learning_rate": 0.0001278991106039229, "loss": 0.2121, "num_input_tokens_seen": 212549, "step": 251 }, { "epoch": 7.905882352941177, "grad_norm": 2.271726369857788, "learning_rate": 0.0001248689887164855, "loss": 0.0888, "num_input_tokens_seen": 213388, "step": 252 }, { "epoch": 7.9372549019607845, "grad_norm": 2.883363962173462, "learning_rate": 0.00012181432413965421, "loss": 0.2263, "num_input_tokens_seen": 214226, "step": 253 }, { "epoch": 7.968627450980392, "grad_norm": 5.223812580108643, "learning_rate": 0.00011873813145857249, "loss": 0.1085, "num_input_tokens_seen": 215089, "step": 254 }, { "epoch": 8.0, "grad_norm": 3.1294803619384766, "learning_rate": 0.00011564344650402304, "loss": 0.2069, "num_input_tokens_seen": 215904, "step": 255 }, { "epoch": 8.031372549019608, "grad_norm": 3.5736420154571533, "learning_rate": 0.00011253332335643046, "loss": 0.0774, "num_input_tokens_seen": 216721, "step": 256 }, { "epoch": 8.062745098039215, "grad_norm": 1.6127935647964478, "learning_rate": 0.00010941083133185139, "loss": 0.0487, "num_input_tokens_seen": 217568, "step": 257 }, { "epoch": 8.094117647058823, "grad_norm": 1.9943934679031372, "learning_rate": 0.00010627905195293135, "loss": 0.049, "num_input_tokens_seen": 218393, "step": 258 }, { "epoch": 8.125490196078431, "grad_norm": 5.851587772369385, "learning_rate": 0.00010314107590781291, "loss": 0.121, "num_input_tokens_seen": 219230, "step": 259 }, { "epoch": 8.156862745098039, "grad_norm": 2.693629026412964, "learning_rate": 0.0001, "loss": 0.0664, "num_input_tokens_seen": 220048, "step": 260 }, { "epoch": 8.188235294117646, "grad_norm": 6.576751708984375, "learning_rate": 9.685892409218724e-05, "loss": 0.0818, "num_input_tokens_seen": 220906, "step": 261 }, { "epoch": 8.219607843137254, "grad_norm": 1.8238303661346436, "learning_rate": 9.372094804706867e-05, "loss": 0.0977, "num_input_tokens_seen": 221795, "step": 262 }, { "epoch": 8.250980392156862, "grad_norm": 2.1661157608032227, "learning_rate": 9.058916866814851e-05, "loss": 0.0617, "num_input_tokens_seen": 222644, "step": 263 }, { "epoch": 8.282352941176471, "grad_norm": 3.1537587642669678, "learning_rate": 8.746667664356956e-05, "loss": 0.1102, "num_input_tokens_seen": 223465, "step": 264 }, { "epoch": 8.313725490196079, "grad_norm": 12.85554027557373, "learning_rate": 8.435655349597699e-05, "loss": 0.1478, "num_input_tokens_seen": 224302, "step": 265 }, { "epoch": 8.345098039215687, "grad_norm": 4.586164474487305, "learning_rate": 8.126186854142752e-05, "loss": 0.2202, "num_input_tokens_seen": 225154, "step": 266 }, { "epoch": 8.376470588235295, "grad_norm": 3.9367291927337646, "learning_rate": 7.818567586034566e-05, "loss": 0.0682, "num_input_tokens_seen": 226040, "step": 267 }, { "epoch": 8.407843137254902, "grad_norm": 2.0368969440460205, "learning_rate": 7.513101128351452e-05, "loss": 0.0402, "num_input_tokens_seen": 226847, "step": 268 }, { "epoch": 8.43921568627451, "grad_norm": 1.0436691045761108, "learning_rate": 7.210088939607714e-05, "loss": 0.0185, "num_input_tokens_seen": 227680, "step": 269 }, { "epoch": 8.470588235294118, "grad_norm": 1.8092098236083984, "learning_rate": 6.909830056250524e-05, "loss": 0.0442, "num_input_tokens_seen": 228543, "step": 270 }, { "epoch": 8.501960784313725, "grad_norm": 1.526930332183838, "learning_rate": 6.612620797547091e-05, "loss": 0.0362, "num_input_tokens_seen": 229373, "step": 271 }, { "epoch": 8.533333333333333, "grad_norm": 1.8258460760116577, "learning_rate": 6.318754473153218e-05, "loss": 0.0457, "num_input_tokens_seen": 230190, "step": 272 }, { "epoch": 8.564705882352941, "grad_norm": 1.3038586378097534, "learning_rate": 6.0285210936521976e-05, "loss": 0.0831, "num_input_tokens_seen": 231024, "step": 273 }, { "epoch": 8.596078431372549, "grad_norm": 5.259337902069092, "learning_rate": 5.7422070843492734e-05, "loss": 0.1228, "num_input_tokens_seen": 231915, "step": 274 }, { "epoch": 8.627450980392156, "grad_norm": 0.8257218599319458, "learning_rate": 5.460095002604537e-05, "loss": 0.0171, "num_input_tokens_seen": 232711, "step": 275 }, { "epoch": 8.658823529411764, "grad_norm": 1.649411916732788, "learning_rate": 5.182463258982846e-05, "loss": 0.0366, "num_input_tokens_seen": 233534, "step": 276 }, { "epoch": 8.690196078431372, "grad_norm": 4.529787540435791, "learning_rate": 4.909585842496292e-05, "loss": 0.1006, "num_input_tokens_seen": 234406, "step": 277 }, { "epoch": 8.72156862745098, "grad_norm": 3.721914052963257, "learning_rate": 4.6417320502100425e-05, "loss": 0.1088, "num_input_tokens_seen": 235270, "step": 278 }, { "epoch": 8.75294117647059, "grad_norm": 8.44381046295166, "learning_rate": 4.379166221478697e-05, "loss": 0.1685, "num_input_tokens_seen": 236150, "step": 279 }, { "epoch": 8.784313725490197, "grad_norm": 3.0152742862701416, "learning_rate": 4.122147477075266e-05, "loss": 0.0961, "num_input_tokens_seen": 237030, "step": 280 }, { "epoch": 8.815686274509805, "grad_norm": 2.386852502822876, "learning_rate": 3.8709294634702376e-05, "loss": 0.112, "num_input_tokens_seen": 237889, "step": 281 }, { "epoch": 8.847058823529412, "grad_norm": 3.819979667663574, "learning_rate": 3.6257601025131094e-05, "loss": 0.0342, "num_input_tokens_seen": 238703, "step": 282 }, { "epoch": 8.87843137254902, "grad_norm": 2.879319190979004, "learning_rate": 3.386881346763483e-05, "loss": 0.1328, "num_input_tokens_seen": 239619, "step": 283 }, { "epoch": 8.909803921568628, "grad_norm": 4.421603679656982, "learning_rate": 3.1545289407131097e-05, "loss": 0.141, "num_input_tokens_seen": 240466, "step": 284 }, { "epoch": 8.941176470588236, "grad_norm": 1.440587043762207, "learning_rate": 2.9289321881345254e-05, "loss": 0.0686, "num_input_tokens_seen": 241295, "step": 285 }, { "epoch": 8.972549019607843, "grad_norm": 1.706859827041626, "learning_rate": 2.71031372578588e-05, "loss": 0.0272, "num_input_tokens_seen": 242157, "step": 286 }, { "epoch": 9.003921568627451, "grad_norm": 1.8288636207580566, "learning_rate": 2.4988893036954043e-05, "loss": 0.0571, "num_input_tokens_seen": 243008, "step": 287 }, { "epoch": 9.035294117647059, "grad_norm": 0.7292274236679077, "learning_rate": 2.294867572242112e-05, "loss": 0.0109, "num_input_tokens_seen": 243807, "step": 288 }, { "epoch": 9.066666666666666, "grad_norm": 1.1452817916870117, "learning_rate": 2.098449876243096e-05, "loss": 0.0239, "num_input_tokens_seen": 244652, "step": 289 }, { "epoch": 9.098039215686274, "grad_norm": 1.0900837182998657, "learning_rate": 1.909830056250529e-05, "loss": 0.0197, "num_input_tokens_seen": 245514, "step": 290 }, { "epoch": 9.129411764705882, "grad_norm": 1.1233842372894287, "learning_rate": 1.7291942572543807e-05, "loss": 0.059, "num_input_tokens_seen": 246368, "step": 291 }, { "epoch": 9.16078431372549, "grad_norm": 1.2153921127319336, "learning_rate": 1.5567207449798515e-05, "loss": 0.0487, "num_input_tokens_seen": 247234, "step": 292 }, { "epoch": 9.192156862745097, "grad_norm": 1.7913979291915894, "learning_rate": 1.3925797299605625e-05, "loss": 0.0462, "num_input_tokens_seen": 248080, "step": 293 }, { "epoch": 9.223529411764705, "grad_norm": 0.9466875195503235, "learning_rate": 1.2369331995613665e-05, "loss": 0.0196, "num_input_tokens_seen": 248920, "step": 294 }, { "epoch": 9.254901960784313, "grad_norm": 2.3814680576324463, "learning_rate": 1.0899347581163277e-05, "loss": 0.021, "num_input_tokens_seen": 249739, "step": 295 }, { "epoch": 9.286274509803922, "grad_norm": 1.2178651094436646, "learning_rate": 9.517294753397998e-06, "loss": 0.0089, "num_input_tokens_seen": 250537, "step": 296 }, { "epoch": 9.31764705882353, "grad_norm": 3.2499518394470215, "learning_rate": 8.224537431601864e-06, "loss": 0.0794, "num_input_tokens_seen": 251375, "step": 297 }, { "epoch": 9.349019607843138, "grad_norm": 2.116999626159668, "learning_rate": 7.022351411174888e-06, "loss": 0.0815, "num_input_tokens_seen": 252200, "step": 298 }, { "epoch": 9.380392156862746, "grad_norm": 1.607701301574707, "learning_rate": 5.911923104577455e-06, "loss": 0.0686, "num_input_tokens_seen": 253122, "step": 299 }, { "epoch": 9.411764705882353, "grad_norm": 1.501824975013733, "learning_rate": 4.8943483704846585e-06, "loss": 0.0453, "num_input_tokens_seen": 253966, "step": 300 }, { "epoch": 9.443137254901961, "grad_norm": 0.8923909664154053, "learning_rate": 3.970631432305694e-06, "loss": 0.0136, "num_input_tokens_seen": 254841, "step": 301 }, { "epoch": 9.474509803921569, "grad_norm": 1.3790185451507568, "learning_rate": 3.141683887136904e-06, "loss": 0.0812, "num_input_tokens_seen": 255706, "step": 302 }, { "epoch": 9.505882352941176, "grad_norm": 0.6012554168701172, "learning_rate": 2.4083238061252567e-06, "loss": 0.012, "num_input_tokens_seen": 256525, "step": 303 }, { "epoch": 9.537254901960784, "grad_norm": 0.9253413677215576, "learning_rate": 1.771274927131139e-06, "loss": 0.0088, "num_input_tokens_seen": 257340, "step": 304 }, { "epoch": 9.568627450980392, "grad_norm": 0.8364208936691284, "learning_rate": 1.2311659404862453e-06, "loss": 0.0128, "num_input_tokens_seen": 258165, "step": 305 }, { "epoch": 9.6, "grad_norm": 1.1087877750396729, "learning_rate": 7.885298685522235e-07, "loss": 0.0406, "num_input_tokens_seen": 259015, "step": 306 }, { "epoch": 9.631372549019607, "grad_norm": 1.227319359779358, "learning_rate": 4.438035396920115e-07, "loss": 0.0136, "num_input_tokens_seen": 259833, "step": 307 }, { "epoch": 9.662745098039215, "grad_norm": 1.0235058069229126, "learning_rate": 1.973271571728441e-07, "loss": 0.037, "num_input_tokens_seen": 260686, "step": 308 }, { "epoch": 9.694117647058823, "grad_norm": 1.069370150566101, "learning_rate": 4.934396342684e-08, "loss": 0.0366, "num_input_tokens_seen": 261513, "step": 309 }, { "epoch": 9.72549019607843, "grad_norm": 1.2721508741378784, "learning_rate": 0.0, "loss": 0.0282, "num_input_tokens_seen": 262364, "step": 310 } ], "logging_steps": 1, "max_steps": 310, "num_input_tokens_seen": 262364, "num_train_epochs": 10, "save_steps": 31, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5870993062052984e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }