|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.72549019607843, |
|
"eval_steps": 31, |
|
"global_step": 310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03137254901960784, |
|
"grad_norm": 17.784626007080078, |
|
"learning_rate": 2e-05, |
|
"loss": 7.7199, |
|
"num_input_tokens_seen": 789, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06274509803921569, |
|
"grad_norm": 12.485037803649902, |
|
"learning_rate": 4e-05, |
|
"loss": 5.2762, |
|
"num_input_tokens_seen": 1656, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09411764705882353, |
|
"grad_norm": 7.9316840171813965, |
|
"learning_rate": 6e-05, |
|
"loss": 4.3107, |
|
"num_input_tokens_seen": 2529, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.12549019607843137, |
|
"grad_norm": 9.721095085144043, |
|
"learning_rate": 8e-05, |
|
"loss": 4.8054, |
|
"num_input_tokens_seen": 3389, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.1568627450980392, |
|
"grad_norm": 8.977057456970215, |
|
"learning_rate": 0.0001, |
|
"loss": 5.4644, |
|
"num_input_tokens_seen": 4244, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.18823529411764706, |
|
"grad_norm": 10.086724281311035, |
|
"learning_rate": 0.00012, |
|
"loss": 5.6263, |
|
"num_input_tokens_seen": 5059, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2196078431372549, |
|
"grad_norm": 13.49176025390625, |
|
"learning_rate": 0.00014, |
|
"loss": 5.4706, |
|
"num_input_tokens_seen": 5918, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25098039215686274, |
|
"grad_norm": 11.12248706817627, |
|
"learning_rate": 0.00016, |
|
"loss": 4.4177, |
|
"num_input_tokens_seen": 6758, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2823529411764706, |
|
"grad_norm": 6.923247814178467, |
|
"learning_rate": 0.00018, |
|
"loss": 3.9388, |
|
"num_input_tokens_seen": 7653, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3137254901960784, |
|
"grad_norm": 11.242279052734375, |
|
"learning_rate": 0.0002, |
|
"loss": 4.3202, |
|
"num_input_tokens_seen": 8464, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.34509803921568627, |
|
"grad_norm": 8.588418960571289, |
|
"learning_rate": 0.00019995065603657316, |
|
"loss": 3.116, |
|
"num_input_tokens_seen": 9348, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3764705882352941, |
|
"grad_norm": 9.73305892944336, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 2.723, |
|
"num_input_tokens_seen": 10204, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.40784313725490196, |
|
"grad_norm": 15.396673202514648, |
|
"learning_rate": 0.00019955619646030802, |
|
"loss": 2.4929, |
|
"num_input_tokens_seen": 11030, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4392156862745098, |
|
"grad_norm": 24.212421417236328, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 2.7489, |
|
"num_input_tokens_seen": 11845, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 14.499921798706055, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 1.8677, |
|
"num_input_tokens_seen": 12686, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5019607843137255, |
|
"grad_norm": 16.502588272094727, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 1.3637, |
|
"num_input_tokens_seen": 13505, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 14.297043800354004, |
|
"learning_rate": 0.00019759167619387476, |
|
"loss": 1.5026, |
|
"num_input_tokens_seen": 14344, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5647058823529412, |
|
"grad_norm": 9.703812599182129, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 1.4165, |
|
"num_input_tokens_seen": 15218, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.596078431372549, |
|
"grad_norm": 12.656659126281738, |
|
"learning_rate": 0.0001960293685676943, |
|
"loss": 1.0906, |
|
"num_input_tokens_seen": 16040, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6274509803921569, |
|
"grad_norm": 58.49094772338867, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.9817, |
|
"num_input_tokens_seen": 16874, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6588235294117647, |
|
"grad_norm": 9.382222175598145, |
|
"learning_rate": 0.00019408807689542257, |
|
"loss": 1.1198, |
|
"num_input_tokens_seen": 17682, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6901960784313725, |
|
"grad_norm": 10.244654655456543, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 1.2596, |
|
"num_input_tokens_seen": 18539, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7215686274509804, |
|
"grad_norm": 8.23478889465332, |
|
"learning_rate": 0.00019177546256839812, |
|
"loss": 1.2768, |
|
"num_input_tokens_seen": 19432, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7529411764705882, |
|
"grad_norm": 19.18866729736328, |
|
"learning_rate": 0.00019048270524660196, |
|
"loss": 1.3043, |
|
"num_input_tokens_seen": 20264, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 6.661506175994873, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.8769, |
|
"num_input_tokens_seen": 21113, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8156862745098039, |
|
"grad_norm": 6.4282546043396, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 1.0473, |
|
"num_input_tokens_seen": 21966, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8470588235294118, |
|
"grad_norm": 11.599508285522461, |
|
"learning_rate": 0.0001860742027003944, |
|
"loss": 1.0205, |
|
"num_input_tokens_seen": 22821, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8784313725490196, |
|
"grad_norm": 9.272802352905273, |
|
"learning_rate": 0.00018443279255020152, |
|
"loss": 1.2292, |
|
"num_input_tokens_seen": 23733, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9098039215686274, |
|
"grad_norm": 7.99858283996582, |
|
"learning_rate": 0.00018270805742745617, |
|
"loss": 1.1247, |
|
"num_input_tokens_seen": 24593, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 10.046642303466797, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.2097, |
|
"num_input_tokens_seen": 25425, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9725490196078431, |
|
"grad_norm": 9.499373435974121, |
|
"learning_rate": 0.00017901550123756904, |
|
"loss": 1.1345, |
|
"num_input_tokens_seen": 26228, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.003921568627451, |
|
"grad_norm": 5.625377178192139, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 1.1178, |
|
"num_input_tokens_seen": 27085, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.035294117647059, |
|
"grad_norm": 4.716068744659424, |
|
"learning_rate": 0.00017501110696304596, |
|
"loss": 0.8997, |
|
"num_input_tokens_seen": 27917, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 4.880837440490723, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 0.654, |
|
"num_input_tokens_seen": 28796, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0980392156862746, |
|
"grad_norm": 3.4638915061950684, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.7083, |
|
"num_input_tokens_seen": 29650, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1294117647058823, |
|
"grad_norm": 6.4757561683654785, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 0.981, |
|
"num_input_tokens_seen": 30477, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1607843137254903, |
|
"grad_norm": 6.666485786437988, |
|
"learning_rate": 0.00016613118653236518, |
|
"loss": 1.1384, |
|
"num_input_tokens_seen": 31314, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.192156862745098, |
|
"grad_norm": 6.810871601104736, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 1.0914, |
|
"num_input_tokens_seen": 32129, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.223529411764706, |
|
"grad_norm": 4.342460632324219, |
|
"learning_rate": 0.00016129070536529766, |
|
"loss": 0.784, |
|
"num_input_tokens_seen": 32980, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2549019607843137, |
|
"grad_norm": 6.6151204109191895, |
|
"learning_rate": 0.0001587785252292473, |
|
"loss": 1.151, |
|
"num_input_tokens_seen": 33811, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2862745098039214, |
|
"grad_norm": 6.701898097991943, |
|
"learning_rate": 0.00015620833778521307, |
|
"loss": 1.1066, |
|
"num_input_tokens_seen": 34640, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3176470588235294, |
|
"grad_norm": 3.7148549556732178, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 1.1097, |
|
"num_input_tokens_seen": 35540, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3490196078431373, |
|
"grad_norm": 4.791841506958008, |
|
"learning_rate": 0.00015090414157503714, |
|
"loss": 0.9391, |
|
"num_input_tokens_seen": 36388, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.380392156862745, |
|
"grad_norm": 6.712942123413086, |
|
"learning_rate": 0.00014817536741017152, |
|
"loss": 0.8025, |
|
"num_input_tokens_seen": 37254, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.4117647058823528, |
|
"grad_norm": 3.204245090484619, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.6842, |
|
"num_input_tokens_seen": 38098, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4431372549019608, |
|
"grad_norm": 4.535635471343994, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 0.7039, |
|
"num_input_tokens_seen": 39024, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4745098039215687, |
|
"grad_norm": 5.812551021575928, |
|
"learning_rate": 0.00013971478906347806, |
|
"loss": 1.0178, |
|
"num_input_tokens_seen": 39876, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5058823529411764, |
|
"grad_norm": 5.7899169921875, |
|
"learning_rate": 0.00013681245526846783, |
|
"loss": 0.8895, |
|
"num_input_tokens_seen": 40691, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5372549019607842, |
|
"grad_norm": 4.779058933258057, |
|
"learning_rate": 0.00013387379202452917, |
|
"loss": 0.7015, |
|
"num_input_tokens_seen": 41529, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5686274509803921, |
|
"grad_norm": 4.17733907699585, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.8385, |
|
"num_input_tokens_seen": 42347, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.26069974899292, |
|
"learning_rate": 0.00012789911060392294, |
|
"loss": 0.6932, |
|
"num_input_tokens_seen": 43170, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6313725490196078, |
|
"grad_norm": 3.7764294147491455, |
|
"learning_rate": 0.00012486898871648546, |
|
"loss": 0.6722, |
|
"num_input_tokens_seen": 44017, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.6627450980392156, |
|
"grad_norm": 5.631955146789551, |
|
"learning_rate": 0.00012181432413965425, |
|
"loss": 0.8225, |
|
"num_input_tokens_seen": 44895, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6941176470588235, |
|
"grad_norm": 3.0410749912261963, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.5853, |
|
"num_input_tokens_seen": 45780, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.7254901960784315, |
|
"grad_norm": 6.754289627075195, |
|
"learning_rate": 0.00011564344650402312, |
|
"loss": 0.8272, |
|
"num_input_tokens_seen": 46623, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.7568627450980392, |
|
"grad_norm": 4.92588472366333, |
|
"learning_rate": 0.00011253332335643046, |
|
"loss": 0.588, |
|
"num_input_tokens_seen": 47446, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.788235294117647, |
|
"grad_norm": 5.982916831970215, |
|
"learning_rate": 0.00010941083133185143, |
|
"loss": 0.9413, |
|
"num_input_tokens_seen": 48260, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8196078431372549, |
|
"grad_norm": 3.09069561958313, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.7281, |
|
"num_input_tokens_seen": 49095, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8509803921568628, |
|
"grad_norm": 7.942086219787598, |
|
"learning_rate": 0.00010314107590781284, |
|
"loss": 0.8998, |
|
"num_input_tokens_seen": 49948, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.8823529411764706, |
|
"grad_norm": 6.040829181671143, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7494, |
|
"num_input_tokens_seen": 50794, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.9137254901960783, |
|
"grad_norm": 5.524172782897949, |
|
"learning_rate": 9.685892409218717e-05, |
|
"loss": 1.1642, |
|
"num_input_tokens_seen": 51649, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9450980392156862, |
|
"grad_norm": 4.093544960021973, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 0.9137, |
|
"num_input_tokens_seen": 52499, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9764705882352942, |
|
"grad_norm": 3.3669676780700684, |
|
"learning_rate": 9.058916866814858e-05, |
|
"loss": 0.7343, |
|
"num_input_tokens_seen": 53345, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.007843137254902, |
|
"grad_norm": 6.184525966644287, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 0.8992, |
|
"num_input_tokens_seen": 54201, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0392156862745097, |
|
"grad_norm": 4.793566703796387, |
|
"learning_rate": 8.435655349597695e-05, |
|
"loss": 0.6028, |
|
"num_input_tokens_seen": 55036, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.070588235294118, |
|
"grad_norm": 4.015380859375, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 0.6597, |
|
"num_input_tokens_seen": 55868, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.1019607843137256, |
|
"grad_norm": 2.2828500270843506, |
|
"learning_rate": 7.818567586034575e-05, |
|
"loss": 0.5288, |
|
"num_input_tokens_seen": 56724, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 3.3936123847961426, |
|
"learning_rate": 7.513101128351454e-05, |
|
"loss": 0.6818, |
|
"num_input_tokens_seen": 57579, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.164705882352941, |
|
"grad_norm": 3.684924602508545, |
|
"learning_rate": 7.210088939607708e-05, |
|
"loss": 0.7798, |
|
"num_input_tokens_seen": 58425, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.196078431372549, |
|
"grad_norm": 3.1493935585021973, |
|
"learning_rate": 6.909830056250524e-05, |
|
"loss": 0.7207, |
|
"num_input_tokens_seen": 59270, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.227450980392157, |
|
"grad_norm": 5.103849411010742, |
|
"learning_rate": 6.612620797547087e-05, |
|
"loss": 0.6483, |
|
"num_input_tokens_seen": 60127, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.2588235294117647, |
|
"grad_norm": 1.9273674488067627, |
|
"learning_rate": 6.318754473153221e-05, |
|
"loss": 0.5992, |
|
"num_input_tokens_seen": 61059, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.2901960784313724, |
|
"grad_norm": 2.9266533851623535, |
|
"learning_rate": 6.0285210936521955e-05, |
|
"loss": 0.6952, |
|
"num_input_tokens_seen": 61905, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3215686274509806, |
|
"grad_norm": 2.3648922443389893, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 0.5068, |
|
"num_input_tokens_seen": 62745, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 3.8562815189361572, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 0.7024, |
|
"num_input_tokens_seen": 63594, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.384313725490196, |
|
"grad_norm": 2.478416919708252, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 0.5721, |
|
"num_input_tokens_seen": 64453, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.4156862745098038, |
|
"grad_norm": 2.117917776107788, |
|
"learning_rate": 4.909585842496292e-05, |
|
"loss": 0.4472, |
|
"num_input_tokens_seen": 65284, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.447058823529412, |
|
"grad_norm": 2.4827864170074463, |
|
"learning_rate": 4.641732050210036e-05, |
|
"loss": 0.414, |
|
"num_input_tokens_seen": 66110, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.4784313725490197, |
|
"grad_norm": 3.7768778800964355, |
|
"learning_rate": 4.3791662214786934e-05, |
|
"loss": 0.5808, |
|
"num_input_tokens_seen": 66934, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5098039215686274, |
|
"grad_norm": 3.4247841835021973, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.5447, |
|
"num_input_tokens_seen": 67775, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.541176470588235, |
|
"grad_norm": 4.182628631591797, |
|
"learning_rate": 3.8709294634702376e-05, |
|
"loss": 0.4573, |
|
"num_input_tokens_seen": 68630, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.572549019607843, |
|
"grad_norm": 4.045778274536133, |
|
"learning_rate": 3.6257601025131026e-05, |
|
"loss": 0.7857, |
|
"num_input_tokens_seen": 69484, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.603921568627451, |
|
"grad_norm": 2.6151039600372314, |
|
"learning_rate": 3.386881346763483e-05, |
|
"loss": 0.7443, |
|
"num_input_tokens_seen": 70355, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.635294117647059, |
|
"grad_norm": 3.3898870944976807, |
|
"learning_rate": 3.154528940713113e-05, |
|
"loss": 0.5147, |
|
"num_input_tokens_seen": 71198, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 1.8886942863464355, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.3022, |
|
"num_input_tokens_seen": 72040, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6980392156862747, |
|
"grad_norm": 5.151848316192627, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 0.3934, |
|
"num_input_tokens_seen": 72907, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.7294117647058824, |
|
"grad_norm": 3.1100125312805176, |
|
"learning_rate": 2.4988893036954043e-05, |
|
"loss": 0.6222, |
|
"num_input_tokens_seen": 73742, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.76078431372549, |
|
"grad_norm": 2.5638794898986816, |
|
"learning_rate": 2.2948675722421086e-05, |
|
"loss": 0.5721, |
|
"num_input_tokens_seen": 74587, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.792156862745098, |
|
"grad_norm": 3.2768828868865967, |
|
"learning_rate": 2.098449876243098e-05, |
|
"loss": 0.5144, |
|
"num_input_tokens_seen": 75433, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 4.178157806396484, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.463, |
|
"num_input_tokens_seen": 76218, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.854901960784314, |
|
"grad_norm": 4.432765960693359, |
|
"learning_rate": 1.7291942572543807e-05, |
|
"loss": 0.6351, |
|
"num_input_tokens_seen": 77053, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.8862745098039215, |
|
"grad_norm": 2.8439605236053467, |
|
"learning_rate": 1.5567207449798515e-05, |
|
"loss": 0.5897, |
|
"num_input_tokens_seen": 77899, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.9176470588235293, |
|
"grad_norm": 3.0817153453826904, |
|
"learning_rate": 1.3925797299605625e-05, |
|
"loss": 0.6083, |
|
"num_input_tokens_seen": 78767, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9490196078431374, |
|
"grad_norm": 3.2820534706115723, |
|
"learning_rate": 1.2369331995613643e-05, |
|
"loss": 0.5075, |
|
"num_input_tokens_seen": 79591, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.980392156862745, |
|
"grad_norm": 4.051931858062744, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 0.4958, |
|
"num_input_tokens_seen": 80422, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.011764705882353, |
|
"grad_norm": 3.6439077854156494, |
|
"learning_rate": 9.517294753398042e-06, |
|
"loss": 0.8383, |
|
"num_input_tokens_seen": 81282, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.0431372549019606, |
|
"grad_norm": 3.559818983078003, |
|
"learning_rate": 8.224537431601908e-06, |
|
"loss": 0.5131, |
|
"num_input_tokens_seen": 82101, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.074509803921569, |
|
"grad_norm": 2.2776846885681152, |
|
"learning_rate": 7.022351411174866e-06, |
|
"loss": 0.5896, |
|
"num_input_tokens_seen": 82989, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.1058823529411765, |
|
"grad_norm": 2.8956410884857178, |
|
"learning_rate": 5.911923104577455e-06, |
|
"loss": 0.5902, |
|
"num_input_tokens_seen": 83838, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.1372549019607843, |
|
"grad_norm": 2.3653640747070312, |
|
"learning_rate": 4.8943483704846585e-06, |
|
"loss": 0.278, |
|
"num_input_tokens_seen": 84654, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.168627450980392, |
|
"grad_norm": 2.7463765144348145, |
|
"learning_rate": 3.970631432305694e-06, |
|
"loss": 0.4051, |
|
"num_input_tokens_seen": 85493, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.9491956233978271, |
|
"learning_rate": 3.141683887136904e-06, |
|
"loss": 0.3255, |
|
"num_input_tokens_seen": 86381, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.231372549019608, |
|
"grad_norm": 3.7543513774871826, |
|
"learning_rate": 2.4083238061252678e-06, |
|
"loss": 0.5123, |
|
"num_input_tokens_seen": 87182, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.2627450980392156, |
|
"grad_norm": 2.68157958984375, |
|
"learning_rate": 1.771274927131128e-06, |
|
"loss": 0.5409, |
|
"num_input_tokens_seen": 88059, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.2941176470588234, |
|
"grad_norm": 2.971251964569092, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 0.3435, |
|
"num_input_tokens_seen": 88889, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.3254901960784315, |
|
"grad_norm": 2.606720209121704, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 0.5119, |
|
"num_input_tokens_seen": 89747, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.3568627450980393, |
|
"grad_norm": 2.622124433517456, |
|
"learning_rate": 4.438035396920004e-07, |
|
"loss": 0.5565, |
|
"num_input_tokens_seen": 90612, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.388235294117647, |
|
"grad_norm": 2.256873846054077, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 0.4164, |
|
"num_input_tokens_seen": 91453, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.4196078431372547, |
|
"grad_norm": 3.2237167358398438, |
|
"learning_rate": 4.934396342684e-08, |
|
"loss": 0.5135, |
|
"num_input_tokens_seen": 92304, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.450980392156863, |
|
"grad_norm": 2.807044267654419, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4391, |
|
"num_input_tokens_seen": 93130, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.4823529411764707, |
|
"grad_norm": 2.9154107570648193, |
|
"learning_rate": 0.00019995065603657316, |
|
"loss": 0.5719, |
|
"num_input_tokens_seen": 93953, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.5137254901960784, |
|
"grad_norm": 3.265385150909424, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 0.7463, |
|
"num_input_tokens_seen": 94803, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.545098039215686, |
|
"grad_norm": 5.992981910705566, |
|
"learning_rate": 0.00019955619646030802, |
|
"loss": 0.7756, |
|
"num_input_tokens_seen": 95673, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.576470588235294, |
|
"grad_norm": 6.979043483734131, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 0.5761, |
|
"num_input_tokens_seen": 96559, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.607843137254902, |
|
"grad_norm": 9.418047904968262, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.7796, |
|
"num_input_tokens_seen": 97366, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.6392156862745098, |
|
"grad_norm": 3.6083357334136963, |
|
"learning_rate": 0.00019822872507286888, |
|
"loss": 0.3696, |
|
"num_input_tokens_seen": 98197, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.6705882352941175, |
|
"grad_norm": 4.184770584106445, |
|
"learning_rate": 0.00019759167619387476, |
|
"loss": 0.4349, |
|
"num_input_tokens_seen": 99019, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.7019607843137257, |
|
"grad_norm": 5.8201799392700195, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 0.434, |
|
"num_input_tokens_seen": 99866, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.7333333333333334, |
|
"grad_norm": 5.535106182098389, |
|
"learning_rate": 0.0001960293685676943, |
|
"loss": 0.6474, |
|
"num_input_tokens_seen": 100723, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.764705882352941, |
|
"grad_norm": 3.3299312591552734, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.4361, |
|
"num_input_tokens_seen": 101581, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.796078431372549, |
|
"grad_norm": 7.6674323081970215, |
|
"learning_rate": 0.00019408807689542257, |
|
"loss": 0.575, |
|
"num_input_tokens_seen": 102418, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.8274509803921566, |
|
"grad_norm": 7.189317226409912, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 0.8572, |
|
"num_input_tokens_seen": 103281, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.8588235294117648, |
|
"grad_norm": 4.600097179412842, |
|
"learning_rate": 0.00019177546256839812, |
|
"loss": 0.6253, |
|
"num_input_tokens_seen": 104128, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.8901960784313725, |
|
"grad_norm": 6.0382585525512695, |
|
"learning_rate": 0.00019048270524660193, |
|
"loss": 0.4465, |
|
"num_input_tokens_seen": 104972, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 5.221591949462891, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.6236, |
|
"num_input_tokens_seen": 105833, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.9529411764705884, |
|
"grad_norm": 4.43485164642334, |
|
"learning_rate": 0.0001876306680043864, |
|
"loss": 0.6026, |
|
"num_input_tokens_seen": 106656, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 3.984313725490196, |
|
"grad_norm": 4.633317947387695, |
|
"learning_rate": 0.0001860742027003944, |
|
"loss": 0.4891, |
|
"num_input_tokens_seen": 107479, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.015686274509804, |
|
"grad_norm": 2.710853338241577, |
|
"learning_rate": 0.00018443279255020155, |
|
"loss": 0.4361, |
|
"num_input_tokens_seen": 108376, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.047058823529412, |
|
"grad_norm": 3.0707015991210938, |
|
"learning_rate": 0.0001827080574274562, |
|
"loss": 0.4198, |
|
"num_input_tokens_seen": 109228, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.078431372549019, |
|
"grad_norm": 2.8015506267547607, |
|
"learning_rate": 0.0001809016994374947, |
|
"loss": 0.4515, |
|
"num_input_tokens_seen": 110061, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.109803921568627, |
|
"grad_norm": 3.3442440032958984, |
|
"learning_rate": 0.00017901550123756906, |
|
"loss": 0.3318, |
|
"num_input_tokens_seen": 110927, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.141176470588236, |
|
"grad_norm": 2.747676372528076, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 0.4279, |
|
"num_input_tokens_seen": 111786, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.172549019607843, |
|
"grad_norm": 3.85593581199646, |
|
"learning_rate": 0.00017501110696304596, |
|
"loss": 0.3526, |
|
"num_input_tokens_seen": 112632, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.203921568627451, |
|
"grad_norm": 3.7865078449249268, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 0.4617, |
|
"num_input_tokens_seen": 113486, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 2.6525180339813232, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.2388, |
|
"num_input_tokens_seen": 114336, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.266666666666667, |
|
"grad_norm": 10.199384689331055, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 0.4286, |
|
"num_input_tokens_seen": 115202, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.298039215686274, |
|
"grad_norm": 3.65986967086792, |
|
"learning_rate": 0.00016613118653236518, |
|
"loss": 0.478, |
|
"num_input_tokens_seen": 116036, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.329411764705882, |
|
"grad_norm": 2.575523614883423, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 0.229, |
|
"num_input_tokens_seen": 116868, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.36078431372549, |
|
"grad_norm": 5.756189346313477, |
|
"learning_rate": 0.00016129070536529766, |
|
"loss": 0.5141, |
|
"num_input_tokens_seen": 117745, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.392156862745098, |
|
"grad_norm": 3.640237331390381, |
|
"learning_rate": 0.0001587785252292473, |
|
"loss": 0.2679, |
|
"num_input_tokens_seen": 118582, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.423529411764706, |
|
"grad_norm": 4.511353015899658, |
|
"learning_rate": 0.00015620833778521304, |
|
"loss": 0.381, |
|
"num_input_tokens_seen": 119445, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.454901960784314, |
|
"grad_norm": 12.773442268371582, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.5296, |
|
"num_input_tokens_seen": 120241, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.486274509803922, |
|
"grad_norm": 3.347665548324585, |
|
"learning_rate": 0.00015090414157503714, |
|
"loss": 0.3427, |
|
"num_input_tokens_seen": 121096, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.517647058823529, |
|
"grad_norm": 3.0967466831207275, |
|
"learning_rate": 0.00014817536741017158, |
|
"loss": 0.3916, |
|
"num_input_tokens_seen": 121910, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.549019607843137, |
|
"grad_norm": 2.6861841678619385, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.3054, |
|
"num_input_tokens_seen": 122762, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.580392156862745, |
|
"grad_norm": 4.615218162536621, |
|
"learning_rate": 0.0001425779291565073, |
|
"loss": 0.3923, |
|
"num_input_tokens_seen": 123593, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.6117647058823525, |
|
"grad_norm": 6.789853572845459, |
|
"learning_rate": 0.00013971478906347803, |
|
"loss": 0.438, |
|
"num_input_tokens_seen": 124446, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.643137254901961, |
|
"grad_norm": 4.128298282623291, |
|
"learning_rate": 0.00013681245526846777, |
|
"loss": 0.4208, |
|
"num_input_tokens_seen": 125313, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.674509803921569, |
|
"grad_norm": 3.4282140731811523, |
|
"learning_rate": 0.00013387379202452917, |
|
"loss": 0.3717, |
|
"num_input_tokens_seen": 126184, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 6.779294967651367, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.4152, |
|
"num_input_tokens_seen": 127026, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.737254901960784, |
|
"grad_norm": 3.086122989654541, |
|
"learning_rate": 0.00012789911060392294, |
|
"loss": 0.3011, |
|
"num_input_tokens_seen": 127883, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.768627450980392, |
|
"grad_norm": 5.97057580947876, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.3705, |
|
"num_input_tokens_seen": 128689, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 4.796477317810059, |
|
"learning_rate": 0.00012181432413965421, |
|
"loss": 0.4362, |
|
"num_input_tokens_seen": 129520, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.8313725490196076, |
|
"grad_norm": 3.3298208713531494, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.4988, |
|
"num_input_tokens_seen": 130406, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.862745098039216, |
|
"grad_norm": 3.887026071548462, |
|
"learning_rate": 0.00011564344650402312, |
|
"loss": 0.5444, |
|
"num_input_tokens_seen": 131276, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.894117647058824, |
|
"grad_norm": 6.220418453216553, |
|
"learning_rate": 0.00011253332335643046, |
|
"loss": 0.4078, |
|
"num_input_tokens_seen": 132090, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 4.925490196078432, |
|
"grad_norm": 2.757842779159546, |
|
"learning_rate": 0.00010941083133185146, |
|
"loss": 0.3539, |
|
"num_input_tokens_seen": 132909, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 4.956862745098039, |
|
"grad_norm": 2.275514602661133, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.3512, |
|
"num_input_tokens_seen": 133799, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 4.988235294117647, |
|
"grad_norm": 3.4296388626098633, |
|
"learning_rate": 0.00010314107590781284, |
|
"loss": 0.3426, |
|
"num_input_tokens_seen": 134640, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.019607843137255, |
|
"grad_norm": 1.8311214447021484, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1619, |
|
"num_input_tokens_seen": 135491, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.050980392156863, |
|
"grad_norm": 2.676769495010376, |
|
"learning_rate": 9.685892409218724e-05, |
|
"loss": 0.221, |
|
"num_input_tokens_seen": 136317, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.08235294117647, |
|
"grad_norm": 1.8601126670837402, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 0.2427, |
|
"num_input_tokens_seen": 137191, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.113725490196078, |
|
"grad_norm": 1.8501461744308472, |
|
"learning_rate": 9.058916866814858e-05, |
|
"loss": 0.2443, |
|
"num_input_tokens_seen": 138029, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.145098039215687, |
|
"grad_norm": 1.612574815750122, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 0.1391, |
|
"num_input_tokens_seen": 138848, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.176470588235294, |
|
"grad_norm": 1.7896589040756226, |
|
"learning_rate": 8.435655349597684e-05, |
|
"loss": 0.0778, |
|
"num_input_tokens_seen": 139648, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.207843137254902, |
|
"grad_norm": 3.172090530395508, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 0.2737, |
|
"num_input_tokens_seen": 140500, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.23921568627451, |
|
"grad_norm": 2.280182361602783, |
|
"learning_rate": 7.818567586034581e-05, |
|
"loss": 0.18, |
|
"num_input_tokens_seen": 141331, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.270588235294118, |
|
"grad_norm": 3.10487699508667, |
|
"learning_rate": 7.513101128351459e-05, |
|
"loss": 0.2636, |
|
"num_input_tokens_seen": 142188, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.301960784313725, |
|
"grad_norm": 2.16359543800354, |
|
"learning_rate": 7.210088939607707e-05, |
|
"loss": 0.1788, |
|
"num_input_tokens_seen": 143005, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 3.690769910812378, |
|
"learning_rate": 6.909830056250524e-05, |
|
"loss": 0.1271, |
|
"num_input_tokens_seen": 143796, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.364705882352941, |
|
"grad_norm": 4.808859348297119, |
|
"learning_rate": 6.612620797547091e-05, |
|
"loss": 0.1235, |
|
"num_input_tokens_seen": 144594, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.396078431372549, |
|
"grad_norm": 2.4017834663391113, |
|
"learning_rate": 6.318754473153218e-05, |
|
"loss": 0.1417, |
|
"num_input_tokens_seen": 145432, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.427450980392157, |
|
"grad_norm": 10.861905097961426, |
|
"learning_rate": 6.0285210936521976e-05, |
|
"loss": 0.3499, |
|
"num_input_tokens_seen": 146307, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.458823529411765, |
|
"grad_norm": 6.508179664611816, |
|
"learning_rate": 5.742207084349277e-05, |
|
"loss": 0.2636, |
|
"num_input_tokens_seen": 147159, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.490196078431373, |
|
"grad_norm": 3.551328659057617, |
|
"learning_rate": 5.4600950026045285e-05, |
|
"loss": 0.1685, |
|
"num_input_tokens_seen": 147990, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.52156862745098, |
|
"grad_norm": 5.636049747467041, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 0.2804, |
|
"num_input_tokens_seen": 148782, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.552941176470588, |
|
"grad_norm": 4.598341941833496, |
|
"learning_rate": 4.909585842496292e-05, |
|
"loss": 0.3087, |
|
"num_input_tokens_seen": 149687, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.584313725490196, |
|
"grad_norm": 5.29741907119751, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 0.3299, |
|
"num_input_tokens_seen": 150565, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.6156862745098035, |
|
"grad_norm": 3.7548561096191406, |
|
"learning_rate": 4.379166221478697e-05, |
|
"loss": 0.3047, |
|
"num_input_tokens_seen": 151419, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.647058823529412, |
|
"grad_norm": 2.995479106903076, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.2234, |
|
"num_input_tokens_seen": 152249, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.67843137254902, |
|
"grad_norm": 2.391935110092163, |
|
"learning_rate": 3.8709294634702376e-05, |
|
"loss": 0.1819, |
|
"num_input_tokens_seen": 153097, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.709803921568628, |
|
"grad_norm": 2.5566658973693848, |
|
"learning_rate": 3.625760102513099e-05, |
|
"loss": 0.2642, |
|
"num_input_tokens_seen": 153970, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.741176470588235, |
|
"grad_norm": 2.3486950397491455, |
|
"learning_rate": 3.386881346763483e-05, |
|
"loss": 0.1399, |
|
"num_input_tokens_seen": 154821, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.772549019607843, |
|
"grad_norm": 3.2803895473480225, |
|
"learning_rate": 3.15452894071312e-05, |
|
"loss": 0.2416, |
|
"num_input_tokens_seen": 155654, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.803921568627451, |
|
"grad_norm": 3.5313189029693604, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.1808, |
|
"num_input_tokens_seen": 156464, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.8352941176470585, |
|
"grad_norm": 2.8547775745391846, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 0.2565, |
|
"num_input_tokens_seen": 157360, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.866666666666667, |
|
"grad_norm": 2.3218977451324463, |
|
"learning_rate": 2.4988893036954043e-05, |
|
"loss": 0.1691, |
|
"num_input_tokens_seen": 158193, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 5.898039215686275, |
|
"grad_norm": 2.2196223735809326, |
|
"learning_rate": 2.2948675722421032e-05, |
|
"loss": 0.4082, |
|
"num_input_tokens_seen": 159122, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 5.929411764705883, |
|
"grad_norm": 2.5707294940948486, |
|
"learning_rate": 2.098449876243096e-05, |
|
"loss": 0.2091, |
|
"num_input_tokens_seen": 159987, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 5.96078431372549, |
|
"grad_norm": 3.849001407623291, |
|
"learning_rate": 1.909830056250529e-05, |
|
"loss": 0.258, |
|
"num_input_tokens_seen": 160876, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.992156862745098, |
|
"grad_norm": 7.267693042755127, |
|
"learning_rate": 1.7291942572543807e-05, |
|
"loss": 0.3389, |
|
"num_input_tokens_seen": 161713, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.023529411764706, |
|
"grad_norm": 2.107445240020752, |
|
"learning_rate": 1.5567207449798495e-05, |
|
"loss": 0.1413, |
|
"num_input_tokens_seen": 162530, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.0549019607843135, |
|
"grad_norm": 1.9133341312408447, |
|
"learning_rate": 1.3925797299605625e-05, |
|
"loss": 0.098, |
|
"num_input_tokens_seen": 163339, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.086274509803921, |
|
"grad_norm": 2.2540628910064697, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 0.1409, |
|
"num_input_tokens_seen": 164183, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.117647058823529, |
|
"grad_norm": 1.5692024230957031, |
|
"learning_rate": 1.08993475811632e-05, |
|
"loss": 0.1951, |
|
"num_input_tokens_seen": 165009, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.149019607843138, |
|
"grad_norm": 1.885223150253296, |
|
"learning_rate": 9.517294753398064e-06, |
|
"loss": 0.1493, |
|
"num_input_tokens_seen": 165856, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.180392156862745, |
|
"grad_norm": 1.3335139751434326, |
|
"learning_rate": 8.224537431601908e-06, |
|
"loss": 0.054, |
|
"num_input_tokens_seen": 166686, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.211764705882353, |
|
"grad_norm": 2.255988359451294, |
|
"learning_rate": 7.022351411174855e-06, |
|
"loss": 0.1965, |
|
"num_input_tokens_seen": 167544, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.243137254901961, |
|
"grad_norm": 4.721761703491211, |
|
"learning_rate": 5.911923104577455e-06, |
|
"loss": 0.1935, |
|
"num_input_tokens_seen": 168348, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.2745098039215685, |
|
"grad_norm": 2.2551510334014893, |
|
"learning_rate": 4.8943483704846585e-06, |
|
"loss": 0.1393, |
|
"num_input_tokens_seen": 169231, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.305882352941176, |
|
"grad_norm": 1.8611195087432861, |
|
"learning_rate": 3.970631432305694e-06, |
|
"loss": 0.1288, |
|
"num_input_tokens_seen": 170060, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.337254901960784, |
|
"grad_norm": 2.192089796066284, |
|
"learning_rate": 3.141683887136904e-06, |
|
"loss": 0.2037, |
|
"num_input_tokens_seen": 170917, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.368627450980393, |
|
"grad_norm": 1.2144159078598022, |
|
"learning_rate": 2.4083238061252678e-06, |
|
"loss": 0.0774, |
|
"num_input_tokens_seen": 171774, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.9125508069992065, |
|
"learning_rate": 1.771274927131139e-06, |
|
"loss": 0.1388, |
|
"num_input_tokens_seen": 172589, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.431372549019608, |
|
"grad_norm": 1.6493868827819824, |
|
"learning_rate": 1.231165940486223e-06, |
|
"loss": 0.1192, |
|
"num_input_tokens_seen": 173418, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.462745098039216, |
|
"grad_norm": 1.91011643409729, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 0.0885, |
|
"num_input_tokens_seen": 174236, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.4941176470588236, |
|
"grad_norm": 1.8174355030059814, |
|
"learning_rate": 4.438035396920115e-07, |
|
"loss": 0.2875, |
|
"num_input_tokens_seen": 175172, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.525490196078431, |
|
"grad_norm": 1.7092390060424805, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 0.1039, |
|
"num_input_tokens_seen": 176023, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.556862745098039, |
|
"grad_norm": 1.8284764289855957, |
|
"learning_rate": 4.934396342684e-08, |
|
"loss": 0.1365, |
|
"num_input_tokens_seen": 176906, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.588235294117647, |
|
"grad_norm": 1.697880506515503, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1889, |
|
"num_input_tokens_seen": 177795, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.6196078431372545, |
|
"grad_norm": 1.8329994678497314, |
|
"learning_rate": 0.00019995065603657314, |
|
"loss": 0.2207, |
|
"num_input_tokens_seen": 178672, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.650980392156863, |
|
"grad_norm": 2.528137683868408, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 0.1294, |
|
"num_input_tokens_seen": 179466, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.682352941176471, |
|
"grad_norm": 5.707077503204346, |
|
"learning_rate": 0.00019955619646030802, |
|
"loss": 0.1878, |
|
"num_input_tokens_seen": 180332, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.713725490196079, |
|
"grad_norm": 5.38962459564209, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 0.1539, |
|
"num_input_tokens_seen": 181169, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.745098039215686, |
|
"grad_norm": 8.556888580322266, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.3257, |
|
"num_input_tokens_seen": 182059, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.776470588235294, |
|
"grad_norm": 5.112634658813477, |
|
"learning_rate": 0.00019822872507286888, |
|
"loss": 0.1682, |
|
"num_input_tokens_seen": 182890, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.807843137254902, |
|
"grad_norm": 5.386922359466553, |
|
"learning_rate": 0.00019759167619387476, |
|
"loss": 0.1355, |
|
"num_input_tokens_seen": 183734, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.8392156862745095, |
|
"grad_norm": 9.67380142211914, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 0.1565, |
|
"num_input_tokens_seen": 184542, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 6.870588235294118, |
|
"grad_norm": 4.442196369171143, |
|
"learning_rate": 0.00019602936856769434, |
|
"loss": 0.1822, |
|
"num_input_tokens_seen": 185420, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 6.901960784313726, |
|
"grad_norm": 6.283408164978027, |
|
"learning_rate": 0.0001951056516295154, |
|
"loss": 0.2803, |
|
"num_input_tokens_seen": 186278, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.933333333333334, |
|
"grad_norm": 2.322974920272827, |
|
"learning_rate": 0.0001940880768954225, |
|
"loss": 0.175, |
|
"num_input_tokens_seen": 187146, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 6.964705882352941, |
|
"grad_norm": 4.757282733917236, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 0.2458, |
|
"num_input_tokens_seen": 187998, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 6.996078431372549, |
|
"grad_norm": 10.94787883758545, |
|
"learning_rate": 0.00019177546256839812, |
|
"loss": 0.4605, |
|
"num_input_tokens_seen": 188820, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.027450980392157, |
|
"grad_norm": 3.9081947803497314, |
|
"learning_rate": 0.00019048270524660193, |
|
"loss": 0.1243, |
|
"num_input_tokens_seen": 189615, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 4.866150856018066, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.2121, |
|
"num_input_tokens_seen": 190498, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.090196078431372, |
|
"grad_norm": 2.8364100456237793, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 0.2427, |
|
"num_input_tokens_seen": 191385, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.12156862745098, |
|
"grad_norm": 1.5343717336654663, |
|
"learning_rate": 0.0001860742027003944, |
|
"loss": 0.0782, |
|
"num_input_tokens_seen": 192218, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.152941176470589, |
|
"grad_norm": 2.824655771255493, |
|
"learning_rate": 0.0001844327925502015, |
|
"loss": 0.0852, |
|
"num_input_tokens_seen": 193056, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.184313725490196, |
|
"grad_norm": 6.053825378417969, |
|
"learning_rate": 0.0001827080574274562, |
|
"loss": 0.2932, |
|
"num_input_tokens_seen": 193874, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.215686274509804, |
|
"grad_norm": 7.972029685974121, |
|
"learning_rate": 0.0001809016994374948, |
|
"loss": 0.1712, |
|
"num_input_tokens_seen": 194758, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.247058823529412, |
|
"grad_norm": 3.326019287109375, |
|
"learning_rate": 0.00017901550123756906, |
|
"loss": 0.1383, |
|
"num_input_tokens_seen": 195658, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.2784313725490195, |
|
"grad_norm": 2.4085988998413086, |
|
"learning_rate": 0.000177051324277579, |
|
"loss": 0.0777, |
|
"num_input_tokens_seen": 196482, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.309803921568627, |
|
"grad_norm": 4.651546001434326, |
|
"learning_rate": 0.00017501110696304596, |
|
"loss": 0.1553, |
|
"num_input_tokens_seen": 197329, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.341176470588235, |
|
"grad_norm": 2.4471681118011475, |
|
"learning_rate": 0.00017289686274214112, |
|
"loss": 0.1341, |
|
"num_input_tokens_seen": 198158, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.372549019607844, |
|
"grad_norm": 4.855953216552734, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.081, |
|
"num_input_tokens_seen": 198961, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.403921568627451, |
|
"grad_norm": 6.432670593261719, |
|
"learning_rate": 0.00016845471059286893, |
|
"loss": 0.0946, |
|
"num_input_tokens_seen": 199788, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.435294117647059, |
|
"grad_norm": 4.775556564331055, |
|
"learning_rate": 0.00016613118653236518, |
|
"loss": 0.1548, |
|
"num_input_tokens_seen": 200637, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.466666666666667, |
|
"grad_norm": 4.532850742340088, |
|
"learning_rate": 0.0001637423989748689, |
|
"loss": 0.0952, |
|
"num_input_tokens_seen": 201482, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.4980392156862745, |
|
"grad_norm": 7.549116134643555, |
|
"learning_rate": 0.00016129070536529766, |
|
"loss": 0.2452, |
|
"num_input_tokens_seen": 202333, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.529411764705882, |
|
"grad_norm": 3.5515332221984863, |
|
"learning_rate": 0.00015877852522924726, |
|
"loss": 0.1033, |
|
"num_input_tokens_seen": 203151, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.56078431372549, |
|
"grad_norm": 4.602539539337158, |
|
"learning_rate": 0.00015620833778521304, |
|
"loss": 0.1349, |
|
"num_input_tokens_seen": 204022, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.592156862745098, |
|
"grad_norm": 2.6608591079711914, |
|
"learning_rate": 0.00015358267949789971, |
|
"loss": 0.1057, |
|
"num_input_tokens_seen": 204856, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.623529411764705, |
|
"grad_norm": 5.953851699829102, |
|
"learning_rate": 0.00015090414157503714, |
|
"loss": 0.168, |
|
"num_input_tokens_seen": 205751, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.654901960784314, |
|
"grad_norm": 2.4022793769836426, |
|
"learning_rate": 0.00014817536741017158, |
|
"loss": 0.1534, |
|
"num_input_tokens_seen": 206614, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.686274509803922, |
|
"grad_norm": 5.277859210968018, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.1876, |
|
"num_input_tokens_seen": 207442, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.7176470588235295, |
|
"grad_norm": 3.6555447578430176, |
|
"learning_rate": 0.0001425779291565073, |
|
"loss": 0.1528, |
|
"num_input_tokens_seen": 208268, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.749019607843137, |
|
"grad_norm": 13.82836627960205, |
|
"learning_rate": 0.00013971478906347803, |
|
"loss": 0.2893, |
|
"num_input_tokens_seen": 209161, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.780392156862745, |
|
"grad_norm": 4.631089687347412, |
|
"learning_rate": 0.00013681245526846785, |
|
"loss": 0.1996, |
|
"num_input_tokens_seen": 209987, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.811764705882353, |
|
"grad_norm": 5.084930419921875, |
|
"learning_rate": 0.00013387379202452922, |
|
"loss": 0.2305, |
|
"num_input_tokens_seen": 210827, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 7.8431372549019605, |
|
"grad_norm": 4.121197700500488, |
|
"learning_rate": 0.00013090169943749463, |
|
"loss": 0.3338, |
|
"num_input_tokens_seen": 211698, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.874509803921569, |
|
"grad_norm": 3.6635348796844482, |
|
"learning_rate": 0.0001278991106039229, |
|
"loss": 0.2121, |
|
"num_input_tokens_seen": 212549, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 7.905882352941177, |
|
"grad_norm": 2.271726369857788, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.0888, |
|
"num_input_tokens_seen": 213388, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 7.9372549019607845, |
|
"grad_norm": 2.883363962173462, |
|
"learning_rate": 0.00012181432413965421, |
|
"loss": 0.2263, |
|
"num_input_tokens_seen": 214226, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 7.968627450980392, |
|
"grad_norm": 5.223812580108643, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.1085, |
|
"num_input_tokens_seen": 215089, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 3.1294803619384766, |
|
"learning_rate": 0.00011564344650402304, |
|
"loss": 0.2069, |
|
"num_input_tokens_seen": 215904, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.031372549019608, |
|
"grad_norm": 3.5736420154571533, |
|
"learning_rate": 0.00011253332335643046, |
|
"loss": 0.0774, |
|
"num_input_tokens_seen": 216721, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.062745098039215, |
|
"grad_norm": 1.6127935647964478, |
|
"learning_rate": 0.00010941083133185139, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 217568, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.094117647058823, |
|
"grad_norm": 1.9943934679031372, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.049, |
|
"num_input_tokens_seen": 218393, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.125490196078431, |
|
"grad_norm": 5.851587772369385, |
|
"learning_rate": 0.00010314107590781291, |
|
"loss": 0.121, |
|
"num_input_tokens_seen": 219230, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.156862745098039, |
|
"grad_norm": 2.693629026412964, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0664, |
|
"num_input_tokens_seen": 220048, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.188235294117646, |
|
"grad_norm": 6.576751708984375, |
|
"learning_rate": 9.685892409218724e-05, |
|
"loss": 0.0818, |
|
"num_input_tokens_seen": 220906, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.219607843137254, |
|
"grad_norm": 1.8238303661346436, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 0.0977, |
|
"num_input_tokens_seen": 221795, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.250980392156862, |
|
"grad_norm": 2.1661157608032227, |
|
"learning_rate": 9.058916866814851e-05, |
|
"loss": 0.0617, |
|
"num_input_tokens_seen": 222644, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.282352941176471, |
|
"grad_norm": 3.1537587642669678, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 0.1102, |
|
"num_input_tokens_seen": 223465, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.313725490196079, |
|
"grad_norm": 12.85554027557373, |
|
"learning_rate": 8.435655349597699e-05, |
|
"loss": 0.1478, |
|
"num_input_tokens_seen": 224302, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.345098039215687, |
|
"grad_norm": 4.586164474487305, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 0.2202, |
|
"num_input_tokens_seen": 225154, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.376470588235295, |
|
"grad_norm": 3.9367291927337646, |
|
"learning_rate": 7.818567586034566e-05, |
|
"loss": 0.0682, |
|
"num_input_tokens_seen": 226040, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.407843137254902, |
|
"grad_norm": 2.0368969440460205, |
|
"learning_rate": 7.513101128351452e-05, |
|
"loss": 0.0402, |
|
"num_input_tokens_seen": 226847, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.43921568627451, |
|
"grad_norm": 1.0436691045761108, |
|
"learning_rate": 7.210088939607714e-05, |
|
"loss": 0.0185, |
|
"num_input_tokens_seen": 227680, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.470588235294118, |
|
"grad_norm": 1.8092098236083984, |
|
"learning_rate": 6.909830056250524e-05, |
|
"loss": 0.0442, |
|
"num_input_tokens_seen": 228543, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.501960784313725, |
|
"grad_norm": 1.526930332183838, |
|
"learning_rate": 6.612620797547091e-05, |
|
"loss": 0.0362, |
|
"num_input_tokens_seen": 229373, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.533333333333333, |
|
"grad_norm": 1.8258460760116577, |
|
"learning_rate": 6.318754473153218e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 230190, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.564705882352941, |
|
"grad_norm": 1.3038586378097534, |
|
"learning_rate": 6.0285210936521976e-05, |
|
"loss": 0.0831, |
|
"num_input_tokens_seen": 231024, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.596078431372549, |
|
"grad_norm": 5.259337902069092, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 0.1228, |
|
"num_input_tokens_seen": 231915, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.627450980392156, |
|
"grad_norm": 0.8257218599319458, |
|
"learning_rate": 5.460095002604537e-05, |
|
"loss": 0.0171, |
|
"num_input_tokens_seen": 232711, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.658823529411764, |
|
"grad_norm": 1.649411916732788, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 0.0366, |
|
"num_input_tokens_seen": 233534, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.690196078431372, |
|
"grad_norm": 4.529787540435791, |
|
"learning_rate": 4.909585842496292e-05, |
|
"loss": 0.1006, |
|
"num_input_tokens_seen": 234406, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.72156862745098, |
|
"grad_norm": 3.721914052963257, |
|
"learning_rate": 4.6417320502100425e-05, |
|
"loss": 0.1088, |
|
"num_input_tokens_seen": 235270, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.75294117647059, |
|
"grad_norm": 8.44381046295166, |
|
"learning_rate": 4.379166221478697e-05, |
|
"loss": 0.1685, |
|
"num_input_tokens_seen": 236150, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.784313725490197, |
|
"grad_norm": 3.0152742862701416, |
|
"learning_rate": 4.122147477075266e-05, |
|
"loss": 0.0961, |
|
"num_input_tokens_seen": 237030, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.815686274509805, |
|
"grad_norm": 2.386852502822876, |
|
"learning_rate": 3.8709294634702376e-05, |
|
"loss": 0.112, |
|
"num_input_tokens_seen": 237889, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 8.847058823529412, |
|
"grad_norm": 3.819979667663574, |
|
"learning_rate": 3.6257601025131094e-05, |
|
"loss": 0.0342, |
|
"num_input_tokens_seen": 238703, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 8.87843137254902, |
|
"grad_norm": 2.879319190979004, |
|
"learning_rate": 3.386881346763483e-05, |
|
"loss": 0.1328, |
|
"num_input_tokens_seen": 239619, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 8.909803921568628, |
|
"grad_norm": 4.421603679656982, |
|
"learning_rate": 3.1545289407131097e-05, |
|
"loss": 0.141, |
|
"num_input_tokens_seen": 240466, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"grad_norm": 1.440587043762207, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.0686, |
|
"num_input_tokens_seen": 241295, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.972549019607843, |
|
"grad_norm": 1.706859827041626, |
|
"learning_rate": 2.71031372578588e-05, |
|
"loss": 0.0272, |
|
"num_input_tokens_seen": 242157, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 9.003921568627451, |
|
"grad_norm": 1.8288636207580566, |
|
"learning_rate": 2.4988893036954043e-05, |
|
"loss": 0.0571, |
|
"num_input_tokens_seen": 243008, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 9.035294117647059, |
|
"grad_norm": 0.7292274236679077, |
|
"learning_rate": 2.294867572242112e-05, |
|
"loss": 0.0109, |
|
"num_input_tokens_seen": 243807, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.066666666666666, |
|
"grad_norm": 1.1452817916870117, |
|
"learning_rate": 2.098449876243096e-05, |
|
"loss": 0.0239, |
|
"num_input_tokens_seen": 244652, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.098039215686274, |
|
"grad_norm": 1.0900837182998657, |
|
"learning_rate": 1.909830056250529e-05, |
|
"loss": 0.0197, |
|
"num_input_tokens_seen": 245514, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.129411764705882, |
|
"grad_norm": 1.1233842372894287, |
|
"learning_rate": 1.7291942572543807e-05, |
|
"loss": 0.059, |
|
"num_input_tokens_seen": 246368, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.16078431372549, |
|
"grad_norm": 1.2153921127319336, |
|
"learning_rate": 1.5567207449798515e-05, |
|
"loss": 0.0487, |
|
"num_input_tokens_seen": 247234, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.192156862745097, |
|
"grad_norm": 1.7913979291915894, |
|
"learning_rate": 1.3925797299605625e-05, |
|
"loss": 0.0462, |
|
"num_input_tokens_seen": 248080, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.223529411764705, |
|
"grad_norm": 0.9466875195503235, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 0.0196, |
|
"num_input_tokens_seen": 248920, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.254901960784313, |
|
"grad_norm": 2.3814680576324463, |
|
"learning_rate": 1.0899347581163277e-05, |
|
"loss": 0.021, |
|
"num_input_tokens_seen": 249739, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.286274509803922, |
|
"grad_norm": 1.2178651094436646, |
|
"learning_rate": 9.517294753397998e-06, |
|
"loss": 0.0089, |
|
"num_input_tokens_seen": 250537, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.31764705882353, |
|
"grad_norm": 3.2499518394470215, |
|
"learning_rate": 8.224537431601864e-06, |
|
"loss": 0.0794, |
|
"num_input_tokens_seen": 251375, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.349019607843138, |
|
"grad_norm": 2.116999626159668, |
|
"learning_rate": 7.022351411174888e-06, |
|
"loss": 0.0815, |
|
"num_input_tokens_seen": 252200, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.380392156862746, |
|
"grad_norm": 1.607701301574707, |
|
"learning_rate": 5.911923104577455e-06, |
|
"loss": 0.0686, |
|
"num_input_tokens_seen": 253122, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 1.501824975013733, |
|
"learning_rate": 4.8943483704846585e-06, |
|
"loss": 0.0453, |
|
"num_input_tokens_seen": 253966, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.443137254901961, |
|
"grad_norm": 0.8923909664154053, |
|
"learning_rate": 3.970631432305694e-06, |
|
"loss": 0.0136, |
|
"num_input_tokens_seen": 254841, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.474509803921569, |
|
"grad_norm": 1.3790185451507568, |
|
"learning_rate": 3.141683887136904e-06, |
|
"loss": 0.0812, |
|
"num_input_tokens_seen": 255706, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.505882352941176, |
|
"grad_norm": 0.6012554168701172, |
|
"learning_rate": 2.4083238061252567e-06, |
|
"loss": 0.012, |
|
"num_input_tokens_seen": 256525, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.537254901960784, |
|
"grad_norm": 0.9253413677215576, |
|
"learning_rate": 1.771274927131139e-06, |
|
"loss": 0.0088, |
|
"num_input_tokens_seen": 257340, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.568627450980392, |
|
"grad_norm": 0.8364208936691284, |
|
"learning_rate": 1.2311659404862453e-06, |
|
"loss": 0.0128, |
|
"num_input_tokens_seen": 258165, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 1.1087877750396729, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 0.0406, |
|
"num_input_tokens_seen": 259015, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.631372549019607, |
|
"grad_norm": 1.227319359779358, |
|
"learning_rate": 4.438035396920115e-07, |
|
"loss": 0.0136, |
|
"num_input_tokens_seen": 259833, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.662745098039215, |
|
"grad_norm": 1.0235058069229126, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 0.037, |
|
"num_input_tokens_seen": 260686, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.694117647058823, |
|
"grad_norm": 1.069370150566101, |
|
"learning_rate": 4.934396342684e-08, |
|
"loss": 0.0366, |
|
"num_input_tokens_seen": 261513, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.72549019607843, |
|
"grad_norm": 1.2721508741378784, |
|
"learning_rate": 0.0, |
|
"loss": 0.0282, |
|
"num_input_tokens_seen": 262364, |
|
"step": 310 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 262364, |
|
"num_train_epochs": 10, |
|
"save_steps": 31, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5870993062052984e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|