|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 816, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012254901960784314, |
|
"grad_norm": 1.603988652495189, |
|
"learning_rate": 1.2195121951219513e-05, |
|
"loss": 1.3541, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006127450980392157, |
|
"grad_norm": 1.4587743423854005, |
|
"learning_rate": 6.097560975609756e-05, |
|
"loss": 1.355, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012254901960784314, |
|
"grad_norm": 1.538685925430638, |
|
"learning_rate": 0.00012195121951219512, |
|
"loss": 1.3083, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01838235294117647, |
|
"grad_norm": 0.5820643527511544, |
|
"learning_rate": 0.00018292682926829268, |
|
"loss": 1.2226, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.024509803921568627, |
|
"grad_norm": 0.37609725156071605, |
|
"learning_rate": 0.00024390243902439024, |
|
"loss": 1.1399, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.030637254901960783, |
|
"grad_norm": 0.2477379120591674, |
|
"learning_rate": 0.0003048780487804878, |
|
"loss": 1.0942, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03676470588235294, |
|
"grad_norm": 0.18934222364015724, |
|
"learning_rate": 0.00036585365853658537, |
|
"loss": 1.0785, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0428921568627451, |
|
"grad_norm": 0.14574615338739755, |
|
"learning_rate": 0.0004268292682926829, |
|
"loss": 1.0549, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"grad_norm": 0.12815166481708085, |
|
"learning_rate": 0.0004878048780487805, |
|
"loss": 1.0493, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05514705882352941, |
|
"grad_norm": 0.15010519509218812, |
|
"learning_rate": 0.0005487804878048781, |
|
"loss": 1.0306, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.061274509803921566, |
|
"grad_norm": 0.13010925959434533, |
|
"learning_rate": 0.0006097560975609756, |
|
"loss": 1.0204, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06740196078431372, |
|
"grad_norm": 0.11891525726508857, |
|
"learning_rate": 0.0006707317073170732, |
|
"loss": 1.0281, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07352941176470588, |
|
"grad_norm": 0.12117005404429922, |
|
"learning_rate": 0.0007317073170731707, |
|
"loss": 1.0187, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07965686274509803, |
|
"grad_norm": 0.11923924460163615, |
|
"learning_rate": 0.0007926829268292683, |
|
"loss": 1.0019, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0857843137254902, |
|
"grad_norm": 0.13523477315023974, |
|
"learning_rate": 0.0008536585365853659, |
|
"loss": 1.0044, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09191176470588236, |
|
"grad_norm": 0.11307823129618054, |
|
"learning_rate": 0.0009146341463414635, |
|
"loss": 1.0071, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"grad_norm": 0.1206648748330027, |
|
"learning_rate": 0.000975609756097561, |
|
"loss": 0.9962, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 0.15935283992889565, |
|
"learning_rate": 0.000999958782259877, |
|
"loss": 0.9998, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11029411764705882, |
|
"grad_norm": 0.13630538643217202, |
|
"learning_rate": 0.0009997069206794246, |
|
"loss": 1.0101, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11642156862745098, |
|
"grad_norm": 0.13281831595913912, |
|
"learning_rate": 0.0009992262114666653, |
|
"loss": 0.9904, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12254901960784313, |
|
"grad_norm": 0.13570583268291556, |
|
"learning_rate": 0.0009985168747689707, |
|
"loss": 0.986, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12867647058823528, |
|
"grad_norm": 0.14302939089927838, |
|
"learning_rate": 0.0009975792354368017, |
|
"loss": 0.9934, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13480392156862744, |
|
"grad_norm": 0.14349254312543258, |
|
"learning_rate": 0.0009964137228749407, |
|
"loss": 0.9961, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1409313725490196, |
|
"grad_norm": 0.13092834458351188, |
|
"learning_rate": 0.000995020870845837, |
|
"loss": 0.9949, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.1251118799739706, |
|
"learning_rate": 0.0009934013172251653, |
|
"loss": 0.9824, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15318627450980393, |
|
"grad_norm": 0.1323892618605143, |
|
"learning_rate": 0.0009915558037097002, |
|
"loss": 0.977, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15931372549019607, |
|
"grad_norm": 0.13068519381648078, |
|
"learning_rate": 0.0009894851754776472, |
|
"loss": 0.9712, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16544117647058823, |
|
"grad_norm": 0.1310198992984819, |
|
"learning_rate": 0.0009871903808015812, |
|
"loss": 0.9807, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1715686274509804, |
|
"grad_norm": 0.10811315309592277, |
|
"learning_rate": 0.0009846724706141716, |
|
"loss": 0.977, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17769607843137256, |
|
"grad_norm": 0.11603574555194691, |
|
"learning_rate": 0.0009819325980268945, |
|
"loss": 0.9743, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.18382352941176472, |
|
"grad_norm": 0.11664960595520962, |
|
"learning_rate": 0.0009789720178019483, |
|
"loss": 0.9742, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18995098039215685, |
|
"grad_norm": 0.11920852297334043, |
|
"learning_rate": 0.0009757920857776188, |
|
"loss": 0.9635, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 0.13745202686899544, |
|
"learning_rate": 0.0009723942582473544, |
|
"loss": 0.9544, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20220588235294118, |
|
"grad_norm": 0.15444220703514816, |
|
"learning_rate": 0.0009687800912928362, |
|
"loss": 0.9697, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 0.11511217870343073, |
|
"learning_rate": 0.0009649512400713498, |
|
"loss": 0.963, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21446078431372548, |
|
"grad_norm": 0.163845433820889, |
|
"learning_rate": 0.0009609094580577824, |
|
"loss": 0.9601, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22058823529411764, |
|
"grad_norm": 0.12370218334013189, |
|
"learning_rate": 0.0009566565962415959, |
|
"loss": 0.9578, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2267156862745098, |
|
"grad_norm": 0.138057520129555, |
|
"learning_rate": 0.0009521946022791401, |
|
"loss": 0.9555, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.23284313725490197, |
|
"grad_norm": 0.161151229045878, |
|
"learning_rate": 0.0009475255196016972, |
|
"loss": 0.9579, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23897058823529413, |
|
"grad_norm": 0.13900328482304902, |
|
"learning_rate": 0.0009426514864796647, |
|
"loss": 0.9494, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.24509803921568626, |
|
"grad_norm": 0.14057545846182565, |
|
"learning_rate": 0.0009375747350433044, |
|
"loss": 0.9478, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2512254901960784, |
|
"grad_norm": 0.15616371521107208, |
|
"learning_rate": 0.0009322975902605082, |
|
"loss": 0.9654, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.25735294117647056, |
|
"grad_norm": 0.11827026404580182, |
|
"learning_rate": 0.0009268224688720474, |
|
"loss": 0.9445, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26348039215686275, |
|
"grad_norm": 0.11103911913637518, |
|
"learning_rate": 0.0009211518782847931, |
|
"loss": 0.9424, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2696078431372549, |
|
"grad_norm": 0.11604427070566481, |
|
"learning_rate": 0.0009152884154234145, |
|
"loss": 0.9451, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2757352941176471, |
|
"grad_norm": 0.1099562215414043, |
|
"learning_rate": 0.0009092347655410818, |
|
"loss": 0.9402, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2818627450980392, |
|
"grad_norm": 0.14837520991789005, |
|
"learning_rate": 0.0009029937009897176, |
|
"loss": 0.9349, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28799019607843135, |
|
"grad_norm": 0.10928552841333679, |
|
"learning_rate": 0.0008965680799503608, |
|
"loss": 0.9329, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.11407153214331639, |
|
"learning_rate": 0.0008899608451242233, |
|
"loss": 0.9379, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3002450980392157, |
|
"grad_norm": 0.11231916470556697, |
|
"learning_rate": 0.0008831750223850389, |
|
"loss": 0.9229, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.30637254901960786, |
|
"grad_norm": 0.11185052745256109, |
|
"learning_rate": 0.0008762137193933241, |
|
"loss": 0.9296, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.11855657350077958, |
|
"learning_rate": 0.0008690801241731818, |
|
"loss": 0.9207, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.31862745098039214, |
|
"grad_norm": 1.8537407128611012, |
|
"learning_rate": 0.0008617775036523015, |
|
"loss": 0.9387, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3247549019607843, |
|
"grad_norm": 0.11676606107692747, |
|
"learning_rate": 0.0008543092021658259, |
|
"loss": 0.9367, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.33088235294117646, |
|
"grad_norm": 0.10492580984162286, |
|
"learning_rate": 0.0008466786399247663, |
|
"loss": 0.928, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.33700980392156865, |
|
"grad_norm": 0.10281631398110604, |
|
"learning_rate": 0.0008388893114496705, |
|
"loss": 0.935, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3431372549019608, |
|
"grad_norm": 0.11217225067437296, |
|
"learning_rate": 0.0008309447839702582, |
|
"loss": 0.9298, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3492647058823529, |
|
"grad_norm": 0.11327220268180357, |
|
"learning_rate": 0.0008228486957917607, |
|
"loss": 0.9219, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3553921568627451, |
|
"grad_norm": 0.11554152008646122, |
|
"learning_rate": 0.0008146047546287076, |
|
"loss": 0.934, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.36151960784313725, |
|
"grad_norm": 0.13610027478132888, |
|
"learning_rate": 0.0008062167359069301, |
|
"loss": 0.9276, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.36764705882352944, |
|
"grad_norm": 0.12248610966496465, |
|
"learning_rate": 0.000797688481034551, |
|
"loss": 0.9175, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3737745098039216, |
|
"grad_norm": 0.10512495641494239, |
|
"learning_rate": 0.00078902389564276, |
|
"loss": 0.9239, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3799019607843137, |
|
"grad_norm": 0.12079056888085157, |
|
"learning_rate": 0.0007802269477971771, |
|
"loss": 0.9167, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3860294117647059, |
|
"grad_norm": 0.1311550506036977, |
|
"learning_rate": 0.0007713016661806211, |
|
"loss": 0.9165, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 0.12748855363301959, |
|
"learning_rate": 0.0007622521382481208, |
|
"loss": 0.9099, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.39828431372549017, |
|
"grad_norm": 0.11389138878908127, |
|
"learning_rate": 0.0007530825083550073, |
|
"loss": 0.9034, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.40441176470588236, |
|
"grad_norm": 0.10172199627242663, |
|
"learning_rate": 0.0007437969758589507, |
|
"loss": 0.9147, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4105392156862745, |
|
"grad_norm": 0.1136698134249708, |
|
"learning_rate": 0.0007343997931968067, |
|
"loss": 0.9076, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.1110896296260987, |
|
"learning_rate": 0.0007248952639371542, |
|
"loss": 0.9075, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4227941176470588, |
|
"grad_norm": 0.10357314484765201, |
|
"learning_rate": 0.0007152877408094178, |
|
"loss": 0.8998, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.42892156862745096, |
|
"grad_norm": 0.11773981651015025, |
|
"learning_rate": 0.0007055816237104753, |
|
"loss": 0.9094, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43504901960784315, |
|
"grad_norm": 0.1283630128752841, |
|
"learning_rate": 0.0006957813576896647, |
|
"loss": 0.899, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.1326640375854421, |
|
"learning_rate": 0.000685891430913113, |
|
"loss": 0.9091, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.44730392156862747, |
|
"grad_norm": 0.12057333477888295, |
|
"learning_rate": 0.0006759163726083191, |
|
"loss": 0.9005, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4534313725490196, |
|
"grad_norm": 0.10157867473834796, |
|
"learning_rate": 0.0006658607509899319, |
|
"loss": 0.8995, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.45955882352941174, |
|
"grad_norm": 0.13679116304924, |
|
"learning_rate": 0.0006557291711676738, |
|
"loss": 0.9064, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.46568627450980393, |
|
"grad_norm": 0.10228308226469025, |
|
"learning_rate": 0.0006455262730373672, |
|
"loss": 0.8902, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.47181372549019607, |
|
"grad_norm": 0.11810749832493427, |
|
"learning_rate": 0.0006352567291560318, |
|
"loss": 0.8947, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.47794117647058826, |
|
"grad_norm": 0.11253919001414733, |
|
"learning_rate": 0.0006249252426020216, |
|
"loss": 0.8984, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4840686274509804, |
|
"grad_norm": 0.10889918340035115, |
|
"learning_rate": 0.0006145365448211866, |
|
"loss": 0.9001, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.49019607843137253, |
|
"grad_norm": 0.10602494662106901, |
|
"learning_rate": 0.0006040953934600423, |
|
"loss": 0.8924, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4963235294117647, |
|
"grad_norm": 0.09537450461248778, |
|
"learning_rate": 0.0005936065701869403, |
|
"loss": 0.8971, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5024509803921569, |
|
"grad_norm": 0.1135732875240647, |
|
"learning_rate": 0.0005830748785022368, |
|
"loss": 0.8956, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.508578431372549, |
|
"grad_norm": 0.11824825784313651, |
|
"learning_rate": 0.0005725051415384657, |
|
"loss": 0.9014, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5147058823529411, |
|
"grad_norm": 2.3957029087137602, |
|
"learning_rate": 0.0005619021998515165, |
|
"loss": 0.8937, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 0.1305239745293032, |
|
"learning_rate": 0.000551270909203838, |
|
"loss": 0.889, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5269607843137255, |
|
"grad_norm": 0.10923687170047386, |
|
"learning_rate": 0.0005406161383406731, |
|
"loss": 0.9009, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5330882352941176, |
|
"grad_norm": 0.11720531307848668, |
|
"learning_rate": 0.0005299427667603515, |
|
"loss": 0.9035, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5392156862745098, |
|
"grad_norm": 0.1043777454103823, |
|
"learning_rate": 0.0005192556824796568, |
|
"loss": 0.887, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5453431372549019, |
|
"grad_norm": 0.12019301588246883, |
|
"learning_rate": 0.0005085597797952905, |
|
"loss": 0.8852, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5514705882352942, |
|
"grad_norm": 0.09829925409523375, |
|
"learning_rate": 0.0004978599570424639, |
|
"loss": 0.8841, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5575980392156863, |
|
"grad_norm": 0.110813034496191, |
|
"learning_rate": 0.0004871611143516367, |
|
"loss": 0.8888, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5637254901960784, |
|
"grad_norm": 0.14013694091933743, |
|
"learning_rate": 0.0004764681514044362, |
|
"loss": 0.8863, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5698529411764706, |
|
"grad_norm": 0.10955250297933698, |
|
"learning_rate": 0.0004657859651897806, |
|
"loss": 0.8904, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5759803921568627, |
|
"grad_norm": 0.13711186271821346, |
|
"learning_rate": 0.00045511944776123513, |
|
"loss": 0.8789, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5821078431372549, |
|
"grad_norm": 0.09396380277187082, |
|
"learning_rate": 0.00044447348399663056, |
|
"loss": 0.8847, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.21392349020058346, |
|
"learning_rate": 0.0004338529493609647, |
|
"loss": 0.8824, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5943627450980392, |
|
"grad_norm": 0.12755805564480172, |
|
"learning_rate": 0.00042326270767361815, |
|
"loss": 0.8884, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6004901960784313, |
|
"grad_norm": 0.09157375745294742, |
|
"learning_rate": 0.00041270760888089997, |
|
"loss": 0.8825, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6066176470588235, |
|
"grad_norm": 0.10173653886247282, |
|
"learning_rate": 0.00040219248683494925, |
|
"loss": 0.8637, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6127450980392157, |
|
"grad_norm": 0.12386704656315299, |
|
"learning_rate": 0.0003917221570800065, |
|
"loss": 0.8719, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6188725490196079, |
|
"grad_norm": 0.10921071757131698, |
|
"learning_rate": 0.000381301414647068, |
|
"loss": 0.8707, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.10860919138034633, |
|
"learning_rate": 0.0003709350318579371, |
|
"loss": 0.8934, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6311274509803921, |
|
"grad_norm": 0.08765926558701954, |
|
"learning_rate": 0.0003606277561396726, |
|
"loss": 0.8595, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6372549019607843, |
|
"grad_norm": 0.08795902636008367, |
|
"learning_rate": 0.00035038430785044053, |
|
"loss": 0.8629, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6433823529411765, |
|
"grad_norm": 0.10125788693590333, |
|
"learning_rate": 0.00034020937811776156, |
|
"loss": 0.8597, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6495098039215687, |
|
"grad_norm": 0.09640732281156021, |
|
"learning_rate": 0.00033010762669014347, |
|
"loss": 0.8672, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6556372549019608, |
|
"grad_norm": 0.09206201588796137, |
|
"learning_rate": 0.00032008367980308734, |
|
"loss": 0.8723, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6617647058823529, |
|
"grad_norm": 0.089094237721721, |
|
"learning_rate": 0.0003101421280604379, |
|
"loss": 0.884, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6678921568627451, |
|
"grad_norm": 0.10047930336023028, |
|
"learning_rate": 0.00030028752433205476, |
|
"loss": 0.8612, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6740196078431373, |
|
"grad_norm": 0.09796290633516842, |
|
"learning_rate": 0.00029052438166876307, |
|
"loss": 0.8527, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6801470588235294, |
|
"grad_norm": 0.08908481799962162, |
|
"learning_rate": 0.0002808571712355389, |
|
"loss": 0.8636, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6862745098039216, |
|
"grad_norm": 0.09854862986040251, |
|
"learning_rate": 0.00027129032026388045, |
|
"loss": 0.8581, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6924019607843137, |
|
"grad_norm": 0.096989721310236, |
|
"learning_rate": 0.00026182821002429345, |
|
"loss": 0.8617, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6985294117647058, |
|
"grad_norm": 0.09027729876751488, |
|
"learning_rate": 0.00025247517381983136, |
|
"loss": 0.8654, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7046568627450981, |
|
"grad_norm": 0.10227245851698821, |
|
"learning_rate": 0.00024323549500159802, |
|
"loss": 0.8618, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7107843137254902, |
|
"grad_norm": 0.09927553647728089, |
|
"learning_rate": 0.0002341134050071283, |
|
"loss": 0.855, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7169117647058824, |
|
"grad_norm": 0.09142338818988954, |
|
"learning_rate": 0.00022511308142254488, |
|
"loss": 0.8577, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7230392156862745, |
|
"grad_norm": 0.10507626286878373, |
|
"learning_rate": 0.000216238646069373, |
|
"loss": 0.8605, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 0.09773601600409339, |
|
"learning_rate": 0.00020749416311689845, |
|
"loss": 0.8605, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.1053760063340528, |
|
"learning_rate": 0.00019888363722092372, |
|
"loss": 0.8631, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.741421568627451, |
|
"grad_norm": 0.09919853848427344, |
|
"learning_rate": 0.00019041101168978093, |
|
"loss": 0.8589, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7475490196078431, |
|
"grad_norm": 0.09240852582600491, |
|
"learning_rate": 0.00018208016667844152, |
|
"loss": 0.8616, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7536764705882353, |
|
"grad_norm": 0.09385869340911827, |
|
"learning_rate": 0.00017389491741154372, |
|
"loss": 0.8543, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7598039215686274, |
|
"grad_norm": 0.08602993504708097, |
|
"learning_rate": 0.00016585901243616042, |
|
"loss": 0.8566, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7659313725490197, |
|
"grad_norm": 0.08661913403120794, |
|
"learning_rate": 0.0001579761319050991, |
|
"loss": 0.8546, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7720588235294118, |
|
"grad_norm": 0.08756073235275695, |
|
"learning_rate": 0.00015024988589152537, |
|
"loss": 0.8582, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7781862745098039, |
|
"grad_norm": 0.08339963011288148, |
|
"learning_rate": 0.0001426838127356823, |
|
"loss": 0.8541, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 0.07897307103939846, |
|
"learning_rate": 0.0001352813774244565, |
|
"loss": 0.849, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7904411764705882, |
|
"grad_norm": 0.08692536794832408, |
|
"learning_rate": 0.00012804597000454215, |
|
"loss": 0.8559, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7965686274509803, |
|
"grad_norm": 0.08695762926336753, |
|
"learning_rate": 0.00012098090402992085, |
|
"loss": 0.8665, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8026960784313726, |
|
"grad_norm": 0.08241315305272631, |
|
"learning_rate": 0.00011408941504437532, |
|
"loss": 0.8544, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8088235294117647, |
|
"grad_norm": 0.07821925622204019, |
|
"learning_rate": 0.00010737465909972776, |
|
"loss": 0.8474, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8149509803921569, |
|
"grad_norm": 0.08521471066806094, |
|
"learning_rate": 0.00010083971131048159, |
|
"loss": 0.8495, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.821078431372549, |
|
"grad_norm": 0.08381156457580924, |
|
"learning_rate": 9.448756444553224e-05, |
|
"loss": 0.8506, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8272058823529411, |
|
"grad_norm": 0.080205577901611, |
|
"learning_rate": 8.832112755758598e-05, |
|
"loss": 0.8482, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.08210122013268317, |
|
"learning_rate": 8.234322465092047e-05, |
|
"loss": 0.8491, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8394607843137255, |
|
"grad_norm": 0.08195821975889148, |
|
"learning_rate": 7.655659338809329e-05, |
|
"loss": 0.8484, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8455882352941176, |
|
"grad_norm": 0.08225068138923354, |
|
"learning_rate": 7.096388383619079e-05, |
|
"loss": 0.8436, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8517156862745098, |
|
"grad_norm": 0.10816220803390626, |
|
"learning_rate": 6.556765725319525e-05, |
|
"loss": 0.8479, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8578431372549019, |
|
"grad_norm": 0.08954785260614277, |
|
"learning_rate": 6.037038491501978e-05, |
|
"loss": 0.8524, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8639705882352942, |
|
"grad_norm": 0.07665058203914679, |
|
"learning_rate": 5.53744469837551e-05, |
|
"loss": 0.8431, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.8700980392156863, |
|
"grad_norm": 0.0782138298232773, |
|
"learning_rate": 5.058213141764151e-05, |
|
"loss": 0.8438, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8762254901960784, |
|
"grad_norm": 0.08269251578264038, |
|
"learning_rate": 4.599563292326592e-05, |
|
"loss": 0.8485, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.0907687363220474, |
|
"learning_rate": 4.161705195046761e-05, |
|
"loss": 0.8443, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8884803921568627, |
|
"grad_norm": 0.08259230750361556, |
|
"learning_rate": 3.744839373040682e-05, |
|
"loss": 0.8467, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8946078431372549, |
|
"grad_norm": 0.07999560967778772, |
|
"learning_rate": 3.349156735724274e-05, |
|
"loss": 0.848, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9007352941176471, |
|
"grad_norm": 0.07524504711853225, |
|
"learning_rate": 2.9748384913837522e-05, |
|
"loss": 0.8348, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9068627450980392, |
|
"grad_norm": 0.08135847243984051, |
|
"learning_rate": 2.622056064188738e-05, |
|
"loss": 0.854, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9129901960784313, |
|
"grad_norm": 0.07885634814452873, |
|
"learning_rate": 2.2909710156863274e-05, |
|
"loss": 0.8514, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9191176470588235, |
|
"grad_norm": 0.08736339560766254, |
|
"learning_rate": 1.981734970811644e-05, |
|
"loss": 0.8417, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9252450980392157, |
|
"grad_norm": 0.0767085793238129, |
|
"learning_rate": 1.6944895484492072e-05, |
|
"loss": 0.8523, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9313725490196079, |
|
"grad_norm": 0.08318918651152993, |
|
"learning_rate": 1.429366296576623e-05, |
|
"loss": 0.8511, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.07933377923909153, |
|
"learning_rate": 1.1864866320203115e-05, |
|
"loss": 0.8479, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9436274509803921, |
|
"grad_norm": 0.09008515851237198, |
|
"learning_rate": 9.659617848510882e-06, |
|
"loss": 0.8449, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9497549019607843, |
|
"grad_norm": 0.07787795748629618, |
|
"learning_rate": 7.678927474447817e-06, |
|
"loss": 0.8446, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9558823529411765, |
|
"grad_norm": 0.08423127476840589, |
|
"learning_rate": 5.923702282314092e-06, |
|
"loss": 0.8466, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9620098039215687, |
|
"grad_norm": 0.07739503343274702, |
|
"learning_rate": 4.394746101540115e-06, |
|
"loss": 0.8423, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.9681372549019608, |
|
"grad_norm": 0.07932764141883414, |
|
"learning_rate": 3.092759138561607e-06, |
|
"loss": 0.8405, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9742647058823529, |
|
"grad_norm": 0.08063542360073593, |
|
"learning_rate": 2.018337656150726e-06, |
|
"loss": 0.8461, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.9803921568627451, |
|
"grad_norm": 0.08993712709783745, |
|
"learning_rate": 1.1719737003492159e-06, |
|
"loss": 0.8388, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9865196078431373, |
|
"grad_norm": 0.0792762901452283, |
|
"learning_rate": 5.540548751292173e-07, |
|
"loss": 0.8346, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.9926470588235294, |
|
"grad_norm": 0.09250878108386706, |
|
"learning_rate": 1.6486416488459277e-07, |
|
"loss": 0.8478, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9987745098039216, |
|
"grad_norm": 0.07786954435284818, |
|
"learning_rate": 4.579804834703438e-09, |
|
"loss": 0.8401, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.202426552772522, |
|
"eval_runtime": 111.4035, |
|
"eval_samples_per_second": 187.974, |
|
"eval_steps_per_second": 5.88, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 816, |
|
"total_flos": 80063181619200.0, |
|
"train_loss": 0.914715180794398, |
|
"train_runtime": 1874.4496, |
|
"train_samples_per_second": 55.714, |
|
"train_steps_per_second": 0.435 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 816, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 80063181619200.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|