{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012254901960784314, "grad_norm": 1.603988652495189, "learning_rate": 1.2195121951219513e-05, "loss": 1.3541, "step": 1 }, { "epoch": 0.006127450980392157, "grad_norm": 1.4587743423854005, "learning_rate": 6.097560975609756e-05, "loss": 1.355, "step": 5 }, { "epoch": 0.012254901960784314, "grad_norm": 1.538685925430638, "learning_rate": 0.00012195121951219512, "loss": 1.3083, "step": 10 }, { "epoch": 0.01838235294117647, "grad_norm": 0.5820643527511544, "learning_rate": 0.00018292682926829268, "loss": 1.2226, "step": 15 }, { "epoch": 0.024509803921568627, "grad_norm": 0.37609725156071605, "learning_rate": 0.00024390243902439024, "loss": 1.1399, "step": 20 }, { "epoch": 0.030637254901960783, "grad_norm": 0.2477379120591674, "learning_rate": 0.0003048780487804878, "loss": 1.0942, "step": 25 }, { "epoch": 0.03676470588235294, "grad_norm": 0.18934222364015724, "learning_rate": 0.00036585365853658537, "loss": 1.0785, "step": 30 }, { "epoch": 0.0428921568627451, "grad_norm": 0.14574615338739755, "learning_rate": 0.0004268292682926829, "loss": 1.0549, "step": 35 }, { "epoch": 0.049019607843137254, "grad_norm": 0.12815166481708085, "learning_rate": 0.0004878048780487805, "loss": 1.0493, "step": 40 }, { "epoch": 0.05514705882352941, "grad_norm": 0.15010519509218812, "learning_rate": 0.0005487804878048781, "loss": 1.0306, "step": 45 }, { "epoch": 0.061274509803921566, "grad_norm": 0.13010925959434533, "learning_rate": 0.0006097560975609756, "loss": 1.0204, "step": 50 }, { "epoch": 0.06740196078431372, "grad_norm": 0.11891525726508857, "learning_rate": 0.0006707317073170732, "loss": 1.0281, "step": 55 }, { "epoch": 0.07352941176470588, "grad_norm": 0.12117005404429922, "learning_rate": 0.0007317073170731707, "loss": 1.0187, "step": 60 }, { "epoch": 0.07965686274509803, "grad_norm": 0.11923924460163615, "learning_rate": 0.0007926829268292683, "loss": 1.0019, "step": 65 }, { "epoch": 0.0857843137254902, "grad_norm": 0.13523477315023974, "learning_rate": 0.0008536585365853659, "loss": 1.0044, "step": 70 }, { "epoch": 0.09191176470588236, "grad_norm": 0.11307823129618054, "learning_rate": 0.0009146341463414635, "loss": 1.0071, "step": 75 }, { "epoch": 0.09803921568627451, "grad_norm": 0.1206648748330027, "learning_rate": 0.000975609756097561, "loss": 0.9962, "step": 80 }, { "epoch": 0.10416666666666667, "grad_norm": 0.15935283992889565, "learning_rate": 0.000999958782259877, "loss": 0.9998, "step": 85 }, { "epoch": 0.11029411764705882, "grad_norm": 0.13630538643217202, "learning_rate": 0.0009997069206794246, "loss": 1.0101, "step": 90 }, { "epoch": 0.11642156862745098, "grad_norm": 0.13281831595913912, "learning_rate": 0.0009992262114666653, "loss": 0.9904, "step": 95 }, { "epoch": 0.12254901960784313, "grad_norm": 0.13570583268291556, "learning_rate": 0.0009985168747689707, "loss": 0.986, "step": 100 }, { "epoch": 0.12867647058823528, "grad_norm": 0.14302939089927838, "learning_rate": 0.0009975792354368017, "loss": 0.9934, "step": 105 }, { "epoch": 0.13480392156862744, "grad_norm": 0.14349254312543258, "learning_rate": 0.0009964137228749407, "loss": 0.9961, "step": 110 }, { "epoch": 0.1409313725490196, "grad_norm": 0.13092834458351188, "learning_rate": 0.000995020870845837, "loss": 0.9949, "step": 115 }, { "epoch": 0.14705882352941177, "grad_norm": 0.1251118799739706, "learning_rate": 0.0009934013172251653, "loss": 0.9824, "step": 120 }, { "epoch": 0.15318627450980393, "grad_norm": 0.1323892618605143, "learning_rate": 0.0009915558037097002, "loss": 0.977, "step": 125 }, { "epoch": 0.15931372549019607, "grad_norm": 0.13068519381648078, "learning_rate": 0.0009894851754776472, "loss": 0.9712, "step": 130 }, { "epoch": 0.16544117647058823, "grad_norm": 0.1310198992984819, "learning_rate": 0.0009871903808015812, "loss": 0.9807, "step": 135 }, { "epoch": 0.1715686274509804, "grad_norm": 0.10811315309592277, "learning_rate": 0.0009846724706141716, "loss": 0.977, "step": 140 }, { "epoch": 0.17769607843137256, "grad_norm": 0.11603574555194691, "learning_rate": 0.0009819325980268945, "loss": 0.9743, "step": 145 }, { "epoch": 0.18382352941176472, "grad_norm": 0.11664960595520962, "learning_rate": 0.0009789720178019483, "loss": 0.9742, "step": 150 }, { "epoch": 0.18995098039215685, "grad_norm": 0.11920852297334043, "learning_rate": 0.0009757920857776188, "loss": 0.9635, "step": 155 }, { "epoch": 0.19607843137254902, "grad_norm": 0.13745202686899544, "learning_rate": 0.0009723942582473544, "loss": 0.9544, "step": 160 }, { "epoch": 0.20220588235294118, "grad_norm": 0.15444220703514816, "learning_rate": 0.0009687800912928362, "loss": 0.9697, "step": 165 }, { "epoch": 0.20833333333333334, "grad_norm": 0.11511217870343073, "learning_rate": 0.0009649512400713498, "loss": 0.963, "step": 170 }, { "epoch": 0.21446078431372548, "grad_norm": 0.163845433820889, "learning_rate": 0.0009609094580577824, "loss": 0.9601, "step": 175 }, { "epoch": 0.22058823529411764, "grad_norm": 0.12370218334013189, "learning_rate": 0.0009566565962415959, "loss": 0.9578, "step": 180 }, { "epoch": 0.2267156862745098, "grad_norm": 0.138057520129555, "learning_rate": 0.0009521946022791401, "loss": 0.9555, "step": 185 }, { "epoch": 0.23284313725490197, "grad_norm": 0.161151229045878, "learning_rate": 0.0009475255196016972, "loss": 0.9579, "step": 190 }, { "epoch": 0.23897058823529413, "grad_norm": 0.13900328482304902, "learning_rate": 0.0009426514864796647, "loss": 0.9494, "step": 195 }, { "epoch": 0.24509803921568626, "grad_norm": 0.14057545846182565, "learning_rate": 0.0009375747350433044, "loss": 0.9478, "step": 200 }, { "epoch": 0.2512254901960784, "grad_norm": 0.15616371521107208, "learning_rate": 0.0009322975902605082, "loss": 0.9654, "step": 205 }, { "epoch": 0.25735294117647056, "grad_norm": 0.11827026404580182, "learning_rate": 0.0009268224688720474, "loss": 0.9445, "step": 210 }, { "epoch": 0.26348039215686275, "grad_norm": 0.11103911913637518, "learning_rate": 0.0009211518782847931, "loss": 0.9424, "step": 215 }, { "epoch": 0.2696078431372549, "grad_norm": 0.11604427070566481, "learning_rate": 0.0009152884154234145, "loss": 0.9451, "step": 220 }, { "epoch": 0.2757352941176471, "grad_norm": 0.1099562215414043, "learning_rate": 0.0009092347655410818, "loss": 0.9402, "step": 225 }, { "epoch": 0.2818627450980392, "grad_norm": 0.14837520991789005, "learning_rate": 0.0009029937009897176, "loss": 0.9349, "step": 230 }, { "epoch": 0.28799019607843135, "grad_norm": 0.10928552841333679, "learning_rate": 0.0008965680799503608, "loss": 0.9329, "step": 235 }, { "epoch": 0.29411764705882354, "grad_norm": 0.11407153214331639, "learning_rate": 0.0008899608451242233, "loss": 0.9379, "step": 240 }, { "epoch": 0.3002450980392157, "grad_norm": 0.11231916470556697, "learning_rate": 0.0008831750223850389, "loss": 0.9229, "step": 245 }, { "epoch": 0.30637254901960786, "grad_norm": 0.11185052745256109, "learning_rate": 0.0008762137193933241, "loss": 0.9296, "step": 250 }, { "epoch": 0.3125, "grad_norm": 0.11855657350077958, "learning_rate": 0.0008690801241731818, "loss": 0.9207, "step": 255 }, { "epoch": 0.31862745098039214, "grad_norm": 1.8537407128611012, "learning_rate": 0.0008617775036523015, "loss": 0.9387, "step": 260 }, { "epoch": 0.3247549019607843, "grad_norm": 0.11676606107692747, "learning_rate": 0.0008543092021658259, "loss": 0.9367, "step": 265 }, { "epoch": 0.33088235294117646, "grad_norm": 0.10492580984162286, "learning_rate": 0.0008466786399247663, "loss": 0.928, "step": 270 }, { "epoch": 0.33700980392156865, "grad_norm": 0.10281631398110604, "learning_rate": 0.0008388893114496705, "loss": 0.935, "step": 275 }, { "epoch": 0.3431372549019608, "grad_norm": 0.11217225067437296, "learning_rate": 0.0008309447839702582, "loss": 0.9298, "step": 280 }, { "epoch": 0.3492647058823529, "grad_norm": 0.11327220268180357, "learning_rate": 0.0008228486957917607, "loss": 0.9219, "step": 285 }, { "epoch": 0.3553921568627451, "grad_norm": 0.11554152008646122, "learning_rate": 0.0008146047546287076, "loss": 0.934, "step": 290 }, { "epoch": 0.36151960784313725, "grad_norm": 0.13610027478132888, "learning_rate": 0.0008062167359069301, "loss": 0.9276, "step": 295 }, { "epoch": 0.36764705882352944, "grad_norm": 0.12248610966496465, "learning_rate": 0.000797688481034551, "loss": 0.9175, "step": 300 }, { "epoch": 0.3737745098039216, "grad_norm": 0.10512495641494239, "learning_rate": 0.00078902389564276, "loss": 0.9239, "step": 305 }, { "epoch": 0.3799019607843137, "grad_norm": 0.12079056888085157, "learning_rate": 0.0007802269477971771, "loss": 0.9167, "step": 310 }, { "epoch": 0.3860294117647059, "grad_norm": 0.1311550506036977, "learning_rate": 0.0007713016661806211, "loss": 0.9165, "step": 315 }, { "epoch": 0.39215686274509803, "grad_norm": 0.12748855363301959, "learning_rate": 0.0007622521382481208, "loss": 0.9099, "step": 320 }, { "epoch": 0.39828431372549017, "grad_norm": 0.11389138878908127, "learning_rate": 0.0007530825083550073, "loss": 0.9034, "step": 325 }, { "epoch": 0.40441176470588236, "grad_norm": 0.10172199627242663, "learning_rate": 0.0007437969758589507, "loss": 0.9147, "step": 330 }, { "epoch": 0.4105392156862745, "grad_norm": 0.1136698134249708, "learning_rate": 0.0007343997931968067, "loss": 0.9076, "step": 335 }, { "epoch": 0.4166666666666667, "grad_norm": 0.1110896296260987, "learning_rate": 0.0007248952639371542, "loss": 0.9075, "step": 340 }, { "epoch": 0.4227941176470588, "grad_norm": 0.10357314484765201, "learning_rate": 0.0007152877408094178, "loss": 0.8998, "step": 345 }, { "epoch": 0.42892156862745096, "grad_norm": 0.11773981651015025, "learning_rate": 0.0007055816237104753, "loss": 0.9094, "step": 350 }, { "epoch": 0.43504901960784315, "grad_norm": 0.1283630128752841, "learning_rate": 0.0006957813576896647, "loss": 0.899, "step": 355 }, { "epoch": 0.4411764705882353, "grad_norm": 0.1326640375854421, "learning_rate": 0.000685891430913113, "loss": 0.9091, "step": 360 }, { "epoch": 0.44730392156862747, "grad_norm": 0.12057333477888295, "learning_rate": 0.0006759163726083191, "loss": 0.9005, "step": 365 }, { "epoch": 0.4534313725490196, "grad_norm": 0.10157867473834796, "learning_rate": 0.0006658607509899319, "loss": 0.8995, "step": 370 }, { "epoch": 0.45955882352941174, "grad_norm": 0.13679116304924, "learning_rate": 0.0006557291711676738, "loss": 0.9064, "step": 375 }, { "epoch": 0.46568627450980393, "grad_norm": 0.10228308226469025, "learning_rate": 0.0006455262730373672, "loss": 0.8902, "step": 380 }, { "epoch": 0.47181372549019607, "grad_norm": 0.11810749832493427, "learning_rate": 0.0006352567291560318, "loss": 0.8947, "step": 385 }, { "epoch": 0.47794117647058826, "grad_norm": 0.11253919001414733, "learning_rate": 0.0006249252426020216, "loss": 0.8984, "step": 390 }, { "epoch": 0.4840686274509804, "grad_norm": 0.10889918340035115, "learning_rate": 0.0006145365448211866, "loss": 0.9001, "step": 395 }, { "epoch": 0.49019607843137253, "grad_norm": 0.10602494662106901, "learning_rate": 0.0006040953934600423, "loss": 0.8924, "step": 400 }, { "epoch": 0.4963235294117647, "grad_norm": 0.09537450461248778, "learning_rate": 0.0005936065701869403, "loss": 0.8971, "step": 405 }, { "epoch": 0.5024509803921569, "grad_norm": 0.1135732875240647, "learning_rate": 0.0005830748785022368, "loss": 0.8956, "step": 410 }, { "epoch": 0.508578431372549, "grad_norm": 0.11824825784313651, "learning_rate": 0.0005725051415384657, "loss": 0.9014, "step": 415 }, { "epoch": 0.5147058823529411, "grad_norm": 2.3957029087137602, "learning_rate": 0.0005619021998515165, "loss": 0.8937, "step": 420 }, { "epoch": 0.5208333333333334, "grad_norm": 0.1305239745293032, "learning_rate": 0.000551270909203838, "loss": 0.889, "step": 425 }, { "epoch": 0.5269607843137255, "grad_norm": 0.10923687170047386, "learning_rate": 0.0005406161383406731, "loss": 0.9009, "step": 430 }, { "epoch": 0.5330882352941176, "grad_norm": 0.11720531307848668, "learning_rate": 0.0005299427667603515, "loss": 0.9035, "step": 435 }, { "epoch": 0.5392156862745098, "grad_norm": 0.1043777454103823, "learning_rate": 0.0005192556824796568, "loss": 0.887, "step": 440 }, { "epoch": 0.5453431372549019, "grad_norm": 0.12019301588246883, "learning_rate": 0.0005085597797952905, "loss": 0.8852, "step": 445 }, { "epoch": 0.5514705882352942, "grad_norm": 0.09829925409523375, "learning_rate": 0.0004978599570424639, "loss": 0.8841, "step": 450 }, { "epoch": 0.5575980392156863, "grad_norm": 0.110813034496191, "learning_rate": 0.0004871611143516367, "loss": 0.8888, "step": 455 }, { "epoch": 0.5637254901960784, "grad_norm": 0.14013694091933743, "learning_rate": 0.0004764681514044362, "loss": 0.8863, "step": 460 }, { "epoch": 0.5698529411764706, "grad_norm": 0.10955250297933698, "learning_rate": 0.0004657859651897806, "loss": 0.8904, "step": 465 }, { "epoch": 0.5759803921568627, "grad_norm": 0.13711186271821346, "learning_rate": 0.00045511944776123513, "loss": 0.8789, "step": 470 }, { "epoch": 0.5821078431372549, "grad_norm": 0.09396380277187082, "learning_rate": 0.00044447348399663056, "loss": 0.8847, "step": 475 }, { "epoch": 0.5882352941176471, "grad_norm": 0.21392349020058346, "learning_rate": 0.0004338529493609647, "loss": 0.8824, "step": 480 }, { "epoch": 0.5943627450980392, "grad_norm": 0.12755805564480172, "learning_rate": 0.00042326270767361815, "loss": 0.8884, "step": 485 }, { "epoch": 0.6004901960784313, "grad_norm": 0.09157375745294742, "learning_rate": 0.00041270760888089997, "loss": 0.8825, "step": 490 }, { "epoch": 0.6066176470588235, "grad_norm": 0.10173653886247282, "learning_rate": 0.00040219248683494925, "loss": 0.8637, "step": 495 }, { "epoch": 0.6127450980392157, "grad_norm": 0.12386704656315299, "learning_rate": 0.0003917221570800065, "loss": 0.8719, "step": 500 }, { "epoch": 0.6188725490196079, "grad_norm": 0.10921071757131698, "learning_rate": 0.000381301414647068, "loss": 0.8707, "step": 505 }, { "epoch": 0.625, "grad_norm": 0.10860919138034633, "learning_rate": 0.0003709350318579371, "loss": 0.8934, "step": 510 }, { "epoch": 0.6311274509803921, "grad_norm": 0.08765926558701954, "learning_rate": 0.0003606277561396726, "loss": 0.8595, "step": 515 }, { "epoch": 0.6372549019607843, "grad_norm": 0.08795902636008367, "learning_rate": 0.00035038430785044053, "loss": 0.8629, "step": 520 }, { "epoch": 0.6433823529411765, "grad_norm": 0.10125788693590333, "learning_rate": 0.00034020937811776156, "loss": 0.8597, "step": 525 }, { "epoch": 0.6495098039215687, "grad_norm": 0.09640732281156021, "learning_rate": 0.00033010762669014347, "loss": 0.8672, "step": 530 }, { "epoch": 0.6556372549019608, "grad_norm": 0.09206201588796137, "learning_rate": 0.00032008367980308734, "loss": 0.8723, "step": 535 }, { "epoch": 0.6617647058823529, "grad_norm": 0.089094237721721, "learning_rate": 0.0003101421280604379, "loss": 0.884, "step": 540 }, { "epoch": 0.6678921568627451, "grad_norm": 0.10047930336023028, "learning_rate": 0.00030028752433205476, "loss": 0.8612, "step": 545 }, { "epoch": 0.6740196078431373, "grad_norm": 0.09796290633516842, "learning_rate": 0.00029052438166876307, "loss": 0.8527, "step": 550 }, { "epoch": 0.6801470588235294, "grad_norm": 0.08908481799962162, "learning_rate": 0.0002808571712355389, "loss": 0.8636, "step": 555 }, { "epoch": 0.6862745098039216, "grad_norm": 0.09854862986040251, "learning_rate": 0.00027129032026388045, "loss": 0.8581, "step": 560 }, { "epoch": 0.6924019607843137, "grad_norm": 0.096989721310236, "learning_rate": 0.00026182821002429345, "loss": 0.8617, "step": 565 }, { "epoch": 0.6985294117647058, "grad_norm": 0.09027729876751488, "learning_rate": 0.00025247517381983136, "loss": 0.8654, "step": 570 }, { "epoch": 0.7046568627450981, "grad_norm": 0.10227245851698821, "learning_rate": 0.00024323549500159802, "loss": 0.8618, "step": 575 }, { "epoch": 0.7107843137254902, "grad_norm": 0.09927553647728089, "learning_rate": 0.0002341134050071283, "loss": 0.855, "step": 580 }, { "epoch": 0.7169117647058824, "grad_norm": 0.09142338818988954, "learning_rate": 0.00022511308142254488, "loss": 0.8577, "step": 585 }, { "epoch": 0.7230392156862745, "grad_norm": 0.10507626286878373, "learning_rate": 0.000216238646069373, "loss": 0.8605, "step": 590 }, { "epoch": 0.7291666666666666, "grad_norm": 0.09773601600409339, "learning_rate": 0.00020749416311689845, "loss": 0.8605, "step": 595 }, { "epoch": 0.7352941176470589, "grad_norm": 0.1053760063340528, "learning_rate": 0.00019888363722092372, "loss": 0.8631, "step": 600 }, { "epoch": 0.741421568627451, "grad_norm": 0.09919853848427344, "learning_rate": 0.00019041101168978093, "loss": 0.8589, "step": 605 }, { "epoch": 0.7475490196078431, "grad_norm": 0.09240852582600491, "learning_rate": 0.00018208016667844152, "loss": 0.8616, "step": 610 }, { "epoch": 0.7536764705882353, "grad_norm": 0.09385869340911827, "learning_rate": 0.00017389491741154372, "loss": 0.8543, "step": 615 }, { "epoch": 0.7598039215686274, "grad_norm": 0.08602993504708097, "learning_rate": 0.00016585901243616042, "loss": 0.8566, "step": 620 }, { "epoch": 0.7659313725490197, "grad_norm": 0.08661913403120794, "learning_rate": 0.0001579761319050991, "loss": 0.8546, "step": 625 }, { "epoch": 0.7720588235294118, "grad_norm": 0.08756073235275695, "learning_rate": 0.00015024988589152537, "loss": 0.8582, "step": 630 }, { "epoch": 0.7781862745098039, "grad_norm": 0.08339963011288148, "learning_rate": 0.0001426838127356823, "loss": 0.8541, "step": 635 }, { "epoch": 0.7843137254901961, "grad_norm": 0.07897307103939846, "learning_rate": 0.0001352813774244565, "loss": 0.849, "step": 640 }, { "epoch": 0.7904411764705882, "grad_norm": 0.08692536794832408, "learning_rate": 0.00012804597000454215, "loss": 0.8559, "step": 645 }, { "epoch": 0.7965686274509803, "grad_norm": 0.08695762926336753, "learning_rate": 0.00012098090402992085, "loss": 0.8665, "step": 650 }, { "epoch": 0.8026960784313726, "grad_norm": 0.08241315305272631, "learning_rate": 0.00011408941504437532, "loss": 0.8544, "step": 655 }, { "epoch": 0.8088235294117647, "grad_norm": 0.07821925622204019, "learning_rate": 0.00010737465909972776, "loss": 0.8474, "step": 660 }, { "epoch": 0.8149509803921569, "grad_norm": 0.08521471066806094, "learning_rate": 0.00010083971131048159, "loss": 0.8495, "step": 665 }, { "epoch": 0.821078431372549, "grad_norm": 0.08381156457580924, "learning_rate": 9.448756444553224e-05, "loss": 0.8506, "step": 670 }, { "epoch": 0.8272058823529411, "grad_norm": 0.080205577901611, "learning_rate": 8.832112755758598e-05, "loss": 0.8482, "step": 675 }, { "epoch": 0.8333333333333334, "grad_norm": 0.08210122013268317, "learning_rate": 8.234322465092047e-05, "loss": 0.8491, "step": 680 }, { "epoch": 0.8394607843137255, "grad_norm": 0.08195821975889148, "learning_rate": 7.655659338809329e-05, "loss": 0.8484, "step": 685 }, { "epoch": 0.8455882352941176, "grad_norm": 0.08225068138923354, "learning_rate": 7.096388383619079e-05, "loss": 0.8436, "step": 690 }, { "epoch": 0.8517156862745098, "grad_norm": 0.10816220803390626, "learning_rate": 6.556765725319525e-05, "loss": 0.8479, "step": 695 }, { "epoch": 0.8578431372549019, "grad_norm": 0.08954785260614277, "learning_rate": 6.037038491501978e-05, "loss": 0.8524, "step": 700 }, { "epoch": 0.8639705882352942, "grad_norm": 0.07665058203914679, "learning_rate": 5.53744469837551e-05, "loss": 0.8431, "step": 705 }, { "epoch": 0.8700980392156863, "grad_norm": 0.0782138298232773, "learning_rate": 5.058213141764151e-05, "loss": 0.8438, "step": 710 }, { "epoch": 0.8762254901960784, "grad_norm": 0.08269251578264038, "learning_rate": 4.599563292326592e-05, "loss": 0.8485, "step": 715 }, { "epoch": 0.8823529411764706, "grad_norm": 0.0907687363220474, "learning_rate": 4.161705195046761e-05, "loss": 0.8443, "step": 720 }, { "epoch": 0.8884803921568627, "grad_norm": 0.08259230750361556, "learning_rate": 3.744839373040682e-05, "loss": 0.8467, "step": 725 }, { "epoch": 0.8946078431372549, "grad_norm": 0.07999560967778772, "learning_rate": 3.349156735724274e-05, "loss": 0.848, "step": 730 }, { "epoch": 0.9007352941176471, "grad_norm": 0.07524504711853225, "learning_rate": 2.9748384913837522e-05, "loss": 0.8348, "step": 735 }, { "epoch": 0.9068627450980392, "grad_norm": 0.08135847243984051, "learning_rate": 2.622056064188738e-05, "loss": 0.854, "step": 740 }, { "epoch": 0.9129901960784313, "grad_norm": 0.07885634814452873, "learning_rate": 2.2909710156863274e-05, "loss": 0.8514, "step": 745 }, { "epoch": 0.9191176470588235, "grad_norm": 0.08736339560766254, "learning_rate": 1.981734970811644e-05, "loss": 0.8417, "step": 750 }, { "epoch": 0.9252450980392157, "grad_norm": 0.0767085793238129, "learning_rate": 1.6944895484492072e-05, "loss": 0.8523, "step": 755 }, { "epoch": 0.9313725490196079, "grad_norm": 0.08318918651152993, "learning_rate": 1.429366296576623e-05, "loss": 0.8511, "step": 760 }, { "epoch": 0.9375, "grad_norm": 0.07933377923909153, "learning_rate": 1.1864866320203115e-05, "loss": 0.8479, "step": 765 }, { "epoch": 0.9436274509803921, "grad_norm": 0.09008515851237198, "learning_rate": 9.659617848510882e-06, "loss": 0.8449, "step": 770 }, { "epoch": 0.9497549019607843, "grad_norm": 0.07787795748629618, "learning_rate": 7.678927474447817e-06, "loss": 0.8446, "step": 775 }, { "epoch": 0.9558823529411765, "grad_norm": 0.08423127476840589, "learning_rate": 5.923702282314092e-06, "loss": 0.8466, "step": 780 }, { "epoch": 0.9620098039215687, "grad_norm": 0.07739503343274702, "learning_rate": 4.394746101540115e-06, "loss": 0.8423, "step": 785 }, { "epoch": 0.9681372549019608, "grad_norm": 0.07932764141883414, "learning_rate": 3.092759138561607e-06, "loss": 0.8405, "step": 790 }, { "epoch": 0.9742647058823529, "grad_norm": 0.08063542360073593, "learning_rate": 2.018337656150726e-06, "loss": 0.8461, "step": 795 }, { "epoch": 0.9803921568627451, "grad_norm": 0.08993712709783745, "learning_rate": 1.1719737003492159e-06, "loss": 0.8388, "step": 800 }, { "epoch": 0.9865196078431373, "grad_norm": 0.0792762901452283, "learning_rate": 5.540548751292173e-07, "loss": 0.8346, "step": 805 }, { "epoch": 0.9926470588235294, "grad_norm": 0.09250878108386706, "learning_rate": 1.6486416488459277e-07, "loss": 0.8478, "step": 810 }, { "epoch": 0.9987745098039216, "grad_norm": 0.07786954435284818, "learning_rate": 4.579804834703438e-09, "loss": 0.8401, "step": 815 }, { "epoch": 1.0, "eval_loss": 1.202426552772522, "eval_runtime": 111.4035, "eval_samples_per_second": 187.974, "eval_steps_per_second": 5.88, "step": 816 }, { "epoch": 1.0, "step": 816, "total_flos": 80063181619200.0, "train_loss": 0.914715180794398, "train_runtime": 1874.4496, "train_samples_per_second": 55.714, "train_steps_per_second": 0.435 } ], "logging_steps": 5, "max_steps": 816, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 80063181619200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }