{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 816, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012254901960784314, "grad_norm": 1.6036396146523473, "learning_rate": 1.2195121951219513e-05, "loss": 1.3541, "step": 1 }, { "epoch": 0.006127450980392157, "grad_norm": 1.4541999892274449, "learning_rate": 6.097560975609756e-05, "loss": 1.355, "step": 5 }, { "epoch": 0.012254901960784314, "grad_norm": 1.5390535097118918, "learning_rate": 0.00012195121951219512, "loss": 1.3083, "step": 10 }, { "epoch": 0.01838235294117647, "grad_norm": 0.5811818759637415, "learning_rate": 0.00018292682926829268, "loss": 1.2226, "step": 15 }, { "epoch": 0.024509803921568627, "grad_norm": 0.3755084090839048, "learning_rate": 0.00024390243902439024, "loss": 1.14, "step": 20 }, { "epoch": 0.030637254901960783, "grad_norm": 0.24648929470069889, "learning_rate": 0.0003048780487804878, "loss": 1.0942, "step": 25 }, { "epoch": 0.03676470588235294, "grad_norm": 0.18518290277618207, "learning_rate": 0.00036585365853658537, "loss": 1.0785, "step": 30 }, { "epoch": 0.0428921568627451, "grad_norm": 0.14372846391436536, "learning_rate": 0.0004268292682926829, "loss": 1.0549, "step": 35 }, { "epoch": 0.049019607843137254, "grad_norm": 0.1389801769443967, "learning_rate": 0.0004878048780487805, "loss": 1.0493, "step": 40 }, { "epoch": 0.05514705882352941, "grad_norm": 0.15520999728279067, "learning_rate": 0.0005487804878048781, "loss": 1.0306, "step": 45 }, { "epoch": 0.061274509803921566, "grad_norm": 0.13121885809742784, "learning_rate": 0.0006097560975609756, "loss": 1.0204, "step": 50 }, { "epoch": 0.06740196078431372, "grad_norm": 0.11703518229748765, "learning_rate": 0.0006707317073170732, "loss": 1.0281, "step": 55 }, { "epoch": 0.07352941176470588, "grad_norm": 0.12071199354118181, "learning_rate": 0.0007317073170731707, "loss": 1.0188, "step": 60 }, { "epoch": 0.07965686274509803, "grad_norm": 0.11482998270254378, "learning_rate": 0.0007926829268292683, "loss": 1.0019, "step": 65 }, { "epoch": 0.0857843137254902, "grad_norm": 0.13413766368345428, "learning_rate": 0.0008536585365853659, "loss": 1.0043, "step": 70 }, { "epoch": 0.09191176470588236, "grad_norm": 0.1471493243236106, "learning_rate": 0.0009146341463414635, "loss": 1.0071, "step": 75 }, { "epoch": 0.09803921568627451, "grad_norm": 0.1259997868133445, "learning_rate": 0.000975609756097561, "loss": 0.9962, "step": 80 }, { "epoch": 0.10416666666666667, "grad_norm": 0.16099811489081525, "learning_rate": 0.000999958782259877, "loss": 0.9999, "step": 85 }, { "epoch": 0.11029411764705882, "grad_norm": 0.13529736823555488, "learning_rate": 0.0009997069206794246, "loss": 1.01, "step": 90 }, { "epoch": 0.11642156862745098, "grad_norm": 0.1287112736990173, "learning_rate": 0.0009992262114666653, "loss": 0.9904, "step": 95 }, { "epoch": 0.12254901960784313, "grad_norm": 0.13096915650623966, "learning_rate": 0.0009985168747689707, "loss": 0.9859, "step": 100 }, { "epoch": 0.12867647058823528, "grad_norm": 0.14334941177812624, "learning_rate": 0.0009975792354368017, "loss": 0.9934, "step": 105 }, { "epoch": 0.13480392156862744, "grad_norm": 0.13671174612094514, "learning_rate": 0.0009964137228749407, "loss": 0.9961, "step": 110 }, { "epoch": 0.1409313725490196, "grad_norm": 0.12218713761592166, "learning_rate": 0.000995020870845837, "loss": 0.9948, "step": 115 }, { "epoch": 0.14705882352941177, "grad_norm": 0.1291445702524626, "learning_rate": 0.0009934013172251653, "loss": 0.9824, "step": 120 }, { "epoch": 0.15318627450980393, "grad_norm": 0.1346131782998567, "learning_rate": 0.0009915558037097002, "loss": 0.977, "step": 125 }, { "epoch": 0.15931372549019607, "grad_norm": 0.13410866481050307, "learning_rate": 0.0009894851754776472, "loss": 0.9712, "step": 130 }, { "epoch": 0.16544117647058823, "grad_norm": 0.12662709799384195, "learning_rate": 0.0009871903808015812, "loss": 0.9807, "step": 135 }, { "epoch": 0.1715686274509804, "grad_norm": 0.10699714724935337, "learning_rate": 0.0009846724706141716, "loss": 0.977, "step": 140 }, { "epoch": 0.17769607843137256, "grad_norm": 0.11416365647326593, "learning_rate": 0.0009819325980268945, "loss": 0.9743, "step": 145 }, { "epoch": 0.18382352941176472, "grad_norm": 0.11826552720186441, "learning_rate": 0.0009789720178019483, "loss": 0.9741, "step": 150 }, { "epoch": 0.18995098039215685, "grad_norm": 0.11966728016095998, "learning_rate": 0.0009757920857776188, "loss": 0.9633, "step": 155 }, { "epoch": 0.19607843137254902, "grad_norm": 0.1224502041103689, "learning_rate": 0.0009723942582473544, "loss": 0.9544, "step": 160 }, { "epoch": 0.20220588235294118, "grad_norm": 0.14383092165933975, "learning_rate": 0.0009687800912928362, "loss": 0.9696, "step": 165 }, { "epoch": 0.20833333333333334, "grad_norm": 0.11172166270095091, "learning_rate": 0.0009649512400713498, "loss": 0.963, "step": 170 }, { "epoch": 0.21446078431372548, "grad_norm": 0.15338163624311216, "learning_rate": 0.0009609094580577824, "loss": 0.96, "step": 175 }, { "epoch": 0.22058823529411764, "grad_norm": 0.13464799651247097, "learning_rate": 0.0009566565962415959, "loss": 0.9578, "step": 180 }, { "epoch": 0.2267156862745098, "grad_norm": 0.14069989404410843, "learning_rate": 0.0009521946022791401, "loss": 0.9555, "step": 185 }, { "epoch": 0.23284313725490197, "grad_norm": 0.1563880044910766, "learning_rate": 0.0009475255196016972, "loss": 0.9579, "step": 190 }, { "epoch": 0.23897058823529413, "grad_norm": 0.1354522777364055, "learning_rate": 0.0009426514864796647, "loss": 0.9494, "step": 195 }, { "epoch": 0.24509803921568626, "grad_norm": 0.14141184524556524, "learning_rate": 0.0009375747350433044, "loss": 0.9479, "step": 200 }, { "epoch": 0.2512254901960784, "grad_norm": 0.16178153584659036, "learning_rate": 0.0009322975902605082, "loss": 0.9655, "step": 205 }, { "epoch": 0.25735294117647056, "grad_norm": 0.12500026452110888, "learning_rate": 0.0009268224688720474, "loss": 0.9446, "step": 210 }, { "epoch": 0.26348039215686275, "grad_norm": 0.11225181351597031, "learning_rate": 0.0009211518782847931, "loss": 0.9425, "step": 215 }, { "epoch": 0.2696078431372549, "grad_norm": 0.11760271912658449, "learning_rate": 0.0009152884154234145, "loss": 0.9451, "step": 220 }, { "epoch": 0.2757352941176471, "grad_norm": 0.1118694290603578, "learning_rate": 0.0009092347655410818, "loss": 0.9403, "step": 225 }, { "epoch": 0.2818627450980392, "grad_norm": 0.1420082600855828, "learning_rate": 0.0009029937009897176, "loss": 0.9349, "step": 230 }, { "epoch": 0.28799019607843135, "grad_norm": 0.1056893276215326, "learning_rate": 0.0008965680799503608, "loss": 0.9329, "step": 235 }, { "epoch": 0.29411764705882354, "grad_norm": 0.11697165985204966, "learning_rate": 0.0008899608451242233, "loss": 0.9396, "step": 240 }, { "epoch": 0.3002450980392157, "grad_norm": 0.11807737475048682, "learning_rate": 0.0008831750223850389, "loss": 0.923, "step": 245 }, { "epoch": 0.30637254901960786, "grad_norm": 0.1050901075842651, "learning_rate": 0.0008762137193933241, "loss": 0.9296, "step": 250 }, { "epoch": 0.3125, "grad_norm": 0.12149544868604345, "learning_rate": 0.0008690801241731818, "loss": 0.9209, "step": 255 }, { "epoch": 0.31862745098039214, "grad_norm": 1.9072328081474224, "learning_rate": 0.0008617775036523015, "loss": 0.9392, "step": 260 }, { "epoch": 0.3247549019607843, "grad_norm": 0.11658317470657904, "learning_rate": 0.0008543092021658259, "loss": 0.9367, "step": 265 }, { "epoch": 0.33088235294117646, "grad_norm": 0.11777705186781876, "learning_rate": 0.0008466786399247663, "loss": 0.9285, "step": 270 }, { "epoch": 0.33700980392156865, "grad_norm": 0.10974224954880234, "learning_rate": 0.0008388893114496705, "loss": 0.9357, "step": 275 }, { "epoch": 0.3431372549019608, "grad_norm": 0.10762311122261868, "learning_rate": 0.0008309447839702582, "loss": 0.9303, "step": 280 }, { "epoch": 0.3492647058823529, "grad_norm": 0.10853358544847327, "learning_rate": 0.0008228486957917607, "loss": 0.9222, "step": 285 }, { "epoch": 0.3553921568627451, "grad_norm": 0.11469387675689356, "learning_rate": 0.0008146047546287076, "loss": 0.9343, "step": 290 }, { "epoch": 0.36151960784313725, "grad_norm": 0.11795665158917668, "learning_rate": 0.0008062167359069301, "loss": 0.9277, "step": 295 }, { "epoch": 0.36764705882352944, "grad_norm": 0.11929317566114471, "learning_rate": 0.000797688481034551, "loss": 0.9176, "step": 300 }, { "epoch": 0.3737745098039216, "grad_norm": 0.10301604347512731, "learning_rate": 0.00078902389564276, "loss": 0.9239, "step": 305 }, { "epoch": 0.3799019607843137, "grad_norm": 0.1271379375111038, "learning_rate": 0.0007802269477971771, "loss": 0.9166, "step": 310 }, { "epoch": 0.3860294117647059, "grad_norm": 0.13842829612868068, "learning_rate": 0.0007713016661806211, "loss": 0.9162, "step": 315 }, { "epoch": 0.39215686274509803, "grad_norm": 0.1258803633770378, "learning_rate": 0.0007622521382481208, "loss": 0.9096, "step": 320 }, { "epoch": 0.39828431372549017, "grad_norm": 0.10943327109661027, "learning_rate": 0.0007530825083550073, "loss": 0.9031, "step": 325 }, { "epoch": 0.40441176470588236, "grad_norm": 0.10473608787205252, "learning_rate": 0.0007437969758589507, "loss": 0.9144, "step": 330 }, { "epoch": 0.4105392156862745, "grad_norm": 0.1120426574406447, "learning_rate": 0.0007343997931968067, "loss": 0.9073, "step": 335 }, { "epoch": 0.4166666666666667, "grad_norm": 0.10888401544172292, "learning_rate": 0.0007248952639371542, "loss": 0.9073, "step": 340 }, { "epoch": 0.4227941176470588, "grad_norm": 0.11326294156455767, "learning_rate": 0.0007152877408094178, "loss": 0.8996, "step": 345 }, { "epoch": 0.42892156862745096, "grad_norm": 0.12674079463497812, "learning_rate": 0.0007055816237104753, "loss": 0.9092, "step": 350 }, { "epoch": 0.43504901960784315, "grad_norm": 0.12252284768767446, "learning_rate": 0.0006957813576896647, "loss": 0.8988, "step": 355 }, { "epoch": 0.4411764705882353, "grad_norm": 0.12424572403107578, "learning_rate": 0.000685891430913113, "loss": 0.9088, "step": 360 }, { "epoch": 0.44730392156862747, "grad_norm": 0.11202550868881908, "learning_rate": 0.0006759163726083191, "loss": 0.9002, "step": 365 }, { "epoch": 0.4534313725490196, "grad_norm": 0.09998982889163562, "learning_rate": 0.0006658607509899319, "loss": 0.8993, "step": 370 }, { "epoch": 0.45955882352941174, "grad_norm": 0.11977953776420541, "learning_rate": 0.0006557291711676738, "loss": 0.9062, "step": 375 }, { "epoch": 0.46568627450980393, "grad_norm": 0.11102474447162053, "learning_rate": 0.0006455262730373672, "loss": 0.8898, "step": 380 }, { "epoch": 0.47181372549019607, "grad_norm": 0.12262996603961465, "learning_rate": 0.0006352567291560318, "loss": 0.8945, "step": 385 }, { "epoch": 0.47794117647058826, "grad_norm": 0.11193151635262173, "learning_rate": 0.0006249252426020216, "loss": 0.8974, "step": 390 }, { "epoch": 0.4840686274509804, "grad_norm": 0.11911248377352072, "learning_rate": 0.0006145365448211866, "loss": 0.8995, "step": 395 }, { "epoch": 0.49019607843137253, "grad_norm": 0.11024344557839909, "learning_rate": 0.0006040953934600423, "loss": 0.8919, "step": 400 }, { "epoch": 0.4963235294117647, "grad_norm": 0.10313300838358162, "learning_rate": 0.0005936065701869403, "loss": 0.8965, "step": 405 }, { "epoch": 0.5024509803921569, "grad_norm": 0.1517513971243366, "learning_rate": 0.0005830748785022368, "loss": 0.8951, "step": 410 }, { "epoch": 0.508578431372549, "grad_norm": 0.10621777821428764, "learning_rate": 0.0005725051415384657, "loss": 0.9009, "step": 415 }, { "epoch": 0.5147058823529411, "grad_norm": 0.120824242008392, "learning_rate": 0.0005619021998515165, "loss": 0.8916, "step": 420 }, { "epoch": 0.5208333333333334, "grad_norm": 0.10796312687200485, "learning_rate": 0.000551270909203838, "loss": 0.8875, "step": 425 }, { "epoch": 0.5269607843137255, "grad_norm": 0.10485643705406462, "learning_rate": 0.0005406161383406731, "loss": 0.8995, "step": 430 }, { "epoch": 0.5330882352941176, "grad_norm": 0.1040747798660248, "learning_rate": 0.0005299427667603515, "loss": 0.9022, "step": 435 }, { "epoch": 0.5392156862745098, "grad_norm": 0.10303234276114956, "learning_rate": 0.0005192556824796568, "loss": 0.8858, "step": 440 }, { "epoch": 0.5453431372549019, "grad_norm": 0.13041962513060196, "learning_rate": 0.0005085597797952905, "loss": 0.8842, "step": 445 }, { "epoch": 0.5514705882352942, "grad_norm": 0.09392051916112838, "learning_rate": 0.0004978599570424639, "loss": 0.8832, "step": 450 }, { "epoch": 0.5575980392156863, "grad_norm": 0.11180233058561544, "learning_rate": 0.0004871611143516367, "loss": 0.8878, "step": 455 }, { "epoch": 0.5637254901960784, "grad_norm": 0.12367570385780484, "learning_rate": 0.0004764681514044362, "loss": 0.8859, "step": 460 }, { "epoch": 0.5698529411764706, "grad_norm": 0.10034153908219615, "learning_rate": 0.0004657859651897806, "loss": 0.8889, "step": 465 }, { "epoch": 0.5759803921568627, "grad_norm": 0.09279117510206411, "learning_rate": 0.00045511944776123513, "loss": 0.878, "step": 470 }, { "epoch": 0.5821078431372549, "grad_norm": 0.09436846491514878, "learning_rate": 0.00044447348399663056, "loss": 0.8842, "step": 475 }, { "epoch": 0.5882352941176471, "grad_norm": 0.2016423837068627, "learning_rate": 0.0004338529493609647, "loss": 0.8815, "step": 480 }, { "epoch": 0.5943627450980392, "grad_norm": 0.1280806078271419, "learning_rate": 0.00042326270767361815, "loss": 0.8877, "step": 485 }, { "epoch": 0.6004901960784313, "grad_norm": 0.08986898470466548, "learning_rate": 0.00041270760888089997, "loss": 0.8819, "step": 490 }, { "epoch": 0.6066176470588235, "grad_norm": 0.0982311145214648, "learning_rate": 0.00040219248683494925, "loss": 0.8629, "step": 495 }, { "epoch": 0.6127450980392157, "grad_norm": 0.12205296661938488, "learning_rate": 0.0003917221570800065, "loss": 0.8713, "step": 500 }, { "epoch": 0.6188725490196079, "grad_norm": 0.10671335419272648, "learning_rate": 0.000381301414647068, "loss": 0.8703, "step": 505 }, { "epoch": 0.625, "grad_norm": 0.10436549804544415, "learning_rate": 0.0003709350318579371, "loss": 0.8929, "step": 510 }, { "epoch": 0.6311274509803921, "grad_norm": 0.09117916773772033, "learning_rate": 0.0003606277561396726, "loss": 0.8591, "step": 515 }, { "epoch": 0.6372549019607843, "grad_norm": 0.09178929014053801, "learning_rate": 0.00035038430785044053, "loss": 0.8625, "step": 520 }, { "epoch": 0.6433823529411765, "grad_norm": 0.09612760306153695, "learning_rate": 0.00034020937811776156, "loss": 0.8594, "step": 525 }, { "epoch": 0.6495098039215687, "grad_norm": 0.09538202932858882, "learning_rate": 0.00033010762669014347, "loss": 0.867, "step": 530 }, { "epoch": 0.6556372549019608, "grad_norm": 0.09494219161832793, "learning_rate": 0.00032008367980308734, "loss": 0.872, "step": 535 }, { "epoch": 0.6617647058823529, "grad_norm": 0.08714205666482473, "learning_rate": 0.0003101421280604379, "loss": 0.8838, "step": 540 }, { "epoch": 0.6678921568627451, "grad_norm": 0.0969140933539997, "learning_rate": 0.00030028752433205476, "loss": 0.8608, "step": 545 }, { "epoch": 0.6740196078431373, "grad_norm": 0.09815891195004724, "learning_rate": 0.00029052438166876307, "loss": 0.8525, "step": 550 }, { "epoch": 0.6801470588235294, "grad_norm": 0.09615743129166938, "learning_rate": 0.0002808571712355389, "loss": 0.8638, "step": 555 }, { "epoch": 0.6862745098039216, "grad_norm": 0.10788692448970114, "learning_rate": 0.00027129032026388045, "loss": 0.8579, "step": 560 }, { "epoch": 0.6924019607843137, "grad_norm": 0.09213201587158737, "learning_rate": 0.00026182821002429345, "loss": 0.8615, "step": 565 }, { "epoch": 0.6985294117647058, "grad_norm": 0.09011406445898068, "learning_rate": 0.00025247517381983136, "loss": 0.8653, "step": 570 }, { "epoch": 0.7046568627450981, "grad_norm": 0.09736667368082612, "learning_rate": 0.00024323549500159802, "loss": 0.8617, "step": 575 }, { "epoch": 0.7107843137254902, "grad_norm": 0.09441622008962705, "learning_rate": 0.0002341134050071283, "loss": 0.8549, "step": 580 }, { "epoch": 0.7169117647058824, "grad_norm": 0.10000573346346127, "learning_rate": 0.00022511308142254488, "loss": 0.8575, "step": 585 }, { "epoch": 0.7230392156862745, "grad_norm": 0.1122829077347512, "learning_rate": 0.000216238646069373, "loss": 0.8604, "step": 590 }, { "epoch": 0.7291666666666666, "grad_norm": 0.09161355750906706, "learning_rate": 0.00020749416311689845, "loss": 0.8604, "step": 595 }, { "epoch": 0.7352941176470589, "grad_norm": 0.10349631376405924, "learning_rate": 0.00019888363722092372, "loss": 0.8629, "step": 600 }, { "epoch": 0.741421568627451, "grad_norm": 0.088451467518437, "learning_rate": 0.00019041101168978093, "loss": 0.8587, "step": 605 }, { "epoch": 0.7475490196078431, "grad_norm": 0.09610692111696861, "learning_rate": 0.00018208016667844152, "loss": 0.8613, "step": 610 }, { "epoch": 0.7536764705882353, "grad_norm": 0.09635232236992683, "learning_rate": 0.00017389491741154372, "loss": 0.8541, "step": 615 }, { "epoch": 0.7598039215686274, "grad_norm": 0.08400718304881724, "learning_rate": 0.00016585901243616042, "loss": 0.8564, "step": 620 }, { "epoch": 0.7659313725490197, "grad_norm": 0.08527935145250837, "learning_rate": 0.0001579761319050991, "loss": 0.8545, "step": 625 }, { "epoch": 0.7720588235294118, "grad_norm": 0.08535848929061582, "learning_rate": 0.00015024988589152537, "loss": 0.858, "step": 630 }, { "epoch": 0.7781862745098039, "grad_norm": 0.08525484362176303, "learning_rate": 0.0001426838127356823, "loss": 0.8538, "step": 635 }, { "epoch": 0.7843137254901961, "grad_norm": 0.08278907118602048, "learning_rate": 0.0001352813774244565, "loss": 0.8488, "step": 640 }, { "epoch": 0.7904411764705882, "grad_norm": 0.08784154420560207, "learning_rate": 0.00012804597000454215, "loss": 0.8556, "step": 645 }, { "epoch": 0.7965686274509803, "grad_norm": 0.08728645873334986, "learning_rate": 0.00012098090402992085, "loss": 0.8662, "step": 650 }, { "epoch": 0.8026960784313726, "grad_norm": 0.08368590385119791, "learning_rate": 0.00011408941504437532, "loss": 0.8541, "step": 655 }, { "epoch": 0.8088235294117647, "grad_norm": 0.12573965965935716, "learning_rate": 0.00010737465909972776, "loss": 0.8472, "step": 660 }, { "epoch": 0.8149509803921569, "grad_norm": 0.08047884146311494, "learning_rate": 0.00010083971131048159, "loss": 0.8492, "step": 665 }, { "epoch": 0.821078431372549, "grad_norm": 0.07946237543030905, "learning_rate": 9.448756444553224e-05, "loss": 0.8503, "step": 670 }, { "epoch": 0.8272058823529411, "grad_norm": 0.08237915177176067, "learning_rate": 8.832112755758598e-05, "loss": 0.848, "step": 675 }, { "epoch": 0.8333333333333334, "grad_norm": 0.08064567161653642, "learning_rate": 8.234322465092047e-05, "loss": 0.8489, "step": 680 }, { "epoch": 0.8394607843137255, "grad_norm": 0.08580518309504229, "learning_rate": 7.655659338809329e-05, "loss": 0.8481, "step": 685 }, { "epoch": 0.8455882352941176, "grad_norm": 0.08240105778083462, "learning_rate": 7.096388383619079e-05, "loss": 0.8434, "step": 690 }, { "epoch": 0.8517156862745098, "grad_norm": 0.10667272643643334, "learning_rate": 6.556765725319525e-05, "loss": 0.8477, "step": 695 }, { "epoch": 0.8578431372549019, "grad_norm": 0.09351829478479391, "learning_rate": 6.037038491501978e-05, "loss": 0.8521, "step": 700 }, { "epoch": 0.8639705882352942, "grad_norm": 0.07805110075360967, "learning_rate": 5.53744469837551e-05, "loss": 0.843, "step": 705 }, { "epoch": 0.8700980392156863, "grad_norm": 0.08112250711019013, "learning_rate": 5.058213141764151e-05, "loss": 0.8434, "step": 710 }, { "epoch": 0.8762254901960784, "grad_norm": 0.09988697539615195, "learning_rate": 4.599563292326592e-05, "loss": 0.8483, "step": 715 }, { "epoch": 0.8823529411764706, "grad_norm": 0.08158094478141424, "learning_rate": 4.161705195046761e-05, "loss": 0.8441, "step": 720 }, { "epoch": 0.8884803921568627, "grad_norm": 0.08831427591672994, "learning_rate": 3.744839373040682e-05, "loss": 0.8465, "step": 725 }, { "epoch": 0.8946078431372549, "grad_norm": 0.08621255370586131, "learning_rate": 3.349156735724274e-05, "loss": 0.8478, "step": 730 }, { "epoch": 0.9007352941176471, "grad_norm": 0.07423138411019962, "learning_rate": 2.9748384913837522e-05, "loss": 0.8345, "step": 735 }, { "epoch": 0.9068627450980392, "grad_norm": 0.0795306653386215, "learning_rate": 2.622056064188738e-05, "loss": 0.8537, "step": 740 }, { "epoch": 0.9129901960784313, "grad_norm": 0.07964808233910256, "learning_rate": 2.2909710156863274e-05, "loss": 0.8512, "step": 745 }, { "epoch": 0.9191176470588235, "grad_norm": 0.08073502455884987, "learning_rate": 1.981734970811644e-05, "loss": 0.8415, "step": 750 }, { "epoch": 0.9252450980392157, "grad_norm": 0.07585958934986796, "learning_rate": 1.6944895484492072e-05, "loss": 0.8521, "step": 755 }, { "epoch": 0.9313725490196079, "grad_norm": 0.08108459367268084, "learning_rate": 1.429366296576623e-05, "loss": 0.851, "step": 760 }, { "epoch": 0.9375, "grad_norm": 0.08450154778950293, "learning_rate": 1.1864866320203115e-05, "loss": 0.8478, "step": 765 }, { "epoch": 0.9436274509803921, "grad_norm": 0.08368284039729793, "learning_rate": 9.659617848510882e-06, "loss": 0.8447, "step": 770 }, { "epoch": 0.9497549019607843, "grad_norm": 0.08046653108562293, "learning_rate": 7.678927474447817e-06, "loss": 0.8443, "step": 775 }, { "epoch": 0.9558823529411765, "grad_norm": 0.08163069514122503, "learning_rate": 5.923702282314092e-06, "loss": 0.8467, "step": 780 }, { "epoch": 0.9620098039215687, "grad_norm": 0.07894532908798362, "learning_rate": 4.394746101540115e-06, "loss": 0.8421, "step": 785 }, { "epoch": 0.9681372549019608, "grad_norm": 0.08066759347765237, "learning_rate": 3.092759138561607e-06, "loss": 0.8403, "step": 790 }, { "epoch": 0.9742647058823529, "grad_norm": 0.08225885021056388, "learning_rate": 2.018337656150726e-06, "loss": 0.8459, "step": 795 }, { "epoch": 0.9803921568627451, "grad_norm": 0.0939601026942546, "learning_rate": 1.1719737003492159e-06, "loss": 0.8385, "step": 800 }, { "epoch": 0.9865196078431373, "grad_norm": 0.07754259049318248, "learning_rate": 5.540548751292173e-07, "loss": 0.8343, "step": 805 }, { "epoch": 0.9926470588235294, "grad_norm": 0.08221470163380953, "learning_rate": 1.6486416488459277e-07, "loss": 0.8475, "step": 810 }, { "epoch": 0.9987745098039216, "grad_norm": 0.08134157509220469, "learning_rate": 4.579804834703438e-09, "loss": 0.8399, "step": 815 }, { "epoch": 1.0, "eval_loss": 1.202902913093567, "eval_runtime": 113.99, "eval_samples_per_second": 183.709, "eval_steps_per_second": 5.746, "step": 816 }, { "epoch": 1.0, "step": 816, "total_flos": 80063181619200.0, "train_loss": 0.9145085595402063, "train_runtime": 1902.7506, "train_samples_per_second": 54.885, "train_steps_per_second": 0.429 } ], "logging_steps": 5, "max_steps": 816, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 80063181619200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }