{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026490066225165563, "grad_norm": 0.12706246972084045, "learning_rate": 0.0004324324324324325, "loss": 4.2683, "step": 16 }, { "epoch": 0.052980132450331126, "grad_norm": 0.1603141874074936, "learning_rate": 0.000864864864864865, "loss": 4.2333, "step": 32 }, { "epoch": 0.07947019867549669, "grad_norm": 0.11292944103479385, "learning_rate": 0.0009997822892796068, "loss": 4.2693, "step": 48 }, { "epoch": 0.10596026490066225, "grad_norm": 0.126583069562912, "learning_rate": 0.000998688816161266, "loss": 4.2745, "step": 64 }, { "epoch": 0.13245033112582782, "grad_norm": 0.1791575849056244, "learning_rate": 0.0009966766110013582, "loss": 4.2707, "step": 80 }, { "epoch": 0.15894039735099338, "grad_norm": 0.16615551710128784, "learning_rate": 0.0009937493808759087, "loss": 4.3332, "step": 96 }, { "epoch": 0.18543046357615894, "grad_norm": 0.17333188652992249, "learning_rate": 0.0009899125186070988, "loss": 4.2671, "step": 112 }, { "epoch": 0.2119205298013245, "grad_norm": 0.14814811944961548, "learning_rate": 0.0009851730928280944, "loss": 4.2474, "step": 128 }, { "epoch": 0.23841059602649006, "grad_norm": 0.16701027750968933, "learning_rate": 0.0009795398349605373, "loss": 4.248, "step": 144 }, { "epoch": 0.26490066225165565, "grad_norm": 0.11271824687719345, "learning_rate": 0.0009730231231286876, "loss": 4.2506, "step": 160 }, { "epoch": 0.2913907284768212, "grad_norm": 0.10354705899953842, "learning_rate": 0.0009656349630398554, "loss": 4.2424, "step": 176 }, { "epoch": 0.31788079470198677, "grad_norm": 0.1165025606751442, "learning_rate": 0.0009573889658663424, "loss": 4.2973, "step": 192 }, { "epoch": 0.3443708609271523, "grad_norm": 0.1250074803829193, "learning_rate": 0.0009483003231696446, "loss": 4.2571, "step": 208 }, { "epoch": 0.3708609271523179, "grad_norm": 0.13243281841278076, "learning_rate": 0.0009383857789131097, "loss": 4.2659, "step": 224 }, { "epoch": 0.3973509933774834, "grad_norm": 0.10707055032253265, "learning_rate": 0.0009276635986146136, "loss": 4.2331, "step": 240 }, { "epoch": 0.423841059602649, "grad_norm": 0.12676522135734558, "learning_rate": 0.0009161535356960828, "loss": 4.2585, "step": 256 }, { "epoch": 0.4503311258278146, "grad_norm": 0.10248049348592758, "learning_rate": 0.0009038767950918592, "loss": 4.2375, "step": 272 }, { "epoch": 0.4768211920529801, "grad_norm": 0.16809116303920746, "learning_rate": 0.0008908559941829497, "loss": 4.2249, "step": 288 }, { "epoch": 0.5033112582781457, "grad_norm": 0.11593299359083176, "learning_rate": 0.0008771151211291332, "loss": 4.2411, "step": 304 }, { "epoch": 0.5298013245033113, "grad_norm": 0.11452265083789825, "learning_rate": 0.0008626794906756866, "loss": 4.2158, "step": 320 }, { "epoch": 0.5562913907284768, "grad_norm": 0.14648571610450745, "learning_rate": 0.0008475756975161504, "loss": 4.2718, "step": 336 }, { "epoch": 0.5827814569536424, "grad_norm": 0.11873907595872879, "learning_rate": 0.00083183156729705, "loss": 4.2431, "step": 352 }, { "epoch": 0.609271523178808, "grad_norm": 0.13245505094528198, "learning_rate": 0.0008154761053548404, "loss": 4.2546, "step": 368 }, { "epoch": 0.6357615894039735, "grad_norm": 0.12086515128612518, "learning_rate": 0.000798539443279511, "loss": 4.2042, "step": 384 }, { "epoch": 0.6622516556291391, "grad_norm": 0.09804444760084152, "learning_rate": 0.0007810527834033009, "loss": 4.2046, "step": 400 }, { "epoch": 0.6887417218543046, "grad_norm": 0.12007380276918411, "learning_rate": 0.00076304834131679, "loss": 4.2311, "step": 416 }, { "epoch": 0.7152317880794702, "grad_norm": 0.12332040816545486, "learning_rate": 0.0007445592865182695, "loss": 4.2304, "step": 432 }, { "epoch": 0.7417218543046358, "grad_norm": 0.12303052097558975, "learning_rate": 0.0007256196813057318, "loss": 4.2351, "step": 448 }, { "epoch": 0.7682119205298014, "grad_norm": 0.09435882419347763, "learning_rate": 0.0007062644180240614, "loss": 4.1903, "step": 464 }, { "epoch": 0.7947019867549668, "grad_norm": 0.12024106830358505, "learning_rate": 0.0006865291547830324, "loss": 4.2468, "step": 480 }, { "epoch": 0.8211920529801324, "grad_norm": 0.09838169813156128, "learning_rate": 0.000666450249764542, "loss": 4.1978, "step": 496 }, { "epoch": 0.847682119205298, "grad_norm": 0.10983674973249435, "learning_rate": 0.0006460646942401058, "loss": 4.2443, "step": 512 }, { "epoch": 0.8741721854304636, "grad_norm": 0.09954366832971573, "learning_rate": 0.0006254100444220115, "loss": 4.227, "step": 528 }, { "epoch": 0.9006622516556292, "grad_norm": 0.12015046179294586, "learning_rate": 0.0006045243522736885, "loss": 4.2154, "step": 544 }, { "epoch": 0.9271523178807947, "grad_norm": 0.10851255804300308, "learning_rate": 0.0005834460954067559, "loss": 4.242, "step": 560 }, { "epoch": 0.9536423841059603, "grad_norm": 0.0975833460688591, "learning_rate": 0.0005622141061939006, "loss": 4.2135, "step": 576 }, { "epoch": 0.9801324503311258, "grad_norm": 0.10621072351932526, "learning_rate": 0.0005408675002281818, "loss": 4.2932, "step": 592 }, { "epoch": 1.0, "eval_bleu": 0.1135024238637086, "eval_cap_loss": 1.2033403850351738, "eval_con_loss": 1.786089795906812, "eval_loss": 2.9894301812380353, "step": 604 }, { "epoch": 1.0, "eval_bleu": 0.1135024238637086, "eval_cap_loss": 1.2033403850351738, "eval_con_loss": 1.786089795906812, "eval_loss": 2.9894301812380353, "eval_runtime": 416.7065, "eval_samples_per_second": 11.588, "eval_steps_per_second": 1.449, "step": 604 }, { "epoch": 1.0066225165562914, "grad_norm": 0.13524451851844788, "learning_rate": 0.0005194456042605587, "loss": 4.1897, "step": 608 }, { "epoch": 1.033112582781457, "grad_norm": 0.09372173994779587, "learning_rate": 0.0004979878837484043, "loss": 4.2234, "step": 624 }, { "epoch": 1.0596026490066226, "grad_norm": 0.10636495053768158, "learning_rate": 0.00047653387014848014, "loss": 4.2304, "step": 640 }, { "epoch": 1.086092715231788, "grad_norm": 0.1295643150806427, "learning_rate": 0.0004551230880883208, "loss": 4.2388, "step": 656 }, { "epoch": 1.1125827814569536, "grad_norm": 0.10932028293609619, "learning_rate": 0.00043379498255020037, "loss": 4.2505, "step": 672 }, { "epoch": 1.1390728476821192, "grad_norm": 0.11317116022109985, "learning_rate": 0.00041258884620182804, "loss": 4.2256, "step": 688 }, { "epoch": 1.1655629139072847, "grad_norm": 0.15960782766342163, "learning_rate": 0.00039154374700765316, "loss": 4.2248, "step": 704 }, { "epoch": 1.1920529801324504, "grad_norm": 0.14448010921478271, "learning_rate": 0.00037069845625413954, "loss": 4.248, "step": 720 }, { "epoch": 1.218543046357616, "grad_norm": 0.12560078501701355, "learning_rate": 0.0003500913771216081, "loss": 4.2264, "step": 736 }, { "epoch": 1.2450331125827814, "grad_norm": 0.15087343752384186, "learning_rate": 0.0003297604739342396, "loss": 4.2582, "step": 752 }, { "epoch": 1.271523178807947, "grad_norm": 0.14385798573493958, "learning_rate": 0.00030974320221858066, "loss": 4.2413, "step": 768 }, { "epoch": 1.2980132450331126, "grad_norm": 0.12157344818115234, "learning_rate": 0.0002900764396994049, "loss": 4.2285, "step": 784 }, { "epoch": 1.3245033112582782, "grad_norm": 0.14533455669879913, "learning_rate": 0.00027079641836005473, "loss": 4.2719, "step": 800 }, { "epoch": 1.3509933774834437, "grad_norm": 0.11581210792064667, "learning_rate": 0.0002519386576924303, "loss": 4.2191, "step": 816 }, { "epoch": 1.3774834437086092, "grad_norm": 0.11933822929859161, "learning_rate": 0.0002335378992595995, "loss": 4.1929, "step": 832 }, { "epoch": 1.403973509933775, "grad_norm": 0.13587485253810883, "learning_rate": 0.0002156280426915786, "loss": 4.2147, "step": 848 }, { "epoch": 1.4304635761589404, "grad_norm": 0.11555938422679901, "learning_rate": 0.00019824208323220656, "loss": 4.2268, "step": 864 }, { "epoch": 1.4569536423841059, "grad_norm": 0.12277619540691376, "learning_rate": 0.00018141205095216294, "loss": 4.2261, "step": 880 }, { "epoch": 1.4834437086092715, "grad_norm": 0.11538510024547577, "learning_rate": 0.00016516895174012043, "loss": 4.1956, "step": 896 }, { "epoch": 1.5099337748344372, "grad_norm": 0.10595931112766266, "learning_rate": 0.00014954271018074368, "loss": 4.2703, "step": 912 }, { "epoch": 1.5364238410596025, "grad_norm": 0.12333059310913086, "learning_rate": 0.00013456211442476813, "loss": 4.1822, "step": 928 }, { "epoch": 1.5629139072847682, "grad_norm": 0.10703016817569733, "learning_rate": 0.00012025476315272743, "loss": 4.2289, "step": 944 }, { "epoch": 1.589403973509934, "grad_norm": 0.11507616937160492, "learning_rate": 0.00010664701473003396, "loss": 4.2223, "step": 960 }, { "epoch": 1.6158940397350994, "grad_norm": 0.09708770364522934, "learning_rate": 9.376393864708821e-05, "loss": 4.2223, "step": 976 }, { "epoch": 1.6423841059602649, "grad_norm": 0.11372318863868713, "learning_rate": 8.162926933387499e-05, "loss": 4.2433, "step": 992 }, { "epoch": 1.6688741721854305, "grad_norm": 0.16325490176677704, "learning_rate": 7.026536243413539e-05, "loss": 4.2427, "step": 1008 }, { "epoch": 1.695364238410596, "grad_norm": 0.10167232900857925, "learning_rate": 5.969315361967087e-05, "loss": 4.1995, "step": 1024 }, { "epoch": 1.7218543046357615, "grad_norm": 0.11737249046564102, "learning_rate": 4.9932120020654116e-05, "loss": 4.2522, "step": 1040 }, { "epoch": 1.7483443708609272, "grad_norm": 0.10857795923948288, "learning_rate": 4.100024434300437e-05, "loss": 4.1555, "step": 1056 }, { "epoch": 1.7748344370860927, "grad_norm": 0.11734642088413239, "learning_rate": 3.2913981738933395e-05, "loss": 4.208, "step": 1072 }, { "epoch": 1.8013245033112582, "grad_norm": 0.10702993720769882, "learning_rate": 2.5688229491697356e-05, "loss": 4.2835, "step": 1088 }, { "epoch": 1.8278145695364238, "grad_norm": 0.12669378519058228, "learning_rate": 1.9336299570401396e-05, "loss": 4.2288, "step": 1104 }, { "epoch": 1.8543046357615895, "grad_norm": 0.0894075259566307, "learning_rate": 1.3869894105423109e-05, "loss": 4.2051, "step": 1120 }, { "epoch": 1.8807947019867548, "grad_norm": 0.09530144929885864, "learning_rate": 9.299083829632516e-06, "loss": 4.2449, "step": 1136 }, { "epoch": 1.9072847682119205, "grad_norm": 0.13094228506088257, "learning_rate": 5.632289525129064e-06, "loss": 4.2017, "step": 1152 }, { "epoch": 1.9337748344370862, "grad_norm": 0.12416987866163254, "learning_rate": 2.8762665096744854e-06, "loss": 4.1943, "step": 1168 }, { "epoch": 1.9602649006622517, "grad_norm": 0.10805880278348923, "learning_rate": 1.036092191402882e-06, "loss": 4.1936, "step": 1184 }, { "epoch": 1.9867549668874172, "grad_norm": 0.11822624504566193, "learning_rate": 1.1515671473599775e-07, "loss": 4.2833, "step": 1200 }, { "epoch": 2.0, "eval_bleu": 0.1159956729511227, "eval_cap_loss": 1.1952345237037203, "eval_con_loss": 1.7797789394066035, "eval_loss": 2.975013460544561, "step": 1208 }, { "epoch": 2.0, "eval_bleu": 0.1159956729511227, "eval_cap_loss": 1.1952345237037203, "eval_con_loss": 1.7797789394066035, "eval_loss": 2.975013460544561, "eval_runtime": 297.535, "eval_samples_per_second": 16.23, "eval_steps_per_second": 2.03, "step": 1208 } ], "logging_steps": 16, "max_steps": 1208, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null, "tau_value": 5.2037 }