|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 1208, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.026490066225165563, |
|
"grad_norm": 0.13324208557605743, |
|
"learning_rate": 0.0002191780821917808, |
|
"loss": 4.2964, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.052980132450331126, |
|
"grad_norm": 0.1445072889328003, |
|
"learning_rate": 0.0004383561643835616, |
|
"loss": 4.2649, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07947019867549669, |
|
"grad_norm": 0.14529070258140564, |
|
"learning_rate": 0.0006575342465753425, |
|
"loss": 4.3014, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.10596026490066225, |
|
"grad_norm": 0.2007322758436203, |
|
"learning_rate": 0.0008767123287671232, |
|
"loss": 4.3097, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.13245033112582782, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0009999959548269918, |
|
"loss": 4.3854, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15894039735099338, |
|
"grad_norm": 0.2345261424779892, |
|
"learning_rate": 0.0009998377521727646, |
|
"loss": 4.4083, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.18543046357615894, |
|
"grad_norm": 0.14606653153896332, |
|
"learning_rate": 0.0009994495073085382, |
|
"loss": 4.3126, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2119205298013245, |
|
"grad_norm": 0.12169423699378967, |
|
"learning_rate": 0.0009988313989177076, |
|
"loss": 4.2892, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23841059602649006, |
|
"grad_norm": 0.21712887287139893, |
|
"learning_rate": 0.0009979837114746243, |
|
"loss": 4.2891, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.26490066225165565, |
|
"grad_norm": 0.09952688217163086, |
|
"learning_rate": 0.0009969068351136706, |
|
"loss": 4.2913, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2913907284768212, |
|
"grad_norm": 0.1054968312382698, |
|
"learning_rate": 0.0009956012654497074, |
|
"loss": 4.2794, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31788079470198677, |
|
"grad_norm": 0.11566981673240662, |
|
"learning_rate": 0.000994067603349975, |
|
"loss": 4.3372, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3443708609271523, |
|
"grad_norm": 0.11426500976085663, |
|
"learning_rate": 0.0009923065546575544, |
|
"loss": 4.2932, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3708609271523179, |
|
"grad_norm": 0.1628023236989975, |
|
"learning_rate": 0.000990318929866513, |
|
"loss": 4.3015, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3973509933774834, |
|
"grad_norm": 0.1205693706870079, |
|
"learning_rate": 0.00098810564374889, |
|
"loss": 4.2702, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.423841059602649, |
|
"grad_norm": 0.10916964709758759, |
|
"learning_rate": 0.0009856677149336858, |
|
"loss": 4.2937, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4503311258278146, |
|
"grad_norm": 0.1388256847858429, |
|
"learning_rate": 0.0009830062654380548, |
|
"loss": 4.2742, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4768211920529801, |
|
"grad_norm": 0.17313151061534882, |
|
"learning_rate": 0.0009801225201509158, |
|
"loss": 4.2593, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5033112582781457, |
|
"grad_norm": 0.13390372693538666, |
|
"learning_rate": 0.0009770178062692165, |
|
"loss": 4.2749, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5298013245033113, |
|
"grad_norm": 0.1548251509666443, |
|
"learning_rate": 0.000973693552687112, |
|
"loss": 4.2509, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5562913907284768, |
|
"grad_norm": 0.13545338809490204, |
|
"learning_rate": 0.0009701512893383407, |
|
"loss": 4.308, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5827814569536424, |
|
"grad_norm": 0.12648995220661163, |
|
"learning_rate": 0.0009663926464920958, |
|
"loss": 4.2801, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.609271523178808, |
|
"grad_norm": 0.13534340262413025, |
|
"learning_rate": 0.0009624193540027217, |
|
"loss": 4.2898, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6357615894039735, |
|
"grad_norm": 0.12199301272630692, |
|
"learning_rate": 0.0009582332405135758, |
|
"loss": 4.2394, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.6622516556291391, |
|
"grad_norm": 0.12553353607654572, |
|
"learning_rate": 0.0009538362326154249, |
|
"loss": 4.2391, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6887417218543046, |
|
"grad_norm": 0.12340961396694183, |
|
"learning_rate": 0.0009492303539597636, |
|
"loss": 4.2676, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7152317880794702, |
|
"grad_norm": 0.10325583815574646, |
|
"learning_rate": 0.0009444177243274617, |
|
"loss": 4.2635, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.7417218543046358, |
|
"grad_norm": 0.13559825718402863, |
|
"learning_rate": 0.0009394005586531688, |
|
"loss": 4.2688, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.7682119205298014, |
|
"grad_norm": 0.10041986405849457, |
|
"learning_rate": 0.0009341811660059271, |
|
"loss": 4.2227, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.7947019867549668, |
|
"grad_norm": 0.1226314976811409, |
|
"learning_rate": 0.0009287619485264596, |
|
"loss": 4.2819, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8211920529801324, |
|
"grad_norm": 0.1134864017367363, |
|
"learning_rate": 0.0009231454003216239, |
|
"loss": 4.2315, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.847682119205298, |
|
"grad_norm": 0.1089298278093338, |
|
"learning_rate": 0.0009173341063165405, |
|
"loss": 4.2791, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.8741721854304636, |
|
"grad_norm": 0.1100950837135315, |
|
"learning_rate": 0.0009113307410649221, |
|
"loss": 4.2643, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.9006622516556292, |
|
"grad_norm": 0.11873114854097366, |
|
"learning_rate": 0.000905138067518154, |
|
"loss": 4.2508, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.9271523178807947, |
|
"grad_norm": 0.10352523624897003, |
|
"learning_rate": 0.0008987589357536914, |
|
"loss": 4.277, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9536423841059603, |
|
"grad_norm": 0.0963631346821785, |
|
"learning_rate": 0.0008921962816633561, |
|
"loss": 4.2486, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.9801324503311258, |
|
"grad_norm": 0.12682202458381653, |
|
"learning_rate": 0.000885453125602141, |
|
"loss": 4.3274, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_bleu": 0.11091629800496128, |
|
"eval_cap_loss": 1.2195911861413362, |
|
"eval_con_loss": 1.8316854106274663, |
|
"eval_loss": 3.0512766001240306, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_bleu": 0.11091629800496128, |
|
"eval_cap_loss": 1.2195911861413362, |
|
"eval_con_loss": 1.8316854106274663, |
|
"eval_loss": 3.0512766001240306, |
|
"eval_runtime": 242.8774, |
|
"eval_samples_per_second": 19.882, |
|
"eval_steps_per_second": 2.487, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.0066225165562914, |
|
"grad_norm": 0.16604916751384735, |
|
"learning_rate": 0.0008785325709981404, |
|
"loss": 4.226, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.033112582781457, |
|
"grad_norm": 0.1470925211906433, |
|
"learning_rate": 0.0008714378029242477, |
|
"loss": 4.2582, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.0596026490066226, |
|
"grad_norm": 0.09578409790992737, |
|
"learning_rate": 0.0008641720866322773, |
|
"loss": 4.2641, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.086092715231788, |
|
"grad_norm": 0.13495096564292908, |
|
"learning_rate": 0.0008567387660501852, |
|
"loss": 4.2752, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.1125827814569536, |
|
"grad_norm": 0.18220090866088867, |
|
"learning_rate": 0.000849141262243081, |
|
"loss": 4.2859, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.1390728476821192, |
|
"grad_norm": 0.10595650970935822, |
|
"learning_rate": 0.0008413830718387375, |
|
"loss": 4.2599, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.1655629139072847, |
|
"grad_norm": 0.2255912721157074, |
|
"learning_rate": 0.0008334677654183254, |
|
"loss": 4.2598, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.1920529801324504, |
|
"grad_norm": 0.15248772501945496, |
|
"learning_rate": 0.0008253989858731106, |
|
"loss": 4.2828, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.218543046357616, |
|
"grad_norm": 0.10478087514638901, |
|
"learning_rate": 0.0008171804467278729, |
|
"loss": 4.2615, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.2450331125827814, |
|
"grad_norm": 0.13858726620674133, |
|
"learning_rate": 0.0008088159304318166, |
|
"loss": 4.2932, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.271523178807947, |
|
"grad_norm": 0.13052597641944885, |
|
"learning_rate": 0.0008003092866177592, |
|
"loss": 4.2778, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.2980132450331126, |
|
"grad_norm": 0.1585925668478012, |
|
"learning_rate": 0.0007916644303304013, |
|
"loss": 4.2626, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.3245033112582782, |
|
"grad_norm": 0.15588369965553284, |
|
"learning_rate": 0.0007828853402244896, |
|
"loss": 4.3056, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3509933774834437, |
|
"grad_norm": 0.09688248485326767, |
|
"learning_rate": 0.0007739760567337073, |
|
"loss": 4.2525, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.3774834437086092, |
|
"grad_norm": 0.12090156972408295, |
|
"learning_rate": 0.0007649406802111283, |
|
"loss": 4.223, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.403973509933775, |
|
"grad_norm": 0.11892067641019821, |
|
"learning_rate": 0.0007557833690420974, |
|
"loss": 4.248, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.4304635761589404, |
|
"grad_norm": 0.1257970631122589, |
|
"learning_rate": 0.0007465083377304009, |
|
"loss": 4.2604, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.4569536423841059, |
|
"grad_norm": 0.1225869208574295, |
|
"learning_rate": 0.0007371198549586091, |
|
"loss": 4.2583, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.4834437086092715, |
|
"grad_norm": 0.12114247679710388, |
|
"learning_rate": 0.000727622241623485, |
|
"loss": 4.2293, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.5099337748344372, |
|
"grad_norm": 0.14327970147132874, |
|
"learning_rate": 0.0007180198688473614, |
|
"loss": 4.3047, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.5364238410596025, |
|
"grad_norm": 0.12314064800739288, |
|
"learning_rate": 0.0007083171559664032, |
|
"loss": 4.2156, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.5629139072847682, |
|
"grad_norm": 0.10841790586709976, |
|
"learning_rate": 0.0006985185684966791, |
|
"loss": 4.2602, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.589403973509934, |
|
"grad_norm": 0.13060128688812256, |
|
"learning_rate": 0.0006886286160789805, |
|
"loss": 4.2554, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.6158940397350994, |
|
"grad_norm": 0.10328269004821777, |
|
"learning_rate": 0.0006786518504033333, |
|
"loss": 4.2572, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.6423841059602649, |
|
"grad_norm": 0.15084649622440338, |
|
"learning_rate": 0.0006685928631141552, |
|
"loss": 4.278, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.6688741721854305, |
|
"grad_norm": 0.1322745680809021, |
|
"learning_rate": 0.0006584562836970271, |
|
"loss": 4.2747, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.695364238410596, |
|
"grad_norm": 0.09931815415620804, |
|
"learning_rate": 0.0006482467773480468, |
|
"loss": 4.2314, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.7218543046357615, |
|
"grad_norm": 0.12023808807134628, |
|
"learning_rate": 0.0006379690428267482, |
|
"loss": 4.2832, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.7483443708609272, |
|
"grad_norm": 0.12222672253847122, |
|
"learning_rate": 0.000627627810293574, |
|
"loss": 4.1856, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 1.7748344370860927, |
|
"grad_norm": 0.14069874584674835, |
|
"learning_rate": 0.0006172278391328957, |
|
"loss": 4.2394, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 1.8013245033112582, |
|
"grad_norm": 0.12728360295295715, |
|
"learning_rate": 0.0006067739157625848, |
|
"loss": 4.3155, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 1.8278145695364238, |
|
"grad_norm": 0.15248538553714752, |
|
"learning_rate": 0.0005962708514311411, |
|
"loss": 4.2606, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 1.8543046357615895, |
|
"grad_norm": 0.08556041121482849, |
|
"learning_rate": 0.0005857234800033936, |
|
"loss": 4.2379, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.8807947019867548, |
|
"grad_norm": 0.10861990600824356, |
|
"learning_rate": 0.0005751366557357933, |
|
"loss": 4.2761, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 1.9072847682119205, |
|
"grad_norm": 0.1670146882534027, |
|
"learning_rate": 0.0005645152510423204, |
|
"loss": 4.2325, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 1.9337748344370862, |
|
"grad_norm": 0.14551334083080292, |
|
"learning_rate": 0.0005538641542520343, |
|
"loss": 4.2249, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 1.9602649006622517, |
|
"grad_norm": 0.12024614959955215, |
|
"learning_rate": 0.0005431882673592999, |
|
"loss": 4.2265, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 1.9867549668874172, |
|
"grad_norm": 0.15291069447994232, |
|
"learning_rate": 0.0005324925037677243, |
|
"loss": 4.3172, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_bleu": 0.11370215395599652, |
|
"eval_cap_loss": 1.2129609353889692, |
|
"eval_con_loss": 1.808429909265594, |
|
"eval_loss": 3.0213908505755542, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_bleu": 0.11370215395599652, |
|
"eval_cap_loss": 1.2129609353889692, |
|
"eval_con_loss": 1.808429909265594, |
|
"eval_loss": 3.0213908505755542, |
|
"eval_runtime": 241.5496, |
|
"eval_samples_per_second": 19.992, |
|
"eval_steps_per_second": 2.501, |
|
"step": 1208 |
|
} |
|
], |
|
"logging_steps": 16, |
|
"max_steps": 2416, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null, |
|
"tau_value": 4.5962 |
|
} |
|
|