|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.07226442208991993, |
|
"eval_steps": 88, |
|
"global_step": 88, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000821186614658181, |
|
"grad_norm": 4.866796016693115, |
|
"learning_rate": 2e-05, |
|
"loss": 1.5096, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001642373229316362, |
|
"grad_norm": 6.223461627960205, |
|
"learning_rate": 4e-05, |
|
"loss": 1.4917, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002463559843974543, |
|
"grad_norm": 2.4966073036193848, |
|
"learning_rate": 6e-05, |
|
"loss": 1.474, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.003284746458632724, |
|
"grad_norm": 3.924987316131592, |
|
"learning_rate": 8e-05, |
|
"loss": 1.4303, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0041059330732909054, |
|
"grad_norm": 1.223948359489441, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2468, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004927119687949086, |
|
"grad_norm": 0.7974618077278137, |
|
"learning_rate": 9.997257268239166e-05, |
|
"loss": 1.2757, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005748306302607267, |
|
"grad_norm": 0.3620994985103607, |
|
"learning_rate": 9.994514536478333e-05, |
|
"loss": 1.1615, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006569492917265448, |
|
"grad_norm": 0.2654111087322235, |
|
"learning_rate": 9.9917718047175e-05, |
|
"loss": 1.0671, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00739067953192363, |
|
"grad_norm": 0.2543610632419586, |
|
"learning_rate": 9.989029072956665e-05, |
|
"loss": 0.9546, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.008211866146581811, |
|
"grad_norm": 0.2834194600582123, |
|
"learning_rate": 9.986286341195832e-05, |
|
"loss": 0.8958, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009033052761239991, |
|
"grad_norm": 0.43697845935821533, |
|
"learning_rate": 9.983543609434997e-05, |
|
"loss": 0.7524, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.009854239375898173, |
|
"grad_norm": 1.3894637823104858, |
|
"learning_rate": 9.980800877674164e-05, |
|
"loss": 0.7509, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.010675425990556354, |
|
"grad_norm": 0.5497334003448486, |
|
"learning_rate": 9.978058145913331e-05, |
|
"loss": 0.6424, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011496612605214535, |
|
"grad_norm": 0.4010011851787567, |
|
"learning_rate": 9.975315414152496e-05, |
|
"loss": 0.6201, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.012317799219872716, |
|
"grad_norm": 0.41307681798934937, |
|
"learning_rate": 9.972572682391662e-05, |
|
"loss": 0.544, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013138985834530896, |
|
"grad_norm": 0.39948198199272156, |
|
"learning_rate": 9.969829950630828e-05, |
|
"loss": 0.4942, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.013960172449189078, |
|
"grad_norm": 0.3839815557003021, |
|
"learning_rate": 9.967087218869995e-05, |
|
"loss": 0.4952, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01478135906384726, |
|
"grad_norm": 0.40473300218582153, |
|
"learning_rate": 9.96434448710916e-05, |
|
"loss": 0.4832, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01560254567850544, |
|
"grad_norm": 0.2300078272819519, |
|
"learning_rate": 9.961601755348327e-05, |
|
"loss": 0.4624, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.016423732293163622, |
|
"grad_norm": 0.20218200981616974, |
|
"learning_rate": 9.958859023587493e-05, |
|
"loss": 0.4354, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017244918907821802, |
|
"grad_norm": 0.20956912636756897, |
|
"learning_rate": 9.95611629182666e-05, |
|
"loss": 0.4463, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.018066105522479982, |
|
"grad_norm": 0.16660131514072418, |
|
"learning_rate": 9.953373560065826e-05, |
|
"loss": 0.4104, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.018887292137138165, |
|
"grad_norm": 0.15235203504562378, |
|
"learning_rate": 9.950630828304992e-05, |
|
"loss": 0.4328, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.019708478751796345, |
|
"grad_norm": 0.14054065942764282, |
|
"learning_rate": 9.947888096544159e-05, |
|
"loss": 0.4126, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.020529665366454525, |
|
"grad_norm": 0.18133644759655, |
|
"learning_rate": 9.945145364783325e-05, |
|
"loss": 0.4214, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02135085198111271, |
|
"grad_norm": 0.1237025335431099, |
|
"learning_rate": 9.942402633022491e-05, |
|
"loss": 0.4138, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02217203859577089, |
|
"grad_norm": 0.1338941603899002, |
|
"learning_rate": 9.939659901261658e-05, |
|
"loss": 0.4198, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02299322521042907, |
|
"grad_norm": 0.24965497851371765, |
|
"learning_rate": 9.936917169500823e-05, |
|
"loss": 0.4292, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.023814411825087253, |
|
"grad_norm": 0.2095515877008438, |
|
"learning_rate": 9.93417443773999e-05, |
|
"loss": 0.4321, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.024635598439745433, |
|
"grad_norm": 0.14506715536117554, |
|
"learning_rate": 9.931431705979157e-05, |
|
"loss": 0.403, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.025456785054403613, |
|
"grad_norm": 0.13434380292892456, |
|
"learning_rate": 9.928688974218322e-05, |
|
"loss": 0.4205, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.026277971669061793, |
|
"grad_norm": 0.14898717403411865, |
|
"learning_rate": 9.925946242457488e-05, |
|
"loss": 0.4095, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.027099158283719976, |
|
"grad_norm": 0.1183394193649292, |
|
"learning_rate": 9.923203510696654e-05, |
|
"loss": 0.3941, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.027920344898378156, |
|
"grad_norm": 0.14402946829795837, |
|
"learning_rate": 9.920460778935821e-05, |
|
"loss": 0.4224, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.028741531513036336, |
|
"grad_norm": 0.14066942036151886, |
|
"learning_rate": 9.917718047174987e-05, |
|
"loss": 0.4728, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02956271812769452, |
|
"grad_norm": 2.1825764179229736, |
|
"learning_rate": 9.914975315414153e-05, |
|
"loss": 0.4013, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0303839047423527, |
|
"grad_norm": 0.15306037664413452, |
|
"learning_rate": 9.912232583653319e-05, |
|
"loss": 0.3776, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03120509135701088, |
|
"grad_norm": 1.2928482294082642, |
|
"learning_rate": 9.909489851892486e-05, |
|
"loss": 0.3766, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.032026277971669063, |
|
"grad_norm": 0.12138387560844421, |
|
"learning_rate": 9.906747120131652e-05, |
|
"loss": 0.4439, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.032847464586327244, |
|
"grad_norm": 0.13965290784835815, |
|
"learning_rate": 9.904004388370818e-05, |
|
"loss": 0.3758, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.033668651200985424, |
|
"grad_norm": 0.11665050685405731, |
|
"learning_rate": 9.901261656609983e-05, |
|
"loss": 0.3539, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.034489837815643604, |
|
"grad_norm": 0.12246105074882507, |
|
"learning_rate": 9.89851892484915e-05, |
|
"loss": 0.385, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.035311024430301784, |
|
"grad_norm": 0.11154136061668396, |
|
"learning_rate": 9.895776193088317e-05, |
|
"loss": 0.3675, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.036132211044959964, |
|
"grad_norm": 0.13517113029956818, |
|
"learning_rate": 9.893033461327482e-05, |
|
"loss": 0.409, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03695339765961815, |
|
"grad_norm": 0.1510034054517746, |
|
"learning_rate": 9.890290729566649e-05, |
|
"loss": 0.356, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03777458427427633, |
|
"grad_norm": 0.12618917226791382, |
|
"learning_rate": 9.887547997805814e-05, |
|
"loss": 0.3635, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03859577088893451, |
|
"grad_norm": 0.17770665884017944, |
|
"learning_rate": 9.884805266044981e-05, |
|
"loss": 0.3801, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03941695750359269, |
|
"grad_norm": 0.13217146694660187, |
|
"learning_rate": 9.882062534284148e-05, |
|
"loss": 0.3771, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04023814411825087, |
|
"grad_norm": 0.11666197329759598, |
|
"learning_rate": 9.879319802523313e-05, |
|
"loss": 0.3837, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04105933073290905, |
|
"grad_norm": 0.20090733468532562, |
|
"learning_rate": 9.876577070762479e-05, |
|
"loss": 0.3767, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04188051734756724, |
|
"grad_norm": 0.3209711015224457, |
|
"learning_rate": 9.873834339001646e-05, |
|
"loss": 0.4027, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04270170396222542, |
|
"grad_norm": 0.11906739324331284, |
|
"learning_rate": 9.871091607240812e-05, |
|
"loss": 0.3776, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0435228905768836, |
|
"grad_norm": 0.3295115530490875, |
|
"learning_rate": 9.868348875479978e-05, |
|
"loss": 0.3484, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04434407719154178, |
|
"grad_norm": 0.10566671937704086, |
|
"learning_rate": 9.865606143719145e-05, |
|
"loss": 0.3645, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04516526380619996, |
|
"grad_norm": 0.18777306377887726, |
|
"learning_rate": 9.86286341195831e-05, |
|
"loss": 0.4219, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04598645042085814, |
|
"grad_norm": 0.11774461716413498, |
|
"learning_rate": 9.860120680197478e-05, |
|
"loss": 0.375, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04680763703551632, |
|
"grad_norm": 0.1274806559085846, |
|
"learning_rate": 9.857377948436644e-05, |
|
"loss": 0.4609, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.047628823650174505, |
|
"grad_norm": 0.1770283281803131, |
|
"learning_rate": 9.854635216675809e-05, |
|
"loss": 0.3577, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.048450010264832685, |
|
"grad_norm": 0.278679758310318, |
|
"learning_rate": 9.851892484914976e-05, |
|
"loss": 0.3748, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.049271196879490865, |
|
"grad_norm": 0.13674406707286835, |
|
"learning_rate": 9.849149753154143e-05, |
|
"loss": 0.3828, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.050092383494149045, |
|
"grad_norm": 0.1524430513381958, |
|
"learning_rate": 9.846407021393308e-05, |
|
"loss": 0.3906, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.050913570108807225, |
|
"grad_norm": 0.12199753522872925, |
|
"learning_rate": 9.843664289632475e-05, |
|
"loss": 0.4007, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.051734756723465405, |
|
"grad_norm": 0.19670936465263367, |
|
"learning_rate": 9.84092155787164e-05, |
|
"loss": 0.4018, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.052555943338123585, |
|
"grad_norm": 0.1128976121544838, |
|
"learning_rate": 9.838178826110807e-05, |
|
"loss": 0.3909, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.05337712995278177, |
|
"grad_norm": 0.1778184324502945, |
|
"learning_rate": 9.835436094349974e-05, |
|
"loss": 0.3736, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05419831656743995, |
|
"grad_norm": 0.19817706942558289, |
|
"learning_rate": 9.83269336258914e-05, |
|
"loss": 0.3505, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05501950318209813, |
|
"grad_norm": 0.09127096086740494, |
|
"learning_rate": 9.829950630828305e-05, |
|
"loss": 0.3504, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05584068979675631, |
|
"grad_norm": 0.13604852557182312, |
|
"learning_rate": 9.827207899067472e-05, |
|
"loss": 0.4266, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05666187641141449, |
|
"grad_norm": 0.11077171564102173, |
|
"learning_rate": 9.824465167306638e-05, |
|
"loss": 0.3602, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05748306302607267, |
|
"grad_norm": 0.10381105542182922, |
|
"learning_rate": 9.821722435545804e-05, |
|
"loss": 0.3405, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05830424964073085, |
|
"grad_norm": 0.13518977165222168, |
|
"learning_rate": 9.81897970378497e-05, |
|
"loss": 0.3348, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.05912543625538904, |
|
"grad_norm": 0.10194771736860275, |
|
"learning_rate": 9.816236972024136e-05, |
|
"loss": 0.349, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.05994662287004722, |
|
"grad_norm": 0.12088090181350708, |
|
"learning_rate": 9.813494240263303e-05, |
|
"loss": 0.357, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0607678094847054, |
|
"grad_norm": 0.1529798060655594, |
|
"learning_rate": 9.81075150850247e-05, |
|
"loss": 0.3618, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06158899609936358, |
|
"grad_norm": 0.10943326354026794, |
|
"learning_rate": 9.808008776741635e-05, |
|
"loss": 0.3273, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06241018271402176, |
|
"grad_norm": 0.11236156523227692, |
|
"learning_rate": 9.8052660449808e-05, |
|
"loss": 0.3511, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.06323136932867994, |
|
"grad_norm": 0.11936212331056595, |
|
"learning_rate": 9.802523313219967e-05, |
|
"loss": 0.3669, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.06405255594333813, |
|
"grad_norm": 0.2718499004840851, |
|
"learning_rate": 9.799780581459134e-05, |
|
"loss": 0.3488, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0648737425579963, |
|
"grad_norm": 0.13413332402706146, |
|
"learning_rate": 9.7970378496983e-05, |
|
"loss": 0.3741, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.06569492917265449, |
|
"grad_norm": 0.4024653136730194, |
|
"learning_rate": 9.794295117937466e-05, |
|
"loss": 0.3714, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06651611578731266, |
|
"grad_norm": 0.12206799536943436, |
|
"learning_rate": 9.791552386176632e-05, |
|
"loss": 0.4094, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.06733730240197085, |
|
"grad_norm": 0.17678625881671906, |
|
"learning_rate": 9.788809654415799e-05, |
|
"loss": 0.3662, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.06815848901662903, |
|
"grad_norm": 0.1201493889093399, |
|
"learning_rate": 9.786066922654965e-05, |
|
"loss": 0.3974, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.06897967563128721, |
|
"grad_norm": 0.11645176261663437, |
|
"learning_rate": 9.783324190894131e-05, |
|
"loss": 0.3676, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0698008622459454, |
|
"grad_norm": 0.20770376920700073, |
|
"learning_rate": 9.780581459133296e-05, |
|
"loss": 0.3442, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07062204886060357, |
|
"grad_norm": 0.3476441502571106, |
|
"learning_rate": 9.777838727372464e-05, |
|
"loss": 0.4063, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07144323547526175, |
|
"grad_norm": 0.10448214411735535, |
|
"learning_rate": 9.77509599561163e-05, |
|
"loss": 0.381, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07226442208991993, |
|
"grad_norm": 0.11250001937150955, |
|
"learning_rate": 9.772353263850797e-05, |
|
"loss": 0.3508, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07226442208991993, |
|
"eval_runtime": 492.6417, |
|
"eval_samples_per_second": 0.4, |
|
"eval_steps_per_second": 0.201, |
|
"step": 88 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 3651, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 88, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.872555937317585e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|