{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07226442208991993, "eval_steps": 88, "global_step": 88, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000821186614658181, "grad_norm": 4.866796016693115, "learning_rate": 2e-05, "loss": 1.5096, "step": 1 }, { "epoch": 0.001642373229316362, "grad_norm": 6.223461627960205, "learning_rate": 4e-05, "loss": 1.4917, "step": 2 }, { "epoch": 0.002463559843974543, "grad_norm": 2.4966073036193848, "learning_rate": 6e-05, "loss": 1.474, "step": 3 }, { "epoch": 0.003284746458632724, "grad_norm": 3.924987316131592, "learning_rate": 8e-05, "loss": 1.4303, "step": 4 }, { "epoch": 0.0041059330732909054, "grad_norm": 1.223948359489441, "learning_rate": 0.0001, "loss": 1.2468, "step": 5 }, { "epoch": 0.004927119687949086, "grad_norm": 0.7974618077278137, "learning_rate": 9.997257268239166e-05, "loss": 1.2757, "step": 6 }, { "epoch": 0.005748306302607267, "grad_norm": 0.3620994985103607, "learning_rate": 9.994514536478333e-05, "loss": 1.1615, "step": 7 }, { "epoch": 0.006569492917265448, "grad_norm": 0.2654111087322235, "learning_rate": 9.9917718047175e-05, "loss": 1.0671, "step": 8 }, { "epoch": 0.00739067953192363, "grad_norm": 0.2543610632419586, "learning_rate": 9.989029072956665e-05, "loss": 0.9546, "step": 9 }, { "epoch": 0.008211866146581811, "grad_norm": 0.2834194600582123, "learning_rate": 9.986286341195832e-05, "loss": 0.8958, "step": 10 }, { "epoch": 0.009033052761239991, "grad_norm": 0.43697845935821533, "learning_rate": 9.983543609434997e-05, "loss": 0.7524, "step": 11 }, { "epoch": 0.009854239375898173, "grad_norm": 1.3894637823104858, "learning_rate": 9.980800877674164e-05, "loss": 0.7509, "step": 12 }, { "epoch": 0.010675425990556354, "grad_norm": 0.5497334003448486, "learning_rate": 9.978058145913331e-05, "loss": 0.6424, "step": 13 }, { "epoch": 0.011496612605214535, "grad_norm": 0.4010011851787567, "learning_rate": 9.975315414152496e-05, "loss": 0.6201, "step": 14 }, { "epoch": 0.012317799219872716, "grad_norm": 0.41307681798934937, "learning_rate": 9.972572682391662e-05, "loss": 0.544, "step": 15 }, { "epoch": 0.013138985834530896, "grad_norm": 0.39948198199272156, "learning_rate": 9.969829950630828e-05, "loss": 0.4942, "step": 16 }, { "epoch": 0.013960172449189078, "grad_norm": 0.3839815557003021, "learning_rate": 9.967087218869995e-05, "loss": 0.4952, "step": 17 }, { "epoch": 0.01478135906384726, "grad_norm": 0.40473300218582153, "learning_rate": 9.96434448710916e-05, "loss": 0.4832, "step": 18 }, { "epoch": 0.01560254567850544, "grad_norm": 0.2300078272819519, "learning_rate": 9.961601755348327e-05, "loss": 0.4624, "step": 19 }, { "epoch": 0.016423732293163622, "grad_norm": 0.20218200981616974, "learning_rate": 9.958859023587493e-05, "loss": 0.4354, "step": 20 }, { "epoch": 0.017244918907821802, "grad_norm": 0.20956912636756897, "learning_rate": 9.95611629182666e-05, "loss": 0.4463, "step": 21 }, { "epoch": 0.018066105522479982, "grad_norm": 0.16660131514072418, "learning_rate": 9.953373560065826e-05, "loss": 0.4104, "step": 22 }, { "epoch": 0.018887292137138165, "grad_norm": 0.15235203504562378, "learning_rate": 9.950630828304992e-05, "loss": 0.4328, "step": 23 }, { "epoch": 0.019708478751796345, "grad_norm": 0.14054065942764282, "learning_rate": 9.947888096544159e-05, "loss": 0.4126, "step": 24 }, { "epoch": 0.020529665366454525, "grad_norm": 0.18133644759655, "learning_rate": 9.945145364783325e-05, "loss": 0.4214, "step": 25 }, { "epoch": 0.02135085198111271, "grad_norm": 0.1237025335431099, "learning_rate": 9.942402633022491e-05, "loss": 0.4138, "step": 26 }, { "epoch": 0.02217203859577089, "grad_norm": 0.1338941603899002, "learning_rate": 9.939659901261658e-05, "loss": 0.4198, "step": 27 }, { "epoch": 0.02299322521042907, "grad_norm": 0.24965497851371765, "learning_rate": 9.936917169500823e-05, "loss": 0.4292, "step": 28 }, { "epoch": 0.023814411825087253, "grad_norm": 0.2095515877008438, "learning_rate": 9.93417443773999e-05, "loss": 0.4321, "step": 29 }, { "epoch": 0.024635598439745433, "grad_norm": 0.14506715536117554, "learning_rate": 9.931431705979157e-05, "loss": 0.403, "step": 30 }, { "epoch": 0.025456785054403613, "grad_norm": 0.13434380292892456, "learning_rate": 9.928688974218322e-05, "loss": 0.4205, "step": 31 }, { "epoch": 0.026277971669061793, "grad_norm": 0.14898717403411865, "learning_rate": 9.925946242457488e-05, "loss": 0.4095, "step": 32 }, { "epoch": 0.027099158283719976, "grad_norm": 0.1183394193649292, "learning_rate": 9.923203510696654e-05, "loss": 0.3941, "step": 33 }, { "epoch": 0.027920344898378156, "grad_norm": 0.14402946829795837, "learning_rate": 9.920460778935821e-05, "loss": 0.4224, "step": 34 }, { "epoch": 0.028741531513036336, "grad_norm": 0.14066942036151886, "learning_rate": 9.917718047174987e-05, "loss": 0.4728, "step": 35 }, { "epoch": 0.02956271812769452, "grad_norm": 2.1825764179229736, "learning_rate": 9.914975315414153e-05, "loss": 0.4013, "step": 36 }, { "epoch": 0.0303839047423527, "grad_norm": 0.15306037664413452, "learning_rate": 9.912232583653319e-05, "loss": 0.3776, "step": 37 }, { "epoch": 0.03120509135701088, "grad_norm": 1.2928482294082642, "learning_rate": 9.909489851892486e-05, "loss": 0.3766, "step": 38 }, { "epoch": 0.032026277971669063, "grad_norm": 0.12138387560844421, "learning_rate": 9.906747120131652e-05, "loss": 0.4439, "step": 39 }, { "epoch": 0.032847464586327244, "grad_norm": 0.13965290784835815, "learning_rate": 9.904004388370818e-05, "loss": 0.3758, "step": 40 }, { "epoch": 0.033668651200985424, "grad_norm": 0.11665050685405731, "learning_rate": 9.901261656609983e-05, "loss": 0.3539, "step": 41 }, { "epoch": 0.034489837815643604, "grad_norm": 0.12246105074882507, "learning_rate": 9.89851892484915e-05, "loss": 0.385, "step": 42 }, { "epoch": 0.035311024430301784, "grad_norm": 0.11154136061668396, "learning_rate": 9.895776193088317e-05, "loss": 0.3675, "step": 43 }, { "epoch": 0.036132211044959964, "grad_norm": 0.13517113029956818, "learning_rate": 9.893033461327482e-05, "loss": 0.409, "step": 44 }, { "epoch": 0.03695339765961815, "grad_norm": 0.1510034054517746, "learning_rate": 9.890290729566649e-05, "loss": 0.356, "step": 45 }, { "epoch": 0.03777458427427633, "grad_norm": 0.12618917226791382, "learning_rate": 9.887547997805814e-05, "loss": 0.3635, "step": 46 }, { "epoch": 0.03859577088893451, "grad_norm": 0.17770665884017944, "learning_rate": 9.884805266044981e-05, "loss": 0.3801, "step": 47 }, { "epoch": 0.03941695750359269, "grad_norm": 0.13217146694660187, "learning_rate": 9.882062534284148e-05, "loss": 0.3771, "step": 48 }, { "epoch": 0.04023814411825087, "grad_norm": 0.11666197329759598, "learning_rate": 9.879319802523313e-05, "loss": 0.3837, "step": 49 }, { "epoch": 0.04105933073290905, "grad_norm": 0.20090733468532562, "learning_rate": 9.876577070762479e-05, "loss": 0.3767, "step": 50 }, { "epoch": 0.04188051734756724, "grad_norm": 0.3209711015224457, "learning_rate": 9.873834339001646e-05, "loss": 0.4027, "step": 51 }, { "epoch": 0.04270170396222542, "grad_norm": 0.11906739324331284, "learning_rate": 9.871091607240812e-05, "loss": 0.3776, "step": 52 }, { "epoch": 0.0435228905768836, "grad_norm": 0.3295115530490875, "learning_rate": 9.868348875479978e-05, "loss": 0.3484, "step": 53 }, { "epoch": 0.04434407719154178, "grad_norm": 0.10566671937704086, "learning_rate": 9.865606143719145e-05, "loss": 0.3645, "step": 54 }, { "epoch": 0.04516526380619996, "grad_norm": 0.18777306377887726, "learning_rate": 9.86286341195831e-05, "loss": 0.4219, "step": 55 }, { "epoch": 0.04598645042085814, "grad_norm": 0.11774461716413498, "learning_rate": 9.860120680197478e-05, "loss": 0.375, "step": 56 }, { "epoch": 0.04680763703551632, "grad_norm": 0.1274806559085846, "learning_rate": 9.857377948436644e-05, "loss": 0.4609, "step": 57 }, { "epoch": 0.047628823650174505, "grad_norm": 0.1770283281803131, "learning_rate": 9.854635216675809e-05, "loss": 0.3577, "step": 58 }, { "epoch": 0.048450010264832685, "grad_norm": 0.278679758310318, "learning_rate": 9.851892484914976e-05, "loss": 0.3748, "step": 59 }, { "epoch": 0.049271196879490865, "grad_norm": 0.13674406707286835, "learning_rate": 9.849149753154143e-05, "loss": 0.3828, "step": 60 }, { "epoch": 0.050092383494149045, "grad_norm": 0.1524430513381958, "learning_rate": 9.846407021393308e-05, "loss": 0.3906, "step": 61 }, { "epoch": 0.050913570108807225, "grad_norm": 0.12199753522872925, "learning_rate": 9.843664289632475e-05, "loss": 0.4007, "step": 62 }, { "epoch": 0.051734756723465405, "grad_norm": 0.19670936465263367, "learning_rate": 9.84092155787164e-05, "loss": 0.4018, "step": 63 }, { "epoch": 0.052555943338123585, "grad_norm": 0.1128976121544838, "learning_rate": 9.838178826110807e-05, "loss": 0.3909, "step": 64 }, { "epoch": 0.05337712995278177, "grad_norm": 0.1778184324502945, "learning_rate": 9.835436094349974e-05, "loss": 0.3736, "step": 65 }, { "epoch": 0.05419831656743995, "grad_norm": 0.19817706942558289, "learning_rate": 9.83269336258914e-05, "loss": 0.3505, "step": 66 }, { "epoch": 0.05501950318209813, "grad_norm": 0.09127096086740494, "learning_rate": 9.829950630828305e-05, "loss": 0.3504, "step": 67 }, { "epoch": 0.05584068979675631, "grad_norm": 0.13604852557182312, "learning_rate": 9.827207899067472e-05, "loss": 0.4266, "step": 68 }, { "epoch": 0.05666187641141449, "grad_norm": 0.11077171564102173, "learning_rate": 9.824465167306638e-05, "loss": 0.3602, "step": 69 }, { "epoch": 0.05748306302607267, "grad_norm": 0.10381105542182922, "learning_rate": 9.821722435545804e-05, "loss": 0.3405, "step": 70 }, { "epoch": 0.05830424964073085, "grad_norm": 0.13518977165222168, "learning_rate": 9.81897970378497e-05, "loss": 0.3348, "step": 71 }, { "epoch": 0.05912543625538904, "grad_norm": 0.10194771736860275, "learning_rate": 9.816236972024136e-05, "loss": 0.349, "step": 72 }, { "epoch": 0.05994662287004722, "grad_norm": 0.12088090181350708, "learning_rate": 9.813494240263303e-05, "loss": 0.357, "step": 73 }, { "epoch": 0.0607678094847054, "grad_norm": 0.1529798060655594, "learning_rate": 9.81075150850247e-05, "loss": 0.3618, "step": 74 }, { "epoch": 0.06158899609936358, "grad_norm": 0.10943326354026794, "learning_rate": 9.808008776741635e-05, "loss": 0.3273, "step": 75 }, { "epoch": 0.06241018271402176, "grad_norm": 0.11236156523227692, "learning_rate": 9.8052660449808e-05, "loss": 0.3511, "step": 76 }, { "epoch": 0.06323136932867994, "grad_norm": 0.11936212331056595, "learning_rate": 9.802523313219967e-05, "loss": 0.3669, "step": 77 }, { "epoch": 0.06405255594333813, "grad_norm": 0.2718499004840851, "learning_rate": 9.799780581459134e-05, "loss": 0.3488, "step": 78 }, { "epoch": 0.0648737425579963, "grad_norm": 0.13413332402706146, "learning_rate": 9.7970378496983e-05, "loss": 0.3741, "step": 79 }, { "epoch": 0.06569492917265449, "grad_norm": 0.4024653136730194, "learning_rate": 9.794295117937466e-05, "loss": 0.3714, "step": 80 }, { "epoch": 0.06651611578731266, "grad_norm": 0.12206799536943436, "learning_rate": 9.791552386176632e-05, "loss": 0.4094, "step": 81 }, { "epoch": 0.06733730240197085, "grad_norm": 0.17678625881671906, "learning_rate": 9.788809654415799e-05, "loss": 0.3662, "step": 82 }, { "epoch": 0.06815848901662903, "grad_norm": 0.1201493889093399, "learning_rate": 9.786066922654965e-05, "loss": 0.3974, "step": 83 }, { "epoch": 0.06897967563128721, "grad_norm": 0.11645176261663437, "learning_rate": 9.783324190894131e-05, "loss": 0.3676, "step": 84 }, { "epoch": 0.0698008622459454, "grad_norm": 0.20770376920700073, "learning_rate": 9.780581459133296e-05, "loss": 0.3442, "step": 85 }, { "epoch": 0.07062204886060357, "grad_norm": 0.3476441502571106, "learning_rate": 9.777838727372464e-05, "loss": 0.4063, "step": 86 }, { "epoch": 0.07144323547526175, "grad_norm": 0.10448214411735535, "learning_rate": 9.77509599561163e-05, "loss": 0.381, "step": 87 }, { "epoch": 0.07226442208991993, "grad_norm": 0.11250001937150955, "learning_rate": 9.772353263850797e-05, "loss": 0.3508, "step": 88 }, { "epoch": 0.07226442208991993, "eval_runtime": 492.6417, "eval_samples_per_second": 0.4, "eval_steps_per_second": 0.201, "step": 88 } ], "logging_steps": 1, "max_steps": 3651, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 88, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.872555937317585e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }