{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.644453259288642, "eval_steps": 1000, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06611133148221605, "grad_norm": 1.0232534408569336, "learning_rate": 4.4080049369655294e-05, "loss": 5.6838, "step": 500 }, { "epoch": 0.1322226629644321, "grad_norm": 1.0166140794754028, "learning_rate": 8.816009873931059e-05, "loss": 3.8378, "step": 1000 }, { "epoch": 0.1322226629644321, "eval_accuracy": 0.4720710052887577, "eval_loss": 3.6431853771209717, "eval_runtime": 65.5904, "eval_samples_per_second": 28.053, "eval_steps_per_second": 1.174, "step": 1000 }, { "epoch": 0.19833399444664815, "grad_norm": 1.0467888116836548, "learning_rate": 9.830315009952811e-05, "loss": 3.3712, "step": 1500 }, { "epoch": 0.2644453259288642, "grad_norm": 1.1263777017593384, "learning_rate": 9.59831475011252e-05, "loss": 3.0922, "step": 2000 }, { "epoch": 0.2644453259288642, "eval_accuracy": 0.5138609524011809, "eval_loss": 3.076597213745117, "eval_runtime": 64.6228, "eval_samples_per_second": 28.473, "eval_steps_per_second": 1.192, "step": 2000 }, { "epoch": 0.33055665741108026, "grad_norm": 1.4438892602920532, "learning_rate": 9.366314490272228e-05, "loss": 2.9066, "step": 2500 }, { "epoch": 0.3966679888932963, "grad_norm": 1.3693314790725708, "learning_rate": 9.134314230431938e-05, "loss": 2.7993, "step": 3000 }, { "epoch": 0.3966679888932963, "eval_accuracy": 0.5319845054268176, "eval_loss": 2.84745454788208, "eval_runtime": 64.9029, "eval_samples_per_second": 28.35, "eval_steps_per_second": 1.186, "step": 3000 }, { "epoch": 0.46277932037551234, "grad_norm": 1.3279718160629272, "learning_rate": 8.902313970591646e-05, "loss": 2.7166, "step": 3500 }, { "epoch": 0.5288906518577284, "grad_norm": 1.465155839920044, "learning_rate": 8.670313710751356e-05, "loss": 2.7115, "step": 4000 }, { "epoch": 0.5288906518577284, "eval_accuracy": 0.5392130052462777, "eval_loss": 2.7528512477874756, "eval_runtime": 65.039, "eval_samples_per_second": 28.291, "eval_steps_per_second": 1.184, "step": 4000 }, { "epoch": 0.5950019833399445, "grad_norm": 2.4618444442749023, "learning_rate": 8.438313450911065e-05, "loss": 2.644, "step": 4500 }, { "epoch": 0.6611133148221605, "grad_norm": 3.049086093902588, "learning_rate": 8.206313191070773e-05, "loss": 2.6702, "step": 5000 }, { "epoch": 0.6611133148221605, "eval_accuracy": 0.5420291625071685, "eval_loss": 2.7150135040283203, "eval_runtime": 64.9223, "eval_samples_per_second": 28.342, "eval_steps_per_second": 1.186, "step": 5000 }, { "epoch": 0.7272246463043766, "grad_norm": 3.927698850631714, "learning_rate": 7.974312931230483e-05, "loss": 2.6029, "step": 5500 }, { "epoch": 0.7933359777865926, "grad_norm": 4.909026622772217, "learning_rate": 7.742312671390191e-05, "loss": 2.6484, "step": 6000 }, { "epoch": 0.7933359777865926, "eval_accuracy": 0.543187538497483, "eval_loss": 2.696218729019165, "eval_runtime": 64.8633, "eval_samples_per_second": 28.367, "eval_steps_per_second": 1.187, "step": 6000 }, { "epoch": 0.8594473092688086, "grad_norm": 10.72818660736084, "learning_rate": 7.510312411549901e-05, "loss": 2.6474, "step": 6500 }, { "epoch": 0.9255586407510247, "grad_norm": 12.435935020446777, "learning_rate": 7.278312151709609e-05, "loss": 2.6419, "step": 7000 }, { "epoch": 0.9255586407510247, "eval_accuracy": 0.5387701514411334, "eval_loss": 2.7223353385925293, "eval_runtime": 68.3123, "eval_samples_per_second": 26.935, "eval_steps_per_second": 1.127, "step": 7000 }, { "epoch": 0.9916699722332408, "grad_norm": 15.605013847351074, "learning_rate": 7.046311891869319e-05, "loss": 2.6239, "step": 7500 }, { "epoch": 1.0577813037154569, "grad_norm": 55.199256896972656, "learning_rate": 6.814311632029027e-05, "loss": 2.5853, "step": 8000 }, { "epoch": 1.0577813037154569, "eval_accuracy": 0.5401743803232727, "eval_loss": 2.7088677883148193, "eval_runtime": 66.8855, "eval_samples_per_second": 27.51, "eval_steps_per_second": 1.151, "step": 8000 }, { "epoch": 1.1238926351976728, "grad_norm": 19.066770553588867, "learning_rate": 6.582311372188736e-05, "loss": 2.616, "step": 8500 }, { "epoch": 1.190003966679889, "grad_norm": 25.54907989501953, "learning_rate": 6.350311112348446e-05, "loss": 2.6009, "step": 9000 }, { "epoch": 1.190003966679889, "eval_accuracy": 0.5401903103162634, "eval_loss": 2.703549861907959, "eval_runtime": 65.0156, "eval_samples_per_second": 28.301, "eval_steps_per_second": 1.184, "step": 9000 }, { "epoch": 1.256115298162105, "grad_norm": 24.64689826965332, "learning_rate": 6.118310852508154e-05, "loss": 2.622, "step": 9500 }, { "epoch": 1.322226629644321, "grad_norm": 39.75895309448242, "learning_rate": 5.886310592667864e-05, "loss": 2.6347, "step": 10000 }, { "epoch": 1.322226629644321, "eval_accuracy": 0.5368293472950872, "eval_loss": 2.7321841716766357, "eval_runtime": 70.4109, "eval_samples_per_second": 26.132, "eval_steps_per_second": 1.094, "step": 10000 }, { "epoch": 1.388337961126537, "grad_norm": 63.85321044921875, "learning_rate": 5.654310332827573e-05, "loss": 2.634, "step": 10500 }, { "epoch": 1.454449292608753, "grad_norm": 38.93082046508789, "learning_rate": 5.422310072987282e-05, "loss": 2.7407, "step": 11000 }, { "epoch": 1.454449292608753, "eval_accuracy": 0.5244159002570039, "eval_loss": 2.8357815742492676, "eval_runtime": 64.8895, "eval_samples_per_second": 28.356, "eval_steps_per_second": 1.187, "step": 11000 }, { "epoch": 1.5205606240909693, "grad_norm": 95.31773376464844, "learning_rate": 5.1903098131469904e-05, "loss": 2.787, "step": 11500 }, { "epoch": 1.5866719555731852, "grad_norm": 388.4613952636719, "learning_rate": 4.9583095533066995e-05, "loss": 2.8981, "step": 12000 }, { "epoch": 1.5866719555731852, "eval_accuracy": 0.5073068222849982, "eval_loss": 2.9791054725646973, "eval_runtime": 65.179, "eval_samples_per_second": 28.23, "eval_steps_per_second": 1.181, "step": 12000 }, { "epoch": 1.6527832870554013, "grad_norm": 612.2838745117188, "learning_rate": 4.7263092934664086e-05, "loss": 2.9835, "step": 12500 }, { "epoch": 1.7188946185376173, "grad_norm": 115.95877838134766, "learning_rate": 4.4943090336261176e-05, "loss": 3.1243, "step": 13000 }, { "epoch": 1.7188946185376173, "eval_accuracy": 0.4558367494318302, "eval_loss": 3.454159736633301, "eval_runtime": 64.9781, "eval_samples_per_second": 28.317, "eval_steps_per_second": 1.185, "step": 13000 }, { "epoch": 1.7850059500198334, "grad_norm": 152.87709045410156, "learning_rate": 4.262308773785827e-05, "loss": 3.2902, "step": 13500 }, { "epoch": 1.8511172815020496, "grad_norm": 235.900390625, "learning_rate": 4.030308513945535e-05, "loss": 3.2186, "step": 14000 }, { "epoch": 1.8511172815020496, "eval_accuracy": 0.4863910069879569, "eval_loss": 3.138493061065674, "eval_runtime": 65.7386, "eval_samples_per_second": 27.99, "eval_steps_per_second": 1.171, "step": 14000 }, { "epoch": 1.9172286129842655, "grad_norm": 22.522586822509766, "learning_rate": 3.798308254105244e-05, "loss": 3.0901, "step": 14500 }, { "epoch": 1.9833399444664814, "grad_norm": 186.2947235107422, "learning_rate": 3.566307994264953e-05, "loss": 2.9741, "step": 15000 }, { "epoch": 1.9833399444664814, "eval_accuracy": 0.4909350374885835, "eval_loss": 3.0912961959838867, "eval_runtime": 64.5786, "eval_samples_per_second": 28.492, "eval_steps_per_second": 1.192, "step": 15000 }, { "epoch": 2.0494512759486976, "grad_norm": 68.64359283447266, "learning_rate": 3.3343077344246624e-05, "loss": 2.8854, "step": 15500 }, { "epoch": 2.1155626074309137, "grad_norm": 95.0623550415039, "learning_rate": 3.1023074745843715e-05, "loss": 2.8322, "step": 16000 }, { "epoch": 2.1155626074309137, "eval_accuracy": 0.5131106497313142, "eval_loss": 2.899470329284668, "eval_runtime": 64.8181, "eval_samples_per_second": 28.387, "eval_steps_per_second": 1.188, "step": 16000 }, { "epoch": 2.18167393891313, "grad_norm": 34.978458404541016, "learning_rate": 2.8703072147440806e-05, "loss": 2.8632, "step": 16500 }, { "epoch": 2.2477852703953456, "grad_norm": 94.10592651367188, "learning_rate": 2.6383069549037897e-05, "loss": 2.8482, "step": 17000 }, { "epoch": 2.2477852703953456, "eval_accuracy": 0.505882149911854, "eval_loss": 2.944797992706299, "eval_runtime": 64.8393, "eval_samples_per_second": 28.378, "eval_steps_per_second": 1.188, "step": 17000 }, { "epoch": 2.3138966018775617, "grad_norm": 68.33843994140625, "learning_rate": 2.4063066950634984e-05, "loss": 2.8747, "step": 17500 }, { "epoch": 2.380007933359778, "grad_norm": 83.47583770751953, "learning_rate": 2.1743064352232075e-05, "loss": 2.8697, "step": 18000 }, { "epoch": 2.380007933359778, "eval_accuracy": 0.503569911429239, "eval_loss": 2.9733448028564453, "eval_runtime": 64.8844, "eval_samples_per_second": 28.358, "eval_steps_per_second": 1.187, "step": 18000 }, { "epoch": 2.446119264841994, "grad_norm": 132.79673767089844, "learning_rate": 1.9423061753829162e-05, "loss": 2.8369, "step": 18500 }, { "epoch": 2.51223059632421, "grad_norm": 189.53028869628906, "learning_rate": 1.7103059155426253e-05, "loss": 2.8289, "step": 19000 }, { "epoch": 2.51223059632421, "eval_accuracy": 0.5066844905588241, "eval_loss": 2.94022536277771, "eval_runtime": 65.2056, "eval_samples_per_second": 28.218, "eval_steps_per_second": 1.181, "step": 19000 }, { "epoch": 2.578341927806426, "grad_norm": 130.18309020996094, "learning_rate": 1.4783056557023344e-05, "loss": 2.8655, "step": 19500 }, { "epoch": 2.644453259288642, "grad_norm": 136.07675170898438, "learning_rate": 1.2463053958620433e-05, "loss": 2.8319, "step": 20000 }, { "epoch": 2.644453259288642, "eval_accuracy": 0.506187209277628, "eval_loss": 2.9402058124542236, "eval_runtime": 65.0148, "eval_samples_per_second": 28.301, "eval_steps_per_second": 1.184, "step": 20000 } ], "logging_steps": 500, "max_steps": 22689, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2813992666018611e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }