lesso02's picture
Training in progress, step 500, checkpoint
7c4ed74 verified
{
"best_metric": 0.26981785893440247,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.09550186228631459,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00019100372457262916,
"eval_loss": 2.06730318069458,
"eval_runtime": 63.62,
"eval_samples_per_second": 34.659,
"eval_steps_per_second": 8.677,
"step": 1
},
{
"epoch": 0.0019100372457262916,
"grad_norm": 7.6455230712890625,
"learning_rate": 4.0400000000000006e-05,
"loss": 3.1643,
"step": 10
},
{
"epoch": 0.003820074491452583,
"grad_norm": 4.942553997039795,
"learning_rate": 8.080000000000001e-05,
"loss": 2.6878,
"step": 20
},
{
"epoch": 0.005730111737178875,
"grad_norm": 4.34798526763916,
"learning_rate": 0.00012119999999999999,
"loss": 1.7231,
"step": 30
},
{
"epoch": 0.007640148982905166,
"grad_norm": 5.313558578491211,
"learning_rate": 0.00016160000000000002,
"loss": 1.7659,
"step": 40
},
{
"epoch": 0.009550186228631458,
"grad_norm": 6.8922014236450195,
"learning_rate": 0.000202,
"loss": 1.3088,
"step": 50
},
{
"epoch": 0.009550186228631458,
"eval_loss": 0.8562319278717041,
"eval_runtime": 63.8502,
"eval_samples_per_second": 34.534,
"eval_steps_per_second": 8.645,
"step": 50
},
{
"epoch": 0.01146022347435775,
"grad_norm": 3.368533134460449,
"learning_rate": 0.00020175396907624226,
"loss": 1.4345,
"step": 60
},
{
"epoch": 0.013370260720084041,
"grad_norm": 3.520836591720581,
"learning_rate": 0.0002010170749428986,
"loss": 1.3208,
"step": 70
},
{
"epoch": 0.015280297965810333,
"grad_norm": 3.200895309448242,
"learning_rate": 0.00019979290767411438,
"loss": 1.2472,
"step": 80
},
{
"epoch": 0.017190335211536626,
"grad_norm": 3.8880531787872314,
"learning_rate": 0.0001980874312897702,
"loss": 0.9301,
"step": 90
},
{
"epoch": 0.019100372457262916,
"grad_norm": 5.285026550292969,
"learning_rate": 0.00019590895469937675,
"loss": 1.0703,
"step": 100
},
{
"epoch": 0.019100372457262916,
"eval_loss": 0.6190745830535889,
"eval_runtime": 63.8537,
"eval_samples_per_second": 34.532,
"eval_steps_per_second": 8.645,
"step": 100
},
{
"epoch": 0.02101040970298921,
"grad_norm": 2.872575283050537,
"learning_rate": 0.0001932680912219027,
"loss": 1.0179,
"step": 110
},
{
"epoch": 0.0229204469487155,
"grad_norm": 3.166882038116455,
"learning_rate": 0.00019017770687875164,
"loss": 0.9739,
"step": 120
},
{
"epoch": 0.024830484194441792,
"grad_norm": 3.3663108348846436,
"learning_rate": 0.000186652857711799,
"loss": 0.8537,
"step": 130
},
{
"epoch": 0.026740521440168082,
"grad_norm": 2.7930142879486084,
"learning_rate": 0.00018271071643186968,
"loss": 0.7789,
"step": 140
},
{
"epoch": 0.028650558685894376,
"grad_norm": 4.935205459594727,
"learning_rate": 0.00017837048875501678,
"loss": 1.0222,
"step": 150
},
{
"epoch": 0.028650558685894376,
"eval_loss": 0.5046119689941406,
"eval_runtime": 63.8937,
"eval_samples_per_second": 34.51,
"eval_steps_per_second": 8.639,
"step": 150
},
{
"epoch": 0.030560595931620665,
"grad_norm": 2.4444825649261475,
"learning_rate": 0.00017365331983420376,
"loss": 0.9172,
"step": 160
},
{
"epoch": 0.032470633177346955,
"grad_norm": 3.213865280151367,
"learning_rate": 0.0001685821912422447,
"loss": 0.8119,
"step": 170
},
{
"epoch": 0.03438067042307325,
"grad_norm": 2.578925371170044,
"learning_rate": 0.00016318180900789148,
"loss": 0.7952,
"step": 180
},
{
"epoch": 0.03629070766879954,
"grad_norm": 3.1453428268432617,
"learning_rate": 0.00015747848325054544,
"loss": 0.774,
"step": 190
},
{
"epoch": 0.03820074491452583,
"grad_norm": 6.084349155426025,
"learning_rate": 0.0001515,
"loss": 0.8455,
"step": 200
},
{
"epoch": 0.03820074491452583,
"eval_loss": 0.4781758189201355,
"eval_runtime": 63.8211,
"eval_samples_per_second": 34.55,
"eval_steps_per_second": 8.649,
"step": 200
},
{
"epoch": 0.04011078216025212,
"grad_norm": 2.6562137603759766,
"learning_rate": 0.00014527548582569683,
"loss": 0.9388,
"step": 210
},
{
"epoch": 0.04202081940597842,
"grad_norm": 2.2333340644836426,
"learning_rate": 0.00013883526593500714,
"loss": 0.7923,
"step": 220
},
{
"epoch": 0.04393085665170471,
"grad_norm": 2.876622200012207,
"learning_rate": 0.0001322107164318697,
"loss": 0.8484,
"step": 230
},
{
"epoch": 0.045840893897431,
"grad_norm": 3.026136636734009,
"learning_rate": 0.00012543411145556643,
"loss": 0.714,
"step": 240
},
{
"epoch": 0.047750931143157295,
"grad_norm": 4.8779826164245605,
"learning_rate": 0.00011853846594435998,
"loss": 0.6556,
"step": 250
},
{
"epoch": 0.047750931143157295,
"eval_loss": 0.3865682780742645,
"eval_runtime": 63.8484,
"eval_samples_per_second": 34.535,
"eval_steps_per_second": 8.645,
"step": 250
},
{
"epoch": 0.049660968388883585,
"grad_norm": 2.4860901832580566,
"learning_rate": 0.00011155737479003301,
"loss": 0.634,
"step": 260
},
{
"epoch": 0.051571005634609875,
"grad_norm": 1.8885411024093628,
"learning_rate": 0.00010452484916695262,
"loss": 0.7666,
"step": 270
},
{
"epoch": 0.053481042880336165,
"grad_norm": 3.745715856552124,
"learning_rate": 9.747515083304742e-05,
"loss": 0.742,
"step": 280
},
{
"epoch": 0.05539108012606246,
"grad_norm": 2.1745243072509766,
"learning_rate": 9.044262520996702e-05,
"loss": 0.6505,
"step": 290
},
{
"epoch": 0.05730111737178875,
"grad_norm": 3.0038623809814453,
"learning_rate": 8.346153405564004e-05,
"loss": 0.6772,
"step": 300
},
{
"epoch": 0.05730111737178875,
"eval_loss": 0.34666135907173157,
"eval_runtime": 63.8243,
"eval_samples_per_second": 34.548,
"eval_steps_per_second": 8.649,
"step": 300
},
{
"epoch": 0.05921115461751504,
"grad_norm": 1.8677786588668823,
"learning_rate": 7.656588854443357e-05,
"loss": 0.5809,
"step": 310
},
{
"epoch": 0.06112119186324133,
"grad_norm": 1.749708652496338,
"learning_rate": 6.978928356813031e-05,
"loss": 0.5507,
"step": 320
},
{
"epoch": 0.06303122910896762,
"grad_norm": 2.1635003089904785,
"learning_rate": 6.316473406499288e-05,
"loss": 0.5366,
"step": 330
},
{
"epoch": 0.06494126635469391,
"grad_norm": 3.309890031814575,
"learning_rate": 5.672451417430317e-05,
"loss": 0.6328,
"step": 340
},
{
"epoch": 0.06685130360042021,
"grad_norm": 4.125185966491699,
"learning_rate": 5.050000000000002e-05,
"loss": 0.5743,
"step": 350
},
{
"epoch": 0.06685130360042021,
"eval_loss": 0.3164260983467102,
"eval_runtime": 63.6764,
"eval_samples_per_second": 34.628,
"eval_steps_per_second": 8.669,
"step": 350
},
{
"epoch": 0.0687613408461465,
"grad_norm": 1.8781318664550781,
"learning_rate": 4.452151674945458e-05,
"loss": 0.6682,
"step": 360
},
{
"epoch": 0.0706713780918728,
"grad_norm": 1.7442470788955688,
"learning_rate": 3.8818190992108515e-05,
"loss": 0.5477,
"step": 370
},
{
"epoch": 0.07258141533759908,
"grad_norm": 2.8514626026153564,
"learning_rate": 3.3417808757755355e-05,
"loss": 0.5742,
"step": 380
},
{
"epoch": 0.07449145258332537,
"grad_norm": 2.2705271244049072,
"learning_rate": 2.8346680165796253e-05,
"loss": 0.6682,
"step": 390
},
{
"epoch": 0.07640148982905166,
"grad_norm": 4.242289066314697,
"learning_rate": 2.362951124498323e-05,
"loss": 0.7781,
"step": 400
},
{
"epoch": 0.07640148982905166,
"eval_loss": 0.288888543844223,
"eval_runtime": 63.7944,
"eval_samples_per_second": 34.564,
"eval_steps_per_second": 8.653,
"step": 400
},
{
"epoch": 0.07831152707477795,
"grad_norm": 2.7676401138305664,
"learning_rate": 1.928928356813032e-05,
"loss": 0.4907,
"step": 410
},
{
"epoch": 0.08022156432050424,
"grad_norm": 2.335130214691162,
"learning_rate": 1.5347142288200977e-05,
"loss": 0.6022,
"step": 420
},
{
"epoch": 0.08213160156623055,
"grad_norm": 2.4311466217041016,
"learning_rate": 1.1822293121248375e-05,
"loss": 0.5102,
"step": 430
},
{
"epoch": 0.08404163881195684,
"grad_norm": 2.3990116119384766,
"learning_rate": 8.731908778097302e-06,
"loss": 0.6047,
"step": 440
},
{
"epoch": 0.08595167605768313,
"grad_norm": 4.477560043334961,
"learning_rate": 6.09104530062326e-06,
"loss": 0.7265,
"step": 450
},
{
"epoch": 0.08595167605768313,
"eval_loss": 0.27164798974990845,
"eval_runtime": 63.8913,
"eval_samples_per_second": 34.512,
"eval_steps_per_second": 8.64,
"step": 450
},
{
"epoch": 0.08786171330340942,
"grad_norm": 2.161862373352051,
"learning_rate": 3.912568710229791e-06,
"loss": 0.5218,
"step": 460
},
{
"epoch": 0.0897717505491357,
"grad_norm": 1.7147796154022217,
"learning_rate": 2.2070923258856255e-06,
"loss": 0.4866,
"step": 470
},
{
"epoch": 0.091681787794862,
"grad_norm": 2.2955782413482666,
"learning_rate": 9.829250571013935e-07,
"loss": 0.516,
"step": 480
},
{
"epoch": 0.09359182504058829,
"grad_norm": 3.3165369033813477,
"learning_rate": 2.4603092375775605e-07,
"loss": 0.4647,
"step": 490
},
{
"epoch": 0.09550186228631459,
"grad_norm": 2.52410888671875,
"learning_rate": 0.0,
"loss": 0.5188,
"step": 500
},
{
"epoch": 0.09550186228631459,
"eval_loss": 0.26981785893440247,
"eval_runtime": 64.0421,
"eval_samples_per_second": 34.43,
"eval_steps_per_second": 8.619,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.300574941413376e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}