gsmyrnis's picture
End of training
c7bbc82 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 834,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03597122302158273,
"grad_norm": 26.93948859971876,
"learning_rate": 5e-06,
"loss": 1.0175,
"step": 10
},
{
"epoch": 0.07194244604316546,
"grad_norm": 2.6743423167480063,
"learning_rate": 5e-06,
"loss": 0.9337,
"step": 20
},
{
"epoch": 0.1079136690647482,
"grad_norm": 1.100805597904231,
"learning_rate": 5e-06,
"loss": 0.8923,
"step": 30
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.8355539701078896,
"learning_rate": 5e-06,
"loss": 0.8673,
"step": 40
},
{
"epoch": 0.17985611510791366,
"grad_norm": 0.7157506047100403,
"learning_rate": 5e-06,
"loss": 0.8553,
"step": 50
},
{
"epoch": 0.2158273381294964,
"grad_norm": 0.9806631521043339,
"learning_rate": 5e-06,
"loss": 0.8492,
"step": 60
},
{
"epoch": 0.2517985611510791,
"grad_norm": 0.8360835611944488,
"learning_rate": 5e-06,
"loss": 0.8382,
"step": 70
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.7078472519601653,
"learning_rate": 5e-06,
"loss": 0.8318,
"step": 80
},
{
"epoch": 0.3237410071942446,
"grad_norm": 0.6255785562847258,
"learning_rate": 5e-06,
"loss": 0.825,
"step": 90
},
{
"epoch": 0.3597122302158273,
"grad_norm": 0.6950072028339258,
"learning_rate": 5e-06,
"loss": 0.8225,
"step": 100
},
{
"epoch": 0.39568345323741005,
"grad_norm": 0.622757689781733,
"learning_rate": 5e-06,
"loss": 0.8165,
"step": 110
},
{
"epoch": 0.4316546762589928,
"grad_norm": 0.6855173384055511,
"learning_rate": 5e-06,
"loss": 0.8162,
"step": 120
},
{
"epoch": 0.4676258992805755,
"grad_norm": 0.555459004966806,
"learning_rate": 5e-06,
"loss": 0.8141,
"step": 130
},
{
"epoch": 0.5035971223021583,
"grad_norm": 0.7189252900166325,
"learning_rate": 5e-06,
"loss": 0.8113,
"step": 140
},
{
"epoch": 0.539568345323741,
"grad_norm": 0.8411135438726722,
"learning_rate": 5e-06,
"loss": 0.8069,
"step": 150
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.9141854769887011,
"learning_rate": 5e-06,
"loss": 0.8087,
"step": 160
},
{
"epoch": 0.6115107913669064,
"grad_norm": 0.6527584548807389,
"learning_rate": 5e-06,
"loss": 0.8048,
"step": 170
},
{
"epoch": 0.6474820143884892,
"grad_norm": 0.6986581112545092,
"learning_rate": 5e-06,
"loss": 0.8051,
"step": 180
},
{
"epoch": 0.6834532374100719,
"grad_norm": 0.6094857952430536,
"learning_rate": 5e-06,
"loss": 0.8044,
"step": 190
},
{
"epoch": 0.7194244604316546,
"grad_norm": 0.74096920276776,
"learning_rate": 5e-06,
"loss": 0.7989,
"step": 200
},
{
"epoch": 0.7553956834532374,
"grad_norm": 0.6584952886572538,
"learning_rate": 5e-06,
"loss": 0.8025,
"step": 210
},
{
"epoch": 0.7913669064748201,
"grad_norm": 0.5838446606699556,
"learning_rate": 5e-06,
"loss": 0.7988,
"step": 220
},
{
"epoch": 0.8273381294964028,
"grad_norm": 0.5916175411049406,
"learning_rate": 5e-06,
"loss": 0.7985,
"step": 230
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.626471567693148,
"learning_rate": 5e-06,
"loss": 0.7973,
"step": 240
},
{
"epoch": 0.8992805755395683,
"grad_norm": 0.6338741269795162,
"learning_rate": 5e-06,
"loss": 0.7933,
"step": 250
},
{
"epoch": 0.935251798561151,
"grad_norm": 0.8343555675066444,
"learning_rate": 5e-06,
"loss": 0.7969,
"step": 260
},
{
"epoch": 0.9712230215827338,
"grad_norm": 0.6221641429373133,
"learning_rate": 5e-06,
"loss": 0.7933,
"step": 270
},
{
"epoch": 1.0,
"eval_loss": 0.7923575043678284,
"eval_runtime": 27.9533,
"eval_samples_per_second": 267.732,
"eval_steps_per_second": 1.073,
"step": 278
},
{
"epoch": 1.0071942446043165,
"grad_norm": 0.8944971285319924,
"learning_rate": 5e-06,
"loss": 0.7823,
"step": 280
},
{
"epoch": 1.0431654676258992,
"grad_norm": 0.7668083853056575,
"learning_rate": 5e-06,
"loss": 0.7574,
"step": 290
},
{
"epoch": 1.079136690647482,
"grad_norm": 0.6176816592509634,
"learning_rate": 5e-06,
"loss": 0.7529,
"step": 300
},
{
"epoch": 1.1151079136690647,
"grad_norm": 0.6475301176330789,
"learning_rate": 5e-06,
"loss": 0.7558,
"step": 310
},
{
"epoch": 1.1510791366906474,
"grad_norm": 0.5811910989874788,
"learning_rate": 5e-06,
"loss": 0.7623,
"step": 320
},
{
"epoch": 1.1870503597122302,
"grad_norm": 0.6269454462814978,
"learning_rate": 5e-06,
"loss": 0.7601,
"step": 330
},
{
"epoch": 1.223021582733813,
"grad_norm": 0.5423886247053047,
"learning_rate": 5e-06,
"loss": 0.7535,
"step": 340
},
{
"epoch": 1.2589928057553956,
"grad_norm": 0.6670401432003603,
"learning_rate": 5e-06,
"loss": 0.757,
"step": 350
},
{
"epoch": 1.2949640287769784,
"grad_norm": 0.7095322132659916,
"learning_rate": 5e-06,
"loss": 0.759,
"step": 360
},
{
"epoch": 1.330935251798561,
"grad_norm": 0.6870367808903867,
"learning_rate": 5e-06,
"loss": 0.7567,
"step": 370
},
{
"epoch": 1.3669064748201438,
"grad_norm": 0.6640094117573664,
"learning_rate": 5e-06,
"loss": 0.7592,
"step": 380
},
{
"epoch": 1.4028776978417266,
"grad_norm": 0.5994950619117767,
"learning_rate": 5e-06,
"loss": 0.7529,
"step": 390
},
{
"epoch": 1.4388489208633093,
"grad_norm": 0.7392872817621052,
"learning_rate": 5e-06,
"loss": 0.7554,
"step": 400
},
{
"epoch": 1.474820143884892,
"grad_norm": 0.5656749568866071,
"learning_rate": 5e-06,
"loss": 0.7547,
"step": 410
},
{
"epoch": 1.5107913669064748,
"grad_norm": 0.921484641426356,
"learning_rate": 5e-06,
"loss": 0.7532,
"step": 420
},
{
"epoch": 1.5467625899280577,
"grad_norm": 0.540059029380678,
"learning_rate": 5e-06,
"loss": 0.7585,
"step": 430
},
{
"epoch": 1.5827338129496402,
"grad_norm": 0.6558652758296812,
"learning_rate": 5e-06,
"loss": 0.7515,
"step": 440
},
{
"epoch": 1.6187050359712232,
"grad_norm": 0.57268163367781,
"learning_rate": 5e-06,
"loss": 0.7562,
"step": 450
},
{
"epoch": 1.6546762589928057,
"grad_norm": 0.5407189047091853,
"learning_rate": 5e-06,
"loss": 0.7559,
"step": 460
},
{
"epoch": 1.6906474820143886,
"grad_norm": 0.6077940984618293,
"learning_rate": 5e-06,
"loss": 0.757,
"step": 470
},
{
"epoch": 1.7266187050359711,
"grad_norm": 1.001124812241379,
"learning_rate": 5e-06,
"loss": 0.7552,
"step": 480
},
{
"epoch": 1.762589928057554,
"grad_norm": 0.6254013722291123,
"learning_rate": 5e-06,
"loss": 0.753,
"step": 490
},
{
"epoch": 1.7985611510791366,
"grad_norm": 0.5767617312575639,
"learning_rate": 5e-06,
"loss": 0.7594,
"step": 500
},
{
"epoch": 1.8345323741007196,
"grad_norm": 0.665915353902276,
"learning_rate": 5e-06,
"loss": 0.7554,
"step": 510
},
{
"epoch": 1.870503597122302,
"grad_norm": 0.5596777388150926,
"learning_rate": 5e-06,
"loss": 0.7537,
"step": 520
},
{
"epoch": 1.906474820143885,
"grad_norm": 0.5547398560915929,
"learning_rate": 5e-06,
"loss": 0.7555,
"step": 530
},
{
"epoch": 1.9424460431654675,
"grad_norm": 0.5874602156110944,
"learning_rate": 5e-06,
"loss": 0.7509,
"step": 540
},
{
"epoch": 1.9784172661870505,
"grad_norm": 0.6369533697170318,
"learning_rate": 5e-06,
"loss": 0.7503,
"step": 550
},
{
"epoch": 2.0,
"eval_loss": 0.7788412570953369,
"eval_runtime": 27.8988,
"eval_samples_per_second": 268.255,
"eval_steps_per_second": 1.075,
"step": 556
},
{
"epoch": 2.014388489208633,
"grad_norm": 1.0929207520027995,
"learning_rate": 5e-06,
"loss": 0.735,
"step": 560
},
{
"epoch": 2.050359712230216,
"grad_norm": 0.687310495052166,
"learning_rate": 5e-06,
"loss": 0.7131,
"step": 570
},
{
"epoch": 2.0863309352517985,
"grad_norm": 0.6848749958758751,
"learning_rate": 5e-06,
"loss": 0.7129,
"step": 580
},
{
"epoch": 2.1223021582733814,
"grad_norm": 0.9700661070159223,
"learning_rate": 5e-06,
"loss": 0.7154,
"step": 590
},
{
"epoch": 2.158273381294964,
"grad_norm": 0.7429316335562708,
"learning_rate": 5e-06,
"loss": 0.7163,
"step": 600
},
{
"epoch": 2.194244604316547,
"grad_norm": 0.5731198010767242,
"learning_rate": 5e-06,
"loss": 0.7197,
"step": 610
},
{
"epoch": 2.2302158273381294,
"grad_norm": 0.6519774548706885,
"learning_rate": 5e-06,
"loss": 0.7192,
"step": 620
},
{
"epoch": 2.2661870503597124,
"grad_norm": 0.7092939571259266,
"learning_rate": 5e-06,
"loss": 0.717,
"step": 630
},
{
"epoch": 2.302158273381295,
"grad_norm": 0.8300683342338049,
"learning_rate": 5e-06,
"loss": 0.7171,
"step": 640
},
{
"epoch": 2.338129496402878,
"grad_norm": 0.6364079517115279,
"learning_rate": 5e-06,
"loss": 0.7179,
"step": 650
},
{
"epoch": 2.3741007194244603,
"grad_norm": 0.6830216482631195,
"learning_rate": 5e-06,
"loss": 0.7208,
"step": 660
},
{
"epoch": 2.4100719424460433,
"grad_norm": 0.580810416113199,
"learning_rate": 5e-06,
"loss": 0.7201,
"step": 670
},
{
"epoch": 2.446043165467626,
"grad_norm": 0.7709663647446697,
"learning_rate": 5e-06,
"loss": 0.7165,
"step": 680
},
{
"epoch": 2.4820143884892087,
"grad_norm": 0.6587806242655105,
"learning_rate": 5e-06,
"loss": 0.7199,
"step": 690
},
{
"epoch": 2.5179856115107913,
"grad_norm": 0.6679031168226195,
"learning_rate": 5e-06,
"loss": 0.7228,
"step": 700
},
{
"epoch": 2.553956834532374,
"grad_norm": 0.5802019851320436,
"learning_rate": 5e-06,
"loss": 0.7211,
"step": 710
},
{
"epoch": 2.5899280575539567,
"grad_norm": 0.633360775543426,
"learning_rate": 5e-06,
"loss": 0.7192,
"step": 720
},
{
"epoch": 2.6258992805755397,
"grad_norm": 0.7014721250700231,
"learning_rate": 5e-06,
"loss": 0.7208,
"step": 730
},
{
"epoch": 2.661870503597122,
"grad_norm": 0.5972726636881343,
"learning_rate": 5e-06,
"loss": 0.7184,
"step": 740
},
{
"epoch": 2.697841726618705,
"grad_norm": 0.5454556975289979,
"learning_rate": 5e-06,
"loss": 0.7139,
"step": 750
},
{
"epoch": 2.7338129496402876,
"grad_norm": 0.5626224999737693,
"learning_rate": 5e-06,
"loss": 0.7207,
"step": 760
},
{
"epoch": 2.7697841726618706,
"grad_norm": 0.5106193565014756,
"learning_rate": 5e-06,
"loss": 0.7193,
"step": 770
},
{
"epoch": 2.805755395683453,
"grad_norm": 0.6138738602878809,
"learning_rate": 5e-06,
"loss": 0.7185,
"step": 780
},
{
"epoch": 2.841726618705036,
"grad_norm": 0.6093685279993987,
"learning_rate": 5e-06,
"loss": 0.7217,
"step": 790
},
{
"epoch": 2.8776978417266186,
"grad_norm": 0.5564883285882788,
"learning_rate": 5e-06,
"loss": 0.7213,
"step": 800
},
{
"epoch": 2.9136690647482015,
"grad_norm": 0.5906548449538034,
"learning_rate": 5e-06,
"loss": 0.7183,
"step": 810
},
{
"epoch": 2.949640287769784,
"grad_norm": 0.5460219561244413,
"learning_rate": 5e-06,
"loss": 0.7216,
"step": 820
},
{
"epoch": 2.985611510791367,
"grad_norm": 0.6453368774762195,
"learning_rate": 5e-06,
"loss": 0.7198,
"step": 830
},
{
"epoch": 3.0,
"eval_loss": 0.7752296328544617,
"eval_runtime": 27.5746,
"eval_samples_per_second": 271.409,
"eval_steps_per_second": 1.088,
"step": 834
},
{
"epoch": 3.0,
"step": 834,
"total_flos": 1396981062696960.0,
"train_loss": 0.7675551453368555,
"train_runtime": 5571.5313,
"train_samples_per_second": 76.563,
"train_steps_per_second": 0.15
}
],
"logging_steps": 10,
"max_steps": 834,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1396981062696960.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}