zephyr-7b-sft-safe / trainer_state.json
AmberYifan's picture
Model save
8c8d21a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 370,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 31.31255863547505,
"learning_rate": 5.405405405405406e-07,
"loss": 1.5205,
"step": 1
},
{
"epoch": 0.03,
"grad_norm": 10.29562052531629,
"learning_rate": 2.702702702702703e-06,
"loss": 1.4439,
"step": 5
},
{
"epoch": 0.05,
"grad_norm": 7.285096223172462,
"learning_rate": 5.405405405405406e-06,
"loss": 1.3513,
"step": 10
},
{
"epoch": 0.08,
"grad_norm": 4.503641754079376,
"learning_rate": 8.108108108108109e-06,
"loss": 1.2532,
"step": 15
},
{
"epoch": 0.11,
"grad_norm": 4.068221865118029,
"learning_rate": 1.0810810810810812e-05,
"loss": 1.2089,
"step": 20
},
{
"epoch": 0.14,
"grad_norm": 3.8587372955314843,
"learning_rate": 1.3513513513513515e-05,
"loss": 1.1817,
"step": 25
},
{
"epoch": 0.16,
"grad_norm": 3.73116376117763,
"learning_rate": 1.6216216216216218e-05,
"loss": 1.1559,
"step": 30
},
{
"epoch": 0.19,
"grad_norm": 3.832939537050073,
"learning_rate": 1.891891891891892e-05,
"loss": 1.1344,
"step": 35
},
{
"epoch": 0.22,
"grad_norm": 3.747693244279986,
"learning_rate": 1.999599507118322e-05,
"loss": 1.1199,
"step": 40
},
{
"epoch": 0.24,
"grad_norm": 3.68684932685565,
"learning_rate": 1.9971532122280466e-05,
"loss": 1.0902,
"step": 45
},
{
"epoch": 0.27,
"grad_norm": 3.694485690651995,
"learning_rate": 1.992488554155135e-05,
"loss": 1.0918,
"step": 50
},
{
"epoch": 0.3,
"grad_norm": 3.9192389906538514,
"learning_rate": 1.9856159103477085e-05,
"loss": 1.0677,
"step": 55
},
{
"epoch": 0.32,
"grad_norm": 3.395113740791023,
"learning_rate": 1.9765505703518494e-05,
"loss": 1.0449,
"step": 60
},
{
"epoch": 0.35,
"grad_norm": 55.64552490454145,
"learning_rate": 1.9653127017970035e-05,
"loss": 1.0412,
"step": 65
},
{
"epoch": 0.38,
"grad_norm": 5.907895975695113,
"learning_rate": 1.9519273055291266e-05,
"loss": 1.0291,
"step": 70
},
{
"epoch": 0.41,
"grad_norm": 16.980325045210197,
"learning_rate": 1.9364241599913923e-05,
"loss": 1.2227,
"step": 75
},
{
"epoch": 0.43,
"grad_norm": 26.879221211035098,
"learning_rate": 1.9188377549761962e-05,
"loss": 1.1577,
"step": 80
},
{
"epoch": 0.46,
"grad_norm": 17.465078721382948,
"learning_rate": 1.8992072148958368e-05,
"loss": 1.0105,
"step": 85
},
{
"epoch": 0.49,
"grad_norm": 4.258708985650342,
"learning_rate": 1.8775762117425777e-05,
"loss": 0.9719,
"step": 90
},
{
"epoch": 0.51,
"grad_norm": 3.9262468540271627,
"learning_rate": 1.853992867931721e-05,
"loss": 0.9714,
"step": 95
},
{
"epoch": 0.54,
"grad_norm": 4.127884777228108,
"learning_rate": 1.8285096492438424e-05,
"loss": 0.9227,
"step": 100
},
{
"epoch": 0.54,
"eval_loss": 1.3365702629089355,
"eval_runtime": 17.7256,
"eval_samples_per_second": 33.454,
"eval_steps_per_second": 1.072,
"step": 100
},
{
"epoch": 0.57,
"grad_norm": 3.6876289826671718,
"learning_rate": 1.8011832481043577e-05,
"loss": 0.9179,
"step": 105
},
{
"epoch": 0.59,
"grad_norm": 4.5580612433526255,
"learning_rate": 1.7720744574600865e-05,
"loss": 0.8885,
"step": 110
},
{
"epoch": 0.62,
"grad_norm": 3.840600055255248,
"learning_rate": 1.7412480355334006e-05,
"loss": 0.8921,
"step": 115
},
{
"epoch": 0.65,
"grad_norm": 4.014232990384134,
"learning_rate": 1.7087725617548385e-05,
"loss": 0.8761,
"step": 120
},
{
"epoch": 0.68,
"grad_norm": 3.362002410991992,
"learning_rate": 1.6747202841946928e-05,
"loss": 0.8497,
"step": 125
},
{
"epoch": 0.7,
"grad_norm": 3.270370211049122,
"learning_rate": 1.639166958832985e-05,
"loss": 0.8659,
"step": 130
},
{
"epoch": 0.73,
"grad_norm": 3.2957573116367196,
"learning_rate": 1.6021916810254096e-05,
"loss": 0.8237,
"step": 135
},
{
"epoch": 0.76,
"grad_norm": 3.968603893718009,
"learning_rate": 1.5638767095401778e-05,
"loss": 0.8345,
"step": 140
},
{
"epoch": 0.78,
"grad_norm": 3.458171302980965,
"learning_rate": 1.5243072835572319e-05,
"loss": 0.8189,
"step": 145
},
{
"epoch": 0.81,
"grad_norm": 3.695152856632861,
"learning_rate": 1.4835714330369445e-05,
"loss": 0.8028,
"step": 150
},
{
"epoch": 0.84,
"grad_norm": 4.148345741206558,
"learning_rate": 1.4417597828801833e-05,
"loss": 0.812,
"step": 155
},
{
"epoch": 0.86,
"grad_norm": 3.2188121700972543,
"learning_rate": 1.3989653513154165e-05,
"loss": 0.7882,
"step": 160
},
{
"epoch": 0.89,
"grad_norm": 3.4338055838751536,
"learning_rate": 1.3552833429613939e-05,
"loss": 0.7801,
"step": 165
},
{
"epoch": 0.92,
"grad_norm": 3.6193578206738666,
"learning_rate": 1.3108109370257714e-05,
"loss": 0.7733,
"step": 170
},
{
"epoch": 0.95,
"grad_norm": 3.313724488690459,
"learning_rate": 1.2656470711108763e-05,
"loss": 0.7493,
"step": 175
},
{
"epoch": 0.97,
"grad_norm": 3.1599413989481864,
"learning_rate": 1.2198922211075779e-05,
"loss": 0.7459,
"step": 180
},
{
"epoch": 1.0,
"grad_norm": 3.2871458343993627,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.712,
"step": 185
},
{
"epoch": 1.03,
"grad_norm": 3.4396952311119473,
"learning_rate": 1.1270178197468788e-05,
"loss": 0.3904,
"step": 190
},
{
"epoch": 1.05,
"grad_norm": 3.8937788749830875,
"learning_rate": 1.080104885737807e-05,
"loss": 0.3743,
"step": 195
},
{
"epoch": 1.08,
"grad_norm": 3.6367405057812885,
"learning_rate": 1.0330137426761136e-05,
"loss": 0.3659,
"step": 200
},
{
"epoch": 1.08,
"eval_loss": 1.4554423093795776,
"eval_runtime": 16.8492,
"eval_samples_per_second": 35.195,
"eval_steps_per_second": 1.128,
"step": 200
},
{
"epoch": 1.11,
"grad_norm": 3.6071182321442823,
"learning_rate": 9.858491540592383e-06,
"loss": 0.3717,
"step": 205
},
{
"epoch": 1.14,
"grad_norm": 3.0993281926679,
"learning_rate": 9.38716046778684e-06,
"loss": 0.3657,
"step": 210
},
{
"epoch": 1.16,
"grad_norm": 4.1194373147335,
"learning_rate": 8.917192776895382e-06,
"loss": 0.368,
"step": 215
},
{
"epoch": 1.19,
"grad_norm": 3.0585597878049553,
"learning_rate": 8.449634003358022e-06,
"loss": 0.3613,
"step": 220
},
{
"epoch": 1.22,
"grad_norm": 3.0753210269434197,
"learning_rate": 7.985524323504948e-06,
"loss": 0.3539,
"step": 225
},
{
"epoch": 1.24,
"grad_norm": 2.922314637903982,
"learning_rate": 7.525896240479977e-06,
"loss": 0.3594,
"step": 230
},
{
"epoch": 1.27,
"grad_norm": 3.2882731601562942,
"learning_rate": 7.071772287234497e-06,
"loss": 0.3469,
"step": 235
},
{
"epoch": 1.3,
"grad_norm": 2.9886234420402147,
"learning_rate": 6.624162751702077e-06,
"loss": 0.3513,
"step": 240
},
{
"epoch": 1.32,
"grad_norm": 3.0577844685937214,
"learning_rate": 6.184063429214515e-06,
"loss": 0.3432,
"step": 245
},
{
"epoch": 1.35,
"grad_norm": 17.4039721868179,
"learning_rate": 5.752453407159521e-06,
"loss": 0.3573,
"step": 250
},
{
"epoch": 1.38,
"grad_norm": 3.2984407603690156,
"learning_rate": 5.33029288680852e-06,
"loss": 0.366,
"step": 255
},
{
"epoch": 1.41,
"grad_norm": 3.1677473809022634,
"learning_rate": 4.918521047160309e-06,
"loss": 0.3602,
"step": 260
},
{
"epoch": 1.43,
"grad_norm": 3.0805955031351195,
"learning_rate": 4.518053955552903e-06,
"loss": 0.3487,
"step": 265
},
{
"epoch": 1.46,
"grad_norm": 2.9525772320976666,
"learning_rate": 4.1297825296918145e-06,
"loss": 0.3533,
"step": 270
},
{
"epoch": 1.49,
"grad_norm": 2.719592667191477,
"learning_rate": 3.754570555628613e-06,
"loss": 0.319,
"step": 275
},
{
"epoch": 1.51,
"grad_norm": 2.862304557924882,
"learning_rate": 3.3932527660991877e-06,
"loss": 0.3238,
"step": 280
},
{
"epoch": 1.54,
"grad_norm": 2.876259708390993,
"learning_rate": 3.0466329834968234e-06,
"loss": 0.337,
"step": 285
},
{
"epoch": 1.57,
"grad_norm": 2.833727652532009,
"learning_rate": 2.715482331611393e-06,
"loss": 0.3315,
"step": 290
},
{
"epoch": 1.59,
"grad_norm": 3.160644601967737,
"learning_rate": 2.4005375201130275e-06,
"loss": 0.3334,
"step": 295
},
{
"epoch": 1.62,
"grad_norm": 2.908187326503437,
"learning_rate": 2.102499205596743e-06,
"loss": 0.3187,
"step": 300
},
{
"epoch": 1.62,
"eval_loss": 1.4746283292770386,
"eval_runtime": 16.7664,
"eval_samples_per_second": 35.368,
"eval_steps_per_second": 1.133,
"step": 300
},
{
"epoch": 1.65,
"grad_norm": 2.8504402811606853,
"learning_rate": 1.8220304328342253e-06,
"loss": 0.32,
"step": 305
},
{
"epoch": 1.68,
"grad_norm": 2.7717294675757183,
"learning_rate": 1.5597551597004968e-06,
"loss": 0.3187,
"step": 310
},
{
"epoch": 1.7,
"grad_norm": 2.778864670191994,
"learning_rate": 1.3162568690570743e-06,
"loss": 0.3187,
"step": 315
},
{
"epoch": 1.73,
"grad_norm": 3.88092742516773,
"learning_rate": 1.0920772706797166e-06,
"loss": 0.3131,
"step": 320
},
{
"epoch": 1.76,
"grad_norm": 2.825202349538424,
"learning_rate": 8.87715096118642e-07,
"loss": 0.3096,
"step": 325
},
{
"epoch": 1.78,
"grad_norm": 2.8002712016640277,
"learning_rate": 7.03624989172228e-07,
"loss": 0.3058,
"step": 330
},
{
"epoch": 1.81,
"grad_norm": 2.9740828096451057,
"learning_rate": 5.402164944425758e-07,
"loss": 0.3041,
"step": 335
},
{
"epoch": 1.84,
"grad_norm": 2.780072576121478,
"learning_rate": 3.97853146223105e-07,
"loss": 0.3133,
"step": 340
},
{
"epoch": 1.86,
"grad_norm": 3.0969397831376018,
"learning_rate": 2.7685165974510987e-07,
"loss": 0.3179,
"step": 345
},
{
"epoch": 1.89,
"grad_norm": 2.7014720778740218,
"learning_rate": 1.7748122658251877e-07,
"loss": 0.3043,
"step": 350
},
{
"epoch": 1.92,
"grad_norm": 2.837315264450012,
"learning_rate": 9.996291578236228e-08,
"loss": 0.3143,
"step": 355
},
{
"epoch": 1.95,
"grad_norm": 2.8950081674485992,
"learning_rate": 4.44691820532539e-08,
"loss": 0.31,
"step": 360
},
{
"epoch": 1.97,
"grad_norm": 2.570055503485132,
"learning_rate": 1.1123482106021322e-08,
"loss": 0.3036,
"step": 365
},
{
"epoch": 2.0,
"grad_norm": 2.616680349359466,
"learning_rate": 0.0,
"loss": 0.3027,
"step": 370
},
{
"epoch": 2.0,
"step": 370,
"total_flos": 19367618150400.0,
"train_loss": 0.660463113075978,
"train_runtime": 1759.0522,
"train_samples_per_second": 6.713,
"train_steps_per_second": 0.21
}
],
"logging_steps": 5,
"max_steps": 370,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"total_flos": 19367618150400.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}