deberta-v3-xsmall-zyda-2-quality / trainer_state.json
agentlans's picture
Upload 13 files
6a56b21 verified
{
"best_metric": 0.31647276878356934,
"best_model_checkpoint": "deberta-v3-xsmall-zyda-2-transformed-quality-new/checkpoint-37947",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 37947,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03952881650723377,
"grad_norm": 2.9192252159118652,
"learning_rate": 4.9341186391546106e-05,
"loss": 0.6141,
"step": 500
},
{
"epoch": 0.07905763301446754,
"grad_norm": 2.9683828353881836,
"learning_rate": 4.868237278309221e-05,
"loss": 0.5164,
"step": 1000
},
{
"epoch": 0.11858644952170132,
"grad_norm": 2.587739944458008,
"learning_rate": 4.802355917463832e-05,
"loss": 0.4719,
"step": 1500
},
{
"epoch": 0.15811526602893508,
"grad_norm": 2.8187127113342285,
"learning_rate": 4.7364745566184415e-05,
"loss": 0.457,
"step": 2000
},
{
"epoch": 0.19764408253616886,
"grad_norm": 3.325552463531494,
"learning_rate": 4.670593195773052e-05,
"loss": 0.4503,
"step": 2500
},
{
"epoch": 0.23717289904340264,
"grad_norm": 2.489327907562256,
"learning_rate": 4.604711834927662e-05,
"loss": 0.4345,
"step": 3000
},
{
"epoch": 0.27670171555063644,
"grad_norm": 3.6023690700531006,
"learning_rate": 4.538830474082273e-05,
"loss": 0.4185,
"step": 3500
},
{
"epoch": 0.31623053205787016,
"grad_norm": 2.4426589012145996,
"learning_rate": 4.4729491132368835e-05,
"loss": 0.4095,
"step": 4000
},
{
"epoch": 0.35575934856510394,
"grad_norm": 3.424283981323242,
"learning_rate": 4.407067752391494e-05,
"loss": 0.399,
"step": 4500
},
{
"epoch": 0.3952881650723377,
"grad_norm": 2.923548936843872,
"learning_rate": 4.3411863915461035e-05,
"loss": 0.3984,
"step": 5000
},
{
"epoch": 0.4348169815795715,
"grad_norm": 2.6173856258392334,
"learning_rate": 4.2753050307007145e-05,
"loss": 0.3896,
"step": 5500
},
{
"epoch": 0.47434579808680527,
"grad_norm": 2.0746352672576904,
"learning_rate": 4.209423669855325e-05,
"loss": 0.3809,
"step": 6000
},
{
"epoch": 0.513874614594039,
"grad_norm": 1.683587908744812,
"learning_rate": 4.143542309009935e-05,
"loss": 0.3805,
"step": 6500
},
{
"epoch": 0.5534034311012729,
"grad_norm": 3.254350185394287,
"learning_rate": 4.0776609481645455e-05,
"loss": 0.3761,
"step": 7000
},
{
"epoch": 0.5929322476085066,
"grad_norm": 2.623117685317993,
"learning_rate": 4.011779587319156e-05,
"loss": 0.3806,
"step": 7500
},
{
"epoch": 0.6324610641157403,
"grad_norm": 2.246891975402832,
"learning_rate": 3.945898226473767e-05,
"loss": 0.3609,
"step": 8000
},
{
"epoch": 0.6719898806229742,
"grad_norm": 3.263880729675293,
"learning_rate": 3.8800168656283764e-05,
"loss": 0.3717,
"step": 8500
},
{
"epoch": 0.7115186971302079,
"grad_norm": 2.449371337890625,
"learning_rate": 3.814135504782987e-05,
"loss": 0.3693,
"step": 9000
},
{
"epoch": 0.7510475136374417,
"grad_norm": 2.3527493476867676,
"learning_rate": 3.748254143937597e-05,
"loss": 0.3623,
"step": 9500
},
{
"epoch": 0.7905763301446754,
"grad_norm": 2.8773348331451416,
"learning_rate": 3.682372783092208e-05,
"loss": 0.3615,
"step": 10000
},
{
"epoch": 0.8301051466519093,
"grad_norm": 2.7280490398406982,
"learning_rate": 3.6164914222468184e-05,
"loss": 0.3598,
"step": 10500
},
{
"epoch": 0.869633963159143,
"grad_norm": 2.2904133796691895,
"learning_rate": 3.550610061401429e-05,
"loss": 0.3539,
"step": 11000
},
{
"epoch": 0.9091627796663768,
"grad_norm": 2.197935104370117,
"learning_rate": 3.4847287005560384e-05,
"loss": 0.3563,
"step": 11500
},
{
"epoch": 0.9486915961736105,
"grad_norm": 2.6248714923858643,
"learning_rate": 3.4188473397106494e-05,
"loss": 0.353,
"step": 12000
},
{
"epoch": 0.9882204126808444,
"grad_norm": 2.276596784591675,
"learning_rate": 3.35296597886526e-05,
"loss": 0.3506,
"step": 12500
},
{
"epoch": 1.0,
"eval_loss": 0.35121020674705505,
"eval_mse": 0.35121019676996473,
"eval_runtime": 148.1236,
"eval_samples_per_second": 675.112,
"eval_steps_per_second": 84.389,
"step": 12649
},
{
"epoch": 1.027749229188078,
"grad_norm": 2.749833822250366,
"learning_rate": 3.28708461801987e-05,
"loss": 0.3211,
"step": 13000
},
{
"epoch": 1.067278045695312,
"grad_norm": 2.4262359142303467,
"learning_rate": 3.2212032571744803e-05,
"loss": 0.303,
"step": 13500
},
{
"epoch": 1.1068068622025455,
"grad_norm": 2.7832248210906982,
"learning_rate": 3.155321896329091e-05,
"loss": 0.3047,
"step": 14000
},
{
"epoch": 1.1463356787097794,
"grad_norm": 2.4965128898620605,
"learning_rate": 3.089440535483701e-05,
"loss": 0.2994,
"step": 14500
},
{
"epoch": 1.1858644952170132,
"grad_norm": 2.200387716293335,
"learning_rate": 3.0235591746383113e-05,
"loss": 0.3,
"step": 15000
},
{
"epoch": 1.225393311724247,
"grad_norm": 2.688816547393799,
"learning_rate": 2.9576778137929216e-05,
"loss": 0.2993,
"step": 15500
},
{
"epoch": 1.2649221282314809,
"grad_norm": 3.349059820175171,
"learning_rate": 2.8917964529475323e-05,
"loss": 0.297,
"step": 16000
},
{
"epoch": 1.3044509447387145,
"grad_norm": 2.224276542663574,
"learning_rate": 2.8259150921021426e-05,
"loss": 0.2986,
"step": 16500
},
{
"epoch": 1.3439797612459483,
"grad_norm": 2.157297134399414,
"learning_rate": 2.7600337312567533e-05,
"loss": 0.2978,
"step": 17000
},
{
"epoch": 1.3835085777531821,
"grad_norm": 2.9185595512390137,
"learning_rate": 2.6941523704113636e-05,
"loss": 0.2964,
"step": 17500
},
{
"epoch": 1.4230373942604158,
"grad_norm": 2.66011381149292,
"learning_rate": 2.6282710095659736e-05,
"loss": 0.2931,
"step": 18000
},
{
"epoch": 1.4625662107676496,
"grad_norm": 3.047621011734009,
"learning_rate": 2.562389648720584e-05,
"loss": 0.2941,
"step": 18500
},
{
"epoch": 1.5020950272748834,
"grad_norm": 2.8634958267211914,
"learning_rate": 2.4965082878751946e-05,
"loss": 0.2933,
"step": 19000
},
{
"epoch": 1.541623843782117,
"grad_norm": 2.827955722808838,
"learning_rate": 2.430626927029805e-05,
"loss": 0.2909,
"step": 19500
},
{
"epoch": 1.581152660289351,
"grad_norm": 2.969160556793213,
"learning_rate": 2.3647455661844152e-05,
"loss": 0.2892,
"step": 20000
},
{
"epoch": 1.6206814767965847,
"grad_norm": 2.9614064693450928,
"learning_rate": 2.2988642053390256e-05,
"loss": 0.2897,
"step": 20500
},
{
"epoch": 1.6602102933038185,
"grad_norm": 2.6613972187042236,
"learning_rate": 2.232982844493636e-05,
"loss": 0.2886,
"step": 21000
},
{
"epoch": 1.6997391098110524,
"grad_norm": 2.762899398803711,
"learning_rate": 2.1671014836482465e-05,
"loss": 0.2872,
"step": 21500
},
{
"epoch": 1.739267926318286,
"grad_norm": 3.2839698791503906,
"learning_rate": 2.1012201228028565e-05,
"loss": 0.2895,
"step": 22000
},
{
"epoch": 1.7787967428255198,
"grad_norm": 2.6103265285491943,
"learning_rate": 2.0353387619574672e-05,
"loss": 0.2872,
"step": 22500
},
{
"epoch": 1.8183255593327536,
"grad_norm": 3.014410972595215,
"learning_rate": 1.9694574011120775e-05,
"loss": 0.2902,
"step": 23000
},
{
"epoch": 1.8578543758399872,
"grad_norm": 3.351289987564087,
"learning_rate": 1.903576040266688e-05,
"loss": 0.2876,
"step": 23500
},
{
"epoch": 1.897383192347221,
"grad_norm": 3.1844663619995117,
"learning_rate": 1.837694679421298e-05,
"loss": 0.2855,
"step": 24000
},
{
"epoch": 1.936912008854455,
"grad_norm": 2.6804494857788086,
"learning_rate": 1.7718133185759085e-05,
"loss": 0.2859,
"step": 24500
},
{
"epoch": 1.9764408253616885,
"grad_norm": 2.4572935104370117,
"learning_rate": 1.7059319577305188e-05,
"loss": 0.28,
"step": 25000
},
{
"epoch": 2.0,
"eval_loss": 0.3186613619327545,
"eval_mse": 0.31866135950110924,
"eval_runtime": 145.5886,
"eval_samples_per_second": 686.867,
"eval_steps_per_second": 85.858,
"step": 25298
},
{
"epoch": 2.0159696418689226,
"grad_norm": 2.0255794525146484,
"learning_rate": 1.640050596885129e-05,
"loss": 0.2658,
"step": 25500
},
{
"epoch": 2.055498458376156,
"grad_norm": 2.699540853500366,
"learning_rate": 1.5741692360397398e-05,
"loss": 0.2477,
"step": 26000
},
{
"epoch": 2.09502727488339,
"grad_norm": 3.3947625160217285,
"learning_rate": 1.5082878751943503e-05,
"loss": 0.2466,
"step": 26500
},
{
"epoch": 2.134556091390624,
"grad_norm": 2.3115530014038086,
"learning_rate": 1.4424065143489604e-05,
"loss": 0.2465,
"step": 27000
},
{
"epoch": 2.1740849078978575,
"grad_norm": 2.9776363372802734,
"learning_rate": 1.376525153503571e-05,
"loss": 0.2446,
"step": 27500
},
{
"epoch": 2.213613724405091,
"grad_norm": 1.9914076328277588,
"learning_rate": 1.3106437926581813e-05,
"loss": 0.2417,
"step": 28000
},
{
"epoch": 2.253142540912325,
"grad_norm": 2.153264284133911,
"learning_rate": 1.2447624318127916e-05,
"loss": 0.2436,
"step": 28500
},
{
"epoch": 2.2926713574195587,
"grad_norm": 2.458815574645996,
"learning_rate": 1.1788810709674019e-05,
"loss": 0.2446,
"step": 29000
},
{
"epoch": 2.332200173926793,
"grad_norm": 2.1611809730529785,
"learning_rate": 1.1129997101220122e-05,
"loss": 0.2408,
"step": 29500
},
{
"epoch": 2.3717289904340264,
"grad_norm": 3.270725727081299,
"learning_rate": 1.0471183492766227e-05,
"loss": 0.2446,
"step": 30000
},
{
"epoch": 2.41125780694126,
"grad_norm": 2.7944698333740234,
"learning_rate": 9.81236988431233e-06,
"loss": 0.242,
"step": 30500
},
{
"epoch": 2.450786623448494,
"grad_norm": 2.138493299484253,
"learning_rate": 9.153556275858434e-06,
"loss": 0.2428,
"step": 31000
},
{
"epoch": 2.4903154399557277,
"grad_norm": 2.3900632858276367,
"learning_rate": 8.494742667404539e-06,
"loss": 0.2435,
"step": 31500
},
{
"epoch": 2.5298442564629617,
"grad_norm": 3.044337749481201,
"learning_rate": 7.835929058950642e-06,
"loss": 0.2411,
"step": 32000
},
{
"epoch": 2.5693730729701953,
"grad_norm": 2.801241159439087,
"learning_rate": 7.177115450496746e-06,
"loss": 0.2437,
"step": 32500
},
{
"epoch": 2.608901889477429,
"grad_norm": 2.554879903793335,
"learning_rate": 6.518301842042849e-06,
"loss": 0.2436,
"step": 33000
},
{
"epoch": 2.648430705984663,
"grad_norm": 2.7090861797332764,
"learning_rate": 5.859488233588953e-06,
"loss": 0.2403,
"step": 33500
},
{
"epoch": 2.6879595224918966,
"grad_norm": 2.0840466022491455,
"learning_rate": 5.200674625135057e-06,
"loss": 0.2396,
"step": 34000
},
{
"epoch": 2.7274883389991302,
"grad_norm": 2.8196446895599365,
"learning_rate": 4.541861016681161e-06,
"loss": 0.2372,
"step": 34500
},
{
"epoch": 2.7670171555063643,
"grad_norm": 2.4318623542785645,
"learning_rate": 3.883047408227265e-06,
"loss": 0.2399,
"step": 35000
},
{
"epoch": 2.806545972013598,
"grad_norm": 3.474367380142212,
"learning_rate": 3.2242337997733685e-06,
"loss": 0.2368,
"step": 35500
},
{
"epoch": 2.8460747885208315,
"grad_norm": 2.736551523208618,
"learning_rate": 2.565420191319472e-06,
"loss": 0.2367,
"step": 36000
},
{
"epoch": 2.8856036050280656,
"grad_norm": 1.705733060836792,
"learning_rate": 1.9066065828655756e-06,
"loss": 0.2397,
"step": 36500
},
{
"epoch": 2.925132421535299,
"grad_norm": 2.315380811691284,
"learning_rate": 1.2477929744116795e-06,
"loss": 0.2388,
"step": 37000
},
{
"epoch": 2.964661238042533,
"grad_norm": 2.4691686630249023,
"learning_rate": 5.889793659577832e-07,
"loss": 0.2398,
"step": 37500
},
{
"epoch": 3.0,
"eval_loss": 0.31647276878356934,
"eval_mse": 0.3164727601450849,
"eval_runtime": 132.665,
"eval_samples_per_second": 753.778,
"eval_steps_per_second": 94.222,
"step": 37947
},
{
"epoch": 3.0,
"step": 37947,
"total_flos": 3.999532597714253e+16,
"train_loss": 0.31243033600999526,
"train_runtime": 5395.7257,
"train_samples_per_second": 450.097,
"train_steps_per_second": 7.033
}
],
"logging_steps": 500,
"max_steps": 37947,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.999532597714253e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}