|
{ |
|
"best_metric": 0.31647276878356934, |
|
"best_model_checkpoint": "deberta-v3-xsmall-zyda-2-transformed-quality-new/checkpoint-37947", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 37947, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03952881650723377, |
|
"grad_norm": 2.9192252159118652, |
|
"learning_rate": 4.9341186391546106e-05, |
|
"loss": 0.6141, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07905763301446754, |
|
"grad_norm": 2.9683828353881836, |
|
"learning_rate": 4.868237278309221e-05, |
|
"loss": 0.5164, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11858644952170132, |
|
"grad_norm": 2.587739944458008, |
|
"learning_rate": 4.802355917463832e-05, |
|
"loss": 0.4719, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15811526602893508, |
|
"grad_norm": 2.8187127113342285, |
|
"learning_rate": 4.7364745566184415e-05, |
|
"loss": 0.457, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19764408253616886, |
|
"grad_norm": 3.325552463531494, |
|
"learning_rate": 4.670593195773052e-05, |
|
"loss": 0.4503, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.23717289904340264, |
|
"grad_norm": 2.489327907562256, |
|
"learning_rate": 4.604711834927662e-05, |
|
"loss": 0.4345, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.27670171555063644, |
|
"grad_norm": 3.6023690700531006, |
|
"learning_rate": 4.538830474082273e-05, |
|
"loss": 0.4185, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.31623053205787016, |
|
"grad_norm": 2.4426589012145996, |
|
"learning_rate": 4.4729491132368835e-05, |
|
"loss": 0.4095, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.35575934856510394, |
|
"grad_norm": 3.424283981323242, |
|
"learning_rate": 4.407067752391494e-05, |
|
"loss": 0.399, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3952881650723377, |
|
"grad_norm": 2.923548936843872, |
|
"learning_rate": 4.3411863915461035e-05, |
|
"loss": 0.3984, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4348169815795715, |
|
"grad_norm": 2.6173856258392334, |
|
"learning_rate": 4.2753050307007145e-05, |
|
"loss": 0.3896, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.47434579808680527, |
|
"grad_norm": 2.0746352672576904, |
|
"learning_rate": 4.209423669855325e-05, |
|
"loss": 0.3809, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.513874614594039, |
|
"grad_norm": 1.683587908744812, |
|
"learning_rate": 4.143542309009935e-05, |
|
"loss": 0.3805, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5534034311012729, |
|
"grad_norm": 3.254350185394287, |
|
"learning_rate": 4.0776609481645455e-05, |
|
"loss": 0.3761, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5929322476085066, |
|
"grad_norm": 2.623117685317993, |
|
"learning_rate": 4.011779587319156e-05, |
|
"loss": 0.3806, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6324610641157403, |
|
"grad_norm": 2.246891975402832, |
|
"learning_rate": 3.945898226473767e-05, |
|
"loss": 0.3609, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6719898806229742, |
|
"grad_norm": 3.263880729675293, |
|
"learning_rate": 3.8800168656283764e-05, |
|
"loss": 0.3717, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.7115186971302079, |
|
"grad_norm": 2.449371337890625, |
|
"learning_rate": 3.814135504782987e-05, |
|
"loss": 0.3693, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7510475136374417, |
|
"grad_norm": 2.3527493476867676, |
|
"learning_rate": 3.748254143937597e-05, |
|
"loss": 0.3623, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7905763301446754, |
|
"grad_norm": 2.8773348331451416, |
|
"learning_rate": 3.682372783092208e-05, |
|
"loss": 0.3615, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8301051466519093, |
|
"grad_norm": 2.7280490398406982, |
|
"learning_rate": 3.6164914222468184e-05, |
|
"loss": 0.3598, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.869633963159143, |
|
"grad_norm": 2.2904133796691895, |
|
"learning_rate": 3.550610061401429e-05, |
|
"loss": 0.3539, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.9091627796663768, |
|
"grad_norm": 2.197935104370117, |
|
"learning_rate": 3.4847287005560384e-05, |
|
"loss": 0.3563, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9486915961736105, |
|
"grad_norm": 2.6248714923858643, |
|
"learning_rate": 3.4188473397106494e-05, |
|
"loss": 0.353, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9882204126808444, |
|
"grad_norm": 2.276596784591675, |
|
"learning_rate": 3.35296597886526e-05, |
|
"loss": 0.3506, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.35121020674705505, |
|
"eval_mse": 0.35121019676996473, |
|
"eval_runtime": 148.1236, |
|
"eval_samples_per_second": 675.112, |
|
"eval_steps_per_second": 84.389, |
|
"step": 12649 |
|
}, |
|
{ |
|
"epoch": 1.027749229188078, |
|
"grad_norm": 2.749833822250366, |
|
"learning_rate": 3.28708461801987e-05, |
|
"loss": 0.3211, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.067278045695312, |
|
"grad_norm": 2.4262359142303467, |
|
"learning_rate": 3.2212032571744803e-05, |
|
"loss": 0.303, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.1068068622025455, |
|
"grad_norm": 2.7832248210906982, |
|
"learning_rate": 3.155321896329091e-05, |
|
"loss": 0.3047, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1463356787097794, |
|
"grad_norm": 2.4965128898620605, |
|
"learning_rate": 3.089440535483701e-05, |
|
"loss": 0.2994, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.1858644952170132, |
|
"grad_norm": 2.200387716293335, |
|
"learning_rate": 3.0235591746383113e-05, |
|
"loss": 0.3, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.225393311724247, |
|
"grad_norm": 2.688816547393799, |
|
"learning_rate": 2.9576778137929216e-05, |
|
"loss": 0.2993, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.2649221282314809, |
|
"grad_norm": 3.349059820175171, |
|
"learning_rate": 2.8917964529475323e-05, |
|
"loss": 0.297, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.3044509447387145, |
|
"grad_norm": 2.224276542663574, |
|
"learning_rate": 2.8259150921021426e-05, |
|
"loss": 0.2986, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3439797612459483, |
|
"grad_norm": 2.157297134399414, |
|
"learning_rate": 2.7600337312567533e-05, |
|
"loss": 0.2978, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.3835085777531821, |
|
"grad_norm": 2.9185595512390137, |
|
"learning_rate": 2.6941523704113636e-05, |
|
"loss": 0.2964, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.4230373942604158, |
|
"grad_norm": 2.66011381149292, |
|
"learning_rate": 2.6282710095659736e-05, |
|
"loss": 0.2931, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.4625662107676496, |
|
"grad_norm": 3.047621011734009, |
|
"learning_rate": 2.562389648720584e-05, |
|
"loss": 0.2941, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.5020950272748834, |
|
"grad_norm": 2.8634958267211914, |
|
"learning_rate": 2.4965082878751946e-05, |
|
"loss": 0.2933, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.541623843782117, |
|
"grad_norm": 2.827955722808838, |
|
"learning_rate": 2.430626927029805e-05, |
|
"loss": 0.2909, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.581152660289351, |
|
"grad_norm": 2.969160556793213, |
|
"learning_rate": 2.3647455661844152e-05, |
|
"loss": 0.2892, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.6206814767965847, |
|
"grad_norm": 2.9614064693450928, |
|
"learning_rate": 2.2988642053390256e-05, |
|
"loss": 0.2897, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.6602102933038185, |
|
"grad_norm": 2.6613972187042236, |
|
"learning_rate": 2.232982844493636e-05, |
|
"loss": 0.2886, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.6997391098110524, |
|
"grad_norm": 2.762899398803711, |
|
"learning_rate": 2.1671014836482465e-05, |
|
"loss": 0.2872, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.739267926318286, |
|
"grad_norm": 3.2839698791503906, |
|
"learning_rate": 2.1012201228028565e-05, |
|
"loss": 0.2895, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.7787967428255198, |
|
"grad_norm": 2.6103265285491943, |
|
"learning_rate": 2.0353387619574672e-05, |
|
"loss": 0.2872, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.8183255593327536, |
|
"grad_norm": 3.014410972595215, |
|
"learning_rate": 1.9694574011120775e-05, |
|
"loss": 0.2902, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.8578543758399872, |
|
"grad_norm": 3.351289987564087, |
|
"learning_rate": 1.903576040266688e-05, |
|
"loss": 0.2876, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.897383192347221, |
|
"grad_norm": 3.1844663619995117, |
|
"learning_rate": 1.837694679421298e-05, |
|
"loss": 0.2855, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.936912008854455, |
|
"grad_norm": 2.6804494857788086, |
|
"learning_rate": 1.7718133185759085e-05, |
|
"loss": 0.2859, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.9764408253616885, |
|
"grad_norm": 2.4572935104370117, |
|
"learning_rate": 1.7059319577305188e-05, |
|
"loss": 0.28, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.3186613619327545, |
|
"eval_mse": 0.31866135950110924, |
|
"eval_runtime": 145.5886, |
|
"eval_samples_per_second": 686.867, |
|
"eval_steps_per_second": 85.858, |
|
"step": 25298 |
|
}, |
|
{ |
|
"epoch": 2.0159696418689226, |
|
"grad_norm": 2.0255794525146484, |
|
"learning_rate": 1.640050596885129e-05, |
|
"loss": 0.2658, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.055498458376156, |
|
"grad_norm": 2.699540853500366, |
|
"learning_rate": 1.5741692360397398e-05, |
|
"loss": 0.2477, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.09502727488339, |
|
"grad_norm": 3.3947625160217285, |
|
"learning_rate": 1.5082878751943503e-05, |
|
"loss": 0.2466, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.134556091390624, |
|
"grad_norm": 2.3115530014038086, |
|
"learning_rate": 1.4424065143489604e-05, |
|
"loss": 0.2465, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.1740849078978575, |
|
"grad_norm": 2.9776363372802734, |
|
"learning_rate": 1.376525153503571e-05, |
|
"loss": 0.2446, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.213613724405091, |
|
"grad_norm": 1.9914076328277588, |
|
"learning_rate": 1.3106437926581813e-05, |
|
"loss": 0.2417, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.253142540912325, |
|
"grad_norm": 2.153264284133911, |
|
"learning_rate": 1.2447624318127916e-05, |
|
"loss": 0.2436, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.2926713574195587, |
|
"grad_norm": 2.458815574645996, |
|
"learning_rate": 1.1788810709674019e-05, |
|
"loss": 0.2446, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.332200173926793, |
|
"grad_norm": 2.1611809730529785, |
|
"learning_rate": 1.1129997101220122e-05, |
|
"loss": 0.2408, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.3717289904340264, |
|
"grad_norm": 3.270725727081299, |
|
"learning_rate": 1.0471183492766227e-05, |
|
"loss": 0.2446, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.41125780694126, |
|
"grad_norm": 2.7944698333740234, |
|
"learning_rate": 9.81236988431233e-06, |
|
"loss": 0.242, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.450786623448494, |
|
"grad_norm": 2.138493299484253, |
|
"learning_rate": 9.153556275858434e-06, |
|
"loss": 0.2428, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.4903154399557277, |
|
"grad_norm": 2.3900632858276367, |
|
"learning_rate": 8.494742667404539e-06, |
|
"loss": 0.2435, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.5298442564629617, |
|
"grad_norm": 3.044337749481201, |
|
"learning_rate": 7.835929058950642e-06, |
|
"loss": 0.2411, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.5693730729701953, |
|
"grad_norm": 2.801241159439087, |
|
"learning_rate": 7.177115450496746e-06, |
|
"loss": 0.2437, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.608901889477429, |
|
"grad_norm": 2.554879903793335, |
|
"learning_rate": 6.518301842042849e-06, |
|
"loss": 0.2436, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.648430705984663, |
|
"grad_norm": 2.7090861797332764, |
|
"learning_rate": 5.859488233588953e-06, |
|
"loss": 0.2403, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.6879595224918966, |
|
"grad_norm": 2.0840466022491455, |
|
"learning_rate": 5.200674625135057e-06, |
|
"loss": 0.2396, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.7274883389991302, |
|
"grad_norm": 2.8196446895599365, |
|
"learning_rate": 4.541861016681161e-06, |
|
"loss": 0.2372, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.7670171555063643, |
|
"grad_norm": 2.4318623542785645, |
|
"learning_rate": 3.883047408227265e-06, |
|
"loss": 0.2399, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.806545972013598, |
|
"grad_norm": 3.474367380142212, |
|
"learning_rate": 3.2242337997733685e-06, |
|
"loss": 0.2368, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.8460747885208315, |
|
"grad_norm": 2.736551523208618, |
|
"learning_rate": 2.565420191319472e-06, |
|
"loss": 0.2367, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.8856036050280656, |
|
"grad_norm": 1.705733060836792, |
|
"learning_rate": 1.9066065828655756e-06, |
|
"loss": 0.2397, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.925132421535299, |
|
"grad_norm": 2.315380811691284, |
|
"learning_rate": 1.2477929744116795e-06, |
|
"loss": 0.2388, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.964661238042533, |
|
"grad_norm": 2.4691686630249023, |
|
"learning_rate": 5.889793659577832e-07, |
|
"loss": 0.2398, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.31647276878356934, |
|
"eval_mse": 0.3164727601450849, |
|
"eval_runtime": 132.665, |
|
"eval_samples_per_second": 753.778, |
|
"eval_steps_per_second": 94.222, |
|
"step": 37947 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 37947, |
|
"total_flos": 3.999532597714253e+16, |
|
"train_loss": 0.31243033600999526, |
|
"train_runtime": 5395.7257, |
|
"train_samples_per_second": 450.097, |
|
"train_steps_per_second": 7.033 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 37947, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.999532597714253e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|