{ "best_metric": 0.31647276878356934, "best_model_checkpoint": "deberta-v3-xsmall-zyda-2-transformed-quality-new/checkpoint-37947", "epoch": 3.0, "eval_steps": 500, "global_step": 37947, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03952881650723377, "grad_norm": 2.9192252159118652, "learning_rate": 4.9341186391546106e-05, "loss": 0.6141, "step": 500 }, { "epoch": 0.07905763301446754, "grad_norm": 2.9683828353881836, "learning_rate": 4.868237278309221e-05, "loss": 0.5164, "step": 1000 }, { "epoch": 0.11858644952170132, "grad_norm": 2.587739944458008, "learning_rate": 4.802355917463832e-05, "loss": 0.4719, "step": 1500 }, { "epoch": 0.15811526602893508, "grad_norm": 2.8187127113342285, "learning_rate": 4.7364745566184415e-05, "loss": 0.457, "step": 2000 }, { "epoch": 0.19764408253616886, "grad_norm": 3.325552463531494, "learning_rate": 4.670593195773052e-05, "loss": 0.4503, "step": 2500 }, { "epoch": 0.23717289904340264, "grad_norm": 2.489327907562256, "learning_rate": 4.604711834927662e-05, "loss": 0.4345, "step": 3000 }, { "epoch": 0.27670171555063644, "grad_norm": 3.6023690700531006, "learning_rate": 4.538830474082273e-05, "loss": 0.4185, "step": 3500 }, { "epoch": 0.31623053205787016, "grad_norm": 2.4426589012145996, "learning_rate": 4.4729491132368835e-05, "loss": 0.4095, "step": 4000 }, { "epoch": 0.35575934856510394, "grad_norm": 3.424283981323242, "learning_rate": 4.407067752391494e-05, "loss": 0.399, "step": 4500 }, { "epoch": 0.3952881650723377, "grad_norm": 2.923548936843872, "learning_rate": 4.3411863915461035e-05, "loss": 0.3984, "step": 5000 }, { "epoch": 0.4348169815795715, "grad_norm": 2.6173856258392334, "learning_rate": 4.2753050307007145e-05, "loss": 0.3896, "step": 5500 }, { "epoch": 0.47434579808680527, "grad_norm": 2.0746352672576904, "learning_rate": 4.209423669855325e-05, "loss": 0.3809, "step": 6000 }, { "epoch": 0.513874614594039, "grad_norm": 1.683587908744812, "learning_rate": 4.143542309009935e-05, "loss": 0.3805, "step": 6500 }, { "epoch": 0.5534034311012729, "grad_norm": 3.254350185394287, "learning_rate": 4.0776609481645455e-05, "loss": 0.3761, "step": 7000 }, { "epoch": 0.5929322476085066, "grad_norm": 2.623117685317993, "learning_rate": 4.011779587319156e-05, "loss": 0.3806, "step": 7500 }, { "epoch": 0.6324610641157403, "grad_norm": 2.246891975402832, "learning_rate": 3.945898226473767e-05, "loss": 0.3609, "step": 8000 }, { "epoch": 0.6719898806229742, "grad_norm": 3.263880729675293, "learning_rate": 3.8800168656283764e-05, "loss": 0.3717, "step": 8500 }, { "epoch": 0.7115186971302079, "grad_norm": 2.449371337890625, "learning_rate": 3.814135504782987e-05, "loss": 0.3693, "step": 9000 }, { "epoch": 0.7510475136374417, "grad_norm": 2.3527493476867676, "learning_rate": 3.748254143937597e-05, "loss": 0.3623, "step": 9500 }, { "epoch": 0.7905763301446754, "grad_norm": 2.8773348331451416, "learning_rate": 3.682372783092208e-05, "loss": 0.3615, "step": 10000 }, { "epoch": 0.8301051466519093, "grad_norm": 2.7280490398406982, "learning_rate": 3.6164914222468184e-05, "loss": 0.3598, "step": 10500 }, { "epoch": 0.869633963159143, "grad_norm": 2.2904133796691895, "learning_rate": 3.550610061401429e-05, "loss": 0.3539, "step": 11000 }, { "epoch": 0.9091627796663768, "grad_norm": 2.197935104370117, "learning_rate": 3.4847287005560384e-05, "loss": 0.3563, "step": 11500 }, { "epoch": 0.9486915961736105, "grad_norm": 2.6248714923858643, "learning_rate": 3.4188473397106494e-05, "loss": 0.353, "step": 12000 }, { "epoch": 0.9882204126808444, "grad_norm": 2.276596784591675, "learning_rate": 3.35296597886526e-05, "loss": 0.3506, "step": 12500 }, { "epoch": 1.0, "eval_loss": 0.35121020674705505, "eval_mse": 0.35121019676996473, "eval_runtime": 148.1236, "eval_samples_per_second": 675.112, "eval_steps_per_second": 84.389, "step": 12649 }, { "epoch": 1.027749229188078, "grad_norm": 2.749833822250366, "learning_rate": 3.28708461801987e-05, "loss": 0.3211, "step": 13000 }, { "epoch": 1.067278045695312, "grad_norm": 2.4262359142303467, "learning_rate": 3.2212032571744803e-05, "loss": 0.303, "step": 13500 }, { "epoch": 1.1068068622025455, "grad_norm": 2.7832248210906982, "learning_rate": 3.155321896329091e-05, "loss": 0.3047, "step": 14000 }, { "epoch": 1.1463356787097794, "grad_norm": 2.4965128898620605, "learning_rate": 3.089440535483701e-05, "loss": 0.2994, "step": 14500 }, { "epoch": 1.1858644952170132, "grad_norm": 2.200387716293335, "learning_rate": 3.0235591746383113e-05, "loss": 0.3, "step": 15000 }, { "epoch": 1.225393311724247, "grad_norm": 2.688816547393799, "learning_rate": 2.9576778137929216e-05, "loss": 0.2993, "step": 15500 }, { "epoch": 1.2649221282314809, "grad_norm": 3.349059820175171, "learning_rate": 2.8917964529475323e-05, "loss": 0.297, "step": 16000 }, { "epoch": 1.3044509447387145, "grad_norm": 2.224276542663574, "learning_rate": 2.8259150921021426e-05, "loss": 0.2986, "step": 16500 }, { "epoch": 1.3439797612459483, "grad_norm": 2.157297134399414, "learning_rate": 2.7600337312567533e-05, "loss": 0.2978, "step": 17000 }, { "epoch": 1.3835085777531821, "grad_norm": 2.9185595512390137, "learning_rate": 2.6941523704113636e-05, "loss": 0.2964, "step": 17500 }, { "epoch": 1.4230373942604158, "grad_norm": 2.66011381149292, "learning_rate": 2.6282710095659736e-05, "loss": 0.2931, "step": 18000 }, { "epoch": 1.4625662107676496, "grad_norm": 3.047621011734009, "learning_rate": 2.562389648720584e-05, "loss": 0.2941, "step": 18500 }, { "epoch": 1.5020950272748834, "grad_norm": 2.8634958267211914, "learning_rate": 2.4965082878751946e-05, "loss": 0.2933, "step": 19000 }, { "epoch": 1.541623843782117, "grad_norm": 2.827955722808838, "learning_rate": 2.430626927029805e-05, "loss": 0.2909, "step": 19500 }, { "epoch": 1.581152660289351, "grad_norm": 2.969160556793213, "learning_rate": 2.3647455661844152e-05, "loss": 0.2892, "step": 20000 }, { "epoch": 1.6206814767965847, "grad_norm": 2.9614064693450928, "learning_rate": 2.2988642053390256e-05, "loss": 0.2897, "step": 20500 }, { "epoch": 1.6602102933038185, "grad_norm": 2.6613972187042236, "learning_rate": 2.232982844493636e-05, "loss": 0.2886, "step": 21000 }, { "epoch": 1.6997391098110524, "grad_norm": 2.762899398803711, "learning_rate": 2.1671014836482465e-05, "loss": 0.2872, "step": 21500 }, { "epoch": 1.739267926318286, "grad_norm": 3.2839698791503906, "learning_rate": 2.1012201228028565e-05, "loss": 0.2895, "step": 22000 }, { "epoch": 1.7787967428255198, "grad_norm": 2.6103265285491943, "learning_rate": 2.0353387619574672e-05, "loss": 0.2872, "step": 22500 }, { "epoch": 1.8183255593327536, "grad_norm": 3.014410972595215, "learning_rate": 1.9694574011120775e-05, "loss": 0.2902, "step": 23000 }, { "epoch": 1.8578543758399872, "grad_norm": 3.351289987564087, "learning_rate": 1.903576040266688e-05, "loss": 0.2876, "step": 23500 }, { "epoch": 1.897383192347221, "grad_norm": 3.1844663619995117, "learning_rate": 1.837694679421298e-05, "loss": 0.2855, "step": 24000 }, { "epoch": 1.936912008854455, "grad_norm": 2.6804494857788086, "learning_rate": 1.7718133185759085e-05, "loss": 0.2859, "step": 24500 }, { "epoch": 1.9764408253616885, "grad_norm": 2.4572935104370117, "learning_rate": 1.7059319577305188e-05, "loss": 0.28, "step": 25000 }, { "epoch": 2.0, "eval_loss": 0.3186613619327545, "eval_mse": 0.31866135950110924, "eval_runtime": 145.5886, "eval_samples_per_second": 686.867, "eval_steps_per_second": 85.858, "step": 25298 }, { "epoch": 2.0159696418689226, "grad_norm": 2.0255794525146484, "learning_rate": 1.640050596885129e-05, "loss": 0.2658, "step": 25500 }, { "epoch": 2.055498458376156, "grad_norm": 2.699540853500366, "learning_rate": 1.5741692360397398e-05, "loss": 0.2477, "step": 26000 }, { "epoch": 2.09502727488339, "grad_norm": 3.3947625160217285, "learning_rate": 1.5082878751943503e-05, "loss": 0.2466, "step": 26500 }, { "epoch": 2.134556091390624, "grad_norm": 2.3115530014038086, "learning_rate": 1.4424065143489604e-05, "loss": 0.2465, "step": 27000 }, { "epoch": 2.1740849078978575, "grad_norm": 2.9776363372802734, "learning_rate": 1.376525153503571e-05, "loss": 0.2446, "step": 27500 }, { "epoch": 2.213613724405091, "grad_norm": 1.9914076328277588, "learning_rate": 1.3106437926581813e-05, "loss": 0.2417, "step": 28000 }, { "epoch": 2.253142540912325, "grad_norm": 2.153264284133911, "learning_rate": 1.2447624318127916e-05, "loss": 0.2436, "step": 28500 }, { "epoch": 2.2926713574195587, "grad_norm": 2.458815574645996, "learning_rate": 1.1788810709674019e-05, "loss": 0.2446, "step": 29000 }, { "epoch": 2.332200173926793, "grad_norm": 2.1611809730529785, "learning_rate": 1.1129997101220122e-05, "loss": 0.2408, "step": 29500 }, { "epoch": 2.3717289904340264, "grad_norm": 3.270725727081299, "learning_rate": 1.0471183492766227e-05, "loss": 0.2446, "step": 30000 }, { "epoch": 2.41125780694126, "grad_norm": 2.7944698333740234, "learning_rate": 9.81236988431233e-06, "loss": 0.242, "step": 30500 }, { "epoch": 2.450786623448494, "grad_norm": 2.138493299484253, "learning_rate": 9.153556275858434e-06, "loss": 0.2428, "step": 31000 }, { "epoch": 2.4903154399557277, "grad_norm": 2.3900632858276367, "learning_rate": 8.494742667404539e-06, "loss": 0.2435, "step": 31500 }, { "epoch": 2.5298442564629617, "grad_norm": 3.044337749481201, "learning_rate": 7.835929058950642e-06, "loss": 0.2411, "step": 32000 }, { "epoch": 2.5693730729701953, "grad_norm": 2.801241159439087, "learning_rate": 7.177115450496746e-06, "loss": 0.2437, "step": 32500 }, { "epoch": 2.608901889477429, "grad_norm": 2.554879903793335, "learning_rate": 6.518301842042849e-06, "loss": 0.2436, "step": 33000 }, { "epoch": 2.648430705984663, "grad_norm": 2.7090861797332764, "learning_rate": 5.859488233588953e-06, "loss": 0.2403, "step": 33500 }, { "epoch": 2.6879595224918966, "grad_norm": 2.0840466022491455, "learning_rate": 5.200674625135057e-06, "loss": 0.2396, "step": 34000 }, { "epoch": 2.7274883389991302, "grad_norm": 2.8196446895599365, "learning_rate": 4.541861016681161e-06, "loss": 0.2372, "step": 34500 }, { "epoch": 2.7670171555063643, "grad_norm": 2.4318623542785645, "learning_rate": 3.883047408227265e-06, "loss": 0.2399, "step": 35000 }, { "epoch": 2.806545972013598, "grad_norm": 3.474367380142212, "learning_rate": 3.2242337997733685e-06, "loss": 0.2368, "step": 35500 }, { "epoch": 2.8460747885208315, "grad_norm": 2.736551523208618, "learning_rate": 2.565420191319472e-06, "loss": 0.2367, "step": 36000 }, { "epoch": 2.8856036050280656, "grad_norm": 1.705733060836792, "learning_rate": 1.9066065828655756e-06, "loss": 0.2397, "step": 36500 }, { "epoch": 2.925132421535299, "grad_norm": 2.315380811691284, "learning_rate": 1.2477929744116795e-06, "loss": 0.2388, "step": 37000 }, { "epoch": 2.964661238042533, "grad_norm": 2.4691686630249023, "learning_rate": 5.889793659577832e-07, "loss": 0.2398, "step": 37500 }, { "epoch": 3.0, "eval_loss": 0.31647276878356934, "eval_mse": 0.3164727601450849, "eval_runtime": 132.665, "eval_samples_per_second": 753.778, "eval_steps_per_second": 94.222, "step": 37947 }, { "epoch": 3.0, "step": 37947, "total_flos": 3.999532597714253e+16, "train_loss": 0.31243033600999526, "train_runtime": 5395.7257, "train_samples_per_second": 450.097, "train_steps_per_second": 7.033 } ], "logging_steps": 500, "max_steps": 37947, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.999532597714253e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }