|
{ |
|
"best_metric": 0.0490318201482296, |
|
"best_model_checkpoint": "/home1/datahome/villien/project_hub/DinoVdeau/models/Kamoulox-large-2024_10_31-batch-size64_freeze_monolabel/checkpoint-387504", |
|
"epoch": 150.0, |
|
"eval_steps": 500, |
|
"global_step": 403650, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18580453363062058, |
|
"grad_norm": 2.81394624710083, |
|
"learning_rate": 0.001, |
|
"loss": 1.1787, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.37160906726124115, |
|
"grad_norm": 2.4735236167907715, |
|
"learning_rate": 0.001, |
|
"loss": 0.98, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5574136008918618, |
|
"grad_norm": 1.8667224645614624, |
|
"learning_rate": 0.001, |
|
"loss": 0.9449, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7432181345224823, |
|
"grad_norm": 1.7729698419570923, |
|
"learning_rate": 0.001, |
|
"loss": 0.9107, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.929022668153103, |
|
"grad_norm": 1.785703182220459, |
|
"learning_rate": 0.001, |
|
"loss": 0.9079, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7326527412414418, |
|
"eval_f1_macro": 0.2723318326456157, |
|
"eval_f1_micro": 0.7326527412414418, |
|
"eval_loss": 0.8028618097305298, |
|
"eval_runtime": 507.3076, |
|
"eval_samples_per_second": 113.148, |
|
"eval_steps_per_second": 1.768, |
|
"learning_rate": 0.001, |
|
"step": 2691 |
|
}, |
|
{ |
|
"epoch": 1.1148272017837235, |
|
"grad_norm": 1.4361521005630493, |
|
"learning_rate": 0.001, |
|
"loss": 0.9035, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3006317354143442, |
|
"grad_norm": 1.2895761728286743, |
|
"learning_rate": 0.001, |
|
"loss": 0.8945, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.4864362690449646, |
|
"grad_norm": 1.5815945863723755, |
|
"learning_rate": 0.001, |
|
"loss": 0.8879, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.6722408026755853, |
|
"grad_norm": 1.3825953006744385, |
|
"learning_rate": 0.001, |
|
"loss": 0.8901, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.8580453363062057, |
|
"grad_norm": 1.0456748008728027, |
|
"learning_rate": 0.001, |
|
"loss": 0.8824, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7288723192975732, |
|
"eval_f1_macro": 0.290650504066453, |
|
"eval_f1_micro": 0.7288723192975732, |
|
"eval_loss": 0.8038854002952576, |
|
"eval_runtime": 507.5267, |
|
"eval_samples_per_second": 113.099, |
|
"eval_steps_per_second": 1.767, |
|
"learning_rate": 0.001, |
|
"step": 5382 |
|
}, |
|
{ |
|
"epoch": 2.0438498699368264, |
|
"grad_norm": 1.1838769912719727, |
|
"learning_rate": 0.001, |
|
"loss": 0.8883, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.229654403567447, |
|
"grad_norm": 1.2504063844680786, |
|
"learning_rate": 0.001, |
|
"loss": 0.8713, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.4154589371980677, |
|
"grad_norm": 1.2367465496063232, |
|
"learning_rate": 0.001, |
|
"loss": 0.8738, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.6012634708286884, |
|
"grad_norm": 1.2117840051651, |
|
"learning_rate": 0.001, |
|
"loss": 0.8664, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.787068004459309, |
|
"grad_norm": 0.9993311762809753, |
|
"learning_rate": 0.001, |
|
"loss": 0.8667, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.9728725380899292, |
|
"grad_norm": 1.0402483940124512, |
|
"learning_rate": 0.001, |
|
"loss": 0.8655, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7408581732025574, |
|
"eval_f1_macro": 0.326985383349658, |
|
"eval_f1_micro": 0.7408581732025574, |
|
"eval_loss": 0.7705450654029846, |
|
"eval_runtime": 518.2697, |
|
"eval_samples_per_second": 110.755, |
|
"eval_steps_per_second": 1.731, |
|
"learning_rate": 0.001, |
|
"step": 8073 |
|
}, |
|
{ |
|
"epoch": 3.15867707172055, |
|
"grad_norm": 1.2037022113800049, |
|
"learning_rate": 0.001, |
|
"loss": 0.8485, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.3444816053511706, |
|
"grad_norm": 1.2167097330093384, |
|
"learning_rate": 0.001, |
|
"loss": 0.8707, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.530286138981791, |
|
"grad_norm": 1.150800108909607, |
|
"learning_rate": 0.001, |
|
"loss": 0.8543, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.716090672612412, |
|
"grad_norm": 0.8481634855270386, |
|
"learning_rate": 0.001, |
|
"loss": 0.848, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.901895206243032, |
|
"grad_norm": 0.9482496380805969, |
|
"learning_rate": 0.001, |
|
"loss": 0.8514, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7417118168673019, |
|
"eval_f1_macro": 0.310238611675961, |
|
"eval_f1_micro": 0.7417118168673019, |
|
"eval_loss": 0.7623223066329956, |
|
"eval_runtime": 526.8057, |
|
"eval_samples_per_second": 108.96, |
|
"eval_steps_per_second": 1.703, |
|
"learning_rate": 0.001, |
|
"step": 10764 |
|
}, |
|
{ |
|
"epoch": 4.087699739873653, |
|
"grad_norm": 0.9820572137832642, |
|
"learning_rate": 0.001, |
|
"loss": 0.8542, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.273504273504273, |
|
"grad_norm": 1.2227846384048462, |
|
"learning_rate": 0.001, |
|
"loss": 0.8536, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.459308807134894, |
|
"grad_norm": 0.9903749823570251, |
|
"learning_rate": 0.001, |
|
"loss": 0.8473, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.645113340765515, |
|
"grad_norm": 0.802663266658783, |
|
"learning_rate": 0.001, |
|
"loss": 0.8475, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.830917874396135, |
|
"grad_norm": 0.8765626549720764, |
|
"learning_rate": 0.001, |
|
"loss": 0.844, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7383495061061653, |
|
"eval_f1_macro": 0.3107947240093857, |
|
"eval_f1_micro": 0.7383495061061653, |
|
"eval_loss": 0.7626621127128601, |
|
"eval_runtime": 531.6556, |
|
"eval_samples_per_second": 107.967, |
|
"eval_steps_per_second": 1.687, |
|
"learning_rate": 0.001, |
|
"step": 13455 |
|
}, |
|
{ |
|
"epoch": 5.016722408026756, |
|
"grad_norm": 0.846638560295105, |
|
"learning_rate": 0.001, |
|
"loss": 0.8519, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 5.202526941657377, |
|
"grad_norm": 0.9798027276992798, |
|
"learning_rate": 0.001, |
|
"loss": 0.8477, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 5.388331475287997, |
|
"grad_norm": 1.171106219291687, |
|
"learning_rate": 0.001, |
|
"loss": 0.8377, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 5.574136008918618, |
|
"grad_norm": 5.736990451812744, |
|
"learning_rate": 0.001, |
|
"loss": 0.8358, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 5.759940542549238, |
|
"grad_norm": 0.9148433208465576, |
|
"learning_rate": 0.001, |
|
"loss": 0.8361, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.9457450761798585, |
|
"grad_norm": 0.9974657297134399, |
|
"learning_rate": 0.001, |
|
"loss": 0.8437, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7447257016428285, |
|
"eval_f1_macro": 0.34331624846537584, |
|
"eval_f1_micro": 0.7447257016428285, |
|
"eval_loss": 0.7451828122138977, |
|
"eval_runtime": 532.2923, |
|
"eval_samples_per_second": 107.837, |
|
"eval_steps_per_second": 1.685, |
|
"learning_rate": 0.001, |
|
"step": 16146 |
|
}, |
|
{ |
|
"epoch": 6.131549609810479, |
|
"grad_norm": 0.7431060671806335, |
|
"learning_rate": 0.001, |
|
"loss": 0.8485, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 6.3173541434411, |
|
"grad_norm": 0.899385392665863, |
|
"learning_rate": 0.001, |
|
"loss": 0.8323, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 6.5031586770717205, |
|
"grad_norm": 0.8070719838142395, |
|
"learning_rate": 0.001, |
|
"loss": 0.8388, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 6.688963210702341, |
|
"grad_norm": 0.7958774566650391, |
|
"learning_rate": 0.001, |
|
"loss": 0.8428, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 6.874767744332962, |
|
"grad_norm": 0.9422007203102112, |
|
"learning_rate": 0.001, |
|
"loss": 0.8289, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7467291510600861, |
|
"eval_f1_macro": 0.3283260141196405, |
|
"eval_f1_micro": 0.7467291510600861, |
|
"eval_loss": 0.7458378672599792, |
|
"eval_runtime": 529.9283, |
|
"eval_samples_per_second": 108.318, |
|
"eval_steps_per_second": 1.693, |
|
"learning_rate": 0.001, |
|
"step": 18837 |
|
}, |
|
{ |
|
"epoch": 7.060572277963582, |
|
"grad_norm": 0.8445965051651001, |
|
"learning_rate": 0.001, |
|
"loss": 0.8383, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 7.246376811594203, |
|
"grad_norm": 0.9642378687858582, |
|
"learning_rate": 0.001, |
|
"loss": 0.8319, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 7.432181345224824, |
|
"grad_norm": 0.9426402449607849, |
|
"learning_rate": 0.001, |
|
"loss": 0.8365, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 7.617985878855444, |
|
"grad_norm": 1.0661524534225464, |
|
"learning_rate": 0.001, |
|
"loss": 0.8348, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 7.803790412486064, |
|
"grad_norm": 0.7548210620880127, |
|
"learning_rate": 0.001, |
|
"loss": 0.8436, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 7.989594946116685, |
|
"grad_norm": 1.0589923858642578, |
|
"learning_rate": 0.001, |
|
"loss": 0.8402, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7458929286946221, |
|
"eval_f1_macro": 0.3352756381207012, |
|
"eval_f1_micro": 0.7458929286946221, |
|
"eval_loss": 0.7398682832717896, |
|
"eval_runtime": 542.5535, |
|
"eval_samples_per_second": 105.798, |
|
"eval_steps_per_second": 1.653, |
|
"learning_rate": 0.001, |
|
"step": 21528 |
|
}, |
|
{ |
|
"epoch": 8.175399479747306, |
|
"grad_norm": 0.9055599570274353, |
|
"learning_rate": 0.001, |
|
"loss": 0.8301, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 8.361204013377927, |
|
"grad_norm": 0.6724838614463806, |
|
"learning_rate": 0.001, |
|
"loss": 0.8362, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 8.547008547008547, |
|
"grad_norm": 0.9552667140960693, |
|
"learning_rate": 0.001, |
|
"loss": 0.837, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 8.732813080639168, |
|
"grad_norm": 0.7529712319374084, |
|
"learning_rate": 0.001, |
|
"loss": 0.8335, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 8.918617614269788, |
|
"grad_norm": 0.9149639010429382, |
|
"learning_rate": 0.001, |
|
"loss": 0.8274, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7455270814097316, |
|
"eval_f1_macro": 0.329402464136349, |
|
"eval_f1_micro": 0.7455270814097316, |
|
"eval_loss": 0.7424591779708862, |
|
"eval_runtime": 539.3684, |
|
"eval_samples_per_second": 106.423, |
|
"eval_steps_per_second": 1.663, |
|
"learning_rate": 0.001, |
|
"step": 24219 |
|
}, |
|
{ |
|
"epoch": 9.104422147900408, |
|
"grad_norm": 0.9954981207847595, |
|
"learning_rate": 0.001, |
|
"loss": 0.8204, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 9.29022668153103, |
|
"grad_norm": 0.8292973041534424, |
|
"learning_rate": 0.001, |
|
"loss": 0.8373, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 9.47603121516165, |
|
"grad_norm": 0.6125388741493225, |
|
"learning_rate": 0.001, |
|
"loss": 0.834, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 9.66183574879227, |
|
"grad_norm": 0.7945879697799683, |
|
"learning_rate": 0.001, |
|
"loss": 0.8523, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 9.84764028242289, |
|
"grad_norm": 0.8813033699989319, |
|
"learning_rate": 0.001, |
|
"loss": 0.8289, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7474782669291475, |
|
"eval_f1_macro": 0.31573051576045374, |
|
"eval_f1_micro": 0.7474782669291475, |
|
"eval_loss": 0.7364382147789001, |
|
"eval_runtime": 565.5402, |
|
"eval_samples_per_second": 101.498, |
|
"eval_steps_per_second": 1.586, |
|
"learning_rate": 0.001, |
|
"step": 26910 |
|
}, |
|
{ |
|
"epoch": 10.033444816053512, |
|
"grad_norm": 0.7872824668884277, |
|
"learning_rate": 0.001, |
|
"loss": 0.8307, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 10.219249349684132, |
|
"grad_norm": 0.9812045097351074, |
|
"learning_rate": 0.001, |
|
"loss": 0.8275, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 10.405053883314753, |
|
"grad_norm": 0.9366200566291809, |
|
"learning_rate": 0.001, |
|
"loss": 0.8271, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 10.590858416945373, |
|
"grad_norm": 0.8672580122947693, |
|
"learning_rate": 0.001, |
|
"loss": 0.8314, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 10.776662950575995, |
|
"grad_norm": 0.8797865509986877, |
|
"learning_rate": 0.001, |
|
"loss": 0.845, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 10.962467484206615, |
|
"grad_norm": 0.708544135093689, |
|
"learning_rate": 0.001, |
|
"loss": 0.8368, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7465375167680005, |
|
"eval_f1_macro": 0.34419509138343996, |
|
"eval_f1_micro": 0.7465375167680005, |
|
"eval_loss": 0.7368418574333191, |
|
"eval_runtime": 540.8249, |
|
"eval_samples_per_second": 106.136, |
|
"eval_steps_per_second": 1.659, |
|
"learning_rate": 0.001, |
|
"step": 29601 |
|
}, |
|
{ |
|
"epoch": 11.148272017837236, |
|
"grad_norm": 0.7683461904525757, |
|
"learning_rate": 0.001, |
|
"loss": 0.8328, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 11.334076551467856, |
|
"grad_norm": 0.8067987561225891, |
|
"learning_rate": 0.001, |
|
"loss": 0.8367, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 11.519881085098476, |
|
"grad_norm": 0.8982560038566589, |
|
"learning_rate": 0.001, |
|
"loss": 0.8333, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 11.705685618729097, |
|
"grad_norm": 0.8005286455154419, |
|
"learning_rate": 0.001, |
|
"loss": 0.8264, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 11.891490152359717, |
|
"grad_norm": 0.848416805267334, |
|
"learning_rate": 0.001, |
|
"loss": 0.8329, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7428093587219735, |
|
"eval_f1_macro": 0.3321199292959056, |
|
"eval_f1_micro": 0.7428093587219735, |
|
"eval_loss": 0.7442134022712708, |
|
"eval_runtime": 602.0094, |
|
"eval_samples_per_second": 95.349, |
|
"eval_steps_per_second": 1.49, |
|
"learning_rate": 0.001, |
|
"step": 32292 |
|
}, |
|
{ |
|
"epoch": 12.077294685990339, |
|
"grad_norm": 0.9192249178886414, |
|
"learning_rate": 0.001, |
|
"loss": 0.8242, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 12.263099219620958, |
|
"grad_norm": 0.9414801597595215, |
|
"learning_rate": 0.001, |
|
"loss": 0.8211, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 12.44890375325158, |
|
"grad_norm": 0.8428712487220764, |
|
"learning_rate": 0.001, |
|
"loss": 0.8252, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 12.6347082868822, |
|
"grad_norm": 0.8568095564842224, |
|
"learning_rate": 0.001, |
|
"loss": 0.834, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 12.820512820512821, |
|
"grad_norm": 0.552550196647644, |
|
"learning_rate": 0.001, |
|
"loss": 0.8359, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7479312207104406, |
|
"eval_f1_macro": 0.35283143267744377, |
|
"eval_f1_micro": 0.7479312207104406, |
|
"eval_loss": 0.7384127378463745, |
|
"eval_runtime": 536.6777, |
|
"eval_samples_per_second": 106.956, |
|
"eval_steps_per_second": 1.671, |
|
"learning_rate": 0.001, |
|
"step": 34983 |
|
}, |
|
{ |
|
"epoch": 13.006317354143441, |
|
"grad_norm": 0.790766179561615, |
|
"learning_rate": 0.001, |
|
"loss": 0.8317, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 13.192121887774062, |
|
"grad_norm": 0.6483836770057678, |
|
"learning_rate": 0.001, |
|
"loss": 0.8142, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 13.377926421404682, |
|
"grad_norm": 0.7646375298500061, |
|
"learning_rate": 0.001, |
|
"loss": 0.8336, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 13.563730955035304, |
|
"grad_norm": 0.8401319980621338, |
|
"learning_rate": 0.001, |
|
"loss": 0.8318, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 13.749535488665924, |
|
"grad_norm": 0.8368563055992126, |
|
"learning_rate": 0.001, |
|
"loss": 0.8305, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 13.935340022296543, |
|
"grad_norm": 1.0513092279434204, |
|
"learning_rate": 0.001, |
|
"loss": 0.8388, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7463633037751956, |
|
"eval_f1_macro": 0.33455884660313173, |
|
"eval_f1_micro": 0.7463633037751956, |
|
"eval_loss": 0.7464041113853455, |
|
"eval_runtime": 519.5938, |
|
"eval_samples_per_second": 110.473, |
|
"eval_steps_per_second": 1.726, |
|
"learning_rate": 0.001, |
|
"step": 37674 |
|
}, |
|
{ |
|
"epoch": 14.121144555927165, |
|
"grad_norm": 0.9093230962753296, |
|
"learning_rate": 0.001, |
|
"loss": 0.8277, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 14.306949089557785, |
|
"grad_norm": 0.7018683552742004, |
|
"learning_rate": 0.001, |
|
"loss": 0.8154, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 14.492753623188406, |
|
"grad_norm": 1.060300588607788, |
|
"learning_rate": 0.001, |
|
"loss": 0.8342, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 14.678558156819026, |
|
"grad_norm": 0.9249637126922607, |
|
"learning_rate": 0.001, |
|
"loss": 0.8382, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 14.864362690449648, |
|
"grad_norm": 0.7552688717842102, |
|
"learning_rate": 0.001, |
|
"loss": 0.8306, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7446734377449871, |
|
"eval_f1_macro": 0.34277495445998946, |
|
"eval_f1_micro": 0.7446734377449871, |
|
"eval_loss": 0.7394037842750549, |
|
"eval_runtime": 526.6425, |
|
"eval_samples_per_second": 108.994, |
|
"eval_steps_per_second": 1.703, |
|
"learning_rate": 0.001, |
|
"step": 40365 |
|
}, |
|
{ |
|
"epoch": 15.050167224080267, |
|
"grad_norm": 0.7855104207992554, |
|
"learning_rate": 0.001, |
|
"loss": 0.8399, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 15.235971757710889, |
|
"grad_norm": 0.709564745426178, |
|
"learning_rate": 0.001, |
|
"loss": 0.8384, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 15.421776291341509, |
|
"grad_norm": 0.7003925442695618, |
|
"learning_rate": 0.001, |
|
"loss": 0.8236, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 15.607580824972128, |
|
"grad_norm": 0.7792437672615051, |
|
"learning_rate": 0.001, |
|
"loss": 0.8296, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 15.79338535860275, |
|
"grad_norm": 0.8424106240272522, |
|
"learning_rate": 0.001, |
|
"loss": 0.8329, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 15.97918989223337, |
|
"grad_norm": 0.8874714374542236, |
|
"learning_rate": 0.001, |
|
"loss": 0.8304, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7478789568125991, |
|
"eval_f1_macro": 0.3506456767847629, |
|
"eval_f1_micro": 0.7478789568125991, |
|
"eval_loss": 0.7397111058235168, |
|
"eval_runtime": 533.4385, |
|
"eval_samples_per_second": 107.606, |
|
"eval_steps_per_second": 1.682, |
|
"learning_rate": 0.001, |
|
"step": 43056 |
|
}, |
|
{ |
|
"epoch": 16.16499442586399, |
|
"grad_norm": 0.7367419600486755, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8145, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 16.35079895949461, |
|
"grad_norm": 0.9213405251502991, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7917, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 16.53660349312523, |
|
"grad_norm": 0.7902110815048218, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7909, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 16.722408026755854, |
|
"grad_norm": 0.7533911466598511, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8054, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 16.908212560386474, |
|
"grad_norm": 0.6558223962783813, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7886, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.7554398007003362, |
|
"eval_f1_macro": 0.3747287292827094, |
|
"eval_f1_micro": 0.7554398007003362, |
|
"eval_loss": 0.7110718488693237, |
|
"eval_runtime": 519.8124, |
|
"eval_samples_per_second": 110.426, |
|
"eval_steps_per_second": 1.726, |
|
"learning_rate": 0.0001, |
|
"step": 45747 |
|
}, |
|
{ |
|
"epoch": 17.094017094017094, |
|
"grad_norm": 0.8171585202217102, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7962, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 17.279821627647713, |
|
"grad_norm": 0.9112296104431152, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7774, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 17.465626161278337, |
|
"grad_norm": 0.7937678694725037, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7802, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 17.651430694908957, |
|
"grad_norm": 0.8527361154556274, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7867, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 17.837235228539576, |
|
"grad_norm": 0.8812717199325562, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7815, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7567463981463738, |
|
"eval_f1_macro": 0.3792648315920964, |
|
"eval_f1_micro": 0.7567463981463738, |
|
"eval_loss": 0.7041681408882141, |
|
"eval_runtime": 506.1969, |
|
"eval_samples_per_second": 113.397, |
|
"eval_steps_per_second": 1.772, |
|
"learning_rate": 0.0001, |
|
"step": 48438 |
|
}, |
|
{ |
|
"epoch": 18.023039762170196, |
|
"grad_norm": 0.9349254965782166, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7883, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 18.208844295800816, |
|
"grad_norm": 0.8031138777732849, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7735, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 18.39464882943144, |
|
"grad_norm": 0.8354169130325317, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7861, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 18.58045336306206, |
|
"grad_norm": 0.9157727956771851, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7743, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 18.76625789669268, |
|
"grad_norm": 0.8417840600013733, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7785, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 18.9520624303233, |
|
"grad_norm": 0.8945490121841431, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7682, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7582446298844968, |
|
"eval_f1_macro": 0.38576725361345504, |
|
"eval_f1_micro": 0.7582446298844968, |
|
"eval_loss": 0.7004917860031128, |
|
"eval_runtime": 517.5109, |
|
"eval_samples_per_second": 110.917, |
|
"eval_steps_per_second": 1.733, |
|
"learning_rate": 0.0001, |
|
"step": 51129 |
|
}, |
|
{ |
|
"epoch": 19.137866963953922, |
|
"grad_norm": 1.05229914188385, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7689, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 19.32367149758454, |
|
"grad_norm": 0.6953094601631165, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7721, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 19.50947603121516, |
|
"grad_norm": 0.7234140634536743, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7757, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 19.69528056484578, |
|
"grad_norm": 0.8757944703102112, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7688, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 19.881085098476404, |
|
"grad_norm": 1.1684017181396484, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7788, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7602829219003153, |
|
"eval_f1_macro": 0.39339544323315423, |
|
"eval_f1_micro": 0.7602829219003153, |
|
"eval_loss": 0.6942671537399292, |
|
"eval_runtime": 534.9756, |
|
"eval_samples_per_second": 107.296, |
|
"eval_steps_per_second": 1.677, |
|
"learning_rate": 0.0001, |
|
"step": 53820 |
|
}, |
|
{ |
|
"epoch": 20.066889632107024, |
|
"grad_norm": 1.1924428939819336, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7598, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 20.252694165737644, |
|
"grad_norm": 0.6945735812187195, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7695, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 20.438498699368264, |
|
"grad_norm": 0.8915848135948181, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7645, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 20.624303232998884, |
|
"grad_norm": 0.9735463857650757, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7747, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 20.810107766629507, |
|
"grad_norm": 1.0891400575637817, |
|
"learning_rate": 0.0001, |
|
"loss": 0.774, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 20.995912300260127, |
|
"grad_norm": 0.9328783750534058, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7735, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.7592899078413268, |
|
"eval_f1_macro": 0.3942710537489151, |
|
"eval_f1_micro": 0.7592899078413268, |
|
"eval_loss": 0.6919424533843994, |
|
"eval_runtime": 511.1285, |
|
"eval_samples_per_second": 112.302, |
|
"eval_steps_per_second": 1.755, |
|
"learning_rate": 0.0001, |
|
"step": 56511 |
|
}, |
|
{ |
|
"epoch": 21.181716833890746, |
|
"grad_norm": 0.7840440273284912, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7777, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 21.367521367521366, |
|
"grad_norm": 0.9738557934761047, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7598, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 21.55332590115199, |
|
"grad_norm": 0.8154065608978271, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7661, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 1.0773000717163086, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7691, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 21.92493496841323, |
|
"grad_norm": 1.0894912481307983, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7602, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7606313478859253, |
|
"eval_f1_macro": 0.3925331634038099, |
|
"eval_f1_micro": 0.7606313478859253, |
|
"eval_loss": 0.6903713941574097, |
|
"eval_runtime": 507.2562, |
|
"eval_samples_per_second": 113.16, |
|
"eval_steps_per_second": 1.768, |
|
"learning_rate": 0.0001, |
|
"step": 59202 |
|
}, |
|
{ |
|
"epoch": 22.11073950204385, |
|
"grad_norm": 1.1766314506530762, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7665, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 22.296544035674472, |
|
"grad_norm": 1.028473973274231, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7638, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 22.482348569305092, |
|
"grad_norm": 0.900393545627594, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7633, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 22.66815310293571, |
|
"grad_norm": 0.9336509704589844, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7636, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 22.85395763656633, |
|
"grad_norm": 1.0151203870773315, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7572, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.7607010330830474, |
|
"eval_f1_macro": 0.39534246826429575, |
|
"eval_f1_micro": 0.7607010330830474, |
|
"eval_loss": 0.6874070167541504, |
|
"eval_runtime": 507.2598, |
|
"eval_samples_per_second": 113.159, |
|
"eval_steps_per_second": 1.768, |
|
"learning_rate": 0.0001, |
|
"step": 61893 |
|
}, |
|
{ |
|
"epoch": 23.03976217019695, |
|
"grad_norm": 1.0469074249267578, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7692, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 23.225566703827575, |
|
"grad_norm": 0.9608765840530396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7497, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 23.411371237458194, |
|
"grad_norm": 1.2574784755706787, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7566, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 23.597175771088814, |
|
"grad_norm": 1.0878256559371948, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7658, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 23.782980304719434, |
|
"grad_norm": 1.1496306657791138, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7642, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 23.968784838350057, |
|
"grad_norm": 1.0485353469848633, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7593, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7612236720614624, |
|
"eval_f1_macro": 0.3933203620961568, |
|
"eval_f1_micro": 0.7612236720614624, |
|
"eval_loss": 0.6864963173866272, |
|
"eval_runtime": 500.8584, |
|
"eval_samples_per_second": 114.605, |
|
"eval_steps_per_second": 1.791, |
|
"learning_rate": 0.0001, |
|
"step": 64584 |
|
}, |
|
{ |
|
"epoch": 24.154589371980677, |
|
"grad_norm": 1.0143301486968994, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7559, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 24.340393905611297, |
|
"grad_norm": 0.8291743993759155, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7643, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 24.526198439241917, |
|
"grad_norm": 1.1790424585342407, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7602, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 24.71200297287254, |
|
"grad_norm": 1.1937212944030762, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7636, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 24.89780750650316, |
|
"grad_norm": 1.230569839477539, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7548, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.7614153063535478, |
|
"eval_f1_macro": 0.402343965759826, |
|
"eval_f1_micro": 0.7614153063535478, |
|
"eval_loss": 0.684335470199585, |
|
"eval_runtime": 514.3947, |
|
"eval_samples_per_second": 111.589, |
|
"eval_steps_per_second": 1.744, |
|
"learning_rate": 0.0001, |
|
"step": 67275 |
|
}, |
|
{ |
|
"epoch": 25.08361204013378, |
|
"grad_norm": 1.0271650552749634, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7568, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 25.2694165737644, |
|
"grad_norm": 1.042904019355774, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7524, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 25.45522110739502, |
|
"grad_norm": 1.03508460521698, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7555, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 25.641025641025642, |
|
"grad_norm": 0.8760964870452881, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7605, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 25.826830174656262, |
|
"grad_norm": 1.2160849571228027, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7557, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7629309593909513, |
|
"eval_f1_macro": 0.4055175650763901, |
|
"eval_f1_micro": 0.7629309593909513, |
|
"eval_loss": 0.6830293536186218, |
|
"eval_runtime": 568.9631, |
|
"eval_samples_per_second": 100.887, |
|
"eval_steps_per_second": 1.577, |
|
"learning_rate": 0.0001, |
|
"step": 69966 |
|
}, |
|
{ |
|
"epoch": 26.012634708286882, |
|
"grad_norm": 0.9436312317848206, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7523, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 26.1984392419175, |
|
"grad_norm": 1.0304313898086548, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7623, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 26.384243775548125, |
|
"grad_norm": 1.1352418661117554, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7476, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 26.570048309178745, |
|
"grad_norm": 0.825731098651886, |
|
"learning_rate": 0.0001, |
|
"loss": 0.752, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 26.755852842809364, |
|
"grad_norm": 0.9680258631706238, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7572, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 26.941657376439984, |
|
"grad_norm": 0.8693468570709229, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7534, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.7630703297851954, |
|
"eval_f1_macro": 0.40742696523740624, |
|
"eval_f1_micro": 0.7630703297851954, |
|
"eval_loss": 0.6827249526977539, |
|
"eval_runtime": 550.472, |
|
"eval_samples_per_second": 104.276, |
|
"eval_steps_per_second": 1.63, |
|
"learning_rate": 0.0001, |
|
"step": 72657 |
|
}, |
|
{ |
|
"epoch": 27.127461910070604, |
|
"grad_norm": 1.1599422693252563, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7587, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 27.313266443701227, |
|
"grad_norm": 1.182005763053894, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7495, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 27.499070977331847, |
|
"grad_norm": 0.935971736907959, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7522, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 27.684875510962467, |
|
"grad_norm": 1.0990965366363525, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7513, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 27.870680044593087, |
|
"grad_norm": 0.834299623966217, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7609, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7629309593909513, |
|
"eval_f1_macro": 0.413578373717749, |
|
"eval_f1_micro": 0.7629309593909513, |
|
"eval_loss": 0.6805527210235596, |
|
"eval_runtime": 553.9776, |
|
"eval_samples_per_second": 103.616, |
|
"eval_steps_per_second": 1.619, |
|
"learning_rate": 0.0001, |
|
"step": 75348 |
|
}, |
|
{ |
|
"epoch": 28.05648457822371, |
|
"grad_norm": 1.1609550714492798, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7471, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 28.24228911185433, |
|
"grad_norm": 0.9144974946975708, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7524, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 28.42809364548495, |
|
"grad_norm": 1.08271324634552, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7461, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 28.61389817911557, |
|
"grad_norm": 1.0623596906661987, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7497, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 28.799702712746193, |
|
"grad_norm": 1.168769359588623, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7433, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 28.985507246376812, |
|
"grad_norm": 0.9733015298843384, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7537, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.7625825334053413, |
|
"eval_f1_macro": 0.4138447052049864, |
|
"eval_f1_micro": 0.7625825334053413, |
|
"eval_loss": 0.6796479225158691, |
|
"eval_runtime": 535.8607, |
|
"eval_samples_per_second": 107.119, |
|
"eval_steps_per_second": 1.674, |
|
"learning_rate": 0.0001, |
|
"step": 78039 |
|
}, |
|
{ |
|
"epoch": 29.171311780007432, |
|
"grad_norm": 1.3112975358963013, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7435, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 29.357116313638052, |
|
"grad_norm": 1.1170014142990112, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7524, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 29.54292084726867, |
|
"grad_norm": 0.9579864144325256, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7544, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 29.728725380899295, |
|
"grad_norm": 0.9765433073043823, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7498, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 29.914529914529915, |
|
"grad_norm": 1.1268893480300903, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7533, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7635929687636104, |
|
"eval_f1_macro": 0.41283784117218203, |
|
"eval_f1_micro": 0.7635929687636104, |
|
"eval_loss": 0.6774595379829407, |
|
"eval_runtime": 535.3939, |
|
"eval_samples_per_second": 107.213, |
|
"eval_steps_per_second": 1.675, |
|
"learning_rate": 0.0001, |
|
"step": 80730 |
|
}, |
|
{ |
|
"epoch": 30.100334448160535, |
|
"grad_norm": 1.268797755241394, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7368, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 30.286138981791154, |
|
"grad_norm": 1.029023289680481, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7478, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 30.471943515421778, |
|
"grad_norm": 1.3363114595413208, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7487, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 30.657748049052397, |
|
"grad_norm": 1.4572941064834595, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7467, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 30.843552582683017, |
|
"grad_norm": 1.1603232622146606, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7481, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.7643769272312328, |
|
"eval_f1_macro": 0.4100283180344899, |
|
"eval_f1_micro": 0.7643769272312328, |
|
"eval_loss": 0.677918553352356, |
|
"eval_runtime": 535.5808, |
|
"eval_samples_per_second": 107.175, |
|
"eval_steps_per_second": 1.675, |
|
"learning_rate": 0.0001, |
|
"step": 83421 |
|
}, |
|
{ |
|
"epoch": 31.029357116313637, |
|
"grad_norm": 1.3755314350128174, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7545, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 31.21516164994426, |
|
"grad_norm": 1.2730791568756104, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7523, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 31.40096618357488, |
|
"grad_norm": 1.3255988359451294, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7459, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 31.5867707172055, |
|
"grad_norm": 1.246517300605774, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7411, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 31.77257525083612, |
|
"grad_norm": 1.3307832479476929, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7313, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 31.95837978446674, |
|
"grad_norm": 1.4022419452667236, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7523, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7641504503405864, |
|
"eval_f1_macro": 0.41093585032897456, |
|
"eval_f1_micro": 0.7641504503405864, |
|
"eval_loss": 0.6754601001739502, |
|
"eval_runtime": 518.6265, |
|
"eval_samples_per_second": 110.679, |
|
"eval_steps_per_second": 1.73, |
|
"learning_rate": 0.0001, |
|
"step": 86112 |
|
}, |
|
{ |
|
"epoch": 32.14418431809736, |
|
"grad_norm": 1.3993639945983887, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7392, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 32.32998885172798, |
|
"grad_norm": 1.2422406673431396, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7434, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 32.515793385358606, |
|
"grad_norm": 1.2804062366485596, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7366, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 32.70159791898922, |
|
"grad_norm": 1.6868325471878052, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7515, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 32.887402452619845, |
|
"grad_norm": 1.067484974861145, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7459, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.7645162976254769, |
|
"eval_f1_macro": 0.4185852176848548, |
|
"eval_f1_micro": 0.7645162976254769, |
|
"eval_loss": 0.6749601364135742, |
|
"eval_runtime": 515.3075, |
|
"eval_samples_per_second": 111.392, |
|
"eval_steps_per_second": 1.741, |
|
"learning_rate": 0.0001, |
|
"step": 88803 |
|
}, |
|
{ |
|
"epoch": 33.07320698625046, |
|
"grad_norm": 1.3152741193771362, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7448, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 33.259011519881085, |
|
"grad_norm": 1.0844374895095825, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7353, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 33.44481605351171, |
|
"grad_norm": 1.1964083909988403, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7439, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 33.630620587142324, |
|
"grad_norm": 1.1734319925308228, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7434, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 33.81642512077295, |
|
"grad_norm": 1.2583281993865967, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7454, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.7650040940053309, |
|
"eval_f1_macro": 0.41458520697205553, |
|
"eval_f1_micro": 0.7650040940053309, |
|
"eval_loss": 0.6746455430984497, |
|
"eval_runtime": 513.1282, |
|
"eval_samples_per_second": 111.865, |
|
"eval_steps_per_second": 1.748, |
|
"learning_rate": 0.0001, |
|
"step": 91494 |
|
}, |
|
{ |
|
"epoch": 34.002229654403564, |
|
"grad_norm": 1.368170142173767, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7475, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 34.18803418803419, |
|
"grad_norm": 1.1858837604522705, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7363, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 34.37383872166481, |
|
"grad_norm": 1.5839638710021973, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7352, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 34.55964325529543, |
|
"grad_norm": 1.560922622680664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7371, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 34.74544778892605, |
|
"grad_norm": 1.3220325708389282, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7442, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 34.931252322556674, |
|
"grad_norm": 1.275272250175476, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7426, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.7641852929391474, |
|
"eval_f1_macro": 0.42549626405712787, |
|
"eval_f1_micro": 0.7641852929391474, |
|
"eval_loss": 0.6740487813949585, |
|
"eval_runtime": 514.2418, |
|
"eval_samples_per_second": 111.623, |
|
"eval_steps_per_second": 1.744, |
|
"learning_rate": 0.0001, |
|
"step": 94185 |
|
}, |
|
{ |
|
"epoch": 35.11705685618729, |
|
"grad_norm": 1.3994961977005005, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7432, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 35.30286138981791, |
|
"grad_norm": 1.2828725576400757, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7443, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 35.48866592344853, |
|
"grad_norm": 1.3517532348632812, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7426, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 35.67447045707915, |
|
"grad_norm": 1.2311525344848633, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7274, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 35.860274990709776, |
|
"grad_norm": 1.209617257118225, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7446, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7646905106182819, |
|
"eval_f1_macro": 0.42042185334833837, |
|
"eval_f1_micro": 0.7646905106182819, |
|
"eval_loss": 0.6740365624427795, |
|
"eval_runtime": 526.8929, |
|
"eval_samples_per_second": 108.942, |
|
"eval_steps_per_second": 1.702, |
|
"learning_rate": 0.0001, |
|
"step": 96876 |
|
}, |
|
{ |
|
"epoch": 36.04607952434039, |
|
"grad_norm": 1.2096632719039917, |
|
"learning_rate": 0.0001, |
|
"loss": 0.738, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 36.231884057971016, |
|
"grad_norm": 1.258972406387329, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7421, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 36.41768859160163, |
|
"grad_norm": 1.5985496044158936, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7426, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 36.603493125232255, |
|
"grad_norm": 1.261830449104309, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7384, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 36.78929765886288, |
|
"grad_norm": 1.5109082460403442, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7383, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 36.975102192493495, |
|
"grad_norm": 1.1752451658248901, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7431, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.7641678716398669, |
|
"eval_f1_macro": 0.4194338951085209, |
|
"eval_f1_micro": 0.7641678716398669, |
|
"eval_loss": 0.6731483936309814, |
|
"eval_runtime": 503.5131, |
|
"eval_samples_per_second": 114.001, |
|
"eval_steps_per_second": 1.781, |
|
"learning_rate": 0.0001, |
|
"step": 99567 |
|
}, |
|
{ |
|
"epoch": 37.16090672612412, |
|
"grad_norm": 1.4535642862319946, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7282, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 37.34671125975474, |
|
"grad_norm": 1.5286682844161987, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7332, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 37.53251579338536, |
|
"grad_norm": 1.6406091451644897, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7387, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 37.71832032701598, |
|
"grad_norm": 1.0090203285217285, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7364, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 37.9041248606466, |
|
"grad_norm": 1.3233065605163574, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7468, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7648821449103674, |
|
"eval_f1_macro": 0.42433192329853336, |
|
"eval_f1_micro": 0.7648821449103674, |
|
"eval_loss": 0.671998918056488, |
|
"eval_runtime": 536.8447, |
|
"eval_samples_per_second": 106.923, |
|
"eval_steps_per_second": 1.671, |
|
"learning_rate": 0.0001, |
|
"step": 102258 |
|
}, |
|
{ |
|
"epoch": 38.08992939427722, |
|
"grad_norm": 1.2303552627563477, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7463, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 38.275733927907844, |
|
"grad_norm": 1.200378179550171, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7296, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 1.3014321327209473, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7437, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 38.64734299516908, |
|
"grad_norm": 1.429656982421875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7444, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 38.8331475287997, |
|
"grad_norm": 1.3417949676513672, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7307, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.7659274228671974, |
|
"eval_f1_macro": 0.42040362773805245, |
|
"eval_f1_micro": 0.7659274228671974, |
|
"eval_loss": 0.6694707870483398, |
|
"eval_runtime": 527.8534, |
|
"eval_samples_per_second": 108.744, |
|
"eval_steps_per_second": 1.699, |
|
"learning_rate": 0.0001, |
|
"step": 104949 |
|
}, |
|
{ |
|
"epoch": 39.01895206243032, |
|
"grad_norm": 1.656445860862732, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7423, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 39.204756596060946, |
|
"grad_norm": 1.33378267288208, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7407, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 39.39056112969156, |
|
"grad_norm": 1.814835548400879, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7395, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 39.576365663322186, |
|
"grad_norm": 1.5420753955841064, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7321, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 39.76217019695281, |
|
"grad_norm": 1.2062960863113403, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7302, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 39.947974730583425, |
|
"grad_norm": 1.4515705108642578, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7404, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.7665023257434539, |
|
"eval_f1_macro": 0.42052839725843943, |
|
"eval_f1_micro": 0.7665023257434539, |
|
"eval_loss": 0.6693674325942993, |
|
"eval_runtime": 517.9605, |
|
"eval_samples_per_second": 110.821, |
|
"eval_steps_per_second": 1.732, |
|
"learning_rate": 0.0001, |
|
"step": 107640 |
|
}, |
|
{ |
|
"epoch": 40.13377926421405, |
|
"grad_norm": 1.211496114730835, |
|
"learning_rate": 0.0001, |
|
"loss": 0.728, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 40.319583797844665, |
|
"grad_norm": 1.0356364250183105, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7307, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 40.50538833147529, |
|
"grad_norm": 1.508729338645935, |
|
"learning_rate": 0.0001, |
|
"loss": 0.732, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 40.69119286510591, |
|
"grad_norm": 1.315461277961731, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7471, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 40.87699739873653, |
|
"grad_norm": 1.2628021240234375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7355, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.7658228950715145, |
|
"eval_f1_macro": 0.4176095837463332, |
|
"eval_f1_micro": 0.7658228950715145, |
|
"eval_loss": 0.6682748198509216, |
|
"eval_runtime": 491.8206, |
|
"eval_samples_per_second": 116.711, |
|
"eval_steps_per_second": 1.824, |
|
"learning_rate": 0.0001, |
|
"step": 110331 |
|
}, |
|
{ |
|
"epoch": 41.06280193236715, |
|
"grad_norm": 1.4534225463867188, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7362, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 41.24860646599777, |
|
"grad_norm": 1.305442214012146, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7292, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 41.43441099962839, |
|
"grad_norm": 1.2671395540237427, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7293, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 41.620215533259014, |
|
"grad_norm": 1.7886228561401367, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7379, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 41.80602006688963, |
|
"grad_norm": 1.565897822380066, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7388, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 41.99182460052025, |
|
"grad_norm": 1.4743560552597046, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7508, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.7665371683420149, |
|
"eval_f1_macro": 0.4307432846853069, |
|
"eval_f1_micro": 0.7665371683420149, |
|
"eval_loss": 0.6682831645011902, |
|
"eval_runtime": 484.4423, |
|
"eval_samples_per_second": 118.489, |
|
"eval_steps_per_second": 1.852, |
|
"learning_rate": 0.0001, |
|
"step": 113022 |
|
}, |
|
{ |
|
"epoch": 42.17762913415088, |
|
"grad_norm": 1.530715823173523, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7309, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 42.36343366778149, |
|
"grad_norm": 1.4992977380752563, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7282, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 42.549238201412116, |
|
"grad_norm": 1.5467846393585205, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7377, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 42.73504273504273, |
|
"grad_norm": 1.5137412548065186, |
|
"learning_rate": 0.0001, |
|
"loss": 0.733, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 42.920847268673356, |
|
"grad_norm": 1.6055561304092407, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7368, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.7664152192470515, |
|
"eval_f1_macro": 0.42637174689527435, |
|
"eval_f1_micro": 0.7664152192470515, |
|
"eval_loss": 0.6695142984390259, |
|
"eval_runtime": 488.5528, |
|
"eval_samples_per_second": 117.492, |
|
"eval_steps_per_second": 1.836, |
|
"learning_rate": 0.0001, |
|
"step": 115713 |
|
}, |
|
{ |
|
"epoch": 43.10665180230398, |
|
"grad_norm": 1.460276484489441, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7371, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 43.292456335934595, |
|
"grad_norm": 1.4303547143936157, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7276, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 43.47826086956522, |
|
"grad_norm": 1.5059736967086792, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7282, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 43.664065403195835, |
|
"grad_norm": 1.4985263347625732, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7347, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 43.84986993682646, |
|
"grad_norm": 1.1422086954116821, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7362, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.765840316370795, |
|
"eval_f1_macro": 0.42609728385101026, |
|
"eval_f1_micro": 0.765840316370795, |
|
"eval_loss": 0.669391393661499, |
|
"eval_runtime": 508.1911, |
|
"eval_samples_per_second": 112.952, |
|
"eval_steps_per_second": 1.765, |
|
"learning_rate": 0.0001, |
|
"step": 118404 |
|
}, |
|
{ |
|
"epoch": 44.03567447045708, |
|
"grad_norm": 1.8496322631835938, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7336, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 44.2214790040877, |
|
"grad_norm": 1.4898383617401123, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7361, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 44.40728353771832, |
|
"grad_norm": 1.341386079788208, |
|
"learning_rate": 0.0001, |
|
"loss": 0.733, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 44.593088071348944, |
|
"grad_norm": 1.747814416885376, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7256, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 44.77889260497956, |
|
"grad_norm": 1.5329481363296509, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7203, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 44.964697138610184, |
|
"grad_norm": 1.4953057765960693, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7287, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.7649518301074895, |
|
"eval_f1_macro": 0.4307801257532618, |
|
"eval_f1_micro": 0.7649518301074895, |
|
"eval_loss": 0.6695447564125061, |
|
"eval_runtime": 486.6366, |
|
"eval_samples_per_second": 117.955, |
|
"eval_steps_per_second": 1.843, |
|
"learning_rate": 0.0001, |
|
"step": 121095 |
|
}, |
|
{ |
|
"epoch": 45.1505016722408, |
|
"grad_norm": 1.5713474750518799, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7282, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 45.33630620587142, |
|
"grad_norm": 1.133755087852478, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7333, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 45.52211073950205, |
|
"grad_norm": 1.4431047439575195, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7271, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 45.70791527313266, |
|
"grad_norm": 1.942878007888794, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7298, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 45.893719806763286, |
|
"grad_norm": 1.6559330224990845, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7374, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.7673908120067595, |
|
"eval_f1_macro": 0.4362286100546047, |
|
"eval_f1_micro": 0.7673908120067595, |
|
"eval_loss": 0.6653340458869934, |
|
"eval_runtime": 483.687, |
|
"eval_samples_per_second": 118.674, |
|
"eval_steps_per_second": 1.855, |
|
"learning_rate": 0.0001, |
|
"step": 123786 |
|
}, |
|
{ |
|
"epoch": 46.0795243403939, |
|
"grad_norm": 1.346835732460022, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7369, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 46.265328874024526, |
|
"grad_norm": 1.3699783086776733, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7344, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 46.45113340765515, |
|
"grad_norm": 1.4523323774337769, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7322, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 46.636937941285765, |
|
"grad_norm": 1.5713220834732056, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7276, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 46.82274247491639, |
|
"grad_norm": 1.4702354669570923, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7321, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.7674430759046009, |
|
"eval_f1_macro": 0.43279477858934173, |
|
"eval_f1_micro": 0.7674430759046009, |
|
"eval_loss": 0.6659862995147705, |
|
"eval_runtime": 487.7507, |
|
"eval_samples_per_second": 117.685, |
|
"eval_steps_per_second": 1.839, |
|
"learning_rate": 0.0001, |
|
"step": 126477 |
|
}, |
|
{ |
|
"epoch": 47.00854700854701, |
|
"grad_norm": 1.5675249099731445, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7255, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 47.19435154217763, |
|
"grad_norm": 1.4069844484329224, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7272, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 47.38015607580825, |
|
"grad_norm": 1.2489932775497437, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7273, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 47.56596060943887, |
|
"grad_norm": 1.8463833332061768, |
|
"learning_rate": 0.0001, |
|
"loss": 0.74, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 47.75176514306949, |
|
"grad_norm": 1.3903892040252686, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7391, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 47.937569676700114, |
|
"grad_norm": 1.2857866287231445, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7352, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.7668855943276249, |
|
"eval_f1_macro": 0.43077383038275147, |
|
"eval_f1_micro": 0.7668855943276249, |
|
"eval_loss": 0.665557861328125, |
|
"eval_runtime": 486.8422, |
|
"eval_samples_per_second": 117.905, |
|
"eval_steps_per_second": 1.842, |
|
"learning_rate": 0.0001, |
|
"step": 129168 |
|
}, |
|
{ |
|
"epoch": 48.12337421033073, |
|
"grad_norm": 1.7764297723770142, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7203, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 48.309178743961354, |
|
"grad_norm": 1.469336748123169, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7292, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 48.49498327759197, |
|
"grad_norm": 1.4002306461334229, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7279, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 48.680787811222594, |
|
"grad_norm": 1.6152621507644653, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7386, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 48.86659234485322, |
|
"grad_norm": 1.5064826011657715, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7373, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.7666242748384174, |
|
"eval_f1_macro": 0.4241681801855841, |
|
"eval_f1_micro": 0.7666242748384174, |
|
"eval_loss": 0.6672787666320801, |
|
"eval_runtime": 484.1155, |
|
"eval_samples_per_second": 118.569, |
|
"eval_steps_per_second": 1.853, |
|
"learning_rate": 0.0001, |
|
"step": 131859 |
|
}, |
|
{ |
|
"epoch": 49.05239687848383, |
|
"grad_norm": 1.329869031906128, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7295, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 49.238201412114456, |
|
"grad_norm": 1.749746322631836, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7301, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 49.42400594574507, |
|
"grad_norm": 2.059324026107788, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7214, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 49.609810479375696, |
|
"grad_norm": 1.6400606632232666, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7285, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 49.79561501300632, |
|
"grad_norm": 1.6473073959350586, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7283, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 49.981419546636936, |
|
"grad_norm": 1.7654767036437988, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7307, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.7662584275535269, |
|
"eval_f1_macro": 0.43151276516162357, |
|
"eval_f1_micro": 0.7662584275535269, |
|
"eval_loss": 0.6661437749862671, |
|
"eval_runtime": 486.5547, |
|
"eval_samples_per_second": 117.974, |
|
"eval_steps_per_second": 1.844, |
|
"learning_rate": 0.0001, |
|
"step": 134550 |
|
}, |
|
{ |
|
"epoch": 50.16722408026756, |
|
"grad_norm": 1.484832525253296, |
|
"learning_rate": 0.0001, |
|
"loss": 0.731, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 50.35302861389818, |
|
"grad_norm": 1.4548848867416382, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7262, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 50.5388331475288, |
|
"grad_norm": 1.7120981216430664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7345, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 50.72463768115942, |
|
"grad_norm": 1.789556860923767, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7253, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 50.91044221479004, |
|
"grad_norm": 2.5206713676452637, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7235, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.7667288026341005, |
|
"eval_f1_macro": 0.4307690989303114, |
|
"eval_f1_micro": 0.7667288026341005, |
|
"eval_loss": 0.6638755798339844, |
|
"eval_runtime": 505.3338, |
|
"eval_samples_per_second": 113.59, |
|
"eval_steps_per_second": 1.775, |
|
"learning_rate": 0.0001, |
|
"step": 137241 |
|
}, |
|
{ |
|
"epoch": 51.09624674842066, |
|
"grad_norm": 1.9092504978179932, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7266, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 51.282051282051285, |
|
"grad_norm": 1.6534066200256348, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7266, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 51.4678558156819, |
|
"grad_norm": 1.7287977933883667, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7304, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 51.653660349312524, |
|
"grad_norm": 2.113309860229492, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7356, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 51.83946488294314, |
|
"grad_norm": 1.4403417110443115, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7295, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.7679134509851745, |
|
"eval_f1_macro": 0.4427799679621408, |
|
"eval_f1_micro": 0.7679134509851745, |
|
"eval_loss": 0.6654694676399231, |
|
"eval_runtime": 502.5773, |
|
"eval_samples_per_second": 114.213, |
|
"eval_steps_per_second": 1.785, |
|
"learning_rate": 0.0001, |
|
"step": 139932 |
|
}, |
|
{ |
|
"epoch": 52.025269416573764, |
|
"grad_norm": 1.7017192840576172, |
|
"learning_rate": 0.0001, |
|
"loss": 0.712, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 52.21107395020439, |
|
"grad_norm": 1.3907127380371094, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7329, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 52.396878483835, |
|
"grad_norm": 1.791027307510376, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7276, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 52.58268301746563, |
|
"grad_norm": 1.384203553199768, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7126, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 52.76848755109625, |
|
"grad_norm": 2.309049367904663, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7284, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 52.954292084726866, |
|
"grad_norm": 1.4892642498016357, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7267, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.767234020313235, |
|
"eval_f1_macro": 0.4341825403324277, |
|
"eval_f1_micro": 0.767234020313235, |
|
"eval_loss": 0.6643231511116028, |
|
"eval_runtime": 484.553, |
|
"eval_samples_per_second": 118.462, |
|
"eval_steps_per_second": 1.851, |
|
"learning_rate": 0.0001, |
|
"step": 142623 |
|
}, |
|
{ |
|
"epoch": 53.14009661835749, |
|
"grad_norm": 1.9168522357940674, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7301, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 53.325901151988106, |
|
"grad_norm": 2.0074169635772705, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7194, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 53.51170568561873, |
|
"grad_norm": 1.7252275943756104, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7248, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 53.69751021924935, |
|
"grad_norm": 1.5249263048171997, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7318, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 53.88331475287997, |
|
"grad_norm": 1.5287233591079712, |
|
"learning_rate": 0.0001, |
|
"loss": 0.724, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.7663455340499294, |
|
"eval_f1_macro": 0.4459616186399035, |
|
"eval_f1_micro": 0.7663455340499294, |
|
"eval_loss": 0.667382001876831, |
|
"eval_runtime": 485.0681, |
|
"eval_samples_per_second": 118.336, |
|
"eval_steps_per_second": 1.849, |
|
"learning_rate": 0.0001, |
|
"step": 145314 |
|
}, |
|
{ |
|
"epoch": 54.06911928651059, |
|
"grad_norm": 2.074103593826294, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7333, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 54.25492382014121, |
|
"grad_norm": 1.8293678760528564, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7265, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 54.44072835377183, |
|
"grad_norm": 1.985289454460144, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7241, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 54.626532887402455, |
|
"grad_norm": 1.5796940326690674, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7188, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 54.81233742103307, |
|
"grad_norm": 2.1631064414978027, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7321, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 54.998141954663694, |
|
"grad_norm": 1.892568588256836, |
|
"learning_rate": 0.0001, |
|
"loss": 0.734, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.7684709325621505, |
|
"eval_f1_macro": 0.4389385481332223, |
|
"eval_f1_micro": 0.7684709325621505, |
|
"eval_loss": 0.6627440452575684, |
|
"eval_runtime": 477.8566, |
|
"eval_samples_per_second": 120.122, |
|
"eval_steps_per_second": 1.877, |
|
"learning_rate": 0.0001, |
|
"step": 148005 |
|
}, |
|
{ |
|
"epoch": 55.18394648829432, |
|
"grad_norm": 1.820019006729126, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7167, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 55.369751021924934, |
|
"grad_norm": 1.9633277654647827, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7347, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 55.55555555555556, |
|
"grad_norm": 1.589463710784912, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7285, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 55.74136008918617, |
|
"grad_norm": 1.771761417388916, |
|
"learning_rate": 0.0001, |
|
"loss": 0.73, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 55.9271646228168, |
|
"grad_norm": 1.93351411819458, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7285, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.767094649918991, |
|
"eval_f1_macro": 0.43857797557707, |
|
"eval_f1_micro": 0.767094649918991, |
|
"eval_loss": 0.6627209186553955, |
|
"eval_runtime": 479.9805, |
|
"eval_samples_per_second": 119.59, |
|
"eval_steps_per_second": 1.869, |
|
"learning_rate": 0.0001, |
|
"step": 150696 |
|
}, |
|
{ |
|
"epoch": 56.11296915644742, |
|
"grad_norm": 1.5868228673934937, |
|
"learning_rate": 0.0001, |
|
"loss": 0.724, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 56.298773690078036, |
|
"grad_norm": 1.6344318389892578, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7218, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 56.48457822370866, |
|
"grad_norm": 1.6172913312911987, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7248, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 56.670382757339276, |
|
"grad_norm": 1.5940032005310059, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7336, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 56.8561872909699, |
|
"grad_norm": 1.6969693899154663, |
|
"learning_rate": 0.0001, |
|
"loss": 0.729, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.7669204369261859, |
|
"eval_f1_macro": 0.43847119749006624, |
|
"eval_f1_micro": 0.7669204369261859, |
|
"eval_loss": 0.6640397310256958, |
|
"eval_runtime": 484.1334, |
|
"eval_samples_per_second": 118.564, |
|
"eval_steps_per_second": 1.853, |
|
"learning_rate": 0.0001, |
|
"step": 153387 |
|
}, |
|
{ |
|
"epoch": 57.04199182460052, |
|
"grad_norm": 1.940199851989746, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7136, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 57.22779635823114, |
|
"grad_norm": 2.240196466445923, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7264, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 57.41360089186176, |
|
"grad_norm": 1.6355632543563843, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7226, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 57.599405425492385, |
|
"grad_norm": 1.930558681488037, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7322, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 57.785209959123, |
|
"grad_norm": 2.192221164703369, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7322, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 57.971014492753625, |
|
"grad_norm": 1.9269661903381348, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7179, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.7672862842110765, |
|
"eval_f1_macro": 0.43760916892251167, |
|
"eval_f1_micro": 0.7672862842110765, |
|
"eval_loss": 0.6627684235572815, |
|
"eval_runtime": 479.7239, |
|
"eval_samples_per_second": 119.654, |
|
"eval_steps_per_second": 1.87, |
|
"learning_rate": 0.0001, |
|
"step": 156078 |
|
}, |
|
{ |
|
"epoch": 58.15681902638424, |
|
"grad_norm": 2.0105526447296143, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7321, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 58.342623560014864, |
|
"grad_norm": 1.7992244958877563, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7319, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 58.52842809364549, |
|
"grad_norm": 2.2000529766082764, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7215, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 58.714232627276104, |
|
"grad_norm": 2.374115228652954, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7189, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 58.90003716090673, |
|
"grad_norm": 2.0600476264953613, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7257, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.7679134509851745, |
|
"eval_f1_macro": 0.439932052501894, |
|
"eval_f1_micro": 0.7679134509851745, |
|
"eval_loss": 0.6614954471588135, |
|
"eval_runtime": 476.1801, |
|
"eval_samples_per_second": 120.545, |
|
"eval_steps_per_second": 1.884, |
|
"learning_rate": 0.0001, |
|
"step": 158769 |
|
}, |
|
{ |
|
"epoch": 59.08584169453734, |
|
"grad_norm": 1.518589735031128, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7106, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 59.27164622816797, |
|
"grad_norm": 1.8264387845993042, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7262, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 59.45745076179859, |
|
"grad_norm": 2.5099358558654785, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7217, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 59.643255295429206, |
|
"grad_norm": 2.0228655338287354, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7234, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 59.82905982905983, |
|
"grad_norm": 1.8197671175003052, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7297, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.766990122123308, |
|
"eval_f1_macro": 0.44188579219640917, |
|
"eval_f1_micro": 0.766990122123308, |
|
"eval_loss": 0.6633245944976807, |
|
"eval_runtime": 478.7046, |
|
"eval_samples_per_second": 119.909, |
|
"eval_steps_per_second": 1.874, |
|
"learning_rate": 0.0001, |
|
"step": 161460 |
|
}, |
|
{ |
|
"epoch": 60.01486436269045, |
|
"grad_norm": 1.6832709312438965, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7319, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 60.20066889632107, |
|
"grad_norm": 2.1717429161071777, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7281, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 60.38647342995169, |
|
"grad_norm": 2.8050575256347656, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7216, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 60.57227796358231, |
|
"grad_norm": 2.1643614768981934, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7272, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 60.75808249721293, |
|
"grad_norm": 2.2237448692321777, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7238, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 60.943887030843555, |
|
"grad_norm": 2.3279480934143066, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7297, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.7685754603578335, |
|
"eval_f1_macro": 0.4370683373238057, |
|
"eval_f1_micro": 0.7685754603578335, |
|
"eval_loss": 0.6611309051513672, |
|
"eval_runtime": 490.7606, |
|
"eval_samples_per_second": 116.963, |
|
"eval_steps_per_second": 1.828, |
|
"learning_rate": 0.0001, |
|
"step": 164151 |
|
}, |
|
{ |
|
"epoch": 61.12969156447417, |
|
"grad_norm": 1.1877753734588623, |
|
"learning_rate": 0.0001, |
|
"loss": 0.721, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 61.315496098104795, |
|
"grad_norm": 1.6631951332092285, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7222, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 61.50130063173541, |
|
"grad_norm": 1.9209131002426147, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7283, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 61.687105165366034, |
|
"grad_norm": 2.088667392730713, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7217, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 61.87290969899666, |
|
"grad_norm": 2.503835916519165, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7262, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.7684360899635895, |
|
"eval_f1_macro": 0.45352172583851785, |
|
"eval_f1_micro": 0.7684360899635895, |
|
"eval_loss": 0.660831093788147, |
|
"eval_runtime": 497.2138, |
|
"eval_samples_per_second": 115.445, |
|
"eval_steps_per_second": 1.804, |
|
"learning_rate": 0.0001, |
|
"step": 166842 |
|
}, |
|
{ |
|
"epoch": 62.058714232627274, |
|
"grad_norm": 1.5878970623016357, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7166, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 62.2445187662579, |
|
"grad_norm": 1.8134915828704834, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7234, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 62.43032329988852, |
|
"grad_norm": 1.9176697731018066, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7246, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 62.61612783351914, |
|
"grad_norm": 1.6024200916290283, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7322, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 62.80193236714976, |
|
"grad_norm": 1.7893162965774536, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7227, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 62.98773690078038, |
|
"grad_norm": 1.9036942720413208, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7204, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.767791501890211, |
|
"eval_f1_macro": 0.44611945476580944, |
|
"eval_f1_micro": 0.767791501890211, |
|
"eval_loss": 0.6621896028518677, |
|
"eval_runtime": 505.1543, |
|
"eval_samples_per_second": 113.631, |
|
"eval_steps_per_second": 1.776, |
|
"learning_rate": 0.0001, |
|
"step": 169533 |
|
}, |
|
{ |
|
"epoch": 63.173541434411, |
|
"grad_norm": 2.1570420265197754, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7229, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 63.35934596804162, |
|
"grad_norm": 1.816713571548462, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7209, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 63.54515050167224, |
|
"grad_norm": 1.9060814380645752, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7279, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 63.73095503530286, |
|
"grad_norm": 1.8209328651428223, |
|
"learning_rate": 0.0001, |
|
"loss": 0.726, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 63.91675956893348, |
|
"grad_norm": 2.5898454189300537, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7296, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.767547603700284, |
|
"eval_f1_macro": 0.4439334258360575, |
|
"eval_f1_micro": 0.767547603700284, |
|
"eval_loss": 0.6610415577888489, |
|
"eval_runtime": 518.2702, |
|
"eval_samples_per_second": 110.755, |
|
"eval_steps_per_second": 1.731, |
|
"learning_rate": 0.0001, |
|
"step": 172224 |
|
}, |
|
{ |
|
"epoch": 64.1025641025641, |
|
"grad_norm": 2.03393816947937, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7116, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 64.28836863619472, |
|
"grad_norm": 2.2850654125213623, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7204, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 64.47417316982535, |
|
"grad_norm": 2.1166341304779053, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7139, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 64.65997770345597, |
|
"grad_norm": 2.2916886806488037, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7235, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 64.84578223708658, |
|
"grad_norm": 2.0275001525878906, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7253, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.7680354000801379, |
|
"eval_f1_macro": 0.434573899819693, |
|
"eval_f1_micro": 0.7680354000801379, |
|
"eval_loss": 0.6589834690093994, |
|
"eval_runtime": 514.9526, |
|
"eval_samples_per_second": 111.469, |
|
"eval_steps_per_second": 1.742, |
|
"learning_rate": 0.0001, |
|
"step": 174915 |
|
}, |
|
{ |
|
"epoch": 65.03158677071721, |
|
"grad_norm": 1.5037002563476562, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7336, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 65.21739130434783, |
|
"grad_norm": 1.769515037536621, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7181, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 65.40319583797844, |
|
"grad_norm": 2.0847535133361816, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7236, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 65.58900037160906, |
|
"grad_norm": 2.0402798652648926, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7196, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 65.77480490523969, |
|
"grad_norm": 2.2478229999542236, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7277, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 65.9606094388703, |
|
"grad_norm": 2.4031426906585693, |
|
"learning_rate": 0.0001, |
|
"loss": 0.723, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.76845351126287, |
|
"eval_f1_macro": 0.4397010901020239, |
|
"eval_f1_micro": 0.76845351126287, |
|
"eval_loss": 0.6599727272987366, |
|
"eval_runtime": 512.5743, |
|
"eval_samples_per_second": 111.986, |
|
"eval_steps_per_second": 1.75, |
|
"learning_rate": 0.0001, |
|
"step": 177606 |
|
}, |
|
{ |
|
"epoch": 66.14641397250092, |
|
"grad_norm": 1.965077519416809, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7167, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 66.33221850613155, |
|
"grad_norm": 2.626265525817871, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7315, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 66.51802303976217, |
|
"grad_norm": 2.360399007797241, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7101, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 66.70382757339279, |
|
"grad_norm": 2.442690849304199, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7336, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 66.88963210702342, |
|
"grad_norm": 1.652727723121643, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7257, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.769045835438407, |
|
"eval_f1_macro": 0.44838573955584105, |
|
"eval_f1_micro": 0.769045835438407, |
|
"eval_loss": 0.6572328209877014, |
|
"eval_runtime": 509.8483, |
|
"eval_samples_per_second": 112.584, |
|
"eval_steps_per_second": 1.759, |
|
"learning_rate": 0.0001, |
|
"step": 180297 |
|
}, |
|
{ |
|
"epoch": 67.07543664065403, |
|
"grad_norm": 2.3557684421539307, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7238, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 67.26124117428465, |
|
"grad_norm": 2.1195363998413086, |
|
"learning_rate": 0.0001, |
|
"loss": 0.718, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 67.44704570791528, |
|
"grad_norm": 2.189373731613159, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7148, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 67.6328502415459, |
|
"grad_norm": 2.0466713905334473, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7295, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 67.81865477517651, |
|
"grad_norm": 2.2755849361419678, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7257, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.7686799881535165, |
|
"eval_f1_macro": 0.4442341389313245, |
|
"eval_f1_micro": 0.7686799881535165, |
|
"eval_loss": 0.658860445022583, |
|
"eval_runtime": 508.6729, |
|
"eval_samples_per_second": 112.845, |
|
"eval_steps_per_second": 1.763, |
|
"learning_rate": 0.0001, |
|
"step": 182988 |
|
}, |
|
{ |
|
"epoch": 68.00445930880713, |
|
"grad_norm": 2.214465618133545, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7245, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 68.19026384243776, |
|
"grad_norm": 2.169665813446045, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7131, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 68.37606837606837, |
|
"grad_norm": 2.195838451385498, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7217, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 68.56187290969899, |
|
"grad_norm": 2.631565809249878, |
|
"learning_rate": 0.0001, |
|
"loss": 0.714, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 68.74767744332962, |
|
"grad_norm": 2.0778391361236572, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7235, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 68.93348197696024, |
|
"grad_norm": 2.478853464126587, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7299, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.7688542011463215, |
|
"eval_f1_macro": 0.43926108990057783, |
|
"eval_f1_micro": 0.7688542011463215, |
|
"eval_loss": 0.659292995929718, |
|
"eval_runtime": 515.951, |
|
"eval_samples_per_second": 111.253, |
|
"eval_steps_per_second": 1.739, |
|
"learning_rate": 0.0001, |
|
"step": 185679 |
|
}, |
|
{ |
|
"epoch": 69.11928651059085, |
|
"grad_norm": 2.1239874362945557, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7327, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 69.30509104422148, |
|
"grad_norm": 2.156639814376831, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7178, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 69.4908955778521, |
|
"grad_norm": 2.094212532043457, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7205, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 69.67670011148272, |
|
"grad_norm": 2.293548822402954, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7275, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 69.86250464511335, |
|
"grad_norm": 2.42189359664917, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7289, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.7679134509851745, |
|
"eval_f1_macro": 0.4357439201261406, |
|
"eval_f1_micro": 0.7679134509851745, |
|
"eval_loss": 0.658970832824707, |
|
"eval_runtime": 507.1382, |
|
"eval_samples_per_second": 113.186, |
|
"eval_steps_per_second": 1.769, |
|
"learning_rate": 0.0001, |
|
"step": 188370 |
|
}, |
|
{ |
|
"epoch": 70.04830917874396, |
|
"grad_norm": 2.229328155517578, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7243, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 70.23411371237458, |
|
"grad_norm": 2.238511323928833, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7194, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 70.4199182460052, |
|
"grad_norm": 2.1684393882751465, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7217, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 70.60572277963583, |
|
"grad_norm": 2.2486910820007324, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7249, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 70.79152731326644, |
|
"grad_norm": 2.3368637561798096, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7169, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 70.97733184689706, |
|
"grad_norm": 2.2436201572418213, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7179, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_accuracy": 0.768244455671504, |
|
"eval_f1_macro": 0.4432082950234043, |
|
"eval_f1_micro": 0.768244455671504, |
|
"eval_loss": 0.6567061543464661, |
|
"eval_runtime": 510.955, |
|
"eval_samples_per_second": 112.341, |
|
"eval_steps_per_second": 1.756, |
|
"learning_rate": 0.0001, |
|
"step": 191061 |
|
}, |
|
{ |
|
"epoch": 71.16313638052769, |
|
"grad_norm": 2.196202039718628, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7218, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 71.3489409141583, |
|
"grad_norm": 1.8094837665557861, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7271, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 71.53474544778892, |
|
"grad_norm": 2.3732166290283203, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7135, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 71.72054998141955, |
|
"grad_norm": 2.5137126445770264, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7195, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 71.90635451505017, |
|
"grad_norm": 1.83713960647583, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7292, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.7681225065765405, |
|
"eval_f1_macro": 0.4369170898714745, |
|
"eval_f1_micro": 0.7681225065765405, |
|
"eval_loss": 0.65887850522995, |
|
"eval_runtime": 522.7505, |
|
"eval_samples_per_second": 109.806, |
|
"eval_steps_per_second": 1.716, |
|
"learning_rate": 0.0001, |
|
"step": 193752 |
|
}, |
|
{ |
|
"epoch": 72.09215904868078, |
|
"grad_norm": 2.539991855621338, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7165, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 72.27796358231141, |
|
"grad_norm": 2.3056955337524414, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7267, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 72.46376811594203, |
|
"grad_norm": 2.263315200805664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7303, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 72.64957264957265, |
|
"grad_norm": 2.132422685623169, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7237, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 72.83537718320326, |
|
"grad_norm": 2.034406900405884, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7139, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_accuracy": 0.7675998675981255, |
|
"eval_f1_macro": 0.4436833314710758, |
|
"eval_f1_micro": 0.7675998675981255, |
|
"eval_loss": 0.6611541509628296, |
|
"eval_runtime": 506.2664, |
|
"eval_samples_per_second": 113.381, |
|
"eval_steps_per_second": 1.772, |
|
"learning_rate": 0.0001, |
|
"step": 196443 |
|
}, |
|
{ |
|
"epoch": 73.0211817168339, |
|
"grad_norm": 1.860060691833496, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7214, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 73.20698625046451, |
|
"grad_norm": 2.3336594104766846, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7223, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 73.39279078409513, |
|
"grad_norm": 2.637392282485962, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7284, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 73.57859531772576, |
|
"grad_norm": 1.9666019678115845, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7141, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 73.76439985135637, |
|
"grad_norm": 2.4116933345794678, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7158, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 73.95020438498699, |
|
"grad_norm": 2.568808078765869, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7307, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.768488353861431, |
|
"eval_f1_macro": 0.44906341810873124, |
|
"eval_f1_micro": 0.768488353861431, |
|
"eval_loss": 0.6570971012115479, |
|
"eval_runtime": 519.8036, |
|
"eval_samples_per_second": 110.428, |
|
"eval_steps_per_second": 1.726, |
|
"learning_rate": 0.0001, |
|
"step": 199134 |
|
}, |
|
{ |
|
"epoch": 74.13600891861762, |
|
"grad_norm": 2.4878387451171875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7172, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 74.32181345224824, |
|
"grad_norm": 2.029940605163574, |
|
"learning_rate": 0.0001, |
|
"loss": 0.713, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 74.50761798587885, |
|
"grad_norm": 1.8782607316970825, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7148, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 74.69342251950948, |
|
"grad_norm": 2.0745551586151123, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7249, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 74.8792270531401, |
|
"grad_norm": 2.2313883304595947, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7238, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_accuracy": 0.768174770474382, |
|
"eval_f1_macro": 0.44439364748025234, |
|
"eval_f1_micro": 0.768174770474382, |
|
"eval_loss": 0.6557245254516602, |
|
"eval_runtime": 535.3584, |
|
"eval_samples_per_second": 107.22, |
|
"eval_steps_per_second": 1.676, |
|
"learning_rate": 0.0001, |
|
"step": 201825 |
|
}, |
|
{ |
|
"epoch": 75.06503158677071, |
|
"grad_norm": 2.656343936920166, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7334, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 75.25083612040133, |
|
"grad_norm": 2.0540146827697754, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7186, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 75.43664065403196, |
|
"grad_norm": 1.8651381731033325, |
|
"learning_rate": 0.0001, |
|
"loss": 0.723, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 75.62244518766258, |
|
"grad_norm": 1.9370218515396118, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7211, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 75.8082497212932, |
|
"grad_norm": 1.9121958017349243, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7205, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 75.99405425492382, |
|
"grad_norm": 2.61309552192688, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7257, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.7683489834671869, |
|
"eval_f1_macro": 0.4478607285233603, |
|
"eval_f1_micro": 0.7683489834671869, |
|
"eval_loss": 0.658838152885437, |
|
"eval_runtime": 499.8427, |
|
"eval_samples_per_second": 114.838, |
|
"eval_steps_per_second": 1.795, |
|
"learning_rate": 0.0001, |
|
"step": 204516 |
|
}, |
|
{ |
|
"epoch": 76.17985878855444, |
|
"grad_norm": 2.235535144805908, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7205, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 76.36566332218506, |
|
"grad_norm": 1.883389949798584, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7299, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 76.55146785581569, |
|
"grad_norm": 2.6287364959716797, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7118, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 76.7372723894463, |
|
"grad_norm": 2.0496041774749756, |
|
"learning_rate": 0.0001, |
|
"loss": 0.72, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"grad_norm": 2.1624155044555664, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7252, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_accuracy": 0.7686277242556749, |
|
"eval_f1_macro": 0.44887316801028765, |
|
"eval_f1_micro": 0.7686277242556749, |
|
"eval_loss": 0.6572225093841553, |
|
"eval_runtime": 492.6745, |
|
"eval_samples_per_second": 116.509, |
|
"eval_steps_per_second": 1.821, |
|
"learning_rate": 0.0001, |
|
"step": 207207 |
|
}, |
|
{ |
|
"epoch": 77.10888145670755, |
|
"grad_norm": 2.232707977294922, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7144, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 77.29468599033817, |
|
"grad_norm": 2.15569806098938, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7182, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 77.48049052396878, |
|
"grad_norm": 2.2068002223968506, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7128, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 77.6662950575994, |
|
"grad_norm": 2.9470760822296143, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7247, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 77.85209959123003, |
|
"grad_norm": 2.5690219402313232, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7231, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.768767094649919, |
|
"eval_f1_macro": 0.4439839848601264, |
|
"eval_f1_micro": 0.768767094649919, |
|
"eval_loss": 0.6562930941581726, |
|
"eval_runtime": 495.5045, |
|
"eval_samples_per_second": 115.844, |
|
"eval_steps_per_second": 1.81, |
|
"learning_rate": 0.0001, |
|
"step": 209898 |
|
}, |
|
{ |
|
"epoch": 78.03790412486065, |
|
"grad_norm": 1.879889726638794, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7258, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 78.22370865849126, |
|
"grad_norm": 1.8929120302200317, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7188, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 78.40951319212189, |
|
"grad_norm": 2.2578604221343994, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7239, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 78.59531772575251, |
|
"grad_norm": 2.1204187870025635, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7255, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 78.78112225938312, |
|
"grad_norm": 1.968096375465393, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7206, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 78.96692679301376, |
|
"grad_norm": 2.7695274353027344, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7207, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_accuracy": 0.76810508527726, |
|
"eval_f1_macro": 0.4379298766193662, |
|
"eval_f1_micro": 0.76810508527726, |
|
"eval_loss": 0.6564787030220032, |
|
"eval_runtime": 498.2393, |
|
"eval_samples_per_second": 115.208, |
|
"eval_steps_per_second": 1.8, |
|
"learning_rate": 0.0001, |
|
"step": 212589 |
|
}, |
|
{ |
|
"epoch": 79.15273132664437, |
|
"grad_norm": 2.0273005962371826, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7177, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 79.33853586027499, |
|
"grad_norm": 2.40535306930542, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7288, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 79.52434039390562, |
|
"grad_norm": 2.329434871673584, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7139, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 79.71014492753623, |
|
"grad_norm": 2.750331401824951, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7244, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 79.89594946116685, |
|
"grad_norm": 2.482513427734375, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7179, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.768383826065748, |
|
"eval_f1_macro": 0.4460529321244195, |
|
"eval_f1_micro": 0.768383826065748, |
|
"eval_loss": 0.661143958568573, |
|
"eval_runtime": 505.2043, |
|
"eval_samples_per_second": 113.619, |
|
"eval_steps_per_second": 1.776, |
|
"learning_rate": 0.0001, |
|
"step": 215280 |
|
}, |
|
{ |
|
"epoch": 80.08175399479747, |
|
"grad_norm": 2.6754283905029297, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7247, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 80.2675585284281, |
|
"grad_norm": 3.016185760498047, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7194, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 80.45336306205871, |
|
"grad_norm": 2.425431489944458, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7197, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 80.63916759568933, |
|
"grad_norm": 2.2331862449645996, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7217, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 80.82497212931996, |
|
"grad_norm": 2.3233511447906494, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7275, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_accuracy": 0.768941307642724, |
|
"eval_f1_macro": 0.44750591322776384, |
|
"eval_f1_micro": 0.768941307642724, |
|
"eval_loss": 0.660437285900116, |
|
"eval_runtime": 497.6996, |
|
"eval_samples_per_second": 115.333, |
|
"eval_steps_per_second": 1.802, |
|
"learning_rate": 0.0001, |
|
"step": 217971 |
|
}, |
|
{ |
|
"epoch": 81.01077666295058, |
|
"grad_norm": 2.5221219062805176, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7201, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 81.19658119658119, |
|
"grad_norm": 2.2925214767456055, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7095, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 81.38238573021182, |
|
"grad_norm": 2.733447313308716, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7125, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 81.56819026384244, |
|
"grad_norm": 2.3250811100006104, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7193, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 81.75399479747306, |
|
"grad_norm": 2.4225757122039795, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7081, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 81.93979933110369, |
|
"grad_norm": 2.1880016326904297, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7101, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.7705614884758105, |
|
"eval_f1_macro": 0.4526720456188751, |
|
"eval_f1_micro": 0.7705614884758105, |
|
"eval_loss": 0.6531779766082764, |
|
"eval_runtime": 493.1117, |
|
"eval_samples_per_second": 116.406, |
|
"eval_steps_per_second": 1.819, |
|
"learning_rate": 1e-05, |
|
"step": 220662 |
|
}, |
|
{ |
|
"epoch": 82.1256038647343, |
|
"grad_norm": 2.038079261779785, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7052, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 82.31140839836492, |
|
"grad_norm": 2.8099365234375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7054, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 82.49721293199553, |
|
"grad_norm": 2.683091878890991, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7013, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 82.68301746562616, |
|
"grad_norm": 2.0887763500213623, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7024, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 82.86882199925678, |
|
"grad_norm": 2.346670627593994, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7063, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_accuracy": 0.77019564119092, |
|
"eval_f1_macro": 0.4489367718812771, |
|
"eval_f1_micro": 0.77019564119092, |
|
"eval_loss": 0.6532895565032959, |
|
"eval_runtime": 492.0028, |
|
"eval_samples_per_second": 116.668, |
|
"eval_steps_per_second": 1.823, |
|
"learning_rate": 1e-05, |
|
"step": 223353 |
|
}, |
|
{ |
|
"epoch": 83.0546265328874, |
|
"grad_norm": 2.1046745777130127, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7073, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 83.24043106651803, |
|
"grad_norm": 1.7634518146514893, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7081, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 83.42623560014864, |
|
"grad_norm": 2.451301097869873, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7151, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 83.61204013377926, |
|
"grad_norm": 2.347801923751831, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6973, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 83.79784466740989, |
|
"grad_norm": 2.34899640083313, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7013, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 83.9836492010405, |
|
"grad_norm": 2.269644260406494, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7067, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.7705266458772495, |
|
"eval_f1_macro": 0.45139096558153424, |
|
"eval_f1_micro": 0.7705266458772495, |
|
"eval_loss": 0.6505005359649658, |
|
"eval_runtime": 486.6669, |
|
"eval_samples_per_second": 117.947, |
|
"eval_steps_per_second": 1.843, |
|
"learning_rate": 1e-05, |
|
"step": 226044 |
|
}, |
|
{ |
|
"epoch": 84.16945373467112, |
|
"grad_norm": 2.6176042556762695, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6912, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 84.35525826830175, |
|
"grad_norm": 2.330493211746216, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7045, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 84.54106280193237, |
|
"grad_norm": 2.351470470428467, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7049, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 84.72686733556299, |
|
"grad_norm": 2.444443702697754, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7122, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 84.9126718691936, |
|
"grad_norm": 2.2119200229644775, |
|
"learning_rate": 1e-05, |
|
"loss": 0.707, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_accuracy": 0.7708053866657375, |
|
"eval_f1_macro": 0.4565625671001629, |
|
"eval_f1_micro": 0.7708053866657375, |
|
"eval_loss": 0.6501905918121338, |
|
"eval_runtime": 485.2188, |
|
"eval_samples_per_second": 118.299, |
|
"eval_steps_per_second": 1.849, |
|
"learning_rate": 1e-05, |
|
"step": 228735 |
|
}, |
|
{ |
|
"epoch": 85.09847640282423, |
|
"grad_norm": 2.385796546936035, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6966, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 85.28428093645485, |
|
"grad_norm": 2.420588254928589, |
|
"learning_rate": 1e-05, |
|
"loss": 0.705, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 85.47008547008546, |
|
"grad_norm": 2.1321732997894287, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6976, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 85.6558900037161, |
|
"grad_norm": 2.215148448944092, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6998, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 85.84169453734671, |
|
"grad_norm": 2.643437385559082, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6944, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_accuracy": 0.7707182801693351, |
|
"eval_f1_macro": 0.4559124472677571, |
|
"eval_f1_micro": 0.7707182801693351, |
|
"eval_loss": 0.6507149338722229, |
|
"eval_runtime": 493.7179, |
|
"eval_samples_per_second": 116.263, |
|
"eval_steps_per_second": 1.817, |
|
"learning_rate": 1e-05, |
|
"step": 231426 |
|
}, |
|
{ |
|
"epoch": 86.02749907097733, |
|
"grad_norm": 2.8356385231018066, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6966, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 86.21330360460796, |
|
"grad_norm": 2.272819757461548, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7011, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 86.39910813823857, |
|
"grad_norm": 2.5435869693756104, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7019, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 86.58491267186919, |
|
"grad_norm": 2.345691204071045, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6964, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 86.77071720549982, |
|
"grad_norm": 1.9666670560836792, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6989, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 86.95652173913044, |
|
"grad_norm": 2.101868152618408, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6958, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_accuracy": 0.770927335760701, |
|
"eval_f1_macro": 0.45838476700319497, |
|
"eval_f1_micro": 0.770927335760701, |
|
"eval_loss": 0.6484472751617432, |
|
"eval_runtime": 497.0614, |
|
"eval_samples_per_second": 115.481, |
|
"eval_steps_per_second": 1.805, |
|
"learning_rate": 1e-05, |
|
"step": 234117 |
|
}, |
|
{ |
|
"epoch": 87.14232627276105, |
|
"grad_norm": 3.2775235176086426, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6957, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 87.32813080639167, |
|
"grad_norm": 2.4724538326263428, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6963, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 87.5139353400223, |
|
"grad_norm": 2.9358458518981934, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6987, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 87.69973987365292, |
|
"grad_norm": 2.5557122230529785, |
|
"learning_rate": 1e-05, |
|
"loss": 0.708, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 87.88554440728353, |
|
"grad_norm": 2.3544118404388428, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6967, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.77054406717653, |
|
"eval_f1_macro": 0.4569340367642959, |
|
"eval_f1_micro": 0.77054406717653, |
|
"eval_loss": 0.6496042013168335, |
|
"eval_runtime": 501.5965, |
|
"eval_samples_per_second": 114.437, |
|
"eval_steps_per_second": 1.788, |
|
"learning_rate": 1e-05, |
|
"step": 236808 |
|
}, |
|
{ |
|
"epoch": 88.07134894091416, |
|
"grad_norm": 2.06174898147583, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6987, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 88.25715347454478, |
|
"grad_norm": 1.922559380531311, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7057, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 88.4429580081754, |
|
"grad_norm": 2.603978157043457, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7019, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 88.62876254180603, |
|
"grad_norm": 1.859614372253418, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6921, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 88.81456707543664, |
|
"grad_norm": 2.208031177520752, |
|
"learning_rate": 1e-05, |
|
"loss": 0.698, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_accuracy": 0.7713977108412745, |
|
"eval_f1_macro": 0.45601319436503734, |
|
"eval_f1_micro": 0.7713977108412745, |
|
"eval_loss": 0.6486304402351379, |
|
"eval_runtime": 503.0751, |
|
"eval_samples_per_second": 114.1, |
|
"eval_steps_per_second": 1.783, |
|
"learning_rate": 1e-05, |
|
"step": 239499 |
|
}, |
|
{ |
|
"epoch": 89.00037160906726, |
|
"grad_norm": 2.6880383491516113, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6938, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 89.18617614269789, |
|
"grad_norm": 2.5231385231018066, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7063, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 89.3719806763285, |
|
"grad_norm": 2.500443935394287, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6931, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 89.55778520995912, |
|
"grad_norm": 2.0993025302886963, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7048, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 89.74358974358974, |
|
"grad_norm": 2.4644758701324463, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7037, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 89.92939427722037, |
|
"grad_norm": 2.391343116760254, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6966, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_accuracy": 0.7711886552499085, |
|
"eval_f1_macro": 0.45742417795749246, |
|
"eval_f1_micro": 0.7711886552499085, |
|
"eval_loss": 0.6490767598152161, |
|
"eval_runtime": 499.9247, |
|
"eval_samples_per_second": 114.819, |
|
"eval_steps_per_second": 1.794, |
|
"learning_rate": 1e-05, |
|
"step": 242190 |
|
}, |
|
{ |
|
"epoch": 90.11519881085098, |
|
"grad_norm": 2.728985071182251, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6981, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 90.3010033444816, |
|
"grad_norm": 2.9933202266693115, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6972, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 90.48680787811223, |
|
"grad_norm": 2.6327061653137207, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6901, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 90.67261241174285, |
|
"grad_norm": 2.532104969024658, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6879, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 90.85841694537346, |
|
"grad_norm": 2.649080753326416, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7017, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_accuracy": 0.7704221180815666, |
|
"eval_f1_macro": 0.45182909085509243, |
|
"eval_f1_micro": 0.7704221180815666, |
|
"eval_loss": 0.6481940746307373, |
|
"eval_runtime": 494.0364, |
|
"eval_samples_per_second": 116.188, |
|
"eval_steps_per_second": 1.816, |
|
"learning_rate": 1e-05, |
|
"step": 244881 |
|
}, |
|
{ |
|
"epoch": 91.0442214790041, |
|
"grad_norm": 2.7153379917144775, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6977, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 91.23002601263471, |
|
"grad_norm": 2.7882251739501953, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6979, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 91.41583054626533, |
|
"grad_norm": 2.632366418838501, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6984, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 91.60163507989596, |
|
"grad_norm": 2.7723135948181152, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6972, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 91.78743961352657, |
|
"grad_norm": 2.384683132171631, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6989, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 91.97324414715719, |
|
"grad_norm": 2.210071086883545, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.7715545025347991, |
|
"eval_f1_macro": 0.45503456574154166, |
|
"eval_f1_micro": 0.7715545025347991, |
|
"eval_loss": 0.6477252244949341, |
|
"eval_runtime": 495.0222, |
|
"eval_samples_per_second": 115.956, |
|
"eval_steps_per_second": 1.812, |
|
"learning_rate": 1e-05, |
|
"step": 247572 |
|
}, |
|
{ |
|
"epoch": 92.1590486807878, |
|
"grad_norm": 2.723433494567871, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6915, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 92.34485321441844, |
|
"grad_norm": 2.203484535217285, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6938, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 92.53065774804905, |
|
"grad_norm": 2.4244284629821777, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6994, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 92.71646228167967, |
|
"grad_norm": 2.495445728302002, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7034, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 92.9022668153103, |
|
"grad_norm": 2.1715991497039795, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_accuracy": 0.771206076549189, |
|
"eval_f1_macro": 0.4518083431267937, |
|
"eval_f1_micro": 0.771206076549189, |
|
"eval_loss": 0.6489835381507874, |
|
"eval_runtime": 487.2388, |
|
"eval_samples_per_second": 117.809, |
|
"eval_steps_per_second": 1.841, |
|
"learning_rate": 1e-05, |
|
"step": 250263 |
|
}, |
|
{ |
|
"epoch": 93.08807134894091, |
|
"grad_norm": 2.252865791320801, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6916, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 93.27387588257153, |
|
"grad_norm": 2.484537363052368, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6953, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 93.45968041620216, |
|
"grad_norm": 1.7372355461120605, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6918, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 93.64548494983278, |
|
"grad_norm": 2.283757448196411, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7009, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 93.8312894834634, |
|
"grad_norm": 2.009822130203247, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7049, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_accuracy": 0.770753122767896, |
|
"eval_f1_macro": 0.45105746851856937, |
|
"eval_f1_micro": 0.770753122767896, |
|
"eval_loss": 0.6485304832458496, |
|
"eval_runtime": 500.9843, |
|
"eval_samples_per_second": 114.576, |
|
"eval_steps_per_second": 1.79, |
|
"learning_rate": 1e-05, |
|
"step": 252954 |
|
}, |
|
{ |
|
"epoch": 94.01709401709402, |
|
"grad_norm": 2.0452888011932373, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6917, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 94.20289855072464, |
|
"grad_norm": 2.5866355895996094, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7064, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 94.38870308435526, |
|
"grad_norm": 2.3897805213928223, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6944, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 94.57450761798587, |
|
"grad_norm": 2.8837811946868896, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6925, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 94.7603121516165, |
|
"grad_norm": 2.44002628326416, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6989, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 94.94611668524712, |
|
"grad_norm": 3.1637446880340576, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6949, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_accuracy": 0.771659030330482, |
|
"eval_f1_macro": 0.45671492126995916, |
|
"eval_f1_micro": 0.771659030330482, |
|
"eval_loss": 0.647895336151123, |
|
"eval_runtime": 490.9837, |
|
"eval_samples_per_second": 116.91, |
|
"eval_steps_per_second": 1.827, |
|
"learning_rate": 1e-05, |
|
"step": 255645 |
|
}, |
|
{ |
|
"epoch": 95.13192121887774, |
|
"grad_norm": 2.9006567001342773, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6916, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 95.31772575250837, |
|
"grad_norm": 2.7343852519989014, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6968, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 95.50353028613898, |
|
"grad_norm": 2.1861000061035156, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6921, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 95.6893348197696, |
|
"grad_norm": 2.3142426013946533, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7016, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 95.87513935340023, |
|
"grad_norm": 3.253568649291992, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6998, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.7715022386369575, |
|
"eval_f1_macro": 0.4596959338122155, |
|
"eval_f1_micro": 0.7715022386369575, |
|
"eval_loss": 0.6472702622413635, |
|
"eval_runtime": 484.103, |
|
"eval_samples_per_second": 118.572, |
|
"eval_steps_per_second": 1.853, |
|
"learning_rate": 1e-05, |
|
"step": 258336 |
|
}, |
|
{ |
|
"epoch": 96.06094388703085, |
|
"grad_norm": 2.4987857341766357, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6887, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 96.24674842066146, |
|
"grad_norm": 3.320939064025879, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6984, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 96.43255295429208, |
|
"grad_norm": 2.3504180908203125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6984, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 96.61835748792271, |
|
"grad_norm": 2.383805513381958, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6963, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 96.80416202155332, |
|
"grad_norm": 2.4699482917785645, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6847, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 96.98996655518394, |
|
"grad_norm": 2.491687297821045, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6968, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_accuracy": 0.7713977108412745, |
|
"eval_f1_macro": 0.4625401473178366, |
|
"eval_f1_micro": 0.7713977108412745, |
|
"eval_loss": 0.6460831165313721, |
|
"eval_runtime": 491.1356, |
|
"eval_samples_per_second": 116.874, |
|
"eval_steps_per_second": 1.826, |
|
"learning_rate": 1e-05, |
|
"step": 261027 |
|
}, |
|
{ |
|
"epoch": 97.17577108881457, |
|
"grad_norm": 1.9844508171081543, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7004, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 97.36157562244519, |
|
"grad_norm": 2.3233275413513184, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6867, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 97.5473801560758, |
|
"grad_norm": 2.318446397781372, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6992, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 97.73318468970643, |
|
"grad_norm": 2.4279417991638184, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6933, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 97.91898922333705, |
|
"grad_norm": 2.6880953311920166, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7055, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_accuracy": 0.7722165119074581, |
|
"eval_f1_macro": 0.45892155771298887, |
|
"eval_f1_micro": 0.7722165119074581, |
|
"eval_loss": 0.6463102698326111, |
|
"eval_runtime": 504.464, |
|
"eval_samples_per_second": 113.786, |
|
"eval_steps_per_second": 1.778, |
|
"learning_rate": 1e-05, |
|
"step": 263718 |
|
}, |
|
{ |
|
"epoch": 98.10479375696767, |
|
"grad_norm": 2.778778553009033, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6994, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 98.2905982905983, |
|
"grad_norm": 2.2124552726745605, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6842, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 98.47640282422891, |
|
"grad_norm": 3.063041925430298, |
|
"learning_rate": 1e-05, |
|
"loss": 0.696, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 98.66220735785953, |
|
"grad_norm": 2.1842856407165527, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6972, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 98.84801189149015, |
|
"grad_norm": 3.16538143157959, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6931, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_accuracy": 0.77089249316214, |
|
"eval_f1_macro": 0.4549241891767946, |
|
"eval_f1_micro": 0.77089249316214, |
|
"eval_loss": 0.646852433681488, |
|
"eval_runtime": 498.8103, |
|
"eval_samples_per_second": 115.076, |
|
"eval_steps_per_second": 1.798, |
|
"learning_rate": 1e-05, |
|
"step": 266409 |
|
}, |
|
{ |
|
"epoch": 99.03381642512078, |
|
"grad_norm": 2.8139493465423584, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6938, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 99.21962095875139, |
|
"grad_norm": 2.5012643337249756, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6978, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 99.40542549238201, |
|
"grad_norm": 2.0635502338409424, |
|
"learning_rate": 1e-05, |
|
"loss": 0.702, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 99.59123002601264, |
|
"grad_norm": 1.9686059951782227, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6867, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 99.77703455964325, |
|
"grad_norm": 2.32633900642395, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6916, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 99.96283909327387, |
|
"grad_norm": 2.2013280391693115, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6872, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.7723384610024215, |
|
"eval_f1_macro": 0.45970146016594643, |
|
"eval_f1_micro": 0.7723384610024215, |
|
"eval_loss": 0.6456441879272461, |
|
"eval_runtime": 523.4938, |
|
"eval_samples_per_second": 109.65, |
|
"eval_steps_per_second": 1.713, |
|
"learning_rate": 1e-05, |
|
"step": 269100 |
|
}, |
|
{ |
|
"epoch": 100.1486436269045, |
|
"grad_norm": 2.9276328086853027, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6936, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 100.33444816053512, |
|
"grad_norm": 2.9882853031158447, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6916, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 100.52025269416573, |
|
"grad_norm": 2.50858998298645, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7004, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 100.70605722779636, |
|
"grad_norm": 2.6219215393066406, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7033, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 100.89186176142698, |
|
"grad_norm": 2.8792617321014404, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6822, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 101.0, |
|
"eval_accuracy": 0.7717461368268845, |
|
"eval_f1_macro": 0.4573593202819609, |
|
"eval_f1_micro": 0.7717461368268845, |
|
"eval_loss": 0.6469387412071228, |
|
"eval_runtime": 517.8794, |
|
"eval_samples_per_second": 110.839, |
|
"eval_steps_per_second": 1.732, |
|
"learning_rate": 1e-05, |
|
"step": 271791 |
|
}, |
|
{ |
|
"epoch": 101.0776662950576, |
|
"grad_norm": 2.3658642768859863, |
|
"learning_rate": 1e-05, |
|
"loss": 0.707, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 101.26347082868821, |
|
"grad_norm": 3.0172219276428223, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6982, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 101.44927536231884, |
|
"grad_norm": 3.0194907188415527, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6899, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 101.63507989594946, |
|
"grad_norm": 2.587480306625366, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6917, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 101.82088442958008, |
|
"grad_norm": 2.457369089126587, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6875, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 102.0, |
|
"eval_accuracy": 0.7717809794254455, |
|
"eval_f1_macro": 0.4593480600391769, |
|
"eval_f1_micro": 0.7717809794254455, |
|
"eval_loss": 0.646738588809967, |
|
"eval_runtime": 515.0733, |
|
"eval_samples_per_second": 111.442, |
|
"eval_steps_per_second": 1.741, |
|
"learning_rate": 1e-05, |
|
"step": 274482 |
|
}, |
|
{ |
|
"epoch": 102.0066889632107, |
|
"grad_norm": 3.1219208240509033, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6994, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 102.19249349684132, |
|
"grad_norm": 2.158254623413086, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6944, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 102.37829803047194, |
|
"grad_norm": 2.474193811416626, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6939, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 102.56410256410257, |
|
"grad_norm": 2.223621129989624, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7028, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 102.74990709773319, |
|
"grad_norm": 2.8731536865234375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6972, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 102.9357116313638, |
|
"grad_norm": 1.9848276376724243, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6983, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 103.0, |
|
"eval_accuracy": 0.7723036184038605, |
|
"eval_f1_macro": 0.4576617106244536, |
|
"eval_f1_micro": 0.7723036184038605, |
|
"eval_loss": 0.6467755436897278, |
|
"eval_runtime": 512.6943, |
|
"eval_samples_per_second": 111.959, |
|
"eval_steps_per_second": 1.75, |
|
"learning_rate": 1e-05, |
|
"step": 277173 |
|
}, |
|
{ |
|
"epoch": 103.12151616499443, |
|
"grad_norm": 2.9539742469787598, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6854, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 103.30732069862505, |
|
"grad_norm": 3.1808767318725586, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6903, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 103.49312523225566, |
|
"grad_norm": 2.223482131958008, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6962, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 103.67892976588628, |
|
"grad_norm": 2.4267632961273193, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6889, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 103.86473429951691, |
|
"grad_norm": 2.554532527923584, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6902, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"eval_accuracy": 0.7725475165937876, |
|
"eval_f1_macro": 0.4578893476938316, |
|
"eval_f1_micro": 0.7725475165937876, |
|
"eval_loss": 0.6456966400146484, |
|
"eval_runtime": 510.0416, |
|
"eval_samples_per_second": 112.542, |
|
"eval_steps_per_second": 1.759, |
|
"learning_rate": 1e-05, |
|
"step": 279864 |
|
}, |
|
{ |
|
"epoch": 104.05053883314753, |
|
"grad_norm": 2.5456807613372803, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6956, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 104.23634336677814, |
|
"grad_norm": 2.9953627586364746, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7025, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 104.42214790040877, |
|
"grad_norm": 2.574535369873047, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6825, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 104.60795243403939, |
|
"grad_norm": 3.287419557571411, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6953, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 104.79375696767, |
|
"grad_norm": 2.3188724517822266, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6979, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 104.97956150130064, |
|
"grad_norm": 2.4571897983551025, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6876, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 105.0, |
|
"eval_accuracy": 0.7719377711189701, |
|
"eval_f1_macro": 0.45556962818046953, |
|
"eval_f1_micro": 0.7719377711189701, |
|
"eval_loss": 0.6455578804016113, |
|
"eval_runtime": 507.9867, |
|
"eval_samples_per_second": 112.997, |
|
"eval_steps_per_second": 1.766, |
|
"learning_rate": 1e-05, |
|
"step": 282555 |
|
}, |
|
{ |
|
"epoch": 105.16536603493125, |
|
"grad_norm": 2.7945613861083984, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6903, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 105.35117056856187, |
|
"grad_norm": 2.2913684844970703, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7094, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 105.5369751021925, |
|
"grad_norm": 2.745809555053711, |
|
"learning_rate": 1e-05, |
|
"loss": 0.685, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 105.72277963582312, |
|
"grad_norm": 2.8689208030700684, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6873, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 105.90858416945373, |
|
"grad_norm": 2.0228753089904785, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6849, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 106.0, |
|
"eval_accuracy": 0.7723384610024215, |
|
"eval_f1_macro": 0.4644160307431161, |
|
"eval_f1_micro": 0.7723384610024215, |
|
"eval_loss": 0.6444206237792969, |
|
"eval_runtime": 515.3095, |
|
"eval_samples_per_second": 111.391, |
|
"eval_steps_per_second": 1.741, |
|
"learning_rate": 1e-05, |
|
"step": 285246 |
|
}, |
|
{ |
|
"epoch": 106.09438870308435, |
|
"grad_norm": 0.45027607679367065, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6656, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 106.28019323671498, |
|
"grad_norm": 0.3893894553184509, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5644, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 106.4659977703456, |
|
"grad_norm": 0.3509347438812256, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4686, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 106.65180230397621, |
|
"grad_norm": 0.32040271162986755, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3969, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 106.83760683760684, |
|
"grad_norm": 0.4328668415546417, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3411, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 107.0, |
|
"eval_accuracy": 0.04797825821849794, |
|
"eval_f1_macro": 0.360721302464235, |
|
"eval_f1_micro": 0.4910938804941607, |
|
"eval_loss": 0.26552170515060425, |
|
"eval_runtime": 519.2635, |
|
"eval_samples_per_second": 110.543, |
|
"eval_steps_per_second": 1.727, |
|
"learning_rate": 1e-05, |
|
"step": 287937 |
|
}, |
|
{ |
|
"epoch": 107.02341137123746, |
|
"grad_norm": 0.26703259348869324, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2913, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 107.20921590486807, |
|
"grad_norm": 0.2284982055425644, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2501, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 107.3950204384987, |
|
"grad_norm": 1.4908051490783691, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2169, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 107.58082497212932, |
|
"grad_norm": 0.18897071480751038, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1873, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 107.76662950575994, |
|
"grad_norm": 0.17562171816825867, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1619, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 107.95243403939057, |
|
"grad_norm": 0.19028235971927643, |
|
"learning_rate": 1e-05, |
|
"loss": 0.141, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"eval_accuracy": 0.44983536872179924, |
|
"eval_f1_macro": 0.22462065038139475, |
|
"eval_f1_micro": 0.6693680656054029, |
|
"eval_loss": 0.1419014185667038, |
|
"eval_runtime": 516.6186, |
|
"eval_samples_per_second": 111.109, |
|
"eval_steps_per_second": 1.736, |
|
"learning_rate": 1e-05, |
|
"step": 290628 |
|
}, |
|
{ |
|
"epoch": 108.13823857302118, |
|
"grad_norm": 0.1382223218679428, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1234, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 108.3240431066518, |
|
"grad_norm": 0.2637607753276825, |
|
"learning_rate": 1e-05, |
|
"loss": 0.109, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 108.50984764028242, |
|
"grad_norm": 0.16210788488388062, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0977, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 108.69565217391305, |
|
"grad_norm": 0.1515241414308548, |
|
"learning_rate": 1e-05, |
|
"loss": 0.088, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 108.88145670754366, |
|
"grad_norm": 0.15725761651992798, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0809, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 109.0, |
|
"eval_accuracy": 0.6714691381683245, |
|
"eval_f1_macro": 0.2136927959803109, |
|
"eval_f1_micro": 0.7449736568518617, |
|
"eval_loss": 0.07755623757839203, |
|
"eval_runtime": 503.5448, |
|
"eval_samples_per_second": 113.994, |
|
"eval_steps_per_second": 1.781, |
|
"learning_rate": 1e-05, |
|
"step": 293319 |
|
}, |
|
{ |
|
"epoch": 109.06726124117428, |
|
"grad_norm": 0.13999715447425842, |
|
"learning_rate": 1e-05, |
|
"loss": 0.076, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 109.25306577480491, |
|
"grad_norm": 0.12521325051784515, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0714, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 109.43887030843553, |
|
"grad_norm": 0.16435325145721436, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0688, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 109.62467484206614, |
|
"grad_norm": 0.1280326545238495, |
|
"learning_rate": 1e-05, |
|
"loss": 0.066, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 109.81047937569677, |
|
"grad_norm": 0.1201464980840683, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0643, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 109.99628390932739, |
|
"grad_norm": 0.09229467064142227, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0621, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 110.0, |
|
"eval_accuracy": 0.6837163115625163, |
|
"eval_f1_macro": 0.26414762709864326, |
|
"eval_f1_micro": 0.7489470111853911, |
|
"eval_loss": 0.05802077427506447, |
|
"eval_runtime": 515.3576, |
|
"eval_samples_per_second": 111.381, |
|
"eval_steps_per_second": 1.741, |
|
"learning_rate": 1e-05, |
|
"step": 296010 |
|
}, |
|
{ |
|
"epoch": 110.182088442958, |
|
"grad_norm": 0.12133761495351791, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0613, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 110.36789297658864, |
|
"grad_norm": 0.11678178608417511, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0601, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 110.55369751021925, |
|
"grad_norm": 0.11925112456083298, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0596, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 110.73950204384987, |
|
"grad_norm": 0.16815921664237976, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0589, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 110.92530657748048, |
|
"grad_norm": 0.11897558718919754, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0582, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 111.0, |
|
"eval_accuracy": 0.6935245030574381, |
|
"eval_f1_macro": 0.331603552491686, |
|
"eval_f1_micro": 0.7547299175391458, |
|
"eval_loss": 0.053473543375730515, |
|
"eval_runtime": 517.7576, |
|
"eval_samples_per_second": 110.865, |
|
"eval_steps_per_second": 1.732, |
|
"learning_rate": 1e-05, |
|
"step": 298701 |
|
}, |
|
{ |
|
"epoch": 111.11111111111111, |
|
"grad_norm": 0.20368880033493042, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0588, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 111.29691564474173, |
|
"grad_norm": 0.1355835497379303, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0577, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 111.48272017837235, |
|
"grad_norm": 0.15652874112129211, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0569, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 111.66852471200298, |
|
"grad_norm": 0.1435033082962036, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0575, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 111.8543292456336, |
|
"grad_norm": 0.13060764968395233, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0568, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"eval_accuracy": 0.6998135920976987, |
|
"eval_f1_macro": 0.3483537603081725, |
|
"eval_f1_micro": 0.7585280588776449, |
|
"eval_loss": 0.05167479068040848, |
|
"eval_runtime": 513.0638, |
|
"eval_samples_per_second": 111.879, |
|
"eval_steps_per_second": 1.748, |
|
"learning_rate": 1e-05, |
|
"step": 301392 |
|
}, |
|
{ |
|
"epoch": 112.04013377926421, |
|
"grad_norm": 0.12529785931110382, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0571, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 112.22593831289484, |
|
"grad_norm": 0.16798335313796997, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0568, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 112.41174284652546, |
|
"grad_norm": 0.1425975114107132, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0557, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 112.59754738015607, |
|
"grad_norm": 0.14742553234100342, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0553, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 112.7833519137867, |
|
"grad_norm": 0.1510126292705536, |
|
"learning_rate": 1e-05, |
|
"loss": 0.056, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 112.96915644741732, |
|
"grad_norm": 0.14474214613437653, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0557, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 113.0, |
|
"eval_accuracy": 0.7042734447135067, |
|
"eval_f1_macro": 0.3378893620669972, |
|
"eval_f1_micro": 0.7611001027447527, |
|
"eval_loss": 0.05106380954384804, |
|
"eval_runtime": 521.661, |
|
"eval_samples_per_second": 110.035, |
|
"eval_steps_per_second": 1.72, |
|
"learning_rate": 1e-05, |
|
"step": 304083 |
|
}, |
|
{ |
|
"epoch": 113.15496098104794, |
|
"grad_norm": 0.12822121381759644, |
|
"learning_rate": 1e-05, |
|
"loss": 0.056, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 113.34076551467855, |
|
"grad_norm": 0.14637312293052673, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0556, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 113.52657004830918, |
|
"grad_norm": 0.1259402185678482, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0554, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 113.7123745819398, |
|
"grad_norm": 0.13158832490444183, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0556, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 113.89817911557041, |
|
"grad_norm": 0.12858903408050537, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0552, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 114.0, |
|
"eval_accuracy": 0.7053535652688978, |
|
"eval_f1_macro": 0.35703913789456687, |
|
"eval_f1_micro": 0.7622047244094489, |
|
"eval_loss": 0.05065497010946274, |
|
"eval_runtime": 534.7718, |
|
"eval_samples_per_second": 107.337, |
|
"eval_steps_per_second": 1.677, |
|
"learning_rate": 1e-05, |
|
"step": 306774 |
|
}, |
|
{ |
|
"epoch": 114.08398364920104, |
|
"grad_norm": 0.16015706956386566, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0548, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 114.26978818283166, |
|
"grad_norm": 0.13509486615657806, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0551, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 114.45559271646228, |
|
"grad_norm": 0.14582909643650055, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0547, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 114.64139725009291, |
|
"grad_norm": 0.14752507209777832, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0552, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 114.82720178372352, |
|
"grad_norm": 0.1321249157190323, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0555, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 115.0, |
|
"eval_accuracy": 0.7117820247034023, |
|
"eval_f1_macro": 0.36428903036337407, |
|
"eval_f1_micro": 0.7647240545893983, |
|
"eval_loss": 0.05039990693330765, |
|
"eval_runtime": 609.7033, |
|
"eval_samples_per_second": 94.146, |
|
"eval_steps_per_second": 1.471, |
|
"learning_rate": 1e-05, |
|
"step": 309465 |
|
}, |
|
{ |
|
"epoch": 115.01300631735414, |
|
"grad_norm": 0.1596326380968094, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0546, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 115.19881085098477, |
|
"grad_norm": 0.19358591735363007, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0547, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 115.38461538461539, |
|
"grad_norm": 0.1181555911898613, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0551, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 115.570419918246, |
|
"grad_norm": 0.16747964918613434, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0551, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 115.75622445187662, |
|
"grad_norm": 0.1441546380519867, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0544, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 115.94202898550725, |
|
"grad_norm": 0.1473866105079651, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0546, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 116.0, |
|
"eval_accuracy": 0.7101966864688769, |
|
"eval_f1_macro": 0.3622993891059384, |
|
"eval_f1_micro": 0.7647289615591668, |
|
"eval_loss": 0.05015714839100838, |
|
"eval_runtime": 523.6323, |
|
"eval_samples_per_second": 109.621, |
|
"eval_steps_per_second": 1.713, |
|
"learning_rate": 1e-05, |
|
"step": 312156 |
|
}, |
|
{ |
|
"epoch": 116.12783351913787, |
|
"grad_norm": 0.18043090403079987, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0548, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 116.31363805276848, |
|
"grad_norm": 0.1623302400112152, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 116.49944258639911, |
|
"grad_norm": 0.1569686233997345, |
|
"learning_rate": 1e-05, |
|
"loss": 0.055, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 116.68524712002973, |
|
"grad_norm": 0.17043337225914001, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0541, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 116.87105165366034, |
|
"grad_norm": 0.15649768710136414, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0545, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 117.0, |
|
"eval_accuracy": 0.712914409156635, |
|
"eval_f1_macro": 0.36544151863175506, |
|
"eval_f1_micro": 0.7657223847509677, |
|
"eval_loss": 0.050175271928310394, |
|
"eval_runtime": 598.1573, |
|
"eval_samples_per_second": 95.963, |
|
"eval_steps_per_second": 1.5, |
|
"learning_rate": 1e-05, |
|
"step": 314847 |
|
}, |
|
{ |
|
"epoch": 117.05685618729098, |
|
"grad_norm": 0.1493714600801468, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0549, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 117.24266072092159, |
|
"grad_norm": 0.1313825249671936, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0547, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 117.42846525455221, |
|
"grad_norm": 0.13993392884731293, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0541, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 117.61426978818284, |
|
"grad_norm": 0.13758248090744019, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0546, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 117.80007432181345, |
|
"grad_norm": 0.16916200518608093, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0545, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 117.98587885544407, |
|
"grad_norm": 0.1452600359916687, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 118.0, |
|
"eval_accuracy": 0.7141687427048309, |
|
"eval_f1_macro": 0.3524192831401073, |
|
"eval_f1_micro": 0.7654782537680462, |
|
"eval_loss": 0.050468478351831436, |
|
"eval_runtime": 549.4659, |
|
"eval_samples_per_second": 104.467, |
|
"eval_steps_per_second": 1.632, |
|
"learning_rate": 1e-05, |
|
"step": 317538 |
|
}, |
|
{ |
|
"epoch": 118.17168338907469, |
|
"grad_norm": 0.12589971721172333, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 118.35748792270532, |
|
"grad_norm": 0.17545440793037415, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 118.54329245633593, |
|
"grad_norm": 0.15301626920700073, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0541, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 118.72909698996655, |
|
"grad_norm": 0.1384187638759613, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0545, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 118.91490152359718, |
|
"grad_norm": 0.16759108006954193, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 119.0, |
|
"eval_accuracy": 0.7127053535652689, |
|
"eval_f1_macro": 0.34416444697858145, |
|
"eval_f1_micro": 0.7658673932788375, |
|
"eval_loss": 0.049900032579898834, |
|
"eval_runtime": 491.987, |
|
"eval_samples_per_second": 116.672, |
|
"eval_steps_per_second": 1.823, |
|
"learning_rate": 1e-05, |
|
"step": 320229 |
|
}, |
|
{ |
|
"epoch": 119.1007060572278, |
|
"grad_norm": 0.15850204229354858, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 119.28651059085841, |
|
"grad_norm": 0.18434040248394012, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 119.47231512448904, |
|
"grad_norm": 0.15472249686717987, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 119.65811965811966, |
|
"grad_norm": 0.1760583370923996, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0544, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 119.84392419175028, |
|
"grad_norm": 0.1788097620010376, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0541, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"eval_accuracy": 0.7130886221494399, |
|
"eval_f1_macro": 0.35077501544817247, |
|
"eval_f1_micro": 0.7657258505633957, |
|
"eval_loss": 0.049903545528650284, |
|
"eval_runtime": 513.1716, |
|
"eval_samples_per_second": 111.855, |
|
"eval_steps_per_second": 1.748, |
|
"learning_rate": 1e-05, |
|
"step": 322920 |
|
}, |
|
{ |
|
"epoch": 120.0297287253809, |
|
"grad_norm": 0.1507652848958969, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 120.21553325901152, |
|
"grad_norm": 0.1735050082206726, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 120.40133779264214, |
|
"grad_norm": 0.14820708334445953, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0543, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 120.58714232627275, |
|
"grad_norm": 0.14484427869319916, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0537, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 120.77294685990339, |
|
"grad_norm": 0.15814656019210815, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 120.958751393534, |
|
"grad_norm": 0.19824689626693726, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 121.0, |
|
"eval_accuracy": 0.7140816362084285, |
|
"eval_f1_macro": 0.3627670615797559, |
|
"eval_f1_micro": 0.7665756914119359, |
|
"eval_loss": 0.04957958310842514, |
|
"eval_runtime": 509.187, |
|
"eval_samples_per_second": 112.731, |
|
"eval_steps_per_second": 1.762, |
|
"learning_rate": 1e-05, |
|
"step": 325611 |
|
}, |
|
{ |
|
"epoch": 121.14455592716462, |
|
"grad_norm": 0.16547606885433197, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 121.33036046079525, |
|
"grad_norm": 0.2055787444114685, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 121.51616499442586, |
|
"grad_norm": 0.2001011222600937, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 121.70196952805648, |
|
"grad_norm": 0.17856118083000183, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 121.88777406168711, |
|
"grad_norm": 0.14508357644081116, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 122.0, |
|
"eval_accuracy": 0.7163812477134545, |
|
"eval_f1_macro": 0.35293584502475456, |
|
"eval_f1_micro": 0.7672263726699065, |
|
"eval_loss": 0.04973344877362251, |
|
"eval_runtime": 495.422, |
|
"eval_samples_per_second": 115.863, |
|
"eval_steps_per_second": 1.811, |
|
"learning_rate": 1e-05, |
|
"step": 328302 |
|
}, |
|
{ |
|
"epoch": 122.07357859531773, |
|
"grad_norm": 0.1540980488061905, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0543, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 122.25938312894834, |
|
"grad_norm": 0.18639309704303741, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 122.44518766257897, |
|
"grad_norm": 0.15830326080322266, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 122.63099219620959, |
|
"grad_norm": 0.17811599373817444, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0537, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 122.8167967298402, |
|
"grad_norm": 0.1285717487335205, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 123.0, |
|
"eval_accuracy": 0.7153533910559049, |
|
"eval_f1_macro": 0.36741171960145996, |
|
"eval_f1_micro": 0.7661930650098223, |
|
"eval_loss": 0.04949206858873367, |
|
"eval_runtime": 496.6111, |
|
"eval_samples_per_second": 115.585, |
|
"eval_steps_per_second": 1.806, |
|
"learning_rate": 1e-05, |
|
"step": 330993 |
|
}, |
|
{ |
|
"epoch": 123.00260126347082, |
|
"grad_norm": 0.17617733776569366, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0537, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 123.18840579710145, |
|
"grad_norm": 0.20897239446640015, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 123.37421033073207, |
|
"grad_norm": 0.1522960364818573, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 123.56001486436269, |
|
"grad_norm": 0.18495243787765503, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 123.74581939799332, |
|
"grad_norm": 0.15397348999977112, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0541, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 123.93162393162393, |
|
"grad_norm": 0.16448819637298584, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 124.0, |
|
"eval_accuracy": 0.7160676643264055, |
|
"eval_f1_macro": 0.36413807211107735, |
|
"eval_f1_micro": 0.7673595994775795, |
|
"eval_loss": 0.049613192677497864, |
|
"eval_runtime": 491.9647, |
|
"eval_samples_per_second": 116.677, |
|
"eval_steps_per_second": 1.823, |
|
"learning_rate": 1e-05, |
|
"step": 333684 |
|
}, |
|
{ |
|
"epoch": 124.11742846525455, |
|
"grad_norm": 0.14923396706581116, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 124.30323299888518, |
|
"grad_norm": 0.1714375615119934, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 124.4890375325158, |
|
"grad_norm": 0.17104235291481018, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 124.67484206614641, |
|
"grad_norm": 0.16875581443309784, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 124.86064659977704, |
|
"grad_norm": 0.14356301724910736, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 125.0, |
|
"eval_accuracy": 0.7124440340760614, |
|
"eval_f1_macro": 0.3509397162428964, |
|
"eval_f1_micro": 0.7658070643240676, |
|
"eval_loss": 0.04959910735487938, |
|
"eval_runtime": 487.4427, |
|
"eval_samples_per_second": 117.759, |
|
"eval_steps_per_second": 1.84, |
|
"learning_rate": 1e-05, |
|
"step": 336375 |
|
}, |
|
{ |
|
"epoch": 125.04645113340766, |
|
"grad_norm": 0.15746818482875824, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 125.23225566703827, |
|
"grad_norm": 0.1541147232055664, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 125.41806020066889, |
|
"grad_norm": 0.15555234253406525, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 125.60386473429952, |
|
"grad_norm": 0.17335093021392822, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 125.78966926793014, |
|
"grad_norm": 0.18515528738498688, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 125.97547380156075, |
|
"grad_norm": 0.17530472576618195, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 126.0, |
|
"eval_accuracy": 0.7152662845595025, |
|
"eval_f1_macro": 0.37424111866414195, |
|
"eval_f1_micro": 0.7660751240774316, |
|
"eval_loss": 0.0494619682431221, |
|
"eval_runtime": 502.6945, |
|
"eval_samples_per_second": 114.187, |
|
"eval_steps_per_second": 1.784, |
|
"learning_rate": 1e-05, |
|
"step": 339066 |
|
}, |
|
{ |
|
"epoch": 126.16127833519138, |
|
"grad_norm": 0.1517336517572403, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 126.347082868822, |
|
"grad_norm": 0.1815425604581833, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0542, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 126.53288740245262, |
|
"grad_norm": 0.17021115124225616, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 126.71869193608325, |
|
"grad_norm": 0.19511722028255463, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 126.90449646971386, |
|
"grad_norm": 0.22142985463142395, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 127.0, |
|
"eval_accuracy": 0.7149178585738925, |
|
"eval_f1_macro": 0.35735030768195364, |
|
"eval_f1_micro": 0.766314294299216, |
|
"eval_loss": 0.049399666488170624, |
|
"eval_runtime": 486.665, |
|
"eval_samples_per_second": 117.948, |
|
"eval_steps_per_second": 1.843, |
|
"learning_rate": 1e-05, |
|
"step": 341757 |
|
}, |
|
{ |
|
"epoch": 127.09030100334448, |
|
"grad_norm": 0.1855282485485077, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0527, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 127.27610553697511, |
|
"grad_norm": 0.14693668484687805, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 127.46191007060573, |
|
"grad_norm": 0.19144034385681152, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0537, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 127.64771460423634, |
|
"grad_norm": 0.16127558052539825, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 127.83351913786696, |
|
"grad_norm": 0.20746751129627228, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 128.0, |
|
"eval_accuracy": 0.714412640894758, |
|
"eval_f1_macro": 0.36010176970077795, |
|
"eval_f1_micro": 0.7664776721721585, |
|
"eval_loss": 0.04938925430178642, |
|
"eval_runtime": 486.3797, |
|
"eval_samples_per_second": 118.017, |
|
"eval_steps_per_second": 1.844, |
|
"learning_rate": 1e-05, |
|
"step": 344448 |
|
}, |
|
{ |
|
"epoch": 128.01932367149757, |
|
"grad_norm": 0.13175125420093536, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 128.2051282051282, |
|
"grad_norm": 0.1861082762479782, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 128.39093273875883, |
|
"grad_norm": 0.16761593520641327, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 128.57673727238944, |
|
"grad_norm": 0.18102224171161652, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 128.76254180602007, |
|
"grad_norm": 0.166373148560524, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 128.9483463396507, |
|
"grad_norm": 0.2147304117679596, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 129.0, |
|
"eval_accuracy": 0.717931743349419, |
|
"eval_f1_macro": 0.3641550142658243, |
|
"eval_f1_micro": 0.7674323253122921, |
|
"eval_loss": 0.049368634819984436, |
|
"eval_runtime": 497.4863, |
|
"eval_samples_per_second": 115.382, |
|
"eval_steps_per_second": 1.803, |
|
"learning_rate": 1e-05, |
|
"step": 347139 |
|
}, |
|
{ |
|
"epoch": 129.1341508732813, |
|
"grad_norm": 0.15825386345386505, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 129.31995540691193, |
|
"grad_norm": 0.17780201137065887, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 129.50575994054256, |
|
"grad_norm": 0.1677912026643753, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 129.69156447417316, |
|
"grad_norm": 0.18808604776859283, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 129.8773690078038, |
|
"grad_norm": 0.1602342426776886, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 130.0, |
|
"eval_accuracy": 0.717687845159492, |
|
"eval_f1_macro": 0.3602711408206009, |
|
"eval_f1_micro": 0.7667705923765463, |
|
"eval_loss": 0.049409620463848114, |
|
"eval_runtime": 497.5454, |
|
"eval_samples_per_second": 115.368, |
|
"eval_steps_per_second": 1.803, |
|
"learning_rate": 1e-05, |
|
"step": 349830 |
|
}, |
|
{ |
|
"epoch": 130.06317354143442, |
|
"grad_norm": 0.23750899732112885, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 130.24897807506503, |
|
"grad_norm": 0.16858229041099548, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0529, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 130.43478260869566, |
|
"grad_norm": 0.19223752617835999, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 130.6205871423263, |
|
"grad_norm": 0.1930113583803177, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 130.8063916759569, |
|
"grad_norm": 0.20461726188659668, |
|
"learning_rate": 1e-05, |
|
"loss": 0.054, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 130.99219620958752, |
|
"grad_norm": 0.20236244797706604, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 131.0, |
|
"eval_accuracy": 0.718210484137907, |
|
"eval_f1_macro": 0.3664294602272974, |
|
"eval_f1_micro": 0.7665082507046622, |
|
"eval_loss": 0.04939533770084381, |
|
"eval_runtime": 493.8302, |
|
"eval_samples_per_second": 116.236, |
|
"eval_steps_per_second": 1.816, |
|
"learning_rate": 1e-05, |
|
"step": 352521 |
|
}, |
|
{ |
|
"epoch": 131.17800074321812, |
|
"grad_norm": 0.16268064081668854, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 131.36380527684875, |
|
"grad_norm": 0.22975020110607147, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 131.54960981047938, |
|
"grad_norm": 0.16796068847179413, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 131.73541434410998, |
|
"grad_norm": 0.2104586660861969, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 131.9212188777406, |
|
"grad_norm": 0.17215226590633392, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0528, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 132.0, |
|
"eval_accuracy": 0.717583317363809, |
|
"eval_f1_macro": 0.3651176446191739, |
|
"eval_f1_micro": 0.7664564319910658, |
|
"eval_loss": 0.04943186417222023, |
|
"eval_runtime": 487.4839, |
|
"eval_samples_per_second": 117.75, |
|
"eval_steps_per_second": 1.84, |
|
"learning_rate": 1e-05, |
|
"step": 355212 |
|
}, |
|
{ |
|
"epoch": 132.10702341137124, |
|
"grad_norm": 0.1763976663351059, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 132.29282794500185, |
|
"grad_norm": 0.19165627658367157, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 132.47863247863248, |
|
"grad_norm": 0.18794529139995575, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 132.6644370122631, |
|
"grad_norm": 0.19783887267112732, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 132.8502415458937, |
|
"grad_norm": 0.14160636067390442, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 133.0, |
|
"eval_accuracy": 0.714621696486124, |
|
"eval_f1_macro": 0.36115748131858555, |
|
"eval_f1_micro": 0.7658437005098911, |
|
"eval_loss": 0.049288176000118256, |
|
"eval_runtime": 479.1036, |
|
"eval_samples_per_second": 119.809, |
|
"eval_steps_per_second": 1.872, |
|
"learning_rate": 1e-05, |
|
"step": 357903 |
|
}, |
|
{ |
|
"epoch": 133.03604607952434, |
|
"grad_norm": 0.18090824782848358, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 133.22185061315497, |
|
"grad_norm": 0.19353412091732025, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 133.40765514678557, |
|
"grad_norm": 0.22670786082744598, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 133.5934596804162, |
|
"grad_norm": 0.18295426666736603, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 133.77926421404683, |
|
"grad_norm": 0.20159971714019775, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 133.96506874767744, |
|
"grad_norm": 0.21708709001541138, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 134.0, |
|
"eval_accuracy": 0.7154753401508684, |
|
"eval_f1_macro": 0.3677607284274943, |
|
"eval_f1_micro": 0.7659699195779215, |
|
"eval_loss": 0.04927274212241173, |
|
"eval_runtime": 492.3636, |
|
"eval_samples_per_second": 116.583, |
|
"eval_steps_per_second": 1.822, |
|
"learning_rate": 1e-05, |
|
"step": 360594 |
|
}, |
|
{ |
|
"epoch": 134.15087328130807, |
|
"grad_norm": 0.1716473251581192, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 134.3366778149387, |
|
"grad_norm": 0.1569896936416626, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 134.5224823485693, |
|
"grad_norm": 0.18859770894050598, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 134.70828688219993, |
|
"grad_norm": 0.18753333389759064, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 134.89409141583056, |
|
"grad_norm": 0.19746656715869904, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0528, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 135.0, |
|
"eval_accuracy": 0.7189944426055295, |
|
"eval_f1_macro": 0.37226237632641596, |
|
"eval_f1_micro": 0.7674080308866179, |
|
"eval_loss": 0.04929700121283531, |
|
"eval_runtime": 476.5646, |
|
"eval_samples_per_second": 120.447, |
|
"eval_steps_per_second": 1.882, |
|
"learning_rate": 1e-05, |
|
"step": 363285 |
|
}, |
|
{ |
|
"epoch": 135.07989594946116, |
|
"grad_norm": 0.1843235194683075, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 135.2657004830918, |
|
"grad_norm": 0.17511311173439026, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 135.45150501672242, |
|
"grad_norm": 0.20711584389209747, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 135.63730955035302, |
|
"grad_norm": 0.19971944391727448, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0529, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 135.82311408398365, |
|
"grad_norm": 0.2045559585094452, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 136.0, |
|
"eval_accuracy": 0.7151094928659779, |
|
"eval_f1_macro": 0.35969130867283144, |
|
"eval_f1_micro": 0.766711291239524, |
|
"eval_loss": 0.049187980592250824, |
|
"eval_runtime": 502.7189, |
|
"eval_samples_per_second": 114.181, |
|
"eval_steps_per_second": 1.784, |
|
"learning_rate": 1e-05, |
|
"step": 365976 |
|
}, |
|
{ |
|
"epoch": 136.00891861761426, |
|
"grad_norm": 0.17318807542324066, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0529, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 136.1947231512449, |
|
"grad_norm": 0.20163071155548096, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0528, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 136.38052768487552, |
|
"grad_norm": 0.17554792761802673, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 136.56633221850612, |
|
"grad_norm": 0.20222993195056915, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 136.75213675213675, |
|
"grad_norm": 0.2069316953420639, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 136.93794128576738, |
|
"grad_norm": 0.17365820705890656, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 137.0, |
|
"eval_accuracy": 0.7157889235379175, |
|
"eval_f1_macro": 0.3631043131303743, |
|
"eval_f1_micro": 0.7664713487937058, |
|
"eval_loss": 0.0491538941860199, |
|
"eval_runtime": 518.3587, |
|
"eval_samples_per_second": 110.736, |
|
"eval_steps_per_second": 1.73, |
|
"learning_rate": 1e-05, |
|
"step": 368667 |
|
}, |
|
{ |
|
"epoch": 137.12374581939798, |
|
"grad_norm": 0.19572311639785767, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 137.3095503530286, |
|
"grad_norm": 0.1570323258638382, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0528, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 137.49535488665924, |
|
"grad_norm": 0.16947729885578156, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 137.68115942028984, |
|
"grad_norm": 0.16606110334396362, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 137.86696395392048, |
|
"grad_norm": 0.2260875552892685, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 138.0, |
|
"eval_accuracy": 0.7178446368530165, |
|
"eval_f1_macro": 0.3687935119842292, |
|
"eval_f1_micro": 0.7665450277813434, |
|
"eval_loss": 0.04930136725306511, |
|
"eval_runtime": 510.8639, |
|
"eval_samples_per_second": 112.361, |
|
"eval_steps_per_second": 1.756, |
|
"learning_rate": 1e-05, |
|
"step": 371358 |
|
}, |
|
{ |
|
"epoch": 138.0527684875511, |
|
"grad_norm": 0.163988396525383, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 138.2385730211817, |
|
"grad_norm": 0.20549984276294708, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 138.42437755481234, |
|
"grad_norm": 0.18827009201049805, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 138.61018208844297, |
|
"grad_norm": 0.2782110571861267, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 138.79598662207357, |
|
"grad_norm": 0.1959678679704666, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0538, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 138.9817911557042, |
|
"grad_norm": 0.18641294538974762, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 139.0, |
|
"eval_accuracy": 0.7182279054371875, |
|
"eval_f1_macro": 0.35626916649899915, |
|
"eval_f1_micro": 0.766155421092079, |
|
"eval_loss": 0.04927237331867218, |
|
"eval_runtime": 508.9115, |
|
"eval_samples_per_second": 112.792, |
|
"eval_steps_per_second": 1.763, |
|
"learning_rate": 1e-05, |
|
"step": 374049 |
|
}, |
|
{ |
|
"epoch": 139.16759568933483, |
|
"grad_norm": 0.16375960409641266, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 139.35340022296543, |
|
"grad_norm": 0.19818973541259766, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 139.53920475659606, |
|
"grad_norm": 0.19776101410388947, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 139.7250092902267, |
|
"grad_norm": 0.24181506037712097, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 139.9108138238573, |
|
"grad_norm": 0.21174757182598114, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0528, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"eval_accuracy": 0.7197958223724326, |
|
"eval_f1_macro": 0.3699733355385332, |
|
"eval_f1_micro": 0.767376184687937, |
|
"eval_loss": 0.04918988421559334, |
|
"eval_runtime": 505.3732, |
|
"eval_samples_per_second": 113.581, |
|
"eval_steps_per_second": 1.775, |
|
"learning_rate": 1e-05, |
|
"step": 376740 |
|
}, |
|
{ |
|
"epoch": 140.09661835748793, |
|
"grad_norm": 0.18043100833892822, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 140.28242289111856, |
|
"grad_norm": 0.18150250613689423, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 140.46822742474916, |
|
"grad_norm": 0.17685070633888245, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0523, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 140.6540319583798, |
|
"grad_norm": 0.2478715479373932, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 140.8398364920104, |
|
"grad_norm": 0.20271550118923187, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0529, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 141.0, |
|
"eval_accuracy": 0.716468354209857, |
|
"eval_f1_macro": 0.35072596287625124, |
|
"eval_f1_micro": 0.7666041104041745, |
|
"eval_loss": 0.04920462518930435, |
|
"eval_runtime": 510.1227, |
|
"eval_samples_per_second": 112.524, |
|
"eval_steps_per_second": 1.758, |
|
"learning_rate": 1e-05, |
|
"step": 379431 |
|
}, |
|
{ |
|
"epoch": 141.02564102564102, |
|
"grad_norm": 0.25310391187667847, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0539, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 141.21144555927165, |
|
"grad_norm": 0.2286742925643921, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0532, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 141.39725009290225, |
|
"grad_norm": 0.2335425764322281, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0536, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 141.58305462653288, |
|
"grad_norm": 0.21884822845458984, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0531, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 141.76885916016352, |
|
"grad_norm": 0.23641781508922577, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 141.95466369379412, |
|
"grad_norm": 0.19402460753917694, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0529, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 142.0, |
|
"eval_accuracy": 0.7194473963868225, |
|
"eval_f1_macro": 0.36600085102264546, |
|
"eval_f1_micro": 0.7669340748803981, |
|
"eval_loss": 0.04919710010290146, |
|
"eval_runtime": 506.4483, |
|
"eval_samples_per_second": 113.34, |
|
"eval_steps_per_second": 1.771, |
|
"learning_rate": 1e-05, |
|
"step": 382122 |
|
}, |
|
{ |
|
"epoch": 142.14046822742475, |
|
"grad_norm": 0.21041558682918549, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0529, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 142.32627276105538, |
|
"grad_norm": 0.21750488877296448, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0533, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 142.51207729468598, |
|
"grad_norm": 0.2097017467021942, |
|
"learning_rate": 1e-05, |
|
"loss": 0.053, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 142.6978818283166, |
|
"grad_norm": 0.2606968879699707, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0535, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 142.88368636194724, |
|
"grad_norm": 0.18851463496685028, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0534, |
|
"step": 384500 |
|
}, |
|
{ |
|
"epoch": 143.0, |
|
"eval_accuracy": 0.7168342014947475, |
|
"eval_f1_macro": 0.3673237139632794, |
|
"eval_f1_micro": 0.765517685242224, |
|
"eval_loss": 0.04930509999394417, |
|
"eval_runtime": 513.7066, |
|
"eval_samples_per_second": 111.739, |
|
"eval_steps_per_second": 1.746, |
|
"learning_rate": 1e-05, |
|
"step": 384813 |
|
}, |
|
{ |
|
"epoch": 143.06949089557784, |
|
"grad_norm": 0.243364155292511, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0527, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 143.25529542920847, |
|
"grad_norm": 0.2838144600391388, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.053, |
|
"step": 385500 |
|
}, |
|
{ |
|
"epoch": 143.4410999628391, |
|
"grad_norm": 0.24525409936904907, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0528, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 143.6269044964697, |
|
"grad_norm": 0.2145887315273285, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0527, |
|
"step": 386500 |
|
}, |
|
{ |
|
"epoch": 143.81270903010034, |
|
"grad_norm": 0.16669905185699463, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0531, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 143.99851356373097, |
|
"grad_norm": 0.23091119527816772, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0528, |
|
"step": 387500 |
|
}, |
|
{ |
|
"epoch": 144.0, |
|
"eval_accuracy": 0.7171477848817965, |
|
"eval_f1_macro": 0.3554021435508309, |
|
"eval_f1_micro": 0.7667940015206897, |
|
"eval_loss": 0.0490318201482296, |
|
"eval_runtime": 525.9642, |
|
"eval_samples_per_second": 109.135, |
|
"eval_steps_per_second": 1.705, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 387504 |
|
}, |
|
{ |
|
"epoch": 144.18431809736157, |
|
"grad_norm": 0.2074396163225174, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0531, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 144.3701226309922, |
|
"grad_norm": 0.2579312026500702, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0529, |
|
"step": 388500 |
|
}, |
|
{ |
|
"epoch": 144.55592716462283, |
|
"grad_norm": 0.1861879676580429, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0528, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 144.74173169825343, |
|
"grad_norm": 0.21441423892974854, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0526, |
|
"step": 389500 |
|
}, |
|
{ |
|
"epoch": 144.92753623188406, |
|
"grad_norm": 0.23621511459350586, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0534, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 145.0, |
|
"eval_accuracy": 0.7201616696573231, |
|
"eval_f1_macro": 0.3711029550898432, |
|
"eval_f1_micro": 0.7677822164123848, |
|
"eval_loss": 0.04918621480464935, |
|
"eval_runtime": 507.392, |
|
"eval_samples_per_second": 113.129, |
|
"eval_steps_per_second": 1.768, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 390195 |
|
}, |
|
{ |
|
"epoch": 145.1133407655147, |
|
"grad_norm": 0.2494996339082718, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0533, |
|
"step": 390500 |
|
}, |
|
{ |
|
"epoch": 145.2991452991453, |
|
"grad_norm": 0.19221335649490356, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0526, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 145.48494983277592, |
|
"grad_norm": 0.19597986340522766, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0533, |
|
"step": 391500 |
|
}, |
|
{ |
|
"epoch": 145.67075436640653, |
|
"grad_norm": 0.19304433465003967, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0526, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 145.85655890003716, |
|
"grad_norm": 0.21061711013317108, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0529, |
|
"step": 392500 |
|
}, |
|
{ |
|
"epoch": 146.0, |
|
"eval_accuracy": 0.717130363582516, |
|
"eval_f1_macro": 0.368326447075977, |
|
"eval_f1_micro": 0.7665065530257804, |
|
"eval_loss": 0.04903709515929222, |
|
"eval_runtime": 590.7348, |
|
"eval_samples_per_second": 97.169, |
|
"eval_steps_per_second": 1.518, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 392886 |
|
}, |
|
{ |
|
"epoch": 146.0423634336678, |
|
"grad_norm": 0.21325454115867615, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0529, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 146.2281679672984, |
|
"grad_norm": 0.15669451653957367, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0525, |
|
"step": 393500 |
|
}, |
|
{ |
|
"epoch": 146.41397250092902, |
|
"grad_norm": 0.22324424982070923, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.053, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 146.59977703455965, |
|
"grad_norm": 0.192140132188797, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0531, |
|
"step": 394500 |
|
}, |
|
{ |
|
"epoch": 146.78558156819025, |
|
"grad_norm": 0.20880526304244995, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0532, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 146.97138610182088, |
|
"grad_norm": 0.21843858063220978, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0532, |
|
"step": 395500 |
|
}, |
|
{ |
|
"epoch": 147.0, |
|
"eval_accuracy": 0.720823679029982, |
|
"eval_f1_macro": 0.37476196073915324, |
|
"eval_f1_micro": 0.768544776459646, |
|
"eval_loss": 0.049094948917627335, |
|
"eval_runtime": 525.4912, |
|
"eval_samples_per_second": 109.233, |
|
"eval_steps_per_second": 1.707, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 395577 |
|
}, |
|
{ |
|
"epoch": 147.1571906354515, |
|
"grad_norm": 0.17876408994197845, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0521, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 147.34299516908212, |
|
"grad_norm": 0.25762057304382324, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0528, |
|
"step": 396500 |
|
}, |
|
{ |
|
"epoch": 147.52879970271275, |
|
"grad_norm": 0.20070230960845947, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.053, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 147.71460423634338, |
|
"grad_norm": 0.20209959149360657, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0536, |
|
"step": 397500 |
|
}, |
|
{ |
|
"epoch": 147.90040876997398, |
|
"grad_norm": 0.22516289353370667, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.053, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 148.0, |
|
"eval_accuracy": 0.7167296736990645, |
|
"eval_f1_macro": 0.3649988602534311, |
|
"eval_f1_micro": 0.7667018106807243, |
|
"eval_loss": 0.04907181113958359, |
|
"eval_runtime": 525.537, |
|
"eval_samples_per_second": 109.224, |
|
"eval_steps_per_second": 1.707, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 398268 |
|
}, |
|
{ |
|
"epoch": 148.0862133036046, |
|
"grad_norm": 0.2316817343235016, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0532, |
|
"step": 398500 |
|
}, |
|
{ |
|
"epoch": 148.27201783723524, |
|
"grad_norm": 0.2039523720741272, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0525, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 148.45782237086584, |
|
"grad_norm": 0.22887806594371796, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0533, |
|
"step": 399500 |
|
}, |
|
{ |
|
"epoch": 148.64362690449647, |
|
"grad_norm": 0.20459462702274323, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0527, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 148.8294314381271, |
|
"grad_norm": 0.20509076118469238, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0526, |
|
"step": 400500 |
|
}, |
|
{ |
|
"epoch": 149.0, |
|
"eval_accuracy": 0.718210484137907, |
|
"eval_f1_macro": 0.3787860055208151, |
|
"eval_f1_micro": 0.7671139893046166, |
|
"eval_loss": 0.04904184117913246, |
|
"eval_runtime": 553.2009, |
|
"eval_samples_per_second": 103.762, |
|
"eval_steps_per_second": 1.621, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 400959 |
|
}, |
|
{ |
|
"epoch": 149.0152359717577, |
|
"grad_norm": 0.2034793645143509, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0536, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 149.20104050538833, |
|
"grad_norm": 0.21667377650737762, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0531, |
|
"step": 401500 |
|
}, |
|
{ |
|
"epoch": 149.38684503901897, |
|
"grad_norm": 0.23273786902427673, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0526, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 149.57264957264957, |
|
"grad_norm": 0.2024523764848709, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0528, |
|
"step": 402500 |
|
}, |
|
{ |
|
"epoch": 149.7584541062802, |
|
"grad_norm": 0.19701418280601501, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0519, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 149.94425863991083, |
|
"grad_norm": 0.20358909666538239, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0535, |
|
"step": 403500 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"eval_accuracy": 0.7154404975523074, |
|
"eval_f1_macro": 0.3726446424747262, |
|
"eval_f1_micro": 0.7667920374277589, |
|
"eval_loss": 0.04912904277443886, |
|
"eval_runtime": 540.2131, |
|
"eval_samples_per_second": 106.256, |
|
"eval_steps_per_second": 1.66, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 403650 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 403650, |
|
"total_flos": 2.3283011301956885e+21, |
|
"train_loss": 0.019644906779846472, |
|
"train_runtime": 108552.2715, |
|
"train_samples_per_second": 237.919, |
|
"train_steps_per_second": 3.718 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 403650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 10, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3283011301956885e+21, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|