{ "best_metric": 0.0418265163898468, "best_model_checkpoint": "./results/answerdotai/ModernBERT-base/trial-5/checkpoint-3012", "epoch": 2.0, "eval_steps": 500, "global_step": 3012, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.033200531208499334, "grad_norm": 6.311530113220215, "learning_rate": 1.279094112727349e-06, "loss": 0.7104, "step": 50 }, { "epoch": 0.06640106241699867, "grad_norm": 17.497058868408203, "learning_rate": 1.2748333062225943e-06, "loss": 0.5729, "step": 100 }, { "epoch": 0.099601593625498, "grad_norm": 7.590151309967041, "learning_rate": 1.2705724997178397e-06, "loss": 0.4714, "step": 150 }, { "epoch": 0.13280212483399734, "grad_norm": 6.96728515625, "learning_rate": 1.2663116932130851e-06, "loss": 0.3881, "step": 200 }, { "epoch": 0.16600265604249667, "grad_norm": 4.9838714599609375, "learning_rate": 1.2620508867083303e-06, "loss": 0.3194, "step": 250 }, { "epoch": 0.199203187250996, "grad_norm": 6.317371368408203, "learning_rate": 1.2577900802035758e-06, "loss": 0.2976, "step": 300 }, { "epoch": 0.23240371845949534, "grad_norm": 15.331583023071289, "learning_rate": 1.2535292736988212e-06, "loss": 0.2392, "step": 350 }, { "epoch": 0.2656042496679947, "grad_norm": 15.493165016174316, "learning_rate": 1.2492684671940664e-06, "loss": 0.2337, "step": 400 }, { "epoch": 0.29880478087649404, "grad_norm": 3.7081472873687744, "learning_rate": 1.2450076606893118e-06, "loss": 0.2037, "step": 450 }, { "epoch": 0.33200531208499334, "grad_norm": 4.029483318328857, "learning_rate": 1.240746854184557e-06, "loss": 0.2054, "step": 500 }, { "epoch": 0.3652058432934927, "grad_norm": 4.573270797729492, "learning_rate": 1.2364860476798024e-06, "loss": 0.1555, "step": 550 }, { "epoch": 0.398406374501992, "grad_norm": 15.748998641967773, "learning_rate": 1.2322252411750478e-06, "loss": 0.1486, "step": 600 }, { "epoch": 0.4316069057104914, "grad_norm": 12.240307807922363, "learning_rate": 1.227964434670293e-06, "loss": 0.1552, "step": 650 }, { "epoch": 0.4648074369189907, "grad_norm": 17.192546844482422, "learning_rate": 1.2237036281655385e-06, "loss": 0.1234, "step": 700 }, { "epoch": 0.49800796812749004, "grad_norm": 11.04953670501709, "learning_rate": 1.2194428216607839e-06, "loss": 0.1212, "step": 750 }, { "epoch": 0.5312084993359893, "grad_norm": 4.883615016937256, "learning_rate": 1.215182015156029e-06, "loss": 0.1059, "step": 800 }, { "epoch": 0.5644090305444888, "grad_norm": 4.633565425872803, "learning_rate": 1.2109212086512745e-06, "loss": 0.0788, "step": 850 }, { "epoch": 0.5976095617529881, "grad_norm": 2.6228833198547363, "learning_rate": 1.20666040214652e-06, "loss": 0.087, "step": 900 }, { "epoch": 0.6308100929614874, "grad_norm": 6.4782915115356445, "learning_rate": 1.2023995956417651e-06, "loss": 0.0802, "step": 950 }, { "epoch": 0.6640106241699867, "grad_norm": 5.229304313659668, "learning_rate": 1.1981387891370103e-06, "loss": 0.077, "step": 1000 }, { "epoch": 0.6972111553784861, "grad_norm": 6.034313201904297, "learning_rate": 1.1938779826322558e-06, "loss": 0.0703, "step": 1050 }, { "epoch": 0.7304116865869854, "grad_norm": 9.29736614227295, "learning_rate": 1.1896171761275012e-06, "loss": 0.066, "step": 1100 }, { "epoch": 0.7636122177954847, "grad_norm": 0.6172637343406677, "learning_rate": 1.1853563696227464e-06, "loss": 0.0692, "step": 1150 }, { "epoch": 0.796812749003984, "grad_norm": 1.642548680305481, "learning_rate": 1.1810955631179918e-06, "loss": 0.0437, "step": 1200 }, { "epoch": 0.8300132802124834, "grad_norm": 3.888737916946411, "learning_rate": 1.176834756613237e-06, "loss": 0.0474, "step": 1250 }, { "epoch": 0.8632138114209827, "grad_norm": 14.787779808044434, "learning_rate": 1.1725739501084824e-06, "loss": 0.0501, "step": 1300 }, { "epoch": 0.896414342629482, "grad_norm": 0.8571153283119202, "learning_rate": 1.1683131436037278e-06, "loss": 0.0439, "step": 1350 }, { "epoch": 0.9296148738379814, "grad_norm": 0.6915457248687744, "learning_rate": 1.164052337098973e-06, "loss": 0.0455, "step": 1400 }, { "epoch": 0.9628154050464808, "grad_norm": 8.8081636428833, "learning_rate": 1.1597915305942185e-06, "loss": 0.0347, "step": 1450 }, { "epoch": 0.9960159362549801, "grad_norm": 8.551522254943848, "learning_rate": 1.1555307240894639e-06, "loss": 0.0346, "step": 1500 }, { "epoch": 1.0, "eval_accuracy": 0.982824427480916, "eval_f1": 0.9838970307302017, "eval_loss": 0.05475565418601036, "eval_precision": 0.986134299459291, "eval_recall": 0.982824427480916, "eval_runtime": 31.8933, "eval_samples_per_second": 262.877, "eval_steps_per_second": 8.215, "step": 1506 }, { "epoch": 1.0292164674634794, "grad_norm": 13.078969955444336, "learning_rate": 1.151269917584709e-06, "loss": 0.0379, "step": 1550 }, { "epoch": 1.0624169986719787, "grad_norm": 1.906078815460205, "learning_rate": 1.1470091110799545e-06, "loss": 0.0338, "step": 1600 }, { "epoch": 1.095617529880478, "grad_norm": 0.4020080864429474, "learning_rate": 1.1427483045752e-06, "loss": 0.0298, "step": 1650 }, { "epoch": 1.1288180610889773, "grad_norm": 2.647258758544922, "learning_rate": 1.1384874980704451e-06, "loss": 0.023, "step": 1700 }, { "epoch": 1.1620185922974768, "grad_norm": 2.046747922897339, "learning_rate": 1.1342266915656906e-06, "loss": 0.0253, "step": 1750 }, { "epoch": 1.1952191235059761, "grad_norm": 13.14510726928711, "learning_rate": 1.129965885060936e-06, "loss": 0.0268, "step": 1800 }, { "epoch": 1.2284196547144755, "grad_norm": 0.12764006853103638, "learning_rate": 1.1257050785561812e-06, "loss": 0.0099, "step": 1850 }, { "epoch": 1.2616201859229748, "grad_norm": 1.6261545419692993, "learning_rate": 1.1214442720514266e-06, "loss": 0.0252, "step": 1900 }, { "epoch": 1.294820717131474, "grad_norm": 5.552518844604492, "learning_rate": 1.117183465546672e-06, "loss": 0.036, "step": 1950 }, { "epoch": 1.3280212483399734, "grad_norm": 24.064516067504883, "learning_rate": 1.1129226590419172e-06, "loss": 0.0169, "step": 2000 }, { "epoch": 1.361221779548473, "grad_norm": 0.00925782322883606, "learning_rate": 1.1086618525371626e-06, "loss": 0.0184, "step": 2050 }, { "epoch": 1.3944223107569722, "grad_norm": 16.54283905029297, "learning_rate": 1.1044010460324078e-06, "loss": 0.0139, "step": 2100 }, { "epoch": 1.4276228419654715, "grad_norm": 0.24406713247299194, "learning_rate": 1.1001402395276533e-06, "loss": 0.0126, "step": 2150 }, { "epoch": 1.4608233731739708, "grad_norm": 0.02731563337147236, "learning_rate": 1.0958794330228987e-06, "loss": 0.0198, "step": 2200 }, { "epoch": 1.4940239043824701, "grad_norm": 17.53055191040039, "learning_rate": 1.0916186265181439e-06, "loss": 0.0303, "step": 2250 }, { "epoch": 1.5272244355909694, "grad_norm": 0.07282107323408127, "learning_rate": 1.0873578200133893e-06, "loss": 0.0016, "step": 2300 }, { "epoch": 1.5604249667994687, "grad_norm": 20.794416427612305, "learning_rate": 1.0830970135086347e-06, "loss": 0.0225, "step": 2350 }, { "epoch": 1.593625498007968, "grad_norm": 0.052418053150177, "learning_rate": 1.07883620700388e-06, "loss": 0.0076, "step": 2400 }, { "epoch": 1.6268260292164674, "grad_norm": 0.21063362061977386, "learning_rate": 1.0745754004991254e-06, "loss": 0.0159, "step": 2450 }, { "epoch": 1.6600265604249667, "grad_norm": 10.455537796020508, "learning_rate": 1.0703145939943708e-06, "loss": 0.0105, "step": 2500 }, { "epoch": 1.6932270916334662, "grad_norm": 6.205326557159424, "learning_rate": 1.066053787489616e-06, "loss": 0.0081, "step": 2550 }, { "epoch": 1.7264276228419655, "grad_norm": 6.523694038391113, "learning_rate": 1.0617929809848614e-06, "loss": 0.0159, "step": 2600 }, { "epoch": 1.7596281540504648, "grad_norm": 0.010043232701718807, "learning_rate": 1.0575321744801068e-06, "loss": 0.0113, "step": 2650 }, { "epoch": 1.792828685258964, "grad_norm": 0.00458578672260046, "learning_rate": 1.053271367975352e-06, "loss": 0.0086, "step": 2700 }, { "epoch": 1.8260292164674636, "grad_norm": 0.10986531525850296, "learning_rate": 1.0490105614705974e-06, "loss": 0.008, "step": 2750 }, { "epoch": 1.859229747675963, "grad_norm": 0.12284637242555618, "learning_rate": 1.0447497549658429e-06, "loss": 0.0052, "step": 2800 }, { "epoch": 1.8924302788844622, "grad_norm": 0.14606119692325592, "learning_rate": 1.040488948461088e-06, "loss": 0.0176, "step": 2850 }, { "epoch": 1.9256308100929616, "grad_norm": 0.020491423085331917, "learning_rate": 1.0362281419563333e-06, "loss": 0.0102, "step": 2900 }, { "epoch": 1.9588313413014609, "grad_norm": 0.05764462426304817, "learning_rate": 1.0319673354515787e-06, "loss": 0.0044, "step": 2950 }, { "epoch": 1.9920318725099602, "grad_norm": 0.7329011559486389, "learning_rate": 1.027706528946824e-06, "loss": 0.0139, "step": 3000 }, { "epoch": 2.0, "eval_accuracy": 0.9924856870229007, "eval_f1": 0.9924235722235019, "eval_loss": 0.0418265163898468, "eval_precision": 0.9923830636545329, "eval_recall": 0.9924856870229007, "eval_runtime": 31.6222, "eval_samples_per_second": 265.131, "eval_steps_per_second": 8.285, "step": 3012 } ], "logging_steps": 50, "max_steps": 15060, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.282861088518144e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }