{ "best_metric": 0.45181071758270264, "best_model_checkpoint": "website-classifier/checkpoint-1800", "epoch": 3.0, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 4.491672039031982, "learning_rate": 8.333333333333334e-06, "loss": 2.7552, "step": 30 }, { "epoch": 0.1, "grad_norm": 6.483436107635498, "learning_rate": 1.6666666666666667e-05, "loss": 2.7259, "step": 60 }, { "epoch": 0.15, "grad_norm": 8.029026985168457, "learning_rate": 2.4444444444444445e-05, "loss": 2.6183, "step": 90 }, { "epoch": 0.2, "grad_norm": 10.646072387695312, "learning_rate": 3.277777777777778e-05, "loss": 2.598, "step": 120 }, { "epoch": 0.25, "grad_norm": 10.159834861755371, "learning_rate": 4.111111111111111e-05, "loss": 2.1777, "step": 150 }, { "epoch": 0.3, "grad_norm": 14.836869239807129, "learning_rate": 4.9166666666666665e-05, "loss": 1.7938, "step": 180 }, { "epoch": 0.35, "grad_norm": 10.7587890625, "learning_rate": 4.9166666666666665e-05, "loss": 1.95, "step": 210 }, { "epoch": 0.4, "grad_norm": 31.001468658447266, "learning_rate": 4.8240740740740744e-05, "loss": 1.7602, "step": 240 }, { "epoch": 0.45, "grad_norm": 9.250581741333008, "learning_rate": 4.731481481481482e-05, "loss": 1.6735, "step": 270 }, { "epoch": 0.5, "grad_norm": 14.803194046020508, "learning_rate": 4.638888888888889e-05, "loss": 1.2047, "step": 300 }, { "epoch": 0.55, "grad_norm": 41.04203796386719, "learning_rate": 4.546296296296296e-05, "loss": 1.4807, "step": 330 }, { "epoch": 0.6, "grad_norm": 15.050979614257812, "learning_rate": 4.4537037037037036e-05, "loss": 1.3084, "step": 360 }, { "epoch": 0.65, "grad_norm": 13.59727668762207, "learning_rate": 4.3611111111111116e-05, "loss": 1.3297, "step": 390 }, { "epoch": 0.7, "grad_norm": 6.688912391662598, "learning_rate": 4.268518518518519e-05, "loss": 1.3379, "step": 420 }, { "epoch": 0.75, "grad_norm": 7.42189359664917, "learning_rate": 4.1820987654320994e-05, "loss": 1.4812, "step": 450 }, { "epoch": 0.8, "grad_norm": 6.354011058807373, "learning_rate": 4.089506172839506e-05, "loss": 1.5017, "step": 480 }, { "epoch": 0.85, "grad_norm": 25.94729995727539, "learning_rate": 3.996913580246914e-05, "loss": 1.1041, "step": 510 }, { "epoch": 0.9, "grad_norm": 24.99347686767578, "learning_rate": 3.904320987654321e-05, "loss": 1.109, "step": 540 }, { "epoch": 0.95, "grad_norm": 41.66750717163086, "learning_rate": 3.8117283950617286e-05, "loss": 1.073, "step": 570 }, { "epoch": 1.0, "grad_norm": 6.238738059997559, "learning_rate": 3.719135802469136e-05, "loss": 1.2524, "step": 600 }, { "epoch": 1.0, "eval_accuracy": 0.775, "eval_f1_macro": 0.7753958544292094, "eval_f1_micro": 0.775, "eval_f1_weighted": 0.7753958544292096, "eval_loss": 0.9330267310142517, "eval_precision_macro": 0.796061342783982, "eval_precision_micro": 0.775, "eval_precision_weighted": 0.7960613427839818, "eval_recall_macro": 0.775, "eval_recall_micro": 0.775, "eval_recall_weighted": 0.775, "eval_runtime": 24.168, "eval_samples_per_second": 49.652, "eval_steps_per_second": 3.103, "step": 600 }, { "epoch": 1.05, "grad_norm": 8.009809494018555, "learning_rate": 3.626543209876543e-05, "loss": 0.9211, "step": 630 }, { "epoch": 1.1, "grad_norm": 17.96474838256836, "learning_rate": 3.533950617283951e-05, "loss": 1.0106, "step": 660 }, { "epoch": 1.15, "grad_norm": 8.952888488769531, "learning_rate": 3.441358024691358e-05, "loss": 0.8015, "step": 690 }, { "epoch": 1.2, "grad_norm": 45.105072021484375, "learning_rate": 3.348765432098766e-05, "loss": 0.7559, "step": 720 }, { "epoch": 1.25, "grad_norm": 7.327918529510498, "learning_rate": 3.256172839506173e-05, "loss": 1.0116, "step": 750 }, { "epoch": 1.3, "grad_norm": 48.14884948730469, "learning_rate": 3.16358024691358e-05, "loss": 1.5269, "step": 780 }, { "epoch": 1.35, "grad_norm": 10.317117691040039, "learning_rate": 3.0709876543209876e-05, "loss": 0.9947, "step": 810 }, { "epoch": 1.4, "grad_norm": 39.09126281738281, "learning_rate": 2.9783950617283952e-05, "loss": 0.8826, "step": 840 }, { "epoch": 1.45, "grad_norm": 7.045171737670898, "learning_rate": 2.8858024691358025e-05, "loss": 0.8446, "step": 870 }, { "epoch": 1.5, "grad_norm": 10.068076133728027, "learning_rate": 2.79320987654321e-05, "loss": 0.7001, "step": 900 }, { "epoch": 1.55, "grad_norm": 15.41380500793457, "learning_rate": 2.700617283950617e-05, "loss": 0.8341, "step": 930 }, { "epoch": 1.6, "grad_norm": 9.957711219787598, "learning_rate": 2.6080246913580247e-05, "loss": 0.6945, "step": 960 }, { "epoch": 1.65, "grad_norm": 4.9840545654296875, "learning_rate": 2.5154320987654324e-05, "loss": 0.7595, "step": 990 }, { "epoch": 1.7, "grad_norm": 14.547338485717773, "learning_rate": 2.4228395061728396e-05, "loss": 0.4924, "step": 1020 }, { "epoch": 1.75, "grad_norm": 0.45237699151039124, "learning_rate": 2.3302469135802473e-05, "loss": 0.9098, "step": 1050 }, { "epoch": 1.8, "grad_norm": 17.84574317932129, "learning_rate": 2.2376543209876542e-05, "loss": 0.758, "step": 1080 }, { "epoch": 1.85, "grad_norm": 3.32464599609375, "learning_rate": 2.145061728395062e-05, "loss": 0.6431, "step": 1110 }, { "epoch": 1.9, "grad_norm": 6.051296234130859, "learning_rate": 2.052469135802469e-05, "loss": 0.8403, "step": 1140 }, { "epoch": 1.95, "grad_norm": 36.67551040649414, "learning_rate": 1.9598765432098768e-05, "loss": 0.8311, "step": 1170 }, { "epoch": 2.0, "grad_norm": 14.538824081420898, "learning_rate": 1.867283950617284e-05, "loss": 0.6665, "step": 1200 }, { "epoch": 2.0, "eval_accuracy": 0.855, "eval_f1_macro": 0.8538278779437969, "eval_f1_micro": 0.855, "eval_f1_weighted": 0.853827877943797, "eval_loss": 0.631514310836792, "eval_precision_macro": 0.8595148734247968, "eval_precision_micro": 0.855, "eval_precision_weighted": 0.8595148734247969, "eval_recall_macro": 0.8549999999999999, "eval_recall_micro": 0.855, "eval_recall_weighted": 0.855, "eval_runtime": 24.1701, "eval_samples_per_second": 49.648, "eval_steps_per_second": 3.103, "step": 1200 }, { "epoch": 2.05, "grad_norm": 4.500637531280518, "learning_rate": 1.7746913580246917e-05, "loss": 0.4377, "step": 1230 }, { "epoch": 2.1, "grad_norm": 0.43099209666252136, "learning_rate": 1.682098765432099e-05, "loss": 0.4461, "step": 1260 }, { "epoch": 2.15, "grad_norm": 15.149191856384277, "learning_rate": 1.5895061728395063e-05, "loss": 0.5035, "step": 1290 }, { "epoch": 2.2, "grad_norm": 20.928913116455078, "learning_rate": 1.4969135802469136e-05, "loss": 0.5992, "step": 1320 }, { "epoch": 2.25, "grad_norm": 12.046993255615234, "learning_rate": 1.4043209876543212e-05, "loss": 0.4523, "step": 1350 }, { "epoch": 2.3, "grad_norm": 0.5978269577026367, "learning_rate": 1.3117283950617285e-05, "loss": 0.483, "step": 1380 }, { "epoch": 2.35, "grad_norm": 7.10316801071167, "learning_rate": 1.219135802469136e-05, "loss": 0.4932, "step": 1410 }, { "epoch": 2.4, "grad_norm": 10.279694557189941, "learning_rate": 1.1265432098765432e-05, "loss": 0.3396, "step": 1440 }, { "epoch": 2.45, "grad_norm": 0.13266117870807648, "learning_rate": 1.0339506172839507e-05, "loss": 0.4377, "step": 1470 }, { "epoch": 2.5, "grad_norm": 24.841278076171875, "learning_rate": 9.413580246913581e-06, "loss": 0.4452, "step": 1500 }, { "epoch": 2.55, "grad_norm": 1.116528034210205, "learning_rate": 8.487654320987654e-06, "loss": 0.4, "step": 1530 }, { "epoch": 2.6, "grad_norm": 38.56851577758789, "learning_rate": 7.561728395061729e-06, "loss": 0.3195, "step": 1560 }, { "epoch": 2.65, "grad_norm": 12.757609367370605, "learning_rate": 6.635802469135803e-06, "loss": 0.4735, "step": 1590 }, { "epoch": 2.7, "grad_norm": 0.18463416397571564, "learning_rate": 5.7098765432098764e-06, "loss": 0.3327, "step": 1620 }, { "epoch": 2.75, "grad_norm": 1.0354253053665161, "learning_rate": 4.78395061728395e-06, "loss": 0.2695, "step": 1650 }, { "epoch": 2.8, "grad_norm": 6.2124433517456055, "learning_rate": 3.858024691358025e-06, "loss": 0.4273, "step": 1680 }, { "epoch": 2.85, "grad_norm": 51.33966827392578, "learning_rate": 2.932098765432099e-06, "loss": 0.3228, "step": 1710 }, { "epoch": 2.9, "grad_norm": 0.17042000591754913, "learning_rate": 2.0061728395061727e-06, "loss": 0.45, "step": 1740 }, { "epoch": 2.95, "grad_norm": 12.142667770385742, "learning_rate": 1.0802469135802469e-06, "loss": 0.3076, "step": 1770 }, { "epoch": 3.0, "grad_norm": 0.21961359679698944, "learning_rate": 1.54320987654321e-07, "loss": 0.3804, "step": 1800 }, { "epoch": 3.0, "eval_accuracy": 0.9108333333333334, "eval_f1_macro": 0.9103583954158208, "eval_f1_micro": 0.9108333333333334, "eval_f1_weighted": 0.9103583954158205, "eval_loss": 0.45181071758270264, "eval_precision_macro": 0.9123282026272069, "eval_precision_micro": 0.9108333333333334, "eval_precision_weighted": 0.9123282026272069, "eval_recall_macro": 0.9108333333333334, "eval_recall_micro": 0.9108333333333334, "eval_recall_weighted": 0.9108333333333334, "eval_runtime": 25.1673, "eval_samples_per_second": 47.681, "eval_steps_per_second": 2.98, "step": 1800 } ], "logging_steps": 30, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.34204916842496e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }