{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2619, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0572737686139748, "grad_norm": 1.6889350414276123, "learning_rate": 4.904543718976709e-05, "loss": 0.6367, "step": 50 }, { "epoch": 0.1145475372279496, "grad_norm": 1.8329870700836182, "learning_rate": 4.809087437953417e-05, "loss": 0.5759, "step": 100 }, { "epoch": 0.1718213058419244, "grad_norm": 1.3400354385375977, "learning_rate": 4.713631156930126e-05, "loss": 0.5711, "step": 150 }, { "epoch": 0.2290950744558992, "grad_norm": 1.22416090965271, "learning_rate": 4.618174875906835e-05, "loss": 0.5597, "step": 200 }, { "epoch": 0.286368843069874, "grad_norm": 1.6629807949066162, "learning_rate": 4.522718594883544e-05, "loss": 0.5442, "step": 250 }, { "epoch": 0.3436426116838488, "grad_norm": 1.5647610425949097, "learning_rate": 4.427262313860252e-05, "loss": 0.5287, "step": 300 }, { "epoch": 0.4009163802978236, "grad_norm": 1.5147887468338013, "learning_rate": 4.331806032836961e-05, "loss": 0.5362, "step": 350 }, { "epoch": 0.4581901489117984, "grad_norm": 1.2945597171783447, "learning_rate": 4.2363497518136695e-05, "loss": 0.5235, "step": 400 }, { "epoch": 0.5154639175257731, "grad_norm": 1.9351801872253418, "learning_rate": 4.140893470790378e-05, "loss": 0.5196, "step": 450 }, { "epoch": 0.572737686139748, "grad_norm": 1.9045623540878296, "learning_rate": 4.0454371897670865e-05, "loss": 0.5093, "step": 500 }, { "epoch": 0.6300114547537228, "grad_norm": 1.5518434047698975, "learning_rate": 3.949980908743795e-05, "loss": 0.5106, "step": 550 }, { "epoch": 0.6872852233676976, "grad_norm": 1.466840386390686, "learning_rate": 3.854524627720504e-05, "loss": 0.5121, "step": 600 }, { "epoch": 0.7445589919816724, "grad_norm": 1.359466791152954, "learning_rate": 3.759068346697213e-05, "loss": 0.507, "step": 650 }, { "epoch": 0.8018327605956472, "grad_norm": 1.9663983583450317, "learning_rate": 3.663612065673922e-05, "loss": 0.5028, "step": 700 }, { "epoch": 0.8591065292096219, "grad_norm": 1.6117963790893555, "learning_rate": 3.5681557846506306e-05, "loss": 0.4896, "step": 750 }, { "epoch": 0.9163802978235968, "grad_norm": 1.4263460636138916, "learning_rate": 3.4726995036273394e-05, "loss": 0.4917, "step": 800 }, { "epoch": 0.9736540664375716, "grad_norm": 1.079898715019226, "learning_rate": 3.3772432226040476e-05, "loss": 0.5124, "step": 850 }, { "epoch": 1.0, "eval_accuracy": 0.738814878479683, "eval_f1": 0.728374217896514, "eval_loss": 0.4868564307689667, "eval_runtime": 56.2275, "eval_samples_per_second": 372.469, "eval_steps_per_second": 3.895, "step": 873 }, { "epoch": 1.0309278350515463, "grad_norm": 1.4357503652572632, "learning_rate": 3.2817869415807564e-05, "loss": 0.4585, "step": 900 }, { "epoch": 1.088201603665521, "grad_norm": 2.1344144344329834, "learning_rate": 3.186330660557465e-05, "loss": 0.4482, "step": 950 }, { "epoch": 1.145475372279496, "grad_norm": 1.5237584114074707, "learning_rate": 3.0908743795341734e-05, "loss": 0.445, "step": 1000 }, { "epoch": 1.2027491408934707, "grad_norm": 2.1996545791625977, "learning_rate": 2.9954180985108822e-05, "loss": 0.4414, "step": 1050 }, { "epoch": 1.2600229095074456, "grad_norm": 1.8122806549072266, "learning_rate": 2.899961817487591e-05, "loss": 0.4379, "step": 1100 }, { "epoch": 1.3172966781214204, "grad_norm": 1.9378130435943604, "learning_rate": 2.8045055364643e-05, "loss": 0.4358, "step": 1150 }, { "epoch": 1.3745704467353952, "grad_norm": 2.2606778144836426, "learning_rate": 2.709049255441008e-05, "loss": 0.4321, "step": 1200 }, { "epoch": 1.43184421534937, "grad_norm": 1.9233603477478027, "learning_rate": 2.6135929744177168e-05, "loss": 0.4191, "step": 1250 }, { "epoch": 1.4891179839633448, "grad_norm": 2.874886989593506, "learning_rate": 2.5181366933944256e-05, "loss": 0.4399, "step": 1300 }, { "epoch": 1.5463917525773194, "grad_norm": 2.801513671875, "learning_rate": 2.422680412371134e-05, "loss": 0.4263, "step": 1350 }, { "epoch": 1.6036655211912945, "grad_norm": 2.149822950363159, "learning_rate": 2.3272241313478426e-05, "loss": 0.4294, "step": 1400 }, { "epoch": 1.660939289805269, "grad_norm": 1.6502012014389038, "learning_rate": 2.2317678503245514e-05, "loss": 0.4293, "step": 1450 }, { "epoch": 1.718213058419244, "grad_norm": 3.247627019882202, "learning_rate": 2.13631156930126e-05, "loss": 0.4325, "step": 1500 }, { "epoch": 1.7754868270332187, "grad_norm": 2.349097967147827, "learning_rate": 2.0408552882779688e-05, "loss": 0.4328, "step": 1550 }, { "epoch": 1.8327605956471937, "grad_norm": 2.392411708831787, "learning_rate": 1.9453990072546772e-05, "loss": 0.4273, "step": 1600 }, { "epoch": 1.8900343642611683, "grad_norm": 1.5783507823944092, "learning_rate": 1.849942726231386e-05, "loss": 0.4228, "step": 1650 }, { "epoch": 1.9473081328751431, "grad_norm": 2.832991600036621, "learning_rate": 1.754486445208095e-05, "loss": 0.4246, "step": 1700 }, { "epoch": 2.0, "eval_accuracy": 0.752041254834551, "eval_f1": 0.7557499647241428, "eval_loss": 0.47728216648101807, "eval_runtime": 55.7924, "eval_samples_per_second": 375.373, "eval_steps_per_second": 3.925, "step": 1746 }, { "epoch": 2.004581901489118, "grad_norm": 1.2103527784347534, "learning_rate": 1.6590301641848037e-05, "loss": 0.4277, "step": 1750 }, { "epoch": 2.0618556701030926, "grad_norm": 2.3143885135650635, "learning_rate": 1.5635738831615122e-05, "loss": 0.3644, "step": 1800 }, { "epoch": 2.1191294387170676, "grad_norm": 1.9333767890930176, "learning_rate": 1.4681176021382207e-05, "loss": 0.3809, "step": 1850 }, { "epoch": 2.176403207331042, "grad_norm": 1.8364256620407104, "learning_rate": 1.3726613211149295e-05, "loss": 0.3638, "step": 1900 }, { "epoch": 2.2336769759450172, "grad_norm": 2.7426836490631104, "learning_rate": 1.277205040091638e-05, "loss": 0.3711, "step": 1950 }, { "epoch": 2.290950744558992, "grad_norm": 2.5422801971435547, "learning_rate": 1.1817487590683468e-05, "loss": 0.3711, "step": 2000 }, { "epoch": 2.348224513172967, "grad_norm": 3.0044119358062744, "learning_rate": 1.0862924780450553e-05, "loss": 0.372, "step": 2050 }, { "epoch": 2.4054982817869415, "grad_norm": 3.1833741664886475, "learning_rate": 9.90836197021764e-06, "loss": 0.3645, "step": 2100 }, { "epoch": 2.4627720504009165, "grad_norm": 2.613365650177002, "learning_rate": 8.953799159984726e-06, "loss": 0.3673, "step": 2150 }, { "epoch": 2.520045819014891, "grad_norm": 2.555938959121704, "learning_rate": 7.999236349751815e-06, "loss": 0.3718, "step": 2200 }, { "epoch": 2.5773195876288657, "grad_norm": 2.8468923568725586, "learning_rate": 7.0446735395189e-06, "loss": 0.3649, "step": 2250 }, { "epoch": 2.6345933562428407, "grad_norm": 3.8473777770996094, "learning_rate": 6.090110729285988e-06, "loss": 0.3702, "step": 2300 }, { "epoch": 2.691867124856816, "grad_norm": 2.7918660640716553, "learning_rate": 5.135547919053074e-06, "loss": 0.3564, "step": 2350 }, { "epoch": 2.7491408934707904, "grad_norm": 2.7832207679748535, "learning_rate": 4.18098510882016e-06, "loss": 0.3647, "step": 2400 }, { "epoch": 2.806414662084765, "grad_norm": 3.194080352783203, "learning_rate": 3.226422298587247e-06, "loss": 0.3695, "step": 2450 }, { "epoch": 2.86368843069874, "grad_norm": 4.3022966384887695, "learning_rate": 2.271859488354334e-06, "loss": 0.358, "step": 2500 }, { "epoch": 2.9209621993127146, "grad_norm": 1.7924271821975708, "learning_rate": 1.3172966781214204e-06, "loss": 0.3721, "step": 2550 }, { "epoch": 2.9782359679266897, "grad_norm": 1.7951252460479736, "learning_rate": 3.627338678885071e-07, "loss": 0.3626, "step": 2600 }, { "epoch": 3.0, "eval_accuracy": 0.7545241846917825, "eval_f1": 0.7647893123484467, "eval_loss": 0.4975164234638214, "eval_runtime": 55.5446, "eval_samples_per_second": 377.048, "eval_steps_per_second": 3.943, "step": 2619 } ], "logging_steps": 50, "max_steps": 2619, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3289189749651456e+16, "train_batch_size": 96, "trial_name": null, "trial_params": null }