{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03731343283582089, "grad_norm": 5.129316662113785, "learning_rate": 5e-06, "loss": 1.0435, "step": 10 }, { "epoch": 0.07462686567164178, "grad_norm": 2.1317631972477087, "learning_rate": 5e-06, "loss": 0.9082, "step": 20 }, { "epoch": 0.11194029850746269, "grad_norm": 1.5020282787388823, "learning_rate": 5e-06, "loss": 0.8736, "step": 30 }, { "epoch": 0.14925373134328357, "grad_norm": 3.6821955914980964, "learning_rate": 5e-06, "loss": 0.8445, "step": 40 }, { "epoch": 0.1865671641791045, "grad_norm": 1.477965409345328, "learning_rate": 5e-06, "loss": 0.8328, "step": 50 }, { "epoch": 0.22388059701492538, "grad_norm": 1.4425543172677124, "learning_rate": 5e-06, "loss": 0.8133, "step": 60 }, { "epoch": 0.26119402985074625, "grad_norm": 1.4802380657963947, "learning_rate": 5e-06, "loss": 0.8034, "step": 70 }, { "epoch": 0.29850746268656714, "grad_norm": 1.0004956548053758, "learning_rate": 5e-06, "loss": 0.7885, "step": 80 }, { "epoch": 0.3358208955223881, "grad_norm": 1.4400220028195896, "learning_rate": 5e-06, "loss": 0.7874, "step": 90 }, { "epoch": 0.373134328358209, "grad_norm": 0.9071777971399051, "learning_rate": 5e-06, "loss": 0.7751, "step": 100 }, { "epoch": 0.41044776119402987, "grad_norm": 0.7749771898812693, "learning_rate": 5e-06, "loss": 0.7758, "step": 110 }, { "epoch": 0.44776119402985076, "grad_norm": 0.6682583813483205, "learning_rate": 5e-06, "loss": 0.7709, "step": 120 }, { "epoch": 0.48507462686567165, "grad_norm": 0.6955540536471363, "learning_rate": 5e-06, "loss": 0.7681, "step": 130 }, { "epoch": 0.5223880597014925, "grad_norm": 0.7759575467087485, "learning_rate": 5e-06, "loss": 0.771, "step": 140 }, { "epoch": 0.5597014925373134, "grad_norm": 0.7880732627454844, "learning_rate": 5e-06, "loss": 0.7651, "step": 150 }, { "epoch": 0.5970149253731343, "grad_norm": 0.7164079633513424, "learning_rate": 5e-06, "loss": 0.7583, "step": 160 }, { "epoch": 0.6343283582089553, "grad_norm": 1.0021390337711888, "learning_rate": 5e-06, "loss": 0.7596, "step": 170 }, { "epoch": 0.6716417910447762, "grad_norm": 0.7967524582691976, "learning_rate": 5e-06, "loss": 0.757, "step": 180 }, { "epoch": 0.7089552238805971, "grad_norm": 0.6753075665370271, "learning_rate": 5e-06, "loss": 0.7514, "step": 190 }, { "epoch": 0.746268656716418, "grad_norm": 0.6338912233238941, "learning_rate": 5e-06, "loss": 0.7537, "step": 200 }, { "epoch": 0.7835820895522388, "grad_norm": 0.6643080911717585, "learning_rate": 5e-06, "loss": 0.7475, "step": 210 }, { "epoch": 0.8208955223880597, "grad_norm": 1.1273298911098508, "learning_rate": 5e-06, "loss": 0.7484, "step": 220 }, { "epoch": 0.8582089552238806, "grad_norm": 0.6268883136610622, "learning_rate": 5e-06, "loss": 0.7413, "step": 230 }, { "epoch": 0.8955223880597015, "grad_norm": 0.7006621707262826, "learning_rate": 5e-06, "loss": 0.7515, "step": 240 }, { "epoch": 0.9328358208955224, "grad_norm": 0.6277470228472463, "learning_rate": 5e-06, "loss": 0.7434, "step": 250 }, { "epoch": 0.9701492537313433, "grad_norm": 0.5840149658488234, "learning_rate": 5e-06, "loss": 0.7394, "step": 260 }, { "epoch": 1.0, "eval_loss": 0.7421078681945801, "eval_runtime": 26.819, "eval_samples_per_second": 268.541, "eval_steps_per_second": 1.081, "step": 268 }, { "epoch": 1.007462686567164, "grad_norm": 0.9665430005922097, "learning_rate": 5e-06, "loss": 0.7328, "step": 270 }, { "epoch": 1.044776119402985, "grad_norm": 0.9256892941459078, "learning_rate": 5e-06, "loss": 0.6907, "step": 280 }, { "epoch": 1.0820895522388059, "grad_norm": 0.967723083049159, "learning_rate": 5e-06, "loss": 0.6908, "step": 290 }, { "epoch": 1.1194029850746268, "grad_norm": 0.8029316363459901, "learning_rate": 5e-06, "loss": 0.6888, "step": 300 }, { "epoch": 1.1567164179104479, "grad_norm": 0.7290665987177927, "learning_rate": 5e-06, "loss": 0.6911, "step": 310 }, { "epoch": 1.1940298507462686, "grad_norm": 0.6436493293281464, "learning_rate": 5e-06, "loss": 0.6941, "step": 320 }, { "epoch": 1.2313432835820897, "grad_norm": 0.8372380897778048, "learning_rate": 5e-06, "loss": 0.6905, "step": 330 }, { "epoch": 1.2686567164179103, "grad_norm": 0.6466378822784399, "learning_rate": 5e-06, "loss": 0.6897, "step": 340 }, { "epoch": 1.3059701492537314, "grad_norm": 0.7069833954506336, "learning_rate": 5e-06, "loss": 0.6956, "step": 350 }, { "epoch": 1.3432835820895521, "grad_norm": 0.6662497449824378, "learning_rate": 5e-06, "loss": 0.6925, "step": 360 }, { "epoch": 1.3805970149253732, "grad_norm": 0.6645762813254901, "learning_rate": 5e-06, "loss": 0.691, "step": 370 }, { "epoch": 1.417910447761194, "grad_norm": 0.7214541836649476, "learning_rate": 5e-06, "loss": 0.6871, "step": 380 }, { "epoch": 1.455223880597015, "grad_norm": 0.6615102951118991, "learning_rate": 5e-06, "loss": 0.6917, "step": 390 }, { "epoch": 1.4925373134328357, "grad_norm": 0.7904981868828695, "learning_rate": 5e-06, "loss": 0.691, "step": 400 }, { "epoch": 1.5298507462686568, "grad_norm": 0.5496929542007543, "learning_rate": 5e-06, "loss": 0.6908, "step": 410 }, { "epoch": 1.5671641791044775, "grad_norm": 0.8674806763284545, "learning_rate": 5e-06, "loss": 0.6866, "step": 420 }, { "epoch": 1.6044776119402986, "grad_norm": 0.600884315991915, "learning_rate": 5e-06, "loss": 0.6876, "step": 430 }, { "epoch": 1.6417910447761193, "grad_norm": 0.5777569966365784, "learning_rate": 5e-06, "loss": 0.6847, "step": 440 }, { "epoch": 1.6791044776119404, "grad_norm": 0.6211438139603345, "learning_rate": 5e-06, "loss": 0.6896, "step": 450 }, { "epoch": 1.716417910447761, "grad_norm": 0.727953569263571, "learning_rate": 5e-06, "loss": 0.688, "step": 460 }, { "epoch": 1.7537313432835822, "grad_norm": 0.622502417555673, "learning_rate": 5e-06, "loss": 0.686, "step": 470 }, { "epoch": 1.7910447761194028, "grad_norm": 0.6153773392613141, "learning_rate": 5e-06, "loss": 0.6854, "step": 480 }, { "epoch": 1.828358208955224, "grad_norm": 0.6095342231767215, "learning_rate": 5e-06, "loss": 0.6863, "step": 490 }, { "epoch": 1.8656716417910446, "grad_norm": 0.5652844139112992, "learning_rate": 5e-06, "loss": 0.6895, "step": 500 }, { "epoch": 1.9029850746268657, "grad_norm": 0.6616935338217843, "learning_rate": 5e-06, "loss": 0.6914, "step": 510 }, { "epoch": 1.9402985074626866, "grad_norm": 0.7942726961974429, "learning_rate": 5e-06, "loss": 0.6899, "step": 520 }, { "epoch": 1.9776119402985075, "grad_norm": 0.5529814711973833, "learning_rate": 5e-06, "loss": 0.6862, "step": 530 }, { "epoch": 2.0, "eval_loss": 0.7329658269882202, "eval_runtime": 26.4426, "eval_samples_per_second": 272.363, "eval_steps_per_second": 1.097, "step": 536 }, { "epoch": 2.014925373134328, "grad_norm": 0.9977731986027086, "learning_rate": 5e-06, "loss": 0.6582, "step": 540 }, { "epoch": 2.0522388059701493, "grad_norm": 0.8166779288921567, "learning_rate": 5e-06, "loss": 0.6336, "step": 550 }, { "epoch": 2.08955223880597, "grad_norm": 0.644735248584532, "learning_rate": 5e-06, "loss": 0.6298, "step": 560 }, { "epoch": 2.126865671641791, "grad_norm": 0.9113995264610321, "learning_rate": 5e-06, "loss": 0.6323, "step": 570 }, { "epoch": 2.1641791044776117, "grad_norm": 0.6613701694950963, "learning_rate": 5e-06, "loss": 0.6354, "step": 580 }, { "epoch": 2.201492537313433, "grad_norm": 0.7980599593461063, "learning_rate": 5e-06, "loss": 0.6358, "step": 590 }, { "epoch": 2.2388059701492535, "grad_norm": 0.6871277711804801, "learning_rate": 5e-06, "loss": 0.6372, "step": 600 }, { "epoch": 2.2761194029850746, "grad_norm": 0.6529528658078483, "learning_rate": 5e-06, "loss": 0.6332, "step": 610 }, { "epoch": 2.3134328358208958, "grad_norm": 0.7094072749960915, "learning_rate": 5e-06, "loss": 0.633, "step": 620 }, { "epoch": 2.3507462686567164, "grad_norm": 0.7388443835438889, "learning_rate": 5e-06, "loss": 0.6383, "step": 630 }, { "epoch": 2.388059701492537, "grad_norm": 0.6171857744927907, "learning_rate": 5e-06, "loss": 0.6431, "step": 640 }, { "epoch": 2.425373134328358, "grad_norm": 0.6957912647507755, "learning_rate": 5e-06, "loss": 0.6316, "step": 650 }, { "epoch": 2.4626865671641793, "grad_norm": 0.8112600102237454, "learning_rate": 5e-06, "loss": 0.6345, "step": 660 }, { "epoch": 2.5, "grad_norm": 0.8803543543008359, "learning_rate": 5e-06, "loss": 0.6377, "step": 670 }, { "epoch": 2.5373134328358207, "grad_norm": 0.5890611580028059, "learning_rate": 5e-06, "loss": 0.6395, "step": 680 }, { "epoch": 2.574626865671642, "grad_norm": 0.7210880973347081, "learning_rate": 5e-06, "loss": 0.6372, "step": 690 }, { "epoch": 2.611940298507463, "grad_norm": 0.8470523550111462, "learning_rate": 5e-06, "loss": 0.6412, "step": 700 }, { "epoch": 2.6492537313432836, "grad_norm": 0.5831290814615696, "learning_rate": 5e-06, "loss": 0.6356, "step": 710 }, { "epoch": 2.6865671641791042, "grad_norm": 0.5855306287357733, "learning_rate": 5e-06, "loss": 0.6372, "step": 720 }, { "epoch": 2.7238805970149254, "grad_norm": 0.7808371100057291, "learning_rate": 5e-06, "loss": 0.6411, "step": 730 }, { "epoch": 2.7611940298507465, "grad_norm": 0.6443731992295694, "learning_rate": 5e-06, "loss": 0.6409, "step": 740 }, { "epoch": 2.798507462686567, "grad_norm": 0.7456200084192686, "learning_rate": 5e-06, "loss": 0.6367, "step": 750 }, { "epoch": 2.835820895522388, "grad_norm": 0.7082935135503028, "learning_rate": 5e-06, "loss": 0.6368, "step": 760 }, { "epoch": 2.873134328358209, "grad_norm": 0.6098997492184298, "learning_rate": 5e-06, "loss": 0.6336, "step": 770 }, { "epoch": 2.91044776119403, "grad_norm": 0.6865537806373121, "learning_rate": 5e-06, "loss": 0.6391, "step": 780 }, { "epoch": 2.9477611940298507, "grad_norm": 0.6037883340004743, "learning_rate": 5e-06, "loss": 0.6424, "step": 790 }, { "epoch": 2.9850746268656714, "grad_norm": 0.878289263065242, "learning_rate": 5e-06, "loss": 0.6402, "step": 800 }, { "epoch": 3.0, "eval_loss": 0.7372995018959045, "eval_runtime": 25.9879, "eval_samples_per_second": 277.128, "eval_steps_per_second": 1.116, "step": 804 }, { "epoch": 3.0, "step": 804, "total_flos": 1346729945333760.0, "train_loss": 0.7052390824502973, "train_runtime": 5207.6966, "train_samples_per_second": 78.826, "train_steps_per_second": 0.154 } ], "logging_steps": 10, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1346729945333760.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }