{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999438727782975, "eval_steps": 500, "global_step": 1002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029934518241347054, "grad_norm": 1.3711546008530633, "learning_rate": 5e-06, "loss": 0.7728, "step": 10 }, { "epoch": 0.05986903648269411, "grad_norm": 0.8532931970235523, "learning_rate": 5e-06, "loss": 0.6958, "step": 20 }, { "epoch": 0.08980355472404115, "grad_norm": 0.9311714188883703, "learning_rate": 5e-06, "loss": 0.6702, "step": 30 }, { "epoch": 0.11973807296538821, "grad_norm": 0.7891322057145671, "learning_rate": 5e-06, "loss": 0.6661, "step": 40 }, { "epoch": 0.14967259120673526, "grad_norm": 0.6667175607439161, "learning_rate": 5e-06, "loss": 0.6572, "step": 50 }, { "epoch": 0.1796071094480823, "grad_norm": 0.6870807802747252, "learning_rate": 5e-06, "loss": 0.6491, "step": 60 }, { "epoch": 0.20954162768942938, "grad_norm": 0.7923438547434251, "learning_rate": 5e-06, "loss": 0.6458, "step": 70 }, { "epoch": 0.23947614593077643, "grad_norm": 0.7026808650849072, "learning_rate": 5e-06, "loss": 0.6473, "step": 80 }, { "epoch": 0.2694106641721235, "grad_norm": 0.9113469922865229, "learning_rate": 5e-06, "loss": 0.6385, "step": 90 }, { "epoch": 0.2993451824134705, "grad_norm": 0.6673532156562788, "learning_rate": 5e-06, "loss": 0.6397, "step": 100 }, { "epoch": 0.3292797006548176, "grad_norm": 0.7024293184544007, "learning_rate": 5e-06, "loss": 0.6355, "step": 110 }, { "epoch": 0.3592142188961646, "grad_norm": 0.7252389528859061, "learning_rate": 5e-06, "loss": 0.6343, "step": 120 }, { "epoch": 0.3891487371375117, "grad_norm": 0.7166274742588952, "learning_rate": 5e-06, "loss": 0.6345, "step": 130 }, { "epoch": 0.41908325537885877, "grad_norm": 1.044790711824065, "learning_rate": 5e-06, "loss": 0.6283, "step": 140 }, { "epoch": 0.4490177736202058, "grad_norm": 0.6941559597539316, "learning_rate": 5e-06, "loss": 0.6299, "step": 150 }, { "epoch": 0.47895229186155286, "grad_norm": 0.6459395559042749, "learning_rate": 5e-06, "loss": 0.6287, "step": 160 }, { "epoch": 0.5088868101028999, "grad_norm": 0.8200206082293998, "learning_rate": 5e-06, "loss": 0.6231, "step": 170 }, { "epoch": 0.538821328344247, "grad_norm": 0.8147867268820529, "learning_rate": 5e-06, "loss": 0.6247, "step": 180 }, { "epoch": 0.568755846585594, "grad_norm": 0.7086057878705686, "learning_rate": 5e-06, "loss": 0.6225, "step": 190 }, { "epoch": 0.598690364826941, "grad_norm": 0.6936168023515021, "learning_rate": 5e-06, "loss": 0.6302, "step": 200 }, { "epoch": 0.6286248830682881, "grad_norm": 0.7668440736078843, "learning_rate": 5e-06, "loss": 0.6242, "step": 210 }, { "epoch": 0.6585594013096352, "grad_norm": 0.9231622196440751, "learning_rate": 5e-06, "loss": 0.6223, "step": 220 }, { "epoch": 0.6884939195509823, "grad_norm": 0.7268095301330812, "learning_rate": 5e-06, "loss": 0.6248, "step": 230 }, { "epoch": 0.7184284377923292, "grad_norm": 0.7253911527197313, "learning_rate": 5e-06, "loss": 0.6237, "step": 240 }, { "epoch": 0.7483629560336763, "grad_norm": 0.624367345766973, "learning_rate": 5e-06, "loss": 0.6202, "step": 250 }, { "epoch": 0.7782974742750234, "grad_norm": 0.7459567304697926, "learning_rate": 5e-06, "loss": 0.619, "step": 260 }, { "epoch": 0.8082319925163705, "grad_norm": 0.7173470697470193, "learning_rate": 5e-06, "loss": 0.6264, "step": 270 }, { "epoch": 0.8381665107577175, "grad_norm": 0.6676024114471235, "learning_rate": 5e-06, "loss": 0.6165, "step": 280 }, { "epoch": 0.8681010289990645, "grad_norm": 0.7422170215057577, "learning_rate": 5e-06, "loss": 0.6133, "step": 290 }, { "epoch": 0.8980355472404116, "grad_norm": 0.770476706070457, "learning_rate": 5e-06, "loss": 0.6147, "step": 300 }, { "epoch": 0.9279700654817586, "grad_norm": 0.7280386561072427, "learning_rate": 5e-06, "loss": 0.6135, "step": 310 }, { "epoch": 0.9579045837231057, "grad_norm": 0.6377527403933052, "learning_rate": 5e-06, "loss": 0.6155, "step": 320 }, { "epoch": 0.9878391019644528, "grad_norm": 0.9160036426859673, "learning_rate": 5e-06, "loss": 0.6084, "step": 330 }, { "epoch": 0.9998129092609915, "eval_loss": 0.6195828914642334, "eval_runtime": 514.5322, "eval_samples_per_second": 17.494, "eval_steps_per_second": 0.548, "step": 334 }, { "epoch": 1.0177736202057999, "grad_norm": 0.8698832997202014, "learning_rate": 5e-06, "loss": 0.6386, "step": 340 }, { "epoch": 1.047708138447147, "grad_norm": 0.818554628982315, "learning_rate": 5e-06, "loss": 0.5525, "step": 350 }, { "epoch": 1.077642656688494, "grad_norm": 0.7038909119697851, "learning_rate": 5e-06, "loss": 0.5481, "step": 360 }, { "epoch": 1.1075771749298409, "grad_norm": 0.7417662574743294, "learning_rate": 5e-06, "loss": 0.5481, "step": 370 }, { "epoch": 1.137511693171188, "grad_norm": 0.6971090185954912, "learning_rate": 5e-06, "loss": 0.5483, "step": 380 }, { "epoch": 1.167446211412535, "grad_norm": 0.6877839912070208, "learning_rate": 5e-06, "loss": 0.55, "step": 390 }, { "epoch": 1.197380729653882, "grad_norm": 0.7726546070457438, "learning_rate": 5e-06, "loss": 0.55, "step": 400 }, { "epoch": 1.2273152478952292, "grad_norm": 0.627193492530289, "learning_rate": 5e-06, "loss": 0.5524, "step": 410 }, { "epoch": 1.2572497661365762, "grad_norm": 0.7305679948147004, "learning_rate": 5e-06, "loss": 0.5567, "step": 420 }, { "epoch": 1.2871842843779233, "grad_norm": 0.6901834732706111, "learning_rate": 5e-06, "loss": 0.5513, "step": 430 }, { "epoch": 1.3171188026192704, "grad_norm": 0.7078951488952088, "learning_rate": 5e-06, "loss": 0.5533, "step": 440 }, { "epoch": 1.3470533208606175, "grad_norm": 0.6469932896858731, "learning_rate": 5e-06, "loss": 0.5474, "step": 450 }, { "epoch": 1.3769878391019645, "grad_norm": 0.6984755612841984, "learning_rate": 5e-06, "loss": 0.5579, "step": 460 }, { "epoch": 1.4069223573433116, "grad_norm": 0.7417844126416642, "learning_rate": 5e-06, "loss": 0.5571, "step": 470 }, { "epoch": 1.4368568755846587, "grad_norm": 0.717182519772729, "learning_rate": 5e-06, "loss": 0.5591, "step": 480 }, { "epoch": 1.4667913938260055, "grad_norm": 0.6313303871675323, "learning_rate": 5e-06, "loss": 0.5585, "step": 490 }, { "epoch": 1.4967259120673526, "grad_norm": 0.7084037752703477, "learning_rate": 5e-06, "loss": 0.5546, "step": 500 }, { "epoch": 1.5266604303086997, "grad_norm": 0.6531206728146401, "learning_rate": 5e-06, "loss": 0.5577, "step": 510 }, { "epoch": 1.5565949485500468, "grad_norm": 0.654485495629501, "learning_rate": 5e-06, "loss": 0.5555, "step": 520 }, { "epoch": 1.5865294667913938, "grad_norm": 0.6725661891430049, "learning_rate": 5e-06, "loss": 0.5561, "step": 530 }, { "epoch": 1.616463985032741, "grad_norm": 0.698329402386774, "learning_rate": 5e-06, "loss": 0.5599, "step": 540 }, { "epoch": 1.646398503274088, "grad_norm": 0.6690114035686613, "learning_rate": 5e-06, "loss": 0.5592, "step": 550 }, { "epoch": 1.6763330215154348, "grad_norm": 0.66011999214286, "learning_rate": 5e-06, "loss": 0.5498, "step": 560 }, { "epoch": 1.706267539756782, "grad_norm": 0.8040306195233741, "learning_rate": 5e-06, "loss": 0.5518, "step": 570 }, { "epoch": 1.736202057998129, "grad_norm": 0.7060300162023749, "learning_rate": 5e-06, "loss": 0.5529, "step": 580 }, { "epoch": 1.766136576239476, "grad_norm": 0.6673011361622929, "learning_rate": 5e-06, "loss": 0.5558, "step": 590 }, { "epoch": 1.7960710944808231, "grad_norm": 0.7571655613304361, "learning_rate": 5e-06, "loss": 0.5564, "step": 600 }, { "epoch": 1.8260056127221702, "grad_norm": 0.642019011432012, "learning_rate": 5e-06, "loss": 0.5543, "step": 610 }, { "epoch": 1.8559401309635173, "grad_norm": 0.6324305634532394, "learning_rate": 5e-06, "loss": 0.5524, "step": 620 }, { "epoch": 1.8858746492048644, "grad_norm": 0.6741301602507832, "learning_rate": 5e-06, "loss": 0.5543, "step": 630 }, { "epoch": 1.9158091674462114, "grad_norm": 0.7541104518831577, "learning_rate": 5e-06, "loss": 0.556, "step": 640 }, { "epoch": 1.9457436856875585, "grad_norm": 0.6493122341038763, "learning_rate": 5e-06, "loss": 0.5541, "step": 650 }, { "epoch": 1.9756782039289056, "grad_norm": 0.8173207800207763, "learning_rate": 5e-06, "loss": 0.559, "step": 660 }, { "epoch": 1.999625818521983, "eval_loss": 0.6155872941017151, "eval_runtime": 513.9247, "eval_samples_per_second": 17.514, "eval_steps_per_second": 0.549, "step": 668 }, { "epoch": 2.0056127221702527, "grad_norm": 1.2127948243552868, "learning_rate": 5e-06, "loss": 0.6011, "step": 670 }, { "epoch": 2.0355472404115997, "grad_norm": 0.8183124334466546, "learning_rate": 5e-06, "loss": 0.4861, "step": 680 }, { "epoch": 2.065481758652947, "grad_norm": 0.7278809818448668, "learning_rate": 5e-06, "loss": 0.4822, "step": 690 }, { "epoch": 2.095416276894294, "grad_norm": 0.6907315590571528, "learning_rate": 5e-06, "loss": 0.4866, "step": 700 }, { "epoch": 2.125350795135641, "grad_norm": 0.7350401872049226, "learning_rate": 5e-06, "loss": 0.4864, "step": 710 }, { "epoch": 2.155285313376988, "grad_norm": 0.8001247911358127, "learning_rate": 5e-06, "loss": 0.4898, "step": 720 }, { "epoch": 2.185219831618335, "grad_norm": 0.7054933220563573, "learning_rate": 5e-06, "loss": 0.4897, "step": 730 }, { "epoch": 2.2151543498596817, "grad_norm": 0.7802502700344892, "learning_rate": 5e-06, "loss": 0.4901, "step": 740 }, { "epoch": 2.245088868101029, "grad_norm": 0.7215036811131982, "learning_rate": 5e-06, "loss": 0.4923, "step": 750 }, { "epoch": 2.275023386342376, "grad_norm": 0.6977267806305402, "learning_rate": 5e-06, "loss": 0.4878, "step": 760 }, { "epoch": 2.304957904583723, "grad_norm": 0.7014967299126638, "learning_rate": 5e-06, "loss": 0.4929, "step": 770 }, { "epoch": 2.33489242282507, "grad_norm": 0.6887436747500622, "learning_rate": 5e-06, "loss": 0.4887, "step": 780 }, { "epoch": 2.364826941066417, "grad_norm": 0.6792765123813863, "learning_rate": 5e-06, "loss": 0.4905, "step": 790 }, { "epoch": 2.394761459307764, "grad_norm": 0.7699608340540777, "learning_rate": 5e-06, "loss": 0.4933, "step": 800 }, { "epoch": 2.4246959775491113, "grad_norm": 0.7845823284512902, "learning_rate": 5e-06, "loss": 0.5002, "step": 810 }, { "epoch": 2.4546304957904583, "grad_norm": 0.7943786395213068, "learning_rate": 5e-06, "loss": 0.4973, "step": 820 }, { "epoch": 2.4845650140318054, "grad_norm": 0.7944786954591401, "learning_rate": 5e-06, "loss": 0.497, "step": 830 }, { "epoch": 2.5144995322731525, "grad_norm": 0.7339137535531504, "learning_rate": 5e-06, "loss": 0.4944, "step": 840 }, { "epoch": 2.5444340505144996, "grad_norm": 0.6904479862339539, "learning_rate": 5e-06, "loss": 0.4978, "step": 850 }, { "epoch": 2.5743685687558466, "grad_norm": 0.7239438693443128, "learning_rate": 5e-06, "loss": 0.496, "step": 860 }, { "epoch": 2.6043030869971937, "grad_norm": 0.6527578782432896, "learning_rate": 5e-06, "loss": 0.4936, "step": 870 }, { "epoch": 2.634237605238541, "grad_norm": 0.7144829092908652, "learning_rate": 5e-06, "loss": 0.492, "step": 880 }, { "epoch": 2.664172123479888, "grad_norm": 0.7507359205267485, "learning_rate": 5e-06, "loss": 0.498, "step": 890 }, { "epoch": 2.694106641721235, "grad_norm": 0.698227000026423, "learning_rate": 5e-06, "loss": 0.5006, "step": 900 }, { "epoch": 2.724041159962582, "grad_norm": 0.6978873719386385, "learning_rate": 5e-06, "loss": 0.4942, "step": 910 }, { "epoch": 2.753975678203929, "grad_norm": 0.6619027440664284, "learning_rate": 5e-06, "loss": 0.4955, "step": 920 }, { "epoch": 2.7839101964452757, "grad_norm": 0.6892171519418343, "learning_rate": 5e-06, "loss": 0.5013, "step": 930 }, { "epoch": 2.8138447146866232, "grad_norm": 0.7177100967350676, "learning_rate": 5e-06, "loss": 0.4994, "step": 940 }, { "epoch": 2.84377923292797, "grad_norm": 0.7555576282537906, "learning_rate": 5e-06, "loss": 0.4988, "step": 950 }, { "epoch": 2.8737137511693174, "grad_norm": 0.8751247226062729, "learning_rate": 5e-06, "loss": 0.5009, "step": 960 }, { "epoch": 2.903648269410664, "grad_norm": 0.730851224108063, "learning_rate": 5e-06, "loss": 0.4964, "step": 970 }, { "epoch": 2.933582787652011, "grad_norm": 0.7044959813969391, "learning_rate": 5e-06, "loss": 0.502, "step": 980 }, { "epoch": 2.963517305893358, "grad_norm": 0.7404990276858111, "learning_rate": 5e-06, "loss": 0.5036, "step": 990 }, { "epoch": 2.9934518241347052, "grad_norm": 0.7940138869090708, "learning_rate": 5e-06, "loss": 0.5047, "step": 1000 }, { "epoch": 2.999438727782975, "eval_loss": 0.6338862776756287, "eval_runtime": 513.4832, "eval_samples_per_second": 17.529, "eval_steps_per_second": 0.549, "step": 1002 }, { "epoch": 2.999438727782975, "step": 1002, "total_flos": 3818092983484416.0, "train_loss": 0.5629288574416719, "train_runtime": 90608.8212, "train_samples_per_second": 5.662, "train_steps_per_second": 0.011 } ], "logging_steps": 10, "max_steps": 1002, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3818092983484416.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }