{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987063389391979, "eval_steps": 500, "global_step": 386, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00258732212160414, "grad_norm": 1.5696258219863308e+34, "learning_rate": 1.282051282051282e-08, "loss": 34.5991, "step": 1 }, { "epoch": 0.0129366106080207, "grad_norm": 2.4800051592297763e+32, "learning_rate": 6.410256410256409e-08, "loss": 35.1328, "step": 5 }, { "epoch": 0.0258732212160414, "grad_norm": 2.54523168619682e+30, "learning_rate": 1.2820512820512818e-07, "loss": 35.3514, "step": 10 }, { "epoch": 0.03880983182406209, "grad_norm": 2.9350637382111784e+28, "learning_rate": 1.9230769230769231e-07, "loss": 35.2418, "step": 15 }, { "epoch": 0.0517464424320828, "grad_norm": 6.845379509396579e+24, "learning_rate": 2.5641025641025636e-07, "loss": 35.4029, "step": 20 }, { "epoch": 0.0646830530401035, "grad_norm": 9.432421146054794e+16, "learning_rate": 3.2051282051282055e-07, "loss": 35.4307, "step": 25 }, { "epoch": 0.07761966364812418, "grad_norm": 103996205868.7185, "learning_rate": 3.8461538461538463e-07, "loss": 35.3465, "step": 30 }, { "epoch": 0.09055627425614489, "grad_norm": 7902.581384813984, "learning_rate": 4.487179487179487e-07, "loss": 35.2609, "step": 35 }, { "epoch": 0.1034928848641656, "grad_norm": 3891.3695056510064, "learning_rate": 4.999897541535663e-07, "loss": 30.0438, "step": 40 }, { "epoch": 0.11642949547218628, "grad_norm": 2358.218700340259, "learning_rate": 4.996312377016688e-07, "loss": 20.8297, "step": 45 }, { "epoch": 0.129366106080207, "grad_norm": 330.05121170654246, "learning_rate": 4.987612684376705e-07, "loss": 12.961, "step": 50 }, { "epoch": 0.1423027166882277, "grad_norm": 193.68319402032964, "learning_rate": 4.973816287836379e-07, "loss": 10.809, "step": 55 }, { "epoch": 0.15523932729624837, "grad_norm": 63.544778628785686, "learning_rate": 4.954951453913442e-07, "loss": 10.385, "step": 60 }, { "epoch": 0.16817593790426907, "grad_norm": 26.370287565962407, "learning_rate": 4.931056833509313e-07, "loss": 10.1709, "step": 65 }, { "epoch": 0.18111254851228978, "grad_norm": 99.65626298246401, "learning_rate": 4.902181382719843e-07, "loss": 10.0785, "step": 70 }, { "epoch": 0.19404915912031048, "grad_norm": 91.97251247083376, "learning_rate": 4.868384262532425e-07, "loss": 10.0305, "step": 75 }, { "epoch": 0.2069857697283312, "grad_norm": 133.24722674771854, "learning_rate": 4.829734717614995e-07, "loss": 9.9683, "step": 80 }, { "epoch": 0.21992238033635186, "grad_norm": 49.44784904214244, "learning_rate": 4.78631193444524e-07, "loss": 9.8086, "step": 85 }, { "epoch": 0.23285899094437257, "grad_norm": 39.83457542693803, "learning_rate": 4.738204879070702e-07, "loss": 9.6444, "step": 90 }, { "epoch": 0.24579560155239327, "grad_norm": 35.524332578393064, "learning_rate": 4.6855121148321705e-07, "loss": 9.5317, "step": 95 }, { "epoch": 0.258732212160414, "grad_norm": 21.89346239249435, "learning_rate": 4.6283416004238185e-07, "loss": 9.4199, "step": 100 }, { "epoch": 0.2716688227684347, "grad_norm": 18.371198207871824, "learning_rate": 4.566810468703828e-07, "loss": 9.3422, "step": 105 }, { "epoch": 0.2846054333764554, "grad_norm": 14.363049133916364, "learning_rate": 4.5010447867086775e-07, "loss": 9.2499, "step": 110 }, { "epoch": 0.2975420439844761, "grad_norm": 25.239020822491135, "learning_rate": 4.431179297362797e-07, "loss": 9.1466, "step": 115 }, { "epoch": 0.31047865459249674, "grad_norm": 50.305464578443626, "learning_rate": 4.3573571434127553e-07, "loss": 9.1168, "step": 120 }, { "epoch": 0.32341526520051744, "grad_norm": 29.946521630452818, "learning_rate": 4.2797295741516337e-07, "loss": 9.0978, "step": 125 }, { "epoch": 0.33635187580853815, "grad_norm": 35.470152215729236, "learning_rate": 4.1984556355344205e-07, "loss": 9.0445, "step": 130 }, { "epoch": 0.34928848641655885, "grad_norm": 26.3769064790735, "learning_rate": 4.1137018443193496e-07, "loss": 9.0052, "step": 135 }, { "epoch": 0.36222509702457956, "grad_norm": 28.18463108090832, "learning_rate": 4.025641846902812e-07, "loss": 8.9554, "step": 140 }, { "epoch": 0.37516170763260026, "grad_norm": 23.521159788503216, "learning_rate": 3.9344560635468183e-07, "loss": 8.9328, "step": 145 }, { "epoch": 0.38809831824062097, "grad_norm": 20.007227770493905, "learning_rate": 3.8403313187279446e-07, "loss": 8.8458, "step": 150 }, { "epoch": 0.40103492884864167, "grad_norm": 20.506423559359675, "learning_rate": 3.743460458365114e-07, "loss": 8.8147, "step": 155 }, { "epoch": 0.4139715394566624, "grad_norm": 20.107542907366796, "learning_rate": 3.644041954710432e-07, "loss": 8.7916, "step": 160 }, { "epoch": 0.4269081500646831, "grad_norm": 18.743137987266405, "learning_rate": 3.5422794997126223e-07, "loss": 8.7202, "step": 165 }, { "epoch": 0.4398447606727037, "grad_norm": 21.53271448781464, "learning_rate": 3.438381587686152e-07, "loss": 8.6901, "step": 170 }, { "epoch": 0.45278137128072443, "grad_norm": 29.119808165824566, "learning_rate": 3.3325610881411314e-07, "loss": 8.6836, "step": 175 }, { "epoch": 0.46571798188874514, "grad_norm": 19.101582760449293, "learning_rate": 3.225034809649149e-07, "loss": 8.6107, "step": 180 }, { "epoch": 0.47865459249676584, "grad_norm": 26.344817395383185, "learning_rate": 3.116023055638638e-07, "loss": 8.5785, "step": 185 }, { "epoch": 0.49159120310478654, "grad_norm": 22.757551882374766, "learning_rate": 3.005749173029856e-07, "loss": 8.5407, "step": 190 }, { "epoch": 0.5045278137128072, "grad_norm": 17.287888797801536, "learning_rate": 2.894439094634258e-07, "loss": 8.4908, "step": 195 }, { "epoch": 0.517464424320828, "grad_norm": 18.78518698061505, "learning_rate": 2.782320876255818e-07, "loss": 8.4408, "step": 200 }, { "epoch": 0.5304010349288486, "grad_norm": 26.147386948788057, "learning_rate": 2.6696242294426794e-07, "loss": 8.4371, "step": 205 }, { "epoch": 0.5433376455368694, "grad_norm": 20.864739238751334, "learning_rate": 2.5565800508464693e-07, "loss": 8.3946, "step": 210 }, { "epoch": 0.55627425614489, "grad_norm": 25.03041019448578, "learning_rate": 2.443419949153531e-07, "loss": 8.3438, "step": 215 }, { "epoch": 0.5692108667529108, "grad_norm": 20.710123234104667, "learning_rate": 2.3303757705573201e-07, "loss": 8.3685, "step": 220 }, { "epoch": 0.5821474773609314, "grad_norm": 20.005194039944502, "learning_rate": 2.217679123744182e-07, "loss": 8.2943, "step": 225 }, { "epoch": 0.5950840879689522, "grad_norm": 20.64021311062467, "learning_rate": 2.1055609053657423e-07, "loss": 8.2681, "step": 230 }, { "epoch": 0.6080206985769728, "grad_norm": 18.041357467771427, "learning_rate": 1.9942508269701447e-07, "loss": 8.2574, "step": 235 }, { "epoch": 0.6209573091849935, "grad_norm": 24.18533351853082, "learning_rate": 1.883976944361362e-07, "loss": 8.2044, "step": 240 }, { "epoch": 0.6338939197930142, "grad_norm": 19.308910630600565, "learning_rate": 1.7749651903508505e-07, "loss": 8.1439, "step": 245 }, { "epoch": 0.6468305304010349, "grad_norm": 18.940923202491184, "learning_rate": 1.6674389118588684e-07, "loss": 8.2016, "step": 250 }, { "epoch": 0.6597671410090556, "grad_norm": 22.62903264366213, "learning_rate": 1.5616184123138476e-07, "loss": 8.1482, "step": 255 }, { "epoch": 0.6727037516170763, "grad_norm": 18.71414418245413, "learning_rate": 1.457720500287379e-07, "loss": 8.1104, "step": 260 }, { "epoch": 0.685640362225097, "grad_norm": 18.16814176627225, "learning_rate": 1.3559580452895682e-07, "loss": 8.1109, "step": 265 }, { "epoch": 0.6985769728331177, "grad_norm": 19.84573072806573, "learning_rate": 1.2565395416348867e-07, "loss": 8.0936, "step": 270 }, { "epoch": 0.7115135834411385, "grad_norm": 19.40761310343157, "learning_rate": 1.1596686812720555e-07, "loss": 8.081, "step": 275 }, { "epoch": 0.7244501940491591, "grad_norm": 17.694041027215786, "learning_rate": 1.065543936453182e-07, "loss": 8.0583, "step": 280 }, { "epoch": 0.7373868046571799, "grad_norm": 19.473414715193275, "learning_rate": 9.743581530971878e-08, "loss": 8.0536, "step": 285 }, { "epoch": 0.7503234152652005, "grad_norm": 20.279917617864715, "learning_rate": 8.862981556806499e-08, "loss": 8.0876, "step": 290 }, { "epoch": 0.7632600258732212, "grad_norm": 18.215356879442886, "learning_rate": 8.0154436446558e-08, "loss": 8.0188, "step": 295 }, { "epoch": 0.7761966364812419, "grad_norm": 17.250109490543473, "learning_rate": 7.202704258483663e-08, "loss": 8.0066, "step": 300 }, { "epoch": 0.7891332470892626, "grad_norm": 16.664504705841736, "learning_rate": 6.426428565872443e-08, "loss": 8.0198, "step": 305 }, { "epoch": 0.8020698576972833, "grad_norm": 21.103826588222557, "learning_rate": 5.688207026372027e-08, "loss": 8.0141, "step": 310 }, { "epoch": 0.815006468305304, "grad_norm": 17.388005585117728, "learning_rate": 4.989552132913219e-08, "loss": 7.9909, "step": 315 }, { "epoch": 0.8279430789133247, "grad_norm": 17.869968121368707, "learning_rate": 4.331895312961725e-08, "loss": 7.9866, "step": 320 }, { "epoch": 0.8408796895213454, "grad_norm": 19.683008826871088, "learning_rate": 3.7165839957618156e-08, "loss": 7.9899, "step": 325 }, { "epoch": 0.8538163001293662, "grad_norm": 17.460030572155425, "learning_rate": 3.144878851678298e-08, "loss": 7.9855, "step": 330 }, { "epoch": 0.8667529107373868, "grad_norm": 17.02791948891275, "learning_rate": 2.617951209292979e-08, "loss": 8.0126, "step": 335 }, { "epoch": 0.8796895213454075, "grad_norm": 20.515442084242775, "learning_rate": 2.136880655547596e-08, "loss": 8.0, "step": 340 }, { "epoch": 0.8926261319534282, "grad_norm": 16.829511963680876, "learning_rate": 1.7026528238500426e-08, "loss": 7.9638, "step": 345 }, { "epoch": 0.9055627425614489, "grad_norm": 17.7377760938193, "learning_rate": 1.3161573746757415e-08, "loss": 7.923, "step": 350 }, { "epoch": 0.9184993531694696, "grad_norm": 17.924298918004375, "learning_rate": 9.78186172801565e-09, "loss": 7.9911, "step": 355 }, { "epoch": 0.9314359637774903, "grad_norm": 17.683222436895043, "learning_rate": 6.894316649068643e-09, "loss": 7.9886, "step": 360 }, { "epoch": 0.944372574385511, "grad_norm": 17.323184239913775, "learning_rate": 4.50485460865585e-09, "loss": 7.9649, "step": 365 }, { "epoch": 0.9573091849935317, "grad_norm": 16.759552285384956, "learning_rate": 2.6183712163621308e-09, "loss": 7.9909, "step": 370 }, { "epoch": 0.9702457956015524, "grad_norm": 16.40682058120129, "learning_rate": 1.2387315623294536e-09, "loss": 8.0056, "step": 375 }, { "epoch": 0.9831824062095731, "grad_norm": 16.270242929964763, "learning_rate": 3.6876229833118776e-10, "loss": 7.9802, "step": 380 }, { "epoch": 0.9961190168175937, "grad_norm": 17.17813750433213, "learning_rate": 1.0245846433665217e-11, "loss": 7.9839, "step": 385 }, { "epoch": 0.9987063389391979, "eval_loss": 7.827143669128418, "eval_runtime": 3.6254, "eval_samples_per_second": 62.613, "eval_steps_per_second": 1.103, "step": 386 }, { "epoch": 0.9987063389391979, "step": 386, "total_flos": 161536404357120.0, "train_loss": 11.514628536342958, "train_runtime": 5043.2067, "train_samples_per_second": 19.608, "train_steps_per_second": 0.077 } ], "logging_steps": 5, "max_steps": 386, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 161536404357120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }