{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996777029006739, "eval_steps": 500, "global_step": 1278, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023439789041898623, "grad_norm": 9.967006175549518, "learning_rate": 5e-06, "loss": 0.6132, "step": 10 }, { "epoch": 0.046879578083797245, "grad_norm": 1.708901732081446, "learning_rate": 5e-06, "loss": 0.5268, "step": 20 }, { "epoch": 0.07031936712569586, "grad_norm": 1.6601249126276376, "learning_rate": 5e-06, "loss": 0.4939, "step": 30 }, { "epoch": 0.09375915616759449, "grad_norm": 0.9058674995172575, "learning_rate": 5e-06, "loss": 0.4729, "step": 40 }, { "epoch": 0.11719894520949312, "grad_norm": 1.1831753366659354, "learning_rate": 5e-06, "loss": 0.4606, "step": 50 }, { "epoch": 0.14063873425139173, "grad_norm": 0.6857017193079774, "learning_rate": 5e-06, "loss": 0.4448, "step": 60 }, { "epoch": 0.16407852329329037, "grad_norm": 0.665824933523209, "learning_rate": 5e-06, "loss": 0.4416, "step": 70 }, { "epoch": 0.18751831233518898, "grad_norm": 1.5098116393031806, "learning_rate": 5e-06, "loss": 0.436, "step": 80 }, { "epoch": 0.2109581013770876, "grad_norm": 0.8814351421187862, "learning_rate": 5e-06, "loss": 0.4249, "step": 90 }, { "epoch": 0.23439789041898623, "grad_norm": 0.5818056460875761, "learning_rate": 5e-06, "loss": 0.416, "step": 100 }, { "epoch": 0.25783767946088487, "grad_norm": 0.6332178617430071, "learning_rate": 5e-06, "loss": 0.4143, "step": 110 }, { "epoch": 0.28127746850278346, "grad_norm": 0.6385133089285979, "learning_rate": 5e-06, "loss": 0.4142, "step": 120 }, { "epoch": 0.3047172575446821, "grad_norm": 0.5190122301041274, "learning_rate": 5e-06, "loss": 0.4054, "step": 130 }, { "epoch": 0.32815704658658074, "grad_norm": 0.72677604074983, "learning_rate": 5e-06, "loss": 0.4061, "step": 140 }, { "epoch": 0.3515968356284793, "grad_norm": 0.5862867262610982, "learning_rate": 5e-06, "loss": 0.4002, "step": 150 }, { "epoch": 0.37503662467037796, "grad_norm": 0.4943669370299919, "learning_rate": 5e-06, "loss": 0.4078, "step": 160 }, { "epoch": 0.3984764137122766, "grad_norm": 0.617733036177013, "learning_rate": 5e-06, "loss": 0.396, "step": 170 }, { "epoch": 0.4219162027541752, "grad_norm": 0.501523217408356, "learning_rate": 5e-06, "loss": 0.398, "step": 180 }, { "epoch": 0.4453559917960738, "grad_norm": 0.5513765393124738, "learning_rate": 5e-06, "loss": 0.3915, "step": 190 }, { "epoch": 0.46879578083797246, "grad_norm": 0.48423414375531915, "learning_rate": 5e-06, "loss": 0.3928, "step": 200 }, { "epoch": 0.4922355698798711, "grad_norm": 0.6668034319234166, "learning_rate": 5e-06, "loss": 0.3897, "step": 210 }, { "epoch": 0.5156753589217697, "grad_norm": 0.6844513719684475, "learning_rate": 5e-06, "loss": 0.3943, "step": 220 }, { "epoch": 0.5391151479636683, "grad_norm": 0.5634978239261383, "learning_rate": 5e-06, "loss": 0.3867, "step": 230 }, { "epoch": 0.5625549370055669, "grad_norm": 0.48253201050555344, "learning_rate": 5e-06, "loss": 0.3951, "step": 240 }, { "epoch": 0.5859947260474656, "grad_norm": 0.5583859071799139, "learning_rate": 5e-06, "loss": 0.3896, "step": 250 }, { "epoch": 0.6094345150893642, "grad_norm": 0.5725388703107269, "learning_rate": 5e-06, "loss": 0.3888, "step": 260 }, { "epoch": 0.6328743041312628, "grad_norm": 0.8156082811133835, "learning_rate": 5e-06, "loss": 0.379, "step": 270 }, { "epoch": 0.6563140931731615, "grad_norm": 0.5966575531920639, "learning_rate": 5e-06, "loss": 0.3864, "step": 280 }, { "epoch": 0.6797538822150601, "grad_norm": 0.48850438172794114, "learning_rate": 5e-06, "loss": 0.3873, "step": 290 }, { "epoch": 0.7031936712569586, "grad_norm": 0.5962793861443083, "learning_rate": 5e-06, "loss": 0.3869, "step": 300 }, { "epoch": 0.7266334602988573, "grad_norm": 0.5171258822649074, "learning_rate": 5e-06, "loss": 0.3818, "step": 310 }, { "epoch": 0.7500732493407559, "grad_norm": 0.5592073208479841, "learning_rate": 5e-06, "loss": 0.3755, "step": 320 }, { "epoch": 0.7735130383826545, "grad_norm": 0.533904690846359, "learning_rate": 5e-06, "loss": 0.3745, "step": 330 }, { "epoch": 0.7969528274245532, "grad_norm": 0.548421936851489, "learning_rate": 5e-06, "loss": 0.3794, "step": 340 }, { "epoch": 0.8203926164664518, "grad_norm": 0.5826483187799658, "learning_rate": 5e-06, "loss": 0.3729, "step": 350 }, { "epoch": 0.8438324055083504, "grad_norm": 0.65088471853365, "learning_rate": 5e-06, "loss": 0.3728, "step": 360 }, { "epoch": 0.8672721945502491, "grad_norm": 0.7534193430854838, "learning_rate": 5e-06, "loss": 0.3763, "step": 370 }, { "epoch": 0.8907119835921476, "grad_norm": 0.6782662978614515, "learning_rate": 5e-06, "loss": 0.3794, "step": 380 }, { "epoch": 0.9141517726340463, "grad_norm": 0.4771544648840299, "learning_rate": 5e-06, "loss": 0.3702, "step": 390 }, { "epoch": 0.9375915616759449, "grad_norm": 0.5535379054118112, "learning_rate": 5e-06, "loss": 0.3768, "step": 400 }, { "epoch": 0.9610313507178435, "grad_norm": 0.4608571118764034, "learning_rate": 5e-06, "loss": 0.3788, "step": 410 }, { "epoch": 0.9844711397597422, "grad_norm": 0.5668677920657946, "learning_rate": 5e-06, "loss": 0.3673, "step": 420 }, { "epoch": 0.9985350131848814, "eval_loss": 0.37145182490348816, "eval_runtime": 299.8322, "eval_samples_per_second": 38.341, "eval_steps_per_second": 0.6, "step": 426 }, { "epoch": 1.0084969235276882, "grad_norm": 0.6984558481581081, "learning_rate": 5e-06, "loss": 0.3801, "step": 430 }, { "epoch": 1.0319367125695869, "grad_norm": 0.5239390968899379, "learning_rate": 5e-06, "loss": 0.3221, "step": 440 }, { "epoch": 1.0553765016114856, "grad_norm": 0.5695148407791144, "learning_rate": 5e-06, "loss": 0.3257, "step": 450 }, { "epoch": 1.078816290653384, "grad_norm": 0.5709165769711674, "learning_rate": 5e-06, "loss": 0.3205, "step": 460 }, { "epoch": 1.1022560796952827, "grad_norm": 0.5285237377857552, "learning_rate": 5e-06, "loss": 0.3267, "step": 470 }, { "epoch": 1.1256958687371814, "grad_norm": 0.5164839735723259, "learning_rate": 5e-06, "loss": 0.3269, "step": 480 }, { "epoch": 1.1491356577790799, "grad_norm": 0.4651823010378352, "learning_rate": 5e-06, "loss": 0.3299, "step": 490 }, { "epoch": 1.1725754468209786, "grad_norm": 0.4611493940935449, "learning_rate": 5e-06, "loss": 0.3307, "step": 500 }, { "epoch": 1.1960152358628773, "grad_norm": 0.5290456767213961, "learning_rate": 5e-06, "loss": 0.3338, "step": 510 }, { "epoch": 1.219455024904776, "grad_norm": 0.49326918332510095, "learning_rate": 5e-06, "loss": 0.3253, "step": 520 }, { "epoch": 1.2428948139466744, "grad_norm": 0.551961171965543, "learning_rate": 5e-06, "loss": 0.3296, "step": 530 }, { "epoch": 1.2663346029885731, "grad_norm": 0.49768749823200925, "learning_rate": 5e-06, "loss": 0.3256, "step": 540 }, { "epoch": 1.2897743920304716, "grad_norm": 0.5626038647037341, "learning_rate": 5e-06, "loss": 0.3255, "step": 550 }, { "epoch": 1.3132141810723703, "grad_norm": 0.5103540222464168, "learning_rate": 5e-06, "loss": 0.3225, "step": 560 }, { "epoch": 1.336653970114269, "grad_norm": 0.540424196440726, "learning_rate": 5e-06, "loss": 0.3264, "step": 570 }, { "epoch": 1.3600937591561677, "grad_norm": 0.5745930638703757, "learning_rate": 5e-06, "loss": 0.3196, "step": 580 }, { "epoch": 1.3835335481980662, "grad_norm": 0.6445462088875488, "learning_rate": 5e-06, "loss": 0.3244, "step": 590 }, { "epoch": 1.4069733372399649, "grad_norm": 0.481096562270643, "learning_rate": 5e-06, "loss": 0.3224, "step": 600 }, { "epoch": 1.4304131262818633, "grad_norm": 0.5092132313164293, "learning_rate": 5e-06, "loss": 0.3251, "step": 610 }, { "epoch": 1.453852915323762, "grad_norm": 0.5067387262720331, "learning_rate": 5e-06, "loss": 0.3265, "step": 620 }, { "epoch": 1.4772927043656607, "grad_norm": 0.48943368667359805, "learning_rate": 5e-06, "loss": 0.3279, "step": 630 }, { "epoch": 1.5007324934075594, "grad_norm": 0.5183176032348761, "learning_rate": 5e-06, "loss": 0.3227, "step": 640 }, { "epoch": 1.524172282449458, "grad_norm": 0.5247150397228669, "learning_rate": 5e-06, "loss": 0.3181, "step": 650 }, { "epoch": 1.5476120714913566, "grad_norm": 0.5298290612904054, "learning_rate": 5e-06, "loss": 0.3275, "step": 660 }, { "epoch": 1.571051860533255, "grad_norm": 0.5335846082176007, "learning_rate": 5e-06, "loss": 0.3228, "step": 670 }, { "epoch": 1.5944916495751538, "grad_norm": 0.49588298624982247, "learning_rate": 5e-06, "loss": 0.3199, "step": 680 }, { "epoch": 1.6179314386170525, "grad_norm": 0.5142614839866478, "learning_rate": 5e-06, "loss": 0.3317, "step": 690 }, { "epoch": 1.6413712276589512, "grad_norm": 0.4957666564649026, "learning_rate": 5e-06, "loss": 0.3239, "step": 700 }, { "epoch": 1.6648110167008499, "grad_norm": 0.5027765035894468, "learning_rate": 5e-06, "loss": 0.3208, "step": 710 }, { "epoch": 1.6882508057427483, "grad_norm": 0.6791514332148134, "learning_rate": 5e-06, "loss": 0.3184, "step": 720 }, { "epoch": 1.7116905947846468, "grad_norm": 0.4925105147109606, "learning_rate": 5e-06, "loss": 0.3252, "step": 730 }, { "epoch": 1.7351303838265455, "grad_norm": 0.5049447941188098, "learning_rate": 5e-06, "loss": 0.3201, "step": 740 }, { "epoch": 1.7585701728684442, "grad_norm": 0.544693187707417, "learning_rate": 5e-06, "loss": 0.3258, "step": 750 }, { "epoch": 1.7820099619103429, "grad_norm": 0.4756047714296392, "learning_rate": 5e-06, "loss": 0.3231, "step": 760 }, { "epoch": 1.8054497509522416, "grad_norm": 0.5599548628855956, "learning_rate": 5e-06, "loss": 0.3246, "step": 770 }, { "epoch": 1.82888953999414, "grad_norm": 0.4756090546668893, "learning_rate": 5e-06, "loss": 0.3219, "step": 780 }, { "epoch": 1.8523293290360385, "grad_norm": 0.6124115056884903, "learning_rate": 5e-06, "loss": 0.3217, "step": 790 }, { "epoch": 1.8757691180779372, "grad_norm": 0.46580289937046976, "learning_rate": 5e-06, "loss": 0.3259, "step": 800 }, { "epoch": 1.899208907119836, "grad_norm": 0.5943245513187589, "learning_rate": 5e-06, "loss": 0.3197, "step": 810 }, { "epoch": 1.9226486961617346, "grad_norm": 0.49042860120039294, "learning_rate": 5e-06, "loss": 0.3226, "step": 820 }, { "epoch": 1.9460884852036333, "grad_norm": 0.5267257339110174, "learning_rate": 5e-06, "loss": 0.3221, "step": 830 }, { "epoch": 1.9695282742455318, "grad_norm": 0.717669514593216, "learning_rate": 5e-06, "loss": 0.3282, "step": 840 }, { "epoch": 1.9929680632874303, "grad_norm": 0.46905846352186026, "learning_rate": 5e-06, "loss": 0.319, "step": 850 }, { "epoch": 1.99765602109581, "eval_loss": 0.36132651567459106, "eval_runtime": 296.4899, "eval_samples_per_second": 38.774, "eval_steps_per_second": 0.607, "step": 852 }, { "epoch": 2.0169938470553763, "grad_norm": 0.6240807456372731, "learning_rate": 5e-06, "loss": 0.3065, "step": 860 }, { "epoch": 2.040433636097275, "grad_norm": 0.6288943822511728, "learning_rate": 5e-06, "loss": 0.2738, "step": 870 }, { "epoch": 2.0638734251391737, "grad_norm": 0.5311356208759552, "learning_rate": 5e-06, "loss": 0.2731, "step": 880 }, { "epoch": 2.0873132141810724, "grad_norm": 0.56075413432555, "learning_rate": 5e-06, "loss": 0.2744, "step": 890 }, { "epoch": 2.110753003222971, "grad_norm": 0.5639112603035239, "learning_rate": 5e-06, "loss": 0.2751, "step": 900 }, { "epoch": 2.13419279226487, "grad_norm": 0.5591523409924748, "learning_rate": 5e-06, "loss": 0.2765, "step": 910 }, { "epoch": 2.157632581306768, "grad_norm": 0.5663817052316904, "learning_rate": 5e-06, "loss": 0.2737, "step": 920 }, { "epoch": 2.1810723703486667, "grad_norm": 0.623581090691007, "learning_rate": 5e-06, "loss": 0.2741, "step": 930 }, { "epoch": 2.2045121593905654, "grad_norm": 0.47856596919111055, "learning_rate": 5e-06, "loss": 0.2756, "step": 940 }, { "epoch": 2.227951948432464, "grad_norm": 0.48512193272255605, "learning_rate": 5e-06, "loss": 0.2737, "step": 950 }, { "epoch": 2.251391737474363, "grad_norm": 0.5099859245096886, "learning_rate": 5e-06, "loss": 0.2796, "step": 960 }, { "epoch": 2.2748315265162615, "grad_norm": 0.5305125040779604, "learning_rate": 5e-06, "loss": 0.273, "step": 970 }, { "epoch": 2.2982713155581598, "grad_norm": 0.557582967315411, "learning_rate": 5e-06, "loss": 0.273, "step": 980 }, { "epoch": 2.3217111046000585, "grad_norm": 0.5807484141453545, "learning_rate": 5e-06, "loss": 0.2755, "step": 990 }, { "epoch": 2.345150893641957, "grad_norm": 0.49544753860231366, "learning_rate": 5e-06, "loss": 0.281, "step": 1000 }, { "epoch": 2.368590682683856, "grad_norm": 0.5035116546897159, "learning_rate": 5e-06, "loss": 0.275, "step": 1010 }, { "epoch": 2.3920304717257546, "grad_norm": 0.5229341109864724, "learning_rate": 5e-06, "loss": 0.2778, "step": 1020 }, { "epoch": 2.4154702607676533, "grad_norm": 0.5788744389040322, "learning_rate": 5e-06, "loss": 0.2843, "step": 1030 }, { "epoch": 2.438910049809552, "grad_norm": 0.587705401996759, "learning_rate": 5e-06, "loss": 0.278, "step": 1040 }, { "epoch": 2.46234983885145, "grad_norm": 0.526667348496211, "learning_rate": 5e-06, "loss": 0.2758, "step": 1050 }, { "epoch": 2.485789627893349, "grad_norm": 0.5557320228470611, "learning_rate": 5e-06, "loss": 0.2749, "step": 1060 }, { "epoch": 2.5092294169352476, "grad_norm": 0.5455334258984845, "learning_rate": 5e-06, "loss": 0.2829, "step": 1070 }, { "epoch": 2.5326692059771463, "grad_norm": 0.6049156598816975, "learning_rate": 5e-06, "loss": 0.2792, "step": 1080 }, { "epoch": 2.556108995019045, "grad_norm": 0.6063878997150756, "learning_rate": 5e-06, "loss": 0.2768, "step": 1090 }, { "epoch": 2.5795487840609432, "grad_norm": 0.5807051957808216, "learning_rate": 5e-06, "loss": 0.2837, "step": 1100 }, { "epoch": 2.602988573102842, "grad_norm": 0.48690825278368993, "learning_rate": 5e-06, "loss": 0.2732, "step": 1110 }, { "epoch": 2.6264283621447406, "grad_norm": 0.47298208429795785, "learning_rate": 5e-06, "loss": 0.2838, "step": 1120 }, { "epoch": 2.6498681511866393, "grad_norm": 0.5117446887050897, "learning_rate": 5e-06, "loss": 0.2799, "step": 1130 }, { "epoch": 2.673307940228538, "grad_norm": 0.5788651437910809, "learning_rate": 5e-06, "loss": 0.2831, "step": 1140 }, { "epoch": 2.6967477292704367, "grad_norm": 0.48115847246690635, "learning_rate": 5e-06, "loss": 0.2795, "step": 1150 }, { "epoch": 2.7201875183123354, "grad_norm": 0.4752829940802366, "learning_rate": 5e-06, "loss": 0.282, "step": 1160 }, { "epoch": 2.7436273073542337, "grad_norm": 0.4817048174863979, "learning_rate": 5e-06, "loss": 0.2798, "step": 1170 }, { "epoch": 2.7670670963961324, "grad_norm": 0.5163232884368958, "learning_rate": 5e-06, "loss": 0.2785, "step": 1180 }, { "epoch": 2.790506885438031, "grad_norm": 0.49724121376113645, "learning_rate": 5e-06, "loss": 0.2871, "step": 1190 }, { "epoch": 2.8139466744799297, "grad_norm": 0.575214274563183, "learning_rate": 5e-06, "loss": 0.277, "step": 1200 }, { "epoch": 2.8373864635218284, "grad_norm": 0.49524252636931043, "learning_rate": 5e-06, "loss": 0.284, "step": 1210 }, { "epoch": 2.8608262525637267, "grad_norm": 0.5731018313488945, "learning_rate": 5e-06, "loss": 0.279, "step": 1220 }, { "epoch": 2.8842660416056254, "grad_norm": 0.5359950242882849, "learning_rate": 5e-06, "loss": 0.2805, "step": 1230 }, { "epoch": 2.907705830647524, "grad_norm": 0.5534733032853985, "learning_rate": 5e-06, "loss": 0.2819, "step": 1240 }, { "epoch": 2.9311456196894228, "grad_norm": 0.5653588586425916, "learning_rate": 5e-06, "loss": 0.276, "step": 1250 }, { "epoch": 2.9545854087313215, "grad_norm": 0.5400026090208886, "learning_rate": 5e-06, "loss": 0.2795, "step": 1260 }, { "epoch": 2.97802519777322, "grad_norm": 0.5145229483499503, "learning_rate": 5e-06, "loss": 0.2817, "step": 1270 }, { "epoch": 2.996777029006739, "eval_loss": 0.3670854866504669, "eval_runtime": 285.3181, "eval_samples_per_second": 40.292, "eval_steps_per_second": 0.631, "step": 1278 }, { "epoch": 2.996777029006739, "step": 1278, "total_flos": 2140488220016640.0, "train_loss": 0.33715206393986613, "train_runtime": 42576.9653, "train_samples_per_second": 15.39, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 1278, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2140488220016640.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }