{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009157509157509158, "grad_norm": 0.05460265651345253, "learning_rate": 2.289639381797367e-07, "loss": 2.2583, "step": 100 }, { "epoch": 0.018315018315018316, "grad_norm": 0.07254950702190399, "learning_rate": 4.579278763594734e-07, "loss": 2.2556, "step": 200 }, { "epoch": 0.027472527472527472, "grad_norm": 0.07023929059505463, "learning_rate": 6.868918145392102e-07, "loss": 2.2658, "step": 300 }, { "epoch": 0.03663003663003663, "grad_norm": 0.08579763770103455, "learning_rate": 9.158557527189468e-07, "loss": 2.2546, "step": 400 }, { "epoch": 0.045787545787545784, "grad_norm": 0.13748684525489807, "learning_rate": 1.1448196908986836e-06, "loss": 2.2453, "step": 500 }, { "epoch": 0.054945054945054944, "grad_norm": 0.0766037255525589, "learning_rate": 1.3737836290784203e-06, "loss": 2.2471, "step": 600 }, { "epoch": 0.0641025641025641, "grad_norm": 0.18016816675662994, "learning_rate": 1.6027475672581568e-06, "loss": 2.2481, "step": 700 }, { "epoch": 0.07326007326007326, "grad_norm": 0.10347414016723633, "learning_rate": 1.8317115054378937e-06, "loss": 2.2388, "step": 800 }, { "epoch": 0.08241758241758242, "grad_norm": 0.14010679721832275, "learning_rate": 2.0606754436176306e-06, "loss": 2.2189, "step": 900 }, { "epoch": 0.09157509157509157, "grad_norm": 0.12083210051059723, "learning_rate": 2.2896393817973673e-06, "loss": 2.2308, "step": 1000 }, { "epoch": 0.10073260073260074, "grad_norm": 0.13420909643173218, "learning_rate": 2.5186033199771035e-06, "loss": 2.215, "step": 1100 }, { "epoch": 0.10989010989010989, "grad_norm": 0.15881897509098053, "learning_rate": 2.7475672581568406e-06, "loss": 2.2177, "step": 1200 }, { "epoch": 0.11904761904761904, "grad_norm": 0.1493181735277176, "learning_rate": 2.97424155695478e-06, "loss": 2.2062, "step": 1300 }, { "epoch": 0.1282051282051282, "grad_norm": 0.17832180857658386, "learning_rate": 3.2032054951345167e-06, "loss": 2.2018, "step": 1400 }, { "epoch": 0.13736263736263737, "grad_norm": 0.16811606287956238, "learning_rate": 3.4321694333142534e-06, "loss": 2.1941, "step": 1500 }, { "epoch": 0.14652014652014653, "grad_norm": 0.16803540289402008, "learning_rate": 3.66113337149399e-06, "loss": 2.1665, "step": 1600 }, { "epoch": 0.15567765567765568, "grad_norm": 0.18348592519760132, "learning_rate": 3.890097309673726e-06, "loss": 2.1702, "step": 1700 }, { "epoch": 0.16483516483516483, "grad_norm": 0.19313548505306244, "learning_rate": 4.119061247853463e-06, "loss": 2.154, "step": 1800 }, { "epoch": 0.17399267399267399, "grad_norm": 0.1911778450012207, "learning_rate": 4.3480251860332005e-06, "loss": 2.1665, "step": 1900 }, { "epoch": 0.18315018315018314, "grad_norm": 0.20000025629997253, "learning_rate": 4.574699484831139e-06, "loss": 2.1645, "step": 2000 }, { "epoch": 0.19230769230769232, "grad_norm": 0.23437415063381195, "learning_rate": 4.803663423010876e-06, "loss": 2.1638, "step": 2100 }, { "epoch": 0.20146520146520147, "grad_norm": 0.23130947351455688, "learning_rate": 5.032627361190613e-06, "loss": 2.1656, "step": 2200 }, { "epoch": 0.21062271062271062, "grad_norm": 0.22546882927417755, "learning_rate": 5.26159129937035e-06, "loss": 2.1426, "step": 2300 }, { "epoch": 0.21978021978021978, "grad_norm": 0.23832601308822632, "learning_rate": 5.490555237550086e-06, "loss": 2.1404, "step": 2400 }, { "epoch": 0.22893772893772893, "grad_norm": 0.2475137710571289, "learning_rate": 5.719519175729823e-06, "loss": 2.132, "step": 2500 }, { "epoch": 0.23809523809523808, "grad_norm": 0.27017173171043396, "learning_rate": 5.94848311390956e-06, "loss": 2.1334, "step": 2600 }, { "epoch": 0.24725274725274726, "grad_norm": 0.31527864933013916, "learning_rate": 6.177447052089296e-06, "loss": 2.1221, "step": 2700 }, { "epoch": 0.2564102564102564, "grad_norm": 0.39517447352409363, "learning_rate": 6.406410990269033e-06, "loss": 2.1134, "step": 2800 }, { "epoch": 0.26556776556776557, "grad_norm": 0.2927546501159668, "learning_rate": 6.6353749284487705e-06, "loss": 2.1196, "step": 2900 }, { "epoch": 0.27472527472527475, "grad_norm": 0.28731632232666016, "learning_rate": 6.864338866628507e-06, "loss": 2.0895, "step": 3000 }, { "epoch": 0.2838827838827839, "grad_norm": 0.33531680703163147, "learning_rate": 7.093302804808243e-06, "loss": 2.0915, "step": 3100 }, { "epoch": 0.29304029304029305, "grad_norm": 0.2969292998313904, "learning_rate": 7.32226674298798e-06, "loss": 2.0853, "step": 3200 }, { "epoch": 0.3021978021978022, "grad_norm": 0.32316532731056213, "learning_rate": 7.551230681167717e-06, "loss": 2.0859, "step": 3300 }, { "epoch": 0.31135531135531136, "grad_norm": 0.3245663344860077, "learning_rate": 7.780194619347453e-06, "loss": 2.0655, "step": 3400 }, { "epoch": 0.32051282051282054, "grad_norm": 0.6936108469963074, "learning_rate": 8.00915855752719e-06, "loss": 2.06, "step": 3500 }, { "epoch": 0.32967032967032966, "grad_norm": 0.36271369457244873, "learning_rate": 8.238122495706927e-06, "loss": 2.0726, "step": 3600 }, { "epoch": 0.33882783882783885, "grad_norm": 0.3658630847930908, "learning_rate": 8.467086433886663e-06, "loss": 2.0698, "step": 3700 }, { "epoch": 0.34798534798534797, "grad_norm": 0.4526204764842987, "learning_rate": 8.696050372066401e-06, "loss": 2.0497, "step": 3800 }, { "epoch": 0.35714285714285715, "grad_norm": 0.3695688843727112, "learning_rate": 8.925014310246137e-06, "loss": 2.0638, "step": 3900 }, { "epoch": 0.3663003663003663, "grad_norm": 0.3850938379764557, "learning_rate": 9.153978248425874e-06, "loss": 2.0525, "step": 4000 }, { "epoch": 0.37545787545787546, "grad_norm": 0.42660748958587646, "learning_rate": 9.38294218660561e-06, "loss": 2.06, "step": 4100 }, { "epoch": 0.38461538461538464, "grad_norm": 0.41539886593818665, "learning_rate": 9.611906124785348e-06, "loss": 2.0474, "step": 4200 }, { "epoch": 0.39377289377289376, "grad_norm": 0.40648505091667175, "learning_rate": 9.840870062965084e-06, "loss": 2.0379, "step": 4300 }, { "epoch": 0.40293040293040294, "grad_norm": 0.5012279152870178, "learning_rate": 1.006983400114482e-05, "loss": 2.043, "step": 4400 }, { "epoch": 0.41208791208791207, "grad_norm": 0.38856762647628784, "learning_rate": 1.0298797939324557e-05, "loss": 2.0299, "step": 4500 }, { "epoch": 0.42124542124542125, "grad_norm": 0.4283836781978607, "learning_rate": 1.0527761877504295e-05, "loss": 2.0429, "step": 4600 }, { "epoch": 0.43040293040293043, "grad_norm": 0.5659844279289246, "learning_rate": 1.075672581568403e-05, "loss": 2.0415, "step": 4700 }, { "epoch": 0.43956043956043955, "grad_norm": 0.47702351212501526, "learning_rate": 1.0985689753863767e-05, "loss": 2.0181, "step": 4800 }, { "epoch": 0.44871794871794873, "grad_norm": 0.4613106846809387, "learning_rate": 1.1212364052661706e-05, "loss": 2.0224, "step": 4900 }, { "epoch": 0.45787545787545786, "grad_norm": 0.4302317500114441, "learning_rate": 1.1441327990841444e-05, "loss": 2.031, "step": 5000 }, { "epoch": 0.46703296703296704, "grad_norm": 0.41913115978240967, "learning_rate": 1.167029192902118e-05, "loss": 2.0182, "step": 5100 }, { "epoch": 0.47619047619047616, "grad_norm": 0.5346239805221558, "learning_rate": 1.1899255867200917e-05, "loss": 2.0158, "step": 5200 }, { "epoch": 0.48534798534798534, "grad_norm": 0.4939456284046173, "learning_rate": 1.2128219805380655e-05, "loss": 2.0215, "step": 5300 }, { "epoch": 0.4945054945054945, "grad_norm": 0.42702722549438477, "learning_rate": 1.2357183743560391e-05, "loss": 2.0081, "step": 5400 }, { "epoch": 0.5036630036630036, "grad_norm": 0.5567723512649536, "learning_rate": 1.2586147681740126e-05, "loss": 2.0116, "step": 5500 }, { "epoch": 0.5128205128205128, "grad_norm": 0.4391794204711914, "learning_rate": 1.2815111619919864e-05, "loss": 2.0117, "step": 5600 }, { "epoch": 0.521978021978022, "grad_norm": 0.6212931275367737, "learning_rate": 1.30440755580996e-05, "loss": 2.0111, "step": 5700 }, { "epoch": 0.5311355311355311, "grad_norm": 0.4400519132614136, "learning_rate": 1.3273039496279336e-05, "loss": 2.0099, "step": 5800 }, { "epoch": 0.5402930402930403, "grad_norm": 0.5068506598472595, "learning_rate": 1.3502003434459074e-05, "loss": 2.014, "step": 5900 }, { "epoch": 0.5494505494505495, "grad_norm": 0.5231282711029053, "learning_rate": 1.373096737263881e-05, "loss": 2.0055, "step": 6000 }, { "epoch": 0.5586080586080586, "grad_norm": 0.5154737234115601, "learning_rate": 1.3959931310818547e-05, "loss": 2.0046, "step": 6100 }, { "epoch": 0.5677655677655677, "grad_norm": 0.4546205699443817, "learning_rate": 1.4188895248998285e-05, "loss": 1.9846, "step": 6200 }, { "epoch": 0.5769230769230769, "grad_norm": 0.5952470302581787, "learning_rate": 1.441785918717802e-05, "loss": 1.9931, "step": 6300 }, { "epoch": 0.5860805860805861, "grad_norm": 0.4483753740787506, "learning_rate": 1.4646823125357757e-05, "loss": 1.9976, "step": 6400 }, { "epoch": 0.5952380952380952, "grad_norm": 0.5263549089431763, "learning_rate": 1.4875787063537495e-05, "loss": 1.9938, "step": 6500 }, { "epoch": 0.6043956043956044, "grad_norm": 0.5647267699241638, "learning_rate": 1.5104751001717231e-05, "loss": 2.0073, "step": 6600 }, { "epoch": 0.6135531135531136, "grad_norm": 0.4960547089576721, "learning_rate": 1.5333714939896968e-05, "loss": 1.9973, "step": 6700 }, { "epoch": 0.6227106227106227, "grad_norm": 0.6195764541625977, "learning_rate": 1.5562678878076704e-05, "loss": 1.9929, "step": 6800 }, { "epoch": 0.6318681318681318, "grad_norm": 0.48598435521125793, "learning_rate": 1.579164281625644e-05, "loss": 1.9864, "step": 6900 }, { "epoch": 0.6410256410256411, "grad_norm": 0.6534644365310669, "learning_rate": 1.6020606754436176e-05, "loss": 1.9883, "step": 7000 }, { "epoch": 0.6501831501831502, "grad_norm": 0.5080538392066956, "learning_rate": 1.6249570692615916e-05, "loss": 1.9651, "step": 7100 }, { "epoch": 0.6593406593406593, "grad_norm": 0.502566397190094, "learning_rate": 1.6478534630795652e-05, "loss": 1.9877, "step": 7200 }, { "epoch": 0.6684981684981685, "grad_norm": 0.5017273426055908, "learning_rate": 1.670749856897539e-05, "loss": 1.9841, "step": 7300 }, { "epoch": 0.6776556776556777, "grad_norm": 0.6656813621520996, "learning_rate": 1.6936462507155125e-05, "loss": 1.9798, "step": 7400 }, { "epoch": 0.6868131868131868, "grad_norm": 0.5949518084526062, "learning_rate": 1.716542644533486e-05, "loss": 1.9929, "step": 7500 }, { "epoch": 0.6959706959706959, "grad_norm": 0.738613486289978, "learning_rate": 1.7394390383514597e-05, "loss": 1.996, "step": 7600 }, { "epoch": 0.7051282051282052, "grad_norm": 0.5886286497116089, "learning_rate": 1.7623354321694334e-05, "loss": 1.9669, "step": 7700 }, { "epoch": 0.7142857142857143, "grad_norm": 0.552151083946228, "learning_rate": 1.785231825987407e-05, "loss": 1.9565, "step": 7800 }, { "epoch": 0.7234432234432234, "grad_norm": 0.5507295727729797, "learning_rate": 1.808128219805381e-05, "loss": 1.9857, "step": 7900 }, { "epoch": 0.7326007326007326, "grad_norm": 0.7184144258499146, "learning_rate": 1.8310246136233546e-05, "loss": 1.969, "step": 8000 }, { "epoch": 0.7417582417582418, "grad_norm": 0.5713202953338623, "learning_rate": 1.8539210074413282e-05, "loss": 1.9797, "step": 8100 }, { "epoch": 0.7509157509157509, "grad_norm": 0.563890278339386, "learning_rate": 1.8768174012593018e-05, "loss": 1.9643, "step": 8200 }, { "epoch": 0.76007326007326, "grad_norm": 0.6094496846199036, "learning_rate": 1.8997137950772755e-05, "loss": 1.9758, "step": 8300 }, { "epoch": 0.7692307692307693, "grad_norm": 0.8818446397781372, "learning_rate": 1.922610188895249e-05, "loss": 1.966, "step": 8400 }, { "epoch": 0.7783882783882784, "grad_norm": 0.5603904128074646, "learning_rate": 1.945506582713223e-05, "loss": 1.97, "step": 8500 }, { "epoch": 0.7875457875457875, "grad_norm": 0.6132967472076416, "learning_rate": 1.9684029765311963e-05, "loss": 1.9606, "step": 8600 }, { "epoch": 0.7967032967032966, "grad_norm": 0.5720909237861633, "learning_rate": 1.99129937034917e-05, "loss": 1.9789, "step": 8700 }, { "epoch": 0.8058608058608059, "grad_norm": 0.5442408919334412, "learning_rate": 1.9960293406409122e-05, "loss": 1.9667, "step": 8800 }, { "epoch": 0.815018315018315, "grad_norm": 0.6531534194946289, "learning_rate": 1.9729957256223114e-05, "loss": 1.9748, "step": 8900 }, { "epoch": 0.8241758241758241, "grad_norm": 0.5392922163009644, "learning_rate": 1.9298823053295396e-05, "loss": 1.9635, "step": 9000 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6137576699256897, "learning_rate": 1.8675788155151654e-05, "loss": 1.9586, "step": 9100 }, { "epoch": 0.8424908424908425, "grad_norm": 0.5812509655952454, "learning_rate": 1.787371019220515e-05, "loss": 1.9504, "step": 9200 }, { "epoch": 0.8516483516483516, "grad_norm": 0.6085131764411926, "learning_rate": 1.6909141723615757e-05, "loss": 1.9583, "step": 9300 }, { "epoch": 0.8608058608058609, "grad_norm": 0.5231878757476807, "learning_rate": 1.5801988640553246e-05, "loss": 1.959, "step": 9400 }, { "epoch": 0.86996336996337, "grad_norm": 0.5782052278518677, "learning_rate": 1.4575099366429102e-05, "loss": 1.9578, "step": 9500 }, { "epoch": 0.8791208791208791, "grad_norm": 0.7281983494758606, "learning_rate": 1.3253793331810845e-05, "loss": 1.9626, "step": 9600 }, { "epoch": 0.8882783882783882, "grad_norm": 0.5513895750045776, "learning_rate": 1.1865338454926736e-05, "loss": 1.954, "step": 9700 }, { "epoch": 0.8974358974358975, "grad_norm": 0.6550291180610657, "learning_rate": 1.0438388411045472e-05, "loss": 1.9417, "step": 9800 }, { "epoch": 0.9065934065934066, "grad_norm": 0.6144053936004639, "learning_rate": 9.002391303856074e-06, "loss": 1.938, "step": 9900 }, { "epoch": 0.9157509157509157, "grad_norm": 0.6759700775146484, "learning_rate": 7.586981942153e-06, "loss": 1.964, "step": 10000 }, { "epoch": 0.924908424908425, "grad_norm": 0.6374625563621521, "learning_rate": 6.221370263470199e-06, "loss": 1.9537, "step": 10100 }, { "epoch": 0.9340659340659341, "grad_norm": 0.6961682438850403, "learning_rate": 4.933738525823817e-06, "loss": 1.9329, "step": 10200 }, { "epoch": 0.9432234432234432, "grad_norm": 0.6878940463066101, "learning_rate": 3.750659707774723e-06, "loss": 1.9359, "step": 10300 }, { "epoch": 0.9523809523809523, "grad_norm": 0.5925911068916321, "learning_rate": 2.696549119344161e-06, "loss": 1.9381, "step": 10400 }, { "epoch": 0.9615384615384616, "grad_norm": 0.5766844749450684, "learning_rate": 1.7931605409401575e-06, "loss": 1.9414, "step": 10500 }, { "epoch": 0.9706959706959707, "grad_norm": 0.5555196404457092, "learning_rate": 1.0591372885234885e-06, "loss": 1.9472, "step": 10600 }, { "epoch": 0.9798534798534798, "grad_norm": 0.7189186811447144, "learning_rate": 5.096274697241354e-07, "loss": 1.9425, "step": 10700 }, { "epoch": 0.989010989010989, "grad_norm": 0.5945115685462952, "learning_rate": 1.5597137090428095e-07, "loss": 1.9325, "step": 10800 }, { "epoch": 0.9981684981684982, "grad_norm": 0.5117030739784241, "learning_rate": 5.467426590739511e-09, "loss": 1.9445, "step": 10900 }, { "epoch": 1.0, "step": 10920, "total_flos": 1.59124040841796e+18, "train_loss": 2.048668092511076, "train_runtime": 3400.905, "train_samples_per_second": 51.372, "train_steps_per_second": 3.211 } ], "logging_steps": 100, "max_steps": 10920, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.59124040841796e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }