{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9700214132762313, "eval_steps": 500, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.15481117367744446, "learning_rate": 2e-05, "loss": 1.1213, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.16627414524555206, "learning_rate": 4e-05, "loss": 1.1341, "step": 2 }, { "epoch": 0.02, "grad_norm": 0.14720524847507477, "learning_rate": 6e-05, "loss": 1.148, "step": 3 }, { "epoch": 0.03, "grad_norm": 0.15325099229812622, "learning_rate": 8e-05, "loss": 1.1435, "step": 4 }, { "epoch": 0.03, "grad_norm": 0.16704852879047394, "learning_rate": 0.0001, "loss": 1.0895, "step": 5 }, { "epoch": 0.04, "grad_norm": 0.11686641722917557, "learning_rate": 0.00012, "loss": 1.0784, "step": 6 }, { "epoch": 0.04, "grad_norm": 0.09641632437705994, "learning_rate": 0.00014, "loss": 1.0612, "step": 7 }, { "epoch": 0.05, "grad_norm": 0.12384118884801865, "learning_rate": 0.00016, "loss": 1.0566, "step": 8 }, { "epoch": 0.06, "grad_norm": 0.07287071645259857, "learning_rate": 0.00018, "loss": 1.0442, "step": 9 }, { "epoch": 0.06, "grad_norm": 0.07469318807125092, "learning_rate": 0.0002, "loss": 1.0083, "step": 10 }, { "epoch": 0.07, "grad_norm": 0.08364757895469666, "learning_rate": 0.00019999761633493753, "loss": 1.0169, "step": 11 }, { "epoch": 0.08, "grad_norm": 0.07763934135437012, "learning_rate": 0.0001999904654533872, "loss": 1.0348, "step": 12 }, { "epoch": 0.08, "grad_norm": 0.06744416803121567, "learning_rate": 0.0001999785476962552, "loss": 1.0123, "step": 13 }, { "epoch": 0.09, "grad_norm": 0.07724820077419281, "learning_rate": 0.00019996186363170035, "loss": 1.0188, "step": 14 }, { "epoch": 0.1, "grad_norm": 0.0746370479464531, "learning_rate": 0.00019994041405510705, "loss": 1.0164, "step": 15 }, { "epoch": 0.1, "grad_norm": 0.055804282426834106, "learning_rate": 0.00019991419998904747, "loss": 1.0587, "step": 16 }, { "epoch": 0.11, "grad_norm": 0.054163169115781784, "learning_rate": 0.00019988322268323268, "loss": 1.0149, "step": 17 }, { "epoch": 0.12, "grad_norm": 0.05897677689790726, "learning_rate": 0.00019984748361445308, "loss": 1.0136, "step": 18 }, { "epoch": 0.12, "grad_norm": 0.05603804066777229, "learning_rate": 0.00019980698448650804, "loss": 0.9996, "step": 19 }, { "epoch": 0.13, "grad_norm": 0.06250110268592834, "learning_rate": 0.0001997617272301248, "loss": 1.0145, "step": 20 }, { "epoch": 0.13, "grad_norm": 0.05678323656320572, "learning_rate": 0.000199711714002866, "loss": 1.0005, "step": 21 }, { "epoch": 0.14, "grad_norm": 0.05278163403272629, "learning_rate": 0.00019965694718902745, "loss": 1.0034, "step": 22 }, { "epoch": 0.15, "grad_norm": 0.05601625144481659, "learning_rate": 0.00019959742939952392, "loss": 0.9915, "step": 23 }, { "epoch": 0.15, "grad_norm": 0.054547134786844254, "learning_rate": 0.00019953316347176488, "loss": 1.0115, "step": 24 }, { "epoch": 0.16, "grad_norm": 0.06417939066886902, "learning_rate": 0.0001994641524695193, "loss": 0.9862, "step": 25 }, { "epoch": 0.17, "grad_norm": 0.061326365917921066, "learning_rate": 0.0001993903996827694, "loss": 0.9889, "step": 26 }, { "epoch": 0.17, "grad_norm": 0.05376205965876579, "learning_rate": 0.00019931190862755417, "loss": 0.9604, "step": 27 }, { "epoch": 0.18, "grad_norm": 0.0678999274969101, "learning_rate": 0.00019922868304580118, "loss": 1.0492, "step": 28 }, { "epoch": 0.19, "grad_norm": 0.053755760192871094, "learning_rate": 0.0001991407269051487, "loss": 0.9985, "step": 29 }, { "epoch": 0.19, "grad_norm": 0.05401955544948578, "learning_rate": 0.00019904804439875633, "loss": 0.9787, "step": 30 }, { "epoch": 0.2, "grad_norm": 0.05666874349117279, "learning_rate": 0.0001989506399451051, "loss": 0.9886, "step": 31 }, { "epoch": 0.21, "grad_norm": 0.05425805598497391, "learning_rate": 0.00019884851818778693, "loss": 1.0197, "step": 32 }, { "epoch": 0.21, "grad_norm": 0.06086369976401329, "learning_rate": 0.00019874168399528305, "loss": 0.9879, "step": 33 }, { "epoch": 0.22, "grad_norm": 0.05577366426587105, "learning_rate": 0.00019863014246073214, "loss": 0.9808, "step": 34 }, { "epoch": 0.22, "grad_norm": 0.05853752791881561, "learning_rate": 0.0001985138989016874, "loss": 0.957, "step": 35 }, { "epoch": 0.23, "grad_norm": 0.0582735501229763, "learning_rate": 0.00019839295885986296, "loss": 0.9732, "step": 36 }, { "epoch": 0.24, "grad_norm": 0.07255622744560242, "learning_rate": 0.00019826732810086998, "loss": 1.0199, "step": 37 }, { "epoch": 0.24, "grad_norm": 0.06085599586367607, "learning_rate": 0.00019813701261394136, "loss": 0.9946, "step": 38 }, { "epoch": 0.25, "grad_norm": 0.06600817292928696, "learning_rate": 0.00019800201861164664, "loss": 0.9646, "step": 39 }, { "epoch": 0.26, "grad_norm": 0.0634196326136589, "learning_rate": 0.00019786235252959553, "loss": 1.0092, "step": 40 }, { "epoch": 0.26, "grad_norm": 0.05254920944571495, "learning_rate": 0.00019771802102613127, "loss": 0.9535, "step": 41 }, { "epoch": 0.27, "grad_norm": 0.06008182466030121, "learning_rate": 0.00019756903098201308, "loss": 0.9897, "step": 42 }, { "epoch": 0.28, "grad_norm": 0.07715290039777756, "learning_rate": 0.00019741538950008818, "loss": 1.0132, "step": 43 }, { "epoch": 0.28, "grad_norm": 0.06104297190904617, "learning_rate": 0.0001972571039049533, "loss": 0.9938, "step": 44 }, { "epoch": 0.29, "grad_norm": 0.06008582562208176, "learning_rate": 0.0001970941817426052, "loss": 0.9889, "step": 45 }, { "epoch": 0.3, "grad_norm": 0.05699775367975235, "learning_rate": 0.00019692663078008132, "loss": 0.9843, "step": 46 }, { "epoch": 0.3, "grad_norm": 0.05760645866394043, "learning_rate": 0.00019675445900508909, "loss": 0.9677, "step": 47 }, { "epoch": 0.31, "grad_norm": 0.06075143814086914, "learning_rate": 0.00019657767462562544, "loss": 0.9929, "step": 48 }, { "epoch": 0.31, "grad_norm": 0.059820640832185745, "learning_rate": 0.00019639628606958533, "loss": 0.9889, "step": 49 }, { "epoch": 0.32, "grad_norm": 0.059315863996744156, "learning_rate": 0.00019621030198436006, "loss": 0.9994, "step": 50 }, { "epoch": 0.33, "grad_norm": 0.05949851870536804, "learning_rate": 0.00019601973123642492, "loss": 0.9593, "step": 51 }, { "epoch": 0.33, "grad_norm": 0.06036762520670891, "learning_rate": 0.00019582458291091663, "loss": 0.9669, "step": 52 }, { "epoch": 0.34, "grad_norm": 0.05799931660294533, "learning_rate": 0.00019562486631120006, "loss": 0.9731, "step": 53 }, { "epoch": 0.35, "grad_norm": 0.05733400583267212, "learning_rate": 0.00019542059095842485, "loss": 0.9676, "step": 54 }, { "epoch": 0.35, "grad_norm": 0.06894834339618683, "learning_rate": 0.00019521176659107142, "loss": 1.0142, "step": 55 }, { "epoch": 0.36, "grad_norm": 0.0657154992222786, "learning_rate": 0.00019499840316448673, "loss": 0.9598, "step": 56 }, { "epoch": 0.37, "grad_norm": 0.0570225715637207, "learning_rate": 0.00019478051085040975, "loss": 0.9979, "step": 57 }, { "epoch": 0.37, "grad_norm": 0.06271913647651672, "learning_rate": 0.00019455810003648637, "loss": 0.9694, "step": 58 }, { "epoch": 0.38, "grad_norm": 0.05831892415881157, "learning_rate": 0.0001943311813257743, "loss": 0.9934, "step": 59 }, { "epoch": 0.39, "grad_norm": 0.061948519200086594, "learning_rate": 0.00019409976553623766, "loss": 0.9812, "step": 60 }, { "epoch": 0.39, "grad_norm": 0.06726415455341339, "learning_rate": 0.00019386386370023103, "loss": 0.9837, "step": 61 }, { "epoch": 0.4, "grad_norm": 0.057408351451158524, "learning_rate": 0.00019362348706397373, "loss": 0.9512, "step": 62 }, { "epoch": 0.4, "grad_norm": 0.057638928294181824, "learning_rate": 0.00019337864708701357, "loss": 0.9622, "step": 63 }, { "epoch": 0.41, "grad_norm": 0.06090644374489784, "learning_rate": 0.00019312935544168048, "loss": 0.9927, "step": 64 }, { "epoch": 0.42, "grad_norm": 0.05811937153339386, "learning_rate": 0.00019287562401253022, "loss": 0.9905, "step": 65 }, { "epoch": 0.42, "grad_norm": 0.05927939713001251, "learning_rate": 0.00019261746489577765, "loss": 0.9604, "step": 66 }, { "epoch": 0.43, "grad_norm": 0.06021604314446449, "learning_rate": 0.0001923548903987201, "loss": 0.9535, "step": 67 }, { "epoch": 0.44, "grad_norm": 0.07004135102033615, "learning_rate": 0.00019208791303915063, "loss": 1.0032, "step": 68 }, { "epoch": 0.44, "grad_norm": 0.05626143515110016, "learning_rate": 0.0001918165455447614, "loss": 0.9726, "step": 69 }, { "epoch": 0.45, "grad_norm": 0.06130916625261307, "learning_rate": 0.00019154080085253666, "loss": 0.9646, "step": 70 }, { "epoch": 0.46, "grad_norm": 0.06467108428478241, "learning_rate": 0.0001912606921081362, "loss": 0.9516, "step": 71 }, { "epoch": 0.46, "grad_norm": 0.06047213450074196, "learning_rate": 0.0001909762326652686, "loss": 0.9664, "step": 72 }, { "epoch": 0.47, "grad_norm": 0.05907664820551872, "learning_rate": 0.00019068743608505455, "loss": 0.9796, "step": 73 }, { "epoch": 0.48, "grad_norm": 0.06679921597242355, "learning_rate": 0.00019039431613538047, "loss": 0.9678, "step": 74 }, { "epoch": 0.48, "grad_norm": 0.06133173033595085, "learning_rate": 0.0001900968867902419, "loss": 0.9875, "step": 75 }, { "epoch": 0.49, "grad_norm": 0.05841493234038353, "learning_rate": 0.00018979516222907775, "loss": 0.9686, "step": 76 }, { "epoch": 0.49, "grad_norm": 0.0660431906580925, "learning_rate": 0.00018948915683609388, "loss": 0.9863, "step": 77 }, { "epoch": 0.5, "grad_norm": 0.05776617303490639, "learning_rate": 0.00018917888519957754, "loss": 0.9417, "step": 78 }, { "epoch": 0.51, "grad_norm": 0.05937017872929573, "learning_rate": 0.00018886436211120193, "loss": 0.9995, "step": 79 }, { "epoch": 0.51, "grad_norm": 0.06314114481210709, "learning_rate": 0.000188545602565321, "loss": 0.9806, "step": 80 }, { "epoch": 0.52, "grad_norm": 0.060519032180309296, "learning_rate": 0.00018822262175825462, "loss": 0.9741, "step": 81 }, { "epoch": 0.53, "grad_norm": 0.06154269725084305, "learning_rate": 0.00018789543508756408, "loss": 0.9793, "step": 82 }, { "epoch": 0.53, "grad_norm": 0.06176121160387993, "learning_rate": 0.00018756405815131813, "loss": 0.9453, "step": 83 }, { "epoch": 0.54, "grad_norm": 0.06044905260205269, "learning_rate": 0.00018722850674734927, "loss": 0.9462, "step": 84 }, { "epoch": 0.55, "grad_norm": 0.05896229296922684, "learning_rate": 0.00018688879687250067, "loss": 0.9963, "step": 85 }, { "epoch": 0.55, "grad_norm": 0.06071419641375542, "learning_rate": 0.0001865449447218635, "loss": 0.9914, "step": 86 }, { "epoch": 0.56, "grad_norm": 0.0697932317852974, "learning_rate": 0.00018619696668800492, "loss": 0.9726, "step": 87 }, { "epoch": 0.57, "grad_norm": 0.062443289905786514, "learning_rate": 0.00018584487936018661, "loss": 1.0084, "step": 88 }, { "epoch": 0.57, "grad_norm": 0.059460923075675964, "learning_rate": 0.0001854886995235738, "loss": 0.9404, "step": 89 }, { "epoch": 0.58, "grad_norm": 0.058260347694158554, "learning_rate": 0.00018512844415843514, "loss": 0.9796, "step": 90 }, { "epoch": 0.58, "grad_norm": 0.05946533381938934, "learning_rate": 0.00018476413043933313, "loss": 0.9418, "step": 91 }, { "epoch": 0.59, "grad_norm": 0.06572849303483963, "learning_rate": 0.00018439577573430555, "loss": 0.9785, "step": 92 }, { "epoch": 0.6, "grad_norm": 0.06783867627382278, "learning_rate": 0.00018402339760403713, "loss": 0.9747, "step": 93 }, { "epoch": 0.6, "grad_norm": 0.06454402953386307, "learning_rate": 0.00018364701380102266, "loss": 0.9779, "step": 94 }, { "epoch": 0.61, "grad_norm": 0.06309663504362106, "learning_rate": 0.00018326664226872065, "loss": 0.9643, "step": 95 }, { "epoch": 0.62, "grad_norm": 0.05967305600643158, "learning_rate": 0.00018288230114069765, "loss": 0.9752, "step": 96 }, { "epoch": 0.62, "grad_norm": 0.05811592936515808, "learning_rate": 0.0001824940087397641, "loss": 0.9551, "step": 97 }, { "epoch": 0.63, "grad_norm": 0.0642295628786087, "learning_rate": 0.00018210178357710058, "loss": 0.9522, "step": 98 }, { "epoch": 0.64, "grad_norm": 0.05724099278450012, "learning_rate": 0.0001817056443513754, "loss": 1.0051, "step": 99 }, { "epoch": 0.64, "grad_norm": 0.056155964732170105, "learning_rate": 0.00018130560994785325, "loss": 0.9778, "step": 100 }, { "epoch": 0.65, "grad_norm": 0.058100346475839615, "learning_rate": 0.00018090169943749476, "loss": 0.9825, "step": 101 }, { "epoch": 0.66, "grad_norm": 0.06120794638991356, "learning_rate": 0.00018049393207604733, "loss": 0.9839, "step": 102 }, { "epoch": 0.66, "grad_norm": 0.056975312530994415, "learning_rate": 0.00018008232730312723, "loss": 0.9968, "step": 103 }, { "epoch": 0.67, "grad_norm": 0.06239038705825806, "learning_rate": 0.00017966690474129285, "loss": 0.9906, "step": 104 }, { "epoch": 0.67, "grad_norm": 0.5958348512649536, "learning_rate": 0.00017924768419510904, "loss": 2.6531, "step": 105 }, { "epoch": 0.68, "grad_norm": 0.06554935872554779, "learning_rate": 0.00017882468565020326, "loss": 1.0164, "step": 106 }, { "epoch": 0.69, "grad_norm": 0.05698655918240547, "learning_rate": 0.00017839792927231254, "loss": 0.9516, "step": 107 }, { "epoch": 0.69, "grad_norm": 0.06186239421367645, "learning_rate": 0.00017796743540632223, "loss": 0.9933, "step": 108 }, { "epoch": 0.7, "grad_norm": 0.05811876431107521, "learning_rate": 0.00017753322457529614, "loss": 0.9552, "step": 109 }, { "epoch": 0.71, "grad_norm": 0.06247268617153168, "learning_rate": 0.00017709531747949796, "loss": 0.9316, "step": 110 }, { "epoch": 0.71, "grad_norm": 0.06278502196073532, "learning_rate": 0.00017665373499540463, "loss": 0.9867, "step": 111 }, { "epoch": 0.72, "grad_norm": 0.06079186499118805, "learning_rate": 0.00017620849817471092, "loss": 1.0233, "step": 112 }, { "epoch": 0.73, "grad_norm": 0.05586745962500572, "learning_rate": 0.00017575962824332596, "loss": 0.9454, "step": 113 }, { "epoch": 0.73, "grad_norm": 0.059647805988788605, "learning_rate": 0.00017530714660036112, "loss": 0.9718, "step": 114 }, { "epoch": 0.74, "grad_norm": 0.060143355280160904, "learning_rate": 0.00017485107481711012, "loss": 0.9927, "step": 115 }, { "epoch": 0.75, "grad_norm": 0.4768543541431427, "learning_rate": 0.0001743914346360205, "loss": 2.4526, "step": 116 }, { "epoch": 0.75, "grad_norm": 0.059104178100824356, "learning_rate": 0.00017392824796965702, "loss": 0.9366, "step": 117 }, { "epoch": 0.76, "grad_norm": 0.06858639419078827, "learning_rate": 0.00017346153689965727, "loss": 0.9783, "step": 118 }, { "epoch": 0.76, "grad_norm": 0.06308155506849289, "learning_rate": 0.00017299132367567857, "loss": 0.9688, "step": 119 }, { "epoch": 0.77, "grad_norm": 0.0601269006729126, "learning_rate": 0.00017251763071433765, "loss": 0.9937, "step": 120 }, { "epoch": 0.78, "grad_norm": 0.06544536352157593, "learning_rate": 0.00017204048059814175, "loss": 0.9351, "step": 121 }, { "epoch": 0.78, "grad_norm": 0.06467759609222412, "learning_rate": 0.00017155989607441213, "loss": 0.9918, "step": 122 }, { "epoch": 0.79, "grad_norm": 0.061619073152542114, "learning_rate": 0.0001710759000541995, "loss": 0.9872, "step": 123 }, { "epoch": 0.8, "grad_norm": 0.06122846156358719, "learning_rate": 0.00017058851561119198, "loss": 0.968, "step": 124 }, { "epoch": 0.8, "grad_norm": 0.08277314156293869, "learning_rate": 0.00017009776598061495, "loss": 0.9869, "step": 125 }, { "epoch": 0.81, "grad_norm": 0.07559008151292801, "learning_rate": 0.00016960367455812336, "loss": 0.9804, "step": 126 }, { "epoch": 0.82, "grad_norm": 0.06251110136508942, "learning_rate": 0.00016910626489868649, "loss": 0.978, "step": 127 }, { "epoch": 0.82, "grad_norm": 0.06253345310688019, "learning_rate": 0.0001686055607154648, "loss": 0.9524, "step": 128 }, { "epoch": 0.83, "grad_norm": 0.05948334559798241, "learning_rate": 0.00016810158587867973, "loss": 0.9826, "step": 129 }, { "epoch": 0.84, "grad_norm": 0.06356865167617798, "learning_rate": 0.00016759436441447545, "loss": 0.9805, "step": 130 }, { "epoch": 0.84, "grad_norm": 0.06536010652780533, "learning_rate": 0.00016708392050377363, "loss": 1.0146, "step": 131 }, { "epoch": 0.85, "grad_norm": 0.06137322261929512, "learning_rate": 0.00016657027848112062, "loss": 0.9457, "step": 132 }, { "epoch": 0.85, "grad_norm": 0.05955340713262558, "learning_rate": 0.00016605346283352727, "loss": 0.9823, "step": 133 }, { "epoch": 0.86, "grad_norm": 0.0602475143969059, "learning_rate": 0.00016553349819930165, "loss": 1.0077, "step": 134 }, { "epoch": 0.87, "grad_norm": 0.05905039981007576, "learning_rate": 0.00016501040936687443, "loss": 0.9313, "step": 135 }, { "epoch": 0.87, "grad_norm": 0.06087419390678406, "learning_rate": 0.00016448422127361706, "loss": 0.9725, "step": 136 }, { "epoch": 0.88, "grad_norm": 0.059714119881391525, "learning_rate": 0.00016395495900465304, "loss": 0.9963, "step": 137 }, { "epoch": 0.89, "grad_norm": 0.06461544334888458, "learning_rate": 0.000163422647791662, "loss": 0.957, "step": 138 }, { "epoch": 0.89, "grad_norm": 0.06247726082801819, "learning_rate": 0.00016288731301167668, "loss": 0.9742, "step": 139 }, { "epoch": 0.9, "grad_norm": 0.06527213007211685, "learning_rate": 0.00016234898018587337, "loss": 0.9789, "step": 140 }, { "epoch": 0.91, "grad_norm": 0.059757690876722336, "learning_rate": 0.00016180767497835503, "loss": 0.9309, "step": 141 }, { "epoch": 0.91, "grad_norm": 0.059084221720695496, "learning_rate": 0.00016126342319492784, "loss": 0.9546, "step": 142 }, { "epoch": 0.92, "grad_norm": 1.778961181640625, "learning_rate": 0.00016071625078187114, "loss": 2.6066, "step": 143 }, { "epoch": 0.93, "grad_norm": 0.06468740105628967, "learning_rate": 0.00016016618382470012, "loss": 0.9472, "step": 144 }, { "epoch": 0.93, "grad_norm": 0.061647091060876846, "learning_rate": 0.00015961324854692254, "loss": 0.9836, "step": 145 }, { "epoch": 0.94, "grad_norm": 0.06061193719506264, "learning_rate": 0.0001590574713087885, "loss": 0.982, "step": 146 }, { "epoch": 0.94, "grad_norm": 0.06635820865631104, "learning_rate": 0.00015849887860603374, "loss": 0.9873, "step": 147 }, { "epoch": 0.95, "grad_norm": 0.06260058283805847, "learning_rate": 0.00015793749706861636, "loss": 0.9827, "step": 148 }, { "epoch": 0.96, "grad_norm": 0.06037148833274841, "learning_rate": 0.00015737335345944757, "loss": 1.0072, "step": 149 }, { "epoch": 0.96, "grad_norm": 0.06277037411928177, "learning_rate": 0.00015680647467311557, "loss": 0.9498, "step": 150 }, { "epoch": 0.97, "grad_norm": 0.06306284666061401, "learning_rate": 0.00015623688773460357, "loss": 0.9866, "step": 151 }, { "epoch": 0.98, "grad_norm": 0.07311715185642242, "learning_rate": 0.00015566461979800122, "loss": 0.9722, "step": 152 }, { "epoch": 0.98, "grad_norm": 0.06143077090382576, "learning_rate": 0.00015508969814521025, "loss": 0.9442, "step": 153 }, { "epoch": 0.99, "grad_norm": 0.067879818379879, "learning_rate": 0.00015451215018464387, "loss": 0.9416, "step": 154 }, { "epoch": 1.0, "grad_norm": 0.06907378137111664, "learning_rate": 0.00015393200344991995, "loss": 0.9813, "step": 155 }, { "epoch": 1.0, "grad_norm": 0.06299201399087906, "learning_rate": 0.0001533492855985485, "loss": 0.9684, "step": 156 }, { "epoch": 1.01, "grad_norm": 0.059243012219667435, "learning_rate": 0.0001527640244106133, "loss": 0.9883, "step": 157 }, { "epoch": 1.01, "grad_norm": 0.06446617841720581, "learning_rate": 0.00015217624778744718, "loss": 0.9836, "step": 158 }, { "epoch": 1.02, "grad_norm": 0.07795852422714233, "learning_rate": 0.00015158598375030217, "loss": 0.9682, "step": 159 }, { "epoch": 1.01, "grad_norm": 0.0846932977437973, "learning_rate": 0.0001509932604390136, "loss": 0.8964, "step": 160 }, { "epoch": 1.01, "grad_norm": 0.06378539651632309, "learning_rate": 0.0001503981061106584, "loss": 0.8712, "step": 161 }, { "epoch": 1.02, "grad_norm": 0.06959015876054764, "learning_rate": 0.00014980054913820814, "loss": 0.8538, "step": 162 }, { "epoch": 1.03, "grad_norm": 0.11089441180229187, "learning_rate": 0.00014920061800917638, "loss": 0.8898, "step": 163 }, { "epoch": 1.03, "grad_norm": 0.10314545780420303, "learning_rate": 0.0001485983413242606, "loss": 0.858, "step": 164 }, { "epoch": 1.04, "grad_norm": 0.06961672008037567, "learning_rate": 0.00014799374779597867, "loss": 0.8605, "step": 165 }, { "epoch": 1.04, "grad_norm": 0.07029856741428375, "learning_rate": 0.00014738686624729986, "loss": 0.8536, "step": 166 }, { "epoch": 1.05, "grad_norm": 0.07742595672607422, "learning_rate": 0.0001467777256102712, "loss": 0.875, "step": 167 }, { "epoch": 1.06, "grad_norm": 0.06792555004358292, "learning_rate": 0.00014616635492463776, "loss": 0.8829, "step": 168 }, { "epoch": 1.06, "grad_norm": 0.08564931154251099, "learning_rate": 0.00014555278333645833, "loss": 0.9109, "step": 169 }, { "epoch": 1.07, "grad_norm": 0.0962703675031662, "learning_rate": 0.00014493704009671613, "loss": 0.8707, "step": 170 }, { "epoch": 1.08, "grad_norm": 0.09053023904561996, "learning_rate": 0.00014431915455992414, "loss": 0.9005, "step": 171 }, { "epoch": 1.08, "grad_norm": 0.07491155713796616, "learning_rate": 0.00014369915618272567, "loss": 0.8805, "step": 172 }, { "epoch": 1.09, "grad_norm": 0.08745575696229935, "learning_rate": 0.00014307707452249012, "loss": 0.8766, "step": 173 }, { "epoch": 1.1, "grad_norm": 0.08251971006393433, "learning_rate": 0.0001424529392359039, "loss": 0.8939, "step": 174 }, { "epoch": 1.1, "grad_norm": 0.07616613060235977, "learning_rate": 0.0001418267800775565, "loss": 0.8678, "step": 175 }, { "epoch": 1.11, "grad_norm": 0.08193603903055191, "learning_rate": 0.00014119862689852223, "loss": 0.8473, "step": 176 }, { "epoch": 1.12, "grad_norm": 0.08167202025651932, "learning_rate": 0.0001405685096449367, "loss": 0.8669, "step": 177 }, { "epoch": 1.12, "grad_norm": 0.08531814068555832, "learning_rate": 0.00013993645835656953, "loss": 0.8637, "step": 178 }, { "epoch": 1.13, "grad_norm": 0.08357888460159302, "learning_rate": 0.00013930250316539238, "loss": 0.846, "step": 179 }, { "epoch": 1.13, "grad_norm": 0.583361804485321, "learning_rate": 0.0001386666742941419, "loss": 2.3925, "step": 180 }, { "epoch": 1.14, "grad_norm": 0.08598628640174866, "learning_rate": 0.00013802900205487948, "loss": 0.8702, "step": 181 }, { "epoch": 1.15, "grad_norm": 0.0913216769695282, "learning_rate": 0.00013738951684754585, "loss": 0.8492, "step": 182 }, { "epoch": 1.15, "grad_norm": 0.08879151940345764, "learning_rate": 0.00013674824915851192, "loss": 0.8721, "step": 183 }, { "epoch": 1.16, "grad_norm": 0.09359906613826752, "learning_rate": 0.0001361052295591255, "loss": 0.8678, "step": 184 }, { "epoch": 1.17, "grad_norm": 0.09475123882293701, "learning_rate": 0.00013546048870425356, "loss": 0.8514, "step": 185 }, { "epoch": 1.17, "grad_norm": 0.1002567857503891, "learning_rate": 0.00013481405733082116, "loss": 0.8688, "step": 186 }, { "epoch": 1.18, "grad_norm": 0.09523095935583115, "learning_rate": 0.00013416596625634593, "loss": 0.8776, "step": 187 }, { "epoch": 1.19, "grad_norm": 0.09465932101011276, "learning_rate": 0.00013351624637746886, "loss": 0.8965, "step": 188 }, { "epoch": 1.19, "grad_norm": 0.09145588427782059, "learning_rate": 0.00013286492866848142, "loss": 0.8587, "step": 189 }, { "epoch": 1.2, "grad_norm": 0.09743087738752365, "learning_rate": 0.00013221204417984908, "loss": 0.823, "step": 190 }, { "epoch": 1.21, "grad_norm": 0.09468277543783188, "learning_rate": 0.00013155762403673063, "loss": 0.8578, "step": 191 }, { "epoch": 1.21, "grad_norm": 0.09516102075576782, "learning_rate": 0.00013090169943749476, "loss": 0.8944, "step": 192 }, { "epoch": 1.22, "grad_norm": 0.09514753520488739, "learning_rate": 0.00013024430165223244, "loss": 0.8175, "step": 193 }, { "epoch": 1.22, "grad_norm": 0.10116668790578842, "learning_rate": 0.0001295854620212664, "loss": 0.8572, "step": 194 }, { "epoch": 1.23, "grad_norm": 0.09801662713289261, "learning_rate": 0.00012892521195365678, "loss": 0.8532, "step": 195 }, { "epoch": 1.24, "grad_norm": 0.09754762053489685, "learning_rate": 0.00012826358292570398, "loss": 0.8653, "step": 196 }, { "epoch": 1.24, "grad_norm": 0.10124494135379791, "learning_rate": 0.00012760060647944795, "loss": 0.9007, "step": 197 }, { "epoch": 1.25, "grad_norm": 0.09749335795640945, "learning_rate": 0.00012693631422116454, "loss": 0.8304, "step": 198 }, { "epoch": 1.26, "grad_norm": 0.09953852742910385, "learning_rate": 0.0001262707378198587, "loss": 0.8759, "step": 199 }, { "epoch": 1.26, "grad_norm": 0.10160723328590393, "learning_rate": 0.0001256039090057547, "loss": 0.8923, "step": 200 }, { "epoch": 1.27, "grad_norm": 0.1000576838850975, "learning_rate": 0.00012493585956878354, "loss": 0.8397, "step": 201 }, { "epoch": 1.28, "grad_norm": 0.10324119031429291, "learning_rate": 0.0001242666213570672, "loss": 0.8787, "step": 202 }, { "epoch": 1.28, "grad_norm": 0.1018192246556282, "learning_rate": 0.00012359622627540058, "loss": 0.8318, "step": 203 }, { "epoch": 1.29, "grad_norm": 0.10028495639562607, "learning_rate": 0.00012292470628373037, "loss": 0.8354, "step": 204 }, { "epoch": 1.3, "grad_norm": 0.10520325601100922, "learning_rate": 0.00012225209339563145, "loss": 0.8586, "step": 205 }, { "epoch": 1.3, "grad_norm": 0.10229421406984329, "learning_rate": 0.00012157841967678063, "loss": 0.8961, "step": 206 }, { "epoch": 1.31, "grad_norm": 0.10230256617069244, "learning_rate": 0.00012090371724342804, "loss": 0.8851, "step": 207 }, { "epoch": 1.31, "grad_norm": 0.10282580554485321, "learning_rate": 0.00012022801826086609, "loss": 0.8588, "step": 208 }, { "epoch": 1.32, "grad_norm": 0.1008041650056839, "learning_rate": 0.00011955135494189588, "loss": 0.8462, "step": 209 }, { "epoch": 1.33, "grad_norm": 0.10294941067695618, "learning_rate": 0.00011887375954529168, "loss": 0.8301, "step": 210 }, { "epoch": 1.33, "grad_norm": 0.10411438345909119, "learning_rate": 0.00011819526437426298, "loss": 0.8299, "step": 211 }, { "epoch": 1.34, "grad_norm": 0.10526308417320251, "learning_rate": 0.0001175159017749144, "loss": 0.8624, "step": 212 }, { "epoch": 1.35, "grad_norm": 0.10539798438549042, "learning_rate": 0.00011683570413470383, "loss": 0.8473, "step": 213 }, { "epoch": 1.35, "grad_norm": 0.09961718320846558, "learning_rate": 0.00011615470388089835, "loss": 0.8571, "step": 214 }, { "epoch": 1.36, "grad_norm": 0.10262471437454224, "learning_rate": 0.00011547293347902812, "loss": 0.8539, "step": 215 }, { "epoch": 1.37, "grad_norm": 0.10586554557085037, "learning_rate": 0.00011479042543133895, "loss": 0.8559, "step": 216 }, { "epoch": 1.37, "grad_norm": 0.10486755520105362, "learning_rate": 0.00011410721227524255, "loss": 0.8726, "step": 217 }, { "epoch": 1.38, "grad_norm": 0.10485829412937164, "learning_rate": 0.00011342332658176555, "loss": 0.8903, "step": 218 }, { "epoch": 1.39, "grad_norm": 0.10306116938591003, "learning_rate": 0.00011273880095399667, "loss": 0.8495, "step": 219 }, { "epoch": 1.39, "grad_norm": 0.10205783694982529, "learning_rate": 0.0001120536680255323, "loss": 0.86, "step": 220 }, { "epoch": 1.4, "grad_norm": 0.10305880010128021, "learning_rate": 0.00011136796045892102, "loss": 0.8617, "step": 221 }, { "epoch": 1.4, "grad_norm": 0.10516184568405151, "learning_rate": 0.00011068171094410618, "loss": 0.8942, "step": 222 }, { "epoch": 1.41, "grad_norm": 0.10830646008253098, "learning_rate": 0.00010999495219686762, "loss": 0.8853, "step": 223 }, { "epoch": 1.42, "grad_norm": 0.10334376990795135, "learning_rate": 0.00010930771695726201, "loss": 0.8183, "step": 224 }, { "epoch": 1.42, "grad_norm": 0.10024702548980713, "learning_rate": 0.00010862003798806196, "loss": 0.8503, "step": 225 }, { "epoch": 1.43, "grad_norm": 0.10792502015829086, "learning_rate": 0.00010793194807319408, "loss": 0.8746, "step": 226 }, { "epoch": 1.44, "grad_norm": 0.10143101215362549, "learning_rate": 0.00010724348001617625, "loss": 0.8321, "step": 227 }, { "epoch": 1.44, "grad_norm": 0.09850587695837021, "learning_rate": 0.00010655466663855349, "loss": 0.8649, "step": 228 }, { "epoch": 1.45, "grad_norm": 0.10333626717329025, "learning_rate": 0.00010586554077833347, "loss": 0.8722, "step": 229 }, { "epoch": 1.46, "grad_norm": 0.10461243987083435, "learning_rate": 0.00010517613528842097, "loss": 0.8961, "step": 230 }, { "epoch": 1.46, "grad_norm": 0.10886025428771973, "learning_rate": 0.00010448648303505151, "loss": 0.8921, "step": 231 }, { "epoch": 1.47, "grad_norm": 0.10471045225858688, "learning_rate": 0.00010379661689622477, "loss": 0.8647, "step": 232 }, { "epoch": 1.48, "grad_norm": 0.10855681449174881, "learning_rate": 0.00010310656976013705, "loss": 0.8783, "step": 233 }, { "epoch": 1.48, "grad_norm": 0.11153094470500946, "learning_rate": 0.00010241637452361323, "loss": 0.8732, "step": 234 }, { "epoch": 1.49, "grad_norm": 0.10941874235868454, "learning_rate": 0.00010172606409053886, "loss": 0.8555, "step": 235 }, { "epoch": 1.49, "grad_norm": 0.10388068854808807, "learning_rate": 0.0001010356713702911, "loss": 0.8549, "step": 236 }, { "epoch": 1.5, "grad_norm": 0.1045478880405426, "learning_rate": 0.00010034522927617014, "loss": 0.8706, "step": 237 }, { "epoch": 1.51, "grad_norm": 0.10188283771276474, "learning_rate": 9.96547707238299e-05, "loss": 0.8916, "step": 238 }, { "epoch": 1.51, "grad_norm": 0.10076084733009338, "learning_rate": 9.896432862970892e-05, "loss": 0.8841, "step": 239 }, { "epoch": 1.52, "grad_norm": 0.10353115946054459, "learning_rate": 9.827393590946116e-05, "loss": 0.8723, "step": 240 }, { "epoch": 1.53, "grad_norm": 0.10559534281492233, "learning_rate": 9.75836254763868e-05, "loss": 0.8679, "step": 241 }, { "epoch": 1.53, "grad_norm": 0.10226694494485855, "learning_rate": 9.689343023986302e-05, "loss": 0.8434, "step": 242 }, { "epoch": 1.54, "grad_norm": 0.10569630563259125, "learning_rate": 9.620338310377525e-05, "loss": 0.8768, "step": 243 }, { "epoch": 1.55, "grad_norm": 0.10112857073545456, "learning_rate": 9.551351696494854e-05, "loss": 0.8249, "step": 244 }, { "epoch": 1.55, "grad_norm": 0.10926086455583572, "learning_rate": 9.482386471157904e-05, "loss": 0.8634, "step": 245 }, { "epoch": 1.56, "grad_norm": 0.1026775911450386, "learning_rate": 9.413445922166653e-05, "loss": 0.8679, "step": 246 }, { "epoch": 1.57, "grad_norm": 0.10021597146987915, "learning_rate": 9.344533336144652e-05, "loss": 0.8596, "step": 247 }, { "epoch": 1.57, "grad_norm": 0.1025286465883255, "learning_rate": 9.275651998382377e-05, "loss": 0.8302, "step": 248 }, { "epoch": 1.58, "grad_norm": 0.0997297540307045, "learning_rate": 9.206805192680593e-05, "loss": 0.8818, "step": 249 }, { "epoch": 1.58, "grad_norm": 0.10259710997343063, "learning_rate": 9.137996201193805e-05, "loss": 0.8558, "step": 250 }, { "epoch": 1.59, "grad_norm": 0.1076403483748436, "learning_rate": 9.069228304273802e-05, "loss": 0.8922, "step": 251 }, { "epoch": 1.6, "grad_norm": 0.10561492294073105, "learning_rate": 9.00050478031324e-05, "loss": 0.8593, "step": 252 }, { "epoch": 1.6, "grad_norm": 0.10473283380270004, "learning_rate": 8.931828905589385e-05, "loss": 0.8893, "step": 253 }, { "epoch": 1.61, "grad_norm": 0.11126771569252014, "learning_rate": 8.863203954107902e-05, "loss": 0.8761, "step": 254 }, { "epoch": 1.62, "grad_norm": 0.10318706929683685, "learning_rate": 8.79463319744677e-05, "loss": 0.8688, "step": 255 }, { "epoch": 1.62, "grad_norm": 0.10156065970659256, "learning_rate": 8.726119904600336e-05, "loss": 0.8683, "step": 256 }, { "epoch": 1.63, "grad_norm": 0.0986093133687973, "learning_rate": 8.657667341823448e-05, "loss": 0.8375, "step": 257 }, { "epoch": 1.64, "grad_norm": 0.10292702913284302, "learning_rate": 8.589278772475749e-05, "loss": 0.8623, "step": 258 }, { "epoch": 1.64, "grad_norm": 0.1031164601445198, "learning_rate": 8.520957456866107e-05, "loss": 0.8633, "step": 259 }, { "epoch": 1.65, "grad_norm": 0.0997462198138237, "learning_rate": 8.452706652097186e-05, "loss": 0.8462, "step": 260 }, { "epoch": 1.66, "grad_norm": 0.11357836425304413, "learning_rate": 8.384529611910163e-05, "loss": 0.8316, "step": 261 }, { "epoch": 1.66, "grad_norm": 0.10290369391441345, "learning_rate": 8.316429586529615e-05, "loss": 0.8357, "step": 262 }, { "epoch": 1.67, "grad_norm": 0.10479974746704102, "learning_rate": 8.248409822508561e-05, "loss": 0.8591, "step": 263 }, { "epoch": 1.67, "grad_norm": 0.1035972386598587, "learning_rate": 8.180473562573705e-05, "loss": 0.8351, "step": 264 }, { "epoch": 1.68, "grad_norm": 0.10530169308185577, "learning_rate": 8.112624045470835e-05, "loss": 0.8387, "step": 265 }, { "epoch": 1.69, "grad_norm": 0.09917616844177246, "learning_rate": 8.044864505810414e-05, "loss": 0.8699, "step": 266 }, { "epoch": 1.69, "grad_norm": 0.10452553629875183, "learning_rate": 7.977198173913394e-05, "loss": 0.859, "step": 267 }, { "epoch": 1.7, "grad_norm": 0.10981190949678421, "learning_rate": 7.909628275657198e-05, "loss": 0.8443, "step": 268 }, { "epoch": 1.71, "grad_norm": 0.10094719380140305, "learning_rate": 7.84215803232194e-05, "loss": 0.8461, "step": 269 }, { "epoch": 1.71, "grad_norm": 0.11014903336763382, "learning_rate": 7.774790660436858e-05, "loss": 0.8409, "step": 270 }, { "epoch": 1.72, "grad_norm": 0.10279539972543716, "learning_rate": 7.707529371626965e-05, "loss": 0.8588, "step": 271 }, { "epoch": 1.73, "grad_norm": 0.10260649770498276, "learning_rate": 7.640377372459945e-05, "loss": 0.8702, "step": 272 }, { "epoch": 1.73, "grad_norm": 0.10066790878772736, "learning_rate": 7.573337864293283e-05, "loss": 0.8523, "step": 273 }, { "epoch": 1.74, "grad_norm": 0.10237132757902145, "learning_rate": 7.506414043121647e-05, "loss": 0.8676, "step": 274 }, { "epoch": 1.75, "grad_norm": 0.10095226764678955, "learning_rate": 7.43960909942453e-05, "loss": 0.8802, "step": 275 }, { "epoch": 1.75, "grad_norm": 0.10220375657081604, "learning_rate": 7.372926218014131e-05, "loss": 0.8347, "step": 276 }, { "epoch": 1.76, "grad_norm": 0.09893085062503815, "learning_rate": 7.306368577883547e-05, "loss": 0.8565, "step": 277 }, { "epoch": 1.76, "grad_norm": 0.10145033895969391, "learning_rate": 7.239939352055208e-05, "loss": 0.8565, "step": 278 }, { "epoch": 1.77, "grad_norm": 0.10070981830358505, "learning_rate": 7.173641707429606e-05, "loss": 0.8475, "step": 279 }, { "epoch": 1.78, "grad_norm": 0.10595450550317764, "learning_rate": 7.107478804634325e-05, "loss": 0.8368, "step": 280 }, { "epoch": 1.78, "grad_norm": 0.1085483580827713, "learning_rate": 7.041453797873363e-05, "loss": 0.866, "step": 281 }, { "epoch": 1.79, "grad_norm": 0.1005505919456482, "learning_rate": 6.975569834776758e-05, "loss": 0.8393, "step": 282 }, { "epoch": 1.8, "grad_norm": 0.10384807735681534, "learning_rate": 6.909830056250527e-05, "loss": 0.8571, "step": 283 }, { "epoch": 1.8, "grad_norm": 0.10558132082223892, "learning_rate": 6.844237596326941e-05, "loss": 0.8477, "step": 284 }, { "epoch": 1.81, "grad_norm": 0.10049811750650406, "learning_rate": 6.778795582015097e-05, "loss": 0.8317, "step": 285 }, { "epoch": 1.82, "grad_norm": 0.10088322311639786, "learning_rate": 6.713507133151857e-05, "loss": 0.8602, "step": 286 }, { "epoch": 1.82, "grad_norm": 0.09972478449344635, "learning_rate": 6.648375362253118e-05, "loss": 0.8514, "step": 287 }, { "epoch": 1.83, "grad_norm": 0.10518650710582733, "learning_rate": 6.583403374365405e-05, "loss": 0.8732, "step": 288 }, { "epoch": 1.84, "grad_norm": 0.10432999581098557, "learning_rate": 6.518594266917882e-05, "loss": 0.9014, "step": 289 }, { "epoch": 1.84, "grad_norm": 0.10011251270771027, "learning_rate": 6.453951129574644e-05, "loss": 0.8192, "step": 290 }, { "epoch": 1.85, "grad_norm": 0.09896595031023026, "learning_rate": 6.389477044087452e-05, "loss": 0.8148, "step": 291 }, { "epoch": 1.85, "grad_norm": 0.10062045603990555, "learning_rate": 6.325175084148809e-05, "loss": 0.8303, "step": 292 }, { "epoch": 1.86, "grad_norm": 0.1033136248588562, "learning_rate": 6.261048315245419e-05, "loss": 0.8537, "step": 293 }, { "epoch": 1.87, "grad_norm": 0.10438770055770874, "learning_rate": 6.197099794512056e-05, "loss": 0.8881, "step": 294 }, { "epoch": 1.87, "grad_norm": 0.1036936491727829, "learning_rate": 6.133332570585812e-05, "loss": 0.85, "step": 295 }, { "epoch": 1.88, "grad_norm": 0.10618463903665543, "learning_rate": 6.069749683460765e-05, "loss": 0.8618, "step": 296 }, { "epoch": 1.89, "grad_norm": 0.10110071301460266, "learning_rate": 6.006354164343046e-05, "loss": 0.8632, "step": 297 }, { "epoch": 1.89, "grad_norm": 0.10008033365011215, "learning_rate": 5.943149035506337e-05, "loss": 0.8648, "step": 298 }, { "epoch": 1.9, "grad_norm": 0.09985347092151642, "learning_rate": 5.880137310147782e-05, "loss": 0.893, "step": 299 }, { "epoch": 1.91, "grad_norm": 0.09989628195762634, "learning_rate": 5.817321992244351e-05, "loss": 0.8682, "step": 300 }, { "epoch": 1.91, "grad_norm": 0.10163696855306625, "learning_rate": 5.754706076409613e-05, "loss": 0.8554, "step": 301 }, { "epoch": 1.92, "grad_norm": 0.10319127887487411, "learning_rate": 5.692292547750988e-05, "loss": 0.8603, "step": 302 }, { "epoch": 1.93, "grad_norm": 0.10187959671020508, "learning_rate": 5.630084381727434e-05, "loss": 0.8628, "step": 303 }, { "epoch": 1.93, "grad_norm": 0.09999451041221619, "learning_rate": 5.568084544007588e-05, "loss": 0.8471, "step": 304 }, { "epoch": 1.94, "grad_norm": 0.10175001621246338, "learning_rate": 5.506295990328385e-05, "loss": 0.8371, "step": 305 }, { "epoch": 1.94, "grad_norm": 0.10206517577171326, "learning_rate": 5.444721666354169e-05, "loss": 0.8836, "step": 306 }, { "epoch": 1.95, "grad_norm": 0.10593275725841522, "learning_rate": 5.383364507536229e-05, "loss": 0.8397, "step": 307 }, { "epoch": 1.96, "grad_norm": 0.10138902068138123, "learning_rate": 5.32222743897288e-05, "loss": 0.8749, "step": 308 }, { "epoch": 1.96, "grad_norm": 0.1002379059791565, "learning_rate": 5.261313375270014e-05, "loss": 0.8338, "step": 309 }, { "epoch": 1.97, "grad_norm": 0.11006593704223633, "learning_rate": 5.200625220402139e-05, "loss": 0.8779, "step": 310 } ], "logging_steps": 1, "max_steps": 465, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 155, "total_flos": 6.721047353232458e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }