{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9921259842519685, "eval_steps": 500, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005249343832020997, "grad_norm": 1.1348930782232016, "learning_rate": 1.5000000000000002e-07, "loss": 1.1087, "step": 1 }, { "epoch": 0.010498687664041995, "grad_norm": 1.123696373079589, "learning_rate": 3.0000000000000004e-07, "loss": 1.1356, "step": 2 }, { "epoch": 0.015748031496062992, "grad_norm": 1.0989081863562118, "learning_rate": 4.5e-07, "loss": 1.1158, "step": 3 }, { "epoch": 0.02099737532808399, "grad_norm": 1.0628548113414964, "learning_rate": 6.000000000000001e-07, "loss": 1.0986, "step": 4 }, { "epoch": 0.026246719160104987, "grad_norm": 1.0629069543612368, "learning_rate": 7.5e-07, "loss": 1.0727, "step": 5 }, { "epoch": 0.031496062992125984, "grad_norm": 1.1219311917213644, "learning_rate": 9e-07, "loss": 1.1513, "step": 6 }, { "epoch": 0.03674540682414698, "grad_norm": 1.068318638334139, "learning_rate": 1.05e-06, "loss": 1.0978, "step": 7 }, { "epoch": 0.04199475065616798, "grad_norm": 1.0335025624008565, "learning_rate": 1.2000000000000002e-06, "loss": 1.0932, "step": 8 }, { "epoch": 0.047244094488188976, "grad_norm": 0.9514112971268772, "learning_rate": 1.35e-06, "loss": 1.1046, "step": 9 }, { "epoch": 0.05249343832020997, "grad_norm": 0.8944230714776324, "learning_rate": 1.5e-06, "loss": 1.0638, "step": 10 }, { "epoch": 0.05774278215223097, "grad_norm": 0.8720343077794245, "learning_rate": 1.65e-06, "loss": 1.1132, "step": 11 }, { "epoch": 0.06299212598425197, "grad_norm": 0.7519518665820406, "learning_rate": 1.8e-06, "loss": 1.0788, "step": 12 }, { "epoch": 0.06824146981627296, "grad_norm": 0.7768466543241798, "learning_rate": 1.95e-06, "loss": 1.0795, "step": 13 }, { "epoch": 0.07349081364829396, "grad_norm": 0.7109922479048013, "learning_rate": 2.1e-06, "loss": 1.1012, "step": 14 }, { "epoch": 0.07874015748031496, "grad_norm": 0.6312078880187205, "learning_rate": 2.25e-06, "loss": 1.0851, "step": 15 }, { "epoch": 0.08398950131233596, "grad_norm": 0.5514473048370377, "learning_rate": 2.4000000000000003e-06, "loss": 1.1041, "step": 16 }, { "epoch": 0.08923884514435695, "grad_norm": 0.6271281070432462, "learning_rate": 2.55e-06, "loss": 1.0855, "step": 17 }, { "epoch": 0.09448818897637795, "grad_norm": 0.7059888078645049, "learning_rate": 2.7e-06, "loss": 1.0473, "step": 18 }, { "epoch": 0.09973753280839895, "grad_norm": 0.7226157330393405, "learning_rate": 2.85e-06, "loss": 1.0665, "step": 19 }, { "epoch": 0.10498687664041995, "grad_norm": 0.7244742832208652, "learning_rate": 3e-06, "loss": 1.0604, "step": 20 }, { "epoch": 0.11023622047244094, "grad_norm": 0.7088251146482789, "learning_rate": 3.1500000000000003e-06, "loss": 1.0516, "step": 21 }, { "epoch": 0.11548556430446194, "grad_norm": 0.5987242362229293, "learning_rate": 3.3e-06, "loss": 1.084, "step": 22 }, { "epoch": 0.12073490813648294, "grad_norm": 0.5730637810768702, "learning_rate": 3.45e-06, "loss": 1.0621, "step": 23 }, { "epoch": 0.12598425196850394, "grad_norm": 0.5894968443138215, "learning_rate": 3.6e-06, "loss": 1.0797, "step": 24 }, { "epoch": 0.13123359580052493, "grad_norm": 0.5798124303184627, "learning_rate": 3.75e-06, "loss": 1.0035, "step": 25 }, { "epoch": 0.13648293963254593, "grad_norm": 0.643205751513686, "learning_rate": 3.9e-06, "loss": 1.0455, "step": 26 }, { "epoch": 0.14173228346456693, "grad_norm": 0.5621970774702022, "learning_rate": 4.05e-06, "loss": 1.0576, "step": 27 }, { "epoch": 0.14698162729658792, "grad_norm": 0.5506084571895594, "learning_rate": 4.2e-06, "loss": 1.0298, "step": 28 }, { "epoch": 0.15223097112860892, "grad_norm": 0.48741149421912777, "learning_rate": 4.35e-06, "loss": 1.0018, "step": 29 }, { "epoch": 0.15748031496062992, "grad_norm": 0.46403007703544275, "learning_rate": 4.5e-06, "loss": 0.9872, "step": 30 }, { "epoch": 0.16272965879265092, "grad_norm": 0.4754381818573106, "learning_rate": 4.65e-06, "loss": 1.0271, "step": 31 }, { "epoch": 0.1679790026246719, "grad_norm": 0.9362850890979981, "learning_rate": 4.800000000000001e-06, "loss": 1.0437, "step": 32 }, { "epoch": 0.1732283464566929, "grad_norm": 0.47391181595772164, "learning_rate": 4.95e-06, "loss": 1.0437, "step": 33 }, { "epoch": 0.1784776902887139, "grad_norm": 0.5276920454851337, "learning_rate": 5.1e-06, "loss": 1.0557, "step": 34 }, { "epoch": 0.1837270341207349, "grad_norm": 0.4616075133913133, "learning_rate": 5.2500000000000006e-06, "loss": 1.0465, "step": 35 }, { "epoch": 0.1889763779527559, "grad_norm": 0.4555174555636226, "learning_rate": 5.4e-06, "loss": 1.0588, "step": 36 }, { "epoch": 0.1942257217847769, "grad_norm": 0.5071864534648831, "learning_rate": 5.55e-06, "loss": 1.044, "step": 37 }, { "epoch": 0.1994750656167979, "grad_norm": 0.4851367263882934, "learning_rate": 5.7e-06, "loss": 1.0464, "step": 38 }, { "epoch": 0.2047244094488189, "grad_norm": 0.44188022228811896, "learning_rate": 5.85e-06, "loss": 1.0182, "step": 39 }, { "epoch": 0.2099737532808399, "grad_norm": 0.43420740120454643, "learning_rate": 6e-06, "loss": 1.0188, "step": 40 }, { "epoch": 0.2152230971128609, "grad_norm": 0.4291543441241407, "learning_rate": 5.9998719351101036e-06, "loss": 1.0245, "step": 41 }, { "epoch": 0.2204724409448819, "grad_norm": 0.43326370236005163, "learning_rate": 5.999487751374158e-06, "loss": 1.0238, "step": 42 }, { "epoch": 0.22572178477690288, "grad_norm": 0.427571644972227, "learning_rate": 5.998847481592462e-06, "loss": 1.0311, "step": 43 }, { "epoch": 0.23097112860892388, "grad_norm": 0.4215063088273006, "learning_rate": 5.997951180429069e-06, "loss": 0.9925, "step": 44 }, { "epoch": 0.23622047244094488, "grad_norm": 0.4206536914503675, "learning_rate": 5.996798924407118e-06, "loss": 1.003, "step": 45 }, { "epoch": 0.24146981627296588, "grad_norm": 0.40910969064965136, "learning_rate": 5.995390811902302e-06, "loss": 0.9949, "step": 46 }, { "epoch": 0.24671916010498687, "grad_norm": 0.4165775049327623, "learning_rate": 5.993726963134471e-06, "loss": 0.9734, "step": 47 }, { "epoch": 0.25196850393700787, "grad_norm": 0.3832235501001726, "learning_rate": 5.9918075201573645e-06, "loss": 0.9485, "step": 48 }, { "epoch": 0.2572178477690289, "grad_norm": 0.37002495168808525, "learning_rate": 5.9896326468464835e-06, "loss": 0.9358, "step": 49 }, { "epoch": 0.26246719160104987, "grad_norm": 0.44836853406053057, "learning_rate": 5.987202528885104e-06, "loss": 0.9982, "step": 50 }, { "epoch": 0.2677165354330709, "grad_norm": 0.4080608606117312, "learning_rate": 5.984517373748417e-06, "loss": 1.0129, "step": 51 }, { "epoch": 0.27296587926509186, "grad_norm": 0.4001550595702573, "learning_rate": 5.981577410685822e-06, "loss": 0.9788, "step": 52 }, { "epoch": 0.2782152230971129, "grad_norm": 0.41021488877460305, "learning_rate": 5.978382890701347e-06, "loss": 1.0262, "step": 53 }, { "epoch": 0.28346456692913385, "grad_norm": 0.39997016380492506, "learning_rate": 5.9749340865322284e-06, "loss": 1.0275, "step": 54 }, { "epoch": 0.2887139107611549, "grad_norm": 0.3839823787027912, "learning_rate": 5.971231292625615e-06, "loss": 0.9374, "step": 55 }, { "epoch": 0.29396325459317585, "grad_norm": 0.4125068495663659, "learning_rate": 5.967274825113438e-06, "loss": 0.9954, "step": 56 }, { "epoch": 0.2992125984251969, "grad_norm": 0.3908377197765856, "learning_rate": 5.963065021785414e-06, "loss": 0.9671, "step": 57 }, { "epoch": 0.30446194225721784, "grad_norm": 0.3850488592862481, "learning_rate": 5.958602242060207e-06, "loss": 0.9657, "step": 58 }, { "epoch": 0.30971128608923887, "grad_norm": 0.3877990366088493, "learning_rate": 5.95388686695475e-06, "loss": 0.9678, "step": 59 }, { "epoch": 0.31496062992125984, "grad_norm": 0.40470471194287355, "learning_rate": 5.948919299051706e-06, "loss": 1.0149, "step": 60 }, { "epoch": 0.32020997375328086, "grad_norm": 0.42889495063392963, "learning_rate": 5.943699962465096e-06, "loss": 1.033, "step": 61 }, { "epoch": 0.32545931758530183, "grad_norm": 0.39164358737100274, "learning_rate": 5.9382293028040985e-06, "loss": 0.9761, "step": 62 }, { "epoch": 0.33070866141732286, "grad_norm": 0.3869342590567232, "learning_rate": 5.9325077871349975e-06, "loss": 0.9982, "step": 63 }, { "epoch": 0.3359580052493438, "grad_norm": 0.39264627926569035, "learning_rate": 5.9265359039413105e-06, "loss": 0.9667, "step": 64 }, { "epoch": 0.34120734908136485, "grad_norm": 0.3887717698297268, "learning_rate": 5.920314163082079e-06, "loss": 0.9806, "step": 65 }, { "epoch": 0.3464566929133858, "grad_norm": 0.40896336915084297, "learning_rate": 5.913843095748342e-06, "loss": 1.0135, "step": 66 }, { "epoch": 0.35170603674540685, "grad_norm": 0.3610209560875707, "learning_rate": 5.907123254417783e-06, "loss": 0.956, "step": 67 }, { "epoch": 0.3569553805774278, "grad_norm": 0.38154744815823505, "learning_rate": 5.9001552128075625e-06, "loss": 1.0045, "step": 68 }, { "epoch": 0.36220472440944884, "grad_norm": 0.4094826396119445, "learning_rate": 5.892939565825335e-06, "loss": 1.0069, "step": 69 }, { "epoch": 0.3674540682414698, "grad_norm": 0.39129138622932325, "learning_rate": 5.885476929518457e-06, "loss": 0.9525, "step": 70 }, { "epoch": 0.37270341207349084, "grad_norm": 0.3712890701175899, "learning_rate": 5.8777679410213956e-06, "loss": 0.9792, "step": 71 }, { "epoch": 0.3779527559055118, "grad_norm": 0.4086264062600148, "learning_rate": 5.869813258501323e-06, "loss": 0.9926, "step": 72 }, { "epoch": 0.38320209973753283, "grad_norm": 0.368975878599487, "learning_rate": 5.861613561101934e-06, "loss": 0.9643, "step": 73 }, { "epoch": 0.3884514435695538, "grad_norm": 0.36792811629461203, "learning_rate": 5.853169548885461e-06, "loss": 0.9867, "step": 74 }, { "epoch": 0.3937007874015748, "grad_norm": 0.3566251893981936, "learning_rate": 5.844481942772898e-06, "loss": 1.0069, "step": 75 }, { "epoch": 0.3989501312335958, "grad_norm": 0.4578529359685586, "learning_rate": 5.835551484482459e-06, "loss": 1.0173, "step": 76 }, { "epoch": 0.4041994750656168, "grad_norm": 0.3935925285922137, "learning_rate": 5.826378936466249e-06, "loss": 0.9743, "step": 77 }, { "epoch": 0.4094488188976378, "grad_norm": 0.4109939217838428, "learning_rate": 5.81696508184517e-06, "loss": 0.9866, "step": 78 }, { "epoch": 0.4146981627296588, "grad_norm": 0.3839870332489822, "learning_rate": 5.807310724342058e-06, "loss": 0.9516, "step": 79 }, { "epoch": 0.4199475065616798, "grad_norm": 0.3774576797883406, "learning_rate": 5.797416688213067e-06, "loss": 0.9895, "step": 80 }, { "epoch": 0.4251968503937008, "grad_norm": 0.3817468964498129, "learning_rate": 5.787283818177297e-06, "loss": 0.9632, "step": 81 }, { "epoch": 0.4304461942257218, "grad_norm": 0.60843002346461, "learning_rate": 5.776912979344669e-06, "loss": 1.0166, "step": 82 }, { "epoch": 0.4356955380577428, "grad_norm": 0.3858713700245362, "learning_rate": 5.766305057142073e-06, "loss": 0.9976, "step": 83 }, { "epoch": 0.4409448818897638, "grad_norm": 0.3724153436541016, "learning_rate": 5.755460957237769e-06, "loss": 0.9645, "step": 84 }, { "epoch": 0.4461942257217848, "grad_norm": 0.38201105695018567, "learning_rate": 5.744381605464064e-06, "loss": 0.9899, "step": 85 }, { "epoch": 0.45144356955380577, "grad_norm": 0.38383930861007165, "learning_rate": 5.7330679477382655e-06, "loss": 0.9919, "step": 86 }, { "epoch": 0.4566929133858268, "grad_norm": 0.4078870418259581, "learning_rate": 5.7215209499819296e-06, "loss": 0.9797, "step": 87 }, { "epoch": 0.46194225721784776, "grad_norm": 0.38463767466523974, "learning_rate": 5.709741598038387e-06, "loss": 0.9597, "step": 88 }, { "epoch": 0.4671916010498688, "grad_norm": 0.36309855116472584, "learning_rate": 5.697730897588577e-06, "loss": 0.9737, "step": 89 }, { "epoch": 0.47244094488188976, "grad_norm": 0.4106701446638758, "learning_rate": 5.685489874065187e-06, "loss": 0.9683, "step": 90 }, { "epoch": 0.4776902887139108, "grad_norm": 0.37110409255145443, "learning_rate": 5.673019572565103e-06, "loss": 1.0418, "step": 91 }, { "epoch": 0.48293963254593175, "grad_norm": 0.3558357783330656, "learning_rate": 5.660321057760186e-06, "loss": 1.0055, "step": 92 }, { "epoch": 0.4881889763779528, "grad_norm": 0.40499489938404787, "learning_rate": 5.6473954138063674e-06, "loss": 1.0113, "step": 93 }, { "epoch": 0.49343832020997375, "grad_norm": 0.39428526462199764, "learning_rate": 5.634243744251094e-06, "loss": 0.9875, "step": 94 }, { "epoch": 0.49868766404199477, "grad_norm": 0.3711741011240413, "learning_rate": 5.620867171939109e-06, "loss": 0.9749, "step": 95 }, { "epoch": 0.5039370078740157, "grad_norm": 0.3961340085644134, "learning_rate": 5.607266838916585e-06, "loss": 0.982, "step": 96 }, { "epoch": 0.5091863517060368, "grad_norm": 0.3784646685814138, "learning_rate": 5.593443906333624e-06, "loss": 0.9957, "step": 97 }, { "epoch": 0.5144356955380578, "grad_norm": 0.3750460397069026, "learning_rate": 5.579399554345118e-06, "loss": 0.9755, "step": 98 }, { "epoch": 0.5196850393700787, "grad_norm": 0.3746718538274792, "learning_rate": 5.565134982009994e-06, "loss": 0.9736, "step": 99 }, { "epoch": 0.5249343832020997, "grad_norm": 0.38418890409196027, "learning_rate": 5.550651407188843e-06, "loss": 0.9506, "step": 100 }, { "epoch": 0.5301837270341208, "grad_norm": 0.422976375435725, "learning_rate": 5.535950066439941e-06, "loss": 1.0141, "step": 101 }, { "epoch": 0.5354330708661418, "grad_norm": 0.38354451243133536, "learning_rate": 5.521032214913679e-06, "loss": 0.9618, "step": 102 }, { "epoch": 0.5406824146981627, "grad_norm": 0.38257660011773076, "learning_rate": 5.505899126245397e-06, "loss": 0.939, "step": 103 }, { "epoch": 0.5459317585301837, "grad_norm": 0.3768438915225408, "learning_rate": 5.490552092446652e-06, "loss": 0.9675, "step": 104 }, { "epoch": 0.5511811023622047, "grad_norm": 0.3749655286727107, "learning_rate": 5.474992423794907e-06, "loss": 0.9592, "step": 105 }, { "epoch": 0.5564304461942258, "grad_norm": 0.38461916993489687, "learning_rate": 5.459221448721664e-06, "loss": 0.9623, "step": 106 }, { "epoch": 0.5616797900262467, "grad_norm": 0.35648642966931204, "learning_rate": 5.443240513699045e-06, "loss": 0.985, "step": 107 }, { "epoch": 0.5669291338582677, "grad_norm": 0.4051560712719681, "learning_rate": 5.427050983124842e-06, "loss": 0.9407, "step": 108 }, { "epoch": 0.5721784776902887, "grad_norm": 0.3769879713701903, "learning_rate": 5.410654239206021e-06, "loss": 0.968, "step": 109 }, { "epoch": 0.5774278215223098, "grad_norm": 0.3746822083724367, "learning_rate": 5.394051681840719e-06, "loss": 0.9497, "step": 110 }, { "epoch": 0.5826771653543307, "grad_norm": 0.3987231911136733, "learning_rate": 5.3772447284987216e-06, "loss": 0.961, "step": 111 }, { "epoch": 0.5879265091863517, "grad_norm": 0.37848222525971176, "learning_rate": 5.36023481410045e-06, "loss": 0.9707, "step": 112 }, { "epoch": 0.5931758530183727, "grad_norm": 0.3794904855253974, "learning_rate": 5.343023390894446e-06, "loss": 0.9714, "step": 113 }, { "epoch": 0.5984251968503937, "grad_norm": 0.37452267525256994, "learning_rate": 5.325611928333389e-06, "loss": 0.9406, "step": 114 }, { "epoch": 0.6036745406824147, "grad_norm": 0.39474437059829304, "learning_rate": 5.308001912948637e-06, "loss": 0.9626, "step": 115 }, { "epoch": 0.6089238845144357, "grad_norm": 0.4023921986663554, "learning_rate": 5.290194848223309e-06, "loss": 0.9889, "step": 116 }, { "epoch": 0.6141732283464567, "grad_norm": 0.39963771712171875, "learning_rate": 5.272192254463929e-06, "loss": 0.9639, "step": 117 }, { "epoch": 0.6194225721784777, "grad_norm": 0.3893586064595733, "learning_rate": 5.2539956686706205e-06, "loss": 0.9469, "step": 118 }, { "epoch": 0.6246719160104987, "grad_norm": 0.4651495625439333, "learning_rate": 5.2356066444058875e-06, "loss": 0.9658, "step": 119 }, { "epoch": 0.6299212598425197, "grad_norm": 0.39599728107932586, "learning_rate": 5.217026751661978e-06, "loss": 1.0137, "step": 120 }, { "epoch": 0.6351706036745407, "grad_norm": 0.406988761369817, "learning_rate": 5.198257576726835e-06, "loss": 0.9306, "step": 121 }, { "epoch": 0.6404199475065617, "grad_norm": 0.3611939094322339, "learning_rate": 5.179300722048673e-06, "loss": 0.9462, "step": 122 }, { "epoch": 0.6456692913385826, "grad_norm": 0.3809841775392484, "learning_rate": 5.1601578060991645e-06, "loss": 0.953, "step": 123 }, { "epoch": 0.6509186351706037, "grad_norm": 0.46022843064705843, "learning_rate": 5.1408304632352575e-06, "loss": 0.9422, "step": 124 }, { "epoch": 0.6561679790026247, "grad_norm": 0.3979704646560941, "learning_rate": 5.1213203435596425e-06, "loss": 0.9751, "step": 125 }, { "epoch": 0.6614173228346457, "grad_norm": 0.39388496260457084, "learning_rate": 5.101629112779873e-06, "loss": 0.9722, "step": 126 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3899148438115094, "learning_rate": 5.08175845206615e-06, "loss": 0.9652, "step": 127 }, { "epoch": 0.6719160104986877, "grad_norm": 0.37391882787694275, "learning_rate": 5.061710057907788e-06, "loss": 0.9621, "step": 128 }, { "epoch": 0.6771653543307087, "grad_norm": 0.39500875865406576, "learning_rate": 5.041485641968385e-06, "loss": 0.9899, "step": 129 }, { "epoch": 0.6824146981627297, "grad_norm": 0.37540362490802714, "learning_rate": 5.021086930939672e-06, "loss": 0.9472, "step": 130 }, { "epoch": 0.6876640419947506, "grad_norm": 0.3940788728379769, "learning_rate": 5.000515666394105e-06, "loss": 0.9479, "step": 131 }, { "epoch": 0.6929133858267716, "grad_norm": 0.3919125365655477, "learning_rate": 4.979773604636169e-06, "loss": 0.9624, "step": 132 }, { "epoch": 0.6981627296587927, "grad_norm": 0.3804552314744538, "learning_rate": 4.958862516552433e-06, "loss": 0.9806, "step": 133 }, { "epoch": 0.7034120734908137, "grad_norm": 0.3674434286105591, "learning_rate": 4.937784187460362e-06, "loss": 0.9511, "step": 134 }, { "epoch": 0.7086614173228346, "grad_norm": 0.4109777494732396, "learning_rate": 4.916540416955884e-06, "loss": 0.9943, "step": 135 }, { "epoch": 0.7139107611548556, "grad_norm": 0.40231567788837497, "learning_rate": 4.895133018759753e-06, "loss": 0.9798, "step": 136 }, { "epoch": 0.7191601049868767, "grad_norm": 0.3721834479908975, "learning_rate": 4.873563820562698e-06, "loss": 0.9504, "step": 137 }, { "epoch": 0.7244094488188977, "grad_norm": 0.36127526200518306, "learning_rate": 4.851834663869379e-06, "loss": 0.9517, "step": 138 }, { "epoch": 0.7296587926509186, "grad_norm": 0.3513827139135777, "learning_rate": 4.82994740384117e-06, "loss": 0.9835, "step": 139 }, { "epoch": 0.7349081364829396, "grad_norm": 0.36760728272750326, "learning_rate": 4.80790390913777e-06, "loss": 0.9503, "step": 140 }, { "epoch": 0.7401574803149606, "grad_norm": 0.36275280721999276, "learning_rate": 4.785706061757656e-06, "loss": 0.9743, "step": 141 }, { "epoch": 0.7454068241469817, "grad_norm": 0.3733380512329921, "learning_rate": 4.763355756877419e-06, "loss": 0.9384, "step": 142 }, { "epoch": 0.7506561679790026, "grad_norm": 0.3801691027568987, "learning_rate": 4.740854902689947e-06, "loss": 0.9296, "step": 143 }, { "epoch": 0.7559055118110236, "grad_norm": 0.39053906811778566, "learning_rate": 4.718205420241516e-06, "loss": 0.9488, "step": 144 }, { "epoch": 0.7611548556430446, "grad_norm": 0.3923993707534958, "learning_rate": 4.695409243267776e-06, "loss": 0.9383, "step": 145 }, { "epoch": 0.7664041994750657, "grad_norm": 0.364792552828712, "learning_rate": 4.672468318028657e-06, "loss": 0.9193, "step": 146 }, { "epoch": 0.7716535433070866, "grad_norm": 0.35070825551906964, "learning_rate": 4.649384603142202e-06, "loss": 0.9164, "step": 147 }, { "epoch": 0.7769028871391076, "grad_norm": 0.37099778180795795, "learning_rate": 4.626160069417348e-06, "loss": 0.9425, "step": 148 }, { "epoch": 0.7821522309711286, "grad_norm": 0.36954118968922517, "learning_rate": 4.602796699685665e-06, "loss": 0.9265, "step": 149 }, { "epoch": 0.7874015748031497, "grad_norm": 0.4076466706382121, "learning_rate": 4.579296488632067e-06, "loss": 1.0133, "step": 150 }, { "epoch": 0.7926509186351706, "grad_norm": 0.4015334925568992, "learning_rate": 4.5556614426245165e-06, "loss": 0.9486, "step": 151 }, { "epoch": 0.7979002624671916, "grad_norm": 0.39628644809730684, "learning_rate": 4.5318935795427206e-06, "loss": 0.9605, "step": 152 }, { "epoch": 0.8031496062992126, "grad_norm": 0.36792154742540445, "learning_rate": 4.507994928605862e-06, "loss": 0.9287, "step": 153 }, { "epoch": 0.8083989501312336, "grad_norm": 0.3887839296706913, "learning_rate": 4.483967530199337e-06, "loss": 0.951, "step": 154 }, { "epoch": 0.8136482939632546, "grad_norm": 0.36716852968968616, "learning_rate": 4.459813435700569e-06, "loss": 0.9702, "step": 155 }, { "epoch": 0.8188976377952756, "grad_norm": 0.3533521076976156, "learning_rate": 4.4355347073038595e-06, "loss": 0.9612, "step": 156 }, { "epoch": 0.8241469816272966, "grad_norm": 0.3499649930079787, "learning_rate": 4.411133417844328e-06, "loss": 0.9599, "step": 157 }, { "epoch": 0.8293963254593176, "grad_norm": 0.38582146832565867, "learning_rate": 4.38661165062094e-06, "loss": 0.9894, "step": 158 }, { "epoch": 0.8346456692913385, "grad_norm": 0.39040836855795735, "learning_rate": 4.36197149921864e-06, "loss": 0.9747, "step": 159 }, { "epoch": 0.8398950131233596, "grad_norm": 0.3798580758700489, "learning_rate": 4.3372150673296155e-06, "loss": 0.9654, "step": 160 }, { "epoch": 0.8451443569553806, "grad_norm": 0.3764456540061034, "learning_rate": 4.3123444685736795e-06, "loss": 0.9823, "step": 161 }, { "epoch": 0.8503937007874016, "grad_norm": 0.3771195417830333, "learning_rate": 4.287361826317827e-06, "loss": 0.9456, "step": 162 }, { "epoch": 0.8556430446194225, "grad_norm": 0.37650137746409273, "learning_rate": 4.262269273494946e-06, "loss": 1.0022, "step": 163 }, { "epoch": 0.8608923884514436, "grad_norm": 0.38148353077474145, "learning_rate": 4.237068952421711e-06, "loss": 0.964, "step": 164 }, { "epoch": 0.8661417322834646, "grad_norm": 0.3982519128695332, "learning_rate": 4.2117630146156845e-06, "loss": 0.9673, "step": 165 }, { "epoch": 0.8713910761154856, "grad_norm": 0.36000775624632003, "learning_rate": 4.186353620611627e-06, "loss": 0.9359, "step": 166 }, { "epoch": 0.8766404199475065, "grad_norm": 0.36850454735662447, "learning_rate": 4.160842939777036e-06, "loss": 0.9422, "step": 167 }, { "epoch": 0.8818897637795275, "grad_norm": 0.37804115639757085, "learning_rate": 4.135233150126931e-06, "loss": 0.9454, "step": 168 }, { "epoch": 0.8871391076115486, "grad_norm": 0.3689383402086321, "learning_rate": 4.109526438137908e-06, "loss": 0.9455, "step": 169 }, { "epoch": 0.8923884514435696, "grad_norm": 0.46527154775209717, "learning_rate": 4.08372499856146e-06, "loss": 0.9386, "step": 170 }, { "epoch": 0.8976377952755905, "grad_norm": 0.45653306710128705, "learning_rate": 4.0578310342365975e-06, "loss": 0.9616, "step": 171 }, { "epoch": 0.9028871391076115, "grad_norm": 0.3773630567359451, "learning_rate": 4.031846755901785e-06, "loss": 0.9285, "step": 172 }, { "epoch": 0.9081364829396326, "grad_norm": 0.3644595191521506, "learning_rate": 4.005774382006182e-06, "loss": 0.9663, "step": 173 }, { "epoch": 0.9133858267716536, "grad_norm": 0.3539767481135477, "learning_rate": 3.97961613852025e-06, "loss": 0.9564, "step": 174 }, { "epoch": 0.9186351706036745, "grad_norm": 0.3819676152776953, "learning_rate": 3.953374258745705e-06, "loss": 0.9607, "step": 175 }, { "epoch": 0.9238845144356955, "grad_norm": 0.38397675786726637, "learning_rate": 3.927050983124842e-06, "loss": 0.9539, "step": 176 }, { "epoch": 0.9291338582677166, "grad_norm": 0.3979084367711538, "learning_rate": 3.900648559049258e-06, "loss": 0.9505, "step": 177 }, { "epoch": 0.9343832020997376, "grad_norm": 0.3756154385935223, "learning_rate": 3.874169240667974e-06, "loss": 0.9519, "step": 178 }, { "epoch": 0.9396325459317585, "grad_norm": 0.40551973597201274, "learning_rate": 3.847615288694985e-06, "loss": 0.9727, "step": 179 }, { "epoch": 0.9448818897637795, "grad_norm": 0.4149625851710124, "learning_rate": 3.820988970216249e-06, "loss": 0.9464, "step": 180 }, { "epoch": 0.9501312335958005, "grad_norm": 0.35739115830542967, "learning_rate": 3.7942925584961272e-06, "loss": 0.9427, "step": 181 }, { "epoch": 0.9553805774278216, "grad_norm": 0.3759540038847051, "learning_rate": 3.767528332783307e-06, "loss": 0.9679, "step": 182 }, { "epoch": 0.9606299212598425, "grad_norm": 0.3525867658299593, "learning_rate": 3.740698578116199e-06, "loss": 0.9183, "step": 183 }, { "epoch": 0.9658792650918635, "grad_norm": 0.3557123352774738, "learning_rate": 3.7138055851278564e-06, "loss": 0.9383, "step": 184 }, { "epoch": 0.9711286089238845, "grad_norm": 0.3623514252763418, "learning_rate": 3.6868516498504025e-06, "loss": 0.9246, "step": 185 }, { "epoch": 0.9763779527559056, "grad_norm": 0.38495496418054853, "learning_rate": 3.6598390735190066e-06, "loss": 0.9612, "step": 186 }, { "epoch": 0.9816272965879265, "grad_norm": 0.3648599004428126, "learning_rate": 3.63277016237541e-06, "loss": 0.9293, "step": 187 }, { "epoch": 0.9868766404199475, "grad_norm": 0.38871547084803876, "learning_rate": 3.6056472274710305e-06, "loss": 0.9973, "step": 188 }, { "epoch": 0.9921259842519685, "grad_norm": 0.38590844403642666, "learning_rate": 3.578472584469651e-06, "loss": 0.9457, "step": 189 }, { "epoch": 0.9973753280839895, "grad_norm": 0.3872507088649178, "learning_rate": 3.5512485534497116e-06, "loss": 0.9462, "step": 190 }, { "epoch": 1.0, "grad_norm": 0.3872507088649178, "learning_rate": 3.523977458706237e-06, "loss": 0.9693, "step": 191 }, { "epoch": 1.005249343832021, "grad_norm": 0.6232728744646114, "learning_rate": 3.49666162855239e-06, "loss": 0.887, "step": 192 }, { "epoch": 1.010498687664042, "grad_norm": 0.4149641950734625, "learning_rate": 3.469303395120693e-06, "loss": 0.8826, "step": 193 }, { "epoch": 1.015748031496063, "grad_norm": 0.37273340109017755, "learning_rate": 3.441905094163913e-06, "loss": 0.8893, "step": 194 }, { "epoch": 1.020997375328084, "grad_norm": 0.4113832689982837, "learning_rate": 3.414469064855647e-06, "loss": 0.9205, "step": 195 }, { "epoch": 1.026246719160105, "grad_norm": 0.49485155842511663, "learning_rate": 3.3869976495906104e-06, "loss": 0.9074, "step": 196 }, { "epoch": 1.031496062992126, "grad_norm": 0.3736781934252868, "learning_rate": 3.3594931937846498e-06, "loss": 0.8966, "step": 197 }, { "epoch": 1.036745406824147, "grad_norm": 0.3758650059773124, "learning_rate": 3.3319580456745023e-06, "loss": 0.8759, "step": 198 }, { "epoch": 1.041994750656168, "grad_norm": 0.4056031624712629, "learning_rate": 3.3043945561173092e-06, "loss": 0.8788, "step": 199 }, { "epoch": 1.047244094488189, "grad_norm": 0.36344982085137467, "learning_rate": 3.2768050783899063e-06, "loss": 0.873, "step": 200 }, { "epoch": 1.05249343832021, "grad_norm": 0.3760103676246, "learning_rate": 3.249191967987912e-06, "loss": 0.899, "step": 201 }, { "epoch": 1.057742782152231, "grad_norm": 0.39433477834527153, "learning_rate": 3.221557582424622e-06, "loss": 0.9019, "step": 202 }, { "epoch": 1.0629921259842519, "grad_norm": 0.3595753440791428, "learning_rate": 3.1939042810297328e-06, "loss": 0.8781, "step": 203 }, { "epoch": 1.068241469816273, "grad_norm": 0.3743448170598354, "learning_rate": 3.16623442474791e-06, "loss": 0.8689, "step": 204 }, { "epoch": 1.073490813648294, "grad_norm": 0.3618551186966609, "learning_rate": 3.138550375937219e-06, "loss": 0.9094, "step": 205 }, { "epoch": 1.078740157480315, "grad_norm": 0.36577516842050983, "learning_rate": 3.1108544981674356e-06, "loss": 0.8668, "step": 206 }, { "epoch": 1.083989501312336, "grad_norm": 0.3985134455319658, "learning_rate": 3.0831491560182495e-06, "loss": 0.9016, "step": 207 }, { "epoch": 1.0892388451443569, "grad_norm": 0.37808489525197075, "learning_rate": 3.0554367148773897e-06, "loss": 0.895, "step": 208 }, { "epoch": 1.094488188976378, "grad_norm": 0.4112784941005797, "learning_rate": 3.027719540738673e-06, "loss": 0.859, "step": 209 }, { "epoch": 1.099737532808399, "grad_norm": 0.3830296759827936, "learning_rate": 3e-06, "loss": 0.8569, "step": 210 }, { "epoch": 1.10498687664042, "grad_norm": 0.3930755503999148, "learning_rate": 2.972280459261328e-06, "loss": 0.8774, "step": 211 }, { "epoch": 1.110236220472441, "grad_norm": 0.36738851637178116, "learning_rate": 2.944563285122611e-06, "loss": 0.9086, "step": 212 }, { "epoch": 1.1154855643044619, "grad_norm": 0.3897160841039193, "learning_rate": 2.9168508439817515e-06, "loss": 0.889, "step": 213 }, { "epoch": 1.120734908136483, "grad_norm": 0.39858146379374537, "learning_rate": 2.889145501832566e-06, "loss": 0.8964, "step": 214 }, { "epoch": 1.125984251968504, "grad_norm": 0.3739395525411432, "learning_rate": 2.861449624062782e-06, "loss": 0.8884, "step": 215 }, { "epoch": 1.1312335958005248, "grad_norm": 0.3755768464864809, "learning_rate": 2.83376557525209e-06, "loss": 0.851, "step": 216 }, { "epoch": 1.136482939632546, "grad_norm": 0.38260315757882735, "learning_rate": 2.8060957189702674e-06, "loss": 0.9152, "step": 217 }, { "epoch": 1.141732283464567, "grad_norm": 0.4205379839527009, "learning_rate": 2.7784424175753784e-06, "loss": 0.8683, "step": 218 }, { "epoch": 1.1469816272965878, "grad_norm": 0.38325260941818995, "learning_rate": 2.7508080320120888e-06, "loss": 0.8943, "step": 219 }, { "epoch": 1.152230971128609, "grad_norm": 0.3763198826603672, "learning_rate": 2.7231949216100943e-06, "loss": 0.8676, "step": 220 }, { "epoch": 1.1574803149606299, "grad_norm": 0.3767162287387105, "learning_rate": 2.6956054438826918e-06, "loss": 0.8482, "step": 221 }, { "epoch": 1.162729658792651, "grad_norm": 0.3486273740901837, "learning_rate": 2.668041954325498e-06, "loss": 0.8879, "step": 222 }, { "epoch": 1.167979002624672, "grad_norm": 0.39084218665366566, "learning_rate": 2.640506806215351e-06, "loss": 0.8679, "step": 223 }, { "epoch": 1.1732283464566928, "grad_norm": 0.3538552501730603, "learning_rate": 2.613002350409391e-06, "loss": 0.8871, "step": 224 }, { "epoch": 1.178477690288714, "grad_norm": 0.36544200913577, "learning_rate": 2.585530935144354e-06, "loss": 0.8616, "step": 225 }, { "epoch": 1.1837270341207349, "grad_norm": 0.3985990462573467, "learning_rate": 2.558094905836087e-06, "loss": 0.8917, "step": 226 }, { "epoch": 1.188976377952756, "grad_norm": 0.42608518999556655, "learning_rate": 2.5306966048793067e-06, "loss": 0.8817, "step": 227 }, { "epoch": 1.194225721784777, "grad_norm": 0.37952769789031354, "learning_rate": 2.5033383714476097e-06, "loss": 0.8985, "step": 228 }, { "epoch": 1.1994750656167978, "grad_norm": 0.40804864076806885, "learning_rate": 2.4760225412937633e-06, "loss": 0.9073, "step": 229 }, { "epoch": 1.204724409448819, "grad_norm": 0.4167713152946991, "learning_rate": 2.4487514465502885e-06, "loss": 0.8566, "step": 230 }, { "epoch": 1.20997375328084, "grad_norm": 0.4022153540631621, "learning_rate": 2.42152741553035e-06, "loss": 0.8713, "step": 231 }, { "epoch": 1.2152230971128608, "grad_norm": 0.4222065137992956, "learning_rate": 2.39435277252897e-06, "loss": 0.9035, "step": 232 }, { "epoch": 1.220472440944882, "grad_norm": 0.3666365807384159, "learning_rate": 2.3672298376245908e-06, "loss": 0.8637, "step": 233 }, { "epoch": 1.2257217847769029, "grad_norm": 0.3976853335036615, "learning_rate": 2.3401609264809953e-06, "loss": 0.9398, "step": 234 }, { "epoch": 1.2309711286089238, "grad_norm": 0.37956934109451046, "learning_rate": 2.3131483501495985e-06, "loss": 0.8353, "step": 235 }, { "epoch": 1.236220472440945, "grad_norm": 0.33722056538083744, "learning_rate": 2.2861944148721446e-06, "loss": 0.8786, "step": 236 }, { "epoch": 1.2414698162729658, "grad_norm": 0.49777382093647954, "learning_rate": 2.2593014218838e-06, "loss": 0.8834, "step": 237 }, { "epoch": 1.246719160104987, "grad_norm": 0.35315516410389436, "learning_rate": 2.232471667216693e-06, "loss": 0.8442, "step": 238 }, { "epoch": 1.2519685039370079, "grad_norm": 0.3816124424363711, "learning_rate": 2.2057074415038725e-06, "loss": 0.8573, "step": 239 }, { "epoch": 1.257217847769029, "grad_norm": 0.36319142999803095, "learning_rate": 2.1790110297837514e-06, "loss": 0.8481, "step": 240 }, { "epoch": 1.26246719160105, "grad_norm": 0.34672889281207053, "learning_rate": 2.152384711305015e-06, "loss": 0.8623, "step": 241 }, { "epoch": 1.2677165354330708, "grad_norm": 0.37448151544392105, "learning_rate": 2.1258307593320262e-06, "loss": 0.8751, "step": 242 }, { "epoch": 1.272965879265092, "grad_norm": 0.37082567424502005, "learning_rate": 2.099351440950742e-06, "loss": 0.8914, "step": 243 }, { "epoch": 1.2782152230971129, "grad_norm": 0.39074992783073415, "learning_rate": 2.072949016875158e-06, "loss": 0.9222, "step": 244 }, { "epoch": 1.2834645669291338, "grad_norm": 0.4150437401629804, "learning_rate": 2.046625741254295e-06, "loss": 0.9475, "step": 245 }, { "epoch": 1.288713910761155, "grad_norm": 0.4504166670407193, "learning_rate": 2.0203838614797505e-06, "loss": 0.9026, "step": 246 }, { "epoch": 1.2939632545931758, "grad_norm": 0.38345958484903814, "learning_rate": 1.994225617993819e-06, "loss": 0.9074, "step": 247 }, { "epoch": 1.2992125984251968, "grad_norm": 0.37086048031752866, "learning_rate": 1.9681532440982154e-06, "loss": 0.8755, "step": 248 }, { "epoch": 1.304461942257218, "grad_norm": 0.3775524407980251, "learning_rate": 1.942168965763402e-06, "loss": 0.8986, "step": 249 }, { "epoch": 1.3097112860892388, "grad_norm": 0.364796377340789, "learning_rate": 1.916275001438541e-06, "loss": 0.867, "step": 250 }, { "epoch": 1.3149606299212597, "grad_norm": 0.3705604843330414, "learning_rate": 1.8904735618620928e-06, "loss": 0.8875, "step": 251 }, { "epoch": 1.3202099737532809, "grad_norm": 0.3847344001283667, "learning_rate": 1.8647668498730693e-06, "loss": 0.8678, "step": 252 }, { "epoch": 1.3254593175853018, "grad_norm": 0.3507183610862785, "learning_rate": 1.8391570602229647e-06, "loss": 0.8895, "step": 253 }, { "epoch": 1.330708661417323, "grad_norm": 0.34464955572346173, "learning_rate": 1.8136463793883725e-06, "loss": 0.9112, "step": 254 }, { "epoch": 1.3359580052493438, "grad_norm": 0.3804540728076062, "learning_rate": 1.7882369853843155e-06, "loss": 0.8818, "step": 255 }, { "epoch": 1.341207349081365, "grad_norm": 0.38671544491057547, "learning_rate": 1.76293104757829e-06, "loss": 0.8712, "step": 256 }, { "epoch": 1.3464566929133859, "grad_norm": 0.35028636565033566, "learning_rate": 1.7377307265050559e-06, "loss": 0.8795, "step": 257 }, { "epoch": 1.3517060367454068, "grad_norm": 0.3596694021401425, "learning_rate": 1.7126381736821732e-06, "loss": 0.8791, "step": 258 }, { "epoch": 1.356955380577428, "grad_norm": 0.3833574983214166, "learning_rate": 1.6876555314263213e-06, "loss": 0.9108, "step": 259 }, { "epoch": 1.3622047244094488, "grad_norm": 0.3701840047085969, "learning_rate": 1.6627849326703855e-06, "loss": 0.8695, "step": 260 }, { "epoch": 1.3674540682414698, "grad_norm": 0.36098816535443995, "learning_rate": 1.6380285007813598e-06, "loss": 0.876, "step": 261 }, { "epoch": 1.372703412073491, "grad_norm": 0.3900890284585014, "learning_rate": 1.6133883493790609e-06, "loss": 0.8498, "step": 262 }, { "epoch": 1.3779527559055118, "grad_norm": 0.34906551126755136, "learning_rate": 1.5888665821556724e-06, "loss": 0.8513, "step": 263 }, { "epoch": 1.3832020997375327, "grad_norm": 0.3753732283477496, "learning_rate": 1.5644652926961407e-06, "loss": 0.8714, "step": 264 }, { "epoch": 1.3884514435695539, "grad_norm": 0.34748864593560347, "learning_rate": 1.5401865642994315e-06, "loss": 0.9124, "step": 265 }, { "epoch": 1.3937007874015748, "grad_norm": 0.36698053817770165, "learning_rate": 1.5160324698006642e-06, "loss": 0.8814, "step": 266 }, { "epoch": 1.3989501312335957, "grad_norm": 0.4000964153653425, "learning_rate": 1.4920050713941398e-06, "loss": 0.9082, "step": 267 }, { "epoch": 1.4041994750656168, "grad_norm": 0.3985391177875817, "learning_rate": 1.4681064204572798e-06, "loss": 0.8749, "step": 268 }, { "epoch": 1.4094488188976377, "grad_norm": 0.3578122677174226, "learning_rate": 1.4443385573754837e-06, "loss": 0.8608, "step": 269 }, { "epoch": 1.4146981627296589, "grad_norm": 0.3576093239254431, "learning_rate": 1.4207035113679322e-06, "loss": 0.8798, "step": 270 }, { "epoch": 1.4199475065616798, "grad_norm": 0.35299639204379674, "learning_rate": 1.3972033003143348e-06, "loss": 0.8972, "step": 271 }, { "epoch": 1.425196850393701, "grad_norm": 0.3937775289907907, "learning_rate": 1.3738399305826516e-06, "loss": 0.8736, "step": 272 }, { "epoch": 1.4304461942257218, "grad_norm": 0.3691998032129419, "learning_rate": 1.3506153968577983e-06, "loss": 0.8667, "step": 273 }, { "epoch": 1.4356955380577427, "grad_norm": 0.35764876894907843, "learning_rate": 1.3275316819713435e-06, "loss": 0.882, "step": 274 }, { "epoch": 1.4409448818897639, "grad_norm": 0.3859579688778526, "learning_rate": 1.3045907567322243e-06, "loss": 0.844, "step": 275 }, { "epoch": 1.4461942257217848, "grad_norm": 0.3736621084680505, "learning_rate": 1.2817945797584844e-06, "loss": 0.8525, "step": 276 }, { "epoch": 1.4514435695538057, "grad_norm": 0.36602372507940695, "learning_rate": 1.2591450973100532e-06, "loss": 0.8577, "step": 277 }, { "epoch": 1.4566929133858268, "grad_norm": 0.37926054124030645, "learning_rate": 1.236644243122581e-06, "loss": 0.8837, "step": 278 }, { "epoch": 1.4619422572178478, "grad_norm": 0.3680022216795608, "learning_rate": 1.214293938242344e-06, "loss": 0.8984, "step": 279 }, { "epoch": 1.4671916010498687, "grad_norm": 0.37824901927870175, "learning_rate": 1.1920960908622313e-06, "loss": 0.8745, "step": 280 }, { "epoch": 1.4724409448818898, "grad_norm": 0.3489273490529577, "learning_rate": 1.17005259615883e-06, "loss": 0.8628, "step": 281 }, { "epoch": 1.4776902887139107, "grad_norm": 0.3735770062938505, "learning_rate": 1.1481653361306215e-06, "loss": 0.8619, "step": 282 }, { "epoch": 1.4829396325459316, "grad_norm": 0.3458041443504503, "learning_rate": 1.1264361794373032e-06, "loss": 0.8761, "step": 283 }, { "epoch": 1.4881889763779528, "grad_norm": 0.35998420937846626, "learning_rate": 1.104866981240248e-06, "loss": 0.8844, "step": 284 }, { "epoch": 1.4934383202099737, "grad_norm": 0.4029178073367971, "learning_rate": 1.0834595830441168e-06, "loss": 0.8511, "step": 285 }, { "epoch": 1.4986876640419948, "grad_norm": 0.3763622650814437, "learning_rate": 1.0622158125396387e-06, "loss": 0.8599, "step": 286 }, { "epoch": 1.5039370078740157, "grad_norm": 0.3845965137728459, "learning_rate": 1.0411374834475678e-06, "loss": 0.9062, "step": 287 }, { "epoch": 1.5091863517060369, "grad_norm": 0.34964825506869784, "learning_rate": 1.020226395363833e-06, "loss": 0.8525, "step": 288 }, { "epoch": 1.5144356955380578, "grad_norm": 0.38214019455395715, "learning_rate": 9.994843336058968e-07, "loss": 0.8686, "step": 289 }, { "epoch": 1.5196850393700787, "grad_norm": 0.3808975526218143, "learning_rate": 9.789130690603284e-07, "loss": 0.8537, "step": 290 }, { "epoch": 1.5249343832020998, "grad_norm": 0.3761982373529746, "learning_rate": 9.585143580316153e-07, "loss": 0.8826, "step": 291 }, { "epoch": 1.5301837270341208, "grad_norm": 0.351389916026518, "learning_rate": 9.382899420922119e-07, "loss": 0.8683, "step": 292 }, { "epoch": 1.5354330708661417, "grad_norm": 0.3711139029247798, "learning_rate": 9.182415479338512e-07, "loss": 0.8878, "step": 293 }, { "epoch": 1.5406824146981628, "grad_norm": 0.3717732780588312, "learning_rate": 8.983708872201271e-07, "loss": 0.8585, "step": 294 }, { "epoch": 1.5459317585301837, "grad_norm": 0.3742271193984993, "learning_rate": 8.786796564403577e-07, "loss": 0.8579, "step": 295 }, { "epoch": 1.5511811023622046, "grad_norm": 0.42523434659053005, "learning_rate": 8.591695367647433e-07, "loss": 0.8746, "step": 296 }, { "epoch": 1.5564304461942258, "grad_norm": 0.3794388162880317, "learning_rate": 8.398421939008367e-07, "loss": 0.8479, "step": 297 }, { "epoch": 1.5616797900262467, "grad_norm": 0.3588910082794427, "learning_rate": 8.206992779513281e-07, "loss": 0.8329, "step": 298 }, { "epoch": 1.5669291338582676, "grad_norm": 0.37000389491476643, "learning_rate": 8.017424232731664e-07, "loss": 0.8693, "step": 299 }, { "epoch": 1.5721784776902887, "grad_norm": 0.4003207798760719, "learning_rate": 7.829732483380231e-07, "loss": 0.8886, "step": 300 }, { "epoch": 1.5774278215223099, "grad_norm": 0.37170546863230536, "learning_rate": 7.64393355594112e-07, "loss": 0.9035, "step": 301 }, { "epoch": 1.5826771653543306, "grad_norm": 0.35766484669954807, "learning_rate": 7.4600433132938e-07, "loss": 0.848, "step": 302 }, { "epoch": 1.5879265091863517, "grad_norm": 0.33788894124632585, "learning_rate": 7.278077455360717e-07, "loss": 0.8545, "step": 303 }, { "epoch": 1.5931758530183728, "grad_norm": 0.36604275227388566, "learning_rate": 7.09805151776691e-07, "loss": 0.8415, "step": 304 }, { "epoch": 1.5984251968503937, "grad_norm": 0.33845675028801603, "learning_rate": 6.919980870513633e-07, "loss": 0.8472, "step": 305 }, { "epoch": 1.6036745406824147, "grad_norm": 0.35701657873038517, "learning_rate": 6.743880716666104e-07, "loss": 0.8496, "step": 306 }, { "epoch": 1.6089238845144358, "grad_norm": 0.3779107207471187, "learning_rate": 6.569766091055539e-07, "loss": 0.8495, "step": 307 }, { "epoch": 1.6141732283464567, "grad_norm": 0.3872432739805792, "learning_rate": 6.397651858995504e-07, "loss": 0.851, "step": 308 }, { "epoch": 1.6194225721784776, "grad_norm": 0.3595059488828886, "learning_rate": 6.227552715012785e-07, "loss": 0.8855, "step": 309 }, { "epoch": 1.6246719160104988, "grad_norm": 0.37515207797149636, "learning_rate": 6.059483181592815e-07, "loss": 0.8858, "step": 310 }, { "epoch": 1.6299212598425197, "grad_norm": 0.3629206169447269, "learning_rate": 5.893457607939788e-07, "loss": 0.8807, "step": 311 }, { "epoch": 1.6351706036745406, "grad_norm": 0.3545486266254271, "learning_rate": 5.72949016875158e-07, "loss": 0.8955, "step": 312 }, { "epoch": 1.6404199475065617, "grad_norm": 0.3432909080087375, "learning_rate": 5.56759486300955e-07, "loss": 0.8681, "step": 313 }, { "epoch": 1.6456692913385826, "grad_norm": 0.3639385301913205, "learning_rate": 5.40778551278337e-07, "loss": 0.8733, "step": 314 }, { "epoch": 1.6509186351706036, "grad_norm": 0.38920489291790045, "learning_rate": 5.250075762050935e-07, "loss": 0.8745, "step": 315 }, { "epoch": 1.6561679790026247, "grad_norm": 0.3618641411341515, "learning_rate": 5.094479075533486e-07, "loss": 0.8832, "step": 316 }, { "epoch": 1.6614173228346458, "grad_norm": 0.38194003303963936, "learning_rate": 4.941008737546039e-07, "loss": 0.882, "step": 317 }, { "epoch": 1.6666666666666665, "grad_norm": 0.36004508832846943, "learning_rate": 4.789677850863222e-07, "loss": 0.8754, "step": 318 }, { "epoch": 1.6719160104986877, "grad_norm": 0.3713147294795857, "learning_rate": 4.6404993356005967e-07, "loss": 0.8496, "step": 319 }, { "epoch": 1.6771653543307088, "grad_norm": 0.40332844918244803, "learning_rate": 4.4934859281115804e-07, "loss": 0.8985, "step": 320 }, { "epoch": 1.6824146981627297, "grad_norm": 0.37460256078858306, "learning_rate": 4.34865017990007e-07, "loss": 0.8997, "step": 321 }, { "epoch": 1.6876640419947506, "grad_norm": 0.34681859755129757, "learning_rate": 4.2060044565488264e-07, "loss": 0.8596, "step": 322 }, { "epoch": 1.6929133858267718, "grad_norm": 0.36407790059697526, "learning_rate": 4.0655609366637635e-07, "loss": 0.8891, "step": 323 }, { "epoch": 1.6981627296587927, "grad_norm": 0.38031424057937346, "learning_rate": 3.9273316108341493e-07, "loss": 0.9026, "step": 324 }, { "epoch": 1.7034120734908136, "grad_norm": 0.37078158732866173, "learning_rate": 3.791328280608916e-07, "loss": 0.8676, "step": 325 }, { "epoch": 1.7086614173228347, "grad_norm": 0.3744777105843697, "learning_rate": 3.657562557489063e-07, "loss": 0.8692, "step": 326 }, { "epoch": 1.7139107611548556, "grad_norm": 0.35275476224983093, "learning_rate": 3.52604586193633e-07, "loss": 0.878, "step": 327 }, { "epoch": 1.7191601049868765, "grad_norm": 0.3845721220969486, "learning_rate": 3.396789422398143e-07, "loss": 0.8715, "step": 328 }, { "epoch": 1.7244094488188977, "grad_norm": 0.38567935425060995, "learning_rate": 3.269804274348966e-07, "loss": 0.8552, "step": 329 }, { "epoch": 1.7296587926509186, "grad_norm": 0.36984914245968326, "learning_rate": 3.145101259348133e-07, "loss": 0.8905, "step": 330 }, { "epoch": 1.7349081364829395, "grad_norm": 0.3862126346234947, "learning_rate": 3.022691024114234e-07, "loss": 0.8759, "step": 331 }, { "epoch": 1.7401574803149606, "grad_norm": 0.37276644156643624, "learning_rate": 2.9025840196161345e-07, "loss": 0.8996, "step": 332 }, { "epoch": 1.7454068241469818, "grad_norm": 0.3619714746103851, "learning_rate": 2.784790500180704e-07, "loss": 0.8734, "step": 333 }, { "epoch": 1.7506561679790025, "grad_norm": 0.3489848842196673, "learning_rate": 2.6693205226173466e-07, "loss": 0.852, "step": 334 }, { "epoch": 1.7559055118110236, "grad_norm": 0.47292345630417715, "learning_rate": 2.556183945359369e-07, "loss": 0.8524, "step": 335 }, { "epoch": 1.7611548556430447, "grad_norm": 0.3454751472880757, "learning_rate": 2.4453904276223093e-07, "loss": 0.8639, "step": 336 }, { "epoch": 1.7664041994750657, "grad_norm": 0.39520326916346893, "learning_rate": 2.3369494285792648e-07, "loss": 0.9011, "step": 337 }, { "epoch": 1.7716535433070866, "grad_norm": 0.36215879824858, "learning_rate": 2.2308702065533138e-07, "loss": 0.8475, "step": 338 }, { "epoch": 1.7769028871391077, "grad_norm": 0.3785025816595213, "learning_rate": 2.1271618182270402e-07, "loss": 0.8785, "step": 339 }, { "epoch": 1.7821522309711286, "grad_norm": 0.35017564001831825, "learning_rate": 2.0258331178693291e-07, "loss": 0.9251, "step": 340 }, { "epoch": 1.7874015748031495, "grad_norm": 0.3736191583691154, "learning_rate": 1.926892756579427e-07, "loss": 0.8638, "step": 341 }, { "epoch": 1.7926509186351707, "grad_norm": 0.3542533426451256, "learning_rate": 1.8303491815483076e-07, "loss": 0.8501, "step": 342 }, { "epoch": 1.7979002624671916, "grad_norm": 0.3653513060765524, "learning_rate": 1.7362106353375107e-07, "loss": 0.8704, "step": 343 }, { "epoch": 1.8031496062992125, "grad_norm": 0.3699565968914539, "learning_rate": 1.6444851551754158e-07, "loss": 0.8659, "step": 344 }, { "epoch": 1.8083989501312336, "grad_norm": 0.357867752615946, "learning_rate": 1.5551805722710245e-07, "loss": 0.8802, "step": 345 }, { "epoch": 1.8136482939632546, "grad_norm": 0.37125992932731333, "learning_rate": 1.4683045111453941e-07, "loss": 0.8368, "step": 346 }, { "epoch": 1.8188976377952755, "grad_norm": 0.3685836112400432, "learning_rate": 1.3838643889806568e-07, "loss": 0.9235, "step": 347 }, { "epoch": 1.8241469816272966, "grad_norm": 0.36408877164595227, "learning_rate": 1.3018674149867782e-07, "loss": 0.8799, "step": 348 }, { "epoch": 1.8293963254593177, "grad_norm": 0.3642144586766023, "learning_rate": 1.2223205897860533e-07, "loss": 0.8777, "step": 349 }, { "epoch": 1.8346456692913384, "grad_norm": 0.39073442531405206, "learning_rate": 1.1452307048154286e-07, "loss": 0.8797, "step": 350 }, { "epoch": 1.8398950131233596, "grad_norm": 0.3810371400467593, "learning_rate": 1.0706043417466549e-07, "loss": 0.91, "step": 351 }, { "epoch": 1.8451443569553807, "grad_norm": 0.35959663657652996, "learning_rate": 9.984478719243772e-08, "loss": 0.8714, "step": 352 }, { "epoch": 1.8503937007874016, "grad_norm": 0.36819474514599226, "learning_rate": 9.287674558221737e-08, "loss": 0.9048, "step": 353 }, { "epoch": 1.8556430446194225, "grad_norm": 0.3492451150451855, "learning_rate": 8.615690425165823e-08, "loss": 0.8589, "step": 354 }, { "epoch": 1.8608923884514437, "grad_norm": 0.3692991236824256, "learning_rate": 7.968583691792142e-08, "loss": 0.8502, "step": 355 }, { "epoch": 1.8661417322834646, "grad_norm": 0.36506034919430097, "learning_rate": 7.34640960586902e-08, "loss": 0.8948, "step": 356 }, { "epoch": 1.8713910761154855, "grad_norm": 0.39656772129331486, "learning_rate": 6.749221286500273e-08, "loss": 0.872, "step": 357 }, { "epoch": 1.8766404199475066, "grad_norm": 0.3766497135471153, "learning_rate": 6.177069719590234e-08, "loss": 0.8459, "step": 358 }, { "epoch": 1.8818897637795275, "grad_norm": 0.3420155942064905, "learning_rate": 5.6300037534904644e-08, "loss": 0.8797, "step": 359 }, { "epoch": 1.8871391076115485, "grad_norm": 0.3790798456525195, "learning_rate": 5.108070094829465e-08, "loss": 0.8374, "step": 360 }, { "epoch": 1.8923884514435696, "grad_norm": 0.36574760138577367, "learning_rate": 4.6113133045249225e-08, "loss": 0.8507, "step": 361 }, { "epoch": 1.8976377952755905, "grad_norm": 0.39215283711659366, "learning_rate": 4.139775793979228e-08, "loss": 0.9416, "step": 362 }, { "epoch": 1.9028871391076114, "grad_norm": 0.33743123097312766, "learning_rate": 3.693497821458702e-08, "loss": 0.8469, "step": 363 }, { "epoch": 1.9081364829396326, "grad_norm": 0.365685642660726, "learning_rate": 3.272517488656213e-08, "loss": 0.8809, "step": 364 }, { "epoch": 1.9133858267716537, "grad_norm": 0.35701737992686006, "learning_rate": 2.876870737438475e-08, "loss": 0.8576, "step": 365 }, { "epoch": 1.9186351706036744, "grad_norm": 0.4717726718121241, "learning_rate": 2.506591346777176e-08, "loss": 0.8882, "step": 366 }, { "epoch": 1.9238845144356955, "grad_norm": 0.3644123714802389, "learning_rate": 2.1617109298653126e-08, "loss": 0.8806, "step": 367 }, { "epoch": 1.9291338582677167, "grad_norm": 0.37039268056716995, "learning_rate": 1.842258931417917e-08, "loss": 0.8646, "step": 368 }, { "epoch": 1.9343832020997376, "grad_norm": 0.38307646079911417, "learning_rate": 1.5482626251583364e-08, "loss": 0.8605, "step": 369 }, { "epoch": 1.9396325459317585, "grad_norm": 0.34939756521186505, "learning_rate": 1.2797471114896598e-08, "loss": 0.8605, "step": 370 }, { "epoch": 1.9448818897637796, "grad_norm": 0.3630644212614912, "learning_rate": 1.0367353153516335e-08, "loss": 0.8874, "step": 371 }, { "epoch": 1.9501312335958005, "grad_norm": 0.36486914850748925, "learning_rate": 8.192479842635937e-09, "loss": 0.8488, "step": 372 }, { "epoch": 1.9553805774278215, "grad_norm": 0.3477602485063963, "learning_rate": 6.273036865529158e-09, "loss": 0.8865, "step": 373 }, { "epoch": 1.9606299212598426, "grad_norm": 0.3699792979427579, "learning_rate": 4.6091880976981695e-09, "loss": 0.8552, "step": 374 }, { "epoch": 1.9658792650918635, "grad_norm": 0.36569117211522567, "learning_rate": 3.201075592882741e-09, "loss": 0.8771, "step": 375 }, { "epoch": 1.9711286089238844, "grad_norm": 0.3673522439384638, "learning_rate": 2.0488195709312816e-09, "loss": 0.8316, "step": 376 }, { "epoch": 1.9763779527559056, "grad_norm": 0.40741119157495514, "learning_rate": 1.152518407537717e-09, "loss": 0.8686, "step": 377 }, { "epoch": 1.9816272965879265, "grad_norm": 0.3922247762663656, "learning_rate": 5.122486258418713e-10, "loss": 0.88, "step": 378 }, { "epoch": 1.9868766404199474, "grad_norm": 0.36724366518491103, "learning_rate": 1.2806488989636211e-10, "loss": 0.863, "step": 379 }, { "epoch": 1.9921259842519685, "grad_norm": 0.37347664771694333, "learning_rate": 0.0, "loss": 0.8553, "step": 380 } ], "logging_steps": 1, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 95, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.333503833071944e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }