diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4338 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4885193378594859, + "eval_steps": 500, + "global_step": 615, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024410710199099855, + "grad_norm": 8.533896446228027, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1655, + "step": 1 + }, + { + "epoch": 0.004882142039819971, + "grad_norm": 8.988560676574707, + "learning_rate": 4.000000000000001e-06, + "loss": 1.2516, + "step": 2 + }, + { + "epoch": 0.0073232130597299565, + "grad_norm": 7.550627708435059, + "learning_rate": 6e-06, + "loss": 1.1895, + "step": 3 + }, + { + "epoch": 0.009764284079639942, + "grad_norm": 3.6377415657043457, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0982, + "step": 4 + }, + { + "epoch": 0.012205355099549928, + "grad_norm": 3.964740753173828, + "learning_rate": 1e-05, + "loss": 1.0622, + "step": 5 + }, + { + "epoch": 0.014646426119459913, + "grad_norm": 4.8016157150268555, + "learning_rate": 9.999962669988608e-06, + "loss": 1.0653, + "step": 6 + }, + { + "epoch": 0.017087497139369898, + "grad_norm": 2.9538488388061523, + "learning_rate": 9.999850680511844e-06, + "loss": 1.026, + "step": 7 + }, + { + "epoch": 0.019528568159279884, + "grad_norm": 2.869965076446533, + "learning_rate": 9.999664033241933e-06, + "loss": 1.0349, + "step": 8 + }, + { + "epoch": 0.02196963917918987, + "grad_norm": 1.8026058673858643, + "learning_rate": 9.999402730965894e-06, + "loss": 1.0421, + "step": 9 + }, + { + "epoch": 0.024410710199099857, + "grad_norm": 1.075210452079773, + "learning_rate": 9.999066777585496e-06, + "loss": 1.0008, + "step": 10 + }, + { + "epoch": 0.02685178121900984, + "grad_norm": 1.4493818283081055, + "learning_rate": 9.998656178117193e-06, + "loss": 0.9347, + "step": 11 + }, + { + "epoch": 0.029292852238919826, + "grad_norm": 1.2218502759933472, + "learning_rate": 9.99817093869206e-06, + "loss": 0.9537, + "step": 12 + }, + { + "epoch": 0.03173392325882981, + "grad_norm": 1.0389800071716309, + "learning_rate": 9.997611066555694e-06, + "loss": 0.9458, + "step": 13 + }, + { + "epoch": 0.034174994278739795, + "grad_norm": 0.959168016910553, + "learning_rate": 9.99697657006811e-06, + "loss": 0.9622, + "step": 14 + }, + { + "epoch": 0.03661606529864978, + "grad_norm": 1.0173426866531372, + "learning_rate": 9.99626745870361e-06, + "loss": 0.9594, + "step": 15 + }, + { + "epoch": 0.03905713631855977, + "grad_norm": 0.9893942475318909, + "learning_rate": 9.995483743050649e-06, + "loss": 0.9128, + "step": 16 + }, + { + "epoch": 0.041498207338469754, + "grad_norm": 0.9174278974533081, + "learning_rate": 9.99462543481167e-06, + "loss": 0.9108, + "step": 17 + }, + { + "epoch": 0.04393927835837974, + "grad_norm": 0.8355342745780945, + "learning_rate": 9.993692546802943e-06, + "loss": 0.9341, + "step": 18 + }, + { + "epoch": 0.04638034937828973, + "grad_norm": 0.9482454061508179, + "learning_rate": 9.992685092954347e-06, + "loss": 0.8488, + "step": 19 + }, + { + "epoch": 0.048821420398199714, + "grad_norm": 0.8152992129325867, + "learning_rate": 9.991603088309195e-06, + "loss": 0.9388, + "step": 20 + }, + { + "epoch": 0.05126249141810969, + "grad_norm": 0.7824520468711853, + "learning_rate": 9.990446549023977e-06, + "loss": 0.917, + "step": 21 + }, + { + "epoch": 0.05370356243801968, + "grad_norm": 0.8396065831184387, + "learning_rate": 9.989215492368152e-06, + "loss": 0.9043, + "step": 22 + }, + { + "epoch": 0.056144633457929666, + "grad_norm": 0.7503563761711121, + "learning_rate": 9.98790993672386e-06, + "loss": 0.9368, + "step": 23 + }, + { + "epoch": 0.05858570447783965, + "grad_norm": 0.846466600894928, + "learning_rate": 9.98652990158566e-06, + "loss": 0.8641, + "step": 24 + }, + { + "epoch": 0.06102677549774964, + "grad_norm": 0.8216990232467651, + "learning_rate": 9.985075407560247e-06, + "loss": 0.8744, + "step": 25 + }, + { + "epoch": 0.06346784651765962, + "grad_norm": 0.7758781313896179, + "learning_rate": 9.983546476366133e-06, + "loss": 0.8722, + "step": 26 + }, + { + "epoch": 0.06590891753756961, + "grad_norm": 0.8065202236175537, + "learning_rate": 9.981943130833323e-06, + "loss": 0.8582, + "step": 27 + }, + { + "epoch": 0.06834998855747959, + "grad_norm": 0.79361891746521, + "learning_rate": 9.980265394902982e-06, + "loss": 0.8549, + "step": 28 + }, + { + "epoch": 0.07079105957738958, + "grad_norm": 0.7769683003425598, + "learning_rate": 9.978513293627068e-06, + "loss": 0.8801, + "step": 29 + }, + { + "epoch": 0.07323213059729956, + "grad_norm": 0.7662413120269775, + "learning_rate": 9.976686853167967e-06, + "loss": 0.849, + "step": 30 + }, + { + "epoch": 0.07567320161720956, + "grad_norm": 0.7053027153015137, + "learning_rate": 9.974786100798098e-06, + "loss": 0.8925, + "step": 31 + }, + { + "epoch": 0.07811427263711954, + "grad_norm": 0.7407605051994324, + "learning_rate": 9.9728110648995e-06, + "loss": 0.8623, + "step": 32 + }, + { + "epoch": 0.08055534365702952, + "grad_norm": 0.7798149585723877, + "learning_rate": 9.970761774963421e-06, + "loss": 0.8711, + "step": 33 + }, + { + "epoch": 0.08299641467693951, + "grad_norm": 0.7310554385185242, + "learning_rate": 9.968638261589866e-06, + "loss": 0.9071, + "step": 34 + }, + { + "epoch": 0.08543748569684949, + "grad_norm": 0.8006892204284668, + "learning_rate": 9.966440556487149e-06, + "loss": 0.9026, + "step": 35 + }, + { + "epoch": 0.08787855671675948, + "grad_norm": 0.7774298787117004, + "learning_rate": 9.96416869247141e-06, + "loss": 0.8512, + "step": 36 + }, + { + "epoch": 0.09031962773666946, + "grad_norm": 0.7737051844596863, + "learning_rate": 9.961822703466131e-06, + "loss": 0.8629, + "step": 37 + }, + { + "epoch": 0.09276069875657945, + "grad_norm": 0.8388147950172424, + "learning_rate": 9.959402624501636e-06, + "loss": 0.803, + "step": 38 + }, + { + "epoch": 0.09520176977648943, + "grad_norm": 0.7394818067550659, + "learning_rate": 9.956908491714552e-06, + "loss": 0.8768, + "step": 39 + }, + { + "epoch": 0.09764284079639943, + "grad_norm": 0.8373251557350159, + "learning_rate": 9.95434034234728e-06, + "loss": 0.8604, + "step": 40 + }, + { + "epoch": 0.1000839118163094, + "grad_norm": 0.7941448092460632, + "learning_rate": 9.951698214747441e-06, + "loss": 0.8397, + "step": 41 + }, + { + "epoch": 0.10252498283621939, + "grad_norm": 0.7676767706871033, + "learning_rate": 9.948982148367294e-06, + "loss": 0.8434, + "step": 42 + }, + { + "epoch": 0.10496605385612938, + "grad_norm": 0.7958892583847046, + "learning_rate": 9.946192183763155e-06, + "loss": 0.8503, + "step": 43 + }, + { + "epoch": 0.10740712487603936, + "grad_norm": 0.793487012386322, + "learning_rate": 9.943328362594788e-06, + "loss": 0.8566, + "step": 44 + }, + { + "epoch": 0.10984819589594935, + "grad_norm": 0.716295599937439, + "learning_rate": 9.940390727624785e-06, + "loss": 0.8128, + "step": 45 + }, + { + "epoch": 0.11228926691585933, + "grad_norm": 0.7760279178619385, + "learning_rate": 9.937379322717923e-06, + "loss": 0.8409, + "step": 46 + }, + { + "epoch": 0.11473033793576932, + "grad_norm": 0.8229836821556091, + "learning_rate": 9.934294192840518e-06, + "loss": 0.8429, + "step": 47 + }, + { + "epoch": 0.1171714089556793, + "grad_norm": 0.6973395347595215, + "learning_rate": 9.931135384059737e-06, + "loss": 0.8542, + "step": 48 + }, + { + "epoch": 0.11961247997558928, + "grad_norm": 0.7911590933799744, + "learning_rate": 9.927902943542932e-06, + "loss": 0.8554, + "step": 49 + }, + { + "epoch": 0.12205355099549928, + "grad_norm": 0.6992570757865906, + "learning_rate": 9.924596919556917e-06, + "loss": 0.8706, + "step": 50 + }, + { + "epoch": 0.12449462201540926, + "grad_norm": 0.7577567100524902, + "learning_rate": 9.921217361467259e-06, + "loss": 0.856, + "step": 51 + }, + { + "epoch": 0.12693569303531924, + "grad_norm": 0.8022581934928894, + "learning_rate": 9.917764319737533e-06, + "loss": 0.8276, + "step": 52 + }, + { + "epoch": 0.12937676405522924, + "grad_norm": 0.720230758190155, + "learning_rate": 9.914237845928574e-06, + "loss": 0.8613, + "step": 53 + }, + { + "epoch": 0.13181783507513922, + "grad_norm": 0.7254828214645386, + "learning_rate": 9.910637992697707e-06, + "loss": 0.8617, + "step": 54 + }, + { + "epoch": 0.1342589060950492, + "grad_norm": 0.7254623174667358, + "learning_rate": 9.906964813797955e-06, + "loss": 0.8543, + "step": 55 + }, + { + "epoch": 0.13669997711495918, + "grad_norm": 0.7306321859359741, + "learning_rate": 9.903218364077242e-06, + "loss": 0.8332, + "step": 56 + }, + { + "epoch": 0.13914104813486916, + "grad_norm": 0.7202122211456299, + "learning_rate": 9.899398699477573e-06, + "loss": 0.8663, + "step": 57 + }, + { + "epoch": 0.14158211915477917, + "grad_norm": 0.7067145109176636, + "learning_rate": 9.895505877034198e-06, + "loss": 0.8165, + "step": 58 + }, + { + "epoch": 0.14402319017468915, + "grad_norm": 0.7376930713653564, + "learning_rate": 9.891539954874758e-06, + "loss": 0.8267, + "step": 59 + }, + { + "epoch": 0.14646426119459913, + "grad_norm": 0.7250686883926392, + "learning_rate": 9.887500992218421e-06, + "loss": 0.8239, + "step": 60 + }, + { + "epoch": 0.1489053322145091, + "grad_norm": 0.7254573106765747, + "learning_rate": 9.883389049374998e-06, + "loss": 0.8452, + "step": 61 + }, + { + "epoch": 0.1513464032344191, + "grad_norm": 0.7461521029472351, + "learning_rate": 9.879204187744036e-06, + "loss": 0.803, + "step": 62 + }, + { + "epoch": 0.1537874742543291, + "grad_norm": 0.7778986096382141, + "learning_rate": 9.874946469813907e-06, + "loss": 0.8287, + "step": 63 + }, + { + "epoch": 0.15622854527423907, + "grad_norm": 0.7395936846733093, + "learning_rate": 9.870615959160876e-06, + "loss": 0.8781, + "step": 64 + }, + { + "epoch": 0.15866961629414905, + "grad_norm": 0.7308329343795776, + "learning_rate": 9.866212720448149e-06, + "loss": 0.807, + "step": 65 + }, + { + "epoch": 0.16111068731405903, + "grad_norm": 0.7851212620735168, + "learning_rate": 9.861736819424904e-06, + "loss": 0.821, + "step": 66 + }, + { + "epoch": 0.16355175833396904, + "grad_norm": 0.7638505697250366, + "learning_rate": 9.857188322925317e-06, + "loss": 0.8273, + "step": 67 + }, + { + "epoch": 0.16599282935387902, + "grad_norm": 0.7750548720359802, + "learning_rate": 9.852567298867557e-06, + "loss": 0.8523, + "step": 68 + }, + { + "epoch": 0.168433900373789, + "grad_norm": 0.7466771602630615, + "learning_rate": 9.84787381625278e-06, + "loss": 0.8415, + "step": 69 + }, + { + "epoch": 0.17087497139369898, + "grad_norm": 0.6956301331520081, + "learning_rate": 9.843107945164086e-06, + "loss": 0.8206, + "step": 70 + }, + { + "epoch": 0.17331604241360898, + "grad_norm": 0.7392652630805969, + "learning_rate": 9.838269756765483e-06, + "loss": 0.8098, + "step": 71 + }, + { + "epoch": 0.17575711343351896, + "grad_norm": 0.7311574220657349, + "learning_rate": 9.833359323300827e-06, + "loss": 0.8116, + "step": 72 + }, + { + "epoch": 0.17819818445342894, + "grad_norm": 0.6983757615089417, + "learning_rate": 9.82837671809273e-06, + "loss": 0.8436, + "step": 73 + }, + { + "epoch": 0.18063925547333892, + "grad_norm": 0.7569893598556519, + "learning_rate": 9.823322015541474e-06, + "loss": 0.8058, + "step": 74 + }, + { + "epoch": 0.1830803264932489, + "grad_norm": 0.7439902424812317, + "learning_rate": 9.818195291123903e-06, + "loss": 0.8424, + "step": 75 + }, + { + "epoch": 0.1855213975131589, + "grad_norm": 0.7790477275848389, + "learning_rate": 9.81299662139229e-06, + "loss": 0.8483, + "step": 76 + }, + { + "epoch": 0.1879624685330689, + "grad_norm": 0.7717331051826477, + "learning_rate": 9.807726083973192e-06, + "loss": 0.8214, + "step": 77 + }, + { + "epoch": 0.19040353955297887, + "grad_norm": 0.7872374057769775, + "learning_rate": 9.8023837575663e-06, + "loss": 0.7938, + "step": 78 + }, + { + "epoch": 0.19284461057288885, + "grad_norm": 0.8018149137496948, + "learning_rate": 9.796969721943257e-06, + "loss": 0.802, + "step": 79 + }, + { + "epoch": 0.19528568159279885, + "grad_norm": 0.709600031375885, + "learning_rate": 9.791484057946465e-06, + "loss": 0.7944, + "step": 80 + }, + { + "epoch": 0.19772675261270883, + "grad_norm": 0.8216169476509094, + "learning_rate": 9.785926847487885e-06, + "loss": 0.8181, + "step": 81 + }, + { + "epoch": 0.2001678236326188, + "grad_norm": 0.7138919830322266, + "learning_rate": 9.780298173547811e-06, + "loss": 0.8043, + "step": 82 + }, + { + "epoch": 0.2026088946525288, + "grad_norm": 0.7637642621994019, + "learning_rate": 9.774598120173625e-06, + "loss": 0.8034, + "step": 83 + }, + { + "epoch": 0.20504996567243877, + "grad_norm": 0.7272418141365051, + "learning_rate": 9.76882677247855e-06, + "loss": 0.8271, + "step": 84 + }, + { + "epoch": 0.20749103669234878, + "grad_norm": 0.7340764999389648, + "learning_rate": 9.762984216640378e-06, + "loss": 0.8508, + "step": 85 + }, + { + "epoch": 0.20993210771225876, + "grad_norm": 0.7231638431549072, + "learning_rate": 9.75707053990018e-06, + "loss": 0.823, + "step": 86 + }, + { + "epoch": 0.21237317873216874, + "grad_norm": 0.7670260071754456, + "learning_rate": 9.751085830561e-06, + "loss": 0.8595, + "step": 87 + }, + { + "epoch": 0.21481424975207872, + "grad_norm": 0.7142215371131897, + "learning_rate": 9.74503017798655e-06, + "loss": 0.8325, + "step": 88 + }, + { + "epoch": 0.2172553207719887, + "grad_norm": 0.7884289026260376, + "learning_rate": 9.738903672599858e-06, + "loss": 0.7751, + "step": 89 + }, + { + "epoch": 0.2196963917918987, + "grad_norm": 0.7771654725074768, + "learning_rate": 9.732706405881931e-06, + "loss": 0.7827, + "step": 90 + }, + { + "epoch": 0.22213746281180868, + "grad_norm": 0.7293388247489929, + "learning_rate": 9.726438470370385e-06, + "loss": 0.8724, + "step": 91 + }, + { + "epoch": 0.22457853383171866, + "grad_norm": 0.7578020095825195, + "learning_rate": 9.720099959658062e-06, + "loss": 0.8277, + "step": 92 + }, + { + "epoch": 0.22701960485162864, + "grad_norm": 0.7896732091903687, + "learning_rate": 9.713690968391634e-06, + "loss": 0.7769, + "step": 93 + }, + { + "epoch": 0.22946067587153865, + "grad_norm": 0.6877868175506592, + "learning_rate": 9.707211592270183e-06, + "loss": 0.7938, + "step": 94 + }, + { + "epoch": 0.23190174689144863, + "grad_norm": 0.8047687411308289, + "learning_rate": 9.700661928043787e-06, + "loss": 0.7735, + "step": 95 + }, + { + "epoch": 0.2343428179113586, + "grad_norm": 0.7561459541320801, + "learning_rate": 9.69404207351206e-06, + "loss": 0.8079, + "step": 96 + }, + { + "epoch": 0.2367838889312686, + "grad_norm": 0.7163955569267273, + "learning_rate": 9.687352127522703e-06, + "loss": 0.8042, + "step": 97 + }, + { + "epoch": 0.23922495995117857, + "grad_norm": 0.7289466857910156, + "learning_rate": 9.680592189970015e-06, + "loss": 0.8449, + "step": 98 + }, + { + "epoch": 0.24166603097108857, + "grad_norm": 0.6951574087142944, + "learning_rate": 9.673762361793418e-06, + "loss": 0.7988, + "step": 99 + }, + { + "epoch": 0.24410710199099855, + "grad_norm": 0.7552266716957092, + "learning_rate": 9.666862744975938e-06, + "loss": 0.8323, + "step": 100 + }, + { + "epoch": 0.24654817301090853, + "grad_norm": 0.7086972594261169, + "learning_rate": 9.659893442542683e-06, + "loss": 0.8567, + "step": 101 + }, + { + "epoch": 0.2489892440308185, + "grad_norm": 0.7231544852256775, + "learning_rate": 9.652854558559309e-06, + "loss": 0.8265, + "step": 102 + }, + { + "epoch": 0.2514303150507285, + "grad_norm": 0.7094722986221313, + "learning_rate": 9.645746198130462e-06, + "loss": 0.7803, + "step": 103 + }, + { + "epoch": 0.25387138607063847, + "grad_norm": 0.6969436407089233, + "learning_rate": 9.638568467398215e-06, + "loss": 0.804, + "step": 104 + }, + { + "epoch": 0.25631245709054845, + "grad_norm": 0.7204388380050659, + "learning_rate": 9.631321473540476e-06, + "loss": 0.787, + "step": 105 + }, + { + "epoch": 0.2587535281104585, + "grad_norm": 0.6980841159820557, + "learning_rate": 9.62400532476939e-06, + "loss": 0.8294, + "step": 106 + }, + { + "epoch": 0.26119459913036847, + "grad_norm": 0.6793758273124695, + "learning_rate": 9.61662013032972e-06, + "loss": 0.7739, + "step": 107 + }, + { + "epoch": 0.26363567015027844, + "grad_norm": 0.7096854448318481, + "learning_rate": 9.60916600049723e-06, + "loss": 0.8035, + "step": 108 + }, + { + "epoch": 0.2660767411701884, + "grad_norm": 0.6875160932540894, + "learning_rate": 9.601643046577014e-06, + "loss": 0.8567, + "step": 109 + }, + { + "epoch": 0.2685178121900984, + "grad_norm": 0.7122709155082703, + "learning_rate": 9.59405138090186e-06, + "loss": 0.8153, + "step": 110 + }, + { + "epoch": 0.2709588832100084, + "grad_norm": 0.695655882358551, + "learning_rate": 9.586391116830549e-06, + "loss": 0.7813, + "step": 111 + }, + { + "epoch": 0.27339995422991836, + "grad_norm": 0.674659788608551, + "learning_rate": 9.578662368746183e-06, + "loss": 0.8802, + "step": 112 + }, + { + "epoch": 0.27584102524982834, + "grad_norm": 0.7121911644935608, + "learning_rate": 9.570865252054462e-06, + "loss": 0.8017, + "step": 113 + }, + { + "epoch": 0.2782820962697383, + "grad_norm": 0.7068195939064026, + "learning_rate": 9.562999883181968e-06, + "loss": 0.7817, + "step": 114 + }, + { + "epoch": 0.28072316728964836, + "grad_norm": 0.6847429275512695, + "learning_rate": 9.555066379574423e-06, + "loss": 0.801, + "step": 115 + }, + { + "epoch": 0.28316423830955834, + "grad_norm": 0.743248462677002, + "learning_rate": 9.547064859694943e-06, + "loss": 0.7978, + "step": 116 + }, + { + "epoch": 0.2856053093294683, + "grad_norm": 0.7640885710716248, + "learning_rate": 9.538995443022256e-06, + "loss": 0.7913, + "step": 117 + }, + { + "epoch": 0.2880463803493783, + "grad_norm": 0.7139798402786255, + "learning_rate": 9.530858250048933e-06, + "loss": 0.7994, + "step": 118 + }, + { + "epoch": 0.2904874513692883, + "grad_norm": 0.7640753388404846, + "learning_rate": 9.52265340227957e-06, + "loss": 0.7946, + "step": 119 + }, + { + "epoch": 0.29292852238919825, + "grad_norm": 0.7454321980476379, + "learning_rate": 9.514381022228997e-06, + "loss": 0.809, + "step": 120 + }, + { + "epoch": 0.29536959340910823, + "grad_norm": 0.6853974461555481, + "learning_rate": 9.506041233420427e-06, + "loss": 0.8013, + "step": 121 + }, + { + "epoch": 0.2978106644290182, + "grad_norm": 0.723430335521698, + "learning_rate": 9.497634160383627e-06, + "loss": 0.7923, + "step": 122 + }, + { + "epoch": 0.3002517354489282, + "grad_norm": 0.7062557935714722, + "learning_rate": 9.489159928653047e-06, + "loss": 0.7702, + "step": 123 + }, + { + "epoch": 0.3026928064688382, + "grad_norm": 0.6789696216583252, + "learning_rate": 9.480618664765956e-06, + "loss": 0.7748, + "step": 124 + }, + { + "epoch": 0.3051338774887482, + "grad_norm": 0.7581243515014648, + "learning_rate": 9.472010496260545e-06, + "loss": 0.771, + "step": 125 + }, + { + "epoch": 0.3075749485086582, + "grad_norm": 0.7822269201278687, + "learning_rate": 9.463335551674024e-06, + "loss": 0.8, + "step": 126 + }, + { + "epoch": 0.31001601952856817, + "grad_norm": 0.7157217264175415, + "learning_rate": 9.454593960540709e-06, + "loss": 0.7883, + "step": 127 + }, + { + "epoch": 0.31245709054847814, + "grad_norm": 0.7614567875862122, + "learning_rate": 9.445785853390074e-06, + "loss": 0.7929, + "step": 128 + }, + { + "epoch": 0.3148981615683881, + "grad_norm": 0.7470414042472839, + "learning_rate": 9.436911361744817e-06, + "loss": 0.7826, + "step": 129 + }, + { + "epoch": 0.3173392325882981, + "grad_norm": 0.7033482193946838, + "learning_rate": 9.427970618118888e-06, + "loss": 0.8359, + "step": 130 + }, + { + "epoch": 0.3197803036082081, + "grad_norm": 0.7030816674232483, + "learning_rate": 9.418963756015511e-06, + "loss": 0.7966, + "step": 131 + }, + { + "epoch": 0.32222137462811806, + "grad_norm": 0.7050835490226746, + "learning_rate": 9.409890909925191e-06, + "loss": 0.7852, + "step": 132 + }, + { + "epoch": 0.3246624456480281, + "grad_norm": 0.7047673463821411, + "learning_rate": 9.400752215323712e-06, + "loss": 0.8134, + "step": 133 + }, + { + "epoch": 0.3271035166679381, + "grad_norm": 0.6739450693130493, + "learning_rate": 9.391547808670097e-06, + "loss": 0.8186, + "step": 134 + }, + { + "epoch": 0.32954458768784806, + "grad_norm": 0.7166461944580078, + "learning_rate": 9.38227782740459e-06, + "loss": 0.8118, + "step": 135 + }, + { + "epoch": 0.33198565870775804, + "grad_norm": 0.6905531287193298, + "learning_rate": 9.372942409946597e-06, + "loss": 0.8092, + "step": 136 + }, + { + "epoch": 0.334426729727668, + "grad_norm": 0.7552813291549683, + "learning_rate": 9.36354169569261e-06, + "loss": 0.7405, + "step": 137 + }, + { + "epoch": 0.336867800747578, + "grad_norm": 0.6745990514755249, + "learning_rate": 9.35407582501414e-06, + "loss": 0.8397, + "step": 138 + }, + { + "epoch": 0.339308871767488, + "grad_norm": 0.7749987840652466, + "learning_rate": 9.344544939255608e-06, + "loss": 0.7979, + "step": 139 + }, + { + "epoch": 0.34174994278739795, + "grad_norm": 0.7859154939651489, + "learning_rate": 9.334949180732245e-06, + "loss": 0.8217, + "step": 140 + }, + { + "epoch": 0.34419101380730793, + "grad_norm": 0.7111227512359619, + "learning_rate": 9.325288692727963e-06, + "loss": 0.7692, + "step": 141 + }, + { + "epoch": 0.34663208482721797, + "grad_norm": 0.824995219707489, + "learning_rate": 9.315563619493209e-06, + "loss": 0.7989, + "step": 142 + }, + { + "epoch": 0.34907315584712795, + "grad_norm": 0.7707095742225647, + "learning_rate": 9.305774106242825e-06, + "loss": 0.8115, + "step": 143 + }, + { + "epoch": 0.3515142268670379, + "grad_norm": 0.7036089301109314, + "learning_rate": 9.295920299153863e-06, + "loss": 0.8119, + "step": 144 + }, + { + "epoch": 0.3539552978869479, + "grad_norm": 0.7585278153419495, + "learning_rate": 9.286002345363418e-06, + "loss": 0.7853, + "step": 145 + }, + { + "epoch": 0.3563963689068579, + "grad_norm": 0.7351112961769104, + "learning_rate": 9.276020392966423e-06, + "loss": 0.7974, + "step": 146 + }, + { + "epoch": 0.35883743992676786, + "grad_norm": 0.7286148071289062, + "learning_rate": 9.265974591013434e-06, + "loss": 0.8044, + "step": 147 + }, + { + "epoch": 0.36127851094667784, + "grad_norm": 0.6930050253868103, + "learning_rate": 9.25586508950841e-06, + "loss": 0.8117, + "step": 148 + }, + { + "epoch": 0.3637195819665878, + "grad_norm": 0.8765610456466675, + "learning_rate": 9.24569203940648e-06, + "loss": 0.7551, + "step": 149 + }, + { + "epoch": 0.3661606529864978, + "grad_norm": 0.7214458584785461, + "learning_rate": 9.235455592611667e-06, + "loss": 0.7984, + "step": 150 + }, + { + "epoch": 0.36860172400640784, + "grad_norm": 0.7065439820289612, + "learning_rate": 9.225155901974645e-06, + "loss": 0.8106, + "step": 151 + }, + { + "epoch": 0.3710427950263178, + "grad_norm": 0.7775700092315674, + "learning_rate": 9.214793121290442e-06, + "loss": 0.8211, + "step": 152 + }, + { + "epoch": 0.3734838660462278, + "grad_norm": 0.7118616700172424, + "learning_rate": 9.204367405296144e-06, + "loss": 0.82, + "step": 153 + }, + { + "epoch": 0.3759249370661378, + "grad_norm": 0.7476733326911926, + "learning_rate": 9.193878909668591e-06, + "loss": 0.7584, + "step": 154 + }, + { + "epoch": 0.37836600808604776, + "grad_norm": 0.7488994002342224, + "learning_rate": 9.183327791022048e-06, + "loss": 0.7552, + "step": 155 + }, + { + "epoch": 0.38080707910595774, + "grad_norm": 0.7086935043334961, + "learning_rate": 9.172714206905866e-06, + "loss": 0.7993, + "step": 156 + }, + { + "epoch": 0.3832481501258677, + "grad_norm": 0.7513390183448792, + "learning_rate": 9.162038315802132e-06, + "loss": 0.7684, + "step": 157 + }, + { + "epoch": 0.3856892211457777, + "grad_norm": 0.6983102560043335, + "learning_rate": 9.1513002771233e-06, + "loss": 0.7904, + "step": 158 + }, + { + "epoch": 0.3881302921656877, + "grad_norm": 0.6591006517410278, + "learning_rate": 9.140500251209813e-06, + "loss": 0.7357, + "step": 159 + }, + { + "epoch": 0.3905713631855977, + "grad_norm": 0.7491998672485352, + "learning_rate": 9.129638399327707e-06, + "loss": 0.7964, + "step": 160 + }, + { + "epoch": 0.3930124342055077, + "grad_norm": 0.7312127947807312, + "learning_rate": 9.118714883666204e-06, + "loss": 0.7706, + "step": 161 + }, + { + "epoch": 0.39545350522541767, + "grad_norm": 0.7120770215988159, + "learning_rate": 9.107729867335287e-06, + "loss": 0.8367, + "step": 162 + }, + { + "epoch": 0.39789457624532765, + "grad_norm": 0.735023021697998, + "learning_rate": 9.096683514363275e-06, + "loss": 0.7832, + "step": 163 + }, + { + "epoch": 0.4003356472652376, + "grad_norm": 0.7334295511245728, + "learning_rate": 9.085575989694358e-06, + "loss": 0.7977, + "step": 164 + }, + { + "epoch": 0.4027767182851476, + "grad_norm": 0.7482827305793762, + "learning_rate": 9.074407459186144e-06, + "loss": 0.868, + "step": 165 + }, + { + "epoch": 0.4052177893050576, + "grad_norm": 0.7395485043525696, + "learning_rate": 9.063178089607183e-06, + "loss": 0.7676, + "step": 166 + }, + { + "epoch": 0.40765886032496756, + "grad_norm": 0.6970906257629395, + "learning_rate": 9.051888048634471e-06, + "loss": 0.762, + "step": 167 + }, + { + "epoch": 0.41009993134487754, + "grad_norm": 0.7200821042060852, + "learning_rate": 9.040537504850954e-06, + "loss": 0.8067, + "step": 168 + }, + { + "epoch": 0.4125410023647875, + "grad_norm": 0.7742771506309509, + "learning_rate": 9.029126627743003e-06, + "loss": 0.7767, + "step": 169 + }, + { + "epoch": 0.41498207338469756, + "grad_norm": 0.7340243458747864, + "learning_rate": 9.017655587697885e-06, + "loss": 0.7816, + "step": 170 + }, + { + "epoch": 0.41742314440460754, + "grad_norm": 0.7570080161094666, + "learning_rate": 9.006124556001223e-06, + "loss": 0.8374, + "step": 171 + }, + { + "epoch": 0.4198642154245175, + "grad_norm": 0.7807502150535583, + "learning_rate": 8.994533704834435e-06, + "loss": 0.7749, + "step": 172 + }, + { + "epoch": 0.4223052864444275, + "grad_norm": 0.7137355208396912, + "learning_rate": 8.982883207272164e-06, + "loss": 0.7397, + "step": 173 + }, + { + "epoch": 0.4247463574643375, + "grad_norm": 0.7511448860168457, + "learning_rate": 8.971173237279693e-06, + "loss": 0.8006, + "step": 174 + }, + { + "epoch": 0.42718742848424746, + "grad_norm": 0.7791663408279419, + "learning_rate": 8.959403969710346e-06, + "loss": 0.7684, + "step": 175 + }, + { + "epoch": 0.42962849950415744, + "grad_norm": 0.7711341381072998, + "learning_rate": 8.947575580302879e-06, + "loss": 0.7905, + "step": 176 + }, + { + "epoch": 0.4320695705240674, + "grad_norm": 0.7793801426887512, + "learning_rate": 8.935688245678859e-06, + "loss": 0.8121, + "step": 177 + }, + { + "epoch": 0.4345106415439774, + "grad_norm": 0.7082055807113647, + "learning_rate": 8.92374214334002e-06, + "loss": 0.7657, + "step": 178 + }, + { + "epoch": 0.43695171256388743, + "grad_norm": 0.735462486743927, + "learning_rate": 8.911737451665616e-06, + "loss": 0.7833, + "step": 179 + }, + { + "epoch": 0.4393927835837974, + "grad_norm": 0.7432037591934204, + "learning_rate": 8.899674349909759e-06, + "loss": 0.7645, + "step": 180 + }, + { + "epoch": 0.4418338546037074, + "grad_norm": 0.7552315592765808, + "learning_rate": 8.887553018198738e-06, + "loss": 0.8018, + "step": 181 + }, + { + "epoch": 0.44427492562361737, + "grad_norm": 0.677143931388855, + "learning_rate": 8.875373637528336e-06, + "loss": 0.8029, + "step": 182 + }, + { + "epoch": 0.44671599664352735, + "grad_norm": 0.7790682911872864, + "learning_rate": 8.863136389761115e-06, + "loss": 0.792, + "step": 183 + }, + { + "epoch": 0.4491570676634373, + "grad_norm": 0.735373854637146, + "learning_rate": 8.85084145762372e-06, + "loss": 0.78, + "step": 184 + }, + { + "epoch": 0.4515981386833473, + "grad_norm": 0.7221420407295227, + "learning_rate": 8.838489024704131e-06, + "loss": 0.807, + "step": 185 + }, + { + "epoch": 0.4540392097032573, + "grad_norm": 0.7021591067314148, + "learning_rate": 8.826079275448934e-06, + "loss": 0.7828, + "step": 186 + }, + { + "epoch": 0.45648028072316726, + "grad_norm": 0.7104141712188721, + "learning_rate": 8.81361239516056e-06, + "loss": 0.8051, + "step": 187 + }, + { + "epoch": 0.4589213517430773, + "grad_norm": 0.749536395072937, + "learning_rate": 8.801088569994523e-06, + "loss": 0.7811, + "step": 188 + }, + { + "epoch": 0.4613624227629873, + "grad_norm": 0.7570759654045105, + "learning_rate": 8.788507986956639e-06, + "loss": 0.8015, + "step": 189 + }, + { + "epoch": 0.46380349378289726, + "grad_norm": 0.6997769474983215, + "learning_rate": 8.775870833900226e-06, + "loss": 0.7816, + "step": 190 + }, + { + "epoch": 0.46624456480280724, + "grad_norm": 0.6764109134674072, + "learning_rate": 8.763177299523318e-06, + "loss": 0.7577, + "step": 191 + }, + { + "epoch": 0.4686856358227172, + "grad_norm": 0.7811216115951538, + "learning_rate": 8.750427573365825e-06, + "loss": 0.7324, + "step": 192 + }, + { + "epoch": 0.4711267068426272, + "grad_norm": 0.7098534107208252, + "learning_rate": 8.737621845806715e-06, + "loss": 0.7321, + "step": 193 + }, + { + "epoch": 0.4735677778625372, + "grad_norm": 0.7705920934677124, + "learning_rate": 8.724760308061172e-06, + "loss": 0.7501, + "step": 194 + }, + { + "epoch": 0.47600884888244716, + "grad_norm": 0.7170778512954712, + "learning_rate": 8.711843152177735e-06, + "loss": 0.767, + "step": 195 + }, + { + "epoch": 0.47844991990235713, + "grad_norm": 0.7175964713096619, + "learning_rate": 8.698870571035436e-06, + "loss": 0.7592, + "step": 196 + }, + { + "epoch": 0.48089099092226717, + "grad_norm": 0.7901434898376465, + "learning_rate": 8.685842758340912e-06, + "loss": 0.7921, + "step": 197 + }, + { + "epoch": 0.48333206194217715, + "grad_norm": 0.7608402371406555, + "learning_rate": 8.672759908625528e-06, + "loss": 0.8617, + "step": 198 + }, + { + "epoch": 0.48577313296208713, + "grad_norm": 0.7593024373054504, + "learning_rate": 8.65962221724245e-06, + "loss": 0.7674, + "step": 199 + }, + { + "epoch": 0.4882142039819971, + "grad_norm": 0.7110275626182556, + "learning_rate": 8.646429880363746e-06, + "loss": 0.7521, + "step": 200 + }, + { + "epoch": 0.4906552750019071, + "grad_norm": 0.7535459399223328, + "learning_rate": 8.633183094977453e-06, + "loss": 0.7296, + "step": 201 + }, + { + "epoch": 0.49309634602181707, + "grad_norm": 0.7531000971794128, + "learning_rate": 8.61988205888463e-06, + "loss": 0.7863, + "step": 202 + }, + { + "epoch": 0.49553741704172705, + "grad_norm": 0.7889319658279419, + "learning_rate": 8.60652697069641e-06, + "loss": 0.7784, + "step": 203 + }, + { + "epoch": 0.497978488061637, + "grad_norm": 0.6903645396232605, + "learning_rate": 8.593118029831025e-06, + "loss": 0.7954, + "step": 204 + }, + { + "epoch": 0.500419559081547, + "grad_norm": 0.7375295758247375, + "learning_rate": 8.579655436510847e-06, + "loss": 0.7764, + "step": 205 + }, + { + "epoch": 0.502860630101457, + "grad_norm": 0.7218457460403442, + "learning_rate": 8.566139391759378e-06, + "loss": 0.7852, + "step": 206 + }, + { + "epoch": 0.505301701121367, + "grad_norm": 0.7074956297874451, + "learning_rate": 8.552570097398262e-06, + "loss": 0.7824, + "step": 207 + }, + { + "epoch": 0.5077427721412769, + "grad_norm": 0.6844367384910583, + "learning_rate": 8.53894775604426e-06, + "loss": 0.8005, + "step": 208 + }, + { + "epoch": 0.5101838431611869, + "grad_norm": 0.7443989515304565, + "learning_rate": 8.525272571106242e-06, + "loss": 0.7761, + "step": 209 + }, + { + "epoch": 0.5126249141810969, + "grad_norm": 0.7639645338058472, + "learning_rate": 8.511544746782124e-06, + "loss": 0.8032, + "step": 210 + }, + { + "epoch": 0.515065985201007, + "grad_norm": 0.699748158454895, + "learning_rate": 8.497764488055848e-06, + "loss": 0.7801, + "step": 211 + }, + { + "epoch": 0.517507056220917, + "grad_norm": 0.7058794498443604, + "learning_rate": 8.483932000694295e-06, + "loss": 0.7693, + "step": 212 + }, + { + "epoch": 0.519948127240827, + "grad_norm": 0.7830145359039307, + "learning_rate": 8.470047491244232e-06, + "loss": 0.7684, + "step": 213 + }, + { + "epoch": 0.5223891982607369, + "grad_norm": 0.6766949892044067, + "learning_rate": 8.456111167029219e-06, + "loss": 0.8214, + "step": 214 + }, + { + "epoch": 0.5248302692806469, + "grad_norm": 0.7066507339477539, + "learning_rate": 8.442123236146509e-06, + "loss": 0.7639, + "step": 215 + }, + { + "epoch": 0.5272713403005569, + "grad_norm": 0.7286085486412048, + "learning_rate": 8.42808390746395e-06, + "loss": 0.7723, + "step": 216 + }, + { + "epoch": 0.5297124113204669, + "grad_norm": 0.7587203979492188, + "learning_rate": 8.413993390616865e-06, + "loss": 0.8034, + "step": 217 + }, + { + "epoch": 0.5321534823403768, + "grad_norm": 0.6527595520019531, + "learning_rate": 8.399851896004914e-06, + "loss": 0.7587, + "step": 218 + }, + { + "epoch": 0.5345945533602868, + "grad_norm": 0.8271955251693726, + "learning_rate": 8.385659634788959e-06, + "loss": 0.7846, + "step": 219 + }, + { + "epoch": 0.5370356243801968, + "grad_norm": 0.7351842522621155, + "learning_rate": 8.371416818887907e-06, + "loss": 0.8002, + "step": 220 + }, + { + "epoch": 0.5394766954001068, + "grad_norm": 0.7915340065956116, + "learning_rate": 8.357123660975553e-06, + "loss": 0.7511, + "step": 221 + }, + { + "epoch": 0.5419177664200168, + "grad_norm": 0.6955085396766663, + "learning_rate": 8.342780374477396e-06, + "loss": 0.7754, + "step": 222 + }, + { + "epoch": 0.5443588374399267, + "grad_norm": 0.7368732690811157, + "learning_rate": 8.328387173567453e-06, + "loss": 0.7775, + "step": 223 + }, + { + "epoch": 0.5467999084598367, + "grad_norm": 0.6908881068229675, + "learning_rate": 8.313944273165068e-06, + "loss": 0.7571, + "step": 224 + }, + { + "epoch": 0.5492409794797467, + "grad_norm": 0.700554370880127, + "learning_rate": 8.299451888931696e-06, + "loss": 0.7714, + "step": 225 + }, + { + "epoch": 0.5516820504996567, + "grad_norm": 0.7163404822349548, + "learning_rate": 8.284910237267681e-06, + "loss": 0.7767, + "step": 226 + }, + { + "epoch": 0.5541231215195667, + "grad_norm": 0.7443628311157227, + "learning_rate": 8.270319535309035e-06, + "loss": 0.7709, + "step": 227 + }, + { + "epoch": 0.5565641925394766, + "grad_norm": 0.691213071346283, + "learning_rate": 8.255680000924184e-06, + "loss": 0.7997, + "step": 228 + }, + { + "epoch": 0.5590052635593867, + "grad_norm": 0.7387362718582153, + "learning_rate": 8.240991852710724e-06, + "loss": 0.7502, + "step": 229 + }, + { + "epoch": 0.5614463345792967, + "grad_norm": 0.7051777243614197, + "learning_rate": 8.22625530999215e-06, + "loss": 0.7811, + "step": 230 + }, + { + "epoch": 0.5638874055992067, + "grad_norm": 0.6610181331634521, + "learning_rate": 8.211470592814586e-06, + "loss": 0.7884, + "step": 231 + }, + { + "epoch": 0.5663284766191167, + "grad_norm": 0.6575087904930115, + "learning_rate": 8.196637921943496e-06, + "loss": 0.7797, + "step": 232 + }, + { + "epoch": 0.5687695476390267, + "grad_norm": 0.7363637685775757, + "learning_rate": 8.181757518860387e-06, + "loss": 0.7369, + "step": 233 + }, + { + "epoch": 0.5712106186589366, + "grad_norm": 0.6955734491348267, + "learning_rate": 8.166829605759507e-06, + "loss": 0.7841, + "step": 234 + }, + { + "epoch": 0.5736516896788466, + "grad_norm": 0.7019768357276917, + "learning_rate": 8.151854405544526e-06, + "loss": 0.7702, + "step": 235 + }, + { + "epoch": 0.5760927606987566, + "grad_norm": 0.7041372656822205, + "learning_rate": 8.136832141825197e-06, + "loss": 0.7755, + "step": 236 + }, + { + "epoch": 0.5785338317186666, + "grad_norm": 0.7138186693191528, + "learning_rate": 8.12176303891403e-06, + "loss": 0.7815, + "step": 237 + }, + { + "epoch": 0.5809749027385765, + "grad_norm": 0.6682398319244385, + "learning_rate": 8.106647321822943e-06, + "loss": 0.7573, + "step": 238 + }, + { + "epoch": 0.5834159737584865, + "grad_norm": 0.6600127816200256, + "learning_rate": 8.091485216259886e-06, + "loss": 0.7644, + "step": 239 + }, + { + "epoch": 0.5858570447783965, + "grad_norm": 0.7157433032989502, + "learning_rate": 8.076276948625495e-06, + "loss": 0.7661, + "step": 240 + }, + { + "epoch": 0.5882981157983065, + "grad_norm": 0.7180731892585754, + "learning_rate": 8.061022746009687e-06, + "loss": 0.756, + "step": 241 + }, + { + "epoch": 0.5907391868182165, + "grad_norm": 0.6941264271736145, + "learning_rate": 8.04572283618829e-06, + "loss": 0.7501, + "step": 242 + }, + { + "epoch": 0.5931802578381264, + "grad_norm": 0.7047881484031677, + "learning_rate": 8.030377447619622e-06, + "loss": 0.7564, + "step": 243 + }, + { + "epoch": 0.5956213288580364, + "grad_norm": 0.6860742568969727, + "learning_rate": 8.014986809441093e-06, + "loss": 0.8048, + "step": 244 + }, + { + "epoch": 0.5980623998779464, + "grad_norm": 0.6961259245872498, + "learning_rate": 7.999551151465793e-06, + "loss": 0.8085, + "step": 245 + }, + { + "epoch": 0.6005034708978564, + "grad_norm": 0.6859815716743469, + "learning_rate": 7.984070704179026e-06, + "loss": 0.7911, + "step": 246 + }, + { + "epoch": 0.6029445419177665, + "grad_norm": 0.739782452583313, + "learning_rate": 7.968545698734908e-06, + "loss": 0.7981, + "step": 247 + }, + { + "epoch": 0.6053856129376765, + "grad_norm": 0.7173625230789185, + "learning_rate": 7.952976366952888e-06, + "loss": 0.7738, + "step": 248 + }, + { + "epoch": 0.6078266839575864, + "grad_norm": 0.7094762921333313, + "learning_rate": 7.9373629413143e-06, + "loss": 0.7802, + "step": 249 + }, + { + "epoch": 0.6102677549774964, + "grad_norm": 0.6974766254425049, + "learning_rate": 7.921705654958886e-06, + "loss": 0.7956, + "step": 250 + }, + { + "epoch": 0.6127088259974064, + "grad_norm": 0.7235715389251709, + "learning_rate": 7.906004741681321e-06, + "loss": 0.7581, + "step": 251 + }, + { + "epoch": 0.6151498970173164, + "grad_norm": 0.68167644739151, + "learning_rate": 7.890260435927709e-06, + "loss": 0.7746, + "step": 252 + }, + { + "epoch": 0.6175909680372264, + "grad_norm": 0.6965702176094055, + "learning_rate": 7.874472972792097e-06, + "loss": 0.7638, + "step": 253 + }, + { + "epoch": 0.6200320390571363, + "grad_norm": 0.7000617384910583, + "learning_rate": 7.858642588012957e-06, + "loss": 0.7252, + "step": 254 + }, + { + "epoch": 0.6224731100770463, + "grad_norm": 0.6918544173240662, + "learning_rate": 7.842769517969665e-06, + "loss": 0.7867, + "step": 255 + }, + { + "epoch": 0.6249141810969563, + "grad_norm": 0.7168439626693726, + "learning_rate": 7.826853999678978e-06, + "loss": 0.7349, + "step": 256 + }, + { + "epoch": 0.6273552521168663, + "grad_norm": 0.6823673844337463, + "learning_rate": 7.810896270791484e-06, + "loss": 0.7749, + "step": 257 + }, + { + "epoch": 0.6297963231367762, + "grad_norm": 0.7399064898490906, + "learning_rate": 7.794896569588066e-06, + "loss": 0.7886, + "step": 258 + }, + { + "epoch": 0.6322373941566862, + "grad_norm": 0.6988884806632996, + "learning_rate": 7.778855134976334e-06, + "loss": 0.7329, + "step": 259 + }, + { + "epoch": 0.6346784651765962, + "grad_norm": 0.7028211951255798, + "learning_rate": 7.762772206487066e-06, + "loss": 0.8214, + "step": 260 + }, + { + "epoch": 0.6371195361965062, + "grad_norm": 0.7346593737602234, + "learning_rate": 7.74664802427062e-06, + "loss": 0.7626, + "step": 261 + }, + { + "epoch": 0.6395606072164162, + "grad_norm": 0.7249420881271362, + "learning_rate": 7.73048282909336e-06, + "loss": 0.7617, + "step": 262 + }, + { + "epoch": 0.6420016782363261, + "grad_norm": 0.7126630544662476, + "learning_rate": 7.714276862334051e-06, + "loss": 0.7599, + "step": 263 + }, + { + "epoch": 0.6444427492562361, + "grad_norm": 0.7718750238418579, + "learning_rate": 7.698030365980265e-06, + "loss": 0.8228, + "step": 264 + }, + { + "epoch": 0.6468838202761461, + "grad_norm": 0.7375472187995911, + "learning_rate": 7.681743582624761e-06, + "loss": 0.7458, + "step": 265 + }, + { + "epoch": 0.6493248912960562, + "grad_norm": 0.7044516205787659, + "learning_rate": 7.66541675546186e-06, + "loss": 0.779, + "step": 266 + }, + { + "epoch": 0.6517659623159662, + "grad_norm": 0.7249746322631836, + "learning_rate": 7.64905012828382e-06, + "loss": 0.7983, + "step": 267 + }, + { + "epoch": 0.6542070333358762, + "grad_norm": 0.7117093801498413, + "learning_rate": 7.632643945477195e-06, + "loss": 0.7436, + "step": 268 + }, + { + "epoch": 0.6566481043557861, + "grad_norm": 0.7090557217597961, + "learning_rate": 7.616198452019176e-06, + "loss": 0.7563, + "step": 269 + }, + { + "epoch": 0.6590891753756961, + "grad_norm": 0.720168948173523, + "learning_rate": 7.59971389347395e-06, + "loss": 0.7487, + "step": 270 + }, + { + "epoch": 0.6615302463956061, + "grad_norm": 0.6775338053703308, + "learning_rate": 7.583190515989022e-06, + "loss": 0.7708, + "step": 271 + }, + { + "epoch": 0.6639713174155161, + "grad_norm": 0.711544394493103, + "learning_rate": 7.566628566291537e-06, + "loss": 0.7732, + "step": 272 + }, + { + "epoch": 0.666412388435426, + "grad_norm": 0.7197690606117249, + "learning_rate": 7.550028291684603e-06, + "loss": 0.7681, + "step": 273 + }, + { + "epoch": 0.668853459455336, + "grad_norm": 0.7002537250518799, + "learning_rate": 7.5333899400435986e-06, + "loss": 0.7414, + "step": 274 + }, + { + "epoch": 0.671294530475246, + "grad_norm": 0.7534189820289612, + "learning_rate": 7.516713759812465e-06, + "loss": 0.7756, + "step": 275 + }, + { + "epoch": 0.673735601495156, + "grad_norm": 0.6993451714515686, + "learning_rate": 7.500000000000001e-06, + "loss": 0.7494, + "step": 276 + }, + { + "epoch": 0.676176672515066, + "grad_norm": 0.7160854339599609, + "learning_rate": 7.483248910176144e-06, + "loss": 0.7727, + "step": 277 + }, + { + "epoch": 0.678617743534976, + "grad_norm": 0.7340339422225952, + "learning_rate": 7.466460740468246e-06, + "loss": 0.7641, + "step": 278 + }, + { + "epoch": 0.6810588145548859, + "grad_norm": 0.6893506050109863, + "learning_rate": 7.44963574155733e-06, + "loss": 0.7859, + "step": 279 + }, + { + "epoch": 0.6834998855747959, + "grad_norm": 0.7197214365005493, + "learning_rate": 7.432774164674359e-06, + "loss": 0.7645, + "step": 280 + }, + { + "epoch": 0.6859409565947059, + "grad_norm": 0.722647488117218, + "learning_rate": 7.4158762615964744e-06, + "loss": 0.7614, + "step": 281 + }, + { + "epoch": 0.6883820276146159, + "grad_norm": 0.7112955451011658, + "learning_rate": 7.398942284643242e-06, + "loss": 0.7565, + "step": 282 + }, + { + "epoch": 0.6908230986345258, + "grad_norm": 0.7392273545265198, + "learning_rate": 7.381972486672886e-06, + "loss": 0.7474, + "step": 283 + }, + { + "epoch": 0.6932641696544359, + "grad_norm": 0.691473126411438, + "learning_rate": 7.3649671210785024e-06, + "loss": 0.7392, + "step": 284 + }, + { + "epoch": 0.6957052406743459, + "grad_norm": 0.764784038066864, + "learning_rate": 7.34792644178429e-06, + "loss": 0.7341, + "step": 285 + }, + { + "epoch": 0.6981463116942559, + "grad_norm": 0.7281628251075745, + "learning_rate": 7.330850703241751e-06, + "loss": 0.7804, + "step": 286 + }, + { + "epoch": 0.7005873827141659, + "grad_norm": 0.843350887298584, + "learning_rate": 7.313740160425887e-06, + "loss": 0.7085, + "step": 287 + }, + { + "epoch": 0.7030284537340759, + "grad_norm": 0.7312772870063782, + "learning_rate": 7.296595068831406e-06, + "loss": 0.7638, + "step": 288 + }, + { + "epoch": 0.7054695247539858, + "grad_norm": 0.7479636073112488, + "learning_rate": 7.279415684468893e-06, + "loss": 0.7208, + "step": 289 + }, + { + "epoch": 0.7079105957738958, + "grad_norm": 0.6945940256118774, + "learning_rate": 7.262202263860989e-06, + "loss": 0.7133, + "step": 290 + }, + { + "epoch": 0.7103516667938058, + "grad_norm": 0.697964608669281, + "learning_rate": 7.244955064038574e-06, + "loss": 0.7478, + "step": 291 + }, + { + "epoch": 0.7127927378137158, + "grad_norm": 0.7676611542701721, + "learning_rate": 7.227674342536914e-06, + "loss": 0.7778, + "step": 292 + }, + { + "epoch": 0.7152338088336258, + "grad_norm": 0.6927245259284973, + "learning_rate": 7.210360357391818e-06, + "loss": 0.7041, + "step": 293 + }, + { + "epoch": 0.7176748798535357, + "grad_norm": 0.7169522643089294, + "learning_rate": 7.1930133671357915e-06, + "loss": 0.7493, + "step": 294 + }, + { + "epoch": 0.7201159508734457, + "grad_norm": 0.6862355470657349, + "learning_rate": 7.175633630794176e-06, + "loss": 0.7547, + "step": 295 + }, + { + "epoch": 0.7225570218933557, + "grad_norm": 0.7003543376922607, + "learning_rate": 7.1582214078812715e-06, + "loss": 0.7677, + "step": 296 + }, + { + "epoch": 0.7249980929132657, + "grad_norm": 0.6878992915153503, + "learning_rate": 7.140776958396468e-06, + "loss": 0.7663, + "step": 297 + }, + { + "epoch": 0.7274391639331756, + "grad_norm": 0.6947048306465149, + "learning_rate": 7.123300542820367e-06, + "loss": 0.7514, + "step": 298 + }, + { + "epoch": 0.7298802349530856, + "grad_norm": 0.6599522829055786, + "learning_rate": 7.1057924221108856e-06, + "loss": 0.7363, + "step": 299 + }, + { + "epoch": 0.7323213059729956, + "grad_norm": 0.6951528787612915, + "learning_rate": 7.08825285769936e-06, + "loss": 0.7247, + "step": 300 + }, + { + "epoch": 0.7347623769929056, + "grad_norm": 0.7057417035102844, + "learning_rate": 7.0706821114866475e-06, + "loss": 0.7829, + "step": 301 + }, + { + "epoch": 0.7372034480128157, + "grad_norm": 0.7661730647087097, + "learning_rate": 7.053080445839211e-06, + "loss": 0.7233, + "step": 302 + }, + { + "epoch": 0.7396445190327257, + "grad_norm": 0.6780954003334045, + "learning_rate": 7.035448123585201e-06, + "loss": 0.7549, + "step": 303 + }, + { + "epoch": 0.7420855900526356, + "grad_norm": 0.7154073715209961, + "learning_rate": 7.017785408010533e-06, + "loss": 0.7593, + "step": 304 + }, + { + "epoch": 0.7445266610725456, + "grad_norm": 0.72113436460495, + "learning_rate": 7.0000925628549595e-06, + "loss": 0.8079, + "step": 305 + }, + { + "epoch": 0.7469677320924556, + "grad_norm": 0.6903125643730164, + "learning_rate": 6.982369852308124e-06, + "loss": 0.7777, + "step": 306 + }, + { + "epoch": 0.7494088031123656, + "grad_norm": 0.7365685701370239, + "learning_rate": 6.964617541005617e-06, + "loss": 0.7827, + "step": 307 + }, + { + "epoch": 0.7518498741322756, + "grad_norm": 0.7428478002548218, + "learning_rate": 6.946835894025037e-06, + "loss": 0.7319, + "step": 308 + }, + { + "epoch": 0.7542909451521855, + "grad_norm": 0.7224217653274536, + "learning_rate": 6.929025176882016e-06, + "loss": 0.7758, + "step": 309 + }, + { + "epoch": 0.7567320161720955, + "grad_norm": 0.7415998578071594, + "learning_rate": 6.911185655526263e-06, + "loss": 0.7832, + "step": 310 + }, + { + "epoch": 0.7591730871920055, + "grad_norm": 0.7322662472724915, + "learning_rate": 6.893317596337592e-06, + "loss": 0.7323, + "step": 311 + }, + { + "epoch": 0.7616141582119155, + "grad_norm": 0.6983169913291931, + "learning_rate": 6.875421266121946e-06, + "loss": 0.7576, + "step": 312 + }, + { + "epoch": 0.7640552292318255, + "grad_norm": 0.7458372116088867, + "learning_rate": 6.857496932107407e-06, + "loss": 0.7549, + "step": 313 + }, + { + "epoch": 0.7664963002517354, + "grad_norm": 0.7724815011024475, + "learning_rate": 6.839544861940214e-06, + "loss": 0.7625, + "step": 314 + }, + { + "epoch": 0.7689373712716454, + "grad_norm": 0.7590445280075073, + "learning_rate": 6.821565323680759e-06, + "loss": 0.7422, + "step": 315 + }, + { + "epoch": 0.7713784422915554, + "grad_norm": 0.7100796699523926, + "learning_rate": 6.80355858579959e-06, + "loss": 0.748, + "step": 316 + }, + { + "epoch": 0.7738195133114654, + "grad_norm": 0.6766054034233093, + "learning_rate": 6.7855249171734e-06, + "loss": 0.7487, + "step": 317 + }, + { + "epoch": 0.7762605843313753, + "grad_norm": 0.7497526407241821, + "learning_rate": 6.76746458708101e-06, + "loss": 0.7584, + "step": 318 + }, + { + "epoch": 0.7787016553512853, + "grad_norm": 0.6761816740036011, + "learning_rate": 6.74937786519935e-06, + "loss": 0.7332, + "step": 319 + }, + { + "epoch": 0.7811427263711954, + "grad_norm": 0.6793827414512634, + "learning_rate": 6.731265021599437e-06, + "loss": 0.7387, + "step": 320 + }, + { + "epoch": 0.7835837973911054, + "grad_norm": 0.7181971669197083, + "learning_rate": 6.7131263267423305e-06, + "loss": 0.7588, + "step": 321 + }, + { + "epoch": 0.7860248684110154, + "grad_norm": 0.6722172498703003, + "learning_rate": 6.6949620514751075e-06, + "loss": 0.7264, + "step": 322 + }, + { + "epoch": 0.7884659394309254, + "grad_norm": 0.6741105914115906, + "learning_rate": 6.676772467026809e-06, + "loss": 0.7806, + "step": 323 + }, + { + "epoch": 0.7909070104508353, + "grad_norm": 0.6798011660575867, + "learning_rate": 6.65855784500439e-06, + "loss": 0.7281, + "step": 324 + }, + { + "epoch": 0.7933480814707453, + "grad_norm": 0.6723977327346802, + "learning_rate": 6.640318457388672e-06, + "loss": 0.7358, + "step": 325 + }, + { + "epoch": 0.7957891524906553, + "grad_norm": 0.6920611262321472, + "learning_rate": 6.622054576530275e-06, + "loss": 0.7754, + "step": 326 + }, + { + "epoch": 0.7982302235105653, + "grad_norm": 0.7018612623214722, + "learning_rate": 6.603766475145546e-06, + "loss": 0.7714, + "step": 327 + }, + { + "epoch": 0.8006712945304753, + "grad_norm": 0.7847645282745361, + "learning_rate": 6.585454426312506e-06, + "loss": 0.804, + "step": 328 + }, + { + "epoch": 0.8031123655503852, + "grad_norm": 0.6560506820678711, + "learning_rate": 6.5671187034667465e-06, + "loss": 0.7768, + "step": 329 + }, + { + "epoch": 0.8055534365702952, + "grad_norm": 0.720274031162262, + "learning_rate": 6.548759580397363e-06, + "loss": 0.7619, + "step": 330 + }, + { + "epoch": 0.8079945075902052, + "grad_norm": 0.6853426694869995, + "learning_rate": 6.53037733124287e-06, + "loss": 0.7147, + "step": 331 + }, + { + "epoch": 0.8104355786101152, + "grad_norm": 0.6448130011558533, + "learning_rate": 6.511972230487091e-06, + "loss": 0.7958, + "step": 332 + }, + { + "epoch": 0.8128766496300251, + "grad_norm": 0.6405046582221985, + "learning_rate": 6.4935445529550775e-06, + "loss": 0.7659, + "step": 333 + }, + { + "epoch": 0.8153177206499351, + "grad_norm": 0.7028204798698425, + "learning_rate": 6.475094573808994e-06, + "loss": 0.7609, + "step": 334 + }, + { + "epoch": 0.8177587916698451, + "grad_norm": 0.7086704969406128, + "learning_rate": 6.456622568544012e-06, + "loss": 0.7719, + "step": 335 + }, + { + "epoch": 0.8201998626897551, + "grad_norm": 0.6891460418701172, + "learning_rate": 6.438128812984199e-06, + "loss": 0.7667, + "step": 336 + }, + { + "epoch": 0.8226409337096651, + "grad_norm": 0.7402656674385071, + "learning_rate": 6.419613583278395e-06, + "loss": 0.7833, + "step": 337 + }, + { + "epoch": 0.825082004729575, + "grad_norm": 0.7249888777732849, + "learning_rate": 6.401077155896098e-06, + "loss": 0.7031, + "step": 338 + }, + { + "epoch": 0.8275230757494851, + "grad_norm": 0.6789460182189941, + "learning_rate": 6.3825198076233255e-06, + "loss": 0.7739, + "step": 339 + }, + { + "epoch": 0.8299641467693951, + "grad_norm": 0.6832931041717529, + "learning_rate": 6.363941815558484e-06, + "loss": 0.7242, + "step": 340 + }, + { + "epoch": 0.8324052177893051, + "grad_norm": 0.64850252866745, + "learning_rate": 6.345343457108238e-06, + "loss": 0.7378, + "step": 341 + }, + { + "epoch": 0.8348462888092151, + "grad_norm": 0.7206950783729553, + "learning_rate": 6.32672500998336e-06, + "loss": 0.7718, + "step": 342 + }, + { + "epoch": 0.837287359829125, + "grad_norm": 0.690377950668335, + "learning_rate": 6.308086752194586e-06, + "loss": 0.7784, + "step": 343 + }, + { + "epoch": 0.839728430849035, + "grad_norm": 0.6793109774589539, + "learning_rate": 6.289428962048467e-06, + "loss": 0.7936, + "step": 344 + }, + { + "epoch": 0.842169501868945, + "grad_norm": 0.6914140582084656, + "learning_rate": 6.270751918143213e-06, + "loss": 0.7652, + "step": 345 + }, + { + "epoch": 0.844610572888855, + "grad_norm": 0.6733123064041138, + "learning_rate": 6.252055899364525e-06, + "loss": 0.8477, + "step": 346 + }, + { + "epoch": 0.847051643908765, + "grad_norm": 0.6884806156158447, + "learning_rate": 6.2333411848814415e-06, + "loss": 0.7544, + "step": 347 + }, + { + "epoch": 0.849492714928675, + "grad_norm": 0.6831750273704529, + "learning_rate": 6.214608054142167e-06, + "loss": 0.7333, + "step": 348 + }, + { + "epoch": 0.8519337859485849, + "grad_norm": 0.6560673713684082, + "learning_rate": 6.195856786869893e-06, + "loss": 0.7252, + "step": 349 + }, + { + "epoch": 0.8543748569684949, + "grad_norm": 0.6822741627693176, + "learning_rate": 6.177087663058626e-06, + "loss": 0.7181, + "step": 350 + }, + { + "epoch": 0.8568159279884049, + "grad_norm": 0.8162235021591187, + "learning_rate": 6.158300962969012e-06, + "loss": 0.7359, + "step": 351 + }, + { + "epoch": 0.8592569990083149, + "grad_norm": 0.7049196362495422, + "learning_rate": 6.13949696712414e-06, + "loss": 0.7592, + "step": 352 + }, + { + "epoch": 0.8616980700282248, + "grad_norm": 0.6802664399147034, + "learning_rate": 6.120675956305363e-06, + "loss": 0.7476, + "step": 353 + }, + { + "epoch": 0.8641391410481348, + "grad_norm": 0.7405847311019897, + "learning_rate": 6.101838211548099e-06, + "loss": 0.7368, + "step": 354 + }, + { + "epoch": 0.8665802120680448, + "grad_norm": 0.6568160057067871, + "learning_rate": 6.0829840141376385e-06, + "loss": 0.7519, + "step": 355 + }, + { + "epoch": 0.8690212830879548, + "grad_norm": 0.6947789192199707, + "learning_rate": 6.064113645604945e-06, + "loss": 0.7217, + "step": 356 + }, + { + "epoch": 0.8714623541078649, + "grad_norm": 0.6590073704719543, + "learning_rate": 6.045227387722445e-06, + "loss": 0.7516, + "step": 357 + }, + { + "epoch": 0.8739034251277749, + "grad_norm": 0.7120059132575989, + "learning_rate": 6.026325522499829e-06, + "loss": 0.7481, + "step": 358 + }, + { + "epoch": 0.8763444961476848, + "grad_norm": 0.6988116502761841, + "learning_rate": 6.007408332179836e-06, + "loss": 0.7995, + "step": 359 + }, + { + "epoch": 0.8787855671675948, + "grad_norm": 0.7005130648612976, + "learning_rate": 5.988476099234033e-06, + "loss": 0.7427, + "step": 360 + }, + { + "epoch": 0.8812266381875048, + "grad_norm": 0.675496518611908, + "learning_rate": 5.969529106358612e-06, + "loss": 0.7603, + "step": 361 + }, + { + "epoch": 0.8836677092074148, + "grad_norm": 0.7157605290412903, + "learning_rate": 5.95056763647016e-06, + "loss": 0.7788, + "step": 362 + }, + { + "epoch": 0.8861087802273248, + "grad_norm": 0.6584900617599487, + "learning_rate": 5.931591972701427e-06, + "loss": 0.7415, + "step": 363 + }, + { + "epoch": 0.8885498512472347, + "grad_norm": 0.6833528876304626, + "learning_rate": 5.9126023983971114e-06, + "loss": 0.7339, + "step": 364 + }, + { + "epoch": 0.8909909222671447, + "grad_norm": 0.6946988701820374, + "learning_rate": 5.893599197109625e-06, + "loss": 0.8115, + "step": 365 + }, + { + "epoch": 0.8934319932870547, + "grad_norm": 0.7004518508911133, + "learning_rate": 5.874582652594855e-06, + "loss": 0.75, + "step": 366 + }, + { + "epoch": 0.8958730643069647, + "grad_norm": 0.6791942119598389, + "learning_rate": 5.855553048807932e-06, + "loss": 0.7288, + "step": 367 + }, + { + "epoch": 0.8983141353268747, + "grad_norm": 0.6619113683700562, + "learning_rate": 5.836510669898984e-06, + "loss": 0.7408, + "step": 368 + }, + { + "epoch": 0.9007552063467846, + "grad_norm": 0.6960625052452087, + "learning_rate": 5.817455800208901e-06, + "loss": 0.7937, + "step": 369 + }, + { + "epoch": 0.9031962773666946, + "grad_norm": 0.6567366123199463, + "learning_rate": 5.798388724265085e-06, + "loss": 0.7555, + "step": 370 + }, + { + "epoch": 0.9056373483866046, + "grad_norm": 0.6739965677261353, + "learning_rate": 5.7793097267772e-06, + "loss": 0.7193, + "step": 371 + }, + { + "epoch": 0.9080784194065146, + "grad_norm": 0.6742777228355408, + "learning_rate": 5.760219092632924e-06, + "loss": 0.7308, + "step": 372 + }, + { + "epoch": 0.9105194904264245, + "grad_norm": 0.6787696480751038, + "learning_rate": 5.741117106893693e-06, + "loss": 0.7387, + "step": 373 + }, + { + "epoch": 0.9129605614463345, + "grad_norm": 0.6927146911621094, + "learning_rate": 5.722004054790442e-06, + "loss": 0.7435, + "step": 374 + }, + { + "epoch": 0.9154016324662446, + "grad_norm": 0.6972612142562866, + "learning_rate": 5.7028802217193565e-06, + "loss": 0.7605, + "step": 375 + }, + { + "epoch": 0.9178427034861546, + "grad_norm": 0.7299436926841736, + "learning_rate": 5.683745893237598e-06, + "loss": 0.7745, + "step": 376 + }, + { + "epoch": 0.9202837745060646, + "grad_norm": 0.6617533564567566, + "learning_rate": 5.664601355059044e-06, + "loss": 0.7718, + "step": 377 + }, + { + "epoch": 0.9227248455259746, + "grad_norm": 0.722768247127533, + "learning_rate": 5.645446893050029e-06, + "loss": 0.783, + "step": 378 + }, + { + "epoch": 0.9251659165458845, + "grad_norm": 0.686353862285614, + "learning_rate": 5.626282793225066e-06, + "loss": 0.7411, + "step": 379 + }, + { + "epoch": 0.9276069875657945, + "grad_norm": 0.7211350798606873, + "learning_rate": 5.607109341742579e-06, + "loss": 0.7729, + "step": 380 + }, + { + "epoch": 0.9300480585857045, + "grad_norm": 0.6945457458496094, + "learning_rate": 5.587926824900637e-06, + "loss": 0.73, + "step": 381 + }, + { + "epoch": 0.9324891296056145, + "grad_norm": 0.641502320766449, + "learning_rate": 5.568735529132665e-06, + "loss": 0.7537, + "step": 382 + }, + { + "epoch": 0.9349302006255245, + "grad_norm": 0.7009978294372559, + "learning_rate": 5.5495357410031805e-06, + "loss": 0.7407, + "step": 383 + }, + { + "epoch": 0.9373712716454344, + "grad_norm": 0.6966471672058105, + "learning_rate": 5.530327747203507e-06, + "loss": 0.7287, + "step": 384 + }, + { + "epoch": 0.9398123426653444, + "grad_norm": 0.7361583709716797, + "learning_rate": 5.511111834547496e-06, + "loss": 0.7132, + "step": 385 + }, + { + "epoch": 0.9422534136852544, + "grad_norm": 0.8365876078605652, + "learning_rate": 5.491888289967241e-06, + "loss": 0.7517, + "step": 386 + }, + { + "epoch": 0.9446944847051644, + "grad_norm": 0.7376920580863953, + "learning_rate": 5.472657400508801e-06, + "loss": 0.7354, + "step": 387 + }, + { + "epoch": 0.9471355557250744, + "grad_norm": 0.6934507489204407, + "learning_rate": 5.4534194533279e-06, + "loss": 0.7418, + "step": 388 + }, + { + "epoch": 0.9495766267449843, + "grad_norm": 0.6778532862663269, + "learning_rate": 5.434174735685658e-06, + "loss": 0.7768, + "step": 389 + }, + { + "epoch": 0.9520176977648943, + "grad_norm": 0.7122485041618347, + "learning_rate": 5.414923534944283e-06, + "loss": 0.7674, + "step": 390 + }, + { + "epoch": 0.9544587687848043, + "grad_norm": 0.6961596608161926, + "learning_rate": 5.395666138562794e-06, + "loss": 0.7709, + "step": 391 + }, + { + "epoch": 0.9568998398047143, + "grad_norm": 0.7608603239059448, + "learning_rate": 5.376402834092721e-06, + "loss": 0.7787, + "step": 392 + }, + { + "epoch": 0.9593409108246242, + "grad_norm": 0.6927947402000427, + "learning_rate": 5.357133909173815e-06, + "loss": 0.7363, + "step": 393 + }, + { + "epoch": 0.9617819818445343, + "grad_norm": 0.7136049270629883, + "learning_rate": 5.337859651529747e-06, + "loss": 0.742, + "step": 394 + }, + { + "epoch": 0.9642230528644443, + "grad_norm": 0.7149622440338135, + "learning_rate": 5.318580348963826e-06, + "loss": 0.7501, + "step": 395 + }, + { + "epoch": 0.9666641238843543, + "grad_norm": 0.6613947749137878, + "learning_rate": 5.2992962893546804e-06, + "loss": 0.7045, + "step": 396 + }, + { + "epoch": 0.9691051949042643, + "grad_norm": 0.7548585534095764, + "learning_rate": 5.280007760651977e-06, + "loss": 0.7447, + "step": 397 + }, + { + "epoch": 0.9715462659241743, + "grad_norm": 0.6713358759880066, + "learning_rate": 5.260715050872119e-06, + "loss": 0.7356, + "step": 398 + }, + { + "epoch": 0.9739873369440842, + "grad_norm": 0.7130277156829834, + "learning_rate": 5.241418448093931e-06, + "loss": 0.7523, + "step": 399 + }, + { + "epoch": 0.9764284079639942, + "grad_norm": 0.7643216252326965, + "learning_rate": 5.222118240454376e-06, + "loss": 0.7581, + "step": 400 + }, + { + "epoch": 0.9788694789839042, + "grad_norm": 0.6513252258300781, + "learning_rate": 5.202814716144245e-06, + "loss": 0.7635, + "step": 401 + }, + { + "epoch": 0.9813105500038142, + "grad_norm": 0.7522091269493103, + "learning_rate": 5.1835081634038455e-06, + "loss": 0.7765, + "step": 402 + }, + { + "epoch": 0.9837516210237242, + "grad_norm": 0.7359428405761719, + "learning_rate": 5.164198870518714e-06, + "loss": 0.7626, + "step": 403 + }, + { + "epoch": 0.9861926920436341, + "grad_norm": 0.7049588561058044, + "learning_rate": 5.144887125815301e-06, + "loss": 0.6856, + "step": 404 + }, + { + "epoch": 0.9886337630635441, + "grad_norm": 0.70652836561203, + "learning_rate": 5.125573217656664e-06, + "loss": 0.7479, + "step": 405 + }, + { + "epoch": 0.9910748340834541, + "grad_norm": 0.6710291504859924, + "learning_rate": 5.1062574344381686e-06, + "loss": 0.7419, + "step": 406 + }, + { + "epoch": 0.9935159051033641, + "grad_norm": 0.6925918459892273, + "learning_rate": 5.086940064583179e-06, + "loss": 0.7222, + "step": 407 + }, + { + "epoch": 0.995956976123274, + "grad_norm": 0.738444447517395, + "learning_rate": 5.067621396538747e-06, + "loss": 0.738, + "step": 408 + }, + { + "epoch": 0.998398047143184, + "grad_norm": 0.6995366811752319, + "learning_rate": 5.048301718771317e-06, + "loss": 0.7986, + "step": 409 + }, + { + "epoch": 1.000839118163094, + "grad_norm": 0.7283437848091125, + "learning_rate": 5.028981319762399e-06, + "loss": 0.7439, + "step": 410 + }, + { + "epoch": 1.003280189183004, + "grad_norm": 0.6933985948562622, + "learning_rate": 5.009660488004283e-06, + "loss": 0.7776, + "step": 411 + }, + { + "epoch": 1.005721260202914, + "grad_norm": 1.7147387266159058, + "learning_rate": 4.990339511995718e-06, + "loss": 0.7962, + "step": 412 + }, + { + "epoch": 1.008162331222824, + "grad_norm": 0.7424637675285339, + "learning_rate": 4.971018680237602e-06, + "loss": 0.7573, + "step": 413 + }, + { + "epoch": 1.010603402242734, + "grad_norm": 0.714701235294342, + "learning_rate": 4.951698281228686e-06, + "loss": 0.7582, + "step": 414 + }, + { + "epoch": 1.0003051338774887, + "grad_norm": 0.7080721855163574, + "learning_rate": 4.932378603461253e-06, + "loss": 0.7603, + "step": 415 + }, + { + "epoch": 1.0027462048973987, + "grad_norm": 1.1822799444198608, + "learning_rate": 4.913059935416822e-06, + "loss": 0.6213, + "step": 416 + }, + { + "epoch": 1.0051872759173086, + "grad_norm": 1.084796667098999, + "learning_rate": 4.893742565561832e-06, + "loss": 0.6391, + "step": 417 + }, + { + "epoch": 1.0076283469372187, + "grad_norm": 0.7524453401565552, + "learning_rate": 4.8744267823433374e-06, + "loss": 0.6246, + "step": 418 + }, + { + "epoch": 1.0100694179571288, + "grad_norm": 0.8332728147506714, + "learning_rate": 4.855112874184701e-06, + "loss": 0.615, + "step": 419 + }, + { + "epoch": 1.0125104889770387, + "grad_norm": 1.0015031099319458, + "learning_rate": 4.835801129481287e-06, + "loss": 0.6168, + "step": 420 + }, + { + "epoch": 1.0149515599969487, + "grad_norm": 0.9933431148529053, + "learning_rate": 4.816491836596157e-06, + "loss": 0.6083, + "step": 421 + }, + { + "epoch": 1.0173926310168586, + "grad_norm": 0.8857572674751282, + "learning_rate": 4.797185283855756e-06, + "loss": 0.6478, + "step": 422 + }, + { + "epoch": 1.0198337020367687, + "grad_norm": 1.0425084829330444, + "learning_rate": 4.777881759545625e-06, + "loss": 0.6198, + "step": 423 + }, + { + "epoch": 1.0222747730566786, + "grad_norm": 0.8083875775337219, + "learning_rate": 4.75858155190607e-06, + "loss": 0.6153, + "step": 424 + }, + { + "epoch": 1.0247158440765887, + "grad_norm": 0.8430187106132507, + "learning_rate": 4.7392849491278825e-06, + "loss": 0.5814, + "step": 425 + }, + { + "epoch": 1.0271569150964985, + "grad_norm": 0.8698053956031799, + "learning_rate": 4.719992239348024e-06, + "loss": 0.619, + "step": 426 + }, + { + "epoch": 1.0295979861164086, + "grad_norm": 0.8039147257804871, + "learning_rate": 4.700703710645322e-06, + "loss": 0.6716, + "step": 427 + }, + { + "epoch": 1.0320390571363185, + "grad_norm": 0.7722117900848389, + "learning_rate": 4.681419651036177e-06, + "loss": 0.6423, + "step": 428 + }, + { + "epoch": 1.0344801281562286, + "grad_norm": 0.7633837461471558, + "learning_rate": 4.662140348470253e-06, + "loss": 0.6122, + "step": 429 + }, + { + "epoch": 1.0369211991761385, + "grad_norm": 0.845450758934021, + "learning_rate": 4.642866090826187e-06, + "loss": 0.617, + "step": 430 + }, + { + "epoch": 1.0393622701960485, + "grad_norm": 0.7823710441589355, + "learning_rate": 4.6235971659072806e-06, + "loss": 0.6102, + "step": 431 + }, + { + "epoch": 1.0418033412159584, + "grad_norm": 0.6919700503349304, + "learning_rate": 4.604333861437207e-06, + "loss": 0.5872, + "step": 432 + }, + { + "epoch": 1.0442444122358685, + "grad_norm": 0.8099319338798523, + "learning_rate": 4.585076465055719e-06, + "loss": 0.6435, + "step": 433 + }, + { + "epoch": 1.0466854832557784, + "grad_norm": 0.8152591586112976, + "learning_rate": 4.565825264314344e-06, + "loss": 0.6321, + "step": 434 + }, + { + "epoch": 1.0491265542756885, + "grad_norm": 0.7012869119644165, + "learning_rate": 4.5465805466721e-06, + "loss": 0.6248, + "step": 435 + }, + { + "epoch": 1.0515676252955983, + "grad_norm": 0.8160309195518494, + "learning_rate": 4.5273425994912e-06, + "loss": 0.6366, + "step": 436 + }, + { + "epoch": 1.0540086963155084, + "grad_norm": 0.7725083827972412, + "learning_rate": 4.5081117100327594e-06, + "loss": 0.6205, + "step": 437 + }, + { + "epoch": 1.0564497673354185, + "grad_norm": 0.7493377923965454, + "learning_rate": 4.488888165452506e-06, + "loss": 0.643, + "step": 438 + }, + { + "epoch": 1.0588908383553284, + "grad_norm": 0.7511170506477356, + "learning_rate": 4.469672252796495e-06, + "loss": 0.6245, + "step": 439 + }, + { + "epoch": 1.0613319093752385, + "grad_norm": 0.7679057717323303, + "learning_rate": 4.450464258996822e-06, + "loss": 0.6131, + "step": 440 + }, + { + "epoch": 1.0637729803951483, + "grad_norm": 0.7596908807754517, + "learning_rate": 4.4312644708673375e-06, + "loss": 0.6655, + "step": 441 + }, + { + "epoch": 1.0662140514150584, + "grad_norm": 0.709562361240387, + "learning_rate": 4.412073175099365e-06, + "loss": 0.637, + "step": 442 + }, + { + "epoch": 1.0686551224349683, + "grad_norm": 0.7175518274307251, + "learning_rate": 4.392890658257421e-06, + "loss": 0.6462, + "step": 443 + }, + { + "epoch": 1.0710961934548784, + "grad_norm": 0.7275789976119995, + "learning_rate": 4.373717206774935e-06, + "loss": 0.6773, + "step": 444 + }, + { + "epoch": 1.0735372644747883, + "grad_norm": 0.7145282626152039, + "learning_rate": 4.354553106949972e-06, + "loss": 0.5822, + "step": 445 + }, + { + "epoch": 1.0759783354946983, + "grad_norm": 0.7076422572135925, + "learning_rate": 4.335398644940958e-06, + "loss": 0.624, + "step": 446 + }, + { + "epoch": 1.0784194065146082, + "grad_norm": 0.7219144105911255, + "learning_rate": 4.316254106762404e-06, + "loss": 0.6099, + "step": 447 + }, + { + "epoch": 1.0808604775345183, + "grad_norm": 0.7198976874351501, + "learning_rate": 4.297119778280645e-06, + "loss": 0.6083, + "step": 448 + }, + { + "epoch": 1.0833015485544282, + "grad_norm": 0.7468734979629517, + "learning_rate": 4.277995945209558e-06, + "loss": 0.6438, + "step": 449 + }, + { + "epoch": 1.0857426195743383, + "grad_norm": 0.7466146945953369, + "learning_rate": 4.258882893106308e-06, + "loss": 0.609, + "step": 450 + }, + { + "epoch": 1.0881836905942481, + "grad_norm": 0.7118586897850037, + "learning_rate": 4.239780907367078e-06, + "loss": 0.6045, + "step": 451 + }, + { + "epoch": 1.0906247616141582, + "grad_norm": 0.7523221969604492, + "learning_rate": 4.220690273222802e-06, + "loss": 0.5949, + "step": 452 + }, + { + "epoch": 1.093065832634068, + "grad_norm": 0.7216614484786987, + "learning_rate": 4.201611275734916e-06, + "loss": 0.6117, + "step": 453 + }, + { + "epoch": 1.0955069036539782, + "grad_norm": 0.7935013771057129, + "learning_rate": 4.182544199791102e-06, + "loss": 0.6055, + "step": 454 + }, + { + "epoch": 1.0979479746738883, + "grad_norm": 0.7399945259094238, + "learning_rate": 4.163489330101017e-06, + "loss": 0.6147, + "step": 455 + }, + { + "epoch": 1.1003890456937981, + "grad_norm": 0.7488657236099243, + "learning_rate": 4.14444695119207e-06, + "loss": 0.6189, + "step": 456 + }, + { + "epoch": 1.1028301167137082, + "grad_norm": 0.7612137794494629, + "learning_rate": 4.125417347405147e-06, + "loss": 0.6291, + "step": 457 + }, + { + "epoch": 1.105271187733618, + "grad_norm": 0.7072088718414307, + "learning_rate": 4.106400802890377e-06, + "loss": 0.6332, + "step": 458 + }, + { + "epoch": 1.1077122587535282, + "grad_norm": 0.6833801865577698, + "learning_rate": 4.08739760160289e-06, + "loss": 0.5767, + "step": 459 + }, + { + "epoch": 1.110153329773438, + "grad_norm": 0.7195488214492798, + "learning_rate": 4.068408027298576e-06, + "loss": 0.6515, + "step": 460 + }, + { + "epoch": 1.1125944007933481, + "grad_norm": 0.7658545970916748, + "learning_rate": 4.049432363529842e-06, + "loss": 0.6528, + "step": 461 + }, + { + "epoch": 1.115035471813258, + "grad_norm": 0.713361382484436, + "learning_rate": 4.030470893641387e-06, + "loss": 0.6457, + "step": 462 + }, + { + "epoch": 1.117476542833168, + "grad_norm": 0.7208877205848694, + "learning_rate": 4.011523900765968e-06, + "loss": 0.657, + "step": 463 + }, + { + "epoch": 1.119917613853078, + "grad_norm": 0.7104807496070862, + "learning_rate": 3.992591667820166e-06, + "loss": 0.615, + "step": 464 + }, + { + "epoch": 1.122358684872988, + "grad_norm": 0.7092770338058472, + "learning_rate": 3.973674477500172e-06, + "loss": 0.6052, + "step": 465 + }, + { + "epoch": 1.124799755892898, + "grad_norm": 0.695006251335144, + "learning_rate": 3.954772612277557e-06, + "loss": 0.6211, + "step": 466 + }, + { + "epoch": 1.127240826912808, + "grad_norm": 0.6810059547424316, + "learning_rate": 3.935886354395057e-06, + "loss": 0.5862, + "step": 467 + }, + { + "epoch": 1.129681897932718, + "grad_norm": 0.6945995688438416, + "learning_rate": 3.917015985862364e-06, + "loss": 0.6112, + "step": 468 + }, + { + "epoch": 1.132122968952628, + "grad_norm": 0.7196330428123474, + "learning_rate": 3.8981617884519015e-06, + "loss": 0.6147, + "step": 469 + }, + { + "epoch": 1.1345640399725379, + "grad_norm": 0.6800129413604736, + "learning_rate": 3.8793240436946385e-06, + "loss": 0.6016, + "step": 470 + }, + { + "epoch": 1.137005110992448, + "grad_norm": 0.6729468107223511, + "learning_rate": 3.860503032875861e-06, + "loss": 0.6244, + "step": 471 + }, + { + "epoch": 1.139446182012358, + "grad_norm": 0.7448645234107971, + "learning_rate": 3.841699037030989e-06, + "loss": 0.5908, + "step": 472 + }, + { + "epoch": 1.141887253032268, + "grad_norm": 0.7207785248756409, + "learning_rate": 3.822912336941375e-06, + "loss": 0.6245, + "step": 473 + }, + { + "epoch": 1.1443283240521778, + "grad_norm": 0.6913513541221619, + "learning_rate": 3.80414321313011e-06, + "loss": 0.612, + "step": 474 + }, + { + "epoch": 1.1467693950720879, + "grad_norm": 0.7224765419960022, + "learning_rate": 3.7853919458578327e-06, + "loss": 0.627, + "step": 475 + }, + { + "epoch": 1.149210466091998, + "grad_norm": 0.7210829257965088, + "learning_rate": 3.7666588151185584e-06, + "loss": 0.6174, + "step": 476 + }, + { + "epoch": 1.1516515371119078, + "grad_norm": 0.7608765363693237, + "learning_rate": 3.7479441006354755e-06, + "loss": 0.6023, + "step": 477 + }, + { + "epoch": 1.154092608131818, + "grad_norm": 0.7259137034416199, + "learning_rate": 3.729248081856788e-06, + "loss": 0.611, + "step": 478 + }, + { + "epoch": 1.1565336791517278, + "grad_norm": 0.700013279914856, + "learning_rate": 3.7105710379515335e-06, + "loss": 0.6162, + "step": 479 + }, + { + "epoch": 1.1589747501716379, + "grad_norm": 0.7397728562355042, + "learning_rate": 3.6919132478054153e-06, + "loss": 0.567, + "step": 480 + }, + { + "epoch": 1.1614158211915477, + "grad_norm": 0.6973784565925598, + "learning_rate": 3.673274990016642e-06, + "loss": 0.6249, + "step": 481 + }, + { + "epoch": 1.1638568922114578, + "grad_norm": 0.7272012829780579, + "learning_rate": 3.6546565428917623e-06, + "loss": 0.6025, + "step": 482 + }, + { + "epoch": 1.1662979632313677, + "grad_norm": 0.7183772921562195, + "learning_rate": 3.6360581844415165e-06, + "loss": 0.6128, + "step": 483 + }, + { + "epoch": 1.1687390342512778, + "grad_norm": 0.7049492001533508, + "learning_rate": 3.6174801923766762e-06, + "loss": 0.6525, + "step": 484 + }, + { + "epoch": 1.1711801052711877, + "grad_norm": 0.7470197081565857, + "learning_rate": 3.5989228441039024e-06, + "loss": 0.6067, + "step": 485 + }, + { + "epoch": 1.1736211762910977, + "grad_norm": 0.66621994972229, + "learning_rate": 3.5803864167216055e-06, + "loss": 0.6597, + "step": 486 + }, + { + "epoch": 1.1760622473110076, + "grad_norm": 0.7643696665763855, + "learning_rate": 3.561871187015803e-06, + "loss": 0.6192, + "step": 487 + }, + { + "epoch": 1.1785033183309177, + "grad_norm": 0.7268946766853333, + "learning_rate": 3.543377431455991e-06, + "loss": 0.6335, + "step": 488 + }, + { + "epoch": 1.1809443893508278, + "grad_norm": 0.743349552154541, + "learning_rate": 3.5249054261910067e-06, + "loss": 0.6024, + "step": 489 + }, + { + "epoch": 1.1833854603707377, + "grad_norm": 0.6776405572891235, + "learning_rate": 3.506455447044923e-06, + "loss": 0.6601, + "step": 490 + }, + { + "epoch": 1.1858265313906475, + "grad_norm": 0.7699673771858215, + "learning_rate": 3.4880277695129095e-06, + "loss": 0.6093, + "step": 491 + }, + { + "epoch": 1.1882676024105576, + "grad_norm": 0.6653928160667419, + "learning_rate": 3.4696226687571317e-06, + "loss": 0.655, + "step": 492 + }, + { + "epoch": 1.1907086734304677, + "grad_norm": 0.6945586204528809, + "learning_rate": 3.4512404196026384e-06, + "loss": 0.6305, + "step": 493 + }, + { + "epoch": 1.1931497444503776, + "grad_norm": 0.7231103777885437, + "learning_rate": 3.432881296533257e-06, + "loss": 0.6186, + "step": 494 + }, + { + "epoch": 1.1955908154702877, + "grad_norm": 0.718945324420929, + "learning_rate": 3.4145455736874957e-06, + "loss": 0.6155, + "step": 495 + }, + { + "epoch": 1.1980318864901975, + "grad_norm": 0.738957941532135, + "learning_rate": 3.396233524854453e-06, + "loss": 0.6022, + "step": 496 + }, + { + "epoch": 1.2004729575101076, + "grad_norm": 0.6797804236412048, + "learning_rate": 3.377945423469727e-06, + "loss": 0.6448, + "step": 497 + }, + { + "epoch": 1.2029140285300175, + "grad_norm": 0.7095910906791687, + "learning_rate": 3.359681542611328e-06, + "loss": 0.6229, + "step": 498 + }, + { + "epoch": 1.2053550995499276, + "grad_norm": 0.6906200647354126, + "learning_rate": 3.3414421549956115e-06, + "loss": 0.6149, + "step": 499 + }, + { + "epoch": 1.2077961705698375, + "grad_norm": 0.7154446840286255, + "learning_rate": 3.323227532973193e-06, + "loss": 0.6198, + "step": 500 + }, + { + "epoch": 1.2102372415897475, + "grad_norm": 0.7295240759849548, + "learning_rate": 3.305037948524894e-06, + "loss": 0.6541, + "step": 501 + }, + { + "epoch": 1.2126783126096574, + "grad_norm": 0.7125757932662964, + "learning_rate": 3.2868736732576695e-06, + "loss": 0.6396, + "step": 502 + }, + { + "epoch": 1.2151193836295675, + "grad_norm": 0.6835936307907104, + "learning_rate": 3.268734978400564e-06, + "loss": 0.6576, + "step": 503 + }, + { + "epoch": 1.2175604546494774, + "grad_norm": 0.7498407363891602, + "learning_rate": 3.250622134800651e-06, + "loss": 0.6013, + "step": 504 + }, + { + "epoch": 1.2200015256693875, + "grad_norm": 0.7102557420730591, + "learning_rate": 3.2325354129189923e-06, + "loss": 0.6175, + "step": 505 + }, + { + "epoch": 1.2224425966892973, + "grad_norm": 0.7087395787239075, + "learning_rate": 3.214475082826602e-06, + "loss": 0.6391, + "step": 506 + }, + { + "epoch": 1.2248836677092074, + "grad_norm": 0.6935733556747437, + "learning_rate": 3.1964414142004123e-06, + "loss": 0.6234, + "step": 507 + }, + { + "epoch": 1.2273247387291173, + "grad_norm": 0.7102475166320801, + "learning_rate": 3.1784346763192437e-06, + "loss": 0.6339, + "step": 508 + }, + { + "epoch": 1.2297658097490274, + "grad_norm": 0.7003853917121887, + "learning_rate": 3.160455138059788e-06, + "loss": 0.6077, + "step": 509 + }, + { + "epoch": 1.2322068807689375, + "grad_norm": 0.6690055727958679, + "learning_rate": 3.142503067892594e-06, + "loss": 0.6097, + "step": 510 + }, + { + "epoch": 1.2346479517888473, + "grad_norm": 0.7464238405227661, + "learning_rate": 3.1245787338780555e-06, + "loss": 0.617, + "step": 511 + }, + { + "epoch": 1.2370890228087574, + "grad_norm": 0.7298991084098816, + "learning_rate": 3.1066824036624086e-06, + "loss": 0.6492, + "step": 512 + }, + { + "epoch": 1.2395300938286673, + "grad_norm": 0.686462938785553, + "learning_rate": 3.0888143444737395e-06, + "loss": 0.5875, + "step": 513 + }, + { + "epoch": 1.2419711648485774, + "grad_norm": 0.7086668610572815, + "learning_rate": 3.070974823117986e-06, + "loss": 0.6059, + "step": 514 + }, + { + "epoch": 1.2444122358684873, + "grad_norm": 0.7413360476493835, + "learning_rate": 3.053164105974964e-06, + "loss": 0.5894, + "step": 515 + }, + { + "epoch": 1.2468533068883974, + "grad_norm": 0.6833621859550476, + "learning_rate": 3.0353824589943835e-06, + "loss": 0.6254, + "step": 516 + }, + { + "epoch": 1.2492943779083072, + "grad_norm": 0.6703265309333801, + "learning_rate": 3.017630147691878e-06, + "loss": 0.6072, + "step": 517 + }, + { + "epoch": 1.2517354489282173, + "grad_norm": 0.7052757143974304, + "learning_rate": 2.999907437145042e-06, + "loss": 0.614, + "step": 518 + }, + { + "epoch": 1.2541765199481272, + "grad_norm": 0.7140656113624573, + "learning_rate": 2.9822145919894676e-06, + "loss": 0.598, + "step": 519 + }, + { + "epoch": 1.2566175909680373, + "grad_norm": 0.722187340259552, + "learning_rate": 2.964551876414801e-06, + "loss": 0.6277, + "step": 520 + }, + { + "epoch": 1.2590586619879471, + "grad_norm": 0.7111802101135254, + "learning_rate": 2.946919554160792e-06, + "loss": 0.6221, + "step": 521 + }, + { + "epoch": 1.2614997330078572, + "grad_norm": 0.695512056350708, + "learning_rate": 2.929317888513353e-06, + "loss": 0.6257, + "step": 522 + }, + { + "epoch": 1.2639408040277673, + "grad_norm": 0.7214482426643372, + "learning_rate": 2.9117471423006418e-06, + "loss": 0.6436, + "step": 523 + }, + { + "epoch": 1.2663818750476772, + "grad_norm": 0.7167050242424011, + "learning_rate": 2.8942075778891153e-06, + "loss": 0.5728, + "step": 524 + }, + { + "epoch": 1.268822946067587, + "grad_norm": 0.737602710723877, + "learning_rate": 2.8766994571796336e-06, + "loss": 0.6247, + "step": 525 + }, + { + "epoch": 1.2712640170874971, + "grad_norm": 0.7210221290588379, + "learning_rate": 2.859223041603534e-06, + "loss": 0.6216, + "step": 526 + }, + { + "epoch": 1.2737050881074072, + "grad_norm": 0.6660017371177673, + "learning_rate": 2.84177859211873e-06, + "loss": 0.6212, + "step": 527 + }, + { + "epoch": 1.276146159127317, + "grad_norm": 0.7368438839912415, + "learning_rate": 2.8243663692058255e-06, + "loss": 0.6509, + "step": 528 + }, + { + "epoch": 1.278587230147227, + "grad_norm": 0.6850879788398743, + "learning_rate": 2.806986632864208e-06, + "loss": 0.6128, + "step": 529 + }, + { + "epoch": 1.281028301167137, + "grad_norm": 0.6914191842079163, + "learning_rate": 2.7896396426081844e-06, + "loss": 0.6038, + "step": 530 + }, + { + "epoch": 1.2834693721870472, + "grad_norm": 0.7017717361450195, + "learning_rate": 2.772325657463088e-06, + "loss": 0.6523, + "step": 531 + }, + { + "epoch": 1.285910443206957, + "grad_norm": 0.7203471660614014, + "learning_rate": 2.7550449359614272e-06, + "loss": 0.6852, + "step": 532 + }, + { + "epoch": 1.2883515142268671, + "grad_norm": 0.7346776127815247, + "learning_rate": 2.7377977361390118e-06, + "loss": 0.5981, + "step": 533 + }, + { + "epoch": 1.290792585246777, + "grad_norm": 0.6989855170249939, + "learning_rate": 2.7205843155311098e-06, + "loss": 0.6554, + "step": 534 + }, + { + "epoch": 1.293233656266687, + "grad_norm": 0.7189421653747559, + "learning_rate": 2.703404931168594e-06, + "loss": 0.6285, + "step": 535 + }, + { + "epoch": 1.295674727286597, + "grad_norm": 0.7148353457450867, + "learning_rate": 2.6862598395741136e-06, + "loss": 0.6629, + "step": 536 + }, + { + "epoch": 1.298115798306507, + "grad_norm": 0.7388694882392883, + "learning_rate": 2.66914929675825e-06, + "loss": 0.6393, + "step": 537 + }, + { + "epoch": 1.300556869326417, + "grad_norm": 0.7198134660720825, + "learning_rate": 2.652073558215711e-06, + "loss": 0.6458, + "step": 538 + }, + { + "epoch": 1.302997940346327, + "grad_norm": 0.7222850322723389, + "learning_rate": 2.6350328789215e-06, + "loss": 0.5939, + "step": 539 + }, + { + "epoch": 1.3054390113662369, + "grad_norm": 0.7000096440315247, + "learning_rate": 2.618027513327116e-06, + "loss": 0.6409, + "step": 540 + }, + { + "epoch": 1.307880082386147, + "grad_norm": 0.6793365478515625, + "learning_rate": 2.6010577153567597e-06, + "loss": 0.6155, + "step": 541 + }, + { + "epoch": 1.3103211534060568, + "grad_norm": 0.7198613882064819, + "learning_rate": 2.584123738403527e-06, + "loss": 0.6099, + "step": 542 + }, + { + "epoch": 1.312762224425967, + "grad_norm": 0.6797595620155334, + "learning_rate": 2.567225835325642e-06, + "loss": 0.6216, + "step": 543 + }, + { + "epoch": 1.315203295445877, + "grad_norm": 0.6998633742332458, + "learning_rate": 2.550364258442671e-06, + "loss": 0.6268, + "step": 544 + }, + { + "epoch": 1.3176443664657869, + "grad_norm": 0.706811785697937, + "learning_rate": 2.533539259531757e-06, + "loss": 0.5907, + "step": 545 + }, + { + "epoch": 1.3200854374856967, + "grad_norm": 0.7156489491462708, + "learning_rate": 2.5167510898238566e-06, + "loss": 0.62, + "step": 546 + }, + { + "epoch": 1.3225265085056068, + "grad_norm": 0.7096937894821167, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.6087, + "step": 547 + }, + { + "epoch": 1.324967579525517, + "grad_norm": 0.6790308952331543, + "learning_rate": 2.483286240187538e-06, + "loss": 0.5822, + "step": 548 + }, + { + "epoch": 1.3274086505454268, + "grad_norm": 0.7136918306350708, + "learning_rate": 2.466610059956401e-06, + "loss": 0.6387, + "step": 549 + }, + { + "epoch": 1.3298497215653369, + "grad_norm": 0.7166918516159058, + "learning_rate": 2.4499717083153975e-06, + "loss": 0.6196, + "step": 550 + }, + { + "epoch": 1.3322907925852467, + "grad_norm": 0.6799836158752441, + "learning_rate": 2.433371433708465e-06, + "loss": 0.602, + "step": 551 + }, + { + "epoch": 1.3347318636051568, + "grad_norm": 0.6526500582695007, + "learning_rate": 2.4168094840109784e-06, + "loss": 0.598, + "step": 552 + }, + { + "epoch": 1.3371729346250667, + "grad_norm": 0.6871894001960754, + "learning_rate": 2.4002861065260506e-06, + "loss": 0.5954, + "step": 553 + }, + { + "epoch": 1.3396140056449768, + "grad_norm": 0.6922629475593567, + "learning_rate": 2.383801547980826e-06, + "loss": 0.6007, + "step": 554 + }, + { + "epoch": 1.3420550766648867, + "grad_norm": 0.7360095977783203, + "learning_rate": 2.3673560545228082e-06, + "loss": 0.6713, + "step": 555 + }, + { + "epoch": 1.3444961476847967, + "grad_norm": 0.6689037680625916, + "learning_rate": 2.3509498717161803e-06, + "loss": 0.6255, + "step": 556 + }, + { + "epoch": 1.3469372187047066, + "grad_norm": 0.7025970816612244, + "learning_rate": 2.3345832445381415e-06, + "loss": 0.6239, + "step": 557 + }, + { + "epoch": 1.3493782897246167, + "grad_norm": 0.7011806964874268, + "learning_rate": 2.31825641737524e-06, + "loss": 0.6243, + "step": 558 + }, + { + "epoch": 1.3518193607445266, + "grad_norm": 0.7212774753570557, + "learning_rate": 2.3019696340197358e-06, + "loss": 0.612, + "step": 559 + }, + { + "epoch": 1.3542604317644367, + "grad_norm": 0.6677849292755127, + "learning_rate": 2.2857231376659517e-06, + "loss": 0.6362, + "step": 560 + }, + { + "epoch": 1.3567015027843468, + "grad_norm": 0.7369222640991211, + "learning_rate": 2.2695171709066427e-06, + "loss": 0.5964, + "step": 561 + }, + { + "epoch": 1.3591425738042566, + "grad_norm": 0.7177897691726685, + "learning_rate": 2.2533519757293803e-06, + "loss": 0.6261, + "step": 562 + }, + { + "epoch": 1.3615836448241665, + "grad_norm": 0.7659596800804138, + "learning_rate": 2.237227793512935e-06, + "loss": 0.623, + "step": 563 + }, + { + "epoch": 1.3640247158440766, + "grad_norm": 0.7141837477684021, + "learning_rate": 2.221144865023666e-06, + "loss": 0.6207, + "step": 564 + }, + { + "epoch": 1.3664657868639867, + "grad_norm": 0.7026992440223694, + "learning_rate": 2.2051034304119344e-06, + "loss": 0.6172, + "step": 565 + }, + { + "epoch": 1.3689068578838965, + "grad_norm": 0.7084405422210693, + "learning_rate": 2.1891037292085177e-06, + "loss": 0.5976, + "step": 566 + }, + { + "epoch": 1.3713479289038064, + "grad_norm": 0.7147724032402039, + "learning_rate": 2.1731460003210255e-06, + "loss": 0.6032, + "step": 567 + }, + { + "epoch": 1.3737889999237165, + "grad_norm": 0.6965445876121521, + "learning_rate": 2.157230482030336e-06, + "loss": 0.6451, + "step": 568 + }, + { + "epoch": 1.3762300709436266, + "grad_norm": 0.7260875105857849, + "learning_rate": 2.141357411987044e-06, + "loss": 0.5887, + "step": 569 + }, + { + "epoch": 1.3786711419635365, + "grad_norm": 0.7008638978004456, + "learning_rate": 2.1255270272079044e-06, + "loss": 0.6476, + "step": 570 + }, + { + "epoch": 1.3811122129834466, + "grad_norm": 0.6777788996696472, + "learning_rate": 2.1097395640722916e-06, + "loss": 0.6201, + "step": 571 + }, + { + "epoch": 1.3835532840033564, + "grad_norm": 0.7306210994720459, + "learning_rate": 2.0939952583186806e-06, + "loss": 0.6373, + "step": 572 + }, + { + "epoch": 1.3859943550232665, + "grad_norm": 0.7415671944618225, + "learning_rate": 2.0782943450411148e-06, + "loss": 0.6038, + "step": 573 + }, + { + "epoch": 1.3884354260431764, + "grad_norm": 0.7014935612678528, + "learning_rate": 2.062637058685701e-06, + "loss": 0.6012, + "step": 574 + }, + { + "epoch": 1.3908764970630865, + "grad_norm": 0.664974570274353, + "learning_rate": 2.0470236330471125e-06, + "loss": 0.5804, + "step": 575 + }, + { + "epoch": 1.3933175680829963, + "grad_norm": 0.7382723689079285, + "learning_rate": 2.0314543012650934e-06, + "loss": 0.6291, + "step": 576 + }, + { + "epoch": 1.3957586391029064, + "grad_norm": 0.7389819622039795, + "learning_rate": 2.015929295820974e-06, + "loss": 0.6151, + "step": 577 + }, + { + "epoch": 1.3981997101228165, + "grad_norm": 0.7102298736572266, + "learning_rate": 2.000448848534209e-06, + "loss": 0.6381, + "step": 578 + }, + { + "epoch": 1.4006407811427264, + "grad_norm": 0.7227942943572998, + "learning_rate": 1.9850131905589065e-06, + "loss": 0.6524, + "step": 579 + }, + { + "epoch": 1.4030818521626363, + "grad_norm": 0.6704846620559692, + "learning_rate": 1.9696225523803803e-06, + "loss": 0.5928, + "step": 580 + }, + { + "epoch": 1.4055229231825463, + "grad_norm": 0.6964511871337891, + "learning_rate": 1.9542771638117124e-06, + "loss": 0.6072, + "step": 581 + }, + { + "epoch": 1.4079639942024564, + "grad_norm": 0.6633496284484863, + "learning_rate": 1.9389772539903123e-06, + "loss": 0.5949, + "step": 582 + }, + { + "epoch": 1.4104050652223663, + "grad_norm": 0.6861550211906433, + "learning_rate": 1.923723051374505e-06, + "loss": 0.607, + "step": 583 + }, + { + "epoch": 1.4128461362422762, + "grad_norm": 0.705496609210968, + "learning_rate": 1.908514783740114e-06, + "loss": 0.6053, + "step": 584 + }, + { + "epoch": 1.4152872072621863, + "grad_norm": 0.6763675808906555, + "learning_rate": 1.89335267817706e-06, + "loss": 0.6074, + "step": 585 + }, + { + "epoch": 1.4177282782820964, + "grad_norm": 0.7484830617904663, + "learning_rate": 1.8782369610859707e-06, + "loss": 0.5975, + "step": 586 + }, + { + "epoch": 1.4201693493020062, + "grad_norm": 0.6787152290344238, + "learning_rate": 1.8631678581748059e-06, + "loss": 0.6134, + "step": 587 + }, + { + "epoch": 1.4226104203219163, + "grad_norm": 0.6600430607795715, + "learning_rate": 1.848145594455477e-06, + "loss": 0.6017, + "step": 588 + }, + { + "epoch": 1.4250514913418262, + "grad_norm": 0.6843500137329102, + "learning_rate": 1.8331703942404932e-06, + "loss": 0.6027, + "step": 589 + }, + { + "epoch": 1.4274925623617363, + "grad_norm": 0.725928544998169, + "learning_rate": 1.8182424811396131e-06, + "loss": 0.5891, + "step": 590 + }, + { + "epoch": 1.4299336333816461, + "grad_norm": 0.7073947787284851, + "learning_rate": 1.8033620780565058e-06, + "loss": 0.6, + "step": 591 + }, + { + "epoch": 1.4323747044015562, + "grad_norm": 0.6989341378211975, + "learning_rate": 1.7885294071854159e-06, + "loss": 0.6025, + "step": 592 + }, + { + "epoch": 1.434815775421466, + "grad_norm": 0.7197926640510559, + "learning_rate": 1.7737446900078503e-06, + "loss": 0.5944, + "step": 593 + }, + { + "epoch": 1.4372568464413762, + "grad_norm": 0.6586629152297974, + "learning_rate": 1.7590081472892779e-06, + "loss": 0.6074, + "step": 594 + }, + { + "epoch": 1.439697917461286, + "grad_norm": 0.6680869460105896, + "learning_rate": 1.7443199990758168e-06, + "loss": 0.5892, + "step": 595 + }, + { + "epoch": 1.4421389884811961, + "grad_norm": 0.6742851734161377, + "learning_rate": 1.7296804646909654e-06, + "loss": 0.6125, + "step": 596 + }, + { + "epoch": 1.444580059501106, + "grad_norm": 0.6669274568557739, + "learning_rate": 1.71508976273232e-06, + "loss": 0.6625, + "step": 597 + }, + { + "epoch": 1.447021130521016, + "grad_norm": 0.6753084659576416, + "learning_rate": 1.7005481110683064e-06, + "loss": 0.6016, + "step": 598 + }, + { + "epoch": 1.4494622015409262, + "grad_norm": 0.6702268719673157, + "learning_rate": 1.686055726834932e-06, + "loss": 0.5682, + "step": 599 + }, + { + "epoch": 1.451903272560836, + "grad_norm": 0.6861889958381653, + "learning_rate": 1.6716128264325477e-06, + "loss": 0.6066, + "step": 600 + }, + { + "epoch": 1.454344343580746, + "grad_norm": 0.7023141980171204, + "learning_rate": 1.6572196255226063e-06, + "loss": 0.6164, + "step": 601 + }, + { + "epoch": 1.456785414600656, + "grad_norm": 0.678805410861969, + "learning_rate": 1.6428763390244462e-06, + "loss": 0.6037, + "step": 602 + }, + { + "epoch": 1.4592264856205661, + "grad_norm": 0.6716406941413879, + "learning_rate": 1.6285831811120938e-06, + "loss": 0.5865, + "step": 603 + }, + { + "epoch": 1.461667556640476, + "grad_norm": 0.6960821151733398, + "learning_rate": 1.614340365211044e-06, + "loss": 0.626, + "step": 604 + }, + { + "epoch": 1.464108627660386, + "grad_norm": 0.6855660080909729, + "learning_rate": 1.6001481039950872e-06, + "loss": 0.6583, + "step": 605 + }, + { + "epoch": 1.466549698680296, + "grad_norm": 0.686470091342926, + "learning_rate": 1.5860066093831366e-06, + "loss": 0.5914, + "step": 606 + }, + { + "epoch": 1.468990769700206, + "grad_norm": 0.6726788878440857, + "learning_rate": 1.5719160925360517e-06, + "loss": 0.6499, + "step": 607 + }, + { + "epoch": 1.471431840720116, + "grad_norm": 0.6723580956459045, + "learning_rate": 1.557876763853493e-06, + "loss": 0.6381, + "step": 608 + }, + { + "epoch": 1.473872911740026, + "grad_norm": 0.7011963129043579, + "learning_rate": 1.5438888329707824e-06, + "loss": 0.5925, + "step": 609 + }, + { + "epoch": 1.4763139827599359, + "grad_norm": 0.708925724029541, + "learning_rate": 1.5299525087557682e-06, + "loss": 0.5849, + "step": 610 + }, + { + "epoch": 1.478755053779846, + "grad_norm": 0.6862295866012573, + "learning_rate": 1.5160679993057048e-06, + "loss": 0.5964, + "step": 611 + }, + { + "epoch": 1.4811961247997558, + "grad_norm": 0.6844595670700073, + "learning_rate": 1.502235511944154e-06, + "loss": 0.5876, + "step": 612 + }, + { + "epoch": 1.483637195819666, + "grad_norm": 0.6619264483451843, + "learning_rate": 1.488455253217877e-06, + "loss": 0.6242, + "step": 613 + }, + { + "epoch": 1.4860782668395758, + "grad_norm": 0.6791555881500244, + "learning_rate": 1.4747274288937597e-06, + "loss": 0.624, + "step": 614 + }, + { + "epoch": 1.4885193378594859, + "grad_norm": 0.6787840723991394, + "learning_rate": 1.461052243955739e-06, + "loss": 0.6298, + "step": 615 + } + ], + "logging_steps": 1, + "max_steps": 818, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 205, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3668760344669979e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}