{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9712, "eval_steps": 500, "global_step": 308000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 1.7230606079101562, "learning_rate": 4.99208e-05, "loss": 1.2281, "step": 500 }, { "epoch": 0.0064, "grad_norm": 3.655383348464966, "learning_rate": 4.9840800000000006e-05, "loss": 0.7566, "step": 1000 }, { "epoch": 0.0096, "grad_norm": 1.2925927639007568, "learning_rate": 4.97608e-05, "loss": 0.6764, "step": 1500 }, { "epoch": 0.0128, "grad_norm": 1.286004900932312, "learning_rate": 4.968080000000001e-05, "loss": 0.6304, "step": 2000 }, { "epoch": 0.016, "grad_norm": 1.2140214443206787, "learning_rate": 4.96008e-05, "loss": 0.5981, "step": 2500 }, { "epoch": 0.0192, "grad_norm": 1.2525482177734375, "learning_rate": 4.95208e-05, "loss": 0.5767, "step": 3000 }, { "epoch": 0.0224, "grad_norm": 1.2310410737991333, "learning_rate": 4.94408e-05, "loss": 0.5597, "step": 3500 }, { "epoch": 0.0256, "grad_norm": 1.1735206842422485, "learning_rate": 4.9360800000000004e-05, "loss": 0.5418, "step": 4000 }, { "epoch": 0.0288, "grad_norm": 1.114688754081726, "learning_rate": 4.9280800000000004e-05, "loss": 0.5335, "step": 4500 }, { "epoch": 0.032, "grad_norm": 0.8874593377113342, "learning_rate": 4.9200800000000005e-05, "loss": 0.5237, "step": 5000 }, { "epoch": 0.0352, "grad_norm": 1.1261299848556519, "learning_rate": 4.91208e-05, "loss": 0.5135, "step": 5500 }, { "epoch": 0.0384, "grad_norm": 0.9994556307792664, "learning_rate": 4.9040800000000007e-05, "loss": 0.5059, "step": 6000 }, { "epoch": 0.0416, "grad_norm": 1.2349673509597778, "learning_rate": 4.89608e-05, "loss": 0.4939, "step": 6500 }, { "epoch": 0.0448, "grad_norm": 0.9770995378494263, "learning_rate": 4.88808e-05, "loss": 0.4824, "step": 7000 }, { "epoch": 0.048, "grad_norm": 0.981966495513916, "learning_rate": 4.88008e-05, "loss": 0.4875, "step": 7500 }, { "epoch": 0.0512, "grad_norm": 1.0177415609359741, "learning_rate": 4.87208e-05, "loss": 0.4785, "step": 8000 }, { "epoch": 0.0544, "grad_norm": 1.0521667003631592, "learning_rate": 4.8640800000000004e-05, "loss": 0.4731, "step": 8500 }, { "epoch": 0.0576, "grad_norm": 0.8560615181922913, "learning_rate": 4.85608e-05, "loss": 0.4633, "step": 9000 }, { "epoch": 0.0608, "grad_norm": 1.0170217752456665, "learning_rate": 4.8480800000000005e-05, "loss": 0.4576, "step": 9500 }, { "epoch": 0.064, "grad_norm": 0.9891325831413269, "learning_rate": 4.84008e-05, "loss": 0.4556, "step": 10000 }, { "epoch": 0.0672, "grad_norm": 1.0609711408615112, "learning_rate": 4.832080000000001e-05, "loss": 0.4493, "step": 10500 }, { "epoch": 0.0704, "grad_norm": 0.8623799681663513, "learning_rate": 4.82408e-05, "loss": 0.4459, "step": 11000 }, { "epoch": 0.0736, "grad_norm": 0.9587870240211487, "learning_rate": 4.81608e-05, "loss": 0.4418, "step": 11500 }, { "epoch": 0.0768, "grad_norm": 0.8939447999000549, "learning_rate": 4.80808e-05, "loss": 0.4327, "step": 12000 }, { "epoch": 0.08, "grad_norm": 0.9886033535003662, "learning_rate": 4.80008e-05, "loss": 0.438, "step": 12500 }, { "epoch": 0.0832, "grad_norm": 0.9157513976097107, "learning_rate": 4.7920800000000004e-05, "loss": 0.4323, "step": 13000 }, { "epoch": 0.0864, "grad_norm": 0.9085854887962341, "learning_rate": 4.7840800000000005e-05, "loss": 0.4303, "step": 13500 }, { "epoch": 0.0896, "grad_norm": 0.9123984575271606, "learning_rate": 4.77608e-05, "loss": 0.4247, "step": 14000 }, { "epoch": 0.0928, "grad_norm": 0.839026689529419, "learning_rate": 4.7680960000000004e-05, "loss": 0.4233, "step": 14500 }, { "epoch": 0.096, "grad_norm": 0.8110847473144531, "learning_rate": 4.760096e-05, "loss": 0.4207, "step": 15000 }, { "epoch": 0.0992, "grad_norm": 0.8462579250335693, "learning_rate": 4.7520960000000005e-05, "loss": 0.421, "step": 15500 }, { "epoch": 0.1024, "grad_norm": 0.8980106711387634, "learning_rate": 4.744096e-05, "loss": 0.417, "step": 16000 }, { "epoch": 0.1056, "grad_norm": 0.8297702074050903, "learning_rate": 4.736096000000001e-05, "loss": 0.4139, "step": 16500 }, { "epoch": 0.1088, "grad_norm": 0.9856173992156982, "learning_rate": 4.728096e-05, "loss": 0.419, "step": 17000 }, { "epoch": 0.112, "grad_norm": 0.934256911277771, "learning_rate": 4.720096e-05, "loss": 0.4098, "step": 17500 }, { "epoch": 0.1152, "grad_norm": 0.9190649390220642, "learning_rate": 4.712096e-05, "loss": 0.412, "step": 18000 }, { "epoch": 0.1184, "grad_norm": 0.9078772664070129, "learning_rate": 4.704096e-05, "loss": 0.4043, "step": 18500 }, { "epoch": 0.1216, "grad_norm": 1.082939624786377, "learning_rate": 4.696112e-05, "loss": 0.4045, "step": 19000 }, { "epoch": 0.1248, "grad_norm": 0.9159390926361084, "learning_rate": 4.688112e-05, "loss": 0.4098, "step": 19500 }, { "epoch": 0.128, "grad_norm": 0.8420547842979431, "learning_rate": 4.680128e-05, "loss": 0.4033, "step": 20000 }, { "epoch": 0.1312, "grad_norm": 0.7658286094665527, "learning_rate": 4.672128e-05, "loss": 0.4002, "step": 20500 }, { "epoch": 0.1344, "grad_norm": 0.9074057340621948, "learning_rate": 4.664128e-05, "loss": 0.3964, "step": 21000 }, { "epoch": 0.1376, "grad_norm": 0.6065025329589844, "learning_rate": 4.656128e-05, "loss": 0.3984, "step": 21500 }, { "epoch": 0.1408, "grad_norm": 0.7523757219314575, "learning_rate": 4.6481280000000004e-05, "loss": 0.3959, "step": 22000 }, { "epoch": 0.144, "grad_norm": 0.807826042175293, "learning_rate": 4.6401280000000004e-05, "loss": 0.3921, "step": 22500 }, { "epoch": 0.1472, "grad_norm": 0.8530682325363159, "learning_rate": 4.632128e-05, "loss": 0.4002, "step": 23000 }, { "epoch": 0.1504, "grad_norm": 0.8661518692970276, "learning_rate": 4.6241280000000006e-05, "loss": 0.3856, "step": 23500 }, { "epoch": 0.1536, "grad_norm": 0.7473235130310059, "learning_rate": 4.616144e-05, "loss": 0.3854, "step": 24000 }, { "epoch": 0.1568, "grad_norm": 0.7954819202423096, "learning_rate": 4.6081440000000005e-05, "loss": 0.3871, "step": 24500 }, { "epoch": 0.16, "grad_norm": 0.8758727312088013, "learning_rate": 4.600144e-05, "loss": 0.3842, "step": 25000 }, { "epoch": 0.1632, "grad_norm": 0.8430293798446655, "learning_rate": 4.592144000000001e-05, "loss": 0.3886, "step": 25500 }, { "epoch": 0.1664, "grad_norm": 0.6557173728942871, "learning_rate": 4.584144e-05, "loss": 0.3854, "step": 26000 }, { "epoch": 0.1696, "grad_norm": 0.7791888117790222, "learning_rate": 4.576144e-05, "loss": 0.3796, "step": 26500 }, { "epoch": 0.1728, "grad_norm": 0.736084520816803, "learning_rate": 4.56816e-05, "loss": 0.3806, "step": 27000 }, { "epoch": 0.176, "grad_norm": 0.7714269161224365, "learning_rate": 4.56016e-05, "loss": 0.3781, "step": 27500 }, { "epoch": 0.1792, "grad_norm": 0.766144335269928, "learning_rate": 4.552176e-05, "loss": 0.3766, "step": 28000 }, { "epoch": 0.1824, "grad_norm": 0.7035301923751831, "learning_rate": 4.544176e-05, "loss": 0.3737, "step": 28500 }, { "epoch": 0.1856, "grad_norm": 0.7573793530464172, "learning_rate": 4.536176e-05, "loss": 0.3753, "step": 29000 }, { "epoch": 0.1888, "grad_norm": 0.8799508213996887, "learning_rate": 4.528176e-05, "loss": 0.373, "step": 29500 }, { "epoch": 0.192, "grad_norm": 0.8543264269828796, "learning_rate": 4.520176e-05, "loss": 0.3735, "step": 30000 }, { "epoch": 0.1952, "grad_norm": 0.6768947243690491, "learning_rate": 4.512176e-05, "loss": 0.3697, "step": 30500 }, { "epoch": 0.1984, "grad_norm": 0.8239702582359314, "learning_rate": 4.504176e-05, "loss": 0.3675, "step": 31000 }, { "epoch": 0.2016, "grad_norm": 0.8310449123382568, "learning_rate": 4.4961760000000004e-05, "loss": 0.3695, "step": 31500 }, { "epoch": 0.2048, "grad_norm": 0.8459475040435791, "learning_rate": 4.488176e-05, "loss": 0.3694, "step": 32000 }, { "epoch": 0.208, "grad_norm": 0.7346063852310181, "learning_rate": 4.4801760000000006e-05, "loss": 0.3646, "step": 32500 }, { "epoch": 0.2112, "grad_norm": 0.6958354115486145, "learning_rate": 4.472176e-05, "loss": 0.3704, "step": 33000 }, { "epoch": 0.2144, "grad_norm": 0.8244686722755432, "learning_rate": 4.464176000000001e-05, "loss": 0.3647, "step": 33500 }, { "epoch": 0.2176, "grad_norm": 0.7559502124786377, "learning_rate": 4.456192e-05, "loss": 0.3665, "step": 34000 }, { "epoch": 0.2208, "grad_norm": 0.9046504497528076, "learning_rate": 4.4481920000000007e-05, "loss": 0.3637, "step": 34500 }, { "epoch": 0.224, "grad_norm": 0.7771899700164795, "learning_rate": 4.440192e-05, "loss": 0.3648, "step": 35000 }, { "epoch": 0.2272, "grad_norm": 0.6887528300285339, "learning_rate": 4.432192e-05, "loss": 0.3562, "step": 35500 }, { "epoch": 0.2304, "grad_norm": 0.7471407055854797, "learning_rate": 4.424192e-05, "loss": 0.3639, "step": 36000 }, { "epoch": 0.2336, "grad_norm": 0.7198163270950317, "learning_rate": 4.416192e-05, "loss": 0.3604, "step": 36500 }, { "epoch": 0.2368, "grad_norm": 0.7383478879928589, "learning_rate": 4.4081920000000004e-05, "loss": 0.3592, "step": 37000 }, { "epoch": 0.24, "grad_norm": 0.8052579760551453, "learning_rate": 4.4001920000000004e-05, "loss": 0.3563, "step": 37500 }, { "epoch": 0.2432, "grad_norm": 0.7765107154846191, "learning_rate": 4.392224e-05, "loss": 0.3548, "step": 38000 }, { "epoch": 0.2464, "grad_norm": 0.7250288724899292, "learning_rate": 4.384224e-05, "loss": 0.3605, "step": 38500 }, { "epoch": 0.2496, "grad_norm": 0.6914694309234619, "learning_rate": 4.376224e-05, "loss": 0.3551, "step": 39000 }, { "epoch": 0.2528, "grad_norm": 0.6636275053024292, "learning_rate": 4.368224e-05, "loss": 0.3587, "step": 39500 }, { "epoch": 0.256, "grad_norm": 0.710564911365509, "learning_rate": 4.360224e-05, "loss": 0.3537, "step": 40000 }, { "epoch": 0.2592, "grad_norm": 0.6195800304412842, "learning_rate": 4.3522240000000004e-05, "loss": 0.3537, "step": 40500 }, { "epoch": 0.2624, "grad_norm": 0.7131514549255371, "learning_rate": 4.34424e-05, "loss": 0.3531, "step": 41000 }, { "epoch": 0.2656, "grad_norm": 0.6594410538673401, "learning_rate": 4.336256e-05, "loss": 0.3518, "step": 41500 }, { "epoch": 0.2688, "grad_norm": 0.7651230096817017, "learning_rate": 4.328256e-05, "loss": 0.3516, "step": 42000 }, { "epoch": 0.272, "grad_norm": 0.756515622138977, "learning_rate": 4.320256e-05, "loss": 0.3461, "step": 42500 }, { "epoch": 0.2752, "grad_norm": 0.7201528549194336, "learning_rate": 4.3122560000000003e-05, "loss": 0.3497, "step": 43000 }, { "epoch": 0.2784, "grad_norm": 0.7436856031417847, "learning_rate": 4.3042560000000004e-05, "loss": 0.3505, "step": 43500 }, { "epoch": 0.2816, "grad_norm": 0.7914199829101562, "learning_rate": 4.2962560000000005e-05, "loss": 0.3439, "step": 44000 }, { "epoch": 0.2848, "grad_norm": 0.7488194704055786, "learning_rate": 4.288256e-05, "loss": 0.349, "step": 44500 }, { "epoch": 0.288, "grad_norm": 0.8654124736785889, "learning_rate": 4.280256e-05, "loss": 0.3491, "step": 45000 }, { "epoch": 0.2912, "grad_norm": 0.6817401647567749, "learning_rate": 4.272272e-05, "loss": 0.3447, "step": 45500 }, { "epoch": 0.2944, "grad_norm": 0.6439715623855591, "learning_rate": 4.2642720000000006e-05, "loss": 0.3453, "step": 46000 }, { "epoch": 0.2976, "grad_norm": 1.3840138912200928, "learning_rate": 4.256272e-05, "loss": 0.3445, "step": 46500 }, { "epoch": 0.3008, "grad_norm": 0.7245766520500183, "learning_rate": 4.248272e-05, "loss": 0.3462, "step": 47000 }, { "epoch": 0.304, "grad_norm": 0.6877666711807251, "learning_rate": 4.240288e-05, "loss": 0.3465, "step": 47500 }, { "epoch": 0.3072, "grad_norm": 0.8494886159896851, "learning_rate": 4.2322880000000006e-05, "loss": 0.348, "step": 48000 }, { "epoch": 0.3104, "grad_norm": 0.6704971790313721, "learning_rate": 4.224288e-05, "loss": 0.3403, "step": 48500 }, { "epoch": 0.3136, "grad_norm": 0.6239964962005615, "learning_rate": 4.216288000000001e-05, "loss": 0.3382, "step": 49000 }, { "epoch": 0.3168, "grad_norm": 0.7317768335342407, "learning_rate": 4.208288e-05, "loss": 0.3385, "step": 49500 }, { "epoch": 0.32, "grad_norm": 0.7397735118865967, "learning_rate": 4.200288e-05, "loss": 0.3405, "step": 50000 }, { "epoch": 0.3232, "grad_norm": 1.1299536228179932, "learning_rate": 4.1922880000000003e-05, "loss": 0.3431, "step": 50500 }, { "epoch": 0.3264, "grad_norm": 0.6406556963920593, "learning_rate": 4.184304e-05, "loss": 0.3384, "step": 51000 }, { "epoch": 0.3296, "grad_norm": 0.8084424734115601, "learning_rate": 4.17632e-05, "loss": 0.3365, "step": 51500 }, { "epoch": 0.3328, "grad_norm": 0.7525010704994202, "learning_rate": 4.16832e-05, "loss": 0.3399, "step": 52000 }, { "epoch": 0.336, "grad_norm": 0.7382110953330994, "learning_rate": 4.16032e-05, "loss": 0.335, "step": 52500 }, { "epoch": 0.3392, "grad_norm": 0.6454793810844421, "learning_rate": 4.15232e-05, "loss": 0.3354, "step": 53000 }, { "epoch": 0.3424, "grad_norm": 0.639664351940155, "learning_rate": 4.14432e-05, "loss": 0.3371, "step": 53500 }, { "epoch": 0.3456, "grad_norm": 0.5574499368667603, "learning_rate": 4.1363200000000004e-05, "loss": 0.3341, "step": 54000 }, { "epoch": 0.3488, "grad_norm": 0.6772671341896057, "learning_rate": 4.12832e-05, "loss": 0.3331, "step": 54500 }, { "epoch": 0.352, "grad_norm": 0.6943195462226868, "learning_rate": 4.120336e-05, "loss": 0.3365, "step": 55000 }, { "epoch": 0.3552, "grad_norm": 0.7460485100746155, "learning_rate": 4.112336e-05, "loss": 0.3308, "step": 55500 }, { "epoch": 0.3584, "grad_norm": 0.7071924805641174, "learning_rate": 4.1043360000000005e-05, "loss": 0.3312, "step": 56000 }, { "epoch": 0.3616, "grad_norm": 0.6678891181945801, "learning_rate": 4.0963519999999996e-05, "loss": 0.3314, "step": 56500 }, { "epoch": 0.3648, "grad_norm": 0.7100914120674133, "learning_rate": 4.0883520000000004e-05, "loss": 0.3307, "step": 57000 }, { "epoch": 0.368, "grad_norm": 0.6085671782493591, "learning_rate": 4.080352e-05, "loss": 0.3282, "step": 57500 }, { "epoch": 0.3712, "grad_norm": 0.6634243130683899, "learning_rate": 4.0723520000000005e-05, "loss": 0.3321, "step": 58000 }, { "epoch": 0.3744, "grad_norm": 0.7203409075737, "learning_rate": 4.064352e-05, "loss": 0.3318, "step": 58500 }, { "epoch": 0.3776, "grad_norm": 0.7934884428977966, "learning_rate": 4.056352e-05, "loss": 0.3239, "step": 59000 }, { "epoch": 0.3808, "grad_norm": 0.8591666221618652, "learning_rate": 4.048352e-05, "loss": 0.3275, "step": 59500 }, { "epoch": 0.384, "grad_norm": 0.6306772232055664, "learning_rate": 4.040352e-05, "loss": 0.3308, "step": 60000 }, { "epoch": 0.3872, "grad_norm": 0.6059302687644958, "learning_rate": 4.032352e-05, "loss": 0.3266, "step": 60500 }, { "epoch": 0.3904, "grad_norm": 0.6875105500221252, "learning_rate": 4.024352e-05, "loss": 0.3265, "step": 61000 }, { "epoch": 0.3936, "grad_norm": 0.6397412419319153, "learning_rate": 4.0163520000000004e-05, "loss": 0.3268, "step": 61500 }, { "epoch": 0.3968, "grad_norm": 0.7801005840301514, "learning_rate": 4.0083520000000005e-05, "loss": 0.3314, "step": 62000 }, { "epoch": 0.4, "grad_norm": 0.6966884136199951, "learning_rate": 4.000352e-05, "loss": 0.3263, "step": 62500 }, { "epoch": 0.4032, "grad_norm": 0.7413304448127747, "learning_rate": 3.9923520000000006e-05, "loss": 0.3284, "step": 63000 }, { "epoch": 0.4064, "grad_norm": 0.7089780569076538, "learning_rate": 3.984352e-05, "loss": 0.3252, "step": 63500 }, { "epoch": 0.4096, "grad_norm": 0.6669878959655762, "learning_rate": 3.976352e-05, "loss": 0.3239, "step": 64000 }, { "epoch": 0.4128, "grad_norm": 0.7352403998374939, "learning_rate": 3.968368e-05, "loss": 0.3226, "step": 64500 }, { "epoch": 0.416, "grad_norm": 0.6916635036468506, "learning_rate": 3.9603840000000005e-05, "loss": 0.3234, "step": 65000 }, { "epoch": 0.4192, "grad_norm": 0.6800302863121033, "learning_rate": 3.952384e-05, "loss": 0.3224, "step": 65500 }, { "epoch": 0.4224, "grad_norm": 0.6685224771499634, "learning_rate": 3.9443840000000006e-05, "loss": 0.3197, "step": 66000 }, { "epoch": 0.4256, "grad_norm": 0.7219159603118896, "learning_rate": 3.936384e-05, "loss": 0.3185, "step": 66500 }, { "epoch": 0.4288, "grad_norm": 0.5928858518600464, "learning_rate": 3.928384e-05, "loss": 0.3291, "step": 67000 }, { "epoch": 0.432, "grad_norm": 0.6616542339324951, "learning_rate": 3.920384e-05, "loss": 0.3266, "step": 67500 }, { "epoch": 0.4352, "grad_norm": 0.5957266092300415, "learning_rate": 3.912384e-05, "loss": 0.32, "step": 68000 }, { "epoch": 0.4384, "grad_norm": 0.6576407551765442, "learning_rate": 3.904384e-05, "loss": 0.3246, "step": 68500 }, { "epoch": 0.4416, "grad_norm": 0.6852056384086609, "learning_rate": 3.896416e-05, "loss": 0.3268, "step": 69000 }, { "epoch": 0.4448, "grad_norm": 0.780893087387085, "learning_rate": 3.888416e-05, "loss": 0.3229, "step": 69500 }, { "epoch": 0.448, "grad_norm": 0.6741476655006409, "learning_rate": 3.880416e-05, "loss": 0.3188, "step": 70000 }, { "epoch": 0.4512, "grad_norm": 0.5919800400733948, "learning_rate": 3.872416e-05, "loss": 0.3208, "step": 70500 }, { "epoch": 0.4544, "grad_norm": 0.6476633548736572, "learning_rate": 3.864416e-05, "loss": 0.322, "step": 71000 }, { "epoch": 0.4576, "grad_norm": 0.5667979717254639, "learning_rate": 3.8564159999999996e-05, "loss": 0.3151, "step": 71500 }, { "epoch": 0.4608, "grad_norm": 0.6126554608345032, "learning_rate": 3.8484160000000004e-05, "loss": 0.3185, "step": 72000 }, { "epoch": 0.464, "grad_norm": 0.7995546460151672, "learning_rate": 3.840416e-05, "loss": 0.3174, "step": 72500 }, { "epoch": 0.4672, "grad_norm": 0.5964981317520142, "learning_rate": 3.8324160000000005e-05, "loss": 0.3187, "step": 73000 }, { "epoch": 0.4704, "grad_norm": 0.7718212008476257, "learning_rate": 3.824416e-05, "loss": 0.3156, "step": 73500 }, { "epoch": 0.4736, "grad_norm": 0.7086686491966248, "learning_rate": 3.8164320000000005e-05, "loss": 0.3189, "step": 74000 }, { "epoch": 0.4768, "grad_norm": 0.7988029718399048, "learning_rate": 3.808432e-05, "loss": 0.3151, "step": 74500 }, { "epoch": 0.48, "grad_norm": 0.6092699766159058, "learning_rate": 3.8004320000000006e-05, "loss": 0.3153, "step": 75000 }, { "epoch": 0.4832, "grad_norm": 0.6181166768074036, "learning_rate": 3.792432e-05, "loss": 0.3113, "step": 75500 }, { "epoch": 0.4864, "grad_norm": 0.5952243208885193, "learning_rate": 3.784432e-05, "loss": 0.3091, "step": 76000 }, { "epoch": 0.4896, "grad_norm": 0.5732501745223999, "learning_rate": 3.776432e-05, "loss": 0.3169, "step": 76500 }, { "epoch": 0.4928, "grad_norm": 0.5866090059280396, "learning_rate": 3.768432e-05, "loss": 0.3135, "step": 77000 }, { "epoch": 0.496, "grad_norm": 0.6748520135879517, "learning_rate": 3.760432e-05, "loss": 0.3134, "step": 77500 }, { "epoch": 0.4992, "grad_norm": 0.5922159552574158, "learning_rate": 3.752448e-05, "loss": 0.3156, "step": 78000 }, { "epoch": 0.5024, "grad_norm": 0.6446545124053955, "learning_rate": 3.744448e-05, "loss": 0.3171, "step": 78500 }, { "epoch": 0.5056, "grad_norm": 0.6506426334381104, "learning_rate": 3.736448e-05, "loss": 0.3138, "step": 79000 }, { "epoch": 0.5088, "grad_norm": 0.6826354265213013, "learning_rate": 3.728448e-05, "loss": 0.3164, "step": 79500 }, { "epoch": 0.512, "grad_norm": 0.6866195797920227, "learning_rate": 3.72048e-05, "loss": 0.315, "step": 80000 }, { "epoch": 0.5152, "grad_norm": 0.5590147376060486, "learning_rate": 3.7124960000000005e-05, "loss": 0.3094, "step": 80500 }, { "epoch": 0.5184, "grad_norm": 0.6728788614273071, "learning_rate": 3.704496e-05, "loss": 0.3194, "step": 81000 }, { "epoch": 0.5216, "grad_norm": 0.6108749508857727, "learning_rate": 3.696496000000001e-05, "loss": 0.3128, "step": 81500 }, { "epoch": 0.5248, "grad_norm": 0.5888856649398804, "learning_rate": 3.688496e-05, "loss": 0.3121, "step": 82000 }, { "epoch": 0.528, "grad_norm": 0.727268397808075, "learning_rate": 3.680496e-05, "loss": 0.3193, "step": 82500 }, { "epoch": 0.5312, "grad_norm": 0.6358634233474731, "learning_rate": 3.672496e-05, "loss": 0.3092, "step": 83000 }, { "epoch": 0.5344, "grad_norm": 0.6482620239257812, "learning_rate": 3.664496e-05, "loss": 0.3098, "step": 83500 }, { "epoch": 0.5376, "grad_norm": 0.5968552827835083, "learning_rate": 3.6564960000000004e-05, "loss": 0.3108, "step": 84000 }, { "epoch": 0.5408, "grad_norm": 0.6621351838111877, "learning_rate": 3.6484960000000004e-05, "loss": 0.3065, "step": 84500 }, { "epoch": 0.544, "grad_norm": 0.5520649552345276, "learning_rate": 3.640496e-05, "loss": 0.3088, "step": 85000 }, { "epoch": 0.5472, "grad_norm": 0.6885005831718445, "learning_rate": 3.632496e-05, "loss": 0.3075, "step": 85500 }, { "epoch": 0.5504, "grad_norm": 0.666653573513031, "learning_rate": 3.624512e-05, "loss": 0.3113, "step": 86000 }, { "epoch": 0.5536, "grad_norm": 0.6344409584999084, "learning_rate": 3.6165120000000005e-05, "loss": 0.3085, "step": 86500 }, { "epoch": 0.5568, "grad_norm": 0.5792534947395325, "learning_rate": 3.608512e-05, "loss": 0.3132, "step": 87000 }, { "epoch": 0.56, "grad_norm": 0.6864989995956421, "learning_rate": 3.600512e-05, "loss": 0.3079, "step": 87500 }, { "epoch": 0.5632, "grad_norm": 0.6077435612678528, "learning_rate": 3.592512e-05, "loss": 0.3095, "step": 88000 }, { "epoch": 0.5664, "grad_norm": 0.7073134779930115, "learning_rate": 3.584512e-05, "loss": 0.3116, "step": 88500 }, { "epoch": 0.5696, "grad_norm": 0.6477733850479126, "learning_rate": 3.576512e-05, "loss": 0.3062, "step": 89000 }, { "epoch": 0.5728, "grad_norm": 0.7786093354225159, "learning_rate": 3.568512e-05, "loss": 0.3017, "step": 89500 }, { "epoch": 0.576, "grad_norm": 0.6447868943214417, "learning_rate": 3.560528e-05, "loss": 0.3077, "step": 90000 }, { "epoch": 0.5792, "grad_norm": 0.6663397550582886, "learning_rate": 3.552528e-05, "loss": 0.3089, "step": 90500 }, { "epoch": 0.5824, "grad_norm": 0.533214807510376, "learning_rate": 3.544528e-05, "loss": 0.3064, "step": 91000 }, { "epoch": 0.5856, "grad_norm": 0.6517444849014282, "learning_rate": 3.5365280000000004e-05, "loss": 0.3108, "step": 91500 }, { "epoch": 0.5888, "grad_norm": 0.7635303735733032, "learning_rate": 3.528544e-05, "loss": 0.3028, "step": 92000 }, { "epoch": 0.592, "grad_norm": 0.6636632680892944, "learning_rate": 3.520544e-05, "loss": 0.3015, "step": 92500 }, { "epoch": 0.5952, "grad_norm": 0.7296783924102783, "learning_rate": 3.5125440000000004e-05, "loss": 0.305, "step": 93000 }, { "epoch": 0.5984, "grad_norm": 0.5089054703712463, "learning_rate": 3.50456e-05, "loss": 0.3092, "step": 93500 }, { "epoch": 0.6016, "grad_norm": 0.6761330366134644, "learning_rate": 3.49656e-05, "loss": 0.3055, "step": 94000 }, { "epoch": 0.6048, "grad_norm": 0.6327843070030212, "learning_rate": 3.4885600000000004e-05, "loss": 0.3055, "step": 94500 }, { "epoch": 0.608, "grad_norm": 0.5940554141998291, "learning_rate": 3.48056e-05, "loss": 0.3017, "step": 95000 }, { "epoch": 0.6112, "grad_norm": 0.516828179359436, "learning_rate": 3.4725600000000005e-05, "loss": 0.3035, "step": 95500 }, { "epoch": 0.6144, "grad_norm": 0.5835782289505005, "learning_rate": 3.46456e-05, "loss": 0.2978, "step": 96000 }, { "epoch": 0.6176, "grad_norm": 0.5978230237960815, "learning_rate": 3.456560000000001e-05, "loss": 0.301, "step": 96500 }, { "epoch": 0.6208, "grad_norm": 0.5460017323493958, "learning_rate": 3.44856e-05, "loss": 0.3052, "step": 97000 }, { "epoch": 0.624, "grad_norm": 0.6875701546669006, "learning_rate": 3.44056e-05, "loss": 0.3028, "step": 97500 }, { "epoch": 0.6272, "grad_norm": 0.5780492424964905, "learning_rate": 3.43256e-05, "loss": 0.2988, "step": 98000 }, { "epoch": 0.6304, "grad_norm": 0.5191554427146912, "learning_rate": 3.42456e-05, "loss": 0.3052, "step": 98500 }, { "epoch": 0.6336, "grad_norm": 0.6811420917510986, "learning_rate": 3.416576e-05, "loss": 0.3032, "step": 99000 }, { "epoch": 0.6368, "grad_norm": 0.6301366686820984, "learning_rate": 3.408576e-05, "loss": 0.2979, "step": 99500 }, { "epoch": 0.64, "grad_norm": 0.5777577757835388, "learning_rate": 3.400576e-05, "loss": 0.2991, "step": 100000 }, { "epoch": 0.6432, "grad_norm": 0.6444558501243591, "learning_rate": 3.392592e-05, "loss": 0.298, "step": 100500 }, { "epoch": 0.6464, "grad_norm": 0.4793080985546112, "learning_rate": 3.384592e-05, "loss": 0.3014, "step": 101000 }, { "epoch": 0.6496, "grad_norm": 0.6691552400588989, "learning_rate": 3.376608e-05, "loss": 0.3006, "step": 101500 }, { "epoch": 0.6528, "grad_norm": 0.6318476796150208, "learning_rate": 3.368608e-05, "loss": 0.3032, "step": 102000 }, { "epoch": 0.656, "grad_norm": 0.5805894136428833, "learning_rate": 3.360608e-05, "loss": 0.3014, "step": 102500 }, { "epoch": 0.6592, "grad_norm": 0.5658220648765564, "learning_rate": 3.352608e-05, "loss": 0.3, "step": 103000 }, { "epoch": 0.6624, "grad_norm": 0.6117516160011292, "learning_rate": 3.3446080000000004e-05, "loss": 0.3014, "step": 103500 }, { "epoch": 0.6656, "grad_norm": 0.6763502359390259, "learning_rate": 3.336608e-05, "loss": 0.3043, "step": 104000 }, { "epoch": 0.6688, "grad_norm": 0.6046746969223022, "learning_rate": 3.3286080000000005e-05, "loss": 0.2965, "step": 104500 }, { "epoch": 0.672, "grad_norm": 0.7453213930130005, "learning_rate": 3.320608e-05, "loss": 0.2964, "step": 105000 }, { "epoch": 0.6752, "grad_norm": 0.6010546088218689, "learning_rate": 3.3126080000000007e-05, "loss": 0.2975, "step": 105500 }, { "epoch": 0.6784, "grad_norm": 0.7377296686172485, "learning_rate": 3.304608e-05, "loss": 0.2993, "step": 106000 }, { "epoch": 0.6816, "grad_norm": 0.6612259745597839, "learning_rate": 3.2966240000000006e-05, "loss": 0.298, "step": 106500 }, { "epoch": 0.6848, "grad_norm": 0.6570013165473938, "learning_rate": 3.288624e-05, "loss": 0.296, "step": 107000 }, { "epoch": 0.688, "grad_norm": 0.633602499961853, "learning_rate": 3.280624e-05, "loss": 0.2989, "step": 107500 }, { "epoch": 0.6912, "grad_norm": 0.5594373345375061, "learning_rate": 3.272624e-05, "loss": 0.2977, "step": 108000 }, { "epoch": 0.6944, "grad_norm": 0.5643302202224731, "learning_rate": 3.264624e-05, "loss": 0.2941, "step": 108500 }, { "epoch": 0.6976, "grad_norm": 0.5127794146537781, "learning_rate": 3.256624e-05, "loss": 0.2953, "step": 109000 }, { "epoch": 0.7008, "grad_norm": 0.6273791790008545, "learning_rate": 3.24864e-05, "loss": 0.2944, "step": 109500 }, { "epoch": 0.704, "grad_norm": 0.5089157223701477, "learning_rate": 3.24064e-05, "loss": 0.3, "step": 110000 }, { "epoch": 0.7072, "grad_norm": 0.5816791653633118, "learning_rate": 3.232656e-05, "loss": 0.2957, "step": 110500 }, { "epoch": 0.7104, "grad_norm": 0.6407476663589478, "learning_rate": 3.224656e-05, "loss": 0.2974, "step": 111000 }, { "epoch": 0.7136, "grad_norm": 0.46444937586784363, "learning_rate": 3.216656e-05, "loss": 0.2969, "step": 111500 }, { "epoch": 0.7168, "grad_norm": 0.4997446835041046, "learning_rate": 3.2086559999999996e-05, "loss": 0.2966, "step": 112000 }, { "epoch": 0.72, "grad_norm": 0.6996490359306335, "learning_rate": 3.2006560000000003e-05, "loss": 0.2965, "step": 112500 }, { "epoch": 0.7232, "grad_norm": 0.5806016325950623, "learning_rate": 3.192672e-05, "loss": 0.2952, "step": 113000 }, { "epoch": 0.7264, "grad_norm": 0.6140916347503662, "learning_rate": 3.184672e-05, "loss": 0.2995, "step": 113500 }, { "epoch": 0.7296, "grad_norm": 0.45879319310188293, "learning_rate": 3.1766719999999997e-05, "loss": 0.292, "step": 114000 }, { "epoch": 0.7328, "grad_norm": 0.6141937971115112, "learning_rate": 3.1686720000000004e-05, "loss": 0.2945, "step": 114500 }, { "epoch": 0.736, "grad_norm": 0.6565462946891785, "learning_rate": 3.160672e-05, "loss": 0.2982, "step": 115000 }, { "epoch": 0.7392, "grad_norm": 0.5997145175933838, "learning_rate": 3.1526720000000006e-05, "loss": 0.2957, "step": 115500 }, { "epoch": 0.7424, "grad_norm": 0.736965537071228, "learning_rate": 3.144672e-05, "loss": 0.2953, "step": 116000 }, { "epoch": 0.7456, "grad_norm": 0.6587550640106201, "learning_rate": 3.136672e-05, "loss": 0.2917, "step": 116500 }, { "epoch": 0.7488, "grad_norm": 0.7265971302986145, "learning_rate": 3.128672e-05, "loss": 0.2908, "step": 117000 }, { "epoch": 0.752, "grad_norm": 0.6158114075660706, "learning_rate": 3.120672e-05, "loss": 0.2916, "step": 117500 }, { "epoch": 0.7552, "grad_norm": 0.6521216034889221, "learning_rate": 3.112672e-05, "loss": 0.2947, "step": 118000 }, { "epoch": 0.7584, "grad_norm": 0.5868868231773376, "learning_rate": 3.1046720000000004e-05, "loss": 0.2919, "step": 118500 }, { "epoch": 0.7616, "grad_norm": 0.6495432257652283, "learning_rate": 3.096672e-05, "loss": 0.2974, "step": 119000 }, { "epoch": 0.7648, "grad_norm": 0.6204816102981567, "learning_rate": 3.0886720000000005e-05, "loss": 0.2945, "step": 119500 }, { "epoch": 0.768, "grad_norm": 0.6333968639373779, "learning_rate": 3.080672e-05, "loss": 0.292, "step": 120000 }, { "epoch": 0.7712, "grad_norm": 0.5613961815834045, "learning_rate": 3.0726880000000004e-05, "loss": 0.2938, "step": 120500 }, { "epoch": 0.7744, "grad_norm": 0.6623988151550293, "learning_rate": 3.064688e-05, "loss": 0.2954, "step": 121000 }, { "epoch": 0.7776, "grad_norm": 0.6134264469146729, "learning_rate": 3.0566880000000006e-05, "loss": 0.2915, "step": 121500 }, { "epoch": 0.7808, "grad_norm": 0.6159347891807556, "learning_rate": 3.048688e-05, "loss": 0.2887, "step": 122000 }, { "epoch": 0.784, "grad_norm": 0.6079424023628235, "learning_rate": 3.0407040000000005e-05, "loss": 0.2915, "step": 122500 }, { "epoch": 0.7872, "grad_norm": 0.7703385353088379, "learning_rate": 3.0327040000000002e-05, "loss": 0.2901, "step": 123000 }, { "epoch": 0.7904, "grad_norm": 0.5626256465911865, "learning_rate": 3.024704e-05, "loss": 0.2938, "step": 123500 }, { "epoch": 0.7936, "grad_norm": 0.554914653301239, "learning_rate": 3.016704e-05, "loss": 0.2913, "step": 124000 }, { "epoch": 0.7968, "grad_norm": 0.6610060930252075, "learning_rate": 3.008704e-05, "loss": 0.2912, "step": 124500 }, { "epoch": 0.8, "grad_norm": 0.6194009780883789, "learning_rate": 3.0007040000000002e-05, "loss": 0.2901, "step": 125000 }, { "epoch": 0.8032, "grad_norm": 0.7150211930274963, "learning_rate": 2.992704e-05, "loss": 0.2895, "step": 125500 }, { "epoch": 0.8064, "grad_norm": 0.6945148706436157, "learning_rate": 2.9847040000000003e-05, "loss": 0.2878, "step": 126000 }, { "epoch": 0.8096, "grad_norm": 0.6546908617019653, "learning_rate": 2.9767200000000002e-05, "loss": 0.287, "step": 126500 }, { "epoch": 0.8128, "grad_norm": 0.535040020942688, "learning_rate": 2.9687360000000004e-05, "loss": 0.2901, "step": 127000 }, { "epoch": 0.816, "grad_norm": 0.6062806844711304, "learning_rate": 2.960736e-05, "loss": 0.2862, "step": 127500 }, { "epoch": 0.8192, "grad_norm": 0.6202298998832703, "learning_rate": 2.9527360000000005e-05, "loss": 0.2884, "step": 128000 }, { "epoch": 0.8224, "grad_norm": 0.5966545343399048, "learning_rate": 2.9447360000000003e-05, "loss": 0.2877, "step": 128500 }, { "epoch": 0.8256, "grad_norm": 0.5024796724319458, "learning_rate": 2.936736e-05, "loss": 0.2882, "step": 129000 }, { "epoch": 0.8288, "grad_norm": 0.5895559191703796, "learning_rate": 2.9287520000000002e-05, "loss": 0.288, "step": 129500 }, { "epoch": 0.832, "grad_norm": 0.9302066564559937, "learning_rate": 2.920752e-05, "loss": 0.286, "step": 130000 }, { "epoch": 0.8352, "grad_norm": 0.573466956615448, "learning_rate": 2.9127520000000003e-05, "loss": 0.2848, "step": 130500 }, { "epoch": 0.8384, "grad_norm": 0.5901783108711243, "learning_rate": 2.904768e-05, "loss": 0.2883, "step": 131000 }, { "epoch": 0.8416, "grad_norm": 0.7780030369758606, "learning_rate": 2.8967680000000002e-05, "loss": 0.2914, "step": 131500 }, { "epoch": 0.8448, "grad_norm": 0.6630533933639526, "learning_rate": 2.888768e-05, "loss": 0.2878, "step": 132000 }, { "epoch": 0.848, "grad_norm": 0.6001667976379395, "learning_rate": 2.8807680000000004e-05, "loss": 0.2818, "step": 132500 }, { "epoch": 0.8512, "grad_norm": 0.6324682831764221, "learning_rate": 2.872768e-05, "loss": 0.2849, "step": 133000 }, { "epoch": 0.8544, "grad_norm": 0.6814092993736267, "learning_rate": 2.864768e-05, "loss": 0.288, "step": 133500 }, { "epoch": 0.8576, "grad_norm": 0.651709794998169, "learning_rate": 2.8567680000000003e-05, "loss": 0.2872, "step": 134000 }, { "epoch": 0.8608, "grad_norm": 0.5912330746650696, "learning_rate": 2.848768e-05, "loss": 0.2824, "step": 134500 }, { "epoch": 0.864, "grad_norm": 0.5821974277496338, "learning_rate": 2.8407680000000004e-05, "loss": 0.2853, "step": 135000 }, { "epoch": 0.8672, "grad_norm": 0.6262611150741577, "learning_rate": 2.832784e-05, "loss": 0.2848, "step": 135500 }, { "epoch": 0.8704, "grad_norm": 0.5360976457595825, "learning_rate": 2.8247840000000004e-05, "loss": 0.2869, "step": 136000 }, { "epoch": 0.8736, "grad_norm": 0.6523284912109375, "learning_rate": 2.816784e-05, "loss": 0.2792, "step": 136500 }, { "epoch": 0.8768, "grad_norm": 0.6329330205917358, "learning_rate": 2.808784e-05, "loss": 0.2865, "step": 137000 }, { "epoch": 0.88, "grad_norm": 0.6053124666213989, "learning_rate": 2.8007840000000003e-05, "loss": 0.2844, "step": 137500 }, { "epoch": 0.8832, "grad_norm": 0.6887571811676025, "learning_rate": 2.7927999999999998e-05, "loss": 0.288, "step": 138000 }, { "epoch": 0.8864, "grad_norm": 0.7047476172447205, "learning_rate": 2.7848000000000002e-05, "loss": 0.2877, "step": 138500 }, { "epoch": 0.8896, "grad_norm": 0.598227858543396, "learning_rate": 2.7768e-05, "loss": 0.2867, "step": 139000 }, { "epoch": 0.8928, "grad_norm": 0.5094701051712036, "learning_rate": 2.7688000000000003e-05, "loss": 0.2832, "step": 139500 }, { "epoch": 0.896, "grad_norm": 0.5749739408493042, "learning_rate": 2.7608e-05, "loss": 0.2821, "step": 140000 }, { "epoch": 0.8992, "grad_norm": 0.4442578852176666, "learning_rate": 2.7528320000000003e-05, "loss": 0.282, "step": 140500 }, { "epoch": 0.9024, "grad_norm": 0.5418574213981628, "learning_rate": 2.744832e-05, "loss": 0.2816, "step": 141000 }, { "epoch": 0.9056, "grad_norm": 0.5984327793121338, "learning_rate": 2.736832e-05, "loss": 0.285, "step": 141500 }, { "epoch": 0.9088, "grad_norm": 0.6572843194007874, "learning_rate": 2.728832e-05, "loss": 0.2817, "step": 142000 }, { "epoch": 0.912, "grad_norm": 0.590993344783783, "learning_rate": 2.7208320000000003e-05, "loss": 0.288, "step": 142500 }, { "epoch": 0.9152, "grad_norm": 0.6096624135971069, "learning_rate": 2.712832e-05, "loss": 0.2861, "step": 143000 }, { "epoch": 0.9184, "grad_norm": 0.5189167261123657, "learning_rate": 2.7048319999999998e-05, "loss": 0.2857, "step": 143500 }, { "epoch": 0.9216, "grad_norm": 0.5812899470329285, "learning_rate": 2.6968320000000002e-05, "loss": 0.2888, "step": 144000 }, { "epoch": 0.9248, "grad_norm": 0.515201210975647, "learning_rate": 2.688832e-05, "loss": 0.2791, "step": 144500 }, { "epoch": 0.928, "grad_norm": 0.6398504972457886, "learning_rate": 2.6808320000000004e-05, "loss": 0.282, "step": 145000 }, { "epoch": 0.9312, "grad_norm": 0.5990891456604004, "learning_rate": 2.672832e-05, "loss": 0.28, "step": 145500 }, { "epoch": 0.9344, "grad_norm": 0.5883029699325562, "learning_rate": 2.664832e-05, "loss": 0.2777, "step": 146000 }, { "epoch": 0.9376, "grad_norm": 0.6432376503944397, "learning_rate": 2.656848e-05, "loss": 0.2804, "step": 146500 }, { "epoch": 0.9408, "grad_norm": 0.5375948548316956, "learning_rate": 2.6488479999999997e-05, "loss": 0.2807, "step": 147000 }, { "epoch": 0.944, "grad_norm": 0.6207411885261536, "learning_rate": 2.6408640000000003e-05, "loss": 0.283, "step": 147500 }, { "epoch": 0.9472, "grad_norm": 0.5854378342628479, "learning_rate": 2.632864e-05, "loss": 0.2854, "step": 148000 }, { "epoch": 0.9504, "grad_norm": 0.5260078310966492, "learning_rate": 2.6248800000000002e-05, "loss": 0.2836, "step": 148500 }, { "epoch": 0.9536, "grad_norm": 0.6284717917442322, "learning_rate": 2.61688e-05, "loss": 0.2824, "step": 149000 }, { "epoch": 0.9568, "grad_norm": 0.6092182397842407, "learning_rate": 2.608896e-05, "loss": 0.2804, "step": 149500 }, { "epoch": 0.96, "grad_norm": 0.6028911471366882, "learning_rate": 2.600896e-05, "loss": 0.281, "step": 150000 }, { "epoch": 0.9632, "grad_norm": 0.5008478164672852, "learning_rate": 2.5928960000000003e-05, "loss": 0.277, "step": 150500 }, { "epoch": 0.9664, "grad_norm": 0.5233867168426514, "learning_rate": 2.584896e-05, "loss": 0.2807, "step": 151000 }, { "epoch": 0.9696, "grad_norm": 0.5762408375740051, "learning_rate": 2.5768960000000004e-05, "loss": 0.2831, "step": 151500 }, { "epoch": 0.9728, "grad_norm": 0.6097844243049622, "learning_rate": 2.568896e-05, "loss": 0.2803, "step": 152000 }, { "epoch": 0.976, "grad_norm": 0.6696804761886597, "learning_rate": 2.560896e-05, "loss": 0.2742, "step": 152500 }, { "epoch": 0.9792, "grad_norm": 0.6028556823730469, "learning_rate": 2.5528960000000003e-05, "loss": 0.282, "step": 153000 }, { "epoch": 0.9824, "grad_norm": 0.6651898622512817, "learning_rate": 2.544896e-05, "loss": 0.2849, "step": 153500 }, { "epoch": 0.9856, "grad_norm": 0.5219380855560303, "learning_rate": 2.536896e-05, "loss": 0.2785, "step": 154000 }, { "epoch": 0.9888, "grad_norm": 0.6161176562309265, "learning_rate": 2.5288960000000002e-05, "loss": 0.2808, "step": 154500 }, { "epoch": 0.992, "grad_norm": 0.7915316224098206, "learning_rate": 2.5208960000000003e-05, "loss": 0.2777, "step": 155000 }, { "epoch": 0.9952, "grad_norm": 0.7261882424354553, "learning_rate": 2.512896e-05, "loss": 0.2767, "step": 155500 }, { "epoch": 0.9984, "grad_norm": 0.5452406406402588, "learning_rate": 2.5048959999999997e-05, "loss": 0.2764, "step": 156000 }, { "epoch": 1.0016, "grad_norm": 0.642181396484375, "learning_rate": 2.4969120000000003e-05, "loss": 0.2746, "step": 156500 }, { "epoch": 1.0048, "grad_norm": 0.5900291204452515, "learning_rate": 2.4889120000000003e-05, "loss": 0.2721, "step": 157000 }, { "epoch": 1.008, "grad_norm": 0.5960043668746948, "learning_rate": 2.480912e-05, "loss": 0.265, "step": 157500 }, { "epoch": 1.0112, "grad_norm": 0.582115650177002, "learning_rate": 2.472912e-05, "loss": 0.2673, "step": 158000 }, { "epoch": 1.0144, "grad_norm": 0.552392303943634, "learning_rate": 2.464912e-05, "loss": 0.2663, "step": 158500 }, { "epoch": 1.0176, "grad_norm": 0.5585765242576599, "learning_rate": 2.456912e-05, "loss": 0.2688, "step": 159000 }, { "epoch": 1.0208, "grad_norm": 0.6049332022666931, "learning_rate": 2.448912e-05, "loss": 0.266, "step": 159500 }, { "epoch": 1.024, "grad_norm": 0.5749877095222473, "learning_rate": 2.440912e-05, "loss": 0.2689, "step": 160000 }, { "epoch": 1.0272, "grad_norm": 0.5832675695419312, "learning_rate": 2.4329120000000002e-05, "loss": 0.2703, "step": 160500 }, { "epoch": 1.0304, "grad_norm": 0.8549031019210815, "learning_rate": 2.424928e-05, "loss": 0.2623, "step": 161000 }, { "epoch": 1.0336, "grad_norm": 0.5572855472564697, "learning_rate": 2.416928e-05, "loss": 0.2711, "step": 161500 }, { "epoch": 1.0368, "grad_norm": 0.6818140745162964, "learning_rate": 2.408928e-05, "loss": 0.2652, "step": 162000 }, { "epoch": 1.04, "grad_norm": 0.6900683045387268, "learning_rate": 2.400928e-05, "loss": 0.2669, "step": 162500 }, { "epoch": 1.0432, "grad_norm": 0.6015618443489075, "learning_rate": 2.392944e-05, "loss": 0.2654, "step": 163000 }, { "epoch": 1.0464, "grad_norm": 0.5343177318572998, "learning_rate": 2.3849440000000002e-05, "loss": 0.2656, "step": 163500 }, { "epoch": 1.0496, "grad_norm": 0.6130079627037048, "learning_rate": 2.3769440000000003e-05, "loss": 0.2592, "step": 164000 }, { "epoch": 1.0528, "grad_norm": 0.7150599956512451, "learning_rate": 2.368944e-05, "loss": 0.2634, "step": 164500 }, { "epoch": 1.056, "grad_norm": 0.6321354508399963, "learning_rate": 2.360944e-05, "loss": 0.2683, "step": 165000 }, { "epoch": 1.0592, "grad_norm": 0.6234462857246399, "learning_rate": 2.352976e-05, "loss": 0.2628, "step": 165500 }, { "epoch": 1.0624, "grad_norm": 0.6542537808418274, "learning_rate": 2.344976e-05, "loss": 0.2618, "step": 166000 }, { "epoch": 1.0656, "grad_norm": 0.6302633881568909, "learning_rate": 2.3369760000000002e-05, "loss": 0.2661, "step": 166500 }, { "epoch": 1.0688, "grad_norm": 0.5890353322029114, "learning_rate": 2.3289760000000002e-05, "loss": 0.2646, "step": 167000 }, { "epoch": 1.072, "grad_norm": 0.6490179300308228, "learning_rate": 2.320976e-05, "loss": 0.2635, "step": 167500 }, { "epoch": 1.0752, "grad_norm": 0.648162305355072, "learning_rate": 2.312976e-05, "loss": 0.2646, "step": 168000 }, { "epoch": 1.0784, "grad_norm": 0.675680935382843, "learning_rate": 2.304976e-05, "loss": 0.2626, "step": 168500 }, { "epoch": 1.0816, "grad_norm": 0.6192341446876526, "learning_rate": 2.2969760000000002e-05, "loss": 0.2641, "step": 169000 }, { "epoch": 1.0848, "grad_norm": 0.7046379446983337, "learning_rate": 2.288992e-05, "loss": 0.2643, "step": 169500 }, { "epoch": 1.088, "grad_norm": 0.5477197170257568, "learning_rate": 2.280992e-05, "loss": 0.265, "step": 170000 }, { "epoch": 1.0912, "grad_norm": 0.5775583982467651, "learning_rate": 2.2729920000000002e-05, "loss": 0.2645, "step": 170500 }, { "epoch": 1.0944, "grad_norm": 0.6389047503471375, "learning_rate": 2.2649920000000003e-05, "loss": 0.2634, "step": 171000 }, { "epoch": 1.0976, "grad_norm": 0.6169374585151672, "learning_rate": 2.256992e-05, "loss": 0.2642, "step": 171500 }, { "epoch": 1.1008, "grad_norm": 0.5913782715797424, "learning_rate": 2.2490080000000002e-05, "loss": 0.2658, "step": 172000 }, { "epoch": 1.104, "grad_norm": 0.7547928690910339, "learning_rate": 2.241008e-05, "loss": 0.2674, "step": 172500 }, { "epoch": 1.1072, "grad_norm": 0.6277585625648499, "learning_rate": 2.233024e-05, "loss": 0.2686, "step": 173000 }, { "epoch": 1.1104, "grad_norm": 0.6357282996177673, "learning_rate": 2.225024e-05, "loss": 0.2639, "step": 173500 }, { "epoch": 1.1136, "grad_norm": 0.5262208580970764, "learning_rate": 2.2170400000000004e-05, "loss": 0.2641, "step": 174000 }, { "epoch": 1.1168, "grad_norm": 0.6878075003623962, "learning_rate": 2.20904e-05, "loss": 0.2654, "step": 174500 }, { "epoch": 1.12, "grad_norm": 0.5332186222076416, "learning_rate": 2.2010400000000002e-05, "loss": 0.2638, "step": 175000 }, { "epoch": 1.1232, "grad_norm": 0.5562476515769958, "learning_rate": 2.19304e-05, "loss": 0.2648, "step": 175500 }, { "epoch": 1.1264, "grad_norm": 0.5924221277236938, "learning_rate": 2.18504e-05, "loss": 0.2627, "step": 176000 }, { "epoch": 1.1296, "grad_norm": 0.5250386595726013, "learning_rate": 2.17704e-05, "loss": 0.2619, "step": 176500 }, { "epoch": 1.1328, "grad_norm": 0.7426069378852844, "learning_rate": 2.16904e-05, "loss": 0.2628, "step": 177000 }, { "epoch": 1.1360000000000001, "grad_norm": 0.4925951063632965, "learning_rate": 2.16104e-05, "loss": 0.2661, "step": 177500 }, { "epoch": 1.1392, "grad_norm": 0.5707270503044128, "learning_rate": 2.15304e-05, "loss": 0.2622, "step": 178000 }, { "epoch": 1.1424, "grad_norm": 0.5793021321296692, "learning_rate": 2.14504e-05, "loss": 0.2671, "step": 178500 }, { "epoch": 1.1456, "grad_norm": 0.5736916661262512, "learning_rate": 2.13704e-05, "loss": 0.2648, "step": 179000 }, { "epoch": 1.1488, "grad_norm": 0.588550329208374, "learning_rate": 2.129056e-05, "loss": 0.2641, "step": 179500 }, { "epoch": 1.152, "grad_norm": 0.5504462122917175, "learning_rate": 2.121056e-05, "loss": 0.2643, "step": 180000 }, { "epoch": 1.1552, "grad_norm": 0.5439949035644531, "learning_rate": 2.113056e-05, "loss": 0.2639, "step": 180500 }, { "epoch": 1.1584, "grad_norm": 0.6882042288780212, "learning_rate": 2.105056e-05, "loss": 0.2595, "step": 181000 }, { "epoch": 1.1616, "grad_norm": 0.6735561490058899, "learning_rate": 2.097056e-05, "loss": 0.2624, "step": 181500 }, { "epoch": 1.1648, "grad_norm": 0.5545785427093506, "learning_rate": 2.089056e-05, "loss": 0.2625, "step": 182000 }, { "epoch": 1.168, "grad_norm": 0.6497994065284729, "learning_rate": 2.081056e-05, "loss": 0.2611, "step": 182500 }, { "epoch": 1.1712, "grad_norm": 0.5887815356254578, "learning_rate": 2.073056e-05, "loss": 0.2632, "step": 183000 }, { "epoch": 1.1743999999999999, "grad_norm": 0.6037270426750183, "learning_rate": 2.0650560000000002e-05, "loss": 0.2645, "step": 183500 }, { "epoch": 1.1776, "grad_norm": 0.636946439743042, "learning_rate": 2.057072e-05, "loss": 0.2628, "step": 184000 }, { "epoch": 1.1808, "grad_norm": 0.5285276770591736, "learning_rate": 2.049072e-05, "loss": 0.2629, "step": 184500 }, { "epoch": 1.184, "grad_norm": 0.4634397625923157, "learning_rate": 2.041072e-05, "loss": 0.2615, "step": 185000 }, { "epoch": 1.1872, "grad_norm": 0.5693604946136475, "learning_rate": 2.033072e-05, "loss": 0.2619, "step": 185500 }, { "epoch": 1.1904, "grad_norm": 0.6433858275413513, "learning_rate": 2.025072e-05, "loss": 0.2591, "step": 186000 }, { "epoch": 1.1936, "grad_norm": 0.5103280544281006, "learning_rate": 2.017088e-05, "loss": 0.2606, "step": 186500 }, { "epoch": 1.1968, "grad_norm": 0.5591945648193359, "learning_rate": 2.009104e-05, "loss": 0.2628, "step": 187000 }, { "epoch": 1.2, "grad_norm": 0.5560447573661804, "learning_rate": 2.001104e-05, "loss": 0.2603, "step": 187500 }, { "epoch": 1.2032, "grad_norm": 0.5321928262710571, "learning_rate": 1.9931040000000002e-05, "loss": 0.2576, "step": 188000 }, { "epoch": 1.2064, "grad_norm": 0.6455059051513672, "learning_rate": 1.9851040000000003e-05, "loss": 0.2615, "step": 188500 }, { "epoch": 1.2096, "grad_norm": 0.6237916946411133, "learning_rate": 1.977104e-05, "loss": 0.2626, "step": 189000 }, { "epoch": 1.2128, "grad_norm": 0.5269157886505127, "learning_rate": 1.969104e-05, "loss": 0.2597, "step": 189500 }, { "epoch": 1.216, "grad_norm": 0.5521387457847595, "learning_rate": 1.961104e-05, "loss": 0.257, "step": 190000 }, { "epoch": 1.2192, "grad_norm": 0.6061577796936035, "learning_rate": 1.953104e-05, "loss": 0.2626, "step": 190500 }, { "epoch": 1.2224, "grad_norm": 0.6479594111442566, "learning_rate": 1.945104e-05, "loss": 0.2586, "step": 191000 }, { "epoch": 1.2256, "grad_norm": 0.5330658555030823, "learning_rate": 1.937104e-05, "loss": 0.2573, "step": 191500 }, { "epoch": 1.2288000000000001, "grad_norm": 0.5984029173851013, "learning_rate": 1.9291200000000003e-05, "loss": 0.2591, "step": 192000 }, { "epoch": 1.232, "grad_norm": 0.8451948165893555, "learning_rate": 1.92112e-05, "loss": 0.2591, "step": 192500 }, { "epoch": 1.2352, "grad_norm": 0.6519868969917297, "learning_rate": 1.91312e-05, "loss": 0.2608, "step": 193000 }, { "epoch": 1.2384, "grad_norm": 0.6487559080123901, "learning_rate": 1.9051199999999998e-05, "loss": 0.2558, "step": 193500 }, { "epoch": 1.2416, "grad_norm": 0.5544815063476562, "learning_rate": 1.897152e-05, "loss": 0.2609, "step": 194000 }, { "epoch": 1.2448, "grad_norm": 0.594536542892456, "learning_rate": 1.8891520000000002e-05, "loss": 0.2592, "step": 194500 }, { "epoch": 1.248, "grad_norm": 0.5301911234855652, "learning_rate": 1.8811520000000002e-05, "loss": 0.2579, "step": 195000 }, { "epoch": 1.2511999999999999, "grad_norm": 0.6232271790504456, "learning_rate": 1.873152e-05, "loss": 0.2611, "step": 195500 }, { "epoch": 1.2544, "grad_norm": 0.6571745276451111, "learning_rate": 1.865152e-05, "loss": 0.2617, "step": 196000 }, { "epoch": 1.2576, "grad_norm": 0.6281866431236267, "learning_rate": 1.857152e-05, "loss": 0.2605, "step": 196500 }, { "epoch": 1.2608, "grad_norm": 0.6584866642951965, "learning_rate": 1.8491520000000002e-05, "loss": 0.2575, "step": 197000 }, { "epoch": 1.264, "grad_norm": 0.5791180729866028, "learning_rate": 1.8411520000000003e-05, "loss": 0.2572, "step": 197500 }, { "epoch": 1.2671999999999999, "grad_norm": 0.5907946228981018, "learning_rate": 1.8331520000000004e-05, "loss": 0.2576, "step": 198000 }, { "epoch": 1.2704, "grad_norm": 0.6532405614852905, "learning_rate": 1.8251680000000002e-05, "loss": 0.2588, "step": 198500 }, { "epoch": 1.2736, "grad_norm": 0.5683246850967407, "learning_rate": 1.8171680000000003e-05, "loss": 0.2597, "step": 199000 }, { "epoch": 1.2768, "grad_norm": 0.5847846865653992, "learning_rate": 1.809168e-05, "loss": 0.2628, "step": 199500 }, { "epoch": 1.28, "grad_norm": 0.5554783344268799, "learning_rate": 1.801168e-05, "loss": 0.2542, "step": 200000 }, { "epoch": 1.2832, "grad_norm": 0.6664928793907166, "learning_rate": 1.793184e-05, "loss": 0.2586, "step": 200500 }, { "epoch": 1.2864, "grad_norm": 0.5993084907531738, "learning_rate": 1.785184e-05, "loss": 0.2571, "step": 201000 }, { "epoch": 1.2896, "grad_norm": 0.4557185173034668, "learning_rate": 1.777184e-05, "loss": 0.2594, "step": 201500 }, { "epoch": 1.2928, "grad_norm": 0.7798305749893188, "learning_rate": 1.7691840000000002e-05, "loss": 0.2561, "step": 202000 }, { "epoch": 1.296, "grad_norm": 0.5406688451766968, "learning_rate": 1.7611840000000002e-05, "loss": 0.2562, "step": 202500 }, { "epoch": 1.2992, "grad_norm": 0.5173208117485046, "learning_rate": 1.7531840000000003e-05, "loss": 0.2606, "step": 203000 }, { "epoch": 1.3024, "grad_norm": 0.6803346872329712, "learning_rate": 1.7452e-05, "loss": 0.259, "step": 203500 }, { "epoch": 1.3056, "grad_norm": 0.5223200917243958, "learning_rate": 1.7372000000000002e-05, "loss": 0.2571, "step": 204000 }, { "epoch": 1.3088, "grad_norm": 0.6100528240203857, "learning_rate": 1.7292e-05, "loss": 0.2558, "step": 204500 }, { "epoch": 1.312, "grad_norm": 0.623023271560669, "learning_rate": 1.7212e-05, "loss": 0.2563, "step": 205000 }, { "epoch": 1.3152, "grad_norm": 0.5915964841842651, "learning_rate": 1.713216e-05, "loss": 0.2581, "step": 205500 }, { "epoch": 1.3184, "grad_norm": 0.538467526435852, "learning_rate": 1.705216e-05, "loss": 0.2554, "step": 206000 }, { "epoch": 1.3216, "grad_norm": 0.5382514595985413, "learning_rate": 1.697216e-05, "loss": 0.2581, "step": 206500 }, { "epoch": 1.3248, "grad_norm": 0.6466744542121887, "learning_rate": 1.689216e-05, "loss": 0.2573, "step": 207000 }, { "epoch": 1.328, "grad_norm": 0.742675244808197, "learning_rate": 1.6812160000000002e-05, "loss": 0.2572, "step": 207500 }, { "epoch": 1.3312, "grad_norm": 0.6123968362808228, "learning_rate": 1.673232e-05, "loss": 0.2598, "step": 208000 }, { "epoch": 1.3344, "grad_norm": 0.6710489392280579, "learning_rate": 1.665232e-05, "loss": 0.2604, "step": 208500 }, { "epoch": 1.3376000000000001, "grad_norm": 0.685879111289978, "learning_rate": 1.657232e-05, "loss": 0.2576, "step": 209000 }, { "epoch": 1.3408, "grad_norm": 0.5600978136062622, "learning_rate": 1.649232e-05, "loss": 0.2606, "step": 209500 }, { "epoch": 1.3439999999999999, "grad_norm": 0.5358079075813293, "learning_rate": 1.6412640000000002e-05, "loss": 0.2578, "step": 210000 }, { "epoch": 1.3472, "grad_norm": 0.7245667576789856, "learning_rate": 1.63328e-05, "loss": 0.2613, "step": 210500 }, { "epoch": 1.3504, "grad_norm": 0.5963015556335449, "learning_rate": 1.62528e-05, "loss": 0.2568, "step": 211000 }, { "epoch": 1.3536000000000001, "grad_norm": 0.6139352917671204, "learning_rate": 1.6172800000000002e-05, "loss": 0.2501, "step": 211500 }, { "epoch": 1.3568, "grad_norm": 0.5434224605560303, "learning_rate": 1.60928e-05, "loss": 0.2583, "step": 212000 }, { "epoch": 1.3599999999999999, "grad_norm": 0.5723361372947693, "learning_rate": 1.60128e-05, "loss": 0.2582, "step": 212500 }, { "epoch": 1.3632, "grad_norm": 0.5621342658996582, "learning_rate": 1.593296e-05, "loss": 0.2583, "step": 213000 }, { "epoch": 1.3664, "grad_norm": 0.5716707706451416, "learning_rate": 1.585296e-05, "loss": 0.2548, "step": 213500 }, { "epoch": 1.3696, "grad_norm": 0.6344952583312988, "learning_rate": 1.577296e-05, "loss": 0.2526, "step": 214000 }, { "epoch": 1.3728, "grad_norm": 0.6360082030296326, "learning_rate": 1.569296e-05, "loss": 0.259, "step": 214500 }, { "epoch": 1.376, "grad_norm": 0.5400614142417908, "learning_rate": 1.5612960000000002e-05, "loss": 0.26, "step": 215000 }, { "epoch": 1.3792, "grad_norm": 0.6992815732955933, "learning_rate": 1.5532960000000002e-05, "loss": 0.259, "step": 215500 }, { "epoch": 1.3824, "grad_norm": 0.4903436601161957, "learning_rate": 1.545296e-05, "loss": 0.2582, "step": 216000 }, { "epoch": 1.3856, "grad_norm": 0.5602136850357056, "learning_rate": 1.537296e-05, "loss": 0.2563, "step": 216500 }, { "epoch": 1.3888, "grad_norm": 0.5858916640281677, "learning_rate": 1.529296e-05, "loss": 0.2553, "step": 217000 }, { "epoch": 1.392, "grad_norm": 0.438550740480423, "learning_rate": 1.521312e-05, "loss": 0.2567, "step": 217500 }, { "epoch": 1.3952, "grad_norm": 0.5660952925682068, "learning_rate": 1.513312e-05, "loss": 0.2552, "step": 218000 }, { "epoch": 1.3984, "grad_norm": 0.6139314770698547, "learning_rate": 1.5053120000000001e-05, "loss": 0.2506, "step": 218500 }, { "epoch": 1.4016, "grad_norm": 0.5470092296600342, "learning_rate": 1.4973120000000002e-05, "loss": 0.2549, "step": 219000 }, { "epoch": 1.4048, "grad_norm": 0.5565882325172424, "learning_rate": 1.489328e-05, "loss": 0.256, "step": 219500 }, { "epoch": 1.408, "grad_norm": 0.4755209684371948, "learning_rate": 1.4813280000000001e-05, "loss": 0.2571, "step": 220000 }, { "epoch": 1.4112, "grad_norm": 0.5266921520233154, "learning_rate": 1.4733280000000002e-05, "loss": 0.2546, "step": 220500 }, { "epoch": 1.4144, "grad_norm": 0.5858850479125977, "learning_rate": 1.465328e-05, "loss": 0.251, "step": 221000 }, { "epoch": 1.4176, "grad_norm": 0.5382100343704224, "learning_rate": 1.457328e-05, "loss": 0.254, "step": 221500 }, { "epoch": 1.4208, "grad_norm": 0.6082443594932556, "learning_rate": 1.4493280000000001e-05, "loss": 0.2549, "step": 222000 }, { "epoch": 1.424, "grad_norm": 0.56458979845047, "learning_rate": 1.4413440000000001e-05, "loss": 0.2556, "step": 222500 }, { "epoch": 1.4272, "grad_norm": 0.5702414512634277, "learning_rate": 1.433344e-05, "loss": 0.2523, "step": 223000 }, { "epoch": 1.4304000000000001, "grad_norm": 0.5704798102378845, "learning_rate": 1.4253440000000001e-05, "loss": 0.2539, "step": 223500 }, { "epoch": 1.4336, "grad_norm": 0.675832211971283, "learning_rate": 1.4173440000000002e-05, "loss": 0.256, "step": 224000 }, { "epoch": 1.4368, "grad_norm": 0.8129279017448425, "learning_rate": 1.4093439999999999e-05, "loss": 0.2533, "step": 224500 }, { "epoch": 1.44, "grad_norm": 0.6167120933532715, "learning_rate": 1.401344e-05, "loss": 0.254, "step": 225000 }, { "epoch": 1.4432, "grad_norm": 0.6647797226905823, "learning_rate": 1.39336e-05, "loss": 0.2497, "step": 225500 }, { "epoch": 1.4464000000000001, "grad_norm": 0.5046721696853638, "learning_rate": 1.38536e-05, "loss": 0.2513, "step": 226000 }, { "epoch": 1.4496, "grad_norm": 0.5725312232971191, "learning_rate": 1.37736e-05, "loss": 0.2547, "step": 226500 }, { "epoch": 1.4527999999999999, "grad_norm": 0.514900803565979, "learning_rate": 1.36936e-05, "loss": 0.2504, "step": 227000 }, { "epoch": 1.456, "grad_norm": 0.3646963834762573, "learning_rate": 1.3613600000000001e-05, "loss": 0.2548, "step": 227500 }, { "epoch": 1.4592, "grad_norm": 0.6462276577949524, "learning_rate": 1.3533600000000002e-05, "loss": 0.2537, "step": 228000 }, { "epoch": 1.4624, "grad_norm": 0.5525985956192017, "learning_rate": 1.34536e-05, "loss": 0.253, "step": 228500 }, { "epoch": 1.4656, "grad_norm": 0.5146868824958801, "learning_rate": 1.33736e-05, "loss": 0.2531, "step": 229000 }, { "epoch": 1.4687999999999999, "grad_norm": 0.6087847948074341, "learning_rate": 1.329376e-05, "loss": 0.2517, "step": 229500 }, { "epoch": 1.472, "grad_norm": 0.5387943387031555, "learning_rate": 1.321376e-05, "loss": 0.2514, "step": 230000 }, { "epoch": 1.4752, "grad_norm": 0.5926588177680969, "learning_rate": 1.313376e-05, "loss": 0.2544, "step": 230500 }, { "epoch": 1.4784, "grad_norm": 0.5444154143333435, "learning_rate": 1.3053760000000001e-05, "loss": 0.253, "step": 231000 }, { "epoch": 1.4816, "grad_norm": 0.5707711577415466, "learning_rate": 1.2973760000000002e-05, "loss": 0.2509, "step": 231500 }, { "epoch": 1.4848, "grad_norm": 0.5120610594749451, "learning_rate": 1.2893760000000002e-05, "loss": 0.2535, "step": 232000 }, { "epoch": 1.488, "grad_norm": 0.6814270615577698, "learning_rate": 1.281392e-05, "loss": 0.2542, "step": 232500 }, { "epoch": 1.4912, "grad_norm": 0.5387424826622009, "learning_rate": 1.2733920000000002e-05, "loss": 0.2559, "step": 233000 }, { "epoch": 1.4944, "grad_norm": 0.6061420440673828, "learning_rate": 1.2653919999999999e-05, "loss": 0.2492, "step": 233500 }, { "epoch": 1.4976, "grad_norm": 0.5238478183746338, "learning_rate": 1.257392e-05, "loss": 0.2495, "step": 234000 }, { "epoch": 1.5008, "grad_norm": 0.6245620846748352, "learning_rate": 1.2494080000000002e-05, "loss": 0.2503, "step": 234500 }, { "epoch": 1.504, "grad_norm": 0.594336211681366, "learning_rate": 1.241408e-05, "loss": 0.2526, "step": 235000 }, { "epoch": 1.5072, "grad_norm": 0.6665235161781311, "learning_rate": 1.233424e-05, "loss": 0.2565, "step": 235500 }, { "epoch": 1.5104, "grad_norm": 0.4540468454360962, "learning_rate": 1.225424e-05, "loss": 0.2509, "step": 236000 }, { "epoch": 1.5135999999999998, "grad_norm": 0.48204490542411804, "learning_rate": 1.217424e-05, "loss": 0.2566, "step": 236500 }, { "epoch": 1.5168, "grad_norm": 0.7209044098854065, "learning_rate": 1.2094240000000001e-05, "loss": 0.2485, "step": 237000 }, { "epoch": 1.52, "grad_norm": 0.5661574006080627, "learning_rate": 1.201424e-05, "loss": 0.2496, "step": 237500 }, { "epoch": 1.5232, "grad_norm": 0.4637988805770874, "learning_rate": 1.1934240000000001e-05, "loss": 0.2498, "step": 238000 }, { "epoch": 1.5264, "grad_norm": 0.5440483093261719, "learning_rate": 1.1854240000000002e-05, "loss": 0.2539, "step": 238500 }, { "epoch": 1.5295999999999998, "grad_norm": 0.6143088936805725, "learning_rate": 1.1774240000000001e-05, "loss": 0.2507, "step": 239000 }, { "epoch": 1.5328, "grad_norm": 0.553655207157135, "learning_rate": 1.1694400000000001e-05, "loss": 0.2494, "step": 239500 }, { "epoch": 1.536, "grad_norm": 0.5812162160873413, "learning_rate": 1.16144e-05, "loss": 0.2494, "step": 240000 }, { "epoch": 1.5392000000000001, "grad_norm": 0.4919438660144806, "learning_rate": 1.1534400000000001e-05, "loss": 0.2471, "step": 240500 }, { "epoch": 1.5424, "grad_norm": 0.6260576844215393, "learning_rate": 1.1454400000000002e-05, "loss": 0.2518, "step": 241000 }, { "epoch": 1.5455999999999999, "grad_norm": 0.6452062726020813, "learning_rate": 1.137456e-05, "loss": 0.253, "step": 241500 }, { "epoch": 1.5488, "grad_norm": 0.5557950139045715, "learning_rate": 1.1294559999999999e-05, "loss": 0.2553, "step": 242000 }, { "epoch": 1.552, "grad_norm": 0.6445254683494568, "learning_rate": 1.121456e-05, "loss": 0.2513, "step": 242500 }, { "epoch": 1.5552000000000001, "grad_norm": 0.5771984457969666, "learning_rate": 1.113456e-05, "loss": 0.2518, "step": 243000 }, { "epoch": 1.5584, "grad_norm": 0.48172786831855774, "learning_rate": 1.105456e-05, "loss": 0.2529, "step": 243500 }, { "epoch": 1.5615999999999999, "grad_norm": 0.5962732434272766, "learning_rate": 1.0974720000000002e-05, "loss": 0.2552, "step": 244000 }, { "epoch": 1.5648, "grad_norm": 0.5713253617286682, "learning_rate": 1.089472e-05, "loss": 0.2518, "step": 244500 }, { "epoch": 1.568, "grad_norm": 0.7049676775932312, "learning_rate": 1.081472e-05, "loss": 0.2503, "step": 245000 }, { "epoch": 1.5712000000000002, "grad_norm": 0.5562995076179504, "learning_rate": 1.073472e-05, "loss": 0.2534, "step": 245500 }, { "epoch": 1.5744, "grad_norm": 0.5492623448371887, "learning_rate": 1.065472e-05, "loss": 0.2508, "step": 246000 }, { "epoch": 1.5776, "grad_norm": 0.6449033617973328, "learning_rate": 1.057472e-05, "loss": 0.2506, "step": 246500 }, { "epoch": 1.5808, "grad_norm": 0.5232768058776855, "learning_rate": 1.049488e-05, "loss": 0.2497, "step": 247000 }, { "epoch": 1.584, "grad_norm": 0.5512565970420837, "learning_rate": 1.0414880000000001e-05, "loss": 0.2534, "step": 247500 }, { "epoch": 1.5872000000000002, "grad_norm": 0.48962149024009705, "learning_rate": 1.033488e-05, "loss": 0.2552, "step": 248000 }, { "epoch": 1.5904, "grad_norm": 0.5635197162628174, "learning_rate": 1.025488e-05, "loss": 0.251, "step": 248500 }, { "epoch": 1.5936, "grad_norm": 0.5858097076416016, "learning_rate": 1.0175040000000001e-05, "loss": 0.2521, "step": 249000 }, { "epoch": 1.5968, "grad_norm": 0.5749566555023193, "learning_rate": 1.0095200000000001e-05, "loss": 0.2542, "step": 249500 }, { "epoch": 1.6, "grad_norm": 0.6057468056678772, "learning_rate": 1.00152e-05, "loss": 0.249, "step": 250000 }, { "epoch": 1.6032, "grad_norm": 0.68129962682724, "learning_rate": 9.9352e-06, "loss": 0.2496, "step": 250500 }, { "epoch": 1.6064, "grad_norm": 0.5518680810928345, "learning_rate": 9.8552e-06, "loss": 0.2483, "step": 251000 }, { "epoch": 1.6096, "grad_norm": 0.7354257702827454, "learning_rate": 9.775200000000001e-06, "loss": 0.251, "step": 251500 }, { "epoch": 1.6128, "grad_norm": 0.5537115335464478, "learning_rate": 9.6952e-06, "loss": 0.2519, "step": 252000 }, { "epoch": 1.616, "grad_norm": 0.5443572402000427, "learning_rate": 9.615360000000002e-06, "loss": 0.2505, "step": 252500 }, { "epoch": 1.6192, "grad_norm": 0.8157851099967957, "learning_rate": 9.53536e-06, "loss": 0.2456, "step": 253000 }, { "epoch": 1.6223999999999998, "grad_norm": 0.5709113478660583, "learning_rate": 9.455520000000001e-06, "loss": 0.2511, "step": 253500 }, { "epoch": 1.6256, "grad_norm": 0.5266199707984924, "learning_rate": 9.37552e-06, "loss": 0.2523, "step": 254000 }, { "epoch": 1.6288, "grad_norm": 0.6796950697898865, "learning_rate": 9.29552e-06, "loss": 0.2529, "step": 254500 }, { "epoch": 1.6320000000000001, "grad_norm": 0.5162604451179504, "learning_rate": 9.215520000000002e-06, "loss": 0.2486, "step": 255000 }, { "epoch": 1.6352, "grad_norm": 0.5577069520950317, "learning_rate": 9.13552e-06, "loss": 0.2501, "step": 255500 }, { "epoch": 1.6383999999999999, "grad_norm": 0.5930905342102051, "learning_rate": 9.055520000000001e-06, "loss": 0.2505, "step": 256000 }, { "epoch": 1.6416, "grad_norm": 0.5219632983207703, "learning_rate": 8.97552e-06, "loss": 0.2499, "step": 256500 }, { "epoch": 1.6448, "grad_norm": 0.5385752320289612, "learning_rate": 8.89552e-06, "loss": 0.2473, "step": 257000 }, { "epoch": 1.6480000000000001, "grad_norm": 0.5498505234718323, "learning_rate": 8.81552e-06, "loss": 0.2513, "step": 257500 }, { "epoch": 1.6512, "grad_norm": 0.5780929923057556, "learning_rate": 8.73552e-06, "loss": 0.2488, "step": 258000 }, { "epoch": 1.6543999999999999, "grad_norm": 0.6167399883270264, "learning_rate": 8.65552e-06, "loss": 0.2501, "step": 258500 }, { "epoch": 1.6576, "grad_norm": 0.6829573512077332, "learning_rate": 8.575520000000001e-06, "loss": 0.2477, "step": 259000 }, { "epoch": 1.6608, "grad_norm": 0.4874655604362488, "learning_rate": 8.49552e-06, "loss": 0.2507, "step": 259500 }, { "epoch": 1.6640000000000001, "grad_norm": 0.5769158601760864, "learning_rate": 8.41568e-06, "loss": 0.2477, "step": 260000 }, { "epoch": 1.6672, "grad_norm": 0.45717403292655945, "learning_rate": 8.335679999999999e-06, "loss": 0.2515, "step": 260500 }, { "epoch": 1.6703999999999999, "grad_norm": 0.5851497650146484, "learning_rate": 8.25568e-06, "loss": 0.2453, "step": 261000 }, { "epoch": 1.6736, "grad_norm": 0.7223703265190125, "learning_rate": 8.17568e-06, "loss": 0.2534, "step": 261500 }, { "epoch": 1.6768, "grad_norm": 0.5290210843086243, "learning_rate": 8.09584e-06, "loss": 0.2468, "step": 262000 }, { "epoch": 1.6800000000000002, "grad_norm": 0.5959377884864807, "learning_rate": 8.015999999999999e-06, "loss": 0.2498, "step": 262500 }, { "epoch": 1.6832, "grad_norm": 0.5404760241508484, "learning_rate": 7.936e-06, "loss": 0.2485, "step": 263000 }, { "epoch": 1.6864, "grad_norm": 0.496378093957901, "learning_rate": 7.856e-06, "loss": 0.2474, "step": 263500 }, { "epoch": 1.6896, "grad_norm": 0.54584801197052, "learning_rate": 7.776e-06, "loss": 0.2498, "step": 264000 }, { "epoch": 1.6928, "grad_norm": 0.5465365052223206, "learning_rate": 7.696160000000002e-06, "loss": 0.2468, "step": 264500 }, { "epoch": 1.696, "grad_norm": 0.5857728719711304, "learning_rate": 7.61616e-06, "loss": 0.2485, "step": 265000 }, { "epoch": 1.6992, "grad_norm": 0.5276440382003784, "learning_rate": 7.5361600000000005e-06, "loss": 0.2467, "step": 265500 }, { "epoch": 1.7024, "grad_norm": 0.5197107195854187, "learning_rate": 7.456160000000001e-06, "loss": 0.2457, "step": 266000 }, { "epoch": 1.7056, "grad_norm": 0.6030395030975342, "learning_rate": 7.37616e-06, "loss": 0.2449, "step": 266500 }, { "epoch": 1.7088, "grad_norm": 0.5553884506225586, "learning_rate": 7.29616e-06, "loss": 0.2459, "step": 267000 }, { "epoch": 1.712, "grad_norm": 0.6518832445144653, "learning_rate": 7.216160000000001e-06, "loss": 0.2469, "step": 267500 }, { "epoch": 1.7151999999999998, "grad_norm": 0.6981451511383057, "learning_rate": 7.13616e-06, "loss": 0.2493, "step": 268000 }, { "epoch": 1.7184, "grad_norm": 0.6021608114242554, "learning_rate": 7.056160000000001e-06, "loss": 0.2477, "step": 268500 }, { "epoch": 1.7216, "grad_norm": 0.6317922472953796, "learning_rate": 6.976160000000001e-06, "loss": 0.2461, "step": 269000 }, { "epoch": 1.7248, "grad_norm": 0.6130341291427612, "learning_rate": 6.89616e-06, "loss": 0.2508, "step": 269500 }, { "epoch": 1.728, "grad_norm": 0.6314118504524231, "learning_rate": 6.816160000000001e-06, "loss": 0.243, "step": 270000 }, { "epoch": 1.7311999999999999, "grad_norm": 0.6070537567138672, "learning_rate": 6.73616e-06, "loss": 0.2472, "step": 270500 }, { "epoch": 1.7344, "grad_norm": 0.5763754844665527, "learning_rate": 6.6561600000000005e-06, "loss": 0.2511, "step": 271000 }, { "epoch": 1.7376, "grad_norm": 0.6849692463874817, "learning_rate": 6.57632e-06, "loss": 0.2461, "step": 271500 }, { "epoch": 1.7408000000000001, "grad_norm": 0.6505193710327148, "learning_rate": 6.4963200000000005e-06, "loss": 0.2472, "step": 272000 }, { "epoch": 1.744, "grad_norm": 0.5150639414787292, "learning_rate": 6.4163200000000004e-06, "loss": 0.2456, "step": 272500 }, { "epoch": 1.7471999999999999, "grad_norm": 0.6367226839065552, "learning_rate": 6.3363199999999995e-06, "loss": 0.2478, "step": 273000 }, { "epoch": 1.7504, "grad_norm": 0.6016091704368591, "learning_rate": 6.25632e-06, "loss": 0.2457, "step": 273500 }, { "epoch": 1.7536, "grad_norm": 0.6344937682151794, "learning_rate": 6.17632e-06, "loss": 0.2449, "step": 274000 }, { "epoch": 1.7568000000000001, "grad_norm": 0.6031948924064636, "learning_rate": 6.09632e-06, "loss": 0.2474, "step": 274500 }, { "epoch": 1.76, "grad_norm": 0.6515588760375977, "learning_rate": 6.01648e-06, "loss": 0.2463, "step": 275000 }, { "epoch": 1.7631999999999999, "grad_norm": 0.562818706035614, "learning_rate": 5.93648e-06, "loss": 0.2507, "step": 275500 }, { "epoch": 1.7664, "grad_norm": 0.591066300868988, "learning_rate": 5.85648e-06, "loss": 0.2488, "step": 276000 }, { "epoch": 1.7696, "grad_norm": 0.544698178768158, "learning_rate": 5.77664e-06, "loss": 0.2445, "step": 276500 }, { "epoch": 1.7728000000000002, "grad_norm": 0.4837886691093445, "learning_rate": 5.6968e-06, "loss": 0.2497, "step": 277000 }, { "epoch": 1.776, "grad_norm": 0.5405265092849731, "learning_rate": 5.6168e-06, "loss": 0.2489, "step": 277500 }, { "epoch": 1.7792, "grad_norm": 0.560249924659729, "learning_rate": 5.5368e-06, "loss": 0.2466, "step": 278000 }, { "epoch": 1.7824, "grad_norm": 0.5680164098739624, "learning_rate": 5.4568e-06, "loss": 0.2475, "step": 278500 }, { "epoch": 1.7856, "grad_norm": 0.7152078747749329, "learning_rate": 5.376800000000001e-06, "loss": 0.2439, "step": 279000 }, { "epoch": 1.7888, "grad_norm": 0.6013668775558472, "learning_rate": 5.296800000000001e-06, "loss": 0.2482, "step": 279500 }, { "epoch": 1.792, "grad_norm": 0.5784064531326294, "learning_rate": 5.2168e-06, "loss": 0.2495, "step": 280000 }, { "epoch": 1.7952, "grad_norm": 0.5531567931175232, "learning_rate": 5.1368e-06, "loss": 0.2497, "step": 280500 }, { "epoch": 1.7984, "grad_norm": 0.5494315028190613, "learning_rate": 5.0568000000000004e-06, "loss": 0.2476, "step": 281000 }, { "epoch": 1.8016, "grad_norm": 0.6254246830940247, "learning_rate": 4.9768e-06, "loss": 0.243, "step": 281500 }, { "epoch": 1.8048, "grad_norm": 0.7309369444847107, "learning_rate": 4.8969600000000005e-06, "loss": 0.2428, "step": 282000 }, { "epoch": 1.808, "grad_norm": 0.587374210357666, "learning_rate": 4.81696e-06, "loss": 0.253, "step": 282500 }, { "epoch": 1.8112, "grad_norm": 0.5197418928146362, "learning_rate": 4.73696e-06, "loss": 0.2454, "step": 283000 }, { "epoch": 1.8144, "grad_norm": 0.5393714308738708, "learning_rate": 4.65696e-06, "loss": 0.2442, "step": 283500 }, { "epoch": 1.8176, "grad_norm": 0.6797386407852173, "learning_rate": 4.57696e-06, "loss": 0.2455, "step": 284000 }, { "epoch": 1.8208, "grad_norm": 0.5192613005638123, "learning_rate": 4.49696e-06, "loss": 0.2457, "step": 284500 }, { "epoch": 1.8239999999999998, "grad_norm": 0.5224815607070923, "learning_rate": 4.416960000000001e-06, "loss": 0.2442, "step": 285000 }, { "epoch": 1.8272, "grad_norm": 0.5999212861061096, "learning_rate": 4.336960000000001e-06, "loss": 0.2465, "step": 285500 }, { "epoch": 1.8304, "grad_norm": 0.6273928880691528, "learning_rate": 4.25712e-06, "loss": 0.2451, "step": 286000 }, { "epoch": 1.8336000000000001, "grad_norm": 0.4545860290527344, "learning_rate": 4.177120000000001e-06, "loss": 0.2455, "step": 286500 }, { "epoch": 1.8368, "grad_norm": 0.5125412344932556, "learning_rate": 4.09712e-06, "loss": 0.2471, "step": 287000 }, { "epoch": 1.8399999999999999, "grad_norm": 0.6210908889770508, "learning_rate": 4.01712e-06, "loss": 0.2504, "step": 287500 }, { "epoch": 1.8432, "grad_norm": 0.5454786419868469, "learning_rate": 3.9371200000000005e-06, "loss": 0.2459, "step": 288000 }, { "epoch": 1.8464, "grad_norm": 0.4733567535877228, "learning_rate": 3.8571200000000004e-06, "loss": 0.2417, "step": 288500 }, { "epoch": 1.8496000000000001, "grad_norm": 0.5461744666099548, "learning_rate": 3.7772800000000005e-06, "loss": 0.2448, "step": 289000 }, { "epoch": 1.8528, "grad_norm": 0.5227847695350647, "learning_rate": 3.69728e-06, "loss": 0.2491, "step": 289500 }, { "epoch": 1.8559999999999999, "grad_norm": 0.4567984938621521, "learning_rate": 3.61728e-06, "loss": 0.2491, "step": 290000 }, { "epoch": 1.8592, "grad_norm": 0.634410560131073, "learning_rate": 3.5372800000000003e-06, "loss": 0.249, "step": 290500 }, { "epoch": 1.8624, "grad_norm": 0.5894684195518494, "learning_rate": 3.45728e-06, "loss": 0.2468, "step": 291000 }, { "epoch": 1.8656000000000001, "grad_norm": 0.6827466487884521, "learning_rate": 3.37728e-06, "loss": 0.2446, "step": 291500 }, { "epoch": 1.8688, "grad_norm": 0.6229824423789978, "learning_rate": 3.2972799999999996e-06, "loss": 0.2487, "step": 292000 }, { "epoch": 1.8719999999999999, "grad_norm": 0.5481498837471008, "learning_rate": 3.2172800000000004e-06, "loss": 0.2461, "step": 292500 }, { "epoch": 1.8752, "grad_norm": 0.6237102746963501, "learning_rate": 3.13728e-06, "loss": 0.2474, "step": 293000 }, { "epoch": 1.8784, "grad_norm": 0.6718310117721558, "learning_rate": 3.0572800000000002e-06, "loss": 0.2512, "step": 293500 }, { "epoch": 1.8816000000000002, "grad_norm": 0.5450266003608704, "learning_rate": 2.97728e-06, "loss": 0.249, "step": 294000 }, { "epoch": 1.8848, "grad_norm": 0.6037828326225281, "learning_rate": 2.8974400000000002e-06, "loss": 0.2447, "step": 294500 }, { "epoch": 1.888, "grad_norm": 0.60282963514328, "learning_rate": 2.81744e-06, "loss": 0.2449, "step": 295000 }, { "epoch": 1.8912, "grad_norm": 0.5670416355133057, "learning_rate": 2.73744e-06, "loss": 0.2424, "step": 295500 }, { "epoch": 1.8944, "grad_norm": 0.6023501753807068, "learning_rate": 2.6576e-06, "loss": 0.2449, "step": 296000 }, { "epoch": 1.8976, "grad_norm": 0.5216399431228638, "learning_rate": 2.5776e-06, "loss": 0.243, "step": 296500 }, { "epoch": 1.9008, "grad_norm": 0.49217623472213745, "learning_rate": 2.4976000000000004e-06, "loss": 0.2432, "step": 297000 }, { "epoch": 1.904, "grad_norm": 0.46074241399765015, "learning_rate": 2.4176e-06, "loss": 0.2434, "step": 297500 }, { "epoch": 1.9072, "grad_norm": 0.5020151734352112, "learning_rate": 2.3376000000000003e-06, "loss": 0.2424, "step": 298000 }, { "epoch": 1.9104, "grad_norm": 0.5782959461212158, "learning_rate": 2.2576e-06, "loss": 0.246, "step": 298500 }, { "epoch": 1.9136, "grad_norm": 0.5627701282501221, "learning_rate": 2.1777600000000003e-06, "loss": 0.2483, "step": 299000 }, { "epoch": 1.9167999999999998, "grad_norm": 0.5413541793823242, "learning_rate": 2.09776e-06, "loss": 0.2472, "step": 299500 }, { "epoch": 1.92, "grad_norm": 0.5274430513381958, "learning_rate": 2.01776e-06, "loss": 0.2439, "step": 300000 }, { "epoch": 1.9232, "grad_norm": 0.5475273728370667, "learning_rate": 1.93776e-06, "loss": 0.2442, "step": 300500 }, { "epoch": 1.9264000000000001, "grad_norm": 0.42543721199035645, "learning_rate": 1.85776e-06, "loss": 0.2472, "step": 301000 }, { "epoch": 1.9296, "grad_norm": 0.6353417634963989, "learning_rate": 1.7777600000000001e-06, "loss": 0.2478, "step": 301500 }, { "epoch": 1.9327999999999999, "grad_norm": 0.6469547748565674, "learning_rate": 1.69776e-06, "loss": 0.2399, "step": 302000 }, { "epoch": 1.936, "grad_norm": 0.5442044734954834, "learning_rate": 1.6177600000000002e-06, "loss": 0.2461, "step": 302500 }, { "epoch": 1.9392, "grad_norm": 0.6031491756439209, "learning_rate": 1.53776e-06, "loss": 0.2463, "step": 303000 }, { "epoch": 1.9424000000000001, "grad_norm": 0.6695773005485535, "learning_rate": 1.4579200000000002e-06, "loss": 0.2475, "step": 303500 }, { "epoch": 1.9456, "grad_norm": 0.5614984631538391, "learning_rate": 1.37792e-06, "loss": 0.2442, "step": 304000 }, { "epoch": 1.9487999999999999, "grad_norm": 0.5449358224868774, "learning_rate": 1.2980800000000002e-06, "loss": 0.2482, "step": 304500 }, { "epoch": 1.952, "grad_norm": 0.4947919249534607, "learning_rate": 1.2180800000000001e-06, "loss": 0.2434, "step": 305000 }, { "epoch": 1.9552, "grad_norm": 0.5914204716682434, "learning_rate": 1.13808e-06, "loss": 0.2449, "step": 305500 }, { "epoch": 1.9584000000000001, "grad_norm": 0.5385975241661072, "learning_rate": 1.05808e-06, "loss": 0.2484, "step": 306000 }, { "epoch": 1.9616, "grad_norm": 0.5446351170539856, "learning_rate": 9.780799999999999e-07, "loss": 0.2434, "step": 306500 }, { "epoch": 1.9647999999999999, "grad_norm": 0.5600497722625732, "learning_rate": 8.980800000000001e-07, "loss": 0.2456, "step": 307000 }, { "epoch": 1.968, "grad_norm": 0.46547964215278625, "learning_rate": 8.1808e-07, "loss": 0.2467, "step": 307500 }, { "epoch": 1.9712, "grad_norm": 0.5435817241668701, "learning_rate": 7.380800000000001e-07, "loss": 0.25, "step": 308000 } ], "logging_steps": 500, "max_steps": 312500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.50047224233984e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }