{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1872, "eval_steps": 500, "global_step": 185500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 1.7230606079101562, "learning_rate": 4.99208e-05, "loss": 1.2281, "step": 500 }, { "epoch": 0.0064, "grad_norm": 3.655383348464966, "learning_rate": 4.9840800000000006e-05, "loss": 0.7566, "step": 1000 }, { "epoch": 0.0096, "grad_norm": 1.2925927639007568, "learning_rate": 4.97608e-05, "loss": 0.6764, "step": 1500 }, { "epoch": 0.0128, "grad_norm": 1.286004900932312, "learning_rate": 4.968080000000001e-05, "loss": 0.6304, "step": 2000 }, { "epoch": 0.016, "grad_norm": 1.2140214443206787, "learning_rate": 4.96008e-05, "loss": 0.5981, "step": 2500 }, { "epoch": 0.0192, "grad_norm": 1.2525482177734375, "learning_rate": 4.95208e-05, "loss": 0.5767, "step": 3000 }, { "epoch": 0.0224, "grad_norm": 1.2310410737991333, "learning_rate": 4.94408e-05, "loss": 0.5597, "step": 3500 }, { "epoch": 0.0256, "grad_norm": 1.1735206842422485, "learning_rate": 4.9360800000000004e-05, "loss": 0.5418, "step": 4000 }, { "epoch": 0.0288, "grad_norm": 1.114688754081726, "learning_rate": 4.9280800000000004e-05, "loss": 0.5335, "step": 4500 }, { "epoch": 0.032, "grad_norm": 0.8874593377113342, "learning_rate": 4.9200800000000005e-05, "loss": 0.5237, "step": 5000 }, { "epoch": 0.0352, "grad_norm": 1.1261299848556519, "learning_rate": 4.91208e-05, "loss": 0.5135, "step": 5500 }, { "epoch": 0.0384, "grad_norm": 0.9994556307792664, "learning_rate": 4.9040800000000007e-05, "loss": 0.5059, "step": 6000 }, { "epoch": 0.0416, "grad_norm": 1.2349673509597778, "learning_rate": 4.89608e-05, "loss": 0.4939, "step": 6500 }, { "epoch": 0.0448, "grad_norm": 0.9770995378494263, "learning_rate": 4.88808e-05, "loss": 0.4824, "step": 7000 }, { "epoch": 0.048, "grad_norm": 0.981966495513916, "learning_rate": 4.88008e-05, "loss": 0.4875, "step": 7500 }, { "epoch": 0.0512, "grad_norm": 1.0177415609359741, "learning_rate": 4.87208e-05, "loss": 0.4785, "step": 8000 }, { "epoch": 0.0544, "grad_norm": 1.0521667003631592, "learning_rate": 4.8640800000000004e-05, "loss": 0.4731, "step": 8500 }, { "epoch": 0.0576, "grad_norm": 0.8560615181922913, "learning_rate": 4.85608e-05, "loss": 0.4633, "step": 9000 }, { "epoch": 0.0608, "grad_norm": 1.0170217752456665, "learning_rate": 4.8480800000000005e-05, "loss": 0.4576, "step": 9500 }, { "epoch": 0.064, "grad_norm": 0.9891325831413269, "learning_rate": 4.84008e-05, "loss": 0.4556, "step": 10000 }, { "epoch": 0.0672, "grad_norm": 1.0609711408615112, "learning_rate": 4.832080000000001e-05, "loss": 0.4493, "step": 10500 }, { "epoch": 0.0704, "grad_norm": 0.8623799681663513, "learning_rate": 4.82408e-05, "loss": 0.4459, "step": 11000 }, { "epoch": 0.0736, "grad_norm": 0.9587870240211487, "learning_rate": 4.81608e-05, "loss": 0.4418, "step": 11500 }, { "epoch": 0.0768, "grad_norm": 0.8939447999000549, "learning_rate": 4.80808e-05, "loss": 0.4327, "step": 12000 }, { "epoch": 0.08, "grad_norm": 0.9886033535003662, "learning_rate": 4.80008e-05, "loss": 0.438, "step": 12500 }, { "epoch": 0.0832, "grad_norm": 0.9157513976097107, "learning_rate": 4.7920800000000004e-05, "loss": 0.4323, "step": 13000 }, { "epoch": 0.0864, "grad_norm": 0.9085854887962341, "learning_rate": 4.7840800000000005e-05, "loss": 0.4303, "step": 13500 }, { "epoch": 0.0896, "grad_norm": 0.9123984575271606, "learning_rate": 4.77608e-05, "loss": 0.4247, "step": 14000 }, { "epoch": 0.0928, "grad_norm": 0.839026689529419, "learning_rate": 4.7680960000000004e-05, "loss": 0.4233, "step": 14500 }, { "epoch": 0.096, "grad_norm": 0.8110847473144531, "learning_rate": 4.760096e-05, "loss": 0.4207, "step": 15000 }, { "epoch": 0.0992, "grad_norm": 0.8462579250335693, "learning_rate": 4.7520960000000005e-05, "loss": 0.421, "step": 15500 }, { "epoch": 0.1024, "grad_norm": 0.8980106711387634, "learning_rate": 4.744096e-05, "loss": 0.417, "step": 16000 }, { "epoch": 0.1056, "grad_norm": 0.8297702074050903, "learning_rate": 4.736096000000001e-05, "loss": 0.4139, "step": 16500 }, { "epoch": 0.1088, "grad_norm": 0.9856173992156982, "learning_rate": 4.728096e-05, "loss": 0.419, "step": 17000 }, { "epoch": 0.112, "grad_norm": 0.934256911277771, "learning_rate": 4.720096e-05, "loss": 0.4098, "step": 17500 }, { "epoch": 0.1152, "grad_norm": 0.9190649390220642, "learning_rate": 4.712096e-05, "loss": 0.412, "step": 18000 }, { "epoch": 0.1184, "grad_norm": 0.9078772664070129, "learning_rate": 4.704096e-05, "loss": 0.4043, "step": 18500 }, { "epoch": 0.1216, "grad_norm": 1.082939624786377, "learning_rate": 4.696112e-05, "loss": 0.4045, "step": 19000 }, { "epoch": 0.1248, "grad_norm": 0.9159390926361084, "learning_rate": 4.688112e-05, "loss": 0.4098, "step": 19500 }, { "epoch": 0.128, "grad_norm": 0.8420547842979431, "learning_rate": 4.680128e-05, "loss": 0.4033, "step": 20000 }, { "epoch": 0.1312, "grad_norm": 0.7658286094665527, "learning_rate": 4.672128e-05, "loss": 0.4002, "step": 20500 }, { "epoch": 0.1344, "grad_norm": 0.9074057340621948, "learning_rate": 4.664128e-05, "loss": 0.3964, "step": 21000 }, { "epoch": 0.1376, "grad_norm": 0.6065025329589844, "learning_rate": 4.656128e-05, "loss": 0.3984, "step": 21500 }, { "epoch": 0.1408, "grad_norm": 0.7523757219314575, "learning_rate": 4.6481280000000004e-05, "loss": 0.3959, "step": 22000 }, { "epoch": 0.144, "grad_norm": 0.807826042175293, "learning_rate": 4.6401280000000004e-05, "loss": 0.3921, "step": 22500 }, { "epoch": 0.1472, "grad_norm": 0.8530682325363159, "learning_rate": 4.632128e-05, "loss": 0.4002, "step": 23000 }, { "epoch": 0.1504, "grad_norm": 0.8661518692970276, "learning_rate": 4.6241280000000006e-05, "loss": 0.3856, "step": 23500 }, { "epoch": 0.1536, "grad_norm": 0.7473235130310059, "learning_rate": 4.616144e-05, "loss": 0.3854, "step": 24000 }, { "epoch": 0.1568, "grad_norm": 0.7954819202423096, "learning_rate": 4.6081440000000005e-05, "loss": 0.3871, "step": 24500 }, { "epoch": 0.16, "grad_norm": 0.8758727312088013, "learning_rate": 4.600144e-05, "loss": 0.3842, "step": 25000 }, { "epoch": 0.1632, "grad_norm": 0.8430293798446655, "learning_rate": 4.592144000000001e-05, "loss": 0.3886, "step": 25500 }, { "epoch": 0.1664, "grad_norm": 0.6557173728942871, "learning_rate": 4.584144e-05, "loss": 0.3854, "step": 26000 }, { "epoch": 0.1696, "grad_norm": 0.7791888117790222, "learning_rate": 4.576144e-05, "loss": 0.3796, "step": 26500 }, { "epoch": 0.1728, "grad_norm": 0.736084520816803, "learning_rate": 4.56816e-05, "loss": 0.3806, "step": 27000 }, { "epoch": 0.176, "grad_norm": 0.7714269161224365, "learning_rate": 4.56016e-05, "loss": 0.3781, "step": 27500 }, { "epoch": 0.1792, "grad_norm": 0.766144335269928, "learning_rate": 4.552176e-05, "loss": 0.3766, "step": 28000 }, { "epoch": 0.1824, "grad_norm": 0.7035301923751831, "learning_rate": 4.544176e-05, "loss": 0.3737, "step": 28500 }, { "epoch": 0.1856, "grad_norm": 0.7573793530464172, "learning_rate": 4.536176e-05, "loss": 0.3753, "step": 29000 }, { "epoch": 0.1888, "grad_norm": 0.8799508213996887, "learning_rate": 4.528176e-05, "loss": 0.373, "step": 29500 }, { "epoch": 0.192, "grad_norm": 0.8543264269828796, "learning_rate": 4.520176e-05, "loss": 0.3735, "step": 30000 }, { "epoch": 0.1952, "grad_norm": 0.6768947243690491, "learning_rate": 4.512176e-05, "loss": 0.3697, "step": 30500 }, { "epoch": 0.1984, "grad_norm": 0.8239702582359314, "learning_rate": 4.504176e-05, "loss": 0.3675, "step": 31000 }, { "epoch": 0.2016, "grad_norm": 0.8310449123382568, "learning_rate": 4.4961760000000004e-05, "loss": 0.3695, "step": 31500 }, { "epoch": 0.2048, "grad_norm": 0.8459475040435791, "learning_rate": 4.488176e-05, "loss": 0.3694, "step": 32000 }, { "epoch": 0.208, "grad_norm": 0.7346063852310181, "learning_rate": 4.4801760000000006e-05, "loss": 0.3646, "step": 32500 }, { "epoch": 0.2112, "grad_norm": 0.6958354115486145, "learning_rate": 4.472176e-05, "loss": 0.3704, "step": 33000 }, { "epoch": 0.2144, "grad_norm": 0.8244686722755432, "learning_rate": 4.464176000000001e-05, "loss": 0.3647, "step": 33500 }, { "epoch": 0.2176, "grad_norm": 0.7559502124786377, "learning_rate": 4.456192e-05, "loss": 0.3665, "step": 34000 }, { "epoch": 0.2208, "grad_norm": 0.9046504497528076, "learning_rate": 4.4481920000000007e-05, "loss": 0.3637, "step": 34500 }, { "epoch": 0.224, "grad_norm": 0.7771899700164795, "learning_rate": 4.440192e-05, "loss": 0.3648, "step": 35000 }, { "epoch": 0.2272, "grad_norm": 0.6887528300285339, "learning_rate": 4.432192e-05, "loss": 0.3562, "step": 35500 }, { "epoch": 0.2304, "grad_norm": 0.7471407055854797, "learning_rate": 4.424192e-05, "loss": 0.3639, "step": 36000 }, { "epoch": 0.2336, "grad_norm": 0.7198163270950317, "learning_rate": 4.416192e-05, "loss": 0.3604, "step": 36500 }, { "epoch": 0.2368, "grad_norm": 0.7383478879928589, "learning_rate": 4.4081920000000004e-05, "loss": 0.3592, "step": 37000 }, { "epoch": 0.24, "grad_norm": 0.8052579760551453, "learning_rate": 4.4001920000000004e-05, "loss": 0.3563, "step": 37500 }, { "epoch": 0.2432, "grad_norm": 0.7765107154846191, "learning_rate": 4.392224e-05, "loss": 0.3548, "step": 38000 }, { "epoch": 0.2464, "grad_norm": 0.7250288724899292, "learning_rate": 4.384224e-05, "loss": 0.3605, "step": 38500 }, { "epoch": 0.2496, "grad_norm": 0.6914694309234619, "learning_rate": 4.376224e-05, "loss": 0.3551, "step": 39000 }, { "epoch": 0.2528, "grad_norm": 0.6636275053024292, "learning_rate": 4.368224e-05, "loss": 0.3587, "step": 39500 }, { "epoch": 0.256, "grad_norm": 0.710564911365509, "learning_rate": 4.360224e-05, "loss": 0.3537, "step": 40000 }, { "epoch": 0.2592, "grad_norm": 0.6195800304412842, "learning_rate": 4.3522240000000004e-05, "loss": 0.3537, "step": 40500 }, { "epoch": 0.2624, "grad_norm": 0.7131514549255371, "learning_rate": 4.34424e-05, "loss": 0.3531, "step": 41000 }, { "epoch": 0.2656, "grad_norm": 0.6594410538673401, "learning_rate": 4.336256e-05, "loss": 0.3518, "step": 41500 }, { "epoch": 0.2688, "grad_norm": 0.7651230096817017, "learning_rate": 4.328256e-05, "loss": 0.3516, "step": 42000 }, { "epoch": 0.272, "grad_norm": 0.756515622138977, "learning_rate": 4.320256e-05, "loss": 0.3461, "step": 42500 }, { "epoch": 0.2752, "grad_norm": 0.7201528549194336, "learning_rate": 4.3122560000000003e-05, "loss": 0.3497, "step": 43000 }, { "epoch": 0.2784, "grad_norm": 0.7436856031417847, "learning_rate": 4.3042560000000004e-05, "loss": 0.3505, "step": 43500 }, { "epoch": 0.2816, "grad_norm": 0.7914199829101562, "learning_rate": 4.2962560000000005e-05, "loss": 0.3439, "step": 44000 }, { "epoch": 0.2848, "grad_norm": 0.7488194704055786, "learning_rate": 4.288256e-05, "loss": 0.349, "step": 44500 }, { "epoch": 0.288, "grad_norm": 0.8654124736785889, "learning_rate": 4.280256e-05, "loss": 0.3491, "step": 45000 }, { "epoch": 0.2912, "grad_norm": 0.6817401647567749, "learning_rate": 4.272272e-05, "loss": 0.3447, "step": 45500 }, { "epoch": 0.2944, "grad_norm": 0.6439715623855591, "learning_rate": 4.2642720000000006e-05, "loss": 0.3453, "step": 46000 }, { "epoch": 0.2976, "grad_norm": 1.3840138912200928, "learning_rate": 4.256272e-05, "loss": 0.3445, "step": 46500 }, { "epoch": 0.3008, "grad_norm": 0.7245766520500183, "learning_rate": 4.248272e-05, "loss": 0.3462, "step": 47000 }, { "epoch": 0.304, "grad_norm": 0.6877666711807251, "learning_rate": 4.240288e-05, "loss": 0.3465, "step": 47500 }, { "epoch": 0.3072, "grad_norm": 0.8494886159896851, "learning_rate": 4.2322880000000006e-05, "loss": 0.348, "step": 48000 }, { "epoch": 0.3104, "grad_norm": 0.6704971790313721, "learning_rate": 4.224288e-05, "loss": 0.3403, "step": 48500 }, { "epoch": 0.3136, "grad_norm": 0.6239964962005615, "learning_rate": 4.216288000000001e-05, "loss": 0.3382, "step": 49000 }, { "epoch": 0.3168, "grad_norm": 0.7317768335342407, "learning_rate": 4.208288e-05, "loss": 0.3385, "step": 49500 }, { "epoch": 0.32, "grad_norm": 0.7397735118865967, "learning_rate": 4.200288e-05, "loss": 0.3405, "step": 50000 }, { "epoch": 0.3232, "grad_norm": 1.1299536228179932, "learning_rate": 4.1922880000000003e-05, "loss": 0.3431, "step": 50500 }, { "epoch": 0.3264, "grad_norm": 0.6406556963920593, "learning_rate": 4.184304e-05, "loss": 0.3384, "step": 51000 }, { "epoch": 0.3296, "grad_norm": 0.8084424734115601, "learning_rate": 4.17632e-05, "loss": 0.3365, "step": 51500 }, { "epoch": 0.3328, "grad_norm": 0.7525010704994202, "learning_rate": 4.16832e-05, "loss": 0.3399, "step": 52000 }, { "epoch": 0.336, "grad_norm": 0.7382110953330994, "learning_rate": 4.16032e-05, "loss": 0.335, "step": 52500 }, { "epoch": 0.3392, "grad_norm": 0.6454793810844421, "learning_rate": 4.15232e-05, "loss": 0.3354, "step": 53000 }, { "epoch": 0.3424, "grad_norm": 0.639664351940155, "learning_rate": 4.14432e-05, "loss": 0.3371, "step": 53500 }, { "epoch": 0.3456, "grad_norm": 0.5574499368667603, "learning_rate": 4.1363200000000004e-05, "loss": 0.3341, "step": 54000 }, { "epoch": 0.3488, "grad_norm": 0.6772671341896057, "learning_rate": 4.12832e-05, "loss": 0.3331, "step": 54500 }, { "epoch": 0.352, "grad_norm": 0.6943195462226868, "learning_rate": 4.120336e-05, "loss": 0.3365, "step": 55000 }, { "epoch": 0.3552, "grad_norm": 0.7460485100746155, "learning_rate": 4.112336e-05, "loss": 0.3308, "step": 55500 }, { "epoch": 0.3584, "grad_norm": 0.7071924805641174, "learning_rate": 4.1043360000000005e-05, "loss": 0.3312, "step": 56000 }, { "epoch": 0.3616, "grad_norm": 0.6678891181945801, "learning_rate": 4.0963519999999996e-05, "loss": 0.3314, "step": 56500 }, { "epoch": 0.3648, "grad_norm": 0.7100914120674133, "learning_rate": 4.0883520000000004e-05, "loss": 0.3307, "step": 57000 }, { "epoch": 0.368, "grad_norm": 0.6085671782493591, "learning_rate": 4.080352e-05, "loss": 0.3282, "step": 57500 }, { "epoch": 0.3712, "grad_norm": 0.6634243130683899, "learning_rate": 4.0723520000000005e-05, "loss": 0.3321, "step": 58000 }, { "epoch": 0.3744, "grad_norm": 0.7203409075737, "learning_rate": 4.064352e-05, "loss": 0.3318, "step": 58500 }, { "epoch": 0.3776, "grad_norm": 0.7934884428977966, "learning_rate": 4.056352e-05, "loss": 0.3239, "step": 59000 }, { "epoch": 0.3808, "grad_norm": 0.8591666221618652, "learning_rate": 4.048352e-05, "loss": 0.3275, "step": 59500 }, { "epoch": 0.384, "grad_norm": 0.6306772232055664, "learning_rate": 4.040352e-05, "loss": 0.3308, "step": 60000 }, { "epoch": 0.3872, "grad_norm": 0.6059302687644958, "learning_rate": 4.032352e-05, "loss": 0.3266, "step": 60500 }, { "epoch": 0.3904, "grad_norm": 0.6875105500221252, "learning_rate": 4.024352e-05, "loss": 0.3265, "step": 61000 }, { "epoch": 0.3936, "grad_norm": 0.6397412419319153, "learning_rate": 4.0163520000000004e-05, "loss": 0.3268, "step": 61500 }, { "epoch": 0.3968, "grad_norm": 0.7801005840301514, "learning_rate": 4.0083520000000005e-05, "loss": 0.3314, "step": 62000 }, { "epoch": 0.4, "grad_norm": 0.6966884136199951, "learning_rate": 4.000352e-05, "loss": 0.3263, "step": 62500 }, { "epoch": 0.4032, "grad_norm": 0.7413304448127747, "learning_rate": 3.9923520000000006e-05, "loss": 0.3284, "step": 63000 }, { "epoch": 0.4064, "grad_norm": 0.7089780569076538, "learning_rate": 3.984352e-05, "loss": 0.3252, "step": 63500 }, { "epoch": 0.4096, "grad_norm": 0.6669878959655762, "learning_rate": 3.976352e-05, "loss": 0.3239, "step": 64000 }, { "epoch": 0.4128, "grad_norm": 0.7352403998374939, "learning_rate": 3.968368e-05, "loss": 0.3226, "step": 64500 }, { "epoch": 0.416, "grad_norm": 0.6916635036468506, "learning_rate": 3.9603840000000005e-05, "loss": 0.3234, "step": 65000 }, { "epoch": 0.4192, "grad_norm": 0.6800302863121033, "learning_rate": 3.952384e-05, "loss": 0.3224, "step": 65500 }, { "epoch": 0.4224, "grad_norm": 0.6685224771499634, "learning_rate": 3.9443840000000006e-05, "loss": 0.3197, "step": 66000 }, { "epoch": 0.4256, "grad_norm": 0.7219159603118896, "learning_rate": 3.936384e-05, "loss": 0.3185, "step": 66500 }, { "epoch": 0.4288, "grad_norm": 0.5928858518600464, "learning_rate": 3.928384e-05, "loss": 0.3291, "step": 67000 }, { "epoch": 0.432, "grad_norm": 0.6616542339324951, "learning_rate": 3.920384e-05, "loss": 0.3266, "step": 67500 }, { "epoch": 0.4352, "grad_norm": 0.5957266092300415, "learning_rate": 3.912384e-05, "loss": 0.32, "step": 68000 }, { "epoch": 0.4384, "grad_norm": 0.6576407551765442, "learning_rate": 3.904384e-05, "loss": 0.3246, "step": 68500 }, { "epoch": 0.4416, "grad_norm": 0.6852056384086609, "learning_rate": 3.896416e-05, "loss": 0.3268, "step": 69000 }, { "epoch": 0.4448, "grad_norm": 0.780893087387085, "learning_rate": 3.888416e-05, "loss": 0.3229, "step": 69500 }, { "epoch": 0.448, "grad_norm": 0.6741476655006409, "learning_rate": 3.880416e-05, "loss": 0.3188, "step": 70000 }, { "epoch": 0.4512, "grad_norm": 0.5919800400733948, "learning_rate": 3.872416e-05, "loss": 0.3208, "step": 70500 }, { "epoch": 0.4544, "grad_norm": 0.6476633548736572, "learning_rate": 3.864416e-05, "loss": 0.322, "step": 71000 }, { "epoch": 0.4576, "grad_norm": 0.5667979717254639, "learning_rate": 3.8564159999999996e-05, "loss": 0.3151, "step": 71500 }, { "epoch": 0.4608, "grad_norm": 0.6126554608345032, "learning_rate": 3.8484160000000004e-05, "loss": 0.3185, "step": 72000 }, { "epoch": 0.464, "grad_norm": 0.7995546460151672, "learning_rate": 3.840416e-05, "loss": 0.3174, "step": 72500 }, { "epoch": 0.4672, "grad_norm": 0.5964981317520142, "learning_rate": 3.8324160000000005e-05, "loss": 0.3187, "step": 73000 }, { "epoch": 0.4704, "grad_norm": 0.7718212008476257, "learning_rate": 3.824416e-05, "loss": 0.3156, "step": 73500 }, { "epoch": 0.4736, "grad_norm": 0.7086686491966248, "learning_rate": 3.8164320000000005e-05, "loss": 0.3189, "step": 74000 }, { "epoch": 0.4768, "grad_norm": 0.7988029718399048, "learning_rate": 3.808432e-05, "loss": 0.3151, "step": 74500 }, { "epoch": 0.48, "grad_norm": 0.6092699766159058, "learning_rate": 3.8004320000000006e-05, "loss": 0.3153, "step": 75000 }, { "epoch": 0.4832, "grad_norm": 0.6181166768074036, "learning_rate": 3.792432e-05, "loss": 0.3113, "step": 75500 }, { "epoch": 0.4864, "grad_norm": 0.5952243208885193, "learning_rate": 3.784432e-05, "loss": 0.3091, "step": 76000 }, { "epoch": 0.4896, "grad_norm": 0.5732501745223999, "learning_rate": 3.776432e-05, "loss": 0.3169, "step": 76500 }, { "epoch": 0.4928, "grad_norm": 0.5866090059280396, "learning_rate": 3.768432e-05, "loss": 0.3135, "step": 77000 }, { "epoch": 0.496, "grad_norm": 0.6748520135879517, "learning_rate": 3.760432e-05, "loss": 0.3134, "step": 77500 }, { "epoch": 0.4992, "grad_norm": 0.5922159552574158, "learning_rate": 3.752448e-05, "loss": 0.3156, "step": 78000 }, { "epoch": 0.5024, "grad_norm": 0.6446545124053955, "learning_rate": 3.744448e-05, "loss": 0.3171, "step": 78500 }, { "epoch": 0.5056, "grad_norm": 0.6506426334381104, "learning_rate": 3.736448e-05, "loss": 0.3138, "step": 79000 }, { "epoch": 0.5088, "grad_norm": 0.6826354265213013, "learning_rate": 3.728448e-05, "loss": 0.3164, "step": 79500 }, { "epoch": 0.512, "grad_norm": 0.6866195797920227, "learning_rate": 3.72048e-05, "loss": 0.315, "step": 80000 }, { "epoch": 0.5152, "grad_norm": 0.5590147376060486, "learning_rate": 3.7124960000000005e-05, "loss": 0.3094, "step": 80500 }, { "epoch": 0.5184, "grad_norm": 0.6728788614273071, "learning_rate": 3.704496e-05, "loss": 0.3194, "step": 81000 }, { "epoch": 0.5216, "grad_norm": 0.6108749508857727, "learning_rate": 3.696496000000001e-05, "loss": 0.3128, "step": 81500 }, { "epoch": 0.5248, "grad_norm": 0.5888856649398804, "learning_rate": 3.688496e-05, "loss": 0.3121, "step": 82000 }, { "epoch": 0.528, "grad_norm": 0.727268397808075, "learning_rate": 3.680496e-05, "loss": 0.3193, "step": 82500 }, { "epoch": 0.5312, "grad_norm": 0.6358634233474731, "learning_rate": 3.672496e-05, "loss": 0.3092, "step": 83000 }, { "epoch": 0.5344, "grad_norm": 0.6482620239257812, "learning_rate": 3.664496e-05, "loss": 0.3098, "step": 83500 }, { "epoch": 0.5376, "grad_norm": 0.5968552827835083, "learning_rate": 3.6564960000000004e-05, "loss": 0.3108, "step": 84000 }, { "epoch": 0.5408, "grad_norm": 0.6621351838111877, "learning_rate": 3.6484960000000004e-05, "loss": 0.3065, "step": 84500 }, { "epoch": 0.544, "grad_norm": 0.5520649552345276, "learning_rate": 3.640496e-05, "loss": 0.3088, "step": 85000 }, { "epoch": 0.5472, "grad_norm": 0.6885005831718445, "learning_rate": 3.632496e-05, "loss": 0.3075, "step": 85500 }, { "epoch": 0.5504, "grad_norm": 0.666653573513031, "learning_rate": 3.624512e-05, "loss": 0.3113, "step": 86000 }, { "epoch": 0.5536, "grad_norm": 0.6344409584999084, "learning_rate": 3.6165120000000005e-05, "loss": 0.3085, "step": 86500 }, { "epoch": 0.5568, "grad_norm": 0.5792534947395325, "learning_rate": 3.608512e-05, "loss": 0.3132, "step": 87000 }, { "epoch": 0.56, "grad_norm": 0.6864989995956421, "learning_rate": 3.600512e-05, "loss": 0.3079, "step": 87500 }, { "epoch": 0.5632, "grad_norm": 0.6077435612678528, "learning_rate": 3.592512e-05, "loss": 0.3095, "step": 88000 }, { "epoch": 0.5664, "grad_norm": 0.7073134779930115, "learning_rate": 3.584512e-05, "loss": 0.3116, "step": 88500 }, { "epoch": 0.5696, "grad_norm": 0.6477733850479126, "learning_rate": 3.576512e-05, "loss": 0.3062, "step": 89000 }, { "epoch": 0.5728, "grad_norm": 0.7786093354225159, "learning_rate": 3.568512e-05, "loss": 0.3017, "step": 89500 }, { "epoch": 0.576, "grad_norm": 0.6447868943214417, "learning_rate": 3.560528e-05, "loss": 0.3077, "step": 90000 }, { "epoch": 0.5792, "grad_norm": 0.6663397550582886, "learning_rate": 3.552528e-05, "loss": 0.3089, "step": 90500 }, { "epoch": 0.5824, "grad_norm": 0.533214807510376, "learning_rate": 3.544528e-05, "loss": 0.3064, "step": 91000 }, { "epoch": 0.5856, "grad_norm": 0.6517444849014282, "learning_rate": 3.5365280000000004e-05, "loss": 0.3108, "step": 91500 }, { "epoch": 0.5888, "grad_norm": 0.7635303735733032, "learning_rate": 3.528544e-05, "loss": 0.3028, "step": 92000 }, { "epoch": 0.592, "grad_norm": 0.6636632680892944, "learning_rate": 3.520544e-05, "loss": 0.3015, "step": 92500 }, { "epoch": 0.5952, "grad_norm": 0.7296783924102783, "learning_rate": 3.5125440000000004e-05, "loss": 0.305, "step": 93000 }, { "epoch": 0.5984, "grad_norm": 0.5089054703712463, "learning_rate": 3.50456e-05, "loss": 0.3092, "step": 93500 }, { "epoch": 0.6016, "grad_norm": 0.6761330366134644, "learning_rate": 3.49656e-05, "loss": 0.3055, "step": 94000 }, { "epoch": 0.6048, "grad_norm": 0.6327843070030212, "learning_rate": 3.4885600000000004e-05, "loss": 0.3055, "step": 94500 }, { "epoch": 0.608, "grad_norm": 0.5940554141998291, "learning_rate": 3.48056e-05, "loss": 0.3017, "step": 95000 }, { "epoch": 0.6112, "grad_norm": 0.516828179359436, "learning_rate": 3.4725600000000005e-05, "loss": 0.3035, "step": 95500 }, { "epoch": 0.6144, "grad_norm": 0.5835782289505005, "learning_rate": 3.46456e-05, "loss": 0.2978, "step": 96000 }, { "epoch": 0.6176, "grad_norm": 0.5978230237960815, "learning_rate": 3.456560000000001e-05, "loss": 0.301, "step": 96500 }, { "epoch": 0.6208, "grad_norm": 0.5460017323493958, "learning_rate": 3.44856e-05, "loss": 0.3052, "step": 97000 }, { "epoch": 0.624, "grad_norm": 0.6875701546669006, "learning_rate": 3.44056e-05, "loss": 0.3028, "step": 97500 }, { "epoch": 0.6272, "grad_norm": 0.5780492424964905, "learning_rate": 3.43256e-05, "loss": 0.2988, "step": 98000 }, { "epoch": 0.6304, "grad_norm": 0.5191554427146912, "learning_rate": 3.42456e-05, "loss": 0.3052, "step": 98500 }, { "epoch": 0.6336, "grad_norm": 0.6811420917510986, "learning_rate": 3.416576e-05, "loss": 0.3032, "step": 99000 }, { "epoch": 0.6368, "grad_norm": 0.6301366686820984, "learning_rate": 3.408576e-05, "loss": 0.2979, "step": 99500 }, { "epoch": 0.64, "grad_norm": 0.5777577757835388, "learning_rate": 3.400576e-05, "loss": 0.2991, "step": 100000 }, { "epoch": 0.6432, "grad_norm": 0.6444558501243591, "learning_rate": 3.392592e-05, "loss": 0.298, "step": 100500 }, { "epoch": 0.6464, "grad_norm": 0.4793080985546112, "learning_rate": 3.384592e-05, "loss": 0.3014, "step": 101000 }, { "epoch": 0.6496, "grad_norm": 0.6691552400588989, "learning_rate": 3.376608e-05, "loss": 0.3006, "step": 101500 }, { "epoch": 0.6528, "grad_norm": 0.6318476796150208, "learning_rate": 3.368608e-05, "loss": 0.3032, "step": 102000 }, { "epoch": 0.656, "grad_norm": 0.5805894136428833, "learning_rate": 3.360608e-05, "loss": 0.3014, "step": 102500 }, { "epoch": 0.6592, "grad_norm": 0.5658220648765564, "learning_rate": 3.352608e-05, "loss": 0.3, "step": 103000 }, { "epoch": 0.6624, "grad_norm": 0.6117516160011292, "learning_rate": 3.3446080000000004e-05, "loss": 0.3014, "step": 103500 }, { "epoch": 0.6656, "grad_norm": 0.6763502359390259, "learning_rate": 3.336608e-05, "loss": 0.3043, "step": 104000 }, { "epoch": 0.6688, "grad_norm": 0.6046746969223022, "learning_rate": 3.3286080000000005e-05, "loss": 0.2965, "step": 104500 }, { "epoch": 0.672, "grad_norm": 0.7453213930130005, "learning_rate": 3.320608e-05, "loss": 0.2964, "step": 105000 }, { "epoch": 0.6752, "grad_norm": 0.6010546088218689, "learning_rate": 3.3126080000000007e-05, "loss": 0.2975, "step": 105500 }, { "epoch": 0.6784, "grad_norm": 0.7377296686172485, "learning_rate": 3.304608e-05, "loss": 0.2993, "step": 106000 }, { "epoch": 0.6816, "grad_norm": 0.6612259745597839, "learning_rate": 3.2966240000000006e-05, "loss": 0.298, "step": 106500 }, { "epoch": 0.6848, "grad_norm": 0.6570013165473938, "learning_rate": 3.288624e-05, "loss": 0.296, "step": 107000 }, { "epoch": 0.688, "grad_norm": 0.633602499961853, "learning_rate": 3.280624e-05, "loss": 0.2989, "step": 107500 }, { "epoch": 0.6912, "grad_norm": 0.5594373345375061, "learning_rate": 3.272624e-05, "loss": 0.2977, "step": 108000 }, { "epoch": 0.6944, "grad_norm": 0.5643302202224731, "learning_rate": 3.264624e-05, "loss": 0.2941, "step": 108500 }, { "epoch": 0.6976, "grad_norm": 0.5127794146537781, "learning_rate": 3.256624e-05, "loss": 0.2953, "step": 109000 }, { "epoch": 0.7008, "grad_norm": 0.6273791790008545, "learning_rate": 3.24864e-05, "loss": 0.2944, "step": 109500 }, { "epoch": 0.704, "grad_norm": 0.5089157223701477, "learning_rate": 3.24064e-05, "loss": 0.3, "step": 110000 }, { "epoch": 0.7072, "grad_norm": 0.5816791653633118, "learning_rate": 3.232656e-05, "loss": 0.2957, "step": 110500 }, { "epoch": 0.7104, "grad_norm": 0.6407476663589478, "learning_rate": 3.224656e-05, "loss": 0.2974, "step": 111000 }, { "epoch": 0.7136, "grad_norm": 0.46444937586784363, "learning_rate": 3.216656e-05, "loss": 0.2969, "step": 111500 }, { "epoch": 0.7168, "grad_norm": 0.4997446835041046, "learning_rate": 3.2086559999999996e-05, "loss": 0.2966, "step": 112000 }, { "epoch": 0.72, "grad_norm": 0.6996490359306335, "learning_rate": 3.2006560000000003e-05, "loss": 0.2965, "step": 112500 }, { "epoch": 0.7232, "grad_norm": 0.5806016325950623, "learning_rate": 3.192672e-05, "loss": 0.2952, "step": 113000 }, { "epoch": 0.7264, "grad_norm": 0.6140916347503662, "learning_rate": 3.184672e-05, "loss": 0.2995, "step": 113500 }, { "epoch": 0.7296, "grad_norm": 0.45879319310188293, "learning_rate": 3.1766719999999997e-05, "loss": 0.292, "step": 114000 }, { "epoch": 0.7328, "grad_norm": 0.6141937971115112, "learning_rate": 3.1686720000000004e-05, "loss": 0.2945, "step": 114500 }, { "epoch": 0.736, "grad_norm": 0.6565462946891785, "learning_rate": 3.160672e-05, "loss": 0.2982, "step": 115000 }, { "epoch": 0.7392, "grad_norm": 0.5997145175933838, "learning_rate": 3.1526720000000006e-05, "loss": 0.2957, "step": 115500 }, { "epoch": 0.7424, "grad_norm": 0.736965537071228, "learning_rate": 3.144672e-05, "loss": 0.2953, "step": 116000 }, { "epoch": 0.7456, "grad_norm": 0.6587550640106201, "learning_rate": 3.136672e-05, "loss": 0.2917, "step": 116500 }, { "epoch": 0.7488, "grad_norm": 0.7265971302986145, "learning_rate": 3.128672e-05, "loss": 0.2908, "step": 117000 }, { "epoch": 0.752, "grad_norm": 0.6158114075660706, "learning_rate": 3.120672e-05, "loss": 0.2916, "step": 117500 }, { "epoch": 0.7552, "grad_norm": 0.6521216034889221, "learning_rate": 3.112672e-05, "loss": 0.2947, "step": 118000 }, { "epoch": 0.7584, "grad_norm": 0.5868868231773376, "learning_rate": 3.1046720000000004e-05, "loss": 0.2919, "step": 118500 }, { "epoch": 0.7616, "grad_norm": 0.6495432257652283, "learning_rate": 3.096672e-05, "loss": 0.2974, "step": 119000 }, { "epoch": 0.7648, "grad_norm": 0.6204816102981567, "learning_rate": 3.0886720000000005e-05, "loss": 0.2945, "step": 119500 }, { "epoch": 0.768, "grad_norm": 0.6333968639373779, "learning_rate": 3.080672e-05, "loss": 0.292, "step": 120000 }, { "epoch": 0.7712, "grad_norm": 0.5613961815834045, "learning_rate": 3.0726880000000004e-05, "loss": 0.2938, "step": 120500 }, { "epoch": 0.7744, "grad_norm": 0.6623988151550293, "learning_rate": 3.064688e-05, "loss": 0.2954, "step": 121000 }, { "epoch": 0.7776, "grad_norm": 0.6134264469146729, "learning_rate": 3.0566880000000006e-05, "loss": 0.2915, "step": 121500 }, { "epoch": 0.7808, "grad_norm": 0.6159347891807556, "learning_rate": 3.048688e-05, "loss": 0.2887, "step": 122000 }, { "epoch": 0.784, "grad_norm": 0.6079424023628235, "learning_rate": 3.0407040000000005e-05, "loss": 0.2915, "step": 122500 }, { "epoch": 0.7872, "grad_norm": 0.7703385353088379, "learning_rate": 3.0327040000000002e-05, "loss": 0.2901, "step": 123000 }, { "epoch": 0.7904, "grad_norm": 0.5626256465911865, "learning_rate": 3.024704e-05, "loss": 0.2938, "step": 123500 }, { "epoch": 0.7936, "grad_norm": 0.554914653301239, "learning_rate": 3.016704e-05, "loss": 0.2913, "step": 124000 }, { "epoch": 0.7968, "grad_norm": 0.6610060930252075, "learning_rate": 3.008704e-05, "loss": 0.2912, "step": 124500 }, { "epoch": 0.8, "grad_norm": 0.6194009780883789, "learning_rate": 3.0007040000000002e-05, "loss": 0.2901, "step": 125000 }, { "epoch": 0.8032, "grad_norm": 0.7150211930274963, "learning_rate": 2.992704e-05, "loss": 0.2895, "step": 125500 }, { "epoch": 0.8064, "grad_norm": 0.6945148706436157, "learning_rate": 2.9847040000000003e-05, "loss": 0.2878, "step": 126000 }, { "epoch": 0.8096, "grad_norm": 0.6546908617019653, "learning_rate": 2.9767200000000002e-05, "loss": 0.287, "step": 126500 }, { "epoch": 0.8128, "grad_norm": 0.535040020942688, "learning_rate": 2.9687360000000004e-05, "loss": 0.2901, "step": 127000 }, { "epoch": 0.816, "grad_norm": 0.6062806844711304, "learning_rate": 2.960736e-05, "loss": 0.2862, "step": 127500 }, { "epoch": 0.8192, "grad_norm": 0.6202298998832703, "learning_rate": 2.9527360000000005e-05, "loss": 0.2884, "step": 128000 }, { "epoch": 0.8224, "grad_norm": 0.5966545343399048, "learning_rate": 2.9447360000000003e-05, "loss": 0.2877, "step": 128500 }, { "epoch": 0.8256, "grad_norm": 0.5024796724319458, "learning_rate": 2.936736e-05, "loss": 0.2882, "step": 129000 }, { "epoch": 0.8288, "grad_norm": 0.5895559191703796, "learning_rate": 2.9287520000000002e-05, "loss": 0.288, "step": 129500 }, { "epoch": 0.832, "grad_norm": 0.9302066564559937, "learning_rate": 2.920752e-05, "loss": 0.286, "step": 130000 }, { "epoch": 0.8352, "grad_norm": 0.573466956615448, "learning_rate": 2.9127520000000003e-05, "loss": 0.2848, "step": 130500 }, { "epoch": 0.8384, "grad_norm": 0.5901783108711243, "learning_rate": 2.904768e-05, "loss": 0.2883, "step": 131000 }, { "epoch": 0.8416, "grad_norm": 0.7780030369758606, "learning_rate": 2.8967680000000002e-05, "loss": 0.2914, "step": 131500 }, { "epoch": 0.8448, "grad_norm": 0.6630533933639526, "learning_rate": 2.888768e-05, "loss": 0.2878, "step": 132000 }, { "epoch": 0.848, "grad_norm": 0.6001667976379395, "learning_rate": 2.8807680000000004e-05, "loss": 0.2818, "step": 132500 }, { "epoch": 0.8512, "grad_norm": 0.6324682831764221, "learning_rate": 2.872768e-05, "loss": 0.2849, "step": 133000 }, { "epoch": 0.8544, "grad_norm": 0.6814092993736267, "learning_rate": 2.864768e-05, "loss": 0.288, "step": 133500 }, { "epoch": 0.8576, "grad_norm": 0.651709794998169, "learning_rate": 2.8567680000000003e-05, "loss": 0.2872, "step": 134000 }, { "epoch": 0.8608, "grad_norm": 0.5912330746650696, "learning_rate": 2.848768e-05, "loss": 0.2824, "step": 134500 }, { "epoch": 0.864, "grad_norm": 0.5821974277496338, "learning_rate": 2.8407680000000004e-05, "loss": 0.2853, "step": 135000 }, { "epoch": 0.8672, "grad_norm": 0.6262611150741577, "learning_rate": 2.832784e-05, "loss": 0.2848, "step": 135500 }, { "epoch": 0.8704, "grad_norm": 0.5360976457595825, "learning_rate": 2.8247840000000004e-05, "loss": 0.2869, "step": 136000 }, { "epoch": 0.8736, "grad_norm": 0.6523284912109375, "learning_rate": 2.816784e-05, "loss": 0.2792, "step": 136500 }, { "epoch": 0.8768, "grad_norm": 0.6329330205917358, "learning_rate": 2.808784e-05, "loss": 0.2865, "step": 137000 }, { "epoch": 0.88, "grad_norm": 0.6053124666213989, "learning_rate": 2.8007840000000003e-05, "loss": 0.2844, "step": 137500 }, { "epoch": 0.8832, "grad_norm": 0.6887571811676025, "learning_rate": 2.7927999999999998e-05, "loss": 0.288, "step": 138000 }, { "epoch": 0.8864, "grad_norm": 0.7047476172447205, "learning_rate": 2.7848000000000002e-05, "loss": 0.2877, "step": 138500 }, { "epoch": 0.8896, "grad_norm": 0.598227858543396, "learning_rate": 2.7768e-05, "loss": 0.2867, "step": 139000 }, { "epoch": 0.8928, "grad_norm": 0.5094701051712036, "learning_rate": 2.7688000000000003e-05, "loss": 0.2832, "step": 139500 }, { "epoch": 0.896, "grad_norm": 0.5749739408493042, "learning_rate": 2.7608e-05, "loss": 0.2821, "step": 140000 }, { "epoch": 0.8992, "grad_norm": 0.4442578852176666, "learning_rate": 2.7528320000000003e-05, "loss": 0.282, "step": 140500 }, { "epoch": 0.9024, "grad_norm": 0.5418574213981628, "learning_rate": 2.744832e-05, "loss": 0.2816, "step": 141000 }, { "epoch": 0.9056, "grad_norm": 0.5984327793121338, "learning_rate": 2.736832e-05, "loss": 0.285, "step": 141500 }, { "epoch": 0.9088, "grad_norm": 0.6572843194007874, "learning_rate": 2.728832e-05, "loss": 0.2817, "step": 142000 }, { "epoch": 0.912, "grad_norm": 0.590993344783783, "learning_rate": 2.7208320000000003e-05, "loss": 0.288, "step": 142500 }, { "epoch": 0.9152, "grad_norm": 0.6096624135971069, "learning_rate": 2.712832e-05, "loss": 0.2861, "step": 143000 }, { "epoch": 0.9184, "grad_norm": 0.5189167261123657, "learning_rate": 2.7048319999999998e-05, "loss": 0.2857, "step": 143500 }, { "epoch": 0.9216, "grad_norm": 0.5812899470329285, "learning_rate": 2.6968320000000002e-05, "loss": 0.2888, "step": 144000 }, { "epoch": 0.9248, "grad_norm": 0.515201210975647, "learning_rate": 2.688832e-05, "loss": 0.2791, "step": 144500 }, { "epoch": 0.928, "grad_norm": 0.6398504972457886, "learning_rate": 2.6808320000000004e-05, "loss": 0.282, "step": 145000 }, { "epoch": 0.9312, "grad_norm": 0.5990891456604004, "learning_rate": 2.672832e-05, "loss": 0.28, "step": 145500 }, { "epoch": 0.9344, "grad_norm": 0.5883029699325562, "learning_rate": 2.664832e-05, "loss": 0.2777, "step": 146000 }, { "epoch": 0.9376, "grad_norm": 0.6432376503944397, "learning_rate": 2.656848e-05, "loss": 0.2804, "step": 146500 }, { "epoch": 0.9408, "grad_norm": 0.5375948548316956, "learning_rate": 2.6488479999999997e-05, "loss": 0.2807, "step": 147000 }, { "epoch": 0.944, "grad_norm": 0.6207411885261536, "learning_rate": 2.6408640000000003e-05, "loss": 0.283, "step": 147500 }, { "epoch": 0.9472, "grad_norm": 0.5854378342628479, "learning_rate": 2.632864e-05, "loss": 0.2854, "step": 148000 }, { "epoch": 0.9504, "grad_norm": 0.5260078310966492, "learning_rate": 2.6248800000000002e-05, "loss": 0.2836, "step": 148500 }, { "epoch": 0.9536, "grad_norm": 0.6284717917442322, "learning_rate": 2.61688e-05, "loss": 0.2824, "step": 149000 }, { "epoch": 0.9568, "grad_norm": 0.6092182397842407, "learning_rate": 2.608896e-05, "loss": 0.2804, "step": 149500 }, { "epoch": 0.96, "grad_norm": 0.6028911471366882, "learning_rate": 2.600896e-05, "loss": 0.281, "step": 150000 }, { "epoch": 0.9632, "grad_norm": 0.5008478164672852, "learning_rate": 2.5928960000000003e-05, "loss": 0.277, "step": 150500 }, { "epoch": 0.9664, "grad_norm": 0.5233867168426514, "learning_rate": 2.584896e-05, "loss": 0.2807, "step": 151000 }, { "epoch": 0.9696, "grad_norm": 0.5762408375740051, "learning_rate": 2.5768960000000004e-05, "loss": 0.2831, "step": 151500 }, { "epoch": 0.9728, "grad_norm": 0.6097844243049622, "learning_rate": 2.568896e-05, "loss": 0.2803, "step": 152000 }, { "epoch": 0.976, "grad_norm": 0.6696804761886597, "learning_rate": 2.560896e-05, "loss": 0.2742, "step": 152500 }, { "epoch": 0.9792, "grad_norm": 0.6028556823730469, "learning_rate": 2.5528960000000003e-05, "loss": 0.282, "step": 153000 }, { "epoch": 0.9824, "grad_norm": 0.6651898622512817, "learning_rate": 2.544896e-05, "loss": 0.2849, "step": 153500 }, { "epoch": 0.9856, "grad_norm": 0.5219380855560303, "learning_rate": 2.536896e-05, "loss": 0.2785, "step": 154000 }, { "epoch": 0.9888, "grad_norm": 0.6161176562309265, "learning_rate": 2.5288960000000002e-05, "loss": 0.2808, "step": 154500 }, { "epoch": 0.992, "grad_norm": 0.7915316224098206, "learning_rate": 2.5208960000000003e-05, "loss": 0.2777, "step": 155000 }, { "epoch": 0.9952, "grad_norm": 0.7261882424354553, "learning_rate": 2.512896e-05, "loss": 0.2767, "step": 155500 }, { "epoch": 0.9984, "grad_norm": 0.5452406406402588, "learning_rate": 2.5048959999999997e-05, "loss": 0.2764, "step": 156000 }, { "epoch": 1.0016, "grad_norm": 0.642181396484375, "learning_rate": 2.4969120000000003e-05, "loss": 0.2746, "step": 156500 }, { "epoch": 1.0048, "grad_norm": 0.5900291204452515, "learning_rate": 2.4889120000000003e-05, "loss": 0.2721, "step": 157000 }, { "epoch": 1.008, "grad_norm": 0.5960043668746948, "learning_rate": 2.480912e-05, "loss": 0.265, "step": 157500 }, { "epoch": 1.0112, "grad_norm": 0.582115650177002, "learning_rate": 2.472912e-05, "loss": 0.2673, "step": 158000 }, { "epoch": 1.0144, "grad_norm": 0.552392303943634, "learning_rate": 2.464912e-05, "loss": 0.2663, "step": 158500 }, { "epoch": 1.0176, "grad_norm": 0.5585765242576599, "learning_rate": 2.456912e-05, "loss": 0.2688, "step": 159000 }, { "epoch": 1.0208, "grad_norm": 0.6049332022666931, "learning_rate": 2.448912e-05, "loss": 0.266, "step": 159500 }, { "epoch": 1.024, "grad_norm": 0.5749877095222473, "learning_rate": 2.440912e-05, "loss": 0.2689, "step": 160000 }, { "epoch": 1.0272, "grad_norm": 0.5832675695419312, "learning_rate": 2.4329120000000002e-05, "loss": 0.2703, "step": 160500 }, { "epoch": 1.0304, "grad_norm": 0.8549031019210815, "learning_rate": 2.424928e-05, "loss": 0.2623, "step": 161000 }, { "epoch": 1.0336, "grad_norm": 0.5572855472564697, "learning_rate": 2.416928e-05, "loss": 0.2711, "step": 161500 }, { "epoch": 1.0368, "grad_norm": 0.6818140745162964, "learning_rate": 2.408928e-05, "loss": 0.2652, "step": 162000 }, { "epoch": 1.04, "grad_norm": 0.6900683045387268, "learning_rate": 2.400928e-05, "loss": 0.2669, "step": 162500 }, { "epoch": 1.0432, "grad_norm": 0.6015618443489075, "learning_rate": 2.392944e-05, "loss": 0.2654, "step": 163000 }, { "epoch": 1.0464, "grad_norm": 0.5343177318572998, "learning_rate": 2.3849440000000002e-05, "loss": 0.2656, "step": 163500 }, { "epoch": 1.0496, "grad_norm": 0.6130079627037048, "learning_rate": 2.3769440000000003e-05, "loss": 0.2592, "step": 164000 }, { "epoch": 1.0528, "grad_norm": 0.7150599956512451, "learning_rate": 2.368944e-05, "loss": 0.2634, "step": 164500 }, { "epoch": 1.056, "grad_norm": 0.6321354508399963, "learning_rate": 2.360944e-05, "loss": 0.2683, "step": 165000 }, { "epoch": 1.0592, "grad_norm": 0.6234462857246399, "learning_rate": 2.352976e-05, "loss": 0.2628, "step": 165500 }, { "epoch": 1.0624, "grad_norm": 0.6542537808418274, "learning_rate": 2.344976e-05, "loss": 0.2618, "step": 166000 }, { "epoch": 1.0656, "grad_norm": 0.6302633881568909, "learning_rate": 2.3369760000000002e-05, "loss": 0.2661, "step": 166500 }, { "epoch": 1.0688, "grad_norm": 0.5890353322029114, "learning_rate": 2.3289760000000002e-05, "loss": 0.2646, "step": 167000 }, { "epoch": 1.072, "grad_norm": 0.6490179300308228, "learning_rate": 2.320976e-05, "loss": 0.2635, "step": 167500 }, { "epoch": 1.0752, "grad_norm": 0.648162305355072, "learning_rate": 2.312976e-05, "loss": 0.2646, "step": 168000 }, { "epoch": 1.0784, "grad_norm": 0.675680935382843, "learning_rate": 2.304976e-05, "loss": 0.2626, "step": 168500 }, { "epoch": 1.0816, "grad_norm": 0.6192341446876526, "learning_rate": 2.2969760000000002e-05, "loss": 0.2641, "step": 169000 }, { "epoch": 1.0848, "grad_norm": 0.7046379446983337, "learning_rate": 2.288992e-05, "loss": 0.2643, "step": 169500 }, { "epoch": 1.088, "grad_norm": 0.5477197170257568, "learning_rate": 2.280992e-05, "loss": 0.265, "step": 170000 }, { "epoch": 1.0912, "grad_norm": 0.5775583982467651, "learning_rate": 2.2729920000000002e-05, "loss": 0.2645, "step": 170500 }, { "epoch": 1.0944, "grad_norm": 0.6389047503471375, "learning_rate": 2.2649920000000003e-05, "loss": 0.2634, "step": 171000 }, { "epoch": 1.0976, "grad_norm": 0.6169374585151672, "learning_rate": 2.256992e-05, "loss": 0.2642, "step": 171500 }, { "epoch": 1.1008, "grad_norm": 0.5913782715797424, "learning_rate": 2.2490080000000002e-05, "loss": 0.2658, "step": 172000 }, { "epoch": 1.104, "grad_norm": 0.7547928690910339, "learning_rate": 2.241008e-05, "loss": 0.2674, "step": 172500 }, { "epoch": 1.1072, "grad_norm": 0.6277585625648499, "learning_rate": 2.233024e-05, "loss": 0.2686, "step": 173000 }, { "epoch": 1.1104, "grad_norm": 0.6357282996177673, "learning_rate": 2.225024e-05, "loss": 0.2639, "step": 173500 }, { "epoch": 1.1136, "grad_norm": 0.5262208580970764, "learning_rate": 2.2170400000000004e-05, "loss": 0.2641, "step": 174000 }, { "epoch": 1.1168, "grad_norm": 0.6878075003623962, "learning_rate": 2.20904e-05, "loss": 0.2654, "step": 174500 }, { "epoch": 1.12, "grad_norm": 0.5332186222076416, "learning_rate": 2.2010400000000002e-05, "loss": 0.2638, "step": 175000 }, { "epoch": 1.1232, "grad_norm": 0.5562476515769958, "learning_rate": 2.19304e-05, "loss": 0.2648, "step": 175500 }, { "epoch": 1.1264, "grad_norm": 0.5924221277236938, "learning_rate": 2.18504e-05, "loss": 0.2627, "step": 176000 }, { "epoch": 1.1296, "grad_norm": 0.5250386595726013, "learning_rate": 2.17704e-05, "loss": 0.2619, "step": 176500 }, { "epoch": 1.1328, "grad_norm": 0.7426069378852844, "learning_rate": 2.16904e-05, "loss": 0.2628, "step": 177000 }, { "epoch": 1.1360000000000001, "grad_norm": 0.4925951063632965, "learning_rate": 2.16104e-05, "loss": 0.2661, "step": 177500 }, { "epoch": 1.1392, "grad_norm": 0.5707270503044128, "learning_rate": 2.15304e-05, "loss": 0.2622, "step": 178000 }, { "epoch": 1.1424, "grad_norm": 0.5793021321296692, "learning_rate": 2.14504e-05, "loss": 0.2671, "step": 178500 }, { "epoch": 1.1456, "grad_norm": 0.5736916661262512, "learning_rate": 2.13704e-05, "loss": 0.2648, "step": 179000 }, { "epoch": 1.1488, "grad_norm": 0.588550329208374, "learning_rate": 2.129056e-05, "loss": 0.2641, "step": 179500 }, { "epoch": 1.152, "grad_norm": 0.5504462122917175, "learning_rate": 2.121056e-05, "loss": 0.2643, "step": 180000 }, { "epoch": 1.1552, "grad_norm": 0.5439949035644531, "learning_rate": 2.113056e-05, "loss": 0.2639, "step": 180500 }, { "epoch": 1.1584, "grad_norm": 0.6882042288780212, "learning_rate": 2.105056e-05, "loss": 0.2595, "step": 181000 }, { "epoch": 1.1616, "grad_norm": 0.6735561490058899, "learning_rate": 2.097056e-05, "loss": 0.2624, "step": 181500 }, { "epoch": 1.1648, "grad_norm": 0.5545785427093506, "learning_rate": 2.089056e-05, "loss": 0.2625, "step": 182000 }, { "epoch": 1.168, "grad_norm": 0.6497994065284729, "learning_rate": 2.081056e-05, "loss": 0.2611, "step": 182500 }, { "epoch": 1.1712, "grad_norm": 0.5887815356254578, "learning_rate": 2.073056e-05, "loss": 0.2632, "step": 183000 }, { "epoch": 1.1743999999999999, "grad_norm": 0.6037270426750183, "learning_rate": 2.0650560000000002e-05, "loss": 0.2645, "step": 183500 }, { "epoch": 1.1776, "grad_norm": 0.636946439743042, "learning_rate": 2.057072e-05, "loss": 0.2628, "step": 184000 }, { "epoch": 1.1808, "grad_norm": 0.5285276770591736, "learning_rate": 2.049072e-05, "loss": 0.2629, "step": 184500 }, { "epoch": 1.184, "grad_norm": 0.4634397625923157, "learning_rate": 2.041072e-05, "loss": 0.2615, "step": 185000 }, { "epoch": 1.1872, "grad_norm": 0.5693604946136475, "learning_rate": 2.033072e-05, "loss": 0.2619, "step": 185500 } ], "logging_steps": 500, "max_steps": 312500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.0369350959104e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }