{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.416, "eval_steps": 500, "global_step": 65000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 1.7230606079101562, "learning_rate": 4.99208e-05, "loss": 1.2281, "step": 500 }, { "epoch": 0.0064, "grad_norm": 3.655383348464966, "learning_rate": 4.9840800000000006e-05, "loss": 0.7566, "step": 1000 }, { "epoch": 0.0096, "grad_norm": 1.2925927639007568, "learning_rate": 4.97608e-05, "loss": 0.6764, "step": 1500 }, { "epoch": 0.0128, "grad_norm": 1.286004900932312, "learning_rate": 4.968080000000001e-05, "loss": 0.6304, "step": 2000 }, { "epoch": 0.016, "grad_norm": 1.2140214443206787, "learning_rate": 4.96008e-05, "loss": 0.5981, "step": 2500 }, { "epoch": 0.0192, "grad_norm": 1.2525482177734375, "learning_rate": 4.95208e-05, "loss": 0.5767, "step": 3000 }, { "epoch": 0.0224, "grad_norm": 1.2310410737991333, "learning_rate": 4.94408e-05, "loss": 0.5597, "step": 3500 }, { "epoch": 0.0256, "grad_norm": 1.1735206842422485, "learning_rate": 4.9360800000000004e-05, "loss": 0.5418, "step": 4000 }, { "epoch": 0.0288, "grad_norm": 1.114688754081726, "learning_rate": 4.9280800000000004e-05, "loss": 0.5335, "step": 4500 }, { "epoch": 0.032, "grad_norm": 0.8874593377113342, "learning_rate": 4.9200800000000005e-05, "loss": 0.5237, "step": 5000 }, { "epoch": 0.0352, "grad_norm": 1.1261299848556519, "learning_rate": 4.91208e-05, "loss": 0.5135, "step": 5500 }, { "epoch": 0.0384, "grad_norm": 0.9994556307792664, "learning_rate": 4.9040800000000007e-05, "loss": 0.5059, "step": 6000 }, { "epoch": 0.0416, "grad_norm": 1.2349673509597778, "learning_rate": 4.89608e-05, "loss": 0.4939, "step": 6500 }, { "epoch": 0.0448, "grad_norm": 0.9770995378494263, "learning_rate": 4.88808e-05, "loss": 0.4824, "step": 7000 }, { "epoch": 0.048, "grad_norm": 0.981966495513916, "learning_rate": 4.88008e-05, "loss": 0.4875, "step": 7500 }, { "epoch": 0.0512, "grad_norm": 1.0177415609359741, "learning_rate": 4.87208e-05, "loss": 0.4785, "step": 8000 }, { "epoch": 0.0544, "grad_norm": 1.0521667003631592, "learning_rate": 4.8640800000000004e-05, "loss": 0.4731, "step": 8500 }, { "epoch": 0.0576, "grad_norm": 0.8560615181922913, "learning_rate": 4.85608e-05, "loss": 0.4633, "step": 9000 }, { "epoch": 0.0608, "grad_norm": 1.0170217752456665, "learning_rate": 4.8480800000000005e-05, "loss": 0.4576, "step": 9500 }, { "epoch": 0.064, "grad_norm": 0.9891325831413269, "learning_rate": 4.84008e-05, "loss": 0.4556, "step": 10000 }, { "epoch": 0.0672, "grad_norm": 1.0609711408615112, "learning_rate": 4.832080000000001e-05, "loss": 0.4493, "step": 10500 }, { "epoch": 0.0704, "grad_norm": 0.8623799681663513, "learning_rate": 4.82408e-05, "loss": 0.4459, "step": 11000 }, { "epoch": 0.0736, "grad_norm": 0.9587870240211487, "learning_rate": 4.81608e-05, "loss": 0.4418, "step": 11500 }, { "epoch": 0.0768, "grad_norm": 0.8939447999000549, "learning_rate": 4.80808e-05, "loss": 0.4327, "step": 12000 }, { "epoch": 0.08, "grad_norm": 0.9886033535003662, "learning_rate": 4.80008e-05, "loss": 0.438, "step": 12500 }, { "epoch": 0.0832, "grad_norm": 0.9157513976097107, "learning_rate": 4.7920800000000004e-05, "loss": 0.4323, "step": 13000 }, { "epoch": 0.0864, "grad_norm": 0.9085854887962341, "learning_rate": 4.7840800000000005e-05, "loss": 0.4303, "step": 13500 }, { "epoch": 0.0896, "grad_norm": 0.9123984575271606, "learning_rate": 4.77608e-05, "loss": 0.4247, "step": 14000 }, { "epoch": 0.0928, "grad_norm": 0.839026689529419, "learning_rate": 4.7680960000000004e-05, "loss": 0.4233, "step": 14500 }, { "epoch": 0.096, "grad_norm": 0.8110847473144531, "learning_rate": 4.760096e-05, "loss": 0.4207, "step": 15000 }, { "epoch": 0.0992, "grad_norm": 0.8462579250335693, "learning_rate": 4.7520960000000005e-05, "loss": 0.421, "step": 15500 }, { "epoch": 0.1024, "grad_norm": 0.8980106711387634, "learning_rate": 4.744096e-05, "loss": 0.417, "step": 16000 }, { "epoch": 0.1056, "grad_norm": 0.8297702074050903, "learning_rate": 4.736096000000001e-05, "loss": 0.4139, "step": 16500 }, { "epoch": 0.1088, "grad_norm": 0.9856173992156982, "learning_rate": 4.728096e-05, "loss": 0.419, "step": 17000 }, { "epoch": 0.112, "grad_norm": 0.934256911277771, "learning_rate": 4.720096e-05, "loss": 0.4098, "step": 17500 }, { "epoch": 0.1152, "grad_norm": 0.9190649390220642, "learning_rate": 4.712096e-05, "loss": 0.412, "step": 18000 }, { "epoch": 0.1184, "grad_norm": 0.9078772664070129, "learning_rate": 4.704096e-05, "loss": 0.4043, "step": 18500 }, { "epoch": 0.1216, "grad_norm": 1.082939624786377, "learning_rate": 4.696112e-05, "loss": 0.4045, "step": 19000 }, { "epoch": 0.1248, "grad_norm": 0.9159390926361084, "learning_rate": 4.688112e-05, "loss": 0.4098, "step": 19500 }, { "epoch": 0.128, "grad_norm": 0.8420547842979431, "learning_rate": 4.680128e-05, "loss": 0.4033, "step": 20000 }, { "epoch": 0.1312, "grad_norm": 0.7658286094665527, "learning_rate": 4.672128e-05, "loss": 0.4002, "step": 20500 }, { "epoch": 0.1344, "grad_norm": 0.9074057340621948, "learning_rate": 4.664128e-05, "loss": 0.3964, "step": 21000 }, { "epoch": 0.1376, "grad_norm": 0.6065025329589844, "learning_rate": 4.656128e-05, "loss": 0.3984, "step": 21500 }, { "epoch": 0.1408, "grad_norm": 0.7523757219314575, "learning_rate": 4.6481280000000004e-05, "loss": 0.3959, "step": 22000 }, { "epoch": 0.144, "grad_norm": 0.807826042175293, "learning_rate": 4.6401280000000004e-05, "loss": 0.3921, "step": 22500 }, { "epoch": 0.1472, "grad_norm": 0.8530682325363159, "learning_rate": 4.632128e-05, "loss": 0.4002, "step": 23000 }, { "epoch": 0.1504, "grad_norm": 0.8661518692970276, "learning_rate": 4.6241280000000006e-05, "loss": 0.3856, "step": 23500 }, { "epoch": 0.1536, "grad_norm": 0.7473235130310059, "learning_rate": 4.616144e-05, "loss": 0.3854, "step": 24000 }, { "epoch": 0.1568, "grad_norm": 0.7954819202423096, "learning_rate": 4.6081440000000005e-05, "loss": 0.3871, "step": 24500 }, { "epoch": 0.16, "grad_norm": 0.8758727312088013, "learning_rate": 4.600144e-05, "loss": 0.3842, "step": 25000 }, { "epoch": 0.1632, "grad_norm": 0.8430293798446655, "learning_rate": 4.592144000000001e-05, "loss": 0.3886, "step": 25500 }, { "epoch": 0.1664, "grad_norm": 0.6557173728942871, "learning_rate": 4.584144e-05, "loss": 0.3854, "step": 26000 }, { "epoch": 0.1696, "grad_norm": 0.7791888117790222, "learning_rate": 4.576144e-05, "loss": 0.3796, "step": 26500 }, { "epoch": 0.1728, "grad_norm": 0.736084520816803, "learning_rate": 4.56816e-05, "loss": 0.3806, "step": 27000 }, { "epoch": 0.176, "grad_norm": 0.7714269161224365, "learning_rate": 4.56016e-05, "loss": 0.3781, "step": 27500 }, { "epoch": 0.1792, "grad_norm": 0.766144335269928, "learning_rate": 4.552176e-05, "loss": 0.3766, "step": 28000 }, { "epoch": 0.1824, "grad_norm": 0.7035301923751831, "learning_rate": 4.544176e-05, "loss": 0.3737, "step": 28500 }, { "epoch": 0.1856, "grad_norm": 0.7573793530464172, "learning_rate": 4.536176e-05, "loss": 0.3753, "step": 29000 }, { "epoch": 0.1888, "grad_norm": 0.8799508213996887, "learning_rate": 4.528176e-05, "loss": 0.373, "step": 29500 }, { "epoch": 0.192, "grad_norm": 0.8543264269828796, "learning_rate": 4.520176e-05, "loss": 0.3735, "step": 30000 }, { "epoch": 0.1952, "grad_norm": 0.6768947243690491, "learning_rate": 4.512176e-05, "loss": 0.3697, "step": 30500 }, { "epoch": 0.1984, "grad_norm": 0.8239702582359314, "learning_rate": 4.504176e-05, "loss": 0.3675, "step": 31000 }, { "epoch": 0.2016, "grad_norm": 0.8310449123382568, "learning_rate": 4.4961760000000004e-05, "loss": 0.3695, "step": 31500 }, { "epoch": 0.2048, "grad_norm": 0.8459475040435791, "learning_rate": 4.488176e-05, "loss": 0.3694, "step": 32000 }, { "epoch": 0.208, "grad_norm": 0.7346063852310181, "learning_rate": 4.4801760000000006e-05, "loss": 0.3646, "step": 32500 }, { "epoch": 0.2112, "grad_norm": 0.6958354115486145, "learning_rate": 4.472176e-05, "loss": 0.3704, "step": 33000 }, { "epoch": 0.2144, "grad_norm": 0.8244686722755432, "learning_rate": 4.464176000000001e-05, "loss": 0.3647, "step": 33500 }, { "epoch": 0.2176, "grad_norm": 0.7559502124786377, "learning_rate": 4.456192e-05, "loss": 0.3665, "step": 34000 }, { "epoch": 0.2208, "grad_norm": 0.9046504497528076, "learning_rate": 4.4481920000000007e-05, "loss": 0.3637, "step": 34500 }, { "epoch": 0.224, "grad_norm": 0.7771899700164795, "learning_rate": 4.440192e-05, "loss": 0.3648, "step": 35000 }, { "epoch": 0.2272, "grad_norm": 0.6887528300285339, "learning_rate": 4.432192e-05, "loss": 0.3562, "step": 35500 }, { "epoch": 0.2304, "grad_norm": 0.7471407055854797, "learning_rate": 4.424192e-05, "loss": 0.3639, "step": 36000 }, { "epoch": 0.2336, "grad_norm": 0.7198163270950317, "learning_rate": 4.416192e-05, "loss": 0.3604, "step": 36500 }, { "epoch": 0.2368, "grad_norm": 0.7383478879928589, "learning_rate": 4.4081920000000004e-05, "loss": 0.3592, "step": 37000 }, { "epoch": 0.24, "grad_norm": 0.8052579760551453, "learning_rate": 4.4001920000000004e-05, "loss": 0.3563, "step": 37500 }, { "epoch": 0.2432, "grad_norm": 0.7765107154846191, "learning_rate": 4.392224e-05, "loss": 0.3548, "step": 38000 }, { "epoch": 0.2464, "grad_norm": 0.7250288724899292, "learning_rate": 4.384224e-05, "loss": 0.3605, "step": 38500 }, { "epoch": 0.2496, "grad_norm": 0.6914694309234619, "learning_rate": 4.376224e-05, "loss": 0.3551, "step": 39000 }, { "epoch": 0.2528, "grad_norm": 0.6636275053024292, "learning_rate": 4.368224e-05, "loss": 0.3587, "step": 39500 }, { "epoch": 0.256, "grad_norm": 0.710564911365509, "learning_rate": 4.360224e-05, "loss": 0.3537, "step": 40000 }, { "epoch": 0.2592, "grad_norm": 0.6195800304412842, "learning_rate": 4.3522240000000004e-05, "loss": 0.3537, "step": 40500 }, { "epoch": 0.2624, "grad_norm": 0.7131514549255371, "learning_rate": 4.34424e-05, "loss": 0.3531, "step": 41000 }, { "epoch": 0.2656, "grad_norm": 0.6594410538673401, "learning_rate": 4.336256e-05, "loss": 0.3518, "step": 41500 }, { "epoch": 0.2688, "grad_norm": 0.7651230096817017, "learning_rate": 4.328256e-05, "loss": 0.3516, "step": 42000 }, { "epoch": 0.272, "grad_norm": 0.756515622138977, "learning_rate": 4.320256e-05, "loss": 0.3461, "step": 42500 }, { "epoch": 0.2752, "grad_norm": 0.7201528549194336, "learning_rate": 4.3122560000000003e-05, "loss": 0.3497, "step": 43000 }, { "epoch": 0.2784, "grad_norm": 0.7436856031417847, "learning_rate": 4.3042560000000004e-05, "loss": 0.3505, "step": 43500 }, { "epoch": 0.2816, "grad_norm": 0.7914199829101562, "learning_rate": 4.2962560000000005e-05, "loss": 0.3439, "step": 44000 }, { "epoch": 0.2848, "grad_norm": 0.7488194704055786, "learning_rate": 4.288256e-05, "loss": 0.349, "step": 44500 }, { "epoch": 0.288, "grad_norm": 0.8654124736785889, "learning_rate": 4.280256e-05, "loss": 0.3491, "step": 45000 }, { "epoch": 0.2912, "grad_norm": 0.6817401647567749, "learning_rate": 4.272272e-05, "loss": 0.3447, "step": 45500 }, { "epoch": 0.2944, "grad_norm": 0.6439715623855591, "learning_rate": 4.2642720000000006e-05, "loss": 0.3453, "step": 46000 }, { "epoch": 0.2976, "grad_norm": 1.3840138912200928, "learning_rate": 4.256272e-05, "loss": 0.3445, "step": 46500 }, { "epoch": 0.3008, "grad_norm": 0.7245766520500183, "learning_rate": 4.248272e-05, "loss": 0.3462, "step": 47000 }, { "epoch": 0.304, "grad_norm": 0.6877666711807251, "learning_rate": 4.240288e-05, "loss": 0.3465, "step": 47500 }, { "epoch": 0.3072, "grad_norm": 0.8494886159896851, "learning_rate": 4.2322880000000006e-05, "loss": 0.348, "step": 48000 }, { "epoch": 0.3104, "grad_norm": 0.6704971790313721, "learning_rate": 4.224288e-05, "loss": 0.3403, "step": 48500 }, { "epoch": 0.3136, "grad_norm": 0.6239964962005615, "learning_rate": 4.216288000000001e-05, "loss": 0.3382, "step": 49000 }, { "epoch": 0.3168, "grad_norm": 0.7317768335342407, "learning_rate": 4.208288e-05, "loss": 0.3385, "step": 49500 }, { "epoch": 0.32, "grad_norm": 0.7397735118865967, "learning_rate": 4.200288e-05, "loss": 0.3405, "step": 50000 }, { "epoch": 0.3232, "grad_norm": 1.1299536228179932, "learning_rate": 4.1922880000000003e-05, "loss": 0.3431, "step": 50500 }, { "epoch": 0.3264, "grad_norm": 0.6406556963920593, "learning_rate": 4.184304e-05, "loss": 0.3384, "step": 51000 }, { "epoch": 0.3296, "grad_norm": 0.8084424734115601, "learning_rate": 4.17632e-05, "loss": 0.3365, "step": 51500 }, { "epoch": 0.3328, "grad_norm": 0.7525010704994202, "learning_rate": 4.16832e-05, "loss": 0.3399, "step": 52000 }, { "epoch": 0.336, "grad_norm": 0.7382110953330994, "learning_rate": 4.16032e-05, "loss": 0.335, "step": 52500 }, { "epoch": 0.3392, "grad_norm": 0.6454793810844421, "learning_rate": 4.15232e-05, "loss": 0.3354, "step": 53000 }, { "epoch": 0.3424, "grad_norm": 0.639664351940155, "learning_rate": 4.14432e-05, "loss": 0.3371, "step": 53500 }, { "epoch": 0.3456, "grad_norm": 0.5574499368667603, "learning_rate": 4.1363200000000004e-05, "loss": 0.3341, "step": 54000 }, { "epoch": 0.3488, "grad_norm": 0.6772671341896057, "learning_rate": 4.12832e-05, "loss": 0.3331, "step": 54500 }, { "epoch": 0.352, "grad_norm": 0.6943195462226868, "learning_rate": 4.120336e-05, "loss": 0.3365, "step": 55000 }, { "epoch": 0.3552, "grad_norm": 0.7460485100746155, "learning_rate": 4.112336e-05, "loss": 0.3308, "step": 55500 }, { "epoch": 0.3584, "grad_norm": 0.7071924805641174, "learning_rate": 4.1043360000000005e-05, "loss": 0.3312, "step": 56000 }, { "epoch": 0.3616, "grad_norm": 0.6678891181945801, "learning_rate": 4.0963519999999996e-05, "loss": 0.3314, "step": 56500 }, { "epoch": 0.3648, "grad_norm": 0.7100914120674133, "learning_rate": 4.0883520000000004e-05, "loss": 0.3307, "step": 57000 }, { "epoch": 0.368, "grad_norm": 0.6085671782493591, "learning_rate": 4.080352e-05, "loss": 0.3282, "step": 57500 }, { "epoch": 0.3712, "grad_norm": 0.6634243130683899, "learning_rate": 4.0723520000000005e-05, "loss": 0.3321, "step": 58000 }, { "epoch": 0.3744, "grad_norm": 0.7203409075737, "learning_rate": 4.064352e-05, "loss": 0.3318, "step": 58500 }, { "epoch": 0.3776, "grad_norm": 0.7934884428977966, "learning_rate": 4.056352e-05, "loss": 0.3239, "step": 59000 }, { "epoch": 0.3808, "grad_norm": 0.8591666221618652, "learning_rate": 4.048352e-05, "loss": 0.3275, "step": 59500 }, { "epoch": 0.384, "grad_norm": 0.6306772232055664, "learning_rate": 4.040352e-05, "loss": 0.3308, "step": 60000 }, { "epoch": 0.3872, "grad_norm": 0.6059302687644958, "learning_rate": 4.032352e-05, "loss": 0.3266, "step": 60500 }, { "epoch": 0.3904, "grad_norm": 0.6875105500221252, "learning_rate": 4.024352e-05, "loss": 0.3265, "step": 61000 }, { "epoch": 0.3936, "grad_norm": 0.6397412419319153, "learning_rate": 4.0163520000000004e-05, "loss": 0.3268, "step": 61500 }, { "epoch": 0.3968, "grad_norm": 0.7801005840301514, "learning_rate": 4.0083520000000005e-05, "loss": 0.3314, "step": 62000 }, { "epoch": 0.4, "grad_norm": 0.6966884136199951, "learning_rate": 4.000352e-05, "loss": 0.3263, "step": 62500 }, { "epoch": 0.4032, "grad_norm": 0.7413304448127747, "learning_rate": 3.9923520000000006e-05, "loss": 0.3284, "step": 63000 }, { "epoch": 0.4064, "grad_norm": 0.7089780569076538, "learning_rate": 3.984352e-05, "loss": 0.3252, "step": 63500 }, { "epoch": 0.4096, "grad_norm": 0.6669878959655762, "learning_rate": 3.976352e-05, "loss": 0.3239, "step": 64000 }, { "epoch": 0.4128, "grad_norm": 0.7352403998374939, "learning_rate": 3.968368e-05, "loss": 0.3226, "step": 64500 }, { "epoch": 0.416, "grad_norm": 0.6916635036468506, "learning_rate": 3.9603840000000005e-05, "loss": 0.3234, "step": 65000 } ], "logging_steps": 500, "max_steps": 312500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.166581030912e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }