{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9555062252678312, "eval_steps": 100, "global_step": 9900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 5.168999671936035, "learning_rate": 2.483498899926662e-05, "loss": 1.8368, "step": 100 }, { "epoch": 0.01, "eval_loss": 1.4510709047317505, "eval_runtime": 27.2844, "eval_samples_per_second": 18.325, "eval_steps_per_second": 2.309, "step": 100 }, { "epoch": 0.02, "grad_norm": 5.417923927307129, "learning_rate": 2.466831122074805e-05, "loss": 1.6043, "step": 200 }, { "epoch": 0.02, "eval_loss": 1.4302723407745361, "eval_runtime": 27.2442, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 200 }, { "epoch": 0.03, "grad_norm": 5.932116985321045, "learning_rate": 2.450163344222948e-05, "loss": 1.7302, "step": 300 }, { "epoch": 0.03, "eval_loss": 1.4036405086517334, "eval_runtime": 27.2418, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 300 }, { "epoch": 0.04, "grad_norm": 3.3278746604919434, "learning_rate": 2.4334955663710914e-05, "loss": 1.5892, "step": 400 }, { "epoch": 0.04, "eval_loss": 1.3917973041534424, "eval_runtime": 27.2382, "eval_samples_per_second": 18.357, "eval_steps_per_second": 2.313, "step": 400 }, { "epoch": 0.05, "grad_norm": 5.543770790100098, "learning_rate": 2.4168277885192348e-05, "loss": 1.6516, "step": 500 }, { "epoch": 0.05, "eval_loss": 1.3847407102584839, "eval_runtime": 27.2419, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 500 }, { "epoch": 0.06, "grad_norm": 4.016234874725342, "learning_rate": 2.400160010667378e-05, "loss": 1.5757, "step": 600 }, { "epoch": 0.06, "eval_loss": 1.3731025457382202, "eval_runtime": 27.2348, "eval_samples_per_second": 18.359, "eval_steps_per_second": 2.313, "step": 600 }, { "epoch": 0.07, "grad_norm": 6.283770561218262, "learning_rate": 2.3834922328155212e-05, "loss": 1.6181, "step": 700 }, { "epoch": 0.07, "eval_loss": 1.3765405416488647, "eval_runtime": 27.2404, "eval_samples_per_second": 18.355, "eval_steps_per_second": 2.313, "step": 700 }, { "epoch": 0.08, "grad_norm": 7.656264781951904, "learning_rate": 2.3668244549636642e-05, "loss": 1.6269, "step": 800 }, { "epoch": 0.08, "eval_loss": 1.363603115081787, "eval_runtime": 27.2488, "eval_samples_per_second": 18.349, "eval_steps_per_second": 2.312, "step": 800 }, { "epoch": 0.09, "grad_norm": 3.9825522899627686, "learning_rate": 2.3501566771118076e-05, "loss": 1.5679, "step": 900 }, { "epoch": 0.09, "eval_loss": 1.3591631650924683, "eval_runtime": 27.2405, "eval_samples_per_second": 18.355, "eval_steps_per_second": 2.313, "step": 900 }, { "epoch": 0.1, "grad_norm": 2.61617112159729, "learning_rate": 2.3334888992599506e-05, "loss": 1.5055, "step": 1000 }, { "epoch": 0.1, "eval_loss": 1.3467729091644287, "eval_runtime": 27.2329, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.313, "step": 1000 }, { "epoch": 0.11, "grad_norm": 4.158337593078613, "learning_rate": 2.316821121408094e-05, "loss": 1.5367, "step": 1100 }, { "epoch": 0.11, "eval_loss": 1.3473803997039795, "eval_runtime": 27.2426, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 1100 }, { "epoch": 0.12, "grad_norm": 8.277384757995605, "learning_rate": 2.3001533435562373e-05, "loss": 1.5571, "step": 1200 }, { "epoch": 0.12, "eval_loss": 1.3315479755401611, "eval_runtime": 27.2582, "eval_samples_per_second": 18.343, "eval_steps_per_second": 2.311, "step": 1200 }, { "epoch": 0.13, "grad_norm": 6.899178504943848, "learning_rate": 2.2834855657043804e-05, "loss": 1.5624, "step": 1300 }, { "epoch": 0.13, "eval_loss": 1.329127311706543, "eval_runtime": 27.2334, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.313, "step": 1300 }, { "epoch": 0.14, "grad_norm": 3.201148271560669, "learning_rate": 2.2668177878525237e-05, "loss": 1.5005, "step": 1400 }, { "epoch": 0.14, "eval_loss": 1.3275610208511353, "eval_runtime": 27.242, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 1400 }, { "epoch": 0.14, "grad_norm": 7.74235725402832, "learning_rate": 2.2501500100006667e-05, "loss": 1.5763, "step": 1500 }, { "epoch": 0.14, "eval_loss": 1.3223873376846313, "eval_runtime": 27.2387, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 1500 }, { "epoch": 0.15, "grad_norm": 2.6546003818511963, "learning_rate": 2.23348223214881e-05, "loss": 1.4833, "step": 1600 }, { "epoch": 0.15, "eval_loss": 1.3203272819519043, "eval_runtime": 27.2472, "eval_samples_per_second": 18.35, "eval_steps_per_second": 2.312, "step": 1600 }, { "epoch": 0.16, "grad_norm": 12.014766693115234, "learning_rate": 2.216814454296953e-05, "loss": 1.5086, "step": 1700 }, { "epoch": 0.16, "eval_loss": 1.3142383098602295, "eval_runtime": 27.2319, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 1700 }, { "epoch": 0.17, "grad_norm": 6.633994102478027, "learning_rate": 2.2001466764450965e-05, "loss": 1.5172, "step": 1800 }, { "epoch": 0.17, "eval_loss": 1.3094617128372192, "eval_runtime": 27.2393, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 1800 }, { "epoch": 0.18, "grad_norm": 6.995003700256348, "learning_rate": 2.18347889859324e-05, "loss": 1.4299, "step": 1900 }, { "epoch": 0.18, "eval_loss": 1.3035434484481812, "eval_runtime": 27.2303, "eval_samples_per_second": 18.362, "eval_steps_per_second": 2.314, "step": 1900 }, { "epoch": 0.19, "grad_norm": 6.952856540679932, "learning_rate": 2.166811120741383e-05, "loss": 1.6068, "step": 2000 }, { "epoch": 0.19, "eval_loss": 1.2929936647415161, "eval_runtime": 27.2469, "eval_samples_per_second": 18.351, "eval_steps_per_second": 2.312, "step": 2000 }, { "epoch": 0.2, "grad_norm": 7.213510036468506, "learning_rate": 2.150143342889526e-05, "loss": 1.5253, "step": 2100 }, { "epoch": 0.2, "eval_loss": 1.294145941734314, "eval_runtime": 27.2323, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 2100 }, { "epoch": 0.21, "grad_norm": 10.12060832977295, "learning_rate": 2.1334755650376693e-05, "loss": 1.54, "step": 2200 }, { "epoch": 0.21, "eval_loss": 1.2877910137176514, "eval_runtime": 27.2416, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 2200 }, { "epoch": 0.22, "grad_norm": 9.764932632446289, "learning_rate": 2.1168077871858126e-05, "loss": 1.5397, "step": 2300 }, { "epoch": 0.22, "eval_loss": 1.2824313640594482, "eval_runtime": 27.2505, "eval_samples_per_second": 18.348, "eval_steps_per_second": 2.312, "step": 2300 }, { "epoch": 0.23, "grad_norm": 7.659645080566406, "learning_rate": 2.1001400093339556e-05, "loss": 1.4568, "step": 2400 }, { "epoch": 0.23, "eval_loss": 1.277322769165039, "eval_runtime": 27.244, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 2400 }, { "epoch": 0.24, "grad_norm": 9.128131866455078, "learning_rate": 2.083472231482099e-05, "loss": 1.4044, "step": 2500 }, { "epoch": 0.24, "eval_loss": 1.274811863899231, "eval_runtime": 27.2388, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 2500 }, { "epoch": 0.25, "grad_norm": 3.9920785427093506, "learning_rate": 2.0668044536302424e-05, "loss": 1.5648, "step": 2600 }, { "epoch": 0.25, "eval_loss": 1.2758684158325195, "eval_runtime": 27.2442, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 2600 }, { "epoch": 0.26, "grad_norm": 10.10120677947998, "learning_rate": 2.050136675778385e-05, "loss": 1.3923, "step": 2700 }, { "epoch": 0.26, "eval_loss": 1.2696377038955688, "eval_runtime": 27.2457, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 2700 }, { "epoch": 0.27, "grad_norm": 8.736724853515625, "learning_rate": 2.0334688979265284e-05, "loss": 1.5071, "step": 2800 }, { "epoch": 0.27, "eval_loss": 1.269506812095642, "eval_runtime": 27.2434, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 2800 }, { "epoch": 0.28, "grad_norm": 4.389509677886963, "learning_rate": 2.0168011200746718e-05, "loss": 1.4408, "step": 2900 }, { "epoch": 0.28, "eval_loss": 1.2669789791107178, "eval_runtime": 27.2401, "eval_samples_per_second": 18.355, "eval_steps_per_second": 2.313, "step": 2900 }, { "epoch": 0.29, "grad_norm": 7.288300514221191, "learning_rate": 2.0001333422228148e-05, "loss": 1.4174, "step": 3000 }, { "epoch": 0.29, "eval_loss": 1.2704719305038452, "eval_runtime": 27.225, "eval_samples_per_second": 18.365, "eval_steps_per_second": 2.314, "step": 3000 }, { "epoch": 0.3, "grad_norm": 3.918144464492798, "learning_rate": 1.983465564370958e-05, "loss": 1.4545, "step": 3100 }, { "epoch": 0.3, "eval_loss": 1.2649730443954468, "eval_runtime": 27.2304, "eval_samples_per_second": 18.362, "eval_steps_per_second": 2.314, "step": 3100 }, { "epoch": 0.31, "grad_norm": 7.6044392585754395, "learning_rate": 1.9667977865191015e-05, "loss": 1.4661, "step": 3200 }, { "epoch": 0.31, "eval_loss": 1.2659434080123901, "eval_runtime": 27.2453, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 3200 }, { "epoch": 0.32, "grad_norm": 2.494729518890381, "learning_rate": 1.9501300086672446e-05, "loss": 1.409, "step": 3300 }, { "epoch": 0.32, "eval_loss": 1.2590056657791138, "eval_runtime": 27.2367, "eval_samples_per_second": 18.358, "eval_steps_per_second": 2.313, "step": 3300 }, { "epoch": 0.33, "grad_norm": 5.691107273101807, "learning_rate": 1.9334622308153876e-05, "loss": 1.4775, "step": 3400 }, { "epoch": 0.33, "eval_loss": 1.2585530281066895, "eval_runtime": 27.2332, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.313, "step": 3400 }, { "epoch": 0.34, "grad_norm": 8.363253593444824, "learning_rate": 1.916794452963531e-05, "loss": 1.3749, "step": 3500 }, { "epoch": 0.34, "eval_loss": 1.2596063613891602, "eval_runtime": 27.2327, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.313, "step": 3500 }, { "epoch": 0.35, "grad_norm": 5.728103160858154, "learning_rate": 1.9001266751116743e-05, "loss": 1.438, "step": 3600 }, { "epoch": 0.35, "eval_loss": 1.25990629196167, "eval_runtime": 27.2309, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.314, "step": 3600 }, { "epoch": 0.36, "grad_norm": 8.10669994354248, "learning_rate": 1.8834588972598173e-05, "loss": 1.4927, "step": 3700 }, { "epoch": 0.36, "eval_loss": 1.2571576833724976, "eval_runtime": 27.2321, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 3700 }, { "epoch": 0.37, "grad_norm": 5.870044231414795, "learning_rate": 1.8667911194079607e-05, "loss": 1.3475, "step": 3800 }, { "epoch": 0.37, "eval_loss": 1.2502912282943726, "eval_runtime": 27.2283, "eval_samples_per_second": 18.363, "eval_steps_per_second": 2.314, "step": 3800 }, { "epoch": 0.38, "grad_norm": 8.73828125, "learning_rate": 1.850123341556104e-05, "loss": 1.467, "step": 3900 }, { "epoch": 0.38, "eval_loss": 1.250830054283142, "eval_runtime": 27.2279, "eval_samples_per_second": 18.364, "eval_steps_per_second": 2.314, "step": 3900 }, { "epoch": 0.39, "grad_norm": 6.171374797821045, "learning_rate": 1.833455563704247e-05, "loss": 1.3805, "step": 4000 }, { "epoch": 0.39, "eval_loss": 1.2475095987319946, "eval_runtime": 27.2405, "eval_samples_per_second": 18.355, "eval_steps_per_second": 2.313, "step": 4000 }, { "epoch": 0.4, "grad_norm": 9.766369819641113, "learning_rate": 1.81678778585239e-05, "loss": 1.4581, "step": 4100 }, { "epoch": 0.4, "eval_loss": 1.2426196336746216, "eval_runtime": 27.2316, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 4100 }, { "epoch": 0.41, "grad_norm": 8.565445899963379, "learning_rate": 1.8001200080005335e-05, "loss": 1.4319, "step": 4200 }, { "epoch": 0.41, "eval_loss": 1.2432831525802612, "eval_runtime": 27.2265, "eval_samples_per_second": 18.364, "eval_steps_per_second": 2.314, "step": 4200 }, { "epoch": 0.42, "grad_norm": 7.835436820983887, "learning_rate": 1.7834522301486768e-05, "loss": 1.4282, "step": 4300 }, { "epoch": 0.42, "eval_loss": 1.23819899559021, "eval_runtime": 27.239, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 4300 }, { "epoch": 0.42, "grad_norm": 5.4964985847473145, "learning_rate": 1.76678445229682e-05, "loss": 1.4429, "step": 4400 }, { "epoch": 0.42, "eval_loss": 1.2355051040649414, "eval_runtime": 27.2454, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 4400 }, { "epoch": 0.43, "grad_norm": 6.0724897384643555, "learning_rate": 1.7501166744449632e-05, "loss": 1.3815, "step": 4500 }, { "epoch": 0.43, "eval_loss": 1.2307285070419312, "eval_runtime": 27.248, "eval_samples_per_second": 18.35, "eval_steps_per_second": 2.312, "step": 4500 }, { "epoch": 0.44, "grad_norm": 5.445316314697266, "learning_rate": 1.7334488965931062e-05, "loss": 1.4194, "step": 4600 }, { "epoch": 0.44, "eval_loss": 1.225948691368103, "eval_runtime": 27.2592, "eval_samples_per_second": 18.342, "eval_steps_per_second": 2.311, "step": 4600 }, { "epoch": 0.45, "grad_norm": 6.078861236572266, "learning_rate": 1.7167811187412493e-05, "loss": 1.4719, "step": 4700 }, { "epoch": 0.45, "eval_loss": 1.2257155179977417, "eval_runtime": 27.234, "eval_samples_per_second": 18.359, "eval_steps_per_second": 2.313, "step": 4700 }, { "epoch": 0.46, "grad_norm": 4.971687316894531, "learning_rate": 1.7001133408893926e-05, "loss": 1.3587, "step": 4800 }, { "epoch": 0.46, "eval_loss": 1.219119668006897, "eval_runtime": 27.232, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 4800 }, { "epoch": 0.47, "grad_norm": 10.197978973388672, "learning_rate": 1.683445563037536e-05, "loss": 1.3599, "step": 4900 }, { "epoch": 0.47, "eval_loss": 1.2111576795578003, "eval_runtime": 27.2374, "eval_samples_per_second": 18.357, "eval_steps_per_second": 2.313, "step": 4900 }, { "epoch": 0.48, "grad_norm": 4.264425754547119, "learning_rate": 1.6667777851856793e-05, "loss": 1.4247, "step": 5000 }, { "epoch": 0.48, "eval_loss": 1.207905650138855, "eval_runtime": 27.2335, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.313, "step": 5000 }, { "epoch": 0.49, "grad_norm": 6.512348651885986, "learning_rate": 1.6501100073338224e-05, "loss": 1.2745, "step": 5100 }, { "epoch": 0.49, "eval_loss": 1.210147738456726, "eval_runtime": 27.2312, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.314, "step": 5100 }, { "epoch": 0.5, "grad_norm": 5.255068778991699, "learning_rate": 1.6334422294819657e-05, "loss": 1.4317, "step": 5200 }, { "epoch": 0.5, "eval_loss": 1.2059623003005981, "eval_runtime": 27.2372, "eval_samples_per_second": 18.357, "eval_steps_per_second": 2.313, "step": 5200 }, { "epoch": 0.51, "grad_norm": 11.69495677947998, "learning_rate": 1.6167744516301088e-05, "loss": 1.3463, "step": 5300 }, { "epoch": 0.51, "eval_loss": 1.2085806131362915, "eval_runtime": 27.238, "eval_samples_per_second": 18.357, "eval_steps_per_second": 2.313, "step": 5300 }, { "epoch": 0.52, "grad_norm": 4.8801398277282715, "learning_rate": 1.6001066737782518e-05, "loss": 1.3995, "step": 5400 }, { "epoch": 0.52, "eval_loss": 1.2063578367233276, "eval_runtime": 27.2504, "eval_samples_per_second": 18.348, "eval_steps_per_second": 2.312, "step": 5400 }, { "epoch": 0.53, "grad_norm": 6.397332668304443, "learning_rate": 1.583438895926395e-05, "loss": 1.3316, "step": 5500 }, { "epoch": 0.53, "eval_loss": 1.2056635618209839, "eval_runtime": 27.2573, "eval_samples_per_second": 18.344, "eval_steps_per_second": 2.311, "step": 5500 }, { "epoch": 0.54, "grad_norm": 8.16231918334961, "learning_rate": 1.5667711180745385e-05, "loss": 1.3064, "step": 5600 }, { "epoch": 0.54, "eval_loss": 1.1981565952301025, "eval_runtime": 27.2447, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 5600 }, { "epoch": 0.55, "grad_norm": 8.272992134094238, "learning_rate": 1.5501033402226815e-05, "loss": 1.2971, "step": 5700 }, { "epoch": 0.55, "eval_loss": 1.1983332633972168, "eval_runtime": 27.2527, "eval_samples_per_second": 18.347, "eval_steps_per_second": 2.312, "step": 5700 }, { "epoch": 0.56, "grad_norm": 7.432589054107666, "learning_rate": 1.533435562370825e-05, "loss": 1.4006, "step": 5800 }, { "epoch": 0.56, "eval_loss": 1.1970897912979126, "eval_runtime": 27.2266, "eval_samples_per_second": 18.364, "eval_steps_per_second": 2.314, "step": 5800 }, { "epoch": 0.57, "grad_norm": 12.17546272277832, "learning_rate": 1.5167677845189679e-05, "loss": 1.4033, "step": 5900 }, { "epoch": 0.57, "eval_loss": 1.1994664669036865, "eval_runtime": 27.2478, "eval_samples_per_second": 18.35, "eval_steps_per_second": 2.312, "step": 5900 }, { "epoch": 0.58, "grad_norm": 6.728982448577881, "learning_rate": 1.5001000066671111e-05, "loss": 1.3381, "step": 6000 }, { "epoch": 0.58, "eval_loss": 1.197486162185669, "eval_runtime": 27.234, "eval_samples_per_second": 18.359, "eval_steps_per_second": 2.313, "step": 6000 }, { "epoch": 0.59, "grad_norm": 5.387950420379639, "learning_rate": 1.4834322288152545e-05, "loss": 1.3284, "step": 6100 }, { "epoch": 0.59, "eval_loss": 1.19466233253479, "eval_runtime": 27.2424, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 6100 }, { "epoch": 0.6, "grad_norm": 6.059466361999512, "learning_rate": 1.4667644509633977e-05, "loss": 1.3492, "step": 6200 }, { "epoch": 0.6, "eval_loss": 1.191383719444275, "eval_runtime": 27.2468, "eval_samples_per_second": 18.351, "eval_steps_per_second": 2.312, "step": 6200 }, { "epoch": 0.61, "grad_norm": 4.771157264709473, "learning_rate": 1.4500966731115409e-05, "loss": 1.2956, "step": 6300 }, { "epoch": 0.61, "eval_loss": 1.1882104873657227, "eval_runtime": 27.2582, "eval_samples_per_second": 18.343, "eval_steps_per_second": 2.311, "step": 6300 }, { "epoch": 0.62, "grad_norm": 8.666051864624023, "learning_rate": 1.4334288952596842e-05, "loss": 1.3381, "step": 6400 }, { "epoch": 0.62, "eval_loss": 1.1888271570205688, "eval_runtime": 27.2369, "eval_samples_per_second": 18.357, "eval_steps_per_second": 2.313, "step": 6400 }, { "epoch": 0.63, "grad_norm": 4.501246452331543, "learning_rate": 1.416761117407827e-05, "loss": 1.4293, "step": 6500 }, { "epoch": 0.63, "eval_loss": 1.1847625970840454, "eval_runtime": 27.239, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 6500 }, { "epoch": 0.64, "grad_norm": 7.15999174118042, "learning_rate": 1.4000933395559704e-05, "loss": 1.2713, "step": 6600 }, { "epoch": 0.64, "eval_loss": 1.1845014095306396, "eval_runtime": 27.2437, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 6600 }, { "epoch": 0.65, "grad_norm": 4.205693244934082, "learning_rate": 1.3834255617041136e-05, "loss": 1.4094, "step": 6700 }, { "epoch": 0.65, "eval_loss": 1.1849124431610107, "eval_runtime": 27.2322, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 6700 }, { "epoch": 0.66, "grad_norm": 10.90744400024414, "learning_rate": 1.3667577838522568e-05, "loss": 1.3321, "step": 6800 }, { "epoch": 0.66, "eval_loss": 1.1828891038894653, "eval_runtime": 27.2545, "eval_samples_per_second": 18.346, "eval_steps_per_second": 2.312, "step": 6800 }, { "epoch": 0.67, "grad_norm": 5.98179292678833, "learning_rate": 1.3500900060004002e-05, "loss": 1.3464, "step": 6900 }, { "epoch": 0.67, "eval_loss": 1.181032419204712, "eval_runtime": 27.2781, "eval_samples_per_second": 18.33, "eval_steps_per_second": 2.31, "step": 6900 }, { "epoch": 0.68, "grad_norm": 7.574715614318848, "learning_rate": 1.3334222281485434e-05, "loss": 1.3912, "step": 7000 }, { "epoch": 0.68, "eval_loss": 1.178390383720398, "eval_runtime": 27.2322, "eval_samples_per_second": 18.361, "eval_steps_per_second": 2.313, "step": 7000 }, { "epoch": 0.69, "grad_norm": 7.541271209716797, "learning_rate": 1.3167544502966864e-05, "loss": 1.3588, "step": 7100 }, { "epoch": 0.69, "eval_loss": 1.177061915397644, "eval_runtime": 27.2447, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 7100 }, { "epoch": 0.69, "grad_norm": 9.224418640136719, "learning_rate": 1.3000866724448296e-05, "loss": 1.3083, "step": 7200 }, { "epoch": 0.69, "eval_loss": 1.1750773191452026, "eval_runtime": 27.2423, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 7200 }, { "epoch": 0.7, "grad_norm": 9.978618621826172, "learning_rate": 1.283418894592973e-05, "loss": 1.2933, "step": 7300 }, { "epoch": 0.7, "eval_loss": 1.174636960029602, "eval_runtime": 27.2335, "eval_samples_per_second": 18.36, "eval_steps_per_second": 2.313, "step": 7300 }, { "epoch": 0.71, "grad_norm": 6.905595779418945, "learning_rate": 1.2667511167411161e-05, "loss": 1.3069, "step": 7400 }, { "epoch": 0.71, "eval_loss": 1.1744636297225952, "eval_runtime": 27.2307, "eval_samples_per_second": 18.362, "eval_steps_per_second": 2.314, "step": 7400 }, { "epoch": 0.72, "grad_norm": 7.499482154846191, "learning_rate": 1.2500833388892593e-05, "loss": 1.3172, "step": 7500 }, { "epoch": 0.72, "eval_loss": 1.173417568206787, "eval_runtime": 27.2368, "eval_samples_per_second": 18.357, "eval_steps_per_second": 2.313, "step": 7500 }, { "epoch": 0.73, "grad_norm": 4.5594282150268555, "learning_rate": 1.2334155610374025e-05, "loss": 1.3392, "step": 7600 }, { "epoch": 0.73, "eval_loss": 1.1731162071228027, "eval_runtime": 27.2663, "eval_samples_per_second": 18.338, "eval_steps_per_second": 2.311, "step": 7600 }, { "epoch": 0.74, "grad_norm": 4.877806663513184, "learning_rate": 1.2167477831855457e-05, "loss": 1.2941, "step": 7700 }, { "epoch": 0.74, "eval_loss": 1.1754953861236572, "eval_runtime": 27.2391, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 7700 }, { "epoch": 0.75, "grad_norm": 6.909502029418945, "learning_rate": 1.200080005333689e-05, "loss": 1.3124, "step": 7800 }, { "epoch": 0.75, "eval_loss": 1.1714600324630737, "eval_runtime": 27.2351, "eval_samples_per_second": 18.359, "eval_steps_per_second": 2.313, "step": 7800 }, { "epoch": 0.76, "grad_norm": 3.706770181655884, "learning_rate": 1.1834122274818321e-05, "loss": 1.2626, "step": 7900 }, { "epoch": 0.76, "eval_loss": 1.1713511943817139, "eval_runtime": 27.2512, "eval_samples_per_second": 18.348, "eval_steps_per_second": 2.312, "step": 7900 }, { "epoch": 0.77, "grad_norm": 4.9828667640686035, "learning_rate": 1.1667444496299753e-05, "loss": 1.3136, "step": 8000 }, { "epoch": 0.77, "eval_loss": 1.1698399782180786, "eval_runtime": 27.2353, "eval_samples_per_second": 18.358, "eval_steps_per_second": 2.313, "step": 8000 }, { "epoch": 0.78, "grad_norm": 2.9339020252227783, "learning_rate": 1.1500766717781187e-05, "loss": 1.3125, "step": 8100 }, { "epoch": 0.78, "eval_loss": 1.171270489692688, "eval_runtime": 27.2365, "eval_samples_per_second": 18.358, "eval_steps_per_second": 2.313, "step": 8100 }, { "epoch": 0.79, "grad_norm": 10.367125511169434, "learning_rate": 1.1334088939262619e-05, "loss": 1.3017, "step": 8200 }, { "epoch": 0.79, "eval_loss": 1.1658000946044922, "eval_runtime": 27.2436, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 8200 }, { "epoch": 0.8, "grad_norm": 6.329850196838379, "learning_rate": 1.116741116074405e-05, "loss": 1.2615, "step": 8300 }, { "epoch": 0.8, "eval_loss": 1.1665682792663574, "eval_runtime": 27.2448, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 8300 }, { "epoch": 0.81, "grad_norm": 5.759530067443848, "learning_rate": 1.1000733382225482e-05, "loss": 1.3781, "step": 8400 }, { "epoch": 0.81, "eval_loss": 1.1660109758377075, "eval_runtime": 27.2383, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 8400 }, { "epoch": 0.82, "grad_norm": 6.256554126739502, "learning_rate": 1.0834055603706914e-05, "loss": 1.2429, "step": 8500 }, { "epoch": 0.82, "eval_loss": 1.1605762243270874, "eval_runtime": 27.2435, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 8500 }, { "epoch": 0.83, "grad_norm": 6.039596080780029, "learning_rate": 1.0667377825188346e-05, "loss": 1.3783, "step": 8600 }, { "epoch": 0.83, "eval_loss": 1.1602510213851929, "eval_runtime": 27.2519, "eval_samples_per_second": 18.347, "eval_steps_per_second": 2.312, "step": 8600 }, { "epoch": 0.84, "grad_norm": 12.013235092163086, "learning_rate": 1.0500700046669778e-05, "loss": 1.247, "step": 8700 }, { "epoch": 0.84, "eval_loss": 1.1587607860565186, "eval_runtime": 27.2412, "eval_samples_per_second": 18.355, "eval_steps_per_second": 2.313, "step": 8700 }, { "epoch": 0.85, "grad_norm": 2.92677640914917, "learning_rate": 1.0334022268151212e-05, "loss": 1.2676, "step": 8800 }, { "epoch": 0.85, "eval_loss": 1.157351016998291, "eval_runtime": 27.2387, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 8800 }, { "epoch": 0.86, "grad_norm": 2.9594922065734863, "learning_rate": 1.0167344489632642e-05, "loss": 1.1611, "step": 8900 }, { "epoch": 0.86, "eval_loss": 1.1563640832901, "eval_runtime": 27.2391, "eval_samples_per_second": 18.356, "eval_steps_per_second": 2.313, "step": 8900 }, { "epoch": 0.87, "grad_norm": 11.393871307373047, "learning_rate": 1.0000666711114074e-05, "loss": 1.2814, "step": 9000 }, { "epoch": 0.87, "eval_loss": 1.1527520418167114, "eval_runtime": 27.2403, "eval_samples_per_second": 18.355, "eval_steps_per_second": 2.313, "step": 9000 }, { "epoch": 0.88, "grad_norm": 13.028400421142578, "learning_rate": 9.833988932595508e-06, "loss": 1.2742, "step": 9100 }, { "epoch": 0.88, "eval_loss": 1.147485375404358, "eval_runtime": 27.2339, "eval_samples_per_second": 18.359, "eval_steps_per_second": 2.313, "step": 9100 }, { "epoch": 0.89, "grad_norm": 5.377606391906738, "learning_rate": 9.667311154076938e-06, "loss": 1.315, "step": 9200 }, { "epoch": 0.89, "eval_loss": 1.1482326984405518, "eval_runtime": 27.255, "eval_samples_per_second": 18.345, "eval_steps_per_second": 2.311, "step": 9200 }, { "epoch": 0.9, "grad_norm": 6.8872199058532715, "learning_rate": 9.500633375558372e-06, "loss": 1.2934, "step": 9300 }, { "epoch": 0.9, "eval_loss": 1.1486388444900513, "eval_runtime": 27.2466, "eval_samples_per_second": 18.351, "eval_steps_per_second": 2.312, "step": 9300 }, { "epoch": 0.91, "grad_norm": 8.581022262573242, "learning_rate": 9.333955597039803e-06, "loss": 1.3053, "step": 9400 }, { "epoch": 0.91, "eval_loss": 1.1475136280059814, "eval_runtime": 27.2485, "eval_samples_per_second": 18.35, "eval_steps_per_second": 2.312, "step": 9400 }, { "epoch": 0.92, "grad_norm": 6.223934173583984, "learning_rate": 9.167277818521235e-06, "loss": 1.2461, "step": 9500 }, { "epoch": 0.92, "eval_loss": 1.1443854570388794, "eval_runtime": 27.25, "eval_samples_per_second": 18.349, "eval_steps_per_second": 2.312, "step": 9500 }, { "epoch": 0.93, "grad_norm": 7.77838659286499, "learning_rate": 9.000600040002667e-06, "loss": 1.2461, "step": 9600 }, { "epoch": 0.93, "eval_loss": 1.1398392915725708, "eval_runtime": 27.2344, "eval_samples_per_second": 18.359, "eval_steps_per_second": 2.313, "step": 9600 }, { "epoch": 0.94, "grad_norm": 6.271127223968506, "learning_rate": 8.8339222614841e-06, "loss": 1.2545, "step": 9700 }, { "epoch": 0.94, "eval_loss": 1.139143466949463, "eval_runtime": 27.2452, "eval_samples_per_second": 18.352, "eval_steps_per_second": 2.312, "step": 9700 }, { "epoch": 0.95, "grad_norm": 5.507913112640381, "learning_rate": 8.667244482965531e-06, "loss": 1.265, "step": 9800 }, { "epoch": 0.95, "eval_loss": 1.1366428136825562, "eval_runtime": 27.2432, "eval_samples_per_second": 18.353, "eval_steps_per_second": 2.312, "step": 9800 }, { "epoch": 0.96, "grad_norm": 13.688467025756836, "learning_rate": 8.500566704446963e-06, "loss": 1.3709, "step": 9900 }, { "epoch": 0.96, "eval_loss": 1.133713722229004, "eval_runtime": 27.2421, "eval_samples_per_second": 18.354, "eval_steps_per_second": 2.313, "step": 9900 } ], "logging_steps": 100, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 3.73253167214592e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }