|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9555062252678312, |
|
"eval_steps": 100, |
|
"global_step": 9900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.168999671936035, |
|
"learning_rate": 2.483498899926662e-05, |
|
"loss": 1.8368, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 1.4510709047317505, |
|
"eval_runtime": 27.2844, |
|
"eval_samples_per_second": 18.325, |
|
"eval_steps_per_second": 2.309, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.417923927307129, |
|
"learning_rate": 2.466831122074805e-05, |
|
"loss": 1.6043, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 1.4302723407745361, |
|
"eval_runtime": 27.2442, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.932116985321045, |
|
"learning_rate": 2.450163344222948e-05, |
|
"loss": 1.7302, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 1.4036405086517334, |
|
"eval_runtime": 27.2418, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.3278746604919434, |
|
"learning_rate": 2.4334955663710914e-05, |
|
"loss": 1.5892, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.3917973041534424, |
|
"eval_runtime": 27.2382, |
|
"eval_samples_per_second": 18.357, |
|
"eval_steps_per_second": 2.313, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.543770790100098, |
|
"learning_rate": 2.4168277885192348e-05, |
|
"loss": 1.6516, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.3847407102584839, |
|
"eval_runtime": 27.2419, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.016234874725342, |
|
"learning_rate": 2.400160010667378e-05, |
|
"loss": 1.5757, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.3731025457382202, |
|
"eval_runtime": 27.2348, |
|
"eval_samples_per_second": 18.359, |
|
"eval_steps_per_second": 2.313, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.283770561218262, |
|
"learning_rate": 2.3834922328155212e-05, |
|
"loss": 1.6181, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 1.3765405416488647, |
|
"eval_runtime": 27.2404, |
|
"eval_samples_per_second": 18.355, |
|
"eval_steps_per_second": 2.313, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.656264781951904, |
|
"learning_rate": 2.3668244549636642e-05, |
|
"loss": 1.6269, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 1.363603115081787, |
|
"eval_runtime": 27.2488, |
|
"eval_samples_per_second": 18.349, |
|
"eval_steps_per_second": 2.312, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.9825522899627686, |
|
"learning_rate": 2.3501566771118076e-05, |
|
"loss": 1.5679, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.3591631650924683, |
|
"eval_runtime": 27.2405, |
|
"eval_samples_per_second": 18.355, |
|
"eval_steps_per_second": 2.313, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.61617112159729, |
|
"learning_rate": 2.3334888992599506e-05, |
|
"loss": 1.5055, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.3467729091644287, |
|
"eval_runtime": 27.2329, |
|
"eval_samples_per_second": 18.36, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.158337593078613, |
|
"learning_rate": 2.316821121408094e-05, |
|
"loss": 1.5367, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 1.3473803997039795, |
|
"eval_runtime": 27.2426, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8.277384757995605, |
|
"learning_rate": 2.3001533435562373e-05, |
|
"loss": 1.5571, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.3315479755401611, |
|
"eval_runtime": 27.2582, |
|
"eval_samples_per_second": 18.343, |
|
"eval_steps_per_second": 2.311, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.899178504943848, |
|
"learning_rate": 2.2834855657043804e-05, |
|
"loss": 1.5624, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 1.329127311706543, |
|
"eval_runtime": 27.2334, |
|
"eval_samples_per_second": 18.36, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.201148271560669, |
|
"learning_rate": 2.2668177878525237e-05, |
|
"loss": 1.5005, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.3275610208511353, |
|
"eval_runtime": 27.242, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 7.74235725402832, |
|
"learning_rate": 2.2501500100006667e-05, |
|
"loss": 1.5763, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.3223873376846313, |
|
"eval_runtime": 27.2387, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.6546003818511963, |
|
"learning_rate": 2.23348223214881e-05, |
|
"loss": 1.4833, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 1.3203272819519043, |
|
"eval_runtime": 27.2472, |
|
"eval_samples_per_second": 18.35, |
|
"eval_steps_per_second": 2.312, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 12.014766693115234, |
|
"learning_rate": 2.216814454296953e-05, |
|
"loss": 1.5086, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.3142383098602295, |
|
"eval_runtime": 27.2319, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.633994102478027, |
|
"learning_rate": 2.2001466764450965e-05, |
|
"loss": 1.5172, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.3094617128372192, |
|
"eval_runtime": 27.2393, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.995003700256348, |
|
"learning_rate": 2.18347889859324e-05, |
|
"loss": 1.4299, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 1.3035434484481812, |
|
"eval_runtime": 27.2303, |
|
"eval_samples_per_second": 18.362, |
|
"eval_steps_per_second": 2.314, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.952856540679932, |
|
"learning_rate": 2.166811120741383e-05, |
|
"loss": 1.6068, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.2929936647415161, |
|
"eval_runtime": 27.2469, |
|
"eval_samples_per_second": 18.351, |
|
"eval_steps_per_second": 2.312, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.213510036468506, |
|
"learning_rate": 2.150143342889526e-05, |
|
"loss": 1.5253, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.294145941734314, |
|
"eval_runtime": 27.2323, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 10.12060832977295, |
|
"learning_rate": 2.1334755650376693e-05, |
|
"loss": 1.54, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 1.2877910137176514, |
|
"eval_runtime": 27.2416, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 9.764932632446289, |
|
"learning_rate": 2.1168077871858126e-05, |
|
"loss": 1.5397, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 1.2824313640594482, |
|
"eval_runtime": 27.2505, |
|
"eval_samples_per_second": 18.348, |
|
"eval_steps_per_second": 2.312, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.659645080566406, |
|
"learning_rate": 2.1001400093339556e-05, |
|
"loss": 1.4568, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 1.277322769165039, |
|
"eval_runtime": 27.244, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 9.128131866455078, |
|
"learning_rate": 2.083472231482099e-05, |
|
"loss": 1.4044, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.274811863899231, |
|
"eval_runtime": 27.2388, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.9920785427093506, |
|
"learning_rate": 2.0668044536302424e-05, |
|
"loss": 1.5648, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.2758684158325195, |
|
"eval_runtime": 27.2442, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 10.10120677947998, |
|
"learning_rate": 2.050136675778385e-05, |
|
"loss": 1.3923, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 1.2696377038955688, |
|
"eval_runtime": 27.2457, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 8.736724853515625, |
|
"learning_rate": 2.0334688979265284e-05, |
|
"loss": 1.5071, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 1.269506812095642, |
|
"eval_runtime": 27.2434, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.389509677886963, |
|
"learning_rate": 2.0168011200746718e-05, |
|
"loss": 1.4408, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 1.2669789791107178, |
|
"eval_runtime": 27.2401, |
|
"eval_samples_per_second": 18.355, |
|
"eval_steps_per_second": 2.313, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 7.288300514221191, |
|
"learning_rate": 2.0001333422228148e-05, |
|
"loss": 1.4174, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.2704719305038452, |
|
"eval_runtime": 27.225, |
|
"eval_samples_per_second": 18.365, |
|
"eval_steps_per_second": 2.314, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.918144464492798, |
|
"learning_rate": 1.983465564370958e-05, |
|
"loss": 1.4545, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 1.2649730443954468, |
|
"eval_runtime": 27.2304, |
|
"eval_samples_per_second": 18.362, |
|
"eval_steps_per_second": 2.314, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 7.6044392585754395, |
|
"learning_rate": 1.9667977865191015e-05, |
|
"loss": 1.4661, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 1.2659434080123901, |
|
"eval_runtime": 27.2453, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.494729518890381, |
|
"learning_rate": 1.9501300086672446e-05, |
|
"loss": 1.409, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 1.2590056657791138, |
|
"eval_runtime": 27.2367, |
|
"eval_samples_per_second": 18.358, |
|
"eval_steps_per_second": 2.313, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.691107273101807, |
|
"learning_rate": 1.9334622308153876e-05, |
|
"loss": 1.4775, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 1.2585530281066895, |
|
"eval_runtime": 27.2332, |
|
"eval_samples_per_second": 18.36, |
|
"eval_steps_per_second": 2.313, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 8.363253593444824, |
|
"learning_rate": 1.916794452963531e-05, |
|
"loss": 1.3749, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 1.2596063613891602, |
|
"eval_runtime": 27.2327, |
|
"eval_samples_per_second": 18.36, |
|
"eval_steps_per_second": 2.313, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.728103160858154, |
|
"learning_rate": 1.9001266751116743e-05, |
|
"loss": 1.438, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 1.25990629196167, |
|
"eval_runtime": 27.2309, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.314, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8.10669994354248, |
|
"learning_rate": 1.8834588972598173e-05, |
|
"loss": 1.4927, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.2571576833724976, |
|
"eval_runtime": 27.2321, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.870044231414795, |
|
"learning_rate": 1.8667911194079607e-05, |
|
"loss": 1.3475, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 1.2502912282943726, |
|
"eval_runtime": 27.2283, |
|
"eval_samples_per_second": 18.363, |
|
"eval_steps_per_second": 2.314, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 8.73828125, |
|
"learning_rate": 1.850123341556104e-05, |
|
"loss": 1.467, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 1.250830054283142, |
|
"eval_runtime": 27.2279, |
|
"eval_samples_per_second": 18.364, |
|
"eval_steps_per_second": 2.314, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 6.171374797821045, |
|
"learning_rate": 1.833455563704247e-05, |
|
"loss": 1.3805, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 1.2475095987319946, |
|
"eval_runtime": 27.2405, |
|
"eval_samples_per_second": 18.355, |
|
"eval_steps_per_second": 2.313, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 9.766369819641113, |
|
"learning_rate": 1.81678778585239e-05, |
|
"loss": 1.4581, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.2426196336746216, |
|
"eval_runtime": 27.2316, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 8.565445899963379, |
|
"learning_rate": 1.8001200080005335e-05, |
|
"loss": 1.4319, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 1.2432831525802612, |
|
"eval_runtime": 27.2265, |
|
"eval_samples_per_second": 18.364, |
|
"eval_steps_per_second": 2.314, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 7.835436820983887, |
|
"learning_rate": 1.7834522301486768e-05, |
|
"loss": 1.4282, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 1.23819899559021, |
|
"eval_runtime": 27.239, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 5.4964985847473145, |
|
"learning_rate": 1.76678445229682e-05, |
|
"loss": 1.4429, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 1.2355051040649414, |
|
"eval_runtime": 27.2454, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 6.0724897384643555, |
|
"learning_rate": 1.7501166744449632e-05, |
|
"loss": 1.3815, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 1.2307285070419312, |
|
"eval_runtime": 27.248, |
|
"eval_samples_per_second": 18.35, |
|
"eval_steps_per_second": 2.312, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 5.445316314697266, |
|
"learning_rate": 1.7334488965931062e-05, |
|
"loss": 1.4194, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 1.225948691368103, |
|
"eval_runtime": 27.2592, |
|
"eval_samples_per_second": 18.342, |
|
"eval_steps_per_second": 2.311, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 6.078861236572266, |
|
"learning_rate": 1.7167811187412493e-05, |
|
"loss": 1.4719, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 1.2257155179977417, |
|
"eval_runtime": 27.234, |
|
"eval_samples_per_second": 18.359, |
|
"eval_steps_per_second": 2.313, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.971687316894531, |
|
"learning_rate": 1.7001133408893926e-05, |
|
"loss": 1.3587, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.219119668006897, |
|
"eval_runtime": 27.232, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 10.197978973388672, |
|
"learning_rate": 1.683445563037536e-05, |
|
"loss": 1.3599, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 1.2111576795578003, |
|
"eval_runtime": 27.2374, |
|
"eval_samples_per_second": 18.357, |
|
"eval_steps_per_second": 2.313, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.264425754547119, |
|
"learning_rate": 1.6667777851856793e-05, |
|
"loss": 1.4247, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 1.207905650138855, |
|
"eval_runtime": 27.2335, |
|
"eval_samples_per_second": 18.36, |
|
"eval_steps_per_second": 2.313, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 6.512348651885986, |
|
"learning_rate": 1.6501100073338224e-05, |
|
"loss": 1.2745, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 1.210147738456726, |
|
"eval_runtime": 27.2312, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.314, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.255068778991699, |
|
"learning_rate": 1.6334422294819657e-05, |
|
"loss": 1.4317, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.2059623003005981, |
|
"eval_runtime": 27.2372, |
|
"eval_samples_per_second": 18.357, |
|
"eval_steps_per_second": 2.313, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 11.69495677947998, |
|
"learning_rate": 1.6167744516301088e-05, |
|
"loss": 1.3463, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 1.2085806131362915, |
|
"eval_runtime": 27.238, |
|
"eval_samples_per_second": 18.357, |
|
"eval_steps_per_second": 2.313, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.8801398277282715, |
|
"learning_rate": 1.6001066737782518e-05, |
|
"loss": 1.3995, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"eval_loss": 1.2063578367233276, |
|
"eval_runtime": 27.2504, |
|
"eval_samples_per_second": 18.348, |
|
"eval_steps_per_second": 2.312, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 6.397332668304443, |
|
"learning_rate": 1.583438895926395e-05, |
|
"loss": 1.3316, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.2056635618209839, |
|
"eval_runtime": 27.2573, |
|
"eval_samples_per_second": 18.344, |
|
"eval_steps_per_second": 2.311, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 8.16231918334961, |
|
"learning_rate": 1.5667711180745385e-05, |
|
"loss": 1.3064, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 1.1981565952301025, |
|
"eval_runtime": 27.2447, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 8.272992134094238, |
|
"learning_rate": 1.5501033402226815e-05, |
|
"loss": 1.2971, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 1.1983332633972168, |
|
"eval_runtime": 27.2527, |
|
"eval_samples_per_second": 18.347, |
|
"eval_steps_per_second": 2.312, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.432589054107666, |
|
"learning_rate": 1.533435562370825e-05, |
|
"loss": 1.4006, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.1970897912979126, |
|
"eval_runtime": 27.2266, |
|
"eval_samples_per_second": 18.364, |
|
"eval_steps_per_second": 2.314, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 12.17546272277832, |
|
"learning_rate": 1.5167677845189679e-05, |
|
"loss": 1.4033, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 1.1994664669036865, |
|
"eval_runtime": 27.2478, |
|
"eval_samples_per_second": 18.35, |
|
"eval_steps_per_second": 2.312, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 6.728982448577881, |
|
"learning_rate": 1.5001000066671111e-05, |
|
"loss": 1.3381, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 1.197486162185669, |
|
"eval_runtime": 27.234, |
|
"eval_samples_per_second": 18.359, |
|
"eval_steps_per_second": 2.313, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 5.387950420379639, |
|
"learning_rate": 1.4834322288152545e-05, |
|
"loss": 1.3284, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 1.19466233253479, |
|
"eval_runtime": 27.2424, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.059466361999512, |
|
"learning_rate": 1.4667644509633977e-05, |
|
"loss": 1.3492, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.191383719444275, |
|
"eval_runtime": 27.2468, |
|
"eval_samples_per_second": 18.351, |
|
"eval_steps_per_second": 2.312, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.771157264709473, |
|
"learning_rate": 1.4500966731115409e-05, |
|
"loss": 1.2956, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 1.1882104873657227, |
|
"eval_runtime": 27.2582, |
|
"eval_samples_per_second": 18.343, |
|
"eval_steps_per_second": 2.311, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 8.666051864624023, |
|
"learning_rate": 1.4334288952596842e-05, |
|
"loss": 1.3381, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 1.1888271570205688, |
|
"eval_runtime": 27.2369, |
|
"eval_samples_per_second": 18.357, |
|
"eval_steps_per_second": 2.313, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.501246452331543, |
|
"learning_rate": 1.416761117407827e-05, |
|
"loss": 1.4293, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_loss": 1.1847625970840454, |
|
"eval_runtime": 27.239, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 7.15999174118042, |
|
"learning_rate": 1.4000933395559704e-05, |
|
"loss": 1.2713, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.1845014095306396, |
|
"eval_runtime": 27.2437, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.205693244934082, |
|
"learning_rate": 1.3834255617041136e-05, |
|
"loss": 1.4094, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 1.1849124431610107, |
|
"eval_runtime": 27.2322, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 10.90744400024414, |
|
"learning_rate": 1.3667577838522568e-05, |
|
"loss": 1.3321, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 1.1828891038894653, |
|
"eval_runtime": 27.2545, |
|
"eval_samples_per_second": 18.346, |
|
"eval_steps_per_second": 2.312, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.98179292678833, |
|
"learning_rate": 1.3500900060004002e-05, |
|
"loss": 1.3464, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 1.181032419204712, |
|
"eval_runtime": 27.2781, |
|
"eval_samples_per_second": 18.33, |
|
"eval_steps_per_second": 2.31, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 7.574715614318848, |
|
"learning_rate": 1.3334222281485434e-05, |
|
"loss": 1.3912, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.178390383720398, |
|
"eval_runtime": 27.2322, |
|
"eval_samples_per_second": 18.361, |
|
"eval_steps_per_second": 2.313, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 7.541271209716797, |
|
"learning_rate": 1.3167544502966864e-05, |
|
"loss": 1.3588, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 1.177061915397644, |
|
"eval_runtime": 27.2447, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 9.224418640136719, |
|
"learning_rate": 1.3000866724448296e-05, |
|
"loss": 1.3083, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 1.1750773191452026, |
|
"eval_runtime": 27.2423, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 9.978618621826172, |
|
"learning_rate": 1.283418894592973e-05, |
|
"loss": 1.2933, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 1.174636960029602, |
|
"eval_runtime": 27.2335, |
|
"eval_samples_per_second": 18.36, |
|
"eval_steps_per_second": 2.313, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.905595779418945, |
|
"learning_rate": 1.2667511167411161e-05, |
|
"loss": 1.3069, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.1744636297225952, |
|
"eval_runtime": 27.2307, |
|
"eval_samples_per_second": 18.362, |
|
"eval_steps_per_second": 2.314, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 7.499482154846191, |
|
"learning_rate": 1.2500833388892593e-05, |
|
"loss": 1.3172, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.173417568206787, |
|
"eval_runtime": 27.2368, |
|
"eval_samples_per_second": 18.357, |
|
"eval_steps_per_second": 2.313, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.5594282150268555, |
|
"learning_rate": 1.2334155610374025e-05, |
|
"loss": 1.3392, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 1.1731162071228027, |
|
"eval_runtime": 27.2663, |
|
"eval_samples_per_second": 18.338, |
|
"eval_steps_per_second": 2.311, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.877806663513184, |
|
"learning_rate": 1.2167477831855457e-05, |
|
"loss": 1.2941, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 1.1754953861236572, |
|
"eval_runtime": 27.2391, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.909502029418945, |
|
"learning_rate": 1.200080005333689e-05, |
|
"loss": 1.3124, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.1714600324630737, |
|
"eval_runtime": 27.2351, |
|
"eval_samples_per_second": 18.359, |
|
"eval_steps_per_second": 2.313, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.706770181655884, |
|
"learning_rate": 1.1834122274818321e-05, |
|
"loss": 1.2626, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 1.1713511943817139, |
|
"eval_runtime": 27.2512, |
|
"eval_samples_per_second": 18.348, |
|
"eval_steps_per_second": 2.312, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.9828667640686035, |
|
"learning_rate": 1.1667444496299753e-05, |
|
"loss": 1.3136, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 1.1698399782180786, |
|
"eval_runtime": 27.2353, |
|
"eval_samples_per_second": 18.358, |
|
"eval_steps_per_second": 2.313, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.9339020252227783, |
|
"learning_rate": 1.1500766717781187e-05, |
|
"loss": 1.3125, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.171270489692688, |
|
"eval_runtime": 27.2365, |
|
"eval_samples_per_second": 18.358, |
|
"eval_steps_per_second": 2.313, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 10.367125511169434, |
|
"learning_rate": 1.1334088939262619e-05, |
|
"loss": 1.3017, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 1.1658000946044922, |
|
"eval_runtime": 27.2436, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 6.329850196838379, |
|
"learning_rate": 1.116741116074405e-05, |
|
"loss": 1.2615, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.1665682792663574, |
|
"eval_runtime": 27.2448, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.759530067443848, |
|
"learning_rate": 1.1000733382225482e-05, |
|
"loss": 1.3781, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 1.1660109758377075, |
|
"eval_runtime": 27.2383, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.256554126739502, |
|
"learning_rate": 1.0834055603706914e-05, |
|
"loss": 1.2429, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 1.1605762243270874, |
|
"eval_runtime": 27.2435, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 6.039596080780029, |
|
"learning_rate": 1.0667377825188346e-05, |
|
"loss": 1.3783, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_loss": 1.1602510213851929, |
|
"eval_runtime": 27.2519, |
|
"eval_samples_per_second": 18.347, |
|
"eval_steps_per_second": 2.312, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 12.013235092163086, |
|
"learning_rate": 1.0500700046669778e-05, |
|
"loss": 1.247, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 1.1587607860565186, |
|
"eval_runtime": 27.2412, |
|
"eval_samples_per_second": 18.355, |
|
"eval_steps_per_second": 2.313, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.92677640914917, |
|
"learning_rate": 1.0334022268151212e-05, |
|
"loss": 1.2676, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 1.157351016998291, |
|
"eval_runtime": 27.2387, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.9594922065734863, |
|
"learning_rate": 1.0167344489632642e-05, |
|
"loss": 1.1611, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 1.1563640832901, |
|
"eval_runtime": 27.2391, |
|
"eval_samples_per_second": 18.356, |
|
"eval_steps_per_second": 2.313, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 11.393871307373047, |
|
"learning_rate": 1.0000666711114074e-05, |
|
"loss": 1.2814, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 1.1527520418167114, |
|
"eval_runtime": 27.2403, |
|
"eval_samples_per_second": 18.355, |
|
"eval_steps_per_second": 2.313, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 13.028400421142578, |
|
"learning_rate": 9.833988932595508e-06, |
|
"loss": 1.2742, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 1.147485375404358, |
|
"eval_runtime": 27.2339, |
|
"eval_samples_per_second": 18.359, |
|
"eval_steps_per_second": 2.313, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 5.377606391906738, |
|
"learning_rate": 9.667311154076938e-06, |
|
"loss": 1.315, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 1.1482326984405518, |
|
"eval_runtime": 27.255, |
|
"eval_samples_per_second": 18.345, |
|
"eval_steps_per_second": 2.311, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.8872199058532715, |
|
"learning_rate": 9.500633375558372e-06, |
|
"loss": 1.2934, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 1.1486388444900513, |
|
"eval_runtime": 27.2466, |
|
"eval_samples_per_second": 18.351, |
|
"eval_steps_per_second": 2.312, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 8.581022262573242, |
|
"learning_rate": 9.333955597039803e-06, |
|
"loss": 1.3053, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 1.1475136280059814, |
|
"eval_runtime": 27.2485, |
|
"eval_samples_per_second": 18.35, |
|
"eval_steps_per_second": 2.312, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.223934173583984, |
|
"learning_rate": 9.167277818521235e-06, |
|
"loss": 1.2461, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 1.1443854570388794, |
|
"eval_runtime": 27.25, |
|
"eval_samples_per_second": 18.349, |
|
"eval_steps_per_second": 2.312, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 7.77838659286499, |
|
"learning_rate": 9.000600040002667e-06, |
|
"loss": 1.2461, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 1.1398392915725708, |
|
"eval_runtime": 27.2344, |
|
"eval_samples_per_second": 18.359, |
|
"eval_steps_per_second": 2.313, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 6.271127223968506, |
|
"learning_rate": 8.8339222614841e-06, |
|
"loss": 1.2545, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 1.139143466949463, |
|
"eval_runtime": 27.2452, |
|
"eval_samples_per_second": 18.352, |
|
"eval_steps_per_second": 2.312, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.507913112640381, |
|
"learning_rate": 8.667244482965531e-06, |
|
"loss": 1.265, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 1.1366428136825562, |
|
"eval_runtime": 27.2432, |
|
"eval_samples_per_second": 18.353, |
|
"eval_steps_per_second": 2.312, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 13.688467025756836, |
|
"learning_rate": 8.500566704446963e-06, |
|
"loss": 1.3709, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 1.133713722229004, |
|
"eval_runtime": 27.2421, |
|
"eval_samples_per_second": 18.354, |
|
"eval_steps_per_second": 2.313, |
|
"step": 9900 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 15000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"total_flos": 3.73253167214592e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|