|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.36319612590799033, |
|
"eval_steps": 9, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004842615012106538, |
|
"grad_norm": 6.951022148132324, |
|
"learning_rate": 1e-05, |
|
"loss": 4.4499, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004842615012106538, |
|
"eval_loss": 4.435294151306152, |
|
"eval_runtime": 19.3661, |
|
"eval_samples_per_second": 8.985, |
|
"eval_steps_per_second": 1.136, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009685230024213076, |
|
"grad_norm": 7.089580535888672, |
|
"learning_rate": 2e-05, |
|
"loss": 4.5236, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.014527845036319613, |
|
"grad_norm": 5.838200092315674, |
|
"learning_rate": 3e-05, |
|
"loss": 4.7935, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01937046004842615, |
|
"grad_norm": 7.749260425567627, |
|
"learning_rate": 4e-05, |
|
"loss": 4.2716, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.024213075060532687, |
|
"grad_norm": 5.93482780456543, |
|
"learning_rate": 5e-05, |
|
"loss": 4.0175, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.029055690072639227, |
|
"grad_norm": 5.986671447753906, |
|
"learning_rate": 6e-05, |
|
"loss": 4.0304, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03389830508474576, |
|
"grad_norm": 9.270807266235352, |
|
"learning_rate": 7e-05, |
|
"loss": 3.8371, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0387409200968523, |
|
"grad_norm": 4.2939581871032715, |
|
"learning_rate": 8e-05, |
|
"loss": 3.419, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.043583535108958835, |
|
"grad_norm": 5.0304856300354, |
|
"learning_rate": 9e-05, |
|
"loss": 3.234, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.043583535108958835, |
|
"eval_loss": 3.272956132888794, |
|
"eval_runtime": 18.9235, |
|
"eval_samples_per_second": 9.195, |
|
"eval_steps_per_second": 1.163, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.048426150121065374, |
|
"grad_norm": 3.3577680587768555, |
|
"learning_rate": 0.0001, |
|
"loss": 3.6277, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.053268765133171914, |
|
"grad_norm": 3.3006627559661865, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 3.0964, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05811138014527845, |
|
"grad_norm": 3.9956471920013428, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 3.513, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06295399515738499, |
|
"grad_norm": 3.6589813232421875, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 3.0589, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06779661016949153, |
|
"grad_norm": 2.8601772785186768, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 3.1075, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07263922518159806, |
|
"grad_norm": 2.350567579269409, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 2.9396, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0774818401937046, |
|
"grad_norm": 3.1738619804382324, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 3.4016, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08232445520581114, |
|
"grad_norm": 2.2036235332489014, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 3.2708, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.08716707021791767, |
|
"grad_norm": 1.8742477893829346, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 3.0718, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08716707021791767, |
|
"eval_loss": 3.21498966217041, |
|
"eval_runtime": 18.9258, |
|
"eval_samples_per_second": 9.194, |
|
"eval_steps_per_second": 1.162, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09200968523002422, |
|
"grad_norm": 1.7467906475067139, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 3.2518, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.09685230024213075, |
|
"grad_norm": 1.91806161403656, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 3.6172, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1016949152542373, |
|
"grad_norm": 3.020704746246338, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 3.3863, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.10653753026634383, |
|
"grad_norm": 2.5391972064971924, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 3.3373, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11138014527845036, |
|
"grad_norm": 4.1953043937683105, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 3.2729, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1162227602905569, |
|
"grad_norm": 3.60253643989563, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 3.3414, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12106537530266344, |
|
"grad_norm": 1.7871415615081787, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 3.2021, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12590799031476999, |
|
"grad_norm": 1.662522792816162, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 3.0991, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13075060532687652, |
|
"grad_norm": 2.686455011367798, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 3.2832, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.13075060532687652, |
|
"eval_loss": 3.157177209854126, |
|
"eval_runtime": 18.9382, |
|
"eval_samples_per_second": 9.188, |
|
"eval_steps_per_second": 1.162, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.13559322033898305, |
|
"grad_norm": 2.5698466300964355, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 3.1181, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14043583535108958, |
|
"grad_norm": 1.9049105644226074, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 2.9334, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.14527845036319612, |
|
"grad_norm": 2.964027166366577, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 3.2234, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15012106537530268, |
|
"grad_norm": 1.3506639003753662, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 3.1192, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1549636803874092, |
|
"grad_norm": 1.3407896757125854, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 3.0793, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.15980629539951574, |
|
"grad_norm": 2.121093988418579, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 3.0593, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.16464891041162227, |
|
"grad_norm": 1.9535635709762573, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 3.2278, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1694915254237288, |
|
"grad_norm": 2.7124617099761963, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 3.4606, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.17433414043583534, |
|
"grad_norm": 1.7934988737106323, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 3.1656, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.17433414043583534, |
|
"eval_loss": 3.129340171813965, |
|
"eval_runtime": 18.9783, |
|
"eval_samples_per_second": 9.168, |
|
"eval_steps_per_second": 1.159, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1791767554479419, |
|
"grad_norm": 1.6445538997650146, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 3.0207, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.18401937046004843, |
|
"grad_norm": 1.3065544366836548, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 2.7292, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.18886198547215496, |
|
"grad_norm": 1.4131739139556885, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 2.8626, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1937046004842615, |
|
"grad_norm": 3.710179328918457, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 3.2187, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19854721549636803, |
|
"grad_norm": 1.917811393737793, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 3.1257, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2033898305084746, |
|
"grad_norm": 1.0857932567596436, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 3.0203, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.20823244552058112, |
|
"grad_norm": 0.9913411736488342, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 2.9028, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.21307506053268765, |
|
"grad_norm": 2.206404685974121, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 3.0664, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2179176755447942, |
|
"grad_norm": 1.5370384454727173, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 2.9419, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2179176755447942, |
|
"eval_loss": 3.1047704219818115, |
|
"eval_runtime": 18.9369, |
|
"eval_samples_per_second": 9.188, |
|
"eval_steps_per_second": 1.162, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.22276029055690072, |
|
"grad_norm": 1.9688664674758911, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 3.193, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.22760290556900725, |
|
"grad_norm": 2.1754133701324463, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 2.9325, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2324455205811138, |
|
"grad_norm": 2.6705334186553955, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 3.2207, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.23728813559322035, |
|
"grad_norm": 1.4950735569000244, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 3.0829, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.24213075060532688, |
|
"grad_norm": 2.104860782623291, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 3.1599, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2469733656174334, |
|
"grad_norm": 1.263146996498108, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 2.975, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.25181598062953997, |
|
"grad_norm": 3.4207539558410645, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 3.4015, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2566585956416465, |
|
"grad_norm": 1.5546566247940063, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 2.9378, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.26150121065375304, |
|
"grad_norm": 2.120374917984009, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 2.9896, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.26150121065375304, |
|
"eval_loss": 3.084446668624878, |
|
"eval_runtime": 18.926, |
|
"eval_samples_per_second": 9.194, |
|
"eval_steps_per_second": 1.162, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.26634382566585957, |
|
"grad_norm": 1.9824186563491821, |
|
"learning_rate": 5e-05, |
|
"loss": 3.0358, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2711864406779661, |
|
"grad_norm": 1.7688870429992676, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 3.1711, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.27602905569007263, |
|
"grad_norm": 2.102198839187622, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 3.4714, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.28087167070217917, |
|
"grad_norm": 1.664321780204773, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 3.0841, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 2.4341115951538086, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 3.2948, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.29055690072639223, |
|
"grad_norm": 0.9612340927124023, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 2.9165, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.29539951573849876, |
|
"grad_norm": 1.2795825004577637, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 3.1389, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.30024213075060535, |
|
"grad_norm": 0.9679549932479858, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 2.9197, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3050847457627119, |
|
"grad_norm": 1.515199899673462, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 3.0526, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3050847457627119, |
|
"eval_loss": 3.055881977081299, |
|
"eval_runtime": 18.9316, |
|
"eval_samples_per_second": 9.191, |
|
"eval_steps_per_second": 1.162, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3099273607748184, |
|
"grad_norm": 1.8831665515899658, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 3.1823, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.31476997578692495, |
|
"grad_norm": 1.6448265314102173, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 2.922, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3196125907990315, |
|
"grad_norm": 1.7221674919128418, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 3.2304, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.324455205811138, |
|
"grad_norm": 1.9639180898666382, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 3.296, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.32929782082324455, |
|
"grad_norm": 1.215235710144043, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 2.6854, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3341404358353511, |
|
"grad_norm": 2.490431785583496, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 3.0375, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3389830508474576, |
|
"grad_norm": 1.9049839973449707, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 3.2666, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.34382566585956414, |
|
"grad_norm": 1.5522865056991577, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 2.8866, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3486682808716707, |
|
"grad_norm": 1.4543166160583496, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 3.0314, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3486682808716707, |
|
"eval_loss": 3.0360867977142334, |
|
"eval_runtime": 18.9447, |
|
"eval_samples_per_second": 9.185, |
|
"eval_steps_per_second": 1.161, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.35351089588377727, |
|
"grad_norm": 3.4857094287872314, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 3.3805, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3583535108958838, |
|
"grad_norm": 1.6463013887405396, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 3.0626, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.36319612590799033, |
|
"grad_norm": 1.6041393280029297, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 2.9819, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.56415462670336e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|