|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.589743589743589, |
|
"eval_steps": 5, |
|
"global_step": 133, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05128205128205128, |
|
"grad_norm": 30.696048736572266, |
|
"learning_rate": 2e-07, |
|
"loss": 2.6145, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05128205128205128, |
|
"eval_loss": 2.721662998199463, |
|
"eval_runtime": 0.1874, |
|
"eval_samples_per_second": 165.427, |
|
"eval_steps_per_second": 26.682, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 31.234418869018555, |
|
"learning_rate": 4e-07, |
|
"loss": 2.839, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 26.09066390991211, |
|
"learning_rate": 6e-07, |
|
"loss": 2.804, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 25.11672019958496, |
|
"learning_rate": 8e-07, |
|
"loss": 2.7178, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 25.194042205810547, |
|
"learning_rate": 1e-06, |
|
"loss": 2.7668, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"eval_loss": 2.701810359954834, |
|
"eval_runtime": 0.1865, |
|
"eval_samples_per_second": 166.189, |
|
"eval_steps_per_second": 26.805, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 31.70111656188965, |
|
"learning_rate": 1.2e-06, |
|
"loss": 2.5639, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.358974358974359, |
|
"grad_norm": 25.10308837890625, |
|
"learning_rate": 1.4e-06, |
|
"loss": 2.6011, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 25.298452377319336, |
|
"learning_rate": 1.6e-06, |
|
"loss": 2.6779, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 22.12431526184082, |
|
"learning_rate": 1.8e-06, |
|
"loss": 2.5438, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 17.181961059570312, |
|
"learning_rate": 2e-06, |
|
"loss": 2.6304, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"eval_loss": 2.5064780712127686, |
|
"eval_runtime": 0.1877, |
|
"eval_samples_per_second": 165.144, |
|
"eval_steps_per_second": 26.636, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5641025641025641, |
|
"grad_norm": 15.064467430114746, |
|
"learning_rate": 1.9998476951563913e-06, |
|
"loss": 2.6119, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 15.15453815460205, |
|
"learning_rate": 1.9993908270190957e-06, |
|
"loss": 2.5618, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 14.976338386535645, |
|
"learning_rate": 1.998629534754574e-06, |
|
"loss": 2.5799, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 16.855302810668945, |
|
"learning_rate": 1.997564050259824e-06, |
|
"loss": 2.4803, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 14.893013954162598, |
|
"learning_rate": 1.9961946980917456e-06, |
|
"loss": 2.3635, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 2.3580050468444824, |
|
"eval_runtime": 0.1876, |
|
"eval_samples_per_second": 165.285, |
|
"eval_steps_per_second": 26.659, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 12.848993301391602, |
|
"learning_rate": 1.994521895368273e-06, |
|
"loss": 2.4411, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.8717948717948718, |
|
"grad_norm": 15.440024375915527, |
|
"learning_rate": 1.992546151641322e-06, |
|
"loss": 2.4781, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 13.695003509521484, |
|
"learning_rate": 1.99026806874157e-06, |
|
"loss": 2.4198, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.9743589743589743, |
|
"grad_norm": 13.504029273986816, |
|
"learning_rate": 1.9876883405951377e-06, |
|
"loss": 2.4088, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 16.270732879638672, |
|
"learning_rate": 1.984807753012208e-06, |
|
"loss": 2.4553, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"eval_loss": 2.281332015991211, |
|
"eval_runtime": 0.1875, |
|
"eval_samples_per_second": 165.302, |
|
"eval_steps_per_second": 26.662, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0384615384615385, |
|
"grad_norm": 13.558752059936523, |
|
"learning_rate": 1.981627183447664e-06, |
|
"loss": 2.3328, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.0897435897435896, |
|
"grad_norm": 13.454627990722656, |
|
"learning_rate": 1.9781476007338054e-06, |
|
"loss": 2.3366, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.141025641025641, |
|
"grad_norm": 14.2904052734375, |
|
"learning_rate": 1.9743700647852355e-06, |
|
"loss": 2.174, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.1923076923076923, |
|
"grad_norm": 13.595693588256836, |
|
"learning_rate": 1.9702957262759963e-06, |
|
"loss": 2.2358, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.2435897435897436, |
|
"grad_norm": 12.418634414672852, |
|
"learning_rate": 1.965925826289068e-06, |
|
"loss": 2.2344, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.2435897435897436, |
|
"eval_loss": 2.233912467956543, |
|
"eval_runtime": 0.1904, |
|
"eval_samples_per_second": 162.822, |
|
"eval_steps_per_second": 26.262, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.294871794871795, |
|
"grad_norm": 15.914401054382324, |
|
"learning_rate": 1.9612616959383188e-06, |
|
"loss": 2.259, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 12.605673789978027, |
|
"learning_rate": 1.9563047559630356e-06, |
|
"loss": 2.0799, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.3974358974358974, |
|
"grad_norm": 13.526497840881348, |
|
"learning_rate": 1.9510565162951534e-06, |
|
"loss": 2.1993, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.4487179487179487, |
|
"grad_norm": 12.563177108764648, |
|
"learning_rate": 1.945518575599317e-06, |
|
"loss": 2.2513, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 12.170258522033691, |
|
"learning_rate": 1.9396926207859082e-06, |
|
"loss": 2.4562, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 2.2017483711242676, |
|
"eval_runtime": 0.1881, |
|
"eval_samples_per_second": 164.78, |
|
"eval_steps_per_second": 26.577, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.5512820512820513, |
|
"grad_norm": 13.118155479431152, |
|
"learning_rate": 1.9335804264972015e-06, |
|
"loss": 2.1825, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.6025641025641026, |
|
"grad_norm": 13.182004928588867, |
|
"learning_rate": 1.9271838545667875e-06, |
|
"loss": 2.2352, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.6538461538461537, |
|
"grad_norm": 14.191438674926758, |
|
"learning_rate": 1.9205048534524403e-06, |
|
"loss": 2.2883, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.7051282051282053, |
|
"grad_norm": 13.125994682312012, |
|
"learning_rate": 1.9135454576426007e-06, |
|
"loss": 2.204, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.7564102564102564, |
|
"grad_norm": 13.099204063415527, |
|
"learning_rate": 1.9063077870366499e-06, |
|
"loss": 2.0943, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.7564102564102564, |
|
"eval_loss": 2.1725800037384033, |
|
"eval_runtime": 0.1875, |
|
"eval_samples_per_second": 165.319, |
|
"eval_steps_per_second": 26.664, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.8076923076923077, |
|
"grad_norm": 12.349153518676758, |
|
"learning_rate": 1.8987940462991669e-06, |
|
"loss": 2.2073, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.858974358974359, |
|
"grad_norm": 12.74866008758545, |
|
"learning_rate": 1.8910065241883678e-06, |
|
"loss": 2.2062, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.9102564102564101, |
|
"grad_norm": 10.330320358276367, |
|
"learning_rate": 1.8829475928589268e-06, |
|
"loss": 2.0004, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.9615384615384617, |
|
"grad_norm": 13.375683784484863, |
|
"learning_rate": 1.8746197071393956e-06, |
|
"loss": 1.9728, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 2.0128205128205128, |
|
"grad_norm": 13.092984199523926, |
|
"learning_rate": 1.8660254037844386e-06, |
|
"loss": 2.0695, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0128205128205128, |
|
"eval_loss": 2.1425397396087646, |
|
"eval_runtime": 0.1884, |
|
"eval_samples_per_second": 164.566, |
|
"eval_steps_per_second": 26.543, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0256410256410255, |
|
"grad_norm": 12.576122283935547, |
|
"learning_rate": 1.8571673007021123e-06, |
|
"loss": 2.0414, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"grad_norm": 13.123306274414062, |
|
"learning_rate": 1.8480480961564257e-06, |
|
"loss": 2.1836, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.128205128205128, |
|
"grad_norm": 11.772199630737305, |
|
"learning_rate": 1.838670567945424e-06, |
|
"loss": 2.0555, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 2.1794871794871793, |
|
"grad_norm": 12.407557487487793, |
|
"learning_rate": 1.8290375725550415e-06, |
|
"loss": 1.9841, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"grad_norm": 10.64401626586914, |
|
"learning_rate": 1.8191520442889917e-06, |
|
"loss": 1.8616, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"eval_loss": 2.117149591445923, |
|
"eval_runtime": 0.1871, |
|
"eval_samples_per_second": 165.686, |
|
"eval_steps_per_second": 26.724, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.282051282051282, |
|
"grad_norm": 11.632575035095215, |
|
"learning_rate": 1.8090169943749474e-06, |
|
"loss": 1.9493, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 13.22929573059082, |
|
"learning_rate": 1.7986355100472927e-06, |
|
"loss": 1.9483, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"grad_norm": 13.824577331542969, |
|
"learning_rate": 1.7880107536067217e-06, |
|
"loss": 2.0555, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.435897435897436, |
|
"grad_norm": 10.910252571105957, |
|
"learning_rate": 1.7771459614569707e-06, |
|
"loss": 2.1374, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 2.4871794871794872, |
|
"grad_norm": 13.26654052734375, |
|
"learning_rate": 1.766044443118978e-06, |
|
"loss": 2.0498, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.4871794871794872, |
|
"eval_loss": 2.1040406227111816, |
|
"eval_runtime": 0.1886, |
|
"eval_samples_per_second": 164.403, |
|
"eval_steps_per_second": 26.517, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"grad_norm": 11.703288078308105, |
|
"learning_rate": 1.7547095802227721e-06, |
|
"loss": 1.9002, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.58974358974359, |
|
"grad_norm": 13.835978507995605, |
|
"learning_rate": 1.743144825477394e-06, |
|
"loss": 1.988, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.641025641025641, |
|
"grad_norm": 14.295548439025879, |
|
"learning_rate": 1.7313537016191704e-06, |
|
"loss": 2.0443, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 11.679184913635254, |
|
"learning_rate": 1.719339800338651e-06, |
|
"loss": 1.9208, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.7435897435897436, |
|
"grad_norm": 15.062151908874512, |
|
"learning_rate": 1.7071067811865474e-06, |
|
"loss": 1.9028, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.7435897435897436, |
|
"eval_loss": 2.098405361175537, |
|
"eval_runtime": 0.186, |
|
"eval_samples_per_second": 166.69, |
|
"eval_steps_per_second": 26.886, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.7948717948717947, |
|
"grad_norm": 14.257363319396973, |
|
"learning_rate": 1.6946583704589972e-06, |
|
"loss": 1.9604, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"grad_norm": 12.327591896057129, |
|
"learning_rate": 1.6819983600624985e-06, |
|
"loss": 1.9919, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.8974358974358974, |
|
"grad_norm": 14.447932243347168, |
|
"learning_rate": 1.669130606358858e-06, |
|
"loss": 1.9196, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.948717948717949, |
|
"grad_norm": 12.253332138061523, |
|
"learning_rate": 1.6560590289905071e-06, |
|
"loss": 1.8955, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 14.021129608154297, |
|
"learning_rate": 1.6427876096865393e-06, |
|
"loss": 1.9057, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.084063768386841, |
|
"eval_runtime": 0.1878, |
|
"eval_samples_per_second": 165.031, |
|
"eval_steps_per_second": 26.618, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.0128205128205128, |
|
"grad_norm": 12.585602760314941, |
|
"learning_rate": 1.6293203910498375e-06, |
|
"loss": 1.9736, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 3.064102564102564, |
|
"grad_norm": 12.412880897521973, |
|
"learning_rate": 1.615661475325658e-06, |
|
"loss": 1.906, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 3.1153846153846154, |
|
"grad_norm": 12.772639274597168, |
|
"learning_rate": 1.6018150231520484e-06, |
|
"loss": 1.8674, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 3.1666666666666665, |
|
"grad_norm": 9.931306838989258, |
|
"learning_rate": 1.587785252292473e-06, |
|
"loss": 1.8862, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 3.217948717948718, |
|
"grad_norm": 13.5899658203125, |
|
"learning_rate": 1.573576436351046e-06, |
|
"loss": 1.7464, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.217948717948718, |
|
"eval_loss": 2.078381061553955, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 166.085, |
|
"eval_steps_per_second": 26.788, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.269230769230769, |
|
"grad_norm": 11.722041130065918, |
|
"learning_rate": 1.5591929034707466e-06, |
|
"loss": 1.8595, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 3.3205128205128207, |
|
"grad_norm": 12.511164665222168, |
|
"learning_rate": 1.544639035015027e-06, |
|
"loss": 1.8445, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 3.371794871794872, |
|
"grad_norm": 15.670218467712402, |
|
"learning_rate": 1.5299192642332049e-06, |
|
"loss": 1.8044, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 3.423076923076923, |
|
"grad_norm": 12.341389656066895, |
|
"learning_rate": 1.5150380749100543e-06, |
|
"loss": 1.811, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 3.4743589743589745, |
|
"grad_norm": 13.361737251281738, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.8284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.4743589743589745, |
|
"eval_loss": 2.078845500946045, |
|
"eval_runtime": 0.1879, |
|
"eval_samples_per_second": 164.946, |
|
"eval_steps_per_second": 26.604, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.5256410256410255, |
|
"grad_norm": 14.585214614868164, |
|
"learning_rate": 1.4848096202463372e-06, |
|
"loss": 1.7391, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 3.5769230769230766, |
|
"grad_norm": 11.4587984085083, |
|
"learning_rate": 1.4694715627858908e-06, |
|
"loss": 1.8459, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 3.628205128205128, |
|
"grad_norm": 14.638727188110352, |
|
"learning_rate": 1.4539904997395467e-06, |
|
"loss": 1.814, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 3.6794871794871797, |
|
"grad_norm": 15.081775665283203, |
|
"learning_rate": 1.4383711467890773e-06, |
|
"loss": 1.9079, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 3.730769230769231, |
|
"grad_norm": 12.757416725158691, |
|
"learning_rate": 1.4226182617406994e-06, |
|
"loss": 1.8866, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.730769230769231, |
|
"eval_loss": 2.0760610103607178, |
|
"eval_runtime": 0.1867, |
|
"eval_samples_per_second": 166.063, |
|
"eval_steps_per_second": 26.784, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.782051282051282, |
|
"grad_norm": 14.678832054138184, |
|
"learning_rate": 1.4067366430758004e-06, |
|
"loss": 1.7503, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 3.8333333333333335, |
|
"grad_norm": 15.981603622436523, |
|
"learning_rate": 1.3907311284892735e-06, |
|
"loss": 1.7984, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 3.8846153846153846, |
|
"grad_norm": 14.856511116027832, |
|
"learning_rate": 1.374606593415912e-06, |
|
"loss": 1.7843, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 3.935897435897436, |
|
"grad_norm": 14.275514602661133, |
|
"learning_rate": 1.3583679495453e-06, |
|
"loss": 1.7888, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 3.9871794871794872, |
|
"grad_norm": 12.734882354736328, |
|
"learning_rate": 1.3420201433256689e-06, |
|
"loss": 1.8927, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.9871794871794872, |
|
"eval_loss": 2.067340135574341, |
|
"eval_runtime": 0.1861, |
|
"eval_samples_per_second": 166.583, |
|
"eval_steps_per_second": 26.868, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.038461538461538, |
|
"grad_norm": 14.663799285888672, |
|
"learning_rate": 1.3255681544571566e-06, |
|
"loss": 1.7531, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 4.051282051282051, |
|
"grad_norm": 12.570903778076172, |
|
"learning_rate": 1.3090169943749473e-06, |
|
"loss": 1.7588, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 11.108199119567871, |
|
"learning_rate": 1.2923717047227368e-06, |
|
"loss": 1.6173, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 14.328954696655273, |
|
"learning_rate": 1.275637355816999e-06, |
|
"loss": 1.7411, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 4.205128205128205, |
|
"grad_norm": 14.140481948852539, |
|
"learning_rate": 1.2588190451025207e-06, |
|
"loss": 1.5778, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 4.205128205128205, |
|
"eval_loss": 2.0778791904449463, |
|
"eval_runtime": 0.1891, |
|
"eval_samples_per_second": 163.917, |
|
"eval_steps_per_second": 26.438, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 4.256410256410256, |
|
"grad_norm": 13.933786392211914, |
|
"learning_rate": 1.2419218955996676e-06, |
|
"loss": 1.5578, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 16.1457462310791, |
|
"learning_rate": 1.2249510543438651e-06, |
|
"loss": 1.6873, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 4.358974358974359, |
|
"grad_norm": 16.26984977722168, |
|
"learning_rate": 1.207911690817759e-06, |
|
"loss": 1.6605, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 4.410256410256411, |
|
"grad_norm": 19.391223907470703, |
|
"learning_rate": 1.1908089953765447e-06, |
|
"loss": 1.6272, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 19.38517951965332, |
|
"learning_rate": 1.1736481776669305e-06, |
|
"loss": 1.7274, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"eval_loss": 2.0934271812438965, |
|
"eval_runtime": 0.1874, |
|
"eval_samples_per_second": 165.396, |
|
"eval_steps_per_second": 26.677, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.512820512820513, |
|
"grad_norm": 16.367389678955078, |
|
"learning_rate": 1.156434465040231e-06, |
|
"loss": 1.8406, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 4.564102564102564, |
|
"grad_norm": 18.22227668762207, |
|
"learning_rate": 1.1391731009600653e-06, |
|
"loss": 1.7469, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 14.44421100616455, |
|
"learning_rate": 1.1218693434051474e-06, |
|
"loss": 1.5867, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 13.295368194580078, |
|
"learning_rate": 1.1045284632676535e-06, |
|
"loss": 1.7081, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 4.717948717948718, |
|
"grad_norm": 15.499272346496582, |
|
"learning_rate": 1.0871557427476583e-06, |
|
"loss": 1.7431, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.717948717948718, |
|
"eval_loss": 2.065159559249878, |
|
"eval_runtime": 0.1863, |
|
"eval_samples_per_second": 166.408, |
|
"eval_steps_per_second": 26.84, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 15.949275016784668, |
|
"learning_rate": 1.069756473744125e-06, |
|
"loss": 1.6641, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 4.82051282051282, |
|
"grad_norm": 13.781301498413086, |
|
"learning_rate": 1.052335956242944e-06, |
|
"loss": 1.5421, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 4.871794871794872, |
|
"grad_norm": 16.268604278564453, |
|
"learning_rate": 1.034899496702501e-06, |
|
"loss": 1.7906, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 12.881053924560547, |
|
"learning_rate": 1.0174524064372837e-06, |
|
"loss": 1.7359, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 4.9743589743589745, |
|
"grad_norm": 15.596150398254395, |
|
"learning_rate": 1e-06, |
|
"loss": 1.8728, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.9743589743589745, |
|
"eval_loss": 2.0617754459381104, |
|
"eval_runtime": 0.1875, |
|
"eval_samples_per_second": 165.345, |
|
"eval_steps_per_second": 26.668, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.0256410256410255, |
|
"grad_norm": 16.61153221130371, |
|
"learning_rate": 9.825475935627165e-07, |
|
"loss": 1.6729, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 5.038461538461538, |
|
"grad_norm": 13.130430221557617, |
|
"learning_rate": 9.651005032974993e-07, |
|
"loss": 1.6707, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 5.089743589743589, |
|
"grad_norm": 14.977300643920898, |
|
"learning_rate": 9.476640437570561e-07, |
|
"loss": 1.5516, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 5.141025641025641, |
|
"grad_norm": 17.314029693603516, |
|
"learning_rate": 9.302435262558747e-07, |
|
"loss": 1.6449, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 5.1923076923076925, |
|
"grad_norm": 15.75112247467041, |
|
"learning_rate": 9.128442572523417e-07, |
|
"loss": 1.5729, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 5.1923076923076925, |
|
"eval_loss": 2.083660125732422, |
|
"eval_runtime": 0.187, |
|
"eval_samples_per_second": 165.747, |
|
"eval_steps_per_second": 26.733, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 5.243589743589744, |
|
"grad_norm": 19.511394500732422, |
|
"learning_rate": 8.954715367323466e-07, |
|
"loss": 1.5756, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 5.294871794871795, |
|
"grad_norm": 16.741764068603516, |
|
"learning_rate": 8.781306565948526e-07, |
|
"loss": 1.6627, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 5.346153846153846, |
|
"grad_norm": 16.6429443359375, |
|
"learning_rate": 8.608268990399348e-07, |
|
"loss": 1.6097, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 5.397435897435898, |
|
"grad_norm": 22.457843780517578, |
|
"learning_rate": 8.435655349597689e-07, |
|
"loss": 1.6192, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 5.448717948717949, |
|
"grad_norm": 13.546624183654785, |
|
"learning_rate": 8.263518223330696e-07, |
|
"loss": 1.4631, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.448717948717949, |
|
"eval_loss": 2.087294816970825, |
|
"eval_runtime": 0.1887, |
|
"eval_samples_per_second": 164.276, |
|
"eval_steps_per_second": 26.496, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 16.943618774414062, |
|
"learning_rate": 8.091910046234551e-07, |
|
"loss": 1.5529, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 5.551282051282051, |
|
"grad_norm": 17.719892501831055, |
|
"learning_rate": 7.920883091822408e-07, |
|
"loss": 1.7165, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 5.602564102564102, |
|
"grad_norm": 14.0659818649292, |
|
"learning_rate": 7.750489456561351e-07, |
|
"loss": 1.5024, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 5.653846153846154, |
|
"grad_norm": 17.86212921142578, |
|
"learning_rate": 7.580781044003324e-07, |
|
"loss": 1.5745, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 5.705128205128205, |
|
"grad_norm": 17.252527236938477, |
|
"learning_rate": 7.411809548974791e-07, |
|
"loss": 1.4758, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.705128205128205, |
|
"eval_loss": 2.074392557144165, |
|
"eval_runtime": 0.1875, |
|
"eval_samples_per_second": 165.36, |
|
"eval_steps_per_second": 26.671, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.756410256410256, |
|
"grad_norm": 18.326730728149414, |
|
"learning_rate": 7.243626441830009e-07, |
|
"loss": 1.5874, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 5.8076923076923075, |
|
"grad_norm": 14.133539199829102, |
|
"learning_rate": 7.076282952772633e-07, |
|
"loss": 1.4556, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 5.858974358974359, |
|
"grad_norm": 16.187454223632812, |
|
"learning_rate": 6.909830056250526e-07, |
|
"loss": 1.5353, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 5.910256410256411, |
|
"grad_norm": 18.15951919555664, |
|
"learning_rate": 6.744318455428435e-07, |
|
"loss": 1.6346, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 5.961538461538462, |
|
"grad_norm": 14.860916137695312, |
|
"learning_rate": 6.579798566743313e-07, |
|
"loss": 1.5289, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.961538461538462, |
|
"eval_loss": 2.0899431705474854, |
|
"eval_runtime": 0.1896, |
|
"eval_samples_per_second": 163.49, |
|
"eval_steps_per_second": 26.369, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 6.012820512820513, |
|
"grad_norm": 23.091646194458008, |
|
"learning_rate": 6.416320504546997e-07, |
|
"loss": 1.6633, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 6.0256410256410255, |
|
"grad_norm": 19.409482955932617, |
|
"learning_rate": 6.253934065840879e-07, |
|
"loss": 1.6998, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 6.076923076923077, |
|
"grad_norm": 15.723928451538086, |
|
"learning_rate": 6.092688715107263e-07, |
|
"loss": 1.5407, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 6.128205128205128, |
|
"grad_norm": 17.410001754760742, |
|
"learning_rate": 5.932633569241999e-07, |
|
"loss": 1.4682, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 6.17948717948718, |
|
"grad_norm": 15.949166297912598, |
|
"learning_rate": 5.773817382593007e-07, |
|
"loss": 1.515, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 6.17948717948718, |
|
"eval_loss": 2.091871500015259, |
|
"eval_runtime": 0.1884, |
|
"eval_samples_per_second": 164.579, |
|
"eval_steps_per_second": 26.545, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 6.230769230769231, |
|
"grad_norm": 19.262935638427734, |
|
"learning_rate": 5.616288532109224e-07, |
|
"loss": 1.4557, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 6.282051282051282, |
|
"grad_norm": 18.071447372436523, |
|
"learning_rate": 5.460095002604532e-07, |
|
"loss": 1.4763, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 6.333333333333333, |
|
"grad_norm": 14.22094440460205, |
|
"learning_rate": 5.305284372141095e-07, |
|
"loss": 1.3375, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 6.384615384615385, |
|
"grad_norm": 19.112789154052734, |
|
"learning_rate": 5.15190379753663e-07, |
|
"loss": 1.5896, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 6.435897435897436, |
|
"grad_norm": 19.069456100463867, |
|
"learning_rate": 5.000000000000002e-07, |
|
"loss": 1.5757, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.435897435897436, |
|
"eval_loss": 2.0978188514709473, |
|
"eval_runtime": 0.1888, |
|
"eval_samples_per_second": 164.22, |
|
"eval_steps_per_second": 26.487, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.487179487179487, |
|
"grad_norm": 16.8870792388916, |
|
"learning_rate": 4.849619250899458e-07, |
|
"loss": 1.4204, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 6.538461538461538, |
|
"grad_norm": 20.033496856689453, |
|
"learning_rate": 4.700807357667952e-07, |
|
"loss": 1.6698, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 6.589743589743589, |
|
"grad_norm": 18.386215209960938, |
|
"learning_rate": 4.5536096498497287e-07, |
|
"loss": 1.4692, |
|
"step": 133 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 190, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 19, |
|
"total_flos": 1.733580238744453e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|