|
{ |
|
"best_metric": 0.5393198132514954, |
|
"best_model_checkpoint": "/media/user/Expansion/flan-t5-small-ner/checkpoint-99955", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 99955, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02501125506477915, |
|
"grad_norm": 27.66358757019043, |
|
"learning_rate": 4.974988744935221e-05, |
|
"loss": 4.6267, |
|
"num_input_tokens_seen": 1673336, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0500225101295583, |
|
"grad_norm": 9.223219871520996, |
|
"learning_rate": 4.949977489870442e-05, |
|
"loss": 2.1909, |
|
"num_input_tokens_seen": 3361736, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07503376519433745, |
|
"grad_norm": 15.481683731079102, |
|
"learning_rate": 4.924966234805663e-05, |
|
"loss": 1.8355, |
|
"num_input_tokens_seen": 5013800, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1000450202591166, |
|
"grad_norm": 14.396512985229492, |
|
"learning_rate": 4.8999549797408836e-05, |
|
"loss": 1.5648, |
|
"num_input_tokens_seen": 6667312, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12505627532389577, |
|
"grad_norm": 7.068989276885986, |
|
"learning_rate": 4.8749437246761046e-05, |
|
"loss": 1.4252, |
|
"num_input_tokens_seen": 8347016, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1500675303886749, |
|
"grad_norm": 11.330971717834473, |
|
"learning_rate": 4.849932469611325e-05, |
|
"loss": 1.3972, |
|
"num_input_tokens_seen": 10008296, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.17507878545345407, |
|
"grad_norm": 9.403321266174316, |
|
"learning_rate": 4.824921214546546e-05, |
|
"loss": 1.3001, |
|
"num_input_tokens_seen": 11658808, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.2000900405182332, |
|
"grad_norm": 8.147115707397461, |
|
"learning_rate": 4.799909959481767e-05, |
|
"loss": 1.2625, |
|
"num_input_tokens_seen": 13331648, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.22510129558301237, |
|
"grad_norm": 13.405184745788574, |
|
"learning_rate": 4.774898704416988e-05, |
|
"loss": 1.1839, |
|
"num_input_tokens_seen": 14982440, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.25011255064779153, |
|
"grad_norm": 20.70949363708496, |
|
"learning_rate": 4.749887449352209e-05, |
|
"loss": 1.1598, |
|
"num_input_tokens_seen": 16633632, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.27512380571257067, |
|
"grad_norm": 16.94267463684082, |
|
"learning_rate": 4.72487619428743e-05, |
|
"loss": 1.1473, |
|
"num_input_tokens_seen": 18311672, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.3001350607773498, |
|
"grad_norm": 8.609989166259766, |
|
"learning_rate": 4.69986493922265e-05, |
|
"loss": 1.1098, |
|
"num_input_tokens_seen": 19980456, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.32514631584212894, |
|
"grad_norm": 9.003643989562988, |
|
"learning_rate": 4.674853684157871e-05, |
|
"loss": 1.0973, |
|
"num_input_tokens_seen": 21646328, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.35015757090690813, |
|
"grad_norm": 18.364194869995117, |
|
"learning_rate": 4.649842429093092e-05, |
|
"loss": 1.0987, |
|
"num_input_tokens_seen": 23277400, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.37516882597168727, |
|
"grad_norm": 13.544733047485352, |
|
"learning_rate": 4.624831174028313e-05, |
|
"loss": 1.0642, |
|
"num_input_tokens_seen": 24915304, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4001800810364664, |
|
"grad_norm": 14.257452011108398, |
|
"learning_rate": 4.5998199189635336e-05, |
|
"loss": 1.0414, |
|
"num_input_tokens_seen": 26590576, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.42519133610124554, |
|
"grad_norm": 10.29515266418457, |
|
"learning_rate": 4.5748086638987546e-05, |
|
"loss": 1.0634, |
|
"num_input_tokens_seen": 28236280, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.45020259116602473, |
|
"grad_norm": 13.840631484985352, |
|
"learning_rate": 4.5497974088339756e-05, |
|
"loss": 0.9817, |
|
"num_input_tokens_seen": 29891480, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.47521384623080387, |
|
"grad_norm": 12.118327140808105, |
|
"learning_rate": 4.5247861537691966e-05, |
|
"loss": 1.0122, |
|
"num_input_tokens_seen": 31551000, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5002251012955831, |
|
"grad_norm": 8.115203857421875, |
|
"learning_rate": 4.499774898704417e-05, |
|
"loss": 0.9802, |
|
"num_input_tokens_seen": 33221384, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5252363563603621, |
|
"grad_norm": 8.905954360961914, |
|
"learning_rate": 4.474763643639638e-05, |
|
"loss": 0.9796, |
|
"num_input_tokens_seen": 34891392, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5502476114251413, |
|
"grad_norm": 10.70656681060791, |
|
"learning_rate": 4.449752388574859e-05, |
|
"loss": 1.0031, |
|
"num_input_tokens_seen": 36518768, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.5752588664899204, |
|
"grad_norm": 12.424896240234375, |
|
"learning_rate": 4.42474113351008e-05, |
|
"loss": 0.9591, |
|
"num_input_tokens_seen": 38147456, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6002701215546996, |
|
"grad_norm": 10.77695083618164, |
|
"learning_rate": 4.399729878445301e-05, |
|
"loss": 0.9338, |
|
"num_input_tokens_seen": 39823976, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6252813766194788, |
|
"grad_norm": 12.77743911743164, |
|
"learning_rate": 4.374718623380521e-05, |
|
"loss": 0.9112, |
|
"num_input_tokens_seen": 41493480, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.6502926316842579, |
|
"grad_norm": 16.060897827148438, |
|
"learning_rate": 4.349707368315742e-05, |
|
"loss": 0.915, |
|
"num_input_tokens_seen": 43130832, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.6753038867490371, |
|
"grad_norm": 17.562183380126953, |
|
"learning_rate": 4.324696113250963e-05, |
|
"loss": 0.9096, |
|
"num_input_tokens_seen": 44779392, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7003151418138163, |
|
"grad_norm": 12.406323432922363, |
|
"learning_rate": 4.2996848581861835e-05, |
|
"loss": 0.9499, |
|
"num_input_tokens_seen": 46433856, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7253263968785953, |
|
"grad_norm": 15.567843437194824, |
|
"learning_rate": 4.2746736031214045e-05, |
|
"loss": 0.923, |
|
"num_input_tokens_seen": 48102016, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.7503376519433745, |
|
"grad_norm": 9.45335578918457, |
|
"learning_rate": 4.2496623480566255e-05, |
|
"loss": 0.9285, |
|
"num_input_tokens_seen": 49796432, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.7753489070081536, |
|
"grad_norm": 7.158623695373535, |
|
"learning_rate": 4.2246510929918465e-05, |
|
"loss": 0.9023, |
|
"num_input_tokens_seen": 51432848, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8003601620729328, |
|
"grad_norm": 9.542813301086426, |
|
"learning_rate": 4.1996398379270675e-05, |
|
"loss": 0.9237, |
|
"num_input_tokens_seen": 53083496, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.825371417137712, |
|
"grad_norm": 10.027923583984375, |
|
"learning_rate": 4.1746285828622885e-05, |
|
"loss": 0.8813, |
|
"num_input_tokens_seen": 54755032, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.8503826722024911, |
|
"grad_norm": 18.8748722076416, |
|
"learning_rate": 4.1496173277975095e-05, |
|
"loss": 0.9036, |
|
"num_input_tokens_seen": 56411184, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.8753939272672703, |
|
"grad_norm": 12.792276382446289, |
|
"learning_rate": 4.12460607273273e-05, |
|
"loss": 0.8589, |
|
"num_input_tokens_seen": 58070520, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9004051823320495, |
|
"grad_norm": 6.8420491218566895, |
|
"learning_rate": 4.09959481766795e-05, |
|
"loss": 0.8855, |
|
"num_input_tokens_seen": 59745800, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9254164373968286, |
|
"grad_norm": 9.066823959350586, |
|
"learning_rate": 4.074583562603171e-05, |
|
"loss": 0.8773, |
|
"num_input_tokens_seen": 61457288, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.9504276924616077, |
|
"grad_norm": 7.002307415008545, |
|
"learning_rate": 4.049572307538392e-05, |
|
"loss": 0.8747, |
|
"num_input_tokens_seen": 63139928, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.9754389475263868, |
|
"grad_norm": 14.685755729675293, |
|
"learning_rate": 4.024561052473613e-05, |
|
"loss": 0.8398, |
|
"num_input_tokens_seen": 64811920, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6227446794509888, |
|
"eval_runtime": 96.3481, |
|
"eval_samples_per_second": 414.964, |
|
"eval_steps_per_second": 51.874, |
|
"num_input_tokens_seen": 66451084, |
|
"step": 19991 |
|
}, |
|
{ |
|
"epoch": 1.0004502025911661, |
|
"grad_norm": 13.560747146606445, |
|
"learning_rate": 3.999549797408834e-05, |
|
"loss": 0.852, |
|
"num_input_tokens_seen": 66482076, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.025461457655945, |
|
"grad_norm": 4.446373462677002, |
|
"learning_rate": 3.974538542344055e-05, |
|
"loss": 0.7973, |
|
"num_input_tokens_seen": 68132180, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.0504727127207243, |
|
"grad_norm": 3.456674098968506, |
|
"learning_rate": 3.949527287279276e-05, |
|
"loss": 0.8215, |
|
"num_input_tokens_seen": 69804380, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.0754839677855035, |
|
"grad_norm": 8.283075332641602, |
|
"learning_rate": 3.924516032214497e-05, |
|
"loss": 0.8081, |
|
"num_input_tokens_seen": 71452668, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.1004952228502827, |
|
"grad_norm": 9.358149528503418, |
|
"learning_rate": 3.8995047771497175e-05, |
|
"loss": 0.7991, |
|
"num_input_tokens_seen": 73104948, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.1255064779150619, |
|
"grad_norm": 9.011244773864746, |
|
"learning_rate": 3.8744935220849385e-05, |
|
"loss": 0.7839, |
|
"num_input_tokens_seen": 74751164, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.1505177329798408, |
|
"grad_norm": 5.775268077850342, |
|
"learning_rate": 3.849482267020159e-05, |
|
"loss": 0.7515, |
|
"num_input_tokens_seen": 76431460, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.17552898804462, |
|
"grad_norm": 13.273436546325684, |
|
"learning_rate": 3.82447101195538e-05, |
|
"loss": 0.7821, |
|
"num_input_tokens_seen": 78092124, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.2005402431093992, |
|
"grad_norm": 10.351176261901855, |
|
"learning_rate": 3.799459756890601e-05, |
|
"loss": 0.772, |
|
"num_input_tokens_seen": 79736012, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.2255514981741784, |
|
"grad_norm": 14.834792137145996, |
|
"learning_rate": 3.774448501825822e-05, |
|
"loss": 0.78, |
|
"num_input_tokens_seen": 81414220, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.2505627532389576, |
|
"grad_norm": 14.160717964172363, |
|
"learning_rate": 3.749437246761043e-05, |
|
"loss": 0.7767, |
|
"num_input_tokens_seen": 83081932, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.2755740083037366, |
|
"grad_norm": 8.410615921020508, |
|
"learning_rate": 3.724425991696264e-05, |
|
"loss": 0.7665, |
|
"num_input_tokens_seen": 84745948, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.3005852633685158, |
|
"grad_norm": 7.881125450134277, |
|
"learning_rate": 3.699414736631484e-05, |
|
"loss": 0.7626, |
|
"num_input_tokens_seen": 86421180, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.325596518433295, |
|
"grad_norm": 21.633901596069336, |
|
"learning_rate": 3.674403481566705e-05, |
|
"loss": 0.7645, |
|
"num_input_tokens_seen": 88075204, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.3506077734980741, |
|
"grad_norm": 14.725602149963379, |
|
"learning_rate": 3.649392226501926e-05, |
|
"loss": 0.751, |
|
"num_input_tokens_seen": 89740116, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.3756190285628533, |
|
"grad_norm": 6.119060039520264, |
|
"learning_rate": 3.6243809714371465e-05, |
|
"loss": 0.756, |
|
"num_input_tokens_seen": 91410556, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.4006302836276325, |
|
"grad_norm": 6.520070552825928, |
|
"learning_rate": 3.5993697163723675e-05, |
|
"loss": 0.7526, |
|
"num_input_tokens_seen": 93116396, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.4256415386924115, |
|
"grad_norm": 7.963521480560303, |
|
"learning_rate": 3.5743584613075885e-05, |
|
"loss": 0.7645, |
|
"num_input_tokens_seen": 94761716, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.4506527937571907, |
|
"grad_norm": 11.38167953491211, |
|
"learning_rate": 3.5493472062428095e-05, |
|
"loss": 0.7624, |
|
"num_input_tokens_seen": 96449700, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.4756640488219699, |
|
"grad_norm": 15.715912818908691, |
|
"learning_rate": 3.5243359511780305e-05, |
|
"loss": 0.7509, |
|
"num_input_tokens_seen": 98102252, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.500675303886749, |
|
"grad_norm": 7.735713005065918, |
|
"learning_rate": 3.499324696113251e-05, |
|
"loss": 0.7738, |
|
"num_input_tokens_seen": 99780396, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.525686558951528, |
|
"grad_norm": 8.079352378845215, |
|
"learning_rate": 3.474313441048472e-05, |
|
"loss": 0.7522, |
|
"num_input_tokens_seen": 101479956, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.5506978140163072, |
|
"grad_norm": 8.290655136108398, |
|
"learning_rate": 3.449302185983693e-05, |
|
"loss": 0.7381, |
|
"num_input_tokens_seen": 103149500, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.5757090690810864, |
|
"grad_norm": 8.904264450073242, |
|
"learning_rate": 3.424290930918914e-05, |
|
"loss": 0.7467, |
|
"num_input_tokens_seen": 104812996, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.6007203241458656, |
|
"grad_norm": 7.439008712768555, |
|
"learning_rate": 3.399279675854135e-05, |
|
"loss": 0.7507, |
|
"num_input_tokens_seen": 106479036, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.6257315792106448, |
|
"grad_norm": 7.584664344787598, |
|
"learning_rate": 3.374268420789355e-05, |
|
"loss": 0.7168, |
|
"num_input_tokens_seen": 108141364, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.650742834275424, |
|
"grad_norm": 8.953302383422852, |
|
"learning_rate": 3.349257165724576e-05, |
|
"loss": 0.7469, |
|
"num_input_tokens_seen": 109799916, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.6757540893402032, |
|
"grad_norm": 10.678362846374512, |
|
"learning_rate": 3.324245910659797e-05, |
|
"loss": 0.7468, |
|
"num_input_tokens_seen": 111436748, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.7007653444049824, |
|
"grad_norm": 11.628217697143555, |
|
"learning_rate": 3.2992346555950175e-05, |
|
"loss": 0.7358, |
|
"num_input_tokens_seen": 113068476, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.7257765994697614, |
|
"grad_norm": 12.741203308105469, |
|
"learning_rate": 3.2742234005302385e-05, |
|
"loss": 0.7402, |
|
"num_input_tokens_seen": 114748860, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.7507878545345406, |
|
"grad_norm": 9.066828727722168, |
|
"learning_rate": 3.2492121454654595e-05, |
|
"loss": 0.7728, |
|
"num_input_tokens_seen": 116441684, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.7757991095993197, |
|
"grad_norm": 7.780086517333984, |
|
"learning_rate": 3.2242008904006805e-05, |
|
"loss": 0.7424, |
|
"num_input_tokens_seen": 118093652, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.8008103646640987, |
|
"grad_norm": 5.290003299713135, |
|
"learning_rate": 3.1991896353359015e-05, |
|
"loss": 0.7121, |
|
"num_input_tokens_seen": 119756772, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.825821619728878, |
|
"grad_norm": 13.356730461120605, |
|
"learning_rate": 3.1741783802711225e-05, |
|
"loss": 0.789, |
|
"num_input_tokens_seen": 121419852, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.850832874793657, |
|
"grad_norm": 4.2140727043151855, |
|
"learning_rate": 3.149167125206343e-05, |
|
"loss": 0.7501, |
|
"num_input_tokens_seen": 123080420, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.8758441298584363, |
|
"grad_norm": 15.408193588256836, |
|
"learning_rate": 3.124155870141564e-05, |
|
"loss": 0.7576, |
|
"num_input_tokens_seen": 124733724, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.9008553849232155, |
|
"grad_norm": 8.88025951385498, |
|
"learning_rate": 3.099144615076784e-05, |
|
"loss": 0.7315, |
|
"num_input_tokens_seen": 126386636, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.9258666399879947, |
|
"grad_norm": 15.850674629211426, |
|
"learning_rate": 3.074133360012005e-05, |
|
"loss": 0.7289, |
|
"num_input_tokens_seen": 128054932, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.9508778950527739, |
|
"grad_norm": 10.460667610168457, |
|
"learning_rate": 3.049122104947226e-05, |
|
"loss": 0.7375, |
|
"num_input_tokens_seen": 129731780, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.975889150117553, |
|
"grad_norm": 4.816532135009766, |
|
"learning_rate": 3.024110849882447e-05, |
|
"loss": 0.7203, |
|
"num_input_tokens_seen": 131377564, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5678554773330688, |
|
"eval_runtime": 97.2769, |
|
"eval_samples_per_second": 411.002, |
|
"eval_steps_per_second": 51.379, |
|
"num_input_tokens_seen": 132976438, |
|
"step": 39982 |
|
}, |
|
{ |
|
"epoch": 2.0009004051823323, |
|
"grad_norm": 8.531465530395508, |
|
"learning_rate": 2.999099594817668e-05, |
|
"loss": 0.7337, |
|
"num_input_tokens_seen": 133038726, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.025911660247111, |
|
"grad_norm": 17.74102783203125, |
|
"learning_rate": 2.974088339752889e-05, |
|
"loss": 0.6798, |
|
"num_input_tokens_seen": 134681590, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.05092291531189, |
|
"grad_norm": 16.203670501708984, |
|
"learning_rate": 2.9490770846881098e-05, |
|
"loss": 0.692, |
|
"num_input_tokens_seen": 136354910, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.0759341703766694, |
|
"grad_norm": 11.238871574401855, |
|
"learning_rate": 2.9240658296233308e-05, |
|
"loss": 0.653, |
|
"num_input_tokens_seen": 138014246, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.1009454254414486, |
|
"grad_norm": 8.781373023986816, |
|
"learning_rate": 2.899054574558551e-05, |
|
"loss": 0.6742, |
|
"num_input_tokens_seen": 139676526, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.1259566805062278, |
|
"grad_norm": 7.73007869720459, |
|
"learning_rate": 2.874043319493772e-05, |
|
"loss": 0.6739, |
|
"num_input_tokens_seen": 141326846, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.150967935571007, |
|
"grad_norm": 6.6758904457092285, |
|
"learning_rate": 2.849032064428993e-05, |
|
"loss": 0.6767, |
|
"num_input_tokens_seen": 142999126, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.175979190635786, |
|
"grad_norm": 9.964508056640625, |
|
"learning_rate": 2.824020809364214e-05, |
|
"loss": 0.6649, |
|
"num_input_tokens_seen": 144643454, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.2009904457005653, |
|
"grad_norm": 7.9148664474487305, |
|
"learning_rate": 2.7990095542994348e-05, |
|
"loss": 0.678, |
|
"num_input_tokens_seen": 146327686, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.2260017007653445, |
|
"grad_norm": 5.838576316833496, |
|
"learning_rate": 2.7739982992346558e-05, |
|
"loss": 0.6629, |
|
"num_input_tokens_seen": 147996750, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.2510129558301237, |
|
"grad_norm": 9.018148422241211, |
|
"learning_rate": 2.7489870441698768e-05, |
|
"loss": 0.6673, |
|
"num_input_tokens_seen": 149658382, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.276024210894903, |
|
"grad_norm": 5.56981897354126, |
|
"learning_rate": 2.7239757891050978e-05, |
|
"loss": 0.658, |
|
"num_input_tokens_seen": 151279470, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.3010354659596817, |
|
"grad_norm": 3.9373059272766113, |
|
"learning_rate": 2.698964534040318e-05, |
|
"loss": 0.6747, |
|
"num_input_tokens_seen": 152950878, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.326046721024461, |
|
"grad_norm": 7.596631050109863, |
|
"learning_rate": 2.6739532789755388e-05, |
|
"loss": 0.6824, |
|
"num_input_tokens_seen": 154603110, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.35105797608924, |
|
"grad_norm": 7.714618682861328, |
|
"learning_rate": 2.6489420239107598e-05, |
|
"loss": 0.6662, |
|
"num_input_tokens_seen": 156262254, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.3760692311540192, |
|
"grad_norm": 11.400321006774902, |
|
"learning_rate": 2.6239307688459808e-05, |
|
"loss": 0.6478, |
|
"num_input_tokens_seen": 157940526, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.4010804862187984, |
|
"grad_norm": 5.944780349731445, |
|
"learning_rate": 2.5989195137812018e-05, |
|
"loss": 0.6701, |
|
"num_input_tokens_seen": 159597926, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.4260917412835776, |
|
"grad_norm": 7.971735954284668, |
|
"learning_rate": 2.5739082587164225e-05, |
|
"loss": 0.6815, |
|
"num_input_tokens_seen": 161249054, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.451102996348357, |
|
"grad_norm": 8.019645690917969, |
|
"learning_rate": 2.5488970036516435e-05, |
|
"loss": 0.6823, |
|
"num_input_tokens_seen": 162937710, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.476114251413136, |
|
"grad_norm": 14.52238655090332, |
|
"learning_rate": 2.5238857485868645e-05, |
|
"loss": 0.6662, |
|
"num_input_tokens_seen": 164579550, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.501125506477915, |
|
"grad_norm": 8.065009117126465, |
|
"learning_rate": 2.498874493522085e-05, |
|
"loss": 0.6855, |
|
"num_input_tokens_seen": 166259486, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.526136761542694, |
|
"grad_norm": 3.0121171474456787, |
|
"learning_rate": 2.473863238457306e-05, |
|
"loss": 0.6597, |
|
"num_input_tokens_seen": 167925014, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.551148016607473, |
|
"grad_norm": 9.93840217590332, |
|
"learning_rate": 2.4488519833925268e-05, |
|
"loss": 0.6672, |
|
"num_input_tokens_seen": 169584230, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.5761592716722523, |
|
"grad_norm": 7.8001627922058105, |
|
"learning_rate": 2.4238407283277475e-05, |
|
"loss": 0.6419, |
|
"num_input_tokens_seen": 171205846, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.6011705267370315, |
|
"grad_norm": 5.621837139129639, |
|
"learning_rate": 2.3988294732629685e-05, |
|
"loss": 0.6679, |
|
"num_input_tokens_seen": 172867766, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.6261817818018107, |
|
"grad_norm": 18.287431716918945, |
|
"learning_rate": 2.3738182181981895e-05, |
|
"loss": 0.6601, |
|
"num_input_tokens_seen": 174508502, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.65119303686659, |
|
"grad_norm": 7.687650203704834, |
|
"learning_rate": 2.34880696313341e-05, |
|
"loss": 0.6722, |
|
"num_input_tokens_seen": 176179174, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.676204291931369, |
|
"grad_norm": 9.807682037353516, |
|
"learning_rate": 2.3237957080686308e-05, |
|
"loss": 0.666, |
|
"num_input_tokens_seen": 177874198, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.7012155469961483, |
|
"grad_norm": 9.2701416015625, |
|
"learning_rate": 2.2987844530038518e-05, |
|
"loss": 0.6811, |
|
"num_input_tokens_seen": 179531678, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.7262268020609275, |
|
"grad_norm": 8.37064266204834, |
|
"learning_rate": 2.2737731979390728e-05, |
|
"loss": 0.6505, |
|
"num_input_tokens_seen": 181197542, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.7512380571257067, |
|
"grad_norm": 5.556591033935547, |
|
"learning_rate": 2.2487619428742935e-05, |
|
"loss": 0.6711, |
|
"num_input_tokens_seen": 182849270, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.776249312190486, |
|
"grad_norm": 7.93866491317749, |
|
"learning_rate": 2.2237506878095145e-05, |
|
"loss": 0.6664, |
|
"num_input_tokens_seen": 184520526, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.801260567255265, |
|
"grad_norm": 6.768641471862793, |
|
"learning_rate": 2.198739432744735e-05, |
|
"loss": 0.6699, |
|
"num_input_tokens_seen": 186239974, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.8262718223200443, |
|
"grad_norm": 5.911066055297852, |
|
"learning_rate": 2.173728177679956e-05, |
|
"loss": 0.6649, |
|
"num_input_tokens_seen": 187875982, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.851283077384823, |
|
"grad_norm": 9.964897155761719, |
|
"learning_rate": 2.1487169226151768e-05, |
|
"loss": 0.6874, |
|
"num_input_tokens_seen": 189505118, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.876294332449602, |
|
"grad_norm": 8.109452247619629, |
|
"learning_rate": 2.1237056675503978e-05, |
|
"loss": 0.6762, |
|
"num_input_tokens_seen": 191184886, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.9013055875143814, |
|
"grad_norm": 8.556594848632812, |
|
"learning_rate": 2.0986944124856188e-05, |
|
"loss": 0.6491, |
|
"num_input_tokens_seen": 192859070, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.9263168425791606, |
|
"grad_norm": 5.430099010467529, |
|
"learning_rate": 2.0736831574208394e-05, |
|
"loss": 0.661, |
|
"num_input_tokens_seen": 194533102, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.9513280976439398, |
|
"grad_norm": 9.806259155273438, |
|
"learning_rate": 2.04867190235606e-05, |
|
"loss": 0.645, |
|
"num_input_tokens_seen": 196171870, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.976339352708719, |
|
"grad_norm": 8.950848579406738, |
|
"learning_rate": 2.023660647291281e-05, |
|
"loss": 0.6479, |
|
"num_input_tokens_seen": 197877830, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.560497522354126, |
|
"eval_runtime": 98.7193, |
|
"eval_samples_per_second": 404.997, |
|
"eval_steps_per_second": 50.628, |
|
"num_input_tokens_seen": 199402582, |
|
"step": 59973 |
|
}, |
|
{ |
|
"epoch": 3.001350607773498, |
|
"grad_norm": 6.855441093444824, |
|
"learning_rate": 1.998649392226502e-05, |
|
"loss": 0.6187, |
|
"num_input_tokens_seen": 199490934, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.0263618628382774, |
|
"grad_norm": 8.57907772064209, |
|
"learning_rate": 1.973638137161723e-05, |
|
"loss": 0.652, |
|
"num_input_tokens_seen": 201164870, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 3.0513731179030565, |
|
"grad_norm": 15.578742027282715, |
|
"learning_rate": 1.9486268820969438e-05, |
|
"loss": 0.6127, |
|
"num_input_tokens_seen": 202824222, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.0763843729678357, |
|
"grad_norm": 9.083669662475586, |
|
"learning_rate": 1.9236156270321644e-05, |
|
"loss": 0.6146, |
|
"num_input_tokens_seen": 204495334, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 3.1013956280326145, |
|
"grad_norm": 10.12027359008789, |
|
"learning_rate": 1.8986043719673854e-05, |
|
"loss": 0.6341, |
|
"num_input_tokens_seen": 206136214, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.1264068830973937, |
|
"grad_norm": 10.482580184936523, |
|
"learning_rate": 1.8735931169026064e-05, |
|
"loss": 0.603, |
|
"num_input_tokens_seen": 207809294, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 3.151418138162173, |
|
"grad_norm": 7.722796440124512, |
|
"learning_rate": 1.848581861837827e-05, |
|
"loss": 0.6184, |
|
"num_input_tokens_seen": 209485534, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.176429393226952, |
|
"grad_norm": 7.449066162109375, |
|
"learning_rate": 1.8235706067730478e-05, |
|
"loss": 0.621, |
|
"num_input_tokens_seen": 211143158, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 3.2014406482917313, |
|
"grad_norm": 8.766199111938477, |
|
"learning_rate": 1.7985593517082688e-05, |
|
"loss": 0.6165, |
|
"num_input_tokens_seen": 212777414, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.2264519033565104, |
|
"grad_norm": 4.193557262420654, |
|
"learning_rate": 1.7735480966434898e-05, |
|
"loss": 0.6188, |
|
"num_input_tokens_seen": 214420798, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 3.2514631584212896, |
|
"grad_norm": 6.699706554412842, |
|
"learning_rate": 1.7485368415787104e-05, |
|
"loss": 0.6095, |
|
"num_input_tokens_seen": 216073590, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.276474413486069, |
|
"grad_norm": 8.79476547241211, |
|
"learning_rate": 1.7235255865139314e-05, |
|
"loss": 0.6208, |
|
"num_input_tokens_seen": 217746214, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 3.301485668550848, |
|
"grad_norm": 6.685282230377197, |
|
"learning_rate": 1.698514331449152e-05, |
|
"loss": 0.6058, |
|
"num_input_tokens_seen": 219433446, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.326496923615627, |
|
"grad_norm": 10.743680953979492, |
|
"learning_rate": 1.673503076384373e-05, |
|
"loss": 0.6318, |
|
"num_input_tokens_seen": 221089694, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 3.3515081786804064, |
|
"grad_norm": 8.36410903930664, |
|
"learning_rate": 1.6484918213195938e-05, |
|
"loss": 0.6236, |
|
"num_input_tokens_seen": 222760502, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.376519433745185, |
|
"grad_norm": 7.1238274574279785, |
|
"learning_rate": 1.6234805662548148e-05, |
|
"loss": 0.6103, |
|
"num_input_tokens_seen": 224417582, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 3.4015306888099643, |
|
"grad_norm": 7.042121887207031, |
|
"learning_rate": 1.5984693111900358e-05, |
|
"loss": 0.6157, |
|
"num_input_tokens_seen": 226068982, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.4265419438747435, |
|
"grad_norm": 9.31881332397461, |
|
"learning_rate": 1.5734580561252564e-05, |
|
"loss": 0.6263, |
|
"num_input_tokens_seen": 227701038, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 3.4515531989395227, |
|
"grad_norm": 7.049442768096924, |
|
"learning_rate": 1.548446801060477e-05, |
|
"loss": 0.6237, |
|
"num_input_tokens_seen": 229359710, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.476564454004302, |
|
"grad_norm": 7.746445178985596, |
|
"learning_rate": 1.5234355459956981e-05, |
|
"loss": 0.6376, |
|
"num_input_tokens_seen": 231028950, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 3.501575709069081, |
|
"grad_norm": 4.588512420654297, |
|
"learning_rate": 1.4984242909309191e-05, |
|
"loss": 0.6189, |
|
"num_input_tokens_seen": 232663446, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.5265869641338603, |
|
"grad_norm": 9.873016357421875, |
|
"learning_rate": 1.47341303586614e-05, |
|
"loss": 0.5935, |
|
"num_input_tokens_seen": 234333558, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 3.5515982191986395, |
|
"grad_norm": 8.153191566467285, |
|
"learning_rate": 1.4484017808013606e-05, |
|
"loss": 0.6403, |
|
"num_input_tokens_seen": 236006758, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.5766094742634187, |
|
"grad_norm": 5.909561634063721, |
|
"learning_rate": 1.4233905257365814e-05, |
|
"loss": 0.6152, |
|
"num_input_tokens_seen": 237655630, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 3.6016207293281974, |
|
"grad_norm": 9.481532096862793, |
|
"learning_rate": 1.3983792706718024e-05, |
|
"loss": 0.5916, |
|
"num_input_tokens_seen": 239300238, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.6266319843929766, |
|
"grad_norm": 4.988440990447998, |
|
"learning_rate": 1.3733680156070232e-05, |
|
"loss": 0.6275, |
|
"num_input_tokens_seen": 240971214, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 3.651643239457756, |
|
"grad_norm": 6.159299850463867, |
|
"learning_rate": 1.3483567605422439e-05, |
|
"loss": 0.6101, |
|
"num_input_tokens_seen": 242634286, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.676654494522535, |
|
"grad_norm": 4.264859199523926, |
|
"learning_rate": 1.3233455054774649e-05, |
|
"loss": 0.6045, |
|
"num_input_tokens_seen": 244293870, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 3.701665749587314, |
|
"grad_norm": 5.82095193862915, |
|
"learning_rate": 1.2983342504126857e-05, |
|
"loss": 0.624, |
|
"num_input_tokens_seen": 245956374, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 3.7266770046520934, |
|
"grad_norm": 10.4242525100708, |
|
"learning_rate": 1.2733229953479067e-05, |
|
"loss": 0.6231, |
|
"num_input_tokens_seen": 247566166, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 3.7516882597168726, |
|
"grad_norm": 6.536423206329346, |
|
"learning_rate": 1.2483117402831276e-05, |
|
"loss": 0.6159, |
|
"num_input_tokens_seen": 249233118, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.776699514781652, |
|
"grad_norm": 10.467476844787598, |
|
"learning_rate": 1.2233004852183482e-05, |
|
"loss": 0.6252, |
|
"num_input_tokens_seen": 250919822, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 3.801710769846431, |
|
"grad_norm": 13.297423362731934, |
|
"learning_rate": 1.1982892301535692e-05, |
|
"loss": 0.6133, |
|
"num_input_tokens_seen": 252600838, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 3.82672202491121, |
|
"grad_norm": 6.729821681976318, |
|
"learning_rate": 1.1732779750887899e-05, |
|
"loss": 0.6201, |
|
"num_input_tokens_seen": 254292558, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 3.8517332799759894, |
|
"grad_norm": 5.975412845611572, |
|
"learning_rate": 1.1482667200240109e-05, |
|
"loss": 0.5976, |
|
"num_input_tokens_seen": 255961510, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 3.8767445350407685, |
|
"grad_norm": 16.30948257446289, |
|
"learning_rate": 1.1232554649592317e-05, |
|
"loss": 0.6023, |
|
"num_input_tokens_seen": 257630246, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 3.9017557901055477, |
|
"grad_norm": 7.327265739440918, |
|
"learning_rate": 1.0982442098944526e-05, |
|
"loss": 0.6145, |
|
"num_input_tokens_seen": 259305118, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 3.9267670451703265, |
|
"grad_norm": 12.45727825164795, |
|
"learning_rate": 1.0732329548296734e-05, |
|
"loss": 0.6311, |
|
"num_input_tokens_seen": 260978934, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 3.9517783002351057, |
|
"grad_norm": 10.317325592041016, |
|
"learning_rate": 1.0482216997648942e-05, |
|
"loss": 0.6346, |
|
"num_input_tokens_seen": 262670814, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 3.976789555299885, |
|
"grad_norm": 7.8411149978637695, |
|
"learning_rate": 1.023210444700115e-05, |
|
"loss": 0.6023, |
|
"num_input_tokens_seen": 264314614, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.54269939661026, |
|
"eval_runtime": 96.9182, |
|
"eval_samples_per_second": 412.523, |
|
"eval_steps_per_second": 51.569, |
|
"num_input_tokens_seen": 265875340, |
|
"step": 79964 |
|
}, |
|
{ |
|
"epoch": 4.0018008103646645, |
|
"grad_norm": 6.620047569274902, |
|
"learning_rate": 9.98199189635336e-06, |
|
"loss": 0.6268, |
|
"num_input_tokens_seen": 266010940, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.026812065429443, |
|
"grad_norm": 10.007366180419922, |
|
"learning_rate": 9.731879345705567e-06, |
|
"loss": 0.5924, |
|
"num_input_tokens_seen": 267660364, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 4.051823320494222, |
|
"grad_norm": 6.680395603179932, |
|
"learning_rate": 9.481766795057777e-06, |
|
"loss": 0.5786, |
|
"num_input_tokens_seen": 269338492, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.076834575559001, |
|
"grad_norm": 4.809377670288086, |
|
"learning_rate": 9.231654244409984e-06, |
|
"loss": 0.5942, |
|
"num_input_tokens_seen": 271024236, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 4.10184583062378, |
|
"grad_norm": 8.463695526123047, |
|
"learning_rate": 8.981541693762194e-06, |
|
"loss": 0.5796, |
|
"num_input_tokens_seen": 272672620, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.12685708568856, |
|
"grad_norm": 10.12741470336914, |
|
"learning_rate": 8.731429143114402e-06, |
|
"loss": 0.5879, |
|
"num_input_tokens_seen": 274353676, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 4.151868340753339, |
|
"grad_norm": 15.428593635559082, |
|
"learning_rate": 8.48131659246661e-06, |
|
"loss": 0.5977, |
|
"num_input_tokens_seen": 275998164, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.176879595818118, |
|
"grad_norm": 10.350814819335938, |
|
"learning_rate": 8.231204041818819e-06, |
|
"loss": 0.566, |
|
"num_input_tokens_seen": 277685356, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 4.201890850882897, |
|
"grad_norm": 11.962939262390137, |
|
"learning_rate": 7.981091491171027e-06, |
|
"loss": 0.5671, |
|
"num_input_tokens_seen": 279358548, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.226902105947676, |
|
"grad_norm": 10.32712459564209, |
|
"learning_rate": 7.730978940523236e-06, |
|
"loss": 0.5785, |
|
"num_input_tokens_seen": 280991044, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 4.2519133610124555, |
|
"grad_norm": 5.896986484527588, |
|
"learning_rate": 7.480866389875445e-06, |
|
"loss": 0.6051, |
|
"num_input_tokens_seen": 282646764, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.276924616077235, |
|
"grad_norm": 7.187685966491699, |
|
"learning_rate": 7.230753839227652e-06, |
|
"loss": 0.5943, |
|
"num_input_tokens_seen": 284342508, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 4.301935871142014, |
|
"grad_norm": 6.680044174194336, |
|
"learning_rate": 6.980641288579861e-06, |
|
"loss": 0.5765, |
|
"num_input_tokens_seen": 286036340, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.326947126206793, |
|
"grad_norm": 4.963362693786621, |
|
"learning_rate": 6.73052873793207e-06, |
|
"loss": 0.6137, |
|
"num_input_tokens_seen": 287681564, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 4.351958381271572, |
|
"grad_norm": 12.112903594970703, |
|
"learning_rate": 6.480416187284279e-06, |
|
"loss": 0.5983, |
|
"num_input_tokens_seen": 289353828, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.3769696363363515, |
|
"grad_norm": 5.938944339752197, |
|
"learning_rate": 6.230303636636486e-06, |
|
"loss": 0.6017, |
|
"num_input_tokens_seen": 291011668, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 4.401980891401131, |
|
"grad_norm": 4.485511302947998, |
|
"learning_rate": 5.980191085988695e-06, |
|
"loss": 0.5898, |
|
"num_input_tokens_seen": 292675092, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.42699214646591, |
|
"grad_norm": 9.15986442565918, |
|
"learning_rate": 5.730078535340903e-06, |
|
"loss": 0.5744, |
|
"num_input_tokens_seen": 294338212, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 4.452003401530689, |
|
"grad_norm": 3.6591997146606445, |
|
"learning_rate": 5.479965984693112e-06, |
|
"loss": 0.5948, |
|
"num_input_tokens_seen": 296004820, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.477014656595468, |
|
"grad_norm": 9.19853401184082, |
|
"learning_rate": 5.2298534340453205e-06, |
|
"loss": 0.5838, |
|
"num_input_tokens_seen": 297661964, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 4.5020259116602475, |
|
"grad_norm": 13.491796493530273, |
|
"learning_rate": 4.979740883397529e-06, |
|
"loss": 0.5726, |
|
"num_input_tokens_seen": 299312356, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.527037166725027, |
|
"grad_norm": 6.374147415161133, |
|
"learning_rate": 4.729628332749737e-06, |
|
"loss": 0.5728, |
|
"num_input_tokens_seen": 300978468, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 4.552048421789806, |
|
"grad_norm": 7.507421970367432, |
|
"learning_rate": 4.479515782101945e-06, |
|
"loss": 0.5903, |
|
"num_input_tokens_seen": 302639252, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.577059676854585, |
|
"grad_norm": 12.31728744506836, |
|
"learning_rate": 4.229403231454155e-06, |
|
"loss": 0.5916, |
|
"num_input_tokens_seen": 304289124, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 4.602070931919363, |
|
"grad_norm": 11.238248825073242, |
|
"learning_rate": 3.979290680806363e-06, |
|
"loss": 0.5617, |
|
"num_input_tokens_seen": 305968436, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 4.6270821869841425, |
|
"grad_norm": 6.74647331237793, |
|
"learning_rate": 3.7291781301585712e-06, |
|
"loss": 0.6249, |
|
"num_input_tokens_seen": 307616156, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 4.652093442048922, |
|
"grad_norm": 7.845546722412109, |
|
"learning_rate": 3.4790655795107795e-06, |
|
"loss": 0.6015, |
|
"num_input_tokens_seen": 309294188, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 4.677104697113701, |
|
"grad_norm": 5.631568431854248, |
|
"learning_rate": 3.2289530288629883e-06, |
|
"loss": 0.5747, |
|
"num_input_tokens_seen": 310930388, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 4.70211595217848, |
|
"grad_norm": 4.305506229400635, |
|
"learning_rate": 2.978840478215197e-06, |
|
"loss": 0.5957, |
|
"num_input_tokens_seen": 312600876, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 4.727127207243259, |
|
"grad_norm": 12.092133522033691, |
|
"learning_rate": 2.7287279275674053e-06, |
|
"loss": 0.5952, |
|
"num_input_tokens_seen": 314275796, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 4.7521384623080385, |
|
"grad_norm": 7.043518543243408, |
|
"learning_rate": 2.478615376919614e-06, |
|
"loss": 0.6013, |
|
"num_input_tokens_seen": 315945468, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 4.777149717372818, |
|
"grad_norm": 6.208098888397217, |
|
"learning_rate": 2.2285028262718224e-06, |
|
"loss": 0.591, |
|
"num_input_tokens_seen": 317595388, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 4.802160972437597, |
|
"grad_norm": 3.588547706604004, |
|
"learning_rate": 1.978390275624031e-06, |
|
"loss": 0.5846, |
|
"num_input_tokens_seen": 319229212, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 4.827172227502376, |
|
"grad_norm": 10.502739906311035, |
|
"learning_rate": 1.7282777249762395e-06, |
|
"loss": 0.5904, |
|
"num_input_tokens_seen": 320908604, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 4.852183482567155, |
|
"grad_norm": 8.170723915100098, |
|
"learning_rate": 1.4781651743284478e-06, |
|
"loss": 0.5925, |
|
"num_input_tokens_seen": 322558268, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 4.8771947376319345, |
|
"grad_norm": 10.083109855651855, |
|
"learning_rate": 1.2280526236806563e-06, |
|
"loss": 0.5977, |
|
"num_input_tokens_seen": 324205708, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 4.902205992696714, |
|
"grad_norm": 6.591386795043945, |
|
"learning_rate": 9.779400730328649e-07, |
|
"loss": 0.5633, |
|
"num_input_tokens_seen": 325850036, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 4.927217247761493, |
|
"grad_norm": 7.133991241455078, |
|
"learning_rate": 7.278275223850733e-07, |
|
"loss": 0.5786, |
|
"num_input_tokens_seen": 327509276, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 4.952228502826272, |
|
"grad_norm": 5.090227127075195, |
|
"learning_rate": 4.777149717372818e-07, |
|
"loss": 0.5886, |
|
"num_input_tokens_seen": 329175052, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 4.977239757891051, |
|
"grad_norm": 7.157599925994873, |
|
"learning_rate": 2.276024210894903e-07, |
|
"loss": 0.5879, |
|
"num_input_tokens_seen": 330819060, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.5393198132514954, |
|
"eval_runtime": 97.8527, |
|
"eval_samples_per_second": 408.584, |
|
"eval_steps_per_second": 51.077, |
|
"num_input_tokens_seen": 332318598, |
|
"step": 99955 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"num_input_tokens_seen": 332318598, |
|
"step": 99955, |
|
"total_flos": 1.2062750373789696e+17, |
|
"train_loss": 0.7601320325591829, |
|
"train_runtime": 7988.2275, |
|
"train_samples_per_second": 100.099, |
|
"train_steps_per_second": 12.513, |
|
"train_tokens_per_second": 41593.134 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 99955, |
|
"num_input_tokens_seen": 332318598, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2062750373789696e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|