|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8859229638692638, |
|
"eval_steps": 500, |
|
"global_step": 1900000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004662752441417178, |
|
"grad_norm": 1.3671598434448242, |
|
"learning_rate": 4.976690900545356e-05, |
|
"loss": 4.8243, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.009325504882834356, |
|
"grad_norm": 1.3568960428237915, |
|
"learning_rate": 4.953384132466932e-05, |
|
"loss": 3.7236, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.013988257324251536, |
|
"grad_norm": 1.4119852781295776, |
|
"learning_rate": 4.930079695764729e-05, |
|
"loss": 3.4332, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.018651009765668712, |
|
"grad_norm": 1.1216883659362793, |
|
"learning_rate": 4.906772927686305e-05, |
|
"loss": 3.2825, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.023313762207085892, |
|
"grad_norm": 1.2323102951049805, |
|
"learning_rate": 4.883468490984102e-05, |
|
"loss": 3.1866, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.02797651464850307, |
|
"grad_norm": 0.9565121531486511, |
|
"learning_rate": 4.860161722905678e-05, |
|
"loss": 3.1126, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.03263926708992025, |
|
"grad_norm": 1.1173287630081177, |
|
"learning_rate": 4.836861948955917e-05, |
|
"loss": 3.0577, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.037302019531337424, |
|
"grad_norm": 1.4626446962356567, |
|
"learning_rate": 4.813555180877493e-05, |
|
"loss": 3.0145, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.041964771972754604, |
|
"grad_norm": 1.0016125440597534, |
|
"learning_rate": 4.790246081422848e-05, |
|
"loss": 2.973, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.046627524414171784, |
|
"grad_norm": 1.3417049646377563, |
|
"learning_rate": 4.766941644720645e-05, |
|
"loss": 2.9423, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.051290276855588963, |
|
"grad_norm": 1.2270045280456543, |
|
"learning_rate": 4.7436372080184424e-05, |
|
"loss": 2.9127, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.05595302929700614, |
|
"grad_norm": 1.0300427675247192, |
|
"learning_rate": 4.7203327713162395e-05, |
|
"loss": 2.8823, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.060615781738423316, |
|
"grad_norm": 0.8841068148612976, |
|
"learning_rate": 4.6970283346140366e-05, |
|
"loss": 2.8588, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.0652785341798405, |
|
"grad_norm": 1.0524730682373047, |
|
"learning_rate": 4.673721566535613e-05, |
|
"loss": 2.8425, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 0.06994128662125768, |
|
"grad_norm": 0.9874018430709839, |
|
"learning_rate": 4.650417129833409e-05, |
|
"loss": 2.8257, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 0.07460403906267485, |
|
"grad_norm": 0.9634119272232056, |
|
"learning_rate": 4.6271126931312064e-05, |
|
"loss": 2.8114, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 0.07926679150409204, |
|
"grad_norm": 0.885671854019165, |
|
"learning_rate": 4.603808256429003e-05, |
|
"loss": 2.7911, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 0.08392954394550921, |
|
"grad_norm": 1.0135940313339233, |
|
"learning_rate": 4.58050148835058e-05, |
|
"loss": 2.7786, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 0.0885922963869264, |
|
"grad_norm": 1.011932611465454, |
|
"learning_rate": 4.557194720272156e-05, |
|
"loss": 2.7636, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 0.09325504882834357, |
|
"grad_norm": 0.7796096205711365, |
|
"learning_rate": 4.533892614946173e-05, |
|
"loss": 2.753, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 0.09791780126976074, |
|
"grad_norm": 1.1194034814834595, |
|
"learning_rate": 4.5105858468677495e-05, |
|
"loss": 2.7371, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 0.10258055371117793, |
|
"grad_norm": 1.1135520935058594, |
|
"learning_rate": 4.4872814101655467e-05, |
|
"loss": 2.7285, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 0.1072433061525951, |
|
"grad_norm": 0.7772097587585449, |
|
"learning_rate": 4.463976973463344e-05, |
|
"loss": 2.7173, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 0.11190605859401229, |
|
"grad_norm": 1.096358299255371, |
|
"learning_rate": 4.44067020538492e-05, |
|
"loss": 2.7141, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 0.11656881103542946, |
|
"grad_norm": 0.8112640380859375, |
|
"learning_rate": 4.417363437306496e-05, |
|
"loss": 2.7073, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 0.12123156347684663, |
|
"grad_norm": 1.0931545495986938, |
|
"learning_rate": 4.394059000604293e-05, |
|
"loss": 2.6931, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 0.1258943159182638, |
|
"grad_norm": 1.1369918584823608, |
|
"learning_rate": 4.37075456390209e-05, |
|
"loss": 2.6824, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 0.130557068359681, |
|
"grad_norm": 1.258300542831421, |
|
"learning_rate": 4.347450127199887e-05, |
|
"loss": 2.6741, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 0.13521982080109818, |
|
"grad_norm": 0.9752686023712158, |
|
"learning_rate": 4.324143359121463e-05, |
|
"loss": 2.6645, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 0.13988257324251535, |
|
"grad_norm": 1.0001367330551147, |
|
"learning_rate": 4.3008412537954805e-05, |
|
"loss": 2.6592, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 0.14454532568393252, |
|
"grad_norm": 1.0314422845840454, |
|
"learning_rate": 4.277534485717057e-05, |
|
"loss": 2.6523, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 0.1492080781253497, |
|
"grad_norm": 0.9287506937980652, |
|
"learning_rate": 4.254230049014854e-05, |
|
"loss": 2.6503, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 0.1538708305667669, |
|
"grad_norm": 0.8209073543548584, |
|
"learning_rate": 4.23092328093643e-05, |
|
"loss": 2.6379, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 0.15853358300818407, |
|
"grad_norm": 0.8727386593818665, |
|
"learning_rate": 4.207618844234227e-05, |
|
"loss": 2.6332, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 0.16319633544960124, |
|
"grad_norm": 0.9841961860656738, |
|
"learning_rate": 4.184314407532024e-05, |
|
"loss": 2.6279, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 0.16785908789101842, |
|
"grad_norm": 0.7831237316131592, |
|
"learning_rate": 4.1610099708298214e-05, |
|
"loss": 2.6237, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 0.1725218403324356, |
|
"grad_norm": 0.9184048175811768, |
|
"learning_rate": 4.137707865503839e-05, |
|
"loss": 2.6167, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 0.1771845927738528, |
|
"grad_norm": 0.9598727822303772, |
|
"learning_rate": 4.114401097425415e-05, |
|
"loss": 2.6082, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 0.18184734521526996, |
|
"grad_norm": 0.8814136981964111, |
|
"learning_rate": 4.0910966607232115e-05, |
|
"loss": 2.6027, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 0.18651009765668713, |
|
"grad_norm": 0.9080318212509155, |
|
"learning_rate": 4.067789892644788e-05, |
|
"loss": 2.6014, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 0.1911728500981043, |
|
"grad_norm": 0.8321977257728577, |
|
"learning_rate": 4.044485455942585e-05, |
|
"loss": 2.5964, |
|
"step": 410000 |
|
}, |
|
{ |
|
"epoch": 0.19583560253952148, |
|
"grad_norm": 1.3375693559646606, |
|
"learning_rate": 4.021178687864161e-05, |
|
"loss": 2.5906, |
|
"step": 420000 |
|
}, |
|
{ |
|
"epoch": 0.20049835498093868, |
|
"grad_norm": 0.8211286067962646, |
|
"learning_rate": 3.997874251161958e-05, |
|
"loss": 2.5866, |
|
"step": 430000 |
|
}, |
|
{ |
|
"epoch": 0.20516110742235585, |
|
"grad_norm": 0.7890422940254211, |
|
"learning_rate": 3.9745698144597546e-05, |
|
"loss": 2.5817, |
|
"step": 440000 |
|
}, |
|
{ |
|
"epoch": 0.20982385986377303, |
|
"grad_norm": 1.0580294132232666, |
|
"learning_rate": 3.951263046381331e-05, |
|
"loss": 2.5776, |
|
"step": 450000 |
|
}, |
|
{ |
|
"epoch": 0.2144866123051902, |
|
"grad_norm": 1.0666168928146362, |
|
"learning_rate": 3.927956278302907e-05, |
|
"loss": 2.5729, |
|
"step": 460000 |
|
}, |
|
{ |
|
"epoch": 0.21914936474660737, |
|
"grad_norm": 1.0440067052841187, |
|
"learning_rate": 3.904651841600704e-05, |
|
"loss": 2.5748, |
|
"step": 470000 |
|
}, |
|
{ |
|
"epoch": 0.22381211718802457, |
|
"grad_norm": 0.8746099472045898, |
|
"learning_rate": 3.88134507352228e-05, |
|
"loss": 2.5704, |
|
"step": 480000 |
|
}, |
|
{ |
|
"epoch": 0.22847486962944175, |
|
"grad_norm": 0.882897675037384, |
|
"learning_rate": 3.858040636820078e-05, |
|
"loss": 2.5623, |
|
"step": 490000 |
|
}, |
|
{ |
|
"epoch": 0.23313762207085892, |
|
"grad_norm": 0.8458369970321655, |
|
"learning_rate": 3.834740862870316e-05, |
|
"loss": 2.5612, |
|
"step": 500000 |
|
}, |
|
{ |
|
"epoch": 0.2378003745122761, |
|
"grad_norm": 0.9579658508300781, |
|
"learning_rate": 3.811434094791892e-05, |
|
"loss": 2.5551, |
|
"step": 510000 |
|
}, |
|
{ |
|
"epoch": 0.24246312695369326, |
|
"grad_norm": 1.0498722791671753, |
|
"learning_rate": 3.78813198946591e-05, |
|
"loss": 2.5502, |
|
"step": 520000 |
|
}, |
|
{ |
|
"epoch": 0.24712587939511046, |
|
"grad_norm": 1.032334804534912, |
|
"learning_rate": 3.764825221387486e-05, |
|
"loss": 2.5534, |
|
"step": 530000 |
|
}, |
|
{ |
|
"epoch": 0.2517886318365276, |
|
"grad_norm": 0.9145790934562683, |
|
"learning_rate": 3.7415184533090624e-05, |
|
"loss": 2.547, |
|
"step": 540000 |
|
}, |
|
{ |
|
"epoch": 0.2564513842779448, |
|
"grad_norm": 1.0633904933929443, |
|
"learning_rate": 3.71821634798308e-05, |
|
"loss": 2.543, |
|
"step": 550000 |
|
}, |
|
{ |
|
"epoch": 0.261114136719362, |
|
"grad_norm": 0.9828123450279236, |
|
"learning_rate": 3.694911911280877e-05, |
|
"loss": 2.5398, |
|
"step": 560000 |
|
}, |
|
{ |
|
"epoch": 0.2657768891607792, |
|
"grad_norm": 0.8735861778259277, |
|
"learning_rate": 3.671607474578674e-05, |
|
"loss": 2.5345, |
|
"step": 570000 |
|
}, |
|
{ |
|
"epoch": 0.27043964160219636, |
|
"grad_norm": 1.1347264051437378, |
|
"learning_rate": 3.6483053692526915e-05, |
|
"loss": 2.5312, |
|
"step": 580000 |
|
}, |
|
{ |
|
"epoch": 0.27510239404361353, |
|
"grad_norm": 0.8275557160377502, |
|
"learning_rate": 3.625000932550489e-05, |
|
"loss": 2.5299, |
|
"step": 590000 |
|
}, |
|
{ |
|
"epoch": 0.2797651464850307, |
|
"grad_norm": 0.8891148567199707, |
|
"learning_rate": 3.601696495848285e-05, |
|
"loss": 2.5301, |
|
"step": 600000 |
|
}, |
|
{ |
|
"epoch": 0.2844278989264479, |
|
"grad_norm": 0.9649547338485718, |
|
"learning_rate": 3.578389727769861e-05, |
|
"loss": 2.5304, |
|
"step": 610000 |
|
}, |
|
{ |
|
"epoch": 0.28909065136786505, |
|
"grad_norm": 1.0385549068450928, |
|
"learning_rate": 3.5550876224438794e-05, |
|
"loss": 2.5179, |
|
"step": 620000 |
|
}, |
|
{ |
|
"epoch": 0.2937534038092822, |
|
"grad_norm": 1.1167703866958618, |
|
"learning_rate": 3.5317785229892346e-05, |
|
"loss": 2.5168, |
|
"step": 630000 |
|
}, |
|
{ |
|
"epoch": 0.2984161562506994, |
|
"grad_norm": 0.8988041281700134, |
|
"learning_rate": 3.508476417663252e-05, |
|
"loss": 2.519, |
|
"step": 640000 |
|
}, |
|
{ |
|
"epoch": 0.30307890869211657, |
|
"grad_norm": 1.2292715311050415, |
|
"learning_rate": 3.485169649584828e-05, |
|
"loss": 2.5102, |
|
"step": 650000 |
|
}, |
|
{ |
|
"epoch": 0.3077416611335338, |
|
"grad_norm": 0.9778872728347778, |
|
"learning_rate": 3.4618652128826254e-05, |
|
"loss": 2.5115, |
|
"step": 660000 |
|
}, |
|
{ |
|
"epoch": 0.31240441357495097, |
|
"grad_norm": 0.9290140271186829, |
|
"learning_rate": 3.4385584448042016e-05, |
|
"loss": 2.5068, |
|
"step": 670000 |
|
}, |
|
{ |
|
"epoch": 0.31706716601636814, |
|
"grad_norm": 0.8549484610557556, |
|
"learning_rate": 3.415254008101999e-05, |
|
"loss": 2.5045, |
|
"step": 680000 |
|
}, |
|
{ |
|
"epoch": 0.3217299184577853, |
|
"grad_norm": 0.9482207894325256, |
|
"learning_rate": 3.391949571399796e-05, |
|
"loss": 2.5002, |
|
"step": 690000 |
|
}, |
|
{ |
|
"epoch": 0.3263926708992025, |
|
"grad_norm": 0.9213985204696655, |
|
"learning_rate": 3.368642803321372e-05, |
|
"loss": 2.499, |
|
"step": 700000 |
|
}, |
|
{ |
|
"epoch": 0.33105542334061966, |
|
"grad_norm": 0.9165827631950378, |
|
"learning_rate": 3.345338366619169e-05, |
|
"loss": 2.4978, |
|
"step": 710000 |
|
}, |
|
{ |
|
"epoch": 0.33571817578203683, |
|
"grad_norm": 0.9712676405906677, |
|
"learning_rate": 3.3220315985407453e-05, |
|
"loss": 2.4923, |
|
"step": 720000 |
|
}, |
|
{ |
|
"epoch": 0.340380928223454, |
|
"grad_norm": 0.9476392865180969, |
|
"learning_rate": 3.2987271618385425e-05, |
|
"loss": 2.4921, |
|
"step": 730000 |
|
}, |
|
{ |
|
"epoch": 0.3450436806648712, |
|
"grad_norm": 0.8675413727760315, |
|
"learning_rate": 3.275422725136339e-05, |
|
"loss": 2.4913, |
|
"step": 740000 |
|
}, |
|
{ |
|
"epoch": 0.34970643310628835, |
|
"grad_norm": 1.0163840055465698, |
|
"learning_rate": 3.252115957057915e-05, |
|
"loss": 2.4876, |
|
"step": 750000 |
|
}, |
|
{ |
|
"epoch": 0.3543691855477056, |
|
"grad_norm": 0.8246685862541199, |
|
"learning_rate": 3.228809188979491e-05, |
|
"loss": 2.4848, |
|
"step": 760000 |
|
}, |
|
{ |
|
"epoch": 0.35903193798912275, |
|
"grad_norm": 1.1230520009994507, |
|
"learning_rate": 3.205507083653509e-05, |
|
"loss": 2.4856, |
|
"step": 770000 |
|
}, |
|
{ |
|
"epoch": 0.3636946904305399, |
|
"grad_norm": 1.0105488300323486, |
|
"learning_rate": 3.182200315575085e-05, |
|
"loss": 2.4815, |
|
"step": 780000 |
|
}, |
|
{ |
|
"epoch": 0.3683574428719571, |
|
"grad_norm": 1.0298407077789307, |
|
"learning_rate": 3.158893547496662e-05, |
|
"loss": 2.4801, |
|
"step": 790000 |
|
}, |
|
{ |
|
"epoch": 0.37302019531337427, |
|
"grad_norm": 1.1969521045684814, |
|
"learning_rate": 3.135589110794458e-05, |
|
"loss": 2.4776, |
|
"step": 800000 |
|
}, |
|
{ |
|
"epoch": 0.37768294775479144, |
|
"grad_norm": 0.9831893444061279, |
|
"learning_rate": 3.112287005468476e-05, |
|
"loss": 2.4787, |
|
"step": 810000 |
|
}, |
|
{ |
|
"epoch": 0.3823457001962086, |
|
"grad_norm": 1.2115819454193115, |
|
"learning_rate": 3.0889825687662735e-05, |
|
"loss": 2.4721, |
|
"step": 820000 |
|
}, |
|
{ |
|
"epoch": 0.3870084526376258, |
|
"grad_norm": 1.0241882801055908, |
|
"learning_rate": 3.06567813206407e-05, |
|
"loss": 2.4714, |
|
"step": 830000 |
|
}, |
|
{ |
|
"epoch": 0.39167120507904296, |
|
"grad_norm": 1.0166583061218262, |
|
"learning_rate": 3.042373695361867e-05, |
|
"loss": 2.4736, |
|
"step": 840000 |
|
}, |
|
{ |
|
"epoch": 0.39633395752046013, |
|
"grad_norm": 0.9006229639053345, |
|
"learning_rate": 3.0190645959072226e-05, |
|
"loss": 2.4678, |
|
"step": 850000 |
|
}, |
|
{ |
|
"epoch": 0.40099670996187736, |
|
"grad_norm": 1.0126405954360962, |
|
"learning_rate": 2.9957624905812404e-05, |
|
"loss": 2.4651, |
|
"step": 860000 |
|
}, |
|
{ |
|
"epoch": 0.40565946240329454, |
|
"grad_norm": 1.3557987213134766, |
|
"learning_rate": 2.972458053879037e-05, |
|
"loss": 2.466, |
|
"step": 870000 |
|
}, |
|
{ |
|
"epoch": 0.4103222148447117, |
|
"grad_norm": 1.241245150566101, |
|
"learning_rate": 2.9491559485530546e-05, |
|
"loss": 2.4611, |
|
"step": 880000 |
|
}, |
|
{ |
|
"epoch": 0.4149849672861289, |
|
"grad_norm": 1.0998499393463135, |
|
"learning_rate": 2.9258491804746315e-05, |
|
"loss": 2.4618, |
|
"step": 890000 |
|
}, |
|
{ |
|
"epoch": 0.41964771972754605, |
|
"grad_norm": 0.9139440655708313, |
|
"learning_rate": 2.902544743772428e-05, |
|
"loss": 2.461, |
|
"step": 900000 |
|
}, |
|
{ |
|
"epoch": 0.4243104721689632, |
|
"grad_norm": 0.8467987179756165, |
|
"learning_rate": 2.8792426384464453e-05, |
|
"loss": 2.4584, |
|
"step": 910000 |
|
}, |
|
{ |
|
"epoch": 0.4289732246103804, |
|
"grad_norm": 1.090574860572815, |
|
"learning_rate": 2.8559335389918012e-05, |
|
"loss": 2.4567, |
|
"step": 920000 |
|
}, |
|
{ |
|
"epoch": 0.43363597705179757, |
|
"grad_norm": 1.063254714012146, |
|
"learning_rate": 2.832629102289598e-05, |
|
"loss": 2.4572, |
|
"step": 930000 |
|
}, |
|
{ |
|
"epoch": 0.43829872949321474, |
|
"grad_norm": 0.962209165096283, |
|
"learning_rate": 2.8093223342111742e-05, |
|
"loss": 2.4525, |
|
"step": 940000 |
|
}, |
|
{ |
|
"epoch": 0.4429614819346319, |
|
"grad_norm": 0.9544676542282104, |
|
"learning_rate": 2.7860202288851923e-05, |
|
"loss": 2.4518, |
|
"step": 950000 |
|
}, |
|
{ |
|
"epoch": 0.44762423437604915, |
|
"grad_norm": 1.0543245077133179, |
|
"learning_rate": 2.7627134608067685e-05, |
|
"loss": 2.4423, |
|
"step": 960000 |
|
}, |
|
{ |
|
"epoch": 0.4522869868174663, |
|
"grad_norm": 1.1137259006500244, |
|
"learning_rate": 2.7394066927283447e-05, |
|
"loss": 2.445, |
|
"step": 970000 |
|
}, |
|
{ |
|
"epoch": 0.4569497392588835, |
|
"grad_norm": 0.8332359790802002, |
|
"learning_rate": 2.716102256026141e-05, |
|
"loss": 2.4483, |
|
"step": 980000 |
|
}, |
|
{ |
|
"epoch": 0.46161249170030066, |
|
"grad_norm": 0.9382948875427246, |
|
"learning_rate": 2.6927954879477173e-05, |
|
"loss": 2.4451, |
|
"step": 990000 |
|
}, |
|
{ |
|
"epoch": 0.46627524414171784, |
|
"grad_norm": 0.8949111700057983, |
|
"learning_rate": 2.6694910512455145e-05, |
|
"loss": 2.4431, |
|
"step": 1000000 |
|
}, |
|
{ |
|
"epoch": 0.470937996583135, |
|
"grad_norm": 0.8998690843582153, |
|
"learning_rate": 2.6461866145433116e-05, |
|
"loss": 2.4419, |
|
"step": 1010000 |
|
}, |
|
{ |
|
"epoch": 0.4756007490245522, |
|
"grad_norm": 0.9611771702766418, |
|
"learning_rate": 2.6228821778411084e-05, |
|
"loss": 2.4425, |
|
"step": 1020000 |
|
}, |
|
{ |
|
"epoch": 0.48026350146596936, |
|
"grad_norm": 1.0314167737960815, |
|
"learning_rate": 2.5995754097626846e-05, |
|
"loss": 2.4393, |
|
"step": 1030000 |
|
}, |
|
{ |
|
"epoch": 0.48492625390738653, |
|
"grad_norm": 1.0637314319610596, |
|
"learning_rate": 2.5762709730604817e-05, |
|
"loss": 2.4373, |
|
"step": 1040000 |
|
}, |
|
{ |
|
"epoch": 0.4895890063488037, |
|
"grad_norm": 1.1488375663757324, |
|
"learning_rate": 2.552964204982058e-05, |
|
"loss": 2.4348, |
|
"step": 1050000 |
|
}, |
|
{ |
|
"epoch": 0.49425175879022093, |
|
"grad_norm": 1.1408883333206177, |
|
"learning_rate": 2.529657436903634e-05, |
|
"loss": 2.4352, |
|
"step": 1060000 |
|
}, |
|
{ |
|
"epoch": 0.4989145112316381, |
|
"grad_norm": 0.9368888735771179, |
|
"learning_rate": 2.5063530002014312e-05, |
|
"loss": 2.4321, |
|
"step": 1070000 |
|
}, |
|
{ |
|
"epoch": 0.5035772636730552, |
|
"grad_norm": 0.9766519069671631, |
|
"learning_rate": 2.483046232123007e-05, |
|
"loss": 2.4319, |
|
"step": 1080000 |
|
}, |
|
{ |
|
"epoch": 0.5082400161144724, |
|
"grad_norm": 0.9681417942047119, |
|
"learning_rate": 2.4597417954208042e-05, |
|
"loss": 2.4329, |
|
"step": 1090000 |
|
}, |
|
{ |
|
"epoch": 0.5129027685558896, |
|
"grad_norm": 1.0913608074188232, |
|
"learning_rate": 2.4364373587186014e-05, |
|
"loss": 2.428, |
|
"step": 1100000 |
|
}, |
|
{ |
|
"epoch": 0.5175655209973068, |
|
"grad_norm": 0.9350466132164001, |
|
"learning_rate": 2.413132922016398e-05, |
|
"loss": 2.4241, |
|
"step": 1110000 |
|
}, |
|
{ |
|
"epoch": 0.522228273438724, |
|
"grad_norm": 0.8676067590713501, |
|
"learning_rate": 2.3898261539379743e-05, |
|
"loss": 2.4227, |
|
"step": 1120000 |
|
}, |
|
{ |
|
"epoch": 0.5268910258801411, |
|
"grad_norm": 0.8786779046058655, |
|
"learning_rate": 2.366521717235771e-05, |
|
"loss": 2.4208, |
|
"step": 1130000 |
|
}, |
|
{ |
|
"epoch": 0.5315537783215584, |
|
"grad_norm": 0.9234575629234314, |
|
"learning_rate": 2.3432126177811267e-05, |
|
"loss": 2.4249, |
|
"step": 1140000 |
|
}, |
|
{ |
|
"epoch": 0.5362165307629755, |
|
"grad_norm": 1.3854731321334839, |
|
"learning_rate": 2.3199105124551445e-05, |
|
"loss": 2.4245, |
|
"step": 1150000 |
|
}, |
|
{ |
|
"epoch": 0.5408792832043927, |
|
"grad_norm": 1.0635942220687866, |
|
"learning_rate": 2.2966060757529416e-05, |
|
"loss": 2.4202, |
|
"step": 1160000 |
|
}, |
|
{ |
|
"epoch": 0.5455420356458098, |
|
"grad_norm": 0.8858787417411804, |
|
"learning_rate": 2.2732993076745178e-05, |
|
"loss": 2.418, |
|
"step": 1170000 |
|
}, |
|
{ |
|
"epoch": 0.5502047880872271, |
|
"grad_norm": 1.309348702430725, |
|
"learning_rate": 2.2499948709723146e-05, |
|
"loss": 2.4148, |
|
"step": 1180000 |
|
}, |
|
{ |
|
"epoch": 0.5548675405286442, |
|
"grad_norm": 0.9109322428703308, |
|
"learning_rate": 2.2266904342701114e-05, |
|
"loss": 2.4151, |
|
"step": 1190000 |
|
}, |
|
{ |
|
"epoch": 0.5595302929700614, |
|
"grad_norm": 1.0102558135986328, |
|
"learning_rate": 2.2033836661916876e-05, |
|
"loss": 2.4134, |
|
"step": 1200000 |
|
}, |
|
{ |
|
"epoch": 0.5641930454114785, |
|
"grad_norm": 1.0807286500930786, |
|
"learning_rate": 2.1800815608657053e-05, |
|
"loss": 2.4135, |
|
"step": 1210000 |
|
}, |
|
{ |
|
"epoch": 0.5688557978528958, |
|
"grad_norm": 0.9256259799003601, |
|
"learning_rate": 2.156774792787282e-05, |
|
"loss": 2.4106, |
|
"step": 1220000 |
|
}, |
|
{ |
|
"epoch": 0.573518550294313, |
|
"grad_norm": 0.9988642930984497, |
|
"learning_rate": 2.1334703560850786e-05, |
|
"loss": 2.4127, |
|
"step": 1230000 |
|
}, |
|
{ |
|
"epoch": 0.5781813027357301, |
|
"grad_norm": 1.1274610757827759, |
|
"learning_rate": 2.1101659193828754e-05, |
|
"loss": 2.4096, |
|
"step": 1240000 |
|
}, |
|
{ |
|
"epoch": 0.5828440551771473, |
|
"grad_norm": 1.0102494955062866, |
|
"learning_rate": 2.0868591513044516e-05, |
|
"loss": 2.4066, |
|
"step": 1250000 |
|
}, |
|
{ |
|
"epoch": 0.5875068076185644, |
|
"grad_norm": 1.011261224746704, |
|
"learning_rate": 2.0635547146022484e-05, |
|
"loss": 2.4046, |
|
"step": 1260000 |
|
}, |
|
{ |
|
"epoch": 0.5921695600599817, |
|
"grad_norm": 1.1067317724227905, |
|
"learning_rate": 2.0402502779000456e-05, |
|
"loss": 2.4054, |
|
"step": 1270000 |
|
}, |
|
{ |
|
"epoch": 0.5968323125013988, |
|
"grad_norm": 1.117375135421753, |
|
"learning_rate": 2.0169435098216217e-05, |
|
"loss": 2.4057, |
|
"step": 1280000 |
|
}, |
|
{ |
|
"epoch": 0.601495064942816, |
|
"grad_norm": 1.067470669746399, |
|
"learning_rate": 1.993636741743198e-05, |
|
"loss": 2.4066, |
|
"step": 1290000 |
|
}, |
|
{ |
|
"epoch": 0.6061578173842331, |
|
"grad_norm": 1.0158133506774902, |
|
"learning_rate": 1.9703346364172157e-05, |
|
"loss": 2.4029, |
|
"step": 1300000 |
|
}, |
|
{ |
|
"epoch": 0.6108205698256504, |
|
"grad_norm": 1.1179207563400269, |
|
"learning_rate": 1.947027868338792e-05, |
|
"loss": 2.4006, |
|
"step": 1310000 |
|
}, |
|
{ |
|
"epoch": 0.6154833222670676, |
|
"grad_norm": 0.8885159492492676, |
|
"learning_rate": 1.923721100260368e-05, |
|
"loss": 2.4008, |
|
"step": 1320000 |
|
}, |
|
{ |
|
"epoch": 0.6201460747084847, |
|
"grad_norm": 0.9562169313430786, |
|
"learning_rate": 1.9004143321819446e-05, |
|
"loss": 2.4014, |
|
"step": 1330000 |
|
}, |
|
{ |
|
"epoch": 0.6248088271499019, |
|
"grad_norm": 1.0893275737762451, |
|
"learning_rate": 1.8771098954797414e-05, |
|
"loss": 2.3992, |
|
"step": 1340000 |
|
}, |
|
{ |
|
"epoch": 0.629471579591319, |
|
"grad_norm": 1.1396783590316772, |
|
"learning_rate": 1.853807790153759e-05, |
|
"loss": 2.3961, |
|
"step": 1350000 |
|
}, |
|
{ |
|
"epoch": 0.6341343320327363, |
|
"grad_norm": 0.894639790058136, |
|
"learning_rate": 1.830503353451556e-05, |
|
"loss": 2.3949, |
|
"step": 1360000 |
|
}, |
|
{ |
|
"epoch": 0.6387970844741534, |
|
"grad_norm": 1.0523122549057007, |
|
"learning_rate": 1.807196585373132e-05, |
|
"loss": 2.3924, |
|
"step": 1370000 |
|
}, |
|
{ |
|
"epoch": 0.6434598369155706, |
|
"grad_norm": 1.4329748153686523, |
|
"learning_rate": 1.7838898172947086e-05, |
|
"loss": 2.3965, |
|
"step": 1380000 |
|
}, |
|
{ |
|
"epoch": 0.6481225893569877, |
|
"grad_norm": 0.9407207369804382, |
|
"learning_rate": 1.7605853805925054e-05, |
|
"loss": 2.3944, |
|
"step": 1390000 |
|
}, |
|
{ |
|
"epoch": 0.652785341798405, |
|
"grad_norm": 1.1153851747512817, |
|
"learning_rate": 1.7372809438903022e-05, |
|
"loss": 2.3951, |
|
"step": 1400000 |
|
}, |
|
{ |
|
"epoch": 0.6574480942398221, |
|
"grad_norm": 1.4270461797714233, |
|
"learning_rate": 1.7139741758118784e-05, |
|
"loss": 2.3903, |
|
"step": 1410000 |
|
}, |
|
{ |
|
"epoch": 0.6621108466812393, |
|
"grad_norm": 0.9156707525253296, |
|
"learning_rate": 1.6906697391096756e-05, |
|
"loss": 2.387, |
|
"step": 1420000 |
|
}, |
|
{ |
|
"epoch": 0.6667735991226565, |
|
"grad_norm": 1.0517213344573975, |
|
"learning_rate": 1.6673653024074724e-05, |
|
"loss": 2.3908, |
|
"step": 1430000 |
|
}, |
|
{ |
|
"epoch": 0.6714363515640737, |
|
"grad_norm": 1.1789027452468872, |
|
"learning_rate": 1.64406319708149e-05, |
|
"loss": 2.3857, |
|
"step": 1440000 |
|
}, |
|
{ |
|
"epoch": 0.6760991040054909, |
|
"grad_norm": 0.9410611391067505, |
|
"learning_rate": 1.6207564290030663e-05, |
|
"loss": 2.3851, |
|
"step": 1450000 |
|
}, |
|
{ |
|
"epoch": 0.680761856446908, |
|
"grad_norm": 1.2597123384475708, |
|
"learning_rate": 1.597451992300863e-05, |
|
"loss": 2.3853, |
|
"step": 1460000 |
|
}, |
|
{ |
|
"epoch": 0.6854246088883252, |
|
"grad_norm": 1.111659288406372, |
|
"learning_rate": 1.5741452242224393e-05, |
|
"loss": 2.3849, |
|
"step": 1470000 |
|
}, |
|
{ |
|
"epoch": 0.6900873613297424, |
|
"grad_norm": 1.114686131477356, |
|
"learning_rate": 1.5508384561440158e-05, |
|
"loss": 2.3844, |
|
"step": 1480000 |
|
}, |
|
{ |
|
"epoch": 0.6947501137711596, |
|
"grad_norm": 1.3087519407272339, |
|
"learning_rate": 1.527531688065592e-05, |
|
"loss": 2.3811, |
|
"step": 1490000 |
|
}, |
|
{ |
|
"epoch": 0.6994128662125767, |
|
"grad_norm": 1.2704778909683228, |
|
"learning_rate": 1.5042319141158304e-05, |
|
"loss": 2.3793, |
|
"step": 1500000 |
|
}, |
|
{ |
|
"epoch": 0.7040756186539939, |
|
"grad_norm": 1.0817821025848389, |
|
"learning_rate": 1.4809251460374065e-05, |
|
"loss": 2.3793, |
|
"step": 1510000 |
|
}, |
|
{ |
|
"epoch": 0.7087383710954112, |
|
"grad_norm": 1.1640921831130981, |
|
"learning_rate": 1.4576230407114241e-05, |
|
"loss": 2.3826, |
|
"step": 1520000 |
|
}, |
|
{ |
|
"epoch": 0.7134011235368283, |
|
"grad_norm": 1.5091464519500732, |
|
"learning_rate": 1.4343162726330003e-05, |
|
"loss": 2.379, |
|
"step": 1530000 |
|
}, |
|
{ |
|
"epoch": 0.7180638759782455, |
|
"grad_norm": 1.3562886714935303, |
|
"learning_rate": 1.4110118359307974e-05, |
|
"loss": 2.3748, |
|
"step": 1540000 |
|
}, |
|
{ |
|
"epoch": 0.7227266284196626, |
|
"grad_norm": 0.9998787641525269, |
|
"learning_rate": 1.3877073992285944e-05, |
|
"loss": 2.375, |
|
"step": 1550000 |
|
}, |
|
{ |
|
"epoch": 0.7273893808610798, |
|
"grad_norm": 1.163294792175293, |
|
"learning_rate": 1.3644006311501706e-05, |
|
"loss": 2.3776, |
|
"step": 1560000 |
|
}, |
|
{ |
|
"epoch": 0.732052133302497, |
|
"grad_norm": 1.0799118280410767, |
|
"learning_rate": 1.3410985258241882e-05, |
|
"loss": 2.3732, |
|
"step": 1570000 |
|
}, |
|
{ |
|
"epoch": 0.7367148857439142, |
|
"grad_norm": 0.9467183351516724, |
|
"learning_rate": 1.317794089121985e-05, |
|
"loss": 2.3705, |
|
"step": 1580000 |
|
}, |
|
{ |
|
"epoch": 0.7413776381853313, |
|
"grad_norm": 1.2810046672821045, |
|
"learning_rate": 1.2944873210435612e-05, |
|
"loss": 2.3721, |
|
"step": 1590000 |
|
}, |
|
{ |
|
"epoch": 0.7460403906267485, |
|
"grad_norm": 1.2798866033554077, |
|
"learning_rate": 1.2711828843413585e-05, |
|
"loss": 2.3738, |
|
"step": 1600000 |
|
}, |
|
{ |
|
"epoch": 0.7507031430681657, |
|
"grad_norm": 1.221845030784607, |
|
"learning_rate": 1.2478784476391553e-05, |
|
"loss": 2.3683, |
|
"step": 1610000 |
|
}, |
|
{ |
|
"epoch": 0.7553658955095829, |
|
"grad_norm": 1.2743821144104004, |
|
"learning_rate": 1.2245740109369522e-05, |
|
"loss": 2.3724, |
|
"step": 1620000 |
|
}, |
|
{ |
|
"epoch": 0.7600286479510001, |
|
"grad_norm": 1.1069179773330688, |
|
"learning_rate": 1.201269574234749e-05, |
|
"loss": 2.3662, |
|
"step": 1630000 |
|
}, |
|
{ |
|
"epoch": 0.7646914003924172, |
|
"grad_norm": 1.4689267873764038, |
|
"learning_rate": 1.177965137532546e-05, |
|
"loss": 2.3713, |
|
"step": 1640000 |
|
}, |
|
{ |
|
"epoch": 0.7693541528338345, |
|
"grad_norm": 1.0129334926605225, |
|
"learning_rate": 1.154660700830343e-05, |
|
"loss": 2.3689, |
|
"step": 1650000 |
|
}, |
|
{ |
|
"epoch": 0.7740169052752516, |
|
"grad_norm": 0.9776953458786011, |
|
"learning_rate": 1.1313539327519193e-05, |
|
"loss": 2.363, |
|
"step": 1660000 |
|
}, |
|
{ |
|
"epoch": 0.7786796577166688, |
|
"grad_norm": 1.1849191188812256, |
|
"learning_rate": 1.1080494960497161e-05, |
|
"loss": 2.3671, |
|
"step": 1670000 |
|
}, |
|
{ |
|
"epoch": 0.7833424101580859, |
|
"grad_norm": 1.0659184455871582, |
|
"learning_rate": 1.0847427279712923e-05, |
|
"loss": 2.363, |
|
"step": 1680000 |
|
}, |
|
{ |
|
"epoch": 0.7880051625995032, |
|
"grad_norm": 1.0228557586669922, |
|
"learning_rate": 1.0614382912690895e-05, |
|
"loss": 2.362, |
|
"step": 1690000 |
|
}, |
|
{ |
|
"epoch": 0.7926679150409203, |
|
"grad_norm": 0.9540805816650391, |
|
"learning_rate": 1.0381315231906656e-05, |
|
"loss": 2.366, |
|
"step": 1700000 |
|
}, |
|
{ |
|
"epoch": 0.7973306674823375, |
|
"grad_norm": 1.1381940841674805, |
|
"learning_rate": 1.0148270864884624e-05, |
|
"loss": 2.3592, |
|
"step": 1710000 |
|
}, |
|
{ |
|
"epoch": 0.8019934199237547, |
|
"grad_norm": 1.1460505723953247, |
|
"learning_rate": 9.915226497862596e-06, |
|
"loss": 2.3591, |
|
"step": 1720000 |
|
}, |
|
{ |
|
"epoch": 0.8066561723651718, |
|
"grad_norm": 1.0586894750595093, |
|
"learning_rate": 9.682182130840564e-06, |
|
"loss": 2.3592, |
|
"step": 1730000 |
|
}, |
|
{ |
|
"epoch": 0.8113189248065891, |
|
"grad_norm": 1.3877402544021606, |
|
"learning_rate": 9.449114450056326e-06, |
|
"loss": 2.3635, |
|
"step": 1740000 |
|
}, |
|
{ |
|
"epoch": 0.8159816772480062, |
|
"grad_norm": 1.2622848749160767, |
|
"learning_rate": 9.216046769272089e-06, |
|
"loss": 2.3577, |
|
"step": 1750000 |
|
}, |
|
{ |
|
"epoch": 0.8206444296894234, |
|
"grad_norm": 1.1290611028671265, |
|
"learning_rate": 8.983002402250059e-06, |
|
"loss": 2.3587, |
|
"step": 1760000 |
|
}, |
|
{ |
|
"epoch": 0.8253071821308405, |
|
"grad_norm": 1.0407214164733887, |
|
"learning_rate": 8.74993472146582e-06, |
|
"loss": 2.3562, |
|
"step": 1770000 |
|
}, |
|
{ |
|
"epoch": 0.8299699345722578, |
|
"grad_norm": 1.1062073707580566, |
|
"learning_rate": 8.51689035444379e-06, |
|
"loss": 2.358, |
|
"step": 1780000 |
|
}, |
|
{ |
|
"epoch": 0.8346326870136749, |
|
"grad_norm": 1.04072904586792, |
|
"learning_rate": 8.28384598742176e-06, |
|
"loss": 2.3518, |
|
"step": 1790000 |
|
}, |
|
{ |
|
"epoch": 0.8392954394550921, |
|
"grad_norm": 1.0454237461090088, |
|
"learning_rate": 8.050801620399728e-06, |
|
"loss": 2.3587, |
|
"step": 1800000 |
|
}, |
|
{ |
|
"epoch": 0.8439581918965092, |
|
"grad_norm": 1.2492414712905884, |
|
"learning_rate": 7.817780567139906e-06, |
|
"loss": 2.3552, |
|
"step": 1810000 |
|
}, |
|
{ |
|
"epoch": 0.8486209443379265, |
|
"grad_norm": 1.2101612091064453, |
|
"learning_rate": 7.584712886355667e-06, |
|
"loss": 2.358, |
|
"step": 1820000 |
|
}, |
|
{ |
|
"epoch": 0.8532836967793437, |
|
"grad_norm": 1.0315169095993042, |
|
"learning_rate": 7.351668519333638e-06, |
|
"loss": 2.3515, |
|
"step": 1830000 |
|
}, |
|
{ |
|
"epoch": 0.8579464492207608, |
|
"grad_norm": 1.130194902420044, |
|
"learning_rate": 7.118624152311607e-06, |
|
"loss": 2.3535, |
|
"step": 1840000 |
|
}, |
|
{ |
|
"epoch": 0.862609201662178, |
|
"grad_norm": 1.1591068506240845, |
|
"learning_rate": 6.885579785289576e-06, |
|
"loss": 2.3484, |
|
"step": 1850000 |
|
}, |
|
{ |
|
"epoch": 0.8672719541035951, |
|
"grad_norm": 1.1694544553756714, |
|
"learning_rate": 6.652535418267545e-06, |
|
"loss": 2.3473, |
|
"step": 1860000 |
|
}, |
|
{ |
|
"epoch": 0.8719347065450124, |
|
"grad_norm": 1.2773854732513428, |
|
"learning_rate": 6.419491051245514e-06, |
|
"loss": 2.3464, |
|
"step": 1870000 |
|
}, |
|
{ |
|
"epoch": 0.8765974589864295, |
|
"grad_norm": 1.0938977003097534, |
|
"learning_rate": 6.186423370461277e-06, |
|
"loss": 2.3468, |
|
"step": 1880000 |
|
}, |
|
{ |
|
"epoch": 0.8812602114278467, |
|
"grad_norm": 1.178916573524475, |
|
"learning_rate": 5.9534023172014535e-06, |
|
"loss": 2.3455, |
|
"step": 1890000 |
|
}, |
|
{ |
|
"epoch": 0.8859229638692638, |
|
"grad_norm": 1.2058972120285034, |
|
"learning_rate": 5.720334636417215e-06, |
|
"loss": 2.3433, |
|
"step": 1900000 |
|
} |
|
], |
|
"logging_steps": 10000, |
|
"max_steps": 2144656, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 100000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.684660451731086e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|