|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.928806133625411, |
|
"eval_steps": 500, |
|
"global_step": 9000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.054764512595837894, |
|
"grad_norm": 4.206851005554199, |
|
"learning_rate": 1.095290251916758e-05, |
|
"loss": 2.3255, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10952902519167579, |
|
"grad_norm": 2.8459718227386475, |
|
"learning_rate": 2.190580503833516e-05, |
|
"loss": 1.8525, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16429353778751368, |
|
"grad_norm": 3.025334119796753, |
|
"learning_rate": 3.285870755750274e-05, |
|
"loss": 1.7615, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21905805038335158, |
|
"grad_norm": 3.115264654159546, |
|
"learning_rate": 4.381161007667032e-05, |
|
"loss": 1.7006, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2738225629791895, |
|
"grad_norm": 2.6407604217529297, |
|
"learning_rate": 5.47645125958379e-05, |
|
"loss": 1.6693, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32858707557502737, |
|
"grad_norm": 2.2635467052459717, |
|
"learning_rate": 6.571741511500547e-05, |
|
"loss": 1.6554, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3833515881708653, |
|
"grad_norm": 2.4853227138519287, |
|
"learning_rate": 7.667031763417306e-05, |
|
"loss": 1.589, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.43811610076670315, |
|
"grad_norm": 2.712339162826538, |
|
"learning_rate": 8.762322015334064e-05, |
|
"loss": 1.5892, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4928806133625411, |
|
"grad_norm": 2.6400351524353027, |
|
"learning_rate": 9.857612267250822e-05, |
|
"loss": 1.5559, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.547645125958379, |
|
"grad_norm": 3.203749656677246, |
|
"learning_rate": 9.997234258138696e-05, |
|
"loss": 1.5673, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 2.3736095428466797, |
|
"learning_rate": 9.987226456522884e-05, |
|
"loss": 1.5324, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6571741511500547, |
|
"grad_norm": 2.253758192062378, |
|
"learning_rate": 9.969929463456831e-05, |
|
"loss": 1.52, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7119386637458927, |
|
"grad_norm": 2.2543797492980957, |
|
"learning_rate": 9.945368559744425e-05, |
|
"loss": 1.463, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7667031763417306, |
|
"grad_norm": 2.094783067703247, |
|
"learning_rate": 9.913579642919276e-05, |
|
"loss": 1.4819, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8214676889375685, |
|
"grad_norm": 2.028780937194824, |
|
"learning_rate": 9.874609174777887e-05, |
|
"loss": 1.4738, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8762322015334063, |
|
"grad_norm": 2.2184860706329346, |
|
"learning_rate": 9.82851411347238e-05, |
|
"loss": 1.4481, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9309967141292442, |
|
"grad_norm": 2.685318946838379, |
|
"learning_rate": 9.775361830262055e-05, |
|
"loss": 1.4143, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9857612267250822, |
|
"grad_norm": 2.3410556316375732, |
|
"learning_rate": 9.715230011045415e-05, |
|
"loss": 1.3997, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.04052573932092, |
|
"grad_norm": 2.4343554973602295, |
|
"learning_rate": 9.648206542816636e-05, |
|
"loss": 1.2183, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.095290251916758, |
|
"grad_norm": 2.7375845909118652, |
|
"learning_rate": 9.574389385212366e-05, |
|
"loss": 1.1348, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1500547645125958, |
|
"grad_norm": 2.488781213760376, |
|
"learning_rate": 9.493886427336657e-05, |
|
"loss": 1.1794, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 2.216630697250366, |
|
"learning_rate": 9.406815330073244e-05, |
|
"loss": 1.1533, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.2595837897042717, |
|
"grad_norm": 2.5205495357513428, |
|
"learning_rate": 9.313303354115677e-05, |
|
"loss": 1.1383, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.3143483023001095, |
|
"grad_norm": 2.7046151161193848, |
|
"learning_rate": 9.213487173966623e-05, |
|
"loss": 1.1639, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3691128148959475, |
|
"grad_norm": 2.2721831798553467, |
|
"learning_rate": 9.107512678178223e-05, |
|
"loss": 1.1572, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4238773274917853, |
|
"grad_norm": 2.1899495124816895, |
|
"learning_rate": 8.99553475612544e-05, |
|
"loss": 1.0975, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.4786418400876231, |
|
"grad_norm": 2.541968822479248, |
|
"learning_rate": 8.877717071624055e-05, |
|
"loss": 1.1102, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.5334063526834611, |
|
"grad_norm": 1.9248064756393433, |
|
"learning_rate": 8.754231823724187e-05, |
|
"loss": 1.1012, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.588170865279299, |
|
"grad_norm": 2.002230644226074, |
|
"learning_rate": 8.62525949502896e-05, |
|
"loss": 1.0992, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.642935377875137, |
|
"grad_norm": 2.2088050842285156, |
|
"learning_rate": 8.490988587906137e-05, |
|
"loss": 1.1102, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.6976998904709748, |
|
"grad_norm": 2.2829463481903076, |
|
"learning_rate": 8.351615348978318e-05, |
|
"loss": 1.1058, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.7524644030668126, |
|
"grad_norm": 2.018911600112915, |
|
"learning_rate": 8.207343482294323e-05, |
|
"loss": 1.0693, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 1.948006272315979, |
|
"learning_rate": 8.058383851601027e-05, |
|
"loss": 1.0797, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.8619934282584885, |
|
"grad_norm": 1.9828593730926514, |
|
"learning_rate": 7.904954172150776e-05, |
|
"loss": 1.0454, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.9167579408543265, |
|
"grad_norm": 2.2209079265594482, |
|
"learning_rate": 7.747278692494825e-05, |
|
"loss": 1.0665, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.9715224534501643, |
|
"grad_norm": 2.0689291954040527, |
|
"learning_rate": 7.585587866727898e-05, |
|
"loss": 1.0571, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.026286966046002, |
|
"grad_norm": 1.5938904285430908, |
|
"learning_rate": 7.420118017662894e-05, |
|
"loss": 0.8671, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.08105147864184, |
|
"grad_norm": 2.3023173809051514, |
|
"learning_rate": 7.251110991428034e-05, |
|
"loss": 0.6557, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.135815991237678, |
|
"grad_norm": 1.6988605260849, |
|
"learning_rate": 7.07881380399129e-05, |
|
"loss": 0.6836, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.190580503833516, |
|
"grad_norm": 1.5740976333618164, |
|
"learning_rate": 6.903478280128721e-05, |
|
"loss": 0.6712, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.245345016429354, |
|
"grad_norm": 2.228999137878418, |
|
"learning_rate": 6.725360685364384e-05, |
|
"loss": 0.668, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.3001095290251916, |
|
"grad_norm": 2.125016450881958, |
|
"learning_rate": 6.54472135141977e-05, |
|
"loss": 0.6735, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.3548740416210294, |
|
"grad_norm": 2.3794703483581543, |
|
"learning_rate": 6.361824295720199e-05, |
|
"loss": 0.6594, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 2.231340169906616, |
|
"learning_rate": 6.176936835514312e-05, |
|
"loss": 0.6691, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.4644030668127055, |
|
"grad_norm": 2.1866331100463867, |
|
"learning_rate": 5.9903291971706e-05, |
|
"loss": 0.687, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.5191675794085433, |
|
"grad_norm": 1.7319729328155518, |
|
"learning_rate": 5.8022741212220623e-05, |
|
"loss": 0.6605, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.573932092004381, |
|
"grad_norm": 2.021991729736328, |
|
"learning_rate": 5.6130464637362466e-05, |
|
"loss": 0.6499, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.628696604600219, |
|
"grad_norm": 2.0492424964904785, |
|
"learning_rate": 5.4229227945932446e-05, |
|
"loss": 0.6634, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.6834611171960567, |
|
"grad_norm": 1.9110430479049683, |
|
"learning_rate": 5.2321809932588664e-05, |
|
"loss": 0.6407, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.738225629791895, |
|
"grad_norm": 2.151268482208252, |
|
"learning_rate": 5.041099842643736e-05, |
|
"loss": 0.6336, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.792990142387733, |
|
"grad_norm": 1.8292102813720703, |
|
"learning_rate": 4.849958621641945e-05, |
|
"loss": 0.6421, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.8477546549835706, |
|
"grad_norm": 1.5233772993087769, |
|
"learning_rate": 4.659036696944793e-05, |
|
"loss": 0.6052, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.9025191675794084, |
|
"grad_norm": 2.1103007793426514, |
|
"learning_rate": 4.4686131147261994e-05, |
|
"loss": 0.6316, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.9572836801752462, |
|
"grad_norm": 1.618807315826416, |
|
"learning_rate": 4.2789661927965795e-05, |
|
"loss": 0.635, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.0120481927710845, |
|
"grad_norm": 1.661723256111145, |
|
"learning_rate": 4.090373113821281e-05, |
|
"loss": 0.553, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.0668127053669223, |
|
"grad_norm": 1.5551363229751587, |
|
"learning_rate": 3.9031095201980976e-05, |
|
"loss": 0.3049, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.12157721796276, |
|
"grad_norm": 1.7245244979858398, |
|
"learning_rate": 3.717449111186025e-05, |
|
"loss": 0.3081, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.176341730558598, |
|
"grad_norm": 1.4479619264602661, |
|
"learning_rate": 3.5336632428740265e-05, |
|
"loss": 0.3103, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.2311062431544357, |
|
"grad_norm": 2.026216745376587, |
|
"learning_rate": 3.352020531574527e-05, |
|
"loss": 0.3035, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.285870755750274, |
|
"grad_norm": 1.2143455743789673, |
|
"learning_rate": 3.172786461221279e-05, |
|
"loss": 0.3146, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.340635268346112, |
|
"grad_norm": 1.7206776142120361, |
|
"learning_rate": 2.996222995345437e-05, |
|
"loss": 0.3138, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.3953997809419496, |
|
"grad_norm": 1.966302514076233, |
|
"learning_rate": 2.822588194196941e-05, |
|
"loss": 0.3059, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.4501642935377874, |
|
"grad_norm": 1.6685985326766968, |
|
"learning_rate": 2.6521358375708428e-05, |
|
"loss": 0.3006, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 3.5049288061336252, |
|
"grad_norm": 1.6338154077529907, |
|
"learning_rate": 2.4851150538898028e-05, |
|
"loss": 0.3017, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.5596933187294635, |
|
"grad_norm": 1.6607939004898071, |
|
"learning_rate": 2.321769956084937e-05, |
|
"loss": 0.2991, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.6144578313253013, |
|
"grad_norm": 1.7949954271316528, |
|
"learning_rate": 2.1623392848071354e-05, |
|
"loss": 0.3045, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.669222343921139, |
|
"grad_norm": 1.6750141382217407, |
|
"learning_rate": 2.007056059490364e-05, |
|
"loss": 0.3077, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.723986856516977, |
|
"grad_norm": 1.425621509552002, |
|
"learning_rate": 1.856147237776956e-05, |
|
"loss": 0.2989, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.7787513691128147, |
|
"grad_norm": 1.534443736076355, |
|
"learning_rate": 1.7098333838026275e-05, |
|
"loss": 0.2975, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.833515881708653, |
|
"grad_norm": 1.511762261390686, |
|
"learning_rate": 1.5683283458260718e-05, |
|
"loss": 0.2968, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.888280394304491, |
|
"grad_norm": 1.968000054359436, |
|
"learning_rate": 1.4318389436742962e-05, |
|
"loss": 0.2907, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.9430449069003286, |
|
"grad_norm": 1.5500705242156982, |
|
"learning_rate": 1.3005646664605165e-05, |
|
"loss": 0.2922, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.9978094194961664, |
|
"grad_norm": 1.7522929906845093, |
|
"learning_rate": 1.1746973810164147e-05, |
|
"loss": 0.2815, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 4.052573932092004, |
|
"grad_norm": 0.8427908420562744, |
|
"learning_rate": 1.0544210514649233e-05, |
|
"loss": 0.1758, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.1073384446878425, |
|
"grad_norm": 0.8239675760269165, |
|
"learning_rate": 9.399114703433688e-06, |
|
"loss": 0.17, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.16210295728368, |
|
"grad_norm": 0.85968416929245, |
|
"learning_rate": 8.313360016700011e-06, |
|
"loss": 0.1679, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.216867469879518, |
|
"grad_norm": 0.9793341159820557, |
|
"learning_rate": 7.288533363293959e-06, |
|
"loss": 0.1719, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.271631982475356, |
|
"grad_norm": 0.9244991540908813, |
|
"learning_rate": 6.32613260134271e-06, |
|
"loss": 0.1718, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.326396495071194, |
|
"grad_norm": 1.2018598318099976, |
|
"learning_rate": 5.427564349027098e-06, |
|
"loss": 0.1669, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 4.381161007667032, |
|
"grad_norm": 0.9171732664108276, |
|
"learning_rate": 4.594141928707629e-06, |
|
"loss": 0.1685, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.435925520262869, |
|
"grad_norm": 0.9013499021530151, |
|
"learning_rate": 3.8270834474090466e-06, |
|
"loss": 0.1684, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 4.490690032858708, |
|
"grad_norm": 0.8273316621780396, |
|
"learning_rate": 3.1275100164689543e-06, |
|
"loss": 0.1666, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.8490225076675415, |
|
"learning_rate": 2.496444112952734e-06, |
|
"loss": 0.1709, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 4.600219058050383, |
|
"grad_norm": 0.9616093635559082, |
|
"learning_rate": 1.9348080852294783e-06, |
|
"loss": 0.1676, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 4.6549835706462215, |
|
"grad_norm": 0.6257395148277283, |
|
"learning_rate": 1.4434228048932796e-06, |
|
"loss": 0.1657, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 4.709748083242059, |
|
"grad_norm": 1.020652174949646, |
|
"learning_rate": 1.023006467000115e-06, |
|
"loss": 0.1708, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 4.764512595837897, |
|
"grad_norm": 1.2859959602355957, |
|
"learning_rate": 6.741735403739901e-07, |
|
"loss": 0.1712, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 4.8192771084337345, |
|
"grad_norm": 0.8666063547134399, |
|
"learning_rate": 3.974338695163393e-07, |
|
"loss": 0.1657, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.874041621029573, |
|
"grad_norm": 0.6807364821434021, |
|
"learning_rate": 1.9319192943152986e-07, |
|
"loss": 0.1675, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.928806133625411, |
|
"grad_norm": 1.1967036724090576, |
|
"learning_rate": 6.174623445742155e-08, |
|
"loss": 0.1721, |
|
"step": 9000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 9130, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.325169651624493e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|