|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9973634651600753, |
|
"eval_steps": 83, |
|
"global_step": 331, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0030131826741996233, |
|
"grad_norm": 7.092586040496826, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.9681, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0030131826741996233, |
|
"eval_loss": NaN, |
|
"eval_runtime": 92.9755, |
|
"eval_samples_per_second": 6.012, |
|
"eval_steps_per_second": 1.506, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006026365348399247, |
|
"grad_norm": 6.170314788818359, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 3.029, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00903954802259887, |
|
"grad_norm": 7.000847339630127, |
|
"learning_rate": 6e-06, |
|
"loss": 3.5581, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.012052730696798493, |
|
"grad_norm": 7.161468029022217, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.5814, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.015065913370998116, |
|
"grad_norm": 7.698644638061523, |
|
"learning_rate": 1e-05, |
|
"loss": 3.8585, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01807909604519774, |
|
"grad_norm": 8.074420928955078, |
|
"learning_rate": 1.2e-05, |
|
"loss": 4.001, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.021092278719397364, |
|
"grad_norm": 7.968758583068848, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 4.1068, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.024105461393596987, |
|
"grad_norm": 8.725764274597168, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 3.6916, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02711864406779661, |
|
"grad_norm": 9.704715728759766, |
|
"learning_rate": 1.8e-05, |
|
"loss": 4.083, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.030131826741996232, |
|
"grad_norm": 8.977635383605957, |
|
"learning_rate": 2e-05, |
|
"loss": 3.548, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03314500941619586, |
|
"grad_norm": 7.7479376792907715, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 3.2281, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03615819209039548, |
|
"grad_norm": 6.718513011932373, |
|
"learning_rate": 2.4e-05, |
|
"loss": 3.0443, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.039171374764595104, |
|
"grad_norm": 5.1653056144714355, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 2.6208, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04218455743879473, |
|
"grad_norm": 4.840386867523193, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 2.3382, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04519774011299435, |
|
"grad_norm": 4.570226669311523, |
|
"learning_rate": 3e-05, |
|
"loss": 2.0896, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04821092278719397, |
|
"grad_norm": 5.179202079772949, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.0368, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.051224105461393596, |
|
"grad_norm": 4.725672245025635, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.8898, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05423728813559322, |
|
"grad_norm": 4.372887134552002, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.6599, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05725047080979284, |
|
"grad_norm": 4.00122594833374, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.4213, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.060263653483992465, |
|
"grad_norm": 4.925197124481201, |
|
"learning_rate": 4e-05, |
|
"loss": 1.5454, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06327683615819209, |
|
"grad_norm": 9.11976146697998, |
|
"learning_rate": 4.2e-05, |
|
"loss": 2.1408, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06629001883239172, |
|
"grad_norm": 15.82564640045166, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.5483, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06930320150659133, |
|
"grad_norm": 15.502534866333008, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.8806, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07231638418079096, |
|
"grad_norm": 18.15461540222168, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.6405, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07532956685499058, |
|
"grad_norm": 23.61888313293457, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2773, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07834274952919021, |
|
"grad_norm": 11.435036659240723, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.3372, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08135593220338982, |
|
"grad_norm": 13.24569320678711, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.3893, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08436911487758945, |
|
"grad_norm": 9.795696258544922, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.3095, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08738229755178907, |
|
"grad_norm": 8.10338020324707, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.1856, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0903954802259887, |
|
"grad_norm": 5.573620796203613, |
|
"learning_rate": 6e-05, |
|
"loss": 1.0843, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09340866290018833, |
|
"grad_norm": 2.8769285678863525, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.9771, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09642184557438795, |
|
"grad_norm": 3.623781681060791, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.9225, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09943502824858758, |
|
"grad_norm": 3.1273627281188965, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.8808, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.10244821092278719, |
|
"grad_norm": 2.4352469444274902, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.8043, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10546139359698682, |
|
"grad_norm": 2.5423271656036377, |
|
"learning_rate": 7e-05, |
|
"loss": 0.7235, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10847457627118644, |
|
"grad_norm": 2.7016396522521973, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.7387, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11148775894538607, |
|
"grad_norm": 3.2129478454589844, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.7625, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11450094161958568, |
|
"grad_norm": 2.897091865539551, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.7295, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11751412429378531, |
|
"grad_norm": 2.90976619720459, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.6797, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.12052730696798493, |
|
"grad_norm": 2.8482582569122314, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7018, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12354048964218456, |
|
"grad_norm": 2.0846457481384277, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.7266, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12655367231638417, |
|
"grad_norm": 2.0968143939971924, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.5808, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1295668549905838, |
|
"grad_norm": 2.948556423187256, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.7257, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.13258003766478343, |
|
"grad_norm": 2.561649799346924, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.6828, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13559322033898305, |
|
"grad_norm": 2.9468488693237305, |
|
"learning_rate": 9e-05, |
|
"loss": 0.7191, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13860640301318267, |
|
"grad_norm": 4.2950825691223145, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.7896, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1416195856873823, |
|
"grad_norm": 8.009641647338867, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.2446, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.14463276836158193, |
|
"grad_norm": 6.519936561584473, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.8017, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14764595103578154, |
|
"grad_norm": 5.3175530433654785, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.6389, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.15065913370998116, |
|
"grad_norm": 7.807773113250732, |
|
"learning_rate": 0.0001, |
|
"loss": 0.5498, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1536723163841808, |
|
"grad_norm": 20.014755249023438, |
|
"learning_rate": 9.999687519737639e-05, |
|
"loss": 1.7708, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.15668549905838042, |
|
"grad_norm": 14.703155517578125, |
|
"learning_rate": 9.998750118008115e-05, |
|
"loss": 1.7615, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.15969868173258003, |
|
"grad_norm": 11.548951148986816, |
|
"learning_rate": 9.997187911979252e-05, |
|
"loss": 1.4554, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.16271186440677965, |
|
"grad_norm": 8.304758071899414, |
|
"learning_rate": 9.995001096914461e-05, |
|
"loss": 1.2888, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1657250470809793, |
|
"grad_norm": 5.550910472869873, |
|
"learning_rate": 9.992189946148366e-05, |
|
"loss": 1.0796, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1687382297551789, |
|
"grad_norm": 3.1853792667388916, |
|
"learning_rate": 9.988754811052616e-05, |
|
"loss": 0.9803, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.17175141242937852, |
|
"grad_norm": 2.140463352203369, |
|
"learning_rate": 9.984696120991978e-05, |
|
"loss": 0.8841, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.17476459510357814, |
|
"grad_norm": 2.4900693893432617, |
|
"learning_rate": 9.980014383270668e-05, |
|
"loss": 0.8013, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 2.4014413356781006, |
|
"learning_rate": 9.974710183068935e-05, |
|
"loss": 0.7769, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1807909604519774, |
|
"grad_norm": 1.818248987197876, |
|
"learning_rate": 9.968784183369929e-05, |
|
"loss": 0.6339, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18380414312617702, |
|
"grad_norm": 2.038853406906128, |
|
"learning_rate": 9.962237124876828e-05, |
|
"loss": 0.7058, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.18681732580037666, |
|
"grad_norm": 1.9188238382339478, |
|
"learning_rate": 9.955069825920249e-05, |
|
"loss": 0.6063, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.18983050847457628, |
|
"grad_norm": 1.960972547531128, |
|
"learning_rate": 9.947283182355982e-05, |
|
"loss": 0.6681, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1928436911487759, |
|
"grad_norm": 1.8506237268447876, |
|
"learning_rate": 9.938878167452992e-05, |
|
"loss": 0.4687, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1958568738229755, |
|
"grad_norm": 1.7827894687652588, |
|
"learning_rate": 9.929855831771786e-05, |
|
"loss": 0.5907, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19887005649717515, |
|
"grad_norm": 1.788488745689392, |
|
"learning_rate": 9.92021730303309e-05, |
|
"loss": 0.5778, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.20188323917137477, |
|
"grad_norm": 1.9174665212631226, |
|
"learning_rate": 9.909963785976903e-05, |
|
"loss": 0.5587, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.20489642184557438, |
|
"grad_norm": 1.898708462715149, |
|
"learning_rate": 9.899096562211902e-05, |
|
"loss": 0.4941, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.207909604519774, |
|
"grad_norm": 2.086160659790039, |
|
"learning_rate": 9.887616990055262e-05, |
|
"loss": 0.5702, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.21092278719397364, |
|
"grad_norm": 3.054201126098633, |
|
"learning_rate": 9.875526504362869e-05, |
|
"loss": 0.6915, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21393596986817326, |
|
"grad_norm": 4.820160388946533, |
|
"learning_rate": 9.86282661634998e-05, |
|
"loss": 1.0488, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.21694915254237288, |
|
"grad_norm": 4.556093692779541, |
|
"learning_rate": 9.849518913402334e-05, |
|
"loss": 0.7673, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2199623352165725, |
|
"grad_norm": 3.98148250579834, |
|
"learning_rate": 9.835605058877729e-05, |
|
"loss": 0.7301, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.22297551789077213, |
|
"grad_norm": 4.126860618591309, |
|
"learning_rate": 9.821086791898134e-05, |
|
"loss": 0.8709, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.22598870056497175, |
|
"grad_norm": 4.8316216468811035, |
|
"learning_rate": 9.805965927132295e-05, |
|
"loss": 0.7508, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22900188323917137, |
|
"grad_norm": 7.367558479309082, |
|
"learning_rate": 9.79024435456893e-05, |
|
"loss": 1.4311, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.232015065913371, |
|
"grad_norm": 6.81304407119751, |
|
"learning_rate": 9.773924039280487e-05, |
|
"loss": 1.3631, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.23502824858757063, |
|
"grad_norm": 5.532495021820068, |
|
"learning_rate": 9.75700702117753e-05, |
|
"loss": 1.2462, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.23804143126177024, |
|
"grad_norm": 3.750790596008301, |
|
"learning_rate": 9.739495414753753e-05, |
|
"loss": 0.9057, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.24105461393596986, |
|
"grad_norm": 2.6967413425445557, |
|
"learning_rate": 9.721391408821711e-05, |
|
"loss": 0.8322, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2440677966101695, |
|
"grad_norm": 1.915755271911621, |
|
"learning_rate": 9.702697266239212e-05, |
|
"loss": 0.6605, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.24708097928436912, |
|
"grad_norm": 1.681639552116394, |
|
"learning_rate": 9.683415323626485e-05, |
|
"loss": 0.6949, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.25009416195856876, |
|
"grad_norm": 1.6740490198135376, |
|
"learning_rate": 9.663547991074127e-05, |
|
"loss": 0.751, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.25009416195856876, |
|
"eval_loss": NaN, |
|
"eval_runtime": 92.9339, |
|
"eval_samples_per_second": 6.015, |
|
"eval_steps_per_second": 1.506, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.25310734463276835, |
|
"grad_norm": 1.4990551471710205, |
|
"learning_rate": 9.643097751841854e-05, |
|
"loss": 0.5189, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.256120527306968, |
|
"grad_norm": 1.4080356359481812, |
|
"learning_rate": 9.622067162048112e-05, |
|
"loss": 0.5112, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2591337099811676, |
|
"grad_norm": 1.3661057949066162, |
|
"learning_rate": 9.600458850350588e-05, |
|
"loss": 0.4688, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2621468926553672, |
|
"grad_norm": 1.570552945137024, |
|
"learning_rate": 9.578275517617645e-05, |
|
"loss": 0.5058, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.26516007532956687, |
|
"grad_norm": 1.6037708520889282, |
|
"learning_rate": 9.555519936590738e-05, |
|
"loss": 0.5201, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.26817325800376646, |
|
"grad_norm": 1.5268930196762085, |
|
"learning_rate": 9.532194951537838e-05, |
|
"loss": 0.4661, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2711864406779661, |
|
"grad_norm": 1.7837523221969604, |
|
"learning_rate": 9.508303477897924e-05, |
|
"loss": 0.5005, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27419962335216574, |
|
"grad_norm": 1.3590326309204102, |
|
"learning_rate": 9.483848501916578e-05, |
|
"loss": 0.3866, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.27721280602636533, |
|
"grad_norm": 1.5031671524047852, |
|
"learning_rate": 9.458833080272722e-05, |
|
"loss": 0.3559, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.280225988700565, |
|
"grad_norm": 1.2212880849838257, |
|
"learning_rate": 9.433260339696563e-05, |
|
"loss": 0.3586, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2832391713747646, |
|
"grad_norm": 1.8385019302368164, |
|
"learning_rate": 9.407133476578778e-05, |
|
"loss": 0.4775, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2862523540489642, |
|
"grad_norm": 2.6899161338806152, |
|
"learning_rate": 9.38045575657098e-05, |
|
"loss": 0.6809, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.28926553672316385, |
|
"grad_norm": 3.9981398582458496, |
|
"learning_rate": 9.353230514177552e-05, |
|
"loss": 0.8967, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.29227871939736344, |
|
"grad_norm": 3.7616143226623535, |
|
"learning_rate": 9.325461152338846e-05, |
|
"loss": 0.9173, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2952919020715631, |
|
"grad_norm": 3.3938989639282227, |
|
"learning_rate": 9.297151142005851e-05, |
|
"loss": 0.7849, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2983050847457627, |
|
"grad_norm": 3.3373446464538574, |
|
"learning_rate": 9.268304021706349e-05, |
|
"loss": 0.6619, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3013182674199623, |
|
"grad_norm": 4.476459503173828, |
|
"learning_rate": 9.23892339710263e-05, |
|
"loss": 0.7758, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.30433145009416196, |
|
"grad_norm": 2.75358510017395, |
|
"learning_rate": 9.209012940540805e-05, |
|
"loss": 0.7565, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3073446327683616, |
|
"grad_norm": 2.192662000656128, |
|
"learning_rate": 9.178576390591802e-05, |
|
"loss": 0.6634, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3103578154425612, |
|
"grad_norm": 2.3334836959838867, |
|
"learning_rate": 9.147617551584066e-05, |
|
"loss": 0.6961, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.31337099811676083, |
|
"grad_norm": 1.9057625532150269, |
|
"learning_rate": 9.116140293128051e-05, |
|
"loss": 0.5762, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3163841807909605, |
|
"grad_norm": 1.5543274879455566, |
|
"learning_rate": 9.084148549632547e-05, |
|
"loss": 0.5249, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.31939736346516007, |
|
"grad_norm": 1.3116902112960815, |
|
"learning_rate": 9.051646319812918e-05, |
|
"loss": 0.4895, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3224105461393597, |
|
"grad_norm": 1.6137094497680664, |
|
"learning_rate": 9.018637666191283e-05, |
|
"loss": 0.5036, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3254237288135593, |
|
"grad_norm": 1.4955766201019287, |
|
"learning_rate": 8.985126714588738e-05, |
|
"loss": 0.4571, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.32843691148775894, |
|
"grad_norm": 1.5371748208999634, |
|
"learning_rate": 8.951117653609666e-05, |
|
"loss": 0.4958, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3314500941619586, |
|
"grad_norm": 1.2266839742660522, |
|
"learning_rate": 8.916614734118184e-05, |
|
"loss": 0.4171, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3344632768361582, |
|
"grad_norm": 1.21657133102417, |
|
"learning_rate": 8.881622268706825e-05, |
|
"loss": 0.421, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3374764595103578, |
|
"grad_norm": 1.2184901237487793, |
|
"learning_rate": 8.8461446311575e-05, |
|
"loss": 0.4307, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.34048964218455746, |
|
"grad_norm": 1.5124021768569946, |
|
"learning_rate": 8.810186255894803e-05, |
|
"loss": 0.4865, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.34350282485875705, |
|
"grad_norm": 1.078994870185852, |
|
"learning_rate": 8.773751637431748e-05, |
|
"loss": 0.3592, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3465160075329567, |
|
"grad_norm": 1.2173560857772827, |
|
"learning_rate": 8.736845329807993e-05, |
|
"loss": 0.3757, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3495291902071563, |
|
"grad_norm": 1.4223103523254395, |
|
"learning_rate": 8.69947194602061e-05, |
|
"loss": 0.4002, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3525423728813559, |
|
"grad_norm": 1.2369580268859863, |
|
"learning_rate": 8.66163615744751e-05, |
|
"loss": 0.3891, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 1.2306034564971924, |
|
"learning_rate": 8.623342693263548e-05, |
|
"loss": 0.3176, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.35856873822975516, |
|
"grad_norm": 1.20809805393219, |
|
"learning_rate": 8.584596339849417e-05, |
|
"loss": 0.3715, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3615819209039548, |
|
"grad_norm": 1.59524405002594, |
|
"learning_rate": 8.545401940193392e-05, |
|
"loss": 0.4539, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36459510357815444, |
|
"grad_norm": 2.4288361072540283, |
|
"learning_rate": 8.505764393285984e-05, |
|
"loss": 0.7094, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.36760828625235403, |
|
"grad_norm": 2.587125778198242, |
|
"learning_rate": 8.46568865350762e-05, |
|
"loss": 0.7052, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3706214689265537, |
|
"grad_norm": 3.610764980316162, |
|
"learning_rate": 8.425179730009368e-05, |
|
"loss": 0.6835, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3736346516007533, |
|
"grad_norm": 2.254451274871826, |
|
"learning_rate": 8.384242686086848e-05, |
|
"loss": 0.5733, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3766478342749529, |
|
"grad_norm": 3.2182092666625977, |
|
"learning_rate": 8.342882638547351e-05, |
|
"loss": 0.7416, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.37966101694915255, |
|
"grad_norm": 2.0895962715148926, |
|
"learning_rate": 8.301104757070274e-05, |
|
"loss": 0.611, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.38267419962335214, |
|
"grad_norm": 1.9307582378387451, |
|
"learning_rate": 8.258914263560971e-05, |
|
"loss": 0.6099, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3856873822975518, |
|
"grad_norm": 1.7885206937789917, |
|
"learning_rate": 8.216316431498028e-05, |
|
"loss": 0.4832, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3887005649717514, |
|
"grad_norm": 1.2265185117721558, |
|
"learning_rate": 8.173316585274145e-05, |
|
"loss": 0.4042, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.391713747645951, |
|
"grad_norm": 1.369534969329834, |
|
"learning_rate": 8.129920099530607e-05, |
|
"loss": 0.4681, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.39472693032015066, |
|
"grad_norm": 1.340951681137085, |
|
"learning_rate": 8.086132398485524e-05, |
|
"loss": 0.4775, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3977401129943503, |
|
"grad_norm": 1.1047234535217285, |
|
"learning_rate": 8.041958955255814e-05, |
|
"loss": 0.4508, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4007532956685499, |
|
"grad_norm": 1.0403156280517578, |
|
"learning_rate": 7.99740529117313e-05, |
|
"loss": 0.4217, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.40376647834274954, |
|
"grad_norm": 0.9500618577003479, |
|
"learning_rate": 7.952476975093729e-05, |
|
"loss": 0.34, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4067796610169492, |
|
"grad_norm": 1.1021428108215332, |
|
"learning_rate": 7.907179622702408e-05, |
|
"loss": 0.392, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.40979284369114877, |
|
"grad_norm": 1.2623156309127808, |
|
"learning_rate": 7.861518895810596e-05, |
|
"loss": 0.4238, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4128060263653484, |
|
"grad_norm": 1.395652413368225, |
|
"learning_rate": 7.815500501648653e-05, |
|
"loss": 0.4211, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.415819209039548, |
|
"grad_norm": 1.3175368309020996, |
|
"learning_rate": 7.769130192152538e-05, |
|
"loss": 0.415, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.41883239171374764, |
|
"grad_norm": 1.3882197141647339, |
|
"learning_rate": 7.722413763244838e-05, |
|
"loss": 0.422, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4218455743879473, |
|
"grad_norm": 1.396023154258728, |
|
"learning_rate": 7.675357054110336e-05, |
|
"loss": 0.466, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4248587570621469, |
|
"grad_norm": 1.0779083967208862, |
|
"learning_rate": 7.627965946466166e-05, |
|
"loss": 0.3576, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4278719397363465, |
|
"grad_norm": 1.2511008977890015, |
|
"learning_rate": 7.580246363826621e-05, |
|
"loss": 0.301, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.43088512241054616, |
|
"grad_norm": 1.13119375705719, |
|
"learning_rate": 7.532204270762786e-05, |
|
"loss": 0.3332, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.43389830508474575, |
|
"grad_norm": 2.0195682048797607, |
|
"learning_rate": 7.483845672156998e-05, |
|
"loss": 0.6475, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.4369114877589454, |
|
"grad_norm": 2.429945230484009, |
|
"learning_rate": 7.435176612452286e-05, |
|
"loss": 0.7177, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.439924670433145, |
|
"grad_norm": 3.0756828784942627, |
|
"learning_rate": 7.386203174896872e-05, |
|
"loss": 0.741, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4429378531073446, |
|
"grad_norm": 3.7236998081207275, |
|
"learning_rate": 7.336931480783801e-05, |
|
"loss": 0.7999, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.44595103578154427, |
|
"grad_norm": 2.7121517658233643, |
|
"learning_rate": 7.287367688685835e-05, |
|
"loss": 0.6044, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.44896421845574386, |
|
"grad_norm": 3.661588668823242, |
|
"learning_rate": 7.237517993685678e-05, |
|
"loss": 0.5553, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4519774011299435, |
|
"grad_norm": 4.68520975112915, |
|
"learning_rate": 7.187388626601637e-05, |
|
"loss": 0.411, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.45499058380414314, |
|
"grad_norm": 1.866217017173767, |
|
"learning_rate": 7.136985853208824e-05, |
|
"loss": 0.5442, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.45800376647834273, |
|
"grad_norm": 1.6526014804840088, |
|
"learning_rate": 7.086315973455981e-05, |
|
"loss": 0.5071, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4610169491525424, |
|
"grad_norm": 1.3213937282562256, |
|
"learning_rate": 7.035385320678036e-05, |
|
"loss": 0.4598, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.464030131826742, |
|
"grad_norm": 0.959452211856842, |
|
"learning_rate": 6.984200260804484e-05, |
|
"loss": 0.3485, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4670433145009416, |
|
"grad_norm": 1.0355703830718994, |
|
"learning_rate": 6.932767191563703e-05, |
|
"loss": 0.3648, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.47005649717514125, |
|
"grad_norm": 0.9991386532783508, |
|
"learning_rate": 6.881092541683278e-05, |
|
"loss": 0.3535, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.47306967984934084, |
|
"grad_norm": 1.0915963649749756, |
|
"learning_rate": 6.829182770086474e-05, |
|
"loss": 0.3682, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.4760828625235405, |
|
"grad_norm": 0.9837580323219299, |
|
"learning_rate": 6.777044365084907e-05, |
|
"loss": 0.3703, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.47909604519774013, |
|
"grad_norm": 1.258581280708313, |
|
"learning_rate": 6.724683843567568e-05, |
|
"loss": 0.4104, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4821092278719397, |
|
"grad_norm": 0.832224428653717, |
|
"learning_rate": 6.672107750186255e-05, |
|
"loss": 0.2934, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.48512241054613936, |
|
"grad_norm": 0.881106436252594, |
|
"learning_rate": 6.619322656537552e-05, |
|
"loss": 0.3127, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.488135593220339, |
|
"grad_norm": 1.257350206375122, |
|
"learning_rate": 6.566335160341424e-05, |
|
"loss": 0.3804, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4911487758945386, |
|
"grad_norm": 1.6826411485671997, |
|
"learning_rate": 6.513151884616556e-05, |
|
"loss": 0.4807, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.49416195856873824, |
|
"grad_norm": 1.31766676902771, |
|
"learning_rate": 6.459779476852528e-05, |
|
"loss": 0.3872, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4971751412429379, |
|
"grad_norm": 1.438594102859497, |
|
"learning_rate": 6.406224608178932e-05, |
|
"loss": 0.3868, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5001883239171375, |
|
"grad_norm": 1.198364496231079, |
|
"learning_rate": 6.352493972531534e-05, |
|
"loss": 0.3361, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5001883239171375, |
|
"eval_loss": NaN, |
|
"eval_runtime": 93.1419, |
|
"eval_samples_per_second": 6.002, |
|
"eval_steps_per_second": 1.503, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5032015065913371, |
|
"grad_norm": 1.2678625583648682, |
|
"learning_rate": 6.298594285815584e-05, |
|
"loss": 0.2982, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5062146892655367, |
|
"grad_norm": 1.2661633491516113, |
|
"learning_rate": 6.244532285066382e-05, |
|
"loss": 0.3381, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5092278719397364, |
|
"grad_norm": 1.160649061203003, |
|
"learning_rate": 6.190314727607196e-05, |
|
"loss": 0.3428, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.512241054613936, |
|
"grad_norm": 2.1445140838623047, |
|
"learning_rate": 6.13594839020466e-05, |
|
"loss": 0.6844, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5152542372881356, |
|
"grad_norm": 1.9892338514328003, |
|
"learning_rate": 6.0814400682217234e-05, |
|
"loss": 0.559, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5182674199623352, |
|
"grad_norm": 3.0277585983276367, |
|
"learning_rate": 6.026796574768288e-05, |
|
"loss": 0.6495, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5212806026365349, |
|
"grad_norm": 3.0984301567077637, |
|
"learning_rate": 5.972024739849622e-05, |
|
"loss": 0.4114, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5242937853107345, |
|
"grad_norm": 3.296858549118042, |
|
"learning_rate": 5.917131409512663e-05, |
|
"loss": 0.5272, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.527306967984934, |
|
"grad_norm": 4.093991756439209, |
|
"learning_rate": 5.862123444990318e-05, |
|
"loss": 0.5134, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5303201506591337, |
|
"grad_norm": 1.48560631275177, |
|
"learning_rate": 5.807007721843861e-05, |
|
"loss": 0.5482, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 1.6580932140350342, |
|
"learning_rate": 5.751791129103544e-05, |
|
"loss": 0.5894, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5363465160075329, |
|
"grad_norm": 1.1557704210281372, |
|
"learning_rate": 5.696480568407523e-05, |
|
"loss": 0.4388, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5393596986817326, |
|
"grad_norm": 1.1503976583480835, |
|
"learning_rate": 5.6410829531392006e-05, |
|
"loss": 0.3841, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5423728813559322, |
|
"grad_norm": 1.0935317277908325, |
|
"learning_rate": 5.585605207563124e-05, |
|
"loss": 0.38, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5453860640301318, |
|
"grad_norm": 1.0500688552856445, |
|
"learning_rate": 5.5300542659594854e-05, |
|
"loss": 0.37, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5483992467043315, |
|
"grad_norm": 0.910677433013916, |
|
"learning_rate": 5.47443707175741e-05, |
|
"loss": 0.335, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5514124293785311, |
|
"grad_norm": 1.0660496950149536, |
|
"learning_rate": 5.418760576667071e-05, |
|
"loss": 0.3516, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5544256120527307, |
|
"grad_norm": 0.8869411945343018, |
|
"learning_rate": 5.3630317398107864e-05, |
|
"loss": 0.3249, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5574387947269304, |
|
"grad_norm": 1.6419062614440918, |
|
"learning_rate": 5.3072575268531835e-05, |
|
"loss": 0.3531, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.56045197740113, |
|
"grad_norm": 0.8144354820251465, |
|
"learning_rate": 5.2514449091305375e-05, |
|
"loss": 0.3002, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5634651600753295, |
|
"grad_norm": 1.1171809434890747, |
|
"learning_rate": 5.195600862779421e-05, |
|
"loss": 0.3776, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5664783427495292, |
|
"grad_norm": 1.14911687374115, |
|
"learning_rate": 5.139732367864736e-05, |
|
"loss": 0.3215, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5694915254237288, |
|
"grad_norm": 0.928807258605957, |
|
"learning_rate": 5.083846407507263e-05, |
|
"loss": 0.2747, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5725047080979284, |
|
"grad_norm": 1.024272084236145, |
|
"learning_rate": 5.0279499670108245e-05, |
|
"loss": 0.3024, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5755178907721281, |
|
"grad_norm": 1.2534008026123047, |
|
"learning_rate": 4.972050032989175e-05, |
|
"loss": 0.3704, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5785310734463277, |
|
"grad_norm": 0.9777889847755432, |
|
"learning_rate": 4.9161535924927374e-05, |
|
"loss": 0.271, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5815442561205273, |
|
"grad_norm": 1.0151127576828003, |
|
"learning_rate": 4.860267632135265e-05, |
|
"loss": 0.3179, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5845574387947269, |
|
"grad_norm": 0.9281553030014038, |
|
"learning_rate": 4.80439913722058e-05, |
|
"loss": 0.2508, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5875706214689266, |
|
"grad_norm": 1.6091618537902832, |
|
"learning_rate": 4.748555090869464e-05, |
|
"loss": 0.3701, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5905838041431262, |
|
"grad_norm": 2.309112548828125, |
|
"learning_rate": 4.692742473146818e-05, |
|
"loss": 0.5701, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5935969868173258, |
|
"grad_norm": 1.9035515785217285, |
|
"learning_rate": 4.636968260189214e-05, |
|
"loss": 0.5654, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5966101694915255, |
|
"grad_norm": 2.222712516784668, |
|
"learning_rate": 4.5812394233329305e-05, |
|
"loss": 0.51, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.599623352165725, |
|
"grad_norm": 3.1770148277282715, |
|
"learning_rate": 4.525562928242592e-05, |
|
"loss": 0.6322, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6026365348399246, |
|
"grad_norm": 3.6650121212005615, |
|
"learning_rate": 4.4699457340405164e-05, |
|
"loss": 0.4471, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6056497175141243, |
|
"grad_norm": 1.350001573562622, |
|
"learning_rate": 4.414394792436877e-05, |
|
"loss": 0.4445, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6086629001883239, |
|
"grad_norm": 1.3186067342758179, |
|
"learning_rate": 4.3589170468607985e-05, |
|
"loss": 0.4268, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6116760828625235, |
|
"grad_norm": 1.0820094347000122, |
|
"learning_rate": 4.3035194315924785e-05, |
|
"loss": 0.349, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6146892655367232, |
|
"grad_norm": 1.016855001449585, |
|
"learning_rate": 4.248208870896456e-05, |
|
"loss": 0.3657, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6177024482109228, |
|
"grad_norm": 0.7627567648887634, |
|
"learning_rate": 4.192992278156141e-05, |
|
"loss": 0.282, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6207156308851224, |
|
"grad_norm": 0.8734930753707886, |
|
"learning_rate": 4.1378765550096835e-05, |
|
"loss": 0.3205, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6237288135593221, |
|
"grad_norm": 0.9233391880989075, |
|
"learning_rate": 4.082868590487339e-05, |
|
"loss": 0.2682, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6267419962335217, |
|
"grad_norm": 0.7341740131378174, |
|
"learning_rate": 4.027975260150381e-05, |
|
"loss": 0.2488, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6297551789077213, |
|
"grad_norm": 0.8570201992988586, |
|
"learning_rate": 3.973203425231715e-05, |
|
"loss": 0.2644, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.632768361581921, |
|
"grad_norm": 0.8284196853637695, |
|
"learning_rate": 3.918559931778277e-05, |
|
"loss": 0.3093, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6357815442561205, |
|
"grad_norm": 0.7870326638221741, |
|
"learning_rate": 3.8640516097953405e-05, |
|
"loss": 0.2577, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6387947269303201, |
|
"grad_norm": 0.9884381294250488, |
|
"learning_rate": 3.809685272392804e-05, |
|
"loss": 0.3252, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.6418079096045197, |
|
"grad_norm": 1.1668404340744019, |
|
"learning_rate": 3.755467714933619e-05, |
|
"loss": 0.3414, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.6448210922787194, |
|
"grad_norm": 1.1960536241531372, |
|
"learning_rate": 3.701405714184416e-05, |
|
"loss": 0.3029, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.647834274952919, |
|
"grad_norm": 0.7987526059150696, |
|
"learning_rate": 3.647506027468467e-05, |
|
"loss": 0.2501, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6508474576271186, |
|
"grad_norm": 1.0721232891082764, |
|
"learning_rate": 3.59377539182107e-05, |
|
"loss": 0.3132, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6538606403013183, |
|
"grad_norm": 0.9739212393760681, |
|
"learning_rate": 3.5402205231474736e-05, |
|
"loss": 0.2644, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6568738229755179, |
|
"grad_norm": 1.1942991018295288, |
|
"learning_rate": 3.486848115383445e-05, |
|
"loss": 0.3206, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6598870056497175, |
|
"grad_norm": 1.3838952779769897, |
|
"learning_rate": 3.4336648396585776e-05, |
|
"loss": 0.3569, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6629001883239172, |
|
"grad_norm": 2.448126792907715, |
|
"learning_rate": 3.380677343462447e-05, |
|
"loss": 0.5818, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6659133709981168, |
|
"grad_norm": 1.8376816511154175, |
|
"learning_rate": 3.327892249813745e-05, |
|
"loss": 0.4343, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6689265536723163, |
|
"grad_norm": 2.102494478225708, |
|
"learning_rate": 3.275316156432434e-05, |
|
"loss": 0.4626, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.671939736346516, |
|
"grad_norm": 2.167078733444214, |
|
"learning_rate": 3.2229556349150945e-05, |
|
"loss": 0.4407, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6749529190207156, |
|
"grad_norm": 2.149308204650879, |
|
"learning_rate": 3.170817229913526e-05, |
|
"loss": 0.3198, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6779661016949152, |
|
"grad_norm": 4.306909084320068, |
|
"learning_rate": 3.118907458316722e-05, |
|
"loss": 0.5187, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6809792843691149, |
|
"grad_norm": 1.0858993530273438, |
|
"learning_rate": 3.067232808436299e-05, |
|
"loss": 0.3973, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6839924670433145, |
|
"grad_norm": 0.9466880559921265, |
|
"learning_rate": 3.0157997391955172e-05, |
|
"loss": 0.2911, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6870056497175141, |
|
"grad_norm": 1.053808569908142, |
|
"learning_rate": 2.964614679321966e-05, |
|
"loss": 0.3425, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6900188323917138, |
|
"grad_norm": 1.020087718963623, |
|
"learning_rate": 2.913684026544021e-05, |
|
"loss": 0.3171, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6930320150659134, |
|
"grad_norm": 0.9112816452980042, |
|
"learning_rate": 2.8630141467911775e-05, |
|
"loss": 0.289, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.696045197740113, |
|
"grad_norm": 0.9472637176513672, |
|
"learning_rate": 2.812611373398365e-05, |
|
"loss": 0.2909, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6990583804143126, |
|
"grad_norm": 0.8144400715827942, |
|
"learning_rate": 2.762482006314324e-05, |
|
"loss": 0.2527, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7020715630885123, |
|
"grad_norm": 0.8899109363555908, |
|
"learning_rate": 2.712632311314165e-05, |
|
"loss": 0.2814, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7050847457627119, |
|
"grad_norm": 0.8338634967803955, |
|
"learning_rate": 2.6630685192161992e-05, |
|
"loss": 0.2684, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7080979284369114, |
|
"grad_norm": 1.1928447484970093, |
|
"learning_rate": 2.6137968251031287e-05, |
|
"loss": 0.327, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.9519332647323608, |
|
"learning_rate": 2.5648233875477157e-05, |
|
"loss": 0.2797, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7141242937853107, |
|
"grad_norm": 1.1364781856536865, |
|
"learning_rate": 2.5161543278430054e-05, |
|
"loss": 0.3121, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7171374764595103, |
|
"grad_norm": 1.0575398206710815, |
|
"learning_rate": 2.4677957292372167e-05, |
|
"loss": 0.2866, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.72015065913371, |
|
"grad_norm": 0.7296847105026245, |
|
"learning_rate": 2.419753636173379e-05, |
|
"loss": 0.2432, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.7231638418079096, |
|
"grad_norm": 0.7573246955871582, |
|
"learning_rate": 2.3720340535338348e-05, |
|
"loss": 0.2545, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7261770244821092, |
|
"grad_norm": 0.8128641247749329, |
|
"learning_rate": 2.3246429458896634e-05, |
|
"loss": 0.2548, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7291902071563089, |
|
"grad_norm": 0.8031213283538818, |
|
"learning_rate": 2.2775862367551644e-05, |
|
"loss": 0.2509, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.7322033898305085, |
|
"grad_norm": 0.9279189109802246, |
|
"learning_rate": 2.2308698078474645e-05, |
|
"loss": 0.264, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7352165725047081, |
|
"grad_norm": 1.0536640882492065, |
|
"learning_rate": 2.1844994983513467e-05, |
|
"loss": 0.3295, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.7382297551789078, |
|
"grad_norm": 2.177769422531128, |
|
"learning_rate": 2.1384811041894055e-05, |
|
"loss": 0.4975, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.7412429378531074, |
|
"grad_norm": 1.5739905834197998, |
|
"learning_rate": 2.0928203772975917e-05, |
|
"loss": 0.4395, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.7442561205273069, |
|
"grad_norm": 1.7161879539489746, |
|
"learning_rate": 2.0475230249062725e-05, |
|
"loss": 0.3947, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.7472693032015066, |
|
"grad_norm": 3.185561180114746, |
|
"learning_rate": 2.0025947088268717e-05, |
|
"loss": 0.6166, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7502824858757062, |
|
"grad_norm": 2.201467275619507, |
|
"learning_rate": 1.958041044744186e-05, |
|
"loss": 0.3779, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7502824858757062, |
|
"eval_loss": NaN, |
|
"eval_runtime": 92.9008, |
|
"eval_samples_per_second": 6.017, |
|
"eval_steps_per_second": 1.507, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7532956685499058, |
|
"grad_norm": 4.4879302978515625, |
|
"learning_rate": 1.9138676015144764e-05, |
|
"loss": 0.4317, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7563088512241055, |
|
"grad_norm": 1.1887656450271606, |
|
"learning_rate": 1.870079900469392e-05, |
|
"loss": 0.3761, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7593220338983051, |
|
"grad_norm": 1.1247776746749878, |
|
"learning_rate": 1.8266834147258578e-05, |
|
"loss": 0.3736, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7623352165725047, |
|
"grad_norm": 1.001464605331421, |
|
"learning_rate": 1.7836835685019733e-05, |
|
"loss": 0.3244, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.7653483992467043, |
|
"grad_norm": 0.8156184554100037, |
|
"learning_rate": 1.741085736439031e-05, |
|
"loss": 0.254, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.768361581920904, |
|
"grad_norm": 0.8647398352622986, |
|
"learning_rate": 1.698895242929725e-05, |
|
"loss": 0.271, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7713747645951036, |
|
"grad_norm": 1.146620750427246, |
|
"learning_rate": 1.6571173614526507e-05, |
|
"loss": 0.3116, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7743879472693032, |
|
"grad_norm": 0.8247811198234558, |
|
"learning_rate": 1.6157573139131527e-05, |
|
"loss": 0.231, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7774011299435029, |
|
"grad_norm": 1.2048598527908325, |
|
"learning_rate": 1.5748202699906335e-05, |
|
"loss": 0.328, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7804143126177024, |
|
"grad_norm": 0.8339473009109497, |
|
"learning_rate": 1.534311346492381e-05, |
|
"loss": 0.2627, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.783427495291902, |
|
"grad_norm": 1.1890642642974854, |
|
"learning_rate": 1.4942356067140162e-05, |
|
"loss": 0.3058, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7864406779661017, |
|
"grad_norm": 0.8932839035987854, |
|
"learning_rate": 1.454598059806609e-05, |
|
"loss": 0.2475, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.7894538606403013, |
|
"grad_norm": 1.4562137126922607, |
|
"learning_rate": 1.4154036601505832e-05, |
|
"loss": 0.3751, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7924670433145009, |
|
"grad_norm": 1.0580323934555054, |
|
"learning_rate": 1.376657306736453e-05, |
|
"loss": 0.2916, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7954802259887006, |
|
"grad_norm": 0.9510928988456726, |
|
"learning_rate": 1.3383638425524908e-05, |
|
"loss": 0.2661, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7984934086629002, |
|
"grad_norm": 0.7758486866950989, |
|
"learning_rate": 1.3005280539793907e-05, |
|
"loss": 0.2241, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8015065913370998, |
|
"grad_norm": 1.2630360126495361, |
|
"learning_rate": 1.2631546701920071e-05, |
|
"loss": 0.2934, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.8045197740112995, |
|
"grad_norm": 0.8792417645454407, |
|
"learning_rate": 1.2262483625682513e-05, |
|
"loss": 0.2504, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.8075329566854991, |
|
"grad_norm": 1.050751805305481, |
|
"learning_rate": 1.1898137441051982e-05, |
|
"loss": 0.2669, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.8105461393596987, |
|
"grad_norm": 0.9729787111282349, |
|
"learning_rate": 1.1538553688425003e-05, |
|
"loss": 0.266, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.8135593220338984, |
|
"grad_norm": 2.0321502685546875, |
|
"learning_rate": 1.1183777312931748e-05, |
|
"loss": 0.4952, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.816572504708098, |
|
"grad_norm": 1.9791584014892578, |
|
"learning_rate": 1.0833852658818166e-05, |
|
"loss": 0.4964, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.8195856873822975, |
|
"grad_norm": 1.8695249557495117, |
|
"learning_rate": 1.0488823463903342e-05, |
|
"loss": 0.4246, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.8225988700564971, |
|
"grad_norm": 1.9649566411972046, |
|
"learning_rate": 1.0148732854112619e-05, |
|
"loss": 0.3945, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.8256120527306968, |
|
"grad_norm": 2.423754930496216, |
|
"learning_rate": 9.81362333808718e-06, |
|
"loss": 0.3351, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.8286252354048964, |
|
"grad_norm": 7.4359540939331055, |
|
"learning_rate": 9.483536801870834e-06, |
|
"loss": 0.7226, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.831638418079096, |
|
"grad_norm": 1.0027852058410645, |
|
"learning_rate": 9.158514503674543e-06, |
|
"loss": 0.4422, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8346516007532957, |
|
"grad_norm": 0.9810426235198975, |
|
"learning_rate": 8.838597068719518e-06, |
|
"loss": 0.4183, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.8376647834274953, |
|
"grad_norm": 0.9647234678268433, |
|
"learning_rate": 8.523824484159349e-06, |
|
"loss": 0.342, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.8406779661016949, |
|
"grad_norm": 1.0083119869232178, |
|
"learning_rate": 8.21423609408199e-06, |
|
"loss": 0.3176, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8436911487758946, |
|
"grad_norm": 1.0560261011123657, |
|
"learning_rate": 7.90987059459195e-06, |
|
"loss": 0.3063, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8467043314500942, |
|
"grad_norm": 0.7893863916397095, |
|
"learning_rate": 7.610766028973709e-06, |
|
"loss": 0.2334, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.8497175141242937, |
|
"grad_norm": 0.8123882412910461, |
|
"learning_rate": 7.3169597829365165e-06, |
|
"loss": 0.2431, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8527306967984934, |
|
"grad_norm": 0.7712526917457581, |
|
"learning_rate": 7.028488579941506e-06, |
|
"loss": 0.2199, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.855743879472693, |
|
"grad_norm": 0.821006715297699, |
|
"learning_rate": 6.745388476611553e-06, |
|
"loss": 0.2353, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8587570621468926, |
|
"grad_norm": 1.0905379056930542, |
|
"learning_rate": 6.467694858224488e-06, |
|
"loss": 0.2829, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8617702448210923, |
|
"grad_norm": 1.1444308757781982, |
|
"learning_rate": 6.1954424342902e-06, |
|
"loss": 0.2891, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.8647834274952919, |
|
"grad_norm": 0.8933337926864624, |
|
"learning_rate": 5.928665234212233e-06, |
|
"loss": 0.2404, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8677966101694915, |
|
"grad_norm": 1.0412938594818115, |
|
"learning_rate": 5.66739660303437e-06, |
|
"loss": 0.2752, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8708097928436912, |
|
"grad_norm": 0.9225270748138428, |
|
"learning_rate": 5.411669197272795e-06, |
|
"loss": 0.2548, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.8738229755178908, |
|
"grad_norm": 0.8741989731788635, |
|
"learning_rate": 5.161514980834231e-06, |
|
"loss": 0.2349, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8768361581920904, |
|
"grad_norm": 0.7688031792640686, |
|
"learning_rate": 4.916965221020753e-06, |
|
"loss": 0.2232, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.87984934086629, |
|
"grad_norm": 0.9201337099075317, |
|
"learning_rate": 4.678050484621615e-06, |
|
"loss": 0.2486, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8828625235404897, |
|
"grad_norm": 0.8050046563148499, |
|
"learning_rate": 4.444800634092616e-06, |
|
"loss": 0.2339, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8858757062146893, |
|
"grad_norm": 0.916643500328064, |
|
"learning_rate": 4.217244823823546e-06, |
|
"loss": 0.2378, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.2140140533447266, |
|
"learning_rate": 3.995411496494134e-06, |
|
"loss": 0.2966, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8919020715630885, |
|
"grad_norm": 2.333122730255127, |
|
"learning_rate": 3.7793283795188984e-06, |
|
"loss": 0.5715, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8949152542372881, |
|
"grad_norm": 1.4072636365890503, |
|
"learning_rate": 3.56902248158148e-06, |
|
"loss": 0.3899, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8979284369114877, |
|
"grad_norm": 1.8280621767044067, |
|
"learning_rate": 3.364520089258727e-06, |
|
"loss": 0.3525, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.9009416195856874, |
|
"grad_norm": 2.5405969619750977, |
|
"learning_rate": 3.165846763735153e-06, |
|
"loss": 0.34, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.903954802259887, |
|
"grad_norm": 8.415023803710938, |
|
"learning_rate": 2.973027337607892e-06, |
|
"loss": 0.6817, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9069679849340866, |
|
"grad_norm": 1.0418468713760376, |
|
"learning_rate": 2.7860859117828987e-06, |
|
"loss": 0.4058, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.9099811676082863, |
|
"grad_norm": 0.9007613062858582, |
|
"learning_rate": 2.605045852462473e-06, |
|
"loss": 0.3921, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.9129943502824859, |
|
"grad_norm": 1.1040724515914917, |
|
"learning_rate": 2.429929788224722e-06, |
|
"loss": 0.344, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.9160075329566855, |
|
"grad_norm": 0.8270070552825928, |
|
"learning_rate": 2.2607596071951286e-06, |
|
"loss": 0.2523, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.9190207156308852, |
|
"grad_norm": 1.036444067955017, |
|
"learning_rate": 2.097556454310701e-06, |
|
"loss": 0.2802, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.9220338983050848, |
|
"grad_norm": 1.1080818176269531, |
|
"learning_rate": 1.940340728677059e-06, |
|
"loss": 0.2779, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.9250470809792843, |
|
"grad_norm": 0.9475107192993164, |
|
"learning_rate": 1.789132081018674e-06, |
|
"loss": 0.2591, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.928060263653484, |
|
"grad_norm": 0.7941210269927979, |
|
"learning_rate": 1.6439494112227172e-06, |
|
"loss": 0.2142, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.9310734463276836, |
|
"grad_norm": 0.8892014622688293, |
|
"learning_rate": 1.5048108659766691e-06, |
|
"loss": 0.2625, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.9340866290018832, |
|
"grad_norm": 0.9323367476463318, |
|
"learning_rate": 1.3717338365001942e-06, |
|
"loss": 0.2504, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9370998116760829, |
|
"grad_norm": 1.1987720727920532, |
|
"learning_rate": 1.2447349563713184e-06, |
|
"loss": 0.2877, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9401129943502825, |
|
"grad_norm": 0.9108150005340576, |
|
"learning_rate": 1.1238300994473983e-06, |
|
"loss": 0.2055, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.9431261770244821, |
|
"grad_norm": 0.8241666555404663, |
|
"learning_rate": 1.0090343778809908e-06, |
|
"loss": 0.2561, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9461393596986817, |
|
"grad_norm": 1.0146571397781372, |
|
"learning_rate": 9.003621402309814e-07, |
|
"loss": 0.271, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9491525423728814, |
|
"grad_norm": 0.7878392934799194, |
|
"learning_rate": 7.97826969669102e-07, |
|
"loss": 0.2338, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.952165725047081, |
|
"grad_norm": 1.097957730293274, |
|
"learning_rate": 7.014416822821556e-07, |
|
"loss": 0.2901, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9551789077212806, |
|
"grad_norm": 0.8541196584701538, |
|
"learning_rate": 6.112183254700865e-07, |
|
"loss": 0.2408, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9581920903954803, |
|
"grad_norm": 0.9861576557159424, |
|
"learning_rate": 5.271681764401848e-07, |
|
"loss": 0.2545, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9612052730696798, |
|
"grad_norm": 1.3217852115631104, |
|
"learning_rate": 4.493017407975086e-07, |
|
"loss": 0.3015, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.9642184557438794, |
|
"grad_norm": 1.4543648958206177, |
|
"learning_rate": 3.7762875123173445e-07, |
|
"loss": 0.3321, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9672316384180791, |
|
"grad_norm": 2.7887980937957764, |
|
"learning_rate": 3.1215816630071335e-07, |
|
"loss": 0.5545, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9702448210922787, |
|
"grad_norm": 1.5680711269378662, |
|
"learning_rate": 2.528981693106558e-07, |
|
"loss": 0.4085, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.9732580037664783, |
|
"grad_norm": 2.545147180557251, |
|
"learning_rate": 1.9985616729332747e-07, |
|
"loss": 0.4379, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.976271186440678, |
|
"grad_norm": 2.684563159942627, |
|
"learning_rate": 1.530387900802177e-07, |
|
"loss": 0.3865, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9792843691148776, |
|
"grad_norm": 3.9092459678649902, |
|
"learning_rate": 1.1245188947384134e-07, |
|
"loss": 0.4588, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9822975517890772, |
|
"grad_norm": 0.7552906274795532, |
|
"learning_rate": 7.81005385163458e-08, |
|
"loss": 0.2851, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9853107344632769, |
|
"grad_norm": 0.8308711051940918, |
|
"learning_rate": 4.998903085539075e-08, |
|
"loss": 0.251, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9883239171374765, |
|
"grad_norm": 0.9575715661048889, |
|
"learning_rate": 2.8120880207493928e-08, |
|
"loss": 0.2713, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9913370998116761, |
|
"grad_norm": 0.8187337517738342, |
|
"learning_rate": 1.2498819918843607e-08, |
|
"loss": 0.2334, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9943502824858758, |
|
"grad_norm": 0.9916722178459167, |
|
"learning_rate": 3.1248026236274652e-09, |
|
"loss": 0.2765, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9973634651600753, |
|
"grad_norm": 1.3465464115142822, |
|
"learning_rate": 0.0, |
|
"loss": 0.3738, |
|
"step": 331 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 331, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 83, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0904629834290299e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|