|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.169077889814409, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001169077889814409, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.898635477582846e-07, |
|
"loss": 5.0851, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002338155779628818, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 7.797270955165692e-07, |
|
"loss": 5.1921, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0035072336694432268, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 1.1695906432748538e-06, |
|
"loss": 5.0756, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004676311559257636, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 1.5594541910331385e-06, |
|
"loss": 5.1097, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005845389449072045, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 1.949317738791423e-06, |
|
"loss": 5.0591, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0070144673388864535, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 2.3391812865497075e-06, |
|
"loss": 5.096, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008183545228700862, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 2.729044834307992e-06, |
|
"loss": 5.0234, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.009352623118515271, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.118908382066277e-06, |
|
"loss": 5.0266, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01052170100832968, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 5.031, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01169077889814409, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.898635477582846e-06, |
|
"loss": 4.9845, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012859856787958498, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.2884990253411305e-06, |
|
"loss": 4.9933, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.014028934677772907, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.678362573099415e-06, |
|
"loss": 5.0822, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.015198012567587316, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 5.0682261208576995e-06, |
|
"loss": 4.9392, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.016367090457401725, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 5.458089668615984e-06, |
|
"loss": 5.0441, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.017536168347216132, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 5.8479532163742686e-06, |
|
"loss": 4.8865, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018705246237030543, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 6.237816764132554e-06, |
|
"loss": 4.9227, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01987432412684495, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 6.6276803118908384e-06, |
|
"loss": 4.9035, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02104340201665936, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 7.017543859649123e-06, |
|
"loss": 4.863, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.022212479906473768, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 4.7735, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02338155779628818, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 7.797270955165692e-06, |
|
"loss": 4.7932, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.024550635686102586, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 8.187134502923977e-06, |
|
"loss": 4.6933, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.025719713575916996, |
|
"grad_norm": 7.15625, |
|
"learning_rate": 8.576998050682261e-06, |
|
"loss": 4.5906, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.026888791465731404, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 8.966861598440546e-06, |
|
"loss": 4.4291, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.028057869355545814, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 9.35672514619883e-06, |
|
"loss": 4.221, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02922694724536022, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 9.746588693957115e-06, |
|
"loss": 4.032, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.030396025135174632, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.0136452241715399e-05, |
|
"loss": 3.8603, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03156510302498904, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 3.6522, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03273418091480345, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.0916179337231968e-05, |
|
"loss": 3.4394, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03390325880461786, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.1306042884990253e-05, |
|
"loss": 3.2265, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.035072336694432264, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.1695906432748537e-05, |
|
"loss": 3.091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03624141458424668, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 1.2085769980506823e-05, |
|
"loss": 2.8856, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.037410492474061086, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.2475633528265108e-05, |
|
"loss": 2.6299, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03857957036387549, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 1.2865497076023392e-05, |
|
"loss": 2.5917, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0397486482536899, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.3255360623781677e-05, |
|
"loss": 2.3757, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.040917726143504314, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 1.364522417153996e-05, |
|
"loss": 2.308, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04208680403331872, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.4035087719298246e-05, |
|
"loss": 2.3042, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04325588192313313, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.442495126705653e-05, |
|
"loss": 2.2149, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.044424959812947536, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 2.0738, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.04559403770276195, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.5204678362573099e-05, |
|
"loss": 1.9663, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.04676311559257636, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.5594541910331384e-05, |
|
"loss": 1.9384, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.047932193482390764, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.5984405458089668e-05, |
|
"loss": 1.894, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04910127137220517, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.6374269005847955e-05, |
|
"loss": 1.8204, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.050270349262019585, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 1.676413255360624e-05, |
|
"loss": 1.8051, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05143942715183399, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.7153996101364522e-05, |
|
"loss": 1.5832, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0526085050416484, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 1.5298, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05377758293146281, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.7933723196881093e-05, |
|
"loss": 1.4964, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.054946660821277214, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.8323586744639376e-05, |
|
"loss": 1.3987, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.05611573871109163, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.871345029239766e-05, |
|
"loss": 1.3959, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.057284816600906036, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.9103313840155944e-05, |
|
"loss": 1.3673, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.05845389449072044, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.949317738791423e-05, |
|
"loss": 1.2864, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05962297238053485, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.9883040935672515e-05, |
|
"loss": 1.2535, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.060792050270349264, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.0272904483430798e-05, |
|
"loss": 1.2242, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06196112816016367, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 2.0662768031189085e-05, |
|
"loss": 1.1936, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.06313020604997809, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 2.105263157894737e-05, |
|
"loss": 1.1858, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06429928393979249, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.1442495126705653e-05, |
|
"loss": 1.1438, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0654683618296069, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.1832358674463936e-05, |
|
"loss": 1.1217, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.06663743971942131, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 1.1508, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06780651760923571, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.2612085769980507e-05, |
|
"loss": 1.0976, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.06897559549905012, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.300194931773879e-05, |
|
"loss": 1.0716, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07014467338886453, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.3391812865497074e-05, |
|
"loss": 1.0152, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07131375127867894, |
|
"grad_norm": 3.125, |
|
"learning_rate": 2.378167641325536e-05, |
|
"loss": 1.0265, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.07248282916849336, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.4171539961013645e-05, |
|
"loss": 1.2355, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.07365190705830776, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.456140350877193e-05, |
|
"loss": 1.0111, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07482098494812217, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 2.4951267056530216e-05, |
|
"loss": 1.2317, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.07599006283793658, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.53411306042885e-05, |
|
"loss": 0.9988, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07715914072775099, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.5730994152046783e-05, |
|
"loss": 1.0076, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07832821861756539, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.6120857699805067e-05, |
|
"loss": 1.0155, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0794972965073798, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 2.6510721247563354e-05, |
|
"loss": 1.1216, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.08066637439719421, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.6900584795321637e-05, |
|
"loss": 0.9788, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.08183545228700863, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 2.729044834307992e-05, |
|
"loss": 0.9372, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08300453017682304, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.7680311890838205e-05, |
|
"loss": 0.897, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.08417360806663744, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.8070175438596492e-05, |
|
"loss": 0.9766, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.08534268595645185, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 2.8460038986354776e-05, |
|
"loss": 0.9315, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.08651176384626626, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 2.884990253411306e-05, |
|
"loss": 0.9425, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.08768084173608066, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 2.9239766081871346e-05, |
|
"loss": 0.9242, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08884991962589507, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 0.9396, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.09001899751570948, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.0019493177387914e-05, |
|
"loss": 0.8932, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.0911880754055239, |
|
"grad_norm": 72.5, |
|
"learning_rate": 3.0409356725146197e-05, |
|
"loss": 0.9062, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.0923571532953383, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.0799220272904484e-05, |
|
"loss": 0.9534, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.09352623118515271, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3.118908382066277e-05, |
|
"loss": 0.8715, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09469530907496712, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.157894736842105e-05, |
|
"loss": 0.9029, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.09586438696478153, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 3.1968810916179335e-05, |
|
"loss": 0.8968, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.09703346485459594, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 3.235867446393762e-05, |
|
"loss": 0.9784, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.09820254274441034, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.274853801169591e-05, |
|
"loss": 0.9012, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.09937162063422475, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.313840155945419e-05, |
|
"loss": 0.8895, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10054069852403917, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.352826510721248e-05, |
|
"loss": 0.8858, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.10170977641385358, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.391812865497076e-05, |
|
"loss": 0.8408, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.10287885430366799, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.4307992202729044e-05, |
|
"loss": 0.8896, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.10404793219348239, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 3.469785575048733e-05, |
|
"loss": 0.8941, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1052170100832968, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 0.9013, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1063860879731112, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.5477582846003895e-05, |
|
"loss": 0.8585, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.10755516586292561, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.5867446393762186e-05, |
|
"loss": 1.0231, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.10872424375274002, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.625730994152047e-05, |
|
"loss": 0.8835, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.10989332164255443, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.664717348927875e-05, |
|
"loss": 0.8465, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.11106239953236885, |
|
"grad_norm": 2.75, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 1.1014, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11223147742218326, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.742690058479532e-05, |
|
"loss": 0.8691, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.11340055531199766, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.7816764132553604e-05, |
|
"loss": 0.8547, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.11456963320181207, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.820662768031189e-05, |
|
"loss": 0.8725, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.11573871109162648, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.859649122807018e-05, |
|
"loss": 0.8393, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.11690778898144089, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.898635477582846e-05, |
|
"loss": 0.8345, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11807686687125529, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.9376218323586745e-05, |
|
"loss": 0.8966, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.1192459447610697, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.976608187134503e-05, |
|
"loss": 0.8291, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.12041502265088412, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.015594541910331e-05, |
|
"loss": 0.8175, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.12158410054069853, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.0545808966861596e-05, |
|
"loss": 0.8353, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.12275317843051294, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.093567251461988e-05, |
|
"loss": 1.0084, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12392225632032734, |
|
"grad_norm": 59.75, |
|
"learning_rate": 4.132553606237817e-05, |
|
"loss": 0.8478, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.12509133421014176, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.1715399610136454e-05, |
|
"loss": 0.8267, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.12626041209995617, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.210526315789474e-05, |
|
"loss": 0.836, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.12742948998977058, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.249512670565302e-05, |
|
"loss": 0.857, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.12859856787958499, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.2884990253411305e-05, |
|
"loss": 0.8478, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1297676457693994, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.327485380116959e-05, |
|
"loss": 0.844, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.1309367236592138, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 4.366471734892787e-05, |
|
"loss": 0.8128, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.1321058015490282, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.4054580896686156e-05, |
|
"loss": 0.8276, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.13327487943884261, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.8478, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.13444395732865702, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.483430799220273e-05, |
|
"loss": 0.814, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13561303521847143, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.5224171539961014e-05, |
|
"loss": 0.765, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.13678211310828584, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.56140350877193e-05, |
|
"loss": 0.8277, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.13795119099810024, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.600389863547758e-05, |
|
"loss": 0.8103, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.13912026888791465, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.6393762183235865e-05, |
|
"loss": 0.7897, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.14028934677772906, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.678362573099415e-05, |
|
"loss": 0.7798, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14145842466754346, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.717348927875244e-05, |
|
"loss": 0.8442, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.14262750255735787, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.756335282651072e-05, |
|
"loss": 0.7936, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.1437965804471723, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.7953216374269006e-05, |
|
"loss": 0.7726, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.1449656583369867, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.834307992202729e-05, |
|
"loss": 0.9626, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.14613473622680112, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 4.8732943469785574e-05, |
|
"loss": 0.7929, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14730381411661553, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 4.912280701754386e-05, |
|
"loss": 0.8589, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.14847289200642994, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.951267056530214e-05, |
|
"loss": 0.7383, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.14964196989624434, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.990253411306043e-05, |
|
"loss": 0.8011, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.15081104778605875, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.0292397660818715e-05, |
|
"loss": 0.8404, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.15198012567587316, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 5.0682261208577e-05, |
|
"loss": 0.8301, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15314920356568756, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 5.107212475633528e-05, |
|
"loss": 0.8157, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.15431828145550197, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 5.1461988304093566e-05, |
|
"loss": 0.7724, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.15548735934531638, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.185185185185185e-05, |
|
"loss": 0.7586, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.15665643723513079, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 5.2241715399610133e-05, |
|
"loss": 0.7812, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.1578255151249452, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 0.7486, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1589945930147596, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.302144249512671e-05, |
|
"loss": 0.7339, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.160163670904574, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.341130604288499e-05, |
|
"loss": 0.8381, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.16133274879438841, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.3801169590643275e-05, |
|
"loss": 0.774, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.16250182668420282, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.419103313840156e-05, |
|
"loss": 0.8114, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.16367090457401726, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 5.458089668615984e-05, |
|
"loss": 0.7549, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16483998246383166, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.4970760233918126e-05, |
|
"loss": 0.7393, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.16600906035364607, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.536062378167641e-05, |
|
"loss": 0.7607, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.16717813824346048, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 5.57504873294347e-05, |
|
"loss": 0.8158, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.16834721613327489, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.6140350877192984e-05, |
|
"loss": 0.7403, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.1695162940230893, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.653021442495127e-05, |
|
"loss": 0.7278, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1706853719129037, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 5.692007797270955e-05, |
|
"loss": 0.821, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.1718544498027181, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 5.7309941520467835e-05, |
|
"loss": 0.8276, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.1730235276925325, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 5.769980506822612e-05, |
|
"loss": 0.8505, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.17419260558234692, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 5.80896686159844e-05, |
|
"loss": 0.7941, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.17536168347216133, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 5.847953216374269e-05, |
|
"loss": 0.8621, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17653076136197574, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.8869395711500976e-05, |
|
"loss": 0.8071, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.17769983925179014, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 5.925925925925926e-05, |
|
"loss": 0.803, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.17886891714160455, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 5.9649122807017544e-05, |
|
"loss": 0.7569, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.18003799503141896, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 6.003898635477583e-05, |
|
"loss": 0.7722, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.18120707292123336, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 6.042884990253411e-05, |
|
"loss": 0.7746, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1823761508110478, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 6.0818713450292395e-05, |
|
"loss": 0.7292, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.1835452287008622, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 6.120857699805068e-05, |
|
"loss": 0.7751, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.1847143065906766, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.159844054580897e-05, |
|
"loss": 0.7473, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.18588338448049102, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 6.198830409356725e-05, |
|
"loss": 0.7718, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.18705246237030543, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.237816764132554e-05, |
|
"loss": 0.7812, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.18822154026011984, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 6.276803118908382e-05, |
|
"loss": 0.7552, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.18939061814993424, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 6.31578947368421e-05, |
|
"loss": 0.7244, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.19055969603974865, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 6.354775828460039e-05, |
|
"loss": 0.8182, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.19172877392956306, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 6.393762183235867e-05, |
|
"loss": 0.7578, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.19289785181937746, |
|
"grad_norm": 0.875, |
|
"learning_rate": 6.432748538011695e-05, |
|
"loss": 0.8755, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19406692970919187, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 6.471734892787524e-05, |
|
"loss": 0.7436, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.19523600759900628, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 6.510721247563352e-05, |
|
"loss": 0.7415, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.19640508548882069, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 6.549707602339182e-05, |
|
"loss": 0.8082, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.1975741633786351, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 6.58869395711501e-05, |
|
"loss": 0.7226, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.1987432412684495, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.627680311890839e-05, |
|
"loss": 0.7408, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1999123191582639, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.7752, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.20108139704807834, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.705653021442495e-05, |
|
"loss": 0.7313, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.20225047493789275, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 6.744639376218324e-05, |
|
"loss": 0.7164, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.20341955282770716, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 6.783625730994152e-05, |
|
"loss": 0.7836, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.20458863071752156, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.82261208576998e-05, |
|
"loss": 0.8936, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20575770860733597, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 6.861598440545809e-05, |
|
"loss": 0.7386, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.20692678649715038, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 6.900584795321637e-05, |
|
"loss": 0.8893, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.20809586438696479, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 6.939571150097466e-05, |
|
"loss": 0.715, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.2092649422767792, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.978557504873294e-05, |
|
"loss": 0.7087, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.2104340201665936, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 0.7067, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.211603098056408, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.05653021442495e-05, |
|
"loss": 0.6979, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.2127721759462224, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 7.095516569200779e-05, |
|
"loss": 0.8533, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.21394125383603682, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 7.134502923976609e-05, |
|
"loss": 0.6971, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.21511033172585123, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 7.173489278752437e-05, |
|
"loss": 0.7152, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.21627940961566564, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 7.212475633528265e-05, |
|
"loss": 0.7176, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.21744848750548004, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.251461988304094e-05, |
|
"loss": 0.7054, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.21861756539529445, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 7.290448343079922e-05, |
|
"loss": 0.7157, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.21978664328510886, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 7.32943469785575e-05, |
|
"loss": 0.742, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2209557211749233, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.368421052631579e-05, |
|
"loss": 0.7016, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2221247990647377, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.7263, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2232938769545521, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.446393762183236e-05, |
|
"loss": 0.7386, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.2244629548443665, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 7.485380116959064e-05, |
|
"loss": 0.7488, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.22563203273418092, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 7.524366471734892e-05, |
|
"loss": 0.7304, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.22680111062399533, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 7.563352826510721e-05, |
|
"loss": 0.6926, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.22797018851380974, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 7.602339181286549e-05, |
|
"loss": 0.7258, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.22913926640362414, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.641325536062378e-05, |
|
"loss": 0.7169, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.23030834429343855, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 7.680311890838207e-05, |
|
"loss": 0.6962, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.23147742218325296, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.719298245614036e-05, |
|
"loss": 0.718, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.23264650007306736, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.758284600389864e-05, |
|
"loss": 0.7106, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.23381557796288177, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.797270955165692e-05, |
|
"loss": 0.739, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23498465585269618, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 7.836257309941521e-05, |
|
"loss": 0.7037, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.23615373374251059, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 7.875243664717349e-05, |
|
"loss": 0.717, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.237322811632325, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.914230019493177e-05, |
|
"loss": 0.7041, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.2384918895221394, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.953216374269006e-05, |
|
"loss": 0.7016, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.23966096741195383, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.992202729044834e-05, |
|
"loss": 0.7423, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24083004530176824, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 8.031189083820663e-05, |
|
"loss": 0.7128, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.24199912319158265, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 8.070175438596491e-05, |
|
"loss": 0.7144, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.24316820108139706, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 8.109161793372319e-05, |
|
"loss": 0.6987, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.24433727897121146, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.148148148148148e-05, |
|
"loss": 0.6946, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.24550635686102587, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 8.187134502923976e-05, |
|
"loss": 0.6954, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24667543475084028, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 8.226120857699804e-05, |
|
"loss": 0.7507, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.24784451264065469, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.265107212475634e-05, |
|
"loss": 0.9267, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.2490135905304691, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.304093567251462e-05, |
|
"loss": 0.7222, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.2501826684202835, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 8.343079922027291e-05, |
|
"loss": 0.8919, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.25135174631009793, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 8.382066276803119e-05, |
|
"loss": 0.7451, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.25252082419991234, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.421052631578948e-05, |
|
"loss": 0.7027, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.25368990208972675, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 8.460038986354776e-05, |
|
"loss": 0.712, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.25485897997954116, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.499025341130604e-05, |
|
"loss": 0.7122, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.25602805786935556, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.538011695906433e-05, |
|
"loss": 0.6871, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.25719713575916997, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 8.576998050682261e-05, |
|
"loss": 0.7278, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2583662136489844, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 8.61598440545809e-05, |
|
"loss": 0.7023, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.2595352915387988, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 8.654970760233918e-05, |
|
"loss": 0.7167, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.2607043694286132, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 8.693957115009746e-05, |
|
"loss": 0.7346, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.2618734473184276, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 8.732943469785574e-05, |
|
"loss": 0.7223, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.263042525208242, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 0.7032, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2642116030980564, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.810916179337231e-05, |
|
"loss": 0.6804, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.2653806809878708, |
|
"grad_norm": 0.875, |
|
"learning_rate": 8.849902534113061e-05, |
|
"loss": 0.7224, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.26654975887768523, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.7138, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.26771883676749963, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 8.927875243664718e-05, |
|
"loss": 0.7065, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.26888791465731404, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.966861598440546e-05, |
|
"loss": 0.7289, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.27005699254712845, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.005847953216374e-05, |
|
"loss": 0.6917, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.27122607043694286, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.044834307992203e-05, |
|
"loss": 0.7493, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.27239514832675726, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.083820662768031e-05, |
|
"loss": 0.7075, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.27356422621657167, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.12280701754386e-05, |
|
"loss": 0.6926, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.2747333041063861, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.161793372319688e-05, |
|
"loss": 0.6714, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2759023819962005, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.200779727095516e-05, |
|
"loss": 0.6973, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.2770714598860149, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 9.239766081871345e-05, |
|
"loss": 0.7289, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.2782405377758293, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.278752436647173e-05, |
|
"loss": 0.7296, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.2794096156656437, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.317738791423001e-05, |
|
"loss": 0.7229, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.2805786935554581, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.35672514619883e-05, |
|
"loss": 0.7158, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2817477714452725, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 9.39571150097466e-05, |
|
"loss": 0.6862, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.28291684933508693, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.434697855750488e-05, |
|
"loss": 0.6535, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.28408592722490134, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.473684210526316e-05, |
|
"loss": 0.703, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.28525500511471574, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.512670565302145e-05, |
|
"loss": 0.7296, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.28642408300453015, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 9.551656920077973e-05, |
|
"loss": 0.7061, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2875931608943446, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.590643274853801e-05, |
|
"loss": 0.6731, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.288762238784159, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.62962962962963e-05, |
|
"loss": 0.7141, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.2899313166739734, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 9.668615984405458e-05, |
|
"loss": 0.7874, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.29110039456378783, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.707602339181286e-05, |
|
"loss": 0.7179, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.29226947245360224, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.746588693957115e-05, |
|
"loss": 0.6819, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.29343855034341665, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.785575048732943e-05, |
|
"loss": 0.688, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.29460762823323106, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.824561403508771e-05, |
|
"loss": 0.8356, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.29577670612304546, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.8635477582846e-05, |
|
"loss": 0.7733, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.29694578401285987, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 9.902534113060428e-05, |
|
"loss": 0.7087, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.2981148619026743, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 9.941520467836257e-05, |
|
"loss": 0.7023, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2992839397924887, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 9.980506822612086e-05, |
|
"loss": 0.7087, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.3004530176823031, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00010019493177387915, |
|
"loss": 0.7002, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.3016220955721175, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00010058479532163743, |
|
"loss": 0.7355, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.3027911734619319, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00010097465886939573, |
|
"loss": 0.6983, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.3039602513517463, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.000101364522417154, |
|
"loss": 0.7016, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3051293292415607, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.0001017543859649123, |
|
"loss": 0.7096, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.3062984071313751, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00010214424951267056, |
|
"loss": 0.7099, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.30746748502118953, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00010253411306042886, |
|
"loss": 0.6931, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.30863656291100394, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00010292397660818713, |
|
"loss": 0.7427, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.30980564080081835, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00010331384015594543, |
|
"loss": 0.6609, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31097471869063276, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001037037037037037, |
|
"loss": 0.7195, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.31214379658044716, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000104093567251462, |
|
"loss": 0.6735, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.31331287447026157, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00010448343079922027, |
|
"loss": 0.7455, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.314481952360076, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00010487329434697856, |
|
"loss": 0.6683, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.3156510302498904, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 0.6981, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3168201081397048, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00010565302144249513, |
|
"loss": 0.65, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.3179891860295192, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.00010604288499025342, |
|
"loss": 0.7997, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.3191582639193336, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00010643274853801171, |
|
"loss": 0.7143, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.320327341809148, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00010682261208576998, |
|
"loss": 0.6619, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.3214964196989624, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00010721247563352828, |
|
"loss": 0.6632, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.32266549758877683, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010760233918128655, |
|
"loss": 0.6844, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.32383457547859124, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010799220272904485, |
|
"loss": 0.698, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.32500365336840564, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00010838206627680312, |
|
"loss": 0.6132, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.3261727312582201, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010877192982456141, |
|
"loss": 0.6507, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.3273418091480345, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00010916179337231968, |
|
"loss": 0.696, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3285108870378489, |
|
"grad_norm": 3.0, |
|
"learning_rate": 0.00010955165692007798, |
|
"loss": 0.926, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.3296799649276633, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00010994152046783625, |
|
"loss": 0.7797, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.33084904281747773, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.00011033138401559455, |
|
"loss": 0.7379, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.33201812070729214, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00011072124756335282, |
|
"loss": 0.679, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.33318719859710655, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.6668, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.33435627648692096, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001115009746588694, |
|
"loss": 0.7378, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.33552535437673536, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001118908382066277, |
|
"loss": 0.7341, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.33669443226654977, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00011228070175438597, |
|
"loss": 0.6793, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.3378635101563642, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00011267056530214426, |
|
"loss": 0.6926, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.3390325880461786, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00011306042884990253, |
|
"loss": 0.6393, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.340201665935993, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00011345029239766083, |
|
"loss": 0.7207, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.3413707438258074, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0001138401559454191, |
|
"loss": 0.6168, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.3425398217156218, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0001142300194931774, |
|
"loss": 0.712, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.3437088996054362, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00011461988304093567, |
|
"loss": 0.6532, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.3448779774952506, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00011500974658869397, |
|
"loss": 0.6766, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.346047055385065, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00011539961013645224, |
|
"loss": 0.6986, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.34721613327487943, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00011578947368421053, |
|
"loss": 0.6867, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.34838521116469384, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0001161793372319688, |
|
"loss": 0.8175, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.34955428905450825, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001165692007797271, |
|
"loss": 0.8122, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.35072336694432266, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00011695906432748539, |
|
"loss": 0.7642, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.35189244483413706, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00011734892787524367, |
|
"loss": 0.703, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.35306152272395147, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00011773879142300195, |
|
"loss": 0.7031, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.3542306006137659, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00011812865497076025, |
|
"loss": 0.686, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.3553996785035803, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00011851851851851852, |
|
"loss": 0.6708, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.3565687563933947, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00011890838206627682, |
|
"loss": 0.8084, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3577378342832091, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00011929824561403509, |
|
"loss": 0.6705, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.3589069121730235, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00011968810916179338, |
|
"loss": 0.6409, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.3600759900628379, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00012007797270955165, |
|
"loss": 0.6805, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.3612450679526523, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00012046783625730995, |
|
"loss": 0.7129, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.36241414584246673, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00012085769980506822, |
|
"loss": 0.6819, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.36358322373228114, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00012124756335282652, |
|
"loss": 0.6424, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.3647523016220956, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00012163742690058479, |
|
"loss": 0.704, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.36592137951191, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00012202729044834309, |
|
"loss": 0.8172, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.3670904574017244, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00012241715399610137, |
|
"loss": 0.6486, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.3682595352915388, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 0.6772, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3694286131813532, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00012319688109161794, |
|
"loss": 0.6997, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.37059769107116763, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00012358674463937622, |
|
"loss": 0.6984, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.37176676896098204, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001239766081871345, |
|
"loss": 0.6796, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.37293584685079645, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0001243664717348928, |
|
"loss": 0.8194, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.37410492474061086, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00012475633528265107, |
|
"loss": 0.7189, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.37527400263042526, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012514619883040936, |
|
"loss": 0.6934, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.37644308052023967, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012553606237816764, |
|
"loss": 0.6885, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.3776121584100541, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00012592592592592592, |
|
"loss": 0.671, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.3787812362998685, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001263157894736842, |
|
"loss": 0.6515, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.3799503141896829, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00012670565302144252, |
|
"loss": 0.6833, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3811193920794973, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00012709551656920077, |
|
"loss": 0.6363, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.3822884699693117, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00012748538011695908, |
|
"loss": 0.6848, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.3834575478591261, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012787524366471734, |
|
"loss": 0.7383, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.3846266257489405, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00012826510721247565, |
|
"loss": 0.7057, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.3857957036387549, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001286549707602339, |
|
"loss": 0.7039, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.38696478152856933, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012904483430799222, |
|
"loss": 0.727, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.38813385941838374, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00012943469785575048, |
|
"loss": 0.7128, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.38930293730819815, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001298245614035088, |
|
"loss": 0.6446, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.39047201519801256, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00013021442495126704, |
|
"loss": 0.6434, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.39164109308782696, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00013060428849902535, |
|
"loss": 0.642, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.39281017097764137, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00013099415204678364, |
|
"loss": 0.6805, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.3939792488674558, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013138401559454192, |
|
"loss": 0.6923, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.3951483267572702, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001317738791423002, |
|
"loss": 0.7944, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.3963174046470846, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001321637426900585, |
|
"loss": 0.6876, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.397486482536899, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00013255360623781677, |
|
"loss": 0.6857, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3986555604267134, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013294346978557506, |
|
"loss": 0.6796, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.3998246383165278, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.6604, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.4009937162063422, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00013372319688109162, |
|
"loss": 0.6999, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.4021627940961567, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001341130604288499, |
|
"loss": 0.663, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.4033318719859711, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0001345029239766082, |
|
"loss": 0.6553, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4045009498757855, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00013489278752436647, |
|
"loss": 0.6892, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.4056700277655999, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00013528265107212476, |
|
"loss": 0.6529, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.4068391056554143, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00013567251461988304, |
|
"loss": 0.6728, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.4080081835452287, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00013606237816764133, |
|
"loss": 0.6847, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.4091772614350431, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001364522417153996, |
|
"loss": 0.7475, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41034633932485753, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0001368421052631579, |
|
"loss": 0.6805, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.41151541721467194, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00013723196881091618, |
|
"loss": 0.8327, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.41268449510448635, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001376218323586745, |
|
"loss": 0.663, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.41385357299430076, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00013801169590643274, |
|
"loss": 0.6675, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.41502265088411516, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00013840155945419105, |
|
"loss": 0.7458, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.41619172877392957, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0001387914230019493, |
|
"loss": 0.6385, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.417360806663744, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00013918128654970762, |
|
"loss": 0.7979, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.4185298845535584, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00013957115009746588, |
|
"loss": 0.6949, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.4196989624433728, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001399610136452242, |
|
"loss": 0.6713, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.4208680403331872, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 0.6503, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4220371182230016, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00014074074074074076, |
|
"loss": 0.6641, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.423206196112816, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.000141130604288499, |
|
"loss": 0.7159, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.4243752740026304, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00014152046783625732, |
|
"loss": 0.786, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.4255443518924448, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00014191033138401558, |
|
"loss": 0.6511, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.42671342978225923, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0001423001949317739, |
|
"loss": 0.8375, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.42788250767207364, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00014269005847953217, |
|
"loss": 0.7475, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.42905158556188805, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00014307992202729046, |
|
"loss": 0.6989, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.43022066345170246, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00014346978557504874, |
|
"loss": 0.6792, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.43138974134151686, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00014385964912280703, |
|
"loss": 0.7084, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.43255881923133127, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0001442495126705653, |
|
"loss": 0.7263, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4337278971211457, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0001446393762183236, |
|
"loss": 0.706, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.4348969750109601, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00014502923976608188, |
|
"loss": 0.6989, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.4360660529007745, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00014541910331384016, |
|
"loss": 0.7097, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.4372351307905889, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00014580896686159844, |
|
"loss": 0.6767, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.4384042086804033, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00014619883040935673, |
|
"loss": 0.7045, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4395732865702177, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.000146588693957115, |
|
"loss": 0.6492, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.4407423644600322, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001469785575048733, |
|
"loss": 0.6776, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.4419114423498466, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00014736842105263158, |
|
"loss": 0.8008, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.443080520239661, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00014775828460038986, |
|
"loss": 0.6677, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.4442495981294754, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00014814814814814815, |
|
"loss": 0.6799, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4454186760192898, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00014853801169590643, |
|
"loss": 0.6626, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.4465877539091042, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001489278752436647, |
|
"loss": 0.6814, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.4477568317989186, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00014931773879142302, |
|
"loss": 0.7181, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.448925909688733, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00014970760233918128, |
|
"loss": 0.681, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.45009498757854743, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001500974658869396, |
|
"loss": 0.6625, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.45126406546836184, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00015048732943469785, |
|
"loss": 0.6558, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.45243314335817625, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00015087719298245616, |
|
"loss": 0.6968, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.45360222124799066, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00015126705653021442, |
|
"loss": 0.7073, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.45477129913780506, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00015165692007797273, |
|
"loss": 0.7151, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.45594037702761947, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00015204678362573098, |
|
"loss": 0.7192, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4571094549174339, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001524366471734893, |
|
"loss": 0.6937, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.4582785328072483, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00015282651072124755, |
|
"loss": 0.6859, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.4594476106970627, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00015321637426900586, |
|
"loss": 0.6491, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.4606166885868771, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00015360623781676414, |
|
"loss": 0.719, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.4617857664766915, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00015399610136452243, |
|
"loss": 0.6906, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4629548443665059, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001543859649122807, |
|
"loss": 0.6957, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.4641239222563203, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.000154775828460039, |
|
"loss": 0.6863, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.4652930001461347, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00015516569200779728, |
|
"loss": 0.6845, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.46646207803594913, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.6393, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.46763115592576354, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00015594541910331385, |
|
"loss": 0.6739, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.46880023381557795, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00015633528265107213, |
|
"loss": 0.6919, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.46996931170539236, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00015672514619883041, |
|
"loss": 0.6885, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.47113838959520676, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001571150097465887, |
|
"loss": 0.7075, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.47230746748502117, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00015750487329434698, |
|
"loss": 0.6716, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.4734765453748356, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 0.6595, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.47464562326465, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00015828460038986355, |
|
"loss": 0.6272, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.4758147011544644, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00015867446393762183, |
|
"loss": 0.6849, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.4769837790442788, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00015906432748538012, |
|
"loss": 0.6523, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.4781528569340932, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0001594541910331384, |
|
"loss": 0.7112, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.47932193482390767, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00015984405458089668, |
|
"loss": 0.7073, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4804910127137221, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.000160233918128655, |
|
"loss": 0.6436, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.4816600906035365, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016062378167641325, |
|
"loss": 0.669, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.4828291684933509, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00016101364522417156, |
|
"loss": 0.8224, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.4839982463831653, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016140350877192982, |
|
"loss": 0.7526, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.4851673242729797, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00016179337231968813, |
|
"loss": 0.6986, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4863364021627941, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00016218323586744639, |
|
"loss": 0.6496, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.4875054800526085, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001625730994152047, |
|
"loss": 0.7165, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.4886745579424229, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016296296296296295, |
|
"loss": 0.6453, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.48984363583223733, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016335282651072126, |
|
"loss": 0.617, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.49101271372205174, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00016374269005847952, |
|
"loss": 0.6841, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.49218179161186615, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00016413255360623783, |
|
"loss": 0.6668, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.49335086950168056, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001645224171539961, |
|
"loss": 0.7092, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.49451994739149496, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001649122807017544, |
|
"loss": 0.6479, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.49568902528130937, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00016530214424951268, |
|
"loss": 0.6687, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.4968581031711238, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00016569200779727097, |
|
"loss": 0.6704, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4980271810609382, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00016608187134502925, |
|
"loss": 0.6678, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.4991962589507526, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00016647173489278753, |
|
"loss": 0.6744, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.500365336840567, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00016686159844054582, |
|
"loss": 0.6565, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.5015344147303814, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001672514619883041, |
|
"loss": 0.6499, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.5027034926201959, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00016764132553606238, |
|
"loss": 0.6619, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5038725705100102, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.00016803118908382067, |
|
"loss": 0.8945, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.5050416483998247, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 0.7597, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.506210726289639, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00016881091617933723, |
|
"loss": 0.6698, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.5073798041794535, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00016920077972709552, |
|
"loss": 0.7135, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.5085488820692678, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001695906432748538, |
|
"loss": 0.8298, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5097179599590823, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016998050682261209, |
|
"loss": 0.6749, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.5108870378488967, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00017037037037037037, |
|
"loss": 0.8318, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.5120561157387111, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00017076023391812865, |
|
"loss": 0.7329, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.5132251936285255, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00017115009746588694, |
|
"loss": 0.7127, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.5143942715183399, |
|
"grad_norm": 9.375, |
|
"learning_rate": 0.00017153996101364522, |
|
"loss": 0.7969, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5155633494081543, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00017192982456140353, |
|
"loss": 0.8313, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.5167324272979688, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.0001723196881091618, |
|
"loss": 0.7693, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.5179015051877831, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0001727095516569201, |
|
"loss": 0.7127, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.5190705830775976, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00017309941520467836, |
|
"loss": 0.7283, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.5202396609674119, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017348927875243667, |
|
"loss": 0.6591, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5214087388572264, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017387914230019492, |
|
"loss": 0.6794, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.5225778167470407, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00017426900584795323, |
|
"loss": 0.6299, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.5237468946368552, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0001746588693957115, |
|
"loss": 0.73, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.5249159725266695, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001750487329434698, |
|
"loss": 0.6538, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.526085050416484, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 0.7013, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5272541283062984, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017582846003898637, |
|
"loss": 0.6633, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.5284232061961128, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017621832358674462, |
|
"loss": 0.6638, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.5295922840859272, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00017660818713450294, |
|
"loss": 0.6616, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.5307613619757416, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00017699805068226122, |
|
"loss": 0.6935, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.531930439865556, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001773879142300195, |
|
"loss": 0.6795, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5330995177553705, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.6797, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.5342685956451848, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00017816764132553607, |
|
"loss": 0.6583, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.5354376735349993, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00017855750487329435, |
|
"loss": 0.6592, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.5366067514248137, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 0.7534, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.5377758293146281, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00017933723196881092, |
|
"loss": 0.699, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5389449072044425, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001797270955165692, |
|
"loss": 0.6759, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.5401139850942569, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001801169590643275, |
|
"loss": 0.6676, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.5412830629840714, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00018050682261208577, |
|
"loss": 0.6734, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.5424521408738857, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00018089668615984406, |
|
"loss": 0.6545, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.5436212187637002, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00018128654970760234, |
|
"loss": 0.6863, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5447902966535145, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018167641325536062, |
|
"loss": 0.6536, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.545959374543329, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001820662768031189, |
|
"loss": 0.6825, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.5471284524331433, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001824561403508772, |
|
"loss": 0.7075, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.5482975303229578, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00018284600389863547, |
|
"loss": 0.8103, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.5494666082127722, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00018323586744639376, |
|
"loss": 0.7081, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5506356861025866, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00018362573099415207, |
|
"loss": 0.6647, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.551804763992401, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00018401559454191032, |
|
"loss": 0.7202, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.5529738418822154, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00018440545808966864, |
|
"loss": 0.7734, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.5541429197720298, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001847953216374269, |
|
"loss": 0.6381, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.5553119976618442, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 0.7235, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5564810755516586, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00018557504873294346, |
|
"loss": 0.8296, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.5576501534414731, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00018596491228070177, |
|
"loss": 0.6901, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.5588192313312874, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00018635477582846003, |
|
"loss": 0.6628, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.5599883092211019, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018674463937621834, |
|
"loss": 0.6689, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.5611573871109162, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0001871345029239766, |
|
"loss": 0.7377, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5623264650007307, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0001875243664717349, |
|
"loss": 0.7217, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.563495542890545, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001879142300194932, |
|
"loss": 0.6776, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.5646646207803595, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00018830409356725147, |
|
"loss": 0.694, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.5658336986701739, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018869395711500976, |
|
"loss": 0.7305, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.5670027765599883, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018908382066276804, |
|
"loss": 0.6659, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5681718544498027, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 0.705, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.5693409323396171, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001898635477582846, |
|
"loss": 0.8348, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.5705100102294315, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001902534113060429, |
|
"loss": 0.7383, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.571679088119246, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019064327485380117, |
|
"loss": 0.7039, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.5728481660090603, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00019103313840155946, |
|
"loss": 0.7163, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5740172438988748, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019142300194931774, |
|
"loss": 0.6648, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.5751863217886892, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019181286549707603, |
|
"loss": 0.6896, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.5763553996785036, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001922027290448343, |
|
"loss": 0.7127, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.577524477568318, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001925925925925926, |
|
"loss": 0.6549, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.5786935554581324, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 0.6581, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5798626333479469, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019337231968810916, |
|
"loss": 0.6802, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.5810317112377612, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00019376218323586744, |
|
"loss": 0.6906, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.5822007891275757, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019415204678362573, |
|
"loss": 0.7383, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.58336986701739, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00019454191033138404, |
|
"loss": 0.7141, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.5845389449072045, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001949317738791423, |
|
"loss": 0.6729, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5857080227970188, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0001953216374269006, |
|
"loss": 0.6561, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.5868771006868333, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00019571150097465886, |
|
"loss": 0.9202, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.5880461785766476, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00019610136452241717, |
|
"loss": 0.7142, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.5892152564664621, |
|
"grad_norm": 105.0, |
|
"learning_rate": 0.00019649122807017543, |
|
"loss": 1.1271, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.5903843343562765, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019688109161793374, |
|
"loss": 1.1422, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5915534122460909, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.000197270955165692, |
|
"loss": 0.7422, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.5927224901359053, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001976608187134503, |
|
"loss": 0.7443, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.5938915680257197, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00019805068226120856, |
|
"loss": 0.7227, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.5950606459155341, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00019844054580896687, |
|
"loss": 0.7005, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.5962297238053486, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00019883040935672513, |
|
"loss": 0.6514, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5973988016951629, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019922027290448344, |
|
"loss": 0.7211, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.5985678795849774, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00019961013645224173, |
|
"loss": 0.6627, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.5997369574747917, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6844, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.6009060353646062, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019990253411306043, |
|
"loss": 0.7113, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.6020751132544205, |
|
"grad_norm": 12.5, |
|
"learning_rate": 0.00019980506822612085, |
|
"loss": 0.751, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.603244191144235, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.00019970760233918128, |
|
"loss": 0.7035, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.6044132690340493, |
|
"grad_norm": 3.96875, |
|
"learning_rate": 0.00019961013645224173, |
|
"loss": 0.8057, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.6055823469238638, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019951267056530218, |
|
"loss": 1.0457, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.6067514248136782, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0001994152046783626, |
|
"loss": 0.7618, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.6079205027034926, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00019931773879142302, |
|
"loss": 0.8979, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.609089580593307, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00019922027290448344, |
|
"loss": 0.7494, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.6102586584831214, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00019912280701754386, |
|
"loss": 0.7505, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.6114277363729358, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0001990253411306043, |
|
"loss": 0.7356, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.6125968142627503, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001989278752436647, |
|
"loss": 0.7106, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.6137658921525647, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00019883040935672513, |
|
"loss": 0.7151, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6149349700423791, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.00019873294346978558, |
|
"loss": 0.7294, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.6161040479321935, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00019863547758284603, |
|
"loss": 0.7628, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.6172731258220079, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00019853801169590645, |
|
"loss": 0.6701, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.6184422037118223, |
|
"grad_norm": 11.5, |
|
"learning_rate": 0.00019844054580896687, |
|
"loss": 0.8591, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.6196112816016367, |
|
"grad_norm": 6.125, |
|
"learning_rate": 0.0001983430799220273, |
|
"loss": 0.7596, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6207803594914512, |
|
"grad_norm": 10.625, |
|
"learning_rate": 0.00019824561403508772, |
|
"loss": 0.7751, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.6219494373812655, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00019814814814814814, |
|
"loss": 0.8455, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.62311851527108, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00019805068226120856, |
|
"loss": 0.7299, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.6242875931608943, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.000197953216374269, |
|
"loss": 0.7077, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.6254566710507088, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00019785575048732946, |
|
"loss": 0.6911, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6266257489405231, |
|
"grad_norm": 10.375, |
|
"learning_rate": 0.00019775828460038989, |
|
"loss": 0.6806, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.6277948268303376, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001976608187134503, |
|
"loss": 0.7528, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.628963904720152, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 0.00019756335282651073, |
|
"loss": 0.6915, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.6301329826099664, |
|
"grad_norm": 4.125, |
|
"learning_rate": 0.00019746588693957115, |
|
"loss": 0.7393, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.6313020604997808, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.00019736842105263157, |
|
"loss": 0.698, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6324711383895952, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.000197270955165692, |
|
"loss": 0.701, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.6336402162794096, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.00019717348927875245, |
|
"loss": 0.6641, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.634809294169224, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00019707602339181287, |
|
"loss": 0.7128, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.6359783720590384, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019697855750487332, |
|
"loss": 0.8843, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.6371474499488529, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 0.00019688109161793374, |
|
"loss": 0.7482, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6383165278386672, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00019678362573099416, |
|
"loss": 0.8855, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.6394856057284817, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00019668615984405458, |
|
"loss": 0.7744, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.640654683618296, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.000196588693957115, |
|
"loss": 0.72, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.6418237615081105, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00019649122807017543, |
|
"loss": 0.6577, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.6429928393979248, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00019639376218323588, |
|
"loss": 0.736, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6441619172877393, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001962962962962963, |
|
"loss": 0.7025, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.6453309951775537, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00019619883040935675, |
|
"loss": 0.7076, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.6465000730673681, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00019610136452241717, |
|
"loss": 0.7121, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.6476691509571825, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001960038986354776, |
|
"loss": 0.705, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.6488382288469969, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00019590643274853802, |
|
"loss": 0.8037, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6500073067368113, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00019580896686159844, |
|
"loss": 0.7236, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.6511763846266257, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019571150097465886, |
|
"loss": 0.764, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.6523454625164402, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001956140350877193, |
|
"loss": 0.6921, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.6535145404062546, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00019551656920077973, |
|
"loss": 0.7026, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.654683618296069, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00019541910331384016, |
|
"loss": 0.6641, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6558526961858834, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001953216374269006, |
|
"loss": 0.7031, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.6570217740756978, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019522417153996103, |
|
"loss": 0.6727, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.6581908519655122, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00019512670565302145, |
|
"loss": 0.6963, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.6593599298553267, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019502923976608187, |
|
"loss": 0.665, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.660529007745141, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001949317738791423, |
|
"loss": 0.6752, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6616980856349555, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019483430799220274, |
|
"loss": 0.6616, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.6628671635247698, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00019473684210526317, |
|
"loss": 0.6426, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.6640362414145843, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001946393762183236, |
|
"loss": 0.675, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.6652053193043986, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00019454191033138404, |
|
"loss": 0.732, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.6663743971942131, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 0.6701, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6675434750840274, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00019434697855750488, |
|
"loss": 0.6637, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.6687125529738419, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.0001942495126705653, |
|
"loss": 0.8085, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.6698816308636563, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00019415204678362573, |
|
"loss": 0.6828, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.6710507087534707, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00019405458089668618, |
|
"loss": 0.7319, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.6722197866432851, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001939571150097466, |
|
"loss": 0.6919, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6733888645330995, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00019385964912280702, |
|
"loss": 0.7407, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.6745579424229139, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019376218323586744, |
|
"loss": 0.7096, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.6757270203127284, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0001936647173489279, |
|
"loss": 0.6948, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.6768960982025427, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00019356725146198832, |
|
"loss": 0.6902, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.6780651760923572, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00019346978557504874, |
|
"loss": 0.6991, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6792342539821715, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00019337231968810916, |
|
"loss": 0.7059, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.680403331871986, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.0001932748538011696, |
|
"loss": 0.8268, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.6815724097618003, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019317738791423003, |
|
"loss": 0.6697, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.6827414876516148, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00019307992202729045, |
|
"loss": 0.6828, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.6839105655414291, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 0.8377, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6850796434312436, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019288499025341133, |
|
"loss": 0.7405, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.686248721321058, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00019278752436647175, |
|
"loss": 0.7039, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.6874177992108724, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00019269005847953217, |
|
"loss": 0.7217, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.6885868771006868, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001925925925925926, |
|
"loss": 0.6667, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 0.6897559549905012, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00019249512670565304, |
|
"loss": 0.7186, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6909250328803157, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00019239766081871346, |
|
"loss": 0.7037, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.69209411077013, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001923001949317739, |
|
"loss": 0.6953, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.6932631886599445, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001922027290448343, |
|
"loss": 0.6952, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 0.6944322665497589, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019210526315789473, |
|
"loss": 0.7018, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.6956013444395733, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00019200779727095518, |
|
"loss": 0.675, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6967704223293877, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001919103313840156, |
|
"loss": 0.6897, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.6979395002192021, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019181286549707603, |
|
"loss": 0.6644, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.6991085781090165, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00019171539961013647, |
|
"loss": 0.7869, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.700277655998831, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0001916179337231969, |
|
"loss": 0.6748, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 0.7014467338886453, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00019152046783625732, |
|
"loss": 0.6426, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7026158117784598, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00019142300194931774, |
|
"loss": 0.6642, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 0.7037848896682741, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00019132553606237816, |
|
"loss": 0.7248, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.7049539675580886, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001912280701754386, |
|
"loss": 0.6599, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.7061230454479029, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019113060428849904, |
|
"loss": 0.6483, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 0.7072921233377174, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019103313840155946, |
|
"loss": 0.7163, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7084612012275318, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001909356725146199, |
|
"loss": 0.6607, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.7096302791173462, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00019083820662768033, |
|
"loss": 0.6647, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 0.7107993570071606, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00019074074074074075, |
|
"loss": 0.7187, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 0.711968434896975, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019064327485380117, |
|
"loss": 0.6364, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.7131375127867894, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0001905458089668616, |
|
"loss": 0.6514, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7143065906766038, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019044834307992202, |
|
"loss": 0.6375, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 0.7154756685664182, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00019035087719298247, |
|
"loss": 0.653, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.7166447464562327, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0001902534113060429, |
|
"loss": 0.711, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 0.717813824346047, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019015594541910334, |
|
"loss": 0.6696, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 0.7189829022358615, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00019005847953216376, |
|
"loss": 0.6747, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7201519801256758, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00018996101364522418, |
|
"loss": 0.6863, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.7213210580154903, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 0.0001898635477582846, |
|
"loss": 0.7781, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 0.7224901359053046, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00018976608187134503, |
|
"loss": 0.7072, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.7236592137951191, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.00018966861598440545, |
|
"loss": 0.8231, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 0.7248282916849335, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001895711500974659, |
|
"loss": 0.6973, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7259973695747479, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 0.6941, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.7271664474645623, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018937621832358677, |
|
"loss": 0.6481, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 0.7283355253543767, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001892787524366472, |
|
"loss": 0.6984, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.7295046032441912, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00018918128654970762, |
|
"loss": 0.7148, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.7306736811340055, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018908382066276804, |
|
"loss": 0.6359, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.73184275902382, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00018898635477582846, |
|
"loss": 0.6469, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 0.7330118369136344, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018888888888888888, |
|
"loss": 0.6634, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.7341809148034488, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001887914230019493, |
|
"loss": 0.788, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 0.7353499926932632, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00018869395711500976, |
|
"loss": 0.6582, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 0.7365190705830776, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.0001885964912280702, |
|
"loss": 0.6793, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.737688148472892, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00018849902534113063, |
|
"loss": 0.6758, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 0.7388572263627065, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018840155945419105, |
|
"loss": 0.6362, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 0.7400263042525208, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018830409356725147, |
|
"loss": 0.692, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.7411953821423353, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001882066276803119, |
|
"loss": 0.6731, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 0.7423644600321496, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018810916179337232, |
|
"loss": 0.7731, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7435335379219641, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00018801169590643274, |
|
"loss": 0.6723, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.7447026158117784, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001879142300194932, |
|
"loss": 0.6414, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.7458716937015929, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0001878167641325536, |
|
"loss": 0.6392, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 0.7470407715914072, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00018771929824561406, |
|
"loss": 0.6615, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.7482098494812217, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018762183235867448, |
|
"loss": 0.6489, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7493789273710361, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0001875243664717349, |
|
"loss": 0.6432, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 0.7505480052608505, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00018742690058479533, |
|
"loss": 0.7061, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.7517170831506649, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018732943469785575, |
|
"loss": 0.692, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 0.7528861610404793, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018723196881091617, |
|
"loss": 0.641, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.7540552389302937, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001871345029239766, |
|
"loss": 0.6842, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7552243168201082, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00018703703703703704, |
|
"loss": 0.6471, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 0.7563933947099225, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001869395711500975, |
|
"loss": 0.6824, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 0.757562472599737, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00018684210526315792, |
|
"loss": 0.6166, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.7587315504895513, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00018674463937621834, |
|
"loss": 0.749, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 0.7599006283793658, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00018664717348927876, |
|
"loss": 0.7687, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7610697062691801, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018654970760233918, |
|
"loss": 0.7456, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.7622387841589946, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001864522417153996, |
|
"loss": 0.706, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 0.763407862048809, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00018635477582846003, |
|
"loss": 0.9545, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 0.7645769399386234, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018625730994152048, |
|
"loss": 0.6858, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.7657460178284379, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0001861598440545809, |
|
"loss": 0.6233, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7669150957182522, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00018606237816764135, |
|
"loss": 0.6781, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 0.7680841736080667, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00018596491228070177, |
|
"loss": 0.6687, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.769253251497881, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0001858674463937622, |
|
"loss": 0.7021, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.7704223293876955, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00018576998050682262, |
|
"loss": 0.6391, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 0.7715914072775099, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00018567251461988304, |
|
"loss": 0.6663, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7727604851673243, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00018557504873294346, |
|
"loss": 0.6668, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 0.7739295630571387, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001854775828460039, |
|
"loss": 0.6051, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 0.7750986409469531, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00018538011695906433, |
|
"loss": 0.6486, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 0.7762677188367675, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00018528265107212478, |
|
"loss": 0.6645, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 0.777436796726582, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001851851851851852, |
|
"loss": 0.6235, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7786058746163963, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00018508771929824563, |
|
"loss": 0.7003, |
|
"step": 1332 |
|
}, |
|
{ |
|
"epoch": 0.7797749525062108, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00018499025341130605, |
|
"loss": 0.7938, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 0.7809440303960251, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018489278752436647, |
|
"loss": 0.6972, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 0.7821131082858396, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001847953216374269, |
|
"loss": 0.6443, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 0.7832821861756539, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018469785575048734, |
|
"loss": 0.5978, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7844512640654684, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00018460038986354776, |
|
"loss": 0.7214, |
|
"step": 1342 |
|
}, |
|
{ |
|
"epoch": 0.7856203419552827, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00018450292397660819, |
|
"loss": 0.6563, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.7867894198450972, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00018440545808966864, |
|
"loss": 0.6051, |
|
"step": 1346 |
|
}, |
|
{ |
|
"epoch": 0.7879584977349116, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00018430799220272906, |
|
"loss": 0.6615, |
|
"step": 1348 |
|
}, |
|
{ |
|
"epoch": 0.789127575624726, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00018421052631578948, |
|
"loss": 0.727, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7902966535145404, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0001841130604288499, |
|
"loss": 0.6712, |
|
"step": 1352 |
|
}, |
|
{ |
|
"epoch": 0.7914657314043548, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00018401559454191032, |
|
"loss": 0.7094, |
|
"step": 1354 |
|
}, |
|
{ |
|
"epoch": 0.7926348092941692, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00018391812865497077, |
|
"loss": 0.6863, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 0.7938038871839836, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0001838206627680312, |
|
"loss": 0.6378, |
|
"step": 1358 |
|
}, |
|
{ |
|
"epoch": 0.794972965073798, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00018372319688109162, |
|
"loss": 0.8658, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7961420429636125, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018362573099415207, |
|
"loss": 0.6507, |
|
"step": 1362 |
|
}, |
|
{ |
|
"epoch": 0.7973111208534268, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0001835282651072125, |
|
"loss": 0.7661, |
|
"step": 1364 |
|
}, |
|
{ |
|
"epoch": 0.7984801987432413, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0001834307992202729, |
|
"loss": 0.6932, |
|
"step": 1366 |
|
}, |
|
{ |
|
"epoch": 0.7996492766330556, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.681, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 0.8008183545228701, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00018323586744639376, |
|
"loss": 0.6374, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8019874324126844, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0001831384015594542, |
|
"loss": 0.657, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 0.8031565103024989, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00018304093567251463, |
|
"loss": 0.6511, |
|
"step": 1374 |
|
}, |
|
{ |
|
"epoch": 0.8043255881923134, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018294346978557505, |
|
"loss": 0.6286, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 0.8054946660821277, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00018284600389863547, |
|
"loss": 0.6964, |
|
"step": 1378 |
|
}, |
|
{ |
|
"epoch": 0.8066637439719422, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00018274853801169592, |
|
"loss": 0.6383, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8078328218617565, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018265107212475635, |
|
"loss": 0.6459, |
|
"step": 1382 |
|
}, |
|
{ |
|
"epoch": 0.809001899751571, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00018255360623781677, |
|
"loss": 0.7644, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 0.8101709776413853, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0001824561403508772, |
|
"loss": 0.6872, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 0.8113400555311998, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00018235867446393764, |
|
"loss": 0.7238, |
|
"step": 1388 |
|
}, |
|
{ |
|
"epoch": 0.8125091334210142, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018226120857699806, |
|
"loss": 0.6579, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8136782113108286, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018216374269005848, |
|
"loss": 0.6741, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.814847289200643, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001820662768031189, |
|
"loss": 0.6218, |
|
"step": 1394 |
|
}, |
|
{ |
|
"epoch": 0.8160163670904574, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00018196881091617936, |
|
"loss": 0.6077, |
|
"step": 1396 |
|
}, |
|
{ |
|
"epoch": 0.8171854449802718, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00018187134502923978, |
|
"loss": 0.6797, |
|
"step": 1398 |
|
}, |
|
{ |
|
"epoch": 0.8183545228700863, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0001817738791423002, |
|
"loss": 0.6416, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8195236007599006, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00018167641325536062, |
|
"loss": 0.6664, |
|
"step": 1402 |
|
}, |
|
{ |
|
"epoch": 0.8206926786497151, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00018157894736842107, |
|
"loss": 0.6795, |
|
"step": 1404 |
|
}, |
|
{ |
|
"epoch": 0.8218617565395294, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0001814814814814815, |
|
"loss": 0.7478, |
|
"step": 1406 |
|
}, |
|
{ |
|
"epoch": 0.8230308344293439, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018138401559454192, |
|
"loss": 0.6926, |
|
"step": 1408 |
|
}, |
|
{ |
|
"epoch": 0.8241999123191582, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018128654970760234, |
|
"loss": 0.6235, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8253689902089727, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00018118908382066276, |
|
"loss": 0.7571, |
|
"step": 1412 |
|
}, |
|
{ |
|
"epoch": 0.826538068098787, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001810916179337232, |
|
"loss": 0.6671, |
|
"step": 1414 |
|
}, |
|
{ |
|
"epoch": 0.8277071459886015, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00018099415204678363, |
|
"loss": 0.715, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 0.8288762238784159, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018089668615984406, |
|
"loss": 0.643, |
|
"step": 1418 |
|
}, |
|
{ |
|
"epoch": 0.8300453017682303, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001807992202729045, |
|
"loss": 0.6701, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8312143796580447, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00018070175438596493, |
|
"loss": 0.6674, |
|
"step": 1422 |
|
}, |
|
{ |
|
"epoch": 0.8323834575478591, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00018060428849902535, |
|
"loss": 0.6442, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 0.8335525354376735, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018050682261208577, |
|
"loss": 0.6345, |
|
"step": 1426 |
|
}, |
|
{ |
|
"epoch": 0.834721613327488, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0001804093567251462, |
|
"loss": 0.7375, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 0.8358906912173023, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00018031189083820664, |
|
"loss": 0.6404, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8370597691071168, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00018021442495126707, |
|
"loss": 0.6356, |
|
"step": 1432 |
|
}, |
|
{ |
|
"epoch": 0.8382288469969311, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001801169590643275, |
|
"loss": 0.6207, |
|
"step": 1434 |
|
}, |
|
{ |
|
"epoch": 0.8393979248867456, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00018001949317738794, |
|
"loss": 0.6375, |
|
"step": 1436 |
|
}, |
|
{ |
|
"epoch": 0.8405670027765599, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00017992202729044836, |
|
"loss": 0.8081, |
|
"step": 1438 |
|
}, |
|
{ |
|
"epoch": 0.8417360806663744, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00017982456140350878, |
|
"loss": 0.7778, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8429051585561889, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001797270955165692, |
|
"loss": 0.7142, |
|
"step": 1442 |
|
}, |
|
{ |
|
"epoch": 0.8440742364460032, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017962962962962963, |
|
"loss": 0.6502, |
|
"step": 1444 |
|
}, |
|
{ |
|
"epoch": 0.8452433143358177, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017953216374269005, |
|
"loss": 0.6668, |
|
"step": 1446 |
|
}, |
|
{ |
|
"epoch": 0.846412392225632, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001794346978557505, |
|
"loss": 0.6663, |
|
"step": 1448 |
|
}, |
|
{ |
|
"epoch": 0.8475814701154465, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 0.00017933723196881092, |
|
"loss": 0.7231, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8487505480052608, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00017923976608187137, |
|
"loss": 0.6152, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 0.8499196258950753, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0001791423001949318, |
|
"loss": 0.7494, |
|
"step": 1454 |
|
}, |
|
{ |
|
"epoch": 0.8510887037848897, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017904483430799221, |
|
"loss": 0.6777, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 0.8522577816747041, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 0.6667, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 0.8534268595645185, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00017884990253411306, |
|
"loss": 0.6389, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8545959374543329, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017875243664717348, |
|
"loss": 0.6435, |
|
"step": 1462 |
|
}, |
|
{ |
|
"epoch": 0.8557650153441473, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00017865497076023393, |
|
"loss": 0.654, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 0.8569340932339617, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00017855750487329435, |
|
"loss": 0.6692, |
|
"step": 1466 |
|
}, |
|
{ |
|
"epoch": 0.8581031711237761, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0001784600389863548, |
|
"loss": 0.6807, |
|
"step": 1468 |
|
}, |
|
{ |
|
"epoch": 0.8592722490135906, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00017836257309941523, |
|
"loss": 0.6314, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8604413269034049, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00017826510721247565, |
|
"loss": 0.6068, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 0.8616104047932194, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00017816764132553607, |
|
"loss": 0.6545, |
|
"step": 1474 |
|
}, |
|
{ |
|
"epoch": 0.8627794826830337, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0001780701754385965, |
|
"loss": 0.6898, |
|
"step": 1476 |
|
}, |
|
{ |
|
"epoch": 0.8639485605728482, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00017797270955165691, |
|
"loss": 0.6213, |
|
"step": 1478 |
|
}, |
|
{ |
|
"epoch": 0.8651176384626625, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00017787524366471734, |
|
"loss": 0.6487, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.866286716352477, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.6895, |
|
"step": 1482 |
|
}, |
|
{ |
|
"epoch": 0.8674557942422914, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00017768031189083824, |
|
"loss": 0.6522, |
|
"step": 1484 |
|
}, |
|
{ |
|
"epoch": 0.8686248721321058, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00017758284600389866, |
|
"loss": 0.6484, |
|
"step": 1486 |
|
}, |
|
{ |
|
"epoch": 0.8697939500219202, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017748538011695908, |
|
"loss": 0.6517, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 0.8709630279117346, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0001773879142300195, |
|
"loss": 0.6644, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.872132105801549, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00017729044834307992, |
|
"loss": 0.6383, |
|
"step": 1492 |
|
}, |
|
{ |
|
"epoch": 0.8733011836913634, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00017719298245614035, |
|
"loss": 0.7941, |
|
"step": 1494 |
|
}, |
|
{ |
|
"epoch": 0.8744702615811778, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00017709551656920077, |
|
"loss": 0.6882, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 0.8756393394709923, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00017699805068226122, |
|
"loss": 0.627, |
|
"step": 1498 |
|
}, |
|
{ |
|
"epoch": 0.8768084173608066, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00017690058479532167, |
|
"loss": 0.6141, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8779774952506211, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0001768031189083821, |
|
"loss": 0.6226, |
|
"step": 1502 |
|
}, |
|
{ |
|
"epoch": 0.8791465731404354, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0001767056530214425, |
|
"loss": 0.6834, |
|
"step": 1504 |
|
}, |
|
{ |
|
"epoch": 0.8803156510302499, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00017660818713450294, |
|
"loss": 0.6269, |
|
"step": 1506 |
|
}, |
|
{ |
|
"epoch": 0.8814847289200644, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00017651072124756336, |
|
"loss": 0.6494, |
|
"step": 1508 |
|
}, |
|
{ |
|
"epoch": 0.8826538068098787, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00017641325536062378, |
|
"loss": 0.6028, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8838228846996932, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.0001763157894736842, |
|
"loss": 0.6428, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.8849919625895075, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00017621832358674462, |
|
"loss": 0.6303, |
|
"step": 1514 |
|
}, |
|
{ |
|
"epoch": 0.886161040479322, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00017612085769980507, |
|
"loss": 0.6522, |
|
"step": 1516 |
|
}, |
|
{ |
|
"epoch": 0.8873301183691363, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00017602339181286552, |
|
"loss": 0.6279, |
|
"step": 1518 |
|
}, |
|
{ |
|
"epoch": 0.8884991962589508, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00017592592592592595, |
|
"loss": 0.6507, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8896682741487651, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00017582846003898637, |
|
"loss": 0.6137, |
|
"step": 1522 |
|
}, |
|
{ |
|
"epoch": 0.8908373520385796, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0001757309941520468, |
|
"loss": 0.649, |
|
"step": 1524 |
|
}, |
|
{ |
|
"epoch": 0.892006429928394, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0001756335282651072, |
|
"loss": 0.6498, |
|
"step": 1526 |
|
}, |
|
{ |
|
"epoch": 0.8931755078182084, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00017553606237816763, |
|
"loss": 0.6804, |
|
"step": 1528 |
|
}, |
|
{ |
|
"epoch": 0.8943445857080228, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 0.6186, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8955136635978372, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001753411306042885, |
|
"loss": 0.6518, |
|
"step": 1532 |
|
}, |
|
{ |
|
"epoch": 0.8966827414876516, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00017524366471734896, |
|
"loss": 0.6458, |
|
"step": 1534 |
|
}, |
|
{ |
|
"epoch": 0.897851819377466, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00017514619883040938, |
|
"loss": 0.6642, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.8990208972672804, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0001750487329434698, |
|
"loss": 0.6254, |
|
"step": 1538 |
|
}, |
|
{ |
|
"epoch": 0.9001899751570949, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00017495126705653022, |
|
"loss": 0.6158, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9013590530469092, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00017485380116959065, |
|
"loss": 0.6635, |
|
"step": 1542 |
|
}, |
|
{ |
|
"epoch": 0.9025281309367237, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00017475633528265107, |
|
"loss": 0.6242, |
|
"step": 1544 |
|
}, |
|
{ |
|
"epoch": 0.903697208826538, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0001746588693957115, |
|
"loss": 0.6503, |
|
"step": 1546 |
|
}, |
|
{ |
|
"epoch": 0.9048662867163525, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00017456140350877194, |
|
"loss": 0.5875, |
|
"step": 1548 |
|
}, |
|
{ |
|
"epoch": 0.9060353646061668, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00017446393762183236, |
|
"loss": 0.6683, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9072044424959813, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0001743664717348928, |
|
"loss": 0.6176, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 0.9083735203857957, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00017426900584795323, |
|
"loss": 0.6351, |
|
"step": 1554 |
|
}, |
|
{ |
|
"epoch": 0.9095425982756101, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00017417153996101366, |
|
"loss": 0.668, |
|
"step": 1556 |
|
}, |
|
{ |
|
"epoch": 0.9107116761654245, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00017407407407407408, |
|
"loss": 0.6425, |
|
"step": 1558 |
|
}, |
|
{ |
|
"epoch": 0.9118807540552389, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0001739766081871345, |
|
"loss": 0.6127, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9130498319450533, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00017387914230019492, |
|
"loss": 0.6055, |
|
"step": 1562 |
|
}, |
|
{ |
|
"epoch": 0.9142189098348678, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00017378167641325537, |
|
"loss": 0.6276, |
|
"step": 1564 |
|
}, |
|
{ |
|
"epoch": 0.9153879877246821, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001736842105263158, |
|
"loss": 0.6818, |
|
"step": 1566 |
|
}, |
|
{ |
|
"epoch": 0.9165570656144966, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017358674463937624, |
|
"loss": 0.76, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 0.9177261435043109, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.00017348927875243667, |
|
"loss": 0.6739, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9188952213941254, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.0001733918128654971, |
|
"loss": 0.6578, |
|
"step": 1572 |
|
}, |
|
{ |
|
"epoch": 0.9200642992839398, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0001732943469785575, |
|
"loss": 0.6705, |
|
"step": 1574 |
|
}, |
|
{ |
|
"epoch": 0.9212333771737542, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00017319688109161793, |
|
"loss": 0.6605, |
|
"step": 1576 |
|
}, |
|
{ |
|
"epoch": 0.9224024550635687, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00017309941520467836, |
|
"loss": 0.6346, |
|
"step": 1578 |
|
}, |
|
{ |
|
"epoch": 0.923571532953383, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001730019493177388, |
|
"loss": 0.6291, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9247406108431975, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00017290448343079923, |
|
"loss": 0.6774, |
|
"step": 1582 |
|
}, |
|
{ |
|
"epoch": 0.9259096887330118, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00017280701754385965, |
|
"loss": 0.6603, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.9270787666228263, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0001727095516569201, |
|
"loss": 0.6435, |
|
"step": 1586 |
|
}, |
|
{ |
|
"epoch": 0.9282478445126406, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017261208576998052, |
|
"loss": 0.7842, |
|
"step": 1588 |
|
}, |
|
{ |
|
"epoch": 0.9294169224024551, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00017251461988304094, |
|
"loss": 0.6378, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9305860002922695, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00017241715399610137, |
|
"loss": 0.6809, |
|
"step": 1592 |
|
}, |
|
{ |
|
"epoch": 0.9317550781820839, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.0001723196881091618, |
|
"loss": 0.6226, |
|
"step": 1594 |
|
}, |
|
{ |
|
"epoch": 0.9329241560718983, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00017222222222222224, |
|
"loss": 0.6763, |
|
"step": 1596 |
|
}, |
|
{ |
|
"epoch": 0.9340932339617127, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00017212475633528266, |
|
"loss": 0.7279, |
|
"step": 1598 |
|
}, |
|
{ |
|
"epoch": 0.9352623118515271, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00017202729044834308, |
|
"loss": 0.6655, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9364313897413415, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00017192982456140353, |
|
"loss": 0.6658, |
|
"step": 1602 |
|
}, |
|
{ |
|
"epoch": 0.9376004676311559, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00017183235867446395, |
|
"loss": 0.7698, |
|
"step": 1604 |
|
}, |
|
{ |
|
"epoch": 0.9387695455209704, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00017173489278752438, |
|
"loss": 0.6555, |
|
"step": 1606 |
|
}, |
|
{ |
|
"epoch": 0.9399386234107847, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0001716374269005848, |
|
"loss": 0.6465, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 0.9411077013005992, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00017153996101364522, |
|
"loss": 0.6642, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9422767791904135, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00017144249512670567, |
|
"loss": 0.6579, |
|
"step": 1612 |
|
}, |
|
{ |
|
"epoch": 0.943445857080228, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0001713450292397661, |
|
"loss": 0.6427, |
|
"step": 1614 |
|
}, |
|
{ |
|
"epoch": 0.9446149349700423, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00017124756335282651, |
|
"loss": 0.6114, |
|
"step": 1616 |
|
}, |
|
{ |
|
"epoch": 0.9457840128598568, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00017115009746588694, |
|
"loss": 0.5887, |
|
"step": 1618 |
|
}, |
|
{ |
|
"epoch": 0.9469530907496712, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00017105263157894739, |
|
"loss": 0.6266, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9481221686394856, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0001709551656920078, |
|
"loss": 0.642, |
|
"step": 1622 |
|
}, |
|
{ |
|
"epoch": 0.9492912465293, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00017085769980506823, |
|
"loss": 0.605, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 0.9504603244191144, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00017076023391812865, |
|
"loss": 0.5981, |
|
"step": 1626 |
|
}, |
|
{ |
|
"epoch": 0.9516294023089288, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001706627680311891, |
|
"loss": 0.7126, |
|
"step": 1628 |
|
}, |
|
{ |
|
"epoch": 0.9527984801987432, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00017056530214424952, |
|
"loss": 0.7645, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9539675580885576, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00017046783625730995, |
|
"loss": 0.7024, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 0.9551366359783721, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00017037037037037037, |
|
"loss": 0.7639, |
|
"step": 1634 |
|
}, |
|
{ |
|
"epoch": 0.9563057138681864, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00017027290448343082, |
|
"loss": 0.624, |
|
"step": 1636 |
|
}, |
|
{ |
|
"epoch": 0.9574747917580009, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017017543859649124, |
|
"loss": 0.6589, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 0.9586438696478153, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00017007797270955166, |
|
"loss": 0.6068, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9598129475376297, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00016998050682261209, |
|
"loss": 0.6223, |
|
"step": 1642 |
|
}, |
|
{ |
|
"epoch": 0.9609820254274442, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00016988304093567254, |
|
"loss": 0.6217, |
|
"step": 1644 |
|
}, |
|
{ |
|
"epoch": 0.9621511033172585, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00016978557504873296, |
|
"loss": 0.6249, |
|
"step": 1646 |
|
}, |
|
{ |
|
"epoch": 0.963320181207073, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00016968810916179338, |
|
"loss": 0.6676, |
|
"step": 1648 |
|
}, |
|
{ |
|
"epoch": 0.9644892590968873, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0001695906432748538, |
|
"loss": 0.6327, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9656583369867018, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00016949317738791422, |
|
"loss": 0.6233, |
|
"step": 1652 |
|
}, |
|
{ |
|
"epoch": 0.9668274148765161, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00016939571150097467, |
|
"loss": 0.6466, |
|
"step": 1654 |
|
}, |
|
{ |
|
"epoch": 0.9679964927663306, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001692982456140351, |
|
"loss": 0.6173, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 0.969165570656145, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016920077972709552, |
|
"loss": 0.6279, |
|
"step": 1658 |
|
}, |
|
{ |
|
"epoch": 0.9703346485459594, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 0.00016910331384015597, |
|
"loss": 0.8359, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9715037264357738, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001690058479532164, |
|
"loss": 0.6702, |
|
"step": 1662 |
|
}, |
|
{ |
|
"epoch": 0.9726728043255882, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001689083820662768, |
|
"loss": 0.6553, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 0.9738418822154026, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.00016881091617933723, |
|
"loss": 0.7719, |
|
"step": 1666 |
|
}, |
|
{ |
|
"epoch": 0.975010960105217, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00016871345029239766, |
|
"loss": 0.6581, |
|
"step": 1668 |
|
}, |
|
{ |
|
"epoch": 0.9761800379950314, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0001686159844054581, |
|
"loss": 0.6169, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9773491158848459, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00016851851851851853, |
|
"loss": 0.6449, |
|
"step": 1672 |
|
}, |
|
{ |
|
"epoch": 0.9785181937746602, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 0.6011, |
|
"step": 1674 |
|
}, |
|
{ |
|
"epoch": 0.9796872716644747, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0001683235867446394, |
|
"loss": 0.6331, |
|
"step": 1676 |
|
}, |
|
{ |
|
"epoch": 0.980856349554289, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00016822612085769982, |
|
"loss": 0.635, |
|
"step": 1678 |
|
}, |
|
{ |
|
"epoch": 0.9820254274441035, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00016812865497076025, |
|
"loss": 0.6567, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9831945053339178, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00016803118908382067, |
|
"loss": 0.592, |
|
"step": 1682 |
|
}, |
|
{ |
|
"epoch": 0.9843635832237323, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001679337231968811, |
|
"loss": 0.6291, |
|
"step": 1684 |
|
}, |
|
{ |
|
"epoch": 0.9855326611135466, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001678362573099415, |
|
"loss": 0.6125, |
|
"step": 1686 |
|
}, |
|
{ |
|
"epoch": 0.9867017390033611, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00016773879142300196, |
|
"loss": 0.5989, |
|
"step": 1688 |
|
}, |
|
{ |
|
"epoch": 0.9878708168931755, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00016764132553606238, |
|
"loss": 0.6214, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9890398947829899, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00016754385964912283, |
|
"loss": 0.5794, |
|
"step": 1692 |
|
}, |
|
{ |
|
"epoch": 0.9902089726728043, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00016744639376218326, |
|
"loss": 0.6138, |
|
"step": 1694 |
|
}, |
|
{ |
|
"epoch": 0.9913780505626187, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00016734892787524368, |
|
"loss": 0.5763, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 0.9925471284524331, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0001672514619883041, |
|
"loss": 0.5501, |
|
"step": 1698 |
|
}, |
|
{ |
|
"epoch": 0.9937162063422476, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00016715399610136452, |
|
"loss": 0.605, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9948852842320619, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00016705653021442494, |
|
"loss": 0.5973, |
|
"step": 1702 |
|
}, |
|
{ |
|
"epoch": 0.9960543621218764, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.0001669590643274854, |
|
"loss": 0.5792, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 0.9972234400116908, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00016686159844054582, |
|
"loss": 0.5833, |
|
"step": 1706 |
|
}, |
|
{ |
|
"epoch": 0.9983925179015052, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00016676413255360627, |
|
"loss": 0.6092, |
|
"step": 1708 |
|
}, |
|
{ |
|
"epoch": 0.9995615957913196, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.6591, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.000730673681134, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001665692007797271, |
|
"loss": 0.5507, |
|
"step": 1712 |
|
}, |
|
{ |
|
"epoch": 1.0018997515709485, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00016647173489278753, |
|
"loss": 0.5538, |
|
"step": 1714 |
|
}, |
|
{ |
|
"epoch": 1.0030688294607628, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00016637426900584796, |
|
"loss": 0.511, |
|
"step": 1716 |
|
}, |
|
{ |
|
"epoch": 1.0042379073505772, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00016627680311890838, |
|
"loss": 0.5208, |
|
"step": 1718 |
|
}, |
|
{ |
|
"epoch": 1.0054069852403917, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001661793372319688, |
|
"loss": 0.5428, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.006576063130206, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.00016608187134502925, |
|
"loss": 0.5979, |
|
"step": 1722 |
|
}, |
|
{ |
|
"epoch": 1.0077451410200204, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0001659844054580897, |
|
"loss": 0.5444, |
|
"step": 1724 |
|
}, |
|
{ |
|
"epoch": 1.0089142189098348, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016588693957115012, |
|
"loss": 0.574, |
|
"step": 1726 |
|
}, |
|
{ |
|
"epoch": 1.0100832967996494, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016578947368421054, |
|
"loss": 0.4927, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 1.0112523746894637, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00016569200779727097, |
|
"loss": 0.5525, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.012421452579278, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001655945419103314, |
|
"loss": 0.5884, |
|
"step": 1732 |
|
}, |
|
{ |
|
"epoch": 1.0135905304690924, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001654970760233918, |
|
"loss": 0.4956, |
|
"step": 1734 |
|
}, |
|
{ |
|
"epoch": 1.014759608358907, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00016539961013645223, |
|
"loss": 0.5446, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 1.0159286862487213, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00016530214424951268, |
|
"loss": 0.5602, |
|
"step": 1738 |
|
}, |
|
{ |
|
"epoch": 1.0170977641385357, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0001652046783625731, |
|
"loss": 0.5462, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.01826684202835, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00016510721247563355, |
|
"loss": 0.53, |
|
"step": 1742 |
|
}, |
|
{ |
|
"epoch": 1.0194359199181646, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016500974658869398, |
|
"loss": 0.5374, |
|
"step": 1744 |
|
}, |
|
{ |
|
"epoch": 1.020604997807979, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0001649122807017544, |
|
"loss": 0.5258, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 1.0217740756977933, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00016481481481481482, |
|
"loss": 0.5118, |
|
"step": 1748 |
|
}, |
|
{ |
|
"epoch": 1.0229431535876077, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00016471734892787524, |
|
"loss": 0.4822, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.0241122314774223, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00016461988304093567, |
|
"loss": 0.5435, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 1.0252813093672366, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001645224171539961, |
|
"loss": 0.5703, |
|
"step": 1754 |
|
}, |
|
{ |
|
"epoch": 1.026450387257051, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00016442495126705654, |
|
"loss": 0.587, |
|
"step": 1756 |
|
}, |
|
{ |
|
"epoch": 1.0276194651468653, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00016432748538011699, |
|
"loss": 0.5146, |
|
"step": 1758 |
|
}, |
|
{ |
|
"epoch": 1.0287885430366799, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0001642300194931774, |
|
"loss": 0.5102, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.0299576209264942, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00016413255360623783, |
|
"loss": 0.5178, |
|
"step": 1762 |
|
}, |
|
{ |
|
"epoch": 1.0311266988163086, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00016403508771929825, |
|
"loss": 0.4984, |
|
"step": 1764 |
|
}, |
|
{ |
|
"epoch": 1.032295776706123, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00016393762183235868, |
|
"loss": 0.4915, |
|
"step": 1766 |
|
}, |
|
{ |
|
"epoch": 1.0334648545959375, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001638401559454191, |
|
"loss": 0.5493, |
|
"step": 1768 |
|
}, |
|
{ |
|
"epoch": 1.0346339324857519, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00016374269005847952, |
|
"loss": 0.4906, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.0358030103755662, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00016364522417153997, |
|
"loss": 0.5478, |
|
"step": 1772 |
|
}, |
|
{ |
|
"epoch": 1.0369720882653808, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0001635477582846004, |
|
"loss": 0.5187, |
|
"step": 1774 |
|
}, |
|
{ |
|
"epoch": 1.0381411661551951, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016345029239766084, |
|
"loss": 0.5421, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 1.0393102440450095, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016335282651072126, |
|
"loss": 0.5583, |
|
"step": 1778 |
|
}, |
|
{ |
|
"epoch": 1.0404793219348238, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00016325536062378169, |
|
"loss": 0.5283, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.0416483998246384, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001631578947368421, |
|
"loss": 0.5294, |
|
"step": 1782 |
|
}, |
|
{ |
|
"epoch": 1.0428174777144528, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00016306042884990253, |
|
"loss": 0.5682, |
|
"step": 1784 |
|
}, |
|
{ |
|
"epoch": 1.0439865556042671, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00016296296296296295, |
|
"loss": 0.5088, |
|
"step": 1786 |
|
}, |
|
{ |
|
"epoch": 1.0451556334940815, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.0001628654970760234, |
|
"loss": 0.4887, |
|
"step": 1788 |
|
}, |
|
{ |
|
"epoch": 1.046324711383896, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00016276803118908382, |
|
"loss": 0.5478, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.0474937892737104, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00016267056530214427, |
|
"loss": 0.5156, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 1.0486628671635247, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001625730994152047, |
|
"loss": 0.528, |
|
"step": 1794 |
|
}, |
|
{ |
|
"epoch": 1.049831945053339, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00016247563352826512, |
|
"loss": 0.5, |
|
"step": 1796 |
|
}, |
|
{ |
|
"epoch": 1.0510010229431537, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00016237816764132554, |
|
"loss": 0.6486, |
|
"step": 1798 |
|
}, |
|
{ |
|
"epoch": 1.052170100832968, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00016228070175438596, |
|
"loss": 0.5215, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.0533391787227824, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016218323586744639, |
|
"loss": 0.5179, |
|
"step": 1802 |
|
}, |
|
{ |
|
"epoch": 1.0545082566125967, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00016208576998050683, |
|
"loss": 0.5772, |
|
"step": 1804 |
|
}, |
|
{ |
|
"epoch": 1.0556773345024113, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00016198830409356726, |
|
"loss": 0.504, |
|
"step": 1806 |
|
}, |
|
{ |
|
"epoch": 1.0568464123922257, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001618908382066277, |
|
"loss": 0.5463, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 1.05801549028204, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00016179337231968813, |
|
"loss": 0.5146, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.0591845681718544, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00016169590643274855, |
|
"loss": 0.5626, |
|
"step": 1812 |
|
}, |
|
{ |
|
"epoch": 1.060353646061669, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016159844054580897, |
|
"loss": 0.5224, |
|
"step": 1814 |
|
}, |
|
{ |
|
"epoch": 1.0615227239514833, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001615009746588694, |
|
"loss": 0.5456, |
|
"step": 1816 |
|
}, |
|
{ |
|
"epoch": 1.0626918018412976, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00016140350877192982, |
|
"loss": 0.5831, |
|
"step": 1818 |
|
}, |
|
{ |
|
"epoch": 1.063860879731112, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00016130604288499027, |
|
"loss": 0.5131, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.0650299576209266, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001612085769980507, |
|
"loss": 0.6097, |
|
"step": 1822 |
|
}, |
|
{ |
|
"epoch": 1.066199035510741, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0001611111111111111, |
|
"loss": 0.5895, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 1.0673681134005553, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00016101364522417156, |
|
"loss": 0.5048, |
|
"step": 1826 |
|
}, |
|
{ |
|
"epoch": 1.0685371912903698, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016091617933723198, |
|
"loss": 0.5041, |
|
"step": 1828 |
|
}, |
|
{ |
|
"epoch": 1.0697062691801842, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001608187134502924, |
|
"loss": 0.5293, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.0708753470699985, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016072124756335283, |
|
"loss": 0.5054, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 1.072044424959813, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00016062378167641325, |
|
"loss": 0.5345, |
|
"step": 1834 |
|
}, |
|
{ |
|
"epoch": 1.0732135028496272, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001605263157894737, |
|
"loss": 0.5057, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 1.0743825807394418, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00016042884990253412, |
|
"loss": 0.5478, |
|
"step": 1838 |
|
}, |
|
{ |
|
"epoch": 1.0755516586292562, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00016033138401559454, |
|
"loss": 0.6639, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.0767207365190705, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.000160233918128655, |
|
"loss": 0.529, |
|
"step": 1842 |
|
}, |
|
{ |
|
"epoch": 1.077889814408885, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00016013645224171542, |
|
"loss": 0.5433, |
|
"step": 1844 |
|
}, |
|
{ |
|
"epoch": 1.0790588922986994, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00016003898635477584, |
|
"loss": 0.6416, |
|
"step": 1846 |
|
}, |
|
{ |
|
"epoch": 1.0802279701885138, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00015994152046783626, |
|
"loss": 0.5125, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 1.0813970480783281, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00015984405458089668, |
|
"loss": 0.5199, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.0825661259681427, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00015974658869395713, |
|
"loss": 0.5225, |
|
"step": 1852 |
|
}, |
|
{ |
|
"epoch": 1.083735203857957, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00015964912280701756, |
|
"loss": 0.4977, |
|
"step": 1854 |
|
}, |
|
{ |
|
"epoch": 1.0849042817477714, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00015955165692007798, |
|
"loss": 0.5717, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 1.0860733596375858, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001594541910331384, |
|
"loss": 0.5352, |
|
"step": 1858 |
|
}, |
|
{ |
|
"epoch": 1.0872424375274004, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00015935672514619885, |
|
"loss": 0.5502, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.0884115154172147, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00015925925925925927, |
|
"loss": 0.5091, |
|
"step": 1862 |
|
}, |
|
{ |
|
"epoch": 1.089580593307029, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001591617933723197, |
|
"loss": 0.5104, |
|
"step": 1864 |
|
}, |
|
{ |
|
"epoch": 1.0907496711968434, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00015906432748538012, |
|
"loss": 0.5178, |
|
"step": 1866 |
|
}, |
|
{ |
|
"epoch": 1.091918749086658, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00015896686159844057, |
|
"loss": 0.5414, |
|
"step": 1868 |
|
}, |
|
{ |
|
"epoch": 1.0930878269764723, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.000158869395711501, |
|
"loss": 0.4606, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.0942569048662867, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0001587719298245614, |
|
"loss": 0.5489, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 1.095425982756101, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00015867446393762183, |
|
"loss": 0.4745, |
|
"step": 1874 |
|
}, |
|
{ |
|
"epoch": 1.0965950606459156, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00015857699805068228, |
|
"loss": 0.5481, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 1.09776413853573, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001584795321637427, |
|
"loss": 0.5632, |
|
"step": 1878 |
|
}, |
|
{ |
|
"epoch": 1.0989332164255443, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00015838206627680313, |
|
"loss": 0.5136, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.1001022943153587, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00015828460038986355, |
|
"loss": 0.51, |
|
"step": 1882 |
|
}, |
|
{ |
|
"epoch": 1.1012713722051732, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.000158187134502924, |
|
"loss": 0.5066, |
|
"step": 1884 |
|
}, |
|
{ |
|
"epoch": 1.1024404500949876, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00015808966861598442, |
|
"loss": 0.6068, |
|
"step": 1886 |
|
}, |
|
{ |
|
"epoch": 1.103609527984802, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00015799220272904484, |
|
"loss": 0.5732, |
|
"step": 1888 |
|
}, |
|
{ |
|
"epoch": 1.1047786058746163, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 0.4733, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.1059476837644309, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0001577972709551657, |
|
"loss": 0.5106, |
|
"step": 1892 |
|
}, |
|
{ |
|
"epoch": 1.1071167616542452, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00015769980506822614, |
|
"loss": 0.545, |
|
"step": 1894 |
|
}, |
|
{ |
|
"epoch": 1.1082858395440596, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00015760233918128656, |
|
"loss": 0.5188, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 1.1094549174338741, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00015750487329434698, |
|
"loss": 0.5393, |
|
"step": 1898 |
|
}, |
|
{ |
|
"epoch": 1.1106239953236885, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00015740740740740743, |
|
"loss": 0.5315, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.1117930732135028, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00015730994152046785, |
|
"loss": 0.5042, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 1.1129621511033172, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00015721247563352828, |
|
"loss": 0.5491, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 1.1141312289931316, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001571150097465887, |
|
"loss": 0.5612, |
|
"step": 1906 |
|
}, |
|
{ |
|
"epoch": 1.1153003068829461, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00015701754385964912, |
|
"loss": 0.5288, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 1.1164693847727605, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00015692007797270957, |
|
"loss": 0.5249, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.1176384626625748, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00015682261208577, |
|
"loss": 0.5217, |
|
"step": 1912 |
|
}, |
|
{ |
|
"epoch": 1.1188075405523894, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00015672514619883041, |
|
"loss": 0.5379, |
|
"step": 1914 |
|
}, |
|
{ |
|
"epoch": 1.1199766184422038, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00015662768031189086, |
|
"loss": 0.5196, |
|
"step": 1916 |
|
}, |
|
{ |
|
"epoch": 1.121145696332018, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00015653021442495129, |
|
"loss": 0.5316, |
|
"step": 1918 |
|
}, |
|
{ |
|
"epoch": 1.1223147742218325, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001564327485380117, |
|
"loss": 0.5255, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.123483852111647, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00015633528265107213, |
|
"loss": 0.5311, |
|
"step": 1922 |
|
}, |
|
{ |
|
"epoch": 1.1246529300014614, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00015623781676413255, |
|
"loss": 0.5729, |
|
"step": 1924 |
|
}, |
|
{ |
|
"epoch": 1.1258220078912757, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00015614035087719297, |
|
"loss": 0.495, |
|
"step": 1926 |
|
}, |
|
{ |
|
"epoch": 1.12699108578109, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00015604288499025342, |
|
"loss": 0.5984, |
|
"step": 1928 |
|
}, |
|
{ |
|
"epoch": 1.1281601636709047, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00015594541910331385, |
|
"loss": 0.5055, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.129329241560719, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001558479532163743, |
|
"loss": 0.5671, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 1.1304983194505334, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00015575048732943472, |
|
"loss": 0.5577, |
|
"step": 1934 |
|
}, |
|
{ |
|
"epoch": 1.1316673973403477, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00015565302144249514, |
|
"loss": 0.5637, |
|
"step": 1936 |
|
}, |
|
{ |
|
"epoch": 1.1328364752301623, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.5337, |
|
"step": 1938 |
|
}, |
|
{ |
|
"epoch": 1.1340055531199766, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00015545808966861599, |
|
"loss": 0.5213, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.135174631009791, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0001553606237816764, |
|
"loss": 0.493, |
|
"step": 1942 |
|
}, |
|
{ |
|
"epoch": 1.1363437088996053, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00015526315789473686, |
|
"loss": 0.5246, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 1.13751278678942, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00015516569200779728, |
|
"loss": 0.4862, |
|
"step": 1946 |
|
}, |
|
{ |
|
"epoch": 1.1386818646792343, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00015506822612085773, |
|
"loss": 0.6714, |
|
"step": 1948 |
|
}, |
|
{ |
|
"epoch": 1.1398509425690486, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00015497076023391815, |
|
"loss": 0.57, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.1410200204588632, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00015487329434697857, |
|
"loss": 0.488, |
|
"step": 1952 |
|
}, |
|
{ |
|
"epoch": 1.1421890983486775, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.000154775828460039, |
|
"loss": 0.5453, |
|
"step": 1954 |
|
}, |
|
{ |
|
"epoch": 1.143358176238492, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00015467836257309942, |
|
"loss": 0.5405, |
|
"step": 1956 |
|
}, |
|
{ |
|
"epoch": 1.1445272541283062, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00015458089668615984, |
|
"loss": 0.4999, |
|
"step": 1958 |
|
}, |
|
{ |
|
"epoch": 1.1456963320181206, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00015448343079922026, |
|
"loss": 0.5213, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.1468654099079352, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001543859649122807, |
|
"loss": 0.5285, |
|
"step": 1962 |
|
}, |
|
{ |
|
"epoch": 1.1480344877977495, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.00015428849902534116, |
|
"loss": 0.6403, |
|
"step": 1964 |
|
}, |
|
{ |
|
"epoch": 1.1492035656875639, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00015419103313840158, |
|
"loss": 0.5461, |
|
"step": 1966 |
|
}, |
|
{ |
|
"epoch": 1.1503726435773785, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.000154093567251462, |
|
"loss": 0.581, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 1.1515417214671928, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00015399610136452243, |
|
"loss": 0.5225, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.1527107993570072, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00015389863547758285, |
|
"loss": 0.5275, |
|
"step": 1972 |
|
}, |
|
{ |
|
"epoch": 1.1538798772468215, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00015380116959064327, |
|
"loss": 0.5758, |
|
"step": 1974 |
|
}, |
|
{ |
|
"epoch": 1.1550489551366359, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0001537037037037037, |
|
"loss": 0.7684, |
|
"step": 1976 |
|
}, |
|
{ |
|
"epoch": 1.1562180330264504, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00015360623781676414, |
|
"loss": 0.6212, |
|
"step": 1978 |
|
}, |
|
{ |
|
"epoch": 1.1573871109162648, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00015350877192982457, |
|
"loss": 0.5519, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.1585561888060791, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.00015341130604288502, |
|
"loss": 0.693, |
|
"step": 1982 |
|
}, |
|
{ |
|
"epoch": 1.1597252666958937, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00015331384015594544, |
|
"loss": 0.5857, |
|
"step": 1984 |
|
}, |
|
{ |
|
"epoch": 1.160894344585708, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00015321637426900586, |
|
"loss": 0.5599, |
|
"step": 1986 |
|
}, |
|
{ |
|
"epoch": 1.1620634224755224, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00015311890838206628, |
|
"loss": 0.5946, |
|
"step": 1988 |
|
}, |
|
{ |
|
"epoch": 1.1632325003653368, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001530214424951267, |
|
"loss": 0.5712, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.1644015782551513, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00015292397660818713, |
|
"loss": 0.5825, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 1.1655706561449657, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00015282651072124755, |
|
"loss": 0.5646, |
|
"step": 1994 |
|
}, |
|
{ |
|
"epoch": 1.16673973403478, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.000152729044834308, |
|
"loss": 0.5377, |
|
"step": 1996 |
|
}, |
|
{ |
|
"epoch": 1.1679088119245944, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00015263157894736845, |
|
"loss": 0.5101, |
|
"step": 1998 |
|
}, |
|
{ |
|
"epoch": 1.169077889814409, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00015253411306042887, |
|
"loss": 0.4811, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 5130, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.15246892515328e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|