|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9990426041168023, |
|
"eval_steps": 500, |
|
"global_step": 1044, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019147917663954045, |
|
"grad_norm": 61.0, |
|
"learning_rate": 3.773584905660378e-07, |
|
"loss": 3.0718, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003829583532790809, |
|
"grad_norm": 62.0, |
|
"learning_rate": 7.547169811320755e-07, |
|
"loss": 3.1052, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0057443752991862135, |
|
"grad_norm": 62.5, |
|
"learning_rate": 1.1320754716981133e-06, |
|
"loss": 3.1376, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007659167065581618, |
|
"grad_norm": 103.5, |
|
"learning_rate": 1.509433962264151e-06, |
|
"loss": 3.0909, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009573958831977022, |
|
"grad_norm": 62.0, |
|
"learning_rate": 1.8867924528301889e-06, |
|
"loss": 3.1064, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011488750598372427, |
|
"grad_norm": 60.5, |
|
"learning_rate": 2.2641509433962266e-06, |
|
"loss": 3.0511, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.013403542364767831, |
|
"grad_norm": 58.75, |
|
"learning_rate": 2.6415094339622644e-06, |
|
"loss": 3.0435, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.015318334131163236, |
|
"grad_norm": 54.0, |
|
"learning_rate": 3.018867924528302e-06, |
|
"loss": 2.9226, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01723312589755864, |
|
"grad_norm": 56.0, |
|
"learning_rate": 3.3962264150943395e-06, |
|
"loss": 3.0539, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.019147917663954045, |
|
"grad_norm": 50.5, |
|
"learning_rate": 3.7735849056603777e-06, |
|
"loss": 2.8775, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02106270943034945, |
|
"grad_norm": 46.75, |
|
"learning_rate": 4.150943396226416e-06, |
|
"loss": 2.8774, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.022977501196744854, |
|
"grad_norm": 42.5, |
|
"learning_rate": 4.528301886792453e-06, |
|
"loss": 2.6977, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02489229296314026, |
|
"grad_norm": 35.5, |
|
"learning_rate": 4.905660377358491e-06, |
|
"loss": 2.6109, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.026807084729535663, |
|
"grad_norm": 30.0, |
|
"learning_rate": 5.283018867924529e-06, |
|
"loss": 2.5524, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.028721876495931067, |
|
"grad_norm": 24.125, |
|
"learning_rate": 5.660377358490566e-06, |
|
"loss": 2.5467, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.030636668262326472, |
|
"grad_norm": 19.625, |
|
"learning_rate": 6.037735849056604e-06, |
|
"loss": 2.3976, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.032551460028721876, |
|
"grad_norm": 17.25, |
|
"learning_rate": 6.415094339622642e-06, |
|
"loss": 2.349, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03446625179511728, |
|
"grad_norm": 13.625, |
|
"learning_rate": 6.792452830188679e-06, |
|
"loss": 2.3495, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.036381043561512685, |
|
"grad_norm": 10.375, |
|
"learning_rate": 7.169811320754717e-06, |
|
"loss": 2.3404, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03829583532790809, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 7.5471698113207555e-06, |
|
"loss": 2.2341, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.040210627094303494, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 7.924528301886793e-06, |
|
"loss": 2.1499, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0421254188606989, |
|
"grad_norm": 7.125, |
|
"learning_rate": 8.301886792452832e-06, |
|
"loss": 2.1879, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0440402106270943, |
|
"grad_norm": 5.875, |
|
"learning_rate": 8.67924528301887e-06, |
|
"loss": 2.147, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04595500239348971, |
|
"grad_norm": 15.375, |
|
"learning_rate": 9.056603773584907e-06, |
|
"loss": 2.2175, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04786979415988511, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 9.433962264150944e-06, |
|
"loss": 2.1928, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04978458592628052, |
|
"grad_norm": 4.125, |
|
"learning_rate": 9.811320754716981e-06, |
|
"loss": 2.1355, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05169937769267592, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.018867924528302e-05, |
|
"loss": 2.117, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.053614169459071326, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.0566037735849058e-05, |
|
"loss": 2.0604, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05552896122546673, |
|
"grad_norm": 3.5, |
|
"learning_rate": 1.0943396226415095e-05, |
|
"loss": 2.0949, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.057443752991862135, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.1320754716981132e-05, |
|
"loss": 2.0517, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05935854475825754, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 1.169811320754717e-05, |
|
"loss": 2.1012, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.061273336524652944, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.2075471698113209e-05, |
|
"loss": 2.0485, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06318812829104835, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.2452830188679246e-05, |
|
"loss": 1.9389, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06510292005744375, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.2830188679245283e-05, |
|
"loss": 2.0672, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06701771182383916, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.320754716981132e-05, |
|
"loss": 2.0884, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06893250359023456, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.3584905660377358e-05, |
|
"loss": 2.0209, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07084729535662997, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.3962264150943397e-05, |
|
"loss": 1.952, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07276208712302537, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.4339622641509435e-05, |
|
"loss": 1.9835, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07467687888942078, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 1.4716981132075472e-05, |
|
"loss": 1.962, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07659167065581618, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.5094339622641511e-05, |
|
"loss": 1.9884, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07850646242221158, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.547169811320755e-05, |
|
"loss": 2.0301, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08042125418860699, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.5849056603773586e-05, |
|
"loss": 1.9727, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0823360459550024, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.6226415094339625e-05, |
|
"loss": 2.0236, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0842508377213978, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.6603773584905664e-05, |
|
"loss": 1.9287, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0861656294877932, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 1.69811320754717e-05, |
|
"loss": 1.9009, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0880804212541886, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 1.735849056603774e-05, |
|
"loss": 1.9898, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08999521302058401, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 1.7735849056603774e-05, |
|
"loss": 1.9054, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09191000478697942, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.8113207547169813e-05, |
|
"loss": 1.9189, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09382479655337482, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 1.8490566037735852e-05, |
|
"loss": 1.9284, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09573958831977022, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 1.8944, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09765438008616563, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.9245283018867927e-05, |
|
"loss": 1.9709, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09956917185256103, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.9622641509433963e-05, |
|
"loss": 1.9177, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10148396361895644, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9661, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10339875538535184, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.9999949751618577e-05, |
|
"loss": 1.8821, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10531354715174725, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.9999799006979282e-05, |
|
"loss": 1.922, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.10722833891814265, |
|
"grad_norm": 2.875, |
|
"learning_rate": 1.9999547767597055e-05, |
|
"loss": 2.01, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10914313068453806, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 1.9999196035996768e-05, |
|
"loss": 1.9654, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.11105792245093346, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.999874381571321e-05, |
|
"loss": 1.9159, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11297271421732887, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 1.999819111129105e-05, |
|
"loss": 1.9388, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11488750598372427, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 1.9997537928284783e-05, |
|
"loss": 1.8953, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11680229775011967, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 1.9996784273258688e-05, |
|
"loss": 1.8523, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11871708951651508, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.999593015378676e-05, |
|
"loss": 1.9153, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.12063188128291048, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 1.9994975578452626e-05, |
|
"loss": 1.817, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.12254667304930589, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.9993920556849448e-05, |
|
"loss": 1.845, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12446146481570129, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.9992765099579853e-05, |
|
"loss": 1.8886, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1263762565820967, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.999150921825582e-05, |
|
"loss": 1.8955, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12829104834849211, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.9990152925498545e-05, |
|
"loss": 1.8308, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1302058401148875, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.9988696234938333e-05, |
|
"loss": 1.9141, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.13212063188128292, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.998713916121445e-05, |
|
"loss": 1.9814, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.13403542364767831, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9985481719974985e-05, |
|
"loss": 1.7735, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13595021541407373, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9983723927876685e-05, |
|
"loss": 1.8418, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.13786500718046912, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9981865802584795e-05, |
|
"loss": 1.8867, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13977979894686454, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9979907362772865e-05, |
|
"loss": 1.8945, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14169459071325993, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.997784862812259e-05, |
|
"loss": 1.8312, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.14360938247965535, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.997568961932358e-05, |
|
"loss": 1.8979, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.14552417424605074, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.997343035807318e-05, |
|
"loss": 1.8536, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.14743896601244616, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9971070867076227e-05, |
|
"loss": 1.814, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.14935375777884155, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.996861117004485e-05, |
|
"loss": 1.8618, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.15126854954523697, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9966051291698202e-05, |
|
"loss": 1.8376, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.15318334131163236, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9963391257762234e-05, |
|
"loss": 1.8344, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15509813307802778, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9960631094969424e-05, |
|
"loss": 1.8546, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.15701292484442317, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9957770831058518e-05, |
|
"loss": 1.8692, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1589277166108186, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.995481049477424e-05, |
|
"loss": 1.8659, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.16084250837721398, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.9951750115867013e-05, |
|
"loss": 1.8511, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1627573001436094, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.994858972509265e-05, |
|
"loss": 1.8137, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1646720919100048, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.994532935421206e-05, |
|
"loss": 1.9099, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.1665868836764002, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9941969035990913e-05, |
|
"loss": 1.8185, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1685016754427956, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9938508804199322e-05, |
|
"loss": 1.8309, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.170416467209191, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.9934948693611495e-05, |
|
"loss": 1.7971, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1723312589755864, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.9931288740005388e-05, |
|
"loss": 1.8312, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17424605074198182, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.992752898016235e-05, |
|
"loss": 1.7379, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.1761608425083772, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.9923669451866753e-05, |
|
"loss": 1.8071, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.17807563427477263, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9919710193905607e-05, |
|
"loss": 1.8601, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.17999042604116802, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9915651246068176e-05, |
|
"loss": 1.8539, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.18190521780756344, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.991149264914556e-05, |
|
"loss": 1.8179, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.18382000957395883, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.9907234444930328e-05, |
|
"loss": 1.8305, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.18573480134035425, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.9902876676216044e-05, |
|
"loss": 1.8367, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.18764959310674964, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 1.989841938679687e-05, |
|
"loss": 1.8328, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.18956438487314506, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.989386262146713e-05, |
|
"loss": 1.8388, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.19147917663954045, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9889206426020837e-05, |
|
"loss": 1.834, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19339396840593587, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.988445084725125e-05, |
|
"loss": 1.8004, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.19530876017233126, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.987959593295039e-05, |
|
"loss": 1.8231, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.19722355193872668, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.987464173190858e-05, |
|
"loss": 1.8879, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.19913834370512207, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.9869588293913932e-05, |
|
"loss": 1.8918, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.20105313547151749, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.986443566975187e-05, |
|
"loss": 1.8015, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.20296792723791288, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9859183911204588e-05, |
|
"loss": 1.8785, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2048827190043083, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9853833071050567e-05, |
|
"loss": 1.8675, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.20679751077070369, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.9848383203064018e-05, |
|
"loss": 1.8918, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2087123025370991, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.984283436201435e-05, |
|
"loss": 1.8418, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2106270943034945, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9837186603665615e-05, |
|
"loss": 1.8626, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2125418860698899, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.983143998477596e-05, |
|
"loss": 1.858, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2144566778362853, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9825594563097043e-05, |
|
"loss": 1.8028, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.21637146960268072, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.981965039737346e-05, |
|
"loss": 1.7789, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2182862613690761, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9813607547342152e-05, |
|
"loss": 1.7911, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.22020105313547153, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9807466073731806e-05, |
|
"loss": 1.7719, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.22211584490186692, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.9801226038262244e-05, |
|
"loss": 1.8903, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.22403063666826234, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9794887503643805e-05, |
|
"loss": 1.8288, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.22594542843465773, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.9788450533576708e-05, |
|
"loss": 1.7571, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.22786022020105315, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9781915192750413e-05, |
|
"loss": 1.8572, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.22977501196744854, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.9775281546842985e-05, |
|
"loss": 1.7588, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23168980373384396, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.976854966252042e-05, |
|
"loss": 1.8695, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.23360459550023935, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.9761719607435973e-05, |
|
"loss": 1.8384, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.23551938726663477, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.9754791450229485e-05, |
|
"loss": 1.8699, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.23743417903303016, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.9747765260526696e-05, |
|
"loss": 1.8323, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.23934897079942558, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9740641108938538e-05, |
|
"loss": 1.7859, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.24126376256582097, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.9733419067060427e-05, |
|
"loss": 1.8269, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.24317855433221638, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.972609920747155e-05, |
|
"loss": 1.8094, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.24509334609861178, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.9718681603734124e-05, |
|
"loss": 1.8463, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.2470081378650072, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.971116633039266e-05, |
|
"loss": 1.9069, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.24892292963140258, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.9703553462973227e-05, |
|
"loss": 1.7865, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.250837721397798, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9695843077982677e-05, |
|
"loss": 1.8369, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2527525131641934, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.9688035252907888e-05, |
|
"loss": 1.823, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2546673049305888, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9680130066214968e-05, |
|
"loss": 1.8506, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.25658209669698423, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.9672127597348486e-05, |
|
"loss": 1.8261, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2584968884633796, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.9664027926730664e-05, |
|
"loss": 1.7482, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.260411680229775, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.965583113576057e-05, |
|
"loss": 1.8663, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.26232647199617043, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.9647537306813303e-05, |
|
"loss": 1.8714, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.26424126376256585, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.9639146523239156e-05, |
|
"loss": 1.7767, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2661560555289612, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9630658869362786e-05, |
|
"loss": 1.7756, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.26807084729535663, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.9622074430482366e-05, |
|
"loss": 1.8538, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.26998563906175205, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9613393292868735e-05, |
|
"loss": 1.8367, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.27190043082814747, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.9604615543764506e-05, |
|
"loss": 1.8716, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.27381522259454283, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.9595741271383225e-05, |
|
"loss": 1.8014, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.27573001436093825, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9586770564908453e-05, |
|
"loss": 1.7858, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.27764480612733367, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9577703514492886e-05, |
|
"loss": 1.7948, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2795595978937291, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.9568540211257444e-05, |
|
"loss": 1.8298, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.28147438966012445, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.9559280747290362e-05, |
|
"loss": 1.8202, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.28338918142651986, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.954992521564625e-05, |
|
"loss": 1.8162, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.2853039731929153, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.9540473710345177e-05, |
|
"loss": 1.8356, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.2872187649593107, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.9530926326371712e-05, |
|
"loss": 1.8004, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28913355672570606, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.952128315967397e-05, |
|
"loss": 1.8221, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2910483484921015, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.9511544307162656e-05, |
|
"loss": 1.8628, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2929631402584969, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9501709866710086e-05, |
|
"loss": 1.7806, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.2948779320248923, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.9491779937149204e-05, |
|
"loss": 1.8573, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2967927237912877, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9481754618272585e-05, |
|
"loss": 1.8113, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2987075155576831, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.947163401083144e-05, |
|
"loss": 1.8203, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3006223073240785, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.9461418216534594e-05, |
|
"loss": 1.8106, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.30253709909047394, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.9451107338047478e-05, |
|
"loss": 1.792, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3044518908568693, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.9440701478991078e-05, |
|
"loss": 1.8266, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3063666826232647, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.9430200743940915e-05, |
|
"loss": 1.862, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.30828147438966014, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.9419605238425975e-05, |
|
"loss": 1.8587, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.31019626615605556, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.9408915068927653e-05, |
|
"loss": 1.7967, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3121110579224509, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.93981303428787e-05, |
|
"loss": 1.7357, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.31402584968884634, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.9387251168662114e-05, |
|
"loss": 1.8246, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.31594064145524176, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.937627765561008e-05, |
|
"loss": 1.8022, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3178554332216372, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.936520991400285e-05, |
|
"loss": 1.8312, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.31977022498803254, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.935404805506764e-05, |
|
"loss": 1.7942, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.32168501675442795, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.9342792190977532e-05, |
|
"loss": 1.8574, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3235998085208234, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.93314424348503e-05, |
|
"loss": 1.78, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3255146002872188, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9319998900747327e-05, |
|
"loss": 1.8314, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32742939205361415, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.930846170367243e-05, |
|
"loss": 1.8051, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.3293441838200096, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.9296830959570697e-05, |
|
"loss": 1.9012, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.331258975586405, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.9285106785327345e-05, |
|
"loss": 1.8383, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3331737673528004, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.9273289298766532e-05, |
|
"loss": 1.808, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3350885591191958, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.9261378618650166e-05, |
|
"loss": 1.7936, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3370033508855912, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9249374864676733e-05, |
|
"loss": 1.8398, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3389181426519866, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.9237278157480074e-05, |
|
"loss": 1.7898, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.340832934418382, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.922508861862818e-05, |
|
"loss": 1.8345, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3427477261847774, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.9212806370621963e-05, |
|
"loss": 1.8619, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3446625179511728, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.920043153689405e-05, |
|
"loss": 1.8286, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3465773097175682, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.9187964241807508e-05, |
|
"loss": 1.8079, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.34849210148396365, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.917540461065462e-05, |
|
"loss": 1.7952, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.350406893250359, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.916275276965561e-05, |
|
"loss": 1.8172, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.3523216850167544, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.9150008845957388e-05, |
|
"loss": 1.7986, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.35423647678314985, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.9137172967632262e-05, |
|
"loss": 1.7909, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.35615126854954526, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.912424526367665e-05, |
|
"loss": 1.7713, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3580660603159406, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.9111225864009794e-05, |
|
"loss": 1.8165, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.35998085208233604, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.9098114899472443e-05, |
|
"loss": 1.8618, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.36189564384873146, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9084912501825554e-05, |
|
"loss": 1.8634, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3638104356151269, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.9071618803748945e-05, |
|
"loss": 1.8141, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.36572522738152224, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.9058233938839975e-05, |
|
"loss": 1.7962, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.36764001914791766, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.9044758041612207e-05, |
|
"loss": 1.8472, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3695548109143131, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.9031191247494046e-05, |
|
"loss": 1.7234, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3714696026807085, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.9017533692827383e-05, |
|
"loss": 1.8239, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.37338439444710386, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.9003785514866214e-05, |
|
"loss": 1.7942, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3752991862134993, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.8989946851775287e-05, |
|
"loss": 1.8161, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3772139779798947, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.8976017842628677e-05, |
|
"loss": 1.8224, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3791287697462901, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8961998627408424e-05, |
|
"loss": 1.8211, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.3810435615126855, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.89478893470031e-05, |
|
"loss": 1.8292, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3829583532790809, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.89336901432064e-05, |
|
"loss": 1.8522, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3848731450454763, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.891940115871574e-05, |
|
"loss": 1.8274, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.38678793681187174, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8905022537130774e-05, |
|
"loss": 1.8349, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3887027285782671, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.8890554422952e-05, |
|
"loss": 1.8058, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3906175203446625, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.8875996961579282e-05, |
|
"loss": 1.8383, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.39253231211105793, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.8861350299310397e-05, |
|
"loss": 1.7721, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.39444710387745335, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.8846614583339555e-05, |
|
"loss": 1.8389, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3963618956438487, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.883178996175593e-05, |
|
"loss": 1.8014, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.39827668741024413, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.8816876583542177e-05, |
|
"loss": 1.8612, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.40019147917663955, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.8801874598572918e-05, |
|
"loss": 1.7983, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.40210627094303497, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.8786784157613242e-05, |
|
"loss": 1.8277, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.40402106270943033, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.8771605412317192e-05, |
|
"loss": 1.815, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.40593585447582575, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.875633851522625e-05, |
|
"loss": 1.7652, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.40785064624222117, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.874098361976779e-05, |
|
"loss": 1.8605, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.4097654380086166, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.872554088025354e-05, |
|
"loss": 1.7953, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.41168022977501195, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.8710010451878036e-05, |
|
"loss": 1.7491, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.41359502154140737, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.869439249071705e-05, |
|
"loss": 1.7363, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.4155098133078028, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.8678687153726037e-05, |
|
"loss": 1.8131, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.4174246050741982, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.866289459873855e-05, |
|
"loss": 1.8631, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.41933939684059357, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.8647014984464657e-05, |
|
"loss": 1.8336, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.421254188606989, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.8631048470489343e-05, |
|
"loss": 1.7929, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4231689803733844, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8614995217270893e-05, |
|
"loss": 1.7582, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4250837721397798, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.859885538613932e-05, |
|
"loss": 1.8399, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4269985639061752, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8582629139294693e-05, |
|
"loss": 1.8379, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4289133556725706, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.8566316639805543e-05, |
|
"loss": 1.711, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.430828147438966, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.854991805160721e-05, |
|
"loss": 1.8693, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.43274293920536144, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.8533433539500195e-05, |
|
"loss": 1.7657, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4346577309717568, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.851686326914851e-05, |
|
"loss": 1.7919, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4365725227381522, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.850020740707801e-05, |
|
"loss": 1.7852, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.43848731450454764, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.8483466120674708e-05, |
|
"loss": 1.7779, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.44040210627094306, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.846663957818312e-05, |
|
"loss": 1.8108, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4423168980373384, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.8449727948704556e-05, |
|
"loss": 1.7799, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.44423168980373384, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.843273140219541e-05, |
|
"loss": 1.7874, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.44614648157012926, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.8415650109465473e-05, |
|
"loss": 1.7549, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.4480612733365247, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8398484242176206e-05, |
|
"loss": 1.7661, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.44997606510292004, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.8381233972839027e-05, |
|
"loss": 1.8091, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.45189085686931546, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.836389947481355e-05, |
|
"loss": 1.7503, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4538056486357109, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.834648092230587e-05, |
|
"loss": 1.829, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4557204404021063, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.8328978490366804e-05, |
|
"loss": 1.8481, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.45763523216850166, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.831139235489013e-05, |
|
"loss": 1.8335, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.4595500239348971, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.8293722692610805e-05, |
|
"loss": 1.7653, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4614648157012925, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.8275969681103226e-05, |
|
"loss": 1.8049, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4633796074676879, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.8258133498779407e-05, |
|
"loss": 1.8257, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.4652943992340833, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.8240214324887212e-05, |
|
"loss": 1.8437, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4672091910004787, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.8222212339508528e-05, |
|
"loss": 1.8295, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.4691239827668741, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.820412772355749e-05, |
|
"loss": 1.7919, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.47103877453326953, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.818596065877863e-05, |
|
"loss": 1.8298, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.4729535662996649, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.816771132774507e-05, |
|
"loss": 1.7665, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.4748683580660603, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.814937991385667e-05, |
|
"loss": 1.8966, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.47678314983245573, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.813096660133822e-05, |
|
"loss": 1.7429, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.47869794159885115, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.811247157523754e-05, |
|
"loss": 1.8369, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4806127333652465, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.8093895021423658e-05, |
|
"loss": 1.7983, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.48252752513164193, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.807523712658493e-05, |
|
"loss": 1.764, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.48444231689803735, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 1.8056498078227152e-05, |
|
"loss": 1.8022, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.48635710866443277, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.8037678064671702e-05, |
|
"loss": 1.7991, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.48827190043082813, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.801877727505362e-05, |
|
"loss": 1.8253, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.49018669219722355, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.7999795899319718e-05, |
|
"loss": 1.8086, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.49210148396361897, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.7980734128226685e-05, |
|
"loss": 1.8171, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.4940162757300144, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.7961592153339146e-05, |
|
"loss": 1.8484, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.49593106749640975, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.7942370167027756e-05, |
|
"loss": 1.8499, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.49784585926280517, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.7923068362467252e-05, |
|
"loss": 1.8069, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4997606510292006, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.7903686933634522e-05, |
|
"loss": 1.8241, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.501675442795596, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.7884226075306652e-05, |
|
"loss": 1.7349, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5035902345619914, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.786468598305897e-05, |
|
"loss": 1.8709, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5055050263283868, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.7845066853263068e-05, |
|
"loss": 1.7636, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5074198180947822, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.7825368883084865e-05, |
|
"loss": 1.8139, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5093346098611776, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.7805592270482576e-05, |
|
"loss": 1.7859, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.511249401627573, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.7785737214204752e-05, |
|
"loss": 1.7915, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.5131641933939685, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.776580391378829e-05, |
|
"loss": 1.7832, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5150789851603638, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.7745792569556402e-05, |
|
"loss": 1.7741, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.5169937769267592, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.772570338261662e-05, |
|
"loss": 1.8348, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5189085686931546, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.7705536554858768e-05, |
|
"loss": 1.768, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.52082336045955, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.768529228895294e-05, |
|
"loss": 1.8223, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5227381522259454, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.7664970788347454e-05, |
|
"loss": 1.8459, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5246529439923409, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.7644572257266807e-05, |
|
"loss": 1.7417, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5265677357587363, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.762409690070964e-05, |
|
"loss": 1.8374, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5284825275251317, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.7603544924446655e-05, |
|
"loss": 1.8115, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.530397319291527, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.7582916535018567e-05, |
|
"loss": 1.8349, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5323121110579224, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.7562211939734e-05, |
|
"loss": 1.7624, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5342269028243178, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.7541431346667447e-05, |
|
"loss": 1.765, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5361416945907133, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.752057496465713e-05, |
|
"loss": 1.8085, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5380564863571087, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.7499643003302943e-05, |
|
"loss": 1.7753, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5399712781235041, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.7478635672964324e-05, |
|
"loss": 1.8409, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5418860698898995, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.7457553184758135e-05, |
|
"loss": 1.7598, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5438008616562949, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.7436395750556563e-05, |
|
"loss": 1.7612, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5457156534226902, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.7415163582984972e-05, |
|
"loss": 1.8075, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5476304451890857, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.739385689541977e-05, |
|
"loss": 1.775, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5495452369554811, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.7372475901986275e-05, |
|
"loss": 1.8337, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5514600287218765, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.7351020817556548e-05, |
|
"loss": 1.8364, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5533748204882719, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.732949185774724e-05, |
|
"loss": 1.7762, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5552896122546673, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.7307889238917424e-05, |
|
"loss": 1.8312, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5572044040210627, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.7286213178166434e-05, |
|
"loss": 1.7795, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5591191957874582, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.726446389333166e-05, |
|
"loss": 1.7856, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5610339875538535, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.7242641602986376e-05, |
|
"loss": 1.7926, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5629487793202489, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.7220746526437535e-05, |
|
"loss": 1.7724, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5648635710866443, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.7198778883723567e-05, |
|
"loss": 1.7524, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5667783628530397, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.717673889561217e-05, |
|
"loss": 1.7831, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5686931546194351, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.7154626783598092e-05, |
|
"loss": 1.7599, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5706079463858306, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.7132442769900898e-05, |
|
"loss": 1.8121, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.572522738152226, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.7110187077462747e-05, |
|
"loss": 1.7459, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5744375299186214, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.7087859929946144e-05, |
|
"loss": 1.7934, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5763523216850167, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.706546155173169e-05, |
|
"loss": 1.7771, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5782671134514121, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.7042992167915836e-05, |
|
"loss": 1.8167, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5801819052178075, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.7020452004308622e-05, |
|
"loss": 1.7243, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.582096696984203, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.699784128743139e-05, |
|
"loss": 1.7949, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5840114887505984, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.6975160244514522e-05, |
|
"loss": 1.8373, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5859262805169938, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.6952409103495163e-05, |
|
"loss": 1.8333, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5878410722833892, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.6929588093014916e-05, |
|
"loss": 1.8455, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.5897558640497846, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.6906697442417547e-05, |
|
"loss": 1.8232, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.59167065581618, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.688373738174668e-05, |
|
"loss": 1.7521, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.5935854475825754, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.6860708141743498e-05, |
|
"loss": 1.7724, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5955002393489708, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.6837609953844406e-05, |
|
"loss": 1.7663, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.5974150311153662, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.6814443050178713e-05, |
|
"loss": 1.8029, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5993298228817616, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.6791207663566307e-05, |
|
"loss": 1.805, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.601244614648157, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.67679040275153e-05, |
|
"loss": 1.7871, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.6031594064145525, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.6744532376219688e-05, |
|
"loss": 1.7786, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6050741981809479, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.6721092944557003e-05, |
|
"loss": 1.7525, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.6069889899473432, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.6697585968085942e-05, |
|
"loss": 1.7923, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.6089037817137386, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.667401168304401e-05, |
|
"loss": 1.7879, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.610818573480134, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.6650370326345146e-05, |
|
"loss": 1.797, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.6127333652465294, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.6626662135577324e-05, |
|
"loss": 1.7889, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6146481570129249, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.660288734900019e-05, |
|
"loss": 1.7892, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.6165629487793203, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.6579046205542656e-05, |
|
"loss": 1.8961, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6184777405457157, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.655513894480049e-05, |
|
"loss": 1.7397, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.6203925323121111, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.653116580703393e-05, |
|
"loss": 1.8193, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6223073240785064, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.6507127033165243e-05, |
|
"loss": 1.8165, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6242221158449018, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.6483022864776327e-05, |
|
"loss": 1.7779, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6261369076112973, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.645885354410627e-05, |
|
"loss": 1.8215, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.6280516993776927, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.643461931404893e-05, |
|
"loss": 1.8383, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6299664911440881, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.641032041815046e-05, |
|
"loss": 1.7761, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6318812829104835, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.6385957100606912e-05, |
|
"loss": 1.7577, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6337960746768789, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.636152960626173e-05, |
|
"loss": 1.7726, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.6357108664432743, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.6337038180603332e-05, |
|
"loss": 1.7556, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6376256582096697, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.631248306976261e-05, |
|
"loss": 1.8344, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.6395404499760651, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.6287864520510483e-05, |
|
"loss": 1.7506, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6414552417424605, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.62631827802554e-05, |
|
"loss": 1.84, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6433700335088559, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.6238438097040866e-05, |
|
"loss": 1.6987, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6452848252752513, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.621363071954293e-05, |
|
"loss": 1.7537, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6471996170416467, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.6188760897067712e-05, |
|
"loss": 1.7974, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6491144088080422, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.616382887954887e-05, |
|
"loss": 1.7815, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6510292005744376, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.6138834917545115e-05, |
|
"loss": 1.8334, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6529439923408329, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.611377926223767e-05, |
|
"loss": 1.7674, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6548587841072283, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.6088662165427767e-05, |
|
"loss": 1.789, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6567735758736237, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.6063483879534092e-05, |
|
"loss": 1.7917, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6586883676400191, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.6038244657590266e-05, |
|
"loss": 1.7639, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6606031594064146, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.6012944753242304e-05, |
|
"loss": 1.7936, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.66251795117281, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.598758442074605e-05, |
|
"loss": 1.7632, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6644327429392054, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.5962163914964644e-05, |
|
"loss": 1.7101, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6663475347056008, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.5936683491365933e-05, |
|
"loss": 1.7726, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6682623264719961, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.5911143406019926e-05, |
|
"loss": 1.7999, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.6701771182383915, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 1.5885543915596215e-05, |
|
"loss": 1.863, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.672091910004787, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5859885277361395e-05, |
|
"loss": 1.8356, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.6740067017711824, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.583416774917647e-05, |
|
"loss": 1.7902, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6759214935375778, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.580839158949427e-05, |
|
"loss": 1.7567, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6778362853039732, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.5782557057356866e-05, |
|
"loss": 1.7907, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6797510770703686, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.575666441239293e-05, |
|
"loss": 1.7513, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.681665868836764, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.5730713914815174e-05, |
|
"loss": 1.853, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.6835806606031594, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.570470582541769e-05, |
|
"loss": 1.829, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.6854954523695548, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.5678640405573357e-05, |
|
"loss": 1.8129, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6874102441359502, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.5652517917231212e-05, |
|
"loss": 1.8097, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.6893250359023456, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.5626338622913807e-05, |
|
"loss": 1.7673, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.691239827668741, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.5600102785714567e-05, |
|
"loss": 1.7461, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.6931546194351365, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.5573810669295176e-05, |
|
"loss": 1.7969, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.6950694112015319, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.554746253788288e-05, |
|
"loss": 1.7485, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.6969842029679273, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.5521058656267874e-05, |
|
"loss": 1.7933, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6988989947343226, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.5494599289800627e-05, |
|
"loss": 1.8053, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.700813786500718, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.54680847043892e-05, |
|
"loss": 1.7844, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.7027285782671134, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.5441515166496593e-05, |
|
"loss": 1.8336, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.7046433700335089, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.5414890943138068e-05, |
|
"loss": 1.8394, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.7065581617999043, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.5388212301878445e-05, |
|
"loss": 1.7923, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.7084729535662997, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.5361479510829434e-05, |
|
"loss": 1.7798, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7103877453326951, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.5334692838646927e-05, |
|
"loss": 1.7606, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.7123025370990905, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.5307852554528318e-05, |
|
"loss": 1.7701, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.7142173288654858, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.5280958928209763e-05, |
|
"loss": 1.7918, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.7161321206318813, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.5254012229963509e-05, |
|
"loss": 1.7718, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.7180469123982767, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.5227012730595146e-05, |
|
"loss": 1.7738, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7199617041646721, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.5199960701440902e-05, |
|
"loss": 1.7932, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7218764959310675, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.5172856414364916e-05, |
|
"loss": 1.7774, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.7237912876974629, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.5145700141756496e-05, |
|
"loss": 1.7727, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7257060794638583, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.5118492156527395e-05, |
|
"loss": 1.8104, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.7276208712302538, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.5091232732109053e-05, |
|
"loss": 1.7703, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7295356629966491, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.5063922142449857e-05, |
|
"loss": 1.7688, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7314504547630445, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.5036560662012405e-05, |
|
"loss": 1.7204, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7333652465294399, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 1.5009148565770707e-05, |
|
"loss": 1.7462, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7352800382958353, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.498168612920746e-05, |
|
"loss": 1.7404, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7371948300622307, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.4954173628311262e-05, |
|
"loss": 1.7549, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7391096218286262, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.492661133957384e-05, |
|
"loss": 1.8004, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7410244135950216, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.4898999539987273e-05, |
|
"loss": 1.7897, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.742939205361417, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.487133850704121e-05, |
|
"loss": 1.7573, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7448539971278123, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.4843628518720076e-05, |
|
"loss": 1.8918, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7467687888942077, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.4815869853500286e-05, |
|
"loss": 1.7759, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7486835806606031, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.4788062790347437e-05, |
|
"loss": 1.8615, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7505983724269986, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.4760207608713515e-05, |
|
"loss": 1.7365, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.752513164193394, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.4732304588534073e-05, |
|
"loss": 1.783, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7544279559597894, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.4704354010225436e-05, |
|
"loss": 1.7981, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7563427477261848, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.4676356154681867e-05, |
|
"loss": 1.8287, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7582575394925802, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.464831130327275e-05, |
|
"loss": 1.7761, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7601723312589755, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.4620219737839766e-05, |
|
"loss": 1.8013, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.762087123025371, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.4592081740694051e-05, |
|
"loss": 1.8035, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7640019147917664, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.4563897594613368e-05, |
|
"loss": 1.8472, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7659167065581618, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.453566758283926e-05, |
|
"loss": 1.7201, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7678314983245572, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.450739198907421e-05, |
|
"loss": 1.7396, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.7697462900909526, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.4479071097478778e-05, |
|
"loss": 1.8413, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.771661081857348, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.4450705192668763e-05, |
|
"loss": 1.8091, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.7735758736237435, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.442229455971232e-05, |
|
"loss": 1.7785, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7754906653901388, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.4393839484127117e-05, |
|
"loss": 1.7627, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7774054571565342, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.4365340251877446e-05, |
|
"loss": 1.7348, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7793202489229296, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.4336797149371377e-05, |
|
"loss": 1.8185, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.781235040689325, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.4308210463457842e-05, |
|
"loss": 1.7564, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7831498324557205, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.4279580481423778e-05, |
|
"loss": 1.7314, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.7850646242221159, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.4250907490991244e-05, |
|
"loss": 1.7787, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7869794159885113, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.4222191780314508e-05, |
|
"loss": 1.7753, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.7888942077549067, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.4193433637977165e-05, |
|
"loss": 1.7371, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.790808999521302, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.416463335298924e-05, |
|
"loss": 1.7923, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.7927237912876974, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.4135791214784272e-05, |
|
"loss": 1.8241, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.7946385830540929, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.4106907513216412e-05, |
|
"loss": 1.7248, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7965533748204883, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.4077982538557511e-05, |
|
"loss": 1.8105, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7984681665868837, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.4049016581494204e-05, |
|
"loss": 1.8315, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.8003829583532791, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.402000993312498e-05, |
|
"loss": 1.7119, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.8022977501196745, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.3990962884957267e-05, |
|
"loss": 1.7673, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.8042125418860699, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.3961875728904495e-05, |
|
"loss": 1.8066, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8061273336524652, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.3932748757283165e-05, |
|
"loss": 1.7593, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.8080421254188607, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.3903582262809918e-05, |
|
"loss": 1.8116, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.8099569171852561, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.3874376538598574e-05, |
|
"loss": 1.789, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.8118717089516515, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.3845131878157214e-05, |
|
"loss": 1.7977, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.8137865007180469, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.3815848575385207e-05, |
|
"loss": 1.8037, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8157012924844423, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.3786526924570262e-05, |
|
"loss": 1.7475, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.8176160842508378, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3757167220385483e-05, |
|
"loss": 1.7709, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.8195308760172332, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.3727769757886388e-05, |
|
"loss": 1.7576, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8214456677836285, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.3698334832507962e-05, |
|
"loss": 1.8164, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.8233604595500239, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.366886274006168e-05, |
|
"loss": 1.7753, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8252752513164193, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.3639353776732523e-05, |
|
"loss": 1.7188, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.8271900430828147, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3609808239076025e-05, |
|
"loss": 1.7456, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8291048348492102, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3580226424015273e-05, |
|
"loss": 1.7591, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.8310196266156056, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.3550608628837933e-05, |
|
"loss": 1.8017, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.832934418382001, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.352095515119326e-05, |
|
"loss": 1.7858, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8348492101483964, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 1.3491266289089107e-05, |
|
"loss": 1.7865, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8367640019147917, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.3461542340888921e-05, |
|
"loss": 1.7595, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.8386787936811871, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.3431783605308761e-05, |
|
"loss": 1.8015, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8405935854475826, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3401990381414287e-05, |
|
"loss": 1.7645, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.842508377213978, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.3372162968617757e-05, |
|
"loss": 1.7324, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8444231689803734, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.3342301666675013e-05, |
|
"loss": 1.7304, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8463379607467688, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3312406775682471e-05, |
|
"loss": 1.814, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8482527525131642, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.328247859607411e-05, |
|
"loss": 1.7776, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8501675442795597, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.3252517428618448e-05, |
|
"loss": 1.7183, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.852082336045955, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.3222523574415516e-05, |
|
"loss": 1.7699, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8539971278123504, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.3192497334893842e-05, |
|
"loss": 1.8766, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8559119195787458, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.316243901180741e-05, |
|
"loss": 1.7422, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8578267113451412, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.3132348907232639e-05, |
|
"loss": 1.7802, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8597415031115366, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.310222732356534e-05, |
|
"loss": 1.7016, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.861656294877932, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.3072074563517676e-05, |
|
"loss": 1.7917, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8635710866443275, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.3041890930115125e-05, |
|
"loss": 1.7344, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8654858784107229, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.3011676726693432e-05, |
|
"loss": 1.7282, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8674006701771182, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.298143225689556e-05, |
|
"loss": 1.7817, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.8693154619435136, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.2951157824668645e-05, |
|
"loss": 1.7465, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.871230253709909, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.2920853734260925e-05, |
|
"loss": 1.7836, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8731450454763044, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.2890520290218698e-05, |
|
"loss": 1.762, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8750598372426999, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.286015779738326e-05, |
|
"loss": 1.7934, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.8769746290090953, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.2829766560887837e-05, |
|
"loss": 1.7687, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.8788894207754907, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.2799346886154513e-05, |
|
"loss": 1.8095, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.8808042125418861, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.2768899078891174e-05, |
|
"loss": 1.7651, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8827190043082814, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.2738423445088429e-05, |
|
"loss": 1.7002, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.8846337960746768, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.2707920291016526e-05, |
|
"loss": 1.804, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.8865485878410723, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.2677389923222297e-05, |
|
"loss": 1.7597, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.8884633796074677, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.2646832648526048e-05, |
|
"loss": 1.7853, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.8903781713738631, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 1.2616248774018503e-05, |
|
"loss": 1.7303, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8922929631402585, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.2585638607057698e-05, |
|
"loss": 1.7358, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.8942077549066539, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.25550024552659e-05, |
|
"loss": 1.8593, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.8961225466730494, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.2524340626526521e-05, |
|
"loss": 1.8011, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8980373384394447, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.2493653428981014e-05, |
|
"loss": 1.7807, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.8999521302058401, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.2462941171025777e-05, |
|
"loss": 1.7492, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9018669219722355, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.2432204161309063e-05, |
|
"loss": 1.7557, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.9037817137386309, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.2401442708727869e-05, |
|
"loss": 1.779, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.9056965055050263, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.2370657122424835e-05, |
|
"loss": 1.7955, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.9076112972714218, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.2339847711785139e-05, |
|
"loss": 1.7179, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.9095260890378172, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.2309014786433381e-05, |
|
"loss": 1.8023, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9114408808042126, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.2278158656230486e-05, |
|
"loss": 1.8111, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.9133556725706079, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 1.224727963127057e-05, |
|
"loss": 1.7841, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.9152704643370033, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.2216378021877835e-05, |
|
"loss": 1.775, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.9171852561033987, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.2185454138603458e-05, |
|
"loss": 1.7291, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.9191000478697942, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.215450829222245e-05, |
|
"loss": 1.7993, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9210148396361896, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.2123540793730554e-05, |
|
"loss": 1.7748, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.922929631402585, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.2092551954341104e-05, |
|
"loss": 1.7842, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.9248444231689804, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.2061542085481904e-05, |
|
"loss": 1.7519, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.9267592149353758, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.2030511498792095e-05, |
|
"loss": 1.8114, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9286740067017711, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.199946050611903e-05, |
|
"loss": 1.7339, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9305887984681666, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.1968389419515134e-05, |
|
"loss": 1.8057, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.932503590234562, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.1937298551234769e-05, |
|
"loss": 1.7458, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.9344183820009574, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.1906188213731099e-05, |
|
"loss": 1.7539, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9363331737673528, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.187505871965294e-05, |
|
"loss": 1.8346, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.9382479655337482, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.1843910381841637e-05, |
|
"loss": 1.7451, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9401627573001436, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 1.1812743513327896e-05, |
|
"loss": 1.7659, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.9420775490665391, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.1781558427328662e-05, |
|
"loss": 1.8026, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9439923408329344, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.1750355437243947e-05, |
|
"loss": 1.7427, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.9459071325993298, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.1719134856653704e-05, |
|
"loss": 1.7436, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9478219243657252, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.1687896999314663e-05, |
|
"loss": 1.8219, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9497367161321206, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.1656642179157173e-05, |
|
"loss": 1.757, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.951651507898516, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.1625370710282067e-05, |
|
"loss": 1.7482, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.9535662996649115, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.1594082906957478e-05, |
|
"loss": 1.8083, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9554810914313069, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.1562779083615702e-05, |
|
"loss": 1.8082, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.9573958831977023, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.153145955485003e-05, |
|
"loss": 1.7942, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9593106749640976, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.1500124635411592e-05, |
|
"loss": 1.7538, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.961225466730493, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.146877464020618e-05, |
|
"loss": 1.7099, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.9631402584968884, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.1437409884291097e-05, |
|
"loss": 1.821, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.9650550502632839, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.140603068287199e-05, |
|
"loss": 1.8018, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.9669698420296793, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.1374637351299672e-05, |
|
"loss": 1.8382, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9688846337960747, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.1343230205066963e-05, |
|
"loss": 1.8094, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.9707994255624701, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.131180955980552e-05, |
|
"loss": 1.7568, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9727142173288655, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.128037573128265e-05, |
|
"loss": 1.743, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9746290090952608, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.124892903539816e-05, |
|
"loss": 1.8106, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.9765438008616563, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.1217469788181158e-05, |
|
"loss": 1.7705, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9784585926280517, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 1.1185998305786902e-05, |
|
"loss": 1.8163, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.9803733843944471, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.1154514904493599e-05, |
|
"loss": 1.8028, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9822881761608425, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.1123019900699239e-05, |
|
"loss": 1.8154, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.9842029679272379, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 1.1091513610918415e-05, |
|
"loss": 1.7655, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.9861177596936334, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.1059996351779139e-05, |
|
"loss": 1.7482, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9880325514600288, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.1028468440019666e-05, |
|
"loss": 1.7829, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9899473432264241, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.0996930192485302e-05, |
|
"loss": 1.7006, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.9918621349928195, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.0965381926125224e-05, |
|
"loss": 1.7737, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.9937769267592149, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.0933823957989298e-05, |
|
"loss": 1.7464, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.9956917185256103, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.0902256605224885e-05, |
|
"loss": 1.8521, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9976065102920058, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.0870680185073666e-05, |
|
"loss": 1.7895, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.9995213020584012, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.083909501486844e-05, |
|
"loss": 1.7476, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.0014360938247966, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.080750141202994e-05, |
|
"loss": 3.1048, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.003350885591192, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.0775899694063649e-05, |
|
"loss": 1.6357, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.0052656773575874, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.0744290178556604e-05, |
|
"loss": 1.7283, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0071804691239827, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.0712673183174205e-05, |
|
"loss": 1.7224, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.0090952608903783, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.0681049025657015e-05, |
|
"loss": 1.6712, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.0110100526567736, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 1.0649418023817583e-05, |
|
"loss": 1.7204, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.0129248444231689, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.061778049553724e-05, |
|
"loss": 1.6806, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.0148396361895644, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.0586136758762902e-05, |
|
"loss": 1.7295, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0167544279559597, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.0554487131503874e-05, |
|
"loss": 1.7294, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.0186692197223552, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.0522831931828677e-05, |
|
"loss": 1.674, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.0205840114887506, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.049117147786181e-05, |
|
"loss": 1.6801, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.022498803255146, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.0459506087780593e-05, |
|
"loss": 1.6725, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.0244135950215414, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.042783607981194e-05, |
|
"loss": 1.6826, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.026328386787937, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.0396161772229185e-05, |
|
"loss": 1.6825, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.0282431785543322, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 1.0364483483348859e-05, |
|
"loss": 1.7038, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.0301579703207275, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 1.0332801531527516e-05, |
|
"loss": 1.7607, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.032072762087123, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.0301116235158516e-05, |
|
"loss": 1.7785, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.0339875538535184, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.0269427912668826e-05, |
|
"loss": 1.6801, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.035902345619914, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.0237736882515832e-05, |
|
"loss": 1.6535, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.0378171373863092, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 1.0206043463184127e-05, |
|
"loss": 1.7229, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.0397319291527047, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.0174347973182318e-05, |
|
"loss": 1.6847, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.0416467209191, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 1.0142650731039815e-05, |
|
"loss": 1.6263, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.0435615126854954, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.0110952055303647e-05, |
|
"loss": 1.6602, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.0454763044518909, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.0079252264535237e-05, |
|
"loss": 1.6483, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.0473910962182862, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.0047551677307226e-05, |
|
"loss": 1.7304, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.0493058879846817, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.0015850612200249e-05, |
|
"loss": 1.7147, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.051220679751077, |
|
"grad_norm": 1.75, |
|
"learning_rate": 9.984149387799754e-06, |
|
"loss": 1.655, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.0531354715174726, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 9.952448322692776e-06, |
|
"loss": 1.7411, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0550502632838679, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 9.920747735464766e-06, |
|
"loss": 1.658, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.0569650550502634, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.889047944696354e-06, |
|
"loss": 1.6857, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.0588798468166587, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 9.857349268960186e-06, |
|
"loss": 1.7655, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.060794638583054, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 9.825652026817683e-06, |
|
"loss": 1.6613, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.0627094303494495, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 9.793956536815874e-06, |
|
"loss": 1.7267, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.0646242221158448, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 9.76226311748417e-06, |
|
"loss": 1.788, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.0665390138822404, |
|
"grad_norm": 1.75, |
|
"learning_rate": 9.730572087331177e-06, |
|
"loss": 1.6566, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.0684538056486357, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 9.698883764841484e-06, |
|
"loss": 1.707, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.0703685974150312, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.667198468472485e-06, |
|
"loss": 1.7831, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.0722833891814265, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.63551651665114e-06, |
|
"loss": 1.7095, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0741981809478218, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 9.603838227770819e-06, |
|
"loss": 1.7534, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.0761129727142174, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 9.57216392018806e-06, |
|
"loss": 1.5954, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.0780277644806127, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 9.54049391221941e-06, |
|
"loss": 1.6686, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.0799425562470082, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 9.508828522138191e-06, |
|
"loss": 1.7209, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.0818573480134035, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.477168068171326e-06, |
|
"loss": 1.6337, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.083772139779799, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 9.445512868496128e-06, |
|
"loss": 1.6905, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.0856869315461943, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 9.413863241237101e-06, |
|
"loss": 1.6972, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.0876017233125896, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.382219504462766e-06, |
|
"loss": 1.7069, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.0895165150789852, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 9.350581976182418e-06, |
|
"loss": 1.6554, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.0914313068453805, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.31895097434299e-06, |
|
"loss": 1.6556, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.093346098611776, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.287326816825799e-06, |
|
"loss": 1.7767, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.0952608903781713, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.255709821443399e-06, |
|
"loss": 1.6971, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.0971756821445668, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.224100305936353e-06, |
|
"loss": 1.6232, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.0990904739109622, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 9.192498587970065e-06, |
|
"loss": 1.6941, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.1010052656773577, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 9.160904985131564e-06, |
|
"loss": 1.6419, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.102920057443753, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.129319814926339e-06, |
|
"loss": 1.7512, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.1048348492101483, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.097743394775116e-06, |
|
"loss": 1.749, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.1067496409765438, |
|
"grad_norm": 1.75, |
|
"learning_rate": 9.066176042010705e-06, |
|
"loss": 1.723, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.1086644327429391, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 9.034618073874777e-06, |
|
"loss": 1.7473, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.1105792245093347, |
|
"grad_norm": 1.875, |
|
"learning_rate": 9.003069807514702e-06, |
|
"loss": 1.6308, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.11249401627573, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 8.971531559980334e-06, |
|
"loss": 1.7228, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.1144088080421255, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 8.940003648220863e-06, |
|
"loss": 1.6956, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.1163235998085208, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 8.908486389081587e-06, |
|
"loss": 1.7137, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.1182383915749163, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 8.876980099300764e-06, |
|
"loss": 1.7155, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.1201531833413116, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 8.845485095506401e-06, |
|
"loss": 1.6971, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.122067975107707, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 8.8140016942131e-06, |
|
"loss": 1.7349, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.1239827668741025, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 8.78253021181884e-06, |
|
"loss": 1.7291, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.1258975586404978, |
|
"grad_norm": 1.75, |
|
"learning_rate": 8.751070964601845e-06, |
|
"loss": 1.6818, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.1278123504068933, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.719624268717351e-06, |
|
"loss": 1.6302, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.1297271421732886, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 8.688190440194483e-06, |
|
"loss": 1.7592, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1316419339396842, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 8.656769794933042e-06, |
|
"loss": 1.7508, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.1335567257060795, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 8.625362648700332e-06, |
|
"loss": 1.6968, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.135471517472475, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 8.593969317128015e-06, |
|
"loss": 1.7048, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.1373863092388703, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 8.562590115708906e-06, |
|
"loss": 1.6671, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.1393011010052656, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 8.531225359793825e-06, |
|
"loss": 1.7416, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.1412158927716611, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 8.49987536458841e-06, |
|
"loss": 1.7597, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.1431306845380564, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.468540445149973e-06, |
|
"loss": 1.6507, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.145045476304452, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 8.437220916384301e-06, |
|
"loss": 1.6958, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.1469602680708473, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 8.405917093042526e-06, |
|
"loss": 1.72, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.1488750598372426, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 8.374629289717937e-06, |
|
"loss": 1.7362, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1507898516036381, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 8.343357820842829e-06, |
|
"loss": 1.6909, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.1527046433700334, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.31210300068534e-06, |
|
"loss": 1.641, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.154619435136429, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 8.280865143346301e-06, |
|
"loss": 1.7413, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.1565342269028243, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 8.249644562756056e-06, |
|
"loss": 1.6597, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.1584490186692198, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 8.218441572671343e-06, |
|
"loss": 1.6769, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.160363810435615, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 8.187256486672106e-06, |
|
"loss": 1.6976, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.1622786022020106, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 8.156089618158366e-06, |
|
"loss": 1.6065, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.164193393968406, |
|
"grad_norm": 1.75, |
|
"learning_rate": 8.12494128034706e-06, |
|
"loss": 1.7632, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.1661081857348012, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 8.093811786268905e-06, |
|
"loss": 1.6849, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.1680229775011968, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 8.06270144876523e-06, |
|
"loss": 1.7192, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.169937769267592, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.03161058048487e-06, |
|
"loss": 1.6864, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.1718525610339876, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.000539493880972e-06, |
|
"loss": 1.704, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.173767352800383, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.969488501207909e-06, |
|
"loss": 1.6947, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.1756821445667784, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.938457914518098e-06, |
|
"loss": 1.7126, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.1775969363331737, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.907448045658899e-06, |
|
"loss": 1.7502, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.1795117280995693, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 7.876459206269446e-06, |
|
"loss": 1.7348, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.1814265198659646, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.845491707777551e-06, |
|
"loss": 1.6578, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.18334131163236, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.814545861396543e-06, |
|
"loss": 1.7479, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.1852561033987554, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 7.783621978122167e-06, |
|
"loss": 1.7027, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.1871708951651507, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 7.752720368729436e-06, |
|
"loss": 1.6828, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1890856869315463, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.721841343769518e-06, |
|
"loss": 1.689, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.1910004786979416, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 7.69098521356662e-06, |
|
"loss": 1.6498, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.192915270464337, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 7.660152288214865e-06, |
|
"loss": 1.6855, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.1948300622307324, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.629342877575169e-06, |
|
"loss": 1.7234, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.196744853997128, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 7.598557291272133e-06, |
|
"loss": 1.7746, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.1986596457635232, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.567795838690941e-06, |
|
"loss": 1.7283, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.2005744375299185, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 7.537058828974226e-06, |
|
"loss": 1.7204, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.202489229296314, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.506346571018992e-06, |
|
"loss": 1.7417, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.2044040210627094, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.475659373473481e-06, |
|
"loss": 1.6638, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.206318812829105, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 7.444997544734105e-06, |
|
"loss": 1.7001, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2082336045955002, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 7.414361392942307e-06, |
|
"loss": 1.7401, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.2101483963618955, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 7.383751225981503e-06, |
|
"loss": 1.7466, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.212063188128291, |
|
"grad_norm": 1.875, |
|
"learning_rate": 7.353167351473955e-06, |
|
"loss": 1.6681, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.2139779798946864, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 7.322610076777707e-06, |
|
"loss": 1.7014, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.215892771661082, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.292079708983475e-06, |
|
"loss": 1.7218, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.2178075634274772, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 7.261576554911575e-06, |
|
"loss": 1.7206, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.2197223551938727, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 7.2311009211088255e-06, |
|
"loss": 1.6895, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.221637146960268, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.20065311384549e-06, |
|
"loss": 1.744, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.2235519387266636, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.170233439112164e-06, |
|
"loss": 1.7217, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.2254667304930589, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 7.139842202616741e-06, |
|
"loss": 1.6799, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2273815222594542, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 7.109479709781302e-06, |
|
"loss": 1.7117, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.2292963140258497, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.079146265739079e-06, |
|
"loss": 1.6948, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.231211105792245, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 7.048842175331356e-06, |
|
"loss": 1.7343, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.2331258975586405, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 7.0185677431044404e-06, |
|
"loss": 1.7078, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.2350406893250359, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.988323273306569e-06, |
|
"loss": 1.7168, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.2369554810914314, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 6.958109069884879e-06, |
|
"loss": 1.6997, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.2388702728578267, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.9279254364823265e-06, |
|
"loss": 1.6204, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.2407850646242222, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 6.897772676434663e-06, |
|
"loss": 1.727, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.2426998563906175, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 6.867651092767366e-06, |
|
"loss": 1.7278, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.2446146481570128, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 6.837560988192593e-06, |
|
"loss": 1.7087, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2465294399234084, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.807502665106164e-06, |
|
"loss": 1.6614, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.2484442316898037, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.777476425584486e-06, |
|
"loss": 1.7264, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.2503590234561992, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 6.747482571381556e-06, |
|
"loss": 1.701, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.2522738152225945, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 6.717521403925892e-06, |
|
"loss": 1.7061, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.2541886069889898, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 6.687593224317533e-06, |
|
"loss": 1.6551, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.2561033987553853, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 6.657698333324991e-06, |
|
"loss": 1.731, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.2580181905217809, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.627837031382246e-06, |
|
"loss": 1.7075, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.2599329822881762, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.598009618585717e-06, |
|
"loss": 1.6765, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.2618477740545715, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 6.568216394691245e-06, |
|
"loss": 1.7378, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.263762565820967, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.538457659111084e-06, |
|
"loss": 1.7609, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2656773575873623, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.5087337109109e-06, |
|
"loss": 1.7036, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.2675921493537579, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 6.479044848806739e-06, |
|
"loss": 1.6546, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.2695069411201532, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 6.4493913711620685e-06, |
|
"loss": 1.7018, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.2714217328865485, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 6.419773575984727e-06, |
|
"loss": 1.7357, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.273336524652944, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 6.390191760923978e-06, |
|
"loss": 1.6928, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.2752513164193395, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.360646223267477e-06, |
|
"loss": 1.6623, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.2771661081857348, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 6.3311372599383245e-06, |
|
"loss": 1.6921, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.2790808999521301, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.301665167492037e-06, |
|
"loss": 1.7036, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.2809956917185257, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.272230242113613e-06, |
|
"loss": 1.7099, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.282910483484921, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 6.242832779614521e-06, |
|
"loss": 1.6826, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2848252752513165, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 6.213473075429741e-06, |
|
"loss": 1.7058, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.2867400670177118, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 6.184151424614795e-06, |
|
"loss": 1.6677, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.2886548587841071, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.154868121842788e-06, |
|
"loss": 1.7125, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.2905696505505027, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.1256234614014256e-06, |
|
"loss": 1.6581, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.292484442316898, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.096417737190085e-06, |
|
"loss": 1.7104, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.2943992340832935, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.067251242716838e-06, |
|
"loss": 1.6612, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.2963140258496888, |
|
"grad_norm": 1.75, |
|
"learning_rate": 6.038124271095507e-06, |
|
"loss": 1.6501, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.2982288176160843, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 6.0090371150427375e-06, |
|
"loss": 1.7283, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.3001436093824796, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.979990066875022e-06, |
|
"loss": 1.699, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.3020584011488752, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 5.950983418505799e-06, |
|
"loss": 1.7458, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3039731929152705, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 5.922017461442492e-06, |
|
"loss": 1.6889, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.3058879846816658, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.893092486783594e-06, |
|
"loss": 1.6935, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.3078027764480613, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.864208785215732e-06, |
|
"loss": 1.6641, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.3097175682144566, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.835366647010767e-06, |
|
"loss": 1.7062, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.3116323599808521, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.8065663620228404e-06, |
|
"loss": 1.7008, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.3135471517472475, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.777808219685496e-06, |
|
"loss": 1.7002, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.3154619435136428, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.749092509008761e-06, |
|
"loss": 1.6896, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.3173767352800383, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 5.720419518576223e-06, |
|
"loss": 1.7014, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.3192915270464338, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.691789536542161e-06, |
|
"loss": 1.6799, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.3212063188128291, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.6632028506286266e-06, |
|
"loss": 1.6558, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3231211105792244, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 5.634659748122552e-06, |
|
"loss": 1.6286, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.32503590234562, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.606160515872886e-06, |
|
"loss": 1.6983, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.3269506941120153, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 5.57770544028768e-06, |
|
"loss": 1.7115, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.3288654858784108, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 5.5492948073312406e-06, |
|
"loss": 1.719, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.330780277644806, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.520928902521221e-06, |
|
"loss": 1.7074, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.3326950694112014, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 5.492608010925793e-06, |
|
"loss": 1.7135, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.334609861177597, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.46433241716074e-06, |
|
"loss": 1.6395, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.3365246529439925, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 5.436102405386636e-06, |
|
"loss": 1.7543, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.3384394447103878, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.407918259305951e-06, |
|
"loss": 1.6431, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.340354236476783, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.379780262160237e-06, |
|
"loss": 1.7222, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3422690282431786, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.3516886967272485e-06, |
|
"loss": 1.7227, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.344183820009574, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.323643845318135e-06, |
|
"loss": 1.7426, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.3460986117759695, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.295645989774565e-06, |
|
"loss": 1.7319, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.3480134035423648, |
|
"grad_norm": 1.75, |
|
"learning_rate": 5.26769541146593e-06, |
|
"loss": 1.6958, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.34992819530876, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.239792391286492e-06, |
|
"loss": 1.682, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.3518429870751556, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 5.211937209652567e-06, |
|
"loss": 1.6474, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.353757778841551, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 5.1841301464997206e-06, |
|
"loss": 1.6762, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.3556725706079464, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 5.156371481279928e-06, |
|
"loss": 1.6729, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.3575873623743417, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.128661492958793e-06, |
|
"loss": 1.7092, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.3595021541407373, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 5.101000460012731e-06, |
|
"loss": 1.6943, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3614169459071326, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 5.073388660426164e-06, |
|
"loss": 1.6758, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.363331737673528, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 5.04582637168874e-06, |
|
"loss": 1.671, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.3652465294399234, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 5.018313870792544e-06, |
|
"loss": 1.6675, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.3671613212063187, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.990851434229295e-06, |
|
"loss": 1.7074, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.3690761129727143, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.9634393379875986e-06, |
|
"loss": 1.6558, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.3709909047391096, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.936077857550141e-06, |
|
"loss": 1.7064, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.372905696505505, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.908767267890952e-06, |
|
"loss": 1.646, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.3748204882719004, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.8815078434726075e-06, |
|
"loss": 1.7207, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.3767352800382957, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.854299858243505e-06, |
|
"loss": 1.7459, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.3786500718046912, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.827143585635085e-06, |
|
"loss": 1.677, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3805648635710868, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.800039298559101e-06, |
|
"loss": 1.682, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.382479655337482, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.772987269404855e-06, |
|
"loss": 1.6784, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.3843944471038774, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.745987770036494e-06, |
|
"loss": 1.7062, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.386309238870273, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.719041071790238e-06, |
|
"loss": 1.6879, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.3882240306366682, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.692147445471687e-06, |
|
"loss": 1.7091, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.3901388224030637, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.665307161353073e-06, |
|
"loss": 1.721, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.392053614169459, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.638520489170572e-06, |
|
"loss": 1.6974, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.3939684059358544, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.611787698121558e-06, |
|
"loss": 1.7036, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.39588319770225, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.585109056861936e-06, |
|
"loss": 1.6918, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.3977979894686454, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.558484833503407e-06, |
|
"loss": 1.6522, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3997127812350407, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.531915295610805e-06, |
|
"loss": 1.682, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.401627573001436, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 4.505400710199376e-06, |
|
"loss": 1.7143, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.4035423647678316, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 4.478941343732125e-06, |
|
"loss": 1.7184, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.4054571565342269, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.452537462117123e-06, |
|
"loss": 1.7365, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.4073719483006224, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.426189330704826e-06, |
|
"loss": 1.6457, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.4092867400670177, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.3998972142854334e-06, |
|
"loss": 1.6866, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.411201531833413, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.373661377086195e-06, |
|
"loss": 1.6959, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.4131163235998085, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.3474820827687894e-06, |
|
"loss": 1.7323, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.4150311153662039, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.321359594426644e-06, |
|
"loss": 1.6982, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.4169459071325994, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.295294174582315e-06, |
|
"loss": 1.6859, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4188606988989947, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.2692860851848295e-06, |
|
"loss": 1.6616, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.4207754906653902, |
|
"grad_norm": 1.75, |
|
"learning_rate": 4.243335587607074e-06, |
|
"loss": 1.6983, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.4226902824317855, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.217442942643138e-06, |
|
"loss": 1.6405, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.424605074198181, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.191608410505732e-06, |
|
"loss": 1.6706, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.4265198659645764, |
|
"grad_norm": 1.875, |
|
"learning_rate": 4.165832250823534e-06, |
|
"loss": 1.7247, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.4284346577309717, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 4.140114722638609e-06, |
|
"loss": 1.713, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.4303494494973672, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.114456084403785e-06, |
|
"loss": 1.7519, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.4322642412637625, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 4.088856593980078e-06, |
|
"loss": 1.7403, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.434179033030158, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.06331650863407e-06, |
|
"loss": 1.6786, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.4360938247965533, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.03783608503536e-06, |
|
"loss": 1.6476, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4380086165629486, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 4.0124155792539496e-06, |
|
"loss": 1.8036, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.4399234083293442, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.987055246757701e-06, |
|
"loss": 1.7387, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.4418382000957397, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.961755342409737e-06, |
|
"loss": 1.7148, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 1.443752991862135, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.936516120465914e-06, |
|
"loss": 1.621, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.4456677836285303, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.911337834572235e-06, |
|
"loss": 1.6647, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.4475825753949259, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.886220737762328e-06, |
|
"loss": 1.6833, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.4494973671613212, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.861165082454888e-06, |
|
"loss": 1.7302, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 1.4514121589277167, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.836171120451131e-06, |
|
"loss": 1.7396, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.453326950694112, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 3.811239102932289e-06, |
|
"loss": 1.7763, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 1.4552417424605073, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.7863692804570707e-06, |
|
"loss": 1.734, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4571565342269028, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.761561902959139e-06, |
|
"loss": 1.6783, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 1.4590713259932984, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.7368172197446007e-06, |
|
"loss": 1.6689, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.4609861177596937, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.7121354794895216e-06, |
|
"loss": 1.6886, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 1.462900909526089, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.6875169302373938e-06, |
|
"loss": 1.6309, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.4648157012924845, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.6629618193966744e-06, |
|
"loss": 1.7063, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.4667304930588798, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.6384703937382714e-06, |
|
"loss": 1.7162, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.4686452848252753, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.6140428993930922e-06, |
|
"loss": 1.7338, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 1.4705600765916707, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.589679581849539e-06, |
|
"loss": 1.6977, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.472474868358066, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.5653806859510743e-06, |
|
"loss": 1.6789, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 1.4743896601244615, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.5411464558937302e-06, |
|
"loss": 1.6898, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4763044518908568, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.5169771352236782e-06, |
|
"loss": 1.7292, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 1.4782192436572523, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.4928729668347616e-06, |
|
"loss": 1.7046, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.4801340354236476, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.4688341929660776e-06, |
|
"loss": 1.7028, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 1.4820488271900432, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.444861055199512e-06, |
|
"loss": 1.6834, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.4839636189564385, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.420953794457349e-06, |
|
"loss": 1.7299, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.485878410722834, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.397112650999811e-06, |
|
"loss": 1.6711, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.4877932024892293, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.37333786442268e-06, |
|
"loss": 1.6332, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 1.4897079942556246, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.349629673654858e-06, |
|
"loss": 1.673, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.4916227860220201, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.32598831695599e-06, |
|
"loss": 1.6594, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 1.4935375777884154, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.3024140319140617e-06, |
|
"loss": 1.6547, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.495452369554811, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.2789070554430003e-06, |
|
"loss": 1.6371, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 1.4973671613212063, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.2554676237803117e-06, |
|
"loss": 1.7326, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.4992819530876016, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.2320959724847e-06, |
|
"loss": 1.7118, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 1.5011967448539971, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.2087923364336904e-06, |
|
"loss": 1.8037, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.5031115366203927, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.1855569498212857e-06, |
|
"loss": 1.7526, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.505026328386788, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.1623900461555933e-06, |
|
"loss": 1.665, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.5069411201531833, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.1392918582565037e-06, |
|
"loss": 1.6528, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 1.5088559119195788, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.1162626182533207e-06, |
|
"loss": 1.7152, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.510770703685974, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.093302557582457e-06, |
|
"loss": 1.661, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 1.5126854954523696, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 3.070411906985088e-06, |
|
"loss": 1.7286, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.514600287218765, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 3.0475908965048374e-06, |
|
"loss": 1.7394, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 1.5165150789851602, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.0248397554854813e-06, |
|
"loss": 1.6925, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.5184298707515558, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.002158712568615e-06, |
|
"loss": 1.6607, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 1.5203446625179513, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.979547995691383e-06, |
|
"loss": 1.6779, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.5222594542843466, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 2.9570078320841644e-06, |
|
"loss": 1.7353, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.524174246050742, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.9345384482683148e-06, |
|
"loss": 1.7012, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.5260890378171372, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.9121400700538593e-06, |
|
"loss": 1.6287, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 1.5280038295835328, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.8898129225372564e-06, |
|
"loss": 1.6926, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 1.5299186213499283, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.867557230099104e-06, |
|
"loss": 1.6822, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 1.5318334131163236, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 2.845373216401913e-06, |
|
"loss": 1.7203, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.533748204882719, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.823261104387833e-06, |
|
"loss": 1.7103, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 1.5356629966491144, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.801221116276436e-06, |
|
"loss": 1.6772, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 1.53757778841551, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.7792534735624687e-06, |
|
"loss": 1.7132, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 1.5394925801819053, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.757358397013625e-06, |
|
"loss": 1.8025, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 1.5414073719483006, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.7355361066683393e-06, |
|
"loss": 1.6785, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.5433221637146959, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.7137868218335674e-06, |
|
"loss": 1.6791, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 1.5452369554810914, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.692110761082577e-06, |
|
"loss": 1.7253, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 1.547151747247487, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.670508142252766e-06, |
|
"loss": 1.7035, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 1.5490665390138822, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.648979182443454e-06, |
|
"loss": 1.7488, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 1.5509813307802776, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.6275240980137272e-06, |
|
"loss": 1.704, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.552896122546673, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.6061431045802286e-06, |
|
"loss": 1.7235, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 1.5548109143130686, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.5848364170150307e-06, |
|
"loss": 1.6652, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.556725706079464, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.563604249443438e-06, |
|
"loss": 1.6524, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 1.5586404978458592, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.542446815241867e-06, |
|
"loss": 1.697, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 1.5605552896122545, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.521364327035678e-06, |
|
"loss": 1.6973, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.56247008137865, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.5003569966970574e-06, |
|
"loss": 1.7513, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.5643848731450456, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.4794250353428707e-06, |
|
"loss": 1.6145, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 1.566299664911441, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.458568653332557e-06, |
|
"loss": 1.7153, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 1.5682144566778362, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.437788060266002e-06, |
|
"loss": 1.6331, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 1.5701292484442317, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.4170834649814366e-06, |
|
"loss": 1.6747, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.572044040210627, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.3964550755533468e-06, |
|
"loss": 1.6055, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 1.5739588319770226, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.375903099290362e-06, |
|
"loss": 1.6992, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 1.5758736237434179, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.355427742733197e-06, |
|
"loss": 1.6433, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 1.5777884155098132, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.335029211652552e-06, |
|
"loss": 1.7133, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 1.5797032072762087, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.314707711047063e-06, |
|
"loss": 1.7327, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.5816179990426043, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.294463445141233e-06, |
|
"loss": 1.7631, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 1.5835327908089996, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.2742966173833835e-06, |
|
"loss": 1.7577, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 1.5854475825753949, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.254207430443599e-06, |
|
"loss": 1.7896, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 1.5873623743417902, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.2341960862117118e-06, |
|
"loss": 1.6763, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 1.5892771661081857, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.214262785795248e-06, |
|
"loss": 1.6878, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5911919578745812, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 2.1944077295174284e-06, |
|
"loss": 1.6532, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 1.5931067496409765, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 2.174631116915137e-06, |
|
"loss": 1.7702, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 1.5950215414073718, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.1549331467369327e-06, |
|
"loss": 1.7115, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 1.5969363331737674, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.1353140169410347e-06, |
|
"loss": 1.6486, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 1.598851124940163, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 2.1157739246933507e-06, |
|
"loss": 1.7097, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.6007659167065582, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.0963130663654785e-06, |
|
"loss": 1.7174, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 1.6026807084729535, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.0769316375327497e-06, |
|
"loss": 1.6954, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 1.6045955002393488, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.0576298329722445e-06, |
|
"loss": 1.6773, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 1.6065102920057444, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 2.038407846660855e-06, |
|
"loss": 1.6202, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 1.6084250837721399, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.019265871773316e-06, |
|
"loss": 1.7177, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6103398755385352, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.0002041006802843e-06, |
|
"loss": 1.7824, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 1.6122546673049305, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.981222724946383e-06, |
|
"loss": 1.7061, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 1.614169459071326, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.9623219353283005e-06, |
|
"loss": 1.7551, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 1.6160842508377216, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.943501921772848e-06, |
|
"loss": 1.7165, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 1.6179990426041169, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.9247628734150725e-06, |
|
"loss": 1.6687, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.6199138343705122, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.9061049785763419e-06, |
|
"loss": 1.6888, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 1.6218286261369075, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.8875284247624625e-06, |
|
"loss": 1.6674, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 1.623743417903303, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.8690333986617827e-06, |
|
"loss": 1.7384, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 1.6256582096696985, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.8506200861433287e-06, |
|
"loss": 1.6367, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 1.6275730014360938, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.832288672254936e-06, |
|
"loss": 1.6592, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6294877932024892, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.8140393412213719e-06, |
|
"loss": 1.7263, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 1.6314025849688847, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.7958722764425119e-06, |
|
"loss": 1.6543, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 1.63331737673528, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.7777876604914712e-06, |
|
"loss": 1.7082, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 1.6352321685016755, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.7597856751127919e-06, |
|
"loss": 1.7153, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 1.6371469602680708, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.7418665012205927e-06, |
|
"loss": 1.65, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.6390617520344661, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.7240303188967767e-06, |
|
"loss": 1.6985, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 1.6409765438008617, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.7062773073891958e-06, |
|
"loss": 1.6766, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 1.6428913355672572, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.6886076451098766e-06, |
|
"loss": 1.6786, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 1.6448061273336525, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.6710215096331971e-06, |
|
"loss": 1.7329, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 1.6467209191000478, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.6535190776941323e-06, |
|
"loss": 1.7428, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6486357108664431, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.6361005251864525e-06, |
|
"loss": 1.6936, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 1.6505505026328386, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.6187660271609773e-06, |
|
"loss": 1.7386, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 1.6524652943992342, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 1.6015157578237939e-06, |
|
"loss": 1.7213, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 1.6543800861656295, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.584349890534531e-06, |
|
"loss": 1.6877, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 1.6562948779320248, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.5672685978045931e-06, |
|
"loss": 1.7153, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.6582096696984203, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.5502720512954472e-06, |
|
"loss": 1.7155, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 1.6601244614648158, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.5333604218168785e-06, |
|
"loss": 1.7235, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 1.6620392532312112, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.5165338793252937e-06, |
|
"loss": 1.6423, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 1.6639540449976065, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.4997925929219937e-06, |
|
"loss": 1.7088, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 1.6658688367640018, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.483136730851492e-06, |
|
"loss": 1.6486, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6677836285303973, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4665664604998053e-06, |
|
"loss": 1.6938, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 1.6696984202967928, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.4500819483927898e-06, |
|
"loss": 1.6819, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 1.6716132120631881, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.4336833601944577e-06, |
|
"loss": 1.6385, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 1.6735280038295834, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.4173708607053071e-06, |
|
"loss": 1.6798, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 1.675442795595979, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4011446138606822e-06, |
|
"loss": 1.7299, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.6773575873623745, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.3850047827291057e-06, |
|
"loss": 1.715, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 1.6792723791287698, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3689515295106626e-06, |
|
"loss": 1.6405, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 1.6811871708951651, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.352985015535344e-06, |
|
"loss": 1.7398, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 1.6831019626615604, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.3371054012614527e-06, |
|
"loss": 1.6731, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 1.685016754427956, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.3213128462739656e-06, |
|
"loss": 1.6651, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6869315461943515, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.3056075092829546e-06, |
|
"loss": 1.6424, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.6888463379607468, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.2899895481219672e-06, |
|
"loss": 1.6476, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.690761129727142, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.2744591197464618e-06, |
|
"loss": 1.7422, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 1.6926759214935376, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2590163802322108e-06, |
|
"loss": 1.6761, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 1.694590713259933, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.2436614847737526e-06, |
|
"loss": 1.7296, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.6965055050263285, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2283945876828107e-06, |
|
"loss": 1.671, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 1.6984202967927238, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.2132158423867645e-06, |
|
"loss": 1.7288, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 1.700335088559119, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.198125401427085e-06, |
|
"loss": 1.7769, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 1.7022498803255146, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.1831234164578242e-06, |
|
"loss": 1.7237, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 1.7041646720919101, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.1682100382440686e-06, |
|
"loss": 1.7282, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7060794638583054, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 1.1533854166604486e-06, |
|
"loss": 1.7385, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 1.7079942556247008, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.1386497006896058e-06, |
|
"loss": 1.6813, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 1.709909047391096, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.1240030384207202e-06, |
|
"loss": 1.7269, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 1.7118238391574916, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.1094455770480017e-06, |
|
"loss": 1.6812, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 1.7137386309238871, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.0949774628692278e-06, |
|
"loss": 1.6795, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.7156534226902824, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.0805988412842638e-06, |
|
"loss": 1.7112, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 1.7175682144566777, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.0663098567935981e-06, |
|
"loss": 1.7174, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 1.7194830062230733, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.0521106529969016e-06, |
|
"loss": 1.6405, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 1.7213977979894688, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.0380013725915783e-06, |
|
"loss": 1.7008, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 1.723312589755864, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.0239821573713228e-06, |
|
"loss": 1.6936, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7252273815222594, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.0100531482247155e-06, |
|
"loss": 1.7914, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 1.7271421732886547, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 9.962144851337863e-07, |
|
"loss": 1.654, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 1.7290569650550502, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 9.824663071726204e-07, |
|
"loss": 1.7272, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 1.7309717568214458, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.688087525059575e-07, |
|
"loss": 1.7288, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 1.732886548587841, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 9.55241958387796e-07, |
|
"loss": 1.6932, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.7348013403542364, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 9.417660611600299e-07, |
|
"loss": 1.6952, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 1.736716132120632, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.283811962510603e-07, |
|
"loss": 1.7722, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 1.7386309238870274, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.150874981744507e-07, |
|
"loss": 1.6527, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 1.7405457156534228, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 9.018851005275586e-07, |
|
"loss": 1.7071, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 1.742460507419818, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 8.887741359902113e-07, |
|
"loss": 1.7559, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7443752991862134, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.757547363233543e-07, |
|
"loss": 1.6998, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 1.746290090952609, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 8.628270323677424e-07, |
|
"loss": 1.593, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 1.7482048827190044, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 8.499911540426131e-07, |
|
"loss": 1.7532, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 1.7501196744853997, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.372472303443924e-07, |
|
"loss": 1.696, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 1.752034466251795, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 8.245953893453829e-07, |
|
"loss": 1.7278, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.7539492580181906, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 8.120357581924931e-07, |
|
"loss": 1.7215, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.7558640497845859, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 7.99568463105953e-07, |
|
"loss": 1.6702, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 1.7577788415509814, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 7.87193629378038e-07, |
|
"loss": 1.6721, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 1.7596936333173767, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 7.749113813718234e-07, |
|
"loss": 1.7008, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 1.761608425083772, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 7.627218425199278e-07, |
|
"loss": 1.6697, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7635232168501676, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.506251353232663e-07, |
|
"loss": 1.7305, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 1.765438008616563, |
|
"grad_norm": 1.75, |
|
"learning_rate": 7.386213813498344e-07, |
|
"loss": 1.7425, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 1.7673528003829584, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 7.267107012334707e-07, |
|
"loss": 1.7188, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 1.7692675921493537, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 7.148932146726572e-07, |
|
"loss": 1.668, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 1.771182383915749, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 7.031690404293046e-07, |
|
"loss": 1.6772, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.7730971756821445, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.915382963275741e-07, |
|
"loss": 1.704, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 1.77501196744854, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.800010992526729e-07, |
|
"loss": 1.6764, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 1.7769267592149354, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 6.685575651497022e-07, |
|
"loss": 1.7017, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 1.7788415509813307, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.572078090224721e-07, |
|
"loss": 1.7396, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 1.7807563427477262, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.459519449323592e-07, |
|
"loss": 1.7057, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7826711345141217, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 6.347900859971534e-07, |
|
"loss": 1.6723, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 1.784585926280517, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 6.237223443899221e-07, |
|
"loss": 1.7167, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 1.7865007180469124, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.127488313378894e-07, |
|
"loss": 1.7802, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 1.7884155098133077, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.018696571213045e-07, |
|
"loss": 1.6742, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 1.7903303015797032, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.910849310723499e-07, |
|
"loss": 1.7288, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.7922450933460987, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 5.803947615740291e-07, |
|
"loss": 1.6607, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 1.794159885112494, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.697992560590882e-07, |
|
"loss": 1.7431, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 1.7960746768788893, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.59298521008923e-07, |
|
"loss": 1.6916, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 1.7979894686452849, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.488926619525248e-07, |
|
"loss": 1.6678, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 1.7999042604116804, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 5.385817834654095e-07, |
|
"loss": 1.673, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8018190521780757, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.283659891685656e-07, |
|
"loss": 1.7494, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 1.803733843944471, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 5.18245381727418e-07, |
|
"loss": 1.667, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 1.8056486357108663, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.082200628507994e-07, |
|
"loss": 1.7009, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 1.8075634274772618, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.982901332899159e-07, |
|
"loss": 1.681, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 1.8094782192436574, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.884556928373462e-07, |
|
"loss": 1.6666, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.8113930110100527, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.787168403260323e-07, |
|
"loss": 1.6944, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 1.813307802776448, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 4.690736736282908e-07, |
|
"loss": 1.6373, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 1.8152225945428435, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.595262896548236e-07, |
|
"loss": 1.709, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 1.8171373863092388, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.500747843537523e-07, |
|
"loss": 1.7181, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 1.8190521780756344, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.407192527096404e-07, |
|
"loss": 1.6296, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8209669698420297, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.3145978874255757e-07, |
|
"loss": 1.6935, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 1.822881761608425, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.222964855071154e-07, |
|
"loss": 1.6726, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 1.8247965533748205, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 4.1322943509154887e-07, |
|
"loss": 1.6841, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 1.826711345141216, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.042587286167754e-07, |
|
"loss": 1.6808, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 1.8286261369076113, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.953844562354936e-07, |
|
"loss": 1.7918, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.8305409286740066, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.8660670713126735e-07, |
|
"loss": 1.7129, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 1.832455720440402, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.7792556951763424e-07, |
|
"loss": 1.6843, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 1.8343705122067975, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.6934113063721634e-07, |
|
"loss": 1.7275, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 1.836285303973193, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 3.6085347676084736e-07, |
|
"loss": 1.7158, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 1.8382000957395883, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.5246269318669924e-07, |
|
"loss": 1.6048, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8401148875059836, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.441688642394292e-07, |
|
"loss": 1.6986, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 1.8420296792723792, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.359720732693361e-07, |
|
"loss": 1.6519, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 1.8439444710387747, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.2787240265151674e-07, |
|
"loss": 1.7094, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 1.84585926280517, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.1986993378503526e-07, |
|
"loss": 1.678, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 1.8477740545715653, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.11964747092115e-07, |
|
"loss": 1.6928, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.8496888463379606, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 3.041569220173235e-07, |
|
"loss": 1.7484, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 1.8516036381043561, |
|
"grad_norm": 1.75, |
|
"learning_rate": 2.9644653702677553e-07, |
|
"loss": 1.7354, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 1.8535184298707517, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.888336696073435e-07, |
|
"loss": 1.6876, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 1.855433221637147, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.8131839626588056e-07, |
|
"loss": 1.6774, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 1.8573480134035423, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.7390079252845205e-07, |
|
"loss": 1.6936, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8592628051699378, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.6658093293957187e-07, |
|
"loss": 1.7162, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 1.8611775969363333, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.5935889106146305e-07, |
|
"loss": 1.7387, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 1.8630923887027286, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.522347394733049e-07, |
|
"loss": 1.6896, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 1.865007180469124, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.45208549770517e-07, |
|
"loss": 1.7292, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 1.8669219722355193, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.382803925640309e-07, |
|
"loss": 1.7433, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.8688367640019148, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 2.314503374795829e-07, |
|
"loss": 1.6975, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 1.8707515557683103, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.247184531570168e-07, |
|
"loss": 1.7252, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 1.8726663475347056, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.1808480724959004e-07, |
|
"loss": 1.7407, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 1.874581139301101, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 2.1154946642329644e-07, |
|
"loss": 1.7345, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 1.8764959310674965, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.051124963561979e-07, |
|
"loss": 1.7497, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8784107228338918, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.9877396173775598e-07, |
|
"loss": 1.7163, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 1.8803255146002873, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.9253392626819468e-07, |
|
"loss": 1.6973, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 1.8822403063666826, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.8639245265784866e-07, |
|
"loss": 1.6974, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 1.884155098133078, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.8034960262654276e-07, |
|
"loss": 1.7159, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 1.8860698898994734, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.744054369029591e-07, |
|
"loss": 1.6499, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.887984681665869, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.6856001522404296e-07, |
|
"loss": 1.7192, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 1.8898994734322643, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.6281339633438698e-07, |
|
"loss": 1.7233, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 1.8918142651986596, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.5716563798565232e-07, |
|
"loss": 1.7212, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 1.893729056965055, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.5161679693598274e-07, |
|
"loss": 1.6976, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 1.8956438487314504, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.4616692894943274e-07, |
|
"loss": 1.7309, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.897558640497846, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.4081608879541241e-07, |
|
"loss": 1.7336, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 1.8994734322642413, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.3556433024813353e-07, |
|
"loss": 1.7123, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 1.9013882240306366, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.304117060860688e-07, |
|
"loss": 1.6961, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 1.903303015797032, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.2535826809142339e-07, |
|
"loss": 1.6591, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 1.9052178075634276, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.2040406704961316e-07, |
|
"loss": 1.6986, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.907132599329823, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 1.15549152748754e-07, |
|
"loss": 1.6602, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.9090473910962182, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.1079357397916435e-07, |
|
"loss": 1.6829, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 1.9109621828626135, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.061373785328701e-07, |
|
"loss": 1.688, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 1.912876974629009, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.015806132031305e-07, |
|
"loss": 1.7246, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 1.9147917663954046, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.712332378395861e-08, |
|
"loss": 1.7222, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9167065581618, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 9.276555506967378e-08, |
|
"loss": 1.7414, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 1.9186213499281952, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.850735085443763e-08, |
|
"loss": 1.6952, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 1.9205361416945907, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.434875393182662e-08, |
|
"loss": 1.671, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 1.9224509334609863, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.028980609439241e-08, |
|
"loss": 1.6717, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 1.9243657252273816, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 7.633054813324769e-08, |
|
"loss": 1.6772, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.926280516993777, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 7.247101983765104e-08, |
|
"loss": 1.7314, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 1.9281953087601722, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 6.871125999461604e-08, |
|
"loss": 1.5927, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 1.9301101005265677, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 6.505130638850831e-08, |
|
"loss": 1.6729, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 1.9320248922929633, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.14911958006792e-08, |
|
"loss": 1.7042, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 1.9339396840593586, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 5.803096400908703e-08, |
|
"loss": 1.8112, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9358544758257539, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 5.46706457879409e-08, |
|
"loss": 1.7101, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 1.9377692675921494, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 5.141027490735195e-08, |
|
"loss": 1.782, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 1.9396840593585447, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.824988413299037e-08, |
|
"loss": 1.6928, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 1.9415988511249402, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.5189505225762266e-08, |
|
"loss": 1.7131, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 1.9435136428913355, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.2229168941484434e-08, |
|
"loss": 1.7265, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.9454284346577309, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 3.9368905030578994e-08, |
|
"loss": 1.6424, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 1.9473432264241264, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.6608742237769227e-08, |
|
"loss": 1.7135, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 1.949258018190522, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.394870830180197e-08, |
|
"loss": 1.7292, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 1.9511728099569172, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 3.1388829955153466e-08, |
|
"loss": 1.6791, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 1.9530876017233125, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.892913292377508e-08, |
|
"loss": 1.6906, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9550023934897078, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.656964192682354e-08, |
|
"loss": 1.698, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 1.9569171852561034, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.431038067642111e-08, |
|
"loss": 1.6888, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 1.958831977022499, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.2151371877412452e-08, |
|
"loss": 1.6964, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 1.9607467687888942, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.0092637227134836e-08, |
|
"loss": 1.6933, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 1.9626615605552895, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.8134197415207165e-08, |
|
"loss": 1.689, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.964576352321685, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.627607212331572e-08, |
|
"loss": 1.6823, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 1.9664911440880806, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 1.451828002501654e-08, |
|
"loss": 1.7396, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 1.9684059358544759, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.286083878555111e-08, |
|
"loss": 1.6459, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 1.9703207276208712, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.1303765061668748e-08, |
|
"loss": 1.6741, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 1.9722355193872665, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.847074501456722e-09, |
|
"loss": 1.6616, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.974150311153662, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 8.490781744181498e-09, |
|
"loss": 1.7448, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 1.9760651029200575, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 7.234900420147739e-09, |
|
"loss": 1.6673, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 1.9779798946864529, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 6.079443150556197e-09, |
|
"loss": 1.7032, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 1.9798946864528482, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 5.0244215473782556e-09, |
|
"loss": 1.6774, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 1.9818094782192437, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.069846213238249e-09, |
|
"loss": 1.7234, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.9837242699856392, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.2157267413113203e-09, |
|
"loss": 1.6599, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 1.9856390617520345, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 2.4620717152201713e-09, |
|
"loss": 1.7449, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 1.9875538535184298, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 1.8088887089551255e-09, |
|
"loss": 1.6684, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 1.9894686452848251, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 1.256184286793083e-09, |
|
"loss": 1.641, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 1.9913834370512207, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 8.039640032342366e-10, |
|
"loss": 1.7045, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9932982288176162, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 4.522324029465619e-10, |
|
"loss": 1.7222, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 1.9952130205840115, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 2.0099302071807658e-10, |
|
"loss": 1.6946, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 1.9971278123504068, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 5.024838142464461e-11, |
|
"loss": 1.7026, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 1.9990426041168023, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.0, |
|
"loss": 1.7507, |
|
"step": 1044 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1044, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.28007906539733e+17, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|