|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999242787551428, |
|
"eval_steps": 2000, |
|
"global_step": 5571, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017948739521722182, |
|
"grad_norm": 70.875, |
|
"learning_rate": 9.999964943868121e-07, |
|
"loss": 113.0671, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0035897479043444365, |
|
"grad_norm": 62.15625, |
|
"learning_rate": 9.999929887736243e-07, |
|
"loss": 109.1635, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0053846218565166545, |
|
"grad_norm": 53.625, |
|
"learning_rate": 9.999894831604363e-07, |
|
"loss": 109.8683, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007179495808688873, |
|
"grad_norm": 65.6875, |
|
"learning_rate": 9.999859775472485e-07, |
|
"loss": 108.1756, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00897436976086109, |
|
"grad_norm": 53.0625, |
|
"learning_rate": 9.99982471934061e-07, |
|
"loss": 106.7566, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.010769243713033309, |
|
"grad_norm": 59.84375, |
|
"learning_rate": 9.99978966320873e-07, |
|
"loss": 107.5178, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.012564117665205527, |
|
"grad_norm": 59.46875, |
|
"learning_rate": 9.99975460707685e-07, |
|
"loss": 106.8143, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.014358991617377746, |
|
"grad_norm": 58.96875, |
|
"learning_rate": 9.999719550944973e-07, |
|
"loss": 106.2242, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.016153865569549963, |
|
"grad_norm": 57.46875, |
|
"learning_rate": 9.999684494813095e-07, |
|
"loss": 105.5488, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01794873952172218, |
|
"grad_norm": 54.65625, |
|
"learning_rate": 9.999649438681217e-07, |
|
"loss": 105.1425, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0197436134738944, |
|
"grad_norm": 55.78125, |
|
"learning_rate": 9.999614382549337e-07, |
|
"loss": 105.8918, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.021538487426066618, |
|
"grad_norm": 54.53125, |
|
"learning_rate": 9.999579326417459e-07, |
|
"loss": 105.4892, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.023333361378238836, |
|
"grad_norm": 57.6875, |
|
"learning_rate": 9.99954427028558e-07, |
|
"loss": 105.5813, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.025128235330411055, |
|
"grad_norm": 55.3125, |
|
"learning_rate": 9.999509214153702e-07, |
|
"loss": 105.1202, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.026923109282583273, |
|
"grad_norm": 54.21875, |
|
"learning_rate": 9.999474158021824e-07, |
|
"loss": 105.9377, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.028717983234755492, |
|
"grad_norm": 52.5625, |
|
"learning_rate": 9.999439101889946e-07, |
|
"loss": 104.7342, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03051285718692771, |
|
"grad_norm": 59.9375, |
|
"learning_rate": 9.999404045758068e-07, |
|
"loss": 104.9382, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.032307731139099925, |
|
"grad_norm": 55.65625, |
|
"learning_rate": 9.999368989626188e-07, |
|
"loss": 106.4299, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.034102605091272144, |
|
"grad_norm": 53.6875, |
|
"learning_rate": 9.99933393349431e-07, |
|
"loss": 104.4395, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03589747904344436, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.999298877362432e-07, |
|
"loss": 104.3946, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03769235299561658, |
|
"grad_norm": 55.9375, |
|
"learning_rate": 9.999263821230554e-07, |
|
"loss": 105.1693, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0394872269477888, |
|
"grad_norm": 55.03125, |
|
"learning_rate": 9.999228765098676e-07, |
|
"loss": 104.7132, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04128210089996102, |
|
"grad_norm": 71.0625, |
|
"learning_rate": 9.999193708966798e-07, |
|
"loss": 104.7171, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.043076974852133236, |
|
"grad_norm": 59.40625, |
|
"learning_rate": 9.99915865283492e-07, |
|
"loss": 104.5917, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.044871848804305454, |
|
"grad_norm": 53.84375, |
|
"learning_rate": 9.999123596703042e-07, |
|
"loss": 104.3239, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04666672275647767, |
|
"grad_norm": 59.375, |
|
"learning_rate": 9.999088540571162e-07, |
|
"loss": 104.0901, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.04846159670864989, |
|
"grad_norm": 56.0, |
|
"learning_rate": 9.999053484439284e-07, |
|
"loss": 104.5291, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05025647066082211, |
|
"grad_norm": 64.4375, |
|
"learning_rate": 9.999018428307405e-07, |
|
"loss": 104.5856, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05205134461299433, |
|
"grad_norm": 79.25, |
|
"learning_rate": 9.998983372175527e-07, |
|
"loss": 104.3207, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05384621856516655, |
|
"grad_norm": 64.375, |
|
"learning_rate": 9.99894831604365e-07, |
|
"loss": 104.9624, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.055641092517338765, |
|
"grad_norm": 58.84375, |
|
"learning_rate": 9.99891325991177e-07, |
|
"loss": 104.5753, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.057435966469510984, |
|
"grad_norm": 61.21875, |
|
"learning_rate": 9.998878203779893e-07, |
|
"loss": 104.478, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0592308404216832, |
|
"grad_norm": 62.5, |
|
"learning_rate": 9.998843147648013e-07, |
|
"loss": 103.2508, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06102571437385542, |
|
"grad_norm": 56.8125, |
|
"learning_rate": 9.998808091516135e-07, |
|
"loss": 104.3786, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06282058832602763, |
|
"grad_norm": 55.21875, |
|
"learning_rate": 9.998773035384257e-07, |
|
"loss": 103.974, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06461546227819985, |
|
"grad_norm": 61.71875, |
|
"learning_rate": 9.998737979252379e-07, |
|
"loss": 104.481, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06641033623037207, |
|
"grad_norm": 53.78125, |
|
"learning_rate": 9.9987029231205e-07, |
|
"loss": 104.1711, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.06820521018254429, |
|
"grad_norm": 54.375, |
|
"learning_rate": 9.99866786698862e-07, |
|
"loss": 103.1215, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0700000841347165, |
|
"grad_norm": 65.6875, |
|
"learning_rate": 9.998632810856743e-07, |
|
"loss": 105.1524, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07179495808688872, |
|
"grad_norm": 58.46875, |
|
"learning_rate": 9.998597754724865e-07, |
|
"loss": 104.1496, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07358983203906094, |
|
"grad_norm": 56.28125, |
|
"learning_rate": 9.998562698592986e-07, |
|
"loss": 103.0084, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07538470599123316, |
|
"grad_norm": 58.9375, |
|
"learning_rate": 9.998527642461108e-07, |
|
"loss": 102.782, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07717957994340538, |
|
"grad_norm": 53.28125, |
|
"learning_rate": 9.99849258632923e-07, |
|
"loss": 104.664, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0789744538955776, |
|
"grad_norm": 55.53125, |
|
"learning_rate": 9.998457530197352e-07, |
|
"loss": 103.6033, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08076932784774982, |
|
"grad_norm": 54.6875, |
|
"learning_rate": 9.998422474065474e-07, |
|
"loss": 103.2984, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08256420179992204, |
|
"grad_norm": 58.75, |
|
"learning_rate": 9.998387417933594e-07, |
|
"loss": 103.1222, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08435907575209425, |
|
"grad_norm": 54.65625, |
|
"learning_rate": 9.998352361801716e-07, |
|
"loss": 104.3975, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.08615394970426647, |
|
"grad_norm": 62.0625, |
|
"learning_rate": 9.998317305669838e-07, |
|
"loss": 103.6788, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08794882365643869, |
|
"grad_norm": 54.75, |
|
"learning_rate": 9.99828224953796e-07, |
|
"loss": 103.4202, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.08974369760861091, |
|
"grad_norm": 58.3125, |
|
"learning_rate": 9.998247193406082e-07, |
|
"loss": 102.8853, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09153857156078313, |
|
"grad_norm": 56.21875, |
|
"learning_rate": 9.998212137274204e-07, |
|
"loss": 103.1342, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.09333344551295535, |
|
"grad_norm": 57.71875, |
|
"learning_rate": 9.998177081142326e-07, |
|
"loss": 103.0709, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.09512831946512756, |
|
"grad_norm": 60.4375, |
|
"learning_rate": 9.998142025010446e-07, |
|
"loss": 104.1919, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.09692319341729978, |
|
"grad_norm": 58.34375, |
|
"learning_rate": 9.998106968878567e-07, |
|
"loss": 104.5302, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.098718067369472, |
|
"grad_norm": 57.875, |
|
"learning_rate": 9.99807191274669e-07, |
|
"loss": 103.7714, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10051294132164422, |
|
"grad_norm": 62.46875, |
|
"learning_rate": 9.998036856614811e-07, |
|
"loss": 104.633, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.10230781527381644, |
|
"grad_norm": 53.8125, |
|
"learning_rate": 9.998001800482933e-07, |
|
"loss": 102.6445, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.10410268922598866, |
|
"grad_norm": 63.84375, |
|
"learning_rate": 9.997966744351053e-07, |
|
"loss": 104.0129, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.10589756317816088, |
|
"grad_norm": 57.34375, |
|
"learning_rate": 9.997931688219175e-07, |
|
"loss": 103.0677, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.1076924371303331, |
|
"grad_norm": 57.25, |
|
"learning_rate": 9.9978966320873e-07, |
|
"loss": 103.8832, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10948731108250531, |
|
"grad_norm": 57.90625, |
|
"learning_rate": 9.997861575955419e-07, |
|
"loss": 103.4752, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.11128218503467753, |
|
"grad_norm": 62.46875, |
|
"learning_rate": 9.99782651982354e-07, |
|
"loss": 103.0433, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.11307705898684975, |
|
"grad_norm": 58.21875, |
|
"learning_rate": 9.997791463691663e-07, |
|
"loss": 102.7304, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.11487193293902197, |
|
"grad_norm": 54.53125, |
|
"learning_rate": 9.997756407559785e-07, |
|
"loss": 103.3318, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.11666680689119419, |
|
"grad_norm": 60.125, |
|
"learning_rate": 9.997721351427907e-07, |
|
"loss": 103.5509, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1184616808433664, |
|
"grad_norm": 58.59375, |
|
"learning_rate": 9.997686295296027e-07, |
|
"loss": 102.8469, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.12025655479553862, |
|
"grad_norm": 58.125, |
|
"learning_rate": 9.997651239164148e-07, |
|
"loss": 104.6024, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.12205142874771084, |
|
"grad_norm": 57.78125, |
|
"learning_rate": 9.99761618303227e-07, |
|
"loss": 102.2974, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.12384630269988306, |
|
"grad_norm": 53.65625, |
|
"learning_rate": 9.997581126900392e-07, |
|
"loss": 103.0835, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.12564117665205526, |
|
"grad_norm": 53.75, |
|
"learning_rate": 9.997546070768514e-07, |
|
"loss": 104.0657, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1274360506042275, |
|
"grad_norm": 57.84375, |
|
"learning_rate": 9.997511014636636e-07, |
|
"loss": 103.865, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.1292309245563997, |
|
"grad_norm": 55.1875, |
|
"learning_rate": 9.997475958504758e-07, |
|
"loss": 102.8469, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.13102579850857193, |
|
"grad_norm": 54.375, |
|
"learning_rate": 9.997440902372878e-07, |
|
"loss": 103.573, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.13282067246074414, |
|
"grad_norm": 55.34375, |
|
"learning_rate": 9.997405846241e-07, |
|
"loss": 102.4788, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.13461554641291637, |
|
"grad_norm": 74.1875, |
|
"learning_rate": 9.997370790109122e-07, |
|
"loss": 104.8757, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.13641042036508857, |
|
"grad_norm": 67.25, |
|
"learning_rate": 9.997335733977244e-07, |
|
"loss": 103.3008, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1382052943172608, |
|
"grad_norm": 50.625, |
|
"learning_rate": 9.997300677845366e-07, |
|
"loss": 102.5183, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.140000168269433, |
|
"grad_norm": 57.34375, |
|
"learning_rate": 9.997265621713488e-07, |
|
"loss": 103.6885, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.14179504222160524, |
|
"grad_norm": 57.15625, |
|
"learning_rate": 9.99723056558161e-07, |
|
"loss": 103.321, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.14358991617377745, |
|
"grad_norm": 64.0625, |
|
"learning_rate": 9.997195509449732e-07, |
|
"loss": 102.9387, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.14538479012594968, |
|
"grad_norm": 61.46875, |
|
"learning_rate": 9.997160453317851e-07, |
|
"loss": 102.8938, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.14717966407812189, |
|
"grad_norm": 54.53125, |
|
"learning_rate": 9.997125397185973e-07, |
|
"loss": 102.8753, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.14897453803029412, |
|
"grad_norm": 59.34375, |
|
"learning_rate": 9.997090341054095e-07, |
|
"loss": 102.9597, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.15076941198246632, |
|
"grad_norm": 55.65625, |
|
"learning_rate": 9.997055284922217e-07, |
|
"loss": 102.3567, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.15256428593463855, |
|
"grad_norm": 67.25, |
|
"learning_rate": 9.99702022879034e-07, |
|
"loss": 102.8148, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.15435915988681076, |
|
"grad_norm": 53.15625, |
|
"learning_rate": 9.99698517265846e-07, |
|
"loss": 104.3594, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.156154033838983, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.996950116526583e-07, |
|
"loss": 102.0147, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1579489077911552, |
|
"grad_norm": 58.53125, |
|
"learning_rate": 9.996915060394703e-07, |
|
"loss": 103.0337, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.15974378174332743, |
|
"grad_norm": 57.9375, |
|
"learning_rate": 9.996880004262825e-07, |
|
"loss": 103.2189, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.16153865569549963, |
|
"grad_norm": 56.3125, |
|
"learning_rate": 9.996844948130947e-07, |
|
"loss": 102.6131, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.16333352964767187, |
|
"grad_norm": 56.0, |
|
"learning_rate": 9.996809891999069e-07, |
|
"loss": 102.7586, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.16512840359984407, |
|
"grad_norm": 53.0, |
|
"learning_rate": 9.99677483586719e-07, |
|
"loss": 101.981, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.16692327755201627, |
|
"grad_norm": 56.4375, |
|
"learning_rate": 9.99673977973531e-07, |
|
"loss": 103.099, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.1687181515041885, |
|
"grad_norm": 57.09375, |
|
"learning_rate": 9.996704723603432e-07, |
|
"loss": 103.5702, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1705130254563607, |
|
"grad_norm": 54.46875, |
|
"learning_rate": 9.996669667471554e-07, |
|
"loss": 102.8591, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.17230789940853294, |
|
"grad_norm": 56.625, |
|
"learning_rate": 9.996634611339676e-07, |
|
"loss": 102.1516, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.17410277336070515, |
|
"grad_norm": 55.0, |
|
"learning_rate": 9.996599555207798e-07, |
|
"loss": 103.4924, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.17589764731287738, |
|
"grad_norm": 54.71875, |
|
"learning_rate": 9.99656449907592e-07, |
|
"loss": 102.3699, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.17769252126504959, |
|
"grad_norm": 58.625, |
|
"learning_rate": 9.996529442944042e-07, |
|
"loss": 101.9164, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.17948739521722182, |
|
"grad_norm": 53.1875, |
|
"learning_rate": 9.996494386812164e-07, |
|
"loss": 102.4251, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18128226916939402, |
|
"grad_norm": 61.34375, |
|
"learning_rate": 9.996459330680284e-07, |
|
"loss": 101.8229, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.18307714312156625, |
|
"grad_norm": 56.8125, |
|
"learning_rate": 9.996424274548406e-07, |
|
"loss": 102.8062, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.18487201707373846, |
|
"grad_norm": 57.5625, |
|
"learning_rate": 9.996389218416528e-07, |
|
"loss": 102.8688, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.1866668910259107, |
|
"grad_norm": 53.75, |
|
"learning_rate": 9.99635416228465e-07, |
|
"loss": 102.0441, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.1884617649780829, |
|
"grad_norm": 55.3125, |
|
"learning_rate": 9.996319106152772e-07, |
|
"loss": 101.1412, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.19025663893025513, |
|
"grad_norm": 57.1875, |
|
"learning_rate": 9.996284050020894e-07, |
|
"loss": 102.8017, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.19205151288242733, |
|
"grad_norm": 55.09375, |
|
"learning_rate": 9.996248993889016e-07, |
|
"loss": 102.8312, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.19384638683459957, |
|
"grad_norm": 58.8125, |
|
"learning_rate": 9.996213937757135e-07, |
|
"loss": 100.9258, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.19564126078677177, |
|
"grad_norm": 53.09375, |
|
"learning_rate": 9.996178881625257e-07, |
|
"loss": 101.9567, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.197436134738944, |
|
"grad_norm": 55.3125, |
|
"learning_rate": 9.99614382549338e-07, |
|
"loss": 102.2161, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1992310086911162, |
|
"grad_norm": 60.71875, |
|
"learning_rate": 9.996108769361501e-07, |
|
"loss": 102.0313, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.20102588264328844, |
|
"grad_norm": 54.28125, |
|
"learning_rate": 9.996073713229623e-07, |
|
"loss": 101.9621, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.20282075659546064, |
|
"grad_norm": 56.75, |
|
"learning_rate": 9.996038657097743e-07, |
|
"loss": 102.8201, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.20461563054763288, |
|
"grad_norm": 60.0, |
|
"learning_rate": 9.996003600965867e-07, |
|
"loss": 101.1011, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.20641050449980508, |
|
"grad_norm": 56.59375, |
|
"learning_rate": 9.99596854483399e-07, |
|
"loss": 102.2005, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2082053784519773, |
|
"grad_norm": 60.875, |
|
"learning_rate": 9.995933488702109e-07, |
|
"loss": 102.4762, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.21000025240414952, |
|
"grad_norm": 61.65625, |
|
"learning_rate": 9.99589843257023e-07, |
|
"loss": 101.5841, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.21179512635632175, |
|
"grad_norm": 51.375, |
|
"learning_rate": 9.995863376438353e-07, |
|
"loss": 101.5104, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.21359000030849395, |
|
"grad_norm": 62.5625, |
|
"learning_rate": 9.995828320306475e-07, |
|
"loss": 102.6215, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.2153848742606662, |
|
"grad_norm": 57.25, |
|
"learning_rate": 9.995793264174597e-07, |
|
"loss": 100.9618, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2171797482128384, |
|
"grad_norm": 53.40625, |
|
"learning_rate": 9.995758208042716e-07, |
|
"loss": 102.1445, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.21897462216501062, |
|
"grad_norm": 61.5, |
|
"learning_rate": 9.995723151910838e-07, |
|
"loss": 102.3304, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.22076949611718283, |
|
"grad_norm": 58.09375, |
|
"learning_rate": 9.99568809577896e-07, |
|
"loss": 103.4687, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.22256437006935506, |
|
"grad_norm": 56.5625, |
|
"learning_rate": 9.995653039647082e-07, |
|
"loss": 102.9005, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.22435924402152727, |
|
"grad_norm": 53.53125, |
|
"learning_rate": 9.995617983515204e-07, |
|
"loss": 101.6564, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2261541179736995, |
|
"grad_norm": 55.03125, |
|
"learning_rate": 9.995582927383326e-07, |
|
"loss": 102.7027, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.2279489919258717, |
|
"grad_norm": 56.09375, |
|
"learning_rate": 9.995547871251448e-07, |
|
"loss": 102.3057, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.22974386587804393, |
|
"grad_norm": 111.0, |
|
"learning_rate": 9.995512815119568e-07, |
|
"loss": 102.2795, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.23153873983021614, |
|
"grad_norm": 53.25, |
|
"learning_rate": 9.99547775898769e-07, |
|
"loss": 102.1083, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.23333361378238837, |
|
"grad_norm": 56.4375, |
|
"learning_rate": 9.995442702855812e-07, |
|
"loss": 101.5716, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.23512848773456058, |
|
"grad_norm": 56.21875, |
|
"learning_rate": 9.995407646723934e-07, |
|
"loss": 102.0678, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.2369233616867328, |
|
"grad_norm": 56.4375, |
|
"learning_rate": 9.995372590592056e-07, |
|
"loss": 101.759, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.238718235638905, |
|
"grad_norm": 57.5625, |
|
"learning_rate": 9.995337534460178e-07, |
|
"loss": 101.938, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.24051310959107725, |
|
"grad_norm": 52.96875, |
|
"learning_rate": 9.9953024783283e-07, |
|
"loss": 100.721, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.24230798354324945, |
|
"grad_norm": 55.09375, |
|
"learning_rate": 9.995267422196421e-07, |
|
"loss": 101.3094, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.24410285749542168, |
|
"grad_norm": 59.125, |
|
"learning_rate": 9.995232366064541e-07, |
|
"loss": 101.9257, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.2458977314475939, |
|
"grad_norm": 54.5625, |
|
"learning_rate": 9.995197309932663e-07, |
|
"loss": 102.262, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.24769260539976612, |
|
"grad_norm": 59.0625, |
|
"learning_rate": 9.995162253800785e-07, |
|
"loss": 101.121, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.24948747935193832, |
|
"grad_norm": 56.90625, |
|
"learning_rate": 9.995127197668907e-07, |
|
"loss": 103.3174, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.25128235330411053, |
|
"grad_norm": 61.15625, |
|
"learning_rate": 9.99509214153703e-07, |
|
"loss": 101.7487, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.25307722725628273, |
|
"grad_norm": 56.84375, |
|
"learning_rate": 9.99505708540515e-07, |
|
"loss": 102.6137, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.254872101208455, |
|
"grad_norm": 56.1875, |
|
"learning_rate": 9.995022029273273e-07, |
|
"loss": 101.4798, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.2566669751606272, |
|
"grad_norm": 53.3125, |
|
"learning_rate": 9.994986973141395e-07, |
|
"loss": 102.564, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.2584618491127994, |
|
"grad_norm": 60.71875, |
|
"learning_rate": 9.994951917009515e-07, |
|
"loss": 101.3267, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.2602567230649716, |
|
"grad_norm": 63.96875, |
|
"learning_rate": 9.994916860877637e-07, |
|
"loss": 101.9713, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.26205159701714387, |
|
"grad_norm": 56.0625, |
|
"learning_rate": 9.994881804745759e-07, |
|
"loss": 102.2705, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.26384647096931607, |
|
"grad_norm": 59.4375, |
|
"learning_rate": 9.99484674861388e-07, |
|
"loss": 101.6555, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.2656413449214883, |
|
"grad_norm": 57.3125, |
|
"learning_rate": 9.994811692482e-07, |
|
"loss": 100.9821, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.2674362188736605, |
|
"grad_norm": 57.25, |
|
"learning_rate": 9.994776636350122e-07, |
|
"loss": 100.6823, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.26923109282583274, |
|
"grad_norm": 52.0, |
|
"learning_rate": 9.994741580218246e-07, |
|
"loss": 101.6088, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.27102596677800495, |
|
"grad_norm": 57.9375, |
|
"learning_rate": 9.994706524086366e-07, |
|
"loss": 102.9636, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.27282084073017715, |
|
"grad_norm": 52.0625, |
|
"learning_rate": 9.994671467954488e-07, |
|
"loss": 100.9283, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.27461571468234935, |
|
"grad_norm": 60.125, |
|
"learning_rate": 9.99463641182261e-07, |
|
"loss": 102.0222, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.2764105886345216, |
|
"grad_norm": 56.1875, |
|
"learning_rate": 9.994601355690732e-07, |
|
"loss": 100.9025, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.2782054625866938, |
|
"grad_norm": 53.375, |
|
"learning_rate": 9.994566299558854e-07, |
|
"loss": 100.8715, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.280000336538866, |
|
"grad_norm": 55.59375, |
|
"learning_rate": 9.994531243426974e-07, |
|
"loss": 102.0768, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.28179521049103823, |
|
"grad_norm": 67.75, |
|
"learning_rate": 9.994496187295096e-07, |
|
"loss": 102.5791, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.2835900844432105, |
|
"grad_norm": 57.21875, |
|
"learning_rate": 9.994461131163218e-07, |
|
"loss": 102.531, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.2853849583953827, |
|
"grad_norm": 60.40625, |
|
"learning_rate": 9.99442607503134e-07, |
|
"loss": 102.0928, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.2871798323475549, |
|
"grad_norm": 53.8125, |
|
"learning_rate": 9.994391018899462e-07, |
|
"loss": 101.2671, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2889747062997271, |
|
"grad_norm": 55.4375, |
|
"learning_rate": 9.994355962767583e-07, |
|
"loss": 102.0377, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.29076958025189936, |
|
"grad_norm": 57.15625, |
|
"learning_rate": 9.994320906635705e-07, |
|
"loss": 102.7935, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.29256445420407157, |
|
"grad_norm": 59.1875, |
|
"learning_rate": 9.994285850503827e-07, |
|
"loss": 101.0506, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.29435932815624377, |
|
"grad_norm": 59.75, |
|
"learning_rate": 9.994250794371947e-07, |
|
"loss": 101.6031, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.296154202108416, |
|
"grad_norm": 58.28125, |
|
"learning_rate": 9.99421573824007e-07, |
|
"loss": 102.0068, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.29794907606058824, |
|
"grad_norm": 60.875, |
|
"learning_rate": 9.994180682108191e-07, |
|
"loss": 101.5614, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.29974395001276044, |
|
"grad_norm": 52.03125, |
|
"learning_rate": 9.994145625976313e-07, |
|
"loss": 101.9509, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.30153882396493265, |
|
"grad_norm": 51.125, |
|
"learning_rate": 9.994110569844435e-07, |
|
"loss": 100.5649, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.30333369791710485, |
|
"grad_norm": 55.8125, |
|
"learning_rate": 9.994075513712557e-07, |
|
"loss": 102.3571, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.3051285718692771, |
|
"grad_norm": 55.75, |
|
"learning_rate": 9.994040457580679e-07, |
|
"loss": 102.2168, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3069234458214493, |
|
"grad_norm": 58.46875, |
|
"learning_rate": 9.994005401448799e-07, |
|
"loss": 101.87, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.3087183197736215, |
|
"grad_norm": 52.96875, |
|
"learning_rate": 9.99397034531692e-07, |
|
"loss": 102.0437, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.3105131937257937, |
|
"grad_norm": 57.09375, |
|
"learning_rate": 9.993935289185043e-07, |
|
"loss": 100.8123, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.312308067677966, |
|
"grad_norm": 57.78125, |
|
"learning_rate": 9.993900233053164e-07, |
|
"loss": 101.3681, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.3141029416301382, |
|
"grad_norm": 54.78125, |
|
"learning_rate": 9.993865176921286e-07, |
|
"loss": 102.2902, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.3158978155823104, |
|
"grad_norm": 62.1875, |
|
"learning_rate": 9.993830120789406e-07, |
|
"loss": 100.0665, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.3176926895344826, |
|
"grad_norm": 54.84375, |
|
"learning_rate": 9.993795064657528e-07, |
|
"loss": 101.723, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.31948756348665486, |
|
"grad_norm": 56.65625, |
|
"learning_rate": 9.993760008525652e-07, |
|
"loss": 100.8694, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.32128243743882706, |
|
"grad_norm": 55.96875, |
|
"learning_rate": 9.993724952393772e-07, |
|
"loss": 100.4624, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.32307731139099927, |
|
"grad_norm": 53.96875, |
|
"learning_rate": 9.993689896261894e-07, |
|
"loss": 101.9399, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.32487218534317147, |
|
"grad_norm": 54.375, |
|
"learning_rate": 9.993654840130016e-07, |
|
"loss": 100.8476, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.32666705929534373, |
|
"grad_norm": 56.34375, |
|
"learning_rate": 9.993619783998138e-07, |
|
"loss": 100.3165, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.32846193324751594, |
|
"grad_norm": 56.9375, |
|
"learning_rate": 9.99358472786626e-07, |
|
"loss": 101.3776, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.33025680719968814, |
|
"grad_norm": 55.375, |
|
"learning_rate": 9.99354967173438e-07, |
|
"loss": 101.8946, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.33205168115186035, |
|
"grad_norm": 57.5, |
|
"learning_rate": 9.993514615602502e-07, |
|
"loss": 100.4206, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.33384655510403255, |
|
"grad_norm": 54.25, |
|
"learning_rate": 9.993479559470624e-07, |
|
"loss": 101.984, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3356414290562048, |
|
"grad_norm": 57.40625, |
|
"learning_rate": 9.993444503338745e-07, |
|
"loss": 101.568, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.337436303008377, |
|
"grad_norm": 52.40625, |
|
"learning_rate": 9.993409447206867e-07, |
|
"loss": 101.8589, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.3392311769605492, |
|
"grad_norm": 53.65625, |
|
"learning_rate": 9.99337439107499e-07, |
|
"loss": 101.0918, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3410260509127214, |
|
"grad_norm": 57.46875, |
|
"learning_rate": 9.993339334943111e-07, |
|
"loss": 100.8577, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3428209248648937, |
|
"grad_norm": 55.03125, |
|
"learning_rate": 9.993304278811231e-07, |
|
"loss": 101.4831, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.3446157988170659, |
|
"grad_norm": 58.5625, |
|
"learning_rate": 9.993269222679353e-07, |
|
"loss": 100.9822, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.3464106727692381, |
|
"grad_norm": 53.3125, |
|
"learning_rate": 9.993234166547475e-07, |
|
"loss": 101.5429, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.3482055467214103, |
|
"grad_norm": 55.4375, |
|
"learning_rate": 9.993199110415597e-07, |
|
"loss": 100.7439, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.35000042067358256, |
|
"grad_norm": 54.59375, |
|
"learning_rate": 9.993164054283719e-07, |
|
"loss": 101.6689, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.35179529462575476, |
|
"grad_norm": 52.5625, |
|
"learning_rate": 9.99312899815184e-07, |
|
"loss": 101.7466, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.35359016857792697, |
|
"grad_norm": 56.625, |
|
"learning_rate": 9.993093942019963e-07, |
|
"loss": 101.7688, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.35538504253009917, |
|
"grad_norm": 59.78125, |
|
"learning_rate": 9.993058885888085e-07, |
|
"loss": 103.273, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.35717991648227143, |
|
"grad_norm": 54.53125, |
|
"learning_rate": 9.993023829756205e-07, |
|
"loss": 100.4256, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.35897479043444364, |
|
"grad_norm": 57.40625, |
|
"learning_rate": 9.992988773624327e-07, |
|
"loss": 101.7477, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.35897479043444364, |
|
"eval_loss": 1.5870920419692993, |
|
"eval_runtime": 199.616, |
|
"eval_samples_per_second": 1465.674, |
|
"eval_steps_per_second": 45.803, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.36076966438661584, |
|
"grad_norm": 57.59375, |
|
"learning_rate": 9.992953717492448e-07, |
|
"loss": 102.2176, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.36256453833878804, |
|
"grad_norm": 62.09375, |
|
"learning_rate": 9.99291866136057e-07, |
|
"loss": 100.658, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.3643594122909603, |
|
"grad_norm": 57.59375, |
|
"learning_rate": 9.992883605228692e-07, |
|
"loss": 101.9662, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.3661542862431325, |
|
"grad_norm": 55.3125, |
|
"learning_rate": 9.992848549096812e-07, |
|
"loss": 101.5475, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.3679491601953047, |
|
"grad_norm": 55.15625, |
|
"learning_rate": 9.992813492964936e-07, |
|
"loss": 101.7496, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.3697440341474769, |
|
"grad_norm": 53.78125, |
|
"learning_rate": 9.992778436833056e-07, |
|
"loss": 101.7626, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.3715389080996492, |
|
"grad_norm": 55.53125, |
|
"learning_rate": 9.992743380701178e-07, |
|
"loss": 102.3083, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.3733337820518214, |
|
"grad_norm": 56.65625, |
|
"learning_rate": 9.9927083245693e-07, |
|
"loss": 101.3427, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.3751286560039936, |
|
"grad_norm": 61.375, |
|
"learning_rate": 9.992673268437422e-07, |
|
"loss": 100.4862, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.3769235299561658, |
|
"grad_norm": 84.5, |
|
"learning_rate": 9.992638212305544e-07, |
|
"loss": 101.2625, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.37871840390833805, |
|
"grad_norm": 60.75, |
|
"learning_rate": 9.992603156173664e-07, |
|
"loss": 101.8961, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.38051327786051026, |
|
"grad_norm": 56.8125, |
|
"learning_rate": 9.992568100041786e-07, |
|
"loss": 100.7833, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.38230815181268246, |
|
"grad_norm": 56.0, |
|
"learning_rate": 9.992533043909908e-07, |
|
"loss": 101.3713, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.38410302576485467, |
|
"grad_norm": 54.625, |
|
"learning_rate": 9.99249798777803e-07, |
|
"loss": 101.6192, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.3858978997170269, |
|
"grad_norm": 61.1875, |
|
"learning_rate": 9.992462931646151e-07, |
|
"loss": 101.346, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.38769277366919913, |
|
"grad_norm": 58.40625, |
|
"learning_rate": 9.992427875514273e-07, |
|
"loss": 101.0517, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.38948764762137134, |
|
"grad_norm": 58.78125, |
|
"learning_rate": 9.992392819382395e-07, |
|
"loss": 101.618, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.39128252157354354, |
|
"grad_norm": 55.03125, |
|
"learning_rate": 9.992357763250517e-07, |
|
"loss": 101.7176, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.3930773955257158, |
|
"grad_norm": 58.40625, |
|
"learning_rate": 9.992322707118637e-07, |
|
"loss": 99.9457, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.394872269477888, |
|
"grad_norm": 53.0, |
|
"learning_rate": 9.99228765098676e-07, |
|
"loss": 101.6802, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3966671434300602, |
|
"grad_norm": 53.40625, |
|
"learning_rate": 9.99225259485488e-07, |
|
"loss": 100.4554, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.3984620173822324, |
|
"grad_norm": 57.6875, |
|
"learning_rate": 9.992217538723003e-07, |
|
"loss": 99.7695, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.4002568913344047, |
|
"grad_norm": 57.1875, |
|
"learning_rate": 9.992182482591125e-07, |
|
"loss": 100.7215, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.4020517652865769, |
|
"grad_norm": 55.21875, |
|
"learning_rate": 9.992147426459247e-07, |
|
"loss": 102.5693, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.4038466392387491, |
|
"grad_norm": 52.4375, |
|
"learning_rate": 9.992112370327369e-07, |
|
"loss": 101.3943, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.4056415131909213, |
|
"grad_norm": 56.9375, |
|
"learning_rate": 9.992077314195489e-07, |
|
"loss": 102.1974, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.40743638714309355, |
|
"grad_norm": 61.71875, |
|
"learning_rate": 9.99204225806361e-07, |
|
"loss": 100.8492, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.40923126109526575, |
|
"grad_norm": 56.9375, |
|
"learning_rate": 9.992007201931732e-07, |
|
"loss": 100.8497, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.41102613504743796, |
|
"grad_norm": 57.71875, |
|
"learning_rate": 9.991972145799854e-07, |
|
"loss": 101.1414, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.41282100899961016, |
|
"grad_norm": 52.625, |
|
"learning_rate": 9.991937089667976e-07, |
|
"loss": 101.3806, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.41461588295178237, |
|
"grad_norm": 54.25, |
|
"learning_rate": 9.991902033536096e-07, |
|
"loss": 101.2679, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.4164107569039546, |
|
"grad_norm": 55.59375, |
|
"learning_rate": 9.99186697740422e-07, |
|
"loss": 100.5695, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.41820563085612683, |
|
"grad_norm": 64.75, |
|
"learning_rate": 9.991831921272342e-07, |
|
"loss": 100.9625, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.42000050480829904, |
|
"grad_norm": 59.78125, |
|
"learning_rate": 9.991796865140462e-07, |
|
"loss": 100.6983, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.42179537876047124, |
|
"grad_norm": 54.46875, |
|
"learning_rate": 9.991761809008584e-07, |
|
"loss": 100.4711, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.4235902527126435, |
|
"grad_norm": 50.9375, |
|
"learning_rate": 9.991726752876706e-07, |
|
"loss": 101.1268, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.4253851266648157, |
|
"grad_norm": 54.40625, |
|
"learning_rate": 9.991691696744828e-07, |
|
"loss": 100.953, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.4271800006169879, |
|
"grad_norm": 60.71875, |
|
"learning_rate": 9.99165664061295e-07, |
|
"loss": 101.4937, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.4289748745691601, |
|
"grad_norm": 59.15625, |
|
"learning_rate": 9.99162158448107e-07, |
|
"loss": 98.9683, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.4307697485213324, |
|
"grad_norm": 57.4375, |
|
"learning_rate": 9.991586528349191e-07, |
|
"loss": 101.705, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4325646224735046, |
|
"grad_norm": 59.15625, |
|
"learning_rate": 9.991551472217313e-07, |
|
"loss": 100.5508, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.4343594964256768, |
|
"grad_norm": 54.5, |
|
"learning_rate": 9.991516416085435e-07, |
|
"loss": 101.0069, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.436154370377849, |
|
"grad_norm": 56.125, |
|
"learning_rate": 9.991481359953557e-07, |
|
"loss": 101.6237, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.43794924433002125, |
|
"grad_norm": 52.78125, |
|
"learning_rate": 9.99144630382168e-07, |
|
"loss": 100.7629, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.43974411828219345, |
|
"grad_norm": 57.28125, |
|
"learning_rate": 9.991411247689801e-07, |
|
"loss": 99.8311, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.44153899223436566, |
|
"grad_norm": 61.125, |
|
"learning_rate": 9.99137619155792e-07, |
|
"loss": 100.6114, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.44333386618653786, |
|
"grad_norm": 57.1875, |
|
"learning_rate": 9.991341135426043e-07, |
|
"loss": 101.98, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.4451287401387101, |
|
"grad_norm": 59.90625, |
|
"learning_rate": 9.991306079294165e-07, |
|
"loss": 101.4968, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4469236140908823, |
|
"grad_norm": 63.375, |
|
"learning_rate": 9.991271023162287e-07, |
|
"loss": 101.6943, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.44871848804305453, |
|
"grad_norm": 59.21875, |
|
"learning_rate": 9.991235967030409e-07, |
|
"loss": 100.9488, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.45051336199522674, |
|
"grad_norm": 58.5625, |
|
"learning_rate": 9.99120091089853e-07, |
|
"loss": 100.7646, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.452308235947399, |
|
"grad_norm": 62.40625, |
|
"learning_rate": 9.991165854766653e-07, |
|
"loss": 101.5989, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.4541031098995712, |
|
"grad_norm": 57.3125, |
|
"learning_rate": 9.991130798634775e-07, |
|
"loss": 101.8999, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.4558979838517434, |
|
"grad_norm": 52.6875, |
|
"learning_rate": 9.991095742502894e-07, |
|
"loss": 99.8278, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.4576928578039156, |
|
"grad_norm": 53.46875, |
|
"learning_rate": 9.991060686371016e-07, |
|
"loss": 100.9303, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.45948773175608787, |
|
"grad_norm": 55.21875, |
|
"learning_rate": 9.991025630239138e-07, |
|
"loss": 99.6612, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.4612826057082601, |
|
"grad_norm": 57.4375, |
|
"learning_rate": 9.99099057410726e-07, |
|
"loss": 102.1943, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.4630774796604323, |
|
"grad_norm": 55.15625, |
|
"learning_rate": 9.990955517975382e-07, |
|
"loss": 101.1615, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.4648723536126045, |
|
"grad_norm": 56.90625, |
|
"learning_rate": 9.990920461843504e-07, |
|
"loss": 100.7708, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.46666722756477674, |
|
"grad_norm": 59.8125, |
|
"learning_rate": 9.990885405711626e-07, |
|
"loss": 99.9972, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.46846210151694895, |
|
"grad_norm": 58.53125, |
|
"learning_rate": 9.990850349579746e-07, |
|
"loss": 100.8372, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.47025697546912115, |
|
"grad_norm": 62.8125, |
|
"learning_rate": 9.990815293447868e-07, |
|
"loss": 99.2942, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.47205184942129336, |
|
"grad_norm": 58.0, |
|
"learning_rate": 9.99078023731599e-07, |
|
"loss": 100.7167, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.4738467233734656, |
|
"grad_norm": 56.28125, |
|
"learning_rate": 9.990745181184112e-07, |
|
"loss": 101.028, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.4756415973256378, |
|
"grad_norm": 56.0, |
|
"learning_rate": 9.990710125052234e-07, |
|
"loss": 100.8192, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.47743647127781, |
|
"grad_norm": 55.625, |
|
"learning_rate": 9.990675068920353e-07, |
|
"loss": 100.6052, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.47923134522998223, |
|
"grad_norm": 55.03125, |
|
"learning_rate": 9.990640012788475e-07, |
|
"loss": 100.9563, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.4810262191821545, |
|
"grad_norm": 58.625, |
|
"learning_rate": 9.9906049566566e-07, |
|
"loss": 100.1024, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.4828210931343267, |
|
"grad_norm": 60.96875, |
|
"learning_rate": 9.99056990052472e-07, |
|
"loss": 101.6276, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.4846159670864989, |
|
"grad_norm": 53.53125, |
|
"learning_rate": 9.990534844392841e-07, |
|
"loss": 100.4488, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4864108410386711, |
|
"grad_norm": 57.78125, |
|
"learning_rate": 9.990499788260963e-07, |
|
"loss": 100.9927, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.48820571499084336, |
|
"grad_norm": 56.125, |
|
"learning_rate": 9.990464732129085e-07, |
|
"loss": 101.7882, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.49000058894301557, |
|
"grad_norm": 55.71875, |
|
"learning_rate": 9.990429675997207e-07, |
|
"loss": 99.6567, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.4917954628951878, |
|
"grad_norm": 57.53125, |
|
"learning_rate": 9.990394619865327e-07, |
|
"loss": 101.4762, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.49359033684736, |
|
"grad_norm": 53.96875, |
|
"learning_rate": 9.990359563733449e-07, |
|
"loss": 100.3731, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.49538521079953224, |
|
"grad_norm": 61.0, |
|
"learning_rate": 9.99032450760157e-07, |
|
"loss": 99.6628, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.49718008475170444, |
|
"grad_norm": 55.875, |
|
"learning_rate": 9.990289451469693e-07, |
|
"loss": 100.9941, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.49897495870387665, |
|
"grad_norm": 53.40625, |
|
"learning_rate": 9.990254395337815e-07, |
|
"loss": 101.2516, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.5007698326560489, |
|
"grad_norm": 62.5625, |
|
"learning_rate": 9.990219339205937e-07, |
|
"loss": 100.9763, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.5025647066082211, |
|
"grad_norm": 54.78125, |
|
"learning_rate": 9.990184283074059e-07, |
|
"loss": 99.8499, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5043595805603933, |
|
"grad_norm": 59.125, |
|
"learning_rate": 9.990149226942178e-07, |
|
"loss": 100.2553, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.5061544545125655, |
|
"grad_norm": 54.625, |
|
"learning_rate": 9.9901141708103e-07, |
|
"loss": 100.0908, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.5079493284647377, |
|
"grad_norm": 53.875, |
|
"learning_rate": 9.990079114678422e-07, |
|
"loss": 99.8332, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.50974420241691, |
|
"grad_norm": 67.3125, |
|
"learning_rate": 9.990044058546544e-07, |
|
"loss": 101.4204, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.5115390763690821, |
|
"grad_norm": 57.8125, |
|
"learning_rate": 9.990009002414666e-07, |
|
"loss": 100.5515, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.5133339503212544, |
|
"grad_norm": 55.96875, |
|
"learning_rate": 9.989973946282786e-07, |
|
"loss": 100.3896, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.5151288242734267, |
|
"grad_norm": 57.8125, |
|
"learning_rate": 9.98993889015091e-07, |
|
"loss": 101.921, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.5169236982255988, |
|
"grad_norm": 51.40625, |
|
"learning_rate": 9.989903834019032e-07, |
|
"loss": 100.1915, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.5187185721777711, |
|
"grad_norm": 57.25, |
|
"learning_rate": 9.989868777887152e-07, |
|
"loss": 100.3448, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.5205134461299432, |
|
"grad_norm": 52.40625, |
|
"learning_rate": 9.989833721755274e-07, |
|
"loss": 100.6079, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5223083200821155, |
|
"grad_norm": 53.28125, |
|
"learning_rate": 9.989798665623396e-07, |
|
"loss": 101.4967, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.5241031940342877, |
|
"grad_norm": 54.40625, |
|
"learning_rate": 9.989763609491518e-07, |
|
"loss": 100.2131, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.5258980679864599, |
|
"grad_norm": 60.59375, |
|
"learning_rate": 9.98972855335964e-07, |
|
"loss": 100.2743, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.5276929419386321, |
|
"grad_norm": 55.5625, |
|
"learning_rate": 9.98969349722776e-07, |
|
"loss": 100.4026, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.5294878158908044, |
|
"grad_norm": 60.125, |
|
"learning_rate": 9.989658441095881e-07, |
|
"loss": 101.1321, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5312826898429766, |
|
"grad_norm": 56.125, |
|
"learning_rate": 9.989623384964005e-07, |
|
"loss": 100.6127, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.5330775637951488, |
|
"grad_norm": 56.0, |
|
"learning_rate": 9.989588328832125e-07, |
|
"loss": 100.822, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.534872437747321, |
|
"grad_norm": 58.65625, |
|
"learning_rate": 9.989553272700247e-07, |
|
"loss": 100.8803, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5366673116994932, |
|
"grad_norm": 52.875, |
|
"learning_rate": 9.98951821656837e-07, |
|
"loss": 100.7196, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.5384621856516655, |
|
"grad_norm": 57.3125, |
|
"learning_rate": 9.98948316043649e-07, |
|
"loss": 100.049, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5402570596038376, |
|
"grad_norm": 54.96875, |
|
"learning_rate": 9.989448104304613e-07, |
|
"loss": 99.7241, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.5420519335560099, |
|
"grad_norm": 57.15625, |
|
"learning_rate": 9.989413048172733e-07, |
|
"loss": 100.1751, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5438468075081822, |
|
"grad_norm": 56.375, |
|
"learning_rate": 9.989377992040855e-07, |
|
"loss": 100.9655, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.5456416814603543, |
|
"grad_norm": 52.09375, |
|
"learning_rate": 9.989342935908977e-07, |
|
"loss": 101.2898, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5474365554125266, |
|
"grad_norm": 54.625, |
|
"learning_rate": 9.989307879777099e-07, |
|
"loss": 100.3283, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5492314293646987, |
|
"grad_norm": 55.21875, |
|
"learning_rate": 9.98927282364522e-07, |
|
"loss": 99.0049, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.551026303316871, |
|
"grad_norm": 55.4375, |
|
"learning_rate": 9.989237767513343e-07, |
|
"loss": 100.6654, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.5528211772690432, |
|
"grad_norm": 55.96875, |
|
"learning_rate": 9.989202711381464e-07, |
|
"loss": 100.3892, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.5546160512212154, |
|
"grad_norm": 61.875, |
|
"learning_rate": 9.989167655249584e-07, |
|
"loss": 99.8484, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.5564109251733876, |
|
"grad_norm": 49.375, |
|
"learning_rate": 9.989132599117706e-07, |
|
"loss": 100.2157, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5582057991255599, |
|
"grad_norm": 53.5625, |
|
"learning_rate": 9.989097542985828e-07, |
|
"loss": 99.1748, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.560000673077732, |
|
"grad_norm": 55.25, |
|
"learning_rate": 9.98906248685395e-07, |
|
"loss": 100.8326, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.5617955470299043, |
|
"grad_norm": 57.71875, |
|
"learning_rate": 9.989027430722072e-07, |
|
"loss": 100.5236, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.5635904209820765, |
|
"grad_norm": 56.6875, |
|
"learning_rate": 9.988992374590194e-07, |
|
"loss": 99.4909, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.5653852949342487, |
|
"grad_norm": 56.90625, |
|
"learning_rate": 9.988957318458316e-07, |
|
"loss": 99.4755, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.567180168886421, |
|
"grad_norm": 56.375, |
|
"learning_rate": 9.988922262326438e-07, |
|
"loss": 99.6661, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.5689750428385931, |
|
"grad_norm": 56.875, |
|
"learning_rate": 9.988887206194558e-07, |
|
"loss": 100.5558, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.5707699167907654, |
|
"grad_norm": 64.125, |
|
"learning_rate": 9.98885215006268e-07, |
|
"loss": 100.403, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5725647907429376, |
|
"grad_norm": 55.53125, |
|
"learning_rate": 9.988817093930802e-07, |
|
"loss": 101.0716, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.5743596646951098, |
|
"grad_norm": 54.625, |
|
"learning_rate": 9.988782037798924e-07, |
|
"loss": 99.6999, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.576154538647282, |
|
"grad_norm": 63.03125, |
|
"learning_rate": 9.988746981667045e-07, |
|
"loss": 100.4395, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.5779494125994542, |
|
"grad_norm": 59.0, |
|
"learning_rate": 9.988711925535165e-07, |
|
"loss": 101.8542, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.5797442865516265, |
|
"grad_norm": 58.46875, |
|
"learning_rate": 9.98867686940329e-07, |
|
"loss": 99.4703, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.5815391605037987, |
|
"grad_norm": 54.3125, |
|
"learning_rate": 9.98864181327141e-07, |
|
"loss": 101.0656, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.5833340344559709, |
|
"grad_norm": 54.03125, |
|
"learning_rate": 9.988606757139531e-07, |
|
"loss": 100.0247, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.5851289084081431, |
|
"grad_norm": 57.0625, |
|
"learning_rate": 9.988571701007653e-07, |
|
"loss": 100.5174, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.5869237823603153, |
|
"grad_norm": 56.1875, |
|
"learning_rate": 9.988536644875775e-07, |
|
"loss": 100.5157, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.5887186563124875, |
|
"grad_norm": 51.6875, |
|
"learning_rate": 9.988501588743897e-07, |
|
"loss": 99.3698, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.5905135302646598, |
|
"grad_norm": 60.5625, |
|
"learning_rate": 9.988466532612017e-07, |
|
"loss": 100.1518, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.592308404216832, |
|
"grad_norm": 59.40625, |
|
"learning_rate": 9.988431476480139e-07, |
|
"loss": 100.5745, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5941032781690042, |
|
"grad_norm": 54.5, |
|
"learning_rate": 9.98839642034826e-07, |
|
"loss": 101.8106, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.5958981521211765, |
|
"grad_norm": 61.03125, |
|
"learning_rate": 9.988361364216383e-07, |
|
"loss": 100.4201, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.5976930260733486, |
|
"grad_norm": 57.15625, |
|
"learning_rate": 9.988326308084505e-07, |
|
"loss": 100.6724, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.5994879000255209, |
|
"grad_norm": 51.03125, |
|
"learning_rate": 9.988291251952626e-07, |
|
"loss": 100.1729, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.601282773977693, |
|
"grad_norm": 57.71875, |
|
"learning_rate": 9.988256195820748e-07, |
|
"loss": 100.661, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6030776479298653, |
|
"grad_norm": 57.25, |
|
"learning_rate": 9.98822113968887e-07, |
|
"loss": 101.6807, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.6048725218820376, |
|
"grad_norm": 56.625, |
|
"learning_rate": 9.98818608355699e-07, |
|
"loss": 101.2, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.6066673958342097, |
|
"grad_norm": 60.5, |
|
"learning_rate": 9.988151027425112e-07, |
|
"loss": 99.7801, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.608462269786382, |
|
"grad_norm": 51.375, |
|
"learning_rate": 9.988115971293234e-07, |
|
"loss": 100.2877, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.6102571437385542, |
|
"grad_norm": 53.40625, |
|
"learning_rate": 9.988080915161356e-07, |
|
"loss": 99.2708, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6120520176907264, |
|
"grad_norm": 51.84375, |
|
"learning_rate": 9.988045859029478e-07, |
|
"loss": 100.1983, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.6138468916428986, |
|
"grad_norm": 57.28125, |
|
"learning_rate": 9.9880108028976e-07, |
|
"loss": 99.6957, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.6156417655950708, |
|
"grad_norm": 60.71875, |
|
"learning_rate": 9.987975746765722e-07, |
|
"loss": 100.4678, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.617436639547243, |
|
"grad_norm": 56.84375, |
|
"learning_rate": 9.987940690633842e-07, |
|
"loss": 101.1054, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6192315134994153, |
|
"grad_norm": 55.1875, |
|
"learning_rate": 9.987905634501964e-07, |
|
"loss": 100.2326, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6210263874515874, |
|
"grad_norm": 59.375, |
|
"learning_rate": 9.987870578370086e-07, |
|
"loss": 101.0535, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.6228212614037597, |
|
"grad_norm": 58.40625, |
|
"learning_rate": 9.987835522238207e-07, |
|
"loss": 100.6926, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.624616135355932, |
|
"grad_norm": 60.25, |
|
"learning_rate": 9.98780046610633e-07, |
|
"loss": 101.5863, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.6264110093081041, |
|
"grad_norm": 61.15625, |
|
"learning_rate": 9.98776540997445e-07, |
|
"loss": 100.7781, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.6282058832602764, |
|
"grad_norm": 55.96875, |
|
"learning_rate": 9.987730353842573e-07, |
|
"loss": 101.0874, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6300007572124485, |
|
"grad_norm": 59.59375, |
|
"learning_rate": 9.987695297710695e-07, |
|
"loss": 100.5733, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.6317956311646208, |
|
"grad_norm": 55.71875, |
|
"learning_rate": 9.987660241578815e-07, |
|
"loss": 100.3865, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.633590505116793, |
|
"grad_norm": 60.1875, |
|
"learning_rate": 9.987625185446937e-07, |
|
"loss": 100.3602, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.6353853790689652, |
|
"grad_norm": 56.625, |
|
"learning_rate": 9.98759012931506e-07, |
|
"loss": 99.8301, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.6371802530211375, |
|
"grad_norm": 59.84375, |
|
"learning_rate": 9.98755507318318e-07, |
|
"loss": 101.2679, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.6389751269733097, |
|
"grad_norm": 53.71875, |
|
"learning_rate": 9.987520017051303e-07, |
|
"loss": 100.4626, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.6407700009254819, |
|
"grad_norm": 65.5625, |
|
"learning_rate": 9.987484960919423e-07, |
|
"loss": 98.8636, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.6425648748776541, |
|
"grad_norm": 59.15625, |
|
"learning_rate": 9.987449904787545e-07, |
|
"loss": 99.7406, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.6443597488298263, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.987414848655667e-07, |
|
"loss": 99.1887, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.6461546227819985, |
|
"grad_norm": 56.875, |
|
"learning_rate": 9.987379792523788e-07, |
|
"loss": 101.342, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6479494967341708, |
|
"grad_norm": 55.6875, |
|
"learning_rate": 9.98734473639191e-07, |
|
"loss": 99.9434, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.6497443706863429, |
|
"grad_norm": 60.59375, |
|
"learning_rate": 9.987309680260032e-07, |
|
"loss": 99.4815, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.6515392446385152, |
|
"grad_norm": 51.53125, |
|
"learning_rate": 9.987274624128154e-07, |
|
"loss": 99.7587, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.6533341185906875, |
|
"grad_norm": 60.0, |
|
"learning_rate": 9.987239567996274e-07, |
|
"loss": 101.2723, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.6551289925428596, |
|
"grad_norm": 63.8125, |
|
"learning_rate": 9.987204511864396e-07, |
|
"loss": 100.9252, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.6569238664950319, |
|
"grad_norm": 56.75, |
|
"learning_rate": 9.987169455732518e-07, |
|
"loss": 100.5552, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.658718740447204, |
|
"grad_norm": 59.71875, |
|
"learning_rate": 9.98713439960064e-07, |
|
"loss": 100.8731, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.6605136143993763, |
|
"grad_norm": 51.9375, |
|
"learning_rate": 9.987099343468762e-07, |
|
"loss": 100.687, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.6623084883515485, |
|
"grad_norm": 56.96875, |
|
"learning_rate": 9.987064287336884e-07, |
|
"loss": 99.6145, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.6641033623037207, |
|
"grad_norm": 57.1875, |
|
"learning_rate": 9.987029231205006e-07, |
|
"loss": 99.7202, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.665898236255893, |
|
"grad_norm": 56.03125, |
|
"learning_rate": 9.986994175073128e-07, |
|
"loss": 100.4039, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.6676931102080651, |
|
"grad_norm": 57.875, |
|
"learning_rate": 9.986959118941248e-07, |
|
"loss": 100.2563, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.6694879841602374, |
|
"grad_norm": 56.15625, |
|
"learning_rate": 9.98692406280937e-07, |
|
"loss": 100.1503, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.6712828581124096, |
|
"grad_norm": 58.0625, |
|
"learning_rate": 9.986889006677491e-07, |
|
"loss": 100.3172, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.6730777320645818, |
|
"grad_norm": 54.53125, |
|
"learning_rate": 9.986853950545613e-07, |
|
"loss": 99.0521, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.674872606016754, |
|
"grad_norm": 67.8125, |
|
"learning_rate": 9.986818894413735e-07, |
|
"loss": 101.304, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.6766674799689263, |
|
"grad_norm": 53.90625, |
|
"learning_rate": 9.986783838281857e-07, |
|
"loss": 99.0437, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.6784623539210984, |
|
"grad_norm": 53.40625, |
|
"learning_rate": 9.98674878214998e-07, |
|
"loss": 100.0527, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.6802572278732707, |
|
"grad_norm": 55.90625, |
|
"learning_rate": 9.9867137260181e-07, |
|
"loss": 100.6432, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.6820521018254428, |
|
"grad_norm": 53.5625, |
|
"learning_rate": 9.98667866988622e-07, |
|
"loss": 100.5706, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6838469757776151, |
|
"grad_norm": 61.46875, |
|
"learning_rate": 9.986643613754343e-07, |
|
"loss": 99.6143, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.6856418497297874, |
|
"grad_norm": 57.9375, |
|
"learning_rate": 9.986608557622465e-07, |
|
"loss": 100.0116, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.6874367236819595, |
|
"grad_norm": 54.40625, |
|
"learning_rate": 9.986573501490587e-07, |
|
"loss": 100.6161, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.6892315976341318, |
|
"grad_norm": 54.6875, |
|
"learning_rate": 9.986538445358707e-07, |
|
"loss": 100.3408, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.691026471586304, |
|
"grad_norm": 59.40625, |
|
"learning_rate": 9.986503389226829e-07, |
|
"loss": 99.8375, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.6928213455384762, |
|
"grad_norm": 55.28125, |
|
"learning_rate": 9.986468333094953e-07, |
|
"loss": 100.2615, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.6946162194906484, |
|
"grad_norm": 62.71875, |
|
"learning_rate": 9.986433276963072e-07, |
|
"loss": 101.2431, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.6964110934428206, |
|
"grad_norm": 56.25, |
|
"learning_rate": 9.986398220831194e-07, |
|
"loss": 100.8722, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.6982059673949929, |
|
"grad_norm": 56.28125, |
|
"learning_rate": 9.986363164699316e-07, |
|
"loss": 100.0757, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.7000008413471651, |
|
"grad_norm": 57.1875, |
|
"learning_rate": 9.986328108567438e-07, |
|
"loss": 99.6734, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7017957152993373, |
|
"grad_norm": 55.15625, |
|
"learning_rate": 9.98629305243556e-07, |
|
"loss": 100.0005, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.7035905892515095, |
|
"grad_norm": 58.0, |
|
"learning_rate": 9.98625799630368e-07, |
|
"loss": 99.7119, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.7053854632036818, |
|
"grad_norm": 57.15625, |
|
"learning_rate": 9.986222940171802e-07, |
|
"loss": 99.9743, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.7071803371558539, |
|
"grad_norm": 53.21875, |
|
"learning_rate": 9.986187884039924e-07, |
|
"loss": 99.9378, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.7089752111080262, |
|
"grad_norm": 55.46875, |
|
"learning_rate": 9.986152827908046e-07, |
|
"loss": 101.0054, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7107700850601983, |
|
"grad_norm": 53.09375, |
|
"learning_rate": 9.986117771776168e-07, |
|
"loss": 100.5901, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.7125649590123706, |
|
"grad_norm": 58.6875, |
|
"learning_rate": 9.98608271564429e-07, |
|
"loss": 100.1371, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.7143598329645429, |
|
"grad_norm": 55.59375, |
|
"learning_rate": 9.986047659512412e-07, |
|
"loss": 100.4305, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.716154706916715, |
|
"grad_norm": 55.875, |
|
"learning_rate": 9.986012603380532e-07, |
|
"loss": 99.389, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.7179495808688873, |
|
"grad_norm": 60.21875, |
|
"learning_rate": 9.985977547248653e-07, |
|
"loss": 100.2918, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7179495808688873, |
|
"eval_loss": 1.561510443687439, |
|
"eval_runtime": 193.4272, |
|
"eval_samples_per_second": 1512.569, |
|
"eval_steps_per_second": 47.268, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7197444548210595, |
|
"grad_norm": 60.78125, |
|
"learning_rate": 9.985942491116775e-07, |
|
"loss": 99.0898, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.7215393287732317, |
|
"grad_norm": 61.78125, |
|
"learning_rate": 9.985907434984897e-07, |
|
"loss": 99.2981, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.7233342027254039, |
|
"grad_norm": 57.75, |
|
"learning_rate": 9.98587237885302e-07, |
|
"loss": 99.7238, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.7251290766775761, |
|
"grad_norm": 53.78125, |
|
"learning_rate": 9.98583732272114e-07, |
|
"loss": 99.6891, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.7269239506297483, |
|
"grad_norm": 59.375, |
|
"learning_rate": 9.985802266589263e-07, |
|
"loss": 100.155, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.7287188245819206, |
|
"grad_norm": 57.375, |
|
"learning_rate": 9.985767210457385e-07, |
|
"loss": 98.5343, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.7305136985340928, |
|
"grad_norm": 57.9375, |
|
"learning_rate": 9.985732154325505e-07, |
|
"loss": 101.0483, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.732308572486265, |
|
"grad_norm": 61.5, |
|
"learning_rate": 9.985697098193627e-07, |
|
"loss": 99.708, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.7341034464384373, |
|
"grad_norm": 52.6875, |
|
"learning_rate": 9.985662042061749e-07, |
|
"loss": 100.2094, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.7358983203906094, |
|
"grad_norm": 54.34375, |
|
"learning_rate": 9.98562698592987e-07, |
|
"loss": 99.3119, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7376931943427817, |
|
"grad_norm": 52.625, |
|
"learning_rate": 9.985591929797993e-07, |
|
"loss": 98.8817, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.7394880682949538, |
|
"grad_norm": 58.71875, |
|
"learning_rate": 9.985556873666113e-07, |
|
"loss": 100.0632, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.7412829422471261, |
|
"grad_norm": 54.28125, |
|
"learning_rate": 9.985521817534234e-07, |
|
"loss": 100.4009, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.7430778161992984, |
|
"grad_norm": 62.46875, |
|
"learning_rate": 9.985486761402356e-07, |
|
"loss": 99.6302, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.7448726901514705, |
|
"grad_norm": 54.90625, |
|
"learning_rate": 9.985451705270478e-07, |
|
"loss": 98.9977, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.7466675641036428, |
|
"grad_norm": 57.3125, |
|
"learning_rate": 9.9854166491386e-07, |
|
"loss": 99.6041, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.7484624380558149, |
|
"grad_norm": 55.90625, |
|
"learning_rate": 9.985381593006722e-07, |
|
"loss": 99.9616, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.7502573120079872, |
|
"grad_norm": 55.5625, |
|
"learning_rate": 9.985346536874844e-07, |
|
"loss": 99.9101, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.7520521859601594, |
|
"grad_norm": 58.375, |
|
"learning_rate": 9.985311480742964e-07, |
|
"loss": 99.3676, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.7538470599123316, |
|
"grad_norm": 62.0, |
|
"learning_rate": 9.985276424611086e-07, |
|
"loss": 100.3315, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7556419338645038, |
|
"grad_norm": 55.375, |
|
"learning_rate": 9.985241368479208e-07, |
|
"loss": 100.3022, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.7574368078166761, |
|
"grad_norm": 56.1875, |
|
"learning_rate": 9.98520631234733e-07, |
|
"loss": 99.8273, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.7592316817688483, |
|
"grad_norm": 55.15625, |
|
"learning_rate": 9.985171256215452e-07, |
|
"loss": 99.8944, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.7610265557210205, |
|
"grad_norm": 57.25, |
|
"learning_rate": 9.985136200083574e-07, |
|
"loss": 99.9793, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.7628214296731927, |
|
"grad_norm": 57.65625, |
|
"learning_rate": 9.985101143951696e-07, |
|
"loss": 99.6139, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.7646163036253649, |
|
"grad_norm": 58.09375, |
|
"learning_rate": 9.985066087819818e-07, |
|
"loss": 99.2432, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.7664111775775372, |
|
"grad_norm": 57.84375, |
|
"learning_rate": 9.985031031687937e-07, |
|
"loss": 100.9006, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.7682060515297093, |
|
"grad_norm": 55.875, |
|
"learning_rate": 9.98499597555606e-07, |
|
"loss": 100.518, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.7700009254818816, |
|
"grad_norm": 60.15625, |
|
"learning_rate": 9.984960919424181e-07, |
|
"loss": 99.9677, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.7717957994340539, |
|
"grad_norm": 60.21875, |
|
"learning_rate": 9.984925863292303e-07, |
|
"loss": 100.5463, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.773590673386226, |
|
"grad_norm": 59.5, |
|
"learning_rate": 9.984890807160425e-07, |
|
"loss": 99.2726, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.7753855473383983, |
|
"grad_norm": 63.03125, |
|
"learning_rate": 9.984855751028547e-07, |
|
"loss": 99.198, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.7771804212905704, |
|
"grad_norm": 55.84375, |
|
"learning_rate": 9.98482069489667e-07, |
|
"loss": 99.174, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.7789752952427427, |
|
"grad_norm": 62.21875, |
|
"learning_rate": 9.98478563876479e-07, |
|
"loss": 99.7285, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.7807701691949149, |
|
"grad_norm": 54.125, |
|
"learning_rate": 9.98475058263291e-07, |
|
"loss": 97.6992, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.7825650431470871, |
|
"grad_norm": 64.3125, |
|
"learning_rate": 9.984715526501033e-07, |
|
"loss": 99.3388, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.7843599170992593, |
|
"grad_norm": 56.1875, |
|
"learning_rate": 9.984680470369155e-07, |
|
"loss": 98.7792, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.7861547910514316, |
|
"grad_norm": 57.90625, |
|
"learning_rate": 9.984645414237277e-07, |
|
"loss": 100.1827, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.7879496650036037, |
|
"grad_norm": 56.09375, |
|
"learning_rate": 9.984610358105396e-07, |
|
"loss": 99.409, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.789744538955776, |
|
"grad_norm": 54.78125, |
|
"learning_rate": 9.984575301973518e-07, |
|
"loss": 99.461, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7915394129079482, |
|
"grad_norm": 56.3125, |
|
"learning_rate": 9.984540245841642e-07, |
|
"loss": 99.5, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.7933342868601204, |
|
"grad_norm": 58.5625, |
|
"learning_rate": 9.984505189709762e-07, |
|
"loss": 99.3468, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.7951291608122927, |
|
"grad_norm": 65.0625, |
|
"learning_rate": 9.984470133577884e-07, |
|
"loss": 99.6309, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.7969240347644648, |
|
"grad_norm": 59.8125, |
|
"learning_rate": 9.984435077446006e-07, |
|
"loss": 99.3963, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.7987189087166371, |
|
"grad_norm": 60.28125, |
|
"learning_rate": 9.984400021314128e-07, |
|
"loss": 98.8352, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8005137826688093, |
|
"grad_norm": 57.1875, |
|
"learning_rate": 9.98436496518225e-07, |
|
"loss": 99.6084, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.8023086566209815, |
|
"grad_norm": 58.3125, |
|
"learning_rate": 9.98432990905037e-07, |
|
"loss": 99.5552, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.8041035305731538, |
|
"grad_norm": 54.28125, |
|
"learning_rate": 9.984294852918492e-07, |
|
"loss": 99.7207, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.8058984045253259, |
|
"grad_norm": 55.875, |
|
"learning_rate": 9.984259796786614e-07, |
|
"loss": 100.2163, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.8076932784774982, |
|
"grad_norm": 58.46875, |
|
"learning_rate": 9.984224740654736e-07, |
|
"loss": 100.1001, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8094881524296704, |
|
"grad_norm": 52.09375, |
|
"learning_rate": 9.984189684522858e-07, |
|
"loss": 100.1762, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.8112830263818426, |
|
"grad_norm": 58.0, |
|
"learning_rate": 9.98415462839098e-07, |
|
"loss": 99.485, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.8130779003340148, |
|
"grad_norm": 53.34375, |
|
"learning_rate": 9.984119572259102e-07, |
|
"loss": 99.2043, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.8148727742861871, |
|
"grad_norm": 55.84375, |
|
"learning_rate": 9.984084516127223e-07, |
|
"loss": 100.9975, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.8166676482383592, |
|
"grad_norm": 53.1875, |
|
"learning_rate": 9.984049459995343e-07, |
|
"loss": 100.2588, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8184625221905315, |
|
"grad_norm": 59.15625, |
|
"learning_rate": 9.984014403863465e-07, |
|
"loss": 101.2972, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.8202573961427037, |
|
"grad_norm": 56.09375, |
|
"learning_rate": 9.983979347731587e-07, |
|
"loss": 99.6443, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.8220522700948759, |
|
"grad_norm": 57.03125, |
|
"learning_rate": 9.98394429159971e-07, |
|
"loss": 99.9172, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.8238471440470482, |
|
"grad_norm": 57.8125, |
|
"learning_rate": 9.983909235467831e-07, |
|
"loss": 99.9183, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.8256420179992203, |
|
"grad_norm": 58.59375, |
|
"learning_rate": 9.983874179335953e-07, |
|
"loss": 101.1427, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8274368919513926, |
|
"grad_norm": 56.375, |
|
"learning_rate": 9.983839123204075e-07, |
|
"loss": 99.5511, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.8292317659035647, |
|
"grad_norm": 54.84375, |
|
"learning_rate": 9.983804067072195e-07, |
|
"loss": 98.27, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.831026639855737, |
|
"grad_norm": 55.1875, |
|
"learning_rate": 9.983769010940317e-07, |
|
"loss": 99.897, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.8328215138079093, |
|
"grad_norm": 55.59375, |
|
"learning_rate": 9.983733954808439e-07, |
|
"loss": 100.4762, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.8346163877600814, |
|
"grad_norm": 53.53125, |
|
"learning_rate": 9.98369889867656e-07, |
|
"loss": 99.019, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.8364112617122537, |
|
"grad_norm": 55.125, |
|
"learning_rate": 9.983663842544683e-07, |
|
"loss": 99.7545, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.8382061356644259, |
|
"grad_norm": 54.75, |
|
"learning_rate": 9.983628786412802e-07, |
|
"loss": 98.6123, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.8400010096165981, |
|
"grad_norm": 52.1875, |
|
"learning_rate": 9.983593730280926e-07, |
|
"loss": 99.1614, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.8417958835687703, |
|
"grad_norm": 57.0625, |
|
"learning_rate": 9.983558674149048e-07, |
|
"loss": 101.187, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.8435907575209425, |
|
"grad_norm": 54.0, |
|
"learning_rate": 9.983523618017168e-07, |
|
"loss": 99.3741, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8453856314731147, |
|
"grad_norm": 54.65625, |
|
"learning_rate": 9.98348856188529e-07, |
|
"loss": 98.8455, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.847180505425287, |
|
"grad_norm": 56.03125, |
|
"learning_rate": 9.983453505753412e-07, |
|
"loss": 100.3109, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.8489753793774591, |
|
"grad_norm": 56.90625, |
|
"learning_rate": 9.983418449621534e-07, |
|
"loss": 99.0733, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.8507702533296314, |
|
"grad_norm": 60.03125, |
|
"learning_rate": 9.983383393489656e-07, |
|
"loss": 99.2901, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.8525651272818037, |
|
"grad_norm": 60.375, |
|
"learning_rate": 9.983348337357776e-07, |
|
"loss": 99.2978, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.8543600012339758, |
|
"grad_norm": 52.1875, |
|
"learning_rate": 9.983313281225898e-07, |
|
"loss": 100.2887, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.8561548751861481, |
|
"grad_norm": 58.6875, |
|
"learning_rate": 9.98327822509402e-07, |
|
"loss": 99.9115, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.8579497491383202, |
|
"grad_norm": 56.03125, |
|
"learning_rate": 9.983243168962142e-07, |
|
"loss": 98.9636, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.8597446230904925, |
|
"grad_norm": 64.5625, |
|
"learning_rate": 9.983208112830264e-07, |
|
"loss": 98.904, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.8615394970426647, |
|
"grad_norm": 60.09375, |
|
"learning_rate": 9.983173056698386e-07, |
|
"loss": 99.8028, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8633343709948369, |
|
"grad_norm": 59.9375, |
|
"learning_rate": 9.983138000566507e-07, |
|
"loss": 99.5853, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.8651292449470092, |
|
"grad_norm": 54.34375, |
|
"learning_rate": 9.983102944434627e-07, |
|
"loss": 98.7931, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.8669241188991814, |
|
"grad_norm": 53.03125, |
|
"learning_rate": 9.98306788830275e-07, |
|
"loss": 99.695, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.8687189928513536, |
|
"grad_norm": 54.34375, |
|
"learning_rate": 9.983032832170871e-07, |
|
"loss": 99.7535, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.8705138668035258, |
|
"grad_norm": 58.84375, |
|
"learning_rate": 9.982997776038993e-07, |
|
"loss": 99.5457, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.872308740755698, |
|
"grad_norm": 54.0, |
|
"learning_rate": 9.982962719907115e-07, |
|
"loss": 99.1976, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.8741036147078702, |
|
"grad_norm": 52.1875, |
|
"learning_rate": 9.982927663775237e-07, |
|
"loss": 98.987, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.8758984886600425, |
|
"grad_norm": 55.0625, |
|
"learning_rate": 9.982892607643359e-07, |
|
"loss": 99.2621, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.8776933626122146, |
|
"grad_norm": 53.4375, |
|
"learning_rate": 9.98285755151148e-07, |
|
"loss": 99.8875, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.8794882365643869, |
|
"grad_norm": 55.875, |
|
"learning_rate": 9.9828224953796e-07, |
|
"loss": 98.8774, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8812831105165592, |
|
"grad_norm": 59.78125, |
|
"learning_rate": 9.982787439247723e-07, |
|
"loss": 100.7272, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.8830779844687313, |
|
"grad_norm": 55.21875, |
|
"learning_rate": 9.982752383115845e-07, |
|
"loss": 99.0058, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.8848728584209036, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.982717326983967e-07, |
|
"loss": 99.3883, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.8866677323730757, |
|
"grad_norm": 55.8125, |
|
"learning_rate": 9.982682270852088e-07, |
|
"loss": 98.9662, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.888462606325248, |
|
"grad_norm": 52.90625, |
|
"learning_rate": 9.98264721472021e-07, |
|
"loss": 100.7881, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.8902574802774202, |
|
"grad_norm": 54.71875, |
|
"learning_rate": 9.982612158588332e-07, |
|
"loss": 98.7615, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.8920523542295924, |
|
"grad_norm": 53.65625, |
|
"learning_rate": 9.982577102456452e-07, |
|
"loss": 99.4831, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.8938472281817647, |
|
"grad_norm": 58.78125, |
|
"learning_rate": 9.982542046324574e-07, |
|
"loss": 98.5362, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.8956421021339369, |
|
"grad_norm": 54.0625, |
|
"learning_rate": 9.982506990192696e-07, |
|
"loss": 98.9379, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.8974369760861091, |
|
"grad_norm": 52.78125, |
|
"learning_rate": 9.982471934060818e-07, |
|
"loss": 98.3786, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8992318500382813, |
|
"grad_norm": 57.09375, |
|
"learning_rate": 9.98243687792894e-07, |
|
"loss": 99.2205, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.9010267239904535, |
|
"grad_norm": 53.71875, |
|
"learning_rate": 9.98240182179706e-07, |
|
"loss": 99.4672, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.9028215979426257, |
|
"grad_norm": 57.40625, |
|
"learning_rate": 9.982366765665182e-07, |
|
"loss": 99.5186, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.904616471894798, |
|
"grad_norm": 58.71875, |
|
"learning_rate": 9.982331709533304e-07, |
|
"loss": 99.584, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.9064113458469701, |
|
"grad_norm": 55.0625, |
|
"learning_rate": 9.982296653401426e-07, |
|
"loss": 100.6763, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.9082062197991424, |
|
"grad_norm": 54.28125, |
|
"learning_rate": 9.982261597269548e-07, |
|
"loss": 98.7888, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.9100010937513147, |
|
"grad_norm": 54.8125, |
|
"learning_rate": 9.98222654113767e-07, |
|
"loss": 98.7003, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.9117959677034868, |
|
"grad_norm": 58.28125, |
|
"learning_rate": 9.982191485005791e-07, |
|
"loss": 99.602, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.9135908416556591, |
|
"grad_norm": 56.28125, |
|
"learning_rate": 9.982156428873913e-07, |
|
"loss": 99.339, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.9153857156078312, |
|
"grad_norm": 55.15625, |
|
"learning_rate": 9.982121372742033e-07, |
|
"loss": 99.4155, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9171805895600035, |
|
"grad_norm": 57.03125, |
|
"learning_rate": 9.982086316610155e-07, |
|
"loss": 99.6862, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.9189754635121757, |
|
"grad_norm": 56.96875, |
|
"learning_rate": 9.982051260478277e-07, |
|
"loss": 99.5966, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.9207703374643479, |
|
"grad_norm": 55.6875, |
|
"learning_rate": 9.9820162043464e-07, |
|
"loss": 100.2231, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.9225652114165201, |
|
"grad_norm": 54.78125, |
|
"learning_rate": 9.98198114821452e-07, |
|
"loss": 99.5804, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.9243600853686923, |
|
"grad_norm": 58.1875, |
|
"learning_rate": 9.981946092082643e-07, |
|
"loss": 98.8408, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.9261549593208646, |
|
"grad_norm": 56.75, |
|
"learning_rate": 9.981911035950765e-07, |
|
"loss": 99.1592, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.9279498332730368, |
|
"grad_norm": 51.3125, |
|
"learning_rate": 9.981875979818885e-07, |
|
"loss": 99.347, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.929744707225209, |
|
"grad_norm": 56.6875, |
|
"learning_rate": 9.981840923687007e-07, |
|
"loss": 99.7062, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.9315395811773812, |
|
"grad_norm": 55.28125, |
|
"learning_rate": 9.981805867555129e-07, |
|
"loss": 100.1522, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.9333344551295535, |
|
"grad_norm": 55.25, |
|
"learning_rate": 9.98177081142325e-07, |
|
"loss": 99.1621, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9351293290817256, |
|
"grad_norm": 58.40625, |
|
"learning_rate": 9.981735755291372e-07, |
|
"loss": 99.3177, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.9369242030338979, |
|
"grad_norm": 82.875, |
|
"learning_rate": 9.981700699159492e-07, |
|
"loss": 99.838, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.93871907698607, |
|
"grad_norm": 56.125, |
|
"learning_rate": 9.981665643027616e-07, |
|
"loss": 98.6675, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.9405139509382423, |
|
"grad_norm": 56.15625, |
|
"learning_rate": 9.981630586895738e-07, |
|
"loss": 100.7525, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.9423088248904146, |
|
"grad_norm": 54.5625, |
|
"learning_rate": 9.981595530763858e-07, |
|
"loss": 98.633, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.9441036988425867, |
|
"grad_norm": 55.875, |
|
"learning_rate": 9.98156047463198e-07, |
|
"loss": 99.6261, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.945898572794759, |
|
"grad_norm": 58.15625, |
|
"learning_rate": 9.981525418500102e-07, |
|
"loss": 99.3767, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.9476934467469312, |
|
"grad_norm": 55.96875, |
|
"learning_rate": 9.981490362368224e-07, |
|
"loss": 99.0867, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.9494883206991034, |
|
"grad_norm": 56.90625, |
|
"learning_rate": 9.981455306236346e-07, |
|
"loss": 98.7356, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.9512831946512756, |
|
"grad_norm": 56.78125, |
|
"learning_rate": 9.981420250104466e-07, |
|
"loss": 99.5022, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.9530780686034478, |
|
"grad_norm": 55.59375, |
|
"learning_rate": 9.981385193972588e-07, |
|
"loss": 99.9538, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.95487294255562, |
|
"grad_norm": 56.8125, |
|
"learning_rate": 9.98135013784071e-07, |
|
"loss": 98.6498, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.9566678165077923, |
|
"grad_norm": 57.9375, |
|
"learning_rate": 9.981315081708831e-07, |
|
"loss": 99.2055, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.9584626904599645, |
|
"grad_norm": 53.25, |
|
"learning_rate": 9.981280025576953e-07, |
|
"loss": 99.4488, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.9602575644121367, |
|
"grad_norm": 58.21875, |
|
"learning_rate": 9.981244969445075e-07, |
|
"loss": 100.3882, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.962052438364309, |
|
"grad_norm": 53.875, |
|
"learning_rate": 9.981209913313197e-07, |
|
"loss": 98.866, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.9638473123164811, |
|
"grad_norm": 56.28125, |
|
"learning_rate": 9.981174857181317e-07, |
|
"loss": 99.651, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.9656421862686534, |
|
"grad_norm": 57.4375, |
|
"learning_rate": 9.98113980104944e-07, |
|
"loss": 98.6736, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.9674370602208255, |
|
"grad_norm": 59.3125, |
|
"learning_rate": 9.98110474491756e-07, |
|
"loss": 99.5751, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.9692319341729978, |
|
"grad_norm": 57.71875, |
|
"learning_rate": 9.981069688785683e-07, |
|
"loss": 98.4359, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9710268081251701, |
|
"grad_norm": 52.65625, |
|
"learning_rate": 9.981034632653805e-07, |
|
"loss": 99.2994, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.9728216820773422, |
|
"grad_norm": 55.90625, |
|
"learning_rate": 9.980999576521927e-07, |
|
"loss": 99.8557, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.9746165560295145, |
|
"grad_norm": 54.875, |
|
"learning_rate": 9.980964520390049e-07, |
|
"loss": 98.9348, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.9764114299816867, |
|
"grad_norm": 57.21875, |
|
"learning_rate": 9.98092946425817e-07, |
|
"loss": 99.2456, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.9782063039338589, |
|
"grad_norm": 55.84375, |
|
"learning_rate": 9.98089440812629e-07, |
|
"loss": 98.4899, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.9800011778860311, |
|
"grad_norm": 57.15625, |
|
"learning_rate": 9.980859351994412e-07, |
|
"loss": 99.0099, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.9817960518382033, |
|
"grad_norm": 52.875, |
|
"learning_rate": 9.980824295862534e-07, |
|
"loss": 98.0469, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.9835909257903755, |
|
"grad_norm": 52.84375, |
|
"learning_rate": 9.980789239730656e-07, |
|
"loss": 99.7561, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.9853857997425478, |
|
"grad_norm": 55.4375, |
|
"learning_rate": 9.980754183598778e-07, |
|
"loss": 99.6168, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.98718067369472, |
|
"grad_norm": 55.0, |
|
"learning_rate": 9.9807191274669e-07, |
|
"loss": 99.3561, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9889755476468922, |
|
"grad_norm": 54.875, |
|
"learning_rate": 9.980684071335022e-07, |
|
"loss": 98.6927, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.9907704215990645, |
|
"grad_norm": 55.03125, |
|
"learning_rate": 9.980649015203142e-07, |
|
"loss": 98.8652, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.9925652955512366, |
|
"grad_norm": 55.375, |
|
"learning_rate": 9.980613959071264e-07, |
|
"loss": 99.0075, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.9943601695034089, |
|
"grad_norm": 56.03125, |
|
"learning_rate": 9.980578902939386e-07, |
|
"loss": 99.1172, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.996155043455581, |
|
"grad_norm": 52.59375, |
|
"learning_rate": 9.980543846807508e-07, |
|
"loss": 99.8071, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.9979499174077533, |
|
"grad_norm": 57.0625, |
|
"learning_rate": 9.98050879067563e-07, |
|
"loss": 99.4139, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.9997447913599256, |
|
"grad_norm": 60.0, |
|
"learning_rate": 9.98047373454375e-07, |
|
"loss": 98.7373, |
|
"step": 5570 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5571, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5382180454408913e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|