pretrain_rugec_msu / last-checkpoint /trainer_state.json
mika5883's picture
Training in progress, step 308000, checkpoint
025744f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9712,
"eval_steps": 500,
"global_step": 308000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032,
"grad_norm": 1.7230606079101562,
"learning_rate": 4.99208e-05,
"loss": 1.2281,
"step": 500
},
{
"epoch": 0.0064,
"grad_norm": 3.655383348464966,
"learning_rate": 4.9840800000000006e-05,
"loss": 0.7566,
"step": 1000
},
{
"epoch": 0.0096,
"grad_norm": 1.2925927639007568,
"learning_rate": 4.97608e-05,
"loss": 0.6764,
"step": 1500
},
{
"epoch": 0.0128,
"grad_norm": 1.286004900932312,
"learning_rate": 4.968080000000001e-05,
"loss": 0.6304,
"step": 2000
},
{
"epoch": 0.016,
"grad_norm": 1.2140214443206787,
"learning_rate": 4.96008e-05,
"loss": 0.5981,
"step": 2500
},
{
"epoch": 0.0192,
"grad_norm": 1.2525482177734375,
"learning_rate": 4.95208e-05,
"loss": 0.5767,
"step": 3000
},
{
"epoch": 0.0224,
"grad_norm": 1.2310410737991333,
"learning_rate": 4.94408e-05,
"loss": 0.5597,
"step": 3500
},
{
"epoch": 0.0256,
"grad_norm": 1.1735206842422485,
"learning_rate": 4.9360800000000004e-05,
"loss": 0.5418,
"step": 4000
},
{
"epoch": 0.0288,
"grad_norm": 1.114688754081726,
"learning_rate": 4.9280800000000004e-05,
"loss": 0.5335,
"step": 4500
},
{
"epoch": 0.032,
"grad_norm": 0.8874593377113342,
"learning_rate": 4.9200800000000005e-05,
"loss": 0.5237,
"step": 5000
},
{
"epoch": 0.0352,
"grad_norm": 1.1261299848556519,
"learning_rate": 4.91208e-05,
"loss": 0.5135,
"step": 5500
},
{
"epoch": 0.0384,
"grad_norm": 0.9994556307792664,
"learning_rate": 4.9040800000000007e-05,
"loss": 0.5059,
"step": 6000
},
{
"epoch": 0.0416,
"grad_norm": 1.2349673509597778,
"learning_rate": 4.89608e-05,
"loss": 0.4939,
"step": 6500
},
{
"epoch": 0.0448,
"grad_norm": 0.9770995378494263,
"learning_rate": 4.88808e-05,
"loss": 0.4824,
"step": 7000
},
{
"epoch": 0.048,
"grad_norm": 0.981966495513916,
"learning_rate": 4.88008e-05,
"loss": 0.4875,
"step": 7500
},
{
"epoch": 0.0512,
"grad_norm": 1.0177415609359741,
"learning_rate": 4.87208e-05,
"loss": 0.4785,
"step": 8000
},
{
"epoch": 0.0544,
"grad_norm": 1.0521667003631592,
"learning_rate": 4.8640800000000004e-05,
"loss": 0.4731,
"step": 8500
},
{
"epoch": 0.0576,
"grad_norm": 0.8560615181922913,
"learning_rate": 4.85608e-05,
"loss": 0.4633,
"step": 9000
},
{
"epoch": 0.0608,
"grad_norm": 1.0170217752456665,
"learning_rate": 4.8480800000000005e-05,
"loss": 0.4576,
"step": 9500
},
{
"epoch": 0.064,
"grad_norm": 0.9891325831413269,
"learning_rate": 4.84008e-05,
"loss": 0.4556,
"step": 10000
},
{
"epoch": 0.0672,
"grad_norm": 1.0609711408615112,
"learning_rate": 4.832080000000001e-05,
"loss": 0.4493,
"step": 10500
},
{
"epoch": 0.0704,
"grad_norm": 0.8623799681663513,
"learning_rate": 4.82408e-05,
"loss": 0.4459,
"step": 11000
},
{
"epoch": 0.0736,
"grad_norm": 0.9587870240211487,
"learning_rate": 4.81608e-05,
"loss": 0.4418,
"step": 11500
},
{
"epoch": 0.0768,
"grad_norm": 0.8939447999000549,
"learning_rate": 4.80808e-05,
"loss": 0.4327,
"step": 12000
},
{
"epoch": 0.08,
"grad_norm": 0.9886033535003662,
"learning_rate": 4.80008e-05,
"loss": 0.438,
"step": 12500
},
{
"epoch": 0.0832,
"grad_norm": 0.9157513976097107,
"learning_rate": 4.7920800000000004e-05,
"loss": 0.4323,
"step": 13000
},
{
"epoch": 0.0864,
"grad_norm": 0.9085854887962341,
"learning_rate": 4.7840800000000005e-05,
"loss": 0.4303,
"step": 13500
},
{
"epoch": 0.0896,
"grad_norm": 0.9123984575271606,
"learning_rate": 4.77608e-05,
"loss": 0.4247,
"step": 14000
},
{
"epoch": 0.0928,
"grad_norm": 0.839026689529419,
"learning_rate": 4.7680960000000004e-05,
"loss": 0.4233,
"step": 14500
},
{
"epoch": 0.096,
"grad_norm": 0.8110847473144531,
"learning_rate": 4.760096e-05,
"loss": 0.4207,
"step": 15000
},
{
"epoch": 0.0992,
"grad_norm": 0.8462579250335693,
"learning_rate": 4.7520960000000005e-05,
"loss": 0.421,
"step": 15500
},
{
"epoch": 0.1024,
"grad_norm": 0.8980106711387634,
"learning_rate": 4.744096e-05,
"loss": 0.417,
"step": 16000
},
{
"epoch": 0.1056,
"grad_norm": 0.8297702074050903,
"learning_rate": 4.736096000000001e-05,
"loss": 0.4139,
"step": 16500
},
{
"epoch": 0.1088,
"grad_norm": 0.9856173992156982,
"learning_rate": 4.728096e-05,
"loss": 0.419,
"step": 17000
},
{
"epoch": 0.112,
"grad_norm": 0.934256911277771,
"learning_rate": 4.720096e-05,
"loss": 0.4098,
"step": 17500
},
{
"epoch": 0.1152,
"grad_norm": 0.9190649390220642,
"learning_rate": 4.712096e-05,
"loss": 0.412,
"step": 18000
},
{
"epoch": 0.1184,
"grad_norm": 0.9078772664070129,
"learning_rate": 4.704096e-05,
"loss": 0.4043,
"step": 18500
},
{
"epoch": 0.1216,
"grad_norm": 1.082939624786377,
"learning_rate": 4.696112e-05,
"loss": 0.4045,
"step": 19000
},
{
"epoch": 0.1248,
"grad_norm": 0.9159390926361084,
"learning_rate": 4.688112e-05,
"loss": 0.4098,
"step": 19500
},
{
"epoch": 0.128,
"grad_norm": 0.8420547842979431,
"learning_rate": 4.680128e-05,
"loss": 0.4033,
"step": 20000
},
{
"epoch": 0.1312,
"grad_norm": 0.7658286094665527,
"learning_rate": 4.672128e-05,
"loss": 0.4002,
"step": 20500
},
{
"epoch": 0.1344,
"grad_norm": 0.9074057340621948,
"learning_rate": 4.664128e-05,
"loss": 0.3964,
"step": 21000
},
{
"epoch": 0.1376,
"grad_norm": 0.6065025329589844,
"learning_rate": 4.656128e-05,
"loss": 0.3984,
"step": 21500
},
{
"epoch": 0.1408,
"grad_norm": 0.7523757219314575,
"learning_rate": 4.6481280000000004e-05,
"loss": 0.3959,
"step": 22000
},
{
"epoch": 0.144,
"grad_norm": 0.807826042175293,
"learning_rate": 4.6401280000000004e-05,
"loss": 0.3921,
"step": 22500
},
{
"epoch": 0.1472,
"grad_norm": 0.8530682325363159,
"learning_rate": 4.632128e-05,
"loss": 0.4002,
"step": 23000
},
{
"epoch": 0.1504,
"grad_norm": 0.8661518692970276,
"learning_rate": 4.6241280000000006e-05,
"loss": 0.3856,
"step": 23500
},
{
"epoch": 0.1536,
"grad_norm": 0.7473235130310059,
"learning_rate": 4.616144e-05,
"loss": 0.3854,
"step": 24000
},
{
"epoch": 0.1568,
"grad_norm": 0.7954819202423096,
"learning_rate": 4.6081440000000005e-05,
"loss": 0.3871,
"step": 24500
},
{
"epoch": 0.16,
"grad_norm": 0.8758727312088013,
"learning_rate": 4.600144e-05,
"loss": 0.3842,
"step": 25000
},
{
"epoch": 0.1632,
"grad_norm": 0.8430293798446655,
"learning_rate": 4.592144000000001e-05,
"loss": 0.3886,
"step": 25500
},
{
"epoch": 0.1664,
"grad_norm": 0.6557173728942871,
"learning_rate": 4.584144e-05,
"loss": 0.3854,
"step": 26000
},
{
"epoch": 0.1696,
"grad_norm": 0.7791888117790222,
"learning_rate": 4.576144e-05,
"loss": 0.3796,
"step": 26500
},
{
"epoch": 0.1728,
"grad_norm": 0.736084520816803,
"learning_rate": 4.56816e-05,
"loss": 0.3806,
"step": 27000
},
{
"epoch": 0.176,
"grad_norm": 0.7714269161224365,
"learning_rate": 4.56016e-05,
"loss": 0.3781,
"step": 27500
},
{
"epoch": 0.1792,
"grad_norm": 0.766144335269928,
"learning_rate": 4.552176e-05,
"loss": 0.3766,
"step": 28000
},
{
"epoch": 0.1824,
"grad_norm": 0.7035301923751831,
"learning_rate": 4.544176e-05,
"loss": 0.3737,
"step": 28500
},
{
"epoch": 0.1856,
"grad_norm": 0.7573793530464172,
"learning_rate": 4.536176e-05,
"loss": 0.3753,
"step": 29000
},
{
"epoch": 0.1888,
"grad_norm": 0.8799508213996887,
"learning_rate": 4.528176e-05,
"loss": 0.373,
"step": 29500
},
{
"epoch": 0.192,
"grad_norm": 0.8543264269828796,
"learning_rate": 4.520176e-05,
"loss": 0.3735,
"step": 30000
},
{
"epoch": 0.1952,
"grad_norm": 0.6768947243690491,
"learning_rate": 4.512176e-05,
"loss": 0.3697,
"step": 30500
},
{
"epoch": 0.1984,
"grad_norm": 0.8239702582359314,
"learning_rate": 4.504176e-05,
"loss": 0.3675,
"step": 31000
},
{
"epoch": 0.2016,
"grad_norm": 0.8310449123382568,
"learning_rate": 4.4961760000000004e-05,
"loss": 0.3695,
"step": 31500
},
{
"epoch": 0.2048,
"grad_norm": 0.8459475040435791,
"learning_rate": 4.488176e-05,
"loss": 0.3694,
"step": 32000
},
{
"epoch": 0.208,
"grad_norm": 0.7346063852310181,
"learning_rate": 4.4801760000000006e-05,
"loss": 0.3646,
"step": 32500
},
{
"epoch": 0.2112,
"grad_norm": 0.6958354115486145,
"learning_rate": 4.472176e-05,
"loss": 0.3704,
"step": 33000
},
{
"epoch": 0.2144,
"grad_norm": 0.8244686722755432,
"learning_rate": 4.464176000000001e-05,
"loss": 0.3647,
"step": 33500
},
{
"epoch": 0.2176,
"grad_norm": 0.7559502124786377,
"learning_rate": 4.456192e-05,
"loss": 0.3665,
"step": 34000
},
{
"epoch": 0.2208,
"grad_norm": 0.9046504497528076,
"learning_rate": 4.4481920000000007e-05,
"loss": 0.3637,
"step": 34500
},
{
"epoch": 0.224,
"grad_norm": 0.7771899700164795,
"learning_rate": 4.440192e-05,
"loss": 0.3648,
"step": 35000
},
{
"epoch": 0.2272,
"grad_norm": 0.6887528300285339,
"learning_rate": 4.432192e-05,
"loss": 0.3562,
"step": 35500
},
{
"epoch": 0.2304,
"grad_norm": 0.7471407055854797,
"learning_rate": 4.424192e-05,
"loss": 0.3639,
"step": 36000
},
{
"epoch": 0.2336,
"grad_norm": 0.7198163270950317,
"learning_rate": 4.416192e-05,
"loss": 0.3604,
"step": 36500
},
{
"epoch": 0.2368,
"grad_norm": 0.7383478879928589,
"learning_rate": 4.4081920000000004e-05,
"loss": 0.3592,
"step": 37000
},
{
"epoch": 0.24,
"grad_norm": 0.8052579760551453,
"learning_rate": 4.4001920000000004e-05,
"loss": 0.3563,
"step": 37500
},
{
"epoch": 0.2432,
"grad_norm": 0.7765107154846191,
"learning_rate": 4.392224e-05,
"loss": 0.3548,
"step": 38000
},
{
"epoch": 0.2464,
"grad_norm": 0.7250288724899292,
"learning_rate": 4.384224e-05,
"loss": 0.3605,
"step": 38500
},
{
"epoch": 0.2496,
"grad_norm": 0.6914694309234619,
"learning_rate": 4.376224e-05,
"loss": 0.3551,
"step": 39000
},
{
"epoch": 0.2528,
"grad_norm": 0.6636275053024292,
"learning_rate": 4.368224e-05,
"loss": 0.3587,
"step": 39500
},
{
"epoch": 0.256,
"grad_norm": 0.710564911365509,
"learning_rate": 4.360224e-05,
"loss": 0.3537,
"step": 40000
},
{
"epoch": 0.2592,
"grad_norm": 0.6195800304412842,
"learning_rate": 4.3522240000000004e-05,
"loss": 0.3537,
"step": 40500
},
{
"epoch": 0.2624,
"grad_norm": 0.7131514549255371,
"learning_rate": 4.34424e-05,
"loss": 0.3531,
"step": 41000
},
{
"epoch": 0.2656,
"grad_norm": 0.6594410538673401,
"learning_rate": 4.336256e-05,
"loss": 0.3518,
"step": 41500
},
{
"epoch": 0.2688,
"grad_norm": 0.7651230096817017,
"learning_rate": 4.328256e-05,
"loss": 0.3516,
"step": 42000
},
{
"epoch": 0.272,
"grad_norm": 0.756515622138977,
"learning_rate": 4.320256e-05,
"loss": 0.3461,
"step": 42500
},
{
"epoch": 0.2752,
"grad_norm": 0.7201528549194336,
"learning_rate": 4.3122560000000003e-05,
"loss": 0.3497,
"step": 43000
},
{
"epoch": 0.2784,
"grad_norm": 0.7436856031417847,
"learning_rate": 4.3042560000000004e-05,
"loss": 0.3505,
"step": 43500
},
{
"epoch": 0.2816,
"grad_norm": 0.7914199829101562,
"learning_rate": 4.2962560000000005e-05,
"loss": 0.3439,
"step": 44000
},
{
"epoch": 0.2848,
"grad_norm": 0.7488194704055786,
"learning_rate": 4.288256e-05,
"loss": 0.349,
"step": 44500
},
{
"epoch": 0.288,
"grad_norm": 0.8654124736785889,
"learning_rate": 4.280256e-05,
"loss": 0.3491,
"step": 45000
},
{
"epoch": 0.2912,
"grad_norm": 0.6817401647567749,
"learning_rate": 4.272272e-05,
"loss": 0.3447,
"step": 45500
},
{
"epoch": 0.2944,
"grad_norm": 0.6439715623855591,
"learning_rate": 4.2642720000000006e-05,
"loss": 0.3453,
"step": 46000
},
{
"epoch": 0.2976,
"grad_norm": 1.3840138912200928,
"learning_rate": 4.256272e-05,
"loss": 0.3445,
"step": 46500
},
{
"epoch": 0.3008,
"grad_norm": 0.7245766520500183,
"learning_rate": 4.248272e-05,
"loss": 0.3462,
"step": 47000
},
{
"epoch": 0.304,
"grad_norm": 0.6877666711807251,
"learning_rate": 4.240288e-05,
"loss": 0.3465,
"step": 47500
},
{
"epoch": 0.3072,
"grad_norm": 0.8494886159896851,
"learning_rate": 4.2322880000000006e-05,
"loss": 0.348,
"step": 48000
},
{
"epoch": 0.3104,
"grad_norm": 0.6704971790313721,
"learning_rate": 4.224288e-05,
"loss": 0.3403,
"step": 48500
},
{
"epoch": 0.3136,
"grad_norm": 0.6239964962005615,
"learning_rate": 4.216288000000001e-05,
"loss": 0.3382,
"step": 49000
},
{
"epoch": 0.3168,
"grad_norm": 0.7317768335342407,
"learning_rate": 4.208288e-05,
"loss": 0.3385,
"step": 49500
},
{
"epoch": 0.32,
"grad_norm": 0.7397735118865967,
"learning_rate": 4.200288e-05,
"loss": 0.3405,
"step": 50000
},
{
"epoch": 0.3232,
"grad_norm": 1.1299536228179932,
"learning_rate": 4.1922880000000003e-05,
"loss": 0.3431,
"step": 50500
},
{
"epoch": 0.3264,
"grad_norm": 0.6406556963920593,
"learning_rate": 4.184304e-05,
"loss": 0.3384,
"step": 51000
},
{
"epoch": 0.3296,
"grad_norm": 0.8084424734115601,
"learning_rate": 4.17632e-05,
"loss": 0.3365,
"step": 51500
},
{
"epoch": 0.3328,
"grad_norm": 0.7525010704994202,
"learning_rate": 4.16832e-05,
"loss": 0.3399,
"step": 52000
},
{
"epoch": 0.336,
"grad_norm": 0.7382110953330994,
"learning_rate": 4.16032e-05,
"loss": 0.335,
"step": 52500
},
{
"epoch": 0.3392,
"grad_norm": 0.6454793810844421,
"learning_rate": 4.15232e-05,
"loss": 0.3354,
"step": 53000
},
{
"epoch": 0.3424,
"grad_norm": 0.639664351940155,
"learning_rate": 4.14432e-05,
"loss": 0.3371,
"step": 53500
},
{
"epoch": 0.3456,
"grad_norm": 0.5574499368667603,
"learning_rate": 4.1363200000000004e-05,
"loss": 0.3341,
"step": 54000
},
{
"epoch": 0.3488,
"grad_norm": 0.6772671341896057,
"learning_rate": 4.12832e-05,
"loss": 0.3331,
"step": 54500
},
{
"epoch": 0.352,
"grad_norm": 0.6943195462226868,
"learning_rate": 4.120336e-05,
"loss": 0.3365,
"step": 55000
},
{
"epoch": 0.3552,
"grad_norm": 0.7460485100746155,
"learning_rate": 4.112336e-05,
"loss": 0.3308,
"step": 55500
},
{
"epoch": 0.3584,
"grad_norm": 0.7071924805641174,
"learning_rate": 4.1043360000000005e-05,
"loss": 0.3312,
"step": 56000
},
{
"epoch": 0.3616,
"grad_norm": 0.6678891181945801,
"learning_rate": 4.0963519999999996e-05,
"loss": 0.3314,
"step": 56500
},
{
"epoch": 0.3648,
"grad_norm": 0.7100914120674133,
"learning_rate": 4.0883520000000004e-05,
"loss": 0.3307,
"step": 57000
},
{
"epoch": 0.368,
"grad_norm": 0.6085671782493591,
"learning_rate": 4.080352e-05,
"loss": 0.3282,
"step": 57500
},
{
"epoch": 0.3712,
"grad_norm": 0.6634243130683899,
"learning_rate": 4.0723520000000005e-05,
"loss": 0.3321,
"step": 58000
},
{
"epoch": 0.3744,
"grad_norm": 0.7203409075737,
"learning_rate": 4.064352e-05,
"loss": 0.3318,
"step": 58500
},
{
"epoch": 0.3776,
"grad_norm": 0.7934884428977966,
"learning_rate": 4.056352e-05,
"loss": 0.3239,
"step": 59000
},
{
"epoch": 0.3808,
"grad_norm": 0.8591666221618652,
"learning_rate": 4.048352e-05,
"loss": 0.3275,
"step": 59500
},
{
"epoch": 0.384,
"grad_norm": 0.6306772232055664,
"learning_rate": 4.040352e-05,
"loss": 0.3308,
"step": 60000
},
{
"epoch": 0.3872,
"grad_norm": 0.6059302687644958,
"learning_rate": 4.032352e-05,
"loss": 0.3266,
"step": 60500
},
{
"epoch": 0.3904,
"grad_norm": 0.6875105500221252,
"learning_rate": 4.024352e-05,
"loss": 0.3265,
"step": 61000
},
{
"epoch": 0.3936,
"grad_norm": 0.6397412419319153,
"learning_rate": 4.0163520000000004e-05,
"loss": 0.3268,
"step": 61500
},
{
"epoch": 0.3968,
"grad_norm": 0.7801005840301514,
"learning_rate": 4.0083520000000005e-05,
"loss": 0.3314,
"step": 62000
},
{
"epoch": 0.4,
"grad_norm": 0.6966884136199951,
"learning_rate": 4.000352e-05,
"loss": 0.3263,
"step": 62500
},
{
"epoch": 0.4032,
"grad_norm": 0.7413304448127747,
"learning_rate": 3.9923520000000006e-05,
"loss": 0.3284,
"step": 63000
},
{
"epoch": 0.4064,
"grad_norm": 0.7089780569076538,
"learning_rate": 3.984352e-05,
"loss": 0.3252,
"step": 63500
},
{
"epoch": 0.4096,
"grad_norm": 0.6669878959655762,
"learning_rate": 3.976352e-05,
"loss": 0.3239,
"step": 64000
},
{
"epoch": 0.4128,
"grad_norm": 0.7352403998374939,
"learning_rate": 3.968368e-05,
"loss": 0.3226,
"step": 64500
},
{
"epoch": 0.416,
"grad_norm": 0.6916635036468506,
"learning_rate": 3.9603840000000005e-05,
"loss": 0.3234,
"step": 65000
},
{
"epoch": 0.4192,
"grad_norm": 0.6800302863121033,
"learning_rate": 3.952384e-05,
"loss": 0.3224,
"step": 65500
},
{
"epoch": 0.4224,
"grad_norm": 0.6685224771499634,
"learning_rate": 3.9443840000000006e-05,
"loss": 0.3197,
"step": 66000
},
{
"epoch": 0.4256,
"grad_norm": 0.7219159603118896,
"learning_rate": 3.936384e-05,
"loss": 0.3185,
"step": 66500
},
{
"epoch": 0.4288,
"grad_norm": 0.5928858518600464,
"learning_rate": 3.928384e-05,
"loss": 0.3291,
"step": 67000
},
{
"epoch": 0.432,
"grad_norm": 0.6616542339324951,
"learning_rate": 3.920384e-05,
"loss": 0.3266,
"step": 67500
},
{
"epoch": 0.4352,
"grad_norm": 0.5957266092300415,
"learning_rate": 3.912384e-05,
"loss": 0.32,
"step": 68000
},
{
"epoch": 0.4384,
"grad_norm": 0.6576407551765442,
"learning_rate": 3.904384e-05,
"loss": 0.3246,
"step": 68500
},
{
"epoch": 0.4416,
"grad_norm": 0.6852056384086609,
"learning_rate": 3.896416e-05,
"loss": 0.3268,
"step": 69000
},
{
"epoch": 0.4448,
"grad_norm": 0.780893087387085,
"learning_rate": 3.888416e-05,
"loss": 0.3229,
"step": 69500
},
{
"epoch": 0.448,
"grad_norm": 0.6741476655006409,
"learning_rate": 3.880416e-05,
"loss": 0.3188,
"step": 70000
},
{
"epoch": 0.4512,
"grad_norm": 0.5919800400733948,
"learning_rate": 3.872416e-05,
"loss": 0.3208,
"step": 70500
},
{
"epoch": 0.4544,
"grad_norm": 0.6476633548736572,
"learning_rate": 3.864416e-05,
"loss": 0.322,
"step": 71000
},
{
"epoch": 0.4576,
"grad_norm": 0.5667979717254639,
"learning_rate": 3.8564159999999996e-05,
"loss": 0.3151,
"step": 71500
},
{
"epoch": 0.4608,
"grad_norm": 0.6126554608345032,
"learning_rate": 3.8484160000000004e-05,
"loss": 0.3185,
"step": 72000
},
{
"epoch": 0.464,
"grad_norm": 0.7995546460151672,
"learning_rate": 3.840416e-05,
"loss": 0.3174,
"step": 72500
},
{
"epoch": 0.4672,
"grad_norm": 0.5964981317520142,
"learning_rate": 3.8324160000000005e-05,
"loss": 0.3187,
"step": 73000
},
{
"epoch": 0.4704,
"grad_norm": 0.7718212008476257,
"learning_rate": 3.824416e-05,
"loss": 0.3156,
"step": 73500
},
{
"epoch": 0.4736,
"grad_norm": 0.7086686491966248,
"learning_rate": 3.8164320000000005e-05,
"loss": 0.3189,
"step": 74000
},
{
"epoch": 0.4768,
"grad_norm": 0.7988029718399048,
"learning_rate": 3.808432e-05,
"loss": 0.3151,
"step": 74500
},
{
"epoch": 0.48,
"grad_norm": 0.6092699766159058,
"learning_rate": 3.8004320000000006e-05,
"loss": 0.3153,
"step": 75000
},
{
"epoch": 0.4832,
"grad_norm": 0.6181166768074036,
"learning_rate": 3.792432e-05,
"loss": 0.3113,
"step": 75500
},
{
"epoch": 0.4864,
"grad_norm": 0.5952243208885193,
"learning_rate": 3.784432e-05,
"loss": 0.3091,
"step": 76000
},
{
"epoch": 0.4896,
"grad_norm": 0.5732501745223999,
"learning_rate": 3.776432e-05,
"loss": 0.3169,
"step": 76500
},
{
"epoch": 0.4928,
"grad_norm": 0.5866090059280396,
"learning_rate": 3.768432e-05,
"loss": 0.3135,
"step": 77000
},
{
"epoch": 0.496,
"grad_norm": 0.6748520135879517,
"learning_rate": 3.760432e-05,
"loss": 0.3134,
"step": 77500
},
{
"epoch": 0.4992,
"grad_norm": 0.5922159552574158,
"learning_rate": 3.752448e-05,
"loss": 0.3156,
"step": 78000
},
{
"epoch": 0.5024,
"grad_norm": 0.6446545124053955,
"learning_rate": 3.744448e-05,
"loss": 0.3171,
"step": 78500
},
{
"epoch": 0.5056,
"grad_norm": 0.6506426334381104,
"learning_rate": 3.736448e-05,
"loss": 0.3138,
"step": 79000
},
{
"epoch": 0.5088,
"grad_norm": 0.6826354265213013,
"learning_rate": 3.728448e-05,
"loss": 0.3164,
"step": 79500
},
{
"epoch": 0.512,
"grad_norm": 0.6866195797920227,
"learning_rate": 3.72048e-05,
"loss": 0.315,
"step": 80000
},
{
"epoch": 0.5152,
"grad_norm": 0.5590147376060486,
"learning_rate": 3.7124960000000005e-05,
"loss": 0.3094,
"step": 80500
},
{
"epoch": 0.5184,
"grad_norm": 0.6728788614273071,
"learning_rate": 3.704496e-05,
"loss": 0.3194,
"step": 81000
},
{
"epoch": 0.5216,
"grad_norm": 0.6108749508857727,
"learning_rate": 3.696496000000001e-05,
"loss": 0.3128,
"step": 81500
},
{
"epoch": 0.5248,
"grad_norm": 0.5888856649398804,
"learning_rate": 3.688496e-05,
"loss": 0.3121,
"step": 82000
},
{
"epoch": 0.528,
"grad_norm": 0.727268397808075,
"learning_rate": 3.680496e-05,
"loss": 0.3193,
"step": 82500
},
{
"epoch": 0.5312,
"grad_norm": 0.6358634233474731,
"learning_rate": 3.672496e-05,
"loss": 0.3092,
"step": 83000
},
{
"epoch": 0.5344,
"grad_norm": 0.6482620239257812,
"learning_rate": 3.664496e-05,
"loss": 0.3098,
"step": 83500
},
{
"epoch": 0.5376,
"grad_norm": 0.5968552827835083,
"learning_rate": 3.6564960000000004e-05,
"loss": 0.3108,
"step": 84000
},
{
"epoch": 0.5408,
"grad_norm": 0.6621351838111877,
"learning_rate": 3.6484960000000004e-05,
"loss": 0.3065,
"step": 84500
},
{
"epoch": 0.544,
"grad_norm": 0.5520649552345276,
"learning_rate": 3.640496e-05,
"loss": 0.3088,
"step": 85000
},
{
"epoch": 0.5472,
"grad_norm": 0.6885005831718445,
"learning_rate": 3.632496e-05,
"loss": 0.3075,
"step": 85500
},
{
"epoch": 0.5504,
"grad_norm": 0.666653573513031,
"learning_rate": 3.624512e-05,
"loss": 0.3113,
"step": 86000
},
{
"epoch": 0.5536,
"grad_norm": 0.6344409584999084,
"learning_rate": 3.6165120000000005e-05,
"loss": 0.3085,
"step": 86500
},
{
"epoch": 0.5568,
"grad_norm": 0.5792534947395325,
"learning_rate": 3.608512e-05,
"loss": 0.3132,
"step": 87000
},
{
"epoch": 0.56,
"grad_norm": 0.6864989995956421,
"learning_rate": 3.600512e-05,
"loss": 0.3079,
"step": 87500
},
{
"epoch": 0.5632,
"grad_norm": 0.6077435612678528,
"learning_rate": 3.592512e-05,
"loss": 0.3095,
"step": 88000
},
{
"epoch": 0.5664,
"grad_norm": 0.7073134779930115,
"learning_rate": 3.584512e-05,
"loss": 0.3116,
"step": 88500
},
{
"epoch": 0.5696,
"grad_norm": 0.6477733850479126,
"learning_rate": 3.576512e-05,
"loss": 0.3062,
"step": 89000
},
{
"epoch": 0.5728,
"grad_norm": 0.7786093354225159,
"learning_rate": 3.568512e-05,
"loss": 0.3017,
"step": 89500
},
{
"epoch": 0.576,
"grad_norm": 0.6447868943214417,
"learning_rate": 3.560528e-05,
"loss": 0.3077,
"step": 90000
},
{
"epoch": 0.5792,
"grad_norm": 0.6663397550582886,
"learning_rate": 3.552528e-05,
"loss": 0.3089,
"step": 90500
},
{
"epoch": 0.5824,
"grad_norm": 0.533214807510376,
"learning_rate": 3.544528e-05,
"loss": 0.3064,
"step": 91000
},
{
"epoch": 0.5856,
"grad_norm": 0.6517444849014282,
"learning_rate": 3.5365280000000004e-05,
"loss": 0.3108,
"step": 91500
},
{
"epoch": 0.5888,
"grad_norm": 0.7635303735733032,
"learning_rate": 3.528544e-05,
"loss": 0.3028,
"step": 92000
},
{
"epoch": 0.592,
"grad_norm": 0.6636632680892944,
"learning_rate": 3.520544e-05,
"loss": 0.3015,
"step": 92500
},
{
"epoch": 0.5952,
"grad_norm": 0.7296783924102783,
"learning_rate": 3.5125440000000004e-05,
"loss": 0.305,
"step": 93000
},
{
"epoch": 0.5984,
"grad_norm": 0.5089054703712463,
"learning_rate": 3.50456e-05,
"loss": 0.3092,
"step": 93500
},
{
"epoch": 0.6016,
"grad_norm": 0.6761330366134644,
"learning_rate": 3.49656e-05,
"loss": 0.3055,
"step": 94000
},
{
"epoch": 0.6048,
"grad_norm": 0.6327843070030212,
"learning_rate": 3.4885600000000004e-05,
"loss": 0.3055,
"step": 94500
},
{
"epoch": 0.608,
"grad_norm": 0.5940554141998291,
"learning_rate": 3.48056e-05,
"loss": 0.3017,
"step": 95000
},
{
"epoch": 0.6112,
"grad_norm": 0.516828179359436,
"learning_rate": 3.4725600000000005e-05,
"loss": 0.3035,
"step": 95500
},
{
"epoch": 0.6144,
"grad_norm": 0.5835782289505005,
"learning_rate": 3.46456e-05,
"loss": 0.2978,
"step": 96000
},
{
"epoch": 0.6176,
"grad_norm": 0.5978230237960815,
"learning_rate": 3.456560000000001e-05,
"loss": 0.301,
"step": 96500
},
{
"epoch": 0.6208,
"grad_norm": 0.5460017323493958,
"learning_rate": 3.44856e-05,
"loss": 0.3052,
"step": 97000
},
{
"epoch": 0.624,
"grad_norm": 0.6875701546669006,
"learning_rate": 3.44056e-05,
"loss": 0.3028,
"step": 97500
},
{
"epoch": 0.6272,
"grad_norm": 0.5780492424964905,
"learning_rate": 3.43256e-05,
"loss": 0.2988,
"step": 98000
},
{
"epoch": 0.6304,
"grad_norm": 0.5191554427146912,
"learning_rate": 3.42456e-05,
"loss": 0.3052,
"step": 98500
},
{
"epoch": 0.6336,
"grad_norm": 0.6811420917510986,
"learning_rate": 3.416576e-05,
"loss": 0.3032,
"step": 99000
},
{
"epoch": 0.6368,
"grad_norm": 0.6301366686820984,
"learning_rate": 3.408576e-05,
"loss": 0.2979,
"step": 99500
},
{
"epoch": 0.64,
"grad_norm": 0.5777577757835388,
"learning_rate": 3.400576e-05,
"loss": 0.2991,
"step": 100000
},
{
"epoch": 0.6432,
"grad_norm": 0.6444558501243591,
"learning_rate": 3.392592e-05,
"loss": 0.298,
"step": 100500
},
{
"epoch": 0.6464,
"grad_norm": 0.4793080985546112,
"learning_rate": 3.384592e-05,
"loss": 0.3014,
"step": 101000
},
{
"epoch": 0.6496,
"grad_norm": 0.6691552400588989,
"learning_rate": 3.376608e-05,
"loss": 0.3006,
"step": 101500
},
{
"epoch": 0.6528,
"grad_norm": 0.6318476796150208,
"learning_rate": 3.368608e-05,
"loss": 0.3032,
"step": 102000
},
{
"epoch": 0.656,
"grad_norm": 0.5805894136428833,
"learning_rate": 3.360608e-05,
"loss": 0.3014,
"step": 102500
},
{
"epoch": 0.6592,
"grad_norm": 0.5658220648765564,
"learning_rate": 3.352608e-05,
"loss": 0.3,
"step": 103000
},
{
"epoch": 0.6624,
"grad_norm": 0.6117516160011292,
"learning_rate": 3.3446080000000004e-05,
"loss": 0.3014,
"step": 103500
},
{
"epoch": 0.6656,
"grad_norm": 0.6763502359390259,
"learning_rate": 3.336608e-05,
"loss": 0.3043,
"step": 104000
},
{
"epoch": 0.6688,
"grad_norm": 0.6046746969223022,
"learning_rate": 3.3286080000000005e-05,
"loss": 0.2965,
"step": 104500
},
{
"epoch": 0.672,
"grad_norm": 0.7453213930130005,
"learning_rate": 3.320608e-05,
"loss": 0.2964,
"step": 105000
},
{
"epoch": 0.6752,
"grad_norm": 0.6010546088218689,
"learning_rate": 3.3126080000000007e-05,
"loss": 0.2975,
"step": 105500
},
{
"epoch": 0.6784,
"grad_norm": 0.7377296686172485,
"learning_rate": 3.304608e-05,
"loss": 0.2993,
"step": 106000
},
{
"epoch": 0.6816,
"grad_norm": 0.6612259745597839,
"learning_rate": 3.2966240000000006e-05,
"loss": 0.298,
"step": 106500
},
{
"epoch": 0.6848,
"grad_norm": 0.6570013165473938,
"learning_rate": 3.288624e-05,
"loss": 0.296,
"step": 107000
},
{
"epoch": 0.688,
"grad_norm": 0.633602499961853,
"learning_rate": 3.280624e-05,
"loss": 0.2989,
"step": 107500
},
{
"epoch": 0.6912,
"grad_norm": 0.5594373345375061,
"learning_rate": 3.272624e-05,
"loss": 0.2977,
"step": 108000
},
{
"epoch": 0.6944,
"grad_norm": 0.5643302202224731,
"learning_rate": 3.264624e-05,
"loss": 0.2941,
"step": 108500
},
{
"epoch": 0.6976,
"grad_norm": 0.5127794146537781,
"learning_rate": 3.256624e-05,
"loss": 0.2953,
"step": 109000
},
{
"epoch": 0.7008,
"grad_norm": 0.6273791790008545,
"learning_rate": 3.24864e-05,
"loss": 0.2944,
"step": 109500
},
{
"epoch": 0.704,
"grad_norm": 0.5089157223701477,
"learning_rate": 3.24064e-05,
"loss": 0.3,
"step": 110000
},
{
"epoch": 0.7072,
"grad_norm": 0.5816791653633118,
"learning_rate": 3.232656e-05,
"loss": 0.2957,
"step": 110500
},
{
"epoch": 0.7104,
"grad_norm": 0.6407476663589478,
"learning_rate": 3.224656e-05,
"loss": 0.2974,
"step": 111000
},
{
"epoch": 0.7136,
"grad_norm": 0.46444937586784363,
"learning_rate": 3.216656e-05,
"loss": 0.2969,
"step": 111500
},
{
"epoch": 0.7168,
"grad_norm": 0.4997446835041046,
"learning_rate": 3.2086559999999996e-05,
"loss": 0.2966,
"step": 112000
},
{
"epoch": 0.72,
"grad_norm": 0.6996490359306335,
"learning_rate": 3.2006560000000003e-05,
"loss": 0.2965,
"step": 112500
},
{
"epoch": 0.7232,
"grad_norm": 0.5806016325950623,
"learning_rate": 3.192672e-05,
"loss": 0.2952,
"step": 113000
},
{
"epoch": 0.7264,
"grad_norm": 0.6140916347503662,
"learning_rate": 3.184672e-05,
"loss": 0.2995,
"step": 113500
},
{
"epoch": 0.7296,
"grad_norm": 0.45879319310188293,
"learning_rate": 3.1766719999999997e-05,
"loss": 0.292,
"step": 114000
},
{
"epoch": 0.7328,
"grad_norm": 0.6141937971115112,
"learning_rate": 3.1686720000000004e-05,
"loss": 0.2945,
"step": 114500
},
{
"epoch": 0.736,
"grad_norm": 0.6565462946891785,
"learning_rate": 3.160672e-05,
"loss": 0.2982,
"step": 115000
},
{
"epoch": 0.7392,
"grad_norm": 0.5997145175933838,
"learning_rate": 3.1526720000000006e-05,
"loss": 0.2957,
"step": 115500
},
{
"epoch": 0.7424,
"grad_norm": 0.736965537071228,
"learning_rate": 3.144672e-05,
"loss": 0.2953,
"step": 116000
},
{
"epoch": 0.7456,
"grad_norm": 0.6587550640106201,
"learning_rate": 3.136672e-05,
"loss": 0.2917,
"step": 116500
},
{
"epoch": 0.7488,
"grad_norm": 0.7265971302986145,
"learning_rate": 3.128672e-05,
"loss": 0.2908,
"step": 117000
},
{
"epoch": 0.752,
"grad_norm": 0.6158114075660706,
"learning_rate": 3.120672e-05,
"loss": 0.2916,
"step": 117500
},
{
"epoch": 0.7552,
"grad_norm": 0.6521216034889221,
"learning_rate": 3.112672e-05,
"loss": 0.2947,
"step": 118000
},
{
"epoch": 0.7584,
"grad_norm": 0.5868868231773376,
"learning_rate": 3.1046720000000004e-05,
"loss": 0.2919,
"step": 118500
},
{
"epoch": 0.7616,
"grad_norm": 0.6495432257652283,
"learning_rate": 3.096672e-05,
"loss": 0.2974,
"step": 119000
},
{
"epoch": 0.7648,
"grad_norm": 0.6204816102981567,
"learning_rate": 3.0886720000000005e-05,
"loss": 0.2945,
"step": 119500
},
{
"epoch": 0.768,
"grad_norm": 0.6333968639373779,
"learning_rate": 3.080672e-05,
"loss": 0.292,
"step": 120000
},
{
"epoch": 0.7712,
"grad_norm": 0.5613961815834045,
"learning_rate": 3.0726880000000004e-05,
"loss": 0.2938,
"step": 120500
},
{
"epoch": 0.7744,
"grad_norm": 0.6623988151550293,
"learning_rate": 3.064688e-05,
"loss": 0.2954,
"step": 121000
},
{
"epoch": 0.7776,
"grad_norm": 0.6134264469146729,
"learning_rate": 3.0566880000000006e-05,
"loss": 0.2915,
"step": 121500
},
{
"epoch": 0.7808,
"grad_norm": 0.6159347891807556,
"learning_rate": 3.048688e-05,
"loss": 0.2887,
"step": 122000
},
{
"epoch": 0.784,
"grad_norm": 0.6079424023628235,
"learning_rate": 3.0407040000000005e-05,
"loss": 0.2915,
"step": 122500
},
{
"epoch": 0.7872,
"grad_norm": 0.7703385353088379,
"learning_rate": 3.0327040000000002e-05,
"loss": 0.2901,
"step": 123000
},
{
"epoch": 0.7904,
"grad_norm": 0.5626256465911865,
"learning_rate": 3.024704e-05,
"loss": 0.2938,
"step": 123500
},
{
"epoch": 0.7936,
"grad_norm": 0.554914653301239,
"learning_rate": 3.016704e-05,
"loss": 0.2913,
"step": 124000
},
{
"epoch": 0.7968,
"grad_norm": 0.6610060930252075,
"learning_rate": 3.008704e-05,
"loss": 0.2912,
"step": 124500
},
{
"epoch": 0.8,
"grad_norm": 0.6194009780883789,
"learning_rate": 3.0007040000000002e-05,
"loss": 0.2901,
"step": 125000
},
{
"epoch": 0.8032,
"grad_norm": 0.7150211930274963,
"learning_rate": 2.992704e-05,
"loss": 0.2895,
"step": 125500
},
{
"epoch": 0.8064,
"grad_norm": 0.6945148706436157,
"learning_rate": 2.9847040000000003e-05,
"loss": 0.2878,
"step": 126000
},
{
"epoch": 0.8096,
"grad_norm": 0.6546908617019653,
"learning_rate": 2.9767200000000002e-05,
"loss": 0.287,
"step": 126500
},
{
"epoch": 0.8128,
"grad_norm": 0.535040020942688,
"learning_rate": 2.9687360000000004e-05,
"loss": 0.2901,
"step": 127000
},
{
"epoch": 0.816,
"grad_norm": 0.6062806844711304,
"learning_rate": 2.960736e-05,
"loss": 0.2862,
"step": 127500
},
{
"epoch": 0.8192,
"grad_norm": 0.6202298998832703,
"learning_rate": 2.9527360000000005e-05,
"loss": 0.2884,
"step": 128000
},
{
"epoch": 0.8224,
"grad_norm": 0.5966545343399048,
"learning_rate": 2.9447360000000003e-05,
"loss": 0.2877,
"step": 128500
},
{
"epoch": 0.8256,
"grad_norm": 0.5024796724319458,
"learning_rate": 2.936736e-05,
"loss": 0.2882,
"step": 129000
},
{
"epoch": 0.8288,
"grad_norm": 0.5895559191703796,
"learning_rate": 2.9287520000000002e-05,
"loss": 0.288,
"step": 129500
},
{
"epoch": 0.832,
"grad_norm": 0.9302066564559937,
"learning_rate": 2.920752e-05,
"loss": 0.286,
"step": 130000
},
{
"epoch": 0.8352,
"grad_norm": 0.573466956615448,
"learning_rate": 2.9127520000000003e-05,
"loss": 0.2848,
"step": 130500
},
{
"epoch": 0.8384,
"grad_norm": 0.5901783108711243,
"learning_rate": 2.904768e-05,
"loss": 0.2883,
"step": 131000
},
{
"epoch": 0.8416,
"grad_norm": 0.7780030369758606,
"learning_rate": 2.8967680000000002e-05,
"loss": 0.2914,
"step": 131500
},
{
"epoch": 0.8448,
"grad_norm": 0.6630533933639526,
"learning_rate": 2.888768e-05,
"loss": 0.2878,
"step": 132000
},
{
"epoch": 0.848,
"grad_norm": 0.6001667976379395,
"learning_rate": 2.8807680000000004e-05,
"loss": 0.2818,
"step": 132500
},
{
"epoch": 0.8512,
"grad_norm": 0.6324682831764221,
"learning_rate": 2.872768e-05,
"loss": 0.2849,
"step": 133000
},
{
"epoch": 0.8544,
"grad_norm": 0.6814092993736267,
"learning_rate": 2.864768e-05,
"loss": 0.288,
"step": 133500
},
{
"epoch": 0.8576,
"grad_norm": 0.651709794998169,
"learning_rate": 2.8567680000000003e-05,
"loss": 0.2872,
"step": 134000
},
{
"epoch": 0.8608,
"grad_norm": 0.5912330746650696,
"learning_rate": 2.848768e-05,
"loss": 0.2824,
"step": 134500
},
{
"epoch": 0.864,
"grad_norm": 0.5821974277496338,
"learning_rate": 2.8407680000000004e-05,
"loss": 0.2853,
"step": 135000
},
{
"epoch": 0.8672,
"grad_norm": 0.6262611150741577,
"learning_rate": 2.832784e-05,
"loss": 0.2848,
"step": 135500
},
{
"epoch": 0.8704,
"grad_norm": 0.5360976457595825,
"learning_rate": 2.8247840000000004e-05,
"loss": 0.2869,
"step": 136000
},
{
"epoch": 0.8736,
"grad_norm": 0.6523284912109375,
"learning_rate": 2.816784e-05,
"loss": 0.2792,
"step": 136500
},
{
"epoch": 0.8768,
"grad_norm": 0.6329330205917358,
"learning_rate": 2.808784e-05,
"loss": 0.2865,
"step": 137000
},
{
"epoch": 0.88,
"grad_norm": 0.6053124666213989,
"learning_rate": 2.8007840000000003e-05,
"loss": 0.2844,
"step": 137500
},
{
"epoch": 0.8832,
"grad_norm": 0.6887571811676025,
"learning_rate": 2.7927999999999998e-05,
"loss": 0.288,
"step": 138000
},
{
"epoch": 0.8864,
"grad_norm": 0.7047476172447205,
"learning_rate": 2.7848000000000002e-05,
"loss": 0.2877,
"step": 138500
},
{
"epoch": 0.8896,
"grad_norm": 0.598227858543396,
"learning_rate": 2.7768e-05,
"loss": 0.2867,
"step": 139000
},
{
"epoch": 0.8928,
"grad_norm": 0.5094701051712036,
"learning_rate": 2.7688000000000003e-05,
"loss": 0.2832,
"step": 139500
},
{
"epoch": 0.896,
"grad_norm": 0.5749739408493042,
"learning_rate": 2.7608e-05,
"loss": 0.2821,
"step": 140000
},
{
"epoch": 0.8992,
"grad_norm": 0.4442578852176666,
"learning_rate": 2.7528320000000003e-05,
"loss": 0.282,
"step": 140500
},
{
"epoch": 0.9024,
"grad_norm": 0.5418574213981628,
"learning_rate": 2.744832e-05,
"loss": 0.2816,
"step": 141000
},
{
"epoch": 0.9056,
"grad_norm": 0.5984327793121338,
"learning_rate": 2.736832e-05,
"loss": 0.285,
"step": 141500
},
{
"epoch": 0.9088,
"grad_norm": 0.6572843194007874,
"learning_rate": 2.728832e-05,
"loss": 0.2817,
"step": 142000
},
{
"epoch": 0.912,
"grad_norm": 0.590993344783783,
"learning_rate": 2.7208320000000003e-05,
"loss": 0.288,
"step": 142500
},
{
"epoch": 0.9152,
"grad_norm": 0.6096624135971069,
"learning_rate": 2.712832e-05,
"loss": 0.2861,
"step": 143000
},
{
"epoch": 0.9184,
"grad_norm": 0.5189167261123657,
"learning_rate": 2.7048319999999998e-05,
"loss": 0.2857,
"step": 143500
},
{
"epoch": 0.9216,
"grad_norm": 0.5812899470329285,
"learning_rate": 2.6968320000000002e-05,
"loss": 0.2888,
"step": 144000
},
{
"epoch": 0.9248,
"grad_norm": 0.515201210975647,
"learning_rate": 2.688832e-05,
"loss": 0.2791,
"step": 144500
},
{
"epoch": 0.928,
"grad_norm": 0.6398504972457886,
"learning_rate": 2.6808320000000004e-05,
"loss": 0.282,
"step": 145000
},
{
"epoch": 0.9312,
"grad_norm": 0.5990891456604004,
"learning_rate": 2.672832e-05,
"loss": 0.28,
"step": 145500
},
{
"epoch": 0.9344,
"grad_norm": 0.5883029699325562,
"learning_rate": 2.664832e-05,
"loss": 0.2777,
"step": 146000
},
{
"epoch": 0.9376,
"grad_norm": 0.6432376503944397,
"learning_rate": 2.656848e-05,
"loss": 0.2804,
"step": 146500
},
{
"epoch": 0.9408,
"grad_norm": 0.5375948548316956,
"learning_rate": 2.6488479999999997e-05,
"loss": 0.2807,
"step": 147000
},
{
"epoch": 0.944,
"grad_norm": 0.6207411885261536,
"learning_rate": 2.6408640000000003e-05,
"loss": 0.283,
"step": 147500
},
{
"epoch": 0.9472,
"grad_norm": 0.5854378342628479,
"learning_rate": 2.632864e-05,
"loss": 0.2854,
"step": 148000
},
{
"epoch": 0.9504,
"grad_norm": 0.5260078310966492,
"learning_rate": 2.6248800000000002e-05,
"loss": 0.2836,
"step": 148500
},
{
"epoch": 0.9536,
"grad_norm": 0.6284717917442322,
"learning_rate": 2.61688e-05,
"loss": 0.2824,
"step": 149000
},
{
"epoch": 0.9568,
"grad_norm": 0.6092182397842407,
"learning_rate": 2.608896e-05,
"loss": 0.2804,
"step": 149500
},
{
"epoch": 0.96,
"grad_norm": 0.6028911471366882,
"learning_rate": 2.600896e-05,
"loss": 0.281,
"step": 150000
},
{
"epoch": 0.9632,
"grad_norm": 0.5008478164672852,
"learning_rate": 2.5928960000000003e-05,
"loss": 0.277,
"step": 150500
},
{
"epoch": 0.9664,
"grad_norm": 0.5233867168426514,
"learning_rate": 2.584896e-05,
"loss": 0.2807,
"step": 151000
},
{
"epoch": 0.9696,
"grad_norm": 0.5762408375740051,
"learning_rate": 2.5768960000000004e-05,
"loss": 0.2831,
"step": 151500
},
{
"epoch": 0.9728,
"grad_norm": 0.6097844243049622,
"learning_rate": 2.568896e-05,
"loss": 0.2803,
"step": 152000
},
{
"epoch": 0.976,
"grad_norm": 0.6696804761886597,
"learning_rate": 2.560896e-05,
"loss": 0.2742,
"step": 152500
},
{
"epoch": 0.9792,
"grad_norm": 0.6028556823730469,
"learning_rate": 2.5528960000000003e-05,
"loss": 0.282,
"step": 153000
},
{
"epoch": 0.9824,
"grad_norm": 0.6651898622512817,
"learning_rate": 2.544896e-05,
"loss": 0.2849,
"step": 153500
},
{
"epoch": 0.9856,
"grad_norm": 0.5219380855560303,
"learning_rate": 2.536896e-05,
"loss": 0.2785,
"step": 154000
},
{
"epoch": 0.9888,
"grad_norm": 0.6161176562309265,
"learning_rate": 2.5288960000000002e-05,
"loss": 0.2808,
"step": 154500
},
{
"epoch": 0.992,
"grad_norm": 0.7915316224098206,
"learning_rate": 2.5208960000000003e-05,
"loss": 0.2777,
"step": 155000
},
{
"epoch": 0.9952,
"grad_norm": 0.7261882424354553,
"learning_rate": 2.512896e-05,
"loss": 0.2767,
"step": 155500
},
{
"epoch": 0.9984,
"grad_norm": 0.5452406406402588,
"learning_rate": 2.5048959999999997e-05,
"loss": 0.2764,
"step": 156000
},
{
"epoch": 1.0016,
"grad_norm": 0.642181396484375,
"learning_rate": 2.4969120000000003e-05,
"loss": 0.2746,
"step": 156500
},
{
"epoch": 1.0048,
"grad_norm": 0.5900291204452515,
"learning_rate": 2.4889120000000003e-05,
"loss": 0.2721,
"step": 157000
},
{
"epoch": 1.008,
"grad_norm": 0.5960043668746948,
"learning_rate": 2.480912e-05,
"loss": 0.265,
"step": 157500
},
{
"epoch": 1.0112,
"grad_norm": 0.582115650177002,
"learning_rate": 2.472912e-05,
"loss": 0.2673,
"step": 158000
},
{
"epoch": 1.0144,
"grad_norm": 0.552392303943634,
"learning_rate": 2.464912e-05,
"loss": 0.2663,
"step": 158500
},
{
"epoch": 1.0176,
"grad_norm": 0.5585765242576599,
"learning_rate": 2.456912e-05,
"loss": 0.2688,
"step": 159000
},
{
"epoch": 1.0208,
"grad_norm": 0.6049332022666931,
"learning_rate": 2.448912e-05,
"loss": 0.266,
"step": 159500
},
{
"epoch": 1.024,
"grad_norm": 0.5749877095222473,
"learning_rate": 2.440912e-05,
"loss": 0.2689,
"step": 160000
},
{
"epoch": 1.0272,
"grad_norm": 0.5832675695419312,
"learning_rate": 2.4329120000000002e-05,
"loss": 0.2703,
"step": 160500
},
{
"epoch": 1.0304,
"grad_norm": 0.8549031019210815,
"learning_rate": 2.424928e-05,
"loss": 0.2623,
"step": 161000
},
{
"epoch": 1.0336,
"grad_norm": 0.5572855472564697,
"learning_rate": 2.416928e-05,
"loss": 0.2711,
"step": 161500
},
{
"epoch": 1.0368,
"grad_norm": 0.6818140745162964,
"learning_rate": 2.408928e-05,
"loss": 0.2652,
"step": 162000
},
{
"epoch": 1.04,
"grad_norm": 0.6900683045387268,
"learning_rate": 2.400928e-05,
"loss": 0.2669,
"step": 162500
},
{
"epoch": 1.0432,
"grad_norm": 0.6015618443489075,
"learning_rate": 2.392944e-05,
"loss": 0.2654,
"step": 163000
},
{
"epoch": 1.0464,
"grad_norm": 0.5343177318572998,
"learning_rate": 2.3849440000000002e-05,
"loss": 0.2656,
"step": 163500
},
{
"epoch": 1.0496,
"grad_norm": 0.6130079627037048,
"learning_rate": 2.3769440000000003e-05,
"loss": 0.2592,
"step": 164000
},
{
"epoch": 1.0528,
"grad_norm": 0.7150599956512451,
"learning_rate": 2.368944e-05,
"loss": 0.2634,
"step": 164500
},
{
"epoch": 1.056,
"grad_norm": 0.6321354508399963,
"learning_rate": 2.360944e-05,
"loss": 0.2683,
"step": 165000
},
{
"epoch": 1.0592,
"grad_norm": 0.6234462857246399,
"learning_rate": 2.352976e-05,
"loss": 0.2628,
"step": 165500
},
{
"epoch": 1.0624,
"grad_norm": 0.6542537808418274,
"learning_rate": 2.344976e-05,
"loss": 0.2618,
"step": 166000
},
{
"epoch": 1.0656,
"grad_norm": 0.6302633881568909,
"learning_rate": 2.3369760000000002e-05,
"loss": 0.2661,
"step": 166500
},
{
"epoch": 1.0688,
"grad_norm": 0.5890353322029114,
"learning_rate": 2.3289760000000002e-05,
"loss": 0.2646,
"step": 167000
},
{
"epoch": 1.072,
"grad_norm": 0.6490179300308228,
"learning_rate": 2.320976e-05,
"loss": 0.2635,
"step": 167500
},
{
"epoch": 1.0752,
"grad_norm": 0.648162305355072,
"learning_rate": 2.312976e-05,
"loss": 0.2646,
"step": 168000
},
{
"epoch": 1.0784,
"grad_norm": 0.675680935382843,
"learning_rate": 2.304976e-05,
"loss": 0.2626,
"step": 168500
},
{
"epoch": 1.0816,
"grad_norm": 0.6192341446876526,
"learning_rate": 2.2969760000000002e-05,
"loss": 0.2641,
"step": 169000
},
{
"epoch": 1.0848,
"grad_norm": 0.7046379446983337,
"learning_rate": 2.288992e-05,
"loss": 0.2643,
"step": 169500
},
{
"epoch": 1.088,
"grad_norm": 0.5477197170257568,
"learning_rate": 2.280992e-05,
"loss": 0.265,
"step": 170000
},
{
"epoch": 1.0912,
"grad_norm": 0.5775583982467651,
"learning_rate": 2.2729920000000002e-05,
"loss": 0.2645,
"step": 170500
},
{
"epoch": 1.0944,
"grad_norm": 0.6389047503471375,
"learning_rate": 2.2649920000000003e-05,
"loss": 0.2634,
"step": 171000
},
{
"epoch": 1.0976,
"grad_norm": 0.6169374585151672,
"learning_rate": 2.256992e-05,
"loss": 0.2642,
"step": 171500
},
{
"epoch": 1.1008,
"grad_norm": 0.5913782715797424,
"learning_rate": 2.2490080000000002e-05,
"loss": 0.2658,
"step": 172000
},
{
"epoch": 1.104,
"grad_norm": 0.7547928690910339,
"learning_rate": 2.241008e-05,
"loss": 0.2674,
"step": 172500
},
{
"epoch": 1.1072,
"grad_norm": 0.6277585625648499,
"learning_rate": 2.233024e-05,
"loss": 0.2686,
"step": 173000
},
{
"epoch": 1.1104,
"grad_norm": 0.6357282996177673,
"learning_rate": 2.225024e-05,
"loss": 0.2639,
"step": 173500
},
{
"epoch": 1.1136,
"grad_norm": 0.5262208580970764,
"learning_rate": 2.2170400000000004e-05,
"loss": 0.2641,
"step": 174000
},
{
"epoch": 1.1168,
"grad_norm": 0.6878075003623962,
"learning_rate": 2.20904e-05,
"loss": 0.2654,
"step": 174500
},
{
"epoch": 1.12,
"grad_norm": 0.5332186222076416,
"learning_rate": 2.2010400000000002e-05,
"loss": 0.2638,
"step": 175000
},
{
"epoch": 1.1232,
"grad_norm": 0.5562476515769958,
"learning_rate": 2.19304e-05,
"loss": 0.2648,
"step": 175500
},
{
"epoch": 1.1264,
"grad_norm": 0.5924221277236938,
"learning_rate": 2.18504e-05,
"loss": 0.2627,
"step": 176000
},
{
"epoch": 1.1296,
"grad_norm": 0.5250386595726013,
"learning_rate": 2.17704e-05,
"loss": 0.2619,
"step": 176500
},
{
"epoch": 1.1328,
"grad_norm": 0.7426069378852844,
"learning_rate": 2.16904e-05,
"loss": 0.2628,
"step": 177000
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.4925951063632965,
"learning_rate": 2.16104e-05,
"loss": 0.2661,
"step": 177500
},
{
"epoch": 1.1392,
"grad_norm": 0.5707270503044128,
"learning_rate": 2.15304e-05,
"loss": 0.2622,
"step": 178000
},
{
"epoch": 1.1424,
"grad_norm": 0.5793021321296692,
"learning_rate": 2.14504e-05,
"loss": 0.2671,
"step": 178500
},
{
"epoch": 1.1456,
"grad_norm": 0.5736916661262512,
"learning_rate": 2.13704e-05,
"loss": 0.2648,
"step": 179000
},
{
"epoch": 1.1488,
"grad_norm": 0.588550329208374,
"learning_rate": 2.129056e-05,
"loss": 0.2641,
"step": 179500
},
{
"epoch": 1.152,
"grad_norm": 0.5504462122917175,
"learning_rate": 2.121056e-05,
"loss": 0.2643,
"step": 180000
},
{
"epoch": 1.1552,
"grad_norm": 0.5439949035644531,
"learning_rate": 2.113056e-05,
"loss": 0.2639,
"step": 180500
},
{
"epoch": 1.1584,
"grad_norm": 0.6882042288780212,
"learning_rate": 2.105056e-05,
"loss": 0.2595,
"step": 181000
},
{
"epoch": 1.1616,
"grad_norm": 0.6735561490058899,
"learning_rate": 2.097056e-05,
"loss": 0.2624,
"step": 181500
},
{
"epoch": 1.1648,
"grad_norm": 0.5545785427093506,
"learning_rate": 2.089056e-05,
"loss": 0.2625,
"step": 182000
},
{
"epoch": 1.168,
"grad_norm": 0.6497994065284729,
"learning_rate": 2.081056e-05,
"loss": 0.2611,
"step": 182500
},
{
"epoch": 1.1712,
"grad_norm": 0.5887815356254578,
"learning_rate": 2.073056e-05,
"loss": 0.2632,
"step": 183000
},
{
"epoch": 1.1743999999999999,
"grad_norm": 0.6037270426750183,
"learning_rate": 2.0650560000000002e-05,
"loss": 0.2645,
"step": 183500
},
{
"epoch": 1.1776,
"grad_norm": 0.636946439743042,
"learning_rate": 2.057072e-05,
"loss": 0.2628,
"step": 184000
},
{
"epoch": 1.1808,
"grad_norm": 0.5285276770591736,
"learning_rate": 2.049072e-05,
"loss": 0.2629,
"step": 184500
},
{
"epoch": 1.184,
"grad_norm": 0.4634397625923157,
"learning_rate": 2.041072e-05,
"loss": 0.2615,
"step": 185000
},
{
"epoch": 1.1872,
"grad_norm": 0.5693604946136475,
"learning_rate": 2.033072e-05,
"loss": 0.2619,
"step": 185500
},
{
"epoch": 1.1904,
"grad_norm": 0.6433858275413513,
"learning_rate": 2.025072e-05,
"loss": 0.2591,
"step": 186000
},
{
"epoch": 1.1936,
"grad_norm": 0.5103280544281006,
"learning_rate": 2.017088e-05,
"loss": 0.2606,
"step": 186500
},
{
"epoch": 1.1968,
"grad_norm": 0.5591945648193359,
"learning_rate": 2.009104e-05,
"loss": 0.2628,
"step": 187000
},
{
"epoch": 1.2,
"grad_norm": 0.5560447573661804,
"learning_rate": 2.001104e-05,
"loss": 0.2603,
"step": 187500
},
{
"epoch": 1.2032,
"grad_norm": 0.5321928262710571,
"learning_rate": 1.9931040000000002e-05,
"loss": 0.2576,
"step": 188000
},
{
"epoch": 1.2064,
"grad_norm": 0.6455059051513672,
"learning_rate": 1.9851040000000003e-05,
"loss": 0.2615,
"step": 188500
},
{
"epoch": 1.2096,
"grad_norm": 0.6237916946411133,
"learning_rate": 1.977104e-05,
"loss": 0.2626,
"step": 189000
},
{
"epoch": 1.2128,
"grad_norm": 0.5269157886505127,
"learning_rate": 1.969104e-05,
"loss": 0.2597,
"step": 189500
},
{
"epoch": 1.216,
"grad_norm": 0.5521387457847595,
"learning_rate": 1.961104e-05,
"loss": 0.257,
"step": 190000
},
{
"epoch": 1.2192,
"grad_norm": 0.6061577796936035,
"learning_rate": 1.953104e-05,
"loss": 0.2626,
"step": 190500
},
{
"epoch": 1.2224,
"grad_norm": 0.6479594111442566,
"learning_rate": 1.945104e-05,
"loss": 0.2586,
"step": 191000
},
{
"epoch": 1.2256,
"grad_norm": 0.5330658555030823,
"learning_rate": 1.937104e-05,
"loss": 0.2573,
"step": 191500
},
{
"epoch": 1.2288000000000001,
"grad_norm": 0.5984029173851013,
"learning_rate": 1.9291200000000003e-05,
"loss": 0.2591,
"step": 192000
},
{
"epoch": 1.232,
"grad_norm": 0.8451948165893555,
"learning_rate": 1.92112e-05,
"loss": 0.2591,
"step": 192500
},
{
"epoch": 1.2352,
"grad_norm": 0.6519868969917297,
"learning_rate": 1.91312e-05,
"loss": 0.2608,
"step": 193000
},
{
"epoch": 1.2384,
"grad_norm": 0.6487559080123901,
"learning_rate": 1.9051199999999998e-05,
"loss": 0.2558,
"step": 193500
},
{
"epoch": 1.2416,
"grad_norm": 0.5544815063476562,
"learning_rate": 1.897152e-05,
"loss": 0.2609,
"step": 194000
},
{
"epoch": 1.2448,
"grad_norm": 0.594536542892456,
"learning_rate": 1.8891520000000002e-05,
"loss": 0.2592,
"step": 194500
},
{
"epoch": 1.248,
"grad_norm": 0.5301911234855652,
"learning_rate": 1.8811520000000002e-05,
"loss": 0.2579,
"step": 195000
},
{
"epoch": 1.2511999999999999,
"grad_norm": 0.6232271790504456,
"learning_rate": 1.873152e-05,
"loss": 0.2611,
"step": 195500
},
{
"epoch": 1.2544,
"grad_norm": 0.6571745276451111,
"learning_rate": 1.865152e-05,
"loss": 0.2617,
"step": 196000
},
{
"epoch": 1.2576,
"grad_norm": 0.6281866431236267,
"learning_rate": 1.857152e-05,
"loss": 0.2605,
"step": 196500
},
{
"epoch": 1.2608,
"grad_norm": 0.6584866642951965,
"learning_rate": 1.8491520000000002e-05,
"loss": 0.2575,
"step": 197000
},
{
"epoch": 1.264,
"grad_norm": 0.5791180729866028,
"learning_rate": 1.8411520000000003e-05,
"loss": 0.2572,
"step": 197500
},
{
"epoch": 1.2671999999999999,
"grad_norm": 0.5907946228981018,
"learning_rate": 1.8331520000000004e-05,
"loss": 0.2576,
"step": 198000
},
{
"epoch": 1.2704,
"grad_norm": 0.6532405614852905,
"learning_rate": 1.8251680000000002e-05,
"loss": 0.2588,
"step": 198500
},
{
"epoch": 1.2736,
"grad_norm": 0.5683246850967407,
"learning_rate": 1.8171680000000003e-05,
"loss": 0.2597,
"step": 199000
},
{
"epoch": 1.2768,
"grad_norm": 0.5847846865653992,
"learning_rate": 1.809168e-05,
"loss": 0.2628,
"step": 199500
},
{
"epoch": 1.28,
"grad_norm": 0.5554783344268799,
"learning_rate": 1.801168e-05,
"loss": 0.2542,
"step": 200000
},
{
"epoch": 1.2832,
"grad_norm": 0.6664928793907166,
"learning_rate": 1.793184e-05,
"loss": 0.2586,
"step": 200500
},
{
"epoch": 1.2864,
"grad_norm": 0.5993084907531738,
"learning_rate": 1.785184e-05,
"loss": 0.2571,
"step": 201000
},
{
"epoch": 1.2896,
"grad_norm": 0.4557185173034668,
"learning_rate": 1.777184e-05,
"loss": 0.2594,
"step": 201500
},
{
"epoch": 1.2928,
"grad_norm": 0.7798305749893188,
"learning_rate": 1.7691840000000002e-05,
"loss": 0.2561,
"step": 202000
},
{
"epoch": 1.296,
"grad_norm": 0.5406688451766968,
"learning_rate": 1.7611840000000002e-05,
"loss": 0.2562,
"step": 202500
},
{
"epoch": 1.2992,
"grad_norm": 0.5173208117485046,
"learning_rate": 1.7531840000000003e-05,
"loss": 0.2606,
"step": 203000
},
{
"epoch": 1.3024,
"grad_norm": 0.6803346872329712,
"learning_rate": 1.7452e-05,
"loss": 0.259,
"step": 203500
},
{
"epoch": 1.3056,
"grad_norm": 0.5223200917243958,
"learning_rate": 1.7372000000000002e-05,
"loss": 0.2571,
"step": 204000
},
{
"epoch": 1.3088,
"grad_norm": 0.6100528240203857,
"learning_rate": 1.7292e-05,
"loss": 0.2558,
"step": 204500
},
{
"epoch": 1.312,
"grad_norm": 0.623023271560669,
"learning_rate": 1.7212e-05,
"loss": 0.2563,
"step": 205000
},
{
"epoch": 1.3152,
"grad_norm": 0.5915964841842651,
"learning_rate": 1.713216e-05,
"loss": 0.2581,
"step": 205500
},
{
"epoch": 1.3184,
"grad_norm": 0.538467526435852,
"learning_rate": 1.705216e-05,
"loss": 0.2554,
"step": 206000
},
{
"epoch": 1.3216,
"grad_norm": 0.5382514595985413,
"learning_rate": 1.697216e-05,
"loss": 0.2581,
"step": 206500
},
{
"epoch": 1.3248,
"grad_norm": 0.6466744542121887,
"learning_rate": 1.689216e-05,
"loss": 0.2573,
"step": 207000
},
{
"epoch": 1.328,
"grad_norm": 0.742675244808197,
"learning_rate": 1.6812160000000002e-05,
"loss": 0.2572,
"step": 207500
},
{
"epoch": 1.3312,
"grad_norm": 0.6123968362808228,
"learning_rate": 1.673232e-05,
"loss": 0.2598,
"step": 208000
},
{
"epoch": 1.3344,
"grad_norm": 0.6710489392280579,
"learning_rate": 1.665232e-05,
"loss": 0.2604,
"step": 208500
},
{
"epoch": 1.3376000000000001,
"grad_norm": 0.685879111289978,
"learning_rate": 1.657232e-05,
"loss": 0.2576,
"step": 209000
},
{
"epoch": 1.3408,
"grad_norm": 0.5600978136062622,
"learning_rate": 1.649232e-05,
"loss": 0.2606,
"step": 209500
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.5358079075813293,
"learning_rate": 1.6412640000000002e-05,
"loss": 0.2578,
"step": 210000
},
{
"epoch": 1.3472,
"grad_norm": 0.7245667576789856,
"learning_rate": 1.63328e-05,
"loss": 0.2613,
"step": 210500
},
{
"epoch": 1.3504,
"grad_norm": 0.5963015556335449,
"learning_rate": 1.62528e-05,
"loss": 0.2568,
"step": 211000
},
{
"epoch": 1.3536000000000001,
"grad_norm": 0.6139352917671204,
"learning_rate": 1.6172800000000002e-05,
"loss": 0.2501,
"step": 211500
},
{
"epoch": 1.3568,
"grad_norm": 0.5434224605560303,
"learning_rate": 1.60928e-05,
"loss": 0.2583,
"step": 212000
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.5723361372947693,
"learning_rate": 1.60128e-05,
"loss": 0.2582,
"step": 212500
},
{
"epoch": 1.3632,
"grad_norm": 0.5621342658996582,
"learning_rate": 1.593296e-05,
"loss": 0.2583,
"step": 213000
},
{
"epoch": 1.3664,
"grad_norm": 0.5716707706451416,
"learning_rate": 1.585296e-05,
"loss": 0.2548,
"step": 213500
},
{
"epoch": 1.3696,
"grad_norm": 0.6344952583312988,
"learning_rate": 1.577296e-05,
"loss": 0.2526,
"step": 214000
},
{
"epoch": 1.3728,
"grad_norm": 0.6360082030296326,
"learning_rate": 1.569296e-05,
"loss": 0.259,
"step": 214500
},
{
"epoch": 1.376,
"grad_norm": 0.5400614142417908,
"learning_rate": 1.5612960000000002e-05,
"loss": 0.26,
"step": 215000
},
{
"epoch": 1.3792,
"grad_norm": 0.6992815732955933,
"learning_rate": 1.5532960000000002e-05,
"loss": 0.259,
"step": 215500
},
{
"epoch": 1.3824,
"grad_norm": 0.4903436601161957,
"learning_rate": 1.545296e-05,
"loss": 0.2582,
"step": 216000
},
{
"epoch": 1.3856,
"grad_norm": 0.5602136850357056,
"learning_rate": 1.537296e-05,
"loss": 0.2563,
"step": 216500
},
{
"epoch": 1.3888,
"grad_norm": 0.5858916640281677,
"learning_rate": 1.529296e-05,
"loss": 0.2553,
"step": 217000
},
{
"epoch": 1.392,
"grad_norm": 0.438550740480423,
"learning_rate": 1.521312e-05,
"loss": 0.2567,
"step": 217500
},
{
"epoch": 1.3952,
"grad_norm": 0.5660952925682068,
"learning_rate": 1.513312e-05,
"loss": 0.2552,
"step": 218000
},
{
"epoch": 1.3984,
"grad_norm": 0.6139314770698547,
"learning_rate": 1.5053120000000001e-05,
"loss": 0.2506,
"step": 218500
},
{
"epoch": 1.4016,
"grad_norm": 0.5470092296600342,
"learning_rate": 1.4973120000000002e-05,
"loss": 0.2549,
"step": 219000
},
{
"epoch": 1.4048,
"grad_norm": 0.5565882325172424,
"learning_rate": 1.489328e-05,
"loss": 0.256,
"step": 219500
},
{
"epoch": 1.408,
"grad_norm": 0.4755209684371948,
"learning_rate": 1.4813280000000001e-05,
"loss": 0.2571,
"step": 220000
},
{
"epoch": 1.4112,
"grad_norm": 0.5266921520233154,
"learning_rate": 1.4733280000000002e-05,
"loss": 0.2546,
"step": 220500
},
{
"epoch": 1.4144,
"grad_norm": 0.5858850479125977,
"learning_rate": 1.465328e-05,
"loss": 0.251,
"step": 221000
},
{
"epoch": 1.4176,
"grad_norm": 0.5382100343704224,
"learning_rate": 1.457328e-05,
"loss": 0.254,
"step": 221500
},
{
"epoch": 1.4208,
"grad_norm": 0.6082443594932556,
"learning_rate": 1.4493280000000001e-05,
"loss": 0.2549,
"step": 222000
},
{
"epoch": 1.424,
"grad_norm": 0.56458979845047,
"learning_rate": 1.4413440000000001e-05,
"loss": 0.2556,
"step": 222500
},
{
"epoch": 1.4272,
"grad_norm": 0.5702414512634277,
"learning_rate": 1.433344e-05,
"loss": 0.2523,
"step": 223000
},
{
"epoch": 1.4304000000000001,
"grad_norm": 0.5704798102378845,
"learning_rate": 1.4253440000000001e-05,
"loss": 0.2539,
"step": 223500
},
{
"epoch": 1.4336,
"grad_norm": 0.675832211971283,
"learning_rate": 1.4173440000000002e-05,
"loss": 0.256,
"step": 224000
},
{
"epoch": 1.4368,
"grad_norm": 0.8129279017448425,
"learning_rate": 1.4093439999999999e-05,
"loss": 0.2533,
"step": 224500
},
{
"epoch": 1.44,
"grad_norm": 0.6167120933532715,
"learning_rate": 1.401344e-05,
"loss": 0.254,
"step": 225000
},
{
"epoch": 1.4432,
"grad_norm": 0.6647797226905823,
"learning_rate": 1.39336e-05,
"loss": 0.2497,
"step": 225500
},
{
"epoch": 1.4464000000000001,
"grad_norm": 0.5046721696853638,
"learning_rate": 1.38536e-05,
"loss": 0.2513,
"step": 226000
},
{
"epoch": 1.4496,
"grad_norm": 0.5725312232971191,
"learning_rate": 1.37736e-05,
"loss": 0.2547,
"step": 226500
},
{
"epoch": 1.4527999999999999,
"grad_norm": 0.514900803565979,
"learning_rate": 1.36936e-05,
"loss": 0.2504,
"step": 227000
},
{
"epoch": 1.456,
"grad_norm": 0.3646963834762573,
"learning_rate": 1.3613600000000001e-05,
"loss": 0.2548,
"step": 227500
},
{
"epoch": 1.4592,
"grad_norm": 0.6462276577949524,
"learning_rate": 1.3533600000000002e-05,
"loss": 0.2537,
"step": 228000
},
{
"epoch": 1.4624,
"grad_norm": 0.5525985956192017,
"learning_rate": 1.34536e-05,
"loss": 0.253,
"step": 228500
},
{
"epoch": 1.4656,
"grad_norm": 0.5146868824958801,
"learning_rate": 1.33736e-05,
"loss": 0.2531,
"step": 229000
},
{
"epoch": 1.4687999999999999,
"grad_norm": 0.6087847948074341,
"learning_rate": 1.329376e-05,
"loss": 0.2517,
"step": 229500
},
{
"epoch": 1.472,
"grad_norm": 0.5387943387031555,
"learning_rate": 1.321376e-05,
"loss": 0.2514,
"step": 230000
},
{
"epoch": 1.4752,
"grad_norm": 0.5926588177680969,
"learning_rate": 1.313376e-05,
"loss": 0.2544,
"step": 230500
},
{
"epoch": 1.4784,
"grad_norm": 0.5444154143333435,
"learning_rate": 1.3053760000000001e-05,
"loss": 0.253,
"step": 231000
},
{
"epoch": 1.4816,
"grad_norm": 0.5707711577415466,
"learning_rate": 1.2973760000000002e-05,
"loss": 0.2509,
"step": 231500
},
{
"epoch": 1.4848,
"grad_norm": 0.5120610594749451,
"learning_rate": 1.2893760000000002e-05,
"loss": 0.2535,
"step": 232000
},
{
"epoch": 1.488,
"grad_norm": 0.6814270615577698,
"learning_rate": 1.281392e-05,
"loss": 0.2542,
"step": 232500
},
{
"epoch": 1.4912,
"grad_norm": 0.5387424826622009,
"learning_rate": 1.2733920000000002e-05,
"loss": 0.2559,
"step": 233000
},
{
"epoch": 1.4944,
"grad_norm": 0.6061420440673828,
"learning_rate": 1.2653919999999999e-05,
"loss": 0.2492,
"step": 233500
},
{
"epoch": 1.4976,
"grad_norm": 0.5238478183746338,
"learning_rate": 1.257392e-05,
"loss": 0.2495,
"step": 234000
},
{
"epoch": 1.5008,
"grad_norm": 0.6245620846748352,
"learning_rate": 1.2494080000000002e-05,
"loss": 0.2503,
"step": 234500
},
{
"epoch": 1.504,
"grad_norm": 0.594336211681366,
"learning_rate": 1.241408e-05,
"loss": 0.2526,
"step": 235000
},
{
"epoch": 1.5072,
"grad_norm": 0.6665235161781311,
"learning_rate": 1.233424e-05,
"loss": 0.2565,
"step": 235500
},
{
"epoch": 1.5104,
"grad_norm": 0.4540468454360962,
"learning_rate": 1.225424e-05,
"loss": 0.2509,
"step": 236000
},
{
"epoch": 1.5135999999999998,
"grad_norm": 0.48204490542411804,
"learning_rate": 1.217424e-05,
"loss": 0.2566,
"step": 236500
},
{
"epoch": 1.5168,
"grad_norm": 0.7209044098854065,
"learning_rate": 1.2094240000000001e-05,
"loss": 0.2485,
"step": 237000
},
{
"epoch": 1.52,
"grad_norm": 0.5661574006080627,
"learning_rate": 1.201424e-05,
"loss": 0.2496,
"step": 237500
},
{
"epoch": 1.5232,
"grad_norm": 0.4637988805770874,
"learning_rate": 1.1934240000000001e-05,
"loss": 0.2498,
"step": 238000
},
{
"epoch": 1.5264,
"grad_norm": 0.5440483093261719,
"learning_rate": 1.1854240000000002e-05,
"loss": 0.2539,
"step": 238500
},
{
"epoch": 1.5295999999999998,
"grad_norm": 0.6143088936805725,
"learning_rate": 1.1774240000000001e-05,
"loss": 0.2507,
"step": 239000
},
{
"epoch": 1.5328,
"grad_norm": 0.553655207157135,
"learning_rate": 1.1694400000000001e-05,
"loss": 0.2494,
"step": 239500
},
{
"epoch": 1.536,
"grad_norm": 0.5812162160873413,
"learning_rate": 1.16144e-05,
"loss": 0.2494,
"step": 240000
},
{
"epoch": 1.5392000000000001,
"grad_norm": 0.4919438660144806,
"learning_rate": 1.1534400000000001e-05,
"loss": 0.2471,
"step": 240500
},
{
"epoch": 1.5424,
"grad_norm": 0.6260576844215393,
"learning_rate": 1.1454400000000002e-05,
"loss": 0.2518,
"step": 241000
},
{
"epoch": 1.5455999999999999,
"grad_norm": 0.6452062726020813,
"learning_rate": 1.137456e-05,
"loss": 0.253,
"step": 241500
},
{
"epoch": 1.5488,
"grad_norm": 0.5557950139045715,
"learning_rate": 1.1294559999999999e-05,
"loss": 0.2553,
"step": 242000
},
{
"epoch": 1.552,
"grad_norm": 0.6445254683494568,
"learning_rate": 1.121456e-05,
"loss": 0.2513,
"step": 242500
},
{
"epoch": 1.5552000000000001,
"grad_norm": 0.5771984457969666,
"learning_rate": 1.113456e-05,
"loss": 0.2518,
"step": 243000
},
{
"epoch": 1.5584,
"grad_norm": 0.48172786831855774,
"learning_rate": 1.105456e-05,
"loss": 0.2529,
"step": 243500
},
{
"epoch": 1.5615999999999999,
"grad_norm": 0.5962732434272766,
"learning_rate": 1.0974720000000002e-05,
"loss": 0.2552,
"step": 244000
},
{
"epoch": 1.5648,
"grad_norm": 0.5713253617286682,
"learning_rate": 1.089472e-05,
"loss": 0.2518,
"step": 244500
},
{
"epoch": 1.568,
"grad_norm": 0.7049676775932312,
"learning_rate": 1.081472e-05,
"loss": 0.2503,
"step": 245000
},
{
"epoch": 1.5712000000000002,
"grad_norm": 0.5562995076179504,
"learning_rate": 1.073472e-05,
"loss": 0.2534,
"step": 245500
},
{
"epoch": 1.5744,
"grad_norm": 0.5492623448371887,
"learning_rate": 1.065472e-05,
"loss": 0.2508,
"step": 246000
},
{
"epoch": 1.5776,
"grad_norm": 0.6449033617973328,
"learning_rate": 1.057472e-05,
"loss": 0.2506,
"step": 246500
},
{
"epoch": 1.5808,
"grad_norm": 0.5232768058776855,
"learning_rate": 1.049488e-05,
"loss": 0.2497,
"step": 247000
},
{
"epoch": 1.584,
"grad_norm": 0.5512565970420837,
"learning_rate": 1.0414880000000001e-05,
"loss": 0.2534,
"step": 247500
},
{
"epoch": 1.5872000000000002,
"grad_norm": 0.48962149024009705,
"learning_rate": 1.033488e-05,
"loss": 0.2552,
"step": 248000
},
{
"epoch": 1.5904,
"grad_norm": 0.5635197162628174,
"learning_rate": 1.025488e-05,
"loss": 0.251,
"step": 248500
},
{
"epoch": 1.5936,
"grad_norm": 0.5858097076416016,
"learning_rate": 1.0175040000000001e-05,
"loss": 0.2521,
"step": 249000
},
{
"epoch": 1.5968,
"grad_norm": 0.5749566555023193,
"learning_rate": 1.0095200000000001e-05,
"loss": 0.2542,
"step": 249500
},
{
"epoch": 1.6,
"grad_norm": 0.6057468056678772,
"learning_rate": 1.00152e-05,
"loss": 0.249,
"step": 250000
},
{
"epoch": 1.6032,
"grad_norm": 0.68129962682724,
"learning_rate": 9.9352e-06,
"loss": 0.2496,
"step": 250500
},
{
"epoch": 1.6064,
"grad_norm": 0.5518680810928345,
"learning_rate": 9.8552e-06,
"loss": 0.2483,
"step": 251000
},
{
"epoch": 1.6096,
"grad_norm": 0.7354257702827454,
"learning_rate": 9.775200000000001e-06,
"loss": 0.251,
"step": 251500
},
{
"epoch": 1.6128,
"grad_norm": 0.5537115335464478,
"learning_rate": 9.6952e-06,
"loss": 0.2519,
"step": 252000
},
{
"epoch": 1.616,
"grad_norm": 0.5443572402000427,
"learning_rate": 9.615360000000002e-06,
"loss": 0.2505,
"step": 252500
},
{
"epoch": 1.6192,
"grad_norm": 0.8157851099967957,
"learning_rate": 9.53536e-06,
"loss": 0.2456,
"step": 253000
},
{
"epoch": 1.6223999999999998,
"grad_norm": 0.5709113478660583,
"learning_rate": 9.455520000000001e-06,
"loss": 0.2511,
"step": 253500
},
{
"epoch": 1.6256,
"grad_norm": 0.5266199707984924,
"learning_rate": 9.37552e-06,
"loss": 0.2523,
"step": 254000
},
{
"epoch": 1.6288,
"grad_norm": 0.6796950697898865,
"learning_rate": 9.29552e-06,
"loss": 0.2529,
"step": 254500
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.5162604451179504,
"learning_rate": 9.215520000000002e-06,
"loss": 0.2486,
"step": 255000
},
{
"epoch": 1.6352,
"grad_norm": 0.5577069520950317,
"learning_rate": 9.13552e-06,
"loss": 0.2501,
"step": 255500
},
{
"epoch": 1.6383999999999999,
"grad_norm": 0.5930905342102051,
"learning_rate": 9.055520000000001e-06,
"loss": 0.2505,
"step": 256000
},
{
"epoch": 1.6416,
"grad_norm": 0.5219632983207703,
"learning_rate": 8.97552e-06,
"loss": 0.2499,
"step": 256500
},
{
"epoch": 1.6448,
"grad_norm": 0.5385752320289612,
"learning_rate": 8.89552e-06,
"loss": 0.2473,
"step": 257000
},
{
"epoch": 1.6480000000000001,
"grad_norm": 0.5498505234718323,
"learning_rate": 8.81552e-06,
"loss": 0.2513,
"step": 257500
},
{
"epoch": 1.6512,
"grad_norm": 0.5780929923057556,
"learning_rate": 8.73552e-06,
"loss": 0.2488,
"step": 258000
},
{
"epoch": 1.6543999999999999,
"grad_norm": 0.6167399883270264,
"learning_rate": 8.65552e-06,
"loss": 0.2501,
"step": 258500
},
{
"epoch": 1.6576,
"grad_norm": 0.6829573512077332,
"learning_rate": 8.575520000000001e-06,
"loss": 0.2477,
"step": 259000
},
{
"epoch": 1.6608,
"grad_norm": 0.4874655604362488,
"learning_rate": 8.49552e-06,
"loss": 0.2507,
"step": 259500
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.5769158601760864,
"learning_rate": 8.41568e-06,
"loss": 0.2477,
"step": 260000
},
{
"epoch": 1.6672,
"grad_norm": 0.45717403292655945,
"learning_rate": 8.335679999999999e-06,
"loss": 0.2515,
"step": 260500
},
{
"epoch": 1.6703999999999999,
"grad_norm": 0.5851497650146484,
"learning_rate": 8.25568e-06,
"loss": 0.2453,
"step": 261000
},
{
"epoch": 1.6736,
"grad_norm": 0.7223703265190125,
"learning_rate": 8.17568e-06,
"loss": 0.2534,
"step": 261500
},
{
"epoch": 1.6768,
"grad_norm": 0.5290210843086243,
"learning_rate": 8.09584e-06,
"loss": 0.2468,
"step": 262000
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.5959377884864807,
"learning_rate": 8.015999999999999e-06,
"loss": 0.2498,
"step": 262500
},
{
"epoch": 1.6832,
"grad_norm": 0.5404760241508484,
"learning_rate": 7.936e-06,
"loss": 0.2485,
"step": 263000
},
{
"epoch": 1.6864,
"grad_norm": 0.496378093957901,
"learning_rate": 7.856e-06,
"loss": 0.2474,
"step": 263500
},
{
"epoch": 1.6896,
"grad_norm": 0.54584801197052,
"learning_rate": 7.776e-06,
"loss": 0.2498,
"step": 264000
},
{
"epoch": 1.6928,
"grad_norm": 0.5465365052223206,
"learning_rate": 7.696160000000002e-06,
"loss": 0.2468,
"step": 264500
},
{
"epoch": 1.696,
"grad_norm": 0.5857728719711304,
"learning_rate": 7.61616e-06,
"loss": 0.2485,
"step": 265000
},
{
"epoch": 1.6992,
"grad_norm": 0.5276440382003784,
"learning_rate": 7.5361600000000005e-06,
"loss": 0.2467,
"step": 265500
},
{
"epoch": 1.7024,
"grad_norm": 0.5197107195854187,
"learning_rate": 7.456160000000001e-06,
"loss": 0.2457,
"step": 266000
},
{
"epoch": 1.7056,
"grad_norm": 0.6030395030975342,
"learning_rate": 7.37616e-06,
"loss": 0.2449,
"step": 266500
},
{
"epoch": 1.7088,
"grad_norm": 0.5553884506225586,
"learning_rate": 7.29616e-06,
"loss": 0.2459,
"step": 267000
},
{
"epoch": 1.712,
"grad_norm": 0.6518832445144653,
"learning_rate": 7.216160000000001e-06,
"loss": 0.2469,
"step": 267500
},
{
"epoch": 1.7151999999999998,
"grad_norm": 0.6981451511383057,
"learning_rate": 7.13616e-06,
"loss": 0.2493,
"step": 268000
},
{
"epoch": 1.7184,
"grad_norm": 0.6021608114242554,
"learning_rate": 7.056160000000001e-06,
"loss": 0.2477,
"step": 268500
},
{
"epoch": 1.7216,
"grad_norm": 0.6317922472953796,
"learning_rate": 6.976160000000001e-06,
"loss": 0.2461,
"step": 269000
},
{
"epoch": 1.7248,
"grad_norm": 0.6130341291427612,
"learning_rate": 6.89616e-06,
"loss": 0.2508,
"step": 269500
},
{
"epoch": 1.728,
"grad_norm": 0.6314118504524231,
"learning_rate": 6.816160000000001e-06,
"loss": 0.243,
"step": 270000
},
{
"epoch": 1.7311999999999999,
"grad_norm": 0.6070537567138672,
"learning_rate": 6.73616e-06,
"loss": 0.2472,
"step": 270500
},
{
"epoch": 1.7344,
"grad_norm": 0.5763754844665527,
"learning_rate": 6.6561600000000005e-06,
"loss": 0.2511,
"step": 271000
},
{
"epoch": 1.7376,
"grad_norm": 0.6849692463874817,
"learning_rate": 6.57632e-06,
"loss": 0.2461,
"step": 271500
},
{
"epoch": 1.7408000000000001,
"grad_norm": 0.6505193710327148,
"learning_rate": 6.4963200000000005e-06,
"loss": 0.2472,
"step": 272000
},
{
"epoch": 1.744,
"grad_norm": 0.5150639414787292,
"learning_rate": 6.4163200000000004e-06,
"loss": 0.2456,
"step": 272500
},
{
"epoch": 1.7471999999999999,
"grad_norm": 0.6367226839065552,
"learning_rate": 6.3363199999999995e-06,
"loss": 0.2478,
"step": 273000
},
{
"epoch": 1.7504,
"grad_norm": 0.6016091704368591,
"learning_rate": 6.25632e-06,
"loss": 0.2457,
"step": 273500
},
{
"epoch": 1.7536,
"grad_norm": 0.6344937682151794,
"learning_rate": 6.17632e-06,
"loss": 0.2449,
"step": 274000
},
{
"epoch": 1.7568000000000001,
"grad_norm": 0.6031948924064636,
"learning_rate": 6.09632e-06,
"loss": 0.2474,
"step": 274500
},
{
"epoch": 1.76,
"grad_norm": 0.6515588760375977,
"learning_rate": 6.01648e-06,
"loss": 0.2463,
"step": 275000
},
{
"epoch": 1.7631999999999999,
"grad_norm": 0.562818706035614,
"learning_rate": 5.93648e-06,
"loss": 0.2507,
"step": 275500
},
{
"epoch": 1.7664,
"grad_norm": 0.591066300868988,
"learning_rate": 5.85648e-06,
"loss": 0.2488,
"step": 276000
},
{
"epoch": 1.7696,
"grad_norm": 0.544698178768158,
"learning_rate": 5.77664e-06,
"loss": 0.2445,
"step": 276500
},
{
"epoch": 1.7728000000000002,
"grad_norm": 0.4837886691093445,
"learning_rate": 5.6968e-06,
"loss": 0.2497,
"step": 277000
},
{
"epoch": 1.776,
"grad_norm": 0.5405265092849731,
"learning_rate": 5.6168e-06,
"loss": 0.2489,
"step": 277500
},
{
"epoch": 1.7792,
"grad_norm": 0.560249924659729,
"learning_rate": 5.5368e-06,
"loss": 0.2466,
"step": 278000
},
{
"epoch": 1.7824,
"grad_norm": 0.5680164098739624,
"learning_rate": 5.4568e-06,
"loss": 0.2475,
"step": 278500
},
{
"epoch": 1.7856,
"grad_norm": 0.7152078747749329,
"learning_rate": 5.376800000000001e-06,
"loss": 0.2439,
"step": 279000
},
{
"epoch": 1.7888,
"grad_norm": 0.6013668775558472,
"learning_rate": 5.296800000000001e-06,
"loss": 0.2482,
"step": 279500
},
{
"epoch": 1.792,
"grad_norm": 0.5784064531326294,
"learning_rate": 5.2168e-06,
"loss": 0.2495,
"step": 280000
},
{
"epoch": 1.7952,
"grad_norm": 0.5531567931175232,
"learning_rate": 5.1368e-06,
"loss": 0.2497,
"step": 280500
},
{
"epoch": 1.7984,
"grad_norm": 0.5494315028190613,
"learning_rate": 5.0568000000000004e-06,
"loss": 0.2476,
"step": 281000
},
{
"epoch": 1.8016,
"grad_norm": 0.6254246830940247,
"learning_rate": 4.9768e-06,
"loss": 0.243,
"step": 281500
},
{
"epoch": 1.8048,
"grad_norm": 0.7309369444847107,
"learning_rate": 4.8969600000000005e-06,
"loss": 0.2428,
"step": 282000
},
{
"epoch": 1.808,
"grad_norm": 0.587374210357666,
"learning_rate": 4.81696e-06,
"loss": 0.253,
"step": 282500
},
{
"epoch": 1.8112,
"grad_norm": 0.5197418928146362,
"learning_rate": 4.73696e-06,
"loss": 0.2454,
"step": 283000
},
{
"epoch": 1.8144,
"grad_norm": 0.5393714308738708,
"learning_rate": 4.65696e-06,
"loss": 0.2442,
"step": 283500
},
{
"epoch": 1.8176,
"grad_norm": 0.6797386407852173,
"learning_rate": 4.57696e-06,
"loss": 0.2455,
"step": 284000
},
{
"epoch": 1.8208,
"grad_norm": 0.5192613005638123,
"learning_rate": 4.49696e-06,
"loss": 0.2457,
"step": 284500
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.5224815607070923,
"learning_rate": 4.416960000000001e-06,
"loss": 0.2442,
"step": 285000
},
{
"epoch": 1.8272,
"grad_norm": 0.5999212861061096,
"learning_rate": 4.336960000000001e-06,
"loss": 0.2465,
"step": 285500
},
{
"epoch": 1.8304,
"grad_norm": 0.6273928880691528,
"learning_rate": 4.25712e-06,
"loss": 0.2451,
"step": 286000
},
{
"epoch": 1.8336000000000001,
"grad_norm": 0.4545860290527344,
"learning_rate": 4.177120000000001e-06,
"loss": 0.2455,
"step": 286500
},
{
"epoch": 1.8368,
"grad_norm": 0.5125412344932556,
"learning_rate": 4.09712e-06,
"loss": 0.2471,
"step": 287000
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.6210908889770508,
"learning_rate": 4.01712e-06,
"loss": 0.2504,
"step": 287500
},
{
"epoch": 1.8432,
"grad_norm": 0.5454786419868469,
"learning_rate": 3.9371200000000005e-06,
"loss": 0.2459,
"step": 288000
},
{
"epoch": 1.8464,
"grad_norm": 0.4733567535877228,
"learning_rate": 3.8571200000000004e-06,
"loss": 0.2417,
"step": 288500
},
{
"epoch": 1.8496000000000001,
"grad_norm": 0.5461744666099548,
"learning_rate": 3.7772800000000005e-06,
"loss": 0.2448,
"step": 289000
},
{
"epoch": 1.8528,
"grad_norm": 0.5227847695350647,
"learning_rate": 3.69728e-06,
"loss": 0.2491,
"step": 289500
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.4567984938621521,
"learning_rate": 3.61728e-06,
"loss": 0.2491,
"step": 290000
},
{
"epoch": 1.8592,
"grad_norm": 0.634410560131073,
"learning_rate": 3.5372800000000003e-06,
"loss": 0.249,
"step": 290500
},
{
"epoch": 1.8624,
"grad_norm": 0.5894684195518494,
"learning_rate": 3.45728e-06,
"loss": 0.2468,
"step": 291000
},
{
"epoch": 1.8656000000000001,
"grad_norm": 0.6827466487884521,
"learning_rate": 3.37728e-06,
"loss": 0.2446,
"step": 291500
},
{
"epoch": 1.8688,
"grad_norm": 0.6229824423789978,
"learning_rate": 3.2972799999999996e-06,
"loss": 0.2487,
"step": 292000
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.5481498837471008,
"learning_rate": 3.2172800000000004e-06,
"loss": 0.2461,
"step": 292500
},
{
"epoch": 1.8752,
"grad_norm": 0.6237102746963501,
"learning_rate": 3.13728e-06,
"loss": 0.2474,
"step": 293000
},
{
"epoch": 1.8784,
"grad_norm": 0.6718310117721558,
"learning_rate": 3.0572800000000002e-06,
"loss": 0.2512,
"step": 293500
},
{
"epoch": 1.8816000000000002,
"grad_norm": 0.5450266003608704,
"learning_rate": 2.97728e-06,
"loss": 0.249,
"step": 294000
},
{
"epoch": 1.8848,
"grad_norm": 0.6037828326225281,
"learning_rate": 2.8974400000000002e-06,
"loss": 0.2447,
"step": 294500
},
{
"epoch": 1.888,
"grad_norm": 0.60282963514328,
"learning_rate": 2.81744e-06,
"loss": 0.2449,
"step": 295000
},
{
"epoch": 1.8912,
"grad_norm": 0.5670416355133057,
"learning_rate": 2.73744e-06,
"loss": 0.2424,
"step": 295500
},
{
"epoch": 1.8944,
"grad_norm": 0.6023501753807068,
"learning_rate": 2.6576e-06,
"loss": 0.2449,
"step": 296000
},
{
"epoch": 1.8976,
"grad_norm": 0.5216399431228638,
"learning_rate": 2.5776e-06,
"loss": 0.243,
"step": 296500
},
{
"epoch": 1.9008,
"grad_norm": 0.49217623472213745,
"learning_rate": 2.4976000000000004e-06,
"loss": 0.2432,
"step": 297000
},
{
"epoch": 1.904,
"grad_norm": 0.46074241399765015,
"learning_rate": 2.4176e-06,
"loss": 0.2434,
"step": 297500
},
{
"epoch": 1.9072,
"grad_norm": 0.5020151734352112,
"learning_rate": 2.3376000000000003e-06,
"loss": 0.2424,
"step": 298000
},
{
"epoch": 1.9104,
"grad_norm": 0.5782959461212158,
"learning_rate": 2.2576e-06,
"loss": 0.246,
"step": 298500
},
{
"epoch": 1.9136,
"grad_norm": 0.5627701282501221,
"learning_rate": 2.1777600000000003e-06,
"loss": 0.2483,
"step": 299000
},
{
"epoch": 1.9167999999999998,
"grad_norm": 0.5413541793823242,
"learning_rate": 2.09776e-06,
"loss": 0.2472,
"step": 299500
},
{
"epoch": 1.92,
"grad_norm": 0.5274430513381958,
"learning_rate": 2.01776e-06,
"loss": 0.2439,
"step": 300000
},
{
"epoch": 1.9232,
"grad_norm": 0.5475273728370667,
"learning_rate": 1.93776e-06,
"loss": 0.2442,
"step": 300500
},
{
"epoch": 1.9264000000000001,
"grad_norm": 0.42543721199035645,
"learning_rate": 1.85776e-06,
"loss": 0.2472,
"step": 301000
},
{
"epoch": 1.9296,
"grad_norm": 0.6353417634963989,
"learning_rate": 1.7777600000000001e-06,
"loss": 0.2478,
"step": 301500
},
{
"epoch": 1.9327999999999999,
"grad_norm": 0.6469547748565674,
"learning_rate": 1.69776e-06,
"loss": 0.2399,
"step": 302000
},
{
"epoch": 1.936,
"grad_norm": 0.5442044734954834,
"learning_rate": 1.6177600000000002e-06,
"loss": 0.2461,
"step": 302500
},
{
"epoch": 1.9392,
"grad_norm": 0.6031491756439209,
"learning_rate": 1.53776e-06,
"loss": 0.2463,
"step": 303000
},
{
"epoch": 1.9424000000000001,
"grad_norm": 0.6695773005485535,
"learning_rate": 1.4579200000000002e-06,
"loss": 0.2475,
"step": 303500
},
{
"epoch": 1.9456,
"grad_norm": 0.5614984631538391,
"learning_rate": 1.37792e-06,
"loss": 0.2442,
"step": 304000
},
{
"epoch": 1.9487999999999999,
"grad_norm": 0.5449358224868774,
"learning_rate": 1.2980800000000002e-06,
"loss": 0.2482,
"step": 304500
},
{
"epoch": 1.952,
"grad_norm": 0.4947919249534607,
"learning_rate": 1.2180800000000001e-06,
"loss": 0.2434,
"step": 305000
},
{
"epoch": 1.9552,
"grad_norm": 0.5914204716682434,
"learning_rate": 1.13808e-06,
"loss": 0.2449,
"step": 305500
},
{
"epoch": 1.9584000000000001,
"grad_norm": 0.5385975241661072,
"learning_rate": 1.05808e-06,
"loss": 0.2484,
"step": 306000
},
{
"epoch": 1.9616,
"grad_norm": 0.5446351170539856,
"learning_rate": 9.780799999999999e-07,
"loss": 0.2434,
"step": 306500
},
{
"epoch": 1.9647999999999999,
"grad_norm": 0.5600497722625732,
"learning_rate": 8.980800000000001e-07,
"loss": 0.2456,
"step": 307000
},
{
"epoch": 1.968,
"grad_norm": 0.46547964215278625,
"learning_rate": 8.1808e-07,
"loss": 0.2467,
"step": 307500
},
{
"epoch": 1.9712,
"grad_norm": 0.5435817241668701,
"learning_rate": 7.380800000000001e-07,
"loss": 0.25,
"step": 308000
}
],
"logging_steps": 500,
"max_steps": 312500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.50047224233984e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}