{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1923828125, "eval_steps": 24576, "global_step": 18912, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0172526041666666e-05, "eval_loss": 4.542403221130371, "eval_runtime": 144.6811, "eval_samples_per_second": 13.872, "eval_steps_per_second": 13.872, "step": 1 }, { "epoch": 5.0862630208333336e-05, "grad_norm": 45.55976867675781, "learning_rate": 2.5000000000000004e-07, "loss": 4.478, "step": 5 }, { "epoch": 0.00010172526041666667, "grad_norm": 44.849979400634766, "learning_rate": 5.000000000000001e-07, "loss": 4.381, "step": 10 }, { "epoch": 0.000152587890625, "grad_norm": 47.78006362915039, "learning_rate": 7.5e-07, "loss": 4.2726, "step": 15 }, { "epoch": 0.00020345052083333334, "grad_norm": 49.011878967285156, "learning_rate": 1.0000000000000002e-06, "loss": 4.4009, "step": 20 }, { "epoch": 0.0002543131510416667, "grad_norm": 40.69770050048828, "learning_rate": 1.25e-06, "loss": 4.4735, "step": 25 }, { "epoch": 0.00030517578125, "grad_norm": 34.35673522949219, "learning_rate": 1.5e-06, "loss": 4.2665, "step": 30 }, { "epoch": 0.0003560384114583333, "grad_norm": 52.81883239746094, "learning_rate": 1.75e-06, "loss": 4.423, "step": 35 }, { "epoch": 0.0004069010416666667, "grad_norm": 32.41872024536133, "learning_rate": 2.0000000000000003e-06, "loss": 4.6442, "step": 40 }, { "epoch": 0.000457763671875, "grad_norm": 37.73821258544922, "learning_rate": 2.25e-06, "loss": 4.2681, "step": 45 }, { "epoch": 0.0005086263020833334, "grad_norm": 37.740386962890625, "learning_rate": 2.5e-06, "loss": 4.6894, "step": 50 }, { "epoch": 0.0005594889322916666, "grad_norm": 50.862735748291016, "learning_rate": 2.7500000000000004e-06, "loss": 4.3243, "step": 55 }, { "epoch": 0.0006103515625, "grad_norm": 45.02497863769531, "learning_rate": 3e-06, "loss": 4.3276, "step": 60 }, { "epoch": 0.0006612141927083334, "grad_norm": 28.076095581054688, "learning_rate": 3.2500000000000002e-06, "loss": 3.8881, "step": 65 }, { "epoch": 0.0007120768229166666, "grad_norm": 27.940998077392578, "learning_rate": 3.5e-06, "loss": 4.2496, "step": 70 }, { "epoch": 0.000762939453125, "grad_norm": 26.482099533081055, "learning_rate": 3.7500000000000005e-06, "loss": 4.349, "step": 75 }, { "epoch": 0.0008138020833333334, "grad_norm": 32.36627960205078, "learning_rate": 4.000000000000001e-06, "loss": 4.423, "step": 80 }, { "epoch": 0.0008646647135416666, "grad_norm": 48.71563720703125, "learning_rate": 4.25e-06, "loss": 4.65, "step": 85 }, { "epoch": 0.00091552734375, "grad_norm": 25.62880516052246, "learning_rate": 4.5e-06, "loss": 4.2165, "step": 90 }, { "epoch": 0.0009663899739583334, "grad_norm": 33.9738655090332, "learning_rate": 4.75e-06, "loss": 4.1882, "step": 95 }, { "epoch": 0.0010172526041666667, "grad_norm": 24.544464111328125, "learning_rate": 5e-06, "loss": 3.7919, "step": 100 }, { "epoch": 0.001068115234375, "grad_norm": 38.953922271728516, "learning_rate": 4.999999968019047e-06, "loss": 4.5549, "step": 105 }, { "epoch": 0.0011189778645833333, "grad_norm": 30.3071346282959, "learning_rate": 4.999999872076186e-06, "loss": 4.3079, "step": 110 }, { "epoch": 0.0011698404947916667, "grad_norm": 26.42899513244629, "learning_rate": 4.999999712171422e-06, "loss": 4.3139, "step": 115 }, { "epoch": 0.001220703125, "grad_norm": 18.36640739440918, "learning_rate": 4.999999488304758e-06, "loss": 4.1012, "step": 120 }, { "epoch": 0.0012715657552083333, "grad_norm": 22.997697830200195, "learning_rate": 4.999999200476199e-06, "loss": 4.0616, "step": 125 }, { "epoch": 0.0013224283854166667, "grad_norm": 18.358749389648438, "learning_rate": 4.999998848685752e-06, "loss": 4.0783, "step": 130 }, { "epoch": 0.001373291015625, "grad_norm": 16.22219467163086, "learning_rate": 4.999998432933428e-06, "loss": 4.14, "step": 135 }, { "epoch": 0.0014241536458333333, "grad_norm": 31.525222778320312, "learning_rate": 4.999997953219238e-06, "loss": 4.268, "step": 140 }, { "epoch": 0.0014750162760416667, "grad_norm": 16.66249656677246, "learning_rate": 4.999997409543191e-06, "loss": 3.7311, "step": 145 }, { "epoch": 0.00152587890625, "grad_norm": 19.815828323364258, "learning_rate": 4.999996801905304e-06, "loss": 4.0011, "step": 150 }, { "epoch": 0.0015767415364583333, "grad_norm": 27.2629337310791, "learning_rate": 4.9999961303055906e-06, "loss": 4.4585, "step": 155 }, { "epoch": 0.0016276041666666667, "grad_norm": 20.656837463378906, "learning_rate": 4.99999539474407e-06, "loss": 3.6024, "step": 160 }, { "epoch": 0.001678466796875, "grad_norm": 32.12629699707031, "learning_rate": 4.999994595220758e-06, "loss": 3.6387, "step": 165 }, { "epoch": 0.0017293294270833333, "grad_norm": 17.384239196777344, "learning_rate": 4.9999937317356776e-06, "loss": 3.7885, "step": 170 }, { "epoch": 0.0017801920572916667, "grad_norm": 16.839290618896484, "learning_rate": 4.99999280428885e-06, "loss": 3.9351, "step": 175 }, { "epoch": 0.0018310546875, "grad_norm": 17.16065216064453, "learning_rate": 4.9999918128803e-06, "loss": 3.829, "step": 180 }, { "epoch": 0.0018819173177083333, "grad_norm": 19.670543670654297, "learning_rate": 4.999990757510052e-06, "loss": 3.9205, "step": 185 }, { "epoch": 0.0019327799479166667, "grad_norm": 19.776790618896484, "learning_rate": 4.999989638178131e-06, "loss": 4.0733, "step": 190 }, { "epoch": 0.001983642578125, "grad_norm": 17.121217727661133, "learning_rate": 4.99998845488457e-06, "loss": 3.9968, "step": 195 }, { "epoch": 0.0020345052083333335, "grad_norm": 18.365943908691406, "learning_rate": 4.999987207629396e-06, "loss": 3.6687, "step": 200 }, { "epoch": 0.0020853678385416665, "grad_norm": 17.672025680541992, "learning_rate": 4.9999858964126415e-06, "loss": 3.9062, "step": 205 }, { "epoch": 0.00213623046875, "grad_norm": 14.973541259765625, "learning_rate": 4.9999845212343415e-06, "loss": 3.4274, "step": 210 }, { "epoch": 0.0021870930989583335, "grad_norm": 20.310712814331055, "learning_rate": 4.999983082094529e-06, "loss": 3.8899, "step": 215 }, { "epoch": 0.0022379557291666665, "grad_norm": 30.07281494140625, "learning_rate": 4.999981578993242e-06, "loss": 4.083, "step": 220 }, { "epoch": 0.002288818359375, "grad_norm": 14.967458724975586, "learning_rate": 4.999980011930519e-06, "loss": 3.8021, "step": 225 }, { "epoch": 0.0023396809895833335, "grad_norm": 18.596702575683594, "learning_rate": 4.999978380906401e-06, "loss": 4.2005, "step": 230 }, { "epoch": 0.0023905436197916665, "grad_norm": 14.92667293548584, "learning_rate": 4.999976685920927e-06, "loss": 3.7878, "step": 235 }, { "epoch": 0.00244140625, "grad_norm": 26.372528076171875, "learning_rate": 4.999974926974142e-06, "loss": 3.906, "step": 240 }, { "epoch": 0.0024922688802083335, "grad_norm": 11.505077362060547, "learning_rate": 4.9999731040660925e-06, "loss": 3.7275, "step": 245 }, { "epoch": 0.0025431315104166665, "grad_norm": 19.219104766845703, "learning_rate": 4.999971217196824e-06, "loss": 3.6951, "step": 250 }, { "epoch": 0.002593994140625, "grad_norm": 20.686763763427734, "learning_rate": 4.999969266366383e-06, "loss": 4.2166, "step": 255 }, { "epoch": 0.0026448567708333335, "grad_norm": 14.91057300567627, "learning_rate": 4.999967251574821e-06, "loss": 4.1096, "step": 260 }, { "epoch": 0.0026957194010416665, "grad_norm": 14.4781494140625, "learning_rate": 4.99996517282219e-06, "loss": 3.9429, "step": 265 }, { "epoch": 0.00274658203125, "grad_norm": 16.21483039855957, "learning_rate": 4.9999630301085425e-06, "loss": 4.5956, "step": 270 }, { "epoch": 0.0027974446614583335, "grad_norm": 20.831764221191406, "learning_rate": 4.9999608234339336e-06, "loss": 4.0729, "step": 275 }, { "epoch": 0.0028483072916666665, "grad_norm": 16.851608276367188, "learning_rate": 4.999958552798419e-06, "loss": 3.9821, "step": 280 }, { "epoch": 0.002899169921875, "grad_norm": 16.131776809692383, "learning_rate": 4.999956218202058e-06, "loss": 3.912, "step": 285 }, { "epoch": 0.0029500325520833335, "grad_norm": 22.348773956298828, "learning_rate": 4.9999538196449096e-06, "loss": 3.7261, "step": 290 }, { "epoch": 0.0030008951822916665, "grad_norm": 27.426599502563477, "learning_rate": 4.9999513571270355e-06, "loss": 3.9633, "step": 295 }, { "epoch": 0.0030517578125, "grad_norm": 19.74297523498535, "learning_rate": 4.999948830648497e-06, "loss": 3.6241, "step": 300 }, { "epoch": 0.0031026204427083335, "grad_norm": 26.0491943359375, "learning_rate": 4.999946240209362e-06, "loss": 3.8093, "step": 305 }, { "epoch": 0.0031534830729166665, "grad_norm": 17.24481964111328, "learning_rate": 4.999943585809694e-06, "loss": 3.9762, "step": 310 }, { "epoch": 0.003204345703125, "grad_norm": 16.89327621459961, "learning_rate": 4.999940867449562e-06, "loss": 3.5947, "step": 315 }, { "epoch": 0.0032552083333333335, "grad_norm": 19.533809661865234, "learning_rate": 4.999938085129036e-06, "loss": 3.5673, "step": 320 }, { "epoch": 0.0033060709635416665, "grad_norm": 16.76618766784668, "learning_rate": 4.999935238848187e-06, "loss": 3.8095, "step": 325 }, { "epoch": 0.00335693359375, "grad_norm": 15.267281532287598, "learning_rate": 4.999932328607087e-06, "loss": 3.8809, "step": 330 }, { "epoch": 0.0034077962239583335, "grad_norm": 14.550617218017578, "learning_rate": 4.999929354405811e-06, "loss": 3.9913, "step": 335 }, { "epoch": 0.0034586588541666665, "grad_norm": 22.565776824951172, "learning_rate": 4.999926316244434e-06, "loss": 4.0828, "step": 340 }, { "epoch": 0.003509521484375, "grad_norm": 14.583137512207031, "learning_rate": 4.999923214123036e-06, "loss": 4.0128, "step": 345 }, { "epoch": 0.0035603841145833335, "grad_norm": 14.065851211547852, "learning_rate": 4.999920048041694e-06, "loss": 3.5976, "step": 350 }, { "epoch": 0.0036112467447916665, "grad_norm": 22.18216323852539, "learning_rate": 4.999916818000491e-06, "loss": 3.8117, "step": 355 }, { "epoch": 0.003662109375, "grad_norm": 21.645153045654297, "learning_rate": 4.9999135239995076e-06, "loss": 3.5906, "step": 360 }, { "epoch": 0.0037129720052083335, "grad_norm": 13.195281028747559, "learning_rate": 4.9999101660388305e-06, "loss": 3.7585, "step": 365 }, { "epoch": 0.0037638346354166665, "grad_norm": 10.816890716552734, "learning_rate": 4.999906744118545e-06, "loss": 3.7736, "step": 370 }, { "epoch": 0.003814697265625, "grad_norm": 14.775542259216309, "learning_rate": 4.999903258238736e-06, "loss": 3.6809, "step": 375 }, { "epoch": 0.0038655598958333335, "grad_norm": 22.4163761138916, "learning_rate": 4.999899708399496e-06, "loss": 3.6198, "step": 380 }, { "epoch": 0.003916422526041667, "grad_norm": 20.934917449951172, "learning_rate": 4.999896094600914e-06, "loss": 3.7003, "step": 385 }, { "epoch": 0.00396728515625, "grad_norm": 13.397461891174316, "learning_rate": 4.999892416843083e-06, "loss": 3.9415, "step": 390 }, { "epoch": 0.004018147786458333, "grad_norm": 18.704856872558594, "learning_rate": 4.999888675126097e-06, "loss": 3.7429, "step": 395 }, { "epoch": 0.004069010416666667, "grad_norm": 14.650379180908203, "learning_rate": 4.9998848694500526e-06, "loss": 3.8455, "step": 400 }, { "epoch": 0.004119873046875, "grad_norm": 22.322834014892578, "learning_rate": 4.999880999815045e-06, "loss": 3.9125, "step": 405 }, { "epoch": 0.004170735677083333, "grad_norm": 16.105815887451172, "learning_rate": 4.999877066221175e-06, "loss": 3.5135, "step": 410 }, { "epoch": 0.004221598307291667, "grad_norm": 10.558919906616211, "learning_rate": 4.999873068668544e-06, "loss": 3.6677, "step": 415 }, { "epoch": 0.0042724609375, "grad_norm": 10.016486167907715, "learning_rate": 4.999869007157252e-06, "loss": 3.5761, "step": 420 }, { "epoch": 0.004323323567708333, "grad_norm": 16.01995849609375, "learning_rate": 4.999864881687404e-06, "loss": 3.9739, "step": 425 }, { "epoch": 0.004374186197916667, "grad_norm": 21.719324111938477, "learning_rate": 4.999860692259105e-06, "loss": 3.9211, "step": 430 }, { "epoch": 0.004425048828125, "grad_norm": 42.85234069824219, "learning_rate": 4.999856438872463e-06, "loss": 3.9438, "step": 435 }, { "epoch": 0.004475911458333333, "grad_norm": 13.891044616699219, "learning_rate": 4.999852121527588e-06, "loss": 3.6916, "step": 440 }, { "epoch": 0.004526774088541667, "grad_norm": 18.872058868408203, "learning_rate": 4.999847740224587e-06, "loss": 4.0558, "step": 445 }, { "epoch": 0.00457763671875, "grad_norm": 16.054645538330078, "learning_rate": 4.999843294963576e-06, "loss": 3.8301, "step": 450 }, { "epoch": 0.004628499348958333, "grad_norm": 14.235097885131836, "learning_rate": 4.999838785744665e-06, "loss": 3.745, "step": 455 }, { "epoch": 0.004679361979166667, "grad_norm": 21.37851905822754, "learning_rate": 4.999834212567972e-06, "loss": 3.8782, "step": 460 }, { "epoch": 0.004730224609375, "grad_norm": 17.109268188476562, "learning_rate": 4.999829575433613e-06, "loss": 3.4868, "step": 465 }, { "epoch": 0.004781087239583333, "grad_norm": 16.911109924316406, "learning_rate": 4.999824874341708e-06, "loss": 3.4223, "step": 470 }, { "epoch": 0.004831949869791667, "grad_norm": 9.432239532470703, "learning_rate": 4.9998201092923746e-06, "loss": 3.3816, "step": 475 }, { "epoch": 0.0048828125, "grad_norm": 18.146076202392578, "learning_rate": 4.999815280285737e-06, "loss": 4.0282, "step": 480 }, { "epoch": 0.004933675130208333, "grad_norm": 26.35177993774414, "learning_rate": 4.999810387321917e-06, "loss": 3.2918, "step": 485 }, { "epoch": 0.004984537760416667, "grad_norm": 17.22603988647461, "learning_rate": 4.9998054304010425e-06, "loss": 3.7617, "step": 490 }, { "epoch": 0.005035400390625, "grad_norm": 15.333989143371582, "learning_rate": 4.999800409523237e-06, "loss": 4.2317, "step": 495 }, { "epoch": 0.005086263020833333, "grad_norm": 19.130863189697266, "learning_rate": 4.999795324688631e-06, "loss": 3.8223, "step": 500 }, { "epoch": 0.005137125651041667, "grad_norm": 12.130922317504883, "learning_rate": 4.999790175897355e-06, "loss": 3.7448, "step": 505 }, { "epoch": 0.00518798828125, "grad_norm": 18.195701599121094, "learning_rate": 4.999784963149539e-06, "loss": 3.8377, "step": 510 }, { "epoch": 0.005238850911458333, "grad_norm": 13.157418251037598, "learning_rate": 4.999779686445318e-06, "loss": 4.2839, "step": 515 }, { "epoch": 0.005289713541666667, "grad_norm": 15.427362442016602, "learning_rate": 4.999774345784825e-06, "loss": 3.6147, "step": 520 }, { "epoch": 0.005340576171875, "grad_norm": 18.931623458862305, "learning_rate": 4.9997689411681986e-06, "loss": 4.246, "step": 525 }, { "epoch": 0.005391438802083333, "grad_norm": 16.495134353637695, "learning_rate": 4.9997634725955756e-06, "loss": 3.811, "step": 530 }, { "epoch": 0.005442301432291667, "grad_norm": 17.250377655029297, "learning_rate": 4.999757940067098e-06, "loss": 3.9962, "step": 535 }, { "epoch": 0.0054931640625, "grad_norm": 19.960880279541016, "learning_rate": 4.999752343582905e-06, "loss": 3.8214, "step": 540 }, { "epoch": 0.005544026692708333, "grad_norm": 21.063762664794922, "learning_rate": 4.999746683143141e-06, "loss": 3.5984, "step": 545 }, { "epoch": 0.005594889322916667, "grad_norm": 15.372790336608887, "learning_rate": 4.999740958747951e-06, "loss": 3.8445, "step": 550 }, { "epoch": 0.005645751953125, "grad_norm": 12.668439865112305, "learning_rate": 4.9997351703974804e-06, "loss": 4.1112, "step": 555 }, { "epoch": 0.005696614583333333, "grad_norm": 14.557056427001953, "learning_rate": 4.999729318091878e-06, "loss": 3.8517, "step": 560 }, { "epoch": 0.005747477213541667, "grad_norm": 18.399843215942383, "learning_rate": 4.9997234018312945e-06, "loss": 3.5826, "step": 565 }, { "epoch": 0.00579833984375, "grad_norm": 16.69244956970215, "learning_rate": 4.9997174216158795e-06, "loss": 3.6708, "step": 570 }, { "epoch": 0.005849202473958333, "grad_norm": 20.53217887878418, "learning_rate": 4.9997113774457865e-06, "loss": 4.0217, "step": 575 }, { "epoch": 0.005900065104166667, "grad_norm": 15.122044563293457, "learning_rate": 4.999705269321171e-06, "loss": 3.8238, "step": 580 }, { "epoch": 0.005950927734375, "grad_norm": 9.872886657714844, "learning_rate": 4.999699097242189e-06, "loss": 3.4889, "step": 585 }, { "epoch": 0.006001790364583333, "grad_norm": 14.158080101013184, "learning_rate": 4.999692861208997e-06, "loss": 3.5411, "step": 590 }, { "epoch": 0.006052652994791667, "grad_norm": 15.669339179992676, "learning_rate": 4.999686561221756e-06, "loss": 3.936, "step": 595 }, { "epoch": 0.006103515625, "grad_norm": 11.445816040039062, "learning_rate": 4.999680197280628e-06, "loss": 3.8597, "step": 600 }, { "epoch": 0.006154378255208333, "grad_norm": 16.03506088256836, "learning_rate": 4.999673769385773e-06, "loss": 3.4291, "step": 605 }, { "epoch": 0.006205240885416667, "grad_norm": 13.63383960723877, "learning_rate": 4.999667277537358e-06, "loss": 3.8566, "step": 610 }, { "epoch": 0.006256103515625, "grad_norm": 11.830592155456543, "learning_rate": 4.999660721735547e-06, "loss": 3.8303, "step": 615 }, { "epoch": 0.006306966145833333, "grad_norm": 19.99570655822754, "learning_rate": 4.999654101980511e-06, "loss": 3.8236, "step": 620 }, { "epoch": 0.006357828776041667, "grad_norm": 12.528511047363281, "learning_rate": 4.999647418272415e-06, "loss": 4.0486, "step": 625 }, { "epoch": 0.00640869140625, "grad_norm": 15.760631561279297, "learning_rate": 4.999640670611434e-06, "loss": 3.4468, "step": 630 }, { "epoch": 0.006459554036458333, "grad_norm": 19.84269905090332, "learning_rate": 4.999633858997738e-06, "loss": 3.8804, "step": 635 }, { "epoch": 0.006510416666666667, "grad_norm": 11.239544868469238, "learning_rate": 4.999626983431503e-06, "loss": 3.5703, "step": 640 }, { "epoch": 0.006561279296875, "grad_norm": 13.65317440032959, "learning_rate": 4.999620043912904e-06, "loss": 3.7155, "step": 645 }, { "epoch": 0.006612141927083333, "grad_norm": 12.85306167602539, "learning_rate": 4.999613040442118e-06, "loss": 3.9372, "step": 650 }, { "epoch": 0.006663004557291667, "grad_norm": 16.895217895507812, "learning_rate": 4.999605973019325e-06, "loss": 4.0348, "step": 655 }, { "epoch": 0.0067138671875, "grad_norm": 15.865885734558105, "learning_rate": 4.999598841644706e-06, "loss": 3.6799, "step": 660 }, { "epoch": 0.006764729817708333, "grad_norm": 17.68973731994629, "learning_rate": 4.999591646318443e-06, "loss": 3.446, "step": 665 }, { "epoch": 0.006815592447916667, "grad_norm": 17.351646423339844, "learning_rate": 4.99958438704072e-06, "loss": 3.9662, "step": 670 }, { "epoch": 0.006866455078125, "grad_norm": 12.52562427520752, "learning_rate": 4.999577063811723e-06, "loss": 3.564, "step": 675 }, { "epoch": 0.006917317708333333, "grad_norm": 15.416642189025879, "learning_rate": 4.999569676631639e-06, "loss": 3.9313, "step": 680 }, { "epoch": 0.006968180338541667, "grad_norm": 11.908843994140625, "learning_rate": 4.999562225500658e-06, "loss": 3.7715, "step": 685 }, { "epoch": 0.00701904296875, "grad_norm": 15.781961441040039, "learning_rate": 4.999554710418969e-06, "loss": 3.467, "step": 690 }, { "epoch": 0.007069905598958333, "grad_norm": 17.706050872802734, "learning_rate": 4.999547131386766e-06, "loss": 3.442, "step": 695 }, { "epoch": 0.007120768229166667, "grad_norm": 16.651485443115234, "learning_rate": 4.999539488404242e-06, "loss": 3.7181, "step": 700 }, { "epoch": 0.007171630859375, "grad_norm": 10.602474212646484, "learning_rate": 4.9995317814715925e-06, "loss": 3.6258, "step": 705 }, { "epoch": 0.007222493489583333, "grad_norm": 20.507596969604492, "learning_rate": 4.999524010589015e-06, "loss": 3.9877, "step": 710 }, { "epoch": 0.007273356119791667, "grad_norm": 17.32587432861328, "learning_rate": 4.999516175756708e-06, "loss": 3.7221, "step": 715 }, { "epoch": 0.00732421875, "grad_norm": 11.81552791595459, "learning_rate": 4.9995082769748715e-06, "loss": 3.4813, "step": 720 }, { "epoch": 0.007375081380208333, "grad_norm": 13.437356948852539, "learning_rate": 4.9995003142437086e-06, "loss": 3.6507, "step": 725 }, { "epoch": 0.007425944010416667, "grad_norm": 25.208234786987305, "learning_rate": 4.999492287563422e-06, "loss": 3.9219, "step": 730 }, { "epoch": 0.007476806640625, "grad_norm": 13.51179313659668, "learning_rate": 4.999484196934219e-06, "loss": 3.5389, "step": 735 }, { "epoch": 0.007527669270833333, "grad_norm": 13.544529914855957, "learning_rate": 4.999476042356305e-06, "loss": 3.597, "step": 740 }, { "epoch": 0.007578531901041667, "grad_norm": 10.314176559448242, "learning_rate": 4.999467823829888e-06, "loss": 3.8758, "step": 745 }, { "epoch": 0.00762939453125, "grad_norm": 21.49091148376465, "learning_rate": 4.99945954135518e-06, "loss": 3.3777, "step": 750 }, { "epoch": 0.007680257161458333, "grad_norm": 12.480860710144043, "learning_rate": 4.999451194932392e-06, "loss": 3.511, "step": 755 }, { "epoch": 0.007731119791666667, "grad_norm": 15.50587272644043, "learning_rate": 4.999442784561737e-06, "loss": 3.6119, "step": 760 }, { "epoch": 0.007781982421875, "grad_norm": 17.906938552856445, "learning_rate": 4.9994343102434314e-06, "loss": 3.8537, "step": 765 }, { "epoch": 0.007832845052083334, "grad_norm": 18.00349998474121, "learning_rate": 4.9994257719776915e-06, "loss": 3.5682, "step": 770 }, { "epoch": 0.007883707682291666, "grad_norm": 14.669235229492188, "learning_rate": 4.999417169764735e-06, "loss": 3.4905, "step": 775 }, { "epoch": 0.0079345703125, "grad_norm": 15.826396942138672, "learning_rate": 4.999408503604783e-06, "loss": 3.9937, "step": 780 }, { "epoch": 0.007985432942708334, "grad_norm": 17.248136520385742, "learning_rate": 4.999399773498057e-06, "loss": 4.14, "step": 785 }, { "epoch": 0.008036295572916666, "grad_norm": 16.377283096313477, "learning_rate": 4.99939097944478e-06, "loss": 4.2206, "step": 790 }, { "epoch": 0.008087158203125, "grad_norm": 11.323334693908691, "learning_rate": 4.9993821214451774e-06, "loss": 4.1021, "step": 795 }, { "epoch": 0.008138020833333334, "grad_norm": 14.058307647705078, "learning_rate": 4.999373199499476e-06, "loss": 3.9362, "step": 800 }, { "epoch": 0.008188883463541666, "grad_norm": 17.746889114379883, "learning_rate": 4.9993642136079025e-06, "loss": 4.0898, "step": 805 }, { "epoch": 0.00823974609375, "grad_norm": 17.276458740234375, "learning_rate": 4.999355163770688e-06, "loss": 3.9971, "step": 810 }, { "epoch": 0.008290608723958334, "grad_norm": 20.424314498901367, "learning_rate": 4.999346049988065e-06, "loss": 3.9069, "step": 815 }, { "epoch": 0.008341471354166666, "grad_norm": 21.62326431274414, "learning_rate": 4.999336872260266e-06, "loss": 3.812, "step": 820 }, { "epoch": 0.008392333984375, "grad_norm": 12.865884780883789, "learning_rate": 4.999327630587525e-06, "loss": 3.7855, "step": 825 }, { "epoch": 0.008443196614583334, "grad_norm": 14.172892570495605, "learning_rate": 4.999318324970079e-06, "loss": 3.8214, "step": 830 }, { "epoch": 0.008494059244791666, "grad_norm": 19.895843505859375, "learning_rate": 4.999308955408166e-06, "loss": 3.5814, "step": 835 }, { "epoch": 0.008544921875, "grad_norm": 14.874984741210938, "learning_rate": 4.999299521902026e-06, "loss": 3.64, "step": 840 }, { "epoch": 0.008595784505208334, "grad_norm": 15.312943458557129, "learning_rate": 4.9992900244519e-06, "loss": 3.7124, "step": 845 }, { "epoch": 0.008646647135416666, "grad_norm": 17.663049697875977, "learning_rate": 4.999280463058031e-06, "loss": 3.3959, "step": 850 }, { "epoch": 0.008697509765625, "grad_norm": 19.949892044067383, "learning_rate": 4.999270837720663e-06, "loss": 3.4157, "step": 855 }, { "epoch": 0.008748372395833334, "grad_norm": 14.926891326904297, "learning_rate": 4.9992611484400444e-06, "loss": 3.4777, "step": 860 }, { "epoch": 0.008799235026041666, "grad_norm": 13.239298820495605, "learning_rate": 4.999251395216421e-06, "loss": 3.6932, "step": 865 }, { "epoch": 0.00885009765625, "grad_norm": 16.126522064208984, "learning_rate": 4.999241578050044e-06, "loss": 3.7468, "step": 870 }, { "epoch": 0.008900960286458334, "grad_norm": 26.55894660949707, "learning_rate": 4.999231696941162e-06, "loss": 3.2945, "step": 875 }, { "epoch": 0.008951822916666666, "grad_norm": 16.377870559692383, "learning_rate": 4.99922175189003e-06, "loss": 3.8092, "step": 880 }, { "epoch": 0.009002685546875, "grad_norm": 25.377391815185547, "learning_rate": 4.999211742896902e-06, "loss": 4.0082, "step": 885 }, { "epoch": 0.009053548177083334, "grad_norm": 16.60061264038086, "learning_rate": 4.999201669962034e-06, "loss": 3.717, "step": 890 }, { "epoch": 0.009104410807291666, "grad_norm": 14.67759895324707, "learning_rate": 4.999191533085684e-06, "loss": 3.5591, "step": 895 }, { "epoch": 0.0091552734375, "grad_norm": 10.630406379699707, "learning_rate": 4.9991813322681105e-06, "loss": 3.6146, "step": 900 }, { "epoch": 0.009206136067708334, "grad_norm": 13.745230674743652, "learning_rate": 4.999171067509575e-06, "loss": 3.5511, "step": 905 }, { "epoch": 0.009256998697916666, "grad_norm": 15.126039505004883, "learning_rate": 4.99916073881034e-06, "loss": 3.6083, "step": 910 }, { "epoch": 0.009307861328125, "grad_norm": 18.30549430847168, "learning_rate": 4.99915034617067e-06, "loss": 3.4733, "step": 915 }, { "epoch": 0.009358723958333334, "grad_norm": 12.323296546936035, "learning_rate": 4.999139889590832e-06, "loss": 3.6615, "step": 920 }, { "epoch": 0.009409586588541666, "grad_norm": 19.943538665771484, "learning_rate": 4.999129369071091e-06, "loss": 4.1944, "step": 925 }, { "epoch": 0.00946044921875, "grad_norm": 10.738496780395508, "learning_rate": 4.9991187846117175e-06, "loss": 3.4904, "step": 930 }, { "epoch": 0.009511311848958334, "grad_norm": 11.223962783813477, "learning_rate": 4.999108136212982e-06, "loss": 3.6925, "step": 935 }, { "epoch": 0.009562174479166666, "grad_norm": 13.026283264160156, "learning_rate": 4.999097423875158e-06, "loss": 4.1644, "step": 940 }, { "epoch": 0.009613037109375, "grad_norm": 16.166419982910156, "learning_rate": 4.999086647598518e-06, "loss": 3.5475, "step": 945 }, { "epoch": 0.009663899739583334, "grad_norm": 18.75146484375, "learning_rate": 4.999075807383339e-06, "loss": 3.4752, "step": 950 }, { "epoch": 0.009714762369791666, "grad_norm": 14.348006248474121, "learning_rate": 4.999064903229897e-06, "loss": 3.5193, "step": 955 }, { "epoch": 0.009765625, "grad_norm": 13.05683708190918, "learning_rate": 4.9990539351384725e-06, "loss": 3.5619, "step": 960 }, { "epoch": 0.009816487630208334, "grad_norm": 14.792659759521484, "learning_rate": 4.999042903109345e-06, "loss": 3.2695, "step": 965 }, { "epoch": 0.009867350260416666, "grad_norm": 11.70416259765625, "learning_rate": 4.999031807142798e-06, "loss": 3.7578, "step": 970 }, { "epoch": 0.009918212890625, "grad_norm": 10.735535621643066, "learning_rate": 4.999020647239114e-06, "loss": 3.8695, "step": 975 }, { "epoch": 0.009969075520833334, "grad_norm": 17.912128448486328, "learning_rate": 4.999009423398579e-06, "loss": 3.5609, "step": 980 }, { "epoch": 0.010019938151041666, "grad_norm": 14.438199043273926, "learning_rate": 4.99899813562148e-06, "loss": 3.7282, "step": 985 }, { "epoch": 0.01007080078125, "grad_norm": 15.654008865356445, "learning_rate": 4.9989867839081065e-06, "loss": 3.8585, "step": 990 }, { "epoch": 0.010121663411458334, "grad_norm": 15.130616188049316, "learning_rate": 4.998975368258749e-06, "loss": 3.4857, "step": 995 }, { "epoch": 0.010172526041666666, "grad_norm": 15.098103523254395, "learning_rate": 4.998963888673698e-06, "loss": 3.7006, "step": 1000 }, { "epoch": 0.010223388671875, "grad_norm": 14.421039581298828, "learning_rate": 4.998952345153249e-06, "loss": 3.7491, "step": 1005 }, { "epoch": 0.010274251302083334, "grad_norm": 20.087514877319336, "learning_rate": 4.998940737697695e-06, "loss": 3.7776, "step": 1010 }, { "epoch": 0.010325113932291666, "grad_norm": 18.88400650024414, "learning_rate": 4.998929066307336e-06, "loss": 3.7481, "step": 1015 }, { "epoch": 0.0103759765625, "grad_norm": 21.07455062866211, "learning_rate": 4.998917330982469e-06, "loss": 3.5294, "step": 1020 }, { "epoch": 0.010426839192708334, "grad_norm": 11.424040794372559, "learning_rate": 4.998905531723394e-06, "loss": 3.9097, "step": 1025 }, { "epoch": 0.010477701822916666, "grad_norm": 10.922247886657715, "learning_rate": 4.998893668530414e-06, "loss": 3.5514, "step": 1030 }, { "epoch": 0.010528564453125, "grad_norm": 17.195194244384766, "learning_rate": 4.99888174140383e-06, "loss": 3.616, "step": 1035 }, { "epoch": 0.010579427083333334, "grad_norm": 10.144538879394531, "learning_rate": 4.998869750343951e-06, "loss": 3.7874, "step": 1040 }, { "epoch": 0.010630289713541666, "grad_norm": 11.742879867553711, "learning_rate": 4.998857695351081e-06, "loss": 3.9271, "step": 1045 }, { "epoch": 0.01068115234375, "grad_norm": 13.740591049194336, "learning_rate": 4.998845576425529e-06, "loss": 3.7697, "step": 1050 }, { "epoch": 0.010732014973958334, "grad_norm": 15.526152610778809, "learning_rate": 4.998833393567605e-06, "loss": 3.3944, "step": 1055 }, { "epoch": 0.010782877604166666, "grad_norm": 14.3406982421875, "learning_rate": 4.998821146777622e-06, "loss": 3.8095, "step": 1060 }, { "epoch": 0.010833740234375, "grad_norm": 12.40285873413086, "learning_rate": 4.99880883605589e-06, "loss": 3.6193, "step": 1065 }, { "epoch": 0.010884602864583334, "grad_norm": 20.011245727539062, "learning_rate": 4.998796461402729e-06, "loss": 3.8485, "step": 1070 }, { "epoch": 0.010935465494791666, "grad_norm": 15.710371017456055, "learning_rate": 4.998784022818452e-06, "loss": 3.8256, "step": 1075 }, { "epoch": 0.010986328125, "grad_norm": 18.561986923217773, "learning_rate": 4.998771520303376e-06, "loss": 3.9108, "step": 1080 }, { "epoch": 0.011037190755208334, "grad_norm": 18.281129837036133, "learning_rate": 4.998758953857825e-06, "loss": 3.4311, "step": 1085 }, { "epoch": 0.011088053385416666, "grad_norm": 14.150856018066406, "learning_rate": 4.998746323482117e-06, "loss": 3.5551, "step": 1090 }, { "epoch": 0.011138916015625, "grad_norm": 13.725726127624512, "learning_rate": 4.9987336291765784e-06, "loss": 3.6229, "step": 1095 }, { "epoch": 0.011189778645833334, "grad_norm": 11.532977104187012, "learning_rate": 4.998720870941531e-06, "loss": 3.5328, "step": 1100 }, { "epoch": 0.011240641276041666, "grad_norm": 15.219062805175781, "learning_rate": 4.998708048777303e-06, "loss": 3.7603, "step": 1105 }, { "epoch": 0.01129150390625, "grad_norm": 10.167997360229492, "learning_rate": 4.9986951626842215e-06, "loss": 4.157, "step": 1110 }, { "epoch": 0.011342366536458334, "grad_norm": 25.62779998779297, "learning_rate": 4.9986822126626165e-06, "loss": 3.539, "step": 1115 }, { "epoch": 0.011393229166666666, "grad_norm": 13.080843925476074, "learning_rate": 4.998669198712819e-06, "loss": 3.9684, "step": 1120 }, { "epoch": 0.011444091796875, "grad_norm": 13.87924861907959, "learning_rate": 4.998656120835163e-06, "loss": 3.6138, "step": 1125 }, { "epoch": 0.011494954427083334, "grad_norm": 16.160778045654297, "learning_rate": 4.998642979029982e-06, "loss": 3.7762, "step": 1130 }, { "epoch": 0.011545817057291666, "grad_norm": 12.456393241882324, "learning_rate": 4.998629773297613e-06, "loss": 3.5095, "step": 1135 }, { "epoch": 0.0115966796875, "grad_norm": 18.339937210083008, "learning_rate": 4.998616503638393e-06, "loss": 3.6277, "step": 1140 }, { "epoch": 0.011647542317708334, "grad_norm": 13.26982307434082, "learning_rate": 4.998603170052662e-06, "loss": 3.9732, "step": 1145 }, { "epoch": 0.011698404947916666, "grad_norm": 12.922799110412598, "learning_rate": 4.9985897725407616e-06, "loss": 3.4633, "step": 1150 }, { "epoch": 0.011749267578125, "grad_norm": 19.501089096069336, "learning_rate": 4.998576311103033e-06, "loss": 3.7619, "step": 1155 }, { "epoch": 0.011800130208333334, "grad_norm": 12.718639373779297, "learning_rate": 4.998562785739823e-06, "loss": 3.7302, "step": 1160 }, { "epoch": 0.011850992838541666, "grad_norm": 10.442389488220215, "learning_rate": 4.998549196451475e-06, "loss": 3.5264, "step": 1165 }, { "epoch": 0.01190185546875, "grad_norm": 15.280603408813477, "learning_rate": 4.99853554323834e-06, "loss": 3.7362, "step": 1170 }, { "epoch": 0.011952718098958334, "grad_norm": 14.212106704711914, "learning_rate": 4.998521826100764e-06, "loss": 3.7874, "step": 1175 }, { "epoch": 0.012003580729166666, "grad_norm": 13.269865989685059, "learning_rate": 4.998508045039099e-06, "loss": 3.6369, "step": 1180 }, { "epoch": 0.012054443359375, "grad_norm": 12.6091890335083, "learning_rate": 4.998494200053698e-06, "loss": 3.6542, "step": 1185 }, { "epoch": 0.012105305989583334, "grad_norm": 14.155792236328125, "learning_rate": 4.998480291144916e-06, "loss": 4.0782, "step": 1190 }, { "epoch": 0.012156168619791666, "grad_norm": 19.589990615844727, "learning_rate": 4.998466318313108e-06, "loss": 3.7343, "step": 1195 }, { "epoch": 0.01220703125, "grad_norm": 13.791729927062988, "learning_rate": 4.99845228155863e-06, "loss": 3.6456, "step": 1200 }, { "epoch": 0.012257893880208334, "grad_norm": 13.561478614807129, "learning_rate": 4.998438180881844e-06, "loss": 4.0166, "step": 1205 }, { "epoch": 0.012308756510416666, "grad_norm": 18.43284034729004, "learning_rate": 4.998424016283109e-06, "loss": 3.6783, "step": 1210 }, { "epoch": 0.012359619140625, "grad_norm": 19.545591354370117, "learning_rate": 4.9984097877627865e-06, "loss": 3.7069, "step": 1215 }, { "epoch": 0.012410481770833334, "grad_norm": 16.589338302612305, "learning_rate": 4.998395495321243e-06, "loss": 3.5455, "step": 1220 }, { "epoch": 0.012461344401041666, "grad_norm": 13.449020385742188, "learning_rate": 4.998381138958843e-06, "loss": 3.6977, "step": 1225 }, { "epoch": 0.01251220703125, "grad_norm": 17.311525344848633, "learning_rate": 4.9983667186759535e-06, "loss": 3.6899, "step": 1230 }, { "epoch": 0.012563069661458334, "grad_norm": 15.647948265075684, "learning_rate": 4.998352234472944e-06, "loss": 3.1426, "step": 1235 }, { "epoch": 0.012613932291666666, "grad_norm": 22.255765914916992, "learning_rate": 4.998337686350184e-06, "loss": 3.66, "step": 1240 }, { "epoch": 0.012664794921875, "grad_norm": 15.966592788696289, "learning_rate": 4.998323074308047e-06, "loss": 3.6801, "step": 1245 }, { "epoch": 0.012715657552083334, "grad_norm": 10.439255714416504, "learning_rate": 4.998308398346906e-06, "loss": 3.6352, "step": 1250 }, { "epoch": 0.012766520182291666, "grad_norm": 14.009291648864746, "learning_rate": 4.998293658467137e-06, "loss": 3.5148, "step": 1255 }, { "epoch": 0.0128173828125, "grad_norm": 9.830448150634766, "learning_rate": 4.998278854669117e-06, "loss": 3.405, "step": 1260 }, { "epoch": 0.012868245442708334, "grad_norm": 18.137758255004883, "learning_rate": 4.998263986953224e-06, "loss": 3.2475, "step": 1265 }, { "epoch": 0.012919108072916666, "grad_norm": 13.296918869018555, "learning_rate": 4.99824905531984e-06, "loss": 3.5983, "step": 1270 }, { "epoch": 0.012969970703125, "grad_norm": 17.331226348876953, "learning_rate": 4.9982340597693455e-06, "loss": 3.4618, "step": 1275 }, { "epoch": 0.013020833333333334, "grad_norm": 16.383100509643555, "learning_rate": 4.998219000302125e-06, "loss": 3.5292, "step": 1280 }, { "epoch": 0.013071695963541666, "grad_norm": 10.87190055847168, "learning_rate": 4.998203876918564e-06, "loss": 3.6293, "step": 1285 }, { "epoch": 0.01312255859375, "grad_norm": 12.597033500671387, "learning_rate": 4.998188689619048e-06, "loss": 3.7316, "step": 1290 }, { "epoch": 0.013173421223958334, "grad_norm": 12.681578636169434, "learning_rate": 4.998173438403966e-06, "loss": 3.478, "step": 1295 }, { "epoch": 0.013224283854166666, "grad_norm": 16.530866622924805, "learning_rate": 4.99815812327371e-06, "loss": 3.7343, "step": 1300 }, { "epoch": 0.013275146484375, "grad_norm": 17.380083084106445, "learning_rate": 4.99814274422867e-06, "loss": 3.5332, "step": 1305 }, { "epoch": 0.013326009114583334, "grad_norm": 15.711562156677246, "learning_rate": 4.998127301269241e-06, "loss": 3.4281, "step": 1310 }, { "epoch": 0.013376871744791666, "grad_norm": 13.467732429504395, "learning_rate": 4.998111794395816e-06, "loss": 3.588, "step": 1315 }, { "epoch": 0.013427734375, "grad_norm": 16.989633560180664, "learning_rate": 4.998096223608792e-06, "loss": 3.304, "step": 1320 }, { "epoch": 0.013478597005208334, "grad_norm": 12.060418128967285, "learning_rate": 4.998080588908571e-06, "loss": 3.121, "step": 1325 }, { "epoch": 0.013529459635416666, "grad_norm": 15.241015434265137, "learning_rate": 4.9980648902955475e-06, "loss": 3.6774, "step": 1330 }, { "epoch": 0.013580322265625, "grad_norm": 15.760343551635742, "learning_rate": 4.998049127770127e-06, "loss": 3.2477, "step": 1335 }, { "epoch": 0.013631184895833334, "grad_norm": 15.907750129699707, "learning_rate": 4.998033301332712e-06, "loss": 3.3698, "step": 1340 }, { "epoch": 0.013682047526041666, "grad_norm": 17.1384334564209, "learning_rate": 4.9980174109837065e-06, "loss": 3.7272, "step": 1345 }, { "epoch": 0.01373291015625, "grad_norm": 16.641490936279297, "learning_rate": 4.998001456723518e-06, "loss": 3.6683, "step": 1350 }, { "epoch": 0.013783772786458334, "grad_norm": 8.19157600402832, "learning_rate": 4.997985438552554e-06, "loss": 3.6485, "step": 1355 }, { "epoch": 0.013834635416666666, "grad_norm": 13.124740600585938, "learning_rate": 4.997969356471225e-06, "loss": 3.6489, "step": 1360 }, { "epoch": 0.013885498046875, "grad_norm": 11.489811897277832, "learning_rate": 4.997953210479941e-06, "loss": 3.9778, "step": 1365 }, { "epoch": 0.013936360677083334, "grad_norm": 18.886642456054688, "learning_rate": 4.997937000579118e-06, "loss": 3.7343, "step": 1370 }, { "epoch": 0.013987223307291666, "grad_norm": 9.009153366088867, "learning_rate": 4.997920726769168e-06, "loss": 3.5347, "step": 1375 }, { "epoch": 0.0140380859375, "grad_norm": 14.774568557739258, "learning_rate": 4.997904389050508e-06, "loss": 3.7011, "step": 1380 }, { "epoch": 0.014088948567708334, "grad_norm": 26.565807342529297, "learning_rate": 4.997887987423556e-06, "loss": 4.026, "step": 1385 }, { "epoch": 0.014139811197916666, "grad_norm": 13.156881332397461, "learning_rate": 4.997871521888733e-06, "loss": 3.645, "step": 1390 }, { "epoch": 0.014190673828125, "grad_norm": 13.961007118225098, "learning_rate": 4.9978549924464595e-06, "loss": 3.638, "step": 1395 }, { "epoch": 0.014241536458333334, "grad_norm": 12.008218765258789, "learning_rate": 4.997838399097157e-06, "loss": 3.7365, "step": 1400 }, { "epoch": 0.014292399088541666, "grad_norm": 16.033838272094727, "learning_rate": 4.997821741841251e-06, "loss": 3.8982, "step": 1405 }, { "epoch": 0.01434326171875, "grad_norm": 10.620261192321777, "learning_rate": 4.997805020679169e-06, "loss": 3.5443, "step": 1410 }, { "epoch": 0.014394124348958334, "grad_norm": 13.818375587463379, "learning_rate": 4.997788235611336e-06, "loss": 3.8064, "step": 1415 }, { "epoch": 0.014444986979166666, "grad_norm": 14.587963104248047, "learning_rate": 4.997771386638184e-06, "loss": 3.599, "step": 1420 }, { "epoch": 0.014495849609375, "grad_norm": 11.063785552978516, "learning_rate": 4.997754473760143e-06, "loss": 3.6159, "step": 1425 }, { "epoch": 0.014546712239583334, "grad_norm": 11.75071907043457, "learning_rate": 4.997737496977645e-06, "loss": 4.4565, "step": 1430 }, { "epoch": 0.014597574869791666, "grad_norm": 14.461017608642578, "learning_rate": 4.997720456291126e-06, "loss": 3.7098, "step": 1435 }, { "epoch": 0.0146484375, "grad_norm": 11.124221801757812, "learning_rate": 4.997703351701021e-06, "loss": 3.5637, "step": 1440 }, { "epoch": 0.014699300130208334, "grad_norm": 13.678889274597168, "learning_rate": 4.997686183207767e-06, "loss": 3.5194, "step": 1445 }, { "epoch": 0.014750162760416666, "grad_norm": 11.239283561706543, "learning_rate": 4.9976689508118055e-06, "loss": 3.6568, "step": 1450 }, { "epoch": 0.014801025390625, "grad_norm": 12.112907409667969, "learning_rate": 4.997651654513575e-06, "loss": 3.6608, "step": 1455 }, { "epoch": 0.014851888020833334, "grad_norm": 13.270151138305664, "learning_rate": 4.997634294313519e-06, "loss": 3.5079, "step": 1460 }, { "epoch": 0.014902750651041666, "grad_norm": 13.076834678649902, "learning_rate": 4.997616870212082e-06, "loss": 3.3748, "step": 1465 }, { "epoch": 0.01495361328125, "grad_norm": 10.513318061828613, "learning_rate": 4.997599382209709e-06, "loss": 3.6342, "step": 1470 }, { "epoch": 0.015004475911458334, "grad_norm": 18.05800437927246, "learning_rate": 4.997581830306848e-06, "loss": 3.6844, "step": 1475 }, { "epoch": 0.015055338541666666, "grad_norm": 10.047406196594238, "learning_rate": 4.997564214503947e-06, "loss": 4.0638, "step": 1480 }, { "epoch": 0.015106201171875, "grad_norm": 15.92277717590332, "learning_rate": 4.997546534801459e-06, "loss": 3.7102, "step": 1485 }, { "epoch": 0.015157063802083334, "grad_norm": 15.660829544067383, "learning_rate": 4.997528791199834e-06, "loss": 4.0566, "step": 1490 }, { "epoch": 0.015207926432291666, "grad_norm": 21.788442611694336, "learning_rate": 4.997510983699527e-06, "loss": 4.1864, "step": 1495 }, { "epoch": 0.0152587890625, "grad_norm": 10.556007385253906, "learning_rate": 4.997493112300994e-06, "loss": 3.621, "step": 1500 }, { "epoch": 0.015309651692708334, "grad_norm": 13.028473854064941, "learning_rate": 4.99747517700469e-06, "loss": 4.406, "step": 1505 }, { "epoch": 0.015360514322916666, "grad_norm": 14.941851615905762, "learning_rate": 4.997457177811077e-06, "loss": 3.1199, "step": 1510 }, { "epoch": 0.015411376953125, "grad_norm": 18.102357864379883, "learning_rate": 4.997439114720614e-06, "loss": 3.7058, "step": 1515 }, { "epoch": 0.015462239583333334, "grad_norm": 22.9876651763916, "learning_rate": 4.997420987733763e-06, "loss": 3.5268, "step": 1520 }, { "epoch": 0.015513102213541666, "grad_norm": 11.794015884399414, "learning_rate": 4.997402796850989e-06, "loss": 3.6392, "step": 1525 }, { "epoch": 0.01556396484375, "grad_norm": 11.62551212310791, "learning_rate": 4.997384542072755e-06, "loss": 3.4675, "step": 1530 }, { "epoch": 0.015614827473958334, "grad_norm": 14.436004638671875, "learning_rate": 4.99736622339953e-06, "loss": 3.657, "step": 1535 }, { "epoch": 0.015665690104166668, "grad_norm": 13.236321449279785, "learning_rate": 4.997347840831782e-06, "loss": 3.2599, "step": 1540 }, { "epoch": 0.015716552734375, "grad_norm": 11.505661010742188, "learning_rate": 4.997329394369981e-06, "loss": 3.6238, "step": 1545 }, { "epoch": 0.015767415364583332, "grad_norm": 14.669344902038574, "learning_rate": 4.997310884014599e-06, "loss": 3.4919, "step": 1550 }, { "epoch": 0.015818277994791668, "grad_norm": 9.981671333312988, "learning_rate": 4.997292309766111e-06, "loss": 3.4323, "step": 1555 }, { "epoch": 0.015869140625, "grad_norm": 13.35000228881836, "learning_rate": 4.997273671624991e-06, "loss": 3.5778, "step": 1560 }, { "epoch": 0.015920003255208332, "grad_norm": 13.411084175109863, "learning_rate": 4.997254969591716e-06, "loss": 3.8477, "step": 1565 }, { "epoch": 0.015970865885416668, "grad_norm": 13.634716033935547, "learning_rate": 4.997236203666764e-06, "loss": 3.4397, "step": 1570 }, { "epoch": 0.016021728515625, "grad_norm": 9.219624519348145, "learning_rate": 4.997217373850617e-06, "loss": 3.5114, "step": 1575 }, { "epoch": 0.016072591145833332, "grad_norm": 12.118866920471191, "learning_rate": 4.997198480143755e-06, "loss": 3.7325, "step": 1580 }, { "epoch": 0.016123453776041668, "grad_norm": 12.5546875, "learning_rate": 4.99717952254666e-06, "loss": 3.6924, "step": 1585 }, { "epoch": 0.01617431640625, "grad_norm": 15.333346366882324, "learning_rate": 4.99716050105982e-06, "loss": 3.4589, "step": 1590 }, { "epoch": 0.016225179036458332, "grad_norm": 11.861115455627441, "learning_rate": 4.997141415683721e-06, "loss": 3.7009, "step": 1595 }, { "epoch": 0.016276041666666668, "grad_norm": 15.62850570678711, "learning_rate": 4.99712226641885e-06, "loss": 4.2749, "step": 1600 }, { "epoch": 0.016326904296875, "grad_norm": 16.968338012695312, "learning_rate": 4.997103053265698e-06, "loss": 3.458, "step": 1605 }, { "epoch": 0.016377766927083332, "grad_norm": 15.927022933959961, "learning_rate": 4.997083776224757e-06, "loss": 3.5739, "step": 1610 }, { "epoch": 0.016428629557291668, "grad_norm": 12.973907470703125, "learning_rate": 4.997064435296518e-06, "loss": 3.5018, "step": 1615 }, { "epoch": 0.0164794921875, "grad_norm": 16.26294708251953, "learning_rate": 4.997045030481478e-06, "loss": 3.8331, "step": 1620 }, { "epoch": 0.016530354817708332, "grad_norm": 14.90145492553711, "learning_rate": 4.997025561780133e-06, "loss": 3.7663, "step": 1625 }, { "epoch": 0.016581217447916668, "grad_norm": 10.96834945678711, "learning_rate": 4.9970060291929816e-06, "loss": 3.2283, "step": 1630 }, { "epoch": 0.016632080078125, "grad_norm": 14.805373191833496, "learning_rate": 4.996986432720521e-06, "loss": 3.5404, "step": 1635 }, { "epoch": 0.016682942708333332, "grad_norm": 14.054951667785645, "learning_rate": 4.996966772363255e-06, "loss": 3.7508, "step": 1640 }, { "epoch": 0.016733805338541668, "grad_norm": 18.448801040649414, "learning_rate": 4.996947048121686e-06, "loss": 3.1579, "step": 1645 }, { "epoch": 0.01678466796875, "grad_norm": 11.387920379638672, "learning_rate": 4.996927259996319e-06, "loss": 3.1547, "step": 1650 }, { "epoch": 0.016835530598958332, "grad_norm": 17.80491828918457, "learning_rate": 4.99690740798766e-06, "loss": 3.5202, "step": 1655 }, { "epoch": 0.016886393229166668, "grad_norm": 15.007222175598145, "learning_rate": 4.9968874920962165e-06, "loss": 3.6235, "step": 1660 }, { "epoch": 0.016937255859375, "grad_norm": 10.149649620056152, "learning_rate": 4.996867512322499e-06, "loss": 3.6799, "step": 1665 }, { "epoch": 0.016988118489583332, "grad_norm": 17.945354461669922, "learning_rate": 4.996847468667016e-06, "loss": 3.9798, "step": 1670 }, { "epoch": 0.017038981119791668, "grad_norm": 15.979263305664062, "learning_rate": 4.9968273611302845e-06, "loss": 3.3488, "step": 1675 }, { "epoch": 0.01708984375, "grad_norm": 14.131532669067383, "learning_rate": 4.996807189712815e-06, "loss": 3.3508, "step": 1680 }, { "epoch": 0.017140706380208332, "grad_norm": 19.903356552124023, "learning_rate": 4.996786954415127e-06, "loss": 3.5976, "step": 1685 }, { "epoch": 0.017191569010416668, "grad_norm": 13.6408052444458, "learning_rate": 4.996766655237736e-06, "loss": 3.4822, "step": 1690 }, { "epoch": 0.017242431640625, "grad_norm": 9.797231674194336, "learning_rate": 4.9967462921811614e-06, "loss": 3.9847, "step": 1695 }, { "epoch": 0.017293294270833332, "grad_norm": 13.015440940856934, "learning_rate": 4.996725865245926e-06, "loss": 3.5052, "step": 1700 }, { "epoch": 0.017344156901041668, "grad_norm": 17.70994758605957, "learning_rate": 4.99670537443255e-06, "loss": 3.9466, "step": 1705 }, { "epoch": 0.01739501953125, "grad_norm": 9.877345085144043, "learning_rate": 4.996684819741559e-06, "loss": 3.6346, "step": 1710 }, { "epoch": 0.017445882161458332, "grad_norm": 12.302515983581543, "learning_rate": 4.996664201173478e-06, "loss": 3.3808, "step": 1715 }, { "epoch": 0.017496744791666668, "grad_norm": 12.325230598449707, "learning_rate": 4.9966435187288365e-06, "loss": 3.8128, "step": 1720 }, { "epoch": 0.017547607421875, "grad_norm": 14.4461669921875, "learning_rate": 4.996622772408162e-06, "loss": 4.3367, "step": 1725 }, { "epoch": 0.017598470052083332, "grad_norm": 15.207497596740723, "learning_rate": 4.996601962211985e-06, "loss": 3.5895, "step": 1730 }, { "epoch": 0.017649332682291668, "grad_norm": 10.924785614013672, "learning_rate": 4.9965810881408384e-06, "loss": 3.3813, "step": 1735 }, { "epoch": 0.0177001953125, "grad_norm": 19.209896087646484, "learning_rate": 4.996560150195257e-06, "loss": 3.4489, "step": 1740 }, { "epoch": 0.017751057942708332, "grad_norm": 14.56915283203125, "learning_rate": 4.9965391483757765e-06, "loss": 3.4132, "step": 1745 }, { "epoch": 0.017801920572916668, "grad_norm": 9.958040237426758, "learning_rate": 4.996518082682933e-06, "loss": 3.5184, "step": 1750 }, { "epoch": 0.017852783203125, "grad_norm": 14.741584777832031, "learning_rate": 4.9964969531172656e-06, "loss": 3.7564, "step": 1755 }, { "epoch": 0.017903645833333332, "grad_norm": 12.061718940734863, "learning_rate": 4.996475759679316e-06, "loss": 3.5494, "step": 1760 }, { "epoch": 0.017954508463541668, "grad_norm": 19.455101013183594, "learning_rate": 4.9964545023696255e-06, "loss": 3.5547, "step": 1765 }, { "epoch": 0.01800537109375, "grad_norm": 15.922080993652344, "learning_rate": 4.996433181188739e-06, "loss": 3.6292, "step": 1770 }, { "epoch": 0.018056233723958332, "grad_norm": 10.75054931640625, "learning_rate": 4.996411796137201e-06, "loss": 3.3277, "step": 1775 }, { "epoch": 0.018107096354166668, "grad_norm": 14.0524263381958, "learning_rate": 4.996390347215558e-06, "loss": 3.671, "step": 1780 }, { "epoch": 0.018157958984375, "grad_norm": 17.705394744873047, "learning_rate": 4.9963688344243605e-06, "loss": 3.4854, "step": 1785 }, { "epoch": 0.018208821614583332, "grad_norm": 14.042203903198242, "learning_rate": 4.996347257764158e-06, "loss": 3.3586, "step": 1790 }, { "epoch": 0.018259684244791668, "grad_norm": 15.437379837036133, "learning_rate": 4.996325617235502e-06, "loss": 3.1949, "step": 1795 }, { "epoch": 0.018310546875, "grad_norm": 14.990214347839355, "learning_rate": 4.996303912838948e-06, "loss": 3.2657, "step": 1800 }, { "epoch": 0.018361409505208332, "grad_norm": 20.377416610717773, "learning_rate": 4.9962821445750485e-06, "loss": 3.5048, "step": 1805 }, { "epoch": 0.018412272135416668, "grad_norm": 11.091985702514648, "learning_rate": 4.996260312444363e-06, "loss": 3.5013, "step": 1810 }, { "epoch": 0.018463134765625, "grad_norm": 17.564790725708008, "learning_rate": 4.9962384164474495e-06, "loss": 3.9346, "step": 1815 }, { "epoch": 0.018513997395833332, "grad_norm": 11.695920944213867, "learning_rate": 4.996216456584867e-06, "loss": 3.7216, "step": 1820 }, { "epoch": 0.018564860026041668, "grad_norm": 12.64533519744873, "learning_rate": 4.9961944328571785e-06, "loss": 3.4994, "step": 1825 }, { "epoch": 0.01861572265625, "grad_norm": 11.813600540161133, "learning_rate": 4.9961723452649465e-06, "loss": 3.3742, "step": 1830 }, { "epoch": 0.018666585286458332, "grad_norm": 13.247116088867188, "learning_rate": 4.9961501938087375e-06, "loss": 3.7691, "step": 1835 }, { "epoch": 0.018717447916666668, "grad_norm": 10.45460319519043, "learning_rate": 4.996127978489117e-06, "loss": 3.6143, "step": 1840 }, { "epoch": 0.018768310546875, "grad_norm": 19.198963165283203, "learning_rate": 4.996105699306654e-06, "loss": 3.4559, "step": 1845 }, { "epoch": 0.018819173177083332, "grad_norm": 15.408687591552734, "learning_rate": 4.996083356261918e-06, "loss": 3.6425, "step": 1850 }, { "epoch": 0.018870035807291668, "grad_norm": 16.426103591918945, "learning_rate": 4.996060949355481e-06, "loss": 3.6617, "step": 1855 }, { "epoch": 0.0189208984375, "grad_norm": 9.62375259399414, "learning_rate": 4.996038478587916e-06, "loss": 4.2817, "step": 1860 }, { "epoch": 0.018971761067708332, "grad_norm": 13.927205085754395, "learning_rate": 4.9960159439598e-06, "loss": 3.1512, "step": 1865 }, { "epoch": 0.019022623697916668, "grad_norm": 16.292879104614258, "learning_rate": 4.995993345471706e-06, "loss": 3.7999, "step": 1870 }, { "epoch": 0.019073486328125, "grad_norm": 16.1326904296875, "learning_rate": 4.995970683124214e-06, "loss": 3.0987, "step": 1875 }, { "epoch": 0.019124348958333332, "grad_norm": 17.09796142578125, "learning_rate": 4.995947956917904e-06, "loss": 3.6616, "step": 1880 }, { "epoch": 0.019175211588541668, "grad_norm": 8.162797927856445, "learning_rate": 4.995925166853357e-06, "loss": 3.5208, "step": 1885 }, { "epoch": 0.01922607421875, "grad_norm": 35.92325210571289, "learning_rate": 4.995902312931156e-06, "loss": 3.819, "step": 1890 }, { "epoch": 0.019276936848958332, "grad_norm": 15.114096641540527, "learning_rate": 4.995879395151886e-06, "loss": 3.5895, "step": 1895 }, { "epoch": 0.019327799479166668, "grad_norm": 18.872722625732422, "learning_rate": 4.995856413516134e-06, "loss": 3.731, "step": 1900 }, { "epoch": 0.019378662109375, "grad_norm": 15.703900337219238, "learning_rate": 4.9958333680244865e-06, "loss": 3.8427, "step": 1905 }, { "epoch": 0.019429524739583332, "grad_norm": 14.129450798034668, "learning_rate": 4.9958102586775334e-06, "loss": 3.6527, "step": 1910 }, { "epoch": 0.019480387369791668, "grad_norm": 11.964359283447266, "learning_rate": 4.9957870854758675e-06, "loss": 3.3432, "step": 1915 }, { "epoch": 0.01953125, "grad_norm": 19.071924209594727, "learning_rate": 4.99576384842008e-06, "loss": 3.7882, "step": 1920 }, { "epoch": 0.019582112630208332, "grad_norm": 14.704536437988281, "learning_rate": 4.995740547510766e-06, "loss": 3.2511, "step": 1925 }, { "epoch": 0.019632975260416668, "grad_norm": 20.858272552490234, "learning_rate": 4.9957171827485215e-06, "loss": 4.266, "step": 1930 }, { "epoch": 0.019683837890625, "grad_norm": 12.940155982971191, "learning_rate": 4.995693754133944e-06, "loss": 3.7349, "step": 1935 }, { "epoch": 0.019734700520833332, "grad_norm": 17.16631507873535, "learning_rate": 4.995670261667635e-06, "loss": 3.6617, "step": 1940 }, { "epoch": 0.019785563151041668, "grad_norm": 12.754393577575684, "learning_rate": 4.995646705350193e-06, "loss": 3.6483, "step": 1945 }, { "epoch": 0.01983642578125, "grad_norm": 12.014126777648926, "learning_rate": 4.995623085182221e-06, "loss": 3.4977, "step": 1950 }, { "epoch": 0.019887288411458332, "grad_norm": 16.564720153808594, "learning_rate": 4.995599401164325e-06, "loss": 3.7256, "step": 1955 }, { "epoch": 0.019938151041666668, "grad_norm": 11.473775863647461, "learning_rate": 4.995575653297109e-06, "loss": 3.416, "step": 1960 }, { "epoch": 0.019989013671875, "grad_norm": 14.353362083435059, "learning_rate": 4.995551841581181e-06, "loss": 3.4641, "step": 1965 }, { "epoch": 0.020039876302083332, "grad_norm": 16.31317901611328, "learning_rate": 4.9955279660171514e-06, "loss": 3.7767, "step": 1970 }, { "epoch": 0.020090738932291668, "grad_norm": 8.309894561767578, "learning_rate": 4.99550402660563e-06, "loss": 3.6317, "step": 1975 }, { "epoch": 0.0201416015625, "grad_norm": 18.78270149230957, "learning_rate": 4.99548002334723e-06, "loss": 3.2542, "step": 1980 }, { "epoch": 0.020192464192708332, "grad_norm": 10.488119125366211, "learning_rate": 4.9954559562425654e-06, "loss": 3.3925, "step": 1985 }, { "epoch": 0.020243326822916668, "grad_norm": 19.56174087524414, "learning_rate": 4.99543182529225e-06, "loss": 3.4966, "step": 1990 }, { "epoch": 0.020294189453125, "grad_norm": 11.951705932617188, "learning_rate": 4.995407630496905e-06, "loss": 3.6632, "step": 1995 }, { "epoch": 0.020345052083333332, "grad_norm": 15.856471061706543, "learning_rate": 4.995383371857145e-06, "loss": 3.3239, "step": 2000 }, { "epoch": 0.020395914713541668, "grad_norm": 12.084221839904785, "learning_rate": 4.9953590493735945e-06, "loss": 3.5615, "step": 2005 }, { "epoch": 0.02044677734375, "grad_norm": 27.98715591430664, "learning_rate": 4.995334663046874e-06, "loss": 3.6218, "step": 2010 }, { "epoch": 0.020497639973958332, "grad_norm": 9.576325416564941, "learning_rate": 4.995310212877608e-06, "loss": 3.4569, "step": 2015 }, { "epoch": 0.020548502604166668, "grad_norm": 13.834068298339844, "learning_rate": 4.9952856988664205e-06, "loss": 3.2584, "step": 2020 }, { "epoch": 0.020599365234375, "grad_norm": 17.03768539428711, "learning_rate": 4.99526112101394e-06, "loss": 3.7883, "step": 2025 }, { "epoch": 0.020650227864583332, "grad_norm": 11.945119857788086, "learning_rate": 4.995236479320796e-06, "loss": 3.3385, "step": 2030 }, { "epoch": 0.020701090494791668, "grad_norm": 13.012072563171387, "learning_rate": 4.995211773787617e-06, "loss": 3.5075, "step": 2035 }, { "epoch": 0.020751953125, "grad_norm": 11.229742050170898, "learning_rate": 4.995187004415038e-06, "loss": 3.1753, "step": 2040 }, { "epoch": 0.020802815755208332, "grad_norm": 15.254374504089355, "learning_rate": 4.995162171203689e-06, "loss": 3.4893, "step": 2045 }, { "epoch": 0.020853678385416668, "grad_norm": 12.569649696350098, "learning_rate": 4.9951372741542084e-06, "loss": 3.7053, "step": 2050 }, { "epoch": 0.020904541015625, "grad_norm": 13.927783966064453, "learning_rate": 4.995112313267231e-06, "loss": 3.43, "step": 2055 }, { "epoch": 0.020955403645833332, "grad_norm": 10.135937690734863, "learning_rate": 4.995087288543397e-06, "loss": 3.3201, "step": 2060 }, { "epoch": 0.021006266276041668, "grad_norm": 17.4373722076416, "learning_rate": 4.995062199983346e-06, "loss": 3.9556, "step": 2065 }, { "epoch": 0.02105712890625, "grad_norm": 8.249671936035156, "learning_rate": 4.9950370475877204e-06, "loss": 3.4842, "step": 2070 }, { "epoch": 0.021107991536458332, "grad_norm": 7.946985244750977, "learning_rate": 4.995011831357164e-06, "loss": 3.7847, "step": 2075 }, { "epoch": 0.021158854166666668, "grad_norm": 14.219249725341797, "learning_rate": 4.99498655129232e-06, "loss": 3.6362, "step": 2080 }, { "epoch": 0.021209716796875, "grad_norm": 16.812952041625977, "learning_rate": 4.994961207393837e-06, "loss": 3.7761, "step": 2085 }, { "epoch": 0.021260579427083332, "grad_norm": 12.385375022888184, "learning_rate": 4.994935799662363e-06, "loss": 3.5996, "step": 2090 }, { "epoch": 0.021311442057291668, "grad_norm": 8.916481971740723, "learning_rate": 4.994910328098548e-06, "loss": 3.6023, "step": 2095 }, { "epoch": 0.0213623046875, "grad_norm": 14.721908569335938, "learning_rate": 4.994884792703043e-06, "loss": 3.3011, "step": 2100 }, { "epoch": 0.021413167317708332, "grad_norm": 15.370878219604492, "learning_rate": 4.9948591934765025e-06, "loss": 3.5518, "step": 2105 }, { "epoch": 0.021464029947916668, "grad_norm": 14.788433074951172, "learning_rate": 4.994833530419581e-06, "loss": 3.5008, "step": 2110 }, { "epoch": 0.021514892578125, "grad_norm": 12.971417427062988, "learning_rate": 4.994807803532934e-06, "loss": 3.5467, "step": 2115 }, { "epoch": 0.021565755208333332, "grad_norm": 14.517382621765137, "learning_rate": 4.994782012817221e-06, "loss": 3.6258, "step": 2120 }, { "epoch": 0.021616617838541668, "grad_norm": 10.307328224182129, "learning_rate": 4.994756158273102e-06, "loss": 4.0953, "step": 2125 }, { "epoch": 0.02166748046875, "grad_norm": 15.38490104675293, "learning_rate": 4.994730239901238e-06, "loss": 4.2029, "step": 2130 }, { "epoch": 0.021718343098958332, "grad_norm": 12.546719551086426, "learning_rate": 4.994704257702292e-06, "loss": 3.4173, "step": 2135 }, { "epoch": 0.021769205729166668, "grad_norm": 11.977869033813477, "learning_rate": 4.994678211676929e-06, "loss": 3.4095, "step": 2140 }, { "epoch": 0.021820068359375, "grad_norm": 10.538824081420898, "learning_rate": 4.994652101825815e-06, "loss": 3.607, "step": 2145 }, { "epoch": 0.021870930989583332, "grad_norm": 18.2629337310791, "learning_rate": 4.994625928149619e-06, "loss": 3.9144, "step": 2150 }, { "epoch": 0.021921793619791668, "grad_norm": 12.87021255493164, "learning_rate": 4.994599690649009e-06, "loss": 3.9643, "step": 2155 }, { "epoch": 0.02197265625, "grad_norm": 16.46050453186035, "learning_rate": 4.994573389324657e-06, "loss": 3.7189, "step": 2160 }, { "epoch": 0.022023518880208332, "grad_norm": 14.574606895446777, "learning_rate": 4.994547024177236e-06, "loss": 3.906, "step": 2165 }, { "epoch": 0.022074381510416668, "grad_norm": 10.178772926330566, "learning_rate": 4.994520595207422e-06, "loss": 3.7685, "step": 2170 }, { "epoch": 0.022125244140625, "grad_norm": 16.5206298828125, "learning_rate": 4.994494102415889e-06, "loss": 3.6191, "step": 2175 }, { "epoch": 0.022176106770833332, "grad_norm": 13.858648300170898, "learning_rate": 4.9944675458033156e-06, "loss": 3.5425, "step": 2180 }, { "epoch": 0.022226969401041668, "grad_norm": 15.840536117553711, "learning_rate": 4.994440925370382e-06, "loss": 3.5171, "step": 2185 }, { "epoch": 0.02227783203125, "grad_norm": 13.754470825195312, "learning_rate": 4.9944142411177675e-06, "loss": 3.3044, "step": 2190 }, { "epoch": 0.022328694661458332, "grad_norm": 13.027044296264648, "learning_rate": 4.994387493046157e-06, "loss": 3.3715, "step": 2195 }, { "epoch": 0.022379557291666668, "grad_norm": 10.115904808044434, "learning_rate": 4.994360681156233e-06, "loss": 3.8424, "step": 2200 }, { "epoch": 0.022430419921875, "grad_norm": 16.806591033935547, "learning_rate": 4.994333805448682e-06, "loss": 4.0637, "step": 2205 }, { "epoch": 0.022481282552083332, "grad_norm": 9.8330078125, "learning_rate": 4.994306865924192e-06, "loss": 3.383, "step": 2210 }, { "epoch": 0.022532145182291668, "grad_norm": 13.583646774291992, "learning_rate": 4.994279862583453e-06, "loss": 3.3756, "step": 2215 }, { "epoch": 0.0225830078125, "grad_norm": 12.496264457702637, "learning_rate": 4.994252795427153e-06, "loss": 3.5715, "step": 2220 }, { "epoch": 0.022633870442708332, "grad_norm": 9.020492553710938, "learning_rate": 4.994225664455989e-06, "loss": 3.3013, "step": 2225 }, { "epoch": 0.022684733072916668, "grad_norm": 14.377449989318848, "learning_rate": 4.99419846967065e-06, "loss": 3.3859, "step": 2230 }, { "epoch": 0.022735595703125, "grad_norm": 13.713812828063965, "learning_rate": 4.994171211071836e-06, "loss": 3.3353, "step": 2235 }, { "epoch": 0.022786458333333332, "grad_norm": 18.0495548248291, "learning_rate": 4.994143888660242e-06, "loss": 3.5454, "step": 2240 }, { "epoch": 0.022837320963541668, "grad_norm": 13.879693031311035, "learning_rate": 4.994116502436568e-06, "loss": 3.9231, "step": 2245 }, { "epoch": 0.02288818359375, "grad_norm": 14.874258041381836, "learning_rate": 4.994089052401515e-06, "loss": 3.114, "step": 2250 }, { "epoch": 0.022939046223958332, "grad_norm": 17.684362411499023, "learning_rate": 4.994061538555784e-06, "loss": 3.5418, "step": 2255 }, { "epoch": 0.022989908854166668, "grad_norm": 20.040653228759766, "learning_rate": 4.9940339609000796e-06, "loss": 4.0738, "step": 2260 }, { "epoch": 0.023040771484375, "grad_norm": 8.416106224060059, "learning_rate": 4.994006319435108e-06, "loss": 3.7515, "step": 2265 }, { "epoch": 0.023091634114583332, "grad_norm": 14.56326675415039, "learning_rate": 4.9939786141615754e-06, "loss": 3.4834, "step": 2270 }, { "epoch": 0.023142496744791668, "grad_norm": 9.106837272644043, "learning_rate": 4.993950845080191e-06, "loss": 3.363, "step": 2275 }, { "epoch": 0.023193359375, "grad_norm": 16.284809112548828, "learning_rate": 4.993923012191666e-06, "loss": 3.4758, "step": 2280 }, { "epoch": 0.023244222005208332, "grad_norm": 12.630521774291992, "learning_rate": 4.993895115496712e-06, "loss": 3.5318, "step": 2285 }, { "epoch": 0.023295084635416668, "grad_norm": 11.593666076660156, "learning_rate": 4.993867154996042e-06, "loss": 3.7887, "step": 2290 }, { "epoch": 0.023345947265625, "grad_norm": 14.084927558898926, "learning_rate": 4.993839130690372e-06, "loss": 3.5525, "step": 2295 }, { "epoch": 0.023396809895833332, "grad_norm": 15.547626495361328, "learning_rate": 4.993811042580419e-06, "loss": 3.3359, "step": 2300 }, { "epoch": 0.023447672526041668, "grad_norm": 17.31133460998535, "learning_rate": 4.993782890666902e-06, "loss": 3.6608, "step": 2305 }, { "epoch": 0.02349853515625, "grad_norm": 10.99673080444336, "learning_rate": 4.99375467495054e-06, "loss": 3.4517, "step": 2310 }, { "epoch": 0.023549397786458332, "grad_norm": 10.245965957641602, "learning_rate": 4.993726395432056e-06, "loss": 3.5961, "step": 2315 }, { "epoch": 0.023600260416666668, "grad_norm": 11.301896095275879, "learning_rate": 4.993698052112174e-06, "loss": 3.2348, "step": 2320 }, { "epoch": 0.023651123046875, "grad_norm": 16.186355590820312, "learning_rate": 4.993669644991617e-06, "loss": 3.6148, "step": 2325 }, { "epoch": 0.023701985677083332, "grad_norm": 15.257999420166016, "learning_rate": 4.993641174071115e-06, "loss": 3.7732, "step": 2330 }, { "epoch": 0.023752848307291668, "grad_norm": 17.36763572692871, "learning_rate": 4.993612639351393e-06, "loss": 3.5761, "step": 2335 }, { "epoch": 0.0238037109375, "grad_norm": 12.962672233581543, "learning_rate": 4.993584040833183e-06, "loss": 3.6535, "step": 2340 }, { "epoch": 0.023854573567708332, "grad_norm": 13.28327751159668, "learning_rate": 4.993555378517217e-06, "loss": 3.5254, "step": 2345 }, { "epoch": 0.023905436197916668, "grad_norm": 14.348062515258789, "learning_rate": 4.993526652404227e-06, "loss": 3.6133, "step": 2350 }, { "epoch": 0.023956298828125, "grad_norm": 15.535712242126465, "learning_rate": 4.993497862494949e-06, "loss": 3.5402, "step": 2355 }, { "epoch": 0.024007161458333332, "grad_norm": 10.898641586303711, "learning_rate": 4.993469008790119e-06, "loss": 3.5233, "step": 2360 }, { "epoch": 0.024058024088541668, "grad_norm": 9.476696968078613, "learning_rate": 4.993440091290476e-06, "loss": 4.3757, "step": 2365 }, { "epoch": 0.02410888671875, "grad_norm": 16.282838821411133, "learning_rate": 4.993411109996759e-06, "loss": 3.8592, "step": 2370 }, { "epoch": 0.024159749348958332, "grad_norm": 15.315332412719727, "learning_rate": 4.99338206490971e-06, "loss": 3.626, "step": 2375 }, { "epoch": 0.024210611979166668, "grad_norm": 11.935681343078613, "learning_rate": 4.993352956030071e-06, "loss": 3.5642, "step": 2380 }, { "epoch": 0.024261474609375, "grad_norm": 14.131113052368164, "learning_rate": 4.993323783358588e-06, "loss": 3.6816, "step": 2385 }, { "epoch": 0.024312337239583332, "grad_norm": 17.588123321533203, "learning_rate": 4.993294546896007e-06, "loss": 3.574, "step": 2390 }, { "epoch": 0.024363199869791668, "grad_norm": 11.560687065124512, "learning_rate": 4.993265246643076e-06, "loss": 3.6406, "step": 2395 }, { "epoch": 0.0244140625, "grad_norm": 9.761478424072266, "learning_rate": 4.993235882600545e-06, "loss": 3.5061, "step": 2400 }, { "epoch": 0.024464925130208332, "grad_norm": 13.649985313415527, "learning_rate": 4.993206454769165e-06, "loss": 3.4024, "step": 2405 }, { "epoch": 0.024515787760416668, "grad_norm": 8.24244499206543, "learning_rate": 4.993176963149689e-06, "loss": 3.6048, "step": 2410 }, { "epoch": 0.024566650390625, "grad_norm": 16.1616268157959, "learning_rate": 4.99314740774287e-06, "loss": 3.5386, "step": 2415 }, { "epoch": 0.024617513020833332, "grad_norm": 9.706306457519531, "learning_rate": 4.993117788549466e-06, "loss": 3.5126, "step": 2420 }, { "epoch": 0.024668375651041668, "grad_norm": 12.5220365524292, "learning_rate": 4.993088105570235e-06, "loss": 4.1297, "step": 2425 }, { "epoch": 0.02471923828125, "grad_norm": 12.254471778869629, "learning_rate": 4.993058358805935e-06, "loss": 3.4553, "step": 2430 }, { "epoch": 0.024770100911458332, "grad_norm": 14.104937553405762, "learning_rate": 4.993028548257328e-06, "loss": 4.2029, "step": 2435 }, { "epoch": 0.024820963541666668, "grad_norm": 14.170137405395508, "learning_rate": 4.992998673925177e-06, "loss": 3.45, "step": 2440 }, { "epoch": 0.024871826171875, "grad_norm": 16.678157806396484, "learning_rate": 4.9929687358102455e-06, "loss": 3.464, "step": 2445 }, { "epoch": 0.024922688802083332, "grad_norm": 11.808859825134277, "learning_rate": 4.9929387339133e-06, "loss": 4.1151, "step": 2450 }, { "epoch": 0.024973551432291668, "grad_norm": 13.544400215148926, "learning_rate": 4.992908668235107e-06, "loss": 3.4802, "step": 2455 }, { "epoch": 0.0250244140625, "grad_norm": 10.196426391601562, "learning_rate": 4.992878538776438e-06, "loss": 3.6122, "step": 2460 }, { "epoch": 0.025075276692708332, "grad_norm": 24.040620803833008, "learning_rate": 4.992848345538062e-06, "loss": 3.7206, "step": 2465 }, { "epoch": 0.025126139322916668, "grad_norm": 13.72429370880127, "learning_rate": 4.992818088520751e-06, "loss": 3.7046, "step": 2470 }, { "epoch": 0.025177001953125, "grad_norm": 17.09827995300293, "learning_rate": 4.992787767725281e-06, "loss": 3.6007, "step": 2475 }, { "epoch": 0.025227864583333332, "grad_norm": 17.559282302856445, "learning_rate": 4.992757383152427e-06, "loss": 3.5262, "step": 2480 }, { "epoch": 0.025278727213541668, "grad_norm": 12.092161178588867, "learning_rate": 4.992726934802965e-06, "loss": 3.4942, "step": 2485 }, { "epoch": 0.02532958984375, "grad_norm": 12.694467544555664, "learning_rate": 4.992696422677677e-06, "loss": 3.6092, "step": 2490 }, { "epoch": 0.025380452473958332, "grad_norm": 19.91484832763672, "learning_rate": 4.99266584677734e-06, "loss": 3.6935, "step": 2495 }, { "epoch": 0.025431315104166668, "grad_norm": 12.521865844726562, "learning_rate": 4.99263520710274e-06, "loss": 3.509, "step": 2500 }, { "epoch": 0.025482177734375, "grad_norm": 14.746912002563477, "learning_rate": 4.9926045036546576e-06, "loss": 3.4715, "step": 2505 }, { "epoch": 0.025533040364583332, "grad_norm": 13.788453102111816, "learning_rate": 4.99257373643388e-06, "loss": 3.7318, "step": 2510 }, { "epoch": 0.025583902994791668, "grad_norm": 15.496885299682617, "learning_rate": 4.992542905441194e-06, "loss": 3.6001, "step": 2515 }, { "epoch": 0.025634765625, "grad_norm": 11.678740501403809, "learning_rate": 4.992512010677389e-06, "loss": 3.3945, "step": 2520 }, { "epoch": 0.025685628255208332, "grad_norm": 11.555937767028809, "learning_rate": 4.992481052143256e-06, "loss": 3.5856, "step": 2525 }, { "epoch": 0.025736490885416668, "grad_norm": 11.76779556274414, "learning_rate": 4.992450029839584e-06, "loss": 3.6207, "step": 2530 }, { "epoch": 0.025787353515625, "grad_norm": 11.902135848999023, "learning_rate": 4.99241894376717e-06, "loss": 3.6504, "step": 2535 }, { "epoch": 0.025838216145833332, "grad_norm": 13.56098461151123, "learning_rate": 4.992387793926808e-06, "loss": 3.561, "step": 2540 }, { "epoch": 0.025889078776041668, "grad_norm": 9.4614839553833, "learning_rate": 4.9923565803192945e-06, "loss": 3.7419, "step": 2545 }, { "epoch": 0.02593994140625, "grad_norm": 15.035961151123047, "learning_rate": 4.9923253029454295e-06, "loss": 3.4019, "step": 2550 }, { "epoch": 0.025990804036458332, "grad_norm": 14.106694221496582, "learning_rate": 4.992293961806012e-06, "loss": 3.6503, "step": 2555 }, { "epoch": 0.026041666666666668, "grad_norm": 13.903535842895508, "learning_rate": 4.992262556901844e-06, "loss": 3.5632, "step": 2560 }, { "epoch": 0.026092529296875, "grad_norm": 21.27461814880371, "learning_rate": 4.99223108823373e-06, "loss": 3.3556, "step": 2565 }, { "epoch": 0.026143391927083332, "grad_norm": 19.755544662475586, "learning_rate": 4.992199555802473e-06, "loss": 3.9516, "step": 2570 }, { "epoch": 0.026194254557291668, "grad_norm": 10.459436416625977, "learning_rate": 4.992167959608882e-06, "loss": 4.184, "step": 2575 }, { "epoch": 0.0262451171875, "grad_norm": 15.970342636108398, "learning_rate": 4.992136299653763e-06, "loss": 3.4005, "step": 2580 }, { "epoch": 0.026295979817708332, "grad_norm": 12.037581443786621, "learning_rate": 4.992104575937929e-06, "loss": 4.3835, "step": 2585 }, { "epoch": 0.026346842447916668, "grad_norm": 9.693264961242676, "learning_rate": 4.99207278846219e-06, "loss": 3.7904, "step": 2590 }, { "epoch": 0.026397705078125, "grad_norm": 18.028060913085938, "learning_rate": 4.99204093722736e-06, "loss": 3.4888, "step": 2595 }, { "epoch": 0.026448567708333332, "grad_norm": 14.927176475524902, "learning_rate": 4.992009022234252e-06, "loss": 3.7141, "step": 2600 }, { "epoch": 0.026499430338541668, "grad_norm": 10.663310050964355, "learning_rate": 4.991977043483684e-06, "loss": 3.3668, "step": 2605 }, { "epoch": 0.02655029296875, "grad_norm": 21.105947494506836, "learning_rate": 4.991945000976475e-06, "loss": 3.7438, "step": 2610 }, { "epoch": 0.026601155598958332, "grad_norm": 11.864397048950195, "learning_rate": 4.991912894713443e-06, "loss": 3.4406, "step": 2615 }, { "epoch": 0.026652018229166668, "grad_norm": 16.927186965942383, "learning_rate": 4.99188072469541e-06, "loss": 3.8746, "step": 2620 }, { "epoch": 0.026702880859375, "grad_norm": 10.140044212341309, "learning_rate": 4.9918484909232e-06, "loss": 3.7149, "step": 2625 }, { "epoch": 0.026753743489583332, "grad_norm": 10.26709270477295, "learning_rate": 4.991816193397637e-06, "loss": 3.4153, "step": 2630 }, { "epoch": 0.026804606119791668, "grad_norm": 18.33658790588379, "learning_rate": 4.991783832119547e-06, "loss": 3.6604, "step": 2635 }, { "epoch": 0.02685546875, "grad_norm": 15.353517532348633, "learning_rate": 4.991751407089759e-06, "loss": 3.4237, "step": 2640 }, { "epoch": 0.026906331380208332, "grad_norm": 13.645818710327148, "learning_rate": 4.991718918309101e-06, "loss": 3.3133, "step": 2645 }, { "epoch": 0.026957194010416668, "grad_norm": 8.63716983795166, "learning_rate": 4.991686365778405e-06, "loss": 3.3966, "step": 2650 }, { "epoch": 0.027008056640625, "grad_norm": 12.624420166015625, "learning_rate": 4.991653749498504e-06, "loss": 3.3418, "step": 2655 }, { "epoch": 0.027058919270833332, "grad_norm": 16.91732406616211, "learning_rate": 4.991621069470233e-06, "loss": 4.1244, "step": 2660 }, { "epoch": 0.027109781901041668, "grad_norm": 15.740944862365723, "learning_rate": 4.991588325694426e-06, "loss": 3.4994, "step": 2665 }, { "epoch": 0.02716064453125, "grad_norm": 12.525944709777832, "learning_rate": 4.9915555181719235e-06, "loss": 3.7722, "step": 2670 }, { "epoch": 0.027211507161458332, "grad_norm": 13.604511260986328, "learning_rate": 4.991522646903564e-06, "loss": 3.4386, "step": 2675 }, { "epoch": 0.027262369791666668, "grad_norm": 10.147235870361328, "learning_rate": 4.991489711890188e-06, "loss": 3.5642, "step": 2680 }, { "epoch": 0.027313232421875, "grad_norm": 14.985891342163086, "learning_rate": 4.991456713132637e-06, "loss": 3.5542, "step": 2685 }, { "epoch": 0.027364095052083332, "grad_norm": 9.699203491210938, "learning_rate": 4.991423650631758e-06, "loss": 3.4597, "step": 2690 }, { "epoch": 0.027414957682291668, "grad_norm": 19.256378173828125, "learning_rate": 4.991390524388394e-06, "loss": 3.9099, "step": 2695 }, { "epoch": 0.0274658203125, "grad_norm": 13.979656219482422, "learning_rate": 4.991357334403396e-06, "loss": 3.4309, "step": 2700 }, { "epoch": 0.027516682942708332, "grad_norm": 15.490063667297363, "learning_rate": 4.9913240806776095e-06, "loss": 3.6231, "step": 2705 }, { "epoch": 0.027567545572916668, "grad_norm": 11.414307594299316, "learning_rate": 4.991290763211887e-06, "loss": 3.5332, "step": 2710 }, { "epoch": 0.027618408203125, "grad_norm": 14.588093757629395, "learning_rate": 4.991257382007081e-06, "loss": 3.8702, "step": 2715 }, { "epoch": 0.027669270833333332, "grad_norm": 11.244452476501465, "learning_rate": 4.9912239370640455e-06, "loss": 4.1972, "step": 2720 }, { "epoch": 0.027720133463541668, "grad_norm": 18.345767974853516, "learning_rate": 4.991190428383637e-06, "loss": 3.4974, "step": 2725 }, { "epoch": 0.02777099609375, "grad_norm": 16.70025634765625, "learning_rate": 4.99115685596671e-06, "loss": 3.4827, "step": 2730 }, { "epoch": 0.027821858723958332, "grad_norm": 9.72400951385498, "learning_rate": 4.9911232198141266e-06, "loss": 3.7941, "step": 2735 }, { "epoch": 0.027872721354166668, "grad_norm": 18.471965789794922, "learning_rate": 4.991089519926746e-06, "loss": 3.3165, "step": 2740 }, { "epoch": 0.027923583984375, "grad_norm": 8.832108497619629, "learning_rate": 4.9910557563054295e-06, "loss": 4.046, "step": 2745 }, { "epoch": 0.027974446614583332, "grad_norm": 13.375359535217285, "learning_rate": 4.991021928951043e-06, "loss": 3.7558, "step": 2750 }, { "epoch": 0.028025309244791668, "grad_norm": 14.425735473632812, "learning_rate": 4.99098803786445e-06, "loss": 3.7207, "step": 2755 }, { "epoch": 0.028076171875, "grad_norm": 11.840872764587402, "learning_rate": 4.99095408304652e-06, "loss": 3.8429, "step": 2760 }, { "epoch": 0.028127034505208332, "grad_norm": 11.43602466583252, "learning_rate": 4.990920064498119e-06, "loss": 3.3477, "step": 2765 }, { "epoch": 0.028177897135416668, "grad_norm": 21.40333366394043, "learning_rate": 4.9908859822201186e-06, "loss": 3.3127, "step": 2770 }, { "epoch": 0.028228759765625, "grad_norm": 9.050609588623047, "learning_rate": 4.990851836213391e-06, "loss": 3.5749, "step": 2775 }, { "epoch": 0.028279622395833332, "grad_norm": 18.72174644470215, "learning_rate": 4.990817626478809e-06, "loss": 3.4886, "step": 2780 }, { "epoch": 0.028330485026041668, "grad_norm": 17.724315643310547, "learning_rate": 4.990783353017249e-06, "loss": 3.6107, "step": 2785 }, { "epoch": 0.02838134765625, "grad_norm": 17.255958557128906, "learning_rate": 4.990749015829587e-06, "loss": 3.4667, "step": 2790 }, { "epoch": 0.028432210286458332, "grad_norm": 15.389385223388672, "learning_rate": 4.9907146149167025e-06, "loss": 3.6194, "step": 2795 }, { "epoch": 0.028483072916666668, "grad_norm": 16.21523666381836, "learning_rate": 4.990680150279474e-06, "loss": 3.7462, "step": 2800 }, { "epoch": 0.028533935546875, "grad_norm": 10.564112663269043, "learning_rate": 4.990645621918785e-06, "loss": 3.5578, "step": 2805 }, { "epoch": 0.028584798177083332, "grad_norm": 11.502965927124023, "learning_rate": 4.990611029835518e-06, "loss": 3.3145, "step": 2810 }, { "epoch": 0.028635660807291668, "grad_norm": 11.729395866394043, "learning_rate": 4.990576374030558e-06, "loss": 3.5022, "step": 2815 }, { "epoch": 0.0286865234375, "grad_norm": 17.083139419555664, "learning_rate": 4.9905416545047914e-06, "loss": 3.1965, "step": 2820 }, { "epoch": 0.028737386067708332, "grad_norm": 12.707961082458496, "learning_rate": 4.990506871259107e-06, "loss": 3.3708, "step": 2825 }, { "epoch": 0.028788248697916668, "grad_norm": 12.2274808883667, "learning_rate": 4.990472024294395e-06, "loss": 3.6543, "step": 2830 }, { "epoch": 0.028839111328125, "grad_norm": 10.415914535522461, "learning_rate": 4.990437113611546e-06, "loss": 3.3149, "step": 2835 }, { "epoch": 0.028889973958333332, "grad_norm": 14.803680419921875, "learning_rate": 4.990402139211454e-06, "loss": 3.6954, "step": 2840 }, { "epoch": 0.028940836588541668, "grad_norm": 9.584573745727539, "learning_rate": 4.990367101095014e-06, "loss": 3.2306, "step": 2845 }, { "epoch": 0.02899169921875, "grad_norm": 9.19983196258545, "learning_rate": 4.9903319992631215e-06, "loss": 3.5329, "step": 2850 }, { "epoch": 0.029042561848958332, "grad_norm": 13.915033340454102, "learning_rate": 4.990296833716676e-06, "loss": 3.5633, "step": 2855 }, { "epoch": 0.029093424479166668, "grad_norm": 8.67843246459961, "learning_rate": 4.990261604456575e-06, "loss": 3.5621, "step": 2860 }, { "epoch": 0.029144287109375, "grad_norm": 14.383540153503418, "learning_rate": 4.990226311483721e-06, "loss": 3.6706, "step": 2865 }, { "epoch": 0.029195149739583332, "grad_norm": 15.184710502624512, "learning_rate": 4.990190954799018e-06, "loss": 3.5497, "step": 2870 }, { "epoch": 0.029246012369791668, "grad_norm": 17.983572006225586, "learning_rate": 4.990155534403369e-06, "loss": 3.2129, "step": 2875 }, { "epoch": 0.029296875, "grad_norm": 19.197294235229492, "learning_rate": 4.9901200502976825e-06, "loss": 3.7564, "step": 2880 }, { "epoch": 0.029347737630208332, "grad_norm": 14.85810661315918, "learning_rate": 4.990084502482863e-06, "loss": 3.4496, "step": 2885 }, { "epoch": 0.029398600260416668, "grad_norm": 12.596855163574219, "learning_rate": 4.990048890959822e-06, "loss": 3.7283, "step": 2890 }, { "epoch": 0.029449462890625, "grad_norm": 17.732402801513672, "learning_rate": 4.99001321572947e-06, "loss": 3.8592, "step": 2895 }, { "epoch": 0.029500325520833332, "grad_norm": 12.990941047668457, "learning_rate": 4.989977476792721e-06, "loss": 3.5972, "step": 2900 }, { "epoch": 0.029551188151041668, "grad_norm": 15.059063911437988, "learning_rate": 4.989941674150488e-06, "loss": 3.6941, "step": 2905 }, { "epoch": 0.02960205078125, "grad_norm": 13.130831718444824, "learning_rate": 4.989905807803688e-06, "loss": 3.5952, "step": 2910 }, { "epoch": 0.029652913411458332, "grad_norm": 15.897665977478027, "learning_rate": 4.989869877753237e-06, "loss": 3.5691, "step": 2915 }, { "epoch": 0.029703776041666668, "grad_norm": 11.824736595153809, "learning_rate": 4.989833884000056e-06, "loss": 3.7098, "step": 2920 }, { "epoch": 0.029754638671875, "grad_norm": 14.641777038574219, "learning_rate": 4.989797826545065e-06, "loss": 3.6694, "step": 2925 }, { "epoch": 0.029805501302083332, "grad_norm": 12.433199882507324, "learning_rate": 4.989761705389187e-06, "loss": 3.5255, "step": 2930 }, { "epoch": 0.029856363932291668, "grad_norm": 12.532072067260742, "learning_rate": 4.989725520533346e-06, "loss": 3.5487, "step": 2935 }, { "epoch": 0.0299072265625, "grad_norm": 16.237571716308594, "learning_rate": 4.9896892719784675e-06, "loss": 3.562, "step": 2940 }, { "epoch": 0.029958089192708332, "grad_norm": 15.376089096069336, "learning_rate": 4.989652959725479e-06, "loss": 3.4396, "step": 2945 }, { "epoch": 0.030008951822916668, "grad_norm": 13.294553756713867, "learning_rate": 4.98961658377531e-06, "loss": 3.8229, "step": 2950 }, { "epoch": 0.030059814453125, "grad_norm": 15.016079902648926, "learning_rate": 4.98958014412889e-06, "loss": 3.4993, "step": 2955 }, { "epoch": 0.030110677083333332, "grad_norm": 16.527847290039062, "learning_rate": 4.989543640787153e-06, "loss": 3.6308, "step": 2960 }, { "epoch": 0.030161539713541668, "grad_norm": 15.960955619812012, "learning_rate": 4.989507073751032e-06, "loss": 3.5736, "step": 2965 }, { "epoch": 0.03021240234375, "grad_norm": 19.20722198486328, "learning_rate": 4.989470443021462e-06, "loss": 3.3009, "step": 2970 }, { "epoch": 0.030263264973958332, "grad_norm": 11.201433181762695, "learning_rate": 4.989433748599381e-06, "loss": 3.6235, "step": 2975 }, { "epoch": 0.030314127604166668, "grad_norm": 13.62901782989502, "learning_rate": 4.989396990485727e-06, "loss": 4.1232, "step": 2980 }, { "epoch": 0.030364990234375, "grad_norm": 15.175191879272461, "learning_rate": 4.989360168681442e-06, "loss": 3.6938, "step": 2985 }, { "epoch": 0.030415852864583332, "grad_norm": 12.873221397399902, "learning_rate": 4.9893232831874676e-06, "loss": 3.7898, "step": 2990 }, { "epoch": 0.030466715494791668, "grad_norm": 10.560770034790039, "learning_rate": 4.989286334004746e-06, "loss": 3.1075, "step": 2995 }, { "epoch": 0.030517578125, "grad_norm": 16.364398956298828, "learning_rate": 4.9892493211342235e-06, "loss": 3.8729, "step": 3000 }, { "epoch": 0.030568440755208332, "grad_norm": 14.276408195495605, "learning_rate": 4.989212244576848e-06, "loss": 3.4278, "step": 3005 }, { "epoch": 0.030619303385416668, "grad_norm": 17.391857147216797, "learning_rate": 4.989175104333567e-06, "loss": 3.5136, "step": 3010 }, { "epoch": 0.030670166015625, "grad_norm": 16.542964935302734, "learning_rate": 4.98913790040533e-06, "loss": 3.7648, "step": 3015 }, { "epoch": 0.030721028645833332, "grad_norm": 12.984308242797852, "learning_rate": 4.9891006327930905e-06, "loss": 3.2495, "step": 3020 }, { "epoch": 0.030771891276041668, "grad_norm": 16.2092227935791, "learning_rate": 4.989063301497801e-06, "loss": 3.2186, "step": 3025 }, { "epoch": 0.03082275390625, "grad_norm": 9.466002464294434, "learning_rate": 4.989025906520417e-06, "loss": 3.5156, "step": 3030 }, { "epoch": 0.030873616536458332, "grad_norm": 11.886150360107422, "learning_rate": 4.988988447861895e-06, "loss": 3.5858, "step": 3035 }, { "epoch": 0.030924479166666668, "grad_norm": 10.326323509216309, "learning_rate": 4.988950925523194e-06, "loss": 3.3724, "step": 3040 }, { "epoch": 0.030975341796875, "grad_norm": 15.874045372009277, "learning_rate": 4.988913339505274e-06, "loss": 3.5452, "step": 3045 }, { "epoch": 0.031026204427083332, "grad_norm": 11.754904747009277, "learning_rate": 4.988875689809095e-06, "loss": 3.4911, "step": 3050 }, { "epoch": 0.031077067057291668, "grad_norm": 17.041898727416992, "learning_rate": 4.988837976435622e-06, "loss": 3.6784, "step": 3055 }, { "epoch": 0.0311279296875, "grad_norm": 11.09570026397705, "learning_rate": 4.988800199385819e-06, "loss": 3.366, "step": 3060 }, { "epoch": 0.031178792317708332, "grad_norm": 14.231475830078125, "learning_rate": 4.988762358660654e-06, "loss": 3.3146, "step": 3065 }, { "epoch": 0.031229654947916668, "grad_norm": 11.312032699584961, "learning_rate": 4.988724454261092e-06, "loss": 3.6764, "step": 3070 }, { "epoch": 0.031280517578125, "grad_norm": 15.22202205657959, "learning_rate": 4.988686486188105e-06, "loss": 3.4084, "step": 3075 }, { "epoch": 0.031331380208333336, "grad_norm": 14.88944149017334, "learning_rate": 4.988648454442666e-06, "loss": 3.1168, "step": 3080 }, { "epoch": 0.031382242838541664, "grad_norm": 10.730203628540039, "learning_rate": 4.988610359025745e-06, "loss": 3.3345, "step": 3085 }, { "epoch": 0.03143310546875, "grad_norm": 13.363783836364746, "learning_rate": 4.988572199938317e-06, "loss": 3.0878, "step": 3090 }, { "epoch": 0.031483968098958336, "grad_norm": 21.714414596557617, "learning_rate": 4.9885339771813604e-06, "loss": 3.6447, "step": 3095 }, { "epoch": 0.031534830729166664, "grad_norm": 18.616336822509766, "learning_rate": 4.9884956907558515e-06, "loss": 3.5968, "step": 3100 }, { "epoch": 0.031585693359375, "grad_norm": 12.650989532470703, "learning_rate": 4.98845734066277e-06, "loss": 3.4597, "step": 3105 }, { "epoch": 0.031636555989583336, "grad_norm": 14.19465446472168, "learning_rate": 4.988418926903098e-06, "loss": 3.399, "step": 3110 }, { "epoch": 0.031687418619791664, "grad_norm": 9.14580249786377, "learning_rate": 4.9883804494778165e-06, "loss": 3.6816, "step": 3115 }, { "epoch": 0.03173828125, "grad_norm": 10.480582237243652, "learning_rate": 4.988341908387912e-06, "loss": 3.128, "step": 3120 }, { "epoch": 0.031789143880208336, "grad_norm": 10.952704429626465, "learning_rate": 4.988303303634368e-06, "loss": 3.4517, "step": 3125 }, { "epoch": 0.031840006510416664, "grad_norm": 12.522828102111816, "learning_rate": 4.988264635218175e-06, "loss": 3.9186, "step": 3130 }, { "epoch": 0.031890869140625, "grad_norm": 12.782024383544922, "learning_rate": 4.988225903140321e-06, "loss": 3.725, "step": 3135 }, { "epoch": 0.031941731770833336, "grad_norm": 11.72850227355957, "learning_rate": 4.988187107401797e-06, "loss": 3.7264, "step": 3140 }, { "epoch": 0.031992594401041664, "grad_norm": 12.133509635925293, "learning_rate": 4.988148248003595e-06, "loss": 3.4472, "step": 3145 }, { "epoch": 0.03204345703125, "grad_norm": 12.93433666229248, "learning_rate": 4.98810932494671e-06, "loss": 3.9443, "step": 3150 }, { "epoch": 0.032094319661458336, "grad_norm": 16.135568618774414, "learning_rate": 4.988070338232138e-06, "loss": 3.4966, "step": 3155 }, { "epoch": 0.032145182291666664, "grad_norm": 18.191791534423828, "learning_rate": 4.988031287860877e-06, "loss": 3.5644, "step": 3160 }, { "epoch": 0.032196044921875, "grad_norm": 11.924603462219238, "learning_rate": 4.987992173833924e-06, "loss": 3.4058, "step": 3165 }, { "epoch": 0.032246907552083336, "grad_norm": 12.82744026184082, "learning_rate": 4.987952996152281e-06, "loss": 3.492, "step": 3170 }, { "epoch": 0.032297770182291664, "grad_norm": 8.64533519744873, "learning_rate": 4.987913754816951e-06, "loss": 3.9099, "step": 3175 }, { "epoch": 0.0323486328125, "grad_norm": 16.365373611450195, "learning_rate": 4.987874449828937e-06, "loss": 3.1102, "step": 3180 }, { "epoch": 0.032399495442708336, "grad_norm": 15.942273139953613, "learning_rate": 4.987835081189245e-06, "loss": 3.7335, "step": 3185 }, { "epoch": 0.032450358072916664, "grad_norm": 17.56052589416504, "learning_rate": 4.987795648898882e-06, "loss": 3.8561, "step": 3190 }, { "epoch": 0.032501220703125, "grad_norm": 9.97986125946045, "learning_rate": 4.987756152958857e-06, "loss": 3.3722, "step": 3195 }, { "epoch": 0.032552083333333336, "grad_norm": 13.236599922180176, "learning_rate": 4.98771659337018e-06, "loss": 3.5341, "step": 3200 }, { "epoch": 0.032602945963541664, "grad_norm": 15.567898750305176, "learning_rate": 4.987676970133864e-06, "loss": 3.6612, "step": 3205 }, { "epoch": 0.03265380859375, "grad_norm": 14.741514205932617, "learning_rate": 4.987637283250923e-06, "loss": 3.7577, "step": 3210 }, { "epoch": 0.032704671223958336, "grad_norm": 12.727431297302246, "learning_rate": 4.98759753272237e-06, "loss": 4.3001, "step": 3215 }, { "epoch": 0.032755533854166664, "grad_norm": 11.071520805358887, "learning_rate": 4.987557718549225e-06, "loss": 3.61, "step": 3220 }, { "epoch": 0.032806396484375, "grad_norm": 10.508591651916504, "learning_rate": 4.987517840732505e-06, "loss": 3.3211, "step": 3225 }, { "epoch": 0.032857259114583336, "grad_norm": 12.959603309631348, "learning_rate": 4.987477899273232e-06, "loss": 3.5145, "step": 3230 }, { "epoch": 0.032908121744791664, "grad_norm": 14.553711891174316, "learning_rate": 4.987437894172426e-06, "loss": 3.9318, "step": 3235 }, { "epoch": 0.032958984375, "grad_norm": 14.899251937866211, "learning_rate": 4.987397825431109e-06, "loss": 3.5583, "step": 3240 }, { "epoch": 0.033009847005208336, "grad_norm": 11.648773193359375, "learning_rate": 4.98735769305031e-06, "loss": 3.2714, "step": 3245 }, { "epoch": 0.033060709635416664, "grad_norm": 13.525915145874023, "learning_rate": 4.987317497031055e-06, "loss": 3.3922, "step": 3250 }, { "epoch": 0.033111572265625, "grad_norm": 16.7314453125, "learning_rate": 4.987277237374369e-06, "loss": 3.6725, "step": 3255 }, { "epoch": 0.033162434895833336, "grad_norm": 16.31707191467285, "learning_rate": 4.987236914081286e-06, "loss": 3.3669, "step": 3260 }, { "epoch": 0.033213297526041664, "grad_norm": 11.438323974609375, "learning_rate": 4.987196527152835e-06, "loss": 3.4222, "step": 3265 }, { "epoch": 0.03326416015625, "grad_norm": 14.705175399780273, "learning_rate": 4.987156076590051e-06, "loss": 3.6076, "step": 3270 }, { "epoch": 0.033315022786458336, "grad_norm": 14.206781387329102, "learning_rate": 4.987115562393969e-06, "loss": 3.4704, "step": 3275 }, { "epoch": 0.033365885416666664, "grad_norm": 10.712113380432129, "learning_rate": 4.987074984565624e-06, "loss": 3.575, "step": 3280 }, { "epoch": 0.033416748046875, "grad_norm": 12.62575626373291, "learning_rate": 4.987034343106055e-06, "loss": 3.2375, "step": 3285 }, { "epoch": 0.033467610677083336, "grad_norm": 13.155769348144531, "learning_rate": 4.986993638016302e-06, "loss": 3.0587, "step": 3290 }, { "epoch": 0.033518473307291664, "grad_norm": 15.919364929199219, "learning_rate": 4.986952869297407e-06, "loss": 3.569, "step": 3295 }, { "epoch": 0.0335693359375, "grad_norm": 16.994239807128906, "learning_rate": 4.986912036950411e-06, "loss": 3.7481, "step": 3300 }, { "epoch": 0.033620198567708336, "grad_norm": 13.510692596435547, "learning_rate": 4.986871140976361e-06, "loss": 3.5751, "step": 3305 }, { "epoch": 0.033671061197916664, "grad_norm": 14.510315895080566, "learning_rate": 4.986830181376302e-06, "loss": 3.2262, "step": 3310 }, { "epoch": 0.033721923828125, "grad_norm": 13.225347518920898, "learning_rate": 4.986789158151282e-06, "loss": 3.3544, "step": 3315 }, { "epoch": 0.033772786458333336, "grad_norm": 15.174870491027832, "learning_rate": 4.9867480713023506e-06, "loss": 3.39, "step": 3320 }, { "epoch": 0.033823649088541664, "grad_norm": 13.420331954956055, "learning_rate": 4.98670692083056e-06, "loss": 3.4115, "step": 3325 }, { "epoch": 0.03387451171875, "grad_norm": 14.904714584350586, "learning_rate": 4.986665706736962e-06, "loss": 3.5199, "step": 3330 }, { "epoch": 0.033925374348958336, "grad_norm": 9.973340034484863, "learning_rate": 4.986624429022611e-06, "loss": 3.5716, "step": 3335 }, { "epoch": 0.033976236979166664, "grad_norm": 12.208763122558594, "learning_rate": 4.986583087688563e-06, "loss": 3.6914, "step": 3340 }, { "epoch": 0.034027099609375, "grad_norm": 13.434621810913086, "learning_rate": 4.986541682735877e-06, "loss": 3.2363, "step": 3345 }, { "epoch": 0.034077962239583336, "grad_norm": 17.55096435546875, "learning_rate": 4.986500214165611e-06, "loss": 3.4762, "step": 3350 }, { "epoch": 0.034128824869791664, "grad_norm": 11.070765495300293, "learning_rate": 4.986458681978826e-06, "loss": 3.6086, "step": 3355 }, { "epoch": 0.0341796875, "grad_norm": 12.782002449035645, "learning_rate": 4.986417086176586e-06, "loss": 3.3807, "step": 3360 }, { "epoch": 0.034230550130208336, "grad_norm": 11.93525218963623, "learning_rate": 4.9863754267599535e-06, "loss": 3.6002, "step": 3365 }, { "epoch": 0.034281412760416664, "grad_norm": 12.5736083984375, "learning_rate": 4.986333703729995e-06, "loss": 3.6075, "step": 3370 }, { "epoch": 0.034332275390625, "grad_norm": 14.359274864196777, "learning_rate": 4.986291917087778e-06, "loss": 3.461, "step": 3375 }, { "epoch": 0.034383138020833336, "grad_norm": 12.868420600891113, "learning_rate": 4.9862500668343714e-06, "loss": 3.9434, "step": 3380 }, { "epoch": 0.034434000651041664, "grad_norm": 18.13227653503418, "learning_rate": 4.986208152970847e-06, "loss": 3.2659, "step": 3385 }, { "epoch": 0.03448486328125, "grad_norm": 8.368807792663574, "learning_rate": 4.986166175498276e-06, "loss": 3.6618, "step": 3390 }, { "epoch": 0.034535725911458336, "grad_norm": 11.16234302520752, "learning_rate": 4.986124134417732e-06, "loss": 3.3186, "step": 3395 }, { "epoch": 0.034586588541666664, "grad_norm": 14.434364318847656, "learning_rate": 4.986082029730292e-06, "loss": 3.8224, "step": 3400 }, { "epoch": 0.034637451171875, "grad_norm": 11.538227081298828, "learning_rate": 4.9860398614370324e-06, "loss": 3.3252, "step": 3405 }, { "epoch": 0.034688313802083336, "grad_norm": 15.10848617553711, "learning_rate": 4.985997629539032e-06, "loss": 3.3582, "step": 3410 }, { "epoch": 0.034739176432291664, "grad_norm": 12.740103721618652, "learning_rate": 4.985955334037372e-06, "loss": 3.672, "step": 3415 }, { "epoch": 0.0347900390625, "grad_norm": 15.706042289733887, "learning_rate": 4.985912974933134e-06, "loss": 3.379, "step": 3420 }, { "epoch": 0.034840901692708336, "grad_norm": 21.957704544067383, "learning_rate": 4.985870552227401e-06, "loss": 3.8408, "step": 3425 }, { "epoch": 0.034891764322916664, "grad_norm": 8.741256713867188, "learning_rate": 4.9858280659212595e-06, "loss": 3.4575, "step": 3430 }, { "epoch": 0.034942626953125, "grad_norm": 16.37959098815918, "learning_rate": 4.9857855160157965e-06, "loss": 3.7038, "step": 3435 }, { "epoch": 0.034993489583333336, "grad_norm": 15.630584716796875, "learning_rate": 4.9857429025120996e-06, "loss": 3.3838, "step": 3440 }, { "epoch": 0.035044352213541664, "grad_norm": 13.591779708862305, "learning_rate": 4.98570022541126e-06, "loss": 3.4661, "step": 3445 }, { "epoch": 0.03509521484375, "grad_norm": 14.534747123718262, "learning_rate": 4.985657484714369e-06, "loss": 3.487, "step": 3450 }, { "epoch": 0.035146077473958336, "grad_norm": 10.500582695007324, "learning_rate": 4.985614680422521e-06, "loss": 3.6731, "step": 3455 }, { "epoch": 0.035196940104166664, "grad_norm": 10.115405082702637, "learning_rate": 4.9855718125368105e-06, "loss": 3.4601, "step": 3460 }, { "epoch": 0.035247802734375, "grad_norm": 17.56122398376465, "learning_rate": 4.985528881058334e-06, "loss": 3.6548, "step": 3465 }, { "epoch": 0.035298665364583336, "grad_norm": 9.477646827697754, "learning_rate": 4.9854858859881905e-06, "loss": 3.3522, "step": 3470 }, { "epoch": 0.035349527994791664, "grad_norm": 12.787845611572266, "learning_rate": 4.985442827327479e-06, "loss": 3.3005, "step": 3475 }, { "epoch": 0.035400390625, "grad_norm": 9.624972343444824, "learning_rate": 4.985399705077303e-06, "loss": 3.6494, "step": 3480 }, { "epoch": 0.035451253255208336, "grad_norm": 16.32085609436035, "learning_rate": 4.985356519238764e-06, "loss": 3.6078, "step": 3485 }, { "epoch": 0.035502115885416664, "grad_norm": 14.953262329101562, "learning_rate": 4.985313269812968e-06, "loss": 3.256, "step": 3490 }, { "epoch": 0.035552978515625, "grad_norm": 19.811403274536133, "learning_rate": 4.985269956801021e-06, "loss": 3.6117, "step": 3495 }, { "epoch": 0.035603841145833336, "grad_norm": 15.88729190826416, "learning_rate": 4.985226580204031e-06, "loss": 3.5138, "step": 3500 }, { "epoch": 0.035654703776041664, "grad_norm": 16.489349365234375, "learning_rate": 4.9851831400231075e-06, "loss": 3.5093, "step": 3505 }, { "epoch": 0.03570556640625, "grad_norm": 11.760025024414062, "learning_rate": 4.985139636259363e-06, "loss": 4.247, "step": 3510 }, { "epoch": 0.035756429036458336, "grad_norm": 12.459917068481445, "learning_rate": 4.98509606891391e-06, "loss": 3.4997, "step": 3515 }, { "epoch": 0.035807291666666664, "grad_norm": 10.306233406066895, "learning_rate": 4.985052437987863e-06, "loss": 3.4573, "step": 3520 }, { "epoch": 0.035858154296875, "grad_norm": 16.409082412719727, "learning_rate": 4.9850087434823384e-06, "loss": 3.4351, "step": 3525 }, { "epoch": 0.035909016927083336, "grad_norm": 9.501917839050293, "learning_rate": 4.984964985398454e-06, "loss": 3.4707, "step": 3530 }, { "epoch": 0.035959879557291664, "grad_norm": 13.083230972290039, "learning_rate": 4.98492116373733e-06, "loss": 3.3439, "step": 3535 }, { "epoch": 0.0360107421875, "grad_norm": 11.03459358215332, "learning_rate": 4.984877278500087e-06, "loss": 3.3296, "step": 3540 }, { "epoch": 0.036061604817708336, "grad_norm": 15.221756935119629, "learning_rate": 4.984833329687847e-06, "loss": 3.5049, "step": 3545 }, { "epoch": 0.036112467447916664, "grad_norm": 12.892142295837402, "learning_rate": 4.9847893173017345e-06, "loss": 3.6549, "step": 3550 }, { "epoch": 0.036163330078125, "grad_norm": 13.091279029846191, "learning_rate": 4.984745241342877e-06, "loss": 3.6493, "step": 3555 }, { "epoch": 0.036214192708333336, "grad_norm": 14.84507942199707, "learning_rate": 4.984701101812402e-06, "loss": 3.5608, "step": 3560 }, { "epoch": 0.036265055338541664, "grad_norm": 12.539996147155762, "learning_rate": 4.984656898711438e-06, "loss": 3.5464, "step": 3565 }, { "epoch": 0.03631591796875, "grad_norm": 12.54378604888916, "learning_rate": 4.984612632041117e-06, "loss": 3.7036, "step": 3570 }, { "epoch": 0.036366780598958336, "grad_norm": 15.650618553161621, "learning_rate": 4.98456830180257e-06, "loss": 3.7464, "step": 3575 }, { "epoch": 0.036417643229166664, "grad_norm": 12.167283058166504, "learning_rate": 4.984523907996932e-06, "loss": 3.6591, "step": 3580 }, { "epoch": 0.036468505859375, "grad_norm": 13.527252197265625, "learning_rate": 4.984479450625338e-06, "loss": 3.5705, "step": 3585 }, { "epoch": 0.036519368489583336, "grad_norm": 16.410930633544922, "learning_rate": 4.9844349296889275e-06, "loss": 3.7471, "step": 3590 }, { "epoch": 0.036570231119791664, "grad_norm": 10.030250549316406, "learning_rate": 4.984390345188838e-06, "loss": 3.841, "step": 3595 }, { "epoch": 0.03662109375, "grad_norm": 11.255728721618652, "learning_rate": 4.9843456971262095e-06, "loss": 3.5435, "step": 3600 }, { "epoch": 0.036671956380208336, "grad_norm": 9.756819725036621, "learning_rate": 4.984300985502185e-06, "loss": 3.3497, "step": 3605 }, { "epoch": 0.036722819010416664, "grad_norm": 12.754555702209473, "learning_rate": 4.984256210317909e-06, "loss": 3.723, "step": 3610 }, { "epoch": 0.036773681640625, "grad_norm": 15.393765449523926, "learning_rate": 4.984211371574527e-06, "loss": 3.99, "step": 3615 }, { "epoch": 0.036824544270833336, "grad_norm": 16.75415802001953, "learning_rate": 4.984166469273186e-06, "loss": 3.1881, "step": 3620 }, { "epoch": 0.036875406901041664, "grad_norm": 15.176877975463867, "learning_rate": 4.984121503415034e-06, "loss": 3.6833, "step": 3625 }, { "epoch": 0.03692626953125, "grad_norm": 95.34676361083984, "learning_rate": 4.9840764740012225e-06, "loss": 3.5113, "step": 3630 }, { "epoch": 0.036977132161458336, "grad_norm": 12.578669548034668, "learning_rate": 4.984031381032903e-06, "loss": 4.0164, "step": 3635 }, { "epoch": 0.037027994791666664, "grad_norm": 12.883451461791992, "learning_rate": 4.98398622451123e-06, "loss": 3.2015, "step": 3640 }, { "epoch": 0.037078857421875, "grad_norm": 14.403608322143555, "learning_rate": 4.983941004437358e-06, "loss": 3.2212, "step": 3645 }, { "epoch": 0.037129720052083336, "grad_norm": 14.960567474365234, "learning_rate": 4.983895720812444e-06, "loss": 3.2845, "step": 3650 }, { "epoch": 0.037180582682291664, "grad_norm": 9.842466354370117, "learning_rate": 4.9838503736376465e-06, "loss": 3.4744, "step": 3655 }, { "epoch": 0.0372314453125, "grad_norm": 9.656144142150879, "learning_rate": 4.983804962914126e-06, "loss": 3.6732, "step": 3660 }, { "epoch": 0.037282307942708336, "grad_norm": 15.675665855407715, "learning_rate": 4.983759488643045e-06, "loss": 3.4493, "step": 3665 }, { "epoch": 0.037333170572916664, "grad_norm": 11.249836921691895, "learning_rate": 4.983713950825565e-06, "loss": 3.6472, "step": 3670 }, { "epoch": 0.037384033203125, "grad_norm": 16.177907943725586, "learning_rate": 4.983668349462853e-06, "loss": 3.3561, "step": 3675 }, { "epoch": 0.037434895833333336, "grad_norm": 14.318443298339844, "learning_rate": 4.983622684556075e-06, "loss": 3.4628, "step": 3680 }, { "epoch": 0.037485758463541664, "grad_norm": 14.132331848144531, "learning_rate": 4.9835769561064e-06, "loss": 3.4866, "step": 3685 }, { "epoch": 0.03753662109375, "grad_norm": 12.774168014526367, "learning_rate": 4.9835311641149955e-06, "loss": 3.2058, "step": 3690 }, { "epoch": 0.037587483723958336, "grad_norm": 10.149826049804688, "learning_rate": 4.983485308583036e-06, "loss": 3.2685, "step": 3695 }, { "epoch": 0.037638346354166664, "grad_norm": 12.236494064331055, "learning_rate": 4.983439389511693e-06, "loss": 3.3043, "step": 3700 }, { "epoch": 0.037689208984375, "grad_norm": 13.76052188873291, "learning_rate": 4.983393406902142e-06, "loss": 3.6717, "step": 3705 }, { "epoch": 0.037740071614583336, "grad_norm": 16.32668113708496, "learning_rate": 4.983347360755559e-06, "loss": 3.6747, "step": 3710 }, { "epoch": 0.037790934244791664, "grad_norm": 14.244294166564941, "learning_rate": 4.983301251073124e-06, "loss": 3.8085, "step": 3715 }, { "epoch": 0.037841796875, "grad_norm": 15.147490501403809, "learning_rate": 4.983255077856014e-06, "loss": 3.2321, "step": 3720 }, { "epoch": 0.037892659505208336, "grad_norm": 12.199444770812988, "learning_rate": 4.983208841105411e-06, "loss": 3.4871, "step": 3725 }, { "epoch": 0.037943522135416664, "grad_norm": 18.730581283569336, "learning_rate": 4.983162540822498e-06, "loss": 4.198, "step": 3730 }, { "epoch": 0.037994384765625, "grad_norm": 14.874238967895508, "learning_rate": 4.983116177008461e-06, "loss": 3.5318, "step": 3735 }, { "epoch": 0.038045247395833336, "grad_norm": 10.15190601348877, "learning_rate": 4.9830697496644855e-06, "loss": 3.3821, "step": 3740 }, { "epoch": 0.038096110026041664, "grad_norm": 17.800188064575195, "learning_rate": 4.983023258791758e-06, "loss": 3.5971, "step": 3745 }, { "epoch": 0.03814697265625, "grad_norm": 45.00802993774414, "learning_rate": 4.98297670439147e-06, "loss": 3.0957, "step": 3750 }, { "epoch": 0.038197835286458336, "grad_norm": 11.17233657836914, "learning_rate": 4.9829300864648104e-06, "loss": 3.6894, "step": 3755 }, { "epoch": 0.038248697916666664, "grad_norm": 16.200355529785156, "learning_rate": 4.982883405012974e-06, "loss": 3.2511, "step": 3760 }, { "epoch": 0.038299560546875, "grad_norm": 11.851147651672363, "learning_rate": 4.982836660037154e-06, "loss": 3.31, "step": 3765 }, { "epoch": 0.038350423177083336, "grad_norm": 8.351678848266602, "learning_rate": 4.982789851538545e-06, "loss": 3.1318, "step": 3770 }, { "epoch": 0.038401285807291664, "grad_norm": 12.307997703552246, "learning_rate": 4.982742979518348e-06, "loss": 3.3283, "step": 3775 }, { "epoch": 0.0384521484375, "grad_norm": 11.954642295837402, "learning_rate": 4.98269604397776e-06, "loss": 3.7551, "step": 3780 }, { "epoch": 0.038503011067708336, "grad_norm": 11.246746063232422, "learning_rate": 4.982649044917982e-06, "loss": 3.5243, "step": 3785 }, { "epoch": 0.038553873697916664, "grad_norm": 19.319927215576172, "learning_rate": 4.982601982340216e-06, "loss": 3.546, "step": 3790 }, { "epoch": 0.038604736328125, "grad_norm": 15.717957496643066, "learning_rate": 4.982554856245668e-06, "loss": 3.9242, "step": 3795 }, { "epoch": 0.038655598958333336, "grad_norm": 10.940032005310059, "learning_rate": 4.982507666635541e-06, "loss": 3.5011, "step": 3800 }, { "epoch": 0.038706461588541664, "grad_norm": 17.505874633789062, "learning_rate": 4.982460413511045e-06, "loss": 3.4926, "step": 3805 }, { "epoch": 0.03875732421875, "grad_norm": 13.361002922058105, "learning_rate": 4.9824130968733875e-06, "loss": 3.431, "step": 3810 }, { "epoch": 0.038808186848958336, "grad_norm": 11.022466659545898, "learning_rate": 4.982365716723779e-06, "loss": 3.4269, "step": 3815 }, { "epoch": 0.038859049479166664, "grad_norm": 13.40270709991455, "learning_rate": 4.982318273063432e-06, "loss": 3.4141, "step": 3820 }, { "epoch": 0.038909912109375, "grad_norm": 12.13808536529541, "learning_rate": 4.98227076589356e-06, "loss": 3.4167, "step": 3825 }, { "epoch": 0.038960774739583336, "grad_norm": 13.911450386047363, "learning_rate": 4.98222319521538e-06, "loss": 3.5935, "step": 3830 }, { "epoch": 0.039011637369791664, "grad_norm": 9.00546646118164, "learning_rate": 4.982175561030107e-06, "loss": 3.4011, "step": 3835 }, { "epoch": 0.0390625, "grad_norm": 15.89461612701416, "learning_rate": 4.982127863338961e-06, "loss": 3.3942, "step": 3840 }, { "epoch": 0.039113362630208336, "grad_norm": 15.606123924255371, "learning_rate": 4.982080102143161e-06, "loss": 3.4825, "step": 3845 }, { "epoch": 0.039164225260416664, "grad_norm": 16.339702606201172, "learning_rate": 4.982032277443931e-06, "loss": 3.5663, "step": 3850 }, { "epoch": 0.039215087890625, "grad_norm": 9.590949058532715, "learning_rate": 4.981984389242493e-06, "loss": 3.5181, "step": 3855 }, { "epoch": 0.039265950520833336, "grad_norm": 8.50133228302002, "learning_rate": 4.981936437540073e-06, "loss": 3.5247, "step": 3860 }, { "epoch": 0.039316813151041664, "grad_norm": 15.226415634155273, "learning_rate": 4.981888422337897e-06, "loss": 3.5207, "step": 3865 }, { "epoch": 0.03936767578125, "grad_norm": 15.351889610290527, "learning_rate": 4.981840343637194e-06, "loss": 3.6821, "step": 3870 }, { "epoch": 0.039418538411458336, "grad_norm": 11.84135627746582, "learning_rate": 4.981792201439195e-06, "loss": 3.4474, "step": 3875 }, { "epoch": 0.039469401041666664, "grad_norm": 19.106542587280273, "learning_rate": 4.9817439957451295e-06, "loss": 3.4201, "step": 3880 }, { "epoch": 0.039520263671875, "grad_norm": 14.762589454650879, "learning_rate": 4.981695726556233e-06, "loss": 3.6159, "step": 3885 }, { "epoch": 0.039571126302083336, "grad_norm": 10.755833625793457, "learning_rate": 4.98164739387374e-06, "loss": 3.6952, "step": 3890 }, { "epoch": 0.039621988932291664, "grad_norm": 7.459182262420654, "learning_rate": 4.9815989976988856e-06, "loss": 3.3008, "step": 3895 }, { "epoch": 0.0396728515625, "grad_norm": 16.29890251159668, "learning_rate": 4.98155053803291e-06, "loss": 3.7614, "step": 3900 }, { "epoch": 0.039723714192708336, "grad_norm": 15.238051414489746, "learning_rate": 4.981502014877051e-06, "loss": 3.5197, "step": 3905 }, { "epoch": 0.039774576822916664, "grad_norm": 16.749492645263672, "learning_rate": 4.981453428232551e-06, "loss": 4.1579, "step": 3910 }, { "epoch": 0.039825439453125, "grad_norm": 14.327499389648438, "learning_rate": 4.981404778100654e-06, "loss": 3.1206, "step": 3915 }, { "epoch": 0.039876302083333336, "grad_norm": 18.08525848388672, "learning_rate": 4.981356064482604e-06, "loss": 3.5667, "step": 3920 }, { "epoch": 0.039927164713541664, "grad_norm": 9.953446388244629, "learning_rate": 4.981307287379647e-06, "loss": 3.1772, "step": 3925 }, { "epoch": 0.03997802734375, "grad_norm": 12.199675559997559, "learning_rate": 4.9812584467930315e-06, "loss": 3.3682, "step": 3930 }, { "epoch": 0.040028889973958336, "grad_norm": 14.886872291564941, "learning_rate": 4.981209542724006e-06, "loss": 3.3351, "step": 3935 }, { "epoch": 0.040079752604166664, "grad_norm": 15.33847427368164, "learning_rate": 4.981160575173823e-06, "loss": 3.2065, "step": 3940 }, { "epoch": 0.040130615234375, "grad_norm": 10.76516056060791, "learning_rate": 4.981111544143735e-06, "loss": 3.6167, "step": 3945 }, { "epoch": 0.040181477864583336, "grad_norm": 10.835091590881348, "learning_rate": 4.981062449634996e-06, "loss": 3.3896, "step": 3950 }, { "epoch": 0.040232340494791664, "grad_norm": 8.555797576904297, "learning_rate": 4.981013291648861e-06, "loss": 3.3618, "step": 3955 }, { "epoch": 0.040283203125, "grad_norm": 20.06720733642578, "learning_rate": 4.980964070186591e-06, "loss": 3.1369, "step": 3960 }, { "epoch": 0.040334065755208336, "grad_norm": 10.057026863098145, "learning_rate": 4.9809147852494425e-06, "loss": 3.4979, "step": 3965 }, { "epoch": 0.040384928385416664, "grad_norm": 9.190557479858398, "learning_rate": 4.980865436838677e-06, "loss": 3.5271, "step": 3970 }, { "epoch": 0.040435791015625, "grad_norm": 13.96591854095459, "learning_rate": 4.9808160249555585e-06, "loss": 3.8185, "step": 3975 }, { "epoch": 0.040486653645833336, "grad_norm": 13.018309593200684, "learning_rate": 4.980766549601349e-06, "loss": 3.3418, "step": 3980 }, { "epoch": 0.040537516276041664, "grad_norm": 12.482734680175781, "learning_rate": 4.9807170107773155e-06, "loss": 3.7106, "step": 3985 }, { "epoch": 0.04058837890625, "grad_norm": 19.169965744018555, "learning_rate": 4.980667408484725e-06, "loss": 3.7609, "step": 3990 }, { "epoch": 0.040639241536458336, "grad_norm": 16.576343536376953, "learning_rate": 4.980617742724847e-06, "loss": 3.564, "step": 3995 }, { "epoch": 0.040690104166666664, "grad_norm": 13.268610954284668, "learning_rate": 4.980568013498952e-06, "loss": 3.5274, "step": 4000 }, { "epoch": 0.040740966796875, "grad_norm": 11.195540428161621, "learning_rate": 4.980518220808312e-06, "loss": 3.6116, "step": 4005 }, { "epoch": 0.040791829427083336, "grad_norm": 10.44593620300293, "learning_rate": 4.980468364654202e-06, "loss": 3.4021, "step": 4010 }, { "epoch": 0.040842692057291664, "grad_norm": 49.621131896972656, "learning_rate": 4.980418445037897e-06, "loss": 3.6859, "step": 4015 }, { "epoch": 0.0408935546875, "grad_norm": 14.462821006774902, "learning_rate": 4.980368461960673e-06, "loss": 3.4476, "step": 4020 }, { "epoch": 0.040944417317708336, "grad_norm": 10.35971736907959, "learning_rate": 4.98031841542381e-06, "loss": 3.6455, "step": 4025 }, { "epoch": 0.040995279947916664, "grad_norm": 16.127132415771484, "learning_rate": 4.980268305428589e-06, "loss": 3.4884, "step": 4030 }, { "epoch": 0.041046142578125, "grad_norm": 11.943824768066406, "learning_rate": 4.980218131976291e-06, "loss": 3.7037, "step": 4035 }, { "epoch": 0.041097005208333336, "grad_norm": 13.393162727355957, "learning_rate": 4.9801678950682e-06, "loss": 3.5249, "step": 4040 }, { "epoch": 0.041147867838541664, "grad_norm": 13.681143760681152, "learning_rate": 4.9801175947056005e-06, "loss": 3.4255, "step": 4045 }, { "epoch": 0.04119873046875, "grad_norm": 14.354728698730469, "learning_rate": 4.980067230889781e-06, "loss": 3.3633, "step": 4050 }, { "epoch": 0.041249593098958336, "grad_norm": 15.5615873336792, "learning_rate": 4.9800168036220295e-06, "loss": 3.5337, "step": 4055 }, { "epoch": 0.041300455729166664, "grad_norm": 8.801639556884766, "learning_rate": 4.9799663129036354e-06, "loss": 3.5424, "step": 4060 }, { "epoch": 0.041351318359375, "grad_norm": 11.887249946594238, "learning_rate": 4.9799157587358905e-06, "loss": 3.595, "step": 4065 }, { "epoch": 0.041402180989583336, "grad_norm": 10.93061637878418, "learning_rate": 4.979865141120089e-06, "loss": 3.2786, "step": 4070 }, { "epoch": 0.041453043619791664, "grad_norm": 13.5011625289917, "learning_rate": 4.979814460057527e-06, "loss": 3.3899, "step": 4075 }, { "epoch": 0.04150390625, "grad_norm": 13.714580535888672, "learning_rate": 4.979763715549498e-06, "loss": 3.3754, "step": 4080 }, { "epoch": 0.041554768880208336, "grad_norm": 8.235960960388184, "learning_rate": 4.9797129075973025e-06, "loss": 3.4213, "step": 4085 }, { "epoch": 0.041605631510416664, "grad_norm": 9.007630348205566, "learning_rate": 4.979662036202241e-06, "loss": 3.5357, "step": 4090 }, { "epoch": 0.041656494140625, "grad_norm": 11.23054313659668, "learning_rate": 4.979611101365613e-06, "loss": 3.7044, "step": 4095 }, { "epoch": 0.041707356770833336, "grad_norm": 12.494560241699219, "learning_rate": 4.979560103088723e-06, "loss": 3.5578, "step": 4100 }, { "epoch": 0.041758219401041664, "grad_norm": 15.440023422241211, "learning_rate": 4.979509041372876e-06, "loss": 3.3444, "step": 4105 }, { "epoch": 0.04180908203125, "grad_norm": 14.82787799835205, "learning_rate": 4.979457916219378e-06, "loss": 3.7682, "step": 4110 }, { "epoch": 0.041859944661458336, "grad_norm": 13.421795845031738, "learning_rate": 4.979406727629536e-06, "loss": 3.673, "step": 4115 }, { "epoch": 0.041910807291666664, "grad_norm": 19.039793014526367, "learning_rate": 4.979355475604661e-06, "loss": 3.5433, "step": 4120 }, { "epoch": 0.041961669921875, "grad_norm": 16.655475616455078, "learning_rate": 4.979304160146064e-06, "loss": 3.2191, "step": 4125 }, { "epoch": 0.042012532552083336, "grad_norm": 13.381031036376953, "learning_rate": 4.979252781255057e-06, "loss": 3.1412, "step": 4130 }, { "epoch": 0.042063395182291664, "grad_norm": 11.062989234924316, "learning_rate": 4.979201338932956e-06, "loss": 3.8812, "step": 4135 }, { "epoch": 0.0421142578125, "grad_norm": 11.04766845703125, "learning_rate": 4.979149833181076e-06, "loss": 3.1701, "step": 4140 }, { "epoch": 0.042165120442708336, "grad_norm": 12.870123863220215, "learning_rate": 4.979098264000736e-06, "loss": 3.4735, "step": 4145 }, { "epoch": 0.042215983072916664, "grad_norm": 11.865944862365723, "learning_rate": 4.979046631393253e-06, "loss": 3.666, "step": 4150 }, { "epoch": 0.042266845703125, "grad_norm": 10.82459831237793, "learning_rate": 4.97899493535995e-06, "loss": 3.6551, "step": 4155 }, { "epoch": 0.042317708333333336, "grad_norm": 12.600262641906738, "learning_rate": 4.97894317590215e-06, "loss": 3.8571, "step": 4160 }, { "epoch": 0.042368570963541664, "grad_norm": 9.6876220703125, "learning_rate": 4.978891353021176e-06, "loss": 3.5783, "step": 4165 }, { "epoch": 0.04241943359375, "grad_norm": 13.637721061706543, "learning_rate": 4.978839466718354e-06, "loss": 3.7054, "step": 4170 }, { "epoch": 0.042470296223958336, "grad_norm": 10.315239906311035, "learning_rate": 4.978787516995012e-06, "loss": 3.442, "step": 4175 }, { "epoch": 0.042521158854166664, "grad_norm": 12.303563117980957, "learning_rate": 4.9787355038524785e-06, "loss": 3.425, "step": 4180 }, { "epoch": 0.042572021484375, "grad_norm": 8.549363136291504, "learning_rate": 4.978683427292086e-06, "loss": 3.3095, "step": 4185 }, { "epoch": 0.042622884114583336, "grad_norm": 14.406485557556152, "learning_rate": 4.978631287315165e-06, "loss": 3.4881, "step": 4190 }, { "epoch": 0.042673746744791664, "grad_norm": 13.812122344970703, "learning_rate": 4.978579083923049e-06, "loss": 3.5369, "step": 4195 }, { "epoch": 0.042724609375, "grad_norm": 14.43535041809082, "learning_rate": 4.978526817117075e-06, "loss": 3.5955, "step": 4200 }, { "epoch": 0.042775472005208336, "grad_norm": 14.955559730529785, "learning_rate": 4.97847448689858e-06, "loss": 3.4783, "step": 4205 }, { "epoch": 0.042826334635416664, "grad_norm": 15.662449836730957, "learning_rate": 4.978422093268903e-06, "loss": 3.0456, "step": 4210 }, { "epoch": 0.042877197265625, "grad_norm": 21.45525360107422, "learning_rate": 4.978369636229383e-06, "loss": 3.3702, "step": 4215 }, { "epoch": 0.042928059895833336, "grad_norm": 11.133001327514648, "learning_rate": 4.978317115781365e-06, "loss": 3.7719, "step": 4220 }, { "epoch": 0.042978922526041664, "grad_norm": 14.71216869354248, "learning_rate": 4.97826453192619e-06, "loss": 3.2268, "step": 4225 }, { "epoch": 0.04302978515625, "grad_norm": 8.537089347839355, "learning_rate": 4.978211884665205e-06, "loss": 3.3107, "step": 4230 }, { "epoch": 0.043080647786458336, "grad_norm": 20.554950714111328, "learning_rate": 4.978159173999756e-06, "loss": 3.9013, "step": 4235 }, { "epoch": 0.043131510416666664, "grad_norm": 16.465560913085938, "learning_rate": 4.9781063999311914e-06, "loss": 3.5264, "step": 4240 }, { "epoch": 0.043182373046875, "grad_norm": 12.345917701721191, "learning_rate": 4.978053562460863e-06, "loss": 3.6222, "step": 4245 }, { "epoch": 0.043233235677083336, "grad_norm": 14.286460876464844, "learning_rate": 4.978000661590121e-06, "loss": 3.55, "step": 4250 }, { "epoch": 0.043284098307291664, "grad_norm": 16.970375061035156, "learning_rate": 4.97794769732032e-06, "loss": 3.7756, "step": 4255 }, { "epoch": 0.0433349609375, "grad_norm": 12.45829963684082, "learning_rate": 4.977894669652814e-06, "loss": 3.5234, "step": 4260 }, { "epoch": 0.043385823567708336, "grad_norm": 7.817399501800537, "learning_rate": 4.97784157858896e-06, "loss": 3.658, "step": 4265 }, { "epoch": 0.043436686197916664, "grad_norm": 16.19847869873047, "learning_rate": 4.9777884241301165e-06, "loss": 3.5021, "step": 4270 }, { "epoch": 0.043487548828125, "grad_norm": 8.416207313537598, "learning_rate": 4.977735206277644e-06, "loss": 3.3046, "step": 4275 }, { "epoch": 0.043538411458333336, "grad_norm": 13.35494613647461, "learning_rate": 4.977681925032902e-06, "loss": 3.3918, "step": 4280 }, { "epoch": 0.043589274088541664, "grad_norm": 10.832646369934082, "learning_rate": 4.977628580397257e-06, "loss": 3.4338, "step": 4285 }, { "epoch": 0.04364013671875, "grad_norm": 9.665356636047363, "learning_rate": 4.977575172372072e-06, "loss": 3.5508, "step": 4290 }, { "epoch": 0.043690999348958336, "grad_norm": 11.935736656188965, "learning_rate": 4.977521700958712e-06, "loss": 3.463, "step": 4295 }, { "epoch": 0.043741861979166664, "grad_norm": 16.61982536315918, "learning_rate": 4.977468166158548e-06, "loss": 3.7634, "step": 4300 }, { "epoch": 0.043792724609375, "grad_norm": 14.604238510131836, "learning_rate": 4.977414567972948e-06, "loss": 3.3686, "step": 4305 }, { "epoch": 0.043843587239583336, "grad_norm": 13.716455459594727, "learning_rate": 4.977360906403283e-06, "loss": 3.374, "step": 4310 }, { "epoch": 0.043894449869791664, "grad_norm": 11.446566581726074, "learning_rate": 4.977307181450926e-06, "loss": 3.3746, "step": 4315 }, { "epoch": 0.0439453125, "grad_norm": 9.788915634155273, "learning_rate": 4.977253393117253e-06, "loss": 3.5568, "step": 4320 }, { "epoch": 0.043996175130208336, "grad_norm": 203.79078674316406, "learning_rate": 4.977199541403638e-06, "loss": 3.7115, "step": 4325 }, { "epoch": 0.044047037760416664, "grad_norm": 14.938610076904297, "learning_rate": 4.97714562631146e-06, "loss": 3.6643, "step": 4330 }, { "epoch": 0.044097900390625, "grad_norm": 15.485106468200684, "learning_rate": 4.977091647842099e-06, "loss": 3.5269, "step": 4335 }, { "epoch": 0.044148763020833336, "grad_norm": 12.620315551757812, "learning_rate": 4.977037605996936e-06, "loss": 4.1156, "step": 4340 }, { "epoch": 0.044199625651041664, "grad_norm": 16.161775588989258, "learning_rate": 4.976983500777352e-06, "loss": 3.845, "step": 4345 }, { "epoch": 0.04425048828125, "grad_norm": 14.427170753479004, "learning_rate": 4.976929332184732e-06, "loss": 3.7344, "step": 4350 }, { "epoch": 0.044301350911458336, "grad_norm": 15.274232864379883, "learning_rate": 4.976875100220462e-06, "loss": 3.3062, "step": 4355 }, { "epoch": 0.044352213541666664, "grad_norm": 8.53735637664795, "learning_rate": 4.97682080488593e-06, "loss": 3.272, "step": 4360 }, { "epoch": 0.044403076171875, "grad_norm": 20.58081817626953, "learning_rate": 4.9767664461825246e-06, "loss": 3.3835, "step": 4365 }, { "epoch": 0.044453938802083336, "grad_norm": 8.867008209228516, "learning_rate": 4.976712024111637e-06, "loss": 3.36, "step": 4370 }, { "epoch": 0.044504801432291664, "grad_norm": 13.777779579162598, "learning_rate": 4.976657538674659e-06, "loss": 3.6504, "step": 4375 }, { "epoch": 0.0445556640625, "grad_norm": 13.533966064453125, "learning_rate": 4.9766029898729865e-06, "loss": 3.6923, "step": 4380 }, { "epoch": 0.044606526692708336, "grad_norm": 16.9124698638916, "learning_rate": 4.976548377708011e-06, "loss": 3.7102, "step": 4385 }, { "epoch": 0.044657389322916664, "grad_norm": 14.597479820251465, "learning_rate": 4.9764937021811345e-06, "loss": 3.6415, "step": 4390 }, { "epoch": 0.044708251953125, "grad_norm": 12.073945999145508, "learning_rate": 4.976438963293753e-06, "loss": 3.5683, "step": 4395 }, { "epoch": 0.044759114583333336, "grad_norm": 10.953951835632324, "learning_rate": 4.976384161047266e-06, "loss": 3.4114, "step": 4400 }, { "epoch": 0.044809977213541664, "grad_norm": 11.895225524902344, "learning_rate": 4.976329295443079e-06, "loss": 3.5981, "step": 4405 }, { "epoch": 0.04486083984375, "grad_norm": 13.530261039733887, "learning_rate": 4.976274366482593e-06, "loss": 3.416, "step": 4410 }, { "epoch": 0.044911702473958336, "grad_norm": 12.748217582702637, "learning_rate": 4.9762193741672145e-06, "loss": 3.6393, "step": 4415 }, { "epoch": 0.044962565104166664, "grad_norm": 34.55634307861328, "learning_rate": 4.976164318498351e-06, "loss": 3.6435, "step": 4420 }, { "epoch": 0.045013427734375, "grad_norm": 9.771318435668945, "learning_rate": 4.9761091994774095e-06, "loss": 3.8975, "step": 4425 }, { "epoch": 0.045064290364583336, "grad_norm": 10.606549263000488, "learning_rate": 4.976054017105801e-06, "loss": 3.8694, "step": 4430 }, { "epoch": 0.045115152994791664, "grad_norm": 18.156574249267578, "learning_rate": 4.975998771384938e-06, "loss": 3.8872, "step": 4435 }, { "epoch": 0.045166015625, "grad_norm": 12.91622257232666, "learning_rate": 4.9759434623162325e-06, "loss": 3.5453, "step": 4440 }, { "epoch": 0.045216878255208336, "grad_norm": 12.400276184082031, "learning_rate": 4.975888089901101e-06, "loss": 3.7277, "step": 4445 }, { "epoch": 0.045267740885416664, "grad_norm": 11.56432056427002, "learning_rate": 4.97583265414096e-06, "loss": 3.62, "step": 4450 }, { "epoch": 0.045318603515625, "grad_norm": 12.74065113067627, "learning_rate": 4.975777155037226e-06, "loss": 3.2927, "step": 4455 }, { "epoch": 0.045369466145833336, "grad_norm": 16.921592712402344, "learning_rate": 4.975721592591321e-06, "loss": 3.4098, "step": 4460 }, { "epoch": 0.045420328776041664, "grad_norm": 10.564414024353027, "learning_rate": 4.975665966804666e-06, "loss": 3.5746, "step": 4465 }, { "epoch": 0.04547119140625, "grad_norm": 11.815017700195312, "learning_rate": 4.9756102776786845e-06, "loss": 3.0929, "step": 4470 }, { "epoch": 0.045522054036458336, "grad_norm": 11.405243873596191, "learning_rate": 4.9755545252147995e-06, "loss": 3.8489, "step": 4475 }, { "epoch": 0.045572916666666664, "grad_norm": 14.713571548461914, "learning_rate": 4.97549870941444e-06, "loss": 3.7178, "step": 4480 }, { "epoch": 0.045623779296875, "grad_norm": 11.61874771118164, "learning_rate": 4.9754428302790325e-06, "loss": 3.3223, "step": 4485 }, { "epoch": 0.045674641927083336, "grad_norm": 8.922009468078613, "learning_rate": 4.975386887810007e-06, "loss": 3.7321, "step": 4490 }, { "epoch": 0.045725504557291664, "grad_norm": 8.890037536621094, "learning_rate": 4.975330882008794e-06, "loss": 3.4178, "step": 4495 }, { "epoch": 0.0457763671875, "grad_norm": 11.523167610168457, "learning_rate": 4.9752748128768275e-06, "loss": 3.8556, "step": 4500 }, { "epoch": 0.045827229817708336, "grad_norm": 15.356710433959961, "learning_rate": 4.975218680415541e-06, "loss": 3.4063, "step": 4505 }, { "epoch": 0.045878092447916664, "grad_norm": 12.003191947937012, "learning_rate": 4.9751624846263725e-06, "loss": 3.727, "step": 4510 }, { "epoch": 0.045928955078125, "grad_norm": 11.115232467651367, "learning_rate": 4.9751062255107575e-06, "loss": 3.3798, "step": 4515 }, { "epoch": 0.045979817708333336, "grad_norm": 12.786600112915039, "learning_rate": 4.975049903070137e-06, "loss": 3.8232, "step": 4520 }, { "epoch": 0.046030680338541664, "grad_norm": 9.778971672058105, "learning_rate": 4.974993517305952e-06, "loss": 3.5046, "step": 4525 }, { "epoch": 0.04608154296875, "grad_norm": 11.696513175964355, "learning_rate": 4.974937068219643e-06, "loss": 3.5063, "step": 4530 }, { "epoch": 0.046132405598958336, "grad_norm": 10.245550155639648, "learning_rate": 4.974880555812656e-06, "loss": 3.2886, "step": 4535 }, { "epoch": 0.046183268229166664, "grad_norm": 15.772186279296875, "learning_rate": 4.9748239800864375e-06, "loss": 3.482, "step": 4540 }, { "epoch": 0.046234130859375, "grad_norm": 11.759252548217773, "learning_rate": 4.974767341042433e-06, "loss": 3.663, "step": 4545 }, { "epoch": 0.046284993489583336, "grad_norm": 9.798601150512695, "learning_rate": 4.9747106386820934e-06, "loss": 3.3534, "step": 4550 }, { "epoch": 0.046335856119791664, "grad_norm": 13.721288681030273, "learning_rate": 4.9746538730068684e-06, "loss": 3.4893, "step": 4555 }, { "epoch": 0.04638671875, "grad_norm": 11.46217155456543, "learning_rate": 4.974597044018211e-06, "loss": 3.3706, "step": 4560 }, { "epoch": 0.046437581380208336, "grad_norm": 8.599750518798828, "learning_rate": 4.974540151717574e-06, "loss": 4.0034, "step": 4565 }, { "epoch": 0.046488444010416664, "grad_norm": 15.226737022399902, "learning_rate": 4.974483196106415e-06, "loss": 3.591, "step": 4570 }, { "epoch": 0.046539306640625, "grad_norm": 8.270947456359863, "learning_rate": 4.9744261771861894e-06, "loss": 3.3649, "step": 4575 }, { "epoch": 0.046590169270833336, "grad_norm": 13.826691627502441, "learning_rate": 4.974369094958356e-06, "loss": 3.7075, "step": 4580 }, { "epoch": 0.046641031901041664, "grad_norm": 15.241434097290039, "learning_rate": 4.974311949424376e-06, "loss": 3.9796, "step": 4585 }, { "epoch": 0.04669189453125, "grad_norm": 9.045734405517578, "learning_rate": 4.974254740585712e-06, "loss": 3.3453, "step": 4590 }, { "epoch": 0.046742757161458336, "grad_norm": 12.359314918518066, "learning_rate": 4.974197468443826e-06, "loss": 3.5554, "step": 4595 }, { "epoch": 0.046793619791666664, "grad_norm": 16.416982650756836, "learning_rate": 4.974140133000184e-06, "loss": 3.4799, "step": 4600 }, { "epoch": 0.046844482421875, "grad_norm": 12.722060203552246, "learning_rate": 4.974082734256254e-06, "loss": 3.8829, "step": 4605 }, { "epoch": 0.046895345052083336, "grad_norm": 14.915850639343262, "learning_rate": 4.9740252722135035e-06, "loss": 3.0652, "step": 4610 }, { "epoch": 0.046946207682291664, "grad_norm": 15.305362701416016, "learning_rate": 4.973967746873403e-06, "loss": 3.2942, "step": 4615 }, { "epoch": 0.0469970703125, "grad_norm": 18.63602638244629, "learning_rate": 4.973910158237423e-06, "loss": 3.1882, "step": 4620 }, { "epoch": 0.047047932942708336, "grad_norm": 15.647112846374512, "learning_rate": 4.973852506307039e-06, "loss": 3.2307, "step": 4625 }, { "epoch": 0.047098795572916664, "grad_norm": 13.300541877746582, "learning_rate": 4.973794791083725e-06, "loss": 3.6601, "step": 4630 }, { "epoch": 0.047149658203125, "grad_norm": 12.027398109436035, "learning_rate": 4.9737370125689575e-06, "loss": 3.5073, "step": 4635 }, { "epoch": 0.047200520833333336, "grad_norm": 9.269347190856934, "learning_rate": 4.973679170764214e-06, "loss": 4.1268, "step": 4640 }, { "epoch": 0.047251383463541664, "grad_norm": 9.16490650177002, "learning_rate": 4.973621265670976e-06, "loss": 3.2929, "step": 4645 }, { "epoch": 0.04730224609375, "grad_norm": 11.5504789352417, "learning_rate": 4.973563297290724e-06, "loss": 3.369, "step": 4650 }, { "epoch": 0.047353108723958336, "grad_norm": 15.80301284790039, "learning_rate": 4.973505265624942e-06, "loss": 3.4352, "step": 4655 }, { "epoch": 0.047403971354166664, "grad_norm": 9.65170669555664, "learning_rate": 4.9734471706751135e-06, "loss": 3.8803, "step": 4660 }, { "epoch": 0.047454833984375, "grad_norm": 11.459211349487305, "learning_rate": 4.9733890124427255e-06, "loss": 3.5879, "step": 4665 }, { "epoch": 0.047505696614583336, "grad_norm": 16.0178279876709, "learning_rate": 4.973330790929266e-06, "loss": 3.6492, "step": 4670 }, { "epoch": 0.047556559244791664, "grad_norm": 16.536197662353516, "learning_rate": 4.973272506136224e-06, "loss": 3.2745, "step": 4675 }, { "epoch": 0.047607421875, "grad_norm": 10.868372917175293, "learning_rate": 4.973214158065092e-06, "loss": 3.4472, "step": 4680 }, { "epoch": 0.047658284505208336, "grad_norm": 14.260408401489258, "learning_rate": 4.973155746717361e-06, "loss": 3.4165, "step": 4685 }, { "epoch": 0.047709147135416664, "grad_norm": 10.171429634094238, "learning_rate": 4.973097272094527e-06, "loss": 3.209, "step": 4690 }, { "epoch": 0.047760009765625, "grad_norm": 13.096430778503418, "learning_rate": 4.973038734198086e-06, "loss": 3.4693, "step": 4695 }, { "epoch": 0.047810872395833336, "grad_norm": 15.2460355758667, "learning_rate": 4.972980133029534e-06, "loss": 3.4677, "step": 4700 }, { "epoch": 0.047861735026041664, "grad_norm": 13.545559883117676, "learning_rate": 4.9729214685903725e-06, "loss": 3.4694, "step": 4705 }, { "epoch": 0.04791259765625, "grad_norm": 17.726600646972656, "learning_rate": 4.9728627408821e-06, "loss": 3.5932, "step": 4710 }, { "epoch": 0.047963460286458336, "grad_norm": 13.018586158752441, "learning_rate": 4.972803949906222e-06, "loss": 3.4267, "step": 4715 }, { "epoch": 0.048014322916666664, "grad_norm": 15.820887565612793, "learning_rate": 4.9727450956642395e-06, "loss": 3.3878, "step": 4720 }, { "epoch": 0.048065185546875, "grad_norm": 10.09266471862793, "learning_rate": 4.972686178157661e-06, "loss": 3.7088, "step": 4725 }, { "epoch": 0.048116048177083336, "grad_norm": 14.535094261169434, "learning_rate": 4.972627197387993e-06, "loss": 3.492, "step": 4730 }, { "epoch": 0.048166910807291664, "grad_norm": 16.890783309936523, "learning_rate": 4.972568153356744e-06, "loss": 3.3656, "step": 4735 }, { "epoch": 0.0482177734375, "grad_norm": 8.608535766601562, "learning_rate": 4.972509046065423e-06, "loss": 3.4594, "step": 4740 }, { "epoch": 0.048268636067708336, "grad_norm": 12.446272850036621, "learning_rate": 4.9724498755155455e-06, "loss": 3.3436, "step": 4745 }, { "epoch": 0.048319498697916664, "grad_norm": 11.247904777526855, "learning_rate": 4.972390641708625e-06, "loss": 3.7488, "step": 4750 }, { "epoch": 0.048370361328125, "grad_norm": 10.85303783416748, "learning_rate": 4.972331344646175e-06, "loss": 3.4678, "step": 4755 }, { "epoch": 0.048421223958333336, "grad_norm": 12.21154499053955, "learning_rate": 4.972271984329713e-06, "loss": 3.4773, "step": 4760 }, { "epoch": 0.048472086588541664, "grad_norm": 12.006597518920898, "learning_rate": 4.9722125607607595e-06, "loss": 3.5087, "step": 4765 }, { "epoch": 0.04852294921875, "grad_norm": 9.981345176696777, "learning_rate": 4.972153073940833e-06, "loss": 3.5338, "step": 4770 }, { "epoch": 0.048573811848958336, "grad_norm": 13.28870964050293, "learning_rate": 4.972093523871456e-06, "loss": 3.2124, "step": 4775 }, { "epoch": 0.048624674479166664, "grad_norm": 11.399847984313965, "learning_rate": 4.972033910554151e-06, "loss": 3.251, "step": 4780 }, { "epoch": 0.048675537109375, "grad_norm": 11.87983226776123, "learning_rate": 4.971974233990447e-06, "loss": 3.2554, "step": 4785 }, { "epoch": 0.048726399739583336, "grad_norm": 10.418863296508789, "learning_rate": 4.971914494181866e-06, "loss": 3.8199, "step": 4790 }, { "epoch": 0.048777262369791664, "grad_norm": 13.546000480651855, "learning_rate": 4.971854691129939e-06, "loss": 3.6464, "step": 4795 }, { "epoch": 0.048828125, "grad_norm": 16.547273635864258, "learning_rate": 4.9717948248361954e-06, "loss": 3.3086, "step": 4800 }, { "epoch": 0.048878987630208336, "grad_norm": 10.11819839477539, "learning_rate": 4.971734895302168e-06, "loss": 3.5591, "step": 4805 }, { "epoch": 0.048929850260416664, "grad_norm": 11.45274829864502, "learning_rate": 4.971674902529389e-06, "loss": 3.666, "step": 4810 }, { "epoch": 0.048980712890625, "grad_norm": 12.099120140075684, "learning_rate": 4.971614846519393e-06, "loss": 3.6187, "step": 4815 }, { "epoch": 0.049031575520833336, "grad_norm": 9.340841293334961, "learning_rate": 4.971554727273718e-06, "loss": 3.4518, "step": 4820 }, { "epoch": 0.049082438151041664, "grad_norm": 14.398073196411133, "learning_rate": 4.9714945447939e-06, "loss": 3.514, "step": 4825 }, { "epoch": 0.04913330078125, "grad_norm": 10.84057331085205, "learning_rate": 4.97143429908148e-06, "loss": 4.0011, "step": 4830 }, { "epoch": 0.049184163411458336, "grad_norm": 14.066889762878418, "learning_rate": 4.971373990137999e-06, "loss": 3.4912, "step": 4835 }, { "epoch": 0.049235026041666664, "grad_norm": 13.424138069152832, "learning_rate": 4.971313617965001e-06, "loss": 3.4337, "step": 4840 }, { "epoch": 0.049285888671875, "grad_norm": 15.70202922821045, "learning_rate": 4.971253182564029e-06, "loss": 3.5523, "step": 4845 }, { "epoch": 0.049336751302083336, "grad_norm": 15.583520889282227, "learning_rate": 4.971192683936631e-06, "loss": 3.548, "step": 4850 }, { "epoch": 0.049387613932291664, "grad_norm": 16.691743850708008, "learning_rate": 4.9711321220843535e-06, "loss": 3.3919, "step": 4855 }, { "epoch": 0.0494384765625, "grad_norm": 15.311267852783203, "learning_rate": 4.971071497008746e-06, "loss": 3.275, "step": 4860 }, { "epoch": 0.049489339192708336, "grad_norm": 11.667826652526855, "learning_rate": 4.971010808711361e-06, "loss": 3.3042, "step": 4865 }, { "epoch": 0.049540201822916664, "grad_norm": 12.931231498718262, "learning_rate": 4.9709500571937485e-06, "loss": 3.6733, "step": 4870 }, { "epoch": 0.049591064453125, "grad_norm": 14.819002151489258, "learning_rate": 4.970889242457466e-06, "loss": 3.3424, "step": 4875 }, { "epoch": 0.049641927083333336, "grad_norm": 11.588568687438965, "learning_rate": 4.9708283645040675e-06, "loss": 3.2318, "step": 4880 }, { "epoch": 0.049692789713541664, "grad_norm": 14.38076400756836, "learning_rate": 4.970767423335111e-06, "loss": 3.5042, "step": 4885 }, { "epoch": 0.04974365234375, "grad_norm": 11.927538871765137, "learning_rate": 4.970706418952155e-06, "loss": 3.4414, "step": 4890 }, { "epoch": 0.049794514973958336, "grad_norm": 17.747844696044922, "learning_rate": 4.970645351356761e-06, "loss": 3.9686, "step": 4895 }, { "epoch": 0.049845377604166664, "grad_norm": 13.796122550964355, "learning_rate": 4.970584220550492e-06, "loss": 4.094, "step": 4900 }, { "epoch": 0.049896240234375, "grad_norm": 13.095043182373047, "learning_rate": 4.97052302653491e-06, "loss": 3.7906, "step": 4905 }, { "epoch": 0.049947102864583336, "grad_norm": 12.416662216186523, "learning_rate": 4.970461769311583e-06, "loss": 3.4955, "step": 4910 }, { "epoch": 0.049997965494791664, "grad_norm": 12.235672950744629, "learning_rate": 4.970400448882078e-06, "loss": 3.0274, "step": 4915 }, { "epoch": 0.050048828125, "grad_norm": 12.012406349182129, "learning_rate": 4.9703390652479615e-06, "loss": 3.3271, "step": 4920 }, { "epoch": 0.050099690755208336, "grad_norm": 11.558436393737793, "learning_rate": 4.970277618410806e-06, "loss": 3.4439, "step": 4925 }, { "epoch": 0.050150553385416664, "grad_norm": 10.99045467376709, "learning_rate": 4.970216108372184e-06, "loss": 3.4762, "step": 4930 }, { "epoch": 0.050201416015625, "grad_norm": 14.257964134216309, "learning_rate": 4.970154535133667e-06, "loss": 4.1728, "step": 4935 }, { "epoch": 0.050252278645833336, "grad_norm": 14.24341106414795, "learning_rate": 4.970092898696832e-06, "loss": 3.3222, "step": 4940 }, { "epoch": 0.050303141276041664, "grad_norm": 17.90021324157715, "learning_rate": 4.9700311990632565e-06, "loss": 3.2803, "step": 4945 }, { "epoch": 0.05035400390625, "grad_norm": 13.78532600402832, "learning_rate": 4.9699694362345175e-06, "loss": 3.6847, "step": 4950 }, { "epoch": 0.050404866536458336, "grad_norm": 9.49715518951416, "learning_rate": 4.969907610212197e-06, "loss": 3.4094, "step": 4955 }, { "epoch": 0.050455729166666664, "grad_norm": 16.413652420043945, "learning_rate": 4.969845720997874e-06, "loss": 3.4125, "step": 4960 }, { "epoch": 0.050506591796875, "grad_norm": 11.681251525878906, "learning_rate": 4.969783768593135e-06, "loss": 3.7634, "step": 4965 }, { "epoch": 0.050557454427083336, "grad_norm": 12.140727043151855, "learning_rate": 4.969721752999563e-06, "loss": 3.3175, "step": 4970 }, { "epoch": 0.050608317057291664, "grad_norm": 17.70606231689453, "learning_rate": 4.9696596742187455e-06, "loss": 3.7512, "step": 4975 }, { "epoch": 0.0506591796875, "grad_norm": 14.161558151245117, "learning_rate": 4.969597532252271e-06, "loss": 3.2145, "step": 4980 }, { "epoch": 0.050710042317708336, "grad_norm": 9.752479553222656, "learning_rate": 4.969535327101729e-06, "loss": 3.3454, "step": 4985 }, { "epoch": 0.050760904947916664, "grad_norm": 10.96493911743164, "learning_rate": 4.96947305876871e-06, "loss": 3.493, "step": 4990 }, { "epoch": 0.050811767578125, "grad_norm": 10.895770072937012, "learning_rate": 4.969410727254809e-06, "loss": 3.1966, "step": 4995 }, { "epoch": 0.050862630208333336, "grad_norm": 14.102956771850586, "learning_rate": 4.96934833256162e-06, "loss": 3.4336, "step": 5000 }, { "epoch": 0.050913492838541664, "grad_norm": 10.122415542602539, "learning_rate": 4.9692858746907395e-06, "loss": 3.275, "step": 5005 }, { "epoch": 0.05096435546875, "grad_norm": 16.442476272583008, "learning_rate": 4.969223353643764e-06, "loss": 3.7156, "step": 5010 }, { "epoch": 0.051015218098958336, "grad_norm": 12.107405662536621, "learning_rate": 4.969160769422294e-06, "loss": 3.645, "step": 5015 }, { "epoch": 0.051066080729166664, "grad_norm": 18.26190948486328, "learning_rate": 4.969098122027932e-06, "loss": 3.5743, "step": 5020 }, { "epoch": 0.051116943359375, "grad_norm": 11.404193878173828, "learning_rate": 4.96903541146228e-06, "loss": 3.2735, "step": 5025 }, { "epoch": 0.051167805989583336, "grad_norm": 14.428089141845703, "learning_rate": 4.968972637726942e-06, "loss": 3.7864, "step": 5030 }, { "epoch": 0.051218668619791664, "grad_norm": 10.478737831115723, "learning_rate": 4.9689098008235235e-06, "loss": 3.8017, "step": 5035 }, { "epoch": 0.05126953125, "grad_norm": 15.701272010803223, "learning_rate": 4.968846900753634e-06, "loss": 3.4754, "step": 5040 }, { "epoch": 0.051320393880208336, "grad_norm": 11.944446563720703, "learning_rate": 4.968783937518882e-06, "loss": 3.5693, "step": 5045 }, { "epoch": 0.051371256510416664, "grad_norm": 9.105219841003418, "learning_rate": 4.968720911120876e-06, "loss": 3.3752, "step": 5050 }, { "epoch": 0.051422119140625, "grad_norm": 17.28725814819336, "learning_rate": 4.968657821561233e-06, "loss": 3.542, "step": 5055 }, { "epoch": 0.051472981770833336, "grad_norm": 9.695812225341797, "learning_rate": 4.9685946688415635e-06, "loss": 3.3903, "step": 5060 }, { "epoch": 0.051523844401041664, "grad_norm": 11.877903938293457, "learning_rate": 4.968531452963485e-06, "loss": 3.8497, "step": 5065 }, { "epoch": 0.05157470703125, "grad_norm": 14.348855018615723, "learning_rate": 4.968468173928614e-06, "loss": 3.2782, "step": 5070 }, { "epoch": 0.051625569661458336, "grad_norm": 14.756049156188965, "learning_rate": 4.96840483173857e-06, "loss": 4.0242, "step": 5075 }, { "epoch": 0.051676432291666664, "grad_norm": 14.059425354003906, "learning_rate": 4.968341426394974e-06, "loss": 3.2894, "step": 5080 }, { "epoch": 0.051727294921875, "grad_norm": 11.158661842346191, "learning_rate": 4.968277957899446e-06, "loss": 3.6611, "step": 5085 }, { "epoch": 0.051778157552083336, "grad_norm": 18.206151962280273, "learning_rate": 4.968214426253613e-06, "loss": 3.4665, "step": 5090 }, { "epoch": 0.051829020182291664, "grad_norm": 8.762948989868164, "learning_rate": 4.968150831459099e-06, "loss": 3.4422, "step": 5095 }, { "epoch": 0.0518798828125, "grad_norm": 14.994571685791016, "learning_rate": 4.968087173517531e-06, "loss": 3.602, "step": 5100 }, { "epoch": 0.051930745442708336, "grad_norm": 12.887874603271484, "learning_rate": 4.968023452430537e-06, "loss": 3.12, "step": 5105 }, { "epoch": 0.051981608072916664, "grad_norm": 15.499711990356445, "learning_rate": 4.967959668199748e-06, "loss": 3.7486, "step": 5110 }, { "epoch": 0.052032470703125, "grad_norm": 12.390300750732422, "learning_rate": 4.967895820826796e-06, "loss": 3.4817, "step": 5115 }, { "epoch": 0.052083333333333336, "grad_norm": 13.03585433959961, "learning_rate": 4.967831910313314e-06, "loss": 3.1236, "step": 5120 }, { "epoch": 0.052134195963541664, "grad_norm": 11.844696998596191, "learning_rate": 4.967767936660939e-06, "loss": 3.4756, "step": 5125 }, { "epoch": 0.05218505859375, "grad_norm": 12.35226821899414, "learning_rate": 4.967703899871304e-06, "loss": 3.481, "step": 5130 }, { "epoch": 0.052235921223958336, "grad_norm": 10.799506187438965, "learning_rate": 4.967639799946052e-06, "loss": 3.7089, "step": 5135 }, { "epoch": 0.052286783854166664, "grad_norm": 11.297710418701172, "learning_rate": 4.967575636886819e-06, "loss": 3.5326, "step": 5140 }, { "epoch": 0.052337646484375, "grad_norm": 9.424179077148438, "learning_rate": 4.967511410695249e-06, "loss": 3.7654, "step": 5145 }, { "epoch": 0.052388509114583336, "grad_norm": 11.021271705627441, "learning_rate": 4.9674471213729836e-06, "loss": 3.278, "step": 5150 }, { "epoch": 0.052439371744791664, "grad_norm": 12.35921573638916, "learning_rate": 4.9673827689216695e-06, "loss": 3.4304, "step": 5155 }, { "epoch": 0.052490234375, "grad_norm": 12.476812362670898, "learning_rate": 4.967318353342952e-06, "loss": 3.3936, "step": 5160 }, { "epoch": 0.052541097005208336, "grad_norm": 10.683613777160645, "learning_rate": 4.967253874638478e-06, "loss": 3.1559, "step": 5165 }, { "epoch": 0.052591959635416664, "grad_norm": 13.43266773223877, "learning_rate": 4.967189332809899e-06, "loss": 3.8937, "step": 5170 }, { "epoch": 0.052642822265625, "grad_norm": 17.731548309326172, "learning_rate": 4.967124727858867e-06, "loss": 3.4165, "step": 5175 }, { "epoch": 0.052693684895833336, "grad_norm": 15.984275817871094, "learning_rate": 4.967060059787032e-06, "loss": 3.6395, "step": 5180 }, { "epoch": 0.052744547526041664, "grad_norm": 18.11372184753418, "learning_rate": 4.96699532859605e-06, "loss": 3.495, "step": 5185 }, { "epoch": 0.05279541015625, "grad_norm": 13.443792343139648, "learning_rate": 4.9669305342875785e-06, "loss": 4.1017, "step": 5190 }, { "epoch": 0.052846272786458336, "grad_norm": 16.402788162231445, "learning_rate": 4.9668656768632725e-06, "loss": 3.4839, "step": 5195 }, { "epoch": 0.052897135416666664, "grad_norm": 14.646184921264648, "learning_rate": 4.966800756324794e-06, "loss": 3.7131, "step": 5200 }, { "epoch": 0.052947998046875, "grad_norm": 16.304649353027344, "learning_rate": 4.966735772673803e-06, "loss": 3.4049, "step": 5205 }, { "epoch": 0.052998860677083336, "grad_norm": 17.611019134521484, "learning_rate": 4.966670725911962e-06, "loss": 3.2992, "step": 5210 }, { "epoch": 0.053049723307291664, "grad_norm": 9.94887638092041, "learning_rate": 4.966605616040935e-06, "loss": 3.498, "step": 5215 }, { "epoch": 0.0531005859375, "grad_norm": 14.684574127197266, "learning_rate": 4.9665404430623874e-06, "loss": 3.6487, "step": 5220 }, { "epoch": 0.053151448567708336, "grad_norm": 14.74726676940918, "learning_rate": 4.9664752069779875e-06, "loss": 3.3533, "step": 5225 }, { "epoch": 0.053202311197916664, "grad_norm": 16.72749137878418, "learning_rate": 4.966409907789403e-06, "loss": 3.203, "step": 5230 }, { "epoch": 0.053253173828125, "grad_norm": 12.44540786743164, "learning_rate": 4.966344545498307e-06, "loss": 3.4401, "step": 5235 }, { "epoch": 0.053304036458333336, "grad_norm": 10.099539756774902, "learning_rate": 4.96627912010637e-06, "loss": 3.5915, "step": 5240 }, { "epoch": 0.053354899088541664, "grad_norm": 11.097341537475586, "learning_rate": 4.966213631615266e-06, "loss": 3.9558, "step": 5245 }, { "epoch": 0.05340576171875, "grad_norm": 13.878585815429688, "learning_rate": 4.966148080026671e-06, "loss": 3.4617, "step": 5250 }, { "epoch": 0.053456624348958336, "grad_norm": 10.348923683166504, "learning_rate": 4.966082465342263e-06, "loss": 3.6141, "step": 5255 }, { "epoch": 0.053507486979166664, "grad_norm": 9.475302696228027, "learning_rate": 4.966016787563719e-06, "loss": 3.2901, "step": 5260 }, { "epoch": 0.053558349609375, "grad_norm": 16.38394546508789, "learning_rate": 4.965951046692719e-06, "loss": 3.1904, "step": 5265 }, { "epoch": 0.053609212239583336, "grad_norm": 14.318309783935547, "learning_rate": 4.965885242730947e-06, "loss": 3.3258, "step": 5270 }, { "epoch": 0.053660074869791664, "grad_norm": 8.593515396118164, "learning_rate": 4.965819375680085e-06, "loss": 3.341, "step": 5275 }, { "epoch": 0.0537109375, "grad_norm": 14.667823791503906, "learning_rate": 4.9657534455418186e-06, "loss": 3.5297, "step": 5280 }, { "epoch": 0.053761800130208336, "grad_norm": 15.408576965332031, "learning_rate": 4.965687452317836e-06, "loss": 3.3244, "step": 5285 }, { "epoch": 0.053812662760416664, "grad_norm": 15.13183307647705, "learning_rate": 4.9656213960098235e-06, "loss": 3.3597, "step": 5290 }, { "epoch": 0.053863525390625, "grad_norm": 10.050660133361816, "learning_rate": 4.965555276619471e-06, "loss": 3.2173, "step": 5295 }, { "epoch": 0.053914388020833336, "grad_norm": 11.996289253234863, "learning_rate": 4.965489094148473e-06, "loss": 3.258, "step": 5300 }, { "epoch": 0.053965250651041664, "grad_norm": 13.994873046875, "learning_rate": 4.965422848598519e-06, "loss": 3.9475, "step": 5305 }, { "epoch": 0.05401611328125, "grad_norm": 13.173226356506348, "learning_rate": 4.965356539971307e-06, "loss": 3.5109, "step": 5310 }, { "epoch": 0.054066975911458336, "grad_norm": 11.68813419342041, "learning_rate": 4.965290168268532e-06, "loss": 3.7385, "step": 5315 }, { "epoch": 0.054117838541666664, "grad_norm": 13.431297302246094, "learning_rate": 4.965223733491893e-06, "loss": 3.4336, "step": 5320 }, { "epoch": 0.054168701171875, "grad_norm": 14.174638748168945, "learning_rate": 4.965157235643088e-06, "loss": 3.2346, "step": 5325 }, { "epoch": 0.054219563802083336, "grad_norm": 14.978144645690918, "learning_rate": 4.96509067472382e-06, "loss": 3.6034, "step": 5330 }, { "epoch": 0.054270426432291664, "grad_norm": 11.926872253417969, "learning_rate": 4.965024050735792e-06, "loss": 3.4996, "step": 5335 }, { "epoch": 0.0543212890625, "grad_norm": 9.099247932434082, "learning_rate": 4.9649573636807065e-06, "loss": 3.1218, "step": 5340 }, { "epoch": 0.054372151692708336, "grad_norm": 17.655933380126953, "learning_rate": 4.964890613560272e-06, "loss": 3.6185, "step": 5345 }, { "epoch": 0.054423014322916664, "grad_norm": 10.588809967041016, "learning_rate": 4.964823800376195e-06, "loss": 3.5396, "step": 5350 }, { "epoch": 0.054473876953125, "grad_norm": 12.350024223327637, "learning_rate": 4.964756924130186e-06, "loss": 3.7472, "step": 5355 }, { "epoch": 0.054524739583333336, "grad_norm": 13.325206756591797, "learning_rate": 4.964689984823955e-06, "loss": 3.1619, "step": 5360 }, { "epoch": 0.054575602213541664, "grad_norm": 12.419147491455078, "learning_rate": 4.964622982459214e-06, "loss": 3.3821, "step": 5365 }, { "epoch": 0.05462646484375, "grad_norm": 11.1918306350708, "learning_rate": 4.964555917037679e-06, "loss": 3.2978, "step": 5370 }, { "epoch": 0.054677327473958336, "grad_norm": 8.211668014526367, "learning_rate": 4.964488788561066e-06, "loss": 3.1058, "step": 5375 }, { "epoch": 0.054728190104166664, "grad_norm": 14.706374168395996, "learning_rate": 4.96442159703109e-06, "loss": 3.5759, "step": 5380 }, { "epoch": 0.054779052734375, "grad_norm": 12.363343238830566, "learning_rate": 4.964354342449472e-06, "loss": 3.3521, "step": 5385 }, { "epoch": 0.054829915364583336, "grad_norm": 11.750041007995605, "learning_rate": 4.964287024817933e-06, "loss": 3.7942, "step": 5390 }, { "epoch": 0.054880777994791664, "grad_norm": 15.14163589477539, "learning_rate": 4.964219644138194e-06, "loss": 3.7889, "step": 5395 }, { "epoch": 0.054931640625, "grad_norm": 14.809743881225586, "learning_rate": 4.964152200411979e-06, "loss": 3.1966, "step": 5400 }, { "epoch": 0.054982503255208336, "grad_norm": 10.114827156066895, "learning_rate": 4.964084693641014e-06, "loss": 3.5248, "step": 5405 }, { "epoch": 0.055033365885416664, "grad_norm": 12.720572471618652, "learning_rate": 4.964017123827027e-06, "loss": 3.1197, "step": 5410 }, { "epoch": 0.055084228515625, "grad_norm": 8.508932113647461, "learning_rate": 4.963949490971746e-06, "loss": 3.8478, "step": 5415 }, { "epoch": 0.055135091145833336, "grad_norm": 7.770327091217041, "learning_rate": 4.963881795076901e-06, "loss": 3.0364, "step": 5420 }, { "epoch": 0.055185953776041664, "grad_norm": 13.418745994567871, "learning_rate": 4.963814036144223e-06, "loss": 3.1178, "step": 5425 }, { "epoch": 0.05523681640625, "grad_norm": 17.769771575927734, "learning_rate": 4.963746214175448e-06, "loss": 3.5998, "step": 5430 }, { "epoch": 0.055287679036458336, "grad_norm": 12.966880798339844, "learning_rate": 4.96367832917231e-06, "loss": 3.2851, "step": 5435 }, { "epoch": 0.055338541666666664, "grad_norm": 13.409180641174316, "learning_rate": 4.9636103811365464e-06, "loss": 3.7049, "step": 5440 }, { "epoch": 0.055389404296875, "grad_norm": 13.363730430603027, "learning_rate": 4.963542370069895e-06, "loss": 3.6024, "step": 5445 }, { "epoch": 0.055440266927083336, "grad_norm": 13.596270561218262, "learning_rate": 4.963474295974095e-06, "loss": 3.5165, "step": 5450 }, { "epoch": 0.055491129557291664, "grad_norm": 10.22996711730957, "learning_rate": 4.96340615885089e-06, "loss": 3.6675, "step": 5455 }, { "epoch": 0.0555419921875, "grad_norm": 14.241738319396973, "learning_rate": 4.963337958702022e-06, "loss": 3.3507, "step": 5460 }, { "epoch": 0.055592854817708336, "grad_norm": 9.234009742736816, "learning_rate": 4.963269695529236e-06, "loss": 3.4202, "step": 5465 }, { "epoch": 0.055643717447916664, "grad_norm": 13.222739219665527, "learning_rate": 4.963201369334279e-06, "loss": 3.7927, "step": 5470 }, { "epoch": 0.055694580078125, "grad_norm": 11.704004287719727, "learning_rate": 4.963132980118899e-06, "loss": 3.3365, "step": 5475 }, { "epoch": 0.055745442708333336, "grad_norm": 11.018651008605957, "learning_rate": 4.963064527884845e-06, "loss": 3.6051, "step": 5480 }, { "epoch": 0.055796305338541664, "grad_norm": 14.942146301269531, "learning_rate": 4.96299601263387e-06, "loss": 3.2443, "step": 5485 }, { "epoch": 0.05584716796875, "grad_norm": 9.587372779846191, "learning_rate": 4.962927434367724e-06, "loss": 3.637, "step": 5490 }, { "epoch": 0.055898030598958336, "grad_norm": 9.663505554199219, "learning_rate": 4.9628587930881646e-06, "loss": 3.5114, "step": 5495 }, { "epoch": 0.055948893229166664, "grad_norm": 15.723515510559082, "learning_rate": 4.962790088796946e-06, "loss": 3.5283, "step": 5500 }, { "epoch": 0.055999755859375, "grad_norm": 11.564032554626465, "learning_rate": 4.962721321495827e-06, "loss": 3.523, "step": 5505 }, { "epoch": 0.056050618489583336, "grad_norm": 15.196375846862793, "learning_rate": 4.962652491186567e-06, "loss": 3.4245, "step": 5510 }, { "epoch": 0.056101481119791664, "grad_norm": 11.132039070129395, "learning_rate": 4.962583597870927e-06, "loss": 3.6306, "step": 5515 }, { "epoch": 0.05615234375, "grad_norm": 13.807767868041992, "learning_rate": 4.962514641550668e-06, "loss": 3.284, "step": 5520 }, { "epoch": 0.056203206380208336, "grad_norm": 8.87157154083252, "learning_rate": 4.962445622227558e-06, "loss": 3.5285, "step": 5525 }, { "epoch": 0.056254069010416664, "grad_norm": 16.512109756469727, "learning_rate": 4.962376539903359e-06, "loss": 3.5056, "step": 5530 }, { "epoch": 0.056304931640625, "grad_norm": 10.745555877685547, "learning_rate": 4.962307394579839e-06, "loss": 3.2763, "step": 5535 }, { "epoch": 0.056355794270833336, "grad_norm": 10.458619117736816, "learning_rate": 4.9622381862587685e-06, "loss": 3.3848, "step": 5540 }, { "epoch": 0.056406656901041664, "grad_norm": 10.014710426330566, "learning_rate": 4.962168914941919e-06, "loss": 3.4247, "step": 5545 }, { "epoch": 0.05645751953125, "grad_norm": 15.231337547302246, "learning_rate": 4.96209958063106e-06, "loss": 3.368, "step": 5550 }, { "epoch": 0.056508382161458336, "grad_norm": 14.728731155395508, "learning_rate": 4.962030183327967e-06, "loss": 3.4259, "step": 5555 }, { "epoch": 0.056559244791666664, "grad_norm": 14.27651596069336, "learning_rate": 4.961960723034415e-06, "loss": 3.3908, "step": 5560 }, { "epoch": 0.056610107421875, "grad_norm": 15.396354675292969, "learning_rate": 4.961891199752182e-06, "loss": 3.6155, "step": 5565 }, { "epoch": 0.056660970052083336, "grad_norm": 13.059240341186523, "learning_rate": 4.961821613483047e-06, "loss": 3.737, "step": 5570 }, { "epoch": 0.056711832682291664, "grad_norm": 21.01386833190918, "learning_rate": 4.961751964228788e-06, "loss": 3.4121, "step": 5575 }, { "epoch": 0.0567626953125, "grad_norm": 12.324559211730957, "learning_rate": 4.961682251991189e-06, "loss": 3.1513, "step": 5580 }, { "epoch": 0.056813557942708336, "grad_norm": 11.327527046203613, "learning_rate": 4.961612476772033e-06, "loss": 3.4368, "step": 5585 }, { "epoch": 0.056864420572916664, "grad_norm": 9.986822128295898, "learning_rate": 4.961542638573106e-06, "loss": 3.5226, "step": 5590 }, { "epoch": 0.056915283203125, "grad_norm": 16.936189651489258, "learning_rate": 4.961472737396193e-06, "loss": 3.698, "step": 5595 }, { "epoch": 0.056966145833333336, "grad_norm": 15.606807708740234, "learning_rate": 4.9614027732430835e-06, "loss": 3.4487, "step": 5600 }, { "epoch": 0.057017008463541664, "grad_norm": 11.489161491394043, "learning_rate": 4.961332746115568e-06, "loss": 3.5364, "step": 5605 }, { "epoch": 0.05706787109375, "grad_norm": 13.7288818359375, "learning_rate": 4.9612626560154375e-06, "loss": 3.6168, "step": 5610 }, { "epoch": 0.057118733723958336, "grad_norm": 8.421143531799316, "learning_rate": 4.961192502944485e-06, "loss": 3.4713, "step": 5615 }, { "epoch": 0.057169596354166664, "grad_norm": 17.100997924804688, "learning_rate": 4.961122286904506e-06, "loss": 3.392, "step": 5620 }, { "epoch": 0.057220458984375, "grad_norm": 17.024621963500977, "learning_rate": 4.961052007897297e-06, "loss": 3.6999, "step": 5625 }, { "epoch": 0.057271321614583336, "grad_norm": 8.612135887145996, "learning_rate": 4.960981665924655e-06, "loss": 3.2406, "step": 5630 }, { "epoch": 0.057322184244791664, "grad_norm": 11.135616302490234, "learning_rate": 4.9609112609883816e-06, "loss": 3.9177, "step": 5635 }, { "epoch": 0.057373046875, "grad_norm": 13.029520988464355, "learning_rate": 4.960840793090276e-06, "loss": 3.4563, "step": 5640 }, { "epoch": 0.057423909505208336, "grad_norm": 12.987259864807129, "learning_rate": 4.960770262232141e-06, "loss": 3.4741, "step": 5645 }, { "epoch": 0.057474772135416664, "grad_norm": 8.016860008239746, "learning_rate": 4.960699668415784e-06, "loss": 3.423, "step": 5650 }, { "epoch": 0.057525634765625, "grad_norm": 11.892767906188965, "learning_rate": 4.960629011643008e-06, "loss": 3.0988, "step": 5655 }, { "epoch": 0.057576497395833336, "grad_norm": 7.135873794555664, "learning_rate": 4.960558291915622e-06, "loss": 3.2616, "step": 5660 }, { "epoch": 0.057627360026041664, "grad_norm": 10.335238456726074, "learning_rate": 4.960487509235435e-06, "loss": 3.5319, "step": 5665 }, { "epoch": 0.05767822265625, "grad_norm": 10.540165901184082, "learning_rate": 4.96041666360426e-06, "loss": 3.3135, "step": 5670 }, { "epoch": 0.057729085286458336, "grad_norm": 17.09685516357422, "learning_rate": 4.9603457550239065e-06, "loss": 3.5976, "step": 5675 }, { "epoch": 0.057779947916666664, "grad_norm": 13.739051818847656, "learning_rate": 4.96027478349619e-06, "loss": 3.5974, "step": 5680 }, { "epoch": 0.057830810546875, "grad_norm": 13.99421501159668, "learning_rate": 4.960203749022927e-06, "loss": 3.2972, "step": 5685 }, { "epoch": 0.057881673177083336, "grad_norm": 12.529129028320312, "learning_rate": 4.960132651605934e-06, "loss": 3.4988, "step": 5690 }, { "epoch": 0.057932535807291664, "grad_norm": 13.570207595825195, "learning_rate": 4.96006149124703e-06, "loss": 3.2342, "step": 5695 }, { "epoch": 0.0579833984375, "grad_norm": 8.2381591796875, "learning_rate": 4.959990267948035e-06, "loss": 3.6295, "step": 5700 }, { "epoch": 0.058034261067708336, "grad_norm": 10.554726600646973, "learning_rate": 4.959918981710773e-06, "loss": 3.6647, "step": 5705 }, { "epoch": 0.058085123697916664, "grad_norm": 10.161996841430664, "learning_rate": 4.959847632537067e-06, "loss": 3.3671, "step": 5710 }, { "epoch": 0.058135986328125, "grad_norm": 10.479255676269531, "learning_rate": 4.959776220428743e-06, "loss": 3.6848, "step": 5715 }, { "epoch": 0.058186848958333336, "grad_norm": 13.689743995666504, "learning_rate": 4.959704745387626e-06, "loss": 3.6923, "step": 5720 }, { "epoch": 0.058237711588541664, "grad_norm": 15.075189590454102, "learning_rate": 4.9596332074155465e-06, "loss": 3.1929, "step": 5725 }, { "epoch": 0.05828857421875, "grad_norm": 10.657966613769531, "learning_rate": 4.959561606514335e-06, "loss": 3.3856, "step": 5730 }, { "epoch": 0.058339436848958336, "grad_norm": 17.425458908081055, "learning_rate": 4.959489942685822e-06, "loss": 3.5663, "step": 5735 }, { "epoch": 0.058390299479166664, "grad_norm": 10.390463829040527, "learning_rate": 4.959418215931843e-06, "loss": 4.1035, "step": 5740 }, { "epoch": 0.058441162109375, "grad_norm": 9.901952743530273, "learning_rate": 4.959346426254231e-06, "loss": 3.8645, "step": 5745 }, { "epoch": 0.058492024739583336, "grad_norm": 12.191425323486328, "learning_rate": 4.9592745736548235e-06, "loss": 3.5104, "step": 5750 }, { "epoch": 0.058542887369791664, "grad_norm": 8.743103981018066, "learning_rate": 4.959202658135459e-06, "loss": 3.4598, "step": 5755 }, { "epoch": 0.05859375, "grad_norm": 12.222476959228516, "learning_rate": 4.959130679697978e-06, "loss": 3.4976, "step": 5760 }, { "epoch": 0.058644612630208336, "grad_norm": 12.773666381835938, "learning_rate": 4.95905863834422e-06, "loss": 3.3259, "step": 5765 }, { "epoch": 0.058695475260416664, "grad_norm": 12.655311584472656, "learning_rate": 4.958986534076031e-06, "loss": 3.6635, "step": 5770 }, { "epoch": 0.058746337890625, "grad_norm": 10.425118446350098, "learning_rate": 4.9589143668952536e-06, "loss": 3.3051, "step": 5775 }, { "epoch": 0.058797200520833336, "grad_norm": 12.868205070495605, "learning_rate": 4.958842136803735e-06, "loss": 3.6249, "step": 5780 }, { "epoch": 0.058848063151041664, "grad_norm": 15.800631523132324, "learning_rate": 4.958769843803324e-06, "loss": 3.5991, "step": 5785 }, { "epoch": 0.05889892578125, "grad_norm": 10.733988761901855, "learning_rate": 4.958697487895869e-06, "loss": 3.6806, "step": 5790 }, { "epoch": 0.058949788411458336, "grad_norm": 14.783018112182617, "learning_rate": 4.9586250690832214e-06, "loss": 3.6191, "step": 5795 }, { "epoch": 0.059000651041666664, "grad_norm": 9.55103874206543, "learning_rate": 4.958552587367233e-06, "loss": 3.2202, "step": 5800 }, { "epoch": 0.059051513671875, "grad_norm": 12.465963363647461, "learning_rate": 4.958480042749762e-06, "loss": 3.0866, "step": 5805 }, { "epoch": 0.059102376302083336, "grad_norm": 10.959587097167969, "learning_rate": 4.958407435232659e-06, "loss": 3.6214, "step": 5810 }, { "epoch": 0.059153238932291664, "grad_norm": 20.63323974609375, "learning_rate": 4.958334764817786e-06, "loss": 4.0361, "step": 5815 }, { "epoch": 0.0592041015625, "grad_norm": 7.549029350280762, "learning_rate": 4.9582620315070005e-06, "loss": 3.3141, "step": 5820 }, { "epoch": 0.059254964192708336, "grad_norm": 12.9930419921875, "learning_rate": 4.958189235302164e-06, "loss": 3.397, "step": 5825 }, { "epoch": 0.059305826822916664, "grad_norm": 15.772438049316406, "learning_rate": 4.958116376205138e-06, "loss": 3.375, "step": 5830 }, { "epoch": 0.059356689453125, "grad_norm": 14.877005577087402, "learning_rate": 4.9580434542177875e-06, "loss": 3.2812, "step": 5835 }, { "epoch": 0.059407552083333336, "grad_norm": 14.17616081237793, "learning_rate": 4.957970469341977e-06, "loss": 3.4907, "step": 5840 }, { "epoch": 0.059458414713541664, "grad_norm": 16.345909118652344, "learning_rate": 4.957897421579576e-06, "loss": 3.3758, "step": 5845 }, { "epoch": 0.05950927734375, "grad_norm": 15.003165245056152, "learning_rate": 4.957824310932451e-06, "loss": 3.0737, "step": 5850 }, { "epoch": 0.059560139973958336, "grad_norm": 14.897906303405762, "learning_rate": 4.957751137402475e-06, "loss": 3.2275, "step": 5855 }, { "epoch": 0.059611002604166664, "grad_norm": 13.421442031860352, "learning_rate": 4.957677900991516e-06, "loss": 3.2209, "step": 5860 }, { "epoch": 0.059661865234375, "grad_norm": 9.91036319732666, "learning_rate": 4.957604601701453e-06, "loss": 3.9609, "step": 5865 }, { "epoch": 0.059712727864583336, "grad_norm": 18.146198272705078, "learning_rate": 4.957531239534158e-06, "loss": 3.4241, "step": 5870 }, { "epoch": 0.059763590494791664, "grad_norm": 13.376401901245117, "learning_rate": 4.957457814491509e-06, "loss": 3.4611, "step": 5875 }, { "epoch": 0.059814453125, "grad_norm": 9.991517066955566, "learning_rate": 4.957384326575383e-06, "loss": 3.1355, "step": 5880 }, { "epoch": 0.059865315755208336, "grad_norm": 8.1071138381958, "learning_rate": 4.9573107757876625e-06, "loss": 3.4358, "step": 5885 }, { "epoch": 0.059916178385416664, "grad_norm": 16.746749877929688, "learning_rate": 4.9572371621302284e-06, "loss": 3.6471, "step": 5890 }, { "epoch": 0.059967041015625, "grad_norm": 16.02521514892578, "learning_rate": 4.957163485604963e-06, "loss": 3.5407, "step": 5895 }, { "epoch": 0.060017903645833336, "grad_norm": 8.261488914489746, "learning_rate": 4.957089746213753e-06, "loss": 3.2416, "step": 5900 }, { "epoch": 0.060068766276041664, "grad_norm": 12.552051544189453, "learning_rate": 4.957015943958484e-06, "loss": 3.3061, "step": 5905 }, { "epoch": 0.06011962890625, "grad_norm": 15.044920921325684, "learning_rate": 4.956942078841045e-06, "loss": 3.8459, "step": 5910 }, { "epoch": 0.060170491536458336, "grad_norm": 13.593306541442871, "learning_rate": 4.9568681508633246e-06, "loss": 3.2017, "step": 5915 }, { "epoch": 0.060221354166666664, "grad_norm": 12.529535293579102, "learning_rate": 4.956794160027215e-06, "loss": 3.7657, "step": 5920 }, { "epoch": 0.060272216796875, "grad_norm": 13.670187950134277, "learning_rate": 4.9567201063346096e-06, "loss": 3.5618, "step": 5925 }, { "epoch": 0.060323079427083336, "grad_norm": 14.781356811523438, "learning_rate": 4.956645989787402e-06, "loss": 3.3705, "step": 5930 }, { "epoch": 0.060373942057291664, "grad_norm": 12.808194160461426, "learning_rate": 4.95657181038749e-06, "loss": 3.5778, "step": 5935 }, { "epoch": 0.0604248046875, "grad_norm": 9.7361421585083, "learning_rate": 4.956497568136769e-06, "loss": 3.9065, "step": 5940 }, { "epoch": 0.060475667317708336, "grad_norm": 8.544392585754395, "learning_rate": 4.9564232630371414e-06, "loss": 3.0755, "step": 5945 }, { "epoch": 0.060526529947916664, "grad_norm": 16.483707427978516, "learning_rate": 4.956348895090506e-06, "loss": 3.2558, "step": 5950 }, { "epoch": 0.060577392578125, "grad_norm": 17.052963256835938, "learning_rate": 4.956274464298766e-06, "loss": 3.6055, "step": 5955 }, { "epoch": 0.060628255208333336, "grad_norm": 13.169681549072266, "learning_rate": 4.956199970663827e-06, "loss": 3.6049, "step": 5960 }, { "epoch": 0.060679117838541664, "grad_norm": 11.642385482788086, "learning_rate": 4.956125414187594e-06, "loss": 3.3229, "step": 5965 }, { "epoch": 0.06072998046875, "grad_norm": 9.508110046386719, "learning_rate": 4.956050794871974e-06, "loss": 3.2438, "step": 5970 }, { "epoch": 0.060780843098958336, "grad_norm": 9.767243385314941, "learning_rate": 4.955976112718876e-06, "loss": 3.623, "step": 5975 }, { "epoch": 0.060831705729166664, "grad_norm": 13.47176456451416, "learning_rate": 4.955901367730212e-06, "loss": 3.5425, "step": 5980 }, { "epoch": 0.060882568359375, "grad_norm": 13.689672470092773, "learning_rate": 4.9558265599078935e-06, "loss": 3.4059, "step": 5985 }, { "epoch": 0.060933430989583336, "grad_norm": 14.70511531829834, "learning_rate": 4.955751689253834e-06, "loss": 3.2993, "step": 5990 }, { "epoch": 0.060984293619791664, "grad_norm": 10.678279876708984, "learning_rate": 4.955676755769951e-06, "loss": 3.8429, "step": 5995 }, { "epoch": 0.06103515625, "grad_norm": 9.907837867736816, "learning_rate": 4.955601759458158e-06, "loss": 3.6905, "step": 6000 }, { "epoch": 0.061086018880208336, "grad_norm": 14.26388931274414, "learning_rate": 4.955526700320378e-06, "loss": 3.8103, "step": 6005 }, { "epoch": 0.061136881510416664, "grad_norm": 13.094128608703613, "learning_rate": 4.955451578358529e-06, "loss": 3.5114, "step": 6010 }, { "epoch": 0.061187744140625, "grad_norm": 9.898237228393555, "learning_rate": 4.955376393574533e-06, "loss": 3.0525, "step": 6015 }, { "epoch": 0.061238606770833336, "grad_norm": 20.232635498046875, "learning_rate": 4.955301145970314e-06, "loss": 3.3842, "step": 6020 }, { "epoch": 0.061289469401041664, "grad_norm": 10.714902877807617, "learning_rate": 4.955225835547798e-06, "loss": 3.2338, "step": 6025 }, { "epoch": 0.06134033203125, "grad_norm": 16.868967056274414, "learning_rate": 4.95515046230891e-06, "loss": 3.3336, "step": 6030 }, { "epoch": 0.061391194661458336, "grad_norm": 10.545510292053223, "learning_rate": 4.9550750262555795e-06, "loss": 3.4626, "step": 6035 }, { "epoch": 0.061442057291666664, "grad_norm": 16.030223846435547, "learning_rate": 4.9549995273897365e-06, "loss": 3.746, "step": 6040 }, { "epoch": 0.061492919921875, "grad_norm": 16.585622787475586, "learning_rate": 4.954923965713312e-06, "loss": 3.5239, "step": 6045 }, { "epoch": 0.061543782552083336, "grad_norm": 10.373279571533203, "learning_rate": 4.95484834122824e-06, "loss": 3.4698, "step": 6050 }, { "epoch": 0.061594645182291664, "grad_norm": 11.033610343933105, "learning_rate": 4.954772653936455e-06, "loss": 3.521, "step": 6055 }, { "epoch": 0.0616455078125, "grad_norm": 12.441916465759277, "learning_rate": 4.954696903839894e-06, "loss": 3.5568, "step": 6060 }, { "epoch": 0.061696370442708336, "grad_norm": 12.675573348999023, "learning_rate": 4.954621090940495e-06, "loss": 3.199, "step": 6065 }, { "epoch": 0.061747233072916664, "grad_norm": 11.934941291809082, "learning_rate": 4.9545452152401965e-06, "loss": 3.5181, "step": 6070 }, { "epoch": 0.061798095703125, "grad_norm": 12.990299224853516, "learning_rate": 4.95446927674094e-06, "loss": 3.1343, "step": 6075 }, { "epoch": 0.061848958333333336, "grad_norm": 15.312989234924316, "learning_rate": 4.954393275444669e-06, "loss": 2.912, "step": 6080 }, { "epoch": 0.061899820963541664, "grad_norm": 16.04402732849121, "learning_rate": 4.954317211353328e-06, "loss": 3.5654, "step": 6085 }, { "epoch": 0.06195068359375, "grad_norm": 16.57796287536621, "learning_rate": 4.954241084468863e-06, "loss": 3.4681, "step": 6090 }, { "epoch": 0.062001546223958336, "grad_norm": 12.278860092163086, "learning_rate": 4.954164894793222e-06, "loss": 3.4626, "step": 6095 }, { "epoch": 0.062052408854166664, "grad_norm": 85.06364440917969, "learning_rate": 4.954088642328353e-06, "loss": 3.5676, "step": 6100 }, { "epoch": 0.062103271484375, "grad_norm": 9.45626163482666, "learning_rate": 4.954012327076207e-06, "loss": 3.7756, "step": 6105 }, { "epoch": 0.062154134114583336, "grad_norm": 13.716742515563965, "learning_rate": 4.95393594903874e-06, "loss": 3.5243, "step": 6110 }, { "epoch": 0.062204996744791664, "grad_norm": 19.727203369140625, "learning_rate": 4.953859508217901e-06, "loss": 3.479, "step": 6115 }, { "epoch": 0.062255859375, "grad_norm": 14.221769332885742, "learning_rate": 4.953783004615649e-06, "loss": 3.3731, "step": 6120 }, { "epoch": 0.062306722005208336, "grad_norm": 8.173396110534668, "learning_rate": 4.953706438233941e-06, "loss": 3.2895, "step": 6125 }, { "epoch": 0.062357584635416664, "grad_norm": 11.394742965698242, "learning_rate": 4.953629809074734e-06, "loss": 3.3431, "step": 6130 }, { "epoch": 0.062408447265625, "grad_norm": 7.780871391296387, "learning_rate": 4.953553117139991e-06, "loss": 3.6523, "step": 6135 }, { "epoch": 0.062459309895833336, "grad_norm": 12.438297271728516, "learning_rate": 4.953476362431672e-06, "loss": 3.4039, "step": 6140 }, { "epoch": 0.06251017252604167, "grad_norm": 12.829780578613281, "learning_rate": 4.953399544951742e-06, "loss": 3.5707, "step": 6145 }, { "epoch": 0.06256103515625, "grad_norm": 11.045709609985352, "learning_rate": 4.953322664702167e-06, "loss": 3.1758, "step": 6150 }, { "epoch": 0.06261189778645833, "grad_norm": 13.500697135925293, "learning_rate": 4.953245721684913e-06, "loss": 3.5634, "step": 6155 }, { "epoch": 0.06266276041666667, "grad_norm": 13.87531852722168, "learning_rate": 4.953168715901949e-06, "loss": 3.0295, "step": 6160 }, { "epoch": 0.062713623046875, "grad_norm": 10.557450294494629, "learning_rate": 4.953091647355244e-06, "loss": 3.6556, "step": 6165 }, { "epoch": 0.06276448567708333, "grad_norm": 14.243014335632324, "learning_rate": 4.953014516046771e-06, "loss": 3.493, "step": 6170 }, { "epoch": 0.06281534830729167, "grad_norm": 14.641172409057617, "learning_rate": 4.952937321978502e-06, "loss": 3.5743, "step": 6175 }, { "epoch": 0.0628662109375, "grad_norm": 14.950688362121582, "learning_rate": 4.952860065152415e-06, "loss": 3.3458, "step": 6180 }, { "epoch": 0.06291707356770833, "grad_norm": 10.996521949768066, "learning_rate": 4.952782745570483e-06, "loss": 3.3334, "step": 6185 }, { "epoch": 0.06296793619791667, "grad_norm": 10.796847343444824, "learning_rate": 4.952705363234687e-06, "loss": 3.6746, "step": 6190 }, { "epoch": 0.063018798828125, "grad_norm": 16.92947006225586, "learning_rate": 4.952627918147005e-06, "loss": 3.3581, "step": 6195 }, { "epoch": 0.06306966145833333, "grad_norm": 9.009227752685547, "learning_rate": 4.952550410309419e-06, "loss": 3.3688, "step": 6200 }, { "epoch": 0.06312052408854167, "grad_norm": 9.859138488769531, "learning_rate": 4.952472839723912e-06, "loss": 3.2627, "step": 6205 }, { "epoch": 0.06317138671875, "grad_norm": 16.701183319091797, "learning_rate": 4.952395206392469e-06, "loss": 3.6775, "step": 6210 }, { "epoch": 0.06322224934895833, "grad_norm": 17.42451286315918, "learning_rate": 4.952317510317076e-06, "loss": 3.5911, "step": 6215 }, { "epoch": 0.06327311197916667, "grad_norm": 12.738759994506836, "learning_rate": 4.952239751499721e-06, "loss": 3.2518, "step": 6220 }, { "epoch": 0.063323974609375, "grad_norm": 12.116572380065918, "learning_rate": 4.952161929942393e-06, "loss": 3.6424, "step": 6225 }, { "epoch": 0.06337483723958333, "grad_norm": 15.029314041137695, "learning_rate": 4.952084045647083e-06, "loss": 3.4677, "step": 6230 }, { "epoch": 0.06342569986979167, "grad_norm": 16.844175338745117, "learning_rate": 4.952006098615784e-06, "loss": 3.0719, "step": 6235 }, { "epoch": 0.0634765625, "grad_norm": 16.886533737182617, "learning_rate": 4.95192808885049e-06, "loss": 3.4529, "step": 6240 }, { "epoch": 0.06352742513020833, "grad_norm": 15.081376075744629, "learning_rate": 4.9518500163531966e-06, "loss": 3.1723, "step": 6245 }, { "epoch": 0.06357828776041667, "grad_norm": 9.575623512268066, "learning_rate": 4.951771881125903e-06, "loss": 3.5933, "step": 6250 }, { "epoch": 0.063629150390625, "grad_norm": 9.829601287841797, "learning_rate": 4.951693683170606e-06, "loss": 3.7078, "step": 6255 }, { "epoch": 0.06368001302083333, "grad_norm": 10.027034759521484, "learning_rate": 4.951615422489308e-06, "loss": 3.3214, "step": 6260 }, { "epoch": 0.06373087565104167, "grad_norm": 14.713983535766602, "learning_rate": 4.9515370990840095e-06, "loss": 3.9297, "step": 6265 }, { "epoch": 0.06378173828125, "grad_norm": 10.196232795715332, "learning_rate": 4.951458712956716e-06, "loss": 3.3136, "step": 6270 }, { "epoch": 0.06383260091145833, "grad_norm": 8.880327224731445, "learning_rate": 4.9513802641094325e-06, "loss": 3.4526, "step": 6275 }, { "epoch": 0.06388346354166667, "grad_norm": 10.190267562866211, "learning_rate": 4.951301752544165e-06, "loss": 3.8351, "step": 6280 }, { "epoch": 0.063934326171875, "grad_norm": 13.929616928100586, "learning_rate": 4.951223178262924e-06, "loss": 3.3708, "step": 6285 }, { "epoch": 0.06398518880208333, "grad_norm": 12.750432014465332, "learning_rate": 4.951144541267719e-06, "loss": 3.617, "step": 6290 }, { "epoch": 0.06403605143229167, "grad_norm": 17.89909553527832, "learning_rate": 4.951065841560561e-06, "loss": 3.6072, "step": 6295 }, { "epoch": 0.0640869140625, "grad_norm": 14.314258575439453, "learning_rate": 4.950987079143465e-06, "loss": 3.4483, "step": 6300 }, { "epoch": 0.06413777669270833, "grad_norm": 12.541434288024902, "learning_rate": 4.950908254018446e-06, "loss": 3.9403, "step": 6305 }, { "epoch": 0.06418863932291667, "grad_norm": 10.588662147521973, "learning_rate": 4.9508293661875205e-06, "loss": 3.3919, "step": 6310 }, { "epoch": 0.064239501953125, "grad_norm": 14.57321834564209, "learning_rate": 4.950750415652706e-06, "loss": 3.6334, "step": 6315 }, { "epoch": 0.06429036458333333, "grad_norm": 12.940896034240723, "learning_rate": 4.950671402416023e-06, "loss": 3.5378, "step": 6320 }, { "epoch": 0.06434122721354167, "grad_norm": 14.000555992126465, "learning_rate": 4.9505923264794935e-06, "loss": 3.7651, "step": 6325 }, { "epoch": 0.06439208984375, "grad_norm": 12.165482521057129, "learning_rate": 4.95051318784514e-06, "loss": 3.3013, "step": 6330 }, { "epoch": 0.06444295247395833, "grad_norm": 10.64943790435791, "learning_rate": 4.950433986514986e-06, "loss": 3.6277, "step": 6335 }, { "epoch": 0.06449381510416667, "grad_norm": 9.537559509277344, "learning_rate": 4.9503547224910605e-06, "loss": 3.3196, "step": 6340 }, { "epoch": 0.064544677734375, "grad_norm": 12.809176445007324, "learning_rate": 4.9502753957753905e-06, "loss": 3.5917, "step": 6345 }, { "epoch": 0.06459554036458333, "grad_norm": 9.395776748657227, "learning_rate": 4.950196006370005e-06, "loss": 3.4648, "step": 6350 }, { "epoch": 0.06464640299479167, "grad_norm": 14.865406036376953, "learning_rate": 4.950116554276936e-06, "loss": 3.5405, "step": 6355 }, { "epoch": 0.064697265625, "grad_norm": 13.822293281555176, "learning_rate": 4.950037039498215e-06, "loss": 3.2149, "step": 6360 }, { "epoch": 0.06474812825520833, "grad_norm": 10.368663787841797, "learning_rate": 4.949957462035877e-06, "loss": 3.2916, "step": 6365 }, { "epoch": 0.06479899088541667, "grad_norm": 15.785028457641602, "learning_rate": 4.949877821891958e-06, "loss": 3.2872, "step": 6370 }, { "epoch": 0.064849853515625, "grad_norm": 9.993640899658203, "learning_rate": 4.949798119068495e-06, "loss": 3.3167, "step": 6375 }, { "epoch": 0.06490071614583333, "grad_norm": 14.402359962463379, "learning_rate": 4.949718353567529e-06, "loss": 3.7203, "step": 6380 }, { "epoch": 0.06495157877604167, "grad_norm": 13.888138771057129, "learning_rate": 4.9496385253910996e-06, "loss": 3.4504, "step": 6385 }, { "epoch": 0.06500244140625, "grad_norm": 19.04587745666504, "learning_rate": 4.949558634541249e-06, "loss": 3.4553, "step": 6390 }, { "epoch": 0.06505330403645833, "grad_norm": 17.151155471801758, "learning_rate": 4.94947868102002e-06, "loss": 3.7058, "step": 6395 }, { "epoch": 0.06510416666666667, "grad_norm": 13.129286766052246, "learning_rate": 4.949398664829461e-06, "loss": 3.5657, "step": 6400 }, { "epoch": 0.065155029296875, "grad_norm": 12.586851119995117, "learning_rate": 4.949318585971617e-06, "loss": 3.2961, "step": 6405 }, { "epoch": 0.06520589192708333, "grad_norm": 15.264113426208496, "learning_rate": 4.949238444448539e-06, "loss": 3.7745, "step": 6410 }, { "epoch": 0.06525675455729167, "grad_norm": 12.554951667785645, "learning_rate": 4.949158240262274e-06, "loss": 3.455, "step": 6415 }, { "epoch": 0.0653076171875, "grad_norm": 11.962057113647461, "learning_rate": 4.949077973414877e-06, "loss": 3.2963, "step": 6420 }, { "epoch": 0.06535847981770833, "grad_norm": 12.596719741821289, "learning_rate": 4.9489976439084e-06, "loss": 3.7278, "step": 6425 }, { "epoch": 0.06540934244791667, "grad_norm": 10.47852897644043, "learning_rate": 4.948917251744899e-06, "loss": 3.6283, "step": 6430 }, { "epoch": 0.065460205078125, "grad_norm": 9.309669494628906, "learning_rate": 4.9488367969264304e-06, "loss": 3.3136, "step": 6435 }, { "epoch": 0.06551106770833333, "grad_norm": 13.344404220581055, "learning_rate": 4.9487562794550535e-06, "loss": 3.3655, "step": 6440 }, { "epoch": 0.06556193033854167, "grad_norm": 13.794371604919434, "learning_rate": 4.948675699332827e-06, "loss": 3.2034, "step": 6445 }, { "epoch": 0.06561279296875, "grad_norm": 14.009343147277832, "learning_rate": 4.9485950565618134e-06, "loss": 3.4136, "step": 6450 }, { "epoch": 0.06566365559895833, "grad_norm": 10.496328353881836, "learning_rate": 4.9485143511440754e-06, "loss": 3.5469, "step": 6455 }, { "epoch": 0.06571451822916667, "grad_norm": 18.860055923461914, "learning_rate": 4.948433583081678e-06, "loss": 4.3312, "step": 6460 }, { "epoch": 0.065765380859375, "grad_norm": 12.226606369018555, "learning_rate": 4.948352752376689e-06, "loss": 3.187, "step": 6465 }, { "epoch": 0.06581624348958333, "grad_norm": 8.17588996887207, "learning_rate": 4.948271859031173e-06, "loss": 3.4183, "step": 6470 }, { "epoch": 0.06586710611979167, "grad_norm": 18.25055694580078, "learning_rate": 4.948190903047203e-06, "loss": 3.9294, "step": 6475 }, { "epoch": 0.06591796875, "grad_norm": 15.113214492797852, "learning_rate": 4.948109884426849e-06, "loss": 3.2971, "step": 6480 }, { "epoch": 0.06596883138020833, "grad_norm": 10.636944770812988, "learning_rate": 4.9480288031721835e-06, "loss": 3.1685, "step": 6485 }, { "epoch": 0.06601969401041667, "grad_norm": 12.23661994934082, "learning_rate": 4.947947659285281e-06, "loss": 3.2994, "step": 6490 }, { "epoch": 0.066070556640625, "grad_norm": 14.394771575927734, "learning_rate": 4.9478664527682194e-06, "loss": 3.5549, "step": 6495 }, { "epoch": 0.06612141927083333, "grad_norm": 12.141425132751465, "learning_rate": 4.947785183623074e-06, "loss": 3.5396, "step": 6500 }, { "epoch": 0.06617228190104167, "grad_norm": 11.212850570678711, "learning_rate": 4.9477038518519235e-06, "loss": 3.3503, "step": 6505 }, { "epoch": 0.06622314453125, "grad_norm": 13.835770606994629, "learning_rate": 4.947622457456852e-06, "loss": 3.1723, "step": 6510 }, { "epoch": 0.06627400716145833, "grad_norm": 13.431824684143066, "learning_rate": 4.94754100043994e-06, "loss": 3.339, "step": 6515 }, { "epoch": 0.06632486979166667, "grad_norm": 12.01131820678711, "learning_rate": 4.94745948080327e-06, "loss": 3.2717, "step": 6520 }, { "epoch": 0.066375732421875, "grad_norm": 14.991222381591797, "learning_rate": 4.947377898548931e-06, "loss": 3.52, "step": 6525 }, { "epoch": 0.06642659505208333, "grad_norm": 14.108927726745605, "learning_rate": 4.947296253679008e-06, "loss": 3.1859, "step": 6530 }, { "epoch": 0.06647745768229167, "grad_norm": 9.38899040222168, "learning_rate": 4.94721454619559e-06, "loss": 3.6626, "step": 6535 }, { "epoch": 0.0665283203125, "grad_norm": 12.356621742248535, "learning_rate": 4.947132776100768e-06, "loss": 3.8692, "step": 6540 }, { "epoch": 0.06657918294270833, "grad_norm": 16.474075317382812, "learning_rate": 4.947050943396634e-06, "loss": 3.0192, "step": 6545 }, { "epoch": 0.06663004557291667, "grad_norm": 12.742081642150879, "learning_rate": 4.9469690480852824e-06, "loss": 3.3703, "step": 6550 }, { "epoch": 0.066680908203125, "grad_norm": 13.639100074768066, "learning_rate": 4.946887090168807e-06, "loss": 3.8247, "step": 6555 }, { "epoch": 0.06673177083333333, "grad_norm": 9.607970237731934, "learning_rate": 4.946805069649305e-06, "loss": 3.4566, "step": 6560 }, { "epoch": 0.06678263346354167, "grad_norm": 13.177118301391602, "learning_rate": 4.946722986528876e-06, "loss": 3.5717, "step": 6565 }, { "epoch": 0.06683349609375, "grad_norm": 12.43066120147705, "learning_rate": 4.946640840809619e-06, "loss": 3.5391, "step": 6570 }, { "epoch": 0.06688435872395833, "grad_norm": 13.892637252807617, "learning_rate": 4.946558632493636e-06, "loss": 3.802, "step": 6575 }, { "epoch": 0.06693522135416667, "grad_norm": 13.632702827453613, "learning_rate": 4.94647636158303e-06, "loss": 3.3285, "step": 6580 }, { "epoch": 0.066986083984375, "grad_norm": 12.024981498718262, "learning_rate": 4.946394028079907e-06, "loss": 3.2208, "step": 6585 }, { "epoch": 0.06703694661458333, "grad_norm": 15.032366752624512, "learning_rate": 4.946311631986372e-06, "loss": 3.374, "step": 6590 }, { "epoch": 0.06708780924479167, "grad_norm": 10.841226577758789, "learning_rate": 4.946229173304535e-06, "loss": 3.2525, "step": 6595 }, { "epoch": 0.067138671875, "grad_norm": 10.338508605957031, "learning_rate": 4.946146652036502e-06, "loss": 3.5958, "step": 6600 }, { "epoch": 0.06718953450520833, "grad_norm": 10.029006004333496, "learning_rate": 4.9460640681843885e-06, "loss": 3.6985, "step": 6605 }, { "epoch": 0.06724039713541667, "grad_norm": 9.308262825012207, "learning_rate": 4.945981421750305e-06, "loss": 3.3633, "step": 6610 }, { "epoch": 0.067291259765625, "grad_norm": 8.373737335205078, "learning_rate": 4.945898712736366e-06, "loss": 3.5412, "step": 6615 }, { "epoch": 0.06734212239583333, "grad_norm": 11.022754669189453, "learning_rate": 4.94581594114469e-06, "loss": 4.3451, "step": 6620 }, { "epoch": 0.06739298502604167, "grad_norm": 7.635653972625732, "learning_rate": 4.945733106977391e-06, "loss": 3.1132, "step": 6625 }, { "epoch": 0.06744384765625, "grad_norm": 11.958892822265625, "learning_rate": 4.945650210236591e-06, "loss": 3.736, "step": 6630 }, { "epoch": 0.06749471028645833, "grad_norm": 17.302410125732422, "learning_rate": 4.94556725092441e-06, "loss": 3.2335, "step": 6635 }, { "epoch": 0.06754557291666667, "grad_norm": 10.765392303466797, "learning_rate": 4.945484229042971e-06, "loss": 3.0814, "step": 6640 }, { "epoch": 0.067596435546875, "grad_norm": 10.385746002197266, "learning_rate": 4.945401144594397e-06, "loss": 3.4643, "step": 6645 }, { "epoch": 0.06764729817708333, "grad_norm": 14.88101577758789, "learning_rate": 4.945317997580814e-06, "loss": 3.7192, "step": 6650 }, { "epoch": 0.06769816080729167, "grad_norm": 10.534723281860352, "learning_rate": 4.9452347880043505e-06, "loss": 3.4338, "step": 6655 }, { "epoch": 0.0677490234375, "grad_norm": 10.487231254577637, "learning_rate": 4.945151515867134e-06, "loss": 3.6227, "step": 6660 }, { "epoch": 0.06779988606770833, "grad_norm": 10.452747344970703, "learning_rate": 4.9450681811712954e-06, "loss": 3.2158, "step": 6665 }, { "epoch": 0.06785074869791667, "grad_norm": 12.921323776245117, "learning_rate": 4.944984783918968e-06, "loss": 3.5547, "step": 6670 }, { "epoch": 0.067901611328125, "grad_norm": 12.540404319763184, "learning_rate": 4.944901324112283e-06, "loss": 3.5022, "step": 6675 }, { "epoch": 0.06795247395833333, "grad_norm": 11.059189796447754, "learning_rate": 4.9448178017533775e-06, "loss": 3.3226, "step": 6680 }, { "epoch": 0.06800333658854167, "grad_norm": 19.822628021240234, "learning_rate": 4.944734216844388e-06, "loss": 3.7347, "step": 6685 }, { "epoch": 0.06805419921875, "grad_norm": 12.81627082824707, "learning_rate": 4.944650569387453e-06, "loss": 3.9766, "step": 6690 }, { "epoch": 0.06810506184895833, "grad_norm": 14.65715217590332, "learning_rate": 4.944566859384714e-06, "loss": 3.2335, "step": 6695 }, { "epoch": 0.06815592447916667, "grad_norm": 9.100480079650879, "learning_rate": 4.94448308683831e-06, "loss": 3.3709, "step": 6700 }, { "epoch": 0.068206787109375, "grad_norm": 12.44413948059082, "learning_rate": 4.944399251750386e-06, "loss": 3.705, "step": 6705 }, { "epoch": 0.06825764973958333, "grad_norm": 612.3552856445312, "learning_rate": 4.944315354123086e-06, "loss": 3.851, "step": 6710 }, { "epoch": 0.06830851236979167, "grad_norm": 11.150618553161621, "learning_rate": 4.944231393958558e-06, "loss": 3.3883, "step": 6715 }, { "epoch": 0.068359375, "grad_norm": 15.825736999511719, "learning_rate": 4.944147371258948e-06, "loss": 3.1653, "step": 6720 }, { "epoch": 0.06841023763020833, "grad_norm": 13.351125717163086, "learning_rate": 4.944063286026408e-06, "loss": 3.6262, "step": 6725 }, { "epoch": 0.06846110026041667, "grad_norm": 10.806169509887695, "learning_rate": 4.9439791382630875e-06, "loss": 3.4346, "step": 6730 }, { "epoch": 0.068511962890625, "grad_norm": 14.80085277557373, "learning_rate": 4.94389492797114e-06, "loss": 4.8293, "step": 6735 }, { "epoch": 0.06856282552083333, "grad_norm": 15.626360893249512, "learning_rate": 4.94381065515272e-06, "loss": 3.5732, "step": 6740 }, { "epoch": 0.06861368815104167, "grad_norm": 12.220220565795898, "learning_rate": 4.943726319809984e-06, "loss": 3.1158, "step": 6745 }, { "epoch": 0.06866455078125, "grad_norm": 11.84122371673584, "learning_rate": 4.943641921945089e-06, "loss": 3.2894, "step": 6750 }, { "epoch": 0.06871541341145833, "grad_norm": 11.213668823242188, "learning_rate": 4.943557461560195e-06, "loss": 3.5569, "step": 6755 }, { "epoch": 0.06876627604166667, "grad_norm": 10.223577499389648, "learning_rate": 4.943472938657462e-06, "loss": 3.4687, "step": 6760 }, { "epoch": 0.068817138671875, "grad_norm": 11.476598739624023, "learning_rate": 4.943388353239053e-06, "loss": 3.3717, "step": 6765 }, { "epoch": 0.06886800130208333, "grad_norm": 12.814501762390137, "learning_rate": 4.943303705307133e-06, "loss": 3.4168, "step": 6770 }, { "epoch": 0.06891886393229167, "grad_norm": 15.519283294677734, "learning_rate": 4.943218994863866e-06, "loss": 3.4865, "step": 6775 }, { "epoch": 0.0689697265625, "grad_norm": 10.789101600646973, "learning_rate": 4.943134221911421e-06, "loss": 3.3488, "step": 6780 }, { "epoch": 0.06902058919270833, "grad_norm": 16.44157600402832, "learning_rate": 4.943049386451964e-06, "loss": 3.7548, "step": 6785 }, { "epoch": 0.06907145182291667, "grad_norm": 12.07598876953125, "learning_rate": 4.942964488487669e-06, "loss": 3.6223, "step": 6790 }, { "epoch": 0.069122314453125, "grad_norm": 13.612014770507812, "learning_rate": 4.942879528020707e-06, "loss": 3.9421, "step": 6795 }, { "epoch": 0.06917317708333333, "grad_norm": 10.37803840637207, "learning_rate": 4.9427945050532515e-06, "loss": 3.6429, "step": 6800 }, { "epoch": 0.06922403971354167, "grad_norm": 10.748969078063965, "learning_rate": 4.942709419587476e-06, "loss": 3.4308, "step": 6805 }, { "epoch": 0.06927490234375, "grad_norm": 8.533020973205566, "learning_rate": 4.9426242716255605e-06, "loss": 3.2213, "step": 6810 }, { "epoch": 0.06932576497395833, "grad_norm": 14.239413261413574, "learning_rate": 4.942539061169681e-06, "loss": 3.5663, "step": 6815 }, { "epoch": 0.06937662760416667, "grad_norm": 12.664621353149414, "learning_rate": 4.942453788222019e-06, "loss": 3.5002, "step": 6820 }, { "epoch": 0.069427490234375, "grad_norm": 12.779163360595703, "learning_rate": 4.942368452784756e-06, "loss": 3.2068, "step": 6825 }, { "epoch": 0.06947835286458333, "grad_norm": 16.653684616088867, "learning_rate": 4.9422830548600745e-06, "loss": 3.3543, "step": 6830 }, { "epoch": 0.06952921549479167, "grad_norm": 16.349224090576172, "learning_rate": 4.94219759445016e-06, "loss": 3.5099, "step": 6835 }, { "epoch": 0.069580078125, "grad_norm": 8.445566177368164, "learning_rate": 4.942112071557199e-06, "loss": 3.793, "step": 6840 }, { "epoch": 0.06963094075520833, "grad_norm": 14.703461647033691, "learning_rate": 4.94202648618338e-06, "loss": 3.5832, "step": 6845 }, { "epoch": 0.06968180338541667, "grad_norm": 14.559622764587402, "learning_rate": 4.941940838330891e-06, "loss": 3.6992, "step": 6850 }, { "epoch": 0.069732666015625, "grad_norm": 11.245277404785156, "learning_rate": 4.941855128001925e-06, "loss": 2.9931, "step": 6855 }, { "epoch": 0.06978352864583333, "grad_norm": 17.25091552734375, "learning_rate": 4.941769355198675e-06, "loss": 3.243, "step": 6860 }, { "epoch": 0.06983439127604167, "grad_norm": 11.740355491638184, "learning_rate": 4.941683519923335e-06, "loss": 3.3695, "step": 6865 }, { "epoch": 0.06988525390625, "grad_norm": 12.884564399719238, "learning_rate": 4.9415976221781e-06, "loss": 3.3783, "step": 6870 }, { "epoch": 0.06993611653645833, "grad_norm": 14.360733032226562, "learning_rate": 4.9415116619651685e-06, "loss": 3.332, "step": 6875 }, { "epoch": 0.06998697916666667, "grad_norm": 15.234169960021973, "learning_rate": 4.94142563928674e-06, "loss": 3.0307, "step": 6880 }, { "epoch": 0.070037841796875, "grad_norm": 12.016648292541504, "learning_rate": 4.941339554145015e-06, "loss": 3.607, "step": 6885 }, { "epoch": 0.07008870442708333, "grad_norm": 10.132488250732422, "learning_rate": 4.941253406542197e-06, "loss": 3.7322, "step": 6890 }, { "epoch": 0.07013956705729167, "grad_norm": 11.677488327026367, "learning_rate": 4.941167196480489e-06, "loss": 3.7035, "step": 6895 }, { "epoch": 0.0701904296875, "grad_norm": 8.434168815612793, "learning_rate": 4.941080923962096e-06, "loss": 3.3046, "step": 6900 }, { "epoch": 0.07024129231770833, "grad_norm": 18.53049659729004, "learning_rate": 4.940994588989227e-06, "loss": 3.9959, "step": 6905 }, { "epoch": 0.07029215494791667, "grad_norm": 14.731730461120605, "learning_rate": 4.94090819156409e-06, "loss": 3.5238, "step": 6910 }, { "epoch": 0.070343017578125, "grad_norm": 9.122929573059082, "learning_rate": 4.940821731688895e-06, "loss": 3.525, "step": 6915 }, { "epoch": 0.07039388020833333, "grad_norm": 11.580704689025879, "learning_rate": 4.940735209365855e-06, "loss": 3.9973, "step": 6920 }, { "epoch": 0.07044474283854167, "grad_norm": 13.921919822692871, "learning_rate": 4.940648624597183e-06, "loss": 3.3295, "step": 6925 }, { "epoch": 0.07049560546875, "grad_norm": 11.508909225463867, "learning_rate": 4.9405619773850944e-06, "loss": 3.3846, "step": 6930 }, { "epoch": 0.07054646809895833, "grad_norm": 15.52819538116455, "learning_rate": 4.940475267731806e-06, "loss": 3.7001, "step": 6935 }, { "epoch": 0.07059733072916667, "grad_norm": 12.931517601013184, "learning_rate": 4.940388495639537e-06, "loss": 3.4707, "step": 6940 }, { "epoch": 0.070648193359375, "grad_norm": 16.401697158813477, "learning_rate": 4.9403016611105055e-06, "loss": 3.5479, "step": 6945 }, { "epoch": 0.07069905598958333, "grad_norm": 14.115461349487305, "learning_rate": 4.940214764146935e-06, "loss": 3.4651, "step": 6950 }, { "epoch": 0.07074991861979167, "grad_norm": 13.501302719116211, "learning_rate": 4.940127804751048e-06, "loss": 3.2613, "step": 6955 }, { "epoch": 0.07080078125, "grad_norm": 9.428580284118652, "learning_rate": 4.94004078292507e-06, "loss": 3.528, "step": 6960 }, { "epoch": 0.07085164388020833, "grad_norm": 15.635141372680664, "learning_rate": 4.939953698671227e-06, "loss": 3.413, "step": 6965 }, { "epoch": 0.07090250651041667, "grad_norm": 18.93190574645996, "learning_rate": 4.939866551991746e-06, "loss": 3.6146, "step": 6970 }, { "epoch": 0.070953369140625, "grad_norm": 13.07038688659668, "learning_rate": 4.939779342888858e-06, "loss": 3.6089, "step": 6975 }, { "epoch": 0.07100423177083333, "grad_norm": 11.98829174041748, "learning_rate": 4.939692071364794e-06, "loss": 3.2366, "step": 6980 }, { "epoch": 0.07105509440104167, "grad_norm": 13.00400161743164, "learning_rate": 4.939604737421787e-06, "loss": 3.6997, "step": 6985 }, { "epoch": 0.07110595703125, "grad_norm": 10.635180473327637, "learning_rate": 4.9395173410620714e-06, "loss": 3.5263, "step": 6990 }, { "epoch": 0.07115681966145833, "grad_norm": 8.874784469604492, "learning_rate": 4.939429882287881e-06, "loss": 3.7435, "step": 6995 }, { "epoch": 0.07120768229166667, "grad_norm": 17.223342895507812, "learning_rate": 4.939342361101457e-06, "loss": 3.248, "step": 7000 }, { "epoch": 0.071258544921875, "grad_norm": 15.709144592285156, "learning_rate": 4.939254777505037e-06, "loss": 3.3038, "step": 7005 }, { "epoch": 0.07130940755208333, "grad_norm": 10.639139175415039, "learning_rate": 4.93916713150086e-06, "loss": 3.433, "step": 7010 }, { "epoch": 0.07136027018229167, "grad_norm": 13.915593147277832, "learning_rate": 4.9390794230911715e-06, "loss": 3.4058, "step": 7015 }, { "epoch": 0.0714111328125, "grad_norm": 8.267925262451172, "learning_rate": 4.938991652278213e-06, "loss": 3.5928, "step": 7020 }, { "epoch": 0.07146199544270833, "grad_norm": 16.01654624938965, "learning_rate": 4.938903819064232e-06, "loss": 3.1889, "step": 7025 }, { "epoch": 0.07151285807291667, "grad_norm": 13.366415977478027, "learning_rate": 4.938815923451476e-06, "loss": 3.3296, "step": 7030 }, { "epoch": 0.071563720703125, "grad_norm": 12.922379493713379, "learning_rate": 4.9387279654421905e-06, "loss": 3.4845, "step": 7035 }, { "epoch": 0.07161458333333333, "grad_norm": 10.66179084777832, "learning_rate": 4.938639945038629e-06, "loss": 3.3688, "step": 7040 }, { "epoch": 0.07166544596354167, "grad_norm": 9.114995002746582, "learning_rate": 4.938551862243042e-06, "loss": 3.2822, "step": 7045 }, { "epoch": 0.07171630859375, "grad_norm": 10.946172714233398, "learning_rate": 4.9384637170576844e-06, "loss": 3.5603, "step": 7050 }, { "epoch": 0.07176717122395833, "grad_norm": 11.345202445983887, "learning_rate": 4.93837550948481e-06, "loss": 3.4276, "step": 7055 }, { "epoch": 0.07181803385416667, "grad_norm": 9.533265113830566, "learning_rate": 4.938287239526676e-06, "loss": 3.1513, "step": 7060 }, { "epoch": 0.071868896484375, "grad_norm": 9.234315872192383, "learning_rate": 4.938198907185542e-06, "loss": 3.3517, "step": 7065 }, { "epoch": 0.07191975911458333, "grad_norm": 10.09774398803711, "learning_rate": 4.938110512463666e-06, "loss": 3.5838, "step": 7070 }, { "epoch": 0.07197062174479167, "grad_norm": 14.220784187316895, "learning_rate": 4.938022055363311e-06, "loss": 3.4819, "step": 7075 }, { "epoch": 0.072021484375, "grad_norm": 10.781584739685059, "learning_rate": 4.9379335358867384e-06, "loss": 3.7672, "step": 7080 }, { "epoch": 0.07207234700520833, "grad_norm": 9.52526569366455, "learning_rate": 4.937844954036215e-06, "loss": 3.3561, "step": 7085 }, { "epoch": 0.07212320963541667, "grad_norm": 12.460453033447266, "learning_rate": 4.9377563098140065e-06, "loss": 3.4052, "step": 7090 }, { "epoch": 0.072174072265625, "grad_norm": 14.343060493469238, "learning_rate": 4.9376676032223805e-06, "loss": 3.214, "step": 7095 }, { "epoch": 0.07222493489583333, "grad_norm": 12.104986190795898, "learning_rate": 4.937578834263607e-06, "loss": 3.472, "step": 7100 }, { "epoch": 0.07227579752604167, "grad_norm": 12.275232315063477, "learning_rate": 4.9374900029399555e-06, "loss": 3.7637, "step": 7105 }, { "epoch": 0.07232666015625, "grad_norm": 12.134095191955566, "learning_rate": 4.937401109253701e-06, "loss": 3.6845, "step": 7110 }, { "epoch": 0.07237752278645833, "grad_norm": 15.203348159790039, "learning_rate": 4.937312153207117e-06, "loss": 3.5803, "step": 7115 }, { "epoch": 0.07242838541666667, "grad_norm": 17.58793830871582, "learning_rate": 4.937223134802478e-06, "loss": 3.1135, "step": 7120 }, { "epoch": 0.072479248046875, "grad_norm": 9.813260078430176, "learning_rate": 4.937134054042064e-06, "loss": 3.487, "step": 7125 }, { "epoch": 0.07253011067708333, "grad_norm": 12.799726486206055, "learning_rate": 4.9370449109281524e-06, "loss": 3.5097, "step": 7130 }, { "epoch": 0.07258097330729167, "grad_norm": 10.807867050170898, "learning_rate": 4.936955705463025e-06, "loss": 3.5704, "step": 7135 }, { "epoch": 0.0726318359375, "grad_norm": 13.212722778320312, "learning_rate": 4.936866437648963e-06, "loss": 3.2637, "step": 7140 }, { "epoch": 0.07268269856770833, "grad_norm": 10.858726501464844, "learning_rate": 4.936777107488251e-06, "loss": 3.3638, "step": 7145 }, { "epoch": 0.07273356119791667, "grad_norm": 14.568108558654785, "learning_rate": 4.936687714983174e-06, "loss": 3.2916, "step": 7150 }, { "epoch": 0.072784423828125, "grad_norm": 12.920387268066406, "learning_rate": 4.9365982601360194e-06, "loss": 3.4835, "step": 7155 }, { "epoch": 0.07283528645833333, "grad_norm": 13.888976097106934, "learning_rate": 4.9365087429490765e-06, "loss": 3.4057, "step": 7160 }, { "epoch": 0.07288614908854167, "grad_norm": 8.182246208190918, "learning_rate": 4.936419163424634e-06, "loss": 3.4935, "step": 7165 }, { "epoch": 0.07293701171875, "grad_norm": 9.68957805633545, "learning_rate": 4.936329521564986e-06, "loss": 3.4725, "step": 7170 }, { "epoch": 0.07298787434895833, "grad_norm": 8.523253440856934, "learning_rate": 4.936239817372423e-06, "loss": 3.3035, "step": 7175 }, { "epoch": 0.07303873697916667, "grad_norm": 15.208540916442871, "learning_rate": 4.936150050849242e-06, "loss": 3.3124, "step": 7180 }, { "epoch": 0.073089599609375, "grad_norm": 9.686799049377441, "learning_rate": 4.93606022199774e-06, "loss": 3.1586, "step": 7185 }, { "epoch": 0.07314046223958333, "grad_norm": 17.809568405151367, "learning_rate": 4.935970330820215e-06, "loss": 3.5217, "step": 7190 }, { "epoch": 0.07319132486979167, "grad_norm": 9.103455543518066, "learning_rate": 4.935880377318965e-06, "loss": 3.3448, "step": 7195 }, { "epoch": 0.0732421875, "grad_norm": 17.965932846069336, "learning_rate": 4.935790361496295e-06, "loss": 3.4703, "step": 7200 }, { "epoch": 0.07329305013020833, "grad_norm": 17.913076400756836, "learning_rate": 4.935700283354504e-06, "loss": 3.7618, "step": 7205 }, { "epoch": 0.07334391276041667, "grad_norm": 10.30252456665039, "learning_rate": 4.9356101428959e-06, "loss": 3.0109, "step": 7210 }, { "epoch": 0.073394775390625, "grad_norm": 9.662969589233398, "learning_rate": 4.935519940122787e-06, "loss": 3.4018, "step": 7215 }, { "epoch": 0.07344563802083333, "grad_norm": 15.70295524597168, "learning_rate": 4.935429675037474e-06, "loss": 3.083, "step": 7220 }, { "epoch": 0.07349650065104167, "grad_norm": 8.53459644317627, "learning_rate": 4.935339347642269e-06, "loss": 3.0109, "step": 7225 }, { "epoch": 0.07354736328125, "grad_norm": 15.32571792602539, "learning_rate": 4.935248957939486e-06, "loss": 3.5678, "step": 7230 }, { "epoch": 0.07359822591145833, "grad_norm": 15.416007995605469, "learning_rate": 4.935158505931434e-06, "loss": 3.2472, "step": 7235 }, { "epoch": 0.07364908854166667, "grad_norm": 12.033348083496094, "learning_rate": 4.93506799162043e-06, "loss": 3.3393, "step": 7240 }, { "epoch": 0.073699951171875, "grad_norm": 12.217211723327637, "learning_rate": 4.934977415008787e-06, "loss": 3.7256, "step": 7245 }, { "epoch": 0.07375081380208333, "grad_norm": 13.5538969039917, "learning_rate": 4.934886776098825e-06, "loss": 3.6471, "step": 7250 }, { "epoch": 0.07380167643229167, "grad_norm": 11.107625961303711, "learning_rate": 4.934796074892862e-06, "loss": 3.4801, "step": 7255 }, { "epoch": 0.0738525390625, "grad_norm": 8.796854972839355, "learning_rate": 4.934705311393219e-06, "loss": 2.999, "step": 7260 }, { "epoch": 0.07390340169270833, "grad_norm": 15.173271179199219, "learning_rate": 4.934614485602217e-06, "loss": 3.2074, "step": 7265 }, { "epoch": 0.07395426432291667, "grad_norm": 14.630196571350098, "learning_rate": 4.9345235975221804e-06, "loss": 3.2888, "step": 7270 }, { "epoch": 0.074005126953125, "grad_norm": 10.86033821105957, "learning_rate": 4.934432647155435e-06, "loss": 3.2936, "step": 7275 }, { "epoch": 0.07405598958333333, "grad_norm": 13.982941627502441, "learning_rate": 4.934341634504307e-06, "loss": 3.7184, "step": 7280 }, { "epoch": 0.07410685221354167, "grad_norm": 9.288113594055176, "learning_rate": 4.934250559571126e-06, "loss": 3.1865, "step": 7285 }, { "epoch": 0.07415771484375, "grad_norm": 11.010740280151367, "learning_rate": 4.93415942235822e-06, "loss": 3.442, "step": 7290 }, { "epoch": 0.07420857747395833, "grad_norm": 15.357698440551758, "learning_rate": 4.934068222867923e-06, "loss": 3.6836, "step": 7295 }, { "epoch": 0.07425944010416667, "grad_norm": 14.378852844238281, "learning_rate": 4.9339769611025675e-06, "loss": 3.4716, "step": 7300 }, { "epoch": 0.074310302734375, "grad_norm": 10.144874572753906, "learning_rate": 4.933885637064489e-06, "loss": 3.7662, "step": 7305 }, { "epoch": 0.07436116536458333, "grad_norm": 15.959395408630371, "learning_rate": 4.933794250756022e-06, "loss": 3.2475, "step": 7310 }, { "epoch": 0.07441202799479167, "grad_norm": 15.771014213562012, "learning_rate": 4.933702802179506e-06, "loss": 3.5408, "step": 7315 }, { "epoch": 0.074462890625, "grad_norm": 16.440584182739258, "learning_rate": 4.933611291337282e-06, "loss": 3.5703, "step": 7320 }, { "epoch": 0.07451375325520833, "grad_norm": 13.77549934387207, "learning_rate": 4.933519718231689e-06, "loss": 3.7564, "step": 7325 }, { "epoch": 0.07456461588541667, "grad_norm": 13.803625106811523, "learning_rate": 4.9334280828650714e-06, "loss": 3.9605, "step": 7330 }, { "epoch": 0.074615478515625, "grad_norm": 16.266210556030273, "learning_rate": 4.933336385239772e-06, "loss": 3.6834, "step": 7335 }, { "epoch": 0.07466634114583333, "grad_norm": 12.476020812988281, "learning_rate": 4.933244625358139e-06, "loss": 3.2462, "step": 7340 }, { "epoch": 0.07471720377604167, "grad_norm": 14.898695945739746, "learning_rate": 4.9331528032225186e-06, "loss": 4.0511, "step": 7345 }, { "epoch": 0.07476806640625, "grad_norm": 13.865912437438965, "learning_rate": 4.933060918835261e-06, "loss": 3.5242, "step": 7350 }, { "epoch": 0.07481892903645833, "grad_norm": 15.214755058288574, "learning_rate": 4.932968972198715e-06, "loss": 3.6132, "step": 7355 }, { "epoch": 0.07486979166666667, "grad_norm": 14.871356010437012, "learning_rate": 4.932876963315236e-06, "loss": 3.3851, "step": 7360 }, { "epoch": 0.074920654296875, "grad_norm": 12.997641563415527, "learning_rate": 4.932784892187176e-06, "loss": 3.1629, "step": 7365 }, { "epoch": 0.07497151692708333, "grad_norm": 10.09203052520752, "learning_rate": 4.932692758816892e-06, "loss": 3.3311, "step": 7370 }, { "epoch": 0.07502237955729167, "grad_norm": 8.960355758666992, "learning_rate": 4.932600563206739e-06, "loss": 3.1715, "step": 7375 }, { "epoch": 0.0750732421875, "grad_norm": 16.078197479248047, "learning_rate": 4.932508305359078e-06, "loss": 3.5975, "step": 7380 }, { "epoch": 0.07512410481770833, "grad_norm": 15.18477725982666, "learning_rate": 4.9324159852762685e-06, "loss": 3.341, "step": 7385 }, { "epoch": 0.07517496744791667, "grad_norm": 15.12987995147705, "learning_rate": 4.932323602960673e-06, "loss": 3.5689, "step": 7390 }, { "epoch": 0.075225830078125, "grad_norm": 13.448582649230957, "learning_rate": 4.932231158414653e-06, "loss": 3.6534, "step": 7395 }, { "epoch": 0.07527669270833333, "grad_norm": 9.764447212219238, "learning_rate": 4.932138651640577e-06, "loss": 3.5477, "step": 7400 }, { "epoch": 0.07532755533854167, "grad_norm": 11.549277305603027, "learning_rate": 4.932046082640809e-06, "loss": 3.0506, "step": 7405 }, { "epoch": 0.07537841796875, "grad_norm": 9.322796821594238, "learning_rate": 4.9319534514177196e-06, "loss": 3.2946, "step": 7410 }, { "epoch": 0.07542928059895833, "grad_norm": 10.205277442932129, "learning_rate": 4.931860757973676e-06, "loss": 3.3928, "step": 7415 }, { "epoch": 0.07548014322916667, "grad_norm": 8.97710132598877, "learning_rate": 4.931768002311052e-06, "loss": 3.5807, "step": 7420 }, { "epoch": 0.075531005859375, "grad_norm": 15.988985061645508, "learning_rate": 4.931675184432221e-06, "loss": 3.5203, "step": 7425 }, { "epoch": 0.07558186848958333, "grad_norm": 12.848153114318848, "learning_rate": 4.931582304339556e-06, "loss": 3.1023, "step": 7430 }, { "epoch": 0.07563273111979167, "grad_norm": 14.775106430053711, "learning_rate": 4.931489362035434e-06, "loss": 3.6487, "step": 7435 }, { "epoch": 0.07568359375, "grad_norm": 15.638808250427246, "learning_rate": 4.931396357522233e-06, "loss": 3.4362, "step": 7440 }, { "epoch": 0.07573445638020833, "grad_norm": 17.059715270996094, "learning_rate": 4.931303290802333e-06, "loss": 4.0533, "step": 7445 }, { "epoch": 0.07578531901041667, "grad_norm": 13.491350173950195, "learning_rate": 4.931210161878114e-06, "loss": 3.1253, "step": 7450 }, { "epoch": 0.075836181640625, "grad_norm": 11.535757064819336, "learning_rate": 4.93111697075196e-06, "loss": 3.4529, "step": 7455 }, { "epoch": 0.07588704427083333, "grad_norm": 8.205851554870605, "learning_rate": 4.9310237174262535e-06, "loss": 3.3011, "step": 7460 }, { "epoch": 0.07593790690104167, "grad_norm": 10.262289047241211, "learning_rate": 4.930930401903382e-06, "loss": 3.2739, "step": 7465 }, { "epoch": 0.07598876953125, "grad_norm": 11.151719093322754, "learning_rate": 4.930837024185732e-06, "loss": 3.6304, "step": 7470 }, { "epoch": 0.07603963216145833, "grad_norm": 13.860788345336914, "learning_rate": 4.930743584275694e-06, "loss": 3.3064, "step": 7475 }, { "epoch": 0.07609049479166667, "grad_norm": 12.631816864013672, "learning_rate": 4.930650082175656e-06, "loss": 3.3838, "step": 7480 }, { "epoch": 0.076141357421875, "grad_norm": 15.496430397033691, "learning_rate": 4.930556517888013e-06, "loss": 3.3085, "step": 7485 }, { "epoch": 0.07619222005208333, "grad_norm": 15.001185417175293, "learning_rate": 4.930462891415156e-06, "loss": 3.1146, "step": 7490 }, { "epoch": 0.07624308268229167, "grad_norm": 9.253118515014648, "learning_rate": 4.930369202759484e-06, "loss": 3.5206, "step": 7495 }, { "epoch": 0.0762939453125, "grad_norm": 9.574434280395508, "learning_rate": 4.9302754519233905e-06, "loss": 3.0834, "step": 7500 }, { "epoch": 0.07634480794270833, "grad_norm": 8.028079986572266, "learning_rate": 4.9301816389092775e-06, "loss": 3.3453, "step": 7505 }, { "epoch": 0.07639567057291667, "grad_norm": 15.80912971496582, "learning_rate": 4.930087763719541e-06, "loss": 3.3046, "step": 7510 }, { "epoch": 0.076446533203125, "grad_norm": 12.27437686920166, "learning_rate": 4.929993826356586e-06, "loss": 3.3498, "step": 7515 }, { "epoch": 0.07649739583333333, "grad_norm": 8.595794677734375, "learning_rate": 4.9298998268228154e-06, "loss": 3.4443, "step": 7520 }, { "epoch": 0.07654825846354167, "grad_norm": 14.03144645690918, "learning_rate": 4.929805765120633e-06, "loss": 3.4373, "step": 7525 }, { "epoch": 0.07659912109375, "grad_norm": 16.364065170288086, "learning_rate": 4.929711641252446e-06, "loss": 3.3485, "step": 7530 }, { "epoch": 0.07664998372395833, "grad_norm": 13.637460708618164, "learning_rate": 4.929617455220664e-06, "loss": 3.1559, "step": 7535 }, { "epoch": 0.07670084635416667, "grad_norm": 9.677896499633789, "learning_rate": 4.929523207027693e-06, "loss": 3.1611, "step": 7540 }, { "epoch": 0.076751708984375, "grad_norm": 11.03842830657959, "learning_rate": 4.929428896675949e-06, "loss": 3.468, "step": 7545 }, { "epoch": 0.07680257161458333, "grad_norm": 8.96399974822998, "learning_rate": 4.92933452416784e-06, "loss": 3.2743, "step": 7550 }, { "epoch": 0.07685343424479167, "grad_norm": 14.87559986114502, "learning_rate": 4.929240089505785e-06, "loss": 3.4153, "step": 7555 }, { "epoch": 0.076904296875, "grad_norm": 16.70171356201172, "learning_rate": 4.929145592692197e-06, "loss": 3.3959, "step": 7560 }, { "epoch": 0.07695515950520833, "grad_norm": 10.184401512145996, "learning_rate": 4.929051033729495e-06, "loss": 3.3099, "step": 7565 }, { "epoch": 0.07700602213541667, "grad_norm": 16.423912048339844, "learning_rate": 4.928956412620098e-06, "loss": 3.5852, "step": 7570 }, { "epoch": 0.077056884765625, "grad_norm": 12.25012493133545, "learning_rate": 4.928861729366427e-06, "loss": 3.3323, "step": 7575 }, { "epoch": 0.07710774739583333, "grad_norm": 14.227832794189453, "learning_rate": 4.928766983970905e-06, "loss": 3.7213, "step": 7580 }, { "epoch": 0.07715861002604167, "grad_norm": 9.725828170776367, "learning_rate": 4.928672176435955e-06, "loss": 3.8787, "step": 7585 }, { "epoch": 0.07720947265625, "grad_norm": 13.200632095336914, "learning_rate": 4.928577306764003e-06, "loss": 3.3488, "step": 7590 }, { "epoch": 0.07726033528645833, "grad_norm": 16.116952896118164, "learning_rate": 4.928482374957476e-06, "loss": 3.5104, "step": 7595 }, { "epoch": 0.07731119791666667, "grad_norm": 14.31235408782959, "learning_rate": 4.928387381018803e-06, "loss": 3.4802, "step": 7600 }, { "epoch": 0.077362060546875, "grad_norm": 14.306567192077637, "learning_rate": 4.928292324950415e-06, "loss": 3.1986, "step": 7605 }, { "epoch": 0.07741292317708333, "grad_norm": 10.679328918457031, "learning_rate": 4.9281972067547435e-06, "loss": 3.4726, "step": 7610 }, { "epoch": 0.07746378580729167, "grad_norm": 13.975683212280273, "learning_rate": 4.928102026434221e-06, "loss": 3.5155, "step": 7615 }, { "epoch": 0.0775146484375, "grad_norm": 11.450372695922852, "learning_rate": 4.928006783991285e-06, "loss": 3.26, "step": 7620 }, { "epoch": 0.07756551106770833, "grad_norm": 9.844697952270508, "learning_rate": 4.92791147942837e-06, "loss": 3.552, "step": 7625 }, { "epoch": 0.07761637369791667, "grad_norm": 13.510926246643066, "learning_rate": 4.927816112747915e-06, "loss": 3.2648, "step": 7630 }, { "epoch": 0.077667236328125, "grad_norm": 16.180030822753906, "learning_rate": 4.927720683952361e-06, "loss": 3.8344, "step": 7635 }, { "epoch": 0.07771809895833333, "grad_norm": 10.086830139160156, "learning_rate": 4.9276251930441485e-06, "loss": 3.5012, "step": 7640 }, { "epoch": 0.07776896158854167, "grad_norm": 16.06332778930664, "learning_rate": 4.927529640025721e-06, "loss": 3.5488, "step": 7645 }, { "epoch": 0.07781982421875, "grad_norm": 9.544757843017578, "learning_rate": 4.927434024899522e-06, "loss": 3.3311, "step": 7650 }, { "epoch": 0.07787068684895833, "grad_norm": 10.502907752990723, "learning_rate": 4.927338347668e-06, "loss": 3.7151, "step": 7655 }, { "epoch": 0.07792154947916667, "grad_norm": 21.776485443115234, "learning_rate": 4.927242608333601e-06, "loss": 3.6689, "step": 7660 }, { "epoch": 0.077972412109375, "grad_norm": 9.892070770263672, "learning_rate": 4.927146806898776e-06, "loss": 3.4572, "step": 7665 }, { "epoch": 0.07802327473958333, "grad_norm": 10.099709510803223, "learning_rate": 4.927050943365974e-06, "loss": 3.1367, "step": 7670 }, { "epoch": 0.07807413736979167, "grad_norm": 11.609224319458008, "learning_rate": 4.92695501773765e-06, "loss": 3.6545, "step": 7675 }, { "epoch": 0.078125, "grad_norm": 12.737848281860352, "learning_rate": 4.926859030016257e-06, "loss": 3.4818, "step": 7680 }, { "epoch": 0.07817586263020833, "grad_norm": 11.716086387634277, "learning_rate": 4.926762980204251e-06, "loss": 3.3399, "step": 7685 }, { "epoch": 0.07822672526041667, "grad_norm": 15.350922584533691, "learning_rate": 4.926666868304089e-06, "loss": 3.2643, "step": 7690 }, { "epoch": 0.078277587890625, "grad_norm": 12.929101943969727, "learning_rate": 4.9265706943182305e-06, "loss": 3.1085, "step": 7695 }, { "epoch": 0.07832845052083333, "grad_norm": 11.21131706237793, "learning_rate": 4.926474458249137e-06, "loss": 3.3425, "step": 7700 }, { "epoch": 0.07837931315104167, "grad_norm": 16.155336380004883, "learning_rate": 4.9263781600992675e-06, "loss": 3.7774, "step": 7705 }, { "epoch": 0.07843017578125, "grad_norm": 16.52438735961914, "learning_rate": 4.926281799871089e-06, "loss": 3.6957, "step": 7710 }, { "epoch": 0.07848103841145833, "grad_norm": 15.890446662902832, "learning_rate": 4.926185377567065e-06, "loss": 3.5257, "step": 7715 }, { "epoch": 0.07853190104166667, "grad_norm": 12.618369102478027, "learning_rate": 4.926088893189665e-06, "loss": 3.5028, "step": 7720 }, { "epoch": 0.078582763671875, "grad_norm": 8.554431915283203, "learning_rate": 4.925992346741354e-06, "loss": 3.0891, "step": 7725 }, { "epoch": 0.07863362630208333, "grad_norm": 15.57974624633789, "learning_rate": 4.9258957382246045e-06, "loss": 3.4668, "step": 7730 }, { "epoch": 0.07868448893229167, "grad_norm": 13.180612564086914, "learning_rate": 4.925799067641888e-06, "loss": 3.6996, "step": 7735 }, { "epoch": 0.0787353515625, "grad_norm": 10.547351837158203, "learning_rate": 4.9257023349956765e-06, "loss": 3.8278, "step": 7740 }, { "epoch": 0.07878621419270833, "grad_norm": 18.47353744506836, "learning_rate": 4.925605540288445e-06, "loss": 3.5644, "step": 7745 }, { "epoch": 0.07883707682291667, "grad_norm": 14.555137634277344, "learning_rate": 4.925508683522673e-06, "loss": 3.3353, "step": 7750 }, { "epoch": 0.078887939453125, "grad_norm": 14.03541088104248, "learning_rate": 4.925411764700834e-06, "loss": 3.3763, "step": 7755 }, { "epoch": 0.07893880208333333, "grad_norm": 13.177834510803223, "learning_rate": 4.925314783825411e-06, "loss": 3.6113, "step": 7760 }, { "epoch": 0.07898966471354167, "grad_norm": 12.112380027770996, "learning_rate": 4.925217740898884e-06, "loss": 3.3866, "step": 7765 }, { "epoch": 0.07904052734375, "grad_norm": 12.616769790649414, "learning_rate": 4.925120635923736e-06, "loss": 3.7202, "step": 7770 }, { "epoch": 0.07909138997395833, "grad_norm": 12.70201587677002, "learning_rate": 4.925023468902451e-06, "loss": 3.316, "step": 7775 }, { "epoch": 0.07914225260416667, "grad_norm": 15.856217384338379, "learning_rate": 4.924926239837515e-06, "loss": 3.36, "step": 7780 }, { "epoch": 0.079193115234375, "grad_norm": 12.738685607910156, "learning_rate": 4.9248289487314174e-06, "loss": 3.3463, "step": 7785 }, { "epoch": 0.07924397786458333, "grad_norm": 15.38477897644043, "learning_rate": 4.924731595586645e-06, "loss": 3.3254, "step": 7790 }, { "epoch": 0.07929484049479167, "grad_norm": 12.949041366577148, "learning_rate": 4.924634180405689e-06, "loss": 3.2682, "step": 7795 }, { "epoch": 0.079345703125, "grad_norm": 17.46603775024414, "learning_rate": 4.924536703191043e-06, "loss": 3.5482, "step": 7800 }, { "epoch": 0.07939656575520833, "grad_norm": 15.399724960327148, "learning_rate": 4.9244391639451995e-06, "loss": 3.5545, "step": 7805 }, { "epoch": 0.07944742838541667, "grad_norm": 13.765381813049316, "learning_rate": 4.924341562670655e-06, "loss": 3.2777, "step": 7810 }, { "epoch": 0.079498291015625, "grad_norm": 13.219882011413574, "learning_rate": 4.924243899369906e-06, "loss": 3.4312, "step": 7815 }, { "epoch": 0.07954915364583333, "grad_norm": 15.765204429626465, "learning_rate": 4.924146174045451e-06, "loss": 3.4005, "step": 7820 }, { "epoch": 0.07960001627604167, "grad_norm": 14.722639083862305, "learning_rate": 4.924048386699792e-06, "loss": 3.4344, "step": 7825 }, { "epoch": 0.07965087890625, "grad_norm": 14.24759292602539, "learning_rate": 4.923950537335429e-06, "loss": 3.5187, "step": 7830 }, { "epoch": 0.07970174153645833, "grad_norm": 14.9491548538208, "learning_rate": 4.923852625954866e-06, "loss": 3.3459, "step": 7835 }, { "epoch": 0.07975260416666667, "grad_norm": 10.572219848632812, "learning_rate": 4.9237546525606075e-06, "loss": 3.6514, "step": 7840 }, { "epoch": 0.079803466796875, "grad_norm": 13.39721393585205, "learning_rate": 4.923656617155162e-06, "loss": 3.4159, "step": 7845 }, { "epoch": 0.07985432942708333, "grad_norm": 13.185128211975098, "learning_rate": 4.923558519741035e-06, "loss": 3.3885, "step": 7850 }, { "epoch": 0.07990519205729167, "grad_norm": 10.458941459655762, "learning_rate": 4.923460360320738e-06, "loss": 3.8603, "step": 7855 }, { "epoch": 0.0799560546875, "grad_norm": 16.00554084777832, "learning_rate": 4.923362138896782e-06, "loss": 3.8007, "step": 7860 }, { "epoch": 0.08000691731770833, "grad_norm": 19.089359283447266, "learning_rate": 4.923263855471681e-06, "loss": 3.4031, "step": 7865 }, { "epoch": 0.08005777994791667, "grad_norm": 16.948728561401367, "learning_rate": 4.923165510047948e-06, "loss": 3.4621, "step": 7870 }, { "epoch": 0.080108642578125, "grad_norm": 15.898715019226074, "learning_rate": 4.9230671026281e-06, "loss": 3.2493, "step": 7875 }, { "epoch": 0.08015950520833333, "grad_norm": 15.451108932495117, "learning_rate": 4.922968633214654e-06, "loss": 3.5485, "step": 7880 }, { "epoch": 0.08021036783854167, "grad_norm": 8.44896411895752, "learning_rate": 4.922870101810131e-06, "loss": 3.6708, "step": 7885 }, { "epoch": 0.08026123046875, "grad_norm": 15.81263542175293, "learning_rate": 4.92277150841705e-06, "loss": 3.0769, "step": 7890 }, { "epoch": 0.08031209309895833, "grad_norm": 11.589963912963867, "learning_rate": 4.922672853037934e-06, "loss": 3.1524, "step": 7895 }, { "epoch": 0.08036295572916667, "grad_norm": 11.018848419189453, "learning_rate": 4.922574135675308e-06, "loss": 3.4289, "step": 7900 }, { "epoch": 0.080413818359375, "grad_norm": 14.702198028564453, "learning_rate": 4.922475356331696e-06, "loss": 3.7443, "step": 7905 }, { "epoch": 0.08046468098958333, "grad_norm": 7.636882305145264, "learning_rate": 4.922376515009627e-06, "loss": 3.2372, "step": 7910 }, { "epoch": 0.08051554361979167, "grad_norm": 14.174872398376465, "learning_rate": 4.922277611711629e-06, "loss": 3.2774, "step": 7915 }, { "epoch": 0.08056640625, "grad_norm": 7.712305068969727, "learning_rate": 4.922178646440232e-06, "loss": 2.9999, "step": 7920 }, { "epoch": 0.08061726888020833, "grad_norm": 13.299323081970215, "learning_rate": 4.922079619197968e-06, "loss": 3.4508, "step": 7925 }, { "epoch": 0.08066813151041667, "grad_norm": 12.260127067565918, "learning_rate": 4.9219805299873715e-06, "loss": 3.5971, "step": 7930 }, { "epoch": 0.080718994140625, "grad_norm": 11.661309242248535, "learning_rate": 4.9218813788109776e-06, "loss": 3.2826, "step": 7935 }, { "epoch": 0.08076985677083333, "grad_norm": 14.099932670593262, "learning_rate": 4.921782165671322e-06, "loss": 3.2441, "step": 7940 }, { "epoch": 0.08082071940104167, "grad_norm": 11.960383415222168, "learning_rate": 4.9216828905709445e-06, "loss": 3.2706, "step": 7945 }, { "epoch": 0.08087158203125, "grad_norm": 14.964332580566406, "learning_rate": 4.921583553512384e-06, "loss": 2.997, "step": 7950 }, { "epoch": 0.08092244466145833, "grad_norm": 11.150708198547363, "learning_rate": 4.9214841544981826e-06, "loss": 3.3165, "step": 7955 }, { "epoch": 0.08097330729166667, "grad_norm": 9.034783363342285, "learning_rate": 4.9213846935308816e-06, "loss": 3.4451, "step": 7960 }, { "epoch": 0.081024169921875, "grad_norm": 12.717357635498047, "learning_rate": 4.921285170613029e-06, "loss": 3.4678, "step": 7965 }, { "epoch": 0.08107503255208333, "grad_norm": 15.22363567352295, "learning_rate": 4.921185585747168e-06, "loss": 3.3836, "step": 7970 }, { "epoch": 0.08112589518229167, "grad_norm": 12.619773864746094, "learning_rate": 4.9210859389358475e-06, "loss": 3.3497, "step": 7975 }, { "epoch": 0.0811767578125, "grad_norm": 10.314453125, "learning_rate": 4.920986230181618e-06, "loss": 3.3937, "step": 7980 }, { "epoch": 0.08122762044270833, "grad_norm": 16.396963119506836, "learning_rate": 4.920886459487029e-06, "loss": 3.2124, "step": 7985 }, { "epoch": 0.08127848307291667, "grad_norm": 10.935754776000977, "learning_rate": 4.920786626854634e-06, "loss": 3.3134, "step": 7990 }, { "epoch": 0.081329345703125, "grad_norm": 11.221820831298828, "learning_rate": 4.920686732286988e-06, "loss": 3.3703, "step": 7995 }, { "epoch": 0.08138020833333333, "grad_norm": 15.557806968688965, "learning_rate": 4.9205867757866445e-06, "loss": 3.2063, "step": 8000 }, { "epoch": 0.08143107096354167, "grad_norm": 14.617751121520996, "learning_rate": 4.920486757356162e-06, "loss": 3.4361, "step": 8005 }, { "epoch": 0.08148193359375, "grad_norm": 7.648261070251465, "learning_rate": 4.9203866769981e-06, "loss": 3.1502, "step": 8010 }, { "epoch": 0.08153279622395833, "grad_norm": 13.730716705322266, "learning_rate": 4.920286534715018e-06, "loss": 3.7119, "step": 8015 }, { "epoch": 0.08158365885416667, "grad_norm": 14.1985502243042, "learning_rate": 4.9201863305094786e-06, "loss": 3.4887, "step": 8020 }, { "epoch": 0.081634521484375, "grad_norm": 13.51031494140625, "learning_rate": 4.920086064384046e-06, "loss": 3.0307, "step": 8025 }, { "epoch": 0.08168538411458333, "grad_norm": 11.816555976867676, "learning_rate": 4.919985736341286e-06, "loss": 3.4726, "step": 8030 }, { "epoch": 0.08173624674479167, "grad_norm": 15.351333618164062, "learning_rate": 4.919885346383764e-06, "loss": 3.6725, "step": 8035 }, { "epoch": 0.081787109375, "grad_norm": 17.565916061401367, "learning_rate": 4.919784894514048e-06, "loss": 3.551, "step": 8040 }, { "epoch": 0.08183797200520833, "grad_norm": 11.794846534729004, "learning_rate": 4.91968438073471e-06, "loss": 3.7001, "step": 8045 }, { "epoch": 0.08188883463541667, "grad_norm": 13.469988822937012, "learning_rate": 4.919583805048321e-06, "loss": 3.2706, "step": 8050 }, { "epoch": 0.081939697265625, "grad_norm": 9.399026870727539, "learning_rate": 4.919483167457452e-06, "loss": 3.476, "step": 8055 }, { "epoch": 0.08199055989583333, "grad_norm": 10.455305099487305, "learning_rate": 4.919382467964681e-06, "loss": 3.78, "step": 8060 }, { "epoch": 0.08204142252604167, "grad_norm": 13.494087219238281, "learning_rate": 4.919281706572583e-06, "loss": 3.5696, "step": 8065 }, { "epoch": 0.08209228515625, "grad_norm": 9.601789474487305, "learning_rate": 4.919180883283735e-06, "loss": 3.4697, "step": 8070 }, { "epoch": 0.08214314778645833, "grad_norm": 8.843679428100586, "learning_rate": 4.919079998100719e-06, "loss": 3.3273, "step": 8075 }, { "epoch": 0.08219401041666667, "grad_norm": 17.03862953186035, "learning_rate": 4.918979051026113e-06, "loss": 3.3333, "step": 8080 }, { "epoch": 0.082244873046875, "grad_norm": 9.111615180969238, "learning_rate": 4.918878042062503e-06, "loss": 3.1043, "step": 8085 }, { "epoch": 0.08229573567708333, "grad_norm": 9.032851219177246, "learning_rate": 4.918776971212471e-06, "loss": 3.1896, "step": 8090 }, { "epoch": 0.08234659830729167, "grad_norm": 15.128028869628906, "learning_rate": 4.918675838478603e-06, "loss": 3.5589, "step": 8095 }, { "epoch": 0.0823974609375, "grad_norm": 9.789177894592285, "learning_rate": 4.918574643863488e-06, "loss": 3.4767, "step": 8100 }, { "epoch": 0.08244832356770833, "grad_norm": 8.648837089538574, "learning_rate": 4.918473387369713e-06, "loss": 3.6547, "step": 8105 }, { "epoch": 0.08249918619791667, "grad_norm": 12.925644874572754, "learning_rate": 4.91837206899987e-06, "loss": 3.4412, "step": 8110 }, { "epoch": 0.082550048828125, "grad_norm": 10.411079406738281, "learning_rate": 4.918270688756551e-06, "loss": 3.501, "step": 8115 }, { "epoch": 0.08260091145833333, "grad_norm": 16.97960662841797, "learning_rate": 4.918169246642349e-06, "loss": 3.9448, "step": 8120 }, { "epoch": 0.08265177408854167, "grad_norm": 13.027390480041504, "learning_rate": 4.91806774265986e-06, "loss": 3.2829, "step": 8125 }, { "epoch": 0.08270263671875, "grad_norm": 11.88217544555664, "learning_rate": 4.9179661768116815e-06, "loss": 3.6066, "step": 8130 }, { "epoch": 0.08275349934895833, "grad_norm": 9.799572944641113, "learning_rate": 4.9178645491004115e-06, "loss": 3.2738, "step": 8135 }, { "epoch": 0.08280436197916667, "grad_norm": 12.099909782409668, "learning_rate": 4.91776285952865e-06, "loss": 3.2961, "step": 8140 }, { "epoch": 0.082855224609375, "grad_norm": 11.036294937133789, "learning_rate": 4.917661108098999e-06, "loss": 3.5535, "step": 8145 }, { "epoch": 0.08290608723958333, "grad_norm": 16.11235237121582, "learning_rate": 4.9175592948140614e-06, "loss": 3.5957, "step": 8150 }, { "epoch": 0.08295694986979167, "grad_norm": 12.630952835083008, "learning_rate": 4.917457419676443e-06, "loss": 3.1787, "step": 8155 }, { "epoch": 0.0830078125, "grad_norm": 10.421707153320312, "learning_rate": 4.9173554826887485e-06, "loss": 3.5609, "step": 8160 }, { "epoch": 0.08305867513020833, "grad_norm": 14.865351676940918, "learning_rate": 4.917253483853587e-06, "loss": 3.4347, "step": 8165 }, { "epoch": 0.08310953776041667, "grad_norm": 10.349051475524902, "learning_rate": 4.917151423173568e-06, "loss": 3.3103, "step": 8170 }, { "epoch": 0.083160400390625, "grad_norm": 12.763856887817383, "learning_rate": 4.917049300651303e-06, "loss": 3.2145, "step": 8175 }, { "epoch": 0.08321126302083333, "grad_norm": 13.702238082885742, "learning_rate": 4.916947116289405e-06, "loss": 3.4389, "step": 8180 }, { "epoch": 0.08326212565104167, "grad_norm": 8.125384330749512, "learning_rate": 4.916844870090487e-06, "loss": 3.1778, "step": 8185 }, { "epoch": 0.08331298828125, "grad_norm": 13.532679557800293, "learning_rate": 4.916742562057166e-06, "loss": 3.5005, "step": 8190 }, { "epoch": 0.08336385091145833, "grad_norm": 11.276693344116211, "learning_rate": 4.91664019219206e-06, "loss": 3.1357, "step": 8195 }, { "epoch": 0.08341471354166667, "grad_norm": 10.653390884399414, "learning_rate": 4.916537760497787e-06, "loss": 2.9572, "step": 8200 }, { "epoch": 0.083465576171875, "grad_norm": 19.044883728027344, "learning_rate": 4.9164352669769685e-06, "loss": 3.4532, "step": 8205 }, { "epoch": 0.08351643880208333, "grad_norm": 9.898783683776855, "learning_rate": 4.916332711632227e-06, "loss": 3.3115, "step": 8210 }, { "epoch": 0.08356730143229167, "grad_norm": 7.111556053161621, "learning_rate": 4.916230094466185e-06, "loss": 3.5531, "step": 8215 }, { "epoch": 0.0836181640625, "grad_norm": 15.385550498962402, "learning_rate": 4.916127415481469e-06, "loss": 3.5578, "step": 8220 }, { "epoch": 0.08366902669270833, "grad_norm": 9.553897857666016, "learning_rate": 4.916024674680705e-06, "loss": 3.4616, "step": 8225 }, { "epoch": 0.08371988932291667, "grad_norm": 9.77919864654541, "learning_rate": 4.915921872066524e-06, "loss": 3.5612, "step": 8230 }, { "epoch": 0.083770751953125, "grad_norm": 13.859053611755371, "learning_rate": 4.915819007641553e-06, "loss": 3.3759, "step": 8235 }, { "epoch": 0.08382161458333333, "grad_norm": 13.282938003540039, "learning_rate": 4.915716081408426e-06, "loss": 3.1999, "step": 8240 }, { "epoch": 0.08387247721354167, "grad_norm": 8.742036819458008, "learning_rate": 4.9156130933697756e-06, "loss": 3.6131, "step": 8245 }, { "epoch": 0.08392333984375, "grad_norm": 10.683076858520508, "learning_rate": 4.915510043528237e-06, "loss": 3.6148, "step": 8250 }, { "epoch": 0.08397420247395833, "grad_norm": 13.600173950195312, "learning_rate": 4.915406931886446e-06, "loss": 3.0754, "step": 8255 }, { "epoch": 0.08402506510416667, "grad_norm": 9.838703155517578, "learning_rate": 4.915303758447041e-06, "loss": 3.422, "step": 8260 }, { "epoch": 0.084075927734375, "grad_norm": 17.08401107788086, "learning_rate": 4.915200523212662e-06, "loss": 3.5272, "step": 8265 }, { "epoch": 0.08412679036458333, "grad_norm": 11.889626502990723, "learning_rate": 4.91509722618595e-06, "loss": 3.244, "step": 8270 }, { "epoch": 0.08417765299479167, "grad_norm": 13.750388145446777, "learning_rate": 4.914993867369549e-06, "loss": 3.0886, "step": 8275 }, { "epoch": 0.084228515625, "grad_norm": 12.008491516113281, "learning_rate": 4.914890446766101e-06, "loss": 3.4445, "step": 8280 }, { "epoch": 0.08427937825520833, "grad_norm": 8.624137878417969, "learning_rate": 4.914786964378253e-06, "loss": 3.1456, "step": 8285 }, { "epoch": 0.08433024088541667, "grad_norm": 13.373948097229004, "learning_rate": 4.914683420208654e-06, "loss": 3.1974, "step": 8290 }, { "epoch": 0.084381103515625, "grad_norm": 11.786771774291992, "learning_rate": 4.914579814259952e-06, "loss": 3.7054, "step": 8295 }, { "epoch": 0.08443196614583333, "grad_norm": 12.221219062805176, "learning_rate": 4.914476146534797e-06, "loss": 3.3905, "step": 8300 }, { "epoch": 0.08448282877604167, "grad_norm": 11.495316505432129, "learning_rate": 4.914372417035843e-06, "loss": 3.9629, "step": 8305 }, { "epoch": 0.08453369140625, "grad_norm": 13.219930648803711, "learning_rate": 4.914268625765742e-06, "loss": 3.359, "step": 8310 }, { "epoch": 0.08458455403645833, "grad_norm": 11.233851432800293, "learning_rate": 4.9141647727271515e-06, "loss": 3.6808, "step": 8315 }, { "epoch": 0.08463541666666667, "grad_norm": 10.176496505737305, "learning_rate": 4.914060857922727e-06, "loss": 3.3135, "step": 8320 }, { "epoch": 0.084686279296875, "grad_norm": 10.016979217529297, "learning_rate": 4.9139568813551275e-06, "loss": 3.8656, "step": 8325 }, { "epoch": 0.08473714192708333, "grad_norm": 14.729958534240723, "learning_rate": 4.913852843027013e-06, "loss": 3.4418, "step": 8330 }, { "epoch": 0.08478800455729167, "grad_norm": 16.3724422454834, "learning_rate": 4.913748742941046e-06, "loss": 3.3672, "step": 8335 }, { "epoch": 0.0848388671875, "grad_norm": 9.82769775390625, "learning_rate": 4.91364458109989e-06, "loss": 3.0876, "step": 8340 }, { "epoch": 0.08488972981770833, "grad_norm": 12.202662467956543, "learning_rate": 4.913540357506209e-06, "loss": 3.3755, "step": 8345 }, { "epoch": 0.08494059244791667, "grad_norm": 13.47652530670166, "learning_rate": 4.913436072162671e-06, "loss": 3.3207, "step": 8350 }, { "epoch": 0.084991455078125, "grad_norm": 13.997584342956543, "learning_rate": 4.913331725071942e-06, "loss": 3.4016, "step": 8355 }, { "epoch": 0.08504231770833333, "grad_norm": 10.103306770324707, "learning_rate": 4.9132273162366926e-06, "loss": 3.9784, "step": 8360 }, { "epoch": 0.08509318033854167, "grad_norm": 13.56583023071289, "learning_rate": 4.913122845659595e-06, "loss": 3.5657, "step": 8365 }, { "epoch": 0.08514404296875, "grad_norm": 16.747276306152344, "learning_rate": 4.913018313343322e-06, "loss": 3.3329, "step": 8370 }, { "epoch": 0.08519490559895833, "grad_norm": 15.131979942321777, "learning_rate": 4.912913719290546e-06, "loss": 3.7669, "step": 8375 }, { "epoch": 0.08524576822916667, "grad_norm": 13.406637191772461, "learning_rate": 4.912809063503945e-06, "loss": 3.8842, "step": 8380 }, { "epoch": 0.085296630859375, "grad_norm": 10.718530654907227, "learning_rate": 4.912704345986196e-06, "loss": 3.3867, "step": 8385 }, { "epoch": 0.08534749348958333, "grad_norm": 10.112154006958008, "learning_rate": 4.912599566739979e-06, "loss": 3.1978, "step": 8390 }, { "epoch": 0.08539835611979167, "grad_norm": 9.753875732421875, "learning_rate": 4.912494725767972e-06, "loss": 3.2426, "step": 8395 }, { "epoch": 0.08544921875, "grad_norm": 13.011728286743164, "learning_rate": 4.9123898230728616e-06, "loss": 3.689, "step": 8400 }, { "epoch": 0.08550008138020833, "grad_norm": 10.342684745788574, "learning_rate": 4.912284858657328e-06, "loss": 3.2583, "step": 8405 }, { "epoch": 0.08555094401041667, "grad_norm": 15.096055030822754, "learning_rate": 4.9121798325240574e-06, "loss": 3.797, "step": 8410 }, { "epoch": 0.085601806640625, "grad_norm": 16.6988582611084, "learning_rate": 4.912074744675739e-06, "loss": 3.7339, "step": 8415 }, { "epoch": 0.08565266927083333, "grad_norm": 9.644712448120117, "learning_rate": 4.911969595115059e-06, "loss": 3.6449, "step": 8420 }, { "epoch": 0.08570353190104167, "grad_norm": 9.427045822143555, "learning_rate": 4.911864383844709e-06, "loss": 3.7292, "step": 8425 }, { "epoch": 0.08575439453125, "grad_norm": 12.242361068725586, "learning_rate": 4.9117591108673815e-06, "loss": 3.6643, "step": 8430 }, { "epoch": 0.08580525716145833, "grad_norm": 15.966888427734375, "learning_rate": 4.911653776185768e-06, "loss": 3.7233, "step": 8435 }, { "epoch": 0.08585611979166667, "grad_norm": 13.699934005737305, "learning_rate": 4.9115483798025635e-06, "loss": 3.2923, "step": 8440 }, { "epoch": 0.085906982421875, "grad_norm": 18.62406349182129, "learning_rate": 4.911442921720465e-06, "loss": 3.3712, "step": 8445 }, { "epoch": 0.08595784505208333, "grad_norm": 12.284649848937988, "learning_rate": 4.911337401942172e-06, "loss": 3.0775, "step": 8450 }, { "epoch": 0.08600870768229167, "grad_norm": 15.336729049682617, "learning_rate": 4.911231820470383e-06, "loss": 3.1454, "step": 8455 }, { "epoch": 0.0860595703125, "grad_norm": 12.336068153381348, "learning_rate": 4.911126177307799e-06, "loss": 3.5428, "step": 8460 }, { "epoch": 0.08611043294270833, "grad_norm": 9.04693603515625, "learning_rate": 4.911020472457124e-06, "loss": 4.0488, "step": 8465 }, { "epoch": 0.08616129557291667, "grad_norm": 8.606358528137207, "learning_rate": 4.91091470592106e-06, "loss": 3.2326, "step": 8470 }, { "epoch": 0.086212158203125, "grad_norm": 16.552690505981445, "learning_rate": 4.910808877702317e-06, "loss": 3.6028, "step": 8475 }, { "epoch": 0.08626302083333333, "grad_norm": 10.208168029785156, "learning_rate": 4.910702987803599e-06, "loss": 3.4335, "step": 8480 }, { "epoch": 0.08631388346354167, "grad_norm": 12.139317512512207, "learning_rate": 4.910597036227617e-06, "loss": 3.2395, "step": 8485 }, { "epoch": 0.08636474609375, "grad_norm": 10.915331840515137, "learning_rate": 4.91049102297708e-06, "loss": 3.4295, "step": 8490 }, { "epoch": 0.08641560872395833, "grad_norm": 9.047663688659668, "learning_rate": 4.910384948054703e-06, "loss": 3.7313, "step": 8495 }, { "epoch": 0.08646647135416667, "grad_norm": 11.79930305480957, "learning_rate": 4.910278811463197e-06, "loss": 3.8923, "step": 8500 }, { "epoch": 0.086517333984375, "grad_norm": 9.152587890625, "learning_rate": 4.91017261320528e-06, "loss": 3.4595, "step": 8505 }, { "epoch": 0.08656819661458333, "grad_norm": 10.824846267700195, "learning_rate": 4.910066353283668e-06, "loss": 3.2886, "step": 8510 }, { "epoch": 0.08661905924479167, "grad_norm": 6.95084285736084, "learning_rate": 4.909960031701079e-06, "loss": 3.3848, "step": 8515 }, { "epoch": 0.086669921875, "grad_norm": 16.21605682373047, "learning_rate": 4.9098536484602334e-06, "loss": 3.2412, "step": 8520 }, { "epoch": 0.08672078450520833, "grad_norm": 8.85698127746582, "learning_rate": 4.909747203563855e-06, "loss": 3.5511, "step": 8525 }, { "epoch": 0.08677164713541667, "grad_norm": 16.60267448425293, "learning_rate": 4.909640697014664e-06, "loss": 3.9667, "step": 8530 }, { "epoch": 0.086822509765625, "grad_norm": 14.518190383911133, "learning_rate": 4.909534128815387e-06, "loss": 3.3415, "step": 8535 }, { "epoch": 0.08687337239583333, "grad_norm": 10.557718276977539, "learning_rate": 4.909427498968752e-06, "loss": 3.4739, "step": 8540 }, { "epoch": 0.08692423502604167, "grad_norm": 11.117448806762695, "learning_rate": 4.909320807477485e-06, "loss": 3.345, "step": 8545 }, { "epoch": 0.08697509765625, "grad_norm": 13.413477897644043, "learning_rate": 4.9092140543443145e-06, "loss": 3.8153, "step": 8550 }, { "epoch": 0.08702596028645833, "grad_norm": 11.934691429138184, "learning_rate": 4.909107239571975e-06, "loss": 3.548, "step": 8555 }, { "epoch": 0.08707682291666667, "grad_norm": 10.76050853729248, "learning_rate": 4.9090003631631975e-06, "loss": 3.3043, "step": 8560 }, { "epoch": 0.087127685546875, "grad_norm": 10.168484687805176, "learning_rate": 4.9088934251207165e-06, "loss": 3.8714, "step": 8565 }, { "epoch": 0.08717854817708333, "grad_norm": 13.013615608215332, "learning_rate": 4.908786425447269e-06, "loss": 3.5501, "step": 8570 }, { "epoch": 0.08722941080729167, "grad_norm": 19.23887062072754, "learning_rate": 4.908679364145591e-06, "loss": 3.5044, "step": 8575 }, { "epoch": 0.0872802734375, "grad_norm": 14.924409866333008, "learning_rate": 4.908572241218422e-06, "loss": 3.2358, "step": 8580 }, { "epoch": 0.08733113606770833, "grad_norm": 12.832475662231445, "learning_rate": 4.908465056668504e-06, "loss": 3.6383, "step": 8585 }, { "epoch": 0.08738199869791667, "grad_norm": 15.433663368225098, "learning_rate": 4.908357810498578e-06, "loss": 3.4119, "step": 8590 }, { "epoch": 0.087432861328125, "grad_norm": 8.936971664428711, "learning_rate": 4.908250502711388e-06, "loss": 3.2385, "step": 8595 }, { "epoch": 0.08748372395833333, "grad_norm": 15.772163391113281, "learning_rate": 4.9081431333096805e-06, "loss": 3.5685, "step": 8600 }, { "epoch": 0.08753458658854167, "grad_norm": 11.028748512268066, "learning_rate": 4.908035702296201e-06, "loss": 3.4128, "step": 8605 }, { "epoch": 0.08758544921875, "grad_norm": 12.614755630493164, "learning_rate": 4.907928209673699e-06, "loss": 3.2888, "step": 8610 }, { "epoch": 0.08763631184895833, "grad_norm": 12.599555969238281, "learning_rate": 4.907820655444924e-06, "loss": 3.1925, "step": 8615 }, { "epoch": 0.08768717447916667, "grad_norm": 8.425646781921387, "learning_rate": 4.907713039612629e-06, "loss": 3.2201, "step": 8620 }, { "epoch": 0.087738037109375, "grad_norm": 11.59953784942627, "learning_rate": 4.907605362179566e-06, "loss": 3.2422, "step": 8625 }, { "epoch": 0.08778889973958333, "grad_norm": 7.5655131340026855, "learning_rate": 4.907497623148491e-06, "loss": 3.5319, "step": 8630 }, { "epoch": 0.08783976236979167, "grad_norm": 15.107267379760742, "learning_rate": 4.90738982252216e-06, "loss": 3.5978, "step": 8635 }, { "epoch": 0.087890625, "grad_norm": 15.67873764038086, "learning_rate": 4.90728196030333e-06, "loss": 3.2055, "step": 8640 }, { "epoch": 0.08794148763020833, "grad_norm": 13.261436462402344, "learning_rate": 4.907174036494763e-06, "loss": 3.0959, "step": 8645 }, { "epoch": 0.08799235026041667, "grad_norm": 8.268048286437988, "learning_rate": 4.907066051099219e-06, "loss": 4.1154, "step": 8650 }, { "epoch": 0.088043212890625, "grad_norm": 10.855912208557129, "learning_rate": 4.906958004119459e-06, "loss": 3.7448, "step": 8655 }, { "epoch": 0.08809407552083333, "grad_norm": 11.688394546508789, "learning_rate": 4.90684989555825e-06, "loss": 3.5907, "step": 8660 }, { "epoch": 0.08814493815104167, "grad_norm": 10.814862251281738, "learning_rate": 4.906741725418357e-06, "loss": 3.0817, "step": 8665 }, { "epoch": 0.08819580078125, "grad_norm": 15.317708969116211, "learning_rate": 4.906633493702547e-06, "loss": 3.3432, "step": 8670 }, { "epoch": 0.08824666341145833, "grad_norm": 9.729673385620117, "learning_rate": 4.9065252004135896e-06, "loss": 3.2816, "step": 8675 }, { "epoch": 0.08829752604166667, "grad_norm": 16.496000289916992, "learning_rate": 4.906416845554255e-06, "loss": 3.511, "step": 8680 }, { "epoch": 0.088348388671875, "grad_norm": 16.304975509643555, "learning_rate": 4.906308429127317e-06, "loss": 3.3911, "step": 8685 }, { "epoch": 0.08839925130208333, "grad_norm": 9.595266342163086, "learning_rate": 4.906199951135547e-06, "loss": 3.5494, "step": 8690 }, { "epoch": 0.08845011393229167, "grad_norm": 13.496864318847656, "learning_rate": 4.906091411581722e-06, "loss": 3.0696, "step": 8695 }, { "epoch": 0.0885009765625, "grad_norm": 9.127147674560547, "learning_rate": 4.905982810468619e-06, "loss": 3.5201, "step": 8700 }, { "epoch": 0.08855183919270833, "grad_norm": 17.793336868286133, "learning_rate": 4.905874147799015e-06, "loss": 3.9616, "step": 8705 }, { "epoch": 0.08860270182291667, "grad_norm": 12.619993209838867, "learning_rate": 4.905765423575692e-06, "loss": 3.6193, "step": 8710 }, { "epoch": 0.088653564453125, "grad_norm": 13.0260648727417, "learning_rate": 4.90565663780143e-06, "loss": 3.1035, "step": 8715 }, { "epoch": 0.08870442708333333, "grad_norm": 11.968487739562988, "learning_rate": 4.905547790479015e-06, "loss": 3.8811, "step": 8720 }, { "epoch": 0.08875528971354167, "grad_norm": 14.09224796295166, "learning_rate": 4.905438881611228e-06, "loss": 3.2689, "step": 8725 }, { "epoch": 0.08880615234375, "grad_norm": 15.479565620422363, "learning_rate": 4.905329911200858e-06, "loss": 3.3083, "step": 8730 }, { "epoch": 0.08885701497395833, "grad_norm": 12.190298080444336, "learning_rate": 4.905220879250693e-06, "loss": 3.0763, "step": 8735 }, { "epoch": 0.08890787760416667, "grad_norm": 13.899861335754395, "learning_rate": 4.905111785763521e-06, "loss": 3.5266, "step": 8740 }, { "epoch": 0.088958740234375, "grad_norm": 11.89075756072998, "learning_rate": 4.905002630742135e-06, "loss": 3.9321, "step": 8745 }, { "epoch": 0.08900960286458333, "grad_norm": 13.489803314208984, "learning_rate": 4.904893414189326e-06, "loss": 3.4766, "step": 8750 }, { "epoch": 0.08906046549479167, "grad_norm": 13.605609893798828, "learning_rate": 4.904784136107888e-06, "loss": 3.1691, "step": 8755 }, { "epoch": 0.089111328125, "grad_norm": 10.71879768371582, "learning_rate": 4.90467479650062e-06, "loss": 3.2934, "step": 8760 }, { "epoch": 0.08916219075520833, "grad_norm": 10.595026016235352, "learning_rate": 4.9045653953703156e-06, "loss": 3.1218, "step": 8765 }, { "epoch": 0.08921305338541667, "grad_norm": 11.80540943145752, "learning_rate": 4.9044559327197764e-06, "loss": 3.2589, "step": 8770 }, { "epoch": 0.089263916015625, "grad_norm": 14.13592529296875, "learning_rate": 4.9043464085518026e-06, "loss": 3.3531, "step": 8775 }, { "epoch": 0.08931477864583333, "grad_norm": 12.386794090270996, "learning_rate": 4.904236822869195e-06, "loss": 3.529, "step": 8780 }, { "epoch": 0.08936564127604167, "grad_norm": 15.09712028503418, "learning_rate": 4.904127175674758e-06, "loss": 3.5851, "step": 8785 }, { "epoch": 0.08941650390625, "grad_norm": 22.025758743286133, "learning_rate": 4.904017466971297e-06, "loss": 3.4247, "step": 8790 }, { "epoch": 0.08946736653645833, "grad_norm": 12.594602584838867, "learning_rate": 4.9039076967616196e-06, "loss": 3.5087, "step": 8795 }, { "epoch": 0.08951822916666667, "grad_norm": 12.804073333740234, "learning_rate": 4.903797865048533e-06, "loss": 3.3348, "step": 8800 }, { "epoch": 0.089569091796875, "grad_norm": 10.385623931884766, "learning_rate": 4.903687971834848e-06, "loss": 3.4176, "step": 8805 }, { "epoch": 0.08961995442708333, "grad_norm": 9.570121765136719, "learning_rate": 4.903578017123376e-06, "loss": 3.3923, "step": 8810 }, { "epoch": 0.08967081705729167, "grad_norm": 12.22430419921875, "learning_rate": 4.90346800091693e-06, "loss": 3.6975, "step": 8815 }, { "epoch": 0.0897216796875, "grad_norm": 14.948432922363281, "learning_rate": 4.9033579232183256e-06, "loss": 3.483, "step": 8820 }, { "epoch": 0.08977254231770833, "grad_norm": 14.922759056091309, "learning_rate": 4.903247784030377e-06, "loss": 3.4358, "step": 8825 }, { "epoch": 0.08982340494791667, "grad_norm": 11.016508102416992, "learning_rate": 4.903137583355905e-06, "loss": 3.4084, "step": 8830 }, { "epoch": 0.089874267578125, "grad_norm": 17.078842163085938, "learning_rate": 4.903027321197726e-06, "loss": 3.5926, "step": 8835 }, { "epoch": 0.08992513020833333, "grad_norm": 13.634777069091797, "learning_rate": 4.902916997558665e-06, "loss": 3.5905, "step": 8840 }, { "epoch": 0.08997599283854167, "grad_norm": 8.89342212677002, "learning_rate": 4.902806612441539e-06, "loss": 3.2723, "step": 8845 }, { "epoch": 0.09002685546875, "grad_norm": 14.009095191955566, "learning_rate": 4.902696165849178e-06, "loss": 3.5151, "step": 8850 }, { "epoch": 0.09007771809895833, "grad_norm": 16.494112014770508, "learning_rate": 4.902585657784404e-06, "loss": 3.229, "step": 8855 }, { "epoch": 0.09012858072916667, "grad_norm": 9.716590881347656, "learning_rate": 4.902475088250045e-06, "loss": 3.4538, "step": 8860 }, { "epoch": 0.090179443359375, "grad_norm": 14.583306312561035, "learning_rate": 4.90236445724893e-06, "loss": 3.6892, "step": 8865 }, { "epoch": 0.09023030598958333, "grad_norm": 11.482134819030762, "learning_rate": 4.902253764783891e-06, "loss": 3.3962, "step": 8870 }, { "epoch": 0.09028116861979167, "grad_norm": 14.702147483825684, "learning_rate": 4.902143010857758e-06, "loss": 3.2805, "step": 8875 }, { "epoch": 0.09033203125, "grad_norm": 16.604808807373047, "learning_rate": 4.902032195473366e-06, "loss": 3.2619, "step": 8880 }, { "epoch": 0.09038289388020833, "grad_norm": 12.936920166015625, "learning_rate": 4.901921318633549e-06, "loss": 3.4328, "step": 8885 }, { "epoch": 0.09043375651041667, "grad_norm": 7.750491142272949, "learning_rate": 4.901810380341145e-06, "loss": 3.7113, "step": 8890 }, { "epoch": 0.090484619140625, "grad_norm": 17.840023040771484, "learning_rate": 4.901699380598992e-06, "loss": 3.1851, "step": 8895 }, { "epoch": 0.09053548177083333, "grad_norm": 9.541340827941895, "learning_rate": 4.901588319409929e-06, "loss": 3.5305, "step": 8900 }, { "epoch": 0.09058634440104167, "grad_norm": 15.900449752807617, "learning_rate": 4.901477196776798e-06, "loss": 3.084, "step": 8905 }, { "epoch": 0.09063720703125, "grad_norm": 11.218430519104004, "learning_rate": 4.901366012702443e-06, "loss": 3.5674, "step": 8910 }, { "epoch": 0.09068806966145833, "grad_norm": 12.750350952148438, "learning_rate": 4.901254767189707e-06, "loss": 3.8215, "step": 8915 }, { "epoch": 0.09073893229166667, "grad_norm": 13.617959976196289, "learning_rate": 4.901143460241437e-06, "loss": 3.3986, "step": 8920 }, { "epoch": 0.090789794921875, "grad_norm": 13.11312484741211, "learning_rate": 4.90103209186048e-06, "loss": 3.453, "step": 8925 }, { "epoch": 0.09084065755208333, "grad_norm": 9.750372886657715, "learning_rate": 4.9009206620496875e-06, "loss": 3.348, "step": 8930 }, { "epoch": 0.09089152018229167, "grad_norm": 13.289474487304688, "learning_rate": 4.900809170811908e-06, "loss": 3.284, "step": 8935 }, { "epoch": 0.0909423828125, "grad_norm": 10.578423500061035, "learning_rate": 4.900697618149995e-06, "loss": 3.8172, "step": 8940 }, { "epoch": 0.09099324544270833, "grad_norm": 14.269426345825195, "learning_rate": 4.900586004066803e-06, "loss": 3.8127, "step": 8945 }, { "epoch": 0.09104410807291667, "grad_norm": 11.517721176147461, "learning_rate": 4.900474328565186e-06, "loss": 3.2798, "step": 8950 }, { "epoch": 0.091094970703125, "grad_norm": 12.702051162719727, "learning_rate": 4.900362591648003e-06, "loss": 3.1521, "step": 8955 }, { "epoch": 0.09114583333333333, "grad_norm": 13.737582206726074, "learning_rate": 4.900250793318112e-06, "loss": 3.3586, "step": 8960 }, { "epoch": 0.09119669596354167, "grad_norm": 14.892784118652344, "learning_rate": 4.900138933578373e-06, "loss": 3.4807, "step": 8965 }, { "epoch": 0.09124755859375, "grad_norm": 11.482332229614258, "learning_rate": 4.9000270124316495e-06, "loss": 3.0206, "step": 8970 }, { "epoch": 0.09129842122395833, "grad_norm": 13.81802749633789, "learning_rate": 4.899915029880803e-06, "loss": 3.3766, "step": 8975 }, { "epoch": 0.09134928385416667, "grad_norm": 11.666979789733887, "learning_rate": 4.899802985928699e-06, "loss": 3.2013, "step": 8980 }, { "epoch": 0.091400146484375, "grad_norm": 14.300288200378418, "learning_rate": 4.899690880578205e-06, "loss": 3.1857, "step": 8985 }, { "epoch": 0.09145100911458333, "grad_norm": 9.281865119934082, "learning_rate": 4.899578713832188e-06, "loss": 3.4702, "step": 8990 }, { "epoch": 0.09150187174479167, "grad_norm": 13.91549015045166, "learning_rate": 4.899466485693518e-06, "loss": 3.9119, "step": 8995 }, { "epoch": 0.091552734375, "grad_norm": 8.730186462402344, "learning_rate": 4.899354196165068e-06, "loss": 3.136, "step": 9000 }, { "epoch": 0.09160359700520833, "grad_norm": 9.029413223266602, "learning_rate": 4.899241845249708e-06, "loss": 3.6015, "step": 9005 }, { "epoch": 0.09165445963541667, "grad_norm": 10.518607139587402, "learning_rate": 4.899129432950316e-06, "loss": 3.5339, "step": 9010 }, { "epoch": 0.091705322265625, "grad_norm": 13.978117942810059, "learning_rate": 4.899016959269764e-06, "loss": 3.6865, "step": 9015 }, { "epoch": 0.09175618489583333, "grad_norm": 13.56008243560791, "learning_rate": 4.898904424210934e-06, "loss": 3.1692, "step": 9020 }, { "epoch": 0.09180704752604167, "grad_norm": 11.420557975769043, "learning_rate": 4.898791827776701e-06, "loss": 3.3184, "step": 9025 }, { "epoch": 0.09185791015625, "grad_norm": 11.740954399108887, "learning_rate": 4.898679169969949e-06, "loss": 3.5797, "step": 9030 }, { "epoch": 0.09190877278645833, "grad_norm": 11.412757873535156, "learning_rate": 4.898566450793558e-06, "loss": 3.4047, "step": 9035 }, { "epoch": 0.09195963541666667, "grad_norm": 7.887825965881348, "learning_rate": 4.898453670250413e-06, "loss": 3.3213, "step": 9040 }, { "epoch": 0.092010498046875, "grad_norm": 10.620329856872559, "learning_rate": 4.8983408283433995e-06, "loss": 3.3956, "step": 9045 }, { "epoch": 0.09206136067708333, "grad_norm": 10.935351371765137, "learning_rate": 4.898227925075405e-06, "loss": 3.1719, "step": 9050 }, { "epoch": 0.09211222330729167, "grad_norm": 13.198664665222168, "learning_rate": 4.898114960449317e-06, "loss": 3.2274, "step": 9055 }, { "epoch": 0.0921630859375, "grad_norm": 7.661378383636475, "learning_rate": 4.8980019344680255e-06, "loss": 3.4533, "step": 9060 }, { "epoch": 0.09221394856770833, "grad_norm": 14.63359546661377, "learning_rate": 4.897888847134424e-06, "loss": 3.2218, "step": 9065 }, { "epoch": 0.09226481119791667, "grad_norm": 10.786942481994629, "learning_rate": 4.897775698451404e-06, "loss": 3.0604, "step": 9070 }, { "epoch": 0.092315673828125, "grad_norm": 13.290276527404785, "learning_rate": 4.897662488421861e-06, "loss": 3.4856, "step": 9075 }, { "epoch": 0.09236653645833333, "grad_norm": 13.736598014831543, "learning_rate": 4.897549217048692e-06, "loss": 3.4483, "step": 9080 }, { "epoch": 0.09241739908854167, "grad_norm": 13.418255805969238, "learning_rate": 4.897435884334795e-06, "loss": 3.3985, "step": 9085 }, { "epoch": 0.09246826171875, "grad_norm": 11.04183578491211, "learning_rate": 4.897322490283069e-06, "loss": 3.746, "step": 9090 }, { "epoch": 0.09251912434895833, "grad_norm": 14.375475883483887, "learning_rate": 4.897209034896414e-06, "loss": 3.3322, "step": 9095 }, { "epoch": 0.09256998697916667, "grad_norm": 12.415508270263672, "learning_rate": 4.897095518177735e-06, "loss": 3.221, "step": 9100 }, { "epoch": 0.092620849609375, "grad_norm": 13.276843070983887, "learning_rate": 4.896981940129935e-06, "loss": 3.1526, "step": 9105 }, { "epoch": 0.09267171223958333, "grad_norm": 16.24017333984375, "learning_rate": 4.8968683007559204e-06, "loss": 3.1622, "step": 9110 }, { "epoch": 0.09272257486979167, "grad_norm": 13.676992416381836, "learning_rate": 4.8967546000585985e-06, "loss": 3.4367, "step": 9115 }, { "epoch": 0.0927734375, "grad_norm": 12.518017768859863, "learning_rate": 4.896640838040878e-06, "loss": 3.149, "step": 9120 }, { "epoch": 0.09282430013020833, "grad_norm": 9.173798561096191, "learning_rate": 4.89652701470567e-06, "loss": 3.4939, "step": 9125 }, { "epoch": 0.09287516276041667, "grad_norm": 9.206432342529297, "learning_rate": 4.896413130055887e-06, "loss": 3.2798, "step": 9130 }, { "epoch": 0.092926025390625, "grad_norm": 10.400619506835938, "learning_rate": 4.896299184094441e-06, "loss": 3.5009, "step": 9135 }, { "epoch": 0.09297688802083333, "grad_norm": 14.5686616897583, "learning_rate": 4.896185176824249e-06, "loss": 3.0607, "step": 9140 }, { "epoch": 0.09302775065104167, "grad_norm": 15.614046096801758, "learning_rate": 4.8960711082482275e-06, "loss": 3.5764, "step": 9145 }, { "epoch": 0.09307861328125, "grad_norm": 11.736412048339844, "learning_rate": 4.895956978369294e-06, "loss": 3.2503, "step": 9150 }, { "epoch": 0.09312947591145833, "grad_norm": 8.38943862915039, "learning_rate": 4.895842787190369e-06, "loss": 3.5593, "step": 9155 }, { "epoch": 0.09318033854166667, "grad_norm": 10.80936050415039, "learning_rate": 4.895728534714375e-06, "loss": 3.7306, "step": 9160 }, { "epoch": 0.093231201171875, "grad_norm": 8.9891939163208, "learning_rate": 4.895614220944233e-06, "loss": 3.5101, "step": 9165 }, { "epoch": 0.09328206380208333, "grad_norm": 10.55257511138916, "learning_rate": 4.895499845882869e-06, "loss": 3.642, "step": 9170 }, { "epoch": 0.09333292643229167, "grad_norm": 11.787149429321289, "learning_rate": 4.895385409533211e-06, "loss": 3.3967, "step": 9175 }, { "epoch": 0.0933837890625, "grad_norm": 14.418353080749512, "learning_rate": 4.895270911898183e-06, "loss": 3.3822, "step": 9180 }, { "epoch": 0.09343465169270833, "grad_norm": 10.53858470916748, "learning_rate": 4.895156352980718e-06, "loss": 3.5411, "step": 9185 }, { "epoch": 0.09348551432291667, "grad_norm": 14.017937660217285, "learning_rate": 4.895041732783745e-06, "loss": 3.2294, "step": 9190 }, { "epoch": 0.093536376953125, "grad_norm": 9.317304611206055, "learning_rate": 4.8949270513101965e-06, "loss": 3.0859, "step": 9195 }, { "epoch": 0.09358723958333333, "grad_norm": 14.010085105895996, "learning_rate": 4.894812308563007e-06, "loss": 3.3879, "step": 9200 }, { "epoch": 0.09363810221354167, "grad_norm": 14.181611061096191, "learning_rate": 4.8946975045451125e-06, "loss": 3.4383, "step": 9205 }, { "epoch": 0.09368896484375, "grad_norm": 17.099210739135742, "learning_rate": 4.894582639259451e-06, "loss": 3.2286, "step": 9210 }, { "epoch": 0.09373982747395833, "grad_norm": 15.619363784790039, "learning_rate": 4.894467712708959e-06, "loss": 3.5737, "step": 9215 }, { "epoch": 0.09379069010416667, "grad_norm": 8.870353698730469, "learning_rate": 4.8943527248965786e-06, "loss": 3.2734, "step": 9220 }, { "epoch": 0.093841552734375, "grad_norm": 8.589174270629883, "learning_rate": 4.894237675825251e-06, "loss": 3.5581, "step": 9225 }, { "epoch": 0.09389241536458333, "grad_norm": 16.112144470214844, "learning_rate": 4.89412256549792e-06, "loss": 3.4103, "step": 9230 }, { "epoch": 0.09394327799479167, "grad_norm": 10.410552978515625, "learning_rate": 4.89400739391753e-06, "loss": 3.4895, "step": 9235 }, { "epoch": 0.093994140625, "grad_norm": 13.545750617980957, "learning_rate": 4.89389216108703e-06, "loss": 3.2598, "step": 9240 }, { "epoch": 0.09404500325520833, "grad_norm": 18.490036010742188, "learning_rate": 4.893776867009365e-06, "loss": 3.3705, "step": 9245 }, { "epoch": 0.09409586588541667, "grad_norm": 12.499258041381836, "learning_rate": 4.893661511687487e-06, "loss": 3.3089, "step": 9250 }, { "epoch": 0.094146728515625, "grad_norm": 46.41202163696289, "learning_rate": 4.893546095124346e-06, "loss": 3.46, "step": 9255 }, { "epoch": 0.09419759114583333, "grad_norm": 13.263053894042969, "learning_rate": 4.893430617322895e-06, "loss": 3.4889, "step": 9260 }, { "epoch": 0.09424845377604167, "grad_norm": 7.173794269561768, "learning_rate": 4.8933150782860905e-06, "loss": 3.4028, "step": 9265 }, { "epoch": 0.09429931640625, "grad_norm": 15.863984107971191, "learning_rate": 4.893199478016886e-06, "loss": 3.1168, "step": 9270 }, { "epoch": 0.09435017903645833, "grad_norm": 9.970428466796875, "learning_rate": 4.8930838165182405e-06, "loss": 3.3857, "step": 9275 }, { "epoch": 0.09440104166666667, "grad_norm": 6.910197734832764, "learning_rate": 4.892968093793112e-06, "loss": 3.6812, "step": 9280 }, { "epoch": 0.094451904296875, "grad_norm": 12.154581069946289, "learning_rate": 4.892852309844462e-06, "loss": 3.459, "step": 9285 }, { "epoch": 0.09450276692708333, "grad_norm": 13.089887619018555, "learning_rate": 4.892736464675254e-06, "loss": 3.75, "step": 9290 }, { "epoch": 0.09455362955729167, "grad_norm": 12.283695220947266, "learning_rate": 4.89262055828845e-06, "loss": 3.5142, "step": 9295 }, { "epoch": 0.0946044921875, "grad_norm": 16.34111976623535, "learning_rate": 4.892504590687016e-06, "loss": 3.4612, "step": 9300 }, { "epoch": 0.09465535481770833, "grad_norm": 14.332940101623535, "learning_rate": 4.89238856187392e-06, "loss": 2.9819, "step": 9305 }, { "epoch": 0.09470621744791667, "grad_norm": 9.912870407104492, "learning_rate": 4.892272471852128e-06, "loss": 3.3375, "step": 9310 }, { "epoch": 0.094757080078125, "grad_norm": 18.02761459350586, "learning_rate": 4.892156320624613e-06, "loss": 3.5184, "step": 9315 }, { "epoch": 0.09480794270833333, "grad_norm": 14.656006813049316, "learning_rate": 4.892040108194346e-06, "loss": 3.7168, "step": 9320 }, { "epoch": 0.09485880533854167, "grad_norm": 14.194197654724121, "learning_rate": 4.8919238345643e-06, "loss": 3.7077, "step": 9325 }, { "epoch": 0.09490966796875, "grad_norm": 14.731765747070312, "learning_rate": 4.891807499737449e-06, "loss": 3.526, "step": 9330 }, { "epoch": 0.09496053059895833, "grad_norm": 11.650579452514648, "learning_rate": 4.891691103716769e-06, "loss": 3.2138, "step": 9335 }, { "epoch": 0.09501139322916667, "grad_norm": 8.169177055358887, "learning_rate": 4.89157464650524e-06, "loss": 3.4209, "step": 9340 }, { "epoch": 0.095062255859375, "grad_norm": 13.967549324035645, "learning_rate": 4.89145812810584e-06, "loss": 3.523, "step": 9345 }, { "epoch": 0.09511311848958333, "grad_norm": 15.903494834899902, "learning_rate": 4.891341548521552e-06, "loss": 3.5998, "step": 9350 }, { "epoch": 0.09516398111979167, "grad_norm": 18.27981185913086, "learning_rate": 4.8912249077553566e-06, "loss": 4.1232, "step": 9355 }, { "epoch": 0.09521484375, "grad_norm": 12.90363597869873, "learning_rate": 4.8911082058102375e-06, "loss": 3.6707, "step": 9360 }, { "epoch": 0.09526570638020833, "grad_norm": 15.968925476074219, "learning_rate": 4.890991442689184e-06, "loss": 3.1965, "step": 9365 }, { "epoch": 0.09531656901041667, "grad_norm": 12.801103591918945, "learning_rate": 4.890874618395179e-06, "loss": 3.2281, "step": 9370 }, { "epoch": 0.095367431640625, "grad_norm": 9.642354011535645, "learning_rate": 4.890757732931215e-06, "loss": 3.6309, "step": 9375 }, { "epoch": 0.09541829427083333, "grad_norm": 18.670930862426758, "learning_rate": 4.8906407863002805e-06, "loss": 3.7841, "step": 9380 }, { "epoch": 0.09546915690104167, "grad_norm": 12.024273872375488, "learning_rate": 4.8905237785053675e-06, "loss": 3.3917, "step": 9385 }, { "epoch": 0.09552001953125, "grad_norm": 8.89647102355957, "learning_rate": 4.8904067095494714e-06, "loss": 3.703, "step": 9390 }, { "epoch": 0.09557088216145833, "grad_norm": 11.106717109680176, "learning_rate": 4.890289579435585e-06, "loss": 3.2822, "step": 9395 }, { "epoch": 0.09562174479166667, "grad_norm": 14.37429428100586, "learning_rate": 4.8901723881667075e-06, "loss": 3.3098, "step": 9400 }, { "epoch": 0.095672607421875, "grad_norm": 12.311078071594238, "learning_rate": 4.890055135745835e-06, "loss": 3.1674, "step": 9405 }, { "epoch": 0.09572347005208333, "grad_norm": 13.66702938079834, "learning_rate": 4.88993782217597e-06, "loss": 3.0287, "step": 9410 }, { "epoch": 0.09577433268229167, "grad_norm": 11.145536422729492, "learning_rate": 4.889820447460111e-06, "loss": 3.3768, "step": 9415 }, { "epoch": 0.0958251953125, "grad_norm": 12.304450988769531, "learning_rate": 4.889703011601262e-06, "loss": 3.5019, "step": 9420 }, { "epoch": 0.09587605794270833, "grad_norm": 9.890800476074219, "learning_rate": 4.889585514602429e-06, "loss": 3.1514, "step": 9425 }, { "epoch": 0.09592692057291667, "grad_norm": 9.260703086853027, "learning_rate": 4.889467956466616e-06, "loss": 3.4432, "step": 9430 }, { "epoch": 0.095977783203125, "grad_norm": 13.874801635742188, "learning_rate": 4.889350337196832e-06, "loss": 3.2378, "step": 9435 }, { "epoch": 0.09602864583333333, "grad_norm": 13.591817855834961, "learning_rate": 4.889232656796086e-06, "loss": 3.4383, "step": 9440 }, { "epoch": 0.09607950846354167, "grad_norm": 9.10405445098877, "learning_rate": 4.8891149152673875e-06, "loss": 3.4487, "step": 9445 }, { "epoch": 0.09613037109375, "grad_norm": 8.167961120605469, "learning_rate": 4.888997112613752e-06, "loss": 3.6359, "step": 9450 }, { "epoch": 0.09618123372395833, "grad_norm": 14.467639923095703, "learning_rate": 4.888879248838191e-06, "loss": 3.1424, "step": 9455 }, { "epoch": 0.09623209635416667, "grad_norm": 12.446633338928223, "learning_rate": 4.888761323943721e-06, "loss": 3.5426, "step": 9460 }, { "epoch": 0.096282958984375, "grad_norm": 11.570879936218262, "learning_rate": 4.888643337933358e-06, "loss": 3.5324, "step": 9465 }, { "epoch": 0.09633382161458333, "grad_norm": 13.078673362731934, "learning_rate": 4.8885252908101226e-06, "loss": 3.3334, "step": 9470 }, { "epoch": 0.09638468424479167, "grad_norm": 10.936042785644531, "learning_rate": 4.888407182577032e-06, "loss": 3.2394, "step": 9475 }, { "epoch": 0.096435546875, "grad_norm": 13.004395484924316, "learning_rate": 4.888289013237112e-06, "loss": 3.3966, "step": 9480 }, { "epoch": 0.09648640950520833, "grad_norm": 11.469902992248535, "learning_rate": 4.888170782793382e-06, "loss": 3.717, "step": 9485 }, { "epoch": 0.09653727213541667, "grad_norm": 14.799452781677246, "learning_rate": 4.888052491248869e-06, "loss": 3.4483, "step": 9490 }, { "epoch": 0.096588134765625, "grad_norm": 10.995068550109863, "learning_rate": 4.887934138606599e-06, "loss": 3.3911, "step": 9495 }, { "epoch": 0.09663899739583333, "grad_norm": 13.323307037353516, "learning_rate": 4.8878157248696e-06, "loss": 3.749, "step": 9500 }, { "epoch": 0.09668986002604167, "grad_norm": 20.117530822753906, "learning_rate": 4.887697250040901e-06, "loss": 3.5721, "step": 9505 }, { "epoch": 0.09674072265625, "grad_norm": 10.738547325134277, "learning_rate": 4.887578714123536e-06, "loss": 3.3881, "step": 9510 }, { "epoch": 0.09679158528645833, "grad_norm": 12.129579544067383, "learning_rate": 4.887460117120533e-06, "loss": 3.5233, "step": 9515 }, { "epoch": 0.09684244791666667, "grad_norm": 14.902997970581055, "learning_rate": 4.88734145903493e-06, "loss": 3.5717, "step": 9520 }, { "epoch": 0.096893310546875, "grad_norm": 11.737396240234375, "learning_rate": 4.887222739869761e-06, "loss": 3.175, "step": 9525 }, { "epoch": 0.09694417317708333, "grad_norm": 6.6605730056762695, "learning_rate": 4.8871039596280654e-06, "loss": 3.4191, "step": 9530 }, { "epoch": 0.09699503580729167, "grad_norm": 11.67459487915039, "learning_rate": 4.88698511831288e-06, "loss": 3.4298, "step": 9535 }, { "epoch": 0.0970458984375, "grad_norm": 7.061823844909668, "learning_rate": 4.886866215927246e-06, "loss": 3.4257, "step": 9540 }, { "epoch": 0.09709676106770833, "grad_norm": 14.894327163696289, "learning_rate": 4.8867472524742055e-06, "loss": 3.4476, "step": 9545 }, { "epoch": 0.09714762369791667, "grad_norm": 14.296971321105957, "learning_rate": 4.8866282279568024e-06, "loss": 3.7858, "step": 9550 }, { "epoch": 0.097198486328125, "grad_norm": 12.031661033630371, "learning_rate": 4.886509142378082e-06, "loss": 3.5772, "step": 9555 }, { "epoch": 0.09724934895833333, "grad_norm": 13.860501289367676, "learning_rate": 4.88638999574109e-06, "loss": 3.259, "step": 9560 }, { "epoch": 0.09730021158854167, "grad_norm": 12.739351272583008, "learning_rate": 4.886270788048877e-06, "loss": 3.358, "step": 9565 }, { "epoch": 0.09735107421875, "grad_norm": 8.996005058288574, "learning_rate": 4.8861515193044905e-06, "loss": 3.0429, "step": 9570 }, { "epoch": 0.09740193684895833, "grad_norm": 10.95101261138916, "learning_rate": 4.886032189510983e-06, "loss": 3.8154, "step": 9575 }, { "epoch": 0.09745279947916667, "grad_norm": 13.15895938873291, "learning_rate": 4.885912798671408e-06, "loss": 3.3369, "step": 9580 }, { "epoch": 0.097503662109375, "grad_norm": 13.110836029052734, "learning_rate": 4.885793346788819e-06, "loss": 3.5893, "step": 9585 }, { "epoch": 0.09755452473958333, "grad_norm": 13.61912727355957, "learning_rate": 4.885673833866273e-06, "loss": 3.3841, "step": 9590 }, { "epoch": 0.09760538736979167, "grad_norm": 8.887752532958984, "learning_rate": 4.885554259906827e-06, "loss": 3.7583, "step": 9595 }, { "epoch": 0.09765625, "grad_norm": 14.288346290588379, "learning_rate": 4.885434624913541e-06, "loss": 3.0627, "step": 9600 }, { "epoch": 0.09770711263020833, "grad_norm": 13.13482666015625, "learning_rate": 4.8853149288894765e-06, "loss": 3.6814, "step": 9605 }, { "epoch": 0.09775797526041667, "grad_norm": 10.950799942016602, "learning_rate": 4.885195171837694e-06, "loss": 3.0417, "step": 9610 }, { "epoch": 0.097808837890625, "grad_norm": 15.985264778137207, "learning_rate": 4.885075353761258e-06, "loss": 3.4947, "step": 9615 }, { "epoch": 0.09785970052083333, "grad_norm": 11.092909812927246, "learning_rate": 4.884955474663235e-06, "loss": 3.6053, "step": 9620 }, { "epoch": 0.09791056315104167, "grad_norm": 10.567216873168945, "learning_rate": 4.884835534546692e-06, "loss": 3.5717, "step": 9625 }, { "epoch": 0.09796142578125, "grad_norm": 16.9993896484375, "learning_rate": 4.884715533414696e-06, "loss": 3.8297, "step": 9630 }, { "epoch": 0.09801228841145833, "grad_norm": 9.653956413269043, "learning_rate": 4.884595471270319e-06, "loss": 3.2011, "step": 9635 }, { "epoch": 0.09806315104166667, "grad_norm": 7.467728137969971, "learning_rate": 4.884475348116631e-06, "loss": 3.3446, "step": 9640 }, { "epoch": 0.098114013671875, "grad_norm": 16.218908309936523, "learning_rate": 4.884355163956708e-06, "loss": 3.4298, "step": 9645 }, { "epoch": 0.09816487630208333, "grad_norm": 8.244502067565918, "learning_rate": 4.884234918793622e-06, "loss": 3.3024, "step": 9650 }, { "epoch": 0.09821573893229167, "grad_norm": 13.385653495788574, "learning_rate": 4.884114612630451e-06, "loss": 3.2548, "step": 9655 }, { "epoch": 0.0982666015625, "grad_norm": 12.53419303894043, "learning_rate": 4.883994245470274e-06, "loss": 3.0654, "step": 9660 }, { "epoch": 0.09831746419270833, "grad_norm": 7.7406005859375, "learning_rate": 4.883873817316168e-06, "loss": 3.7867, "step": 9665 }, { "epoch": 0.09836832682291667, "grad_norm": 16.32261085510254, "learning_rate": 4.883753328171216e-06, "loss": 3.5244, "step": 9670 }, { "epoch": 0.098419189453125, "grad_norm": 14.094369888305664, "learning_rate": 4.8836327780385e-06, "loss": 3.6854, "step": 9675 }, { "epoch": 0.09847005208333333, "grad_norm": 12.182512283325195, "learning_rate": 4.883512166921104e-06, "loss": 3.5938, "step": 9680 }, { "epoch": 0.09852091471354167, "grad_norm": 11.643973350524902, "learning_rate": 4.883391494822114e-06, "loss": 3.3795, "step": 9685 }, { "epoch": 0.09857177734375, "grad_norm": 13.109829902648926, "learning_rate": 4.883270761744617e-06, "loss": 3.4673, "step": 9690 }, { "epoch": 0.09862263997395833, "grad_norm": 9.588884353637695, "learning_rate": 4.883149967691704e-06, "loss": 3.4358, "step": 9695 }, { "epoch": 0.09867350260416667, "grad_norm": 16.999698638916016, "learning_rate": 4.883029112666463e-06, "loss": 3.4918, "step": 9700 }, { "epoch": 0.098724365234375, "grad_norm": 14.010063171386719, "learning_rate": 4.882908196671987e-06, "loss": 3.5411, "step": 9705 }, { "epoch": 0.09877522786458333, "grad_norm": 14.857710838317871, "learning_rate": 4.88278721971137e-06, "loss": 3.4874, "step": 9710 }, { "epoch": 0.09882609049479167, "grad_norm": 12.407607078552246, "learning_rate": 4.882666181787707e-06, "loss": 3.2071, "step": 9715 }, { "epoch": 0.098876953125, "grad_norm": 12.295507431030273, "learning_rate": 4.882545082904094e-06, "loss": 3.3899, "step": 9720 }, { "epoch": 0.09892781575520833, "grad_norm": 12.28351879119873, "learning_rate": 4.88242392306363e-06, "loss": 3.8177, "step": 9725 }, { "epoch": 0.09897867838541667, "grad_norm": 8.689821243286133, "learning_rate": 4.882302702269415e-06, "loss": 3.5002, "step": 9730 }, { "epoch": 0.099029541015625, "grad_norm": 10.74190616607666, "learning_rate": 4.882181420524548e-06, "loss": 3.1903, "step": 9735 }, { "epoch": 0.09908040364583333, "grad_norm": 11.707921981811523, "learning_rate": 4.882060077832137e-06, "loss": 3.2921, "step": 9740 }, { "epoch": 0.09913126627604167, "grad_norm": 13.249473571777344, "learning_rate": 4.881938674195282e-06, "loss": 3.3386, "step": 9745 }, { "epoch": 0.09918212890625, "grad_norm": 16.1980037689209, "learning_rate": 4.88181720961709e-06, "loss": 3.5874, "step": 9750 }, { "epoch": 0.09923299153645833, "grad_norm": 14.430301666259766, "learning_rate": 4.88169568410067e-06, "loss": 3.2323, "step": 9755 }, { "epoch": 0.09928385416666667, "grad_norm": 8.47429084777832, "learning_rate": 4.881574097649131e-06, "loss": 3.2952, "step": 9760 }, { "epoch": 0.099334716796875, "grad_norm": 8.88611888885498, "learning_rate": 4.881452450265583e-06, "loss": 3.3712, "step": 9765 }, { "epoch": 0.09938557942708333, "grad_norm": 10.999044418334961, "learning_rate": 4.881330741953137e-06, "loss": 3.504, "step": 9770 }, { "epoch": 0.09943644205729167, "grad_norm": 13.003782272338867, "learning_rate": 4.88120897271491e-06, "loss": 3.8129, "step": 9775 }, { "epoch": 0.0994873046875, "grad_norm": 11.573512077331543, "learning_rate": 4.881087142554015e-06, "loss": 3.3195, "step": 9780 }, { "epoch": 0.09953816731770833, "grad_norm": 16.704687118530273, "learning_rate": 4.880965251473571e-06, "loss": 3.0435, "step": 9785 }, { "epoch": 0.09958902994791667, "grad_norm": 13.198221206665039, "learning_rate": 4.8808432994766944e-06, "loss": 3.2013, "step": 9790 }, { "epoch": 0.099639892578125, "grad_norm": 9.851302146911621, "learning_rate": 4.880721286566506e-06, "loss": 3.3261, "step": 9795 }, { "epoch": 0.09969075520833333, "grad_norm": 14.158812522888184, "learning_rate": 4.880599212746128e-06, "loss": 3.2733, "step": 9800 }, { "epoch": 0.09974161783854167, "grad_norm": 15.338472366333008, "learning_rate": 4.880477078018684e-06, "loss": 3.3534, "step": 9805 }, { "epoch": 0.09979248046875, "grad_norm": 14.599812507629395, "learning_rate": 4.8803548823872985e-06, "loss": 3.6244, "step": 9810 }, { "epoch": 0.09984334309895833, "grad_norm": 11.007782936096191, "learning_rate": 4.880232625855096e-06, "loss": 3.0961, "step": 9815 }, { "epoch": 0.09989420572916667, "grad_norm": 13.516772270202637, "learning_rate": 4.880110308425207e-06, "loss": 3.3936, "step": 9820 }, { "epoch": 0.099945068359375, "grad_norm": 7.3785905838012695, "learning_rate": 4.8799879301007596e-06, "loss": 3.7464, "step": 9825 }, { "epoch": 0.09999593098958333, "grad_norm": 38.93828582763672, "learning_rate": 4.879865490884886e-06, "loss": 3.5595, "step": 9830 }, { "epoch": 0.10004679361979167, "grad_norm": 14.068145751953125, "learning_rate": 4.879742990780717e-06, "loss": 3.5197, "step": 9835 }, { "epoch": 0.10009765625, "grad_norm": 11.864350318908691, "learning_rate": 4.879620429791387e-06, "loss": 3.601, "step": 9840 }, { "epoch": 0.10014851888020833, "grad_norm": 15.89621639251709, "learning_rate": 4.879497807920034e-06, "loss": 3.2539, "step": 9845 }, { "epoch": 0.10019938151041667, "grad_norm": 11.07425308227539, "learning_rate": 4.8793751251697925e-06, "loss": 3.5963, "step": 9850 }, { "epoch": 0.100250244140625, "grad_norm": 10.702958106994629, "learning_rate": 4.879252381543803e-06, "loss": 3.6547, "step": 9855 }, { "epoch": 0.10030110677083333, "grad_norm": 11.077383995056152, "learning_rate": 4.879129577045204e-06, "loss": 3.3366, "step": 9860 }, { "epoch": 0.10035196940104167, "grad_norm": 11.844645500183105, "learning_rate": 4.87900671167714e-06, "loss": 3.377, "step": 9865 }, { "epoch": 0.10040283203125, "grad_norm": 12.209272384643555, "learning_rate": 4.8788837854427525e-06, "loss": 4.0542, "step": 9870 }, { "epoch": 0.10045369466145833, "grad_norm": 12.173541069030762, "learning_rate": 4.878760798345188e-06, "loss": 3.5212, "step": 9875 }, { "epoch": 0.10050455729166667, "grad_norm": 12.681471824645996, "learning_rate": 4.878637750387591e-06, "loss": 3.8139, "step": 9880 }, { "epoch": 0.100555419921875, "grad_norm": 8.942806243896484, "learning_rate": 4.878514641573112e-06, "loss": 3.5788, "step": 9885 }, { "epoch": 0.10060628255208333, "grad_norm": 11.298273086547852, "learning_rate": 4.8783914719048995e-06, "loss": 3.3029, "step": 9890 }, { "epoch": 0.10065714518229167, "grad_norm": 9.567761421203613, "learning_rate": 4.8782682413861046e-06, "loss": 3.7846, "step": 9895 }, { "epoch": 0.1007080078125, "grad_norm": 10.052159309387207, "learning_rate": 4.8781449500198804e-06, "loss": 3.3519, "step": 9900 }, { "epoch": 0.10075887044270833, "grad_norm": 16.80272674560547, "learning_rate": 4.878021597809382e-06, "loss": 3.7498, "step": 9905 }, { "epoch": 0.10080973307291667, "grad_norm": 11.9678316116333, "learning_rate": 4.877898184757765e-06, "loss": 3.7687, "step": 9910 }, { "epoch": 0.100860595703125, "grad_norm": 14.82444953918457, "learning_rate": 4.877774710868185e-06, "loss": 3.4121, "step": 9915 }, { "epoch": 0.10091145833333333, "grad_norm": 12.413556098937988, "learning_rate": 4.877651176143804e-06, "loss": 3.5701, "step": 9920 }, { "epoch": 0.10096232096354167, "grad_norm": 16.419466018676758, "learning_rate": 4.877527580587781e-06, "loss": 3.5236, "step": 9925 }, { "epoch": 0.10101318359375, "grad_norm": 15.198001861572266, "learning_rate": 4.877403924203278e-06, "loss": 3.4151, "step": 9930 }, { "epoch": 0.10106404622395833, "grad_norm": 11.96876335144043, "learning_rate": 4.877280206993459e-06, "loss": 3.1713, "step": 9935 }, { "epoch": 0.10111490885416667, "grad_norm": 12.78167724609375, "learning_rate": 4.8771564289614895e-06, "loss": 3.3529, "step": 9940 }, { "epoch": 0.101165771484375, "grad_norm": 14.186490058898926, "learning_rate": 4.877032590110536e-06, "loss": 3.8994, "step": 9945 }, { "epoch": 0.10121663411458333, "grad_norm": 9.95617389678955, "learning_rate": 4.876908690443767e-06, "loss": 3.402, "step": 9950 }, { "epoch": 0.10126749674479167, "grad_norm": 14.145368576049805, "learning_rate": 4.876784729964353e-06, "loss": 3.5655, "step": 9955 }, { "epoch": 0.101318359375, "grad_norm": 10.90357780456543, "learning_rate": 4.876660708675465e-06, "loss": 3.4442, "step": 9960 }, { "epoch": 0.10136922200520833, "grad_norm": 13.902193069458008, "learning_rate": 4.876536626580276e-06, "loss": 3.3617, "step": 9965 }, { "epoch": 0.10142008463541667, "grad_norm": 8.821534156799316, "learning_rate": 4.876412483681961e-06, "loss": 3.445, "step": 9970 }, { "epoch": 0.101470947265625, "grad_norm": 11.418331146240234, "learning_rate": 4.8762882799836955e-06, "loss": 3.4656, "step": 9975 }, { "epoch": 0.10152180989583333, "grad_norm": 11.632497787475586, "learning_rate": 4.876164015488658e-06, "loss": 3.2188, "step": 9980 }, { "epoch": 0.10157267252604167, "grad_norm": 10.307924270629883, "learning_rate": 4.876039690200027e-06, "loss": 3.5271, "step": 9985 }, { "epoch": 0.10162353515625, "grad_norm": 11.517049789428711, "learning_rate": 4.875915304120984e-06, "loss": 3.5669, "step": 9990 }, { "epoch": 0.10167439778645833, "grad_norm": 9.514555931091309, "learning_rate": 4.875790857254711e-06, "loss": 3.3767, "step": 9995 }, { "epoch": 0.10172526041666667, "grad_norm": 13.482763290405273, "learning_rate": 4.875666349604392e-06, "loss": 3.5474, "step": 10000 }, { "epoch": 0.101776123046875, "grad_norm": 8.935997009277344, "learning_rate": 4.875541781173212e-06, "loss": 3.4632, "step": 10005 }, { "epoch": 0.10182698567708333, "grad_norm": 8.468302726745605, "learning_rate": 4.875417151964359e-06, "loss": 3.4322, "step": 10010 }, { "epoch": 0.10187784830729167, "grad_norm": 12.212846755981445, "learning_rate": 4.875292461981022e-06, "loss": 4.2413, "step": 10015 }, { "epoch": 0.1019287109375, "grad_norm": 11.472529411315918, "learning_rate": 4.87516771122639e-06, "loss": 3.2805, "step": 10020 }, { "epoch": 0.10197957356770833, "grad_norm": 13.230598449707031, "learning_rate": 4.875042899703654e-06, "loss": 3.4256, "step": 10025 }, { "epoch": 0.10203043619791667, "grad_norm": 13.139010429382324, "learning_rate": 4.874918027416009e-06, "loss": 3.5081, "step": 10030 }, { "epoch": 0.102081298828125, "grad_norm": 6.4391703605651855, "learning_rate": 4.874793094366649e-06, "loss": 3.3638, "step": 10035 }, { "epoch": 0.10213216145833333, "grad_norm": 15.451169967651367, "learning_rate": 4.8746681005587715e-06, "loss": 3.6243, "step": 10040 }, { "epoch": 0.10218302408854167, "grad_norm": 11.433987617492676, "learning_rate": 4.874543045995572e-06, "loss": 3.6848, "step": 10045 }, { "epoch": 0.10223388671875, "grad_norm": 8.72944450378418, "learning_rate": 4.874417930680253e-06, "loss": 3.3524, "step": 10050 }, { "epoch": 0.10228474934895833, "grad_norm": 15.81502628326416, "learning_rate": 4.874292754616014e-06, "loss": 3.3923, "step": 10055 }, { "epoch": 0.10233561197916667, "grad_norm": 12.636479377746582, "learning_rate": 4.8741675178060565e-06, "loss": 3.2582, "step": 10060 }, { "epoch": 0.102386474609375, "grad_norm": 10.026837348937988, "learning_rate": 4.874042220253586e-06, "loss": 3.5174, "step": 10065 }, { "epoch": 0.10243733723958333, "grad_norm": 14.83649730682373, "learning_rate": 4.8739168619618086e-06, "loss": 3.5073, "step": 10070 }, { "epoch": 0.10248819986979167, "grad_norm": 15.73272705078125, "learning_rate": 4.873791442933931e-06, "loss": 3.3724, "step": 10075 }, { "epoch": 0.1025390625, "grad_norm": 15.510259628295898, "learning_rate": 4.873665963173161e-06, "loss": 3.1763, "step": 10080 }, { "epoch": 0.10258992513020833, "grad_norm": 8.296263694763184, "learning_rate": 4.873540422682711e-06, "loss": 3.4604, "step": 10085 }, { "epoch": 0.10264078776041667, "grad_norm": 11.954290390014648, "learning_rate": 4.873414821465792e-06, "loss": 3.4896, "step": 10090 }, { "epoch": 0.102691650390625, "grad_norm": 10.191630363464355, "learning_rate": 4.873289159525617e-06, "loss": 3.5075, "step": 10095 }, { "epoch": 0.10274251302083333, "grad_norm": 14.362115859985352, "learning_rate": 4.873163436865401e-06, "loss": 3.0294, "step": 10100 }, { "epoch": 0.10279337565104167, "grad_norm": 10.870323181152344, "learning_rate": 4.873037653488361e-06, "loss": 3.5388, "step": 10105 }, { "epoch": 0.10284423828125, "grad_norm": 14.673710823059082, "learning_rate": 4.872911809397715e-06, "loss": 3.2948, "step": 10110 }, { "epoch": 0.10289510091145833, "grad_norm": 10.174817085266113, "learning_rate": 4.872785904596684e-06, "loss": 3.831, "step": 10115 }, { "epoch": 0.10294596354166667, "grad_norm": 13.638348579406738, "learning_rate": 4.8726599390884866e-06, "loss": 3.2765, "step": 10120 }, { "epoch": 0.102996826171875, "grad_norm": 14.297739028930664, "learning_rate": 4.872533912876348e-06, "loss": 3.4857, "step": 10125 }, { "epoch": 0.10304768880208333, "grad_norm": 8.859086036682129, "learning_rate": 4.872407825963491e-06, "loss": 3.5332, "step": 10130 }, { "epoch": 0.10309855143229167, "grad_norm": 17.016996383666992, "learning_rate": 4.872281678353142e-06, "loss": 3.858, "step": 10135 }, { "epoch": 0.1031494140625, "grad_norm": 7.245146751403809, "learning_rate": 4.872155470048529e-06, "loss": 3.3026, "step": 10140 }, { "epoch": 0.10320027669270833, "grad_norm": 13.683245658874512, "learning_rate": 4.87202920105288e-06, "loss": 3.5045, "step": 10145 }, { "epoch": 0.10325113932291667, "grad_norm": 10.41574478149414, "learning_rate": 4.871902871369427e-06, "loss": 3.289, "step": 10150 }, { "epoch": 0.103302001953125, "grad_norm": 13.628034591674805, "learning_rate": 4.871776481001401e-06, "loss": 3.6961, "step": 10155 }, { "epoch": 0.10335286458333333, "grad_norm": 9.731292724609375, "learning_rate": 4.8716500299520356e-06, "loss": 3.4743, "step": 10160 }, { "epoch": 0.10340372721354167, "grad_norm": 15.618063926696777, "learning_rate": 4.871523518224567e-06, "loss": 3.0353, "step": 10165 }, { "epoch": 0.10345458984375, "grad_norm": 12.420486450195312, "learning_rate": 4.87139694582223e-06, "loss": 3.5414, "step": 10170 }, { "epoch": 0.10350545247395833, "grad_norm": 9.918728828430176, "learning_rate": 4.871270312748265e-06, "loss": 3.6799, "step": 10175 }, { "epoch": 0.10355631510416667, "grad_norm": 10.1067533493042, "learning_rate": 4.871143619005911e-06, "loss": 3.4345, "step": 10180 }, { "epoch": 0.103607177734375, "grad_norm": 6.703737735748291, "learning_rate": 4.87101686459841e-06, "loss": 3.2298, "step": 10185 }, { "epoch": 0.10365804036458333, "grad_norm": 15.222867012023926, "learning_rate": 4.8708900495290035e-06, "loss": 3.6949, "step": 10190 }, { "epoch": 0.10370890299479167, "grad_norm": 12.256555557250977, "learning_rate": 4.8707631738009376e-06, "loss": 3.4872, "step": 10195 }, { "epoch": 0.103759765625, "grad_norm": 8.05500602722168, "learning_rate": 4.870636237417458e-06, "loss": 3.5136, "step": 10200 }, { "epoch": 0.10381062825520833, "grad_norm": 9.914165496826172, "learning_rate": 4.870509240381812e-06, "loss": 3.5153, "step": 10205 }, { "epoch": 0.10386149088541667, "grad_norm": 14.44861125946045, "learning_rate": 4.8703821826972495e-06, "loss": 3.4795, "step": 10210 }, { "epoch": 0.103912353515625, "grad_norm": 12.164118766784668, "learning_rate": 4.87025506436702e-06, "loss": 3.4607, "step": 10215 }, { "epoch": 0.10396321614583333, "grad_norm": 15.556924819946289, "learning_rate": 4.8701278853943764e-06, "loss": 3.3538, "step": 10220 }, { "epoch": 0.10401407877604167, "grad_norm": 14.256190299987793, "learning_rate": 4.870000645782573e-06, "loss": 3.3421, "step": 10225 }, { "epoch": 0.10406494140625, "grad_norm": 13.732766151428223, "learning_rate": 4.869873345534865e-06, "loss": 3.0725, "step": 10230 }, { "epoch": 0.10411580403645833, "grad_norm": 16.038616180419922, "learning_rate": 4.869745984654508e-06, "loss": 3.212, "step": 10235 }, { "epoch": 0.10416666666666667, "grad_norm": 14.277772903442383, "learning_rate": 4.8696185631447635e-06, "loss": 3.6345, "step": 10240 }, { "epoch": 0.104217529296875, "grad_norm": 10.199970245361328, "learning_rate": 4.869491081008889e-06, "loss": 3.5137, "step": 10245 }, { "epoch": 0.10426839192708333, "grad_norm": 12.390953063964844, "learning_rate": 4.869363538250146e-06, "loss": 3.3681, "step": 10250 }, { "epoch": 0.10431925455729167, "grad_norm": 8.647974967956543, "learning_rate": 4.869235934871799e-06, "loss": 3.2399, "step": 10255 }, { "epoch": 0.1043701171875, "grad_norm": 11.923579216003418, "learning_rate": 4.869108270877112e-06, "loss": 3.6247, "step": 10260 }, { "epoch": 0.10442097981770833, "grad_norm": 13.356016159057617, "learning_rate": 4.868980546269352e-06, "loss": 3.7096, "step": 10265 }, { "epoch": 0.10447184244791667, "grad_norm": 8.816129684448242, "learning_rate": 4.868852761051787e-06, "loss": 3.3696, "step": 10270 }, { "epoch": 0.104522705078125, "grad_norm": 11.433305740356445, "learning_rate": 4.868724915227684e-06, "loss": 3.3735, "step": 10275 }, { "epoch": 0.10457356770833333, "grad_norm": 10.5938138961792, "learning_rate": 4.868597008800315e-06, "loss": 3.5073, "step": 10280 }, { "epoch": 0.10462443033854167, "grad_norm": 13.988579750061035, "learning_rate": 4.868469041772955e-06, "loss": 3.6398, "step": 10285 }, { "epoch": 0.10467529296875, "grad_norm": 8.426319122314453, "learning_rate": 4.868341014148875e-06, "loss": 3.5602, "step": 10290 }, { "epoch": 0.10472615559895833, "grad_norm": 12.661907196044922, "learning_rate": 4.868212925931351e-06, "loss": 3.3872, "step": 10295 }, { "epoch": 0.10477701822916667, "grad_norm": 10.294717788696289, "learning_rate": 4.868084777123661e-06, "loss": 3.5918, "step": 10300 }, { "epoch": 0.104827880859375, "grad_norm": 16.74044418334961, "learning_rate": 4.867956567729084e-06, "loss": 3.4537, "step": 10305 }, { "epoch": 0.10487874348958333, "grad_norm": 10.488282203674316, "learning_rate": 4.867828297750899e-06, "loss": 3.101, "step": 10310 }, { "epoch": 0.10492960611979167, "grad_norm": 10.39643669128418, "learning_rate": 4.867699967192388e-06, "loss": 3.1673, "step": 10315 }, { "epoch": 0.10498046875, "grad_norm": 13.151554107666016, "learning_rate": 4.867571576056834e-06, "loss": 3.6526, "step": 10320 }, { "epoch": 0.10503133138020833, "grad_norm": 11.318380355834961, "learning_rate": 4.867443124347523e-06, "loss": 3.6569, "step": 10325 }, { "epoch": 0.10508219401041667, "grad_norm": 11.30242919921875, "learning_rate": 4.867314612067741e-06, "loss": 3.2106, "step": 10330 }, { "epoch": 0.105133056640625, "grad_norm": 8.88015079498291, "learning_rate": 4.867186039220775e-06, "loss": 3.3928, "step": 10335 }, { "epoch": 0.10518391927083333, "grad_norm": 9.858390808105469, "learning_rate": 4.867057405809916e-06, "loss": 3.3096, "step": 10340 }, { "epoch": 0.10523478190104167, "grad_norm": 14.723819732666016, "learning_rate": 4.866928711838455e-06, "loss": 3.3486, "step": 10345 }, { "epoch": 0.10528564453125, "grad_norm": 8.097146034240723, "learning_rate": 4.866799957309682e-06, "loss": 4.0018, "step": 10350 }, { "epoch": 0.10533650716145833, "grad_norm": 14.983758926391602, "learning_rate": 4.866671142226895e-06, "loss": 3.5232, "step": 10355 }, { "epoch": 0.10538736979166667, "grad_norm": 10.44787311553955, "learning_rate": 4.866542266593387e-06, "loss": 3.6857, "step": 10360 }, { "epoch": 0.105438232421875, "grad_norm": 13.872020721435547, "learning_rate": 4.8664133304124555e-06, "loss": 3.2874, "step": 10365 }, { "epoch": 0.10548909505208333, "grad_norm": 11.341287612915039, "learning_rate": 4.8662843336874e-06, "loss": 3.202, "step": 10370 }, { "epoch": 0.10553995768229167, "grad_norm": 9.611438751220703, "learning_rate": 4.866155276421522e-06, "loss": 3.4759, "step": 10375 }, { "epoch": 0.1055908203125, "grad_norm": 12.784802436828613, "learning_rate": 4.8660261586181205e-06, "loss": 3.3239, "step": 10380 }, { "epoch": 0.10564168294270833, "grad_norm": 15.74527645111084, "learning_rate": 4.865896980280501e-06, "loss": 3.6173, "step": 10385 }, { "epoch": 0.10569254557291667, "grad_norm": 9.02971076965332, "learning_rate": 4.865767741411969e-06, "loss": 3.7378, "step": 10390 }, { "epoch": 0.105743408203125, "grad_norm": 8.090533256530762, "learning_rate": 4.8656384420158285e-06, "loss": 3.2215, "step": 10395 }, { "epoch": 0.10579427083333333, "grad_norm": 9.529847145080566, "learning_rate": 4.86550908209539e-06, "loss": 3.3138, "step": 10400 }, { "epoch": 0.10584513346354167, "grad_norm": 11.145707130432129, "learning_rate": 4.865379661653963e-06, "loss": 2.8438, "step": 10405 }, { "epoch": 0.10589599609375, "grad_norm": 9.50683879852295, "learning_rate": 4.8652501806948575e-06, "loss": 3.4143, "step": 10410 }, { "epoch": 0.10594685872395833, "grad_norm": 14.874054908752441, "learning_rate": 4.865120639221386e-06, "loss": 3.6896, "step": 10415 }, { "epoch": 0.10599772135416667, "grad_norm": 15.502473831176758, "learning_rate": 4.864991037236864e-06, "loss": 3.7117, "step": 10420 }, { "epoch": 0.106048583984375, "grad_norm": 17.096446990966797, "learning_rate": 4.864861374744607e-06, "loss": 3.4389, "step": 10425 }, { "epoch": 0.10609944661458333, "grad_norm": 12.414908409118652, "learning_rate": 4.8647316517479326e-06, "loss": 3.5158, "step": 10430 }, { "epoch": 0.10615030924479167, "grad_norm": 13.846083641052246, "learning_rate": 4.864601868250159e-06, "loss": 3.4447, "step": 10435 }, { "epoch": 0.106201171875, "grad_norm": 12.67673110961914, "learning_rate": 4.864472024254607e-06, "loss": 3.0979, "step": 10440 }, { "epoch": 0.10625203450520833, "grad_norm": 13.106204986572266, "learning_rate": 4.864342119764599e-06, "loss": 3.4044, "step": 10445 }, { "epoch": 0.10630289713541667, "grad_norm": 10.791247367858887, "learning_rate": 4.864212154783458e-06, "loss": 3.4244, "step": 10450 }, { "epoch": 0.106353759765625, "grad_norm": 15.417497634887695, "learning_rate": 4.86408212931451e-06, "loss": 3.8919, "step": 10455 }, { "epoch": 0.10640462239583333, "grad_norm": 12.463935852050781, "learning_rate": 4.86395204336108e-06, "loss": 3.3458, "step": 10460 }, { "epoch": 0.10645548502604167, "grad_norm": 11.733417510986328, "learning_rate": 4.863821896926498e-06, "loss": 3.5288, "step": 10465 }, { "epoch": 0.10650634765625, "grad_norm": 14.689102172851562, "learning_rate": 4.863691690014093e-06, "loss": 3.2967, "step": 10470 }, { "epoch": 0.10655721028645833, "grad_norm": 10.164743423461914, "learning_rate": 4.863561422627197e-06, "loss": 3.2024, "step": 10475 }, { "epoch": 0.10660807291666667, "grad_norm": 10.591276168823242, "learning_rate": 4.863431094769141e-06, "loss": 3.1854, "step": 10480 }, { "epoch": 0.106658935546875, "grad_norm": 7.265041351318359, "learning_rate": 4.863300706443261e-06, "loss": 3.3769, "step": 10485 }, { "epoch": 0.10670979817708333, "grad_norm": 9.011372566223145, "learning_rate": 4.8631702576528924e-06, "loss": 3.3535, "step": 10490 }, { "epoch": 0.10676066080729167, "grad_norm": 12.629626274108887, "learning_rate": 4.863039748401374e-06, "loss": 3.1713, "step": 10495 }, { "epoch": 0.1068115234375, "grad_norm": 10.04238224029541, "learning_rate": 4.8629091786920425e-06, "loss": 3.2648, "step": 10500 }, { "epoch": 0.10686238606770833, "grad_norm": 8.861063003540039, "learning_rate": 4.862778548528239e-06, "loss": 3.4088, "step": 10505 }, { "epoch": 0.10691324869791667, "grad_norm": 10.639623641967773, "learning_rate": 4.862647857913308e-06, "loss": 3.1807, "step": 10510 }, { "epoch": 0.106964111328125, "grad_norm": 11.753366470336914, "learning_rate": 4.862517106850592e-06, "loss": 4.0414, "step": 10515 }, { "epoch": 0.10701497395833333, "grad_norm": 6.897514343261719, "learning_rate": 4.862386295343435e-06, "loss": 3.1811, "step": 10520 }, { "epoch": 0.10706583658854167, "grad_norm": 17.118993759155273, "learning_rate": 4.862255423395184e-06, "loss": 3.549, "step": 10525 }, { "epoch": 0.10711669921875, "grad_norm": 12.017741203308105, "learning_rate": 4.862124491009188e-06, "loss": 3.4445, "step": 10530 }, { "epoch": 0.10716756184895833, "grad_norm": 18.230016708374023, "learning_rate": 4.861993498188798e-06, "loss": 3.381, "step": 10535 }, { "epoch": 0.10721842447916667, "grad_norm": 16.504117965698242, "learning_rate": 4.861862444937363e-06, "loss": 4.0049, "step": 10540 }, { "epoch": 0.107269287109375, "grad_norm": 13.888938903808594, "learning_rate": 4.861731331258238e-06, "loss": 3.5727, "step": 10545 }, { "epoch": 0.10732014973958333, "grad_norm": 14.863890647888184, "learning_rate": 4.8616001571547764e-06, "loss": 3.4779, "step": 10550 }, { "epoch": 0.10737101236979167, "grad_norm": 13.943183898925781, "learning_rate": 4.8614689226303345e-06, "loss": 3.602, "step": 10555 }, { "epoch": 0.107421875, "grad_norm": 10.1746826171875, "learning_rate": 4.86133762768827e-06, "loss": 3.4561, "step": 10560 }, { "epoch": 0.10747273763020833, "grad_norm": 12.866034507751465, "learning_rate": 4.861206272331941e-06, "loss": 3.2867, "step": 10565 }, { "epoch": 0.10752360026041667, "grad_norm": 13.17371654510498, "learning_rate": 4.86107485656471e-06, "loss": 3.4148, "step": 10570 }, { "epoch": 0.107574462890625, "grad_norm": 8.4532470703125, "learning_rate": 4.860943380389939e-06, "loss": 3.1673, "step": 10575 }, { "epoch": 0.10762532552083333, "grad_norm": 10.179365158081055, "learning_rate": 4.86081184381099e-06, "loss": 3.3304, "step": 10580 }, { "epoch": 0.10767618815104167, "grad_norm": 10.048255920410156, "learning_rate": 4.860680246831231e-06, "loss": 3.3539, "step": 10585 }, { "epoch": 0.10772705078125, "grad_norm": 14.505502700805664, "learning_rate": 4.860548589454026e-06, "loss": 3.5862, "step": 10590 }, { "epoch": 0.10777791341145833, "grad_norm": 11.778282165527344, "learning_rate": 4.860416871682746e-06, "loss": 3.3957, "step": 10595 }, { "epoch": 0.10782877604166667, "grad_norm": 10.966447830200195, "learning_rate": 4.860285093520759e-06, "loss": 3.2105, "step": 10600 }, { "epoch": 0.107879638671875, "grad_norm": 12.947134017944336, "learning_rate": 4.860153254971437e-06, "loss": 3.7268, "step": 10605 }, { "epoch": 0.10793050130208333, "grad_norm": 12.653708457946777, "learning_rate": 4.860021356038155e-06, "loss": 3.4977, "step": 10610 }, { "epoch": 0.10798136393229167, "grad_norm": 15.799676895141602, "learning_rate": 4.859889396724284e-06, "loss": 3.7318, "step": 10615 }, { "epoch": 0.1080322265625, "grad_norm": 10.117684364318848, "learning_rate": 4.859757377033204e-06, "loss": 3.8512, "step": 10620 }, { "epoch": 0.10808308919270833, "grad_norm": 16.469112396240234, "learning_rate": 4.85962529696829e-06, "loss": 3.6864, "step": 10625 }, { "epoch": 0.10813395182291667, "grad_norm": 8.12623119354248, "learning_rate": 4.859493156532922e-06, "loss": 3.162, "step": 10630 }, { "epoch": 0.108184814453125, "grad_norm": 10.442404747009277, "learning_rate": 4.859360955730481e-06, "loss": 3.3013, "step": 10635 }, { "epoch": 0.10823567708333333, "grad_norm": 13.673596382141113, "learning_rate": 4.8592286945643485e-06, "loss": 3.3599, "step": 10640 }, { "epoch": 0.10828653971354167, "grad_norm": 8.366315841674805, "learning_rate": 4.859096373037911e-06, "loss": 3.2563, "step": 10645 }, { "epoch": 0.10833740234375, "grad_norm": 16.115209579467773, "learning_rate": 4.8589639911545495e-06, "loss": 3.6421, "step": 10650 }, { "epoch": 0.10838826497395833, "grad_norm": 12.849785804748535, "learning_rate": 4.858831548917655e-06, "loss": 3.4333, "step": 10655 }, { "epoch": 0.10843912760416667, "grad_norm": 12.12121295928955, "learning_rate": 4.858699046330614e-06, "loss": 3.8339, "step": 10660 }, { "epoch": 0.108489990234375, "grad_norm": 12.373353004455566, "learning_rate": 4.858566483396816e-06, "loss": 3.3636, "step": 10665 }, { "epoch": 0.10854085286458333, "grad_norm": 13.366467475891113, "learning_rate": 4.858433860119655e-06, "loss": 3.1534, "step": 10670 }, { "epoch": 0.10859171549479167, "grad_norm": 11.035595893859863, "learning_rate": 4.858301176502522e-06, "loss": 3.0366, "step": 10675 }, { "epoch": 0.108642578125, "grad_norm": 9.009740829467773, "learning_rate": 4.858168432548813e-06, "loss": 3.1213, "step": 10680 }, { "epoch": 0.10869344075520833, "grad_norm": 13.181190490722656, "learning_rate": 4.858035628261924e-06, "loss": 3.2362, "step": 10685 }, { "epoch": 0.10874430338541667, "grad_norm": 12.755681991577148, "learning_rate": 4.85790276364525e-06, "loss": 3.493, "step": 10690 }, { "epoch": 0.108795166015625, "grad_norm": 16.021320343017578, "learning_rate": 4.857769838702195e-06, "loss": 3.3145, "step": 10695 }, { "epoch": 0.10884602864583333, "grad_norm": 11.281843185424805, "learning_rate": 4.857636853436156e-06, "loss": 3.4385, "step": 10700 }, { "epoch": 0.10889689127604167, "grad_norm": 13.89880657196045, "learning_rate": 4.857503807850538e-06, "loss": 3.5553, "step": 10705 }, { "epoch": 0.10894775390625, "grad_norm": 17.039382934570312, "learning_rate": 4.857370701948744e-06, "loss": 3.7388, "step": 10710 }, { "epoch": 0.10899861653645833, "grad_norm": 8.904913902282715, "learning_rate": 4.857237535734179e-06, "loss": 3.5905, "step": 10715 }, { "epoch": 0.10904947916666667, "grad_norm": 10.64655876159668, "learning_rate": 4.8571043092102496e-06, "loss": 3.174, "step": 10720 }, { "epoch": 0.109100341796875, "grad_norm": 11.749823570251465, "learning_rate": 4.856971022380366e-06, "loss": 3.295, "step": 10725 }, { "epoch": 0.10915120442708333, "grad_norm": 9.72337532043457, "learning_rate": 4.856837675247938e-06, "loss": 3.3855, "step": 10730 }, { "epoch": 0.10920206705729167, "grad_norm": 16.54507827758789, "learning_rate": 4.856704267816375e-06, "loss": 3.5955, "step": 10735 }, { "epoch": 0.1092529296875, "grad_norm": 13.975875854492188, "learning_rate": 4.856570800089093e-06, "loss": 3.3142, "step": 10740 }, { "epoch": 0.10930379231770833, "grad_norm": 15.316740989685059, "learning_rate": 4.856437272069506e-06, "loss": 3.4958, "step": 10745 }, { "epoch": 0.10935465494791667, "grad_norm": 10.500741004943848, "learning_rate": 4.856303683761029e-06, "loss": 3.2991, "step": 10750 }, { "epoch": 0.109405517578125, "grad_norm": 10.17525577545166, "learning_rate": 4.8561700351670815e-06, "loss": 3.7226, "step": 10755 }, { "epoch": 0.10945638020833333, "grad_norm": 12.4932861328125, "learning_rate": 4.856036326291082e-06, "loss": 3.2116, "step": 10760 }, { "epoch": 0.10950724283854167, "grad_norm": 9.614534378051758, "learning_rate": 4.855902557136451e-06, "loss": 3.351, "step": 10765 }, { "epoch": 0.10955810546875, "grad_norm": 14.387618064880371, "learning_rate": 4.855768727706613e-06, "loss": 3.5085, "step": 10770 }, { "epoch": 0.10960896809895833, "grad_norm": 11.664185523986816, "learning_rate": 4.855634838004988e-06, "loss": 3.2219, "step": 10775 }, { "epoch": 0.10965983072916667, "grad_norm": 8.379794120788574, "learning_rate": 4.8555008880350055e-06, "loss": 3.0816, "step": 10780 }, { "epoch": 0.109710693359375, "grad_norm": 14.983808517456055, "learning_rate": 4.8553668778000905e-06, "loss": 3.0428, "step": 10785 }, { "epoch": 0.10976155598958333, "grad_norm": 10.499786376953125, "learning_rate": 4.855232807303673e-06, "loss": 3.1102, "step": 10790 }, { "epoch": 0.10981241861979167, "grad_norm": 15.770118713378906, "learning_rate": 4.8550986765491825e-06, "loss": 4.0818, "step": 10795 }, { "epoch": 0.10986328125, "grad_norm": 13.66915225982666, "learning_rate": 4.85496448554005e-06, "loss": 3.7093, "step": 10800 }, { "epoch": 0.10991414388020833, "grad_norm": 8.571983337402344, "learning_rate": 4.85483023427971e-06, "loss": 3.0368, "step": 10805 }, { "epoch": 0.10996500651041667, "grad_norm": 15.759471893310547, "learning_rate": 4.854695922771595e-06, "loss": 3.5172, "step": 10810 }, { "epoch": 0.110015869140625, "grad_norm": 13.83719539642334, "learning_rate": 4.854561551019145e-06, "loss": 3.5774, "step": 10815 }, { "epoch": 0.11006673177083333, "grad_norm": 11.690563201904297, "learning_rate": 4.854427119025794e-06, "loss": 3.5709, "step": 10820 }, { "epoch": 0.11011759440104167, "grad_norm": 16.460613250732422, "learning_rate": 4.854292626794984e-06, "loss": 3.0922, "step": 10825 }, { "epoch": 0.11016845703125, "grad_norm": 9.609016418457031, "learning_rate": 4.854158074330155e-06, "loss": 3.6565, "step": 10830 }, { "epoch": 0.11021931966145833, "grad_norm": 8.492650985717773, "learning_rate": 4.85402346163475e-06, "loss": 3.0948, "step": 10835 }, { "epoch": 0.11027018229166667, "grad_norm": 8.52239990234375, "learning_rate": 4.853888788712213e-06, "loss": 3.6836, "step": 10840 }, { "epoch": 0.110321044921875, "grad_norm": 14.75137710571289, "learning_rate": 4.853754055565988e-06, "loss": 3.5731, "step": 10845 }, { "epoch": 0.11037190755208333, "grad_norm": 17.106433868408203, "learning_rate": 4.853619262199525e-06, "loss": 3.3211, "step": 10850 }, { "epoch": 0.11042277018229167, "grad_norm": 13.21174430847168, "learning_rate": 4.85348440861627e-06, "loss": 3.1972, "step": 10855 }, { "epoch": 0.1104736328125, "grad_norm": 15.949371337890625, "learning_rate": 4.8533494948196746e-06, "loss": 3.7361, "step": 10860 }, { "epoch": 0.11052449544270833, "grad_norm": 13.830065727233887, "learning_rate": 4.8532145208131894e-06, "loss": 3.4961, "step": 10865 }, { "epoch": 0.11057535807291667, "grad_norm": 14.657466888427734, "learning_rate": 4.85307948660027e-06, "loss": 3.3326, "step": 10870 }, { "epoch": 0.110626220703125, "grad_norm": 18.071182250976562, "learning_rate": 4.852944392184369e-06, "loss": 3.5664, "step": 10875 }, { "epoch": 0.11067708333333333, "grad_norm": 259.91387939453125, "learning_rate": 4.852809237568943e-06, "loss": 3.6806, "step": 10880 }, { "epoch": 0.11072794596354167, "grad_norm": 12.160555839538574, "learning_rate": 4.85267402275745e-06, "loss": 3.8418, "step": 10885 }, { "epoch": 0.11077880859375, "grad_norm": 15.426597595214844, "learning_rate": 4.852538747753351e-06, "loss": 3.5979, "step": 10890 }, { "epoch": 0.11082967122395833, "grad_norm": 13.01453685760498, "learning_rate": 4.852403412560105e-06, "loss": 3.2011, "step": 10895 }, { "epoch": 0.11088053385416667, "grad_norm": 10.577211380004883, "learning_rate": 4.852268017181176e-06, "loss": 3.727, "step": 10900 }, { "epoch": 0.110931396484375, "grad_norm": 14.287973403930664, "learning_rate": 4.852132561620026e-06, "loss": 3.758, "step": 10905 }, { "epoch": 0.11098225911458333, "grad_norm": 11.887785911560059, "learning_rate": 4.851997045880123e-06, "loss": 2.9819, "step": 10910 }, { "epoch": 0.11103312174479167, "grad_norm": 7.388796329498291, "learning_rate": 4.851861469964932e-06, "loss": 3.0688, "step": 10915 }, { "epoch": 0.111083984375, "grad_norm": 11.511301040649414, "learning_rate": 4.851725833877924e-06, "loss": 3.3384, "step": 10920 }, { "epoch": 0.11113484700520833, "grad_norm": 13.27392864227295, "learning_rate": 4.851590137622567e-06, "loss": 3.0957, "step": 10925 }, { "epoch": 0.11118570963541667, "grad_norm": 14.255321502685547, "learning_rate": 4.851454381202334e-06, "loss": 3.2306, "step": 10930 }, { "epoch": 0.111236572265625, "grad_norm": 11.45173168182373, "learning_rate": 4.851318564620699e-06, "loss": 3.4344, "step": 10935 }, { "epoch": 0.11128743489583333, "grad_norm": 12.557133674621582, "learning_rate": 4.851182687881134e-06, "loss": 3.4574, "step": 10940 }, { "epoch": 0.11133829752604167, "grad_norm": 12.226534843444824, "learning_rate": 4.851046750987118e-06, "loss": 3.8019, "step": 10945 }, { "epoch": 0.11138916015625, "grad_norm": 10.195042610168457, "learning_rate": 4.850910753942129e-06, "loss": 3.0214, "step": 10950 }, { "epoch": 0.11144002278645833, "grad_norm": 14.165132522583008, "learning_rate": 4.850774696749645e-06, "loss": 3.5724, "step": 10955 }, { "epoch": 0.11149088541666667, "grad_norm": 13.284117698669434, "learning_rate": 4.850638579413147e-06, "loss": 4.1911, "step": 10960 }, { "epoch": 0.111541748046875, "grad_norm": 16.30782127380371, "learning_rate": 4.850502401936119e-06, "loss": 3.1399, "step": 10965 }, { "epoch": 0.11159261067708333, "grad_norm": 13.27759838104248, "learning_rate": 4.850366164322044e-06, "loss": 3.4511, "step": 10970 }, { "epoch": 0.11164347330729167, "grad_norm": 12.456367492675781, "learning_rate": 4.850229866574407e-06, "loss": 3.5491, "step": 10975 }, { "epoch": 0.1116943359375, "grad_norm": 10.18445873260498, "learning_rate": 4.850093508696697e-06, "loss": 3.5397, "step": 10980 }, { "epoch": 0.11174519856770833, "grad_norm": 14.981690406799316, "learning_rate": 4.849957090692401e-06, "loss": 3.3078, "step": 10985 }, { "epoch": 0.11179606119791667, "grad_norm": 18.036338806152344, "learning_rate": 4.84982061256501e-06, "loss": 3.2716, "step": 10990 }, { "epoch": 0.111846923828125, "grad_norm": 18.29755973815918, "learning_rate": 4.849684074318015e-06, "loss": 3.6321, "step": 10995 }, { "epoch": 0.11189778645833333, "grad_norm": 13.462907791137695, "learning_rate": 4.849547475954911e-06, "loss": 3.3401, "step": 11000 }, { "epoch": 0.11194864908854167, "grad_norm": 9.00394058227539, "learning_rate": 4.849410817479191e-06, "loss": 3.6376, "step": 11005 }, { "epoch": 0.11199951171875, "grad_norm": 7.56473970413208, "learning_rate": 4.849274098894352e-06, "loss": 3.5239, "step": 11010 }, { "epoch": 0.11205037434895833, "grad_norm": 10.673871994018555, "learning_rate": 4.849137320203892e-06, "loss": 3.4489, "step": 11015 }, { "epoch": 0.11210123697916667, "grad_norm": 17.68508529663086, "learning_rate": 4.849000481411312e-06, "loss": 3.3573, "step": 11020 }, { "epoch": 0.112152099609375, "grad_norm": 13.00468921661377, "learning_rate": 4.84886358252011e-06, "loss": 3.3832, "step": 11025 }, { "epoch": 0.11220296223958333, "grad_norm": 8.050721168518066, "learning_rate": 4.8487266235337895e-06, "loss": 3.2107, "step": 11030 }, { "epoch": 0.11225382486979167, "grad_norm": 10.355036735534668, "learning_rate": 4.848589604455856e-06, "loss": 3.304, "step": 11035 }, { "epoch": 0.1123046875, "grad_norm": 19.039920806884766, "learning_rate": 4.848452525289814e-06, "loss": 3.7786, "step": 11040 }, { "epoch": 0.11235555013020833, "grad_norm": 12.501426696777344, "learning_rate": 4.8483153860391705e-06, "loss": 3.5708, "step": 11045 }, { "epoch": 0.11240641276041667, "grad_norm": 15.971506118774414, "learning_rate": 4.848178186707435e-06, "loss": 3.209, "step": 11050 }, { "epoch": 0.112457275390625, "grad_norm": 7.047082424163818, "learning_rate": 4.8480409272981165e-06, "loss": 3.6421, "step": 11055 }, { "epoch": 0.11250813802083333, "grad_norm": 14.781586647033691, "learning_rate": 4.847903607814728e-06, "loss": 3.4269, "step": 11060 }, { "epoch": 0.11255900065104167, "grad_norm": 10.393120765686035, "learning_rate": 4.847766228260781e-06, "loss": 3.5995, "step": 11065 }, { "epoch": 0.11260986328125, "grad_norm": 10.039695739746094, "learning_rate": 4.847628788639793e-06, "loss": 3.3204, "step": 11070 }, { "epoch": 0.11266072591145833, "grad_norm": 13.824012756347656, "learning_rate": 4.847491288955279e-06, "loss": 3.6698, "step": 11075 }, { "epoch": 0.11271158854166667, "grad_norm": 11.334912300109863, "learning_rate": 4.847353729210756e-06, "loss": 3.5687, "step": 11080 }, { "epoch": 0.112762451171875, "grad_norm": 15.515557289123535, "learning_rate": 4.847216109409744e-06, "loss": 3.761, "step": 11085 }, { "epoch": 0.11281331380208333, "grad_norm": 7.701551914215088, "learning_rate": 4.847078429555765e-06, "loss": 3.6052, "step": 11090 }, { "epoch": 0.11286417643229167, "grad_norm": 13.438464164733887, "learning_rate": 4.8469406896523405e-06, "loss": 3.2666, "step": 11095 }, { "epoch": 0.1129150390625, "grad_norm": 11.519120216369629, "learning_rate": 4.846802889702994e-06, "loss": 3.1568, "step": 11100 }, { "epoch": 0.11296590169270833, "grad_norm": 10.409358978271484, "learning_rate": 4.8466650297112525e-06, "loss": 3.5128, "step": 11105 }, { "epoch": 0.11301676432291667, "grad_norm": 10.928455352783203, "learning_rate": 4.846527109680642e-06, "loss": 3.7998, "step": 11110 }, { "epoch": 0.113067626953125, "grad_norm": 13.899455070495605, "learning_rate": 4.846389129614692e-06, "loss": 3.1142, "step": 11115 }, { "epoch": 0.11311848958333333, "grad_norm": 10.756808280944824, "learning_rate": 4.846251089516932e-06, "loss": 3.3781, "step": 11120 }, { "epoch": 0.11316935221354167, "grad_norm": 10.080056190490723, "learning_rate": 4.846112989390894e-06, "loss": 3.1493, "step": 11125 }, { "epoch": 0.11322021484375, "grad_norm": 16.393461227416992, "learning_rate": 4.845974829240112e-06, "loss": 3.6096, "step": 11130 }, { "epoch": 0.11327107747395833, "grad_norm": 13.437868118286133, "learning_rate": 4.845836609068119e-06, "loss": 3.3507, "step": 11135 }, { "epoch": 0.11332194010416667, "grad_norm": 12.06811237335205, "learning_rate": 4.8456983288784535e-06, "loss": 3.197, "step": 11140 }, { "epoch": 0.113372802734375, "grad_norm": 11.709611892700195, "learning_rate": 4.845559988674651e-06, "loss": 3.5839, "step": 11145 }, { "epoch": 0.11342366536458333, "grad_norm": 17.550220489501953, "learning_rate": 4.8454215884602525e-06, "loss": 3.6834, "step": 11150 }, { "epoch": 0.11347452799479167, "grad_norm": 10.27842903137207, "learning_rate": 4.845283128238799e-06, "loss": 3.5057, "step": 11155 }, { "epoch": 0.113525390625, "grad_norm": 16.689462661743164, "learning_rate": 4.845144608013832e-06, "loss": 3.3252, "step": 11160 }, { "epoch": 0.11357625325520833, "grad_norm": 10.314776420593262, "learning_rate": 4.845006027788897e-06, "loss": 4.4668, "step": 11165 }, { "epoch": 0.11362711588541667, "grad_norm": 12.564552307128906, "learning_rate": 4.844867387567538e-06, "loss": 3.3531, "step": 11170 }, { "epoch": 0.113677978515625, "grad_norm": 16.5703182220459, "learning_rate": 4.8447286873533025e-06, "loss": 3.3511, "step": 11175 }, { "epoch": 0.11372884114583333, "grad_norm": 10.776202201843262, "learning_rate": 4.84458992714974e-06, "loss": 3.5057, "step": 11180 }, { "epoch": 0.11377970377604167, "grad_norm": 9.926742553710938, "learning_rate": 4.844451106960399e-06, "loss": 3.5368, "step": 11185 }, { "epoch": 0.11383056640625, "grad_norm": 13.466375350952148, "learning_rate": 4.844312226788833e-06, "loss": 3.4744, "step": 11190 }, { "epoch": 0.11388142903645833, "grad_norm": 13.84046745300293, "learning_rate": 4.844173286638593e-06, "loss": 3.4421, "step": 11195 }, { "epoch": 0.11393229166666667, "grad_norm": 16.326255798339844, "learning_rate": 4.8440342865132365e-06, "loss": 3.5388, "step": 11200 }, { "epoch": 0.113983154296875, "grad_norm": 11.938819885253906, "learning_rate": 4.843895226416317e-06, "loss": 3.171, "step": 11205 }, { "epoch": 0.11403401692708333, "grad_norm": 8.013636589050293, "learning_rate": 4.843756106351396e-06, "loss": 4.0519, "step": 11210 }, { "epoch": 0.11408487955729167, "grad_norm": 12.114385604858398, "learning_rate": 4.843616926322029e-06, "loss": 3.5346, "step": 11215 }, { "epoch": 0.1141357421875, "grad_norm": 14.332465171813965, "learning_rate": 4.843477686331778e-06, "loss": 3.3585, "step": 11220 }, { "epoch": 0.11418660481770833, "grad_norm": 13.207276344299316, "learning_rate": 4.8433383863842065e-06, "loss": 3.2574, "step": 11225 }, { "epoch": 0.11423746744791667, "grad_norm": 13.37678050994873, "learning_rate": 4.8431990264828775e-06, "loss": 3.0765, "step": 11230 }, { "epoch": 0.114288330078125, "grad_norm": 10.288887977600098, "learning_rate": 4.843059606631358e-06, "loss": 3.4786, "step": 11235 }, { "epoch": 0.11433919270833333, "grad_norm": 9.490957260131836, "learning_rate": 4.842920126833212e-06, "loss": 3.0409, "step": 11240 }, { "epoch": 0.11439005533854167, "grad_norm": 12.170294761657715, "learning_rate": 4.842780587092011e-06, "loss": 3.4692, "step": 11245 }, { "epoch": 0.11444091796875, "grad_norm": 8.890890121459961, "learning_rate": 4.842640987411323e-06, "loss": 3.1144, "step": 11250 }, { "epoch": 0.11449178059895833, "grad_norm": 10.649124145507812, "learning_rate": 4.842501327794722e-06, "loss": 3.3058, "step": 11255 }, { "epoch": 0.11454264322916667, "grad_norm": 15.376280784606934, "learning_rate": 4.842361608245779e-06, "loss": 3.2482, "step": 11260 }, { "epoch": 0.114593505859375, "grad_norm": 11.391185760498047, "learning_rate": 4.84222182876807e-06, "loss": 3.2936, "step": 11265 }, { "epoch": 0.11464436848958333, "grad_norm": 8.210448265075684, "learning_rate": 4.84208198936517e-06, "loss": 3.5134, "step": 11270 }, { "epoch": 0.11469523111979167, "grad_norm": 15.108521461486816, "learning_rate": 4.841942090040658e-06, "loss": 3.4495, "step": 11275 }, { "epoch": 0.11474609375, "grad_norm": 10.561015129089355, "learning_rate": 4.841802130798112e-06, "loss": 3.3369, "step": 11280 }, { "epoch": 0.11479695638020833, "grad_norm": 13.979567527770996, "learning_rate": 4.841662111641114e-06, "loss": 3.5918, "step": 11285 }, { "epoch": 0.11484781901041667, "grad_norm": 12.601395606994629, "learning_rate": 4.841522032573246e-06, "loss": 3.3726, "step": 11290 }, { "epoch": 0.114898681640625, "grad_norm": 14.680171966552734, "learning_rate": 4.841381893598092e-06, "loss": 3.3161, "step": 11295 }, { "epoch": 0.11494954427083333, "grad_norm": 11.404961585998535, "learning_rate": 4.841241694719236e-06, "loss": 3.9943, "step": 11300 }, { "epoch": 0.11500040690104167, "grad_norm": 7.569732666015625, "learning_rate": 4.841101435940268e-06, "loss": 3.2494, "step": 11305 }, { "epoch": 0.11505126953125, "grad_norm": 13.507386207580566, "learning_rate": 4.840961117264773e-06, "loss": 3.2946, "step": 11310 }, { "epoch": 0.11510213216145833, "grad_norm": 8.6294584274292, "learning_rate": 4.840820738696343e-06, "loss": 3.6376, "step": 11315 }, { "epoch": 0.11515299479166667, "grad_norm": 12.276251792907715, "learning_rate": 4.8406803002385696e-06, "loss": 3.5475, "step": 11320 }, { "epoch": 0.115203857421875, "grad_norm": 13.95095157623291, "learning_rate": 4.8405398018950465e-06, "loss": 3.4318, "step": 11325 }, { "epoch": 0.11525472005208333, "grad_norm": 8.981563568115234, "learning_rate": 4.840399243669366e-06, "loss": 3.3984, "step": 11330 }, { "epoch": 0.11530558268229167, "grad_norm": 17.332687377929688, "learning_rate": 4.840258625565126e-06, "loss": 3.4236, "step": 11335 }, { "epoch": 0.1153564453125, "grad_norm": 13.317326545715332, "learning_rate": 4.840117947585924e-06, "loss": 3.7795, "step": 11340 }, { "epoch": 0.11540730794270833, "grad_norm": 7.8791656494140625, "learning_rate": 4.8399772097353585e-06, "loss": 3.316, "step": 11345 }, { "epoch": 0.11545817057291667, "grad_norm": 11.29205322265625, "learning_rate": 4.839836412017031e-06, "loss": 3.5903, "step": 11350 }, { "epoch": 0.115509033203125, "grad_norm": 9.628116607666016, "learning_rate": 4.839695554434543e-06, "loss": 3.9415, "step": 11355 }, { "epoch": 0.11555989583333333, "grad_norm": 13.431588172912598, "learning_rate": 4.839554636991499e-06, "loss": 3.3724, "step": 11360 }, { "epoch": 0.11561075846354167, "grad_norm": 16.05211639404297, "learning_rate": 4.8394136596915044e-06, "loss": 3.6998, "step": 11365 }, { "epoch": 0.11566162109375, "grad_norm": 15.686810493469238, "learning_rate": 4.839272622538166e-06, "loss": 3.3292, "step": 11370 }, { "epoch": 0.11571248372395833, "grad_norm": 10.349496841430664, "learning_rate": 4.839131525535093e-06, "loss": 3.7189, "step": 11375 }, { "epoch": 0.11576334635416667, "grad_norm": 12.135592460632324, "learning_rate": 4.838990368685892e-06, "loss": 4.0048, "step": 11380 }, { "epoch": 0.115814208984375, "grad_norm": 16.356788635253906, "learning_rate": 4.838849151994178e-06, "loss": 3.6877, "step": 11385 }, { "epoch": 0.11586507161458333, "grad_norm": 16.019092559814453, "learning_rate": 4.838707875463563e-06, "loss": 3.4608, "step": 11390 }, { "epoch": 0.11591593424479167, "grad_norm": 9.560792922973633, "learning_rate": 4.838566539097661e-06, "loss": 3.3188, "step": 11395 }, { "epoch": 0.115966796875, "grad_norm": 14.858901023864746, "learning_rate": 4.838425142900089e-06, "loss": 3.589, "step": 11400 }, { "epoch": 0.11601765950520833, "grad_norm": 8.294513702392578, "learning_rate": 4.8382836868744635e-06, "loss": 3.3469, "step": 11405 }, { "epoch": 0.11606852213541667, "grad_norm": 10.487101554870605, "learning_rate": 4.838142171024404e-06, "loss": 3.6045, "step": 11410 }, { "epoch": 0.116119384765625, "grad_norm": 12.35394287109375, "learning_rate": 4.838000595353531e-06, "loss": 3.4698, "step": 11415 }, { "epoch": 0.11617024739583333, "grad_norm": 16.841279983520508, "learning_rate": 4.8378589598654675e-06, "loss": 3.3743, "step": 11420 }, { "epoch": 0.11622111002604167, "grad_norm": 8.052149772644043, "learning_rate": 4.837717264563837e-06, "loss": 3.7649, "step": 11425 }, { "epoch": 0.11627197265625, "grad_norm": 8.459037780761719, "learning_rate": 4.837575509452264e-06, "loss": 3.1424, "step": 11430 }, { "epoch": 0.11632283528645833, "grad_norm": 13.29317855834961, "learning_rate": 4.837433694534376e-06, "loss": 3.2511, "step": 11435 }, { "epoch": 0.11637369791666667, "grad_norm": 10.47177791595459, "learning_rate": 4.8372918198138e-06, "loss": 3.1985, "step": 11440 }, { "epoch": 0.116424560546875, "grad_norm": 12.912489891052246, "learning_rate": 4.837149885294167e-06, "loss": 3.414, "step": 11445 }, { "epoch": 0.11647542317708333, "grad_norm": 11.507333755493164, "learning_rate": 4.837007890979108e-06, "loss": 3.362, "step": 11450 }, { "epoch": 0.11652628580729167, "grad_norm": 11.616119384765625, "learning_rate": 4.836865836872257e-06, "loss": 3.4036, "step": 11455 }, { "epoch": 0.1165771484375, "grad_norm": 16.553955078125, "learning_rate": 4.8367237229772466e-06, "loss": 3.5742, "step": 11460 }, { "epoch": 0.11662801106770833, "grad_norm": 10.673224449157715, "learning_rate": 4.836581549297715e-06, "loss": 3.6775, "step": 11465 }, { "epoch": 0.11667887369791667, "grad_norm": 12.565034866333008, "learning_rate": 4.836439315837297e-06, "loss": 3.8003, "step": 11470 }, { "epoch": 0.116729736328125, "grad_norm": 8.820014953613281, "learning_rate": 4.8362970225996334e-06, "loss": 3.1671, "step": 11475 }, { "epoch": 0.11678059895833333, "grad_norm": 13.71123218536377, "learning_rate": 4.836154669588363e-06, "loss": 3.2272, "step": 11480 }, { "epoch": 0.11683146158854167, "grad_norm": 12.12161636352539, "learning_rate": 4.8360122568071304e-06, "loss": 3.6246, "step": 11485 }, { "epoch": 0.11688232421875, "grad_norm": 9.981605529785156, "learning_rate": 4.835869784259578e-06, "loss": 3.2711, "step": 11490 }, { "epoch": 0.11693318684895833, "grad_norm": 8.769081115722656, "learning_rate": 4.83572725194935e-06, "loss": 3.4075, "step": 11495 }, { "epoch": 0.11698404947916667, "grad_norm": 9.689691543579102, "learning_rate": 4.835584659880095e-06, "loss": 3.2837, "step": 11500 }, { "epoch": 0.117034912109375, "grad_norm": 10.834155082702637, "learning_rate": 4.835442008055459e-06, "loss": 3.1802, "step": 11505 }, { "epoch": 0.11708577473958333, "grad_norm": 13.873346328735352, "learning_rate": 4.835299296479093e-06, "loss": 3.8511, "step": 11510 }, { "epoch": 0.11713663736979167, "grad_norm": 10.343167304992676, "learning_rate": 4.835156525154648e-06, "loss": 3.5516, "step": 11515 }, { "epoch": 0.1171875, "grad_norm": 13.721023559570312, "learning_rate": 4.8350136940857775e-06, "loss": 3.4011, "step": 11520 }, { "epoch": 0.11723836263020833, "grad_norm": 15.717689514160156, "learning_rate": 4.834870803276134e-06, "loss": 3.5618, "step": 11525 }, { "epoch": 0.11728922526041667, "grad_norm": 16.386823654174805, "learning_rate": 4.834727852729375e-06, "loss": 3.3112, "step": 11530 }, { "epoch": 0.117340087890625, "grad_norm": 15.418145179748535, "learning_rate": 4.834584842449158e-06, "loss": 4.0253, "step": 11535 }, { "epoch": 0.11739095052083333, "grad_norm": 8.149100303649902, "learning_rate": 4.83444177243914e-06, "loss": 3.5978, "step": 11540 }, { "epoch": 0.11744181315104167, "grad_norm": 8.802492141723633, "learning_rate": 4.834298642702983e-06, "loss": 3.931, "step": 11545 }, { "epoch": 0.11749267578125, "grad_norm": 10.84029483795166, "learning_rate": 4.834155453244348e-06, "loss": 3.4902, "step": 11550 }, { "epoch": 0.11754353841145833, "grad_norm": 13.952017784118652, "learning_rate": 4.8340122040669e-06, "loss": 3.3633, "step": 11555 }, { "epoch": 0.11759440104166667, "grad_norm": 9.742794036865234, "learning_rate": 4.833868895174303e-06, "loss": 3.46, "step": 11560 }, { "epoch": 0.117645263671875, "grad_norm": 9.079483032226562, "learning_rate": 4.833725526570223e-06, "loss": 3.2567, "step": 11565 }, { "epoch": 0.11769612630208333, "grad_norm": 11.4492826461792, "learning_rate": 4.833582098258328e-06, "loss": 3.332, "step": 11570 }, { "epoch": 0.11774698893229167, "grad_norm": 13.898246765136719, "learning_rate": 4.833438610242289e-06, "loss": 3.6246, "step": 11575 }, { "epoch": 0.1177978515625, "grad_norm": 18.6462459564209, "learning_rate": 4.833295062525775e-06, "loss": 4.2665, "step": 11580 }, { "epoch": 0.11784871419270833, "grad_norm": 15.118053436279297, "learning_rate": 4.833151455112462e-06, "loss": 3.4488, "step": 11585 }, { "epoch": 0.11789957682291667, "grad_norm": 12.25283432006836, "learning_rate": 4.833007788006021e-06, "loss": 3.3655, "step": 11590 }, { "epoch": 0.117950439453125, "grad_norm": 12.915300369262695, "learning_rate": 4.832864061210128e-06, "loss": 3.2749, "step": 11595 }, { "epoch": 0.11800130208333333, "grad_norm": 13.077988624572754, "learning_rate": 4.832720274728462e-06, "loss": 3.7166, "step": 11600 }, { "epoch": 0.11805216471354167, "grad_norm": 12.226242065429688, "learning_rate": 4.8325764285647e-06, "loss": 3.4749, "step": 11605 }, { "epoch": 0.11810302734375, "grad_norm": 12.33906078338623, "learning_rate": 4.832432522722523e-06, "loss": 3.8167, "step": 11610 }, { "epoch": 0.11815388997395833, "grad_norm": 18.989513397216797, "learning_rate": 4.832288557205612e-06, "loss": 3.2801, "step": 11615 }, { "epoch": 0.11820475260416667, "grad_norm": 13.292867660522461, "learning_rate": 4.832144532017653e-06, "loss": 3.4304, "step": 11620 }, { "epoch": 0.118255615234375, "grad_norm": 13.21804141998291, "learning_rate": 4.832000447162328e-06, "loss": 3.1607, "step": 11625 }, { "epoch": 0.11830647786458333, "grad_norm": 9.76976203918457, "learning_rate": 4.8318563026433244e-06, "loss": 3.6529, "step": 11630 }, { "epoch": 0.11835734049479167, "grad_norm": 14.536739349365234, "learning_rate": 4.831712098464329e-06, "loss": 3.216, "step": 11635 }, { "epoch": 0.118408203125, "grad_norm": 7.9985880851745605, "learning_rate": 4.831567834629033e-06, "loss": 3.0674, "step": 11640 }, { "epoch": 0.11845906575520833, "grad_norm": 11.221656799316406, "learning_rate": 4.831423511141127e-06, "loss": 3.3163, "step": 11645 }, { "epoch": 0.11850992838541667, "grad_norm": 16.915536880493164, "learning_rate": 4.831279128004303e-06, "loss": 3.6208, "step": 11650 }, { "epoch": 0.118560791015625, "grad_norm": 11.9306058883667, "learning_rate": 4.831134685222255e-06, "loss": 3.6851, "step": 11655 }, { "epoch": 0.11861165364583333, "grad_norm": 10.024066925048828, "learning_rate": 4.8309901827986785e-06, "loss": 3.4386, "step": 11660 }, { "epoch": 0.11866251627604167, "grad_norm": 14.097890853881836, "learning_rate": 4.83084562073727e-06, "loss": 3.3924, "step": 11665 }, { "epoch": 0.11871337890625, "grad_norm": 17.218984603881836, "learning_rate": 4.83070099904173e-06, "loss": 3.3288, "step": 11670 }, { "epoch": 0.11876424153645833, "grad_norm": 9.430810928344727, "learning_rate": 4.830556317715757e-06, "loss": 3.1723, "step": 11675 }, { "epoch": 0.11881510416666667, "grad_norm": 16.22243309020996, "learning_rate": 4.830411576763052e-06, "loss": 3.2928, "step": 11680 }, { "epoch": 0.118865966796875, "grad_norm": 14.999611854553223, "learning_rate": 4.83026677618732e-06, "loss": 3.5795, "step": 11685 }, { "epoch": 0.11891682942708333, "grad_norm": 13.795806884765625, "learning_rate": 4.830121915992265e-06, "loss": 3.6035, "step": 11690 }, { "epoch": 0.11896769205729167, "grad_norm": 11.390329360961914, "learning_rate": 4.829976996181593e-06, "loss": 3.6227, "step": 11695 }, { "epoch": 0.1190185546875, "grad_norm": 13.322566986083984, "learning_rate": 4.829832016759012e-06, "loss": 3.2774, "step": 11700 }, { "epoch": 0.11906941731770833, "grad_norm": 13.909505844116211, "learning_rate": 4.829686977728231e-06, "loss": 3.3357, "step": 11705 }, { "epoch": 0.11912027994791667, "grad_norm": 12.06095027923584, "learning_rate": 4.82954187909296e-06, "loss": 3.9097, "step": 11710 }, { "epoch": 0.119171142578125, "grad_norm": 11.268613815307617, "learning_rate": 4.829396720856913e-06, "loss": 3.2764, "step": 11715 }, { "epoch": 0.11922200520833333, "grad_norm": 8.79672622680664, "learning_rate": 4.829251503023803e-06, "loss": 3.587, "step": 11720 }, { "epoch": 0.11927286783854167, "grad_norm": 15.61196231842041, "learning_rate": 4.8291062255973455e-06, "loss": 3.2073, "step": 11725 }, { "epoch": 0.11932373046875, "grad_norm": 8.96154499053955, "learning_rate": 4.828960888581256e-06, "loss": 3.4395, "step": 11730 }, { "epoch": 0.11937459309895833, "grad_norm": 13.609565734863281, "learning_rate": 4.828815491979256e-06, "loss": 3.3523, "step": 11735 }, { "epoch": 0.11942545572916667, "grad_norm": 8.735452651977539, "learning_rate": 4.828670035795063e-06, "loss": 3.3582, "step": 11740 }, { "epoch": 0.119476318359375, "grad_norm": 13.924631118774414, "learning_rate": 4.828524520032399e-06, "loss": 3.3311, "step": 11745 }, { "epoch": 0.11952718098958333, "grad_norm": 13.989679336547852, "learning_rate": 4.828378944694987e-06, "loss": 3.1395, "step": 11750 }, { "epoch": 0.11957804361979167, "grad_norm": 17.6063289642334, "learning_rate": 4.828233309786552e-06, "loss": 3.2459, "step": 11755 }, { "epoch": 0.11962890625, "grad_norm": 14.077485084533691, "learning_rate": 4.828087615310819e-06, "loss": 3.5539, "step": 11760 }, { "epoch": 0.11967976888020833, "grad_norm": 11.900781631469727, "learning_rate": 4.8279418612715165e-06, "loss": 3.3846, "step": 11765 }, { "epoch": 0.11973063151041667, "grad_norm": 9.877041816711426, "learning_rate": 4.8277960476723726e-06, "loss": 3.7396, "step": 11770 }, { "epoch": 0.119781494140625, "grad_norm": 16.47829818725586, "learning_rate": 4.82765017451712e-06, "loss": 3.1649, "step": 11775 }, { "epoch": 0.11983235677083333, "grad_norm": 12.586395263671875, "learning_rate": 4.827504241809488e-06, "loss": 3.513, "step": 11780 }, { "epoch": 0.11988321940104167, "grad_norm": 12.90040111541748, "learning_rate": 4.827358249553213e-06, "loss": 3.5154, "step": 11785 }, { "epoch": 0.11993408203125, "grad_norm": 7.8959503173828125, "learning_rate": 4.8272121977520266e-06, "loss": 3.5876, "step": 11790 }, { "epoch": 0.11998494466145833, "grad_norm": 14.199788093566895, "learning_rate": 4.82706608640967e-06, "loss": 3.3843, "step": 11795 }, { "epoch": 0.12003580729166667, "grad_norm": 14.042448043823242, "learning_rate": 4.826919915529878e-06, "loss": 2.9448, "step": 11800 }, { "epoch": 0.120086669921875, "grad_norm": 14.99729061126709, "learning_rate": 4.826773685116392e-06, "loss": 3.3641, "step": 11805 }, { "epoch": 0.12013753255208333, "grad_norm": 8.840888023376465, "learning_rate": 4.826627395172952e-06, "loss": 3.1467, "step": 11810 }, { "epoch": 0.12018839518229167, "grad_norm": 8.198177337646484, "learning_rate": 4.8264810457033025e-06, "loss": 3.5442, "step": 11815 }, { "epoch": 0.1202392578125, "grad_norm": 10.527442932128906, "learning_rate": 4.826334636711186e-06, "loss": 3.431, "step": 11820 }, { "epoch": 0.12029012044270833, "grad_norm": 11.18076229095459, "learning_rate": 4.82618816820035e-06, "loss": 3.2887, "step": 11825 }, { "epoch": 0.12034098307291667, "grad_norm": 13.812418937683105, "learning_rate": 4.826041640174542e-06, "loss": 3.5518, "step": 11830 }, { "epoch": 0.120391845703125, "grad_norm": 17.44132423400879, "learning_rate": 4.825895052637508e-06, "loss": 3.629, "step": 11835 }, { "epoch": 0.12044270833333333, "grad_norm": 13.55534553527832, "learning_rate": 4.825748405593001e-06, "loss": 3.4921, "step": 11840 }, { "epoch": 0.12049357096354167, "grad_norm": 10.288969993591309, "learning_rate": 4.825601699044773e-06, "loss": 3.876, "step": 11845 }, { "epoch": 0.12054443359375, "grad_norm": 15.818678855895996, "learning_rate": 4.825454932996576e-06, "loss": 3.2189, "step": 11850 }, { "epoch": 0.12059529622395833, "grad_norm": 8.30019474029541, "learning_rate": 4.825308107452166e-06, "loss": 3.3629, "step": 11855 }, { "epoch": 0.12064615885416667, "grad_norm": 8.714160919189453, "learning_rate": 4.825161222415299e-06, "loss": 3.2909, "step": 11860 }, { "epoch": 0.120697021484375, "grad_norm": 8.198768615722656, "learning_rate": 4.825014277889733e-06, "loss": 3.5741, "step": 11865 }, { "epoch": 0.12074788411458333, "grad_norm": 14.308550834655762, "learning_rate": 4.824867273879229e-06, "loss": 3.5978, "step": 11870 }, { "epoch": 0.12079874674479167, "grad_norm": 11.020129203796387, "learning_rate": 4.8247202103875455e-06, "loss": 3.3011, "step": 11875 }, { "epoch": 0.120849609375, "grad_norm": 12.168553352355957, "learning_rate": 4.824573087418447e-06, "loss": 3.4922, "step": 11880 }, { "epoch": 0.12090047200520833, "grad_norm": 8.048748016357422, "learning_rate": 4.824425904975697e-06, "loss": 3.7659, "step": 11885 }, { "epoch": 0.12095133463541667, "grad_norm": 13.554608345031738, "learning_rate": 4.8242786630630615e-06, "loss": 3.2647, "step": 11890 }, { "epoch": 0.121002197265625, "grad_norm": 14.179122924804688, "learning_rate": 4.824131361684308e-06, "loss": 3.3172, "step": 11895 }, { "epoch": 0.12105305989583333, "grad_norm": 17.352275848388672, "learning_rate": 4.823984000843203e-06, "loss": 3.9405, "step": 11900 }, { "epoch": 0.12110392252604167, "grad_norm": 7.687533378601074, "learning_rate": 4.823836580543519e-06, "loss": 3.6776, "step": 11905 }, { "epoch": 0.12115478515625, "grad_norm": 18.918060302734375, "learning_rate": 4.823689100789026e-06, "loss": 3.6594, "step": 11910 }, { "epoch": 0.12120564778645833, "grad_norm": 16.174633026123047, "learning_rate": 4.823541561583499e-06, "loss": 3.4054, "step": 11915 }, { "epoch": 0.12125651041666667, "grad_norm": 12.678868293762207, "learning_rate": 4.8233939629307115e-06, "loss": 3.4099, "step": 11920 }, { "epoch": 0.121307373046875, "grad_norm": 12.389989852905273, "learning_rate": 4.82324630483444e-06, "loss": 3.6306, "step": 11925 }, { "epoch": 0.12135823567708333, "grad_norm": 8.188562393188477, "learning_rate": 4.823098587298463e-06, "loss": 3.2383, "step": 11930 }, { "epoch": 0.12140909830729167, "grad_norm": 8.033101081848145, "learning_rate": 4.8229508103265595e-06, "loss": 3.322, "step": 11935 }, { "epoch": 0.1214599609375, "grad_norm": 11.021990776062012, "learning_rate": 4.822802973922509e-06, "loss": 3.704, "step": 11940 }, { "epoch": 0.12151082356770833, "grad_norm": 6.139153003692627, "learning_rate": 4.822655078090096e-06, "loss": 3.2763, "step": 11945 }, { "epoch": 0.12156168619791667, "grad_norm": 13.162734985351562, "learning_rate": 4.822507122833104e-06, "loss": 3.8643, "step": 11950 }, { "epoch": 0.121612548828125, "grad_norm": 11.088370323181152, "learning_rate": 4.8223591081553154e-06, "loss": 3.5046, "step": 11955 }, { "epoch": 0.12166341145833333, "grad_norm": 13.00243091583252, "learning_rate": 4.822211034060521e-06, "loss": 4.2186, "step": 11960 }, { "epoch": 0.12171427408854167, "grad_norm": 11.786962509155273, "learning_rate": 4.822062900552507e-06, "loss": 3.4922, "step": 11965 }, { "epoch": 0.12176513671875, "grad_norm": 8.783388137817383, "learning_rate": 4.821914707635065e-06, "loss": 3.3294, "step": 11970 }, { "epoch": 0.12181599934895833, "grad_norm": 9.021117210388184, "learning_rate": 4.821766455311986e-06, "loss": 3.4608, "step": 11975 }, { "epoch": 0.12186686197916667, "grad_norm": 11.563819885253906, "learning_rate": 4.821618143587062e-06, "loss": 3.4183, "step": 11980 }, { "epoch": 0.121917724609375, "grad_norm": 7.746058464050293, "learning_rate": 4.821469772464087e-06, "loss": 3.0433, "step": 11985 }, { "epoch": 0.12196858723958333, "grad_norm": 16.20210838317871, "learning_rate": 4.821321341946859e-06, "loss": 3.6232, "step": 11990 }, { "epoch": 0.12201944986979167, "grad_norm": 14.226766586303711, "learning_rate": 4.821172852039175e-06, "loss": 3.5745, "step": 11995 }, { "epoch": 0.1220703125, "grad_norm": 13.494339942932129, "learning_rate": 4.821024302744834e-06, "loss": 3.5401, "step": 12000 }, { "epoch": 0.12212117513020833, "grad_norm": 9.108319282531738, "learning_rate": 4.820875694067635e-06, "loss": 3.2929, "step": 12005 }, { "epoch": 0.12217203776041667, "grad_norm": 9.119816780090332, "learning_rate": 4.820727026011382e-06, "loss": 2.9584, "step": 12010 }, { "epoch": 0.122222900390625, "grad_norm": 14.945995330810547, "learning_rate": 4.820578298579879e-06, "loss": 3.4583, "step": 12015 }, { "epoch": 0.12227376302083333, "grad_norm": 12.933895111083984, "learning_rate": 4.820429511776929e-06, "loss": 3.5372, "step": 12020 }, { "epoch": 0.12232462565104167, "grad_norm": 14.16519546508789, "learning_rate": 4.820280665606341e-06, "loss": 3.3085, "step": 12025 }, { "epoch": 0.12237548828125, "grad_norm": 14.623350143432617, "learning_rate": 4.820131760071921e-06, "loss": 2.9811, "step": 12030 }, { "epoch": 0.12242635091145833, "grad_norm": 13.574602127075195, "learning_rate": 4.8199827951774805e-06, "loss": 3.5069, "step": 12035 }, { "epoch": 0.12247721354166667, "grad_norm": 15.912371635437012, "learning_rate": 4.8198337709268305e-06, "loss": 3.6601, "step": 12040 }, { "epoch": 0.122528076171875, "grad_norm": 15.12542724609375, "learning_rate": 4.819684687323783e-06, "loss": 3.3761, "step": 12045 }, { "epoch": 0.12257893880208333, "grad_norm": 8.16356372833252, "learning_rate": 4.819535544372153e-06, "loss": 3.474, "step": 12050 }, { "epoch": 0.12262980143229167, "grad_norm": 9.369928359985352, "learning_rate": 4.819386342075755e-06, "loss": 3.548, "step": 12055 }, { "epoch": 0.1226806640625, "grad_norm": 16.48990249633789, "learning_rate": 4.8192370804384075e-06, "loss": 3.5765, "step": 12060 }, { "epoch": 0.12273152669270833, "grad_norm": 14.442497253417969, "learning_rate": 4.819087759463929e-06, "loss": 3.3908, "step": 12065 }, { "epoch": 0.12278238932291667, "grad_norm": 15.705107688903809, "learning_rate": 4.81893837915614e-06, "loss": 3.6492, "step": 12070 }, { "epoch": 0.122833251953125, "grad_norm": 15.82107162475586, "learning_rate": 4.818788939518863e-06, "loss": 3.2617, "step": 12075 }, { "epoch": 0.12288411458333333, "grad_norm": 16.034608840942383, "learning_rate": 4.8186394405559186e-06, "loss": 3.2683, "step": 12080 }, { "epoch": 0.12293497721354167, "grad_norm": 10.579744338989258, "learning_rate": 4.818489882271135e-06, "loss": 3.2758, "step": 12085 }, { "epoch": 0.12298583984375, "grad_norm": 12.461812019348145, "learning_rate": 4.818340264668337e-06, "loss": 3.2529, "step": 12090 }, { "epoch": 0.12303670247395833, "grad_norm": 17.07436752319336, "learning_rate": 4.8181905877513535e-06, "loss": 3.7906, "step": 12095 }, { "epoch": 0.12308756510416667, "grad_norm": 11.529945373535156, "learning_rate": 4.818040851524013e-06, "loss": 3.5339, "step": 12100 }, { "epoch": 0.123138427734375, "grad_norm": 15.79837417602539, "learning_rate": 4.817891055990146e-06, "loss": 3.4221, "step": 12105 }, { "epoch": 0.12318929036458333, "grad_norm": 15.319779396057129, "learning_rate": 4.817741201153587e-06, "loss": 3.4168, "step": 12110 }, { "epoch": 0.12324015299479167, "grad_norm": 13.032570838928223, "learning_rate": 4.817591287018168e-06, "loss": 3.5855, "step": 12115 }, { "epoch": 0.123291015625, "grad_norm": 10.453932762145996, "learning_rate": 4.817441313587725e-06, "loss": 3.3555, "step": 12120 }, { "epoch": 0.12334187825520833, "grad_norm": 15.642955780029297, "learning_rate": 4.817291280866096e-06, "loss": 3.5448, "step": 12125 }, { "epoch": 0.12339274088541667, "grad_norm": 10.044322967529297, "learning_rate": 4.8171411888571185e-06, "loss": 3.3746, "step": 12130 }, { "epoch": 0.123443603515625, "grad_norm": 11.619243621826172, "learning_rate": 4.816991037564632e-06, "loss": 3.6049, "step": 12135 }, { "epoch": 0.12349446614583333, "grad_norm": 9.354294776916504, "learning_rate": 4.81684082699248e-06, "loss": 3.5606, "step": 12140 }, { "epoch": 0.12354532877604167, "grad_norm": 15.51186752319336, "learning_rate": 4.816690557144505e-06, "loss": 3.6708, "step": 12145 }, { "epoch": 0.12359619140625, "grad_norm": 13.874622344970703, "learning_rate": 4.816540228024551e-06, "loss": 3.5003, "step": 12150 }, { "epoch": 0.12364705403645833, "grad_norm": 11.789558410644531, "learning_rate": 4.816389839636463e-06, "loss": 3.2361, "step": 12155 }, { "epoch": 0.12369791666666667, "grad_norm": 14.9029541015625, "learning_rate": 4.816239391984091e-06, "loss": 3.8518, "step": 12160 }, { "epoch": 0.123748779296875, "grad_norm": 16.188743591308594, "learning_rate": 4.8160888850712835e-06, "loss": 3.4669, "step": 12165 }, { "epoch": 0.12379964192708333, "grad_norm": 16.114046096801758, "learning_rate": 4.81593831890189e-06, "loss": 3.329, "step": 12170 }, { "epoch": 0.12385050455729167, "grad_norm": 9.05451774597168, "learning_rate": 4.815787693479764e-06, "loss": 3.1477, "step": 12175 }, { "epoch": 0.1239013671875, "grad_norm": 13.087759971618652, "learning_rate": 4.815637008808759e-06, "loss": 3.4289, "step": 12180 }, { "epoch": 0.12395222981770833, "grad_norm": 11.878509521484375, "learning_rate": 4.81548626489273e-06, "loss": 3.3253, "step": 12185 }, { "epoch": 0.12400309244791667, "grad_norm": 23.86294937133789, "learning_rate": 4.815335461735534e-06, "loss": 3.7135, "step": 12190 }, { "epoch": 0.124053955078125, "grad_norm": 12.333020210266113, "learning_rate": 4.815184599341029e-06, "loss": 3.5888, "step": 12195 }, { "epoch": 0.12410481770833333, "grad_norm": 11.767260551452637, "learning_rate": 4.8150336777130736e-06, "loss": 3.2373, "step": 12200 }, { "epoch": 0.12415568033854167, "grad_norm": 11.746440887451172, "learning_rate": 4.8148826968555306e-06, "loss": 3.3153, "step": 12205 }, { "epoch": 0.12420654296875, "grad_norm": 12.361739158630371, "learning_rate": 4.814731656772263e-06, "loss": 3.4058, "step": 12210 }, { "epoch": 0.12425740559895833, "grad_norm": 10.730027198791504, "learning_rate": 4.8145805574671346e-06, "loss": 3.4855, "step": 12215 }, { "epoch": 0.12430826822916667, "grad_norm": 16.28338623046875, "learning_rate": 4.814429398944011e-06, "loss": 3.5275, "step": 12220 }, { "epoch": 0.124359130859375, "grad_norm": 16.588144302368164, "learning_rate": 4.81427818120676e-06, "loss": 3.6741, "step": 12225 }, { "epoch": 0.12440999348958333, "grad_norm": 15.339471817016602, "learning_rate": 4.81412690425925e-06, "loss": 3.9718, "step": 12230 }, { "epoch": 0.12446085611979167, "grad_norm": 8.622764587402344, "learning_rate": 4.8139755681053526e-06, "loss": 3.5536, "step": 12235 }, { "epoch": 0.12451171875, "grad_norm": 13.108068466186523, "learning_rate": 4.813824172748938e-06, "loss": 3.283, "step": 12240 }, { "epoch": 0.12456258138020833, "grad_norm": 12.999624252319336, "learning_rate": 4.8136727181938804e-06, "loss": 3.7082, "step": 12245 }, { "epoch": 0.12461344401041667, "grad_norm": 9.05246639251709, "learning_rate": 4.813521204444055e-06, "loss": 3.655, "step": 12250 }, { "epoch": 0.124664306640625, "grad_norm": 11.840641021728516, "learning_rate": 4.8133696315033375e-06, "loss": 3.4432, "step": 12255 }, { "epoch": 0.12471516927083333, "grad_norm": 12.203230857849121, "learning_rate": 4.813217999375606e-06, "loss": 3.1917, "step": 12260 }, { "epoch": 0.12476603190104167, "grad_norm": 10.946849822998047, "learning_rate": 4.813066308064741e-06, "loss": 3.2396, "step": 12265 }, { "epoch": 0.12481689453125, "grad_norm": 9.767146110534668, "learning_rate": 4.812914557574622e-06, "loss": 3.337, "step": 12270 }, { "epoch": 0.12486775716145833, "grad_norm": 10.88211727142334, "learning_rate": 4.8127627479091336e-06, "loss": 3.2582, "step": 12275 }, { "epoch": 0.12491861979166667, "grad_norm": 9.982248306274414, "learning_rate": 4.812610879072157e-06, "loss": 3.2663, "step": 12280 }, { "epoch": 0.124969482421875, "grad_norm": 8.785655975341797, "learning_rate": 4.8124589510675805e-06, "loss": 3.1978, "step": 12285 }, { "epoch": 0.12502034505208334, "grad_norm": 13.370884895324707, "learning_rate": 4.812306963899289e-06, "loss": 3.5219, "step": 12290 }, { "epoch": 0.12507120768229166, "grad_norm": 14.19118595123291, "learning_rate": 4.812154917571172e-06, "loss": 3.1948, "step": 12295 }, { "epoch": 0.1251220703125, "grad_norm": 15.058856010437012, "learning_rate": 4.81200281208712e-06, "loss": 3.4093, "step": 12300 }, { "epoch": 0.12517293294270834, "grad_norm": 17.146102905273438, "learning_rate": 4.811850647451024e-06, "loss": 3.3073, "step": 12305 }, { "epoch": 0.12522379557291666, "grad_norm": 10.474042892456055, "learning_rate": 4.811698423666777e-06, "loss": 3.3589, "step": 12310 }, { "epoch": 0.125274658203125, "grad_norm": 12.146330833435059, "learning_rate": 4.811546140738273e-06, "loss": 3.4111, "step": 12315 }, { "epoch": 0.12532552083333334, "grad_norm": 8.893861770629883, "learning_rate": 4.811393798669409e-06, "loss": 3.4912, "step": 12320 }, { "epoch": 0.12537638346354166, "grad_norm": 14.769695281982422, "learning_rate": 4.811241397464083e-06, "loss": 3.459, "step": 12325 }, { "epoch": 0.12542724609375, "grad_norm": 16.10647964477539, "learning_rate": 4.811088937126194e-06, "loss": 3.3882, "step": 12330 }, { "epoch": 0.12547810872395834, "grad_norm": 14.999397277832031, "learning_rate": 4.8109364176596416e-06, "loss": 3.3708, "step": 12335 }, { "epoch": 0.12552897135416666, "grad_norm": 11.39329719543457, "learning_rate": 4.810783839068329e-06, "loss": 3.5732, "step": 12340 }, { "epoch": 0.125579833984375, "grad_norm": 10.63199234008789, "learning_rate": 4.81063120135616e-06, "loss": 3.5412, "step": 12345 }, { "epoch": 0.12563069661458334, "grad_norm": 9.905284881591797, "learning_rate": 4.81047850452704e-06, "loss": 3.1996, "step": 12350 }, { "epoch": 0.12568155924479166, "grad_norm": 15.250255584716797, "learning_rate": 4.810325748584873e-06, "loss": 3.3639, "step": 12355 }, { "epoch": 0.125732421875, "grad_norm": 12.014054298400879, "learning_rate": 4.8101729335335716e-06, "loss": 3.2804, "step": 12360 }, { "epoch": 0.12578328450520834, "grad_norm": 15.268871307373047, "learning_rate": 4.810020059377042e-06, "loss": 3.4151, "step": 12365 }, { "epoch": 0.12583414713541666, "grad_norm": 15.318325996398926, "learning_rate": 4.809867126119197e-06, "loss": 3.6018, "step": 12370 }, { "epoch": 0.125885009765625, "grad_norm": 13.372442245483398, "learning_rate": 4.8097141337639485e-06, "loss": 3.8433, "step": 12375 }, { "epoch": 0.12593587239583334, "grad_norm": 15.433815956115723, "learning_rate": 4.809561082315212e-06, "loss": 3.2409, "step": 12380 }, { "epoch": 0.12598673502604166, "grad_norm": 9.874568939208984, "learning_rate": 4.809407971776902e-06, "loss": 3.5737, "step": 12385 }, { "epoch": 0.12603759765625, "grad_norm": 13.617438316345215, "learning_rate": 4.809254802152937e-06, "loss": 3.598, "step": 12390 }, { "epoch": 0.12608846028645834, "grad_norm": 17.306371688842773, "learning_rate": 4.809101573447236e-06, "loss": 3.9022, "step": 12395 }, { "epoch": 0.12613932291666666, "grad_norm": 12.497845649719238, "learning_rate": 4.808948285663717e-06, "loss": 3.292, "step": 12400 }, { "epoch": 0.126190185546875, "grad_norm": 11.501578330993652, "learning_rate": 4.808794938806305e-06, "loss": 3.4564, "step": 12405 }, { "epoch": 0.12624104817708334, "grad_norm": 15.333441734313965, "learning_rate": 4.808641532878921e-06, "loss": 3.257, "step": 12410 }, { "epoch": 0.12629191080729166, "grad_norm": 9.874608993530273, "learning_rate": 4.80848806788549e-06, "loss": 3.2448, "step": 12415 }, { "epoch": 0.1263427734375, "grad_norm": 9.775317192077637, "learning_rate": 4.808334543829939e-06, "loss": 3.4979, "step": 12420 }, { "epoch": 0.12639363606770834, "grad_norm": 9.998175621032715, "learning_rate": 4.808180960716196e-06, "loss": 3.1983, "step": 12425 }, { "epoch": 0.12644449869791666, "grad_norm": 10.001486778259277, "learning_rate": 4.808027318548191e-06, "loss": 3.2788, "step": 12430 }, { "epoch": 0.126495361328125, "grad_norm": 15.605623245239258, "learning_rate": 4.807873617329854e-06, "loss": 3.7982, "step": 12435 }, { "epoch": 0.12654622395833334, "grad_norm": 16.90433692932129, "learning_rate": 4.807719857065117e-06, "loss": 3.7252, "step": 12440 }, { "epoch": 0.12659708658854166, "grad_norm": 9.86048412322998, "learning_rate": 4.807566037757914e-06, "loss": 3.3411, "step": 12445 }, { "epoch": 0.12664794921875, "grad_norm": 9.026899337768555, "learning_rate": 4.807412159412181e-06, "loss": 3.1362, "step": 12450 }, { "epoch": 0.12669881184895834, "grad_norm": 11.268509864807129, "learning_rate": 4.807258222031855e-06, "loss": 3.2253, "step": 12455 }, { "epoch": 0.12674967447916666, "grad_norm": 14.036666870117188, "learning_rate": 4.807104225620875e-06, "loss": 3.1731, "step": 12460 }, { "epoch": 0.126800537109375, "grad_norm": 9.418083190917969, "learning_rate": 4.8069501701831795e-06, "loss": 3.1714, "step": 12465 }, { "epoch": 0.12685139973958334, "grad_norm": 12.404669761657715, "learning_rate": 4.8067960557227114e-06, "loss": 3.6047, "step": 12470 }, { "epoch": 0.12690226236979166, "grad_norm": 12.391560554504395, "learning_rate": 4.806641882243412e-06, "loss": 3.4368, "step": 12475 }, { "epoch": 0.126953125, "grad_norm": 13.154583930969238, "learning_rate": 4.806487649749228e-06, "loss": 4.0829, "step": 12480 }, { "epoch": 0.12700398763020834, "grad_norm": 13.884321212768555, "learning_rate": 4.806333358244103e-06, "loss": 3.1453, "step": 12485 }, { "epoch": 0.12705485026041666, "grad_norm": 11.85291862487793, "learning_rate": 4.806179007731986e-06, "loss": 3.4844, "step": 12490 }, { "epoch": 0.127105712890625, "grad_norm": 12.22746753692627, "learning_rate": 4.806024598216826e-06, "loss": 3.7238, "step": 12495 }, { "epoch": 0.12715657552083334, "grad_norm": 14.834999084472656, "learning_rate": 4.805870129702573e-06, "loss": 3.7607, "step": 12500 }, { "epoch": 0.12720743815104166, "grad_norm": 18.713239669799805, "learning_rate": 4.8057156021931795e-06, "loss": 3.5314, "step": 12505 }, { "epoch": 0.12725830078125, "grad_norm": 15.553114891052246, "learning_rate": 4.8055610156925984e-06, "loss": 3.9413, "step": 12510 }, { "epoch": 0.12730916341145834, "grad_norm": 12.916542053222656, "learning_rate": 4.805406370204785e-06, "loss": 3.3368, "step": 12515 }, { "epoch": 0.12736002604166666, "grad_norm": 13.694195747375488, "learning_rate": 4.805251665733696e-06, "loss": 3.2709, "step": 12520 }, { "epoch": 0.127410888671875, "grad_norm": 10.602334976196289, "learning_rate": 4.805096902283291e-06, "loss": 3.3952, "step": 12525 }, { "epoch": 0.12746175130208334, "grad_norm": 14.200652122497559, "learning_rate": 4.804942079857527e-06, "loss": 3.5215, "step": 12530 }, { "epoch": 0.12751261393229166, "grad_norm": 7.785111427307129, "learning_rate": 4.804787198460366e-06, "loss": 3.3147, "step": 12535 }, { "epoch": 0.1275634765625, "grad_norm": 12.794201850891113, "learning_rate": 4.804632258095772e-06, "loss": 3.4677, "step": 12540 }, { "epoch": 0.12761433919270834, "grad_norm": 12.663290977478027, "learning_rate": 4.804477258767707e-06, "loss": 3.6271, "step": 12545 }, { "epoch": 0.12766520182291666, "grad_norm": 10.109106063842773, "learning_rate": 4.804322200480138e-06, "loss": 3.5064, "step": 12550 }, { "epoch": 0.127716064453125, "grad_norm": 15.36133861541748, "learning_rate": 4.804167083237031e-06, "loss": 3.4629, "step": 12555 }, { "epoch": 0.12776692708333334, "grad_norm": 12.881776809692383, "learning_rate": 4.804011907042356e-06, "loss": 3.4292, "step": 12560 }, { "epoch": 0.12781778971354166, "grad_norm": 8.816761016845703, "learning_rate": 4.8038566719000825e-06, "loss": 3.0355, "step": 12565 }, { "epoch": 0.12786865234375, "grad_norm": 15.458996772766113, "learning_rate": 4.803701377814181e-06, "loss": 4.0244, "step": 12570 }, { "epoch": 0.12791951497395834, "grad_norm": 9.55286979675293, "learning_rate": 4.803546024788628e-06, "loss": 3.4125, "step": 12575 }, { "epoch": 0.12797037760416666, "grad_norm": 15.313224792480469, "learning_rate": 4.803390612827394e-06, "loss": 3.4685, "step": 12580 }, { "epoch": 0.128021240234375, "grad_norm": 13.189021110534668, "learning_rate": 4.803235141934458e-06, "loss": 3.3217, "step": 12585 }, { "epoch": 0.12807210286458334, "grad_norm": 16.103500366210938, "learning_rate": 4.803079612113796e-06, "loss": 3.1009, "step": 12590 }, { "epoch": 0.12812296549479166, "grad_norm": 16.239477157592773, "learning_rate": 4.802924023369388e-06, "loss": 3.4193, "step": 12595 }, { "epoch": 0.128173828125, "grad_norm": 11.9988374710083, "learning_rate": 4.802768375705216e-06, "loss": 3.6531, "step": 12600 }, { "epoch": 0.12822469075520834, "grad_norm": 11.093536376953125, "learning_rate": 4.802612669125261e-06, "loss": 3.2401, "step": 12605 }, { "epoch": 0.12827555338541666, "grad_norm": 10.944393157958984, "learning_rate": 4.8024569036335055e-06, "loss": 3.2432, "step": 12610 }, { "epoch": 0.128326416015625, "grad_norm": 14.31828498840332, "learning_rate": 4.802301079233936e-06, "loss": 3.6411, "step": 12615 }, { "epoch": 0.12837727864583334, "grad_norm": 9.36100959777832, "learning_rate": 4.802145195930539e-06, "loss": 3.4192, "step": 12620 }, { "epoch": 0.12842814127604166, "grad_norm": 13.963410377502441, "learning_rate": 4.801989253727303e-06, "loss": 3.1725, "step": 12625 }, { "epoch": 0.12847900390625, "grad_norm": 16.763933181762695, "learning_rate": 4.801833252628218e-06, "loss": 3.3481, "step": 12630 }, { "epoch": 0.12852986653645834, "grad_norm": 12.801989555358887, "learning_rate": 4.801677192637275e-06, "loss": 3.5976, "step": 12635 }, { "epoch": 0.12858072916666666, "grad_norm": 81.20991516113281, "learning_rate": 4.801521073758466e-06, "loss": 3.3127, "step": 12640 }, { "epoch": 0.128631591796875, "grad_norm": 13.878232955932617, "learning_rate": 4.801364895995786e-06, "loss": 3.313, "step": 12645 }, { "epoch": 0.12868245442708334, "grad_norm": 10.5735445022583, "learning_rate": 4.8012086593532306e-06, "loss": 3.298, "step": 12650 }, { "epoch": 0.12873331705729166, "grad_norm": 10.607198715209961, "learning_rate": 4.8010523638347965e-06, "loss": 3.4635, "step": 12655 }, { "epoch": 0.1287841796875, "grad_norm": 10.28955078125, "learning_rate": 4.800896009444484e-06, "loss": 3.5608, "step": 12660 }, { "epoch": 0.12883504231770834, "grad_norm": 12.767330169677734, "learning_rate": 4.800739596186293e-06, "loss": 3.222, "step": 12665 }, { "epoch": 0.12888590494791666, "grad_norm": 15.003509521484375, "learning_rate": 4.800583124064223e-06, "loss": 3.2579, "step": 12670 }, { "epoch": 0.128936767578125, "grad_norm": 14.754562377929688, "learning_rate": 4.80042659308228e-06, "loss": 3.2074, "step": 12675 }, { "epoch": 0.12898763020833334, "grad_norm": 15.33542251586914, "learning_rate": 4.800270003244467e-06, "loss": 3.2303, "step": 12680 }, { "epoch": 0.12903849283854166, "grad_norm": 14.25710391998291, "learning_rate": 4.800113354554793e-06, "loss": 3.8232, "step": 12685 }, { "epoch": 0.12908935546875, "grad_norm": 16.291183471679688, "learning_rate": 4.799956647017262e-06, "loss": 3.3102, "step": 12690 }, { "epoch": 0.12914021809895834, "grad_norm": 13.827568054199219, "learning_rate": 4.799799880635887e-06, "loss": 3.8675, "step": 12695 }, { "epoch": 0.12919108072916666, "grad_norm": 13.914209365844727, "learning_rate": 4.799643055414677e-06, "loss": 3.0558, "step": 12700 }, { "epoch": 0.129241943359375, "grad_norm": 10.144859313964844, "learning_rate": 4.799486171357644e-06, "loss": 3.4235, "step": 12705 }, { "epoch": 0.12929280598958334, "grad_norm": 10.29310417175293, "learning_rate": 4.799329228468802e-06, "loss": 3.4347, "step": 12710 }, { "epoch": 0.12934366861979166, "grad_norm": 13.560129165649414, "learning_rate": 4.7991722267521665e-06, "loss": 3.3288, "step": 12715 }, { "epoch": 0.12939453125, "grad_norm": 13.470560073852539, "learning_rate": 4.799015166211756e-06, "loss": 3.2676, "step": 12720 }, { "epoch": 0.12944539388020834, "grad_norm": 12.207547187805176, "learning_rate": 4.798858046851587e-06, "loss": 3.0864, "step": 12725 }, { "epoch": 0.12949625651041666, "grad_norm": 15.827969551086426, "learning_rate": 4.798700868675679e-06, "loss": 3.3807, "step": 12730 }, { "epoch": 0.129547119140625, "grad_norm": 14.415461540222168, "learning_rate": 4.798543631688054e-06, "loss": 3.0016, "step": 12735 }, { "epoch": 0.12959798177083334, "grad_norm": 7.998185157775879, "learning_rate": 4.798386335892735e-06, "loss": 3.2422, "step": 12740 }, { "epoch": 0.12964884440104166, "grad_norm": 7.472536087036133, "learning_rate": 4.798228981293747e-06, "loss": 3.4627, "step": 12745 }, { "epoch": 0.12969970703125, "grad_norm": 7.803865909576416, "learning_rate": 4.798071567895115e-06, "loss": 3.4333, "step": 12750 }, { "epoch": 0.12975056966145834, "grad_norm": 11.206137657165527, "learning_rate": 4.797914095700867e-06, "loss": 3.2908, "step": 12755 }, { "epoch": 0.12980143229166666, "grad_norm": 13.730634689331055, "learning_rate": 4.797756564715031e-06, "loss": 3.4254, "step": 12760 }, { "epoch": 0.129852294921875, "grad_norm": 13.240988731384277, "learning_rate": 4.797598974941638e-06, "loss": 3.8927, "step": 12765 }, { "epoch": 0.12990315755208334, "grad_norm": 14.986083030700684, "learning_rate": 4.79744132638472e-06, "loss": 2.9281, "step": 12770 }, { "epoch": 0.12995402018229166, "grad_norm": 12.508495330810547, "learning_rate": 4.79728361904831e-06, "loss": 3.2039, "step": 12775 }, { "epoch": 0.1300048828125, "grad_norm": 13.931479454040527, "learning_rate": 4.797125852936444e-06, "loss": 3.3995, "step": 12780 }, { "epoch": 0.13005574544270834, "grad_norm": 15.499288558959961, "learning_rate": 4.796968028053156e-06, "loss": 3.5259, "step": 12785 }, { "epoch": 0.13010660807291666, "grad_norm": 15.569916725158691, "learning_rate": 4.796810144402486e-06, "loss": 3.5487, "step": 12790 }, { "epoch": 0.130157470703125, "grad_norm": 11.951336860656738, "learning_rate": 4.796652201988474e-06, "loss": 3.1815, "step": 12795 }, { "epoch": 0.13020833333333334, "grad_norm": 10.30989933013916, "learning_rate": 4.796494200815158e-06, "loss": 3.4612, "step": 12800 }, { "epoch": 0.13025919596354166, "grad_norm": 10.12043571472168, "learning_rate": 4.796336140886584e-06, "loss": 3.9442, "step": 12805 }, { "epoch": 0.13031005859375, "grad_norm": 16.965572357177734, "learning_rate": 4.796178022206793e-06, "loss": 3.603, "step": 12810 }, { "epoch": 0.13036092122395834, "grad_norm": 12.111398696899414, "learning_rate": 4.796019844779831e-06, "loss": 3.6743, "step": 12815 }, { "epoch": 0.13041178385416666, "grad_norm": 14.302387237548828, "learning_rate": 4.795861608609747e-06, "loss": 3.2853, "step": 12820 }, { "epoch": 0.130462646484375, "grad_norm": 12.947281837463379, "learning_rate": 4.795703313700587e-06, "loss": 3.3062, "step": 12825 }, { "epoch": 0.13051350911458334, "grad_norm": 12.033870697021484, "learning_rate": 4.795544960056402e-06, "loss": 3.1838, "step": 12830 }, { "epoch": 0.13056437174479166, "grad_norm": 9.891314506530762, "learning_rate": 4.7953865476812435e-06, "loss": 3.4533, "step": 12835 }, { "epoch": 0.130615234375, "grad_norm": 11.173048973083496, "learning_rate": 4.795228076579164e-06, "loss": 3.3019, "step": 12840 }, { "epoch": 0.13066609700520834, "grad_norm": 58.573089599609375, "learning_rate": 4.795069546754219e-06, "loss": 3.2771, "step": 12845 }, { "epoch": 0.13071695963541666, "grad_norm": 10.534260749816895, "learning_rate": 4.794910958210463e-06, "loss": 3.5155, "step": 12850 }, { "epoch": 0.130767822265625, "grad_norm": 10.294408798217773, "learning_rate": 4.7947523109519535e-06, "loss": 3.1176, "step": 12855 }, { "epoch": 0.13081868489583334, "grad_norm": 8.581644058227539, "learning_rate": 4.79459360498275e-06, "loss": 3.6988, "step": 12860 }, { "epoch": 0.13086954752604166, "grad_norm": 7.670634746551514, "learning_rate": 4.794434840306914e-06, "loss": 3.4395, "step": 12865 }, { "epoch": 0.13092041015625, "grad_norm": 8.824413299560547, "learning_rate": 4.794276016928506e-06, "loss": 3.7669, "step": 12870 }, { "epoch": 0.13097127278645834, "grad_norm": 16.448566436767578, "learning_rate": 4.794117134851589e-06, "loss": 3.6758, "step": 12875 }, { "epoch": 0.13102213541666666, "grad_norm": 17.62189483642578, "learning_rate": 4.79395819408023e-06, "loss": 3.4432, "step": 12880 }, { "epoch": 0.131072998046875, "grad_norm": 13.63729190826416, "learning_rate": 4.793799194618495e-06, "loss": 3.1668, "step": 12885 }, { "epoch": 0.13112386067708334, "grad_norm": 9.173359870910645, "learning_rate": 4.79364013647045e-06, "loss": 3.28, "step": 12890 }, { "epoch": 0.13117472330729166, "grad_norm": 10.53978157043457, "learning_rate": 4.793481019640166e-06, "loss": 3.2492, "step": 12895 }, { "epoch": 0.1312255859375, "grad_norm": 11.081829071044922, "learning_rate": 4.793321844131714e-06, "loss": 3.3987, "step": 12900 }, { "epoch": 0.13127644856770834, "grad_norm": 8.339278221130371, "learning_rate": 4.793162609949166e-06, "loss": 3.1725, "step": 12905 }, { "epoch": 0.13132731119791666, "grad_norm": 9.031716346740723, "learning_rate": 4.793003317096596e-06, "loss": 3.2547, "step": 12910 }, { "epoch": 0.131378173828125, "grad_norm": 12.638387680053711, "learning_rate": 4.79284396557808e-06, "loss": 3.367, "step": 12915 }, { "epoch": 0.13142903645833334, "grad_norm": 12.951153755187988, "learning_rate": 4.7926845553976945e-06, "loss": 3.2392, "step": 12920 }, { "epoch": 0.13147989908854166, "grad_norm": 15.467037200927734, "learning_rate": 4.792525086559518e-06, "loss": 3.5068, "step": 12925 }, { "epoch": 0.13153076171875, "grad_norm": 8.698431015014648, "learning_rate": 4.792365559067631e-06, "loss": 3.3362, "step": 12930 }, { "epoch": 0.13158162434895834, "grad_norm": 14.218523979187012, "learning_rate": 4.792205972926114e-06, "loss": 3.0097, "step": 12935 }, { "epoch": 0.13163248697916666, "grad_norm": 7.461666584014893, "learning_rate": 4.792046328139051e-06, "loss": 3.3224, "step": 12940 }, { "epoch": 0.131683349609375, "grad_norm": 15.739773750305176, "learning_rate": 4.791886624710525e-06, "loss": 3.8441, "step": 12945 }, { "epoch": 0.13173421223958334, "grad_norm": 10.866748809814453, "learning_rate": 4.791726862644623e-06, "loss": 3.861, "step": 12950 }, { "epoch": 0.13178507486979166, "grad_norm": 9.433417320251465, "learning_rate": 4.791567041945433e-06, "loss": 3.0838, "step": 12955 }, { "epoch": 0.1318359375, "grad_norm": 16.90796661376953, "learning_rate": 4.791407162617043e-06, "loss": 3.7086, "step": 12960 }, { "epoch": 0.13188680013020834, "grad_norm": 14.038589477539062, "learning_rate": 4.791247224663545e-06, "loss": 3.4241, "step": 12965 }, { "epoch": 0.13193766276041666, "grad_norm": 8.530431747436523, "learning_rate": 4.791087228089029e-06, "loss": 3.4557, "step": 12970 }, { "epoch": 0.131988525390625, "grad_norm": 16.40488624572754, "learning_rate": 4.790927172897589e-06, "loss": 3.7428, "step": 12975 }, { "epoch": 0.13203938802083334, "grad_norm": 10.48965072631836, "learning_rate": 4.790767059093321e-06, "loss": 3.8246, "step": 12980 }, { "epoch": 0.13209025065104166, "grad_norm": 11.309000968933105, "learning_rate": 4.79060688668032e-06, "loss": 3.3004, "step": 12985 }, { "epoch": 0.13214111328125, "grad_norm": 13.044082641601562, "learning_rate": 4.790446655662686e-06, "loss": 3.4936, "step": 12990 }, { "epoch": 0.13219197591145834, "grad_norm": 14.110028266906738, "learning_rate": 4.790286366044516e-06, "loss": 2.9904, "step": 12995 }, { "epoch": 0.13224283854166666, "grad_norm": 16.38824462890625, "learning_rate": 4.790126017829913e-06, "loss": 3.1172, "step": 13000 }, { "epoch": 0.132293701171875, "grad_norm": 12.276939392089844, "learning_rate": 4.789965611022977e-06, "loss": 3.588, "step": 13005 }, { "epoch": 0.13234456380208334, "grad_norm": 12.605525016784668, "learning_rate": 4.7898051456278155e-06, "loss": 3.1167, "step": 13010 }, { "epoch": 0.13239542643229166, "grad_norm": 10.54214096069336, "learning_rate": 4.7896446216485314e-06, "loss": 3.496, "step": 13015 }, { "epoch": 0.1324462890625, "grad_norm": 11.415157318115234, "learning_rate": 4.789484039089232e-06, "loss": 3.5018, "step": 13020 }, { "epoch": 0.13249715169270834, "grad_norm": 10.655110359191895, "learning_rate": 4.789323397954027e-06, "loss": 3.5284, "step": 13025 }, { "epoch": 0.13254801432291666, "grad_norm": 12.17111873626709, "learning_rate": 4.789162698247024e-06, "loss": 3.0777, "step": 13030 }, { "epoch": 0.132598876953125, "grad_norm": 14.725519180297852, "learning_rate": 4.789001939972338e-06, "loss": 3.358, "step": 13035 }, { "epoch": 0.13264973958333334, "grad_norm": 15.81676959991455, "learning_rate": 4.7888411231340785e-06, "loss": 3.2394, "step": 13040 }, { "epoch": 0.13270060221354166, "grad_norm": 9.43869686126709, "learning_rate": 4.788680247736362e-06, "loss": 3.3947, "step": 13045 }, { "epoch": 0.13275146484375, "grad_norm": 14.853139877319336, "learning_rate": 4.788519313783303e-06, "loss": 3.0722, "step": 13050 }, { "epoch": 0.13280232747395834, "grad_norm": 12.241186141967773, "learning_rate": 4.788358321279021e-06, "loss": 3.8822, "step": 13055 }, { "epoch": 0.13285319010416666, "grad_norm": 8.0377779006958, "learning_rate": 4.788197270227633e-06, "loss": 3.4792, "step": 13060 }, { "epoch": 0.132904052734375, "grad_norm": 13.722543716430664, "learning_rate": 4.78803616063326e-06, "loss": 3.3762, "step": 13065 }, { "epoch": 0.13295491536458334, "grad_norm": 9.478768348693848, "learning_rate": 4.787874992500024e-06, "loss": 3.1809, "step": 13070 }, { "epoch": 0.13300577799479166, "grad_norm": 9.255359649658203, "learning_rate": 4.7877137658320496e-06, "loss": 3.4478, "step": 13075 }, { "epoch": 0.133056640625, "grad_norm": 14.590103149414062, "learning_rate": 4.7875524806334605e-06, "loss": 3.4376, "step": 13080 }, { "epoch": 0.13310750325520834, "grad_norm": 9.905108451843262, "learning_rate": 4.787391136908383e-06, "loss": 3.4683, "step": 13085 }, { "epoch": 0.13315836588541666, "grad_norm": 14.800880432128906, "learning_rate": 4.787229734660945e-06, "loss": 3.3281, "step": 13090 }, { "epoch": 0.133209228515625, "grad_norm": 15.413259506225586, "learning_rate": 4.787068273895278e-06, "loss": 3.222, "step": 13095 }, { "epoch": 0.13326009114583334, "grad_norm": 14.764119148254395, "learning_rate": 4.7869067546155105e-06, "loss": 3.3957, "step": 13100 }, { "epoch": 0.13331095377604166, "grad_norm": 10.476200103759766, "learning_rate": 4.786745176825775e-06, "loss": 3.2024, "step": 13105 }, { "epoch": 0.13336181640625, "grad_norm": 6.172470569610596, "learning_rate": 4.786583540530206e-06, "loss": 3.0385, "step": 13110 }, { "epoch": 0.13341267903645834, "grad_norm": 16.662702560424805, "learning_rate": 4.78642184573294e-06, "loss": 3.3144, "step": 13115 }, { "epoch": 0.13346354166666666, "grad_norm": 14.16555404663086, "learning_rate": 4.786260092438113e-06, "loss": 3.4183, "step": 13120 }, { "epoch": 0.133514404296875, "grad_norm": 13.170214653015137, "learning_rate": 4.7860982806498635e-06, "loss": 3.2948, "step": 13125 }, { "epoch": 0.13356526692708334, "grad_norm": 14.692107200622559, "learning_rate": 4.78593641037233e-06, "loss": 3.583, "step": 13130 }, { "epoch": 0.13361612955729166, "grad_norm": 14.330788612365723, "learning_rate": 4.785774481609657e-06, "loss": 3.5935, "step": 13135 }, { "epoch": 0.1336669921875, "grad_norm": 10.425037384033203, "learning_rate": 4.785612494365985e-06, "loss": 3.4952, "step": 13140 }, { "epoch": 0.13371785481770834, "grad_norm": 9.754368782043457, "learning_rate": 4.785450448645459e-06, "loss": 3.1403, "step": 13145 }, { "epoch": 0.13376871744791666, "grad_norm": 12.570056915283203, "learning_rate": 4.785288344452226e-06, "loss": 4.0699, "step": 13150 }, { "epoch": 0.133819580078125, "grad_norm": 9.117902755737305, "learning_rate": 4.78512618179043e-06, "loss": 3.8218, "step": 13155 }, { "epoch": 0.13387044270833334, "grad_norm": 13.497501373291016, "learning_rate": 4.784963960664224e-06, "loss": 3.5526, "step": 13160 }, { "epoch": 0.13392130533854166, "grad_norm": 12.687576293945312, "learning_rate": 4.784801681077757e-06, "loss": 3.5295, "step": 13165 }, { "epoch": 0.13397216796875, "grad_norm": 9.852676391601562, "learning_rate": 4.78463934303518e-06, "loss": 3.4353, "step": 13170 }, { "epoch": 0.13402303059895834, "grad_norm": 11.296305656433105, "learning_rate": 4.7844769465406464e-06, "loss": 3.3819, "step": 13175 }, { "epoch": 0.13407389322916666, "grad_norm": 9.368551254272461, "learning_rate": 4.784314491598312e-06, "loss": 3.1461, "step": 13180 }, { "epoch": 0.134124755859375, "grad_norm": 9.357016563415527, "learning_rate": 4.784151978212333e-06, "loss": 3.582, "step": 13185 }, { "epoch": 0.13417561848958334, "grad_norm": 11.293736457824707, "learning_rate": 4.783989406386867e-06, "loss": 3.5203, "step": 13190 }, { "epoch": 0.13422648111979166, "grad_norm": 13.790380477905273, "learning_rate": 4.783826776126073e-06, "loss": 3.3007, "step": 13195 }, { "epoch": 0.13427734375, "grad_norm": 14.679352760314941, "learning_rate": 4.783664087434112e-06, "loss": 3.8487, "step": 13200 }, { "epoch": 0.13432820638020834, "grad_norm": 15.090688705444336, "learning_rate": 4.783501340315147e-06, "loss": 2.9251, "step": 13205 }, { "epoch": 0.13437906901041666, "grad_norm": 15.12507152557373, "learning_rate": 4.783338534773343e-06, "loss": 3.2318, "step": 13210 }, { "epoch": 0.134429931640625, "grad_norm": 10.328474044799805, "learning_rate": 4.783175670812862e-06, "loss": 3.3133, "step": 13215 }, { "epoch": 0.13448079427083334, "grad_norm": 9.897954940795898, "learning_rate": 4.783012748437873e-06, "loss": 3.3901, "step": 13220 }, { "epoch": 0.13453165690104166, "grad_norm": 12.634456634521484, "learning_rate": 4.782849767652544e-06, "loss": 3.4324, "step": 13225 }, { "epoch": 0.13458251953125, "grad_norm": 13.496397972106934, "learning_rate": 4.782686728461044e-06, "loss": 3.4852, "step": 13230 }, { "epoch": 0.13463338216145834, "grad_norm": 9.415319442749023, "learning_rate": 4.782523630867546e-06, "loss": 3.8193, "step": 13235 }, { "epoch": 0.13468424479166666, "grad_norm": 14.183247566223145, "learning_rate": 4.782360474876222e-06, "loss": 3.7154, "step": 13240 }, { "epoch": 0.134735107421875, "grad_norm": 9.120186805725098, "learning_rate": 4.7821972604912464e-06, "loss": 3.0765, "step": 13245 }, { "epoch": 0.13478597005208334, "grad_norm": 15.911162376403809, "learning_rate": 4.782033987716794e-06, "loss": 3.2823, "step": 13250 }, { "epoch": 0.13483683268229166, "grad_norm": 12.479145050048828, "learning_rate": 4.781870656557044e-06, "loss": 3.3999, "step": 13255 }, { "epoch": 0.1348876953125, "grad_norm": 10.130742073059082, "learning_rate": 4.781707267016174e-06, "loss": 3.7453, "step": 13260 }, { "epoch": 0.13493855794270834, "grad_norm": 13.885787963867188, "learning_rate": 4.781543819098363e-06, "loss": 3.348, "step": 13265 }, { "epoch": 0.13498942057291666, "grad_norm": 9.897067070007324, "learning_rate": 4.781380312807795e-06, "loss": 3.6482, "step": 13270 }, { "epoch": 0.135040283203125, "grad_norm": 10.115368843078613, "learning_rate": 4.781216748148653e-06, "loss": 3.5157, "step": 13275 }, { "epoch": 0.13509114583333334, "grad_norm": 11.32646369934082, "learning_rate": 4.78105312512512e-06, "loss": 3.3044, "step": 13280 }, { "epoch": 0.13514200846354166, "grad_norm": 12.162534713745117, "learning_rate": 4.780889443741384e-06, "loss": 3.3909, "step": 13285 }, { "epoch": 0.13519287109375, "grad_norm": 14.136061668395996, "learning_rate": 4.780725704001633e-06, "loss": 3.5301, "step": 13290 }, { "epoch": 0.13524373372395834, "grad_norm": 12.348136901855469, "learning_rate": 4.780561905910055e-06, "loss": 3.979, "step": 13295 }, { "epoch": 0.13529459635416666, "grad_norm": 12.501129150390625, "learning_rate": 4.780398049470841e-06, "loss": 3.6263, "step": 13300 }, { "epoch": 0.135345458984375, "grad_norm": 16.70313835144043, "learning_rate": 4.780234134688184e-06, "loss": 3.584, "step": 13305 }, { "epoch": 0.13539632161458334, "grad_norm": 12.98574447631836, "learning_rate": 4.780070161566276e-06, "loss": 2.9994, "step": 13310 }, { "epoch": 0.13544718424479166, "grad_norm": 9.226192474365234, "learning_rate": 4.7799061301093144e-06, "loss": 3.1362, "step": 13315 }, { "epoch": 0.135498046875, "grad_norm": 7.743569850921631, "learning_rate": 4.779742040321494e-06, "loss": 3.789, "step": 13320 }, { "epoch": 0.13554890950520834, "grad_norm": 10.638326644897461, "learning_rate": 4.779577892207015e-06, "loss": 3.5861, "step": 13325 }, { "epoch": 0.13559977213541666, "grad_norm": 10.221707344055176, "learning_rate": 4.779413685770075e-06, "loss": 3.1357, "step": 13330 }, { "epoch": 0.135650634765625, "grad_norm": 12.546625137329102, "learning_rate": 4.779249421014876e-06, "loss": 3.2787, "step": 13335 }, { "epoch": 0.13570149739583334, "grad_norm": 15.528656959533691, "learning_rate": 4.779085097945621e-06, "loss": 3.4499, "step": 13340 }, { "epoch": 0.13575236002604166, "grad_norm": 15.490991592407227, "learning_rate": 4.778920716566514e-06, "loss": 3.7281, "step": 13345 }, { "epoch": 0.13580322265625, "grad_norm": 12.92549991607666, "learning_rate": 4.7787562768817605e-06, "loss": 3.3561, "step": 13350 }, { "epoch": 0.13585408528645834, "grad_norm": 14.30997371673584, "learning_rate": 4.778591778895568e-06, "loss": 3.4102, "step": 13355 }, { "epoch": 0.13590494791666666, "grad_norm": 9.108956336975098, "learning_rate": 4.778427222612145e-06, "loss": 3.2839, "step": 13360 }, { "epoch": 0.135955810546875, "grad_norm": 15.586246490478516, "learning_rate": 4.778262608035702e-06, "loss": 3.6318, "step": 13365 }, { "epoch": 0.13600667317708334, "grad_norm": 12.77320671081543, "learning_rate": 4.778097935170449e-06, "loss": 3.4799, "step": 13370 }, { "epoch": 0.13605753580729166, "grad_norm": 8.657517433166504, "learning_rate": 4.777933204020602e-06, "loss": 3.2848, "step": 13375 }, { "epoch": 0.1361083984375, "grad_norm": 13.57066822052002, "learning_rate": 4.777768414590372e-06, "loss": 3.0883, "step": 13380 }, { "epoch": 0.13615926106770834, "grad_norm": 15.412737846374512, "learning_rate": 4.777603566883978e-06, "loss": 3.3734, "step": 13385 }, { "epoch": 0.13621012369791666, "grad_norm": 7.619217395782471, "learning_rate": 4.777438660905637e-06, "loss": 3.2357, "step": 13390 }, { "epoch": 0.136260986328125, "grad_norm": 8.894469261169434, "learning_rate": 4.777273696659567e-06, "loss": 3.3946, "step": 13395 }, { "epoch": 0.13631184895833334, "grad_norm": 11.534381866455078, "learning_rate": 4.7771086741499895e-06, "loss": 3.3128, "step": 13400 }, { "epoch": 0.13636271158854166, "grad_norm": 9.838509559631348, "learning_rate": 4.776943593381126e-06, "loss": 3.5254, "step": 13405 }, { "epoch": 0.13641357421875, "grad_norm": 12.967144012451172, "learning_rate": 4.7767784543572e-06, "loss": 3.1434, "step": 13410 }, { "epoch": 0.13646443684895834, "grad_norm": 9.366642951965332, "learning_rate": 4.776613257082439e-06, "loss": 2.9998, "step": 13415 }, { "epoch": 0.13651529947916666, "grad_norm": 10.693907737731934, "learning_rate": 4.776448001561065e-06, "loss": 3.9294, "step": 13420 }, { "epoch": 0.136566162109375, "grad_norm": 14.118185997009277, "learning_rate": 4.7762826877973095e-06, "loss": 3.4184, "step": 13425 }, { "epoch": 0.13661702473958334, "grad_norm": 16.79452133178711, "learning_rate": 4.776117315795401e-06, "loss": 3.4651, "step": 13430 }, { "epoch": 0.13666788736979166, "grad_norm": 7.560031890869141, "learning_rate": 4.77595188555957e-06, "loss": 3.3598, "step": 13435 }, { "epoch": 0.13671875, "grad_norm": 8.069879531860352, "learning_rate": 4.77578639709405e-06, "loss": 3.3903, "step": 13440 }, { "epoch": 0.13676961263020834, "grad_norm": 14.968077659606934, "learning_rate": 4.775620850403075e-06, "loss": 3.1777, "step": 13445 }, { "epoch": 0.13682047526041666, "grad_norm": 11.37788200378418, "learning_rate": 4.775455245490879e-06, "loss": 3.1365, "step": 13450 }, { "epoch": 0.136871337890625, "grad_norm": 11.77219295501709, "learning_rate": 4.7752895823616995e-06, "loss": 3.397, "step": 13455 }, { "epoch": 0.13692220052083334, "grad_norm": 11.196371078491211, "learning_rate": 4.775123861019776e-06, "loss": 3.146, "step": 13460 }, { "epoch": 0.13697306315104166, "grad_norm": 19.208072662353516, "learning_rate": 4.774958081469348e-06, "loss": 3.2435, "step": 13465 }, { "epoch": 0.13702392578125, "grad_norm": 13.371184349060059, "learning_rate": 4.774792243714656e-06, "loss": 3.4511, "step": 13470 }, { "epoch": 0.13707478841145834, "grad_norm": 7.495774745941162, "learning_rate": 4.774626347759944e-06, "loss": 3.174, "step": 13475 }, { "epoch": 0.13712565104166666, "grad_norm": 9.855815887451172, "learning_rate": 4.774460393609456e-06, "loss": 3.4519, "step": 13480 }, { "epoch": 0.137176513671875, "grad_norm": 15.752311706542969, "learning_rate": 4.774294381267438e-06, "loss": 3.3032, "step": 13485 }, { "epoch": 0.13722737630208334, "grad_norm": 10.933402061462402, "learning_rate": 4.774128310738137e-06, "loss": 3.5641, "step": 13490 }, { "epoch": 0.13727823893229166, "grad_norm": 10.031787872314453, "learning_rate": 4.773962182025803e-06, "loss": 3.4982, "step": 13495 }, { "epoch": 0.1373291015625, "grad_norm": 10.56356430053711, "learning_rate": 4.773795995134685e-06, "loss": 3.2439, "step": 13500 }, { "epoch": 0.13737996419270834, "grad_norm": 7.9167585372924805, "learning_rate": 4.773629750069036e-06, "loss": 2.9149, "step": 13505 }, { "epoch": 0.13743082682291666, "grad_norm": 11.75800609588623, "learning_rate": 4.773463446833108e-06, "loss": 3.3936, "step": 13510 }, { "epoch": 0.137481689453125, "grad_norm": 10.611553192138672, "learning_rate": 4.773297085431156e-06, "loss": 3.3874, "step": 13515 }, { "epoch": 0.13753255208333334, "grad_norm": 7.607089996337891, "learning_rate": 4.773130665867438e-06, "loss": 3.1678, "step": 13520 }, { "epoch": 0.13758341471354166, "grad_norm": 13.853635787963867, "learning_rate": 4.7729641881462106e-06, "loss": 3.5818, "step": 13525 }, { "epoch": 0.13763427734375, "grad_norm": 14.964798927307129, "learning_rate": 4.772797652271732e-06, "loss": 3.722, "step": 13530 }, { "epoch": 0.13768513997395834, "grad_norm": 13.768731117248535, "learning_rate": 4.772631058248266e-06, "loss": 4.1596, "step": 13535 }, { "epoch": 0.13773600260416666, "grad_norm": 14.737395286560059, "learning_rate": 4.772464406080072e-06, "loss": 3.3029, "step": 13540 }, { "epoch": 0.137786865234375, "grad_norm": 13.914802551269531, "learning_rate": 4.772297695771415e-06, "loss": 3.4859, "step": 13545 }, { "epoch": 0.13783772786458334, "grad_norm": 11.838445663452148, "learning_rate": 4.7721309273265605e-06, "loss": 3.3564, "step": 13550 }, { "epoch": 0.13788859049479166, "grad_norm": 12.323065757751465, "learning_rate": 4.771964100749774e-06, "loss": 3.3043, "step": 13555 }, { "epoch": 0.137939453125, "grad_norm": 13.693683624267578, "learning_rate": 4.771797216045325e-06, "loss": 3.4427, "step": 13560 }, { "epoch": 0.13799031575520834, "grad_norm": 14.236191749572754, "learning_rate": 4.771630273217483e-06, "loss": 3.3801, "step": 13565 }, { "epoch": 0.13804117838541666, "grad_norm": 12.197969436645508, "learning_rate": 4.7714632722705175e-06, "loss": 3.6923, "step": 13570 }, { "epoch": 0.138092041015625, "grad_norm": 12.962698936462402, "learning_rate": 4.771296213208704e-06, "loss": 3.3643, "step": 13575 }, { "epoch": 0.13814290364583334, "grad_norm": 15.476211547851562, "learning_rate": 4.7711290960363145e-06, "loss": 3.3381, "step": 13580 }, { "epoch": 0.13819376627604166, "grad_norm": 12.055511474609375, "learning_rate": 4.770961920757626e-06, "loss": 3.1077, "step": 13585 }, { "epoch": 0.13824462890625, "grad_norm": 11.437824249267578, "learning_rate": 4.7707946873769144e-06, "loss": 3.4872, "step": 13590 }, { "epoch": 0.13829549153645834, "grad_norm": 14.684785842895508, "learning_rate": 4.77062739589846e-06, "loss": 3.1012, "step": 13595 }, { "epoch": 0.13834635416666666, "grad_norm": 11.902538299560547, "learning_rate": 4.77046004632654e-06, "loss": 3.3424, "step": 13600 }, { "epoch": 0.138397216796875, "grad_norm": 13.112920761108398, "learning_rate": 4.770292638665439e-06, "loss": 3.6353, "step": 13605 }, { "epoch": 0.13844807942708334, "grad_norm": 12.018083572387695, "learning_rate": 4.7701251729194396e-06, "loss": 3.7497, "step": 13610 }, { "epoch": 0.13849894205729166, "grad_norm": 12.88236141204834, "learning_rate": 4.769957649092825e-06, "loss": 3.9322, "step": 13615 }, { "epoch": 0.1385498046875, "grad_norm": 16.120954513549805, "learning_rate": 4.769790067189882e-06, "loss": 3.4474, "step": 13620 }, { "epoch": 0.13860066731770834, "grad_norm": 9.990036010742188, "learning_rate": 4.769622427214898e-06, "loss": 3.1569, "step": 13625 }, { "epoch": 0.13865152994791666, "grad_norm": 9.095274925231934, "learning_rate": 4.769454729172163e-06, "loss": 3.3547, "step": 13630 }, { "epoch": 0.138702392578125, "grad_norm": 12.9636812210083, "learning_rate": 4.7692869730659655e-06, "loss": 3.3197, "step": 13635 }, { "epoch": 0.13875325520833334, "grad_norm": 14.65504264831543, "learning_rate": 4.769119158900599e-06, "loss": 3.4425, "step": 13640 }, { "epoch": 0.13880411783854166, "grad_norm": 9.957447052001953, "learning_rate": 4.768951286680357e-06, "loss": 3.4952, "step": 13645 }, { "epoch": 0.13885498046875, "grad_norm": 12.655380249023438, "learning_rate": 4.768783356409535e-06, "loss": 3.4519, "step": 13650 }, { "epoch": 0.13890584309895834, "grad_norm": 14.230171203613281, "learning_rate": 4.768615368092427e-06, "loss": 3.2948, "step": 13655 }, { "epoch": 0.13895670572916666, "grad_norm": 15.258803367614746, "learning_rate": 4.768447321733332e-06, "loss": 3.5278, "step": 13660 }, { "epoch": 0.139007568359375, "grad_norm": 8.109949111938477, "learning_rate": 4.7682792173365525e-06, "loss": 3.2576, "step": 13665 }, { "epoch": 0.13905843098958334, "grad_norm": 12.126956939697266, "learning_rate": 4.768111054906384e-06, "loss": 3.5308, "step": 13670 }, { "epoch": 0.13910929361979166, "grad_norm": 12.557209014892578, "learning_rate": 4.767942834447134e-06, "loss": 3.405, "step": 13675 }, { "epoch": 0.13916015625, "grad_norm": 8.383021354675293, "learning_rate": 4.767774555963103e-06, "loss": 3.3727, "step": 13680 }, { "epoch": 0.13921101888020834, "grad_norm": 9.561806678771973, "learning_rate": 4.767606219458598e-06, "loss": 3.7015, "step": 13685 }, { "epoch": 0.13926188151041666, "grad_norm": 11.462172508239746, "learning_rate": 4.767437824937926e-06, "loss": 3.6486, "step": 13690 }, { "epoch": 0.139312744140625, "grad_norm": 16.29766273498535, "learning_rate": 4.767269372405393e-06, "loss": 3.0515, "step": 13695 }, { "epoch": 0.13936360677083334, "grad_norm": 8.491412162780762, "learning_rate": 4.767100861865311e-06, "loss": 3.3933, "step": 13700 }, { "epoch": 0.13941446940104166, "grad_norm": 8.013467788696289, "learning_rate": 4.766932293321992e-06, "loss": 3.3615, "step": 13705 }, { "epoch": 0.13946533203125, "grad_norm": 8.9276704788208, "learning_rate": 4.766763666779747e-06, "loss": 3.4722, "step": 13710 }, { "epoch": 0.13951619466145834, "grad_norm": 8.328059196472168, "learning_rate": 4.76659498224289e-06, "loss": 3.6743, "step": 13715 }, { "epoch": 0.13956705729166666, "grad_norm": 10.622822761535645, "learning_rate": 4.766426239715739e-06, "loss": 3.3682, "step": 13720 }, { "epoch": 0.139617919921875, "grad_norm": 13.057267189025879, "learning_rate": 4.766257439202609e-06, "loss": 3.41, "step": 13725 }, { "epoch": 0.13966878255208334, "grad_norm": 13.841679573059082, "learning_rate": 4.766088580707819e-06, "loss": 3.2495, "step": 13730 }, { "epoch": 0.13971964518229166, "grad_norm": 16.03135108947754, "learning_rate": 4.765919664235691e-06, "loss": 3.3903, "step": 13735 }, { "epoch": 0.1397705078125, "grad_norm": 8.109480857849121, "learning_rate": 4.765750689790545e-06, "loss": 3.2099, "step": 13740 }, { "epoch": 0.13982137044270834, "grad_norm": 15.432792663574219, "learning_rate": 4.765581657376705e-06, "loss": 3.2156, "step": 13745 }, { "epoch": 0.13987223307291666, "grad_norm": 13.207000732421875, "learning_rate": 4.7654125669984945e-06, "loss": 3.3425, "step": 13750 }, { "epoch": 0.139923095703125, "grad_norm": 10.853780746459961, "learning_rate": 4.765243418660241e-06, "loss": 3.5909, "step": 13755 }, { "epoch": 0.13997395833333334, "grad_norm": 11.465790748596191, "learning_rate": 4.765074212366271e-06, "loss": 3.7235, "step": 13760 }, { "epoch": 0.14002482096354166, "grad_norm": 14.370993614196777, "learning_rate": 4.764904948120915e-06, "loss": 3.6259, "step": 13765 }, { "epoch": 0.14007568359375, "grad_norm": 11.535296440124512, "learning_rate": 4.7647356259285025e-06, "loss": 3.5197, "step": 13770 }, { "epoch": 0.14012654622395834, "grad_norm": 11.556644439697266, "learning_rate": 4.764566245793365e-06, "loss": 3.5663, "step": 13775 }, { "epoch": 0.14017740885416666, "grad_norm": 14.494634628295898, "learning_rate": 4.764396807719838e-06, "loss": 3.4005, "step": 13780 }, { "epoch": 0.140228271484375, "grad_norm": 14.56704330444336, "learning_rate": 4.764227311712255e-06, "loss": 3.335, "step": 13785 }, { "epoch": 0.14027913411458334, "grad_norm": 10.961151123046875, "learning_rate": 4.764057757774953e-06, "loss": 3.7573, "step": 13790 }, { "epoch": 0.14032999674479166, "grad_norm": 8.999013900756836, "learning_rate": 4.76388814591227e-06, "loss": 3.4393, "step": 13795 }, { "epoch": 0.140380859375, "grad_norm": 85.62931823730469, "learning_rate": 4.763718476128545e-06, "loss": 3.9256, "step": 13800 }, { "epoch": 0.14043172200520834, "grad_norm": 11.35169792175293, "learning_rate": 4.763548748428119e-06, "loss": 3.6844, "step": 13805 }, { "epoch": 0.14048258463541666, "grad_norm": 10.133095741271973, "learning_rate": 4.763378962815335e-06, "loss": 3.3458, "step": 13810 }, { "epoch": 0.140533447265625, "grad_norm": 14.392997741699219, "learning_rate": 4.763209119294537e-06, "loss": 3.4801, "step": 13815 }, { "epoch": 0.14058430989583334, "grad_norm": 13.355161666870117, "learning_rate": 4.76303921787007e-06, "loss": 3.98, "step": 13820 }, { "epoch": 0.14063517252604166, "grad_norm": 15.28199291229248, "learning_rate": 4.762869258546281e-06, "loss": 3.565, "step": 13825 }, { "epoch": 0.14068603515625, "grad_norm": 15.826837539672852, "learning_rate": 4.762699241327518e-06, "loss": 3.9634, "step": 13830 }, { "epoch": 0.14073689778645834, "grad_norm": 19.135751724243164, "learning_rate": 4.762529166218133e-06, "loss": 3.298, "step": 13835 }, { "epoch": 0.14078776041666666, "grad_norm": 14.346761703491211, "learning_rate": 4.7623590332224735e-06, "loss": 3.3069, "step": 13840 }, { "epoch": 0.140838623046875, "grad_norm": 18.030282974243164, "learning_rate": 4.762188842344896e-06, "loss": 3.2781, "step": 13845 }, { "epoch": 0.14088948567708334, "grad_norm": 11.988079071044922, "learning_rate": 4.762018593589752e-06, "loss": 3.7506, "step": 13850 }, { "epoch": 0.14094034830729166, "grad_norm": 12.060202598571777, "learning_rate": 4.761848286961398e-06, "loss": 3.3371, "step": 13855 }, { "epoch": 0.1409912109375, "grad_norm": 10.818347930908203, "learning_rate": 4.7616779224641925e-06, "loss": 3.528, "step": 13860 }, { "epoch": 0.14104207356770834, "grad_norm": 14.578571319580078, "learning_rate": 4.761507500102493e-06, "loss": 3.3091, "step": 13865 }, { "epoch": 0.14109293619791666, "grad_norm": 14.782280921936035, "learning_rate": 4.761337019880661e-06, "loss": 3.382, "step": 13870 }, { "epoch": 0.141143798828125, "grad_norm": 16.304611206054688, "learning_rate": 4.761166481803057e-06, "loss": 3.3789, "step": 13875 }, { "epoch": 0.14119466145833334, "grad_norm": 17.154775619506836, "learning_rate": 4.760995885874045e-06, "loss": 4.1372, "step": 13880 }, { "epoch": 0.14124552408854166, "grad_norm": 7.446081161499023, "learning_rate": 4.760825232097988e-06, "loss": 3.44, "step": 13885 }, { "epoch": 0.14129638671875, "grad_norm": 15.864761352539062, "learning_rate": 4.760654520479254e-06, "loss": 3.7214, "step": 13890 }, { "epoch": 0.14134724934895834, "grad_norm": 10.105338096618652, "learning_rate": 4.76048375102221e-06, "loss": 3.0877, "step": 13895 }, { "epoch": 0.14139811197916666, "grad_norm": 11.060812950134277, "learning_rate": 4.760312923731224e-06, "loss": 3.406, "step": 13900 }, { "epoch": 0.141448974609375, "grad_norm": 17.411237716674805, "learning_rate": 4.760142038610669e-06, "loss": 3.4056, "step": 13905 }, { "epoch": 0.14149983723958334, "grad_norm": 12.01961898803711, "learning_rate": 4.759971095664915e-06, "loss": 3.6575, "step": 13910 }, { "epoch": 0.14155069986979166, "grad_norm": 10.522672653198242, "learning_rate": 4.7598000948983355e-06, "loss": 3.462, "step": 13915 }, { "epoch": 0.1416015625, "grad_norm": 14.614590644836426, "learning_rate": 4.759629036315307e-06, "loss": 3.5353, "step": 13920 }, { "epoch": 0.14165242513020834, "grad_norm": 13.122455596923828, "learning_rate": 4.759457919920206e-06, "loss": 3.2238, "step": 13925 }, { "epoch": 0.14170328776041666, "grad_norm": 11.244791984558105, "learning_rate": 4.759286745717409e-06, "loss": 3.8902, "step": 13930 }, { "epoch": 0.141754150390625, "grad_norm": 13.832063674926758, "learning_rate": 4.759115513711296e-06, "loss": 3.2644, "step": 13935 }, { "epoch": 0.14180501302083334, "grad_norm": 18.43947982788086, "learning_rate": 4.758944223906248e-06, "loss": 3.3131, "step": 13940 }, { "epoch": 0.14185587565104166, "grad_norm": 15.581222534179688, "learning_rate": 4.758772876306647e-06, "loss": 3.2354, "step": 13945 }, { "epoch": 0.14190673828125, "grad_norm": 7.942548751831055, "learning_rate": 4.758601470916878e-06, "loss": 3.2157, "step": 13950 }, { "epoch": 0.14195760091145834, "grad_norm": 15.951170921325684, "learning_rate": 4.758430007741325e-06, "loss": 3.3562, "step": 13955 }, { "epoch": 0.14200846354166666, "grad_norm": 14.815132141113281, "learning_rate": 4.7582584867843764e-06, "loss": 2.939, "step": 13960 }, { "epoch": 0.142059326171875, "grad_norm": 12.236970901489258, "learning_rate": 4.7580869080504185e-06, "loss": 3.2991, "step": 13965 }, { "epoch": 0.14211018880208334, "grad_norm": 12.710593223571777, "learning_rate": 4.757915271543844e-06, "loss": 3.2584, "step": 13970 }, { "epoch": 0.14216105143229166, "grad_norm": 12.963603019714355, "learning_rate": 4.757743577269042e-06, "loss": 3.1463, "step": 13975 }, { "epoch": 0.1422119140625, "grad_norm": 10.430787086486816, "learning_rate": 4.7575718252304046e-06, "loss": 4.2165, "step": 13980 }, { "epoch": 0.14226277669270834, "grad_norm": 10.955326080322266, "learning_rate": 4.7574000154323274e-06, "loss": 3.195, "step": 13985 }, { "epoch": 0.14231363932291666, "grad_norm": 8.364349365234375, "learning_rate": 4.757228147879207e-06, "loss": 3.1683, "step": 13990 }, { "epoch": 0.142364501953125, "grad_norm": 14.662646293640137, "learning_rate": 4.757056222575438e-06, "loss": 3.2215, "step": 13995 }, { "epoch": 0.14241536458333334, "grad_norm": 14.448378562927246, "learning_rate": 4.756884239525422e-06, "loss": 3.3889, "step": 14000 }, { "epoch": 0.14246622721354166, "grad_norm": 12.963875770568848, "learning_rate": 4.756712198733557e-06, "loss": 3.2769, "step": 14005 }, { "epoch": 0.14251708984375, "grad_norm": 11.409318923950195, "learning_rate": 4.756540100204245e-06, "loss": 3.3303, "step": 14010 }, { "epoch": 0.14256795247395834, "grad_norm": 9.132806777954102, "learning_rate": 4.75636794394189e-06, "loss": 3.2503, "step": 14015 }, { "epoch": 0.14261881510416666, "grad_norm": 12.652349472045898, "learning_rate": 4.756195729950896e-06, "loss": 3.2854, "step": 14020 }, { "epoch": 0.142669677734375, "grad_norm": 11.568374633789062, "learning_rate": 4.756023458235668e-06, "loss": 3.2795, "step": 14025 }, { "epoch": 0.14272054036458334, "grad_norm": 14.18939208984375, "learning_rate": 4.755851128800616e-06, "loss": 3.2294, "step": 14030 }, { "epoch": 0.14277140299479166, "grad_norm": 12.417285919189453, "learning_rate": 4.755678741650146e-06, "loss": 3.3423, "step": 14035 }, { "epoch": 0.142822265625, "grad_norm": 12.653460502624512, "learning_rate": 4.755506296788671e-06, "loss": 3.4815, "step": 14040 }, { "epoch": 0.14287312825520834, "grad_norm": 10.841574668884277, "learning_rate": 4.7553337942206025e-06, "loss": 3.3619, "step": 14045 }, { "epoch": 0.14292399088541666, "grad_norm": 9.72189712524414, "learning_rate": 4.7551612339503524e-06, "loss": 3.0766, "step": 14050 }, { "epoch": 0.142974853515625, "grad_norm": 10.824090003967285, "learning_rate": 4.754988615982336e-06, "loss": 3.2932, "step": 14055 }, { "epoch": 0.14302571614583334, "grad_norm": 14.650367736816406, "learning_rate": 4.7548159403209725e-06, "loss": 3.3557, "step": 14060 }, { "epoch": 0.14307657877604166, "grad_norm": 13.419211387634277, "learning_rate": 4.7546432069706765e-06, "loss": 3.135, "step": 14065 }, { "epoch": 0.14312744140625, "grad_norm": 10.86098861694336, "learning_rate": 4.754470415935868e-06, "loss": 3.7383, "step": 14070 }, { "epoch": 0.14317830403645834, "grad_norm": 16.1043758392334, "learning_rate": 4.754297567220969e-06, "loss": 3.5326, "step": 14075 }, { "epoch": 0.14322916666666666, "grad_norm": 11.805451393127441, "learning_rate": 4.754124660830401e-06, "loss": 3.2176, "step": 14080 }, { "epoch": 0.143280029296875, "grad_norm": 10.292030334472656, "learning_rate": 4.753951696768587e-06, "loss": 3.5026, "step": 14085 }, { "epoch": 0.14333089192708334, "grad_norm": 9.801261901855469, "learning_rate": 4.753778675039954e-06, "loss": 3.5177, "step": 14090 }, { "epoch": 0.14338175455729166, "grad_norm": 14.867837905883789, "learning_rate": 4.753605595648928e-06, "loss": 3.1356, "step": 14095 }, { "epoch": 0.1434326171875, "grad_norm": 19.2530460357666, "learning_rate": 4.753432458599936e-06, "loss": 3.4818, "step": 14100 }, { "epoch": 0.14348347981770834, "grad_norm": 11.164162635803223, "learning_rate": 4.753259263897409e-06, "loss": 3.65, "step": 14105 }, { "epoch": 0.14353434244791666, "grad_norm": 7.864550590515137, "learning_rate": 4.7530860115457785e-06, "loss": 3.1865, "step": 14110 }, { "epoch": 0.143585205078125, "grad_norm": 7.927250862121582, "learning_rate": 4.7529127015494754e-06, "loss": 3.2852, "step": 14115 }, { "epoch": 0.14363606770833334, "grad_norm": 15.011672973632812, "learning_rate": 4.752739333912936e-06, "loss": 3.2359, "step": 14120 }, { "epoch": 0.14368693033854166, "grad_norm": 10.188263893127441, "learning_rate": 4.752565908640594e-06, "loss": 3.4599, "step": 14125 }, { "epoch": 0.14373779296875, "grad_norm": 13.60891056060791, "learning_rate": 4.752392425736888e-06, "loss": 3.5074, "step": 14130 }, { "epoch": 0.14378865559895834, "grad_norm": 7.990344524383545, "learning_rate": 4.752218885206255e-06, "loss": 3.1402, "step": 14135 }, { "epoch": 0.14383951822916666, "grad_norm": 13.563754081726074, "learning_rate": 4.752045287053135e-06, "loss": 3.6083, "step": 14140 }, { "epoch": 0.143890380859375, "grad_norm": 7.188718795776367, "learning_rate": 4.751871631281971e-06, "loss": 3.6155, "step": 14145 }, { "epoch": 0.14394124348958334, "grad_norm": 11.118795394897461, "learning_rate": 4.751697917897204e-06, "loss": 3.2539, "step": 14150 }, { "epoch": 0.14399210611979166, "grad_norm": 11.786707878112793, "learning_rate": 4.7515241469032805e-06, "loss": 3.4012, "step": 14155 }, { "epoch": 0.14404296875, "grad_norm": 15.172300338745117, "learning_rate": 4.751350318304645e-06, "loss": 3.4998, "step": 14160 }, { "epoch": 0.14409383138020834, "grad_norm": 14.242692947387695, "learning_rate": 4.751176432105746e-06, "loss": 3.374, "step": 14165 }, { "epoch": 0.14414469401041666, "grad_norm": 12.410831451416016, "learning_rate": 4.751002488311031e-06, "loss": 3.1663, "step": 14170 }, { "epoch": 0.144195556640625, "grad_norm": 16.139020919799805, "learning_rate": 4.75082848692495e-06, "loss": 3.6473, "step": 14175 }, { "epoch": 0.14424641927083334, "grad_norm": 9.563576698303223, "learning_rate": 4.750654427951957e-06, "loss": 3.133, "step": 14180 }, { "epoch": 0.14429728190104166, "grad_norm": 9.554606437683105, "learning_rate": 4.750480311396503e-06, "loss": 3.2635, "step": 14185 }, { "epoch": 0.14434814453125, "grad_norm": 16.014537811279297, "learning_rate": 4.750306137263044e-06, "loss": 3.4213, "step": 14190 }, { "epoch": 0.14439900716145834, "grad_norm": 15.562359809875488, "learning_rate": 4.750131905556036e-06, "loss": 3.2832, "step": 14195 }, { "epoch": 0.14444986979166666, "grad_norm": 15.114102363586426, "learning_rate": 4.749957616279937e-06, "loss": 3.3687, "step": 14200 }, { "epoch": 0.144500732421875, "grad_norm": 15.910360336303711, "learning_rate": 4.749783269439205e-06, "loss": 3.1607, "step": 14205 }, { "epoch": 0.14455159505208334, "grad_norm": 8.769811630249023, "learning_rate": 4.749608865038301e-06, "loss": 3.3552, "step": 14210 }, { "epoch": 0.14460245768229166, "grad_norm": 9.91917896270752, "learning_rate": 4.749434403081688e-06, "loss": 3.2902, "step": 14215 }, { "epoch": 0.1446533203125, "grad_norm": 9.88044261932373, "learning_rate": 4.749259883573829e-06, "loss": 3.4576, "step": 14220 }, { "epoch": 0.14470418294270834, "grad_norm": 15.199368476867676, "learning_rate": 4.749085306519189e-06, "loss": 3.2871, "step": 14225 }, { "epoch": 0.14475504557291666, "grad_norm": 15.931928634643555, "learning_rate": 4.748910671922234e-06, "loss": 3.5535, "step": 14230 }, { "epoch": 0.144805908203125, "grad_norm": 11.521714210510254, "learning_rate": 4.748735979787433e-06, "loss": 3.3337, "step": 14235 }, { "epoch": 0.14485677083333334, "grad_norm": 9.132640838623047, "learning_rate": 4.7485612301192545e-06, "loss": 3.599, "step": 14240 }, { "epoch": 0.14490763346354166, "grad_norm": 14.95056438446045, "learning_rate": 4.74838642292217e-06, "loss": 3.2524, "step": 14245 }, { "epoch": 0.14495849609375, "grad_norm": 13.88601303100586, "learning_rate": 4.748211558200653e-06, "loss": 3.3675, "step": 14250 }, { "epoch": 0.14500935872395834, "grad_norm": 11.538806915283203, "learning_rate": 4.748036635959174e-06, "loss": 3.4054, "step": 14255 }, { "epoch": 0.14506022135416666, "grad_norm": 12.001321792602539, "learning_rate": 4.747861656202212e-06, "loss": 3.481, "step": 14260 }, { "epoch": 0.145111083984375, "grad_norm": 13.24527359008789, "learning_rate": 4.747686618934242e-06, "loss": 2.8579, "step": 14265 }, { "epoch": 0.14516194661458334, "grad_norm": 13.960221290588379, "learning_rate": 4.747511524159743e-06, "loss": 3.6163, "step": 14270 }, { "epoch": 0.14521280924479166, "grad_norm": 14.137311935424805, "learning_rate": 4.747336371883194e-06, "loss": 3.2439, "step": 14275 }, { "epoch": 0.145263671875, "grad_norm": 13.414327621459961, "learning_rate": 4.747161162109076e-06, "loss": 3.6854, "step": 14280 }, { "epoch": 0.14531453450520834, "grad_norm": 7.808047294616699, "learning_rate": 4.746985894841873e-06, "loss": 3.4737, "step": 14285 }, { "epoch": 0.14536539713541666, "grad_norm": 7.56891393661499, "learning_rate": 4.746810570086069e-06, "loss": 3.2434, "step": 14290 }, { "epoch": 0.145416259765625, "grad_norm": 15.081562042236328, "learning_rate": 4.746635187846148e-06, "loss": 3.3065, "step": 14295 }, { "epoch": 0.14546712239583334, "grad_norm": 12.434776306152344, "learning_rate": 4.746459748126599e-06, "loss": 3.2885, "step": 14300 }, { "epoch": 0.14551798502604166, "grad_norm": 14.816580772399902, "learning_rate": 4.74628425093191e-06, "loss": 3.3603, "step": 14305 }, { "epoch": 0.14556884765625, "grad_norm": 10.694483757019043, "learning_rate": 4.74610869626657e-06, "loss": 3.4447, "step": 14310 }, { "epoch": 0.14561971028645834, "grad_norm": 11.717911720275879, "learning_rate": 4.745933084135071e-06, "loss": 3.7955, "step": 14315 }, { "epoch": 0.14567057291666666, "grad_norm": 12.842635154724121, "learning_rate": 4.745757414541908e-06, "loss": 3.63, "step": 14320 }, { "epoch": 0.145721435546875, "grad_norm": 13.658503532409668, "learning_rate": 4.745581687491573e-06, "loss": 3.5957, "step": 14325 }, { "epoch": 0.14577229817708334, "grad_norm": 8.604068756103516, "learning_rate": 4.745405902988563e-06, "loss": 3.3326, "step": 14330 }, { "epoch": 0.14582316080729166, "grad_norm": 10.351202964782715, "learning_rate": 4.745230061037375e-06, "loss": 3.2391, "step": 14335 }, { "epoch": 0.1458740234375, "grad_norm": 9.471860885620117, "learning_rate": 4.745054161642508e-06, "loss": 3.3626, "step": 14340 }, { "epoch": 0.14592488606770834, "grad_norm": 10.003438949584961, "learning_rate": 4.744878204808463e-06, "loss": 3.5786, "step": 14345 }, { "epoch": 0.14597574869791666, "grad_norm": 10.829023361206055, "learning_rate": 4.744702190539741e-06, "loss": 3.7278, "step": 14350 }, { "epoch": 0.146026611328125, "grad_norm": 9.213700294494629, "learning_rate": 4.744526118840844e-06, "loss": 2.9384, "step": 14355 }, { "epoch": 0.14607747395833334, "grad_norm": 12.245752334594727, "learning_rate": 4.7443499897162794e-06, "loss": 3.4541, "step": 14360 }, { "epoch": 0.14612833658854166, "grad_norm": 9.579618453979492, "learning_rate": 4.744173803170553e-06, "loss": 3.2422, "step": 14365 }, { "epoch": 0.14617919921875, "grad_norm": 7.907454967498779, "learning_rate": 4.743997559208171e-06, "loss": 3.2129, "step": 14370 }, { "epoch": 0.14623006184895834, "grad_norm": 15.023747444152832, "learning_rate": 4.743821257833644e-06, "loss": 3.3496, "step": 14375 }, { "epoch": 0.14628092447916666, "grad_norm": 8.13438892364502, "learning_rate": 4.743644899051481e-06, "loss": 3.4805, "step": 14380 }, { "epoch": 0.146331787109375, "grad_norm": 10.506806373596191, "learning_rate": 4.743468482866196e-06, "loss": 3.653, "step": 14385 }, { "epoch": 0.14638264973958334, "grad_norm": 13.162545204162598, "learning_rate": 4.743292009282301e-06, "loss": 3.4791, "step": 14390 }, { "epoch": 0.14643351236979166, "grad_norm": 14.953128814697266, "learning_rate": 4.743115478304312e-06, "loss": 3.3943, "step": 14395 }, { "epoch": 0.146484375, "grad_norm": 8.338605880737305, "learning_rate": 4.742938889936745e-06, "loss": 3.484, "step": 14400 }, { "epoch": 0.14653523763020834, "grad_norm": 11.624462127685547, "learning_rate": 4.742762244184117e-06, "loss": 3.1865, "step": 14405 }, { "epoch": 0.14658610026041666, "grad_norm": 9.61160945892334, "learning_rate": 4.74258554105095e-06, "loss": 3.518, "step": 14410 }, { "epoch": 0.146636962890625, "grad_norm": 10.283778190612793, "learning_rate": 4.742408780541763e-06, "loss": 3.4033, "step": 14415 }, { "epoch": 0.14668782552083334, "grad_norm": 14.608953475952148, "learning_rate": 4.742231962661079e-06, "loss": 3.0345, "step": 14420 }, { "epoch": 0.14673868815104166, "grad_norm": 13.46399974822998, "learning_rate": 4.742055087413422e-06, "loss": 3.3489, "step": 14425 }, { "epoch": 0.14678955078125, "grad_norm": 9.469012260437012, "learning_rate": 4.741878154803316e-06, "loss": 2.9819, "step": 14430 }, { "epoch": 0.14684041341145834, "grad_norm": 10.287275314331055, "learning_rate": 4.741701164835291e-06, "loss": 3.191, "step": 14435 }, { "epoch": 0.14689127604166666, "grad_norm": 12.825785636901855, "learning_rate": 4.741524117513871e-06, "loss": 2.9444, "step": 14440 }, { "epoch": 0.146942138671875, "grad_norm": 11.87830924987793, "learning_rate": 4.741347012843588e-06, "loss": 3.3462, "step": 14445 }, { "epoch": 0.14699300130208334, "grad_norm": 9.253486633300781, "learning_rate": 4.7411698508289735e-06, "loss": 3.7622, "step": 14450 }, { "epoch": 0.14704386393229166, "grad_norm": 11.679107666015625, "learning_rate": 4.740992631474559e-06, "loss": 3.2838, "step": 14455 }, { "epoch": 0.1470947265625, "grad_norm": 13.79211711883545, "learning_rate": 4.740815354784879e-06, "loss": 3.2918, "step": 14460 }, { "epoch": 0.14714558919270834, "grad_norm": 14.639359474182129, "learning_rate": 4.74063802076447e-06, "loss": 3.4211, "step": 14465 }, { "epoch": 0.14719645182291666, "grad_norm": 12.899956703186035, "learning_rate": 4.7404606294178684e-06, "loss": 3.6037, "step": 14470 }, { "epoch": 0.147247314453125, "grad_norm": 15.321117401123047, "learning_rate": 4.740283180749613e-06, "loss": 3.5764, "step": 14475 }, { "epoch": 0.14729817708333334, "grad_norm": 12.093023300170898, "learning_rate": 4.740105674764243e-06, "loss": 3.4525, "step": 14480 }, { "epoch": 0.14734903971354166, "grad_norm": 12.851874351501465, "learning_rate": 4.739928111466299e-06, "loss": 3.7187, "step": 14485 }, { "epoch": 0.14739990234375, "grad_norm": 11.185502052307129, "learning_rate": 4.739750490860327e-06, "loss": 3.6987, "step": 14490 }, { "epoch": 0.14745076497395834, "grad_norm": 10.547276496887207, "learning_rate": 4.7395728129508686e-06, "loss": 3.746, "step": 14495 }, { "epoch": 0.14750162760416666, "grad_norm": 14.449246406555176, "learning_rate": 4.739395077742471e-06, "loss": 3.6518, "step": 14500 }, { "epoch": 0.147552490234375, "grad_norm": 14.639830589294434, "learning_rate": 4.739217285239681e-06, "loss": 4.1111, "step": 14505 }, { "epoch": 0.14760335286458334, "grad_norm": 13.559803009033203, "learning_rate": 4.739039435447047e-06, "loss": 3.5169, "step": 14510 }, { "epoch": 0.14765421549479166, "grad_norm": 17.011873245239258, "learning_rate": 4.738861528369121e-06, "loss": 3.3476, "step": 14515 }, { "epoch": 0.147705078125, "grad_norm": 10.405207633972168, "learning_rate": 4.7386835640104525e-06, "loss": 3.1015, "step": 14520 }, { "epoch": 0.14775594075520834, "grad_norm": 15.131099700927734, "learning_rate": 4.738505542375595e-06, "loss": 3.7291, "step": 14525 }, { "epoch": 0.14780680338541666, "grad_norm": 17.359588623046875, "learning_rate": 4.738327463469105e-06, "loss": 3.3012, "step": 14530 }, { "epoch": 0.147857666015625, "grad_norm": 8.4977445602417, "learning_rate": 4.738149327295537e-06, "loss": 4.0538, "step": 14535 }, { "epoch": 0.14790852864583334, "grad_norm": 14.337766647338867, "learning_rate": 4.737971133859449e-06, "loss": 2.9475, "step": 14540 }, { "epoch": 0.14795939127604166, "grad_norm": 6.989287853240967, "learning_rate": 4.7377928831654e-06, "loss": 3.5471, "step": 14545 }, { "epoch": 0.14801025390625, "grad_norm": 8.3816499710083, "learning_rate": 4.73761457521795e-06, "loss": 3.4316, "step": 14550 }, { "epoch": 0.14806111653645834, "grad_norm": 10.17712116241455, "learning_rate": 4.7374362100216625e-06, "loss": 3.5723, "step": 14555 }, { "epoch": 0.14811197916666666, "grad_norm": 13.229191780090332, "learning_rate": 4.737257787581099e-06, "loss": 3.5133, "step": 14560 }, { "epoch": 0.148162841796875, "grad_norm": 13.506834983825684, "learning_rate": 4.737079307900826e-06, "loss": 3.5203, "step": 14565 }, { "epoch": 0.14821370442708334, "grad_norm": 13.098067283630371, "learning_rate": 4.736900770985409e-06, "loss": 3.1275, "step": 14570 }, { "epoch": 0.14826456705729166, "grad_norm": 15.300044059753418, "learning_rate": 4.7367221768394155e-06, "loss": 3.3876, "step": 14575 }, { "epoch": 0.1483154296875, "grad_norm": 11.651641845703125, "learning_rate": 4.736543525467415e-06, "loss": 3.1862, "step": 14580 }, { "epoch": 0.14836629231770834, "grad_norm": 17.858064651489258, "learning_rate": 4.736364816873979e-06, "loss": 3.2058, "step": 14585 }, { "epoch": 0.14841715494791666, "grad_norm": 11.784101486206055, "learning_rate": 4.73618605106368e-06, "loss": 3.3627, "step": 14590 }, { "epoch": 0.148468017578125, "grad_norm": 16.125946044921875, "learning_rate": 4.73600722804109e-06, "loss": 3.2323, "step": 14595 }, { "epoch": 0.14851888020833334, "grad_norm": 17.72896385192871, "learning_rate": 4.735828347810785e-06, "loss": 3.542, "step": 14600 }, { "epoch": 0.14856974283854166, "grad_norm": 13.747530937194824, "learning_rate": 4.735649410377342e-06, "loss": 3.0246, "step": 14605 }, { "epoch": 0.14862060546875, "grad_norm": 10.233755111694336, "learning_rate": 4.735470415745339e-06, "loss": 3.6899, "step": 14610 }, { "epoch": 0.14867146809895834, "grad_norm": 13.699902534484863, "learning_rate": 4.735291363919355e-06, "loss": 3.1592, "step": 14615 }, { "epoch": 0.14872233072916666, "grad_norm": 11.987502098083496, "learning_rate": 4.735112254903971e-06, "loss": 3.3525, "step": 14620 }, { "epoch": 0.148773193359375, "grad_norm": 13.890175819396973, "learning_rate": 4.73493308870377e-06, "loss": 3.3058, "step": 14625 }, { "epoch": 0.14882405598958334, "grad_norm": 14.703634262084961, "learning_rate": 4.734753865323336e-06, "loss": 3.3691, "step": 14630 }, { "epoch": 0.14887491861979166, "grad_norm": 11.793313980102539, "learning_rate": 4.734574584767253e-06, "loss": 3.1809, "step": 14635 }, { "epoch": 0.14892578125, "grad_norm": 13.151676177978516, "learning_rate": 4.73439524704011e-06, "loss": 3.4036, "step": 14640 }, { "epoch": 0.14897664388020834, "grad_norm": 10.763456344604492, "learning_rate": 4.734215852146493e-06, "loss": 3.2028, "step": 14645 }, { "epoch": 0.14902750651041666, "grad_norm": 15.11712646484375, "learning_rate": 4.734036400090994e-06, "loss": 3.5627, "step": 14650 }, { "epoch": 0.149078369140625, "grad_norm": 7.594333171844482, "learning_rate": 4.7338568908782036e-06, "loss": 3.526, "step": 14655 }, { "epoch": 0.14912923177083334, "grad_norm": 16.161596298217773, "learning_rate": 4.733677324512713e-06, "loss": 3.3709, "step": 14660 }, { "epoch": 0.14918009440104166, "grad_norm": 9.044793128967285, "learning_rate": 4.733497700999119e-06, "loss": 3.2466, "step": 14665 }, { "epoch": 0.14923095703125, "grad_norm": 9.249896049499512, "learning_rate": 4.733318020342014e-06, "loss": 3.7076, "step": 14670 }, { "epoch": 0.14928181966145834, "grad_norm": 9.653711318969727, "learning_rate": 4.7331382825459985e-06, "loss": 3.3792, "step": 14675 }, { "epoch": 0.14933268229166666, "grad_norm": 10.778152465820312, "learning_rate": 4.732958487615668e-06, "loss": 3.2468, "step": 14680 }, { "epoch": 0.149383544921875, "grad_norm": 12.605219841003418, "learning_rate": 4.7327786355556235e-06, "loss": 3.717, "step": 14685 }, { "epoch": 0.14943440755208334, "grad_norm": 11.11141300201416, "learning_rate": 4.7325987263704685e-06, "loss": 3.1906, "step": 14690 }, { "epoch": 0.14948527018229166, "grad_norm": 13.047835350036621, "learning_rate": 4.732418760064803e-06, "loss": 3.5693, "step": 14695 }, { "epoch": 0.1495361328125, "grad_norm": 12.675945281982422, "learning_rate": 4.7322387366432335e-06, "loss": 3.2765, "step": 14700 }, { "epoch": 0.14958699544270834, "grad_norm": 11.04984188079834, "learning_rate": 4.732058656110364e-06, "loss": 3.0329, "step": 14705 }, { "epoch": 0.14963785807291666, "grad_norm": 14.509688377380371, "learning_rate": 4.7318785184708035e-06, "loss": 3.5102, "step": 14710 }, { "epoch": 0.149688720703125, "grad_norm": 12.124781608581543, "learning_rate": 4.731698323729161e-06, "loss": 3.2282, "step": 14715 }, { "epoch": 0.14973958333333334, "grad_norm": 10.267120361328125, "learning_rate": 4.731518071890045e-06, "loss": 3.3788, "step": 14720 }, { "epoch": 0.14979044596354166, "grad_norm": 9.034321784973145, "learning_rate": 4.731337762958067e-06, "loss": 3.4097, "step": 14725 }, { "epoch": 0.14984130859375, "grad_norm": 13.264888763427734, "learning_rate": 4.731157396937842e-06, "loss": 3.5671, "step": 14730 }, { "epoch": 0.14989217122395834, "grad_norm": 12.145549774169922, "learning_rate": 4.730976973833984e-06, "loss": 3.1394, "step": 14735 }, { "epoch": 0.14994303385416666, "grad_norm": 10.380547523498535, "learning_rate": 4.7307964936511095e-06, "loss": 3.4537, "step": 14740 }, { "epoch": 0.149993896484375, "grad_norm": 10.687427520751953, "learning_rate": 4.730615956393835e-06, "loss": 3.2803, "step": 14745 }, { "epoch": 0.15004475911458334, "grad_norm": 9.110136985778809, "learning_rate": 4.730435362066779e-06, "loss": 3.1083, "step": 14750 }, { "epoch": 0.15009562174479166, "grad_norm": 27.618864059448242, "learning_rate": 4.730254710674564e-06, "loss": 3.4778, "step": 14755 }, { "epoch": 0.150146484375, "grad_norm": 9.159721374511719, "learning_rate": 4.73007400222181e-06, "loss": 3.5116, "step": 14760 }, { "epoch": 0.15019734700520834, "grad_norm": 13.149566650390625, "learning_rate": 4.729893236713142e-06, "loss": 3.3386, "step": 14765 }, { "epoch": 0.15024820963541666, "grad_norm": 10.829029083251953, "learning_rate": 4.729712414153184e-06, "loss": 3.862, "step": 14770 }, { "epoch": 0.150299072265625, "grad_norm": 11.96367073059082, "learning_rate": 4.729531534546563e-06, "loss": 3.66, "step": 14775 }, { "epoch": 0.15034993489583334, "grad_norm": 14.411954879760742, "learning_rate": 4.729350597897905e-06, "loss": 3.2965, "step": 14780 }, { "epoch": 0.15040079752604166, "grad_norm": 12.512883186340332, "learning_rate": 4.729169604211841e-06, "loss": 3.2615, "step": 14785 }, { "epoch": 0.15045166015625, "grad_norm": 11.264093399047852, "learning_rate": 4.728988553493001e-06, "loss": 3.1227, "step": 14790 }, { "epoch": 0.15050252278645834, "grad_norm": 7.70527458190918, "learning_rate": 4.728807445746018e-06, "loss": 3.3477, "step": 14795 }, { "epoch": 0.15055338541666666, "grad_norm": 14.769386291503906, "learning_rate": 4.728626280975523e-06, "loss": 3.6628, "step": 14800 }, { "epoch": 0.150604248046875, "grad_norm": 13.030516624450684, "learning_rate": 4.7284450591861545e-06, "loss": 3.377, "step": 14805 }, { "epoch": 0.15065511067708334, "grad_norm": 12.844939231872559, "learning_rate": 4.728263780382546e-06, "loss": 3.8196, "step": 14810 }, { "epoch": 0.15070597330729166, "grad_norm": 10.097814559936523, "learning_rate": 4.728082444569337e-06, "loss": 3.8156, "step": 14815 }, { "epoch": 0.1507568359375, "grad_norm": 10.388051986694336, "learning_rate": 4.727901051751167e-06, "loss": 3.4716, "step": 14820 }, { "epoch": 0.15080769856770834, "grad_norm": 13.543767929077148, "learning_rate": 4.727719601932678e-06, "loss": 3.4144, "step": 14825 }, { "epoch": 0.15085856119791666, "grad_norm": 11.783435821533203, "learning_rate": 4.727538095118509e-06, "loss": 3.2369, "step": 14830 }, { "epoch": 0.150909423828125, "grad_norm": 14.941376686096191, "learning_rate": 4.727356531313307e-06, "loss": 3.5681, "step": 14835 }, { "epoch": 0.15096028645833334, "grad_norm": 13.9028959274292, "learning_rate": 4.727174910521716e-06, "loss": 3.2229, "step": 14840 }, { "epoch": 0.15101114908854166, "grad_norm": 13.589739799499512, "learning_rate": 4.726993232748382e-06, "loss": 3.3788, "step": 14845 }, { "epoch": 0.15106201171875, "grad_norm": 11.502520561218262, "learning_rate": 4.7268114979979555e-06, "loss": 3.4142, "step": 14850 }, { "epoch": 0.15111287434895834, "grad_norm": 12.786849975585938, "learning_rate": 4.726629706275083e-06, "loss": 3.5127, "step": 14855 }, { "epoch": 0.15116373697916666, "grad_norm": 7.40986967086792, "learning_rate": 4.7264478575844185e-06, "loss": 3.127, "step": 14860 }, { "epoch": 0.151214599609375, "grad_norm": 10.638337135314941, "learning_rate": 4.726265951930612e-06, "loss": 3.0998, "step": 14865 }, { "epoch": 0.15126546223958334, "grad_norm": 10.695724487304688, "learning_rate": 4.72608398931832e-06, "loss": 3.5446, "step": 14870 }, { "epoch": 0.15131632486979166, "grad_norm": 11.744098663330078, "learning_rate": 4.7259019697521955e-06, "loss": 2.9158, "step": 14875 }, { "epoch": 0.1513671875, "grad_norm": 9.356399536132812, "learning_rate": 4.725719893236898e-06, "loss": 3.2486, "step": 14880 }, { "epoch": 0.15141805013020834, "grad_norm": 12.832469940185547, "learning_rate": 4.725537759777084e-06, "loss": 3.2114, "step": 14885 }, { "epoch": 0.15146891276041666, "grad_norm": 12.14478588104248, "learning_rate": 4.725355569377415e-06, "loss": 2.9888, "step": 14890 }, { "epoch": 0.151519775390625, "grad_norm": 8.157812118530273, "learning_rate": 4.72517332204255e-06, "loss": 3.3431, "step": 14895 }, { "epoch": 0.15157063802083334, "grad_norm": 12.803070068359375, "learning_rate": 4.724991017777153e-06, "loss": 3.5178, "step": 14900 }, { "epoch": 0.15162150065104166, "grad_norm": 8.299994468688965, "learning_rate": 4.7248086565858886e-06, "loss": 3.5268, "step": 14905 }, { "epoch": 0.15167236328125, "grad_norm": 12.789700508117676, "learning_rate": 4.724626238473421e-06, "loss": 3.2294, "step": 14910 }, { "epoch": 0.15172322591145834, "grad_norm": 9.37491226196289, "learning_rate": 4.724443763444419e-06, "loss": 3.22, "step": 14915 }, { "epoch": 0.15177408854166666, "grad_norm": 14.288036346435547, "learning_rate": 4.724261231503552e-06, "loss": 3.3765, "step": 14920 }, { "epoch": 0.151824951171875, "grad_norm": 17.946796417236328, "learning_rate": 4.724078642655487e-06, "loss": 3.1767, "step": 14925 }, { "epoch": 0.15187581380208334, "grad_norm": 9.992767333984375, "learning_rate": 4.723895996904897e-06, "loss": 4.5134, "step": 14930 }, { "epoch": 0.15192667643229166, "grad_norm": 12.649903297424316, "learning_rate": 4.7237132942564565e-06, "loss": 3.2216, "step": 14935 }, { "epoch": 0.1519775390625, "grad_norm": 11.577804565429688, "learning_rate": 4.723530534714837e-06, "loss": 3.3525, "step": 14940 }, { "epoch": 0.15202840169270834, "grad_norm": 11.609172821044922, "learning_rate": 4.723347718284716e-06, "loss": 3.2994, "step": 14945 }, { "epoch": 0.15207926432291666, "grad_norm": 11.450626373291016, "learning_rate": 4.723164844970771e-06, "loss": 3.1224, "step": 14950 }, { "epoch": 0.152130126953125, "grad_norm": 12.139467239379883, "learning_rate": 4.722981914777681e-06, "loss": 2.981, "step": 14955 }, { "epoch": 0.15218098958333334, "grad_norm": 14.975263595581055, "learning_rate": 4.722798927710124e-06, "loss": 3.2868, "step": 14960 }, { "epoch": 0.15223185221354166, "grad_norm": 14.647977828979492, "learning_rate": 4.722615883772785e-06, "loss": 3.3901, "step": 14965 }, { "epoch": 0.15228271484375, "grad_norm": 9.550837516784668, "learning_rate": 4.7224327829703444e-06, "loss": 3.8862, "step": 14970 }, { "epoch": 0.15233357747395834, "grad_norm": 10.410728454589844, "learning_rate": 4.7222496253074876e-06, "loss": 3.3335, "step": 14975 }, { "epoch": 0.15238444010416666, "grad_norm": 9.198328018188477, "learning_rate": 4.722066410788902e-06, "loss": 3.7736, "step": 14980 }, { "epoch": 0.152435302734375, "grad_norm": 15.138815879821777, "learning_rate": 4.721883139419273e-06, "loss": 3.2838, "step": 14985 }, { "epoch": 0.15248616536458334, "grad_norm": 10.338898658752441, "learning_rate": 4.721699811203291e-06, "loss": 3.3141, "step": 14990 }, { "epoch": 0.15253702799479166, "grad_norm": 11.960433959960938, "learning_rate": 4.721516426145646e-06, "loss": 3.3508, "step": 14995 }, { "epoch": 0.152587890625, "grad_norm": 12.279603958129883, "learning_rate": 4.7213329842510295e-06, "loss": 3.3861, "step": 15000 }, { "epoch": 0.15263875325520834, "grad_norm": 11.51431941986084, "learning_rate": 4.721149485524135e-06, "loss": 3.3131, "step": 15005 }, { "epoch": 0.15268961588541666, "grad_norm": 8.839967727661133, "learning_rate": 4.720965929969658e-06, "loss": 3.2533, "step": 15010 }, { "epoch": 0.152740478515625, "grad_norm": 8.14935302734375, "learning_rate": 4.720782317592293e-06, "loss": 3.6221, "step": 15015 }, { "epoch": 0.15279134114583334, "grad_norm": 12.631562232971191, "learning_rate": 4.7205986483967396e-06, "loss": 3.6434, "step": 15020 }, { "epoch": 0.15284220377604166, "grad_norm": 12.91220760345459, "learning_rate": 4.720414922387696e-06, "loss": 3.1931, "step": 15025 }, { "epoch": 0.15289306640625, "grad_norm": 15.801246643066406, "learning_rate": 4.720231139569863e-06, "loss": 3.4381, "step": 15030 }, { "epoch": 0.15294392903645834, "grad_norm": 11.53238296508789, "learning_rate": 4.720047299947943e-06, "loss": 3.7497, "step": 15035 }, { "epoch": 0.15299479166666666, "grad_norm": 12.755319595336914, "learning_rate": 4.7198634035266375e-06, "loss": 3.2225, "step": 15040 }, { "epoch": 0.153045654296875, "grad_norm": 9.745579719543457, "learning_rate": 4.719679450310654e-06, "loss": 3.5779, "step": 15045 }, { "epoch": 0.15309651692708334, "grad_norm": 12.108492851257324, "learning_rate": 4.719495440304698e-06, "loss": 3.9278, "step": 15050 }, { "epoch": 0.15314737955729166, "grad_norm": 13.5195951461792, "learning_rate": 4.719311373513477e-06, "loss": 3.3074, "step": 15055 }, { "epoch": 0.1531982421875, "grad_norm": 14.34796142578125, "learning_rate": 4.719127249941701e-06, "loss": 3.2327, "step": 15060 }, { "epoch": 0.15324910481770834, "grad_norm": 10.067496299743652, "learning_rate": 4.718943069594079e-06, "loss": 3.8594, "step": 15065 }, { "epoch": 0.15329996744791666, "grad_norm": 9.026041984558105, "learning_rate": 4.718758832475326e-06, "loss": 3.4053, "step": 15070 }, { "epoch": 0.153350830078125, "grad_norm": 14.132732391357422, "learning_rate": 4.718574538590154e-06, "loss": 3.3339, "step": 15075 }, { "epoch": 0.15340169270833334, "grad_norm": 13.146659851074219, "learning_rate": 4.718390187943278e-06, "loss": 4.1499, "step": 15080 }, { "epoch": 0.15345255533854166, "grad_norm": 12.891654014587402, "learning_rate": 4.7182057805394145e-06, "loss": 3.1477, "step": 15085 }, { "epoch": 0.15350341796875, "grad_norm": 8.704354286193848, "learning_rate": 4.718021316383282e-06, "loss": 3.4241, "step": 15090 }, { "epoch": 0.15355428059895834, "grad_norm": 8.283037185668945, "learning_rate": 4.7178367954796e-06, "loss": 3.4241, "step": 15095 }, { "epoch": 0.15360514322916666, "grad_norm": 14.645727157592773, "learning_rate": 4.7176522178330895e-06, "loss": 4.3103, "step": 15100 }, { "epoch": 0.153656005859375, "grad_norm": 8.679193496704102, "learning_rate": 4.717467583448472e-06, "loss": 4.3254, "step": 15105 }, { "epoch": 0.15370686848958334, "grad_norm": 22.80760383605957, "learning_rate": 4.717282892330472e-06, "loss": 3.3115, "step": 15110 }, { "epoch": 0.15375773111979166, "grad_norm": 10.638278007507324, "learning_rate": 4.717098144483815e-06, "loss": 3.4555, "step": 15115 }, { "epoch": 0.15380859375, "grad_norm": 16.177385330200195, "learning_rate": 4.7169133399132285e-06, "loss": 3.2353, "step": 15120 }, { "epoch": 0.15385945638020834, "grad_norm": 13.588946342468262, "learning_rate": 4.7167284786234385e-06, "loss": 3.512, "step": 15125 }, { "epoch": 0.15391031901041666, "grad_norm": 14.394001960754395, "learning_rate": 4.716543560619175e-06, "loss": 3.0573, "step": 15130 }, { "epoch": 0.153961181640625, "grad_norm": 9.461201667785645, "learning_rate": 4.716358585905172e-06, "loss": 3.2514, "step": 15135 }, { "epoch": 0.15401204427083334, "grad_norm": 13.264630317687988, "learning_rate": 4.716173554486159e-06, "loss": 3.2544, "step": 15140 }, { "epoch": 0.15406290690104166, "grad_norm": 11.358147621154785, "learning_rate": 4.71598846636687e-06, "loss": 3.5706, "step": 15145 }, { "epoch": 0.15411376953125, "grad_norm": 14.053950309753418, "learning_rate": 4.715803321552043e-06, "loss": 3.4664, "step": 15150 }, { "epoch": 0.15416463216145834, "grad_norm": 14.992704391479492, "learning_rate": 4.715618120046412e-06, "loss": 3.9273, "step": 15155 }, { "epoch": 0.15421549479166666, "grad_norm": 13.31781005859375, "learning_rate": 4.715432861854717e-06, "loss": 3.5456, "step": 15160 }, { "epoch": 0.154266357421875, "grad_norm": 8.325743675231934, "learning_rate": 4.715247546981697e-06, "loss": 3.3918, "step": 15165 }, { "epoch": 0.15431722005208334, "grad_norm": 11.262593269348145, "learning_rate": 4.715062175432093e-06, "loss": 3.5529, "step": 15170 }, { "epoch": 0.15436808268229166, "grad_norm": 10.394039154052734, "learning_rate": 4.71487674721065e-06, "loss": 3.3751, "step": 15175 }, { "epoch": 0.1544189453125, "grad_norm": 16.578327178955078, "learning_rate": 4.7146912623221095e-06, "loss": 3.1776, "step": 15180 }, { "epoch": 0.15446980794270834, "grad_norm": 11.776280403137207, "learning_rate": 4.7145057207712175e-06, "loss": 3.5786, "step": 15185 }, { "epoch": 0.15452067057291666, "grad_norm": 12.497096061706543, "learning_rate": 4.714320122562722e-06, "loss": 3.0066, "step": 15190 }, { "epoch": 0.154571533203125, "grad_norm": 9.880024909973145, "learning_rate": 4.714134467701371e-06, "loss": 3.3478, "step": 15195 }, { "epoch": 0.15462239583333334, "grad_norm": 9.223662376403809, "learning_rate": 4.713948756191915e-06, "loss": 3.3263, "step": 15200 }, { "epoch": 0.15467325846354166, "grad_norm": 7.955392837524414, "learning_rate": 4.713762988039105e-06, "loss": 3.4025, "step": 15205 }, { "epoch": 0.15472412109375, "grad_norm": 10.604644775390625, "learning_rate": 4.713577163247692e-06, "loss": 2.9697, "step": 15210 }, { "epoch": 0.15477498372395834, "grad_norm": 7.452188014984131, "learning_rate": 4.713391281822433e-06, "loss": 3.6369, "step": 15215 }, { "epoch": 0.15482584635416666, "grad_norm": 12.423209190368652, "learning_rate": 4.713205343768082e-06, "loss": 3.3861, "step": 15220 }, { "epoch": 0.154876708984375, "grad_norm": 17.55681610107422, "learning_rate": 4.713019349089399e-06, "loss": 3.6858, "step": 15225 }, { "epoch": 0.15492757161458334, "grad_norm": 14.02679443359375, "learning_rate": 4.712833297791138e-06, "loss": 3.167, "step": 15230 }, { "epoch": 0.15497843424479166, "grad_norm": 11.839001655578613, "learning_rate": 4.712647189878063e-06, "loss": 2.955, "step": 15235 }, { "epoch": 0.155029296875, "grad_norm": 11.20364761352539, "learning_rate": 4.712461025354933e-06, "loss": 3.0909, "step": 15240 }, { "epoch": 0.15508015950520834, "grad_norm": 18.19971466064453, "learning_rate": 4.712274804226513e-06, "loss": 3.2367, "step": 15245 }, { "epoch": 0.15513102213541666, "grad_norm": 7.526338577270508, "learning_rate": 4.712088526497566e-06, "loss": 3.3709, "step": 15250 }, { "epoch": 0.155181884765625, "grad_norm": 15.513593673706055, "learning_rate": 4.711902192172858e-06, "loss": 3.9162, "step": 15255 }, { "epoch": 0.15523274739583334, "grad_norm": 10.80632495880127, "learning_rate": 4.7117158012571585e-06, "loss": 3.1795, "step": 15260 }, { "epoch": 0.15528361002604166, "grad_norm": 13.773353576660156, "learning_rate": 4.711529353755233e-06, "loss": 3.7235, "step": 15265 }, { "epoch": 0.15533447265625, "grad_norm": 9.968823432922363, "learning_rate": 4.711342849671853e-06, "loss": 3.4381, "step": 15270 }, { "epoch": 0.15538533528645834, "grad_norm": 7.348340034484863, "learning_rate": 4.711156289011792e-06, "loss": 2.9837, "step": 15275 }, { "epoch": 0.15543619791666666, "grad_norm": 19.31353187561035, "learning_rate": 4.710969671779819e-06, "loss": 3.2548, "step": 15280 }, { "epoch": 0.155487060546875, "grad_norm": 13.9642915725708, "learning_rate": 4.7107829979807124e-06, "loss": 3.1202, "step": 15285 }, { "epoch": 0.15553792317708334, "grad_norm": 12.23745346069336, "learning_rate": 4.710596267619247e-06, "loss": 3.3532, "step": 15290 }, { "epoch": 0.15558878580729166, "grad_norm": 10.830820083618164, "learning_rate": 4.710409480700199e-06, "loss": 3.5973, "step": 15295 }, { "epoch": 0.1556396484375, "grad_norm": 10.229269981384277, "learning_rate": 4.710222637228349e-06, "loss": 3.5623, "step": 15300 }, { "epoch": 0.15569051106770834, "grad_norm": 11.168558120727539, "learning_rate": 4.710035737208477e-06, "loss": 3.0141, "step": 15305 }, { "epoch": 0.15574137369791666, "grad_norm": 7.080033302307129, "learning_rate": 4.709848780645364e-06, "loss": 3.4462, "step": 15310 }, { "epoch": 0.155792236328125, "grad_norm": 7.330209255218506, "learning_rate": 4.709661767543794e-06, "loss": 3.2099, "step": 15315 }, { "epoch": 0.15584309895833334, "grad_norm": 7.501708030700684, "learning_rate": 4.709474697908552e-06, "loss": 3.3722, "step": 15320 }, { "epoch": 0.15589396158854166, "grad_norm": 9.475503921508789, "learning_rate": 4.709287571744423e-06, "loss": 3.6584, "step": 15325 }, { "epoch": 0.15594482421875, "grad_norm": 14.18191146850586, "learning_rate": 4.709100389056195e-06, "loss": 3.3899, "step": 15330 }, { "epoch": 0.15599568684895834, "grad_norm": 9.33660888671875, "learning_rate": 4.708913149848658e-06, "loss": 3.455, "step": 15335 }, { "epoch": 0.15604654947916666, "grad_norm": 10.5362548828125, "learning_rate": 4.7087258541266e-06, "loss": 3.5219, "step": 15340 }, { "epoch": 0.156097412109375, "grad_norm": 13.641423225402832, "learning_rate": 4.7085385018948155e-06, "loss": 3.8866, "step": 15345 }, { "epoch": 0.15614827473958334, "grad_norm": 14.66014289855957, "learning_rate": 4.708351093158097e-06, "loss": 3.6575, "step": 15350 }, { "epoch": 0.15619913736979166, "grad_norm": 10.71635627746582, "learning_rate": 4.708163627921239e-06, "loss": 3.5451, "step": 15355 }, { "epoch": 0.15625, "grad_norm": 12.844642639160156, "learning_rate": 4.7079761061890374e-06, "loss": 3.3541, "step": 15360 }, { "epoch": 0.15630086263020834, "grad_norm": 9.12993335723877, "learning_rate": 4.707788527966291e-06, "loss": 3.3004, "step": 15365 }, { "epoch": 0.15635172526041666, "grad_norm": 8.735359191894531, "learning_rate": 4.707600893257799e-06, "loss": 3.6416, "step": 15370 }, { "epoch": 0.156402587890625, "grad_norm": 17.718563079833984, "learning_rate": 4.707413202068361e-06, "loss": 3.4838, "step": 15375 }, { "epoch": 0.15645345052083334, "grad_norm": 13.61154556274414, "learning_rate": 4.707225454402779e-06, "loss": 3.3291, "step": 15380 }, { "epoch": 0.15650431315104166, "grad_norm": 10.323596954345703, "learning_rate": 4.707037650265857e-06, "loss": 3.44, "step": 15385 }, { "epoch": 0.15655517578125, "grad_norm": 12.949178695678711, "learning_rate": 4.7068497896624014e-06, "loss": 3.6229, "step": 15390 }, { "epoch": 0.15660603841145834, "grad_norm": 10.059599876403809, "learning_rate": 4.706661872597215e-06, "loss": 3.1794, "step": 15395 }, { "epoch": 0.15665690104166666, "grad_norm": 13.678911209106445, "learning_rate": 4.706473899075108e-06, "loss": 3.9318, "step": 15400 }, { "epoch": 0.156707763671875, "grad_norm": 11.830682754516602, "learning_rate": 4.7062858691008906e-06, "loss": 3.383, "step": 15405 }, { "epoch": 0.15675862630208334, "grad_norm": 17.724088668823242, "learning_rate": 4.706097782679371e-06, "loss": 3.9705, "step": 15410 }, { "epoch": 0.15680948893229166, "grad_norm": 7.41434907913208, "learning_rate": 4.7059096398153624e-06, "loss": 3.3684, "step": 15415 }, { "epoch": 0.1568603515625, "grad_norm": 10.549930572509766, "learning_rate": 4.705721440513679e-06, "loss": 3.5617, "step": 15420 }, { "epoch": 0.15691121419270834, "grad_norm": 13.3803071975708, "learning_rate": 4.705533184779135e-06, "loss": 3.4956, "step": 15425 }, { "epoch": 0.15696207682291666, "grad_norm": 9.431879043579102, "learning_rate": 4.705344872616548e-06, "loss": 3.3123, "step": 15430 }, { "epoch": 0.157012939453125, "grad_norm": 13.48584270477295, "learning_rate": 4.705156504030735e-06, "loss": 3.5462, "step": 15435 }, { "epoch": 0.15706380208333334, "grad_norm": 12.505586624145508, "learning_rate": 4.7049680790265145e-06, "loss": 3.5568, "step": 15440 }, { "epoch": 0.15711466471354166, "grad_norm": 14.382122993469238, "learning_rate": 4.704779597608709e-06, "loss": 3.2458, "step": 15445 }, { "epoch": 0.15716552734375, "grad_norm": 17.052955627441406, "learning_rate": 4.70459105978214e-06, "loss": 3.3895, "step": 15450 }, { "epoch": 0.15721638997395834, "grad_norm": 9.036126136779785, "learning_rate": 4.704402465551632e-06, "loss": 3.3079, "step": 15455 }, { "epoch": 0.15726725260416666, "grad_norm": 10.451210021972656, "learning_rate": 4.704213814922008e-06, "loss": 3.5565, "step": 15460 }, { "epoch": 0.157318115234375, "grad_norm": 9.082598686218262, "learning_rate": 4.704025107898097e-06, "loss": 3.1954, "step": 15465 }, { "epoch": 0.15736897786458334, "grad_norm": 15.115447044372559, "learning_rate": 4.703836344484726e-06, "loss": 3.3543, "step": 15470 }, { "epoch": 0.15741984049479166, "grad_norm": 11.461282730102539, "learning_rate": 4.7036475246867245e-06, "loss": 3.7555, "step": 15475 }, { "epoch": 0.157470703125, "grad_norm": 10.950669288635254, "learning_rate": 4.703458648508923e-06, "loss": 3.5753, "step": 15480 }, { "epoch": 0.15752156575520834, "grad_norm": 10.564921379089355, "learning_rate": 4.703269715956154e-06, "loss": 3.0771, "step": 15485 }, { "epoch": 0.15757242838541666, "grad_norm": 11.57642650604248, "learning_rate": 4.703080727033252e-06, "loss": 3.1755, "step": 15490 }, { "epoch": 0.157623291015625, "grad_norm": 10.133732795715332, "learning_rate": 4.702891681745052e-06, "loss": 3.6182, "step": 15495 }, { "epoch": 0.15767415364583334, "grad_norm": 11.367161750793457, "learning_rate": 4.70270258009639e-06, "loss": 3.2131, "step": 15500 }, { "epoch": 0.15772501627604166, "grad_norm": 10.937525749206543, "learning_rate": 4.702513422092106e-06, "loss": 3.6436, "step": 15505 }, { "epoch": 0.15777587890625, "grad_norm": 7.845228672027588, "learning_rate": 4.7023242077370365e-06, "loss": 3.2949, "step": 15510 }, { "epoch": 0.15782674153645834, "grad_norm": 10.833147048950195, "learning_rate": 4.7021349370360246e-06, "loss": 3.5691, "step": 15515 }, { "epoch": 0.15787760416666666, "grad_norm": 10.04218578338623, "learning_rate": 4.701945609993912e-06, "loss": 3.6244, "step": 15520 }, { "epoch": 0.157928466796875, "grad_norm": 15.010988235473633, "learning_rate": 4.701756226615544e-06, "loss": 3.5053, "step": 15525 }, { "epoch": 0.15797932942708334, "grad_norm": 12.318516731262207, "learning_rate": 4.701566786905763e-06, "loss": 3.2215, "step": 15530 }, { "epoch": 0.15803019205729166, "grad_norm": 12.219952583312988, "learning_rate": 4.701377290869419e-06, "loss": 3.5105, "step": 15535 }, { "epoch": 0.1580810546875, "grad_norm": 15.182580947875977, "learning_rate": 4.701187738511358e-06, "loss": 3.6047, "step": 15540 }, { "epoch": 0.15813191731770834, "grad_norm": 8.09954833984375, "learning_rate": 4.700998129836431e-06, "loss": 3.2873, "step": 15545 }, { "epoch": 0.15818277994791666, "grad_norm": 12.153461456298828, "learning_rate": 4.700808464849489e-06, "loss": 3.2927, "step": 15550 }, { "epoch": 0.158233642578125, "grad_norm": 16.8814754486084, "learning_rate": 4.700618743555384e-06, "loss": 3.6992, "step": 15555 }, { "epoch": 0.15828450520833334, "grad_norm": 8.98434066772461, "learning_rate": 4.700428965958968e-06, "loss": 3.2662, "step": 15560 }, { "epoch": 0.15833536783854166, "grad_norm": 14.116239547729492, "learning_rate": 4.700239132065101e-06, "loss": 3.6816, "step": 15565 }, { "epoch": 0.15838623046875, "grad_norm": 14.018820762634277, "learning_rate": 4.700049241878637e-06, "loss": 3.4, "step": 15570 }, { "epoch": 0.15843709309895834, "grad_norm": 9.490670204162598, "learning_rate": 4.699859295404433e-06, "loss": 3.3853, "step": 15575 }, { "epoch": 0.15848795572916666, "grad_norm": 12.291707992553711, "learning_rate": 4.699669292647352e-06, "loss": 3.4993, "step": 15580 }, { "epoch": 0.158538818359375, "grad_norm": 11.097095489501953, "learning_rate": 4.699479233612252e-06, "loss": 3.4258, "step": 15585 }, { "epoch": 0.15858968098958334, "grad_norm": 16.145360946655273, "learning_rate": 4.699289118303998e-06, "loss": 3.6462, "step": 15590 }, { "epoch": 0.15864054361979166, "grad_norm": 13.295560836791992, "learning_rate": 4.699098946727454e-06, "loss": 3.7865, "step": 15595 }, { "epoch": 0.15869140625, "grad_norm": 7.7286152839660645, "learning_rate": 4.6989087188874835e-06, "loss": 3.2336, "step": 15600 }, { "epoch": 0.15874226888020834, "grad_norm": 8.864896774291992, "learning_rate": 4.698718434788955e-06, "loss": 3.5222, "step": 15605 }, { "epoch": 0.15879313151041666, "grad_norm": 16.60683822631836, "learning_rate": 4.698528094436737e-06, "loss": 3.5871, "step": 15610 }, { "epoch": 0.158843994140625, "grad_norm": 13.247583389282227, "learning_rate": 4.698337697835697e-06, "loss": 3.3722, "step": 15615 }, { "epoch": 0.15889485677083334, "grad_norm": 10.77502155303955, "learning_rate": 4.69814724499071e-06, "loss": 3.3955, "step": 15620 }, { "epoch": 0.15894571940104166, "grad_norm": 12.344950675964355, "learning_rate": 4.697956735906646e-06, "loss": 3.197, "step": 15625 }, { "epoch": 0.15899658203125, "grad_norm": 14.187060356140137, "learning_rate": 4.6977661705883805e-06, "loss": 3.3216, "step": 15630 }, { "epoch": 0.15904744466145834, "grad_norm": 14.362591743469238, "learning_rate": 4.697575549040788e-06, "loss": 3.4256, "step": 15635 }, { "epoch": 0.15909830729166666, "grad_norm": 10.191631317138672, "learning_rate": 4.697384871268745e-06, "loss": 3.4145, "step": 15640 }, { "epoch": 0.159149169921875, "grad_norm": 15.049766540527344, "learning_rate": 4.697194137277132e-06, "loss": 3.4378, "step": 15645 }, { "epoch": 0.15920003255208334, "grad_norm": 16.25983428955078, "learning_rate": 4.697003347070828e-06, "loss": 3.3361, "step": 15650 }, { "epoch": 0.15925089518229166, "grad_norm": 17.122467041015625, "learning_rate": 4.696812500654714e-06, "loss": 3.2949, "step": 15655 }, { "epoch": 0.1593017578125, "grad_norm": 11.814437866210938, "learning_rate": 4.696621598033673e-06, "loss": 3.1715, "step": 15660 }, { "epoch": 0.15935262044270834, "grad_norm": 9.952800750732422, "learning_rate": 4.696430639212588e-06, "loss": 3.3326, "step": 15665 }, { "epoch": 0.15940348307291666, "grad_norm": 13.430285453796387, "learning_rate": 4.696239624196346e-06, "loss": 3.542, "step": 15670 }, { "epoch": 0.159454345703125, "grad_norm": 8.788776397705078, "learning_rate": 4.696048552989835e-06, "loss": 3.3412, "step": 15675 }, { "epoch": 0.15950520833333334, "grad_norm": 11.158987045288086, "learning_rate": 4.69585742559794e-06, "loss": 3.1511, "step": 15680 }, { "epoch": 0.15955607096354166, "grad_norm": 8.167466163635254, "learning_rate": 4.695666242025556e-06, "loss": 3.7305, "step": 15685 }, { "epoch": 0.15960693359375, "grad_norm": 13.274105072021484, "learning_rate": 4.69547500227757e-06, "loss": 3.3312, "step": 15690 }, { "epoch": 0.15965779622395834, "grad_norm": 10.515645027160645, "learning_rate": 4.6952837063588766e-06, "loss": 3.07, "step": 15695 }, { "epoch": 0.15970865885416666, "grad_norm": 12.83730411529541, "learning_rate": 4.69509235427437e-06, "loss": 3.5047, "step": 15700 }, { "epoch": 0.159759521484375, "grad_norm": 12.311369895935059, "learning_rate": 4.694900946028946e-06, "loss": 3.6497, "step": 15705 }, { "epoch": 0.15981038411458334, "grad_norm": 12.914471626281738, "learning_rate": 4.694709481627502e-06, "loss": 3.2395, "step": 15710 }, { "epoch": 0.15986124674479166, "grad_norm": 14.76524543762207, "learning_rate": 4.694517961074934e-06, "loss": 3.2397, "step": 15715 }, { "epoch": 0.159912109375, "grad_norm": 18.405723571777344, "learning_rate": 4.694326384376146e-06, "loss": 3.3388, "step": 15720 }, { "epoch": 0.15996297200520834, "grad_norm": 9.726785659790039, "learning_rate": 4.694134751536038e-06, "loss": 3.287, "step": 15725 }, { "epoch": 0.16001383463541666, "grad_norm": 12.435685157775879, "learning_rate": 4.693943062559512e-06, "loss": 3.2259, "step": 15730 }, { "epoch": 0.160064697265625, "grad_norm": 11.35924243927002, "learning_rate": 4.693751317451472e-06, "loss": 3.1574, "step": 15735 }, { "epoch": 0.16011555989583334, "grad_norm": 10.276074409484863, "learning_rate": 4.693559516216825e-06, "loss": 3.3365, "step": 15740 }, { "epoch": 0.16016642252604166, "grad_norm": 8.467253684997559, "learning_rate": 4.693367658860478e-06, "loss": 3.1872, "step": 15745 }, { "epoch": 0.16021728515625, "grad_norm": 11.348249435424805, "learning_rate": 4.693175745387339e-06, "loss": 3.3335, "step": 15750 }, { "epoch": 0.16026814778645834, "grad_norm": 12.088251113891602, "learning_rate": 4.692983775802318e-06, "loss": 3.4162, "step": 15755 }, { "epoch": 0.16031901041666666, "grad_norm": 10.288962364196777, "learning_rate": 4.692791750110327e-06, "loss": 3.4439, "step": 15760 }, { "epoch": 0.160369873046875, "grad_norm": 15.410057067871094, "learning_rate": 4.692599668316279e-06, "loss": 3.4407, "step": 15765 }, { "epoch": 0.16042073567708334, "grad_norm": 13.116768836975098, "learning_rate": 4.692407530425089e-06, "loss": 3.4639, "step": 15770 }, { "epoch": 0.16047159830729166, "grad_norm": 9.727956771850586, "learning_rate": 4.692215336441671e-06, "loss": 3.3015, "step": 15775 }, { "epoch": 0.1605224609375, "grad_norm": 11.078639030456543, "learning_rate": 4.692023086370944e-06, "loss": 3.265, "step": 15780 }, { "epoch": 0.16057332356770834, "grad_norm": 7.883986473083496, "learning_rate": 4.6918307802178255e-06, "loss": 3.4122, "step": 15785 }, { "epoch": 0.16062418619791666, "grad_norm": 12.443114280700684, "learning_rate": 4.6916384179872356e-06, "loss": 3.0951, "step": 15790 }, { "epoch": 0.160675048828125, "grad_norm": 9.635088920593262, "learning_rate": 4.691445999684097e-06, "loss": 3.8282, "step": 15795 }, { "epoch": 0.16072591145833334, "grad_norm": 8.927238464355469, "learning_rate": 4.69125352531333e-06, "loss": 3.4864, "step": 15800 }, { "epoch": 0.16077677408854166, "grad_norm": 10.879573822021484, "learning_rate": 4.6910609948798636e-06, "loss": 3.7555, "step": 15805 }, { "epoch": 0.16082763671875, "grad_norm": 8.361101150512695, "learning_rate": 4.69086840838862e-06, "loss": 3.5791, "step": 15810 }, { "epoch": 0.16087849934895834, "grad_norm": 10.21220588684082, "learning_rate": 4.6906757658445265e-06, "loss": 3.2996, "step": 15815 }, { "epoch": 0.16092936197916666, "grad_norm": 12.77525520324707, "learning_rate": 4.690483067252514e-06, "loss": 3.4817, "step": 15820 }, { "epoch": 0.160980224609375, "grad_norm": 12.158894538879395, "learning_rate": 4.690290312617512e-06, "loss": 3.3044, "step": 15825 }, { "epoch": 0.16103108723958334, "grad_norm": 15.93702507019043, "learning_rate": 4.69009750194445e-06, "loss": 3.3049, "step": 15830 }, { "epoch": 0.16108194986979166, "grad_norm": 14.75804615020752, "learning_rate": 4.6899046352382625e-06, "loss": 3.2458, "step": 15835 }, { "epoch": 0.1611328125, "grad_norm": 7.818066596984863, "learning_rate": 4.689711712503885e-06, "loss": 3.3629, "step": 15840 }, { "epoch": 0.16118367513020834, "grad_norm": 11.566434860229492, "learning_rate": 4.689518733746251e-06, "loss": 3.3139, "step": 15845 }, { "epoch": 0.16123453776041666, "grad_norm": 11.620870590209961, "learning_rate": 4.689325698970301e-06, "loss": 3.3736, "step": 15850 }, { "epoch": 0.161285400390625, "grad_norm": 9.550261497497559, "learning_rate": 4.6891326081809705e-06, "loss": 3.1503, "step": 15855 }, { "epoch": 0.16133626302083334, "grad_norm": 14.424015045166016, "learning_rate": 4.688939461383202e-06, "loss": 3.3043, "step": 15860 }, { "epoch": 0.16138712565104166, "grad_norm": 14.749124526977539, "learning_rate": 4.688746258581936e-06, "loss": 3.2469, "step": 15865 }, { "epoch": 0.16143798828125, "grad_norm": 14.39970588684082, "learning_rate": 4.688552999782114e-06, "loss": 3.4534, "step": 15870 }, { "epoch": 0.16148885091145834, "grad_norm": 11.76413631439209, "learning_rate": 4.6883596849886845e-06, "loss": 3.1851, "step": 15875 }, { "epoch": 0.16153971354166666, "grad_norm": 14.800198554992676, "learning_rate": 4.68816631420659e-06, "loss": 3.7561, "step": 15880 }, { "epoch": 0.161590576171875, "grad_norm": 13.527456283569336, "learning_rate": 4.68797288744078e-06, "loss": 3.4993, "step": 15885 }, { "epoch": 0.16164143880208334, "grad_norm": 13.89983081817627, "learning_rate": 4.6877794046962014e-06, "loss": 3.5607, "step": 15890 }, { "epoch": 0.16169230143229166, "grad_norm": 12.58095932006836, "learning_rate": 4.687585865977806e-06, "loss": 3.2699, "step": 15895 }, { "epoch": 0.1617431640625, "grad_norm": 14.336568832397461, "learning_rate": 4.687392271290544e-06, "loss": 3.5323, "step": 15900 }, { "epoch": 0.16179402669270834, "grad_norm": 12.0953950881958, "learning_rate": 4.6871986206393695e-06, "loss": 3.549, "step": 15905 }, { "epoch": 0.16184488932291666, "grad_norm": 10.101968765258789, "learning_rate": 4.687004914029237e-06, "loss": 3.3465, "step": 15910 }, { "epoch": 0.161895751953125, "grad_norm": 14.160582542419434, "learning_rate": 4.6868111514651025e-06, "loss": 3.3592, "step": 15915 }, { "epoch": 0.16194661458333334, "grad_norm": 8.53564453125, "learning_rate": 4.686617332951922e-06, "loss": 3.3656, "step": 15920 }, { "epoch": 0.16199747721354166, "grad_norm": 13.714225769042969, "learning_rate": 4.686423458494655e-06, "loss": 2.912, "step": 15925 }, { "epoch": 0.16204833984375, "grad_norm": 13.133865356445312, "learning_rate": 4.686229528098263e-06, "loss": 3.5427, "step": 15930 }, { "epoch": 0.16209920247395834, "grad_norm": 10.839924812316895, "learning_rate": 4.686035541767707e-06, "loss": 3.1262, "step": 15935 }, { "epoch": 0.16215006510416666, "grad_norm": 16.54444694519043, "learning_rate": 4.6858414995079495e-06, "loss": 3.5354, "step": 15940 }, { "epoch": 0.162200927734375, "grad_norm": 14.630023002624512, "learning_rate": 4.685647401323955e-06, "loss": 3.9413, "step": 15945 }, { "epoch": 0.16225179036458334, "grad_norm": 9.191962242126465, "learning_rate": 4.68545324722069e-06, "loss": 3.011, "step": 15950 }, { "epoch": 0.16230265299479166, "grad_norm": 9.845999717712402, "learning_rate": 4.685259037203121e-06, "loss": 3.433, "step": 15955 }, { "epoch": 0.162353515625, "grad_norm": 10.282391548156738, "learning_rate": 4.685064771276219e-06, "loss": 4.0526, "step": 15960 }, { "epoch": 0.16240437825520834, "grad_norm": 11.311758995056152, "learning_rate": 4.684870449444951e-06, "loss": 3.3342, "step": 15965 }, { "epoch": 0.16245524088541666, "grad_norm": 10.48366641998291, "learning_rate": 4.684676071714292e-06, "loss": 3.3553, "step": 15970 }, { "epoch": 0.162506103515625, "grad_norm": 14.828400611877441, "learning_rate": 4.684481638089212e-06, "loss": 3.3099, "step": 15975 }, { "epoch": 0.16255696614583334, "grad_norm": 9.731256484985352, "learning_rate": 4.684287148574689e-06, "loss": 3.4463, "step": 15980 }, { "epoch": 0.16260782877604166, "grad_norm": 11.136251449584961, "learning_rate": 4.684092603175696e-06, "loss": 3.5473, "step": 15985 }, { "epoch": 0.16265869140625, "grad_norm": 13.161229133605957, "learning_rate": 4.683898001897211e-06, "loss": 3.3273, "step": 15990 }, { "epoch": 0.16270955403645834, "grad_norm": 9.826051712036133, "learning_rate": 4.683703344744213e-06, "loss": 3.3483, "step": 15995 }, { "epoch": 0.16276041666666666, "grad_norm": 7.0690460205078125, "learning_rate": 4.683508631721684e-06, "loss": 3.0126, "step": 16000 }, { "epoch": 0.162811279296875, "grad_norm": 13.985572814941406, "learning_rate": 4.683313862834603e-06, "loss": 3.6359, "step": 16005 }, { "epoch": 0.16286214192708334, "grad_norm": 10.578701972961426, "learning_rate": 4.683119038087955e-06, "loss": 3.2651, "step": 16010 }, { "epoch": 0.16291300455729166, "grad_norm": 11.11215591430664, "learning_rate": 4.682924157486724e-06, "loss": 3.1562, "step": 16015 }, { "epoch": 0.1629638671875, "grad_norm": 11.88314151763916, "learning_rate": 4.682729221035895e-06, "loss": 3.677, "step": 16020 }, { "epoch": 0.16301472981770834, "grad_norm": 8.311817169189453, "learning_rate": 4.6825342287404564e-06, "loss": 3.6961, "step": 16025 }, { "epoch": 0.16306559244791666, "grad_norm": 13.549260139465332, "learning_rate": 4.682339180605397e-06, "loss": 3.7595, "step": 16030 }, { "epoch": 0.163116455078125, "grad_norm": 9.020939826965332, "learning_rate": 4.682144076635707e-06, "loss": 3.2208, "step": 16035 }, { "epoch": 0.16316731770833334, "grad_norm": 16.133804321289062, "learning_rate": 4.681948916836378e-06, "loss": 3.4582, "step": 16040 }, { "epoch": 0.16321818033854166, "grad_norm": 9.808793067932129, "learning_rate": 4.681753701212404e-06, "loss": 3.3957, "step": 16045 }, { "epoch": 0.16326904296875, "grad_norm": 13.307278633117676, "learning_rate": 4.681558429768777e-06, "loss": 3.2849, "step": 16050 }, { "epoch": 0.16331990559895834, "grad_norm": 17.242029190063477, "learning_rate": 4.681363102510496e-06, "loss": 3.2799, "step": 16055 }, { "epoch": 0.16337076822916666, "grad_norm": 9.080634117126465, "learning_rate": 4.6811677194425566e-06, "loss": 2.9969, "step": 16060 }, { "epoch": 0.163421630859375, "grad_norm": 8.223971366882324, "learning_rate": 4.680972280569958e-06, "loss": 3.3398, "step": 16065 }, { "epoch": 0.16347249348958334, "grad_norm": 13.515527725219727, "learning_rate": 4.680776785897701e-06, "loss": 3.1698, "step": 16070 }, { "epoch": 0.16352335611979166, "grad_norm": 10.615397453308105, "learning_rate": 4.680581235430786e-06, "loss": 3.644, "step": 16075 }, { "epoch": 0.16357421875, "grad_norm": 13.023798942565918, "learning_rate": 4.680385629174218e-06, "loss": 3.8601, "step": 16080 }, { "epoch": 0.16362508138020834, "grad_norm": 9.510238647460938, "learning_rate": 4.680189967133e-06, "loss": 3.2448, "step": 16085 }, { "epoch": 0.16367594401041666, "grad_norm": 10.635534286499023, "learning_rate": 4.6799942493121374e-06, "loss": 3.7123, "step": 16090 }, { "epoch": 0.163726806640625, "grad_norm": 8.131168365478516, "learning_rate": 4.67979847571664e-06, "loss": 3.4378, "step": 16095 }, { "epoch": 0.16377766927083334, "grad_norm": 9.373008728027344, "learning_rate": 4.679602646351515e-06, "loss": 3.5492, "step": 16100 }, { "epoch": 0.16382853190104166, "grad_norm": 15.981472969055176, "learning_rate": 4.6794067612217734e-06, "loss": 3.7413, "step": 16105 }, { "epoch": 0.16387939453125, "grad_norm": 9.636930465698242, "learning_rate": 4.679210820332425e-06, "loss": 3.5977, "step": 16110 }, { "epoch": 0.16393025716145834, "grad_norm": 13.382928848266602, "learning_rate": 4.679014823688485e-06, "loss": 3.3873, "step": 16115 }, { "epoch": 0.16398111979166666, "grad_norm": 11.43319320678711, "learning_rate": 4.678818771294967e-06, "loss": 3.4517, "step": 16120 }, { "epoch": 0.164031982421875, "grad_norm": 10.521183013916016, "learning_rate": 4.678622663156888e-06, "loss": 3.4823, "step": 16125 }, { "epoch": 0.16408284505208334, "grad_norm": 10.852574348449707, "learning_rate": 4.678426499279264e-06, "loss": 3.4299, "step": 16130 }, { "epoch": 0.16413370768229166, "grad_norm": 13.135746955871582, "learning_rate": 4.6782302796671145e-06, "loss": 3.4938, "step": 16135 }, { "epoch": 0.1641845703125, "grad_norm": 16.104429244995117, "learning_rate": 4.678034004325459e-06, "loss": 3.4273, "step": 16140 }, { "epoch": 0.16423543294270834, "grad_norm": 14.600593566894531, "learning_rate": 4.677837673259321e-06, "loss": 3.4873, "step": 16145 }, { "epoch": 0.16428629557291666, "grad_norm": 10.380234718322754, "learning_rate": 4.677641286473722e-06, "loss": 3.4697, "step": 16150 }, { "epoch": 0.164337158203125, "grad_norm": 13.653000831604004, "learning_rate": 4.677444843973685e-06, "loss": 3.2568, "step": 16155 }, { "epoch": 0.16438802083333334, "grad_norm": 10.491547584533691, "learning_rate": 4.67724834576424e-06, "loss": 3.7773, "step": 16160 }, { "epoch": 0.16443888346354166, "grad_norm": 18.804861068725586, "learning_rate": 4.677051791850411e-06, "loss": 3.1534, "step": 16165 }, { "epoch": 0.16448974609375, "grad_norm": 10.031415939331055, "learning_rate": 4.676855182237229e-06, "loss": 3.1366, "step": 16170 }, { "epoch": 0.16454060872395834, "grad_norm": 11.652718544006348, "learning_rate": 4.6766585169297215e-06, "loss": 3.302, "step": 16175 }, { "epoch": 0.16459147135416666, "grad_norm": 12.849843978881836, "learning_rate": 4.6764617959329226e-06, "loss": 3.7761, "step": 16180 }, { "epoch": 0.164642333984375, "grad_norm": 12.61086368560791, "learning_rate": 4.676265019251865e-06, "loss": 3.3563, "step": 16185 }, { "epoch": 0.16469319661458334, "grad_norm": 10.435957908630371, "learning_rate": 4.676068186891582e-06, "loss": 3.4891, "step": 16190 }, { "epoch": 0.16474405924479166, "grad_norm": 13.85044002532959, "learning_rate": 4.675871298857111e-06, "loss": 3.0696, "step": 16195 }, { "epoch": 0.164794921875, "grad_norm": 9.022653579711914, "learning_rate": 4.675674355153488e-06, "loss": 3.4813, "step": 16200 }, { "epoch": 0.16484578450520834, "grad_norm": 11.870932579040527, "learning_rate": 4.675477355785752e-06, "loss": 3.7666, "step": 16205 }, { "epoch": 0.16489664713541666, "grad_norm": 12.24802303314209, "learning_rate": 4.675280300758944e-06, "loss": 3.3968, "step": 16210 }, { "epoch": 0.164947509765625, "grad_norm": 12.894149780273438, "learning_rate": 4.6750831900781055e-06, "loss": 3.4708, "step": 16215 }, { "epoch": 0.16499837239583334, "grad_norm": 9.178655624389648, "learning_rate": 4.674886023748279e-06, "loss": 3.3961, "step": 16220 }, { "epoch": 0.16504923502604166, "grad_norm": 15.221061706542969, "learning_rate": 4.674688801774508e-06, "loss": 3.5445, "step": 16225 }, { "epoch": 0.16510009765625, "grad_norm": 12.730554580688477, "learning_rate": 4.67449152416184e-06, "loss": 3.3576, "step": 16230 }, { "epoch": 0.16515096028645834, "grad_norm": 10.138298988342285, "learning_rate": 4.674294190915321e-06, "loss": 3.3352, "step": 16235 }, { "epoch": 0.16520182291666666, "grad_norm": 9.869894981384277, "learning_rate": 4.674096802040003e-06, "loss": 3.3375, "step": 16240 }, { "epoch": 0.165252685546875, "grad_norm": 9.15701961517334, "learning_rate": 4.673899357540932e-06, "loss": 3.2079, "step": 16245 }, { "epoch": 0.16530354817708334, "grad_norm": 13.204168319702148, "learning_rate": 4.673701857423161e-06, "loss": 3.2676, "step": 16250 }, { "epoch": 0.16535441080729166, "grad_norm": 15.898265838623047, "learning_rate": 4.6735043016917435e-06, "loss": 3.3132, "step": 16255 }, { "epoch": 0.1654052734375, "grad_norm": 10.086653709411621, "learning_rate": 4.673306690351733e-06, "loss": 3.2754, "step": 16260 }, { "epoch": 0.16545613606770834, "grad_norm": 15.631150245666504, "learning_rate": 4.6731090234081865e-06, "loss": 3.6231, "step": 16265 }, { "epoch": 0.16550699869791666, "grad_norm": 10.62215518951416, "learning_rate": 4.672911300866161e-06, "loss": 3.2062, "step": 16270 }, { "epoch": 0.165557861328125, "grad_norm": 8.654640197753906, "learning_rate": 4.672713522730715e-06, "loss": 3.2674, "step": 16275 }, { "epoch": 0.16560872395833334, "grad_norm": 11.249765396118164, "learning_rate": 4.672515689006908e-06, "loss": 3.0822, "step": 16280 }, { "epoch": 0.16565958658854166, "grad_norm": 14.134060859680176, "learning_rate": 4.6723177996998025e-06, "loss": 3.58, "step": 16285 }, { "epoch": 0.16571044921875, "grad_norm": 16.133581161499023, "learning_rate": 4.672119854814461e-06, "loss": 3.6814, "step": 16290 }, { "epoch": 0.16576131184895834, "grad_norm": 13.389992713928223, "learning_rate": 4.671921854355947e-06, "loss": 3.4922, "step": 16295 }, { "epoch": 0.16581217447916666, "grad_norm": 13.663773536682129, "learning_rate": 4.671723798329328e-06, "loss": 3.1055, "step": 16300 }, { "epoch": 0.165863037109375, "grad_norm": 13.669440269470215, "learning_rate": 4.671525686739669e-06, "loss": 3.1514, "step": 16305 }, { "epoch": 0.16591389973958334, "grad_norm": 12.217931747436523, "learning_rate": 4.671327519592042e-06, "loss": 3.6507, "step": 16310 }, { "epoch": 0.16596476236979166, "grad_norm": 15.413629531860352, "learning_rate": 4.6711292968915145e-06, "loss": 3.4689, "step": 16315 }, { "epoch": 0.166015625, "grad_norm": 11.545214653015137, "learning_rate": 4.670931018643158e-06, "loss": 3.2486, "step": 16320 }, { "epoch": 0.16606648763020834, "grad_norm": 12.391217231750488, "learning_rate": 4.670732684852046e-06, "loss": 3.3009, "step": 16325 }, { "epoch": 0.16611735026041666, "grad_norm": 10.62708854675293, "learning_rate": 4.670534295523253e-06, "loss": 3.3732, "step": 16330 }, { "epoch": 0.166168212890625, "grad_norm": 8.105783462524414, "learning_rate": 4.670335850661855e-06, "loss": 3.3016, "step": 16335 }, { "epoch": 0.16621907552083334, "grad_norm": 7.761030197143555, "learning_rate": 4.670137350272927e-06, "loss": 3.2436, "step": 16340 }, { "epoch": 0.16626993815104166, "grad_norm": 12.776176452636719, "learning_rate": 4.669938794361552e-06, "loss": 3.3074, "step": 16345 }, { "epoch": 0.16632080078125, "grad_norm": 7.695553302764893, "learning_rate": 4.669740182932805e-06, "loss": 3.0618, "step": 16350 }, { "epoch": 0.16637166341145834, "grad_norm": 14.618805885314941, "learning_rate": 4.66954151599177e-06, "loss": 3.4561, "step": 16355 }, { "epoch": 0.16642252604166666, "grad_norm": 14.894763946533203, "learning_rate": 4.66934279354353e-06, "loss": 3.492, "step": 16360 }, { "epoch": 0.166473388671875, "grad_norm": 9.516297340393066, "learning_rate": 4.669144015593169e-06, "loss": 3.3012, "step": 16365 }, { "epoch": 0.16652425130208334, "grad_norm": 15.990934371948242, "learning_rate": 4.668945182145773e-06, "loss": 3.4032, "step": 16370 }, { "epoch": 0.16657511393229166, "grad_norm": 11.829225540161133, "learning_rate": 4.668746293206428e-06, "loss": 3.57, "step": 16375 }, { "epoch": 0.1666259765625, "grad_norm": 9.242964744567871, "learning_rate": 4.668547348780222e-06, "loss": 3.43, "step": 16380 }, { "epoch": 0.16667683919270834, "grad_norm": 15.611289024353027, "learning_rate": 4.668348348872248e-06, "loss": 3.2432, "step": 16385 }, { "epoch": 0.16672770182291666, "grad_norm": 12.005697250366211, "learning_rate": 4.668149293487595e-06, "loss": 2.9898, "step": 16390 }, { "epoch": 0.166778564453125, "grad_norm": 14.01111888885498, "learning_rate": 4.6679501826313554e-06, "loss": 3.5077, "step": 16395 }, { "epoch": 0.16682942708333334, "grad_norm": 17.95404052734375, "learning_rate": 4.667751016308624e-06, "loss": 3.3168, "step": 16400 }, { "epoch": 0.16688028971354166, "grad_norm": 14.67130184173584, "learning_rate": 4.6675517945244975e-06, "loss": 3.0814, "step": 16405 }, { "epoch": 0.16693115234375, "grad_norm": 9.70960521697998, "learning_rate": 4.667352517284072e-06, "loss": 3.3854, "step": 16410 }, { "epoch": 0.16698201497395834, "grad_norm": 10.68616008758545, "learning_rate": 4.667153184592446e-06, "loss": 3.375, "step": 16415 }, { "epoch": 0.16703287760416666, "grad_norm": 12.836880683898926, "learning_rate": 4.6669537964547195e-06, "loss": 3.2576, "step": 16420 }, { "epoch": 0.167083740234375, "grad_norm": 10.322628021240234, "learning_rate": 4.666754352875994e-06, "loss": 3.5553, "step": 16425 }, { "epoch": 0.16713460286458334, "grad_norm": 13.36998176574707, "learning_rate": 4.6665548538613715e-06, "loss": 3.5801, "step": 16430 }, { "epoch": 0.16718546549479166, "grad_norm": 11.6786527633667, "learning_rate": 4.666355299415956e-06, "loss": 3.1604, "step": 16435 }, { "epoch": 0.167236328125, "grad_norm": 15.699369430541992, "learning_rate": 4.666155689544855e-06, "loss": 3.2281, "step": 16440 }, { "epoch": 0.16728719075520834, "grad_norm": 23.027801513671875, "learning_rate": 4.6659560242531735e-06, "loss": 3.0434, "step": 16445 }, { "epoch": 0.16733805338541666, "grad_norm": 10.911322593688965, "learning_rate": 4.665756303546021e-06, "loss": 3.5173, "step": 16450 }, { "epoch": 0.167388916015625, "grad_norm": 9.32667064666748, "learning_rate": 4.665556527428506e-06, "loss": 3.5175, "step": 16455 }, { "epoch": 0.16743977864583334, "grad_norm": 13.066845893859863, "learning_rate": 4.66535669590574e-06, "loss": 3.7572, "step": 16460 }, { "epoch": 0.16749064127604166, "grad_norm": 14.599390029907227, "learning_rate": 4.6651568089828384e-06, "loss": 3.1729, "step": 16465 }, { "epoch": 0.16754150390625, "grad_norm": 10.783397674560547, "learning_rate": 4.664956866664912e-06, "loss": 3.1904, "step": 16470 }, { "epoch": 0.16759236653645834, "grad_norm": 15.606278419494629, "learning_rate": 4.664756868957076e-06, "loss": 3.3966, "step": 16475 }, { "epoch": 0.16764322916666666, "grad_norm": 11.634224891662598, "learning_rate": 4.6645568158644496e-06, "loss": 3.4924, "step": 16480 }, { "epoch": 0.167694091796875, "grad_norm": 8.687528610229492, "learning_rate": 4.66435670739215e-06, "loss": 3.4183, "step": 16485 }, { "epoch": 0.16774495442708334, "grad_norm": 13.002685546875, "learning_rate": 4.6641565435452975e-06, "loss": 3.3721, "step": 16490 }, { "epoch": 0.16779581705729166, "grad_norm": 13.268308639526367, "learning_rate": 4.663956324329012e-06, "loss": 3.7302, "step": 16495 }, { "epoch": 0.1678466796875, "grad_norm": 9.55381965637207, "learning_rate": 4.663756049748418e-06, "loss": 3.2204, "step": 16500 }, { "epoch": 0.16789754231770834, "grad_norm": 13.7435302734375, "learning_rate": 4.6635557198086375e-06, "loss": 3.6933, "step": 16505 }, { "epoch": 0.16794840494791666, "grad_norm": 13.537063598632812, "learning_rate": 4.663355334514796e-06, "loss": 3.0507, "step": 16510 }, { "epoch": 0.167999267578125, "grad_norm": 13.002664566040039, "learning_rate": 4.663154893872023e-06, "loss": 3.1599, "step": 16515 }, { "epoch": 0.16805013020833334, "grad_norm": 8.084352493286133, "learning_rate": 4.662954397885443e-06, "loss": 3.5544, "step": 16520 }, { "epoch": 0.16810099283854166, "grad_norm": 13.704971313476562, "learning_rate": 4.662753846560189e-06, "loss": 3.5446, "step": 16525 }, { "epoch": 0.16815185546875, "grad_norm": 13.047155380249023, "learning_rate": 4.662553239901389e-06, "loss": 3.6407, "step": 16530 }, { "epoch": 0.16820271809895834, "grad_norm": 10.460413932800293, "learning_rate": 4.662352577914178e-06, "loss": 3.5399, "step": 16535 }, { "epoch": 0.16825358072916666, "grad_norm": 12.304898262023926, "learning_rate": 4.6621518606036875e-06, "loss": 3.3801, "step": 16540 }, { "epoch": 0.168304443359375, "grad_norm": 10.110926628112793, "learning_rate": 4.661951087975055e-06, "loss": 3.2155, "step": 16545 }, { "epoch": 0.16835530598958334, "grad_norm": 7.640111446380615, "learning_rate": 4.661750260033417e-06, "loss": 3.1164, "step": 16550 }, { "epoch": 0.16840616861979166, "grad_norm": 8.465399742126465, "learning_rate": 4.66154937678391e-06, "loss": 3.7351, "step": 16555 }, { "epoch": 0.16845703125, "grad_norm": 11.797218322753906, "learning_rate": 4.661348438231675e-06, "loss": 3.6523, "step": 16560 }, { "epoch": 0.16850789388020834, "grad_norm": 9.346522331237793, "learning_rate": 4.6611474443818525e-06, "loss": 3.34, "step": 16565 }, { "epoch": 0.16855875651041666, "grad_norm": 14.899089813232422, "learning_rate": 4.660946395239584e-06, "loss": 3.5477, "step": 16570 }, { "epoch": 0.168609619140625, "grad_norm": 11.063288688659668, "learning_rate": 4.660745290810015e-06, "loss": 3.333, "step": 16575 }, { "epoch": 0.16866048177083334, "grad_norm": 9.561201095581055, "learning_rate": 4.66054413109829e-06, "loss": 3.8984, "step": 16580 }, { "epoch": 0.16871134440104166, "grad_norm": 13.455229759216309, "learning_rate": 4.6603429161095556e-06, "loss": 3.2126, "step": 16585 }, { "epoch": 0.16876220703125, "grad_norm": 10.709653854370117, "learning_rate": 4.660141645848959e-06, "loss": 3.2361, "step": 16590 }, { "epoch": 0.16881306966145834, "grad_norm": 12.909401893615723, "learning_rate": 4.659940320321651e-06, "loss": 3.3619, "step": 16595 }, { "epoch": 0.16886393229166666, "grad_norm": 15.584907531738281, "learning_rate": 4.6597389395327816e-06, "loss": 3.1329, "step": 16600 }, { "epoch": 0.168914794921875, "grad_norm": 14.770988464355469, "learning_rate": 4.659537503487503e-06, "loss": 3.9151, "step": 16605 }, { "epoch": 0.16896565755208334, "grad_norm": 7.884305953979492, "learning_rate": 4.6593360121909706e-06, "loss": 3.2376, "step": 16610 }, { "epoch": 0.16901652018229166, "grad_norm": 14.378878593444824, "learning_rate": 4.659134465648338e-06, "loss": 3.4527, "step": 16615 }, { "epoch": 0.1690673828125, "grad_norm": 9.519623756408691, "learning_rate": 4.65893286386476e-06, "loss": 3.445, "step": 16620 }, { "epoch": 0.16911824544270834, "grad_norm": 9.221073150634766, "learning_rate": 4.658731206845398e-06, "loss": 3.3477, "step": 16625 }, { "epoch": 0.16916910807291666, "grad_norm": 10.170306205749512, "learning_rate": 4.658529494595408e-06, "loss": 3.3417, "step": 16630 }, { "epoch": 0.169219970703125, "grad_norm": 15.229034423828125, "learning_rate": 4.6583277271199545e-06, "loss": 3.2686, "step": 16635 }, { "epoch": 0.16927083333333334, "grad_norm": 12.785039901733398, "learning_rate": 4.658125904424197e-06, "loss": 3.4329, "step": 16640 }, { "epoch": 0.16932169596354166, "grad_norm": 9.296720504760742, "learning_rate": 4.6579240265133e-06, "loss": 3.3711, "step": 16645 }, { "epoch": 0.16937255859375, "grad_norm": 10.980588912963867, "learning_rate": 4.657722093392428e-06, "loss": 3.3667, "step": 16650 }, { "epoch": 0.16942342122395834, "grad_norm": 11.73270034790039, "learning_rate": 4.657520105066747e-06, "loss": 3.4216, "step": 16655 }, { "epoch": 0.16947428385416666, "grad_norm": 18.23244857788086, "learning_rate": 4.6573180615414265e-06, "loss": 3.6996, "step": 16660 }, { "epoch": 0.169525146484375, "grad_norm": 13.225893020629883, "learning_rate": 4.657115962821635e-06, "loss": 3.5311, "step": 16665 }, { "epoch": 0.16957600911458334, "grad_norm": 12.947046279907227, "learning_rate": 4.656913808912542e-06, "loss": 3.2338, "step": 16670 }, { "epoch": 0.16962687174479166, "grad_norm": 16.09568977355957, "learning_rate": 4.65671159981932e-06, "loss": 3.3158, "step": 16675 }, { "epoch": 0.169677734375, "grad_norm": 9.442344665527344, "learning_rate": 4.656509335547144e-06, "loss": 3.5821, "step": 16680 }, { "epoch": 0.16972859700520834, "grad_norm": 8.947610855102539, "learning_rate": 4.656307016101187e-06, "loss": 3.2935, "step": 16685 }, { "epoch": 0.16977945963541666, "grad_norm": 15.548479080200195, "learning_rate": 4.656104641486628e-06, "loss": 3.6593, "step": 16690 }, { "epoch": 0.169830322265625, "grad_norm": 9.9256591796875, "learning_rate": 4.655902211708641e-06, "loss": 3.4071, "step": 16695 }, { "epoch": 0.16988118489583334, "grad_norm": 11.299330711364746, "learning_rate": 4.655699726772407e-06, "loss": 3.4039, "step": 16700 }, { "epoch": 0.16993204752604166, "grad_norm": 10.457588195800781, "learning_rate": 4.655497186683107e-06, "loss": 3.4151, "step": 16705 }, { "epoch": 0.16998291015625, "grad_norm": 9.101759910583496, "learning_rate": 4.655294591445921e-06, "loss": 3.0531, "step": 16710 }, { "epoch": 0.17003377278645834, "grad_norm": 9.755556106567383, "learning_rate": 4.6550919410660355e-06, "loss": 3.7921, "step": 16715 }, { "epoch": 0.17008463541666666, "grad_norm": 6.866489410400391, "learning_rate": 4.654889235548633e-06, "loss": 3.3934, "step": 16720 }, { "epoch": 0.170135498046875, "grad_norm": 11.931754112243652, "learning_rate": 4.6546864748989e-06, "loss": 3.2464, "step": 16725 }, { "epoch": 0.17018636067708334, "grad_norm": 12.620525360107422, "learning_rate": 4.654483659122025e-06, "loss": 3.1906, "step": 16730 }, { "epoch": 0.17023722330729166, "grad_norm": 9.603962898254395, "learning_rate": 4.654280788223195e-06, "loss": 3.1315, "step": 16735 }, { "epoch": 0.1702880859375, "grad_norm": 10.157326698303223, "learning_rate": 4.654077862207601e-06, "loss": 3.4375, "step": 16740 }, { "epoch": 0.17033894856770834, "grad_norm": 6.509634971618652, "learning_rate": 4.653874881080437e-06, "loss": 3.2071, "step": 16745 }, { "epoch": 0.17038981119791666, "grad_norm": 11.346307754516602, "learning_rate": 4.653671844846895e-06, "loss": 3.4308, "step": 16750 }, { "epoch": 0.170440673828125, "grad_norm": 10.316970825195312, "learning_rate": 4.653468753512168e-06, "loss": 3.5133, "step": 16755 }, { "epoch": 0.17049153645833334, "grad_norm": 12.79603099822998, "learning_rate": 4.653265607081454e-06, "loss": 3.2712, "step": 16760 }, { "epoch": 0.17054239908854166, "grad_norm": 12.55677604675293, "learning_rate": 4.653062405559951e-06, "loss": 3.2636, "step": 16765 }, { "epoch": 0.17059326171875, "grad_norm": 15.563427925109863, "learning_rate": 4.652859148952855e-06, "loss": 3.2278, "step": 16770 }, { "epoch": 0.17064412434895834, "grad_norm": 13.07026481628418, "learning_rate": 4.652655837265369e-06, "loss": 3.3884, "step": 16775 }, { "epoch": 0.17069498697916666, "grad_norm": 13.581730842590332, "learning_rate": 4.6524524705026925e-06, "loss": 3.6175, "step": 16780 }, { "epoch": 0.170745849609375, "grad_norm": 8.78980827331543, "learning_rate": 4.65224904867003e-06, "loss": 3.4371, "step": 16785 }, { "epoch": 0.17079671223958334, "grad_norm": 11.52972412109375, "learning_rate": 4.652045571772586e-06, "loss": 3.2436, "step": 16790 }, { "epoch": 0.17084757486979166, "grad_norm": 17.321304321289062, "learning_rate": 4.651842039815566e-06, "loss": 3.393, "step": 16795 }, { "epoch": 0.1708984375, "grad_norm": 12.183976173400879, "learning_rate": 4.651638452804178e-06, "loss": 3.3443, "step": 16800 }, { "epoch": 0.17094930013020834, "grad_norm": 11.435518264770508, "learning_rate": 4.6514348107436305e-06, "loss": 3.4001, "step": 16805 }, { "epoch": 0.17100016276041666, "grad_norm": 15.479721069335938, "learning_rate": 4.651231113639132e-06, "loss": 3.0751, "step": 16810 }, { "epoch": 0.171051025390625, "grad_norm": 12.146183013916016, "learning_rate": 4.651027361495896e-06, "loss": 3.238, "step": 16815 }, { "epoch": 0.17110188802083334, "grad_norm": 12.965581893920898, "learning_rate": 4.650823554319135e-06, "loss": 3.8735, "step": 16820 }, { "epoch": 0.17115275065104166, "grad_norm": 9.760518074035645, "learning_rate": 4.650619692114063e-06, "loss": 3.2039, "step": 16825 }, { "epoch": 0.17120361328125, "grad_norm": 12.903510093688965, "learning_rate": 4.650415774885896e-06, "loss": 3.5834, "step": 16830 }, { "epoch": 0.17125447591145834, "grad_norm": 13.00151538848877, "learning_rate": 4.650211802639851e-06, "loss": 3.243, "step": 16835 }, { "epoch": 0.17130533854166666, "grad_norm": 15.11430549621582, "learning_rate": 4.6500077753811465e-06, "loss": 3.4178, "step": 16840 }, { "epoch": 0.171356201171875, "grad_norm": 10.01813793182373, "learning_rate": 4.649803693115003e-06, "loss": 3.2601, "step": 16845 }, { "epoch": 0.17140706380208334, "grad_norm": 11.622364044189453, "learning_rate": 4.649599555846641e-06, "loss": 3.6725, "step": 16850 }, { "epoch": 0.17145792643229166, "grad_norm": 15.42918872833252, "learning_rate": 4.649395363581285e-06, "loss": 3.3048, "step": 16855 }, { "epoch": 0.1715087890625, "grad_norm": 11.032341957092285, "learning_rate": 4.649191116324158e-06, "loss": 3.4853, "step": 16860 }, { "epoch": 0.17155965169270834, "grad_norm": 12.54845142364502, "learning_rate": 4.648986814080485e-06, "loss": 3.2918, "step": 16865 }, { "epoch": 0.17161051432291666, "grad_norm": 8.605419158935547, "learning_rate": 4.648782456855493e-06, "loss": 3.4214, "step": 16870 }, { "epoch": 0.171661376953125, "grad_norm": 9.82323169708252, "learning_rate": 4.648578044654412e-06, "loss": 3.3439, "step": 16875 }, { "epoch": 0.17171223958333334, "grad_norm": 15.435731887817383, "learning_rate": 4.648373577482471e-06, "loss": 3.2196, "step": 16880 }, { "epoch": 0.17176310221354166, "grad_norm": 9.784239768981934, "learning_rate": 4.6481690553449015e-06, "loss": 4.0164, "step": 16885 }, { "epoch": 0.17181396484375, "grad_norm": 11.91480541229248, "learning_rate": 4.647964478246936e-06, "loss": 3.2925, "step": 16890 }, { "epoch": 0.17186482747395834, "grad_norm": 7.702728271484375, "learning_rate": 4.647759846193808e-06, "loss": 3.2802, "step": 16895 }, { "epoch": 0.17191569010416666, "grad_norm": 7.844374656677246, "learning_rate": 4.647555159190753e-06, "loss": 3.3919, "step": 16900 }, { "epoch": 0.171966552734375, "grad_norm": 9.673126220703125, "learning_rate": 4.647350417243009e-06, "loss": 3.7028, "step": 16905 }, { "epoch": 0.17201741536458334, "grad_norm": 13.436027526855469, "learning_rate": 4.647145620355813e-06, "loss": 3.0947, "step": 16910 }, { "epoch": 0.17206827799479166, "grad_norm": 9.890853881835938, "learning_rate": 4.646940768534406e-06, "loss": 3.3502, "step": 16915 }, { "epoch": 0.172119140625, "grad_norm": 12.191718101501465, "learning_rate": 4.6467358617840275e-06, "loss": 3.1166, "step": 16920 }, { "epoch": 0.17217000325520834, "grad_norm": 10.807548522949219, "learning_rate": 4.646530900109921e-06, "loss": 3.3743, "step": 16925 }, { "epoch": 0.17222086588541666, "grad_norm": 15.573385238647461, "learning_rate": 4.646325883517331e-06, "loss": 3.4527, "step": 16930 }, { "epoch": 0.172271728515625, "grad_norm": 12.284563064575195, "learning_rate": 4.646120812011501e-06, "loss": 3.1113, "step": 16935 }, { "epoch": 0.17232259114583334, "grad_norm": 15.29260540008545, "learning_rate": 4.645915685597679e-06, "loss": 3.4474, "step": 16940 }, { "epoch": 0.17237345377604166, "grad_norm": 12.571720123291016, "learning_rate": 4.645710504281113e-06, "loss": 3.1863, "step": 16945 }, { "epoch": 0.17242431640625, "grad_norm": 14.381819725036621, "learning_rate": 4.645505268067052e-06, "loss": 3.3417, "step": 16950 }, { "epoch": 0.17247517903645834, "grad_norm": 9.561775207519531, "learning_rate": 4.645299976960747e-06, "loss": 3.1955, "step": 16955 }, { "epoch": 0.17252604166666666, "grad_norm": 12.148945808410645, "learning_rate": 4.645094630967451e-06, "loss": 3.7219, "step": 16960 }, { "epoch": 0.172576904296875, "grad_norm": 10.793240547180176, "learning_rate": 4.644889230092418e-06, "loss": 3.5109, "step": 16965 }, { "epoch": 0.17262776692708334, "grad_norm": 13.821535110473633, "learning_rate": 4.644683774340902e-06, "loss": 3.3895, "step": 16970 }, { "epoch": 0.17267862955729166, "grad_norm": 10.016497611999512, "learning_rate": 4.64447826371816e-06, "loss": 3.317, "step": 16975 }, { "epoch": 0.1727294921875, "grad_norm": 12.0796480178833, "learning_rate": 4.64427269822945e-06, "loss": 3.1043, "step": 16980 }, { "epoch": 0.17278035481770834, "grad_norm": 11.921280860900879, "learning_rate": 4.644067077880031e-06, "loss": 3.4567, "step": 16985 }, { "epoch": 0.17283121744791666, "grad_norm": 10.839923858642578, "learning_rate": 4.643861402675164e-06, "loss": 3.4736, "step": 16990 }, { "epoch": 0.172882080078125, "grad_norm": 13.740301132202148, "learning_rate": 4.643655672620111e-06, "loss": 3.6096, "step": 16995 }, { "epoch": 0.17293294270833334, "grad_norm": 14.093925476074219, "learning_rate": 4.643449887720136e-06, "loss": 3.2615, "step": 17000 }, { "epoch": 0.17298380533854166, "grad_norm": 10.022769927978516, "learning_rate": 4.643244047980503e-06, "loss": 3.0516, "step": 17005 }, { "epoch": 0.17303466796875, "grad_norm": 9.807087898254395, "learning_rate": 4.64303815340648e-06, "loss": 3.6837, "step": 17010 }, { "epoch": 0.17308553059895834, "grad_norm": 10.774398803710938, "learning_rate": 4.642832204003333e-06, "loss": 3.046, "step": 17015 }, { "epoch": 0.17313639322916666, "grad_norm": 8.116972923278809, "learning_rate": 4.642626199776333e-06, "loss": 3.071, "step": 17020 }, { "epoch": 0.173187255859375, "grad_norm": 13.305663108825684, "learning_rate": 4.642420140730749e-06, "loss": 3.8665, "step": 17025 }, { "epoch": 0.17323811848958334, "grad_norm": 16.738006591796875, "learning_rate": 4.642214026871853e-06, "loss": 3.6563, "step": 17030 }, { "epoch": 0.17328898111979166, "grad_norm": 10.710490226745605, "learning_rate": 4.642007858204919e-06, "loss": 3.3579, "step": 17035 }, { "epoch": 0.17333984375, "grad_norm": 17.46071434020996, "learning_rate": 4.641801634735222e-06, "loss": 3.2495, "step": 17040 }, { "epoch": 0.17339070638020834, "grad_norm": 8.741815567016602, "learning_rate": 4.6415953564680385e-06, "loss": 3.0387, "step": 17045 }, { "epoch": 0.17344156901041666, "grad_norm": 14.374468803405762, "learning_rate": 4.641389023408644e-06, "loss": 3.3034, "step": 17050 }, { "epoch": 0.173492431640625, "grad_norm": 10.53064250946045, "learning_rate": 4.64118263556232e-06, "loss": 3.3958, "step": 17055 }, { "epoch": 0.17354329427083334, "grad_norm": 16.45563316345215, "learning_rate": 4.640976192934345e-06, "loss": 3.6537, "step": 17060 }, { "epoch": 0.17359415690104166, "grad_norm": 13.669546127319336, "learning_rate": 4.6407696955300025e-06, "loss": 3.136, "step": 17065 }, { "epoch": 0.17364501953125, "grad_norm": 15.326542854309082, "learning_rate": 4.640563143354574e-06, "loss": 3.4866, "step": 17070 }, { "epoch": 0.17369588216145834, "grad_norm": 7.852635860443115, "learning_rate": 4.640356536413345e-06, "loss": 3.4082, "step": 17075 }, { "epoch": 0.17374674479166666, "grad_norm": 15.086888313293457, "learning_rate": 4.640149874711601e-06, "loss": 3.3045, "step": 17080 }, { "epoch": 0.173797607421875, "grad_norm": 13.305411338806152, "learning_rate": 4.63994315825463e-06, "loss": 2.9298, "step": 17085 }, { "epoch": 0.17384847005208334, "grad_norm": 6.963002681732178, "learning_rate": 4.639736387047722e-06, "loss": 3.5608, "step": 17090 }, { "epoch": 0.17389933268229166, "grad_norm": 13.062210083007812, "learning_rate": 4.639529561096164e-06, "loss": 3.1996, "step": 17095 }, { "epoch": 0.1739501953125, "grad_norm": 9.515623092651367, "learning_rate": 4.639322680405249e-06, "loss": 3.3422, "step": 17100 }, { "epoch": 0.17400105794270834, "grad_norm": 11.610623359680176, "learning_rate": 4.639115744980272e-06, "loss": 3.3988, "step": 17105 }, { "epoch": 0.17405192057291666, "grad_norm": 11.479599952697754, "learning_rate": 4.6389087548265245e-06, "loss": 3.2056, "step": 17110 }, { "epoch": 0.174102783203125, "grad_norm": 11.929533004760742, "learning_rate": 4.638701709949303e-06, "loss": 3.0456, "step": 17115 }, { "epoch": 0.17415364583333334, "grad_norm": 12.549202919006348, "learning_rate": 4.638494610353907e-06, "loss": 3.3503, "step": 17120 }, { "epoch": 0.17420450846354166, "grad_norm": 14.711870193481445, "learning_rate": 4.638287456045632e-06, "loss": 3.4214, "step": 17125 }, { "epoch": 0.17425537109375, "grad_norm": 14.64809513092041, "learning_rate": 4.638080247029779e-06, "loss": 3.4133, "step": 17130 }, { "epoch": 0.17430623372395834, "grad_norm": 16.768775939941406, "learning_rate": 4.63787298331165e-06, "loss": 3.3163, "step": 17135 }, { "epoch": 0.17435709635416666, "grad_norm": 13.071782112121582, "learning_rate": 4.637665664896547e-06, "loss": 3.8572, "step": 17140 }, { "epoch": 0.174407958984375, "grad_norm": 11.809013366699219, "learning_rate": 4.637458291789776e-06, "loss": 3.4033, "step": 17145 }, { "epoch": 0.17445882161458334, "grad_norm": 11.942599296569824, "learning_rate": 4.63725086399664e-06, "loss": 3.3937, "step": 17150 }, { "epoch": 0.17450968424479166, "grad_norm": 11.510283470153809, "learning_rate": 4.637043381522447e-06, "loss": 3.2646, "step": 17155 }, { "epoch": 0.174560546875, "grad_norm": 15.842909812927246, "learning_rate": 4.636835844372507e-06, "loss": 3.2227, "step": 17160 }, { "epoch": 0.17461140950520834, "grad_norm": 9.673988342285156, "learning_rate": 4.636628252552128e-06, "loss": 3.5092, "step": 17165 }, { "epoch": 0.17466227213541666, "grad_norm": 12.802469253540039, "learning_rate": 4.636420606066621e-06, "loss": 3.3495, "step": 17170 }, { "epoch": 0.174713134765625, "grad_norm": 9.603645324707031, "learning_rate": 4.636212904921299e-06, "loss": 3.404, "step": 17175 }, { "epoch": 0.17476399739583334, "grad_norm": 11.273427963256836, "learning_rate": 4.6360051491214765e-06, "loss": 3.2583, "step": 17180 }, { "epoch": 0.17481486002604166, "grad_norm": 9.479177474975586, "learning_rate": 4.635797338672469e-06, "loss": 3.086, "step": 17185 }, { "epoch": 0.17486572265625, "grad_norm": 15.164192199707031, "learning_rate": 4.635589473579592e-06, "loss": 3.7945, "step": 17190 }, { "epoch": 0.17491658528645834, "grad_norm": 13.359272003173828, "learning_rate": 4.635381553848165e-06, "loss": 3.3775, "step": 17195 }, { "epoch": 0.17496744791666666, "grad_norm": 7.546788692474365, "learning_rate": 4.635173579483507e-06, "loss": 3.2542, "step": 17200 }, { "epoch": 0.175018310546875, "grad_norm": 13.577391624450684, "learning_rate": 4.634965550490939e-06, "loss": 3.4331, "step": 17205 }, { "epoch": 0.17506917317708334, "grad_norm": 14.945005416870117, "learning_rate": 4.6347574668757835e-06, "loss": 3.2556, "step": 17210 }, { "epoch": 0.17512003580729166, "grad_norm": 13.663776397705078, "learning_rate": 4.634549328643364e-06, "loss": 3.4574, "step": 17215 }, { "epoch": 0.1751708984375, "grad_norm": 8.942784309387207, "learning_rate": 4.634341135799007e-06, "loss": 3.4193, "step": 17220 }, { "epoch": 0.17522176106770834, "grad_norm": 15.326900482177734, "learning_rate": 4.634132888348037e-06, "loss": 3.6722, "step": 17225 }, { "epoch": 0.17527262369791666, "grad_norm": 13.63649845123291, "learning_rate": 4.633924586295782e-06, "loss": 3.4126, "step": 17230 }, { "epoch": 0.175323486328125, "grad_norm": 12.05754280090332, "learning_rate": 4.633716229647573e-06, "loss": 3.5295, "step": 17235 }, { "epoch": 0.17537434895833334, "grad_norm": 14.691134452819824, "learning_rate": 4.633507818408741e-06, "loss": 3.2017, "step": 17240 }, { "epoch": 0.17542521158854166, "grad_norm": 12.102783203125, "learning_rate": 4.633299352584616e-06, "loss": 3.3013, "step": 17245 }, { "epoch": 0.17547607421875, "grad_norm": 8.052931785583496, "learning_rate": 4.6330908321805336e-06, "loss": 3.4158, "step": 17250 }, { "epoch": 0.17552693684895834, "grad_norm": 11.040821075439453, "learning_rate": 4.632882257201826e-06, "loss": 3.1246, "step": 17255 }, { "epoch": 0.17557779947916666, "grad_norm": 11.392463684082031, "learning_rate": 4.632673627653833e-06, "loss": 3.4227, "step": 17260 }, { "epoch": 0.175628662109375, "grad_norm": 14.34813404083252, "learning_rate": 4.6324649435418916e-06, "loss": 3.2329, "step": 17265 }, { "epoch": 0.17567952473958334, "grad_norm": 9.003851890563965, "learning_rate": 4.632256204871338e-06, "loss": 3.5315, "step": 17270 }, { "epoch": 0.17573038736979166, "grad_norm": 11.924999237060547, "learning_rate": 4.632047411647516e-06, "loss": 3.4919, "step": 17275 }, { "epoch": 0.17578125, "grad_norm": 11.398469924926758, "learning_rate": 4.6318385638757665e-06, "loss": 3.9053, "step": 17280 }, { "epoch": 0.17583211263020834, "grad_norm": 10.507608413696289, "learning_rate": 4.631629661561432e-06, "loss": 3.8949, "step": 17285 }, { "epoch": 0.17588297526041666, "grad_norm": 6.944844722747803, "learning_rate": 4.6314207047098585e-06, "loss": 3.6582, "step": 17290 }, { "epoch": 0.175933837890625, "grad_norm": 15.985193252563477, "learning_rate": 4.63121169332639e-06, "loss": 3.1175, "step": 17295 }, { "epoch": 0.17598470052083334, "grad_norm": 14.670384407043457, "learning_rate": 4.6310026274163765e-06, "loss": 3.547, "step": 17300 }, { "epoch": 0.17603556315104166, "grad_norm": 12.856901168823242, "learning_rate": 4.630793506985166e-06, "loss": 3.3646, "step": 17305 }, { "epoch": 0.17608642578125, "grad_norm": 14.06156063079834, "learning_rate": 4.6305843320381085e-06, "loss": 3.4701, "step": 17310 }, { "epoch": 0.17613728841145834, "grad_norm": 18.088773727416992, "learning_rate": 4.630375102580557e-06, "loss": 3.5199, "step": 17315 }, { "epoch": 0.17618815104166666, "grad_norm": 16.56032943725586, "learning_rate": 4.630165818617862e-06, "loss": 3.3376, "step": 17320 }, { "epoch": 0.176239013671875, "grad_norm": 13.883744239807129, "learning_rate": 4.62995648015538e-06, "loss": 3.3735, "step": 17325 }, { "epoch": 0.17628987630208334, "grad_norm": 9.11486530303955, "learning_rate": 4.629747087198466e-06, "loss": 3.2933, "step": 17330 }, { "epoch": 0.17634073893229166, "grad_norm": 12.755553245544434, "learning_rate": 4.629537639752477e-06, "loss": 3.5813, "step": 17335 }, { "epoch": 0.1763916015625, "grad_norm": 9.951680183410645, "learning_rate": 4.629328137822774e-06, "loss": 3.4514, "step": 17340 }, { "epoch": 0.17644246419270834, "grad_norm": 14.455367088317871, "learning_rate": 4.629118581414713e-06, "loss": 2.6122, "step": 17345 }, { "epoch": 0.17649332682291666, "grad_norm": 8.975110054016113, "learning_rate": 4.6289089705336595e-06, "loss": 3.1559, "step": 17350 }, { "epoch": 0.176544189453125, "grad_norm": 17.275592803955078, "learning_rate": 4.628699305184974e-06, "loss": 3.3324, "step": 17355 }, { "epoch": 0.17659505208333334, "grad_norm": 14.314291954040527, "learning_rate": 4.628489585374022e-06, "loss": 3.4651, "step": 17360 }, { "epoch": 0.17664591471354166, "grad_norm": 15.956419944763184, "learning_rate": 4.628279811106168e-06, "loss": 3.6316, "step": 17365 }, { "epoch": 0.17669677734375, "grad_norm": 13.444526672363281, "learning_rate": 4.628069982386779e-06, "loss": 3.4095, "step": 17370 }, { "epoch": 0.17674763997395834, "grad_norm": 14.760951042175293, "learning_rate": 4.627860099221224e-06, "loss": 3.2458, "step": 17375 }, { "epoch": 0.17679850260416666, "grad_norm": 9.031475067138672, "learning_rate": 4.627650161614873e-06, "loss": 3.1318, "step": 17380 }, { "epoch": 0.176849365234375, "grad_norm": 9.814698219299316, "learning_rate": 4.627440169573098e-06, "loss": 3.8271, "step": 17385 }, { "epoch": 0.17690022786458334, "grad_norm": 12.851936340332031, "learning_rate": 4.627230123101268e-06, "loss": 3.5598, "step": 17390 }, { "epoch": 0.17695109049479166, "grad_norm": 44.61574935913086, "learning_rate": 4.627020022204761e-06, "loss": 3.6756, "step": 17395 }, { "epoch": 0.177001953125, "grad_norm": 11.95059871673584, "learning_rate": 4.626809866888951e-06, "loss": 3.3316, "step": 17400 }, { "epoch": 0.17705281575520834, "grad_norm": 16.417972564697266, "learning_rate": 4.626599657159216e-06, "loss": 3.3807, "step": 17405 }, { "epoch": 0.17710367838541666, "grad_norm": 13.880332946777344, "learning_rate": 4.6263893930209304e-06, "loss": 3.3911, "step": 17410 }, { "epoch": 0.177154541015625, "grad_norm": 15.084425926208496, "learning_rate": 4.6261790744794765e-06, "loss": 3.4722, "step": 17415 }, { "epoch": 0.17720540364583334, "grad_norm": 15.803872108459473, "learning_rate": 4.625968701540236e-06, "loss": 3.1724, "step": 17420 }, { "epoch": 0.17725626627604166, "grad_norm": 7.609067440032959, "learning_rate": 4.62575827420859e-06, "loss": 3.2543, "step": 17425 }, { "epoch": 0.17730712890625, "grad_norm": 11.47111701965332, "learning_rate": 4.625547792489922e-06, "loss": 3.3642, "step": 17430 }, { "epoch": 0.17735799153645834, "grad_norm": 9.357483863830566, "learning_rate": 4.625337256389618e-06, "loss": 3.5617, "step": 17435 }, { "epoch": 0.17740885416666666, "grad_norm": 12.891478538513184, "learning_rate": 4.625126665913063e-06, "loss": 3.2682, "step": 17440 }, { "epoch": 0.177459716796875, "grad_norm": 14.151500701904297, "learning_rate": 4.6249160210656476e-06, "loss": 3.3943, "step": 17445 }, { "epoch": 0.17751057942708334, "grad_norm": 11.056961059570312, "learning_rate": 4.624705321852758e-06, "loss": 3.2693, "step": 17450 }, { "epoch": 0.17756144205729166, "grad_norm": 16.99146270751953, "learning_rate": 4.624494568279787e-06, "loss": 3.5405, "step": 17455 }, { "epoch": 0.1776123046875, "grad_norm": 9.272522926330566, "learning_rate": 4.624283760352126e-06, "loss": 3.0549, "step": 17460 }, { "epoch": 0.17766316731770834, "grad_norm": 10.904102325439453, "learning_rate": 4.624072898075168e-06, "loss": 3.0915, "step": 17465 }, { "epoch": 0.17771402994791666, "grad_norm": 15.731610298156738, "learning_rate": 4.6238619814543094e-06, "loss": 3.3617, "step": 17470 }, { "epoch": 0.177764892578125, "grad_norm": 15.29787540435791, "learning_rate": 4.623651010494945e-06, "loss": 3.4183, "step": 17475 }, { "epoch": 0.17781575520833334, "grad_norm": 13.272435188293457, "learning_rate": 4.623439985202472e-06, "loss": 3.3338, "step": 17480 }, { "epoch": 0.17786661783854166, "grad_norm": 10.852397918701172, "learning_rate": 4.623228905582292e-06, "loss": 3.6102, "step": 17485 }, { "epoch": 0.17791748046875, "grad_norm": 10.077887535095215, "learning_rate": 4.623017771639803e-06, "loss": 3.4454, "step": 17490 }, { "epoch": 0.17796834309895834, "grad_norm": 13.571096420288086, "learning_rate": 4.622806583380407e-06, "loss": 3.3273, "step": 17495 }, { "epoch": 0.17801920572916666, "grad_norm": 9.822858810424805, "learning_rate": 4.622595340809508e-06, "loss": 3.4383, "step": 17500 }, { "epoch": 0.178070068359375, "grad_norm": 16.22844123840332, "learning_rate": 4.622384043932509e-06, "loss": 3.0372, "step": 17505 }, { "epoch": 0.17812093098958334, "grad_norm": 7.383034706115723, "learning_rate": 4.622172692754819e-06, "loss": 3.1181, "step": 17510 }, { "epoch": 0.17817179361979166, "grad_norm": 9.609219551086426, "learning_rate": 4.621961287281843e-06, "loss": 3.3651, "step": 17515 }, { "epoch": 0.17822265625, "grad_norm": 9.730359077453613, "learning_rate": 4.621749827518991e-06, "loss": 3.4125, "step": 17520 }, { "epoch": 0.17827351888020834, "grad_norm": 7.237141132354736, "learning_rate": 4.621538313471673e-06, "loss": 3.2425, "step": 17525 }, { "epoch": 0.17832438151041666, "grad_norm": 10.888049125671387, "learning_rate": 4.621326745145299e-06, "loss": 3.5376, "step": 17530 }, { "epoch": 0.178375244140625, "grad_norm": 12.970224380493164, "learning_rate": 4.6211151225452835e-06, "loss": 3.5308, "step": 17535 }, { "epoch": 0.17842610677083334, "grad_norm": 12.810026168823242, "learning_rate": 4.62090344567704e-06, "loss": 3.1474, "step": 17540 }, { "epoch": 0.17847696940104166, "grad_norm": 8.826539993286133, "learning_rate": 4.6206917145459855e-06, "loss": 3.2885, "step": 17545 }, { "epoch": 0.17852783203125, "grad_norm": 10.239234924316406, "learning_rate": 4.620479929157535e-06, "loss": 3.1573, "step": 17550 }, { "epoch": 0.17857869466145834, "grad_norm": 14.187760353088379, "learning_rate": 4.620268089517108e-06, "loss": 3.2755, "step": 17555 }, { "epoch": 0.17862955729166666, "grad_norm": 11.916472434997559, "learning_rate": 4.620056195630125e-06, "loss": 3.2658, "step": 17560 }, { "epoch": 0.178680419921875, "grad_norm": 18.4185791015625, "learning_rate": 4.619844247502007e-06, "loss": 3.3846, "step": 17565 }, { "epoch": 0.17873128255208334, "grad_norm": 10.282975196838379, "learning_rate": 4.619632245138176e-06, "loss": 3.497, "step": 17570 }, { "epoch": 0.17878214518229166, "grad_norm": 15.2913179397583, "learning_rate": 4.619420188544057e-06, "loss": 3.5962, "step": 17575 }, { "epoch": 0.1788330078125, "grad_norm": 8.401836395263672, "learning_rate": 4.619208077725075e-06, "loss": 3.2448, "step": 17580 }, { "epoch": 0.17888387044270834, "grad_norm": 11.083022117614746, "learning_rate": 4.6189959126866555e-06, "loss": 3.0683, "step": 17585 }, { "epoch": 0.17893473307291666, "grad_norm": 11.012330055236816, "learning_rate": 4.618783693434229e-06, "loss": 3.3922, "step": 17590 }, { "epoch": 0.178985595703125, "grad_norm": 10.7522611618042, "learning_rate": 4.618571419973222e-06, "loss": 3.1457, "step": 17595 }, { "epoch": 0.17903645833333334, "grad_norm": 9.602813720703125, "learning_rate": 4.6183590923090696e-06, "loss": 3.2299, "step": 17600 }, { "epoch": 0.17908732096354166, "grad_norm": 8.291756629943848, "learning_rate": 4.6181467104472005e-06, "loss": 3.0146, "step": 17605 }, { "epoch": 0.17913818359375, "grad_norm": 13.860746383666992, "learning_rate": 4.61793427439305e-06, "loss": 3.1955, "step": 17610 }, { "epoch": 0.17918904622395834, "grad_norm": 13.45958423614502, "learning_rate": 4.6177217841520535e-06, "loss": 2.8876, "step": 17615 }, { "epoch": 0.17923990885416666, "grad_norm": 12.669150352478027, "learning_rate": 4.617509239729647e-06, "loss": 3.6825, "step": 17620 }, { "epoch": 0.179290771484375, "grad_norm": 11.674629211425781, "learning_rate": 4.61729664113127e-06, "loss": 3.4646, "step": 17625 }, { "epoch": 0.17934163411458334, "grad_norm": 14.575140953063965, "learning_rate": 4.617083988362358e-06, "loss": 2.9819, "step": 17630 }, { "epoch": 0.17939249674479166, "grad_norm": 13.144431114196777, "learning_rate": 4.616871281428355e-06, "loss": 3.5436, "step": 17635 }, { "epoch": 0.179443359375, "grad_norm": 14.929411888122559, "learning_rate": 4.616658520334701e-06, "loss": 3.3415, "step": 17640 }, { "epoch": 0.17949422200520834, "grad_norm": 13.124006271362305, "learning_rate": 4.616445705086842e-06, "loss": 3.3565, "step": 17645 }, { "epoch": 0.17954508463541666, "grad_norm": 8.569463729858398, "learning_rate": 4.616232835690221e-06, "loss": 4.0458, "step": 17650 }, { "epoch": 0.179595947265625, "grad_norm": 11.597271919250488, "learning_rate": 4.616019912150284e-06, "loss": 3.5618, "step": 17655 }, { "epoch": 0.17964680989583334, "grad_norm": 12.50623607635498, "learning_rate": 4.615806934472479e-06, "loss": 3.4437, "step": 17660 }, { "epoch": 0.17969767252604166, "grad_norm": 13.184673309326172, "learning_rate": 4.615593902662256e-06, "loss": 3.4312, "step": 17665 }, { "epoch": 0.17974853515625, "grad_norm": 10.000298500061035, "learning_rate": 4.615380816725063e-06, "loss": 3.693, "step": 17670 }, { "epoch": 0.17979939778645834, "grad_norm": 15.589776039123535, "learning_rate": 4.6151676766663536e-06, "loss": 3.4443, "step": 17675 }, { "epoch": 0.17985026041666666, "grad_norm": 12.44705867767334, "learning_rate": 4.614954482491581e-06, "loss": 3.2575, "step": 17680 }, { "epoch": 0.179901123046875, "grad_norm": 16.3400821685791, "learning_rate": 4.6147412342061995e-06, "loss": 3.1107, "step": 17685 }, { "epoch": 0.17995198567708334, "grad_norm": 9.2102689743042, "learning_rate": 4.614527931815664e-06, "loss": 3.2749, "step": 17690 }, { "epoch": 0.18000284830729166, "grad_norm": 9.545188903808594, "learning_rate": 4.6143145753254335e-06, "loss": 3.5115, "step": 17695 }, { "epoch": 0.1800537109375, "grad_norm": 13.04957389831543, "learning_rate": 4.614101164740965e-06, "loss": 3.5562, "step": 17700 }, { "epoch": 0.18010457356770834, "grad_norm": 10.874368667602539, "learning_rate": 4.613887700067719e-06, "loss": 3.3832, "step": 17705 }, { "epoch": 0.18015543619791666, "grad_norm": 11.728689193725586, "learning_rate": 4.613674181311158e-06, "loss": 3.2963, "step": 17710 }, { "epoch": 0.180206298828125, "grad_norm": 10.86733341217041, "learning_rate": 4.613460608476744e-06, "loss": 3.2448, "step": 17715 }, { "epoch": 0.18025716145833334, "grad_norm": 13.16111946105957, "learning_rate": 4.613246981569941e-06, "loss": 3.5583, "step": 17720 }, { "epoch": 0.18030802408854166, "grad_norm": 15.626137733459473, "learning_rate": 4.6130333005962144e-06, "loss": 3.4084, "step": 17725 }, { "epoch": 0.18035888671875, "grad_norm": 10.255508422851562, "learning_rate": 4.612819565561033e-06, "loss": 3.2747, "step": 17730 }, { "epoch": 0.18040974934895834, "grad_norm": 10.121077537536621, "learning_rate": 4.612605776469863e-06, "loss": 3.3058, "step": 17735 }, { "epoch": 0.18046061197916666, "grad_norm": 12.807663917541504, "learning_rate": 4.612391933328175e-06, "loss": 3.1888, "step": 17740 }, { "epoch": 0.180511474609375, "grad_norm": 7.487052917480469, "learning_rate": 4.61217803614144e-06, "loss": 3.2892, "step": 17745 }, { "epoch": 0.18056233723958334, "grad_norm": 13.956525802612305, "learning_rate": 4.61196408491513e-06, "loss": 3.7043, "step": 17750 }, { "epoch": 0.18061319986979166, "grad_norm": 9.125615119934082, "learning_rate": 4.611750079654721e-06, "loss": 3.4562, "step": 17755 }, { "epoch": 0.1806640625, "grad_norm": 14.65468692779541, "learning_rate": 4.611536020365686e-06, "loss": 3.3539, "step": 17760 }, { "epoch": 0.18071492513020834, "grad_norm": 10.214896202087402, "learning_rate": 4.611321907053502e-06, "loss": 3.2788, "step": 17765 }, { "epoch": 0.18076578776041666, "grad_norm": 13.62902545928955, "learning_rate": 4.611107739723647e-06, "loss": 3.3931, "step": 17770 }, { "epoch": 0.180816650390625, "grad_norm": 12.189156532287598, "learning_rate": 4.610893518381602e-06, "loss": 3.4467, "step": 17775 }, { "epoch": 0.18086751302083334, "grad_norm": 17.752410888671875, "learning_rate": 4.610679243032846e-06, "loss": 3.1061, "step": 17780 }, { "epoch": 0.18091837565104166, "grad_norm": 9.7039794921875, "learning_rate": 4.610464913682863e-06, "loss": 3.3899, "step": 17785 }, { "epoch": 0.18096923828125, "grad_norm": 18.238666534423828, "learning_rate": 4.610250530337134e-06, "loss": 3.4644, "step": 17790 }, { "epoch": 0.18102010091145834, "grad_norm": 8.860824584960938, "learning_rate": 4.6100360930011455e-06, "loss": 3.4091, "step": 17795 }, { "epoch": 0.18107096354166666, "grad_norm": 10.903741836547852, "learning_rate": 4.6098216016803845e-06, "loss": 3.276, "step": 17800 }, { "epoch": 0.181121826171875, "grad_norm": 13.30379581451416, "learning_rate": 4.609607056380337e-06, "loss": 3.2626, "step": 17805 }, { "epoch": 0.18117268880208334, "grad_norm": 12.02137565612793, "learning_rate": 4.609392457106494e-06, "loss": 3.6866, "step": 17810 }, { "epoch": 0.18122355143229166, "grad_norm": 10.083224296569824, "learning_rate": 4.6091778038643445e-06, "loss": 3.2943, "step": 17815 }, { "epoch": 0.1812744140625, "grad_norm": 13.689177513122559, "learning_rate": 4.608963096659381e-06, "loss": 3.4036, "step": 17820 }, { "epoch": 0.18132527669270834, "grad_norm": 10.178418159484863, "learning_rate": 4.608748335497096e-06, "loss": 3.3081, "step": 17825 }, { "epoch": 0.18137613932291666, "grad_norm": 13.94970417022705, "learning_rate": 4.608533520382985e-06, "loss": 3.3372, "step": 17830 }, { "epoch": 0.181427001953125, "grad_norm": 10.877111434936523, "learning_rate": 4.608318651322543e-06, "loss": 3.6664, "step": 17835 }, { "epoch": 0.18147786458333334, "grad_norm": 11.025904655456543, "learning_rate": 4.608103728321269e-06, "loss": 3.7866, "step": 17840 }, { "epoch": 0.18152872721354166, "grad_norm": 14.711570739746094, "learning_rate": 4.6078887513846605e-06, "loss": 3.4579, "step": 17845 }, { "epoch": 0.18157958984375, "grad_norm": 9.070608139038086, "learning_rate": 4.607673720518218e-06, "loss": 3.3378, "step": 17850 }, { "epoch": 0.18163045247395834, "grad_norm": 10.735955238342285, "learning_rate": 4.607458635727443e-06, "loss": 3.4461, "step": 17855 }, { "epoch": 0.18168131510416666, "grad_norm": 16.661575317382812, "learning_rate": 4.607243497017838e-06, "loss": 3.3425, "step": 17860 }, { "epoch": 0.181732177734375, "grad_norm": 12.659605979919434, "learning_rate": 4.607028304394907e-06, "loss": 3.9381, "step": 17865 }, { "epoch": 0.18178304036458334, "grad_norm": 16.774187088012695, "learning_rate": 4.606813057864158e-06, "loss": 3.5132, "step": 17870 }, { "epoch": 0.18183390299479166, "grad_norm": 6.721248149871826, "learning_rate": 4.606597757431095e-06, "loss": 3.3477, "step": 17875 }, { "epoch": 0.181884765625, "grad_norm": 11.525874137878418, "learning_rate": 4.606382403101228e-06, "loss": 3.5922, "step": 17880 }, { "epoch": 0.18193562825520834, "grad_norm": 13.829561233520508, "learning_rate": 4.606166994880067e-06, "loss": 3.2248, "step": 17885 }, { "epoch": 0.18198649088541666, "grad_norm": 8.705430030822754, "learning_rate": 4.605951532773122e-06, "loss": 3.2562, "step": 17890 }, { "epoch": 0.182037353515625, "grad_norm": 8.117154121398926, "learning_rate": 4.605736016785905e-06, "loss": 3.2657, "step": 17895 }, { "epoch": 0.18208821614583334, "grad_norm": 12.32487964630127, "learning_rate": 4.605520446923933e-06, "loss": 3.1516, "step": 17900 }, { "epoch": 0.18213907877604166, "grad_norm": 14.840004920959473, "learning_rate": 4.605304823192719e-06, "loss": 3.4167, "step": 17905 }, { "epoch": 0.18218994140625, "grad_norm": 14.861167907714844, "learning_rate": 4.60508914559778e-06, "loss": 3.729, "step": 17910 }, { "epoch": 0.18224080403645834, "grad_norm": 12.168790817260742, "learning_rate": 4.6048734141446335e-06, "loss": 3.2407, "step": 17915 }, { "epoch": 0.18229166666666666, "grad_norm": 7.106325149536133, "learning_rate": 4.604657628838801e-06, "loss": 3.2299, "step": 17920 }, { "epoch": 0.182342529296875, "grad_norm": 7.858881950378418, "learning_rate": 4.604441789685801e-06, "loss": 3.477, "step": 17925 }, { "epoch": 0.18239339192708334, "grad_norm": 11.167740821838379, "learning_rate": 4.604225896691157e-06, "loss": 3.673, "step": 17930 }, { "epoch": 0.18244425455729166, "grad_norm": 14.516616821289062, "learning_rate": 4.604009949860392e-06, "loss": 3.0951, "step": 17935 }, { "epoch": 0.1824951171875, "grad_norm": 11.77841567993164, "learning_rate": 4.603793949199031e-06, "loss": 3.4499, "step": 17940 }, { "epoch": 0.18254597981770834, "grad_norm": 12.027185440063477, "learning_rate": 4.603577894712601e-06, "loss": 3.8324, "step": 17945 }, { "epoch": 0.18259684244791666, "grad_norm": 13.794437408447266, "learning_rate": 4.603361786406628e-06, "loss": 3.6594, "step": 17950 }, { "epoch": 0.182647705078125, "grad_norm": 13.343639373779297, "learning_rate": 4.603145624286643e-06, "loss": 3.3471, "step": 17955 }, { "epoch": 0.18269856770833334, "grad_norm": 10.639410018920898, "learning_rate": 4.602929408358176e-06, "loss": 3.3093, "step": 17960 }, { "epoch": 0.18274943033854166, "grad_norm": 15.507131576538086, "learning_rate": 4.602713138626758e-06, "loss": 3.3536, "step": 17965 }, { "epoch": 0.18280029296875, "grad_norm": 13.404929161071777, "learning_rate": 4.602496815097923e-06, "loss": 3.244, "step": 17970 }, { "epoch": 0.18285115559895834, "grad_norm": 13.34079647064209, "learning_rate": 4.602280437777205e-06, "loss": 3.4758, "step": 17975 }, { "epoch": 0.18290201822916666, "grad_norm": 10.848614692687988, "learning_rate": 4.602064006670141e-06, "loss": 3.2688, "step": 17980 }, { "epoch": 0.182952880859375, "grad_norm": 11.669103622436523, "learning_rate": 4.601847521782268e-06, "loss": 3.51, "step": 17985 }, { "epoch": 0.18300374348958334, "grad_norm": 9.459100723266602, "learning_rate": 4.601630983119123e-06, "loss": 3.3047, "step": 17990 }, { "epoch": 0.18305460611979166, "grad_norm": 10.783151626586914, "learning_rate": 4.601414390686248e-06, "loss": 3.1554, "step": 17995 }, { "epoch": 0.18310546875, "grad_norm": 7.823272705078125, "learning_rate": 4.601197744489184e-06, "loss": 3.2805, "step": 18000 }, { "epoch": 0.18315633138020834, "grad_norm": 10.413420677185059, "learning_rate": 4.600981044533473e-06, "loss": 3.1569, "step": 18005 }, { "epoch": 0.18320719401041666, "grad_norm": 18.30168342590332, "learning_rate": 4.600764290824661e-06, "loss": 3.2039, "step": 18010 }, { "epoch": 0.183258056640625, "grad_norm": 13.457409858703613, "learning_rate": 4.600547483368292e-06, "loss": 3.5678, "step": 18015 }, { "epoch": 0.18330891927083334, "grad_norm": 12.043107032775879, "learning_rate": 4.600330622169914e-06, "loss": 3.2696, "step": 18020 }, { "epoch": 0.18335978190104166, "grad_norm": 10.646010398864746, "learning_rate": 4.600113707235075e-06, "loss": 3.2272, "step": 18025 }, { "epoch": 0.18341064453125, "grad_norm": 12.23196792602539, "learning_rate": 4.5998967385693235e-06, "loss": 3.4313, "step": 18030 }, { "epoch": 0.18346150716145834, "grad_norm": 9.132007598876953, "learning_rate": 4.599679716178212e-06, "loss": 3.5767, "step": 18035 }, { "epoch": 0.18351236979166666, "grad_norm": 15.052911758422852, "learning_rate": 4.599462640067294e-06, "loss": 3.5414, "step": 18040 }, { "epoch": 0.183563232421875, "grad_norm": 12.69869327545166, "learning_rate": 4.599245510242121e-06, "loss": 3.2965, "step": 18045 }, { "epoch": 0.18361409505208334, "grad_norm": 10.353547096252441, "learning_rate": 4.599028326708248e-06, "loss": 2.9671, "step": 18050 }, { "epoch": 0.18366495768229166, "grad_norm": 8.513712882995605, "learning_rate": 4.598811089471235e-06, "loss": 3.4682, "step": 18055 }, { "epoch": 0.1837158203125, "grad_norm": 10.49561882019043, "learning_rate": 4.598593798536636e-06, "loss": 3.1158, "step": 18060 }, { "epoch": 0.18376668294270834, "grad_norm": 7.227719306945801, "learning_rate": 4.598376453910013e-06, "loss": 3.4783, "step": 18065 }, { "epoch": 0.18381754557291666, "grad_norm": 8.991704940795898, "learning_rate": 4.598159055596926e-06, "loss": 3.258, "step": 18070 }, { "epoch": 0.183868408203125, "grad_norm": 11.315898895263672, "learning_rate": 4.5979416036029366e-06, "loss": 3.3756, "step": 18075 }, { "epoch": 0.18391927083333334, "grad_norm": 10.222885131835938, "learning_rate": 4.597724097933608e-06, "loss": 3.6465, "step": 18080 }, { "epoch": 0.18397013346354166, "grad_norm": 15.640301704406738, "learning_rate": 4.597506538594506e-06, "loss": 3.247, "step": 18085 }, { "epoch": 0.18402099609375, "grad_norm": 14.560544967651367, "learning_rate": 4.597288925591196e-06, "loss": 3.6574, "step": 18090 }, { "epoch": 0.18407185872395834, "grad_norm": 11.333292007446289, "learning_rate": 4.597071258929247e-06, "loss": 3.6407, "step": 18095 }, { "epoch": 0.18412272135416666, "grad_norm": 16.446786880493164, "learning_rate": 4.596853538614226e-06, "loss": 3.505, "step": 18100 }, { "epoch": 0.184173583984375, "grad_norm": 8.610262870788574, "learning_rate": 4.596635764651704e-06, "loss": 3.867, "step": 18105 }, { "epoch": 0.18422444661458334, "grad_norm": 11.611706733703613, "learning_rate": 4.596417937047253e-06, "loss": 3.4687, "step": 18110 }, { "epoch": 0.18427530924479166, "grad_norm": 9.320916175842285, "learning_rate": 4.5962000558064465e-06, "loss": 3.6369, "step": 18115 }, { "epoch": 0.184326171875, "grad_norm": 10.020218849182129, "learning_rate": 4.5959821209348585e-06, "loss": 3.2122, "step": 18120 }, { "epoch": 0.18437703450520834, "grad_norm": 15.03809642791748, "learning_rate": 4.595764132438064e-06, "loss": 3.3018, "step": 18125 }, { "epoch": 0.18442789713541666, "grad_norm": 13.933128356933594, "learning_rate": 4.595546090321642e-06, "loss": 3.4161, "step": 18130 }, { "epoch": 0.184478759765625, "grad_norm": 15.626864433288574, "learning_rate": 4.595327994591169e-06, "loss": 3.1919, "step": 18135 }, { "epoch": 0.18452962239583334, "grad_norm": 15.305878639221191, "learning_rate": 4.595109845252226e-06, "loss": 3.7762, "step": 18140 }, { "epoch": 0.18458048502604166, "grad_norm": 12.688610076904297, "learning_rate": 4.594891642310395e-06, "loss": 3.5695, "step": 18145 }, { "epoch": 0.18463134765625, "grad_norm": 12.089020729064941, "learning_rate": 4.594673385771257e-06, "loss": 3.651, "step": 18150 }, { "epoch": 0.18468221028645834, "grad_norm": 14.751028060913086, "learning_rate": 4.594455075640397e-06, "loss": 3.3362, "step": 18155 }, { "epoch": 0.18473307291666666, "grad_norm": 14.168632507324219, "learning_rate": 4.594236711923401e-06, "loss": 3.5103, "step": 18160 }, { "epoch": 0.184783935546875, "grad_norm": 11.68898868560791, "learning_rate": 4.594018294625855e-06, "loss": 3.0502, "step": 18165 }, { "epoch": 0.18483479817708334, "grad_norm": 11.675402641296387, "learning_rate": 4.593799823753347e-06, "loss": 3.2437, "step": 18170 }, { "epoch": 0.18488566080729166, "grad_norm": 14.426863670349121, "learning_rate": 4.593581299311467e-06, "loss": 3.5249, "step": 18175 }, { "epoch": 0.1849365234375, "grad_norm": 14.298029899597168, "learning_rate": 4.593362721305805e-06, "loss": 3.3278, "step": 18180 }, { "epoch": 0.18498738606770834, "grad_norm": 13.843632698059082, "learning_rate": 4.5931440897419546e-06, "loss": 3.3659, "step": 18185 }, { "epoch": 0.18503824869791666, "grad_norm": 15.963536262512207, "learning_rate": 4.592925404625509e-06, "loss": 3.2535, "step": 18190 }, { "epoch": 0.185089111328125, "grad_norm": 13.099442481994629, "learning_rate": 4.592706665962063e-06, "loss": 3.4257, "step": 18195 }, { "epoch": 0.18513997395833334, "grad_norm": 10.886616706848145, "learning_rate": 4.592487873757212e-06, "loss": 3.3174, "step": 18200 }, { "epoch": 0.18519083658854166, "grad_norm": 14.132744789123535, "learning_rate": 4.592269028016555e-06, "loss": 3.5966, "step": 18205 }, { "epoch": 0.18524169921875, "grad_norm": 12.172066688537598, "learning_rate": 4.5920501287456905e-06, "loss": 3.25, "step": 18210 }, { "epoch": 0.18529256184895834, "grad_norm": 9.599930763244629, "learning_rate": 4.591831175950221e-06, "loss": 3.3482, "step": 18215 }, { "epoch": 0.18534342447916666, "grad_norm": 14.642792701721191, "learning_rate": 4.5916121696357454e-06, "loss": 3.195, "step": 18220 }, { "epoch": 0.185394287109375, "grad_norm": 17.15027618408203, "learning_rate": 4.591393109807868e-06, "loss": 3.6213, "step": 18225 }, { "epoch": 0.18544514973958334, "grad_norm": 10.881146430969238, "learning_rate": 4.591173996472195e-06, "loss": 3.054, "step": 18230 }, { "epoch": 0.18549601236979166, "grad_norm": 16.243993759155273, "learning_rate": 4.5909548296343295e-06, "loss": 3.2491, "step": 18235 }, { "epoch": 0.185546875, "grad_norm": 12.273772239685059, "learning_rate": 4.590735609299881e-06, "loss": 3.208, "step": 18240 }, { "epoch": 0.18559773763020834, "grad_norm": 12.737349510192871, "learning_rate": 4.590516335474458e-06, "loss": 3.0281, "step": 18245 }, { "epoch": 0.18564860026041666, "grad_norm": 10.880949020385742, "learning_rate": 4.590297008163669e-06, "loss": 3.3972, "step": 18250 }, { "epoch": 0.185699462890625, "grad_norm": 10.571752548217773, "learning_rate": 4.590077627373126e-06, "loss": 3.5009, "step": 18255 }, { "epoch": 0.18575032552083334, "grad_norm": 17.717178344726562, "learning_rate": 4.589858193108444e-06, "loss": 3.5101, "step": 18260 }, { "epoch": 0.18580118815104166, "grad_norm": 12.134099006652832, "learning_rate": 4.589638705375234e-06, "loss": 3.3066, "step": 18265 }, { "epoch": 0.18585205078125, "grad_norm": 15.478253364562988, "learning_rate": 4.5894191641791145e-06, "loss": 3.329, "step": 18270 }, { "epoch": 0.18590291341145834, "grad_norm": 8.206337928771973, "learning_rate": 4.5891995695257e-06, "loss": 3.2733, "step": 18275 }, { "epoch": 0.18595377604166666, "grad_norm": 10.964665412902832, "learning_rate": 4.58897992142061e-06, "loss": 3.5635, "step": 18280 }, { "epoch": 0.186004638671875, "grad_norm": 12.019591331481934, "learning_rate": 4.588760219869463e-06, "loss": 3.3073, "step": 18285 }, { "epoch": 0.18605550130208334, "grad_norm": 12.096267700195312, "learning_rate": 4.588540464877882e-06, "loss": 3.4862, "step": 18290 }, { "epoch": 0.18610636393229166, "grad_norm": 10.249032020568848, "learning_rate": 4.588320656451487e-06, "loss": 3.6171, "step": 18295 }, { "epoch": 0.1861572265625, "grad_norm": 16.42626190185547, "learning_rate": 4.588100794595904e-06, "loss": 3.5845, "step": 18300 }, { "epoch": 0.18620808919270834, "grad_norm": 16.055395126342773, "learning_rate": 4.587880879316758e-06, "loss": 3.5212, "step": 18305 }, { "epoch": 0.18625895182291666, "grad_norm": 14.237022399902344, "learning_rate": 4.587660910619672e-06, "loss": 3.1991, "step": 18310 }, { "epoch": 0.186309814453125, "grad_norm": 7.7949652671813965, "learning_rate": 4.5874408885102785e-06, "loss": 3.4076, "step": 18315 }, { "epoch": 0.18636067708333334, "grad_norm": 8.902517318725586, "learning_rate": 4.5872208129942045e-06, "loss": 3.2999, "step": 18320 }, { "epoch": 0.18641153971354166, "grad_norm": 11.778844833374023, "learning_rate": 4.58700068407708e-06, "loss": 3.2865, "step": 18325 }, { "epoch": 0.18646240234375, "grad_norm": 8.537171363830566, "learning_rate": 4.586780501764538e-06, "loss": 3.2332, "step": 18330 }, { "epoch": 0.18651326497395834, "grad_norm": 15.2643461227417, "learning_rate": 4.586560266062211e-06, "loss": 3.3204, "step": 18335 }, { "epoch": 0.18656412760416666, "grad_norm": 8.688283920288086, "learning_rate": 4.586339976975735e-06, "loss": 3.3692, "step": 18340 }, { "epoch": 0.186614990234375, "grad_norm": 12.671655654907227, "learning_rate": 4.586119634510745e-06, "loss": 3.3349, "step": 18345 }, { "epoch": 0.18666585286458334, "grad_norm": 12.594867706298828, "learning_rate": 4.585899238672878e-06, "loss": 3.4412, "step": 18350 }, { "epoch": 0.18671671549479166, "grad_norm": 16.03032875061035, "learning_rate": 4.585678789467774e-06, "loss": 3.474, "step": 18355 }, { "epoch": 0.186767578125, "grad_norm": 13.385184288024902, "learning_rate": 4.585458286901072e-06, "loss": 3.2519, "step": 18360 }, { "epoch": 0.18681844075520834, "grad_norm": 9.913110733032227, "learning_rate": 4.5852377309784146e-06, "loss": 3.3587, "step": 18365 }, { "epoch": 0.18686930338541666, "grad_norm": 10.111922264099121, "learning_rate": 4.585017121705444e-06, "loss": 3.1416, "step": 18370 }, { "epoch": 0.186920166015625, "grad_norm": 9.840933799743652, "learning_rate": 4.584796459087805e-06, "loss": 3.5775, "step": 18375 }, { "epoch": 0.18697102864583334, "grad_norm": 12.728557586669922, "learning_rate": 4.584575743131142e-06, "loss": 3.6974, "step": 18380 }, { "epoch": 0.18702189127604166, "grad_norm": 12.671290397644043, "learning_rate": 4.584354973841103e-06, "loss": 3.343, "step": 18385 }, { "epoch": 0.18707275390625, "grad_norm": 14.869614601135254, "learning_rate": 4.584134151223335e-06, "loss": 3.4718, "step": 18390 }, { "epoch": 0.18712361653645834, "grad_norm": 11.287585258483887, "learning_rate": 4.58391327528349e-06, "loss": 3.4128, "step": 18395 }, { "epoch": 0.18717447916666666, "grad_norm": 16.449617385864258, "learning_rate": 4.5836923460272175e-06, "loss": 3.7161, "step": 18400 }, { "epoch": 0.187225341796875, "grad_norm": 16.260473251342773, "learning_rate": 4.58347136346017e-06, "loss": 3.3264, "step": 18405 }, { "epoch": 0.18727620442708334, "grad_norm": 13.514466285705566, "learning_rate": 4.5832503275880015e-06, "loss": 3.294, "step": 18410 }, { "epoch": 0.18732706705729166, "grad_norm": 13.021100044250488, "learning_rate": 4.583029238416368e-06, "loss": 3.3275, "step": 18415 }, { "epoch": 0.1873779296875, "grad_norm": 10.005925178527832, "learning_rate": 4.582808095950924e-06, "loss": 3.308, "step": 18420 }, { "epoch": 0.18742879231770834, "grad_norm": 11.48652172088623, "learning_rate": 4.5825869001973285e-06, "loss": 3.5553, "step": 18425 }, { "epoch": 0.18747965494791666, "grad_norm": 15.618109703063965, "learning_rate": 4.582365651161242e-06, "loss": 3.3904, "step": 18430 }, { "epoch": 0.187530517578125, "grad_norm": 14.050233840942383, "learning_rate": 4.582144348848323e-06, "loss": 3.7672, "step": 18435 }, { "epoch": 0.18758138020833334, "grad_norm": 10.15687370300293, "learning_rate": 4.581922993264235e-06, "loss": 3.2861, "step": 18440 }, { "epoch": 0.18763224283854166, "grad_norm": 10.453628540039062, "learning_rate": 4.58170158441464e-06, "loss": 3.1252, "step": 18445 }, { "epoch": 0.18768310546875, "grad_norm": 16.22777557373047, "learning_rate": 4.5814801223052035e-06, "loss": 3.43, "step": 18450 }, { "epoch": 0.18773396809895834, "grad_norm": 9.1096830368042, "learning_rate": 4.581258606941592e-06, "loss": 3.3274, "step": 18455 }, { "epoch": 0.18778483072916666, "grad_norm": 16.2208251953125, "learning_rate": 4.581037038329472e-06, "loss": 3.8525, "step": 18460 }, { "epoch": 0.187835693359375, "grad_norm": 16.626689910888672, "learning_rate": 4.580815416474512e-06, "loss": 3.2766, "step": 18465 }, { "epoch": 0.18788655598958334, "grad_norm": 16.821229934692383, "learning_rate": 4.580593741382384e-06, "loss": 3.6567, "step": 18470 }, { "epoch": 0.18793741861979166, "grad_norm": 11.31299877166748, "learning_rate": 4.580372013058757e-06, "loss": 3.4805, "step": 18475 }, { "epoch": 0.18798828125, "grad_norm": 16.890230178833008, "learning_rate": 4.580150231509306e-06, "loss": 3.2733, "step": 18480 }, { "epoch": 0.18803914388020834, "grad_norm": 8.345052719116211, "learning_rate": 4.579928396739704e-06, "loss": 3.7072, "step": 18485 }, { "epoch": 0.18809000651041666, "grad_norm": 7.271616458892822, "learning_rate": 4.579706508755627e-06, "loss": 3.4539, "step": 18490 }, { "epoch": 0.188140869140625, "grad_norm": 14.4603910446167, "learning_rate": 4.579484567562752e-06, "loss": 3.8369, "step": 18495 }, { "epoch": 0.18819173177083334, "grad_norm": 8.192587852478027, "learning_rate": 4.579262573166757e-06, "loss": 2.9722, "step": 18500 }, { "epoch": 0.18824259440104166, "grad_norm": 8.657429695129395, "learning_rate": 4.579040525573323e-06, "loss": 3.7435, "step": 18505 }, { "epoch": 0.18829345703125, "grad_norm": 11.86208724975586, "learning_rate": 4.578818424788129e-06, "loss": 3.3249, "step": 18510 }, { "epoch": 0.18834431966145834, "grad_norm": 11.226592063903809, "learning_rate": 4.578596270816858e-06, "loss": 3.4405, "step": 18515 }, { "epoch": 0.18839518229166666, "grad_norm": 7.9588727951049805, "learning_rate": 4.578374063665195e-06, "loss": 3.378, "step": 18520 }, { "epoch": 0.188446044921875, "grad_norm": 10.29086971282959, "learning_rate": 4.578151803338824e-06, "loss": 3.6522, "step": 18525 }, { "epoch": 0.18849690755208334, "grad_norm": 12.614090919494629, "learning_rate": 4.577929489843431e-06, "loss": 3.0822, "step": 18530 }, { "epoch": 0.18854777018229166, "grad_norm": 14.052781105041504, "learning_rate": 4.577707123184705e-06, "loss": 3.3621, "step": 18535 }, { "epoch": 0.1885986328125, "grad_norm": 13.460591316223145, "learning_rate": 4.577484703368335e-06, "loss": 3.5421, "step": 18540 }, { "epoch": 0.18864949544270834, "grad_norm": 10.110848426818848, "learning_rate": 4.57726223040001e-06, "loss": 3.5516, "step": 18545 }, { "epoch": 0.18870035807291666, "grad_norm": 11.632022857666016, "learning_rate": 4.577039704285424e-06, "loss": 3.5707, "step": 18550 }, { "epoch": 0.188751220703125, "grad_norm": 8.971404075622559, "learning_rate": 4.5768171250302706e-06, "loss": 3.4157, "step": 18555 }, { "epoch": 0.18880208333333334, "grad_norm": 14.906126976013184, "learning_rate": 4.576594492640242e-06, "loss": 3.7568, "step": 18560 }, { "epoch": 0.18885294596354166, "grad_norm": 12.4305419921875, "learning_rate": 4.576371807121036e-06, "loss": 3.5414, "step": 18565 }, { "epoch": 0.18890380859375, "grad_norm": 13.76634407043457, "learning_rate": 4.57614906847835e-06, "loss": 3.4253, "step": 18570 }, { "epoch": 0.18895467122395834, "grad_norm": 14.682574272155762, "learning_rate": 4.5759262767178805e-06, "loss": 3.4316, "step": 18575 }, { "epoch": 0.18900553385416666, "grad_norm": 12.575339317321777, "learning_rate": 4.575703431845331e-06, "loss": 3.6921, "step": 18580 }, { "epoch": 0.189056396484375, "grad_norm": 7.278709411621094, "learning_rate": 4.5754805338664e-06, "loss": 3.1521, "step": 18585 }, { "epoch": 0.18910725911458334, "grad_norm": 12.746679306030273, "learning_rate": 4.575257582786792e-06, "loss": 3.2062, "step": 18590 }, { "epoch": 0.18915812174479166, "grad_norm": 15.597989082336426, "learning_rate": 4.575034578612211e-06, "loss": 3.5262, "step": 18595 }, { "epoch": 0.189208984375, "grad_norm": 7.950192928314209, "learning_rate": 4.574811521348361e-06, "loss": 3.6785, "step": 18600 }, { "epoch": 0.18925984700520834, "grad_norm": 10.346031188964844, "learning_rate": 4.574588411000951e-06, "loss": 3.3503, "step": 18605 }, { "epoch": 0.18931070963541666, "grad_norm": 11.723124504089355, "learning_rate": 4.574365247575688e-06, "loss": 3.3964, "step": 18610 }, { "epoch": 0.189361572265625, "grad_norm": 8.930464744567871, "learning_rate": 4.574142031078282e-06, "loss": 3.2352, "step": 18615 }, { "epoch": 0.18941243489583334, "grad_norm": 13.8888521194458, "learning_rate": 4.573918761514443e-06, "loss": 3.315, "step": 18620 }, { "epoch": 0.18946329752604166, "grad_norm": 7.327556133270264, "learning_rate": 4.573695438889885e-06, "loss": 3.1141, "step": 18625 }, { "epoch": 0.18951416015625, "grad_norm": 8.570147514343262, "learning_rate": 4.57347206321032e-06, "loss": 3.1851, "step": 18630 }, { "epoch": 0.18956502278645834, "grad_norm": 9.34282112121582, "learning_rate": 4.573248634481464e-06, "loss": 3.5862, "step": 18635 }, { "epoch": 0.18961588541666666, "grad_norm": 12.512747764587402, "learning_rate": 4.573025152709033e-06, "loss": 3.5737, "step": 18640 }, { "epoch": 0.189666748046875, "grad_norm": 12.564814567565918, "learning_rate": 4.572801617898746e-06, "loss": 3.4582, "step": 18645 }, { "epoch": 0.18971761067708334, "grad_norm": 14.594040870666504, "learning_rate": 4.572578030056319e-06, "loss": 3.0915, "step": 18650 }, { "epoch": 0.18976847330729166, "grad_norm": 11.229549407958984, "learning_rate": 4.572354389187476e-06, "loss": 3.6208, "step": 18655 }, { "epoch": 0.1898193359375, "grad_norm": 13.392086029052734, "learning_rate": 4.572130695297936e-06, "loss": 3.5334, "step": 18660 }, { "epoch": 0.18987019856770834, "grad_norm": 9.601693153381348, "learning_rate": 4.571906948393424e-06, "loss": 3.3134, "step": 18665 }, { "epoch": 0.18992106119791666, "grad_norm": 13.067540168762207, "learning_rate": 4.571683148479663e-06, "loss": 3.0544, "step": 18670 }, { "epoch": 0.189971923828125, "grad_norm": 10.653081893920898, "learning_rate": 4.57145929556238e-06, "loss": 3.7998, "step": 18675 }, { "epoch": 0.19002278645833334, "grad_norm": 12.351947784423828, "learning_rate": 4.5712353896473025e-06, "loss": 3.3205, "step": 18680 }, { "epoch": 0.19007364908854166, "grad_norm": 9.99809455871582, "learning_rate": 4.571011430740158e-06, "loss": 3.577, "step": 18685 }, { "epoch": 0.19012451171875, "grad_norm": 11.64851188659668, "learning_rate": 4.5707874188466774e-06, "loss": 3.3207, "step": 18690 }, { "epoch": 0.19017537434895834, "grad_norm": 10.789388656616211, "learning_rate": 4.5705633539725915e-06, "loss": 3.5073, "step": 18695 }, { "epoch": 0.19022623697916666, "grad_norm": 11.41800308227539, "learning_rate": 4.570339236123632e-06, "loss": 3.4615, "step": 18700 }, { "epoch": 0.190277099609375, "grad_norm": 12.321640968322754, "learning_rate": 4.5701150653055345e-06, "loss": 3.3763, "step": 18705 }, { "epoch": 0.19032796223958334, "grad_norm": 14.531020164489746, "learning_rate": 4.569890841524034e-06, "loss": 3.5657, "step": 18710 }, { "epoch": 0.19037882486979166, "grad_norm": 10.604043960571289, "learning_rate": 4.569666564784867e-06, "loss": 3.4327, "step": 18715 }, { "epoch": 0.1904296875, "grad_norm": 9.537259101867676, "learning_rate": 4.569442235093771e-06, "loss": 3.3424, "step": 18720 }, { "epoch": 0.19048055013020834, "grad_norm": 10.841087341308594, "learning_rate": 4.569217852456486e-06, "loss": 3.4501, "step": 18725 }, { "epoch": 0.19053141276041666, "grad_norm": 10.725885391235352, "learning_rate": 4.568993416878753e-06, "loss": 3.6736, "step": 18730 }, { "epoch": 0.190582275390625, "grad_norm": 13.705768585205078, "learning_rate": 4.568768928366313e-06, "loss": 3.7702, "step": 18735 }, { "epoch": 0.19063313802083334, "grad_norm": 8.565932273864746, "learning_rate": 4.568544386924911e-06, "loss": 3.6213, "step": 18740 }, { "epoch": 0.19068400065104166, "grad_norm": 14.157207489013672, "learning_rate": 4.568319792560292e-06, "loss": 3.4415, "step": 18745 }, { "epoch": 0.19073486328125, "grad_norm": 12.56330680847168, "learning_rate": 4.5680951452782e-06, "loss": 3.1235, "step": 18750 }, { "epoch": 0.19078572591145834, "grad_norm": 7.816888809204102, "learning_rate": 4.567870445084385e-06, "loss": 3.1856, "step": 18755 }, { "epoch": 0.19083658854166666, "grad_norm": 13.05587387084961, "learning_rate": 4.567645691984594e-06, "loss": 3.1723, "step": 18760 }, { "epoch": 0.190887451171875, "grad_norm": 13.687058448791504, "learning_rate": 4.567420885984578e-06, "loss": 3.8625, "step": 18765 }, { "epoch": 0.19093831380208334, "grad_norm": 12.832854270935059, "learning_rate": 4.567196027090088e-06, "loss": 3.1376, "step": 18770 }, { "epoch": 0.19098917643229166, "grad_norm": 13.068547248840332, "learning_rate": 4.56697111530688e-06, "loss": 3.3375, "step": 18775 }, { "epoch": 0.1910400390625, "grad_norm": 10.62330150604248, "learning_rate": 4.566746150640704e-06, "loss": 3.4257, "step": 18780 }, { "epoch": 0.19109090169270834, "grad_norm": 13.381546020507812, "learning_rate": 4.566521133097318e-06, "loss": 3.2436, "step": 18785 }, { "epoch": 0.19114176432291666, "grad_norm": 9.422579765319824, "learning_rate": 4.566296062682478e-06, "loss": 3.4788, "step": 18790 }, { "epoch": 0.191192626953125, "grad_norm": 16.37440299987793, "learning_rate": 4.566070939401944e-06, "loss": 3.5952, "step": 18795 }, { "epoch": 0.19124348958333334, "grad_norm": 11.154594421386719, "learning_rate": 4.565845763261475e-06, "loss": 3.1788, "step": 18800 }, { "epoch": 0.19129435221354166, "grad_norm": 13.181290626525879, "learning_rate": 4.565620534266831e-06, "loss": 3.2884, "step": 18805 }, { "epoch": 0.19134521484375, "grad_norm": 10.758809089660645, "learning_rate": 4.565395252423775e-06, "loss": 3.2639, "step": 18810 }, { "epoch": 0.19139607747395834, "grad_norm": 11.981339454650879, "learning_rate": 4.5651699177380714e-06, "loss": 3.2688, "step": 18815 }, { "epoch": 0.19144694010416666, "grad_norm": 9.572751998901367, "learning_rate": 4.564944530215486e-06, "loss": 3.5481, "step": 18820 }, { "epoch": 0.191497802734375, "grad_norm": 9.87020492553711, "learning_rate": 4.564719089861783e-06, "loss": 3.4662, "step": 18825 }, { "epoch": 0.19154866536458334, "grad_norm": 9.3651123046875, "learning_rate": 4.564493596682732e-06, "loss": 3.2547, "step": 18830 }, { "epoch": 0.19159952799479166, "grad_norm": 9.596661567687988, "learning_rate": 4.564268050684101e-06, "loss": 3.3904, "step": 18835 }, { "epoch": 0.191650390625, "grad_norm": 15.922765731811523, "learning_rate": 4.564042451871662e-06, "loss": 3.2157, "step": 18840 }, { "epoch": 0.19170125325520834, "grad_norm": 11.004732131958008, "learning_rate": 4.563816800251185e-06, "loss": 3.3153, "step": 18845 }, { "epoch": 0.19175211588541666, "grad_norm": 8.292093276977539, "learning_rate": 4.563591095828446e-06, "loss": 3.2079, "step": 18850 }, { "epoch": 0.191802978515625, "grad_norm": 8.698326110839844, "learning_rate": 4.563365338609216e-06, "loss": 3.404, "step": 18855 }, { "epoch": 0.19185384114583334, "grad_norm": 11.080509185791016, "learning_rate": 4.563139528599274e-06, "loss": 3.2749, "step": 18860 }, { "epoch": 0.19190470377604166, "grad_norm": 14.920134544372559, "learning_rate": 4.562913665804397e-06, "loss": 3.455, "step": 18865 }, { "epoch": 0.19195556640625, "grad_norm": 10.069123268127441, "learning_rate": 4.562687750230361e-06, "loss": 3.6829, "step": 18870 }, { "epoch": 0.19200642903645834, "grad_norm": 8.710797309875488, "learning_rate": 4.562461781882949e-06, "loss": 3.3883, "step": 18875 }, { "epoch": 0.19205729166666666, "grad_norm": 13.156225204467773, "learning_rate": 4.5622357607679415e-06, "loss": 3.1545, "step": 18880 }, { "epoch": 0.192108154296875, "grad_norm": 12.595132827758789, "learning_rate": 4.5620096868911205e-06, "loss": 3.4499, "step": 18885 }, { "epoch": 0.19215901692708334, "grad_norm": 12.80722713470459, "learning_rate": 4.561783560258269e-06, "loss": 3.6053, "step": 18890 }, { "epoch": 0.19220987955729166, "grad_norm": 10.552962303161621, "learning_rate": 4.561557380875175e-06, "loss": 3.4, "step": 18895 }, { "epoch": 0.1922607421875, "grad_norm": 12.450384140014648, "learning_rate": 4.561331148747624e-06, "loss": 3.4466, "step": 18900 }, { "epoch": 0.19231160481770834, "grad_norm": 9.195076942443848, "learning_rate": 4.5611048638814045e-06, "loss": 3.0326, "step": 18905 }, { "epoch": 0.19236246744791666, "grad_norm": 10.586387634277344, "learning_rate": 4.560878526282305e-06, "loss": 3.1759, "step": 18910 } ], "logging_steps": 5, "max_steps": 98304, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 394, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.328444652552192e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }