{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3840639082343302, "eval_steps": 500, "global_step": 420000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004572189383742026, "grad_norm": 187.55120849609375, "learning_rate": 5e-06, "loss": 7.0189, "step": 500 }, { "epoch": 0.0009144378767484053, "grad_norm": 8.421695709228516, "learning_rate": 1e-05, "loss": 3.7862, "step": 1000 }, { "epoch": 0.0013716568151226078, "grad_norm": 5.949154853820801, "learning_rate": 9.995423625806358e-06, "loss": 2.3315, "step": 1500 }, { "epoch": 0.0018288757534968105, "grad_norm": 6.021435260772705, "learning_rate": 9.990847251612715e-06, "loss": 1.7553, "step": 2000 }, { "epoch": 0.002286094691871013, "grad_norm": 6.2066168785095215, "learning_rate": 9.986270877419073e-06, "loss": 1.46, "step": 2500 }, { "epoch": 0.0027433136302452157, "grad_norm": 5.304969310760498, "learning_rate": 9.98169450322543e-06, "loss": 1.2973, "step": 3000 }, { "epoch": 0.0032005325686194183, "grad_norm": 5.2138566970825195, "learning_rate": 9.977118129031787e-06, "loss": 1.1809, "step": 3500 }, { "epoch": 0.003657751506993621, "grad_norm": 6.513549327850342, "learning_rate": 9.972541754838143e-06, "loss": 1.0803, "step": 4000 }, { "epoch": 0.004114970445367823, "grad_norm": 4.748714923858643, "learning_rate": 9.967965380644502e-06, "loss": 1.0187, "step": 4500 }, { "epoch": 0.004572189383742026, "grad_norm": 3.788388252258301, "learning_rate": 9.963389006450857e-06, "loss": 0.95, "step": 5000 }, { "epoch": 0.005029408322116229, "grad_norm": 4.110034465789795, "learning_rate": 9.958812632257215e-06, "loss": 0.9065, "step": 5500 }, { "epoch": 0.005486627260490431, "grad_norm": 4.105165481567383, "learning_rate": 9.954236258063572e-06, "loss": 0.8702, "step": 6000 }, { "epoch": 0.005943846198864634, "grad_norm": 4.9001145362854, "learning_rate": 9.94965988386993e-06, "loss": 0.8165, "step": 6500 }, { "epoch": 0.006401065137238837, "grad_norm": 3.8835108280181885, "learning_rate": 9.945083509676286e-06, "loss": 0.7864, "step": 7000 }, { "epoch": 0.006858284075613039, "grad_norm": 4.042098522186279, "learning_rate": 9.940507135482644e-06, "loss": 0.7639, "step": 7500 }, { "epoch": 0.007315503013987242, "grad_norm": 3.520855188369751, "learning_rate": 9.935930761289001e-06, "loss": 0.7305, "step": 8000 }, { "epoch": 0.007772721952361445, "grad_norm": 3.5829412937164307, "learning_rate": 9.931354387095358e-06, "loss": 0.7138, "step": 8500 }, { "epoch": 0.008229940890735647, "grad_norm": 3.618142604827881, "learning_rate": 9.926778012901714e-06, "loss": 0.6868, "step": 9000 }, { "epoch": 0.00868715982910985, "grad_norm": 4.708620548248291, "learning_rate": 9.922201638708073e-06, "loss": 0.6671, "step": 9500 }, { "epoch": 0.009144378767484052, "grad_norm": 5.403913974761963, "learning_rate": 9.917625264514428e-06, "loss": 0.6627, "step": 10000 }, { "epoch": 0.009601597705858255, "grad_norm": 3.972954511642456, "learning_rate": 9.913048890320787e-06, "loss": 0.6406, "step": 10500 }, { "epoch": 0.010058816644232457, "grad_norm": 3.42512583732605, "learning_rate": 9.908472516127143e-06, "loss": 0.6206, "step": 11000 }, { "epoch": 0.01051603558260666, "grad_norm": 3.6385998725891113, "learning_rate": 9.903896141933502e-06, "loss": 0.6068, "step": 11500 }, { "epoch": 0.010973254520980863, "grad_norm": 3.1945295333862305, "learning_rate": 9.899319767739857e-06, "loss": 0.6005, "step": 12000 }, { "epoch": 0.011430473459355064, "grad_norm": 2.8585703372955322, "learning_rate": 9.894743393546215e-06, "loss": 0.5833, "step": 12500 }, { "epoch": 0.011887692397729268, "grad_norm": 3.1953206062316895, "learning_rate": 9.890167019352572e-06, "loss": 0.5754, "step": 13000 }, { "epoch": 0.01234491133610347, "grad_norm": 3.1728343963623047, "learning_rate": 9.88559064515893e-06, "loss": 0.5657, "step": 13500 }, { "epoch": 0.012802130274477673, "grad_norm": 3.0139970779418945, "learning_rate": 9.881014270965287e-06, "loss": 0.5565, "step": 14000 }, { "epoch": 0.013259349212851875, "grad_norm": 2.9499995708465576, "learning_rate": 9.876437896771644e-06, "loss": 0.5455, "step": 14500 }, { "epoch": 0.013716568151226079, "grad_norm": 3.5701255798339844, "learning_rate": 9.871861522578e-06, "loss": 0.5314, "step": 15000 }, { "epoch": 0.01417378708960028, "grad_norm": 3.3928213119506836, "learning_rate": 9.867285148384358e-06, "loss": 0.5349, "step": 15500 }, { "epoch": 0.014631006027974484, "grad_norm": 3.076819896697998, "learning_rate": 9.862708774190714e-06, "loss": 0.518, "step": 16000 }, { "epoch": 0.015088224966348686, "grad_norm": 2.7929725646972656, "learning_rate": 9.858132399997073e-06, "loss": 0.5181, "step": 16500 }, { "epoch": 0.01554544390472289, "grad_norm": 2.8731892108917236, "learning_rate": 9.853556025803429e-06, "loss": 0.5187, "step": 17000 }, { "epoch": 0.016002662843097093, "grad_norm": 3.5447003841400146, "learning_rate": 9.848979651609786e-06, "loss": 0.5065, "step": 17500 }, { "epoch": 0.016459881781471293, "grad_norm": 2.36434006690979, "learning_rate": 9.844403277416143e-06, "loss": 0.5, "step": 18000 }, { "epoch": 0.016917100719845497, "grad_norm": 2.4048261642456055, "learning_rate": 9.8398269032225e-06, "loss": 0.5015, "step": 18500 }, { "epoch": 0.0173743196582197, "grad_norm": 3.1334474086761475, "learning_rate": 9.835250529028858e-06, "loss": 0.4876, "step": 19000 }, { "epoch": 0.0178315385965939, "grad_norm": 2.134216070175171, "learning_rate": 9.830674154835215e-06, "loss": 0.487, "step": 19500 }, { "epoch": 0.018288757534968104, "grad_norm": 2.5839178562164307, "learning_rate": 9.826097780641572e-06, "loss": 0.4716, "step": 20000 }, { "epoch": 0.018745976473342307, "grad_norm": 2.95695424079895, "learning_rate": 9.82152140644793e-06, "loss": 0.4778, "step": 20500 }, { "epoch": 0.01920319541171651, "grad_norm": 3.2409121990203857, "learning_rate": 9.816945032254285e-06, "loss": 0.4702, "step": 21000 }, { "epoch": 0.01966041435009071, "grad_norm": 3.0505309104919434, "learning_rate": 9.812368658060644e-06, "loss": 0.4737, "step": 21500 }, { "epoch": 0.020117633288464914, "grad_norm": 2.630138397216797, "learning_rate": 9.807792283867e-06, "loss": 0.4732, "step": 22000 }, { "epoch": 0.020574852226839118, "grad_norm": 2.780930995941162, "learning_rate": 9.803215909673357e-06, "loss": 0.4616, "step": 22500 }, { "epoch": 0.02103207116521332, "grad_norm": 2.8004393577575684, "learning_rate": 9.798639535479714e-06, "loss": 0.4586, "step": 23000 }, { "epoch": 0.02148929010358752, "grad_norm": 2.394951581954956, "learning_rate": 9.794063161286071e-06, "loss": 0.4527, "step": 23500 }, { "epoch": 0.021946509041961725, "grad_norm": 2.5440549850463867, "learning_rate": 9.789486787092429e-06, "loss": 0.4465, "step": 24000 }, { "epoch": 0.02240372798033593, "grad_norm": 2.5639050006866455, "learning_rate": 9.784910412898786e-06, "loss": 0.4545, "step": 24500 }, { "epoch": 0.02286094691871013, "grad_norm": 3.256699562072754, "learning_rate": 9.780334038705143e-06, "loss": 0.4467, "step": 25000 }, { "epoch": 0.023318165857084332, "grad_norm": 2.7148571014404297, "learning_rate": 9.7757576645115e-06, "loss": 0.4377, "step": 25500 }, { "epoch": 0.023775384795458536, "grad_norm": 2.497065544128418, "learning_rate": 9.771181290317858e-06, "loss": 0.4348, "step": 26000 }, { "epoch": 0.02423260373383274, "grad_norm": 2.3831052780151367, "learning_rate": 9.766604916124215e-06, "loss": 0.4312, "step": 26500 }, { "epoch": 0.02468982267220694, "grad_norm": 2.513948917388916, "learning_rate": 9.762028541930572e-06, "loss": 0.4245, "step": 27000 }, { "epoch": 0.025147041610581143, "grad_norm": 2.2912256717681885, "learning_rate": 9.75745216773693e-06, "loss": 0.4239, "step": 27500 }, { "epoch": 0.025604260548955347, "grad_norm": 3.100677490234375, "learning_rate": 9.752875793543285e-06, "loss": 0.4341, "step": 28000 }, { "epoch": 0.02606147948732955, "grad_norm": 4.546727180480957, "learning_rate": 9.748299419349643e-06, "loss": 0.4177, "step": 28500 }, { "epoch": 0.02651869842570375, "grad_norm": 2.070556163787842, "learning_rate": 9.743723045156e-06, "loss": 0.4152, "step": 29000 }, { "epoch": 0.026975917364077954, "grad_norm": 2.3387291431427, "learning_rate": 9.739146670962357e-06, "loss": 0.4208, "step": 29500 }, { "epoch": 0.027433136302452157, "grad_norm": 2.6462035179138184, "learning_rate": 9.734570296768714e-06, "loss": 0.4096, "step": 30000 }, { "epoch": 0.02789035524082636, "grad_norm": 2.098785400390625, "learning_rate": 9.729993922575072e-06, "loss": 0.4124, "step": 30500 }, { "epoch": 0.02834757417920056, "grad_norm": 2.7251081466674805, "learning_rate": 9.725417548381429e-06, "loss": 0.4131, "step": 31000 }, { "epoch": 0.028804793117574765, "grad_norm": 2.221843957901001, "learning_rate": 9.720841174187786e-06, "loss": 0.4099, "step": 31500 }, { "epoch": 0.029262012055948968, "grad_norm": 1.7978463172912598, "learning_rate": 9.716264799994143e-06, "loss": 0.4065, "step": 32000 }, { "epoch": 0.029719230994323168, "grad_norm": 2.299729824066162, "learning_rate": 9.7116884258005e-06, "loss": 0.403, "step": 32500 }, { "epoch": 0.030176449932697372, "grad_norm": 2.307136058807373, "learning_rate": 9.707112051606858e-06, "loss": 0.3997, "step": 33000 }, { "epoch": 0.030633668871071575, "grad_norm": 2.1159164905548096, "learning_rate": 9.702535677413214e-06, "loss": 0.3986, "step": 33500 }, { "epoch": 0.03109088780944578, "grad_norm": 2.6387250423431396, "learning_rate": 9.697959303219573e-06, "loss": 0.3887, "step": 34000 }, { "epoch": 0.03154810674781998, "grad_norm": 2.5297632217407227, "learning_rate": 9.693382929025928e-06, "loss": 0.3902, "step": 34500 }, { "epoch": 0.032005325686194186, "grad_norm": 3.11338472366333, "learning_rate": 9.688806554832287e-06, "loss": 0.3879, "step": 35000 }, { "epoch": 0.032462544624568386, "grad_norm": 2.4520089626312256, "learning_rate": 9.684230180638643e-06, "loss": 0.3914, "step": 35500 }, { "epoch": 0.032919763562942586, "grad_norm": 2.3968985080718994, "learning_rate": 9.679653806445e-06, "loss": 0.3859, "step": 36000 }, { "epoch": 0.03337698250131679, "grad_norm": 1.8716310262680054, "learning_rate": 9.675077432251357e-06, "loss": 0.3838, "step": 36500 }, { "epoch": 0.03383420143969099, "grad_norm": 2.634087324142456, "learning_rate": 9.670501058057715e-06, "loss": 0.3798, "step": 37000 }, { "epoch": 0.03429142037806519, "grad_norm": 2.2868430614471436, "learning_rate": 9.665924683864072e-06, "loss": 0.3781, "step": 37500 }, { "epoch": 0.0347486393164394, "grad_norm": 2.1440744400024414, "learning_rate": 9.661348309670429e-06, "loss": 0.3893, "step": 38000 }, { "epoch": 0.0352058582548136, "grad_norm": 4.374706268310547, "learning_rate": 9.656771935476785e-06, "loss": 0.3765, "step": 38500 }, { "epoch": 0.0356630771931878, "grad_norm": 2.3860931396484375, "learning_rate": 9.652195561283144e-06, "loss": 0.376, "step": 39000 }, { "epoch": 0.03612029613156201, "grad_norm": 5.505861282348633, "learning_rate": 9.6476191870895e-06, "loss": 0.3767, "step": 39500 }, { "epoch": 0.03657751506993621, "grad_norm": 2.61763858795166, "learning_rate": 9.643042812895858e-06, "loss": 0.3808, "step": 40000 }, { "epoch": 0.037034734008310415, "grad_norm": 2.1524641513824463, "learning_rate": 9.638466438702214e-06, "loss": 0.3801, "step": 40500 }, { "epoch": 0.037491952946684615, "grad_norm": 1.7687675952911377, "learning_rate": 9.633890064508571e-06, "loss": 0.373, "step": 41000 }, { "epoch": 0.037949171885058815, "grad_norm": 1.924752116203308, "learning_rate": 9.629313690314928e-06, "loss": 0.3747, "step": 41500 }, { "epoch": 0.03840639082343302, "grad_norm": 2.0824227333068848, "learning_rate": 9.624737316121286e-06, "loss": 0.3665, "step": 42000 }, { "epoch": 0.03886360976180722, "grad_norm": 1.825997233390808, "learning_rate": 9.620160941927643e-06, "loss": 0.3678, "step": 42500 }, { "epoch": 0.03932082870018142, "grad_norm": 1.9416835308074951, "learning_rate": 9.615584567734e-06, "loss": 0.3727, "step": 43000 }, { "epoch": 0.03977804763855563, "grad_norm": 2.4522104263305664, "learning_rate": 9.611008193540357e-06, "loss": 0.3679, "step": 43500 }, { "epoch": 0.04023526657692983, "grad_norm": 2.46500825881958, "learning_rate": 9.606431819346715e-06, "loss": 0.3643, "step": 44000 }, { "epoch": 0.04069248551530403, "grad_norm": 2.2443718910217285, "learning_rate": 9.60185544515307e-06, "loss": 0.3619, "step": 44500 }, { "epoch": 0.041149704453678236, "grad_norm": 1.5836185216903687, "learning_rate": 9.59727907095943e-06, "loss": 0.3648, "step": 45000 }, { "epoch": 0.041606923392052436, "grad_norm": 1.7099242210388184, "learning_rate": 9.592702696765785e-06, "loss": 0.3627, "step": 45500 }, { "epoch": 0.04206414233042664, "grad_norm": 2.4821577072143555, "learning_rate": 9.588126322572144e-06, "loss": 0.3611, "step": 46000 }, { "epoch": 0.04252136126880084, "grad_norm": 2.2654361724853516, "learning_rate": 9.5835499483785e-06, "loss": 0.3608, "step": 46500 }, { "epoch": 0.04297858020717504, "grad_norm": 2.3293862342834473, "learning_rate": 9.578973574184857e-06, "loss": 0.368, "step": 47000 }, { "epoch": 0.04343579914554925, "grad_norm": 2.0562539100646973, "learning_rate": 9.574397199991214e-06, "loss": 0.3589, "step": 47500 }, { "epoch": 0.04389301808392345, "grad_norm": 2.5742366313934326, "learning_rate": 9.569820825797571e-06, "loss": 0.3524, "step": 48000 }, { "epoch": 0.04435023702229765, "grad_norm": 2.3970110416412354, "learning_rate": 9.565244451603928e-06, "loss": 0.3608, "step": 48500 }, { "epoch": 0.04480745596067186, "grad_norm": 1.7910202741622925, "learning_rate": 9.560668077410286e-06, "loss": 0.3447, "step": 49000 }, { "epoch": 0.04526467489904606, "grad_norm": 1.8098151683807373, "learning_rate": 9.556091703216643e-06, "loss": 0.3515, "step": 49500 }, { "epoch": 0.04572189383742026, "grad_norm": 1.9009671211242676, "learning_rate": 9.551515329023e-06, "loss": 0.3547, "step": 50000 }, { "epoch": 0.046179112775794465, "grad_norm": 1.8881592750549316, "learning_rate": 9.546938954829358e-06, "loss": 0.3542, "step": 50500 }, { "epoch": 0.046636331714168665, "grad_norm": 2.637274742126465, "learning_rate": 9.542362580635715e-06, "loss": 0.3489, "step": 51000 }, { "epoch": 0.04709355065254287, "grad_norm": 1.9252406358718872, "learning_rate": 9.53778620644207e-06, "loss": 0.3522, "step": 51500 }, { "epoch": 0.04755076959091707, "grad_norm": 1.7176941633224487, "learning_rate": 9.533209832248428e-06, "loss": 0.3425, "step": 52000 }, { "epoch": 0.04800798852929127, "grad_norm": 1.9770063161849976, "learning_rate": 9.528633458054785e-06, "loss": 0.3469, "step": 52500 }, { "epoch": 0.04846520746766548, "grad_norm": 2.2667782306671143, "learning_rate": 9.524057083861142e-06, "loss": 0.3498, "step": 53000 }, { "epoch": 0.04892242640603968, "grad_norm": 2.002631902694702, "learning_rate": 9.5194807096675e-06, "loss": 0.3486, "step": 53500 }, { "epoch": 0.04937964534441388, "grad_norm": 1.749894380569458, "learning_rate": 9.514904335473857e-06, "loss": 0.344, "step": 54000 }, { "epoch": 0.049836864282788086, "grad_norm": 2.701767921447754, "learning_rate": 9.510327961280214e-06, "loss": 0.3403, "step": 54500 }, { "epoch": 0.050294083221162286, "grad_norm": 1.895792007446289, "learning_rate": 9.505751587086571e-06, "loss": 0.3387, "step": 55000 }, { "epoch": 0.050751302159536486, "grad_norm": 2.478041410446167, "learning_rate": 9.501175212892929e-06, "loss": 0.3387, "step": 55500 }, { "epoch": 0.05120852109791069, "grad_norm": 2.3234288692474365, "learning_rate": 9.496598838699286e-06, "loss": 0.3426, "step": 56000 }, { "epoch": 0.05166574003628489, "grad_norm": 2.3493385314941406, "learning_rate": 9.492022464505643e-06, "loss": 0.3348, "step": 56500 }, { "epoch": 0.0521229589746591, "grad_norm": 2.0325398445129395, "learning_rate": 9.487446090311999e-06, "loss": 0.34, "step": 57000 }, { "epoch": 0.0525801779130333, "grad_norm": 2.0757031440734863, "learning_rate": 9.482869716118358e-06, "loss": 0.3396, "step": 57500 }, { "epoch": 0.0530373968514075, "grad_norm": 2.193401575088501, "learning_rate": 9.478293341924713e-06, "loss": 0.3352, "step": 58000 }, { "epoch": 0.05349461578978171, "grad_norm": 2.415004253387451, "learning_rate": 9.47371696773107e-06, "loss": 0.332, "step": 58500 }, { "epoch": 0.05395183472815591, "grad_norm": 1.8097025156021118, "learning_rate": 9.469140593537428e-06, "loss": 0.3395, "step": 59000 }, { "epoch": 0.05440905366653011, "grad_norm": 1.9246598482131958, "learning_rate": 9.464564219343785e-06, "loss": 0.3368, "step": 59500 }, { "epoch": 0.054866272604904315, "grad_norm": 1.8323681354522705, "learning_rate": 9.459987845150142e-06, "loss": 0.34, "step": 60000 }, { "epoch": 0.055323491543278515, "grad_norm": 2.6949360370635986, "learning_rate": 9.4554114709565e-06, "loss": 0.3398, "step": 60500 }, { "epoch": 0.05578071048165272, "grad_norm": 1.7276109457015991, "learning_rate": 9.450835096762857e-06, "loss": 0.3325, "step": 61000 }, { "epoch": 0.05623792942002692, "grad_norm": 1.6375492811203003, "learning_rate": 9.446258722569214e-06, "loss": 0.3323, "step": 61500 }, { "epoch": 0.05669514835840112, "grad_norm": 7.265068531036377, "learning_rate": 9.441682348375572e-06, "loss": 0.3333, "step": 62000 }, { "epoch": 0.05715236729677533, "grad_norm": 1.784020185470581, "learning_rate": 9.437105974181929e-06, "loss": 0.3357, "step": 62500 }, { "epoch": 0.05760958623514953, "grad_norm": 2.0214955806732178, "learning_rate": 9.432529599988284e-06, "loss": 0.3303, "step": 63000 }, { "epoch": 0.05806680517352373, "grad_norm": 2.3039133548736572, "learning_rate": 9.427953225794643e-06, "loss": 0.3292, "step": 63500 }, { "epoch": 0.058524024111897936, "grad_norm": 2.4076285362243652, "learning_rate": 9.423376851600999e-06, "loss": 0.3301, "step": 64000 }, { "epoch": 0.058981243050272136, "grad_norm": 1.872653603553772, "learning_rate": 9.418800477407358e-06, "loss": 0.3237, "step": 64500 }, { "epoch": 0.059438461988646336, "grad_norm": 1.858178973197937, "learning_rate": 9.414224103213713e-06, "loss": 0.3305, "step": 65000 }, { "epoch": 0.05989568092702054, "grad_norm": 2.2404658794403076, "learning_rate": 9.40964772902007e-06, "loss": 0.3282, "step": 65500 }, { "epoch": 0.060352899865394743, "grad_norm": 1.943448781967163, "learning_rate": 9.405071354826428e-06, "loss": 0.3245, "step": 66000 }, { "epoch": 0.06081011880376895, "grad_norm": 1.7627453804016113, "learning_rate": 9.400494980632785e-06, "loss": 0.3263, "step": 66500 }, { "epoch": 0.06126733774214315, "grad_norm": 2.1200695037841797, "learning_rate": 9.395918606439143e-06, "loss": 0.3207, "step": 67000 }, { "epoch": 0.06172455668051735, "grad_norm": 2.522911310195923, "learning_rate": 9.3913422322455e-06, "loss": 0.326, "step": 67500 }, { "epoch": 0.06218177561889156, "grad_norm": 2.193539619445801, "learning_rate": 9.386765858051855e-06, "loss": 0.3223, "step": 68000 }, { "epoch": 0.06263899455726575, "grad_norm": 2.4491043090820312, "learning_rate": 9.382189483858214e-06, "loss": 0.3213, "step": 68500 }, { "epoch": 0.06309621349563996, "grad_norm": 1.5971205234527588, "learning_rate": 9.37761310966457e-06, "loss": 0.3223, "step": 69000 }, { "epoch": 0.06355343243401416, "grad_norm": 2.126255750656128, "learning_rate": 9.373036735470929e-06, "loss": 0.3188, "step": 69500 }, { "epoch": 0.06401065137238837, "grad_norm": 2.074056625366211, "learning_rate": 9.368460361277285e-06, "loss": 0.3193, "step": 70000 }, { "epoch": 0.06446787031076257, "grad_norm": 1.6855189800262451, "learning_rate": 9.363883987083642e-06, "loss": 0.3133, "step": 70500 }, { "epoch": 0.06492508924913677, "grad_norm": 2.1474835872650146, "learning_rate": 9.359307612889999e-06, "loss": 0.3163, "step": 71000 }, { "epoch": 0.06538230818751098, "grad_norm": 1.9591755867004395, "learning_rate": 9.354731238696356e-06, "loss": 0.325, "step": 71500 }, { "epoch": 0.06583952712588517, "grad_norm": 1.7906707525253296, "learning_rate": 9.350154864502714e-06, "loss": 0.3207, "step": 72000 }, { "epoch": 0.06629674606425938, "grad_norm": 1.956648588180542, "learning_rate": 9.345578490309071e-06, "loss": 0.32, "step": 72500 }, { "epoch": 0.06675396500263359, "grad_norm": 2.537899971008301, "learning_rate": 9.341002116115428e-06, "loss": 0.3202, "step": 73000 }, { "epoch": 0.06721118394100778, "grad_norm": 2.68613600730896, "learning_rate": 9.336425741921785e-06, "loss": 0.3123, "step": 73500 }, { "epoch": 0.06766840287938199, "grad_norm": 1.742925763130188, "learning_rate": 9.331849367728141e-06, "loss": 0.3196, "step": 74000 }, { "epoch": 0.0681256218177562, "grad_norm": 1.493833065032959, "learning_rate": 9.3272729935345e-06, "loss": 0.3177, "step": 74500 }, { "epoch": 0.06858284075613039, "grad_norm": 2.0670220851898193, "learning_rate": 9.322696619340856e-06, "loss": 0.3194, "step": 75000 }, { "epoch": 0.0690400596945046, "grad_norm": 1.7943044900894165, "learning_rate": 9.318120245147213e-06, "loss": 0.3142, "step": 75500 }, { "epoch": 0.0694972786328788, "grad_norm": 2.0750091075897217, "learning_rate": 9.31354387095357e-06, "loss": 0.3121, "step": 76000 }, { "epoch": 0.069954497571253, "grad_norm": 2.5226950645446777, "learning_rate": 9.308967496759927e-06, "loss": 0.3167, "step": 76500 }, { "epoch": 0.0704117165096272, "grad_norm": 1.6280384063720703, "learning_rate": 9.304391122566285e-06, "loss": 0.3171, "step": 77000 }, { "epoch": 0.07086893544800141, "grad_norm": 1.8891403675079346, "learning_rate": 9.299814748372642e-06, "loss": 0.3161, "step": 77500 }, { "epoch": 0.0713261543863756, "grad_norm": 2.048211097717285, "learning_rate": 9.295238374179e-06, "loss": 0.3115, "step": 78000 }, { "epoch": 0.07178337332474981, "grad_norm": 1.7160500288009644, "learning_rate": 9.290661999985357e-06, "loss": 0.3189, "step": 78500 }, { "epoch": 0.07224059226312401, "grad_norm": 1.8395957946777344, "learning_rate": 9.286085625791714e-06, "loss": 0.3096, "step": 79000 }, { "epoch": 0.07269781120149822, "grad_norm": 1.92539381980896, "learning_rate": 9.281509251598071e-06, "loss": 0.3144, "step": 79500 }, { "epoch": 0.07315503013987242, "grad_norm": 2.474168300628662, "learning_rate": 9.276932877404428e-06, "loss": 0.3099, "step": 80000 }, { "epoch": 0.07361224907824662, "grad_norm": 2.2422871589660645, "learning_rate": 9.272356503210786e-06, "loss": 0.3129, "step": 80500 }, { "epoch": 0.07406946801662083, "grad_norm": 1.5611120462417603, "learning_rate": 9.267780129017141e-06, "loss": 0.3075, "step": 81000 }, { "epoch": 0.07452668695499502, "grad_norm": 1.408894658088684, "learning_rate": 9.263203754823499e-06, "loss": 0.3017, "step": 81500 }, { "epoch": 0.07498390589336923, "grad_norm": 1.664436936378479, "learning_rate": 9.258627380629856e-06, "loss": 0.3074, "step": 82000 }, { "epoch": 0.07544112483174344, "grad_norm": 1.3899191617965698, "learning_rate": 9.254051006436213e-06, "loss": 0.3061, "step": 82500 }, { "epoch": 0.07589834377011763, "grad_norm": 1.7736977338790894, "learning_rate": 9.24947463224257e-06, "loss": 0.3075, "step": 83000 }, { "epoch": 0.07635556270849184, "grad_norm": 1.743217945098877, "learning_rate": 9.244898258048928e-06, "loss": 0.3066, "step": 83500 }, { "epoch": 0.07681278164686604, "grad_norm": 2.4577653408050537, "learning_rate": 9.240321883855285e-06, "loss": 0.3083, "step": 84000 }, { "epoch": 0.07727000058524024, "grad_norm": 1.7819156646728516, "learning_rate": 9.235745509661642e-06, "loss": 0.3016, "step": 84500 }, { "epoch": 0.07772721952361444, "grad_norm": 1.5945593118667603, "learning_rate": 9.231169135468e-06, "loss": 0.3053, "step": 85000 }, { "epoch": 0.07818443846198865, "grad_norm": 3.3662831783294678, "learning_rate": 9.226592761274357e-06, "loss": 0.3008, "step": 85500 }, { "epoch": 0.07864165740036284, "grad_norm": 1.748854637145996, "learning_rate": 9.222016387080714e-06, "loss": 0.3026, "step": 86000 }, { "epoch": 0.07909887633873705, "grad_norm": 2.074263334274292, "learning_rate": 9.21744001288707e-06, "loss": 0.3045, "step": 86500 }, { "epoch": 0.07955609527711126, "grad_norm": 6.21075439453125, "learning_rate": 9.212863638693429e-06, "loss": 0.3078, "step": 87000 }, { "epoch": 0.08001331421548545, "grad_norm": 1.6198980808258057, "learning_rate": 9.208287264499784e-06, "loss": 0.3019, "step": 87500 }, { "epoch": 0.08047053315385966, "grad_norm": 1.760921835899353, "learning_rate": 9.203710890306143e-06, "loss": 0.2948, "step": 88000 }, { "epoch": 0.08092775209223387, "grad_norm": 2.7417385578155518, "learning_rate": 9.199134516112499e-06, "loss": 0.3054, "step": 88500 }, { "epoch": 0.08138497103060806, "grad_norm": 1.767946481704712, "learning_rate": 9.194558141918856e-06, "loss": 0.3405, "step": 89000 }, { "epoch": 0.08184218996898227, "grad_norm": 1.6789219379425049, "learning_rate": 9.189981767725213e-06, "loss": 0.3024, "step": 89500 }, { "epoch": 0.08229940890735647, "grad_norm": 1.6791198253631592, "learning_rate": 9.18540539353157e-06, "loss": 0.2987, "step": 90000 }, { "epoch": 0.08275662784573068, "grad_norm": 1.9289532899856567, "learning_rate": 9.180829019337928e-06, "loss": 0.3007, "step": 90500 }, { "epoch": 0.08321384678410487, "grad_norm": 1.9767258167266846, "learning_rate": 9.176252645144285e-06, "loss": 0.309, "step": 91000 }, { "epoch": 0.08367106572247908, "grad_norm": 2.790158271789551, "learning_rate": 9.17167627095064e-06, "loss": 0.3001, "step": 91500 }, { "epoch": 0.08412828466085329, "grad_norm": 2.0886495113372803, "learning_rate": 9.167099896757e-06, "loss": 0.2948, "step": 92000 }, { "epoch": 0.08458550359922748, "grad_norm": 1.426714539527893, "learning_rate": 9.162523522563355e-06, "loss": 0.2945, "step": 92500 }, { "epoch": 0.08504272253760169, "grad_norm": 1.887513279914856, "learning_rate": 9.157947148369714e-06, "loss": 0.2991, "step": 93000 }, { "epoch": 0.0854999414759759, "grad_norm": 2.1559338569641113, "learning_rate": 9.15337077417607e-06, "loss": 0.2984, "step": 93500 }, { "epoch": 0.08595716041435009, "grad_norm": 1.6978403329849243, "learning_rate": 9.148794399982429e-06, "loss": 0.3042, "step": 94000 }, { "epoch": 0.0864143793527243, "grad_norm": 1.7569996118545532, "learning_rate": 9.144218025788784e-06, "loss": 0.2929, "step": 94500 }, { "epoch": 0.0868715982910985, "grad_norm": 1.8148245811462402, "learning_rate": 9.139641651595142e-06, "loss": 0.2992, "step": 95000 }, { "epoch": 0.0873288172294727, "grad_norm": 2.9660353660583496, "learning_rate": 9.135065277401499e-06, "loss": 0.2967, "step": 95500 }, { "epoch": 0.0877860361678469, "grad_norm": 1.5390568971633911, "learning_rate": 9.130488903207856e-06, "loss": 0.2973, "step": 96000 }, { "epoch": 0.08824325510622111, "grad_norm": 2.3900351524353027, "learning_rate": 9.125912529014213e-06, "loss": 0.2954, "step": 96500 }, { "epoch": 0.0887004740445953, "grad_norm": 1.924519419670105, "learning_rate": 9.12133615482057e-06, "loss": 0.2989, "step": 97000 }, { "epoch": 0.08915769298296951, "grad_norm": 2.075025796890259, "learning_rate": 9.116759780626926e-06, "loss": 0.2974, "step": 97500 }, { "epoch": 0.08961491192134372, "grad_norm": 1.8780020475387573, "learning_rate": 9.112183406433285e-06, "loss": 0.2972, "step": 98000 }, { "epoch": 0.09007213085971791, "grad_norm": 1.8856852054595947, "learning_rate": 9.10760703223964e-06, "loss": 0.2951, "step": 98500 }, { "epoch": 0.09052934979809212, "grad_norm": 1.982252597808838, "learning_rate": 9.103030658046e-06, "loss": 0.2983, "step": 99000 }, { "epoch": 0.09098656873646632, "grad_norm": 1.7523550987243652, "learning_rate": 9.098454283852355e-06, "loss": 0.2936, "step": 99500 }, { "epoch": 0.09144378767484052, "grad_norm": 1.9436618089675903, "learning_rate": 9.093877909658713e-06, "loss": 0.2891, "step": 100000 }, { "epoch": 0.09190100661321472, "grad_norm": 1.929366946220398, "learning_rate": 9.08930153546507e-06, "loss": 0.2889, "step": 100500 }, { "epoch": 0.09235822555158893, "grad_norm": 2.2968223094940186, "learning_rate": 9.084725161271427e-06, "loss": 0.2951, "step": 101000 }, { "epoch": 0.09281544448996314, "grad_norm": 1.944568157196045, "learning_rate": 9.080148787077784e-06, "loss": 0.2966, "step": 101500 }, { "epoch": 0.09327266342833733, "grad_norm": 1.3778146505355835, "learning_rate": 9.075572412884142e-06, "loss": 0.2906, "step": 102000 }, { "epoch": 0.09372988236671154, "grad_norm": 1.755247712135315, "learning_rate": 9.070996038690499e-06, "loss": 0.2893, "step": 102500 }, { "epoch": 0.09418710130508574, "grad_norm": 1.6563775539398193, "learning_rate": 9.066419664496856e-06, "loss": 0.2952, "step": 103000 }, { "epoch": 0.09464432024345994, "grad_norm": 1.7801234722137451, "learning_rate": 9.061843290303214e-06, "loss": 0.2925, "step": 103500 }, { "epoch": 0.09510153918183414, "grad_norm": 2.3495497703552246, "learning_rate": 9.05726691610957e-06, "loss": 0.2928, "step": 104000 }, { "epoch": 0.09555875812020835, "grad_norm": 1.450566053390503, "learning_rate": 9.052690541915926e-06, "loss": 0.2845, "step": 104500 }, { "epoch": 0.09601597705858254, "grad_norm": 1.4703044891357422, "learning_rate": 9.048114167722284e-06, "loss": 0.2875, "step": 105000 }, { "epoch": 0.09647319599695675, "grad_norm": 1.6310155391693115, "learning_rate": 9.043537793528641e-06, "loss": 0.2975, "step": 105500 }, { "epoch": 0.09693041493533096, "grad_norm": 2.081167459487915, "learning_rate": 9.038961419334998e-06, "loss": 0.2935, "step": 106000 }, { "epoch": 0.09738763387370515, "grad_norm": 1.8510127067565918, "learning_rate": 9.034385045141356e-06, "loss": 0.2916, "step": 106500 }, { "epoch": 0.09784485281207936, "grad_norm": 2.0282094478607178, "learning_rate": 9.029808670947713e-06, "loss": 0.2894, "step": 107000 }, { "epoch": 0.09830207175045357, "grad_norm": 1.4554340839385986, "learning_rate": 9.02523229675407e-06, "loss": 0.2918, "step": 107500 }, { "epoch": 0.09875929068882776, "grad_norm": 1.4794038534164429, "learning_rate": 9.020655922560427e-06, "loss": 0.292, "step": 108000 }, { "epoch": 0.09921650962720197, "grad_norm": 1.5430374145507812, "learning_rate": 9.016079548366785e-06, "loss": 0.282, "step": 108500 }, { "epoch": 0.09967372856557617, "grad_norm": 2.4614310264587402, "learning_rate": 9.011503174173142e-06, "loss": 0.2941, "step": 109000 }, { "epoch": 0.10013094750395037, "grad_norm": 1.9759284257888794, "learning_rate": 9.0069267999795e-06, "loss": 0.2854, "step": 109500 }, { "epoch": 0.10058816644232457, "grad_norm": 1.8766002655029297, "learning_rate": 9.002350425785855e-06, "loss": 0.2894, "step": 110000 }, { "epoch": 0.10104538538069878, "grad_norm": 1.603816270828247, "learning_rate": 8.997774051592214e-06, "loss": 0.2871, "step": 110500 }, { "epoch": 0.10150260431907297, "grad_norm": 1.4415063858032227, "learning_rate": 8.99319767739857e-06, "loss": 0.2892, "step": 111000 }, { "epoch": 0.10195982325744718, "grad_norm": 2.01898193359375, "learning_rate": 8.988621303204927e-06, "loss": 0.286, "step": 111500 }, { "epoch": 0.10241704219582139, "grad_norm": 1.7956452369689941, "learning_rate": 8.984044929011284e-06, "loss": 0.2876, "step": 112000 }, { "epoch": 0.1028742611341956, "grad_norm": 1.8005551099777222, "learning_rate": 8.979468554817641e-06, "loss": 0.2859, "step": 112500 }, { "epoch": 0.10333148007256979, "grad_norm": 1.5132607221603394, "learning_rate": 8.974892180623998e-06, "loss": 0.2828, "step": 113000 }, { "epoch": 0.103788699010944, "grad_norm": 1.9613267183303833, "learning_rate": 8.970315806430356e-06, "loss": 0.2814, "step": 113500 }, { "epoch": 0.1042459179493182, "grad_norm": 2.240898370742798, "learning_rate": 8.965739432236713e-06, "loss": 0.286, "step": 114000 }, { "epoch": 0.1047031368876924, "grad_norm": 1.7905975580215454, "learning_rate": 8.96116305804307e-06, "loss": 0.2864, "step": 114500 }, { "epoch": 0.1051603558260666, "grad_norm": 2.4146153926849365, "learning_rate": 8.956586683849428e-06, "loss": 0.2823, "step": 115000 }, { "epoch": 0.10561757476444081, "grad_norm": 2.2988457679748535, "learning_rate": 8.952010309655785e-06, "loss": 0.2842, "step": 115500 }, { "epoch": 0.106074793702815, "grad_norm": 2.073253631591797, "learning_rate": 8.94743393546214e-06, "loss": 0.2811, "step": 116000 }, { "epoch": 0.10653201264118921, "grad_norm": 1.5774530172348022, "learning_rate": 8.9428575612685e-06, "loss": 0.2843, "step": 116500 }, { "epoch": 0.10698923157956342, "grad_norm": 2.8328728675842285, "learning_rate": 8.938281187074855e-06, "loss": 0.2847, "step": 117000 }, { "epoch": 0.10744645051793761, "grad_norm": 1.9653736352920532, "learning_rate": 8.933704812881214e-06, "loss": 0.2826, "step": 117500 }, { "epoch": 0.10790366945631182, "grad_norm": 1.9079234600067139, "learning_rate": 8.92912843868757e-06, "loss": 0.2799, "step": 118000 }, { "epoch": 0.10836088839468602, "grad_norm": 1.7807742357254028, "learning_rate": 8.924552064493927e-06, "loss": 0.2788, "step": 118500 }, { "epoch": 0.10881810733306022, "grad_norm": 1.857607364654541, "learning_rate": 8.919975690300284e-06, "loss": 0.2808, "step": 119000 }, { "epoch": 0.10927532627143442, "grad_norm": 1.8199599981307983, "learning_rate": 8.915399316106641e-06, "loss": 0.2875, "step": 119500 }, { "epoch": 0.10973254520980863, "grad_norm": 1.4623470306396484, "learning_rate": 8.910822941912999e-06, "loss": 0.3283, "step": 120000 }, { "epoch": 0.11018976414818282, "grad_norm": 1.5743190050125122, "learning_rate": 8.906246567719356e-06, "loss": 0.284, "step": 120500 }, { "epoch": 0.11064698308655703, "grad_norm": 1.7710552215576172, "learning_rate": 8.901670193525711e-06, "loss": 0.2847, "step": 121000 }, { "epoch": 0.11110420202493124, "grad_norm": 1.6554839611053467, "learning_rate": 8.89709381933207e-06, "loss": 0.2844, "step": 121500 }, { "epoch": 0.11156142096330544, "grad_norm": 1.8272452354431152, "learning_rate": 8.892517445138426e-06, "loss": 0.2842, "step": 122000 }, { "epoch": 0.11201863990167964, "grad_norm": 1.7126985788345337, "learning_rate": 8.887941070944785e-06, "loss": 0.2797, "step": 122500 }, { "epoch": 0.11247585884005384, "grad_norm": 2.158935546875, "learning_rate": 8.88336469675114e-06, "loss": 0.2771, "step": 123000 }, { "epoch": 0.11293307777842805, "grad_norm": 1.8630131483078003, "learning_rate": 8.878788322557498e-06, "loss": 0.2785, "step": 123500 }, { "epoch": 0.11339029671680224, "grad_norm": 1.4368232488632202, "learning_rate": 8.874211948363855e-06, "loss": 0.2835, "step": 124000 }, { "epoch": 0.11384751565517645, "grad_norm": 1.773201584815979, "learning_rate": 8.869635574170212e-06, "loss": 0.2846, "step": 124500 }, { "epoch": 0.11430473459355066, "grad_norm": 2.004790782928467, "learning_rate": 8.86505919997657e-06, "loss": 0.2813, "step": 125000 }, { "epoch": 0.11476195353192485, "grad_norm": 1.8280359506607056, "learning_rate": 8.860482825782927e-06, "loss": 0.2794, "step": 125500 }, { "epoch": 0.11521917247029906, "grad_norm": 1.10916268825531, "learning_rate": 8.855906451589284e-06, "loss": 0.2742, "step": 126000 }, { "epoch": 0.11567639140867327, "grad_norm": 1.524181604385376, "learning_rate": 8.851330077395641e-06, "loss": 0.2778, "step": 126500 }, { "epoch": 0.11613361034704746, "grad_norm": 1.8285144567489624, "learning_rate": 8.846753703201997e-06, "loss": 0.2781, "step": 127000 }, { "epoch": 0.11659082928542167, "grad_norm": 2.387599229812622, "learning_rate": 8.842177329008356e-06, "loss": 0.2729, "step": 127500 }, { "epoch": 0.11704804822379587, "grad_norm": 1.5542514324188232, "learning_rate": 8.837600954814712e-06, "loss": 0.2745, "step": 128000 }, { "epoch": 0.11750526716217007, "grad_norm": 1.4079362154006958, "learning_rate": 8.83302458062107e-06, "loss": 0.2815, "step": 128500 }, { "epoch": 0.11796248610054427, "grad_norm": 1.8694310188293457, "learning_rate": 8.828448206427426e-06, "loss": 0.277, "step": 129000 }, { "epoch": 0.11841970503891848, "grad_norm": 1.2781902551651, "learning_rate": 8.823871832233783e-06, "loss": 0.2804, "step": 129500 }, { "epoch": 0.11887692397729267, "grad_norm": 2.4223721027374268, "learning_rate": 8.81929545804014e-06, "loss": 0.282, "step": 130000 }, { "epoch": 0.11933414291566688, "grad_norm": 1.4259532690048218, "learning_rate": 8.814719083846498e-06, "loss": 0.2765, "step": 130500 }, { "epoch": 0.11979136185404109, "grad_norm": 4.0724568367004395, "learning_rate": 8.810142709652855e-06, "loss": 0.2781, "step": 131000 }, { "epoch": 0.12024858079241528, "grad_norm": 1.7051255702972412, "learning_rate": 8.805566335459213e-06, "loss": 0.2791, "step": 131500 }, { "epoch": 0.12070579973078949, "grad_norm": 1.7078741788864136, "learning_rate": 8.80098996126557e-06, "loss": 0.2796, "step": 132000 }, { "epoch": 0.1211630186691637, "grad_norm": 1.6474307775497437, "learning_rate": 8.796413587071927e-06, "loss": 0.271, "step": 132500 }, { "epoch": 0.1216202376075379, "grad_norm": 1.9740554094314575, "learning_rate": 8.791837212878284e-06, "loss": 0.2802, "step": 133000 }, { "epoch": 0.1220774565459121, "grad_norm": 1.4887925386428833, "learning_rate": 8.787260838684642e-06, "loss": 0.2707, "step": 133500 }, { "epoch": 0.1225346754842863, "grad_norm": 1.815319538116455, "learning_rate": 8.782684464490999e-06, "loss": 0.2751, "step": 134000 }, { "epoch": 0.12299189442266051, "grad_norm": 2.604151487350464, "learning_rate": 8.778108090297355e-06, "loss": 0.2779, "step": 134500 }, { "epoch": 0.1234491133610347, "grad_norm": 1.8312991857528687, "learning_rate": 8.773531716103712e-06, "loss": 0.2757, "step": 135000 }, { "epoch": 0.12390633229940891, "grad_norm": 2.094054698944092, "learning_rate": 8.768955341910069e-06, "loss": 0.2788, "step": 135500 }, { "epoch": 0.12436355123778312, "grad_norm": 1.7696080207824707, "learning_rate": 8.764378967716426e-06, "loss": 0.2717, "step": 136000 }, { "epoch": 0.12482077017615731, "grad_norm": 1.6877754926681519, "learning_rate": 8.759802593522784e-06, "loss": 0.2712, "step": 136500 }, { "epoch": 0.1252779891145315, "grad_norm": 2.1642048358917236, "learning_rate": 8.755226219329141e-06, "loss": 0.2727, "step": 137000 }, { "epoch": 0.12573520805290572, "grad_norm": 2.3550350666046143, "learning_rate": 8.750649845135498e-06, "loss": 0.2707, "step": 137500 }, { "epoch": 0.12619242699127992, "grad_norm": 1.6955220699310303, "learning_rate": 8.746073470941855e-06, "loss": 0.2699, "step": 138000 }, { "epoch": 0.12664964592965414, "grad_norm": 1.873693823814392, "learning_rate": 8.741497096748213e-06, "loss": 0.2679, "step": 138500 }, { "epoch": 0.12710686486802833, "grad_norm": 1.5458048582077026, "learning_rate": 8.73692072255457e-06, "loss": 0.2698, "step": 139000 }, { "epoch": 0.12756408380640252, "grad_norm": 2.3633434772491455, "learning_rate": 8.732344348360926e-06, "loss": 0.2708, "step": 139500 }, { "epoch": 0.12802130274477674, "grad_norm": 1.4097380638122559, "learning_rate": 8.727767974167285e-06, "loss": 0.274, "step": 140000 }, { "epoch": 0.12847852168315094, "grad_norm": 1.7990530729293823, "learning_rate": 8.72319159997364e-06, "loss": 0.2706, "step": 140500 }, { "epoch": 0.12893574062152513, "grad_norm": 1.9841113090515137, "learning_rate": 8.718615225779999e-06, "loss": 0.2739, "step": 141000 }, { "epoch": 0.12939295955989935, "grad_norm": 1.222854495048523, "learning_rate": 8.714038851586355e-06, "loss": 0.2686, "step": 141500 }, { "epoch": 0.12985017849827354, "grad_norm": 1.891701340675354, "learning_rate": 8.709462477392712e-06, "loss": 0.2688, "step": 142000 }, { "epoch": 0.13030739743664774, "grad_norm": 1.841719627380371, "learning_rate": 8.70488610319907e-06, "loss": 0.2695, "step": 142500 }, { "epoch": 0.13076461637502196, "grad_norm": 1.5631014108657837, "learning_rate": 8.700309729005426e-06, "loss": 0.2706, "step": 143000 }, { "epoch": 0.13122183531339615, "grad_norm": 1.9422105550765991, "learning_rate": 8.695733354811784e-06, "loss": 0.269, "step": 143500 }, { "epoch": 0.13167905425177034, "grad_norm": 1.475142002105713, "learning_rate": 8.691156980618141e-06, "loss": 0.2694, "step": 144000 }, { "epoch": 0.13213627319014457, "grad_norm": 2.2062432765960693, "learning_rate": 8.686580606424498e-06, "loss": 0.2695, "step": 144500 }, { "epoch": 0.13259349212851876, "grad_norm": 1.754489541053772, "learning_rate": 8.682004232230856e-06, "loss": 0.2743, "step": 145000 }, { "epoch": 0.13305071106689295, "grad_norm": 1.6598039865493774, "learning_rate": 8.677427858037211e-06, "loss": 0.269, "step": 145500 }, { "epoch": 0.13350793000526717, "grad_norm": 1.045148253440857, "learning_rate": 8.67285148384357e-06, "loss": 0.2662, "step": 146000 }, { "epoch": 0.13396514894364137, "grad_norm": 1.2887623310089111, "learning_rate": 8.668275109649926e-06, "loss": 0.2735, "step": 146500 }, { "epoch": 0.13442236788201556, "grad_norm": 1.5989199876785278, "learning_rate": 8.663698735456285e-06, "loss": 0.2688, "step": 147000 }, { "epoch": 0.13487958682038978, "grad_norm": 1.9200626611709595, "learning_rate": 8.65912236126264e-06, "loss": 0.2712, "step": 147500 }, { "epoch": 0.13533680575876397, "grad_norm": 1.7635419368743896, "learning_rate": 8.654545987068998e-06, "loss": 0.2672, "step": 148000 }, { "epoch": 0.13579402469713817, "grad_norm": 1.6450468301773071, "learning_rate": 8.649969612875355e-06, "loss": 0.2656, "step": 148500 }, { "epoch": 0.1362512436355124, "grad_norm": 2.2584726810455322, "learning_rate": 8.645393238681712e-06, "loss": 0.2677, "step": 149000 }, { "epoch": 0.13670846257388658, "grad_norm": 1.372758388519287, "learning_rate": 8.64081686448807e-06, "loss": 0.2726, "step": 149500 }, { "epoch": 0.13716568151226077, "grad_norm": 1.8561943769454956, "learning_rate": 8.636240490294427e-06, "loss": 0.2721, "step": 150000 }, { "epoch": 0.137622900450635, "grad_norm": 1.548618197441101, "learning_rate": 8.631664116100782e-06, "loss": 0.2676, "step": 150500 }, { "epoch": 0.1380801193890092, "grad_norm": 1.3110601902008057, "learning_rate": 8.627087741907141e-06, "loss": 0.2661, "step": 151000 }, { "epoch": 0.13853733832738338, "grad_norm": 1.4244693517684937, "learning_rate": 8.622511367713497e-06, "loss": 0.2712, "step": 151500 }, { "epoch": 0.1389945572657576, "grad_norm": 2.187041759490967, "learning_rate": 8.617934993519856e-06, "loss": 0.2679, "step": 152000 }, { "epoch": 0.1394517762041318, "grad_norm": 1.7944238185882568, "learning_rate": 8.613358619326211e-06, "loss": 0.2682, "step": 152500 }, { "epoch": 0.139908995142506, "grad_norm": 1.7159152030944824, "learning_rate": 8.608782245132569e-06, "loss": 0.2689, "step": 153000 }, { "epoch": 0.1403662140808802, "grad_norm": 1.8711001873016357, "learning_rate": 8.604205870938926e-06, "loss": 0.2685, "step": 153500 }, { "epoch": 0.1408234330192544, "grad_norm": 1.7059112787246704, "learning_rate": 8.599629496745283e-06, "loss": 0.2695, "step": 154000 }, { "epoch": 0.1412806519576286, "grad_norm": 1.720859408378601, "learning_rate": 8.59505312255164e-06, "loss": 0.2703, "step": 154500 }, { "epoch": 0.14173787089600282, "grad_norm": 1.665474772453308, "learning_rate": 8.590476748357998e-06, "loss": 0.269, "step": 155000 }, { "epoch": 0.142195089834377, "grad_norm": 1.6061115264892578, "learning_rate": 8.585900374164355e-06, "loss": 0.2659, "step": 155500 }, { "epoch": 0.1426523087727512, "grad_norm": 1.6262190341949463, "learning_rate": 8.581323999970712e-06, "loss": 0.2652, "step": 156000 }, { "epoch": 0.14310952771112542, "grad_norm": 1.9662021398544312, "learning_rate": 8.57674762577707e-06, "loss": 0.2659, "step": 156500 }, { "epoch": 0.14356674664949962, "grad_norm": 1.2154645919799805, "learning_rate": 8.572171251583427e-06, "loss": 0.2686, "step": 157000 }, { "epoch": 0.1440239655878738, "grad_norm": 1.8387107849121094, "learning_rate": 8.567594877389782e-06, "loss": 0.2668, "step": 157500 }, { "epoch": 0.14448118452624803, "grad_norm": 1.4331964254379272, "learning_rate": 8.56301850319614e-06, "loss": 0.2635, "step": 158000 }, { "epoch": 0.14493840346462222, "grad_norm": 1.503548502922058, "learning_rate": 8.558442129002497e-06, "loss": 0.2681, "step": 158500 }, { "epoch": 0.14539562240299644, "grad_norm": 2.2931318283081055, "learning_rate": 8.553865754808854e-06, "loss": 0.2657, "step": 159000 }, { "epoch": 0.14585284134137064, "grad_norm": 1.415092945098877, "learning_rate": 8.549289380615212e-06, "loss": 0.2563, "step": 159500 }, { "epoch": 0.14631006027974483, "grad_norm": 1.3481783866882324, "learning_rate": 8.544713006421569e-06, "loss": 0.2615, "step": 160000 }, { "epoch": 0.14676727921811905, "grad_norm": 2.6668007373809814, "learning_rate": 8.540136632227926e-06, "loss": 0.2689, "step": 160500 }, { "epoch": 0.14722449815649324, "grad_norm": 1.9730263948440552, "learning_rate": 8.535560258034283e-06, "loss": 0.2625, "step": 161000 }, { "epoch": 0.14768171709486744, "grad_norm": 1.5329406261444092, "learning_rate": 8.53098388384064e-06, "loss": 0.2583, "step": 161500 }, { "epoch": 0.14813893603324166, "grad_norm": 1.8120336532592773, "learning_rate": 8.526407509646998e-06, "loss": 0.2626, "step": 162000 }, { "epoch": 0.14859615497161585, "grad_norm": 1.5694791078567505, "learning_rate": 8.521831135453355e-06, "loss": 0.2638, "step": 162500 }, { "epoch": 0.14905337390999004, "grad_norm": 1.6131516695022583, "learning_rate": 8.517254761259712e-06, "loss": 0.2616, "step": 163000 }, { "epoch": 0.14951059284836427, "grad_norm": 1.7939931154251099, "learning_rate": 8.51267838706607e-06, "loss": 0.2632, "step": 163500 }, { "epoch": 0.14996781178673846, "grad_norm": 1.0342079401016235, "learning_rate": 8.508102012872425e-06, "loss": 0.2646, "step": 164000 }, { "epoch": 0.15042503072511265, "grad_norm": 1.1683495044708252, "learning_rate": 8.503525638678784e-06, "loss": 0.2607, "step": 164500 }, { "epoch": 0.15088224966348687, "grad_norm": 1.189745306968689, "learning_rate": 8.49894926448514e-06, "loss": 0.2643, "step": 165000 }, { "epoch": 0.15133946860186107, "grad_norm": 1.996500015258789, "learning_rate": 8.494372890291497e-06, "loss": 0.2603, "step": 165500 }, { "epoch": 0.15179668754023526, "grad_norm": 1.9063647985458374, "learning_rate": 8.489796516097854e-06, "loss": 0.2697, "step": 166000 }, { "epoch": 0.15225390647860948, "grad_norm": 1.3559688329696655, "learning_rate": 8.485220141904212e-06, "loss": 0.2626, "step": 166500 }, { "epoch": 0.15271112541698367, "grad_norm": 1.9531289339065552, "learning_rate": 8.480643767710569e-06, "loss": 0.2557, "step": 167000 }, { "epoch": 0.15316834435535787, "grad_norm": 1.3879919052124023, "learning_rate": 8.476067393516926e-06, "loss": 0.258, "step": 167500 }, { "epoch": 0.1536255632937321, "grad_norm": 1.7489395141601562, "learning_rate": 8.471491019323284e-06, "loss": 0.2579, "step": 168000 }, { "epoch": 0.15408278223210628, "grad_norm": 1.183287501335144, "learning_rate": 8.46691464512964e-06, "loss": 0.263, "step": 168500 }, { "epoch": 0.15454000117048047, "grad_norm": 1.538761019706726, "learning_rate": 8.462338270935996e-06, "loss": 0.2596, "step": 169000 }, { "epoch": 0.1549972201088547, "grad_norm": 1.6584478616714478, "learning_rate": 8.457761896742355e-06, "loss": 0.2594, "step": 169500 }, { "epoch": 0.1554544390472289, "grad_norm": 1.4705157279968262, "learning_rate": 8.453185522548711e-06, "loss": 0.2537, "step": 170000 }, { "epoch": 0.15591165798560308, "grad_norm": 2.3619368076324463, "learning_rate": 8.44860914835507e-06, "loss": 0.2595, "step": 170500 }, { "epoch": 0.1563688769239773, "grad_norm": 1.5578237771987915, "learning_rate": 8.444032774161425e-06, "loss": 0.2611, "step": 171000 }, { "epoch": 0.1568260958623515, "grad_norm": 1.4956451654434204, "learning_rate": 8.439456399967783e-06, "loss": 0.2661, "step": 171500 }, { "epoch": 0.1572833148007257, "grad_norm": 1.7658261060714722, "learning_rate": 8.43488002577414e-06, "loss": 0.2618, "step": 172000 }, { "epoch": 0.1577405337390999, "grad_norm": 1.9475387334823608, "learning_rate": 8.430303651580497e-06, "loss": 0.2584, "step": 172500 }, { "epoch": 0.1581977526774741, "grad_norm": 1.3033366203308105, "learning_rate": 8.425727277386855e-06, "loss": 0.2619, "step": 173000 }, { "epoch": 0.1586549716158483, "grad_norm": 1.1210391521453857, "learning_rate": 8.421150903193212e-06, "loss": 0.2598, "step": 173500 }, { "epoch": 0.15911219055422252, "grad_norm": 2.0735795497894287, "learning_rate": 8.416574528999567e-06, "loss": 0.259, "step": 174000 }, { "epoch": 0.1595694094925967, "grad_norm": 1.4574170112609863, "learning_rate": 8.411998154805926e-06, "loss": 0.2606, "step": 174500 }, { "epoch": 0.1600266284309709, "grad_norm": 1.5683772563934326, "learning_rate": 8.407421780612282e-06, "loss": 0.2609, "step": 175000 }, { "epoch": 0.16048384736934512, "grad_norm": 1.9865988492965698, "learning_rate": 8.402845406418641e-06, "loss": 0.2613, "step": 175500 }, { "epoch": 0.16094106630771932, "grad_norm": 1.9525185823440552, "learning_rate": 8.398269032224997e-06, "loss": 0.2546, "step": 176000 }, { "epoch": 0.1613982852460935, "grad_norm": 1.6674350500106812, "learning_rate": 8.393692658031354e-06, "loss": 0.256, "step": 176500 }, { "epoch": 0.16185550418446773, "grad_norm": 2.0394787788391113, "learning_rate": 8.389116283837711e-06, "loss": 0.2629, "step": 177000 }, { "epoch": 0.16231272312284192, "grad_norm": 2.1897048950195312, "learning_rate": 8.384539909644068e-06, "loss": 0.2559, "step": 177500 }, { "epoch": 0.16276994206121612, "grad_norm": 1.0547202825546265, "learning_rate": 8.379963535450426e-06, "loss": 0.2593, "step": 178000 }, { "epoch": 0.16322716099959034, "grad_norm": 1.8409370183944702, "learning_rate": 8.375387161256783e-06, "loss": 0.2621, "step": 178500 }, { "epoch": 0.16368437993796453, "grad_norm": 1.753064513206482, "learning_rate": 8.37081078706314e-06, "loss": 0.2597, "step": 179000 }, { "epoch": 0.16414159887633872, "grad_norm": 1.866620421409607, "learning_rate": 8.366234412869497e-06, "loss": 0.2602, "step": 179500 }, { "epoch": 0.16459881781471294, "grad_norm": 1.6045613288879395, "learning_rate": 8.361658038675855e-06, "loss": 0.2585, "step": 180000 }, { "epoch": 0.16505603675308714, "grad_norm": 1.262148380279541, "learning_rate": 8.357081664482212e-06, "loss": 0.2605, "step": 180500 }, { "epoch": 0.16551325569146136, "grad_norm": 1.3324670791625977, "learning_rate": 8.352505290288568e-06, "loss": 0.259, "step": 181000 }, { "epoch": 0.16597047462983555, "grad_norm": 1.5552209615707397, "learning_rate": 8.347928916094927e-06, "loss": 0.2536, "step": 181500 }, { "epoch": 0.16642769356820974, "grad_norm": 2.5258872509002686, "learning_rate": 8.343352541901282e-06, "loss": 0.256, "step": 182000 }, { "epoch": 0.16688491250658397, "grad_norm": 1.462498664855957, "learning_rate": 8.33877616770764e-06, "loss": 0.2574, "step": 182500 }, { "epoch": 0.16734213144495816, "grad_norm": 1.5125452280044556, "learning_rate": 8.334199793513997e-06, "loss": 0.2567, "step": 183000 }, { "epoch": 0.16779935038333235, "grad_norm": 1.6528276205062866, "learning_rate": 8.329623419320354e-06, "loss": 0.2674, "step": 183500 }, { "epoch": 0.16825656932170657, "grad_norm": 1.1524349451065063, "learning_rate": 8.325047045126711e-06, "loss": 0.257, "step": 184000 }, { "epoch": 0.16871378826008077, "grad_norm": 1.5361084938049316, "learning_rate": 8.320470670933069e-06, "loss": 0.2617, "step": 184500 }, { "epoch": 0.16917100719845496, "grad_norm": 1.7371759414672852, "learning_rate": 8.315894296739426e-06, "loss": 0.257, "step": 185000 }, { "epoch": 0.16962822613682918, "grad_norm": 2.3449254035949707, "learning_rate": 8.311317922545783e-06, "loss": 0.2527, "step": 185500 }, { "epoch": 0.17008544507520337, "grad_norm": 1.259590983390808, "learning_rate": 8.30674154835214e-06, "loss": 0.2518, "step": 186000 }, { "epoch": 0.17054266401357757, "grad_norm": 1.6850295066833496, "learning_rate": 8.302165174158498e-06, "loss": 0.2545, "step": 186500 }, { "epoch": 0.1709998829519518, "grad_norm": 1.8006367683410645, "learning_rate": 8.297588799964855e-06, "loss": 0.2569, "step": 187000 }, { "epoch": 0.17145710189032598, "grad_norm": 1.3569294214248657, "learning_rate": 8.29301242577121e-06, "loss": 0.2554, "step": 187500 }, { "epoch": 0.17191432082870017, "grad_norm": 1.310188889503479, "learning_rate": 8.288436051577568e-06, "loss": 0.2593, "step": 188000 }, { "epoch": 0.1723715397670744, "grad_norm": 1.741705298423767, "learning_rate": 8.283859677383925e-06, "loss": 0.252, "step": 188500 }, { "epoch": 0.1728287587054486, "grad_norm": 1.834928035736084, "learning_rate": 8.279283303190282e-06, "loss": 0.2516, "step": 189000 }, { "epoch": 0.17328597764382278, "grad_norm": 1.4775325059890747, "learning_rate": 8.27470692899664e-06, "loss": 0.2567, "step": 189500 }, { "epoch": 0.173743196582197, "grad_norm": 1.818657398223877, "learning_rate": 8.270130554802997e-06, "loss": 0.26, "step": 190000 }, { "epoch": 0.1742004155205712, "grad_norm": 1.9210857152938843, "learning_rate": 8.265554180609354e-06, "loss": 0.2496, "step": 190500 }, { "epoch": 0.1746576344589454, "grad_norm": 1.676413893699646, "learning_rate": 8.260977806415711e-06, "loss": 0.2539, "step": 191000 }, { "epoch": 0.1751148533973196, "grad_norm": 2.254531145095825, "learning_rate": 8.256401432222069e-06, "loss": 0.2552, "step": 191500 }, { "epoch": 0.1755720723356938, "grad_norm": 1.4928869009017944, "learning_rate": 8.251825058028426e-06, "loss": 0.2565, "step": 192000 }, { "epoch": 0.176029291274068, "grad_norm": 1.4001063108444214, "learning_rate": 8.247248683834782e-06, "loss": 0.2547, "step": 192500 }, { "epoch": 0.17648651021244222, "grad_norm": 1.8143495321273804, "learning_rate": 8.24267230964114e-06, "loss": 0.2563, "step": 193000 }, { "epoch": 0.1769437291508164, "grad_norm": 1.865336537361145, "learning_rate": 8.238095935447496e-06, "loss": 0.2568, "step": 193500 }, { "epoch": 0.1774009480891906, "grad_norm": 1.7321306467056274, "learning_rate": 8.233519561253855e-06, "loss": 0.2561, "step": 194000 }, { "epoch": 0.17785816702756482, "grad_norm": 1.6060725450515747, "learning_rate": 8.22894318706021e-06, "loss": 0.252, "step": 194500 }, { "epoch": 0.17831538596593902, "grad_norm": 1.4754799604415894, "learning_rate": 8.22436681286657e-06, "loss": 0.247, "step": 195000 }, { "epoch": 0.1787726049043132, "grad_norm": 1.8268160820007324, "learning_rate": 8.219790438672925e-06, "loss": 0.2558, "step": 195500 }, { "epoch": 0.17922982384268743, "grad_norm": 1.5629231929779053, "learning_rate": 8.215214064479282e-06, "loss": 0.2578, "step": 196000 }, { "epoch": 0.17968704278106162, "grad_norm": 1.7426457405090332, "learning_rate": 8.21063769028564e-06, "loss": 0.2569, "step": 196500 }, { "epoch": 0.18014426171943582, "grad_norm": 1.6766743659973145, "learning_rate": 8.206061316091997e-06, "loss": 0.2528, "step": 197000 }, { "epoch": 0.18060148065781004, "grad_norm": 1.3292638063430786, "learning_rate": 8.201484941898354e-06, "loss": 0.2485, "step": 197500 }, { "epoch": 0.18105869959618423, "grad_norm": 2.073800563812256, "learning_rate": 8.196908567704712e-06, "loss": 0.2538, "step": 198000 }, { "epoch": 0.18151591853455842, "grad_norm": 1.4113343954086304, "learning_rate": 8.192332193511067e-06, "loss": 0.2536, "step": 198500 }, { "epoch": 0.18197313747293264, "grad_norm": 2.1124043464660645, "learning_rate": 8.187755819317426e-06, "loss": 0.2564, "step": 199000 }, { "epoch": 0.18243035641130684, "grad_norm": 1.423259973526001, "learning_rate": 8.183179445123782e-06, "loss": 0.2553, "step": 199500 }, { "epoch": 0.18288757534968103, "grad_norm": 1.9814764261245728, "learning_rate": 8.17860307093014e-06, "loss": 0.2521, "step": 200000 }, { "epoch": 0.18334479428805525, "grad_norm": 1.2298426628112793, "learning_rate": 8.174026696736496e-06, "loss": 0.2539, "step": 200500 }, { "epoch": 0.18380201322642944, "grad_norm": 1.2353808879852295, "learning_rate": 8.169450322542854e-06, "loss": 0.2589, "step": 201000 }, { "epoch": 0.18425923216480367, "grad_norm": 1.585706114768982, "learning_rate": 8.16487394834921e-06, "loss": 0.2535, "step": 201500 }, { "epoch": 0.18471645110317786, "grad_norm": 1.6619884967803955, "learning_rate": 8.160297574155568e-06, "loss": 0.2523, "step": 202000 }, { "epoch": 0.18517367004155205, "grad_norm": 1.504461407661438, "learning_rate": 8.155721199961925e-06, "loss": 0.2508, "step": 202500 }, { "epoch": 0.18563088897992627, "grad_norm": 1.1175047159194946, "learning_rate": 8.151144825768283e-06, "loss": 0.2554, "step": 203000 }, { "epoch": 0.18608810791830047, "grad_norm": 1.6364964246749878, "learning_rate": 8.146568451574638e-06, "loss": 0.2573, "step": 203500 }, { "epoch": 0.18654532685667466, "grad_norm": 1.436776876449585, "learning_rate": 8.141992077380997e-06, "loss": 0.2527, "step": 204000 }, { "epoch": 0.18700254579504888, "grad_norm": 1.684793472290039, "learning_rate": 8.137415703187353e-06, "loss": 0.2556, "step": 204500 }, { "epoch": 0.18745976473342307, "grad_norm": 2.135289430618286, "learning_rate": 8.132839328993712e-06, "loss": 0.2536, "step": 205000 }, { "epoch": 0.18791698367179727, "grad_norm": 1.6975624561309814, "learning_rate": 8.128262954800067e-06, "loss": 0.2452, "step": 205500 }, { "epoch": 0.1883742026101715, "grad_norm": 1.3779131174087524, "learning_rate": 8.123686580606425e-06, "loss": 0.2519, "step": 206000 }, { "epoch": 0.18883142154854568, "grad_norm": 2.1386914253234863, "learning_rate": 8.119110206412782e-06, "loss": 0.2521, "step": 206500 }, { "epoch": 0.18928864048691987, "grad_norm": 2.1056151390075684, "learning_rate": 8.11453383221914e-06, "loss": 0.2519, "step": 207000 }, { "epoch": 0.1897458594252941, "grad_norm": 1.797166109085083, "learning_rate": 8.109957458025496e-06, "loss": 0.2498, "step": 207500 }, { "epoch": 0.1902030783636683, "grad_norm": 1.8904006481170654, "learning_rate": 8.105381083831854e-06, "loss": 0.2537, "step": 208000 }, { "epoch": 0.19066029730204248, "grad_norm": 2.1598122119903564, "learning_rate": 8.100804709638211e-06, "loss": 0.2539, "step": 208500 }, { "epoch": 0.1911175162404167, "grad_norm": 1.544722318649292, "learning_rate": 8.096228335444568e-06, "loss": 0.2486, "step": 209000 }, { "epoch": 0.1915747351787909, "grad_norm": 1.8575553894042969, "learning_rate": 8.091651961250926e-06, "loss": 0.2531, "step": 209500 }, { "epoch": 0.1920319541171651, "grad_norm": 0.9131256341934204, "learning_rate": 8.087075587057283e-06, "loss": 0.2485, "step": 210000 }, { "epoch": 0.1924891730555393, "grad_norm": 2.0034356117248535, "learning_rate": 8.08249921286364e-06, "loss": 0.2522, "step": 210500 }, { "epoch": 0.1929463919939135, "grad_norm": 1.5028212070465088, "learning_rate": 8.077922838669996e-06, "loss": 0.2462, "step": 211000 }, { "epoch": 0.1934036109322877, "grad_norm": 1.4713739156723022, "learning_rate": 8.073346464476353e-06, "loss": 0.2483, "step": 211500 }, { "epoch": 0.19386082987066192, "grad_norm": 1.6516448259353638, "learning_rate": 8.06877009028271e-06, "loss": 0.2446, "step": 212000 }, { "epoch": 0.1943180488090361, "grad_norm": 1.0185027122497559, "learning_rate": 8.064193716089068e-06, "loss": 0.2465, "step": 212500 }, { "epoch": 0.1947752677474103, "grad_norm": 1.6575361490249634, "learning_rate": 8.059617341895425e-06, "loss": 0.248, "step": 213000 }, { "epoch": 0.19523248668578452, "grad_norm": 1.0781890153884888, "learning_rate": 8.055040967701782e-06, "loss": 0.2543, "step": 213500 }, { "epoch": 0.19568970562415872, "grad_norm": 1.0661412477493286, "learning_rate": 8.05046459350814e-06, "loss": 0.2482, "step": 214000 }, { "epoch": 0.1961469245625329, "grad_norm": 2.0978198051452637, "learning_rate": 8.045888219314497e-06, "loss": 0.2479, "step": 214500 }, { "epoch": 0.19660414350090713, "grad_norm": 1.5128875970840454, "learning_rate": 8.041311845120854e-06, "loss": 0.2482, "step": 215000 }, { "epoch": 0.19706136243928132, "grad_norm": 1.4031188488006592, "learning_rate": 8.036735470927211e-06, "loss": 0.2505, "step": 215500 }, { "epoch": 0.19751858137765552, "grad_norm": 1.6590416431427002, "learning_rate": 8.032159096733568e-06, "loss": 0.2487, "step": 216000 }, { "epoch": 0.19797580031602974, "grad_norm": 1.5777417421340942, "learning_rate": 8.027582722539926e-06, "loss": 0.2464, "step": 216500 }, { "epoch": 0.19843301925440393, "grad_norm": 1.3186599016189575, "learning_rate": 8.023006348346281e-06, "loss": 0.2469, "step": 217000 }, { "epoch": 0.19889023819277812, "grad_norm": 1.8318928480148315, "learning_rate": 8.01842997415264e-06, "loss": 0.2418, "step": 217500 }, { "epoch": 0.19934745713115234, "grad_norm": 1.4368090629577637, "learning_rate": 8.013853599958996e-06, "loss": 0.2483, "step": 218000 }, { "epoch": 0.19980467606952654, "grad_norm": 1.7631844282150269, "learning_rate": 8.009277225765353e-06, "loss": 0.2517, "step": 218500 }, { "epoch": 0.20026189500790073, "grad_norm": 1.421195387840271, "learning_rate": 8.00470085157171e-06, "loss": 0.2506, "step": 219000 }, { "epoch": 0.20071911394627495, "grad_norm": 2.1690146923065186, "learning_rate": 8.000124477378068e-06, "loss": 0.2459, "step": 219500 }, { "epoch": 0.20117633288464914, "grad_norm": 1.6307331323623657, "learning_rate": 7.995548103184425e-06, "loss": 0.2499, "step": 220000 }, { "epoch": 0.20163355182302334, "grad_norm": 1.4969900846481323, "learning_rate": 7.990971728990782e-06, "loss": 0.2504, "step": 220500 }, { "epoch": 0.20209077076139756, "grad_norm": 1.8687270879745483, "learning_rate": 7.98639535479714e-06, "loss": 0.2429, "step": 221000 }, { "epoch": 0.20254798969977175, "grad_norm": 1.7077059745788574, "learning_rate": 7.981818980603497e-06, "loss": 0.2428, "step": 221500 }, { "epoch": 0.20300520863814595, "grad_norm": 2.0460216999053955, "learning_rate": 7.977242606409852e-06, "loss": 0.2521, "step": 222000 }, { "epoch": 0.20346242757652017, "grad_norm": 1.2996711730957031, "learning_rate": 7.972666232216211e-06, "loss": 0.2484, "step": 222500 }, { "epoch": 0.20391964651489436, "grad_norm": 1.2837764024734497, "learning_rate": 7.968089858022567e-06, "loss": 0.2473, "step": 223000 }, { "epoch": 0.20437686545326858, "grad_norm": 1.495692253112793, "learning_rate": 7.963513483828926e-06, "loss": 0.2557, "step": 223500 }, { "epoch": 0.20483408439164277, "grad_norm": 1.4509683847427368, "learning_rate": 7.958937109635281e-06, "loss": 0.2475, "step": 224000 }, { "epoch": 0.20529130333001697, "grad_norm": 1.1807700395584106, "learning_rate": 7.954360735441639e-06, "loss": 0.2467, "step": 224500 }, { "epoch": 0.2057485222683912, "grad_norm": 3.423560619354248, "learning_rate": 7.949784361247996e-06, "loss": 0.2477, "step": 225000 }, { "epoch": 0.20620574120676538, "grad_norm": 1.9667267799377441, "learning_rate": 7.945207987054353e-06, "loss": 0.2473, "step": 225500 }, { "epoch": 0.20666296014513957, "grad_norm": 1.695909023284912, "learning_rate": 7.94063161286071e-06, "loss": 0.2485, "step": 226000 }, { "epoch": 0.2071201790835138, "grad_norm": 1.5767865180969238, "learning_rate": 7.936055238667068e-06, "loss": 0.2462, "step": 226500 }, { "epoch": 0.207577398021888, "grad_norm": 1.427411675453186, "learning_rate": 7.931478864473423e-06, "loss": 0.2495, "step": 227000 }, { "epoch": 0.20803461696026218, "grad_norm": 1.1181446313858032, "learning_rate": 7.926902490279782e-06, "loss": 0.2444, "step": 227500 }, { "epoch": 0.2084918358986364, "grad_norm": 1.3804079294204712, "learning_rate": 7.922326116086138e-06, "loss": 0.2459, "step": 228000 }, { "epoch": 0.2089490548370106, "grad_norm": 1.2145448923110962, "learning_rate": 7.917749741892497e-06, "loss": 0.2458, "step": 228500 }, { "epoch": 0.2094062737753848, "grad_norm": 1.2149016857147217, "learning_rate": 7.913173367698853e-06, "loss": 0.2392, "step": 229000 }, { "epoch": 0.209863492713759, "grad_norm": 1.4271708726882935, "learning_rate": 7.908596993505211e-06, "loss": 0.2439, "step": 229500 }, { "epoch": 0.2103207116521332, "grad_norm": 1.336596965789795, "learning_rate": 7.904020619311567e-06, "loss": 0.2481, "step": 230000 }, { "epoch": 0.2107779305905074, "grad_norm": 1.6744037866592407, "learning_rate": 7.899444245117924e-06, "loss": 0.2442, "step": 230500 }, { "epoch": 0.21123514952888162, "grad_norm": 1.5563931465148926, "learning_rate": 7.894867870924282e-06, "loss": 0.2498, "step": 231000 }, { "epoch": 0.2116923684672558, "grad_norm": 1.8821616172790527, "learning_rate": 7.890291496730639e-06, "loss": 0.2443, "step": 231500 }, { "epoch": 0.21214958740563, "grad_norm": 2.037843704223633, "learning_rate": 7.885715122536996e-06, "loss": 0.2434, "step": 232000 }, { "epoch": 0.21260680634400422, "grad_norm": 1.0804463624954224, "learning_rate": 7.881138748343353e-06, "loss": 0.2509, "step": 232500 }, { "epoch": 0.21306402528237842, "grad_norm": 1.5283472537994385, "learning_rate": 7.87656237414971e-06, "loss": 0.2436, "step": 233000 }, { "epoch": 0.2135212442207526, "grad_norm": 1.7273632287979126, "learning_rate": 7.871985999956068e-06, "loss": 0.2477, "step": 233500 }, { "epoch": 0.21397846315912683, "grad_norm": 1.5856326818466187, "learning_rate": 7.867409625762424e-06, "loss": 0.2406, "step": 234000 }, { "epoch": 0.21443568209750102, "grad_norm": 1.1935285329818726, "learning_rate": 7.862833251568783e-06, "loss": 0.2474, "step": 234500 }, { "epoch": 0.21489290103587522, "grad_norm": 1.7221565246582031, "learning_rate": 7.858256877375138e-06, "loss": 0.2411, "step": 235000 }, { "epoch": 0.21535011997424944, "grad_norm": 1.7638108730316162, "learning_rate": 7.853680503181495e-06, "loss": 0.2487, "step": 235500 }, { "epoch": 0.21580733891262363, "grad_norm": 1.392970085144043, "learning_rate": 7.849104128987853e-06, "loss": 0.2475, "step": 236000 }, { "epoch": 0.21626455785099782, "grad_norm": 1.30288565158844, "learning_rate": 7.84452775479421e-06, "loss": 0.2485, "step": 236500 }, { "epoch": 0.21672177678937204, "grad_norm": 1.2558834552764893, "learning_rate": 7.839951380600567e-06, "loss": 0.246, "step": 237000 }, { "epoch": 0.21717899572774624, "grad_norm": 9.420547485351562, "learning_rate": 7.835375006406925e-06, "loss": 0.2465, "step": 237500 }, { "epoch": 0.21763621466612043, "grad_norm": 1.3113701343536377, "learning_rate": 7.830798632213282e-06, "loss": 0.2479, "step": 238000 }, { "epoch": 0.21809343360449465, "grad_norm": 1.3305801153182983, "learning_rate": 7.826222258019639e-06, "loss": 0.2454, "step": 238500 }, { "epoch": 0.21855065254286885, "grad_norm": 1.7414227724075317, "learning_rate": 7.821645883825996e-06, "loss": 0.2419, "step": 239000 }, { "epoch": 0.21900787148124304, "grad_norm": 2.2423360347747803, "learning_rate": 7.817069509632354e-06, "loss": 0.245, "step": 239500 }, { "epoch": 0.21946509041961726, "grad_norm": 1.4997841119766235, "learning_rate": 7.812493135438711e-06, "loss": 0.2454, "step": 240000 }, { "epoch": 0.21992230935799145, "grad_norm": 1.442734718322754, "learning_rate": 7.807916761245066e-06, "loss": 0.2411, "step": 240500 }, { "epoch": 0.22037952829636565, "grad_norm": 1.715790033340454, "learning_rate": 7.803340387051424e-06, "loss": 0.2453, "step": 241000 }, { "epoch": 0.22083674723473987, "grad_norm": 1.3321577310562134, "learning_rate": 7.798764012857781e-06, "loss": 0.2493, "step": 241500 }, { "epoch": 0.22129396617311406, "grad_norm": 1.7420936822891235, "learning_rate": 7.794187638664138e-06, "loss": 0.2388, "step": 242000 }, { "epoch": 0.22175118511148825, "grad_norm": 1.81510329246521, "learning_rate": 7.789611264470496e-06, "loss": 0.2473, "step": 242500 }, { "epoch": 0.22220840404986247, "grad_norm": 1.5320991277694702, "learning_rate": 7.785034890276853e-06, "loss": 0.245, "step": 243000 }, { "epoch": 0.22266562298823667, "grad_norm": 1.9116175174713135, "learning_rate": 7.78045851608321e-06, "loss": 0.2387, "step": 243500 }, { "epoch": 0.2231228419266109, "grad_norm": 1.2568988800048828, "learning_rate": 7.775882141889567e-06, "loss": 0.2426, "step": 244000 }, { "epoch": 0.22358006086498508, "grad_norm": 1.2286899089813232, "learning_rate": 7.771305767695925e-06, "loss": 0.237, "step": 244500 }, { "epoch": 0.22403727980335927, "grad_norm": 1.5561753511428833, "learning_rate": 7.766729393502282e-06, "loss": 0.241, "step": 245000 }, { "epoch": 0.2244944987417335, "grad_norm": 1.5937217473983765, "learning_rate": 7.76215301930864e-06, "loss": 0.2478, "step": 245500 }, { "epoch": 0.2249517176801077, "grad_norm": 1.533897042274475, "learning_rate": 7.757576645114997e-06, "loss": 0.2402, "step": 246000 }, { "epoch": 0.22540893661848188, "grad_norm": 1.7771514654159546, "learning_rate": 7.753000270921352e-06, "loss": 0.2472, "step": 246500 }, { "epoch": 0.2258661555568561, "grad_norm": 1.8437062501907349, "learning_rate": 7.748423896727711e-06, "loss": 0.2441, "step": 247000 }, { "epoch": 0.2263233744952303, "grad_norm": 0.9731696844100952, "learning_rate": 7.743847522534067e-06, "loss": 0.2422, "step": 247500 }, { "epoch": 0.2267805934336045, "grad_norm": 1.3486838340759277, "learning_rate": 7.739271148340426e-06, "loss": 0.2404, "step": 248000 }, { "epoch": 0.2272378123719787, "grad_norm": 1.1618529558181763, "learning_rate": 7.734694774146781e-06, "loss": 0.2431, "step": 248500 }, { "epoch": 0.2276950313103529, "grad_norm": 1.7412848472595215, "learning_rate": 7.730118399953138e-06, "loss": 0.2398, "step": 249000 }, { "epoch": 0.2281522502487271, "grad_norm": 1.6766868829727173, "learning_rate": 7.725542025759496e-06, "loss": 0.2409, "step": 249500 }, { "epoch": 0.22860946918710132, "grad_norm": 1.1682603359222412, "learning_rate": 7.720965651565853e-06, "loss": 0.2429, "step": 250000 }, { "epoch": 0.2290666881254755, "grad_norm": 1.1686651706695557, "learning_rate": 7.71638927737221e-06, "loss": 0.247, "step": 250500 }, { "epoch": 0.2295239070638497, "grad_norm": 1.0763442516326904, "learning_rate": 7.711812903178568e-06, "loss": 0.2405, "step": 251000 }, { "epoch": 0.22998112600222392, "grad_norm": 1.3527404069900513, "learning_rate": 7.707236528984923e-06, "loss": 0.2397, "step": 251500 }, { "epoch": 0.23043834494059812, "grad_norm": 1.5660016536712646, "learning_rate": 7.702660154791282e-06, "loss": 0.2465, "step": 252000 }, { "epoch": 0.2308955638789723, "grad_norm": 1.876938819885254, "learning_rate": 7.698083780597638e-06, "loss": 0.2413, "step": 252500 }, { "epoch": 0.23135278281734653, "grad_norm": 1.446905255317688, "learning_rate": 7.693507406403997e-06, "loss": 0.2422, "step": 253000 }, { "epoch": 0.23181000175572072, "grad_norm": 1.2305630445480347, "learning_rate": 7.688931032210352e-06, "loss": 0.2435, "step": 253500 }, { "epoch": 0.23226722069409492, "grad_norm": 1.6017937660217285, "learning_rate": 7.68435465801671e-06, "loss": 0.2375, "step": 254000 }, { "epoch": 0.23272443963246914, "grad_norm": 1.593798041343689, "learning_rate": 7.679778283823067e-06, "loss": 0.2413, "step": 254500 }, { "epoch": 0.23318165857084333, "grad_norm": 1.7218447923660278, "learning_rate": 7.675201909629424e-06, "loss": 0.2406, "step": 255000 }, { "epoch": 0.23363887750921752, "grad_norm": 1.631316900253296, "learning_rate": 7.670625535435781e-06, "loss": 0.2391, "step": 255500 }, { "epoch": 0.23409609644759174, "grad_norm": 1.3699698448181152, "learning_rate": 7.666049161242139e-06, "loss": 0.2406, "step": 256000 }, { "epoch": 0.23455331538596594, "grad_norm": 1.853630542755127, "learning_rate": 7.661472787048494e-06, "loss": 0.2396, "step": 256500 }, { "epoch": 0.23501053432434013, "grad_norm": 1.54131019115448, "learning_rate": 7.656896412854853e-06, "loss": 0.2435, "step": 257000 }, { "epoch": 0.23546775326271435, "grad_norm": 1.9329149723052979, "learning_rate": 7.652320038661209e-06, "loss": 0.2384, "step": 257500 }, { "epoch": 0.23592497220108855, "grad_norm": 1.2017878293991089, "learning_rate": 7.647743664467568e-06, "loss": 0.2427, "step": 258000 }, { "epoch": 0.23638219113946274, "grad_norm": 1.0747284889221191, "learning_rate": 7.643167290273923e-06, "loss": 0.2381, "step": 258500 }, { "epoch": 0.23683941007783696, "grad_norm": 1.9844415187835693, "learning_rate": 7.63859091608028e-06, "loss": 0.2357, "step": 259000 }, { "epoch": 0.23729662901621115, "grad_norm": 1.74272620677948, "learning_rate": 7.634014541886638e-06, "loss": 0.2429, "step": 259500 }, { "epoch": 0.23775384795458535, "grad_norm": 1.6719539165496826, "learning_rate": 7.629438167692995e-06, "loss": 0.237, "step": 260000 }, { "epoch": 0.23821106689295957, "grad_norm": 1.420264720916748, "learning_rate": 7.624861793499353e-06, "loss": 0.2392, "step": 260500 }, { "epoch": 0.23866828583133376, "grad_norm": 1.3896255493164062, "learning_rate": 7.62028541930571e-06, "loss": 0.239, "step": 261000 }, { "epoch": 0.23912550476970795, "grad_norm": 1.210802674293518, "learning_rate": 7.615709045112066e-06, "loss": 0.2407, "step": 261500 }, { "epoch": 0.23958272370808217, "grad_norm": 1.5185495615005493, "learning_rate": 7.611132670918424e-06, "loss": 0.2363, "step": 262000 }, { "epoch": 0.24003994264645637, "grad_norm": 1.4552907943725586, "learning_rate": 7.606556296724781e-06, "loss": 0.2423, "step": 262500 }, { "epoch": 0.24049716158483056, "grad_norm": 1.2917208671569824, "learning_rate": 7.601979922531139e-06, "loss": 0.2423, "step": 263000 }, { "epoch": 0.24095438052320478, "grad_norm": 1.4719345569610596, "learning_rate": 7.597403548337495e-06, "loss": 0.2375, "step": 263500 }, { "epoch": 0.24141159946157897, "grad_norm": 1.5045033693313599, "learning_rate": 7.592827174143853e-06, "loss": 0.2404, "step": 264000 }, { "epoch": 0.24186881839995317, "grad_norm": 1.222699761390686, "learning_rate": 7.58825079995021e-06, "loss": 0.2428, "step": 264500 }, { "epoch": 0.2423260373383274, "grad_norm": 1.681038498878479, "learning_rate": 7.583674425756566e-06, "loss": 0.2424, "step": 265000 }, { "epoch": 0.24278325627670158, "grad_norm": 1.6437132358551025, "learning_rate": 7.579098051562924e-06, "loss": 0.2363, "step": 265500 }, { "epoch": 0.2432404752150758, "grad_norm": 1.6310714483261108, "learning_rate": 7.574521677369281e-06, "loss": 0.2334, "step": 266000 }, { "epoch": 0.24369769415345, "grad_norm": 0.9880791902542114, "learning_rate": 7.569945303175639e-06, "loss": 0.2398, "step": 266500 }, { "epoch": 0.2441549130918242, "grad_norm": 1.508718729019165, "learning_rate": 7.565368928981995e-06, "loss": 0.2388, "step": 267000 }, { "epoch": 0.2446121320301984, "grad_norm": 2.1801445484161377, "learning_rate": 7.560792554788352e-06, "loss": 0.2414, "step": 267500 }, { "epoch": 0.2450693509685726, "grad_norm": 1.4477598667144775, "learning_rate": 7.55621618059471e-06, "loss": 0.2414, "step": 268000 }, { "epoch": 0.2455265699069468, "grad_norm": 1.3772883415222168, "learning_rate": 7.551639806401066e-06, "loss": 0.2358, "step": 268500 }, { "epoch": 0.24598378884532102, "grad_norm": 1.6583518981933594, "learning_rate": 7.5470634322074244e-06, "loss": 0.2402, "step": 269000 }, { "epoch": 0.2464410077836952, "grad_norm": 1.6297439336776733, "learning_rate": 7.542487058013781e-06, "loss": 0.2406, "step": 269500 }, { "epoch": 0.2468982267220694, "grad_norm": 1.544605016708374, "learning_rate": 7.537910683820137e-06, "loss": 0.2401, "step": 270000 }, { "epoch": 0.24735544566044362, "grad_norm": 2.170027256011963, "learning_rate": 7.533334309626495e-06, "loss": 0.2433, "step": 270500 }, { "epoch": 0.24781266459881782, "grad_norm": 1.6215145587921143, "learning_rate": 7.528757935432852e-06, "loss": 0.2361, "step": 271000 }, { "epoch": 0.248269883537192, "grad_norm": 1.4996685981750488, "learning_rate": 7.52418156123921e-06, "loss": 0.2366, "step": 271500 }, { "epoch": 0.24872710247556623, "grad_norm": 1.610382080078125, "learning_rate": 7.519605187045566e-06, "loss": 0.2429, "step": 272000 }, { "epoch": 0.24918432141394042, "grad_norm": 1.7235709428787231, "learning_rate": 7.515028812851924e-06, "loss": 0.2395, "step": 272500 }, { "epoch": 0.24964154035231462, "grad_norm": 1.6454797983169556, "learning_rate": 7.510452438658281e-06, "loss": 0.2352, "step": 273000 }, { "epoch": 0.2500987592906888, "grad_norm": 1.6150950193405151, "learning_rate": 7.505876064464637e-06, "loss": 0.2431, "step": 273500 }, { "epoch": 0.250555978229063, "grad_norm": 1.4403808116912842, "learning_rate": 7.5012996902709955e-06, "loss": 0.2377, "step": 274000 }, { "epoch": 0.25101319716743725, "grad_norm": 7.061529636383057, "learning_rate": 7.496723316077352e-06, "loss": 0.2444, "step": 274500 }, { "epoch": 0.25147041610581145, "grad_norm": 1.3562450408935547, "learning_rate": 7.492146941883709e-06, "loss": 0.2329, "step": 275000 }, { "epoch": 0.25192763504418564, "grad_norm": 1.495060920715332, "learning_rate": 7.4875705676900665e-06, "loss": 0.2419, "step": 275500 }, { "epoch": 0.25238485398255983, "grad_norm": 1.5048260688781738, "learning_rate": 7.482994193496424e-06, "loss": 0.2341, "step": 276000 }, { "epoch": 0.252842072920934, "grad_norm": 1.3745087385177612, "learning_rate": 7.478417819302781e-06, "loss": 0.239, "step": 276500 }, { "epoch": 0.2532992918593083, "grad_norm": 1.4182616472244263, "learning_rate": 7.473841445109138e-06, "loss": 0.2339, "step": 277000 }, { "epoch": 0.25375651079768247, "grad_norm": 1.4499032497406006, "learning_rate": 7.469265070915495e-06, "loss": 0.2418, "step": 277500 }, { "epoch": 0.25421372973605666, "grad_norm": 1.2812670469284058, "learning_rate": 7.464688696721852e-06, "loss": 0.2341, "step": 278000 }, { "epoch": 0.25467094867443085, "grad_norm": 1.7163888216018677, "learning_rate": 7.460112322528209e-06, "loss": 0.239, "step": 278500 }, { "epoch": 0.25512816761280505, "grad_norm": 1.596152424812317, "learning_rate": 7.4555359483345666e-06, "loss": 0.2364, "step": 279000 }, { "epoch": 0.25558538655117924, "grad_norm": 1.5450259447097778, "learning_rate": 7.450959574140924e-06, "loss": 0.2417, "step": 279500 }, { "epoch": 0.2560426054895535, "grad_norm": 1.5092369318008423, "learning_rate": 7.446383199947281e-06, "loss": 0.239, "step": 280000 }, { "epoch": 0.2564998244279277, "grad_norm": 1.8400509357452393, "learning_rate": 7.441806825753638e-06, "loss": 0.239, "step": 280500 }, { "epoch": 0.2569570433663019, "grad_norm": 1.4208296537399292, "learning_rate": 7.437230451559995e-06, "loss": 0.2384, "step": 281000 }, { "epoch": 0.25741426230467607, "grad_norm": 1.839404821395874, "learning_rate": 7.432654077366352e-06, "loss": 0.2413, "step": 281500 }, { "epoch": 0.25787148124305026, "grad_norm": 1.3527752161026, "learning_rate": 7.428077703172709e-06, "loss": 0.2366, "step": 282000 }, { "epoch": 0.25832870018142445, "grad_norm": 1.7706711292266846, "learning_rate": 7.423501328979067e-06, "loss": 0.2387, "step": 282500 }, { "epoch": 0.2587859191197987, "grad_norm": 1.1660232543945312, "learning_rate": 7.418924954785424e-06, "loss": 0.2381, "step": 283000 }, { "epoch": 0.2592431380581729, "grad_norm": 1.6995941400527954, "learning_rate": 7.41434858059178e-06, "loss": 0.2333, "step": 283500 }, { "epoch": 0.2597003569965471, "grad_norm": 1.5616917610168457, "learning_rate": 7.4097722063981385e-06, "loss": 0.2354, "step": 284000 }, { "epoch": 0.2601575759349213, "grad_norm": 1.7792470455169678, "learning_rate": 7.405195832204495e-06, "loss": 0.2338, "step": 284500 }, { "epoch": 0.2606147948732955, "grad_norm": 1.0877039432525635, "learning_rate": 7.400619458010852e-06, "loss": 0.231, "step": 285000 }, { "epoch": 0.26107201381166967, "grad_norm": 1.5051804780960083, "learning_rate": 7.3960430838172095e-06, "loss": 0.2325, "step": 285500 }, { "epoch": 0.2615292327500439, "grad_norm": 1.912229061126709, "learning_rate": 7.391466709623566e-06, "loss": 0.2329, "step": 286000 }, { "epoch": 0.2619864516884181, "grad_norm": 1.576975703239441, "learning_rate": 7.386890335429924e-06, "loss": 0.2341, "step": 286500 }, { "epoch": 0.2624436706267923, "grad_norm": 1.5463943481445312, "learning_rate": 7.3823139612362804e-06, "loss": 0.2363, "step": 287000 }, { "epoch": 0.2629008895651665, "grad_norm": 1.4972643852233887, "learning_rate": 7.3777375870426386e-06, "loss": 0.2358, "step": 287500 }, { "epoch": 0.2633581085035407, "grad_norm": 1.4807320833206177, "learning_rate": 7.373161212848995e-06, "loss": 0.2372, "step": 288000 }, { "epoch": 0.2638153274419149, "grad_norm": 1.362641453742981, "learning_rate": 7.368584838655351e-06, "loss": 0.2369, "step": 288500 }, { "epoch": 0.26427254638028913, "grad_norm": 2.3555381298065186, "learning_rate": 7.3640084644617095e-06, "loss": 0.2376, "step": 289000 }, { "epoch": 0.2647297653186633, "grad_norm": 1.4322718381881714, "learning_rate": 7.359432090268066e-06, "loss": 0.2344, "step": 289500 }, { "epoch": 0.2651869842570375, "grad_norm": 1.7021692991256714, "learning_rate": 7.354855716074424e-06, "loss": 0.2383, "step": 290000 }, { "epoch": 0.2656442031954117, "grad_norm": 1.3686518669128418, "learning_rate": 7.3502793418807805e-06, "loss": 0.233, "step": 290500 }, { "epoch": 0.2661014221337859, "grad_norm": 1.8416357040405273, "learning_rate": 7.345702967687137e-06, "loss": 0.2354, "step": 291000 }, { "epoch": 0.2665586410721601, "grad_norm": 1.4981660842895508, "learning_rate": 7.341126593493495e-06, "loss": 0.2382, "step": 291500 }, { "epoch": 0.26701586001053434, "grad_norm": 1.022232174873352, "learning_rate": 7.3365502192998515e-06, "loss": 0.2325, "step": 292000 }, { "epoch": 0.26747307894890854, "grad_norm": 1.6213542222976685, "learning_rate": 7.33197384510621e-06, "loss": 0.2357, "step": 292500 }, { "epoch": 0.26793029788728273, "grad_norm": 1.7134053707122803, "learning_rate": 7.327397470912566e-06, "loss": 0.2387, "step": 293000 }, { "epoch": 0.2683875168256569, "grad_norm": 1.051689863204956, "learning_rate": 7.3228210967189225e-06, "loss": 0.2318, "step": 293500 }, { "epoch": 0.2688447357640311, "grad_norm": 1.5515960454940796, "learning_rate": 7.318244722525281e-06, "loss": 0.2377, "step": 294000 }, { "epoch": 0.2693019547024053, "grad_norm": 1.414265513420105, "learning_rate": 7.313668348331637e-06, "loss": 0.2368, "step": 294500 }, { "epoch": 0.26975917364077956, "grad_norm": 3.989739418029785, "learning_rate": 7.309091974137995e-06, "loss": 0.2372, "step": 295000 }, { "epoch": 0.27021639257915375, "grad_norm": 1.0414005517959595, "learning_rate": 7.304515599944352e-06, "loss": 0.2398, "step": 295500 }, { "epoch": 0.27067361151752795, "grad_norm": 2.2172224521636963, "learning_rate": 7.29993922575071e-06, "loss": 0.2375, "step": 296000 }, { "epoch": 0.27113083045590214, "grad_norm": 1.6848254203796387, "learning_rate": 7.295362851557066e-06, "loss": 0.2349, "step": 296500 }, { "epoch": 0.27158804939427633, "grad_norm": 1.2511268854141235, "learning_rate": 7.2907864773634226e-06, "loss": 0.2331, "step": 297000 }, { "epoch": 0.2720452683326506, "grad_norm": 1.4679317474365234, "learning_rate": 7.286210103169781e-06, "loss": 0.2452, "step": 297500 }, { "epoch": 0.2725024872710248, "grad_norm": 1.6700774431228638, "learning_rate": 7.281633728976137e-06, "loss": 0.2329, "step": 298000 }, { "epoch": 0.27295970620939897, "grad_norm": 1.0634691715240479, "learning_rate": 7.277057354782495e-06, "loss": 0.2339, "step": 298500 }, { "epoch": 0.27341692514777316, "grad_norm": 1.706181287765503, "learning_rate": 7.272480980588852e-06, "loss": 0.2382, "step": 299000 }, { "epoch": 0.27387414408614735, "grad_norm": 1.1611428260803223, "learning_rate": 7.267904606395209e-06, "loss": 0.2377, "step": 299500 }, { "epoch": 0.27433136302452155, "grad_norm": 1.2169976234436035, "learning_rate": 7.263328232201566e-06, "loss": 0.2326, "step": 300000 }, { "epoch": 0.2747885819628958, "grad_norm": 1.947888970375061, "learning_rate": 7.258751858007923e-06, "loss": 0.2316, "step": 300500 }, { "epoch": 0.27524580090127, "grad_norm": 1.2794651985168457, "learning_rate": 7.254175483814281e-06, "loss": 0.2373, "step": 301000 }, { "epoch": 0.2757030198396442, "grad_norm": 1.6995222568511963, "learning_rate": 7.249599109620637e-06, "loss": 0.2338, "step": 301500 }, { "epoch": 0.2761602387780184, "grad_norm": 1.4030494689941406, "learning_rate": 7.2450227354269945e-06, "loss": 0.2353, "step": 302000 }, { "epoch": 0.27661745771639257, "grad_norm": 1.529697299003601, "learning_rate": 7.240446361233352e-06, "loss": 0.2318, "step": 302500 }, { "epoch": 0.27707467665476676, "grad_norm": 1.3057571649551392, "learning_rate": 7.235869987039709e-06, "loss": 0.2331, "step": 303000 }, { "epoch": 0.277531895593141, "grad_norm": 1.6574506759643555, "learning_rate": 7.231293612846066e-06, "loss": 0.2373, "step": 303500 }, { "epoch": 0.2779891145315152, "grad_norm": 1.468015432357788, "learning_rate": 7.226717238652423e-06, "loss": 0.2339, "step": 304000 }, { "epoch": 0.2784463334698894, "grad_norm": 1.5622738599777222, "learning_rate": 7.22214086445878e-06, "loss": 0.239, "step": 304500 }, { "epoch": 0.2789035524082636, "grad_norm": 1.5072553157806396, "learning_rate": 7.217564490265137e-06, "loss": 0.2345, "step": 305000 }, { "epoch": 0.2793607713466378, "grad_norm": 1.3062992095947266, "learning_rate": 7.2129881160714946e-06, "loss": 0.2327, "step": 305500 }, { "epoch": 0.279817990285012, "grad_norm": 1.2529741525650024, "learning_rate": 7.208411741877852e-06, "loss": 0.2317, "step": 306000 }, { "epoch": 0.2802752092233862, "grad_norm": 1.3761204481124878, "learning_rate": 7.203835367684209e-06, "loss": 0.2323, "step": 306500 }, { "epoch": 0.2807324281617604, "grad_norm": 1.5438259840011597, "learning_rate": 7.1992589934905655e-06, "loss": 0.2314, "step": 307000 }, { "epoch": 0.2811896471001346, "grad_norm": 1.3885058164596558, "learning_rate": 7.194682619296924e-06, "loss": 0.2381, "step": 307500 }, { "epoch": 0.2816468660385088, "grad_norm": 2.722839117050171, "learning_rate": 7.19010624510328e-06, "loss": 0.2352, "step": 308000 }, { "epoch": 0.282104084976883, "grad_norm": 1.6473603248596191, "learning_rate": 7.185529870909637e-06, "loss": 0.2267, "step": 308500 }, { "epoch": 0.2825613039152572, "grad_norm": 4.843954086303711, "learning_rate": 7.180953496715995e-06, "loss": 0.2323, "step": 309000 }, { "epoch": 0.28301852285363144, "grad_norm": 1.541217565536499, "learning_rate": 7.176377122522351e-06, "loss": 0.2249, "step": 309500 }, { "epoch": 0.28347574179200563, "grad_norm": 3.621250867843628, "learning_rate": 7.171800748328709e-06, "loss": 0.2354, "step": 310000 }, { "epoch": 0.2839329607303798, "grad_norm": 1.7025402784347534, "learning_rate": 7.167224374135066e-06, "loss": 0.2367, "step": 310500 }, { "epoch": 0.284390179668754, "grad_norm": 1.3267154693603516, "learning_rate": 7.162647999941424e-06, "loss": 0.2346, "step": 311000 }, { "epoch": 0.2848473986071282, "grad_norm": 1.163634181022644, "learning_rate": 7.15807162574778e-06, "loss": 0.2317, "step": 311500 }, { "epoch": 0.2853046175455024, "grad_norm": 1.2446917295455933, "learning_rate": 7.153495251554137e-06, "loss": 0.2336, "step": 312000 }, { "epoch": 0.28576183648387665, "grad_norm": 1.2674944400787354, "learning_rate": 7.148918877360495e-06, "loss": 0.235, "step": 312500 }, { "epoch": 0.28621905542225085, "grad_norm": 1.5385478734970093, "learning_rate": 7.144342503166851e-06, "loss": 0.2352, "step": 313000 }, { "epoch": 0.28667627436062504, "grad_norm": 1.1245741844177246, "learning_rate": 7.139766128973209e-06, "loss": 0.2357, "step": 313500 }, { "epoch": 0.28713349329899923, "grad_norm": 1.405200719833374, "learning_rate": 7.135189754779566e-06, "loss": 0.2317, "step": 314000 }, { "epoch": 0.2875907122373734, "grad_norm": 1.4755611419677734, "learning_rate": 7.130613380585924e-06, "loss": 0.2333, "step": 314500 }, { "epoch": 0.2880479311757476, "grad_norm": 1.551849603652954, "learning_rate": 7.12603700639228e-06, "loss": 0.2349, "step": 315000 }, { "epoch": 0.28850515011412187, "grad_norm": 1.5056345462799072, "learning_rate": 7.121460632198637e-06, "loss": 0.2324, "step": 315500 }, { "epoch": 0.28896236905249606, "grad_norm": 1.2897391319274902, "learning_rate": 7.116884258004995e-06, "loss": 0.2348, "step": 316000 }, { "epoch": 0.28941958799087025, "grad_norm": 4.2867560386657715, "learning_rate": 7.112307883811351e-06, "loss": 0.2302, "step": 316500 }, { "epoch": 0.28987680692924445, "grad_norm": 1.673755407333374, "learning_rate": 7.107731509617709e-06, "loss": 0.2349, "step": 317000 }, { "epoch": 0.29033402586761864, "grad_norm": 1.3654760122299194, "learning_rate": 7.103155135424066e-06, "loss": 0.2331, "step": 317500 }, { "epoch": 0.2907912448059929, "grad_norm": 1.285056471824646, "learning_rate": 7.098578761230422e-06, "loss": 0.2322, "step": 318000 }, { "epoch": 0.2912484637443671, "grad_norm": 1.6767268180847168, "learning_rate": 7.09400238703678e-06, "loss": 0.235, "step": 318500 }, { "epoch": 0.2917056826827413, "grad_norm": 1.429954171180725, "learning_rate": 7.089426012843137e-06, "loss": 0.2329, "step": 319000 }, { "epoch": 0.29216290162111547, "grad_norm": 1.201323390007019, "learning_rate": 7.084849638649495e-06, "loss": 0.2315, "step": 319500 }, { "epoch": 0.29262012055948966, "grad_norm": 0.9910763502120972, "learning_rate": 7.080273264455851e-06, "loss": 0.2324, "step": 320000 }, { "epoch": 0.29307733949786385, "grad_norm": 1.744512915611267, "learning_rate": 7.075696890262208e-06, "loss": 0.2322, "step": 320500 }, { "epoch": 0.2935345584362381, "grad_norm": 1.4103306531906128, "learning_rate": 7.071120516068566e-06, "loss": 0.2305, "step": 321000 }, { "epoch": 0.2939917773746123, "grad_norm": 1.1745530366897583, "learning_rate": 7.066544141874922e-06, "loss": 0.2322, "step": 321500 }, { "epoch": 0.2944489963129865, "grad_norm": 1.4597954750061035, "learning_rate": 7.0619677676812804e-06, "loss": 0.2317, "step": 322000 }, { "epoch": 0.2949062152513607, "grad_norm": 1.2206711769104004, "learning_rate": 7.057391393487637e-06, "loss": 0.233, "step": 322500 }, { "epoch": 0.2953634341897349, "grad_norm": 1.5247403383255005, "learning_rate": 7.052815019293993e-06, "loss": 0.2304, "step": 323000 }, { "epoch": 0.29582065312810907, "grad_norm": 1.1999021768569946, "learning_rate": 7.048238645100351e-06, "loss": 0.2338, "step": 323500 }, { "epoch": 0.2962778720664833, "grad_norm": 1.4971832036972046, "learning_rate": 7.043662270906708e-06, "loss": 0.228, "step": 324000 }, { "epoch": 0.2967350910048575, "grad_norm": 1.3966095447540283, "learning_rate": 7.039085896713066e-06, "loss": 0.2272, "step": 324500 }, { "epoch": 0.2971923099432317, "grad_norm": 2.6198599338531494, "learning_rate": 7.034509522519422e-06, "loss": 0.231, "step": 325000 }, { "epoch": 0.2976495288816059, "grad_norm": 1.6076655387878418, "learning_rate": 7.02993314832578e-06, "loss": 0.2294, "step": 325500 }, { "epoch": 0.2981067478199801, "grad_norm": 1.1746286153793335, "learning_rate": 7.025356774132137e-06, "loss": 0.2307, "step": 326000 }, { "epoch": 0.2985639667583543, "grad_norm": 1.1930854320526123, "learning_rate": 7.020780399938494e-06, "loss": 0.2275, "step": 326500 }, { "epoch": 0.29902118569672853, "grad_norm": 1.406855821609497, "learning_rate": 7.0162040257448515e-06, "loss": 0.2294, "step": 327000 }, { "epoch": 0.2994784046351027, "grad_norm": 1.669003963470459, "learning_rate": 7.011627651551208e-06, "loss": 0.2285, "step": 327500 }, { "epoch": 0.2999356235734769, "grad_norm": 1.32759690284729, "learning_rate": 7.007051277357565e-06, "loss": 0.2349, "step": 328000 }, { "epoch": 0.3003928425118511, "grad_norm": 1.2544605731964111, "learning_rate": 7.0024749031639225e-06, "loss": 0.2302, "step": 328500 }, { "epoch": 0.3008500614502253, "grad_norm": 1.6793965101242065, "learning_rate": 6.99789852897028e-06, "loss": 0.2323, "step": 329000 }, { "epoch": 0.3013072803885995, "grad_norm": 2.0287883281707764, "learning_rate": 6.993322154776637e-06, "loss": 0.2269, "step": 329500 }, { "epoch": 0.30176449932697375, "grad_norm": 1.4484792947769165, "learning_rate": 6.988745780582994e-06, "loss": 0.2333, "step": 330000 }, { "epoch": 0.30222171826534794, "grad_norm": 57.33815383911133, "learning_rate": 6.9841694063893516e-06, "loss": 0.2303, "step": 330500 }, { "epoch": 0.30267893720372213, "grad_norm": 1.2864947319030762, "learning_rate": 6.979593032195708e-06, "loss": 0.2288, "step": 331000 }, { "epoch": 0.3031361561420963, "grad_norm": 1.2493833303451538, "learning_rate": 6.975016658002065e-06, "loss": 0.2296, "step": 331500 }, { "epoch": 0.3035933750804705, "grad_norm": 1.4573181867599487, "learning_rate": 6.9704402838084226e-06, "loss": 0.2241, "step": 332000 }, { "epoch": 0.3040505940188447, "grad_norm": 1.5703788995742798, "learning_rate": 6.96586390961478e-06, "loss": 0.2349, "step": 332500 }, { "epoch": 0.30450781295721896, "grad_norm": 1.7643167972564697, "learning_rate": 6.961287535421137e-06, "loss": 0.2313, "step": 333000 }, { "epoch": 0.30496503189559315, "grad_norm": 1.4087350368499756, "learning_rate": 6.956711161227494e-06, "loss": 0.2277, "step": 333500 }, { "epoch": 0.30542225083396735, "grad_norm": 1.5593231916427612, "learning_rate": 6.952134787033851e-06, "loss": 0.2315, "step": 334000 }, { "epoch": 0.30587946977234154, "grad_norm": 1.3212522268295288, "learning_rate": 6.947558412840208e-06, "loss": 0.2299, "step": 334500 }, { "epoch": 0.30633668871071573, "grad_norm": 1.4686697721481323, "learning_rate": 6.942982038646565e-06, "loss": 0.2308, "step": 335000 }, { "epoch": 0.3067939076490899, "grad_norm": 1.436549186706543, "learning_rate": 6.938405664452923e-06, "loss": 0.2311, "step": 335500 }, { "epoch": 0.3072511265874642, "grad_norm": 1.471840262413025, "learning_rate": 6.93382929025928e-06, "loss": 0.228, "step": 336000 }, { "epoch": 0.30770834552583837, "grad_norm": 1.3207478523254395, "learning_rate": 6.929252916065636e-06, "loss": 0.2274, "step": 336500 }, { "epoch": 0.30816556446421256, "grad_norm": 1.2210125923156738, "learning_rate": 6.9246765418719945e-06, "loss": 0.2286, "step": 337000 }, { "epoch": 0.30862278340258675, "grad_norm": 1.3577375411987305, "learning_rate": 6.920100167678351e-06, "loss": 0.2292, "step": 337500 }, { "epoch": 0.30908000234096095, "grad_norm": 1.1750257015228271, "learning_rate": 6.915523793484709e-06, "loss": 0.2324, "step": 338000 }, { "epoch": 0.30953722127933514, "grad_norm": 2.1666083335876465, "learning_rate": 6.9109474192910654e-06, "loss": 0.2287, "step": 338500 }, { "epoch": 0.3099944402177094, "grad_norm": 1.4774967432022095, "learning_rate": 6.906371045097422e-06, "loss": 0.2285, "step": 339000 }, { "epoch": 0.3104516591560836, "grad_norm": 1.6809563636779785, "learning_rate": 6.90179467090378e-06, "loss": 0.2302, "step": 339500 }, { "epoch": 0.3109088780944578, "grad_norm": 1.0657724142074585, "learning_rate": 6.8972182967101364e-06, "loss": 0.2317, "step": 340000 }, { "epoch": 0.31136609703283197, "grad_norm": 1.375364899635315, "learning_rate": 6.8926419225164945e-06, "loss": 0.2286, "step": 340500 }, { "epoch": 0.31182331597120616, "grad_norm": 2.182593584060669, "learning_rate": 6.888065548322851e-06, "loss": 0.2347, "step": 341000 }, { "epoch": 0.3122805349095804, "grad_norm": 1.7022758722305298, "learning_rate": 6.883489174129207e-06, "loss": 0.2286, "step": 341500 }, { "epoch": 0.3127377538479546, "grad_norm": 1.5238574743270874, "learning_rate": 6.8789127999355655e-06, "loss": 0.226, "step": 342000 }, { "epoch": 0.3131949727863288, "grad_norm": 1.5397921800613403, "learning_rate": 6.874336425741922e-06, "loss": 0.2314, "step": 342500 }, { "epoch": 0.313652191724703, "grad_norm": 1.4972351789474487, "learning_rate": 6.86976005154828e-06, "loss": 0.2263, "step": 343000 }, { "epoch": 0.3141094106630772, "grad_norm": 1.541048288345337, "learning_rate": 6.8651836773546365e-06, "loss": 0.226, "step": 343500 }, { "epoch": 0.3145666296014514, "grad_norm": 1.155745029449463, "learning_rate": 6.860607303160993e-06, "loss": 0.2299, "step": 344000 }, { "epoch": 0.3150238485398256, "grad_norm": 1.356096863746643, "learning_rate": 6.856030928967351e-06, "loss": 0.2283, "step": 344500 }, { "epoch": 0.3154810674781998, "grad_norm": 1.1130493879318237, "learning_rate": 6.8514545547737075e-06, "loss": 0.2255, "step": 345000 }, { "epoch": 0.315938286416574, "grad_norm": 1.4475120306015015, "learning_rate": 6.846878180580066e-06, "loss": 0.2311, "step": 345500 }, { "epoch": 0.3163955053549482, "grad_norm": 1.6176073551177979, "learning_rate": 6.842301806386422e-06, "loss": 0.225, "step": 346000 }, { "epoch": 0.3168527242933224, "grad_norm": 1.8481721878051758, "learning_rate": 6.83772543219278e-06, "loss": 0.2284, "step": 346500 }, { "epoch": 0.3173099432316966, "grad_norm": 1.6207536458969116, "learning_rate": 6.833149057999137e-06, "loss": 0.228, "step": 347000 }, { "epoch": 0.31776716217007084, "grad_norm": 1.3753981590270996, "learning_rate": 6.828572683805493e-06, "loss": 0.2297, "step": 347500 }, { "epoch": 0.31822438110844503, "grad_norm": 1.168278455734253, "learning_rate": 6.823996309611851e-06, "loss": 0.2301, "step": 348000 }, { "epoch": 0.3186816000468192, "grad_norm": 1.7938873767852783, "learning_rate": 6.8194199354182076e-06, "loss": 0.2313, "step": 348500 }, { "epoch": 0.3191388189851934, "grad_norm": 1.2588731050491333, "learning_rate": 6.814843561224566e-06, "loss": 0.2287, "step": 349000 }, { "epoch": 0.3195960379235676, "grad_norm": 1.5052902698516846, "learning_rate": 6.810267187030922e-06, "loss": 0.2221, "step": 349500 }, { "epoch": 0.3200532568619418, "grad_norm": 1.4498345851898193, "learning_rate": 6.8056908128372786e-06, "loss": 0.2286, "step": 350000 }, { "epoch": 0.32051047580031605, "grad_norm": 1.5637640953063965, "learning_rate": 6.801114438643637e-06, "loss": 0.2325, "step": 350500 }, { "epoch": 0.32096769473869025, "grad_norm": 0.9277071952819824, "learning_rate": 6.796538064449993e-06, "loss": 0.2241, "step": 351000 }, { "epoch": 0.32142491367706444, "grad_norm": 1.4922164678573608, "learning_rate": 6.791961690256351e-06, "loss": 0.2271, "step": 351500 }, { "epoch": 0.32188213261543863, "grad_norm": 1.3462028503417969, "learning_rate": 6.787385316062708e-06, "loss": 0.2251, "step": 352000 }, { "epoch": 0.3223393515538128, "grad_norm": 1.138120412826538, "learning_rate": 6.782808941869065e-06, "loss": 0.2244, "step": 352500 }, { "epoch": 0.322796570492187, "grad_norm": 1.3926693201065063, "learning_rate": 6.778232567675422e-06, "loss": 0.2261, "step": 353000 }, { "epoch": 0.32325378943056127, "grad_norm": 1.3903855085372925, "learning_rate": 6.773656193481779e-06, "loss": 0.2337, "step": 353500 }, { "epoch": 0.32371100836893546, "grad_norm": 1.4542618989944458, "learning_rate": 6.769079819288137e-06, "loss": 0.2337, "step": 354000 }, { "epoch": 0.32416822730730965, "grad_norm": 1.0457683801651, "learning_rate": 6.764503445094493e-06, "loss": 0.2305, "step": 354500 }, { "epoch": 0.32462544624568385, "grad_norm": 1.3533685207366943, "learning_rate": 6.7599270709008505e-06, "loss": 0.2239, "step": 355000 }, { "epoch": 0.32508266518405804, "grad_norm": 1.5367493629455566, "learning_rate": 6.755350696707208e-06, "loss": 0.229, "step": 355500 }, { "epoch": 0.32553988412243223, "grad_norm": 0.9888611435890198, "learning_rate": 6.750774322513565e-06, "loss": 0.2274, "step": 356000 }, { "epoch": 0.3259971030608065, "grad_norm": 2.0140717029571533, "learning_rate": 6.746197948319922e-06, "loss": 0.225, "step": 356500 }, { "epoch": 0.3264543219991807, "grad_norm": 1.2105058431625366, "learning_rate": 6.7416215741262796e-06, "loss": 0.225, "step": 357000 }, { "epoch": 0.32691154093755487, "grad_norm": 1.452605962753296, "learning_rate": 6.737045199932636e-06, "loss": 0.2264, "step": 357500 }, { "epoch": 0.32736875987592906, "grad_norm": 1.4895436763763428, "learning_rate": 6.732468825738993e-06, "loss": 0.2342, "step": 358000 }, { "epoch": 0.32782597881430325, "grad_norm": 1.7278785705566406, "learning_rate": 6.7278924515453505e-06, "loss": 0.2266, "step": 358500 }, { "epoch": 0.32828319775267745, "grad_norm": 1.0101240873336792, "learning_rate": 6.723316077351708e-06, "loss": 0.2265, "step": 359000 }, { "epoch": 0.3287404166910517, "grad_norm": 1.5752644538879395, "learning_rate": 6.718739703158065e-06, "loss": 0.2246, "step": 359500 }, { "epoch": 0.3291976356294259, "grad_norm": 1.188202977180481, "learning_rate": 6.7141633289644215e-06, "loss": 0.2229, "step": 360000 }, { "epoch": 0.3296548545678001, "grad_norm": 1.657990574836731, "learning_rate": 6.70958695477078e-06, "loss": 0.2291, "step": 360500 }, { "epoch": 0.3301120735061743, "grad_norm": 1.1453895568847656, "learning_rate": 6.705010580577136e-06, "loss": 0.2227, "step": 361000 }, { "epoch": 0.33056929244454847, "grad_norm": 1.4932241439819336, "learning_rate": 6.700434206383493e-06, "loss": 0.2297, "step": 361500 }, { "epoch": 0.3310265113829227, "grad_norm": 1.0189024209976196, "learning_rate": 6.695857832189851e-06, "loss": 0.2226, "step": 362000 }, { "epoch": 0.3314837303212969, "grad_norm": 1.1504535675048828, "learning_rate": 6.691281457996207e-06, "loss": 0.2221, "step": 362500 }, { "epoch": 0.3319409492596711, "grad_norm": 1.1485751867294312, "learning_rate": 6.686705083802565e-06, "loss": 0.2321, "step": 363000 }, { "epoch": 0.3323981681980453, "grad_norm": 1.4840065240859985, "learning_rate": 6.682128709608922e-06, "loss": 0.2242, "step": 363500 }, { "epoch": 0.3328553871364195, "grad_norm": 1.8786394596099854, "learning_rate": 6.67755233541528e-06, "loss": 0.2266, "step": 364000 }, { "epoch": 0.3333126060747937, "grad_norm": 1.5424981117248535, "learning_rate": 6.672975961221636e-06, "loss": 0.2244, "step": 364500 }, { "epoch": 0.33376982501316793, "grad_norm": 1.5438640117645264, "learning_rate": 6.6683995870279934e-06, "loss": 0.2277, "step": 365000 }, { "epoch": 0.3342270439515421, "grad_norm": 1.3231064081192017, "learning_rate": 6.663823212834351e-06, "loss": 0.2252, "step": 365500 }, { "epoch": 0.3346842628899163, "grad_norm": 1.649964451789856, "learning_rate": 6.659246838640707e-06, "loss": 0.2253, "step": 366000 }, { "epoch": 0.3351414818282905, "grad_norm": 1.6227779388427734, "learning_rate": 6.654670464447065e-06, "loss": 0.2256, "step": 366500 }, { "epoch": 0.3355987007666647, "grad_norm": 1.197120189666748, "learning_rate": 6.650094090253422e-06, "loss": 0.2249, "step": 367000 }, { "epoch": 0.3360559197050389, "grad_norm": 1.308526873588562, "learning_rate": 6.64551771605978e-06, "loss": 0.2214, "step": 367500 }, { "epoch": 0.33651313864341315, "grad_norm": 1.3108173608779907, "learning_rate": 6.640941341866136e-06, "loss": 0.2271, "step": 368000 }, { "epoch": 0.33697035758178734, "grad_norm": 1.6136078834533691, "learning_rate": 6.636364967672493e-06, "loss": 0.2288, "step": 368500 }, { "epoch": 0.33742757652016153, "grad_norm": 1.7667053937911987, "learning_rate": 6.631788593478851e-06, "loss": 0.2225, "step": 369000 }, { "epoch": 0.3378847954585357, "grad_norm": 1.384122610092163, "learning_rate": 6.627212219285207e-06, "loss": 0.2237, "step": 369500 }, { "epoch": 0.3383420143969099, "grad_norm": 1.4266330003738403, "learning_rate": 6.622635845091565e-06, "loss": 0.2259, "step": 370000 }, { "epoch": 0.3387992333352841, "grad_norm": 1.2225444316864014, "learning_rate": 6.618059470897922e-06, "loss": 0.2277, "step": 370500 }, { "epoch": 0.33925645227365836, "grad_norm": 1.3453285694122314, "learning_rate": 6.613483096704278e-06, "loss": 0.2252, "step": 371000 }, { "epoch": 0.33971367121203255, "grad_norm": 1.1494442224502563, "learning_rate": 6.608906722510636e-06, "loss": 0.2235, "step": 371500 }, { "epoch": 0.34017089015040675, "grad_norm": 2.2398324012756348, "learning_rate": 6.604330348316993e-06, "loss": 0.2235, "step": 372000 }, { "epoch": 0.34062810908878094, "grad_norm": 1.5684269666671753, "learning_rate": 6.599753974123351e-06, "loss": 0.2243, "step": 372500 }, { "epoch": 0.34108532802715513, "grad_norm": 1.7672525644302368, "learning_rate": 6.595177599929707e-06, "loss": 0.2266, "step": 373000 }, { "epoch": 0.3415425469655293, "grad_norm": 1.8046706914901733, "learning_rate": 6.590601225736064e-06, "loss": 0.2243, "step": 373500 }, { "epoch": 0.3419997659039036, "grad_norm": 1.8992114067077637, "learning_rate": 6.586024851542422e-06, "loss": 0.226, "step": 374000 }, { "epoch": 0.34245698484227777, "grad_norm": 1.6175910234451294, "learning_rate": 6.581448477348778e-06, "loss": 0.2233, "step": 374500 }, { "epoch": 0.34291420378065196, "grad_norm": 1.470871090888977, "learning_rate": 6.576872103155136e-06, "loss": 0.2289, "step": 375000 }, { "epoch": 0.34337142271902615, "grad_norm": 1.562513828277588, "learning_rate": 6.572295728961493e-06, "loss": 0.2293, "step": 375500 }, { "epoch": 0.34382864165740035, "grad_norm": 1.1901838779449463, "learning_rate": 6.56771935476785e-06, "loss": 0.2266, "step": 376000 }, { "epoch": 0.34428586059577454, "grad_norm": 1.8072021007537842, "learning_rate": 6.563142980574207e-06, "loss": 0.2235, "step": 376500 }, { "epoch": 0.3447430795341488, "grad_norm": 1.430627703666687, "learning_rate": 6.558566606380564e-06, "loss": 0.23, "step": 377000 }, { "epoch": 0.345200298472523, "grad_norm": 1.7450295686721802, "learning_rate": 6.553990232186922e-06, "loss": 0.2238, "step": 377500 }, { "epoch": 0.3456575174108972, "grad_norm": 1.278794288635254, "learning_rate": 6.549413857993278e-06, "loss": 0.2247, "step": 378000 }, { "epoch": 0.34611473634927137, "grad_norm": 1.6127958297729492, "learning_rate": 6.544837483799636e-06, "loss": 0.224, "step": 378500 }, { "epoch": 0.34657195528764556, "grad_norm": 1.3669660091400146, "learning_rate": 6.540261109605993e-06, "loss": 0.2228, "step": 379000 }, { "epoch": 0.34702917422601975, "grad_norm": 1.3551899194717407, "learning_rate": 6.53568473541235e-06, "loss": 0.2205, "step": 379500 }, { "epoch": 0.347486393164394, "grad_norm": 1.0663011074066162, "learning_rate": 6.5311083612187075e-06, "loss": 0.2256, "step": 380000 }, { "epoch": 0.3479436121027682, "grad_norm": 1.0649718046188354, "learning_rate": 6.526531987025064e-06, "loss": 0.2248, "step": 380500 }, { "epoch": 0.3484008310411424, "grad_norm": 1.4018280506134033, "learning_rate": 6.521955612831422e-06, "loss": 0.2266, "step": 381000 }, { "epoch": 0.3488580499795166, "grad_norm": 1.750497579574585, "learning_rate": 6.5173792386377785e-06, "loss": 0.2208, "step": 381500 }, { "epoch": 0.3493152689178908, "grad_norm": 1.3639546632766724, "learning_rate": 6.512802864444136e-06, "loss": 0.2271, "step": 382000 }, { "epoch": 0.349772487856265, "grad_norm": 0.8961055278778076, "learning_rate": 6.508226490250493e-06, "loss": 0.2258, "step": 382500 }, { "epoch": 0.3502297067946392, "grad_norm": 1.4955040216445923, "learning_rate": 6.50365011605685e-06, "loss": 0.2168, "step": 383000 }, { "epoch": 0.3506869257330134, "grad_norm": 1.7707324028015137, "learning_rate": 6.4990737418632076e-06, "loss": 0.223, "step": 383500 }, { "epoch": 0.3511441446713876, "grad_norm": 1.7343741655349731, "learning_rate": 6.494497367669564e-06, "loss": 0.2243, "step": 384000 }, { "epoch": 0.3516013636097618, "grad_norm": 1.3605395555496216, "learning_rate": 6.489920993475921e-06, "loss": 0.2208, "step": 384500 }, { "epoch": 0.352058582548136, "grad_norm": 1.7057133913040161, "learning_rate": 6.4853446192822785e-06, "loss": 0.2242, "step": 385000 }, { "epoch": 0.35251580148651024, "grad_norm": 1.440219521522522, "learning_rate": 6.480768245088636e-06, "loss": 0.2198, "step": 385500 }, { "epoch": 0.35297302042488443, "grad_norm": 1.1752151250839233, "learning_rate": 6.476191870894993e-06, "loss": 0.2299, "step": 386000 }, { "epoch": 0.3534302393632586, "grad_norm": 1.0869592428207397, "learning_rate": 6.47161549670135e-06, "loss": 0.2196, "step": 386500 }, { "epoch": 0.3538874583016328, "grad_norm": 1.5555566549301147, "learning_rate": 6.467039122507707e-06, "loss": 0.2204, "step": 387000 }, { "epoch": 0.354344677240007, "grad_norm": 1.148882508277893, "learning_rate": 6.462462748314065e-06, "loss": 0.222, "step": 387500 }, { "epoch": 0.3548018961783812, "grad_norm": 1.7266398668289185, "learning_rate": 6.457886374120421e-06, "loss": 0.2239, "step": 388000 }, { "epoch": 0.35525911511675545, "grad_norm": 1.8156708478927612, "learning_rate": 6.453309999926779e-06, "loss": 0.2254, "step": 388500 }, { "epoch": 0.35571633405512965, "grad_norm": 1.3709605932235718, "learning_rate": 6.448733625733136e-06, "loss": 0.2236, "step": 389000 }, { "epoch": 0.35617355299350384, "grad_norm": 1.373267412185669, "learning_rate": 6.444157251539492e-06, "loss": 0.2182, "step": 389500 }, { "epoch": 0.35663077193187803, "grad_norm": 1.8290654420852661, "learning_rate": 6.4395808773458505e-06, "loss": 0.2199, "step": 390000 }, { "epoch": 0.3570879908702522, "grad_norm": 1.2703052759170532, "learning_rate": 6.435004503152207e-06, "loss": 0.2209, "step": 390500 }, { "epoch": 0.3575452098086264, "grad_norm": 1.3054262399673462, "learning_rate": 6.430428128958565e-06, "loss": 0.2275, "step": 391000 }, { "epoch": 0.35800242874700067, "grad_norm": 1.6827231645584106, "learning_rate": 6.4258517547649214e-06, "loss": 0.225, "step": 391500 }, { "epoch": 0.35845964768537486, "grad_norm": 1.0806723833084106, "learning_rate": 6.421275380571278e-06, "loss": 0.2203, "step": 392000 }, { "epoch": 0.35891686662374905, "grad_norm": 1.3846522569656372, "learning_rate": 6.416699006377636e-06, "loss": 0.2196, "step": 392500 }, { "epoch": 0.35937408556212325, "grad_norm": 1.3533942699432373, "learning_rate": 6.412122632183992e-06, "loss": 0.2245, "step": 393000 }, { "epoch": 0.35983130450049744, "grad_norm": 1.6266448497772217, "learning_rate": 6.4075462579903505e-06, "loss": 0.2237, "step": 393500 }, { "epoch": 0.36028852343887163, "grad_norm": 1.3846949338912964, "learning_rate": 6.402969883796707e-06, "loss": 0.2153, "step": 394000 }, { "epoch": 0.3607457423772459, "grad_norm": 1.707695484161377, "learning_rate": 6.398393509603063e-06, "loss": 0.221, "step": 394500 }, { "epoch": 0.3612029613156201, "grad_norm": 1.2363784313201904, "learning_rate": 6.3938171354094215e-06, "loss": 0.2239, "step": 395000 }, { "epoch": 0.36166018025399427, "grad_norm": 1.168939232826233, "learning_rate": 6.389240761215778e-06, "loss": 0.2219, "step": 395500 }, { "epoch": 0.36211739919236846, "grad_norm": 0.9597660899162292, "learning_rate": 6.384664387022136e-06, "loss": 0.2199, "step": 396000 }, { "epoch": 0.36257461813074265, "grad_norm": 1.228608250617981, "learning_rate": 6.3800880128284925e-06, "loss": 0.2209, "step": 396500 }, { "epoch": 0.36303183706911685, "grad_norm": 0.8621894121170044, "learning_rate": 6.375511638634851e-06, "loss": 0.2209, "step": 397000 }, { "epoch": 0.3634890560074911, "grad_norm": 1.686716079711914, "learning_rate": 6.370935264441207e-06, "loss": 0.2255, "step": 397500 }, { "epoch": 0.3639462749458653, "grad_norm": 1.3068556785583496, "learning_rate": 6.3663588902475635e-06, "loss": 0.2287, "step": 398000 }, { "epoch": 0.3644034938842395, "grad_norm": 1.07680344581604, "learning_rate": 6.361782516053922e-06, "loss": 0.2236, "step": 398500 }, { "epoch": 0.3648607128226137, "grad_norm": 2.0120232105255127, "learning_rate": 6.357206141860278e-06, "loss": 0.2236, "step": 399000 }, { "epoch": 0.36531793176098787, "grad_norm": 1.7183582782745361, "learning_rate": 6.352629767666636e-06, "loss": 0.2191, "step": 399500 }, { "epoch": 0.36577515069936206, "grad_norm": 1.2929768562316895, "learning_rate": 6.348053393472993e-06, "loss": 0.2247, "step": 400000 }, { "epoch": 0.3662323696377363, "grad_norm": 1.0887680053710938, "learning_rate": 6.343477019279349e-06, "loss": 0.2245, "step": 400500 }, { "epoch": 0.3666895885761105, "grad_norm": 1.428952932357788, "learning_rate": 6.338900645085707e-06, "loss": 0.2198, "step": 401000 }, { "epoch": 0.3671468075144847, "grad_norm": 1.679784893989563, "learning_rate": 6.3343242708920636e-06, "loss": 0.219, "step": 401500 }, { "epoch": 0.3676040264528589, "grad_norm": 1.9559983015060425, "learning_rate": 6.329747896698422e-06, "loss": 0.2208, "step": 402000 }, { "epoch": 0.3680612453912331, "grad_norm": 1.2725555896759033, "learning_rate": 6.325171522504778e-06, "loss": 0.2157, "step": 402500 }, { "epoch": 0.36851846432960733, "grad_norm": 1.522418737411499, "learning_rate": 6.3205951483111345e-06, "loss": 0.2224, "step": 403000 }, { "epoch": 0.3689756832679815, "grad_norm": 1.372866153717041, "learning_rate": 6.316018774117493e-06, "loss": 0.2226, "step": 403500 }, { "epoch": 0.3694329022063557, "grad_norm": 1.6181350946426392, "learning_rate": 6.311442399923849e-06, "loss": 0.2229, "step": 404000 }, { "epoch": 0.3698901211447299, "grad_norm": 1.3286571502685547, "learning_rate": 6.306866025730207e-06, "loss": 0.2242, "step": 404500 }, { "epoch": 0.3703473400831041, "grad_norm": 1.808738350868225, "learning_rate": 6.302289651536564e-06, "loss": 0.2187, "step": 405000 }, { "epoch": 0.3708045590214783, "grad_norm": 1.438143253326416, "learning_rate": 6.297713277342921e-06, "loss": 0.2217, "step": 405500 }, { "epoch": 0.37126177795985255, "grad_norm": 1.8223944902420044, "learning_rate": 6.293136903149278e-06, "loss": 0.2269, "step": 406000 }, { "epoch": 0.37171899689822674, "grad_norm": 1.175521969795227, "learning_rate": 6.2885605289556355e-06, "loss": 0.2235, "step": 406500 }, { "epoch": 0.37217621583660093, "grad_norm": 2.208753824234009, "learning_rate": 6.283984154761993e-06, "loss": 0.2267, "step": 407000 }, { "epoch": 0.3726334347749751, "grad_norm": 1.0367563962936401, "learning_rate": 6.279407780568349e-06, "loss": 0.2227, "step": 407500 }, { "epoch": 0.3730906537133493, "grad_norm": 1.719056487083435, "learning_rate": 6.2748314063747065e-06, "loss": 0.2217, "step": 408000 }, { "epoch": 0.3735478726517235, "grad_norm": 1.4684566259384155, "learning_rate": 6.270255032181064e-06, "loss": 0.2247, "step": 408500 }, { "epoch": 0.37400509159009776, "grad_norm": 1.1170936822891235, "learning_rate": 6.265678657987421e-06, "loss": 0.2194, "step": 409000 }, { "epoch": 0.37446231052847195, "grad_norm": 1.2900408506393433, "learning_rate": 6.261102283793778e-06, "loss": 0.2181, "step": 409500 }, { "epoch": 0.37491952946684615, "grad_norm": 1.4425394535064697, "learning_rate": 6.2565259096001356e-06, "loss": 0.2204, "step": 410000 }, { "epoch": 0.37537674840522034, "grad_norm": 1.2810927629470825, "learning_rate": 6.251949535406492e-06, "loss": 0.2189, "step": 410500 }, { "epoch": 0.37583396734359453, "grad_norm": 1.4359560012817383, "learning_rate": 6.247373161212849e-06, "loss": 0.2206, "step": 411000 }, { "epoch": 0.3762911862819687, "grad_norm": 1.2622240781784058, "learning_rate": 6.2427967870192065e-06, "loss": 0.219, "step": 411500 }, { "epoch": 0.376748405220343, "grad_norm": 1.365540862083435, "learning_rate": 6.238220412825564e-06, "loss": 0.22, "step": 412000 }, { "epoch": 0.37720562415871717, "grad_norm": 1.4828884601593018, "learning_rate": 6.233644038631921e-06, "loss": 0.2263, "step": 412500 }, { "epoch": 0.37766284309709136, "grad_norm": 1.3468049764633179, "learning_rate": 6.2290676644382775e-06, "loss": 0.2236, "step": 413000 }, { "epoch": 0.37812006203546555, "grad_norm": 1.2755866050720215, "learning_rate": 6.224491290244636e-06, "loss": 0.2258, "step": 413500 }, { "epoch": 0.37857728097383975, "grad_norm": 1.38784658908844, "learning_rate": 6.219914916050992e-06, "loss": 0.2236, "step": 414000 }, { "epoch": 0.37903449991221394, "grad_norm": 1.3065296411514282, "learning_rate": 6.215338541857349e-06, "loss": 0.2206, "step": 414500 }, { "epoch": 0.3794917188505882, "grad_norm": 1.0447067022323608, "learning_rate": 6.210762167663707e-06, "loss": 0.2177, "step": 415000 }, { "epoch": 0.3799489377889624, "grad_norm": 1.609236478805542, "learning_rate": 6.206185793470064e-06, "loss": 0.2243, "step": 415500 }, { "epoch": 0.3804061567273366, "grad_norm": 1.5531622171401978, "learning_rate": 6.201609419276421e-06, "loss": 0.221, "step": 416000 }, { "epoch": 0.38086337566571077, "grad_norm": 1.4250783920288086, "learning_rate": 6.197033045082778e-06, "loss": 0.2164, "step": 416500 }, { "epoch": 0.38132059460408496, "grad_norm": 1.1696795225143433, "learning_rate": 6.192456670889136e-06, "loss": 0.2225, "step": 417000 }, { "epoch": 0.38177781354245915, "grad_norm": 1.422655701637268, "learning_rate": 6.187880296695492e-06, "loss": 0.2223, "step": 417500 }, { "epoch": 0.3822350324808334, "grad_norm": 1.3113077878952026, "learning_rate": 6.1833039225018494e-06, "loss": 0.2195, "step": 418000 }, { "epoch": 0.3826922514192076, "grad_norm": 1.46403968334198, "learning_rate": 6.178727548308207e-06, "loss": 0.213, "step": 418500 }, { "epoch": 0.3831494703575818, "grad_norm": 1.528786540031433, "learning_rate": 6.174151174114563e-06, "loss": 0.2158, "step": 419000 }, { "epoch": 0.383606689295956, "grad_norm": 1.4497718811035156, "learning_rate": 6.169574799920921e-06, "loss": 0.2196, "step": 419500 }, { "epoch": 0.3840639082343302, "grad_norm": 1.6955440044403076, "learning_rate": 6.164998425727278e-06, "loss": 0.2227, "step": 420000 } ], "logging_steps": 500, "max_steps": 1093568, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.992271060696826e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }