{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 182300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.27427317608337903, "grad_norm": 2.1429903507232666, "learning_rate": 4.986286341195831e-05, "loss": 7.2532, "step": 500 }, { "epoch": 0.5485463521667581, "grad_norm": 2.0887398719787598, "learning_rate": 4.972572682391663e-05, "loss": 6.4724, "step": 1000 }, { "epoch": 0.8228195282501372, "grad_norm": 2.1614580154418945, "learning_rate": 4.958859023587493e-05, "loss": 6.0973, "step": 1500 }, { "epoch": 1.0970927043335161, "grad_norm": 2.202652931213379, "learning_rate": 4.9451453647833245e-05, "loss": 5.7872, "step": 2000 }, { "epoch": 1.3713658804168953, "grad_norm": 2.147414445877075, "learning_rate": 4.931431705979155e-05, "loss": 5.5399, "step": 2500 }, { "epoch": 1.6456390565002743, "grad_norm": 2.1624412536621094, "learning_rate": 4.917718047174987e-05, "loss": 5.3515, "step": 3000 }, { "epoch": 1.9199122325836533, "grad_norm": 2.155363082885742, "learning_rate": 4.9040043883708175e-05, "loss": 5.1762, "step": 3500 }, { "epoch": 2.1941854086670323, "grad_norm": 2.2420756816864014, "learning_rate": 4.890290729566648e-05, "loss": 5.0364, "step": 4000 }, { "epoch": 2.4684585847504112, "grad_norm": 2.4155259132385254, "learning_rate": 4.87657707076248e-05, "loss": 4.9178, "step": 4500 }, { "epoch": 2.7427317608337907, "grad_norm": 2.2149574756622314, "learning_rate": 4.8628634119583105e-05, "loss": 4.8449, "step": 5000 }, { "epoch": 3.0170049369171696, "grad_norm": 2.25925350189209, "learning_rate": 4.849149753154142e-05, "loss": 4.762, "step": 5500 }, { "epoch": 3.2912781130005486, "grad_norm": 2.315990686416626, "learning_rate": 4.835436094349973e-05, "loss": 4.6713, "step": 6000 }, { "epoch": 3.5655512890839276, "grad_norm": 2.425288677215576, "learning_rate": 4.821722435545804e-05, "loss": 4.6267, "step": 6500 }, { "epoch": 3.8398244651673066, "grad_norm": 2.3451356887817383, "learning_rate": 4.808008776741635e-05, "loss": 4.579, "step": 7000 }, { "epoch": 4.1140976412506856, "grad_norm": 2.306058645248413, "learning_rate": 4.794295117937466e-05, "loss": 4.5148, "step": 7500 }, { "epoch": 4.3883708173340645, "grad_norm": 2.3386404514312744, "learning_rate": 4.780581459133297e-05, "loss": 4.4659, "step": 8000 }, { "epoch": 4.6626439934174435, "grad_norm": 2.3117551803588867, "learning_rate": 4.766867800329128e-05, "loss": 4.4327, "step": 8500 }, { "epoch": 4.9369171695008225, "grad_norm": 2.36466908454895, "learning_rate": 4.753154141524959e-05, "loss": 4.3947, "step": 9000 }, { "epoch": 5.2111903455842015, "grad_norm": 2.348733901977539, "learning_rate": 4.73944048272079e-05, "loss": 4.3441, "step": 9500 }, { "epoch": 5.485463521667581, "grad_norm": 2.9133706092834473, "learning_rate": 4.7257268239166215e-05, "loss": 4.3025, "step": 10000 }, { "epoch": 5.75973669775096, "grad_norm": 2.6369545459747314, "learning_rate": 4.712013165112452e-05, "loss": 4.2785, "step": 10500 }, { "epoch": 6.034009873834339, "grad_norm": 2.7040719985961914, "learning_rate": 4.698299506308283e-05, "loss": 4.2431, "step": 11000 }, { "epoch": 6.308283049917718, "grad_norm": 2.7137389183044434, "learning_rate": 4.6845858475041146e-05, "loss": 4.1796, "step": 11500 }, { "epoch": 6.582556226001097, "grad_norm": 2.963534355163574, "learning_rate": 4.670872188699945e-05, "loss": 4.1432, "step": 12000 }, { "epoch": 6.856829402084476, "grad_norm": 2.8243420124053955, "learning_rate": 4.6571585298957763e-05, "loss": 4.1252, "step": 12500 }, { "epoch": 7.131102578167855, "grad_norm": 3.03757905960083, "learning_rate": 4.6434448710916076e-05, "loss": 4.0713, "step": 13000 }, { "epoch": 7.405375754251234, "grad_norm": 3.007908821105957, "learning_rate": 4.629731212287439e-05, "loss": 4.0248, "step": 13500 }, { "epoch": 7.679648930334613, "grad_norm": 2.846788167953491, "learning_rate": 4.6160175534832694e-05, "loss": 4.0099, "step": 14000 }, { "epoch": 7.953922106417992, "grad_norm": 2.961183786392212, "learning_rate": 4.6023038946791006e-05, "loss": 3.9728, "step": 14500 }, { "epoch": 8.228195282501371, "grad_norm": 3.066025733947754, "learning_rate": 4.588590235874932e-05, "loss": 3.9118, "step": 15000 }, { "epoch": 8.502468458584751, "grad_norm": 2.9394822120666504, "learning_rate": 4.5748765770707624e-05, "loss": 3.8828, "step": 15500 }, { "epoch": 8.776741634668129, "grad_norm": 3.012153387069702, "learning_rate": 4.5611629182665936e-05, "loss": 3.8832, "step": 16000 }, { "epoch": 9.051014810751509, "grad_norm": 2.899332046508789, "learning_rate": 4.547449259462425e-05, "loss": 3.8481, "step": 16500 }, { "epoch": 9.325287986834887, "grad_norm": 3.164444923400879, "learning_rate": 4.533735600658256e-05, "loss": 3.7773, "step": 17000 }, { "epoch": 9.599561162918267, "grad_norm": 3.017282009124756, "learning_rate": 4.5200219418540867e-05, "loss": 3.7689, "step": 17500 }, { "epoch": 9.873834339001645, "grad_norm": 3.367647647857666, "learning_rate": 4.506308283049918e-05, "loss": 3.752, "step": 18000 }, { "epoch": 10.148107515085025, "grad_norm": 2.9855947494506836, "learning_rate": 4.492594624245749e-05, "loss": 3.7182, "step": 18500 }, { "epoch": 10.422380691168403, "grad_norm": 3.505870819091797, "learning_rate": 4.47888096544158e-05, "loss": 3.674, "step": 19000 }, { "epoch": 10.696653867251783, "grad_norm": 3.438145160675049, "learning_rate": 4.465167306637411e-05, "loss": 3.651, "step": 19500 }, { "epoch": 10.970927043335163, "grad_norm": 3.0687413215637207, "learning_rate": 4.451453647833242e-05, "loss": 3.6479, "step": 20000 }, { "epoch": 11.24520021941854, "grad_norm": 3.2287588119506836, "learning_rate": 4.4377399890290734e-05, "loss": 3.5693, "step": 20500 }, { "epoch": 11.51947339550192, "grad_norm": 3.3848471641540527, "learning_rate": 4.424026330224904e-05, "loss": 3.5667, "step": 21000 }, { "epoch": 11.793746571585299, "grad_norm": 3.5464422702789307, "learning_rate": 4.410312671420735e-05, "loss": 3.5589, "step": 21500 }, { "epoch": 12.068019747668679, "grad_norm": 3.6160085201263428, "learning_rate": 4.3965990126165664e-05, "loss": 3.5313, "step": 22000 }, { "epoch": 12.342292923752057, "grad_norm": 3.6420817375183105, "learning_rate": 4.3828853538123976e-05, "loss": 3.4855, "step": 22500 }, { "epoch": 12.616566099835437, "grad_norm": 3.583449363708496, "learning_rate": 4.369171695008228e-05, "loss": 3.4654, "step": 23000 }, { "epoch": 12.890839275918815, "grad_norm": 3.5506091117858887, "learning_rate": 4.3554580362040594e-05, "loss": 3.4512, "step": 23500 }, { "epoch": 13.165112452002194, "grad_norm": 4.101990699768066, "learning_rate": 4.341744377399891e-05, "loss": 3.4055, "step": 24000 }, { "epoch": 13.439385628085573, "grad_norm": 4.144250392913818, "learning_rate": 4.328030718595721e-05, "loss": 3.3755, "step": 24500 }, { "epoch": 13.713658804168952, "grad_norm": 3.6288070678710938, "learning_rate": 4.3143170597915525e-05, "loss": 3.3725, "step": 25000 }, { "epoch": 13.98793198025233, "grad_norm": 3.5927882194519043, "learning_rate": 4.300603400987384e-05, "loss": 3.3693, "step": 25500 }, { "epoch": 14.26220515633571, "grad_norm": 3.405404567718506, "learning_rate": 4.286889742183215e-05, "loss": 3.2955, "step": 26000 }, { "epoch": 14.53647833241909, "grad_norm": 4.086198329925537, "learning_rate": 4.2731760833790455e-05, "loss": 3.3038, "step": 26500 }, { "epoch": 14.810751508502468, "grad_norm": 3.3961052894592285, "learning_rate": 4.259462424574877e-05, "loss": 3.2561, "step": 27000 }, { "epoch": 15.085024684585848, "grad_norm": 3.6080105304718018, "learning_rate": 4.245748765770708e-05, "loss": 3.2539, "step": 27500 }, { "epoch": 15.359297860669226, "grad_norm": 3.597956657409668, "learning_rate": 4.2320351069665385e-05, "loss": 3.2148, "step": 28000 }, { "epoch": 15.633571036752606, "grad_norm": 3.466057062149048, "learning_rate": 4.21832144816237e-05, "loss": 3.2031, "step": 28500 }, { "epoch": 15.907844212835984, "grad_norm": 4.239918231964111, "learning_rate": 4.204607789358201e-05, "loss": 3.1768, "step": 29000 }, { "epoch": 16.182117388919362, "grad_norm": 4.533541202545166, "learning_rate": 4.190894130554032e-05, "loss": 3.1258, "step": 29500 }, { "epoch": 16.456390565002742, "grad_norm": 3.8643674850463867, "learning_rate": 4.177180471749863e-05, "loss": 3.1162, "step": 30000 }, { "epoch": 16.730663741086122, "grad_norm": 3.710988998413086, "learning_rate": 4.163466812945694e-05, "loss": 3.1159, "step": 30500 }, { "epoch": 17.004936917169502, "grad_norm": 3.6405742168426514, "learning_rate": 4.149753154141525e-05, "loss": 3.1137, "step": 31000 }, { "epoch": 17.27921009325288, "grad_norm": 4.127532482147217, "learning_rate": 4.136039495337356e-05, "loss": 3.0327, "step": 31500 }, { "epoch": 17.553483269336258, "grad_norm": 3.6551403999328613, "learning_rate": 4.122325836533188e-05, "loss": 3.0611, "step": 32000 }, { "epoch": 17.827756445419638, "grad_norm": 3.6437666416168213, "learning_rate": 4.108612177729018e-05, "loss": 3.0233, "step": 32500 }, { "epoch": 18.102029621503018, "grad_norm": 4.008886814117432, "learning_rate": 4.0948985189248495e-05, "loss": 3.0099, "step": 33000 }, { "epoch": 18.376302797586398, "grad_norm": 3.779545783996582, "learning_rate": 4.08118486012068e-05, "loss": 2.9663, "step": 33500 }, { "epoch": 18.650575973669774, "grad_norm": 3.7845826148986816, "learning_rate": 4.067471201316512e-05, "loss": 2.9628, "step": 34000 }, { "epoch": 18.924849149753154, "grad_norm": 3.866852283477783, "learning_rate": 4.0537575425123425e-05, "loss": 2.9649, "step": 34500 }, { "epoch": 19.199122325836534, "grad_norm": 3.9092442989349365, "learning_rate": 4.040043883708173e-05, "loss": 2.9199, "step": 35000 }, { "epoch": 19.473395501919914, "grad_norm": 4.038732528686523, "learning_rate": 4.026330224904005e-05, "loss": 2.9078, "step": 35500 }, { "epoch": 19.74766867800329, "grad_norm": 3.717470645904541, "learning_rate": 4.0126165660998355e-05, "loss": 2.8907, "step": 36000 }, { "epoch": 20.02194185408667, "grad_norm": 3.9166011810302734, "learning_rate": 3.998902907295667e-05, "loss": 2.8892, "step": 36500 }, { "epoch": 20.29621503017005, "grad_norm": 3.9473681449890137, "learning_rate": 3.985189248491497e-05, "loss": 2.8274, "step": 37000 }, { "epoch": 20.57048820625343, "grad_norm": 4.139565467834473, "learning_rate": 3.971475589687329e-05, "loss": 2.8371, "step": 37500 }, { "epoch": 20.844761382336806, "grad_norm": 3.7124762535095215, "learning_rate": 3.95776193088316e-05, "loss": 2.8401, "step": 38000 }, { "epoch": 21.119034558420186, "grad_norm": 3.772149085998535, "learning_rate": 3.9440482720789904e-05, "loss": 2.8083, "step": 38500 }, { "epoch": 21.393307734503566, "grad_norm": 4.185425758361816, "learning_rate": 3.930334613274822e-05, "loss": 2.7767, "step": 39000 }, { "epoch": 21.667580910586945, "grad_norm": 3.800649881362915, "learning_rate": 3.916620954470653e-05, "loss": 2.7723, "step": 39500 }, { "epoch": 21.941854086670325, "grad_norm": 3.800741195678711, "learning_rate": 3.902907295666484e-05, "loss": 2.774, "step": 40000 }, { "epoch": 22.2161272627537, "grad_norm": 3.7834713459014893, "learning_rate": 3.889193636862315e-05, "loss": 2.7329, "step": 40500 }, { "epoch": 22.49040043883708, "grad_norm": 4.18643045425415, "learning_rate": 3.8754799780581465e-05, "loss": 2.7181, "step": 41000 }, { "epoch": 22.76467361492046, "grad_norm": 3.758415460586548, "learning_rate": 3.861766319253977e-05, "loss": 2.7148, "step": 41500 }, { "epoch": 23.03894679100384, "grad_norm": 4.028139114379883, "learning_rate": 3.8480526604498076e-05, "loss": 2.7025, "step": 42000 }, { "epoch": 23.313219967087218, "grad_norm": 3.779428243637085, "learning_rate": 3.8343390016456395e-05, "loss": 2.6542, "step": 42500 }, { "epoch": 23.587493143170597, "grad_norm": 3.6896047592163086, "learning_rate": 3.82062534284147e-05, "loss": 2.6662, "step": 43000 }, { "epoch": 23.861766319253977, "grad_norm": 3.527114152908325, "learning_rate": 3.806911684037301e-05, "loss": 2.6697, "step": 43500 }, { "epoch": 24.136039495337357, "grad_norm": 3.832408905029297, "learning_rate": 3.7931980252331326e-05, "loss": 2.6443, "step": 44000 }, { "epoch": 24.410312671420733, "grad_norm": 3.912022352218628, "learning_rate": 3.779484366428964e-05, "loss": 2.6098, "step": 44500 }, { "epoch": 24.684585847504113, "grad_norm": 3.832465648651123, "learning_rate": 3.7657707076247944e-05, "loss": 2.6187, "step": 45000 }, { "epoch": 24.958859023587493, "grad_norm": 4.292030334472656, "learning_rate": 3.752057048820625e-05, "loss": 2.6216, "step": 45500 }, { "epoch": 25.233132199670873, "grad_norm": 3.9441139698028564, "learning_rate": 3.738343390016457e-05, "loss": 2.5674, "step": 46000 }, { "epoch": 25.507405375754253, "grad_norm": 3.8073363304138184, "learning_rate": 3.7246297312122874e-05, "loss": 2.5613, "step": 46500 }, { "epoch": 25.78167855183763, "grad_norm": 3.95381498336792, "learning_rate": 3.7109160724081186e-05, "loss": 2.5744, "step": 47000 }, { "epoch": 26.05595172792101, "grad_norm": 4.265843391418457, "learning_rate": 3.69720241360395e-05, "loss": 2.5581, "step": 47500 }, { "epoch": 26.33022490400439, "grad_norm": 3.7907886505126953, "learning_rate": 3.683488754799781e-05, "loss": 2.5085, "step": 48000 }, { "epoch": 26.60449808008777, "grad_norm": 3.9580938816070557, "learning_rate": 3.6697750959956116e-05, "loss": 2.5244, "step": 48500 }, { "epoch": 26.878771256171145, "grad_norm": 3.725271701812744, "learning_rate": 3.656061437191443e-05, "loss": 2.5283, "step": 49000 }, { "epoch": 27.153044432254525, "grad_norm": 4.27789831161499, "learning_rate": 3.642347778387274e-05, "loss": 2.4978, "step": 49500 }, { "epoch": 27.427317608337905, "grad_norm": 4.248454570770264, "learning_rate": 3.628634119583105e-05, "loss": 2.478, "step": 50000 }, { "epoch": 27.701590784421285, "grad_norm": 3.7782256603240967, "learning_rate": 3.614920460778936e-05, "loss": 2.4797, "step": 50500 }, { "epoch": 27.97586396050466, "grad_norm": 3.996277332305908, "learning_rate": 3.601206801974767e-05, "loss": 2.4872, "step": 51000 }, { "epoch": 28.25013713658804, "grad_norm": 4.143040657043457, "learning_rate": 3.5874931431705984e-05, "loss": 2.4311, "step": 51500 }, { "epoch": 28.52441031267142, "grad_norm": 3.6849589347839355, "learning_rate": 3.573779484366429e-05, "loss": 2.4377, "step": 52000 }, { "epoch": 28.7986834887548, "grad_norm": 3.621760606765747, "learning_rate": 3.56006582556226e-05, "loss": 2.4425, "step": 52500 }, { "epoch": 29.07295666483818, "grad_norm": 3.7394306659698486, "learning_rate": 3.5463521667580914e-05, "loss": 2.4287, "step": 53000 }, { "epoch": 29.347229840921557, "grad_norm": 3.782111167907715, "learning_rate": 3.532638507953922e-05, "loss": 2.3911, "step": 53500 }, { "epoch": 29.621503017004937, "grad_norm": 4.35050106048584, "learning_rate": 3.518924849149753e-05, "loss": 2.4084, "step": 54000 }, { "epoch": 29.895776193088317, "grad_norm": 3.8727004528045654, "learning_rate": 3.5052111903455844e-05, "loss": 2.3928, "step": 54500 }, { "epoch": 30.170049369171696, "grad_norm": 3.974501371383667, "learning_rate": 3.4914975315414157e-05, "loss": 2.3668, "step": 55000 }, { "epoch": 30.444322545255073, "grad_norm": 3.7882275581359863, "learning_rate": 3.477783872737246e-05, "loss": 2.3686, "step": 55500 }, { "epoch": 30.718595721338453, "grad_norm": 3.8313581943511963, "learning_rate": 3.4640702139330774e-05, "loss": 2.3707, "step": 56000 }, { "epoch": 30.992868897421832, "grad_norm": 4.046344757080078, "learning_rate": 3.450356555128909e-05, "loss": 2.3632, "step": 56500 }, { "epoch": 31.267142073505212, "grad_norm": 3.7165708541870117, "learning_rate": 3.436642896324739e-05, "loss": 2.3229, "step": 57000 }, { "epoch": 31.54141524958859, "grad_norm": 3.8072948455810547, "learning_rate": 3.4229292375205705e-05, "loss": 2.3262, "step": 57500 }, { "epoch": 31.81568842567197, "grad_norm": 3.8423380851745605, "learning_rate": 3.409215578716402e-05, "loss": 2.3311, "step": 58000 }, { "epoch": 32.08996160175535, "grad_norm": 3.831343650817871, "learning_rate": 3.395501919912233e-05, "loss": 2.3213, "step": 58500 }, { "epoch": 32.364234777838725, "grad_norm": 3.8060476779937744, "learning_rate": 3.3817882611080635e-05, "loss": 2.2849, "step": 59000 }, { "epoch": 32.63850795392211, "grad_norm": 4.033987998962402, "learning_rate": 3.368074602303895e-05, "loss": 2.2979, "step": 59500 }, { "epoch": 32.912781130005484, "grad_norm": 3.870171546936035, "learning_rate": 3.354360943499726e-05, "loss": 2.2982, "step": 60000 }, { "epoch": 33.18705430608887, "grad_norm": 3.848620653152466, "learning_rate": 3.340647284695557e-05, "loss": 2.264, "step": 60500 }, { "epoch": 33.461327482172244, "grad_norm": 4.048386573791504, "learning_rate": 3.326933625891388e-05, "loss": 2.2568, "step": 61000 }, { "epoch": 33.73560065825562, "grad_norm": 4.029069900512695, "learning_rate": 3.313219967087219e-05, "loss": 2.2602, "step": 61500 }, { "epoch": 34.009873834339004, "grad_norm": 3.759799003601074, "learning_rate": 3.29950630828305e-05, "loss": 2.2763, "step": 62000 }, { "epoch": 34.28414701042238, "grad_norm": 4.440002918243408, "learning_rate": 3.285792649478881e-05, "loss": 2.2137, "step": 62500 }, { "epoch": 34.55842018650576, "grad_norm": 3.961390972137451, "learning_rate": 3.272078990674713e-05, "loss": 2.233, "step": 63000 }, { "epoch": 34.83269336258914, "grad_norm": 3.916156768798828, "learning_rate": 3.258365331870543e-05, "loss": 2.2414, "step": 63500 }, { "epoch": 35.106966538672516, "grad_norm": 3.746572732925415, "learning_rate": 3.2446516730663745e-05, "loss": 2.2137, "step": 64000 }, { "epoch": 35.3812397147559, "grad_norm": 3.8424971103668213, "learning_rate": 3.230938014262205e-05, "loss": 2.1907, "step": 64500 }, { "epoch": 35.655512890839276, "grad_norm": 4.250007152557373, "learning_rate": 3.217224355458036e-05, "loss": 2.2104, "step": 65000 }, { "epoch": 35.92978606692265, "grad_norm": 3.760779857635498, "learning_rate": 3.2035106966538675e-05, "loss": 2.2134, "step": 65500 }, { "epoch": 36.204059243006036, "grad_norm": 4.189092636108398, "learning_rate": 3.189797037849698e-05, "loss": 2.1714, "step": 66000 }, { "epoch": 36.47833241908941, "grad_norm": 3.6505000591278076, "learning_rate": 3.17608337904553e-05, "loss": 2.1635, "step": 66500 }, { "epoch": 36.752605595172795, "grad_norm": 3.7696096897125244, "learning_rate": 3.1623697202413605e-05, "loss": 2.1832, "step": 67000 }, { "epoch": 37.02687877125617, "grad_norm": 3.744976282119751, "learning_rate": 3.148656061437192e-05, "loss": 2.1753, "step": 67500 }, { "epoch": 37.30115194733955, "grad_norm": 3.9759116172790527, "learning_rate": 3.134942402633022e-05, "loss": 2.1326, "step": 68000 }, { "epoch": 37.57542512342293, "grad_norm": 3.719237804412842, "learning_rate": 3.1212287438288536e-05, "loss": 2.1423, "step": 68500 }, { "epoch": 37.84969829950631, "grad_norm": 4.290117263793945, "learning_rate": 3.107515085024685e-05, "loss": 2.1602, "step": 69000 }, { "epoch": 38.123971475589684, "grad_norm": 4.060342311859131, "learning_rate": 3.0938014262205153e-05, "loss": 2.1349, "step": 69500 }, { "epoch": 38.39824465167307, "grad_norm": 4.048706531524658, "learning_rate": 3.080087767416347e-05, "loss": 2.1018, "step": 70000 }, { "epoch": 38.672517827756444, "grad_norm": 4.130014896392822, "learning_rate": 3.066374108612178e-05, "loss": 2.1284, "step": 70500 }, { "epoch": 38.94679100383983, "grad_norm": 3.893848419189453, "learning_rate": 3.052660449808009e-05, "loss": 2.1255, "step": 71000 }, { "epoch": 39.221064179923204, "grad_norm": 3.6254563331604004, "learning_rate": 3.03894679100384e-05, "loss": 2.0848, "step": 71500 }, { "epoch": 39.49533735600658, "grad_norm": 4.06374979019165, "learning_rate": 3.025233132199671e-05, "loss": 2.0933, "step": 72000 }, { "epoch": 39.76961053208996, "grad_norm": 3.763274908065796, "learning_rate": 3.011519473395502e-05, "loss": 2.11, "step": 72500 }, { "epoch": 40.04388370817334, "grad_norm": 4.014530181884766, "learning_rate": 2.997805814591333e-05, "loss": 2.0918, "step": 73000 }, { "epoch": 40.31815688425672, "grad_norm": 3.6787962913513184, "learning_rate": 2.9840921557871642e-05, "loss": 2.0559, "step": 73500 }, { "epoch": 40.5924300603401, "grad_norm": 3.752711057662964, "learning_rate": 2.970378496982995e-05, "loss": 2.067, "step": 74000 }, { "epoch": 40.866703236423476, "grad_norm": 3.795217752456665, "learning_rate": 2.9566648381788263e-05, "loss": 2.0792, "step": 74500 }, { "epoch": 41.14097641250686, "grad_norm": 3.7484569549560547, "learning_rate": 2.9429511793746572e-05, "loss": 2.0608, "step": 75000 }, { "epoch": 41.415249588590235, "grad_norm": 3.601229190826416, "learning_rate": 2.9292375205704885e-05, "loss": 2.0337, "step": 75500 }, { "epoch": 41.68952276467361, "grad_norm": 3.9707863330841064, "learning_rate": 2.9155238617663194e-05, "loss": 2.0426, "step": 76000 }, { "epoch": 41.963795940756995, "grad_norm": 3.9523677825927734, "learning_rate": 2.9018102029621502e-05, "loss": 2.0571, "step": 76500 }, { "epoch": 42.23806911684037, "grad_norm": 4.046602725982666, "learning_rate": 2.8880965441579815e-05, "loss": 2.0225, "step": 77000 }, { "epoch": 42.512342292923755, "grad_norm": 4.059443950653076, "learning_rate": 2.8743828853538124e-05, "loss": 2.0185, "step": 77500 }, { "epoch": 42.78661546900713, "grad_norm": 4.066934108734131, "learning_rate": 2.8606692265496436e-05, "loss": 2.0342, "step": 78000 }, { "epoch": 43.06088864509051, "grad_norm": 3.811591386795044, "learning_rate": 2.8469555677454745e-05, "loss": 2.0216, "step": 78500 }, { "epoch": 43.33516182117389, "grad_norm": 3.979374408721924, "learning_rate": 2.833241908941306e-05, "loss": 1.989, "step": 79000 }, { "epoch": 43.60943499725727, "grad_norm": 3.67275333404541, "learning_rate": 2.8195282501371366e-05, "loss": 1.9958, "step": 79500 }, { "epoch": 43.88370817334065, "grad_norm": 3.790217399597168, "learning_rate": 2.8058145913329675e-05, "loss": 2.0102, "step": 80000 }, { "epoch": 44.15798134942403, "grad_norm": 3.9934496879577637, "learning_rate": 2.7921009325287988e-05, "loss": 1.9847, "step": 80500 }, { "epoch": 44.4322545255074, "grad_norm": 4.339521408081055, "learning_rate": 2.7783872737246297e-05, "loss": 1.9752, "step": 81000 }, { "epoch": 44.70652770159079, "grad_norm": 3.5851519107818604, "learning_rate": 2.764673614920461e-05, "loss": 1.9883, "step": 81500 }, { "epoch": 44.98080087767416, "grad_norm": 4.129305362701416, "learning_rate": 2.7509599561162918e-05, "loss": 1.9896, "step": 82000 }, { "epoch": 45.25507405375754, "grad_norm": 3.752852201461792, "learning_rate": 2.7372462973121234e-05, "loss": 1.9465, "step": 82500 }, { "epoch": 45.52934722984092, "grad_norm": 3.923309087753296, "learning_rate": 2.723532638507954e-05, "loss": 1.9589, "step": 83000 }, { "epoch": 45.8036204059243, "grad_norm": 4.141747951507568, "learning_rate": 2.7098189797037848e-05, "loss": 1.9662, "step": 83500 }, { "epoch": 46.07789358200768, "grad_norm": 4.118216514587402, "learning_rate": 2.696105320899616e-05, "loss": 1.9518, "step": 84000 }, { "epoch": 46.35216675809106, "grad_norm": 4.061371326446533, "learning_rate": 2.682391662095447e-05, "loss": 1.928, "step": 84500 }, { "epoch": 46.626439934174435, "grad_norm": 4.138849258422852, "learning_rate": 2.6686780032912785e-05, "loss": 1.9456, "step": 85000 }, { "epoch": 46.90071311025782, "grad_norm": 3.9675650596618652, "learning_rate": 2.654964344487109e-05, "loss": 1.9465, "step": 85500 }, { "epoch": 47.174986286341195, "grad_norm": 3.745779514312744, "learning_rate": 2.6412506856829406e-05, "loss": 1.9293, "step": 86000 }, { "epoch": 47.44925946242458, "grad_norm": 3.6988871097564697, "learning_rate": 2.6275370268787712e-05, "loss": 1.915, "step": 86500 }, { "epoch": 47.723532638507955, "grad_norm": 3.7044730186462402, "learning_rate": 2.613823368074602e-05, "loss": 1.9199, "step": 87000 }, { "epoch": 47.99780581459133, "grad_norm": 3.6700057983398438, "learning_rate": 2.6001097092704337e-05, "loss": 1.9243, "step": 87500 }, { "epoch": 48.272078990674714, "grad_norm": 3.89973521232605, "learning_rate": 2.5863960504662642e-05, "loss": 1.8846, "step": 88000 }, { "epoch": 48.54635216675809, "grad_norm": 4.041015625, "learning_rate": 2.5726823916620958e-05, "loss": 1.8999, "step": 88500 }, { "epoch": 48.82062534284147, "grad_norm": 3.7937917709350586, "learning_rate": 2.5589687328579264e-05, "loss": 1.9085, "step": 89000 }, { "epoch": 49.09489851892485, "grad_norm": 4.050382614135742, "learning_rate": 2.545255074053758e-05, "loss": 1.8934, "step": 89500 }, { "epoch": 49.36917169500823, "grad_norm": 3.809558391571045, "learning_rate": 2.5315414152495888e-05, "loss": 1.8705, "step": 90000 }, { "epoch": 49.64344487109161, "grad_norm": 3.6460201740264893, "learning_rate": 2.51782775644542e-05, "loss": 1.8904, "step": 90500 }, { "epoch": 49.917718047174986, "grad_norm": 3.959718704223633, "learning_rate": 2.504114097641251e-05, "loss": 1.8936, "step": 91000 }, { "epoch": 50.19199122325836, "grad_norm": 3.786888837814331, "learning_rate": 2.490400438837082e-05, "loss": 1.8683, "step": 91500 }, { "epoch": 50.466264399341746, "grad_norm": 3.477952241897583, "learning_rate": 2.476686780032913e-05, "loss": 1.8634, "step": 92000 }, { "epoch": 50.74053757542512, "grad_norm": 3.998764991760254, "learning_rate": 2.462973121228744e-05, "loss": 1.8637, "step": 92500 }, { "epoch": 51.014810751508506, "grad_norm": 4.029101848602295, "learning_rate": 2.449259462424575e-05, "loss": 1.8749, "step": 93000 }, { "epoch": 51.28908392759188, "grad_norm": 3.8711071014404297, "learning_rate": 2.435545803620406e-05, "loss": 1.8377, "step": 93500 }, { "epoch": 51.56335710367526, "grad_norm": 3.922783136367798, "learning_rate": 2.421832144816237e-05, "loss": 1.8397, "step": 94000 }, { "epoch": 51.83763027975864, "grad_norm": 4.025134086608887, "learning_rate": 2.4081184860120682e-05, "loss": 1.8609, "step": 94500 }, { "epoch": 52.11190345584202, "grad_norm": 3.800508975982666, "learning_rate": 2.394404827207899e-05, "loss": 1.8407, "step": 95000 }, { "epoch": 52.386176631925395, "grad_norm": 3.944465160369873, "learning_rate": 2.3806911684037304e-05, "loss": 1.8263, "step": 95500 }, { "epoch": 52.66044980800878, "grad_norm": 4.014648914337158, "learning_rate": 2.3669775095995613e-05, "loss": 1.8374, "step": 96000 }, { "epoch": 52.934722984092154, "grad_norm": 4.08259916305542, "learning_rate": 2.353263850795392e-05, "loss": 1.8448, "step": 96500 }, { "epoch": 53.20899616017554, "grad_norm": 3.941981792449951, "learning_rate": 2.3395501919912234e-05, "loss": 1.8131, "step": 97000 }, { "epoch": 53.483269336258914, "grad_norm": 3.8573715686798096, "learning_rate": 2.3258365331870543e-05, "loss": 1.8169, "step": 97500 }, { "epoch": 53.75754251234229, "grad_norm": 3.987938165664673, "learning_rate": 2.3121228743828855e-05, "loss": 1.8145, "step": 98000 }, { "epoch": 54.031815688425674, "grad_norm": 3.652238607406616, "learning_rate": 2.2984092155787164e-05, "loss": 1.8213, "step": 98500 }, { "epoch": 54.30608886450905, "grad_norm": 3.640587568283081, "learning_rate": 2.2846955567745476e-05, "loss": 1.7938, "step": 99000 }, { "epoch": 54.58036204059243, "grad_norm": 3.884443759918213, "learning_rate": 2.2709818979703785e-05, "loss": 1.7963, "step": 99500 }, { "epoch": 54.85463521667581, "grad_norm": 4.245452880859375, "learning_rate": 2.2572682391662098e-05, "loss": 1.811, "step": 100000 }, { "epoch": 55.128908392759186, "grad_norm": 3.97247576713562, "learning_rate": 2.2435545803620407e-05, "loss": 1.7964, "step": 100500 }, { "epoch": 55.40318156884257, "grad_norm": 3.8827009201049805, "learning_rate": 2.2298409215578716e-05, "loss": 1.7774, "step": 101000 }, { "epoch": 55.677454744925946, "grad_norm": 4.079446792602539, "learning_rate": 2.2161272627537028e-05, "loss": 1.7884, "step": 101500 }, { "epoch": 55.95172792100932, "grad_norm": 4.093244552612305, "learning_rate": 2.2024136039495337e-05, "loss": 1.7904, "step": 102000 }, { "epoch": 56.226001097092706, "grad_norm": 3.674686908721924, "learning_rate": 2.188699945145365e-05, "loss": 1.7655, "step": 102500 }, { "epoch": 56.50027427317608, "grad_norm": 4.042862892150879, "learning_rate": 2.174986286341196e-05, "loss": 1.7708, "step": 103000 }, { "epoch": 56.774547449259465, "grad_norm": 4.069617748260498, "learning_rate": 2.161272627537027e-05, "loss": 1.777, "step": 103500 }, { "epoch": 57.04882062534284, "grad_norm": 3.68752384185791, "learning_rate": 2.1475589687328583e-05, "loss": 1.7702, "step": 104000 }, { "epoch": 57.32309380142622, "grad_norm": 3.998215436935425, "learning_rate": 2.133845309928689e-05, "loss": 1.7508, "step": 104500 }, { "epoch": 57.5973669775096, "grad_norm": 4.300554275512695, "learning_rate": 2.12013165112452e-05, "loss": 1.7634, "step": 105000 }, { "epoch": 57.87164015359298, "grad_norm": 3.649411678314209, "learning_rate": 2.106417992320351e-05, "loss": 1.7603, "step": 105500 }, { "epoch": 58.14591332967636, "grad_norm": 4.17492151260376, "learning_rate": 2.0927043335161822e-05, "loss": 1.7441, "step": 106000 }, { "epoch": 58.42018650575974, "grad_norm": 3.8550057411193848, "learning_rate": 2.0789906747120134e-05, "loss": 1.7381, "step": 106500 }, { "epoch": 58.694459681843114, "grad_norm": 4.004961967468262, "learning_rate": 2.0652770159078443e-05, "loss": 1.7475, "step": 107000 }, { "epoch": 58.9687328579265, "grad_norm": 3.936483144760132, "learning_rate": 2.0515633571036756e-05, "loss": 1.7532, "step": 107500 }, { "epoch": 59.24300603400987, "grad_norm": 3.812488317489624, "learning_rate": 2.037849698299506e-05, "loss": 1.7232, "step": 108000 }, { "epoch": 59.51727921009325, "grad_norm": 4.185512542724609, "learning_rate": 2.0241360394953374e-05, "loss": 1.7196, "step": 108500 }, { "epoch": 59.79155238617663, "grad_norm": 4.278858184814453, "learning_rate": 2.0104223806911686e-05, "loss": 1.7356, "step": 109000 }, { "epoch": 60.06582556226001, "grad_norm": 4.104213714599609, "learning_rate": 1.9967087218869995e-05, "loss": 1.7328, "step": 109500 }, { "epoch": 60.34009873834339, "grad_norm": 4.215428352355957, "learning_rate": 1.9829950630828307e-05, "loss": 1.7109, "step": 110000 }, { "epoch": 60.61437191442677, "grad_norm": 4.020122528076172, "learning_rate": 1.9692814042786616e-05, "loss": 1.7144, "step": 110500 }, { "epoch": 60.888645090510146, "grad_norm": 3.9703729152679443, "learning_rate": 1.955567745474493e-05, "loss": 1.7335, "step": 111000 }, { "epoch": 61.16291826659353, "grad_norm": 3.900017023086548, "learning_rate": 1.9418540866703238e-05, "loss": 1.7048, "step": 111500 }, { "epoch": 61.437191442676905, "grad_norm": 4.7137627601623535, "learning_rate": 1.9281404278661547e-05, "loss": 1.6902, "step": 112000 }, { "epoch": 61.71146461876029, "grad_norm": 4.035908222198486, "learning_rate": 1.914426769061986e-05, "loss": 1.7104, "step": 112500 }, { "epoch": 61.985737794843665, "grad_norm": 4.041805744171143, "learning_rate": 1.9007131102578168e-05, "loss": 1.7155, "step": 113000 }, { "epoch": 62.26001097092704, "grad_norm": 3.817702054977417, "learning_rate": 1.886999451453648e-05, "loss": 1.6851, "step": 113500 }, { "epoch": 62.534284147010425, "grad_norm": 3.8696234226226807, "learning_rate": 1.873285792649479e-05, "loss": 1.6857, "step": 114000 }, { "epoch": 62.8085573230938, "grad_norm": 3.909179925918579, "learning_rate": 1.85957213384531e-05, "loss": 1.7037, "step": 114500 }, { "epoch": 63.08283049917718, "grad_norm": 3.8557326793670654, "learning_rate": 1.845858475041141e-05, "loss": 1.6936, "step": 115000 }, { "epoch": 63.35710367526056, "grad_norm": 4.332828044891357, "learning_rate": 1.832144816236972e-05, "loss": 1.6786, "step": 115500 }, { "epoch": 63.63137685134394, "grad_norm": 4.454130172729492, "learning_rate": 1.818431157432803e-05, "loss": 1.6768, "step": 116000 }, { "epoch": 63.90565002742732, "grad_norm": 3.943071126937866, "learning_rate": 1.804717498628634e-05, "loss": 1.6905, "step": 116500 }, { "epoch": 64.1799232035107, "grad_norm": 4.255739688873291, "learning_rate": 1.7910038398244653e-05, "loss": 1.663, "step": 117000 }, { "epoch": 64.45419637959408, "grad_norm": 4.027384281158447, "learning_rate": 1.7772901810202962e-05, "loss": 1.6603, "step": 117500 }, { "epoch": 64.72846955567745, "grad_norm": 3.8232147693634033, "learning_rate": 1.7635765222161274e-05, "loss": 1.6701, "step": 118000 }, { "epoch": 65.00274273176083, "grad_norm": 3.83734130859375, "learning_rate": 1.7498628634119583e-05, "loss": 1.6797, "step": 118500 }, { "epoch": 65.27701590784422, "grad_norm": 3.9775922298431396, "learning_rate": 1.7361492046077896e-05, "loss": 1.6441, "step": 119000 }, { "epoch": 65.55128908392759, "grad_norm": 3.810086250305176, "learning_rate": 1.7224355458036205e-05, "loss": 1.6534, "step": 119500 }, { "epoch": 65.82556226001097, "grad_norm": 4.3292036056518555, "learning_rate": 1.7087218869994513e-05, "loss": 1.6651, "step": 120000 }, { "epoch": 66.09983543609435, "grad_norm": 4.046462535858154, "learning_rate": 1.6950082281952826e-05, "loss": 1.654, "step": 120500 }, { "epoch": 66.37410861217774, "grad_norm": 4.200257778167725, "learning_rate": 1.6812945693911135e-05, "loss": 1.6383, "step": 121000 }, { "epoch": 66.6483817882611, "grad_norm": 3.98045015335083, "learning_rate": 1.6675809105869447e-05, "loss": 1.6571, "step": 121500 }, { "epoch": 66.92265496434449, "grad_norm": 3.9323537349700928, "learning_rate": 1.653867251782776e-05, "loss": 1.6486, "step": 122000 }, { "epoch": 67.19692814042787, "grad_norm": 3.898150682449341, "learning_rate": 1.640153592978607e-05, "loss": 1.6288, "step": 122500 }, { "epoch": 67.47120131651124, "grad_norm": 3.8490869998931885, "learning_rate": 1.6264399341744377e-05, "loss": 1.6305, "step": 123000 }, { "epoch": 67.74547449259462, "grad_norm": 4.125833034515381, "learning_rate": 1.6127262753702686e-05, "loss": 1.6393, "step": 123500 }, { "epoch": 68.01974766867801, "grad_norm": 4.1837286949157715, "learning_rate": 1.5990126165661e-05, "loss": 1.6441, "step": 124000 }, { "epoch": 68.29402084476138, "grad_norm": 4.150059700012207, "learning_rate": 1.585298957761931e-05, "loss": 1.6144, "step": 124500 }, { "epoch": 68.56829402084476, "grad_norm": 4.325094223022461, "learning_rate": 1.571585298957762e-05, "loss": 1.6254, "step": 125000 }, { "epoch": 68.84256719692814, "grad_norm": 3.9832139015197754, "learning_rate": 1.5578716401535932e-05, "loss": 1.636, "step": 125500 }, { "epoch": 69.11684037301151, "grad_norm": 3.9516079425811768, "learning_rate": 1.544157981349424e-05, "loss": 1.6183, "step": 126000 }, { "epoch": 69.3911135490949, "grad_norm": 3.982802391052246, "learning_rate": 1.5304443225452554e-05, "loss": 1.6116, "step": 126500 }, { "epoch": 69.66538672517828, "grad_norm": 4.178645610809326, "learning_rate": 1.516730663741086e-05, "loss": 1.6183, "step": 127000 }, { "epoch": 69.93965990126166, "grad_norm": 4.045616149902344, "learning_rate": 1.5030170049369171e-05, "loss": 1.6226, "step": 127500 }, { "epoch": 70.21393307734503, "grad_norm": 4.098151206970215, "learning_rate": 1.4893033461327482e-05, "loss": 1.5999, "step": 128000 }, { "epoch": 70.48820625342842, "grad_norm": 4.052021026611328, "learning_rate": 1.4755896873285793e-05, "loss": 1.5946, "step": 128500 }, { "epoch": 70.7624794295118, "grad_norm": 3.7580652236938477, "learning_rate": 1.4618760285244103e-05, "loss": 1.6172, "step": 129000 }, { "epoch": 71.03675260559517, "grad_norm": 4.1435866355896, "learning_rate": 1.4481623697202416e-05, "loss": 1.6121, "step": 129500 }, { "epoch": 71.31102578167855, "grad_norm": 3.739654302597046, "learning_rate": 1.4344487109160726e-05, "loss": 1.588, "step": 130000 }, { "epoch": 71.58529895776194, "grad_norm": 4.380291938781738, "learning_rate": 1.4207350521119034e-05, "loss": 1.5998, "step": 130500 }, { "epoch": 71.8595721338453, "grad_norm": 3.7885782718658447, "learning_rate": 1.4070213933077344e-05, "loss": 1.5958, "step": 131000 }, { "epoch": 72.13384530992869, "grad_norm": 4.236293792724609, "learning_rate": 1.3933077345035655e-05, "loss": 1.5956, "step": 131500 }, { "epoch": 72.40811848601207, "grad_norm": 4.205173015594482, "learning_rate": 1.3795940756993966e-05, "loss": 1.5766, "step": 132000 }, { "epoch": 72.68239166209544, "grad_norm": 4.034268379211426, "learning_rate": 1.3658804168952278e-05, "loss": 1.5912, "step": 132500 }, { "epoch": 72.95666483817882, "grad_norm": 3.9170260429382324, "learning_rate": 1.3521667580910589e-05, "loss": 1.5897, "step": 133000 }, { "epoch": 73.23093801426221, "grad_norm": 3.925799608230591, "learning_rate": 1.33845309928689e-05, "loss": 1.5765, "step": 133500 }, { "epoch": 73.50521119034559, "grad_norm": 4.052227973937988, "learning_rate": 1.324739440482721e-05, "loss": 1.5775, "step": 134000 }, { "epoch": 73.77948436642896, "grad_norm": 4.2378034591674805, "learning_rate": 1.3110257816785517e-05, "loss": 1.5834, "step": 134500 }, { "epoch": 74.05375754251234, "grad_norm": 4.073320388793945, "learning_rate": 1.297312122874383e-05, "loss": 1.5812, "step": 135000 }, { "epoch": 74.32803071859573, "grad_norm": 4.102873802185059, "learning_rate": 1.283598464070214e-05, "loss": 1.5588, "step": 135500 }, { "epoch": 74.6023038946791, "grad_norm": 4.223252773284912, "learning_rate": 1.269884805266045e-05, "loss": 1.5785, "step": 136000 }, { "epoch": 74.87657707076248, "grad_norm": 4.320130825042725, "learning_rate": 1.2561711464618761e-05, "loss": 1.5686, "step": 136500 }, { "epoch": 75.15085024684586, "grad_norm": 4.706448078155518, "learning_rate": 1.2424574876577072e-05, "loss": 1.5688, "step": 137000 }, { "epoch": 75.42512342292923, "grad_norm": 4.024387359619141, "learning_rate": 1.2287438288535381e-05, "loss": 1.5563, "step": 137500 }, { "epoch": 75.69939659901262, "grad_norm": 3.9221880435943604, "learning_rate": 1.2150301700493692e-05, "loss": 1.5712, "step": 138000 }, { "epoch": 75.973669775096, "grad_norm": 4.27291202545166, "learning_rate": 1.2013165112452002e-05, "loss": 1.5654, "step": 138500 }, { "epoch": 76.24794295117937, "grad_norm": 4.373564720153809, "learning_rate": 1.1876028524410313e-05, "loss": 1.5449, "step": 139000 }, { "epoch": 76.52221612726275, "grad_norm": 4.030310153961182, "learning_rate": 1.1738891936368624e-05, "loss": 1.5571, "step": 139500 }, { "epoch": 76.79648930334614, "grad_norm": 4.002580165863037, "learning_rate": 1.1601755348326934e-05, "loss": 1.5568, "step": 140000 }, { "epoch": 77.07076247942952, "grad_norm": 4.0623369216918945, "learning_rate": 1.1464618760285245e-05, "loss": 1.5545, "step": 140500 }, { "epoch": 77.34503565551289, "grad_norm": 4.049304485321045, "learning_rate": 1.1327482172243554e-05, "loss": 1.5442, "step": 141000 }, { "epoch": 77.61930883159627, "grad_norm": 3.891969680786133, "learning_rate": 1.1190345584201866e-05, "loss": 1.5464, "step": 141500 }, { "epoch": 77.89358200767965, "grad_norm": 4.165316104888916, "learning_rate": 1.1053208996160177e-05, "loss": 1.5509, "step": 142000 }, { "epoch": 78.16785518376302, "grad_norm": 4.1472249031066895, "learning_rate": 1.0916072408118487e-05, "loss": 1.5375, "step": 142500 }, { "epoch": 78.44212835984641, "grad_norm": 4.173414707183838, "learning_rate": 1.0778935820076796e-05, "loss": 1.5345, "step": 143000 }, { "epoch": 78.71640153592979, "grad_norm": 3.9279398918151855, "learning_rate": 1.0641799232035107e-05, "loss": 1.5381, "step": 143500 }, { "epoch": 78.99067471201316, "grad_norm": 4.222446441650391, "learning_rate": 1.0504662643993418e-05, "loss": 1.552, "step": 144000 }, { "epoch": 79.26494788809654, "grad_norm": 3.8020248413085938, "learning_rate": 1.0367526055951728e-05, "loss": 1.5217, "step": 144500 }, { "epoch": 79.53922106417993, "grad_norm": 3.8790934085845947, "learning_rate": 1.0230389467910039e-05, "loss": 1.5346, "step": 145000 }, { "epoch": 79.8134942402633, "grad_norm": 4.49297571182251, "learning_rate": 1.009325287986835e-05, "loss": 1.5354, "step": 145500 }, { "epoch": 80.08776741634668, "grad_norm": 4.024161338806152, "learning_rate": 9.95611629182666e-06, "loss": 1.5326, "step": 146000 }, { "epoch": 80.36204059243006, "grad_norm": 3.997326374053955, "learning_rate": 9.818979703784971e-06, "loss": 1.5133, "step": 146500 }, { "epoch": 80.63631376851345, "grad_norm": 4.163906574249268, "learning_rate": 9.68184311574328e-06, "loss": 1.5252, "step": 147000 }, { "epoch": 80.91058694459682, "grad_norm": 4.333358287811279, "learning_rate": 9.54470652770159e-06, "loss": 1.5356, "step": 147500 }, { "epoch": 81.1848601206802, "grad_norm": 4.201995372772217, "learning_rate": 9.407569939659903e-06, "loss": 1.5191, "step": 148000 }, { "epoch": 81.45913329676358, "grad_norm": 3.8196020126342773, "learning_rate": 9.270433351618212e-06, "loss": 1.5168, "step": 148500 }, { "epoch": 81.73340647284695, "grad_norm": 4.32403039932251, "learning_rate": 9.133296763576522e-06, "loss": 1.5214, "step": 149000 }, { "epoch": 82.00767964893033, "grad_norm": 4.165477752685547, "learning_rate": 8.996160175534833e-06, "loss": 1.5242, "step": 149500 }, { "epoch": 82.28195282501372, "grad_norm": 4.155007362365723, "learning_rate": 8.859023587493144e-06, "loss": 1.5022, "step": 150000 }, { "epoch": 82.55622600109709, "grad_norm": 4.049638748168945, "learning_rate": 8.721886999451453e-06, "loss": 1.5172, "step": 150500 }, { "epoch": 82.83049917718047, "grad_norm": 4.376342296600342, "learning_rate": 8.584750411409765e-06, "loss": 1.5097, "step": 151000 }, { "epoch": 83.10477235326385, "grad_norm": 4.413540363311768, "learning_rate": 8.447613823368076e-06, "loss": 1.5163, "step": 151500 }, { "epoch": 83.37904552934722, "grad_norm": 3.754805326461792, "learning_rate": 8.310477235326386e-06, "loss": 1.5011, "step": 152000 }, { "epoch": 83.6533187054306, "grad_norm": 4.167300224304199, "learning_rate": 8.173340647284695e-06, "loss": 1.5026, "step": 152500 }, { "epoch": 83.92759188151399, "grad_norm": 4.1614861488342285, "learning_rate": 8.036204059243006e-06, "loss": 1.5101, "step": 153000 }, { "epoch": 84.20186505759737, "grad_norm": 4.183162212371826, "learning_rate": 7.899067471201317e-06, "loss": 1.4981, "step": 153500 }, { "epoch": 84.47613823368074, "grad_norm": 4.0559539794921875, "learning_rate": 7.761930883159627e-06, "loss": 1.4965, "step": 154000 }, { "epoch": 84.75041140976413, "grad_norm": 4.252512454986572, "learning_rate": 7.624794295117937e-06, "loss": 1.5019, "step": 154500 }, { "epoch": 85.02468458584751, "grad_norm": 3.9115328788757324, "learning_rate": 7.4876577070762485e-06, "loss": 1.5138, "step": 155000 }, { "epoch": 85.29895776193088, "grad_norm": 4.217545032501221, "learning_rate": 7.350521119034559e-06, "loss": 1.492, "step": 155500 }, { "epoch": 85.57323093801426, "grad_norm": 3.974954128265381, "learning_rate": 7.21338453099287e-06, "loss": 1.4897, "step": 156000 }, { "epoch": 85.84750411409765, "grad_norm": 4.266519069671631, "learning_rate": 7.07624794295118e-06, "loss": 1.4996, "step": 156500 }, { "epoch": 86.12177729018102, "grad_norm": 3.8507697582244873, "learning_rate": 6.93911135490949e-06, "loss": 1.4891, "step": 157000 }, { "epoch": 86.3960504662644, "grad_norm": 4.050006866455078, "learning_rate": 6.801974766867801e-06, "loss": 1.4848, "step": 157500 }, { "epoch": 86.67032364234778, "grad_norm": 4.006500720977783, "learning_rate": 6.664838178826111e-06, "loss": 1.4946, "step": 158000 }, { "epoch": 86.94459681843115, "grad_norm": 4.2527289390563965, "learning_rate": 6.527701590784421e-06, "loss": 1.495, "step": 158500 }, { "epoch": 87.21886999451453, "grad_norm": 4.087696552276611, "learning_rate": 6.390565002742732e-06, "loss": 1.4834, "step": 159000 }, { "epoch": 87.49314317059792, "grad_norm": 3.9683475494384766, "learning_rate": 6.253428414701043e-06, "loss": 1.48, "step": 159500 }, { "epoch": 87.7674163466813, "grad_norm": 4.009182453155518, "learning_rate": 6.116291826659353e-06, "loss": 1.4799, "step": 160000 }, { "epoch": 88.04168952276467, "grad_norm": 3.9172310829162598, "learning_rate": 5.979155238617663e-06, "loss": 1.4827, "step": 160500 }, { "epoch": 88.31596269884805, "grad_norm": 3.920940399169922, "learning_rate": 5.842018650575974e-06, "loss": 1.4772, "step": 161000 }, { "epoch": 88.59023587493144, "grad_norm": 4.178516387939453, "learning_rate": 5.704882062534284e-06, "loss": 1.4831, "step": 161500 }, { "epoch": 88.8645090510148, "grad_norm": 4.068806171417236, "learning_rate": 5.567745474492595e-06, "loss": 1.4796, "step": 162000 }, { "epoch": 89.13878222709819, "grad_norm": 3.923023223876953, "learning_rate": 5.430608886450905e-06, "loss": 1.4734, "step": 162500 }, { "epoch": 89.41305540318157, "grad_norm": 4.0538411140441895, "learning_rate": 5.293472298409216e-06, "loss": 1.4675, "step": 163000 }, { "epoch": 89.68732857926494, "grad_norm": 4.289505481719971, "learning_rate": 5.156335710367526e-06, "loss": 1.4812, "step": 163500 }, { "epoch": 89.96160175534833, "grad_norm": 4.2184247970581055, "learning_rate": 5.019199122325837e-06, "loss": 1.4761, "step": 164000 }, { "epoch": 90.23587493143171, "grad_norm": 4.014777183532715, "learning_rate": 4.8820625342841474e-06, "loss": 1.4659, "step": 164500 }, { "epoch": 90.51014810751508, "grad_norm": 4.025433540344238, "learning_rate": 4.744925946242457e-06, "loss": 1.4701, "step": 165000 }, { "epoch": 90.78442128359846, "grad_norm": 4.117000102996826, "learning_rate": 4.607789358200768e-06, "loss": 1.4745, "step": 165500 }, { "epoch": 91.05869445968185, "grad_norm": 4.047626495361328, "learning_rate": 4.4706527701590785e-06, "loss": 1.4693, "step": 166000 }, { "epoch": 91.33296763576523, "grad_norm": 4.17887020111084, "learning_rate": 4.333516182117389e-06, "loss": 1.4624, "step": 166500 }, { "epoch": 91.6072408118486, "grad_norm": 4.2437639236450195, "learning_rate": 4.196379594075699e-06, "loss": 1.4665, "step": 167000 }, { "epoch": 91.88151398793198, "grad_norm": 3.7711315155029297, "learning_rate": 4.0592430060340105e-06, "loss": 1.4731, "step": 167500 }, { "epoch": 92.15578716401536, "grad_norm": 4.002791404724121, "learning_rate": 3.92210641799232e-06, "loss": 1.4642, "step": 168000 }, { "epoch": 92.43006034009873, "grad_norm": 4.0743231773376465, "learning_rate": 3.7849698299506313e-06, "loss": 1.4584, "step": 168500 }, { "epoch": 92.70433351618212, "grad_norm": 4.080685138702393, "learning_rate": 3.647833241908941e-06, "loss": 1.4623, "step": 169000 }, { "epoch": 92.9786066922655, "grad_norm": 4.304593563079834, "learning_rate": 3.510696653867252e-06, "loss": 1.4682, "step": 169500 }, { "epoch": 93.25287986834887, "grad_norm": 4.447428226470947, "learning_rate": 3.3735600658255624e-06, "loss": 1.4535, "step": 170000 }, { "epoch": 93.52715304443225, "grad_norm": 4.22756814956665, "learning_rate": 3.236423477783873e-06, "loss": 1.4592, "step": 170500 }, { "epoch": 93.80142622051564, "grad_norm": 4.293380260467529, "learning_rate": 3.0992868897421833e-06, "loss": 1.4632, "step": 171000 }, { "epoch": 94.075699396599, "grad_norm": 4.07041072845459, "learning_rate": 2.962150301700494e-06, "loss": 1.4605, "step": 171500 }, { "epoch": 94.34997257268239, "grad_norm": 4.039161205291748, "learning_rate": 2.825013713658804e-06, "loss": 1.4551, "step": 172000 }, { "epoch": 94.62424574876577, "grad_norm": 4.1246795654296875, "learning_rate": 2.687877125617115e-06, "loss": 1.456, "step": 172500 }, { "epoch": 94.89851892484916, "grad_norm": 4.026761054992676, "learning_rate": 2.550740537575425e-06, "loss": 1.4512, "step": 173000 }, { "epoch": 95.17279210093253, "grad_norm": 4.5864715576171875, "learning_rate": 2.4136039495337357e-06, "loss": 1.4575, "step": 173500 }, { "epoch": 95.44706527701591, "grad_norm": 4.117992401123047, "learning_rate": 2.2764673614920463e-06, "loss": 1.4475, "step": 174000 }, { "epoch": 95.72133845309929, "grad_norm": 4.155096530914307, "learning_rate": 2.1393307734503565e-06, "loss": 1.4581, "step": 174500 }, { "epoch": 95.99561162918266, "grad_norm": 4.28767204284668, "learning_rate": 2.002194185408667e-06, "loss": 1.4521, "step": 175000 }, { "epoch": 96.26988480526605, "grad_norm": 4.1511077880859375, "learning_rate": 1.8650575973669776e-06, "loss": 1.4488, "step": 175500 }, { "epoch": 96.54415798134943, "grad_norm": 4.336985111236572, "learning_rate": 1.727921009325288e-06, "loss": 1.4534, "step": 176000 }, { "epoch": 96.8184311574328, "grad_norm": 4.181045055389404, "learning_rate": 1.5907844212835987e-06, "loss": 1.45, "step": 176500 }, { "epoch": 97.09270433351618, "grad_norm": 4.217624187469482, "learning_rate": 1.453647833241909e-06, "loss": 1.4498, "step": 177000 }, { "epoch": 97.36697750959956, "grad_norm": 3.8873023986816406, "learning_rate": 1.3165112452002194e-06, "loss": 1.4507, "step": 177500 }, { "epoch": 97.64125068568293, "grad_norm": 4.3951191902160645, "learning_rate": 1.17937465715853e-06, "loss": 1.451, "step": 178000 }, { "epoch": 97.91552386176632, "grad_norm": 4.1204118728637695, "learning_rate": 1.0422380691168404e-06, "loss": 1.4484, "step": 178500 }, { "epoch": 98.1897970378497, "grad_norm": 4.278495788574219, "learning_rate": 9.05101481075151e-07, "loss": 1.4431, "step": 179000 }, { "epoch": 98.46407021393308, "grad_norm": 4.186399459838867, "learning_rate": 7.679648930334613e-07, "loss": 1.4493, "step": 179500 }, { "epoch": 98.73834339001645, "grad_norm": 4.110637187957764, "learning_rate": 6.308283049917719e-07, "loss": 1.4418, "step": 180000 }, { "epoch": 99.01261656609984, "grad_norm": 3.9559993743896484, "learning_rate": 4.936917169500823e-07, "loss": 1.4407, "step": 180500 }, { "epoch": 99.28688974218322, "grad_norm": 4.4722418785095215, "learning_rate": 3.565551289083928e-07, "loss": 1.4421, "step": 181000 }, { "epoch": 99.56116291826659, "grad_norm": 4.151792526245117, "learning_rate": 2.1941854086670326e-07, "loss": 1.4469, "step": 181500 }, { "epoch": 99.83543609434997, "grad_norm": 4.128389835357666, "learning_rate": 8.228195282501371e-08, "loss": 1.4463, "step": 182000 }, { "epoch": 100.0, "step": 182300, "total_flos": 3.157662139522744e+17, "train_loss": 2.261507166926833, "train_runtime": 62730.7046, "train_samples_per_second": 185.96, "train_steps_per_second": 2.906 } ], "logging_steps": 500, "max_steps": 182300, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.157662139522744e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }