{ "best_metric": 1.3924267292022705, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.2705444707473791, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006763611768684477, "grad_norm": 0.11604132503271103, "learning_rate": 1e-05, "loss": 1.3722, "step": 1 }, { "epoch": 0.0006763611768684477, "eval_loss": 1.602122187614441, "eval_runtime": 58.3957, "eval_samples_per_second": 42.64, "eval_steps_per_second": 10.669, "step": 1 }, { "epoch": 0.0013527223537368955, "grad_norm": 0.1288396269083023, "learning_rate": 2e-05, "loss": 1.5532, "step": 2 }, { "epoch": 0.002029083530605343, "grad_norm": 0.1327093541622162, "learning_rate": 3e-05, "loss": 1.616, "step": 3 }, { "epoch": 0.002705444707473791, "grad_norm": 0.11062159389257431, "learning_rate": 4e-05, "loss": 1.467, "step": 4 }, { "epoch": 0.0033818058843422386, "grad_norm": 0.10954708606004715, "learning_rate": 5e-05, "loss": 1.4682, "step": 5 }, { "epoch": 0.004058167061210686, "grad_norm": 0.11840442568063736, "learning_rate": 6e-05, "loss": 1.5285, "step": 6 }, { "epoch": 0.0047345282380791345, "grad_norm": 0.11326661705970764, "learning_rate": 7e-05, "loss": 1.4841, "step": 7 }, { "epoch": 0.005410889414947582, "grad_norm": 0.10721534490585327, "learning_rate": 8e-05, "loss": 1.4474, "step": 8 }, { "epoch": 0.00608725059181603, "grad_norm": 0.10227765887975693, "learning_rate": 9e-05, "loss": 1.4565, "step": 9 }, { "epoch": 0.006763611768684477, "grad_norm": 0.09144428372383118, "learning_rate": 0.0001, "loss": 1.5008, "step": 10 }, { "epoch": 0.0074399729455529254, "grad_norm": 0.09755543619394302, "learning_rate": 9.99983777858264e-05, "loss": 1.6286, "step": 11 }, { "epoch": 0.008116334122421373, "grad_norm": 0.08014149963855743, "learning_rate": 9.999351124856874e-05, "loss": 1.5582, "step": 12 }, { "epoch": 0.00879269529928982, "grad_norm": 0.07479062676429749, "learning_rate": 9.998540070400966e-05, "loss": 1.5557, "step": 13 }, { "epoch": 0.009469056476158269, "grad_norm": 0.07556532323360443, "learning_rate": 9.997404667843075e-05, "loss": 1.5746, "step": 14 }, { "epoch": 0.010145417653026716, "grad_norm": 0.07913000881671906, "learning_rate": 9.995944990857849e-05, "loss": 1.4655, "step": 15 }, { "epoch": 0.010821778829895164, "grad_norm": 0.081476129591465, "learning_rate": 9.994161134161634e-05, "loss": 1.5327, "step": 16 }, { "epoch": 0.011498140006763611, "grad_norm": 0.07700520753860474, "learning_rate": 9.992053213506334e-05, "loss": 1.5462, "step": 17 }, { "epoch": 0.01217450118363206, "grad_norm": 0.08761648833751678, "learning_rate": 9.989621365671902e-05, "loss": 1.5837, "step": 18 }, { "epoch": 0.012850862360500507, "grad_norm": 0.10584254562854767, "learning_rate": 9.986865748457457e-05, "loss": 1.5232, "step": 19 }, { "epoch": 0.013527223537368955, "grad_norm": 0.08562006056308746, "learning_rate": 9.983786540671051e-05, "loss": 1.5188, "step": 20 }, { "epoch": 0.014203584714237404, "grad_norm": 0.08462590724229813, "learning_rate": 9.980383942118066e-05, "loss": 1.5743, "step": 21 }, { "epoch": 0.014879945891105851, "grad_norm": 0.07892792671918869, "learning_rate": 9.976658173588244e-05, "loss": 1.5606, "step": 22 }, { "epoch": 0.015556307067974298, "grad_norm": 0.06926483660936356, "learning_rate": 9.972609476841367e-05, "loss": 1.429, "step": 23 }, { "epoch": 0.016232668244842745, "grad_norm": 0.06770279258489609, "learning_rate": 9.968238114591566e-05, "loss": 1.564, "step": 24 }, { "epoch": 0.016909029421711193, "grad_norm": 0.06699114292860031, "learning_rate": 9.96354437049027e-05, "loss": 1.5362, "step": 25 }, { "epoch": 0.01758539059857964, "grad_norm": 0.0729045495390892, "learning_rate": 9.95852854910781e-05, "loss": 1.5552, "step": 26 }, { "epoch": 0.01826175177544809, "grad_norm": 0.07210293412208557, "learning_rate": 9.953190975913647e-05, "loss": 1.4896, "step": 27 }, { "epoch": 0.018938112952316538, "grad_norm": 0.06548850983381271, "learning_rate": 9.947531997255256e-05, "loss": 1.5583, "step": 28 }, { "epoch": 0.019614474129184985, "grad_norm": 0.06891623139381409, "learning_rate": 9.941551980335652e-05, "loss": 1.5435, "step": 29 }, { "epoch": 0.020290835306053433, "grad_norm": 0.06692694127559662, "learning_rate": 9.935251313189564e-05, "loss": 1.5053, "step": 30 }, { "epoch": 0.02096719648292188, "grad_norm": 0.07309979945421219, "learning_rate": 9.928630404658255e-05, "loss": 1.559, "step": 31 }, { "epoch": 0.021643557659790327, "grad_norm": 0.0623411126434803, "learning_rate": 9.921689684362989e-05, "loss": 1.4759, "step": 32 }, { "epoch": 0.022319918836658775, "grad_norm": 0.06196431815624237, "learning_rate": 9.914429602677162e-05, "loss": 1.436, "step": 33 }, { "epoch": 0.022996280013527222, "grad_norm": 0.06310637295246124, "learning_rate": 9.906850630697068e-05, "loss": 1.5004, "step": 34 }, { "epoch": 0.023672641190395673, "grad_norm": 0.06335233151912689, "learning_rate": 9.898953260211338e-05, "loss": 1.5498, "step": 35 }, { "epoch": 0.02434900236726412, "grad_norm": 0.06549704074859619, "learning_rate": 9.890738003669029e-05, "loss": 1.5757, "step": 36 }, { "epoch": 0.025025363544132567, "grad_norm": 0.06362990289926529, "learning_rate": 9.882205394146361e-05, "loss": 1.4404, "step": 37 }, { "epoch": 0.025701724721001015, "grad_norm": 0.065706267952919, "learning_rate": 9.87335598531214e-05, "loss": 1.5211, "step": 38 }, { "epoch": 0.026378085897869462, "grad_norm": 0.06316651403903961, "learning_rate": 9.864190351391822e-05, "loss": 1.4632, "step": 39 }, { "epoch": 0.02705444707473791, "grad_norm": 0.06697884202003479, "learning_rate": 9.85470908713026e-05, "loss": 1.5391, "step": 40 }, { "epoch": 0.027730808251606356, "grad_norm": 0.06642621010541916, "learning_rate": 9.844912807753104e-05, "loss": 1.4436, "step": 41 }, { "epoch": 0.028407169428474807, "grad_norm": 0.06622406840324402, "learning_rate": 9.834802148926882e-05, "loss": 1.5092, "step": 42 }, { "epoch": 0.029083530605343254, "grad_norm": 0.06932581961154938, "learning_rate": 9.824377766717759e-05, "loss": 1.4794, "step": 43 }, { "epoch": 0.029759891782211702, "grad_norm": 0.07035700976848602, "learning_rate": 9.813640337548954e-05, "loss": 1.3602, "step": 44 }, { "epoch": 0.03043625295908015, "grad_norm": 0.07323595881462097, "learning_rate": 9.802590558156862e-05, "loss": 1.5098, "step": 45 }, { "epoch": 0.031112614135948596, "grad_norm": 0.07331836223602295, "learning_rate": 9.791229145545831e-05, "loss": 1.4027, "step": 46 }, { "epoch": 0.03178897531281705, "grad_norm": 0.07488737255334854, "learning_rate": 9.779556836941645e-05, "loss": 1.4362, "step": 47 }, { "epoch": 0.03246533648968549, "grad_norm": 0.08538374304771423, "learning_rate": 9.767574389743682e-05, "loss": 1.4314, "step": 48 }, { "epoch": 0.03314169766655394, "grad_norm": 0.10251971334218979, "learning_rate": 9.755282581475769e-05, "loss": 1.4597, "step": 49 }, { "epoch": 0.033818058843422386, "grad_norm": 0.1345338076353073, "learning_rate": 9.742682209735727e-05, "loss": 1.4462, "step": 50 }, { "epoch": 0.033818058843422386, "eval_loss": 1.463585615158081, "eval_runtime": 58.5692, "eval_samples_per_second": 42.514, "eval_steps_per_second": 10.637, "step": 50 }, { "epoch": 0.034494420020290836, "grad_norm": 0.08610611408948898, "learning_rate": 9.729774092143627e-05, "loss": 1.243, "step": 51 }, { "epoch": 0.03517078119715928, "grad_norm": 0.09492066502571106, "learning_rate": 9.716559066288715e-05, "loss": 1.3992, "step": 52 }, { "epoch": 0.03584714237402773, "grad_norm": 0.08210044354200363, "learning_rate": 9.703037989675087e-05, "loss": 1.452, "step": 53 }, { "epoch": 0.03652350355089618, "grad_norm": 0.07704884558916092, "learning_rate": 9.689211739666023e-05, "loss": 1.321, "step": 54 }, { "epoch": 0.037199864727764625, "grad_norm": 0.07068493217229843, "learning_rate": 9.675081213427076e-05, "loss": 1.2485, "step": 55 }, { "epoch": 0.037876225904633076, "grad_norm": 0.07206089049577713, "learning_rate": 9.66064732786784e-05, "loss": 1.3866, "step": 56 }, { "epoch": 0.03855258708150152, "grad_norm": 0.06557848304510117, "learning_rate": 9.645911019582467e-05, "loss": 1.3046, "step": 57 }, { "epoch": 0.03922894825836997, "grad_norm": 0.0679512619972229, "learning_rate": 9.630873244788883e-05, "loss": 1.3663, "step": 58 }, { "epoch": 0.039905309435238415, "grad_norm": 0.06276343762874603, "learning_rate": 9.615534979266745e-05, "loss": 1.3611, "step": 59 }, { "epoch": 0.040581670612106865, "grad_norm": 0.06208965554833412, "learning_rate": 9.599897218294122e-05, "loss": 1.4392, "step": 60 }, { "epoch": 0.041258031788975316, "grad_norm": 0.05605030059814453, "learning_rate": 9.583960976582913e-05, "loss": 1.4283, "step": 61 }, { "epoch": 0.04193439296584376, "grad_norm": 0.0576203390955925, "learning_rate": 9.567727288213005e-05, "loss": 1.4846, "step": 62 }, { "epoch": 0.04261075414271221, "grad_norm": 0.056024856865406036, "learning_rate": 9.551197206565173e-05, "loss": 1.473, "step": 63 }, { "epoch": 0.043287115319580655, "grad_norm": 0.05627163499593735, "learning_rate": 9.534371804252728e-05, "loss": 1.4328, "step": 64 }, { "epoch": 0.043963476496449105, "grad_norm": 0.0586436428129673, "learning_rate": 9.517252173051911e-05, "loss": 1.493, "step": 65 }, { "epoch": 0.04463983767331755, "grad_norm": 0.058163754642009735, "learning_rate": 9.49983942383106e-05, "loss": 1.3742, "step": 66 }, { "epoch": 0.045316198850186, "grad_norm": 0.058487631380558014, "learning_rate": 9.482134686478519e-05, "loss": 1.4339, "step": 67 }, { "epoch": 0.045992560027054444, "grad_norm": 0.061149582266807556, "learning_rate": 9.464139109829321e-05, "loss": 1.3955, "step": 68 }, { "epoch": 0.046668921203922895, "grad_norm": 0.05813620612025261, "learning_rate": 9.445853861590647e-05, "loss": 1.3931, "step": 69 }, { "epoch": 0.047345282380791345, "grad_norm": 0.05958769470453262, "learning_rate": 9.42728012826605e-05, "loss": 1.4106, "step": 70 }, { "epoch": 0.04802164355765979, "grad_norm": 0.05971216782927513, "learning_rate": 9.408419115078471e-05, "loss": 1.4899, "step": 71 }, { "epoch": 0.04869800473452824, "grad_norm": 0.061666082590818405, "learning_rate": 9.389272045892024e-05, "loss": 1.4908, "step": 72 }, { "epoch": 0.049374365911396684, "grad_norm": 0.05941418558359146, "learning_rate": 9.36984016313259e-05, "loss": 1.4982, "step": 73 }, { "epoch": 0.050050727088265135, "grad_norm": 0.05943742021918297, "learning_rate": 9.350124727707197e-05, "loss": 1.4998, "step": 74 }, { "epoch": 0.05072708826513358, "grad_norm": 0.06307440251111984, "learning_rate": 9.330127018922194e-05, "loss": 1.387, "step": 75 }, { "epoch": 0.05140344944200203, "grad_norm": 0.05909983813762665, "learning_rate": 9.309848334400246e-05, "loss": 1.3985, "step": 76 }, { "epoch": 0.05207981061887048, "grad_norm": 0.061729796230793, "learning_rate": 9.289289989996133e-05, "loss": 1.4094, "step": 77 }, { "epoch": 0.052756171795738924, "grad_norm": 0.06409716606140137, "learning_rate": 9.268453319711363e-05, "loss": 1.4705, "step": 78 }, { "epoch": 0.053432532972607374, "grad_norm": 0.06077890843153, "learning_rate": 9.247339675607605e-05, "loss": 1.5073, "step": 79 }, { "epoch": 0.05410889414947582, "grad_norm": 0.06479129940271378, "learning_rate": 9.225950427718975e-05, "loss": 1.3732, "step": 80 }, { "epoch": 0.05478525532634427, "grad_norm": 0.06292920559644699, "learning_rate": 9.204286963963111e-05, "loss": 1.4566, "step": 81 }, { "epoch": 0.05546161650321271, "grad_norm": 0.06173981726169586, "learning_rate": 9.182350690051133e-05, "loss": 1.4232, "step": 82 }, { "epoch": 0.056137977680081164, "grad_norm": 0.06200195476412773, "learning_rate": 9.160143029396422e-05, "loss": 1.4592, "step": 83 }, { "epoch": 0.056814338856949614, "grad_norm": 0.0646771639585495, "learning_rate": 9.13766542302225e-05, "loss": 1.3876, "step": 84 }, { "epoch": 0.05749070003381806, "grad_norm": 0.06770598888397217, "learning_rate": 9.114919329468282e-05, "loss": 1.4498, "step": 85 }, { "epoch": 0.05816706121068651, "grad_norm": 0.06534601747989655, "learning_rate": 9.091906224695935e-05, "loss": 1.4767, "step": 86 }, { "epoch": 0.05884342238755495, "grad_norm": 0.06706451624631882, "learning_rate": 9.068627601992598e-05, "loss": 1.4541, "step": 87 }, { "epoch": 0.059519783564423404, "grad_norm": 0.06629147380590439, "learning_rate": 9.045084971874738e-05, "loss": 1.4449, "step": 88 }, { "epoch": 0.06019614474129185, "grad_norm": 0.07131273299455643, "learning_rate": 9.021279861989885e-05, "loss": 1.5306, "step": 89 }, { "epoch": 0.0608725059181603, "grad_norm": 0.07197068631649017, "learning_rate": 8.997213817017507e-05, "loss": 1.4657, "step": 90 }, { "epoch": 0.06154886709502874, "grad_norm": 0.07346373796463013, "learning_rate": 8.972888398568772e-05, "loss": 1.4437, "step": 91 }, { "epoch": 0.06222522827189719, "grad_norm": 0.0742584615945816, "learning_rate": 8.948305185085225e-05, "loss": 1.3633, "step": 92 }, { "epoch": 0.06290158944876564, "grad_norm": 0.07299409061670303, "learning_rate": 8.92346577173636e-05, "loss": 1.458, "step": 93 }, { "epoch": 0.0635779506256341, "grad_norm": 0.07521813362836838, "learning_rate": 8.898371770316111e-05, "loss": 1.5101, "step": 94 }, { "epoch": 0.06425431180250253, "grad_norm": 0.0773283913731575, "learning_rate": 8.873024809138272e-05, "loss": 1.4126, "step": 95 }, { "epoch": 0.06493067297937098, "grad_norm": 0.07289938628673553, "learning_rate": 8.847426532930831e-05, "loss": 1.4009, "step": 96 }, { "epoch": 0.06560703415623943, "grad_norm": 0.08150746673345566, "learning_rate": 8.821578602729242e-05, "loss": 1.4199, "step": 97 }, { "epoch": 0.06628339533310788, "grad_norm": 0.08864708989858627, "learning_rate": 8.795482695768658e-05, "loss": 1.3391, "step": 98 }, { "epoch": 0.06695975650997633, "grad_norm": 0.09325551986694336, "learning_rate": 8.769140505375085e-05, "loss": 1.4052, "step": 99 }, { "epoch": 0.06763611768684477, "grad_norm": 0.16029119491577148, "learning_rate": 8.742553740855506e-05, "loss": 1.5922, "step": 100 }, { "epoch": 0.06763611768684477, "eval_loss": 1.4291542768478394, "eval_runtime": 58.6048, "eval_samples_per_second": 42.488, "eval_steps_per_second": 10.631, "step": 100 }, { "epoch": 0.06831247886371322, "grad_norm": 0.0940331295132637, "learning_rate": 8.715724127386972e-05, "loss": 1.4529, "step": 101 }, { "epoch": 0.06898884004058167, "grad_norm": 0.08114993572235107, "learning_rate": 8.688653405904652e-05, "loss": 1.4573, "step": 102 }, { "epoch": 0.06966520121745012, "grad_norm": 0.07999607920646667, "learning_rate": 8.661343332988869e-05, "loss": 1.4589, "step": 103 }, { "epoch": 0.07034156239431856, "grad_norm": 0.0692276656627655, "learning_rate": 8.633795680751116e-05, "loss": 1.3457, "step": 104 }, { "epoch": 0.07101792357118701, "grad_norm": 0.07577517628669739, "learning_rate": 8.606012236719073e-05, "loss": 1.4778, "step": 105 }, { "epoch": 0.07169428474805546, "grad_norm": 0.06558407098054886, "learning_rate": 8.577994803720606e-05, "loss": 1.3133, "step": 106 }, { "epoch": 0.07237064592492391, "grad_norm": 0.060306984931230545, "learning_rate": 8.549745199766792e-05, "loss": 1.2479, "step": 107 }, { "epoch": 0.07304700710179236, "grad_norm": 0.059541743248701096, "learning_rate": 8.521265257933948e-05, "loss": 1.2768, "step": 108 }, { "epoch": 0.0737233682786608, "grad_norm": 0.06409274786710739, "learning_rate": 8.492556826244687e-05, "loss": 1.3834, "step": 109 }, { "epoch": 0.07439972945552925, "grad_norm": 0.0610245019197464, "learning_rate": 8.463621767547998e-05, "loss": 1.4205, "step": 110 }, { "epoch": 0.0750760906323977, "grad_norm": 0.05981508269906044, "learning_rate": 8.434461959398376e-05, "loss": 1.4558, "step": 111 }, { "epoch": 0.07575245180926615, "grad_norm": 0.05959734693169594, "learning_rate": 8.405079293933986e-05, "loss": 1.3477, "step": 112 }, { "epoch": 0.07642881298613459, "grad_norm": 0.06075485795736313, "learning_rate": 8.375475677753881e-05, "loss": 1.4209, "step": 113 }, { "epoch": 0.07710517416300304, "grad_norm": 0.06361914426088333, "learning_rate": 8.345653031794292e-05, "loss": 1.4434, "step": 114 }, { "epoch": 0.07778153533987149, "grad_norm": 0.06411296874284744, "learning_rate": 8.315613291203976e-05, "loss": 1.4421, "step": 115 }, { "epoch": 0.07845789651673994, "grad_norm": 0.06421005725860596, "learning_rate": 8.285358405218655e-05, "loss": 1.4707, "step": 116 }, { "epoch": 0.07913425769360839, "grad_norm": 0.06386066973209381, "learning_rate": 8.25489033703452e-05, "loss": 1.4649, "step": 117 }, { "epoch": 0.07981061887047683, "grad_norm": 0.06911002844572067, "learning_rate": 8.224211063680853e-05, "loss": 1.4346, "step": 118 }, { "epoch": 0.08048698004734528, "grad_norm": 0.06442833691835403, "learning_rate": 8.19332257589174e-05, "loss": 1.4159, "step": 119 }, { "epoch": 0.08116334122421373, "grad_norm": 0.06367367506027222, "learning_rate": 8.162226877976887e-05, "loss": 1.4468, "step": 120 }, { "epoch": 0.08183970240108218, "grad_norm": 0.06462843716144562, "learning_rate": 8.130925987691569e-05, "loss": 1.4516, "step": 121 }, { "epoch": 0.08251606357795063, "grad_norm": 0.06435953825712204, "learning_rate": 8.099421936105702e-05, "loss": 1.4225, "step": 122 }, { "epoch": 0.08319242475481907, "grad_norm": 0.06690596044063568, "learning_rate": 8.067716767472045e-05, "loss": 1.4001, "step": 123 }, { "epoch": 0.08386878593168752, "grad_norm": 0.06788211315870285, "learning_rate": 8.035812539093557e-05, "loss": 1.4972, "step": 124 }, { "epoch": 0.08454514710855597, "grad_norm": 0.07012511044740677, "learning_rate": 8.003711321189895e-05, "loss": 1.5512, "step": 125 }, { "epoch": 0.08522150828542442, "grad_norm": 0.0646083876490593, "learning_rate": 7.971415196763088e-05, "loss": 1.3902, "step": 126 }, { "epoch": 0.08589786946229286, "grad_norm": 0.0692325159907341, "learning_rate": 7.938926261462366e-05, "loss": 1.4749, "step": 127 }, { "epoch": 0.08657423063916131, "grad_norm": 0.06617575138807297, "learning_rate": 7.906246623448183e-05, "loss": 1.445, "step": 128 }, { "epoch": 0.08725059181602976, "grad_norm": 0.06672117859125137, "learning_rate": 7.873378403255419e-05, "loss": 1.3787, "step": 129 }, { "epoch": 0.08792695299289821, "grad_norm": 0.06823927909135818, "learning_rate": 7.840323733655778e-05, "loss": 1.4454, "step": 130 }, { "epoch": 0.08860331416976666, "grad_norm": 0.06965020298957825, "learning_rate": 7.807084759519405e-05, "loss": 1.4482, "step": 131 }, { "epoch": 0.0892796753466351, "grad_norm": 0.06927543133497238, "learning_rate": 7.773663637675694e-05, "loss": 1.4027, "step": 132 }, { "epoch": 0.08995603652350355, "grad_norm": 0.07070069015026093, "learning_rate": 7.740062536773352e-05, "loss": 1.5726, "step": 133 }, { "epoch": 0.090632397700372, "grad_norm": 0.07122541964054108, "learning_rate": 7.706283637139658e-05, "loss": 1.4538, "step": 134 }, { "epoch": 0.09130875887724045, "grad_norm": 0.0677112340927124, "learning_rate": 7.672329130639005e-05, "loss": 1.4222, "step": 135 }, { "epoch": 0.09198512005410889, "grad_norm": 0.07415631413459778, "learning_rate": 7.638201220530665e-05, "loss": 1.4891, "step": 136 }, { "epoch": 0.09266148123097734, "grad_norm": 0.07131144404411316, "learning_rate": 7.603902121325813e-05, "loss": 1.393, "step": 137 }, { "epoch": 0.09333784240784579, "grad_norm": 0.07452413439750671, "learning_rate": 7.569434058643844e-05, "loss": 1.4818, "step": 138 }, { "epoch": 0.09401420358471424, "grad_norm": 0.0726298838853836, "learning_rate": 7.534799269067953e-05, "loss": 1.3627, "step": 139 }, { "epoch": 0.09469056476158269, "grad_norm": 0.07628554105758667, "learning_rate": 7.500000000000001e-05, "loss": 1.3693, "step": 140 }, { "epoch": 0.09536692593845113, "grad_norm": 0.07212232053279877, "learning_rate": 7.465038509514688e-05, "loss": 1.338, "step": 141 }, { "epoch": 0.09604328711531958, "grad_norm": 0.07615429162979126, "learning_rate": 7.42991706621303e-05, "loss": 1.3357, "step": 142 }, { "epoch": 0.09671964829218803, "grad_norm": 0.08060069382190704, "learning_rate": 7.394637949075154e-05, "loss": 1.448, "step": 143 }, { "epoch": 0.09739600946905648, "grad_norm": 0.07998302578926086, "learning_rate": 7.35920344731241e-05, "loss": 1.46, "step": 144 }, { "epoch": 0.09807237064592493, "grad_norm": 0.0826953873038292, "learning_rate": 7.323615860218843e-05, "loss": 1.3485, "step": 145 }, { "epoch": 0.09874873182279337, "grad_norm": 0.08640425652265549, "learning_rate": 7.287877497021978e-05, "loss": 1.4754, "step": 146 }, { "epoch": 0.09942509299966182, "grad_norm": 0.0944252610206604, "learning_rate": 7.251990676732984e-05, "loss": 1.4108, "step": 147 }, { "epoch": 0.10010145417653027, "grad_norm": 0.09773879498243332, "learning_rate": 7.215957727996207e-05, "loss": 1.3524, "step": 148 }, { "epoch": 0.10077781535339872, "grad_norm": 0.10976506024599075, "learning_rate": 7.179780988938051e-05, "loss": 1.3647, "step": 149 }, { "epoch": 0.10145417653026716, "grad_norm": 0.17261682450771332, "learning_rate": 7.143462807015271e-05, "loss": 1.3822, "step": 150 }, { "epoch": 0.10145417653026716, "eval_loss": 1.4144548177719116, "eval_runtime": 58.5001, "eval_samples_per_second": 42.564, "eval_steps_per_second": 10.65, "step": 150 }, { "epoch": 0.10213053770713561, "grad_norm": 0.07265003770589828, "learning_rate": 7.107005538862646e-05, "loss": 1.2632, "step": 151 }, { "epoch": 0.10280689888400406, "grad_norm": 0.080800361931324, "learning_rate": 7.07041155014006e-05, "loss": 1.4302, "step": 152 }, { "epoch": 0.10348326006087251, "grad_norm": 0.07225494086742401, "learning_rate": 7.033683215379002e-05, "loss": 1.3344, "step": 153 }, { "epoch": 0.10415962123774096, "grad_norm": 0.0720028281211853, "learning_rate": 6.996822917828477e-05, "loss": 1.3406, "step": 154 }, { "epoch": 0.1048359824146094, "grad_norm": 0.06982496380805969, "learning_rate": 6.959833049300377e-05, "loss": 1.2951, "step": 155 }, { "epoch": 0.10551234359147785, "grad_norm": 0.0698671042919159, "learning_rate": 6.922716010014255e-05, "loss": 1.3759, "step": 156 }, { "epoch": 0.1061887047683463, "grad_norm": 0.06916831433773041, "learning_rate": 6.885474208441603e-05, "loss": 1.3699, "step": 157 }, { "epoch": 0.10686506594521475, "grad_norm": 0.06982008367776871, "learning_rate": 6.848110061149556e-05, "loss": 1.3838, "step": 158 }, { "epoch": 0.10754142712208319, "grad_norm": 0.06897356361150742, "learning_rate": 6.810625992644085e-05, "loss": 1.3406, "step": 159 }, { "epoch": 0.10821778829895164, "grad_norm": 0.06280040740966797, "learning_rate": 6.773024435212678e-05, "loss": 1.3497, "step": 160 }, { "epoch": 0.10889414947582009, "grad_norm": 0.06423673778772354, "learning_rate": 6.735307828766515e-05, "loss": 1.3508, "step": 161 }, { "epoch": 0.10957051065268854, "grad_norm": 0.06756166368722916, "learning_rate": 6.697478620682137e-05, "loss": 1.4382, "step": 162 }, { "epoch": 0.11024687182955699, "grad_norm": 0.06480084359645844, "learning_rate": 6.659539265642643e-05, "loss": 1.3933, "step": 163 }, { "epoch": 0.11092323300642543, "grad_norm": 0.06544915586709976, "learning_rate": 6.621492225478414e-05, "loss": 1.4351, "step": 164 }, { "epoch": 0.11159959418329388, "grad_norm": 0.06527773290872574, "learning_rate": 6.583339969007363e-05, "loss": 1.4121, "step": 165 }, { "epoch": 0.11227595536016233, "grad_norm": 0.06609731167554855, "learning_rate": 6.545084971874738e-05, "loss": 1.438, "step": 166 }, { "epoch": 0.11295231653703078, "grad_norm": 0.06664299219846725, "learning_rate": 6.506729716392481e-05, "loss": 1.5263, "step": 167 }, { "epoch": 0.11362867771389923, "grad_norm": 0.0701015368103981, "learning_rate": 6.468276691378155e-05, "loss": 1.4672, "step": 168 }, { "epoch": 0.11430503889076767, "grad_norm": 0.07150130718946457, "learning_rate": 6.429728391993446e-05, "loss": 1.4679, "step": 169 }, { "epoch": 0.11498140006763612, "grad_norm": 0.06744568049907684, "learning_rate": 6.391087319582264e-05, "loss": 1.4669, "step": 170 }, { "epoch": 0.11565776124450457, "grad_norm": 0.06979771703481674, "learning_rate": 6.35235598150842e-05, "loss": 1.4848, "step": 171 }, { "epoch": 0.11633412242137302, "grad_norm": 0.06854456663131714, "learning_rate": 6.313536890992935e-05, "loss": 1.4543, "step": 172 }, { "epoch": 0.11701048359824145, "grad_norm": 0.06629548966884613, "learning_rate": 6.274632566950967e-05, "loss": 1.4109, "step": 173 }, { "epoch": 0.1176868447751099, "grad_norm": 0.07091697305440903, "learning_rate": 6.235645533828349e-05, "loss": 1.3707, "step": 174 }, { "epoch": 0.11836320595197836, "grad_norm": 0.07082553952932358, "learning_rate": 6.19657832143779e-05, "loss": 1.3993, "step": 175 }, { "epoch": 0.11903956712884681, "grad_norm": 0.07125814259052277, "learning_rate": 6.157433464794716e-05, "loss": 1.4445, "step": 176 }, { "epoch": 0.11971592830571526, "grad_norm": 0.07349928468465805, "learning_rate": 6.118213503952779e-05, "loss": 1.4615, "step": 177 }, { "epoch": 0.1203922894825837, "grad_norm": 0.07393936812877655, "learning_rate": 6.078920983839031e-05, "loss": 1.4923, "step": 178 }, { "epoch": 0.12106865065945215, "grad_norm": 0.07127617299556732, "learning_rate": 6.0395584540887963e-05, "loss": 1.4849, "step": 179 }, { "epoch": 0.1217450118363206, "grad_norm": 0.0763372927904129, "learning_rate": 6.0001284688802226e-05, "loss": 1.5, "step": 180 }, { "epoch": 0.12242137301318905, "grad_norm": 0.07156749814748764, "learning_rate": 5.960633586768543e-05, "loss": 1.3908, "step": 181 }, { "epoch": 0.12309773419005748, "grad_norm": 0.07486128807067871, "learning_rate": 5.921076370520058e-05, "loss": 1.4367, "step": 182 }, { "epoch": 0.12377409536692593, "grad_norm": 0.0703478455543518, "learning_rate": 5.8814593869458455e-05, "loss": 1.3708, "step": 183 }, { "epoch": 0.12445045654379439, "grad_norm": 0.07301351428031921, "learning_rate": 5.841785206735192e-05, "loss": 1.4418, "step": 184 }, { "epoch": 0.12512681772066284, "grad_norm": 0.07492207735776901, "learning_rate": 5.8020564042888015e-05, "loss": 1.4297, "step": 185 }, { "epoch": 0.1258031788975313, "grad_norm": 0.07811646908521652, "learning_rate": 5.762275557551727e-05, "loss": 1.4119, "step": 186 }, { "epoch": 0.12647954007439974, "grad_norm": 0.07865872979164124, "learning_rate": 5.7224452478461064e-05, "loss": 1.4008, "step": 187 }, { "epoch": 0.1271559012512682, "grad_norm": 0.0770040899515152, "learning_rate": 5.682568059703659e-05, "loss": 1.4389, "step": 188 }, { "epoch": 0.1278322624281366, "grad_norm": 0.07645025849342346, "learning_rate": 5.642646580697973e-05, "loss": 1.3273, "step": 189 }, { "epoch": 0.12850862360500506, "grad_norm": 0.08479326963424683, "learning_rate": 5.602683401276615e-05, "loss": 1.4879, "step": 190 }, { "epoch": 0.1291849847818735, "grad_norm": 0.08738847821950912, "learning_rate": 5.562681114593028e-05, "loss": 1.4578, "step": 191 }, { "epoch": 0.12986134595874196, "grad_norm": 0.08168379962444305, "learning_rate": 5.522642316338268e-05, "loss": 1.375, "step": 192 }, { "epoch": 0.13053770713561041, "grad_norm": 0.08465580642223358, "learning_rate": 5.482569604572576e-05, "loss": 1.445, "step": 193 }, { "epoch": 0.13121406831247887, "grad_norm": 0.08332633227109909, "learning_rate": 5.442465579556793e-05, "loss": 1.3447, "step": 194 }, { "epoch": 0.13189042948934732, "grad_norm": 0.08931078761816025, "learning_rate": 5.402332843583631e-05, "loss": 1.4188, "step": 195 }, { "epoch": 0.13256679066621577, "grad_norm": 0.08860035240650177, "learning_rate": 5.3621740008088126e-05, "loss": 1.4052, "step": 196 }, { "epoch": 0.13324315184308422, "grad_norm": 0.09431397169828415, "learning_rate": 5.321991657082097e-05, "loss": 1.3552, "step": 197 }, { "epoch": 0.13391951301995267, "grad_norm": 0.09754069894552231, "learning_rate": 5.281788419778187e-05, "loss": 1.3225, "step": 198 }, { "epoch": 0.1345958741968211, "grad_norm": 0.10789535194635391, "learning_rate": 5.2415668976275355e-05, "loss": 1.3024, "step": 199 }, { "epoch": 0.13527223537368954, "grad_norm": 0.13092434406280518, "learning_rate": 5.201329700547076e-05, "loss": 1.3502, "step": 200 }, { "epoch": 0.13527223537368954, "eval_loss": 1.4045569896697998, "eval_runtime": 58.6172, "eval_samples_per_second": 42.479, "eval_steps_per_second": 10.628, "step": 200 }, { "epoch": 0.135948596550558, "grad_norm": 0.07242485135793686, "learning_rate": 5.161079439470866e-05, "loss": 1.2782, "step": 201 }, { "epoch": 0.13662495772742644, "grad_norm": 0.07775568962097168, "learning_rate": 5.1208187261806615e-05, "loss": 1.3799, "step": 202 }, { "epoch": 0.1373013189042949, "grad_norm": 0.07579399645328522, "learning_rate": 5.080550173136457e-05, "loss": 1.3196, "step": 203 }, { "epoch": 0.13797768008116335, "grad_norm": 0.07350926846265793, "learning_rate": 5.0402763933069496e-05, "loss": 1.3123, "step": 204 }, { "epoch": 0.1386540412580318, "grad_norm": 0.07234349101781845, "learning_rate": 5e-05, "loss": 1.3502, "step": 205 }, { "epoch": 0.13933040243490025, "grad_norm": 0.07092832773923874, "learning_rate": 4.9597236066930516e-05, "loss": 1.3384, "step": 206 }, { "epoch": 0.1400067636117687, "grad_norm": 0.06819634884595871, "learning_rate": 4.919449826863544e-05, "loss": 1.3531, "step": 207 }, { "epoch": 0.14068312478863712, "grad_norm": 0.06536248326301575, "learning_rate": 4.87918127381934e-05, "loss": 1.218, "step": 208 }, { "epoch": 0.14135948596550557, "grad_norm": 0.07125400751829147, "learning_rate": 4.8389205605291365e-05, "loss": 1.3146, "step": 209 }, { "epoch": 0.14203584714237402, "grad_norm": 0.06933542340993881, "learning_rate": 4.798670299452926e-05, "loss": 1.3522, "step": 210 }, { "epoch": 0.14271220831924247, "grad_norm": 0.06874727457761765, "learning_rate": 4.758433102372466e-05, "loss": 1.3805, "step": 211 }, { "epoch": 0.14338856949611092, "grad_norm": 0.07114125788211823, "learning_rate": 4.7182115802218126e-05, "loss": 1.3567, "step": 212 }, { "epoch": 0.14406493067297937, "grad_norm": 0.07075918465852737, "learning_rate": 4.678008342917903e-05, "loss": 1.4714, "step": 213 }, { "epoch": 0.14474129184984783, "grad_norm": 0.07062017172574997, "learning_rate": 4.6378259991911886e-05, "loss": 1.3805, "step": 214 }, { "epoch": 0.14541765302671628, "grad_norm": 0.0695713609457016, "learning_rate": 4.597667156416371e-05, "loss": 1.4541, "step": 215 }, { "epoch": 0.14609401420358473, "grad_norm": 0.06924286484718323, "learning_rate": 4.5575344204432084e-05, "loss": 1.4296, "step": 216 }, { "epoch": 0.14677037538045315, "grad_norm": 0.07131223380565643, "learning_rate": 4.5174303954274244e-05, "loss": 1.5286, "step": 217 }, { "epoch": 0.1474467365573216, "grad_norm": 0.06701776385307312, "learning_rate": 4.477357683661734e-05, "loss": 1.3561, "step": 218 }, { "epoch": 0.14812309773419005, "grad_norm": 0.07408411055803299, "learning_rate": 4.437318885406973e-05, "loss": 1.4165, "step": 219 }, { "epoch": 0.1487994589110585, "grad_norm": 0.07193459570407867, "learning_rate": 4.397316598723385e-05, "loss": 1.3685, "step": 220 }, { "epoch": 0.14947582008792695, "grad_norm": 0.07403208315372467, "learning_rate": 4.3573534193020274e-05, "loss": 1.4735, "step": 221 }, { "epoch": 0.1501521812647954, "grad_norm": 0.07124771922826767, "learning_rate": 4.317431940296343e-05, "loss": 1.3149, "step": 222 }, { "epoch": 0.15082854244166385, "grad_norm": 0.07504601776599884, "learning_rate": 4.277554752153895e-05, "loss": 1.4289, "step": 223 }, { "epoch": 0.1515049036185323, "grad_norm": 0.07184187322854996, "learning_rate": 4.237724442448273e-05, "loss": 1.39, "step": 224 }, { "epoch": 0.15218126479540076, "grad_norm": 0.07542978227138519, "learning_rate": 4.197943595711198e-05, "loss": 1.4706, "step": 225 }, { "epoch": 0.15285762597226918, "grad_norm": 0.0755477175116539, "learning_rate": 4.1582147932648074e-05, "loss": 1.4443, "step": 226 }, { "epoch": 0.15353398714913763, "grad_norm": 0.07688594609498978, "learning_rate": 4.118540613054156e-05, "loss": 1.418, "step": 227 }, { "epoch": 0.15421034832600608, "grad_norm": 0.07337923347949982, "learning_rate": 4.078923629479943e-05, "loss": 1.4865, "step": 228 }, { "epoch": 0.15488670950287453, "grad_norm": 0.08003071695566177, "learning_rate": 4.039366413231458e-05, "loss": 1.4069, "step": 229 }, { "epoch": 0.15556307067974298, "grad_norm": 0.07447373121976852, "learning_rate": 3.9998715311197785e-05, "loss": 1.3647, "step": 230 }, { "epoch": 0.15623943185661143, "grad_norm": 0.082440085709095, "learning_rate": 3.960441545911204e-05, "loss": 1.4943, "step": 231 }, { "epoch": 0.15691579303347988, "grad_norm": 0.07682335376739502, "learning_rate": 3.92107901616097e-05, "loss": 1.4059, "step": 232 }, { "epoch": 0.15759215421034833, "grad_norm": 0.07876202464103699, "learning_rate": 3.8817864960472236e-05, "loss": 1.4894, "step": 233 }, { "epoch": 0.15826851538721678, "grad_norm": 0.08159554749727249, "learning_rate": 3.842566535205286e-05, "loss": 1.4321, "step": 234 }, { "epoch": 0.1589448765640852, "grad_norm": 0.0825141966342926, "learning_rate": 3.803421678562213e-05, "loss": 1.4251, "step": 235 }, { "epoch": 0.15962123774095366, "grad_norm": 0.07901477813720703, "learning_rate": 3.764354466171652e-05, "loss": 1.3717, "step": 236 }, { "epoch": 0.1602975989178221, "grad_norm": 0.07721833884716034, "learning_rate": 3.725367433049033e-05, "loss": 1.4613, "step": 237 }, { "epoch": 0.16097396009469056, "grad_norm": 0.0815616324543953, "learning_rate": 3.6864631090070655e-05, "loss": 1.3866, "step": 238 }, { "epoch": 0.161650321271559, "grad_norm": 0.08109608292579651, "learning_rate": 3.6476440184915815e-05, "loss": 1.3556, "step": 239 }, { "epoch": 0.16232668244842746, "grad_norm": 0.08702927082777023, "learning_rate": 3.608912680417737e-05, "loss": 1.3502, "step": 240 }, { "epoch": 0.1630030436252959, "grad_norm": 0.08374108374118805, "learning_rate": 3.570271608006555e-05, "loss": 1.3121, "step": 241 }, { "epoch": 0.16367940480216436, "grad_norm": 0.08754076808691025, "learning_rate": 3.531723308621847e-05, "loss": 1.4123, "step": 242 }, { "epoch": 0.16435576597903281, "grad_norm": 0.08081318438053131, "learning_rate": 3.493270283607522e-05, "loss": 1.2896, "step": 243 }, { "epoch": 0.16503212715590126, "grad_norm": 0.0907411053776741, "learning_rate": 3.4549150281252636e-05, "loss": 1.3749, "step": 244 }, { "epoch": 0.1657084883327697, "grad_norm": 0.08883036673069, "learning_rate": 3.4166600309926387e-05, "loss": 1.3238, "step": 245 }, { "epoch": 0.16638484950963814, "grad_norm": 0.101780466735363, "learning_rate": 3.3785077745215873e-05, "loss": 1.4447, "step": 246 }, { "epoch": 0.1670612106865066, "grad_norm": 0.09471210092306137, "learning_rate": 3.340460734357359e-05, "loss": 1.3208, "step": 247 }, { "epoch": 0.16773757186337504, "grad_norm": 0.1030389666557312, "learning_rate": 3.3025213793178646e-05, "loss": 1.3621, "step": 248 }, { "epoch": 0.1684139330402435, "grad_norm": 0.11591000854969025, "learning_rate": 3.264692171233485e-05, "loss": 1.3342, "step": 249 }, { "epoch": 0.16909029421711194, "grad_norm": 0.17307457327842712, "learning_rate": 3.226975564787322e-05, "loss": 1.4818, "step": 250 }, { "epoch": 0.16909029421711194, "eval_loss": 1.3978846073150635, "eval_runtime": 58.5594, "eval_samples_per_second": 42.521, "eval_steps_per_second": 10.639, "step": 250 }, { "epoch": 0.1697666553939804, "grad_norm": 0.07893766462802887, "learning_rate": 3.189374007355917e-05, "loss": 1.3761, "step": 251 }, { "epoch": 0.17044301657084884, "grad_norm": 0.07456450909376144, "learning_rate": 3.151889938850445e-05, "loss": 1.3348, "step": 252 }, { "epoch": 0.1711193777477173, "grad_norm": 0.07382918149232864, "learning_rate": 3.114525791558398e-05, "loss": 1.3499, "step": 253 }, { "epoch": 0.17179573892458572, "grad_norm": 0.07121651619672775, "learning_rate": 3.0772839899857464e-05, "loss": 1.3402, "step": 254 }, { "epoch": 0.17247210010145417, "grad_norm": 0.07278232276439667, "learning_rate": 3.0401669506996256e-05, "loss": 1.3145, "step": 255 }, { "epoch": 0.17314846127832262, "grad_norm": 0.07363329082727432, "learning_rate": 3.003177082171523e-05, "loss": 1.3581, "step": 256 }, { "epoch": 0.17382482245519107, "grad_norm": 0.06870734691619873, "learning_rate": 2.9663167846209998e-05, "loss": 1.3378, "step": 257 }, { "epoch": 0.17450118363205952, "grad_norm": 0.06792139261960983, "learning_rate": 2.9295884498599414e-05, "loss": 1.3419, "step": 258 }, { "epoch": 0.17517754480892797, "grad_norm": 0.0735265463590622, "learning_rate": 2.8929944611373554e-05, "loss": 1.3432, "step": 259 }, { "epoch": 0.17585390598579642, "grad_norm": 0.07180746644735336, "learning_rate": 2.8565371929847284e-05, "loss": 1.3547, "step": 260 }, { "epoch": 0.17653026716266487, "grad_norm": 0.07569875568151474, "learning_rate": 2.8202190110619493e-05, "loss": 1.4529, "step": 261 }, { "epoch": 0.17720662833953332, "grad_norm": 0.07017538696527481, "learning_rate": 2.784042272003794e-05, "loss": 1.3556, "step": 262 }, { "epoch": 0.17788298951640175, "grad_norm": 0.07113169133663177, "learning_rate": 2.7480093232670158e-05, "loss": 1.4059, "step": 263 }, { "epoch": 0.1785593506932702, "grad_norm": 0.06814723461866379, "learning_rate": 2.712122502978024e-05, "loss": 1.3724, "step": 264 }, { "epoch": 0.17923571187013865, "grad_norm": 0.06991983950138092, "learning_rate": 2.6763841397811573e-05, "loss": 1.3626, "step": 265 }, { "epoch": 0.1799120730470071, "grad_norm": 0.07174625992774963, "learning_rate": 2.64079655268759e-05, "loss": 1.4458, "step": 266 }, { "epoch": 0.18058843422387555, "grad_norm": 0.07207131385803223, "learning_rate": 2.605362050924848e-05, "loss": 1.4582, "step": 267 }, { "epoch": 0.181264795400744, "grad_norm": 0.07133448123931885, "learning_rate": 2.57008293378697e-05, "loss": 1.4014, "step": 268 }, { "epoch": 0.18194115657761245, "grad_norm": 0.07459162175655365, "learning_rate": 2.534961490485313e-05, "loss": 1.5518, "step": 269 }, { "epoch": 0.1826175177544809, "grad_norm": 0.07072808593511581, "learning_rate": 2.500000000000001e-05, "loss": 1.4785, "step": 270 }, { "epoch": 0.18329387893134935, "grad_norm": 0.07045449316501617, "learning_rate": 2.4652007309320498e-05, "loss": 1.3619, "step": 271 }, { "epoch": 0.18397024010821778, "grad_norm": 0.07227825373411179, "learning_rate": 2.430565941356157e-05, "loss": 1.4089, "step": 272 }, { "epoch": 0.18464660128508623, "grad_norm": 0.0711086317896843, "learning_rate": 2.3960978786741877e-05, "loss": 1.4513, "step": 273 }, { "epoch": 0.18532296246195468, "grad_norm": 0.07261015474796295, "learning_rate": 2.361798779469336e-05, "loss": 1.4833, "step": 274 }, { "epoch": 0.18599932363882313, "grad_norm": 0.07312192767858505, "learning_rate": 2.3276708693609943e-05, "loss": 1.4728, "step": 275 }, { "epoch": 0.18667568481569158, "grad_norm": 0.07378038763999939, "learning_rate": 2.2937163628603435e-05, "loss": 1.4263, "step": 276 }, { "epoch": 0.18735204599256003, "grad_norm": 0.0779234915971756, "learning_rate": 2.259937463226651e-05, "loss": 1.3974, "step": 277 }, { "epoch": 0.18802840716942848, "grad_norm": 0.07699842005968094, "learning_rate": 2.2263363623243054e-05, "loss": 1.4611, "step": 278 }, { "epoch": 0.18870476834629693, "grad_norm": 0.07660985738039017, "learning_rate": 2.192915240480596e-05, "loss": 1.3248, "step": 279 }, { "epoch": 0.18938112952316538, "grad_norm": 0.0773986205458641, "learning_rate": 2.1596762663442218e-05, "loss": 1.4168, "step": 280 }, { "epoch": 0.1900574907000338, "grad_norm": 0.08041651546955109, "learning_rate": 2.1266215967445824e-05, "loss": 1.4745, "step": 281 }, { "epoch": 0.19073385187690226, "grad_norm": 0.07643985748291016, "learning_rate": 2.0937533765518187e-05, "loss": 1.428, "step": 282 }, { "epoch": 0.1914102130537707, "grad_norm": 0.07961355894804001, "learning_rate": 2.061073738537635e-05, "loss": 1.4152, "step": 283 }, { "epoch": 0.19208657423063916, "grad_norm": 0.0767185166478157, "learning_rate": 2.0285848032369137e-05, "loss": 1.3493, "step": 284 }, { "epoch": 0.1927629354075076, "grad_norm": 0.08084074407815933, "learning_rate": 1.996288678810105e-05, "loss": 1.4049, "step": 285 }, { "epoch": 0.19343929658437606, "grad_norm": 0.07845637947320938, "learning_rate": 1.9641874609064443e-05, "loss": 1.4237, "step": 286 }, { "epoch": 0.1941156577612445, "grad_norm": 0.08574547618627548, "learning_rate": 1.932283232527956e-05, "loss": 1.429, "step": 287 }, { "epoch": 0.19479201893811296, "grad_norm": 0.08614347875118256, "learning_rate": 1.9005780638942982e-05, "loss": 1.4629, "step": 288 }, { "epoch": 0.1954683801149814, "grad_norm": 0.08351199328899384, "learning_rate": 1.8690740123084316e-05, "loss": 1.3625, "step": 289 }, { "epoch": 0.19614474129184986, "grad_norm": 0.08555345237255096, "learning_rate": 1.837773122023114e-05, "loss": 1.3805, "step": 290 }, { "epoch": 0.19682110246871828, "grad_norm": 0.09179560840129852, "learning_rate": 1.8066774241082612e-05, "loss": 1.4212, "step": 291 }, { "epoch": 0.19749746364558673, "grad_norm": 0.08746284991502762, "learning_rate": 1.7757889363191483e-05, "loss": 1.3866, "step": 292 }, { "epoch": 0.19817382482245519, "grad_norm": 0.09664510190486908, "learning_rate": 1.745109662965481e-05, "loss": 1.4013, "step": 293 }, { "epoch": 0.19885018599932364, "grad_norm": 0.09385596960783005, "learning_rate": 1.714641594781347e-05, "loss": 1.4426, "step": 294 }, { "epoch": 0.1995265471761921, "grad_norm": 0.09728514403104782, "learning_rate": 1.684386708796025e-05, "loss": 1.5043, "step": 295 }, { "epoch": 0.20020290835306054, "grad_norm": 0.09565767645835876, "learning_rate": 1.6543469682057106e-05, "loss": 1.4168, "step": 296 }, { "epoch": 0.200879269529929, "grad_norm": 0.09778063744306564, "learning_rate": 1.62452432224612e-05, "loss": 1.4201, "step": 297 }, { "epoch": 0.20155563070679744, "grad_norm": 0.11353736370801926, "learning_rate": 1.5949207060660138e-05, "loss": 1.4805, "step": 298 }, { "epoch": 0.2022319918836659, "grad_norm": 0.1255834847688675, "learning_rate": 1.5655380406016235e-05, "loss": 1.3583, "step": 299 }, { "epoch": 0.2029083530605343, "grad_norm": 0.1988285779953003, "learning_rate": 1.536378232452003e-05, "loss": 1.3486, "step": 300 }, { "epoch": 0.2029083530605343, "eval_loss": 1.394815444946289, "eval_runtime": 58.5752, "eval_samples_per_second": 42.509, "eval_steps_per_second": 10.636, "step": 300 }, { "epoch": 0.20358471423740276, "grad_norm": 0.06902935355901718, "learning_rate": 1.5074431737553157e-05, "loss": 1.2703, "step": 301 }, { "epoch": 0.20426107541427121, "grad_norm": 0.07079865038394928, "learning_rate": 1.4787347420660541e-05, "loss": 1.2929, "step": 302 }, { "epoch": 0.20493743659113967, "grad_norm": 0.07172349840402603, "learning_rate": 1.4502548002332088e-05, "loss": 1.3659, "step": 303 }, { "epoch": 0.20561379776800812, "grad_norm": 0.06566619873046875, "learning_rate": 1.422005196279395e-05, "loss": 1.2871, "step": 304 }, { "epoch": 0.20629015894487657, "grad_norm": 0.06977768242359161, "learning_rate": 1.3939877632809278e-05, "loss": 1.3399, "step": 305 }, { "epoch": 0.20696652012174502, "grad_norm": 0.0717938095331192, "learning_rate": 1.3662043192488849e-05, "loss": 1.4212, "step": 306 }, { "epoch": 0.20764288129861347, "grad_norm": 0.06987138837575912, "learning_rate": 1.338656667011134e-05, "loss": 1.3077, "step": 307 }, { "epoch": 0.20831924247548192, "grad_norm": 0.07141832262277603, "learning_rate": 1.3113465940953495e-05, "loss": 1.3018, "step": 308 }, { "epoch": 0.20899560365235034, "grad_norm": 0.06794386357069016, "learning_rate": 1.2842758726130283e-05, "loss": 1.2765, "step": 309 }, { "epoch": 0.2096719648292188, "grad_norm": 0.07226680219173431, "learning_rate": 1.257446259144494e-05, "loss": 1.2678, "step": 310 }, { "epoch": 0.21034832600608724, "grad_norm": 0.06923844665288925, "learning_rate": 1.2308594946249163e-05, "loss": 1.3101, "step": 311 }, { "epoch": 0.2110246871829557, "grad_norm": 0.07099943608045578, "learning_rate": 1.204517304231343e-05, "loss": 1.3425, "step": 312 }, { "epoch": 0.21170104835982415, "grad_norm": 0.07071360945701599, "learning_rate": 1.178421397270758e-05, "loss": 1.3551, "step": 313 }, { "epoch": 0.2123774095366926, "grad_norm": 0.07278096675872803, "learning_rate": 1.1525734670691701e-05, "loss": 1.3774, "step": 314 }, { "epoch": 0.21305377071356105, "grad_norm": 0.07339213043451309, "learning_rate": 1.1269751908617277e-05, "loss": 1.4096, "step": 315 }, { "epoch": 0.2137301318904295, "grad_norm": 0.07134059816598892, "learning_rate": 1.1016282296838887e-05, "loss": 1.39, "step": 316 }, { "epoch": 0.21440649306729795, "grad_norm": 0.0759441927075386, "learning_rate": 1.0765342282636416e-05, "loss": 1.5035, "step": 317 }, { "epoch": 0.21508285424416637, "grad_norm": 0.0740458145737648, "learning_rate": 1.0516948149147754e-05, "loss": 1.4417, "step": 318 }, { "epoch": 0.21575921542103482, "grad_norm": 0.07227642089128494, "learning_rate": 1.0271116014312293e-05, "loss": 1.3583, "step": 319 }, { "epoch": 0.21643557659790327, "grad_norm": 0.07315854728221893, "learning_rate": 1.0027861829824952e-05, "loss": 1.4062, "step": 320 }, { "epoch": 0.21711193777477172, "grad_norm": 0.07590021938085556, "learning_rate": 9.787201380101157e-06, "loss": 1.3649, "step": 321 }, { "epoch": 0.21778829895164017, "grad_norm": 0.07322739064693451, "learning_rate": 9.549150281252633e-06, "loss": 1.4691, "step": 322 }, { "epoch": 0.21846466012850863, "grad_norm": 0.07656604051589966, "learning_rate": 9.313723980074018e-06, "loss": 1.5084, "step": 323 }, { "epoch": 0.21914102130537708, "grad_norm": 0.07473529130220413, "learning_rate": 9.080937753040646e-06, "loss": 1.4019, "step": 324 }, { "epoch": 0.21981738248224553, "grad_norm": 0.07312378287315369, "learning_rate": 8.850806705317183e-06, "loss": 1.3843, "step": 325 }, { "epoch": 0.22049374365911398, "grad_norm": 0.0723285973072052, "learning_rate": 8.623345769777514e-06, "loss": 1.4158, "step": 326 }, { "epoch": 0.22117010483598243, "grad_norm": 0.07800653576850891, "learning_rate": 8.398569706035792e-06, "loss": 1.4794, "step": 327 }, { "epoch": 0.22184646601285085, "grad_norm": 0.0782305896282196, "learning_rate": 8.176493099488663e-06, "loss": 1.368, "step": 328 }, { "epoch": 0.2225228271897193, "grad_norm": 0.07785584032535553, "learning_rate": 7.957130360368898e-06, "loss": 1.4589, "step": 329 }, { "epoch": 0.22319918836658775, "grad_norm": 0.07849667221307755, "learning_rate": 7.740495722810271e-06, "loss": 1.4204, "step": 330 }, { "epoch": 0.2238755495434562, "grad_norm": 0.07931456714868546, "learning_rate": 7.526603243923957e-06, "loss": 1.4728, "step": 331 }, { "epoch": 0.22455191072032465, "grad_norm": 0.07695943862199783, "learning_rate": 7.315466802886401e-06, "loss": 1.3767, "step": 332 }, { "epoch": 0.2252282718971931, "grad_norm": 0.07418181747198105, "learning_rate": 7.107100100038671e-06, "loss": 1.3109, "step": 333 }, { "epoch": 0.22590463307406156, "grad_norm": 0.08536362648010254, "learning_rate": 6.901516655997536e-06, "loss": 1.4548, "step": 334 }, { "epoch": 0.22658099425093, "grad_norm": 0.0794777050614357, "learning_rate": 6.698729810778065e-06, "loss": 1.4105, "step": 335 }, { "epoch": 0.22725735542779846, "grad_norm": 0.08513446897268295, "learning_rate": 6.498752722928042e-06, "loss": 1.4203, "step": 336 }, { "epoch": 0.22793371660466688, "grad_norm": 0.0815361961722374, "learning_rate": 6.301598368674105e-06, "loss": 1.4362, "step": 337 }, { "epoch": 0.22861007778153533, "grad_norm": 0.0861455649137497, "learning_rate": 6.107279541079769e-06, "loss": 1.4163, "step": 338 }, { "epoch": 0.22928643895840378, "grad_norm": 0.08055116981267929, "learning_rate": 5.915808849215304e-06, "loss": 1.3815, "step": 339 }, { "epoch": 0.22996280013527223, "grad_norm": 0.08424156159162521, "learning_rate": 5.727198717339511e-06, "loss": 1.3917, "step": 340 }, { "epoch": 0.23063916131214068, "grad_norm": 0.0870758518576622, "learning_rate": 5.54146138409355e-06, "loss": 1.4611, "step": 341 }, { "epoch": 0.23131552248900913, "grad_norm": 0.09209448844194412, "learning_rate": 5.358608901706802e-06, "loss": 1.4124, "step": 342 }, { "epoch": 0.23199188366587759, "grad_norm": 0.09123307466506958, "learning_rate": 5.178653135214812e-06, "loss": 1.3352, "step": 343 }, { "epoch": 0.23266824484274604, "grad_norm": 0.09511040896177292, "learning_rate": 5.001605761689398e-06, "loss": 1.4073, "step": 344 }, { "epoch": 0.2333446060196145, "grad_norm": 0.0982203483581543, "learning_rate": 4.827478269480895e-06, "loss": 1.4191, "step": 345 }, { "epoch": 0.2340209671964829, "grad_norm": 0.09550726413726807, "learning_rate": 4.65628195747273e-06, "loss": 1.4069, "step": 346 }, { "epoch": 0.23469732837335136, "grad_norm": 0.10381521284580231, "learning_rate": 4.488027934348271e-06, "loss": 1.3493, "step": 347 }, { "epoch": 0.2353736895502198, "grad_norm": 0.10688237845897675, "learning_rate": 4.322727117869951e-06, "loss": 1.4009, "step": 348 }, { "epoch": 0.23605005072708826, "grad_norm": 0.1287006139755249, "learning_rate": 4.16039023417088e-06, "loss": 1.3225, "step": 349 }, { "epoch": 0.2367264119039567, "grad_norm": 0.16477414965629578, "learning_rate": 4.001027817058789e-06, "loss": 1.1971, "step": 350 }, { "epoch": 0.2367264119039567, "eval_loss": 1.392931342124939, "eval_runtime": 58.606, "eval_samples_per_second": 42.487, "eval_steps_per_second": 10.63, "step": 350 }, { "epoch": 0.23740277308082516, "grad_norm": 0.07476108521223068, "learning_rate": 3.844650207332562e-06, "loss": 1.2627, "step": 351 }, { "epoch": 0.23807913425769361, "grad_norm": 0.07067526876926422, "learning_rate": 3.691267552111183e-06, "loss": 1.4219, "step": 352 }, { "epoch": 0.23875549543456207, "grad_norm": 0.06555365771055222, "learning_rate": 3.54088980417534e-06, "loss": 1.2736, "step": 353 }, { "epoch": 0.23943185661143052, "grad_norm": 0.07065010815858841, "learning_rate": 3.393526721321616e-06, "loss": 1.3441, "step": 354 }, { "epoch": 0.24010821778829894, "grad_norm": 0.066973976790905, "learning_rate": 3.249187865729264e-06, "loss": 1.3049, "step": 355 }, { "epoch": 0.2407845789651674, "grad_norm": 0.06522707641124725, "learning_rate": 3.1078826033397843e-06, "loss": 1.3121, "step": 356 }, { "epoch": 0.24146094014203584, "grad_norm": 0.06993697583675385, "learning_rate": 2.9696201032491434e-06, "loss": 1.3958, "step": 357 }, { "epoch": 0.2421373013189043, "grad_norm": 0.06835866719484329, "learning_rate": 2.8344093371128424e-06, "loss": 1.2915, "step": 358 }, { "epoch": 0.24281366249577274, "grad_norm": 0.06519079953432083, "learning_rate": 2.70225907856374e-06, "loss": 1.296, "step": 359 }, { "epoch": 0.2434900236726412, "grad_norm": 0.06798028200864792, "learning_rate": 2.573177902642726e-06, "loss": 1.3299, "step": 360 }, { "epoch": 0.24416638484950964, "grad_norm": 0.06898199021816254, "learning_rate": 2.4471741852423237e-06, "loss": 1.3754, "step": 361 }, { "epoch": 0.2448427460263781, "grad_norm": 0.07092057168483734, "learning_rate": 2.324256102563188e-06, "loss": 1.3248, "step": 362 }, { "epoch": 0.24551910720324654, "grad_norm": 0.07142449170351028, "learning_rate": 2.204431630583548e-06, "loss": 1.4185, "step": 363 }, { "epoch": 0.24619546838011497, "grad_norm": 0.06955256313085556, "learning_rate": 2.087708544541689e-06, "loss": 1.3, "step": 364 }, { "epoch": 0.24687182955698342, "grad_norm": 0.07163543999195099, "learning_rate": 1.974094418431388e-06, "loss": 1.4182, "step": 365 }, { "epoch": 0.24754819073385187, "grad_norm": 0.06957574933767319, "learning_rate": 1.8635966245104664e-06, "loss": 1.3436, "step": 366 }, { "epoch": 0.24822455191072032, "grad_norm": 0.07044114917516708, "learning_rate": 1.7562223328224325e-06, "loss": 1.3497, "step": 367 }, { "epoch": 0.24890091308758877, "grad_norm": 0.07231567054986954, "learning_rate": 1.6519785107311891e-06, "loss": 1.4305, "step": 368 }, { "epoch": 0.24957727426445722, "grad_norm": 0.071175716817379, "learning_rate": 1.5508719224689717e-06, "loss": 1.3585, "step": 369 }, { "epoch": 0.2502536354413257, "grad_norm": 0.07513930648565292, "learning_rate": 1.4529091286973995e-06, "loss": 1.4127, "step": 370 }, { "epoch": 0.2509299966181941, "grad_norm": 0.07834319770336151, "learning_rate": 1.358096486081778e-06, "loss": 1.4904, "step": 371 }, { "epoch": 0.2516063577950626, "grad_norm": 0.0735422819852829, "learning_rate": 1.2664401468786114e-06, "loss": 1.4379, "step": 372 }, { "epoch": 0.252282718971931, "grad_norm": 0.07638537883758545, "learning_rate": 1.1779460585363944e-06, "loss": 1.4957, "step": 373 }, { "epoch": 0.2529590801487995, "grad_norm": 0.07571271061897278, "learning_rate": 1.0926199633097157e-06, "loss": 1.4089, "step": 374 }, { "epoch": 0.2536354413256679, "grad_norm": 0.07592733204364777, "learning_rate": 1.0104673978866164e-06, "loss": 1.3655, "step": 375 }, { "epoch": 0.2543118025025364, "grad_norm": 0.07473273575305939, "learning_rate": 9.314936930293283e-07, "loss": 1.3653, "step": 376 }, { "epoch": 0.2549881636794048, "grad_norm": 0.07662557065486908, "learning_rate": 8.557039732283944e-07, "loss": 1.3438, "step": 377 }, { "epoch": 0.2556645248562732, "grad_norm": 0.07830655574798584, "learning_rate": 7.83103156370113e-07, "loss": 1.4263, "step": 378 }, { "epoch": 0.2563408860331417, "grad_norm": 0.07488763332366943, "learning_rate": 7.136959534174592e-07, "loss": 1.3887, "step": 379 }, { "epoch": 0.2570172472100101, "grad_norm": 0.07890254259109497, "learning_rate": 6.474868681043578e-07, "loss": 1.4591, "step": 380 }, { "epoch": 0.2576936083868786, "grad_norm": 0.08007548749446869, "learning_rate": 5.844801966434832e-07, "loss": 1.5055, "step": 381 }, { "epoch": 0.258369969563747, "grad_norm": 0.07806207239627838, "learning_rate": 5.246800274474439e-07, "loss": 1.421, "step": 382 }, { "epoch": 0.2590463307406155, "grad_norm": 0.0787520781159401, "learning_rate": 4.680902408635335e-07, "loss": 1.4013, "step": 383 }, { "epoch": 0.2597226919174839, "grad_norm": 0.07986165583133698, "learning_rate": 4.1471450892189846e-07, "loss": 1.433, "step": 384 }, { "epoch": 0.2603990530943524, "grad_norm": 0.0808686912059784, "learning_rate": 3.6455629509730136e-07, "loss": 1.4251, "step": 385 }, { "epoch": 0.26107541427122083, "grad_norm": 0.08336784690618515, "learning_rate": 3.1761885408435054e-07, "loss": 1.4741, "step": 386 }, { "epoch": 0.26175177544808925, "grad_norm": 0.08735602349042892, "learning_rate": 2.7390523158633554e-07, "loss": 1.4814, "step": 387 }, { "epoch": 0.26242813662495773, "grad_norm": 0.07778208702802658, "learning_rate": 2.334182641175686e-07, "loss": 1.3246, "step": 388 }, { "epoch": 0.26310449780182615, "grad_norm": 0.08358029276132584, "learning_rate": 1.9616057881935436e-07, "loss": 1.5294, "step": 389 }, { "epoch": 0.26378085897869463, "grad_norm": 0.08617830276489258, "learning_rate": 1.6213459328950352e-07, "loss": 1.4288, "step": 390 }, { "epoch": 0.26445722015556306, "grad_norm": 0.08489940315485, "learning_rate": 1.3134251542544774e-07, "loss": 1.387, "step": 391 }, { "epoch": 0.26513358133243153, "grad_norm": 0.0889124944806099, "learning_rate": 1.0378634328099269e-07, "loss": 1.4393, "step": 392 }, { "epoch": 0.26580994250929996, "grad_norm": 0.09041323512792587, "learning_rate": 7.946786493666647e-08, "loss": 1.4255, "step": 393 }, { "epoch": 0.26648630368616844, "grad_norm": 0.09535028040409088, "learning_rate": 5.838865838366792e-08, "loss": 1.3967, "step": 394 }, { "epoch": 0.26716266486303686, "grad_norm": 0.09417252242565155, "learning_rate": 4.055009142152067e-08, "loss": 1.3455, "step": 395 }, { "epoch": 0.26783902603990534, "grad_norm": 0.09836997091770172, "learning_rate": 2.595332156925534e-08, "loss": 1.4622, "step": 396 }, { "epoch": 0.26851538721677376, "grad_norm": 0.10254459083080292, "learning_rate": 1.4599295990352924e-08, "loss": 1.418, "step": 397 }, { "epoch": 0.2691917483936422, "grad_norm": 0.11125387996435165, "learning_rate": 6.488751431266149e-09, "loss": 1.4415, "step": 398 }, { "epoch": 0.26986810957051066, "grad_norm": 0.1215406209230423, "learning_rate": 1.622214173602199e-09, "loss": 1.3851, "step": 399 }, { "epoch": 0.2705444707473791, "grad_norm": 0.15271972119808197, "learning_rate": 0.0, "loss": 1.3691, "step": 400 }, { "epoch": 0.2705444707473791, "eval_loss": 1.3924267292022705, "eval_runtime": 58.679, "eval_samples_per_second": 42.434, "eval_steps_per_second": 10.617, "step": 400 } ], "logging_steps": 1, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.285041640112128e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }