{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6066481994459834, "eval_steps": 37, "global_step": 219, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002770083102493075, "grad_norm": 0.9076941366015471, "learning_rate": 2.5e-06, "loss": 1.1945, "step": 1 }, { "epoch": 0.002770083102493075, "eval_loss": 1.0296825170516968, "eval_runtime": 104.9654, "eval_samples_per_second": 0.953, "eval_steps_per_second": 0.238, "step": 1 }, { "epoch": 0.00554016620498615, "grad_norm": 1.0344272252302953, "learning_rate": 5e-06, "loss": 1.3385, "step": 2 }, { "epoch": 0.008310249307479225, "grad_norm": 1.0977608192145218, "learning_rate": 7.5e-06, "loss": 1.3767, "step": 3 }, { "epoch": 0.0110803324099723, "grad_norm": 0.8895382737806115, "learning_rate": 1e-05, "loss": 1.1881, "step": 4 }, { "epoch": 0.013850415512465374, "grad_norm": 0.958710062908987, "learning_rate": 1.25e-05, "loss": 1.2936, "step": 5 }, { "epoch": 0.01662049861495845, "grad_norm": 0.8447951280142822, "learning_rate": 1.5e-05, "loss": 1.1547, "step": 6 }, { "epoch": 0.019390581717451522, "grad_norm": 0.8794765482094637, "learning_rate": 1.75e-05, "loss": 1.3444, "step": 7 }, { "epoch": 0.0221606648199446, "grad_norm": 0.8437125997364887, "learning_rate": 2e-05, "loss": 1.2765, "step": 8 }, { "epoch": 0.024930747922437674, "grad_norm": 0.7276844901691898, "learning_rate": 2.25e-05, "loss": 1.2031, "step": 9 }, { "epoch": 0.027700831024930747, "grad_norm": 0.577477596526854, "learning_rate": 2.5e-05, "loss": 1.1725, "step": 10 }, { "epoch": 0.030470914127423823, "grad_norm": 0.4618405348091677, "learning_rate": 2.7500000000000004e-05, "loss": 1.1008, "step": 11 }, { "epoch": 0.0332409972299169, "grad_norm": 0.3671423964001518, "learning_rate": 3e-05, "loss": 1.0963, "step": 12 }, { "epoch": 0.036011080332409975, "grad_norm": 0.4139651359524478, "learning_rate": 3.2500000000000004e-05, "loss": 1.2831, "step": 13 }, { "epoch": 0.038781163434903045, "grad_norm": 0.4364185358505406, "learning_rate": 3.5e-05, "loss": 1.0524, "step": 14 }, { "epoch": 0.04155124653739612, "grad_norm": 0.5116601253143136, "learning_rate": 3.7500000000000003e-05, "loss": 1.1499, "step": 15 }, { "epoch": 0.0443213296398892, "grad_norm": 0.5236113301724862, "learning_rate": 4e-05, "loss": 1.1466, "step": 16 }, { "epoch": 0.04709141274238227, "grad_norm": 0.4606756443741647, "learning_rate": 4.25e-05, "loss": 1.1385, "step": 17 }, { "epoch": 0.04986149584487535, "grad_norm": 0.3991457405979455, "learning_rate": 4.5e-05, "loss": 1.108, "step": 18 }, { "epoch": 0.05263157894736842, "grad_norm": 1.8553548238184947, "learning_rate": 4.75e-05, "loss": 0.9794, "step": 19 }, { "epoch": 0.055401662049861494, "grad_norm": 0.31912881369455715, "learning_rate": 5e-05, "loss": 1.1583, "step": 20 }, { "epoch": 0.05817174515235457, "grad_norm": 0.3315962832679952, "learning_rate": 4.999904513837887e-05, "loss": 1.1927, "step": 21 }, { "epoch": 0.060941828254847646, "grad_norm": 0.3081653717373631, "learning_rate": 4.999618063456087e-05, "loss": 1.1666, "step": 22 }, { "epoch": 0.06371191135734072, "grad_norm": 0.2386753107269311, "learning_rate": 4.999140673167533e-05, "loss": 1.1517, "step": 23 }, { "epoch": 0.0664819944598338, "grad_norm": 0.20460753812847132, "learning_rate": 4.998472383491481e-05, "loss": 1.1475, "step": 24 }, { "epoch": 0.06925207756232687, "grad_norm": 0.19426749294122878, "learning_rate": 4.997613251150082e-05, "loss": 1.0427, "step": 25 }, { "epoch": 0.07202216066481995, "grad_norm": 0.16821147486362656, "learning_rate": 4.996563349063557e-05, "loss": 1.1392, "step": 26 }, { "epoch": 0.07479224376731301, "grad_norm": 0.16476251779742826, "learning_rate": 4.995322766344013e-05, "loss": 1.0624, "step": 27 }, { "epoch": 0.07756232686980609, "grad_norm": 0.14974022119841934, "learning_rate": 4.993891608287879e-05, "loss": 1.0357, "step": 28 }, { "epoch": 0.08033240997229917, "grad_norm": 0.18664906507712956, "learning_rate": 4.992269996366969e-05, "loss": 1.0724, "step": 29 }, { "epoch": 0.08310249307479224, "grad_norm": 0.1850945977027583, "learning_rate": 4.990458068218171e-05, "loss": 1.1334, "step": 30 }, { "epoch": 0.08587257617728532, "grad_norm": 0.18423955915331258, "learning_rate": 4.9884559776317644e-05, "loss": 1.1707, "step": 31 }, { "epoch": 0.0886426592797784, "grad_norm": 0.17778624557198042, "learning_rate": 4.9862638945383674e-05, "loss": 1.0923, "step": 32 }, { "epoch": 0.09141274238227147, "grad_norm": 0.18753159302589986, "learning_rate": 4.983882004994517e-05, "loss": 1.0743, "step": 33 }, { "epoch": 0.09418282548476455, "grad_norm": 0.16951133250798278, "learning_rate": 4.98131051116687e-05, "loss": 1.0776, "step": 34 }, { "epoch": 0.09695290858725762, "grad_norm": 0.15767321847790128, "learning_rate": 4.9785496313150515e-05, "loss": 1.0987, "step": 35 }, { "epoch": 0.0997229916897507, "grad_norm": 0.162961453637477, "learning_rate": 4.975599599773124e-05, "loss": 1.2561, "step": 36 }, { "epoch": 0.10249307479224377, "grad_norm": 0.15327784219657248, "learning_rate": 4.972460666929699e-05, "loss": 1.1456, "step": 37 }, { "epoch": 0.10249307479224377, "eval_loss": 0.7954654693603516, "eval_runtime": 105.8629, "eval_samples_per_second": 0.945, "eval_steps_per_second": 0.236, "step": 37 }, { "epoch": 0.10526315789473684, "grad_norm": 0.17503484326795066, "learning_rate": 4.969133099206691e-05, "loss": 1.1647, "step": 38 }, { "epoch": 0.10803324099722991, "grad_norm": 0.1616243718822839, "learning_rate": 4.965617179036695e-05, "loss": 1.0552, "step": 39 }, { "epoch": 0.11080332409972299, "grad_norm": 0.14651727257725808, "learning_rate": 4.96191320483902e-05, "loss": 0.9286, "step": 40 }, { "epoch": 0.11357340720221606, "grad_norm": 0.16761163260684342, "learning_rate": 4.958021490994361e-05, "loss": 1.0554, "step": 41 }, { "epoch": 0.11634349030470914, "grad_norm": 0.18422943695103963, "learning_rate": 4.953942367818113e-05, "loss": 1.1106, "step": 42 }, { "epoch": 0.11911357340720222, "grad_norm": 0.1473247570248768, "learning_rate": 4.9496761815323344e-05, "loss": 1.2059, "step": 43 }, { "epoch": 0.12188365650969529, "grad_norm": 0.14787258143002216, "learning_rate": 4.9452232942363634e-05, "loss": 1.1916, "step": 44 }, { "epoch": 0.12465373961218837, "grad_norm": 0.13340848225076712, "learning_rate": 4.940584083876084e-05, "loss": 1.0421, "step": 45 }, { "epoch": 0.12742382271468145, "grad_norm": 0.15834348128625553, "learning_rate": 4.935758944211844e-05, "loss": 1.0782, "step": 46 }, { "epoch": 0.13019390581717452, "grad_norm": 0.14100327469205548, "learning_rate": 4.9307482847850374e-05, "loss": 1.03, "step": 47 }, { "epoch": 0.1329639889196676, "grad_norm": 0.18919634141556568, "learning_rate": 4.925552530883343e-05, "loss": 1.0422, "step": 48 }, { "epoch": 0.13573407202216067, "grad_norm": 0.15121560590293795, "learning_rate": 4.9201721235046274e-05, "loss": 0.9852, "step": 49 }, { "epoch": 0.13850415512465375, "grad_norm": 0.14628904242090288, "learning_rate": 4.914607519319512e-05, "loss": 1.1146, "step": 50 }, { "epoch": 0.14127423822714683, "grad_norm": 0.13794163183547553, "learning_rate": 4.90885919063262e-05, "loss": 1.0787, "step": 51 }, { "epoch": 0.1440443213296399, "grad_norm": 0.13302301409932443, "learning_rate": 4.902927625342476e-05, "loss": 1.0993, "step": 52 }, { "epoch": 0.14681440443213298, "grad_norm": 0.14124480977262469, "learning_rate": 4.8968133269001096e-05, "loss": 1.0137, "step": 53 }, { "epoch": 0.14958448753462603, "grad_norm": 0.14791504265910085, "learning_rate": 4.890516814266314e-05, "loss": 1.1663, "step": 54 }, { "epoch": 0.1523545706371191, "grad_norm": 0.13506987447877156, "learning_rate": 4.884038621867599e-05, "loss": 0.9941, "step": 55 }, { "epoch": 0.15512465373961218, "grad_norm": 0.14611198065865885, "learning_rate": 4.877379299550838e-05, "loss": 0.9958, "step": 56 }, { "epoch": 0.15789473684210525, "grad_norm": 0.16481726483113354, "learning_rate": 4.8705394125365884e-05, "loss": 1.1329, "step": 57 }, { "epoch": 0.16066481994459833, "grad_norm": 0.1498291481940627, "learning_rate": 4.8635195413711286e-05, "loss": 1.058, "step": 58 }, { "epoch": 0.1634349030470914, "grad_norm": 0.15057001184388058, "learning_rate": 4.8563202818771735e-05, "loss": 1.1062, "step": 59 }, { "epoch": 0.16620498614958448, "grad_norm": 0.12916512409429184, "learning_rate": 4.84894224510331e-05, "loss": 1.0769, "step": 60 }, { "epoch": 0.16897506925207756, "grad_norm": 0.13775185913237034, "learning_rate": 4.841386057272129e-05, "loss": 0.9624, "step": 61 }, { "epoch": 0.17174515235457063, "grad_norm": 0.1447222703266104, "learning_rate": 4.8336523597270765e-05, "loss": 0.9653, "step": 62 }, { "epoch": 0.1745152354570637, "grad_norm": 0.20538147828946965, "learning_rate": 4.825741808878017e-05, "loss": 0.9674, "step": 63 }, { "epoch": 0.1772853185595568, "grad_norm": 0.15455976822474854, "learning_rate": 4.8176550761455195e-05, "loss": 1.1488, "step": 64 }, { "epoch": 0.18005540166204986, "grad_norm": 0.14123106112442296, "learning_rate": 4.8093928479038704e-05, "loss": 1.0781, "step": 65 }, { "epoch": 0.18282548476454294, "grad_norm": 0.153605912426132, "learning_rate": 4.8009558254228185e-05, "loss": 1.0193, "step": 66 }, { "epoch": 0.18559556786703602, "grad_norm": 0.170305518682767, "learning_rate": 4.792344724808048e-05, "loss": 1.1249, "step": 67 }, { "epoch": 0.1883656509695291, "grad_norm": 0.15808161680481964, "learning_rate": 4.783560276940402e-05, "loss": 0.9227, "step": 68 }, { "epoch": 0.19113573407202217, "grad_norm": 0.13584733997983103, "learning_rate": 4.7746032274138494e-05, "loss": 1.0205, "step": 69 }, { "epoch": 0.19390581717451524, "grad_norm": 0.152754335402458, "learning_rate": 4.765474336472197e-05, "loss": 1.1152, "step": 70 }, { "epoch": 0.19667590027700832, "grad_norm": 0.1559364062645901, "learning_rate": 4.7561743789445644e-05, "loss": 1.004, "step": 71 }, { "epoch": 0.1994459833795014, "grad_norm": 0.1601485375057651, "learning_rate": 4.7467041441796214e-05, "loss": 1.0463, "step": 72 }, { "epoch": 0.20221606648199447, "grad_norm": 0.14468865075681345, "learning_rate": 4.737064435978586e-05, "loss": 0.9536, "step": 73 }, { "epoch": 0.20498614958448755, "grad_norm": 0.32356482070883336, "learning_rate": 4.727256072527006e-05, "loss": 1.0037, "step": 74 }, { "epoch": 0.20498614958448755, "eval_loss": 0.7582282423973083, "eval_runtime": 105.298, "eval_samples_per_second": 0.95, "eval_steps_per_second": 0.237, "step": 74 }, { "epoch": 0.2077562326869806, "grad_norm": 0.15635524181898305, "learning_rate": 4.71727988632531e-05, "loss": 1.0651, "step": 75 }, { "epoch": 0.21052631578947367, "grad_norm": 0.15219977997190004, "learning_rate": 4.70713672411815e-05, "loss": 1.0337, "step": 76 }, { "epoch": 0.21329639889196675, "grad_norm": 0.15563741812103743, "learning_rate": 4.6968274468225296e-05, "loss": 1.0671, "step": 77 }, { "epoch": 0.21606648199445982, "grad_norm": 0.15558142374206219, "learning_rate": 4.686352929454739e-05, "loss": 1.0429, "step": 78 }, { "epoch": 0.2188365650969529, "grad_norm": 0.21330774901341568, "learning_rate": 4.6757140610560765e-05, "loss": 1.1399, "step": 79 }, { "epoch": 0.22160664819944598, "grad_norm": 0.15688008638487674, "learning_rate": 4.664911744617398e-05, "loss": 0.9371, "step": 80 }, { "epoch": 0.22437673130193905, "grad_norm": 0.1763095699279512, "learning_rate": 4.653946897002472e-05, "loss": 1.1098, "step": 81 }, { "epoch": 0.22714681440443213, "grad_norm": 0.1678442035216426, "learning_rate": 4.6428204488701576e-05, "loss": 1.068, "step": 82 }, { "epoch": 0.2299168975069252, "grad_norm": 0.17911858466627326, "learning_rate": 4.631533344595416e-05, "loss": 1.0096, "step": 83 }, { "epoch": 0.23268698060941828, "grad_norm": 0.23891003796307586, "learning_rate": 4.6200865421891496e-05, "loss": 1.1138, "step": 84 }, { "epoch": 0.23545706371191136, "grad_norm": 0.18872774327058275, "learning_rate": 4.6084810132169e-05, "loss": 1.0792, "step": 85 }, { "epoch": 0.23822714681440443, "grad_norm": 0.1692018308326617, "learning_rate": 4.596717742716372e-05, "loss": 0.9956, "step": 86 }, { "epoch": 0.2409972299168975, "grad_norm": 0.18114487653466307, "learning_rate": 4.5847977291138366e-05, "loss": 1.0282, "step": 87 }, { "epoch": 0.24376731301939059, "grad_norm": 0.1874960678295754, "learning_rate": 4.5727219841393846e-05, "loss": 0.9019, "step": 88 }, { "epoch": 0.24653739612188366, "grad_norm": 0.15508335823487335, "learning_rate": 4.560491532741053e-05, "loss": 0.7839, "step": 89 }, { "epoch": 0.24930747922437674, "grad_norm": 0.17507231249024333, "learning_rate": 4.548107412997833e-05, "loss": 0.9396, "step": 90 }, { "epoch": 0.2520775623268698, "grad_norm": 0.18664452466447212, "learning_rate": 4.53557067603156e-05, "loss": 0.9874, "step": 91 }, { "epoch": 0.2548476454293629, "grad_norm": 0.19573327857302514, "learning_rate": 4.522882385917699e-05, "loss": 1.0442, "step": 92 }, { "epoch": 0.25761772853185594, "grad_norm": 0.20160586348422904, "learning_rate": 4.510043619595031e-05, "loss": 1.0648, "step": 93 }, { "epoch": 0.26038781163434904, "grad_norm": 0.18066141120205173, "learning_rate": 4.49705546677424e-05, "loss": 0.9018, "step": 94 }, { "epoch": 0.2631578947368421, "grad_norm": 0.21418534715858012, "learning_rate": 4.483919029845431e-05, "loss": 0.9509, "step": 95 }, { "epoch": 0.2659279778393352, "grad_norm": 0.17573567089277978, "learning_rate": 4.470635423784556e-05, "loss": 0.9513, "step": 96 }, { "epoch": 0.26869806094182824, "grad_norm": 0.17774724005874368, "learning_rate": 4.4572057760587796e-05, "loss": 0.8852, "step": 97 }, { "epoch": 0.27146814404432135, "grad_norm": 0.18965686235959792, "learning_rate": 4.4436312265307876e-05, "loss": 0.9826, "step": 98 }, { "epoch": 0.2742382271468144, "grad_norm": 0.1982451843435613, "learning_rate": 4.429912927362035e-05, "loss": 0.9473, "step": 99 }, { "epoch": 0.2770083102493075, "grad_norm": 0.19613124811118365, "learning_rate": 4.416052042914954e-05, "loss": 0.9648, "step": 100 }, { "epoch": 0.27977839335180055, "grad_norm": 0.17148361064681392, "learning_rate": 4.402049749654133e-05, "loss": 0.9262, "step": 101 }, { "epoch": 0.28254847645429365, "grad_norm": 0.18346018130558348, "learning_rate": 4.387907236046455e-05, "loss": 0.9295, "step": 102 }, { "epoch": 0.2853185595567867, "grad_norm": 0.2092097295306342, "learning_rate": 4.3736257024602274e-05, "loss": 0.9108, "step": 103 }, { "epoch": 0.2880886426592798, "grad_norm": 0.21514609527308398, "learning_rate": 4.359206361063302e-05, "loss": 0.9358, "step": 104 }, { "epoch": 0.29085872576177285, "grad_norm": 0.2109170892352583, "learning_rate": 4.344650435720184e-05, "loss": 0.9258, "step": 105 }, { "epoch": 0.29362880886426596, "grad_norm": 0.20927017505540593, "learning_rate": 4.3299591618881604e-05, "loss": 1.0129, "step": 106 }, { "epoch": 0.296398891966759, "grad_norm": 0.2044744893718055, "learning_rate": 4.3151337865124354e-05, "loss": 0.9501, "step": 107 }, { "epoch": 0.29916897506925205, "grad_norm": 0.23366539469114328, "learning_rate": 4.3001755679202946e-05, "loss": 1.0061, "step": 108 }, { "epoch": 0.30193905817174516, "grad_norm": 0.28332040666616826, "learning_rate": 4.2850857757143035e-05, "loss": 1.0622, "step": 109 }, { "epoch": 0.3047091412742382, "grad_norm": 0.22750850899455075, "learning_rate": 4.269865690664546e-05, "loss": 0.9932, "step": 110 }, { "epoch": 0.3074792243767313, "grad_norm": 0.2919596477328322, "learning_rate": 4.2545166045999175e-05, "loss": 0.9947, "step": 111 }, { "epoch": 0.3074792243767313, "eval_loss": 0.7285755276679993, "eval_runtime": 105.4908, "eval_samples_per_second": 0.948, "eval_steps_per_second": 0.237, "step": 111 }, { "epoch": 0.31024930747922436, "grad_norm": 0.22650943562204162, "learning_rate": 4.239039820298482e-05, "loss": 0.9006, "step": 112 }, { "epoch": 0.31301939058171746, "grad_norm": 0.31497879982021704, "learning_rate": 4.223436651376892e-05, "loss": 0.9833, "step": 113 }, { "epoch": 0.3157894736842105, "grad_norm": 0.22217961036727576, "learning_rate": 4.207708422178896e-05, "loss": 0.9329, "step": 114 }, { "epoch": 0.3185595567867036, "grad_norm": 0.29069944659837527, "learning_rate": 4.1918564676629316e-05, "loss": 1.005, "step": 115 }, { "epoch": 0.32132963988919666, "grad_norm": 0.25740135929728936, "learning_rate": 4.1758821332888205e-05, "loss": 0.9289, "step": 116 }, { "epoch": 0.32409972299168976, "grad_norm": 0.20735884986652237, "learning_rate": 4.159786774903569e-05, "loss": 0.7929, "step": 117 }, { "epoch": 0.3268698060941828, "grad_norm": 0.2734473995424548, "learning_rate": 4.143571758626288e-05, "loss": 0.9545, "step": 118 }, { "epoch": 0.3296398891966759, "grad_norm": 0.2819033324098223, "learning_rate": 4.127238460732242e-05, "loss": 0.9095, "step": 119 }, { "epoch": 0.33240997229916897, "grad_norm": 0.21720763820121466, "learning_rate": 4.110788267536036e-05, "loss": 0.8727, "step": 120 }, { "epoch": 0.33518005540166207, "grad_norm": 0.3603455449135715, "learning_rate": 4.0942225752739496e-05, "loss": 0.9487, "step": 121 }, { "epoch": 0.3379501385041551, "grad_norm": 0.26959993038866503, "learning_rate": 4.077542789985429e-05, "loss": 0.9707, "step": 122 }, { "epoch": 0.3407202216066482, "grad_norm": 0.2918507118523375, "learning_rate": 4.0607503273937464e-05, "loss": 0.9828, "step": 123 }, { "epoch": 0.34349030470914127, "grad_norm": 0.34593468652257114, "learning_rate": 4.0438466127858407e-05, "loss": 0.9309, "step": 124 }, { "epoch": 0.3462603878116344, "grad_norm": 0.2328074759683285, "learning_rate": 4.026833080891342e-05, "loss": 0.95, "step": 125 }, { "epoch": 0.3490304709141274, "grad_norm": 0.28764735207874204, "learning_rate": 4.0097111757607955e-05, "loss": 0.9819, "step": 126 }, { "epoch": 0.3518005540166205, "grad_norm": 0.4004054092669033, "learning_rate": 3.9924823506430974e-05, "loss": 0.9405, "step": 127 }, { "epoch": 0.3545706371191136, "grad_norm": 0.2645217010449881, "learning_rate": 3.9751480678621505e-05, "loss": 0.9705, "step": 128 }, { "epoch": 0.3573407202216066, "grad_norm": 0.2940771458015333, "learning_rate": 3.957709798692741e-05, "loss": 0.9053, "step": 129 }, { "epoch": 0.3601108033240997, "grad_norm": 0.33343898482348633, "learning_rate": 3.9401690232356654e-05, "loss": 0.9337, "step": 130 }, { "epoch": 0.3628808864265928, "grad_norm": 0.3818512479727835, "learning_rate": 3.922527230292107e-05, "loss": 0.9639, "step": 131 }, { "epoch": 0.3656509695290859, "grad_norm": 0.3278779554370676, "learning_rate": 3.904785917237264e-05, "loss": 1.0459, "step": 132 }, { "epoch": 0.3684210526315789, "grad_norm": 0.34049121712960123, "learning_rate": 3.8869465898932664e-05, "loss": 0.898, "step": 133 }, { "epoch": 0.37119113573407203, "grad_norm": 0.39663817614396785, "learning_rate": 3.8690107624013614e-05, "loss": 0.8741, "step": 134 }, { "epoch": 0.3739612188365651, "grad_norm": 0.3157982406853401, "learning_rate": 3.8509799570933985e-05, "loss": 1.0399, "step": 135 }, { "epoch": 0.3767313019390582, "grad_norm": 0.5076584455774723, "learning_rate": 3.832855704362621e-05, "loss": 0.9597, "step": 136 }, { "epoch": 0.37950138504155123, "grad_norm": 0.31376263297051804, "learning_rate": 3.814639542533771e-05, "loss": 0.9315, "step": 137 }, { "epoch": 0.38227146814404434, "grad_norm": 0.3400455584385456, "learning_rate": 3.796333017732519e-05, "loss": 0.8829, "step": 138 }, { "epoch": 0.3850415512465374, "grad_norm": 0.3541184588730341, "learning_rate": 3.777937683754242e-05, "loss": 0.8097, "step": 139 }, { "epoch": 0.3878116343490305, "grad_norm": 0.3333594677616915, "learning_rate": 3.75945510193213e-05, "loss": 0.9865, "step": 140 }, { "epoch": 0.39058171745152354, "grad_norm": 0.3650263368887469, "learning_rate": 3.740886841004678e-05, "loss": 0.9889, "step": 141 }, { "epoch": 0.39335180055401664, "grad_norm": 0.3586705478546664, "learning_rate": 3.7222344769825255e-05, "loss": 0.9119, "step": 142 }, { "epoch": 0.3961218836565097, "grad_norm": 0.2968031141643322, "learning_rate": 3.703499593014702e-05, "loss": 0.837, "step": 143 }, { "epoch": 0.3988919667590028, "grad_norm": 0.28911701684318286, "learning_rate": 3.6846837792542446e-05, "loss": 0.849, "step": 144 }, { "epoch": 0.40166204986149584, "grad_norm": 0.3428834579431544, "learning_rate": 3.6657886327232357e-05, "loss": 0.813, "step": 145 }, { "epoch": 0.40443213296398894, "grad_norm": 0.3390660834271031, "learning_rate": 3.6468157571772535e-05, "loss": 1.014, "step": 146 }, { "epoch": 0.407202216066482, "grad_norm": 0.29842989483876114, "learning_rate": 3.6277667629692475e-05, "loss": 0.9415, "step": 147 }, { "epoch": 0.4099722991689751, "grad_norm": 0.40347234769657353, "learning_rate": 3.608643266912863e-05, "loss": 0.8513, "step": 148 }, { "epoch": 0.4099722991689751, "eval_loss": 0.6967936754226685, "eval_runtime": 105.0327, "eval_samples_per_second": 0.952, "eval_steps_per_second": 0.238, "step": 148 }, { "epoch": 0.41274238227146814, "grad_norm": 0.293058789027836, "learning_rate": 3.5894468921452055e-05, "loss": 0.8557, "step": 149 }, { "epoch": 0.4155124653739612, "grad_norm": 0.5025689623712035, "learning_rate": 3.570179267989079e-05, "loss": 0.9919, "step": 150 }, { "epoch": 0.4182825484764543, "grad_norm": 0.34207048547694535, "learning_rate": 3.550842029814689e-05, "loss": 0.9175, "step": 151 }, { "epoch": 0.42105263157894735, "grad_norm": 0.3442929194276912, "learning_rate": 3.531436818900846e-05, "loss": 0.9258, "step": 152 }, { "epoch": 0.42382271468144045, "grad_norm": 0.3397465751123524, "learning_rate": 3.511965282295651e-05, "loss": 0.8444, "step": 153 }, { "epoch": 0.4265927977839335, "grad_norm": 0.4164754708763866, "learning_rate": 3.492429072676704e-05, "loss": 0.7808, "step": 154 }, { "epoch": 0.4293628808864266, "grad_norm": 0.3970020903838321, "learning_rate": 3.472829848210833e-05, "loss": 0.8346, "step": 155 }, { "epoch": 0.43213296398891965, "grad_norm": 0.35193376832006673, "learning_rate": 3.453169272413347e-05, "loss": 0.8341, "step": 156 }, { "epoch": 0.43490304709141275, "grad_norm": 0.3786418049677514, "learning_rate": 3.433449014006849e-05, "loss": 0.8282, "step": 157 }, { "epoch": 0.4376731301939058, "grad_norm": 0.3841329093851364, "learning_rate": 3.4136707467795975e-05, "loss": 0.8553, "step": 158 }, { "epoch": 0.4404432132963989, "grad_norm": 0.3672502229739841, "learning_rate": 3.3938361494434416e-05, "loss": 0.8543, "step": 159 }, { "epoch": 0.44321329639889195, "grad_norm": 0.35006854344753147, "learning_rate": 3.3739469054913396e-05, "loss": 0.9272, "step": 160 }, { "epoch": 0.44598337950138506, "grad_norm": 0.37656118751084383, "learning_rate": 3.354004703054466e-05, "loss": 0.8156, "step": 161 }, { "epoch": 0.4487534626038781, "grad_norm": 0.41484594703372124, "learning_rate": 3.334011234758933e-05, "loss": 0.9283, "step": 162 }, { "epoch": 0.4515235457063712, "grad_norm": 0.36059722487577023, "learning_rate": 3.3139681975821215e-05, "loss": 0.9589, "step": 163 }, { "epoch": 0.45429362880886426, "grad_norm": 0.49096051315762496, "learning_rate": 3.2938772927086506e-05, "loss": 0.8572, "step": 164 }, { "epoch": 0.45706371191135736, "grad_norm": 0.3657461189986915, "learning_rate": 3.273740225385988e-05, "loss": 0.8634, "step": 165 }, { "epoch": 0.4598337950138504, "grad_norm": 0.4619952678802632, "learning_rate": 3.253558704779711e-05, "loss": 0.8531, "step": 166 }, { "epoch": 0.4626038781163435, "grad_norm": 0.38029424915243915, "learning_rate": 3.2333344438284396e-05, "loss": 0.8544, "step": 167 }, { "epoch": 0.46537396121883656, "grad_norm": 0.41328982224991867, "learning_rate": 3.21306915909845e-05, "loss": 0.8833, "step": 168 }, { "epoch": 0.46814404432132967, "grad_norm": 0.3923589316746721, "learning_rate": 3.1927645706379764e-05, "loss": 0.9791, "step": 169 }, { "epoch": 0.4709141274238227, "grad_norm": 0.3424277457752046, "learning_rate": 3.1724224018312184e-05, "loss": 0.9123, "step": 170 }, { "epoch": 0.47368421052631576, "grad_norm": 0.46781422059203653, "learning_rate": 3.152044379252067e-05, "loss": 0.8154, "step": 171 }, { "epoch": 0.47645429362880887, "grad_norm": 0.3469918706960392, "learning_rate": 3.131632232517562e-05, "loss": 0.7862, "step": 172 }, { "epoch": 0.4792243767313019, "grad_norm": 0.3826762775779829, "learning_rate": 3.111187694141082e-05, "loss": 0.9544, "step": 173 }, { "epoch": 0.481994459833795, "grad_norm": 0.35177402551860143, "learning_rate": 3.0907124993852976e-05, "loss": 0.7952, "step": 174 }, { "epoch": 0.48476454293628807, "grad_norm": 0.32900068971172497, "learning_rate": 3.070208386114892e-05, "loss": 0.7085, "step": 175 }, { "epoch": 0.48753462603878117, "grad_norm": 0.7811598278937018, "learning_rate": 3.0496770946490492e-05, "loss": 0.7872, "step": 176 }, { "epoch": 0.4903047091412742, "grad_norm": 0.39312070018511214, "learning_rate": 3.029120367613747e-05, "loss": 0.8672, "step": 177 }, { "epoch": 0.4930747922437673, "grad_norm": 0.4140927363443536, "learning_rate": 3.0085399497938488e-05, "loss": 0.8214, "step": 178 }, { "epoch": 0.49584487534626037, "grad_norm": 0.616509191319175, "learning_rate": 2.9879375879850103e-05, "loss": 0.8303, "step": 179 }, { "epoch": 0.4986149584487535, "grad_norm": 0.429366430219185, "learning_rate": 2.967315030845415e-05, "loss": 0.9848, "step": 180 }, { "epoch": 0.5013850415512465, "grad_norm": 0.5293488318623817, "learning_rate": 2.9466740287473638e-05, "loss": 0.8849, "step": 181 }, { "epoch": 0.5041551246537396, "grad_norm": 0.4983497133183736, "learning_rate": 2.9260163336286968e-05, "loss": 0.8088, "step": 182 }, { "epoch": 0.5069252077562327, "grad_norm": 0.476254468852299, "learning_rate": 2.905343698844103e-05, "loss": 0.8621, "step": 183 }, { "epoch": 0.5096952908587258, "grad_norm": 0.4786974943663476, "learning_rate": 2.884657879016299e-05, "loss": 0.8227, "step": 184 }, { "epoch": 0.5124653739612188, "grad_norm": 0.4653172688884986, "learning_rate": 2.863960629887104e-05, "loss": 0.8095, "step": 185 }, { "epoch": 0.5124653739612188, "eval_loss": 0.6681232452392578, "eval_runtime": 104.7731, "eval_samples_per_second": 0.954, "eval_steps_per_second": 0.239, "step": 185 }, { "epoch": 0.5152354570637119, "grad_norm": 0.4046344642446634, "learning_rate": 2.843253708168415e-05, "loss": 0.8669, "step": 186 }, { "epoch": 0.518005540166205, "grad_norm": 0.39837130351678596, "learning_rate": 2.8225388713931077e-05, "loss": 0.7931, "step": 187 }, { "epoch": 0.5207756232686981, "grad_norm": 0.5213280111204851, "learning_rate": 2.8018178777658606e-05, "loss": 0.8635, "step": 188 }, { "epoch": 0.5235457063711911, "grad_norm": 0.597198365618595, "learning_rate": 2.781092486013922e-05, "loss": 0.8894, "step": 189 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5240535623325078, "learning_rate": 2.7603644552378416e-05, "loss": 0.8702, "step": 190 }, { "epoch": 0.5290858725761773, "grad_norm": 0.6138846789060535, "learning_rate": 2.7396355447621592e-05, "loss": 0.8119, "step": 191 }, { "epoch": 0.5318559556786704, "grad_norm": 0.366272122964379, "learning_rate": 2.718907513986078e-05, "loss": 0.7842, "step": 192 }, { "epoch": 0.5346260387811634, "grad_norm": 0.40094490758320805, "learning_rate": 2.6981821222341402e-05, "loss": 0.764, "step": 193 }, { "epoch": 0.5373961218836565, "grad_norm": 0.5372324207647045, "learning_rate": 2.6774611286068925e-05, "loss": 0.842, "step": 194 }, { "epoch": 0.5401662049861495, "grad_norm": 0.48537277506100157, "learning_rate": 2.6567462918315854e-05, "loss": 0.8458, "step": 195 }, { "epoch": 0.5429362880886427, "grad_norm": 0.34780522923613105, "learning_rate": 2.6360393701128968e-05, "loss": 0.8072, "step": 196 }, { "epoch": 0.5457063711911357, "grad_norm": 0.47774919793354165, "learning_rate": 2.6153421209837014e-05, "loss": 0.8267, "step": 197 }, { "epoch": 0.5484764542936288, "grad_norm": 0.4528933180645613, "learning_rate": 2.5946563011558976e-05, "loss": 0.7658, "step": 198 }, { "epoch": 0.5512465373961218, "grad_norm": 0.36863071919365176, "learning_rate": 2.573983666371304e-05, "loss": 0.7463, "step": 199 }, { "epoch": 0.554016620498615, "grad_norm": 0.5401156027855689, "learning_rate": 2.5533259712526364e-05, "loss": 0.8011, "step": 200 }, { "epoch": 0.556786703601108, "grad_norm": 0.38878696385621075, "learning_rate": 2.532684969154585e-05, "loss": 0.7882, "step": 201 }, { "epoch": 0.5595567867036011, "grad_norm": 0.526790961475592, "learning_rate": 2.512062412014991e-05, "loss": 0.7619, "step": 202 }, { "epoch": 0.5623268698060941, "grad_norm": 0.38768035501734327, "learning_rate": 2.4914600502061514e-05, "loss": 0.8093, "step": 203 }, { "epoch": 0.5650969529085873, "grad_norm": 0.5944282601248911, "learning_rate": 2.4708796323862538e-05, "loss": 0.8492, "step": 204 }, { "epoch": 0.5678670360110804, "grad_norm": 0.425756127694364, "learning_rate": 2.450322905350952e-05, "loss": 0.7968, "step": 205 }, { "epoch": 0.5706371191135734, "grad_norm": 0.5587462841529842, "learning_rate": 2.429791613885109e-05, "loss": 0.8096, "step": 206 }, { "epoch": 0.5734072022160664, "grad_norm": 0.46265286926711546, "learning_rate": 2.4092875006147033e-05, "loss": 0.806, "step": 207 }, { "epoch": 0.5761772853185596, "grad_norm": 0.7142931706449129, "learning_rate": 2.38881230585892e-05, "loss": 0.7939, "step": 208 }, { "epoch": 0.5789473684210527, "grad_norm": 0.3849923319993063, "learning_rate": 2.3683677674824395e-05, "loss": 0.8323, "step": 209 }, { "epoch": 0.5817174515235457, "grad_norm": 0.40510481254312813, "learning_rate": 2.3479556207479336e-05, "loss": 0.7158, "step": 210 }, { "epoch": 0.5844875346260388, "grad_norm": 0.6209266610489942, "learning_rate": 2.327577598168783e-05, "loss": 0.7779, "step": 211 }, { "epoch": 0.5872576177285319, "grad_norm": 0.6177522163632044, "learning_rate": 2.3072354293620234e-05, "loss": 0.7929, "step": 212 }, { "epoch": 0.590027700831025, "grad_norm": 0.6303542857640766, "learning_rate": 2.28693084090155e-05, "loss": 0.8084, "step": 213 }, { "epoch": 0.592797783933518, "grad_norm": 0.4632833158309286, "learning_rate": 2.2666655561715606e-05, "loss": 0.8703, "step": 214 }, { "epoch": 0.5955678670360111, "grad_norm": 0.4375774421144756, "learning_rate": 2.246441295220289e-05, "loss": 0.7233, "step": 215 }, { "epoch": 0.5983379501385041, "grad_norm": 0.43062269722365204, "learning_rate": 2.2262597746140123e-05, "loss": 0.8269, "step": 216 }, { "epoch": 0.6011080332409973, "grad_norm": 0.5563739531634719, "learning_rate": 2.2061227072913493e-05, "loss": 0.8228, "step": 217 }, { "epoch": 0.6038781163434903, "grad_norm": 0.41792823244918276, "learning_rate": 2.186031802417879e-05, "loss": 0.7883, "step": 218 }, { "epoch": 0.6066481994459834, "grad_norm": 1.0052929026225181, "learning_rate": 2.165988765241067e-05, "loss": 0.7929, "step": 219 } ], "logging_steps": 1, "max_steps": 361, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 73, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 653642013081600.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }