{ "best_metric": 0.0009602023055776954, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.33913294242045566, "eval_steps": 100, "global_step": 588, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005767567047966932, "grad_norm": 3.5087673664093018, "learning_rate": 2e-05, "loss": 0.9661, "step": 1 }, { "epoch": 0.0005767567047966932, "eval_loss": 0.256265789270401, "eval_runtime": 215.9962, "eval_samples_per_second": 8.028, "eval_steps_per_second": 2.009, "step": 1 }, { "epoch": 0.0011535134095933864, "grad_norm": 5.137994289398193, "learning_rate": 4e-05, "loss": 1.1907, "step": 2 }, { "epoch": 0.0017302701143900798, "grad_norm": 6.508111000061035, "learning_rate": 6e-05, "loss": 1.3703, "step": 3 }, { "epoch": 0.002307026819186773, "grad_norm": 2.984405040740967, "learning_rate": 8e-05, "loss": 0.7919, "step": 4 }, { "epoch": 0.0028837835239834664, "grad_norm": 6.96499490737915, "learning_rate": 0.0001, "loss": 1.5441, "step": 5 }, { "epoch": 0.0034605402287801595, "grad_norm": 5.478267192840576, "learning_rate": 0.00012, "loss": 0.7325, "step": 6 }, { "epoch": 0.004037296933576853, "grad_norm": 2.7913830280303955, "learning_rate": 0.00014, "loss": 0.4881, "step": 7 }, { "epoch": 0.004614053638373546, "grad_norm": 3.1158297061920166, "learning_rate": 0.00016, "loss": 0.2994, "step": 8 }, { "epoch": 0.005190810343170239, "grad_norm": 3.412966012954712, "learning_rate": 0.00018, "loss": 0.2548, "step": 9 }, { "epoch": 0.005767567047966933, "grad_norm": 3.8502895832061768, "learning_rate": 0.0002, "loss": 0.1836, "step": 10 }, { "epoch": 0.0063443237527636255, "grad_norm": 2.5958235263824463, "learning_rate": 0.00019999852288943748, "loss": 0.0852, "step": 11 }, { "epoch": 0.006921080457560319, "grad_norm": 4.420118808746338, "learning_rate": 0.00019999409160138693, "loss": 0.1172, "step": 12 }, { "epoch": 0.007497837162357013, "grad_norm": 2.869602918624878, "learning_rate": 0.00019998670626675846, "loss": 0.051, "step": 13 }, { "epoch": 0.008074593867153706, "grad_norm": 1.1618932485580444, "learning_rate": 0.0001999763671037312, "loss": 0.0148, "step": 14 }, { "epoch": 0.008651350571950399, "grad_norm": 4.91132116317749, "learning_rate": 0.00019996307441774684, "loss": 0.0444, "step": 15 }, { "epoch": 0.009228107276747092, "grad_norm": 1.5534745454788208, "learning_rate": 0.00019994682860150073, "loss": 0.0288, "step": 16 }, { "epoch": 0.009804863981543786, "grad_norm": 2.5079476833343506, "learning_rate": 0.0001999276301349302, "loss": 0.0389, "step": 17 }, { "epoch": 0.010381620686340479, "grad_norm": 2.015561580657959, "learning_rate": 0.00019990547958520044, "loss": 0.0538, "step": 18 }, { "epoch": 0.010958377391137171, "grad_norm": 0.5642620325088501, "learning_rate": 0.00019988037760668766, "loss": 0.0044, "step": 19 }, { "epoch": 0.011535134095933866, "grad_norm": 1.0824285745620728, "learning_rate": 0.0001998523249409598, "loss": 0.0099, "step": 20 }, { "epoch": 0.012111890800730558, "grad_norm": 1.1586933135986328, "learning_rate": 0.00019982132241675463, "loss": 0.0149, "step": 21 }, { "epoch": 0.012688647505527251, "grad_norm": 1.043748378753662, "learning_rate": 0.00019978737094995526, "loss": 0.0111, "step": 22 }, { "epoch": 0.013265404210323945, "grad_norm": 1.1925742626190186, "learning_rate": 0.0001997504715435631, "loss": 0.0212, "step": 23 }, { "epoch": 0.013842160915120638, "grad_norm": 4.5300445556640625, "learning_rate": 0.00019971062528766824, "loss": 0.0469, "step": 24 }, { "epoch": 0.01441891761991733, "grad_norm": 1.0790067911148071, "learning_rate": 0.00019966783335941717, "loss": 0.026, "step": 25 }, { "epoch": 0.014995674324714025, "grad_norm": 0.853151798248291, "learning_rate": 0.00019962209702297808, "loss": 0.0242, "step": 26 }, { "epoch": 0.015572431029510718, "grad_norm": 1.722130537033081, "learning_rate": 0.00019957341762950344, "loss": 0.0478, "step": 27 }, { "epoch": 0.016149187734307412, "grad_norm": 2.9154410362243652, "learning_rate": 0.00019952179661709028, "loss": 0.0796, "step": 28 }, { "epoch": 0.016725944439104103, "grad_norm": 1.8094756603240967, "learning_rate": 0.00019946723551073732, "loss": 0.0222, "step": 29 }, { "epoch": 0.017302701143900798, "grad_norm": 2.117493152618408, "learning_rate": 0.0001994097359223004, "loss": 0.0215, "step": 30 }, { "epoch": 0.017879457848697492, "grad_norm": 0.26287028193473816, "learning_rate": 0.0001993492995504444, "loss": 0.0066, "step": 31 }, { "epoch": 0.018456214553494183, "grad_norm": 30.89515495300293, "learning_rate": 0.0001992859281805935, "loss": 0.0778, "step": 32 }, { "epoch": 0.019032971258290877, "grad_norm": 4.260768413543701, "learning_rate": 0.00019921962368487807, "loss": 0.0816, "step": 33 }, { "epoch": 0.019609727963087572, "grad_norm": 1.1326998472213745, "learning_rate": 0.00019915038802207944, "loss": 0.0209, "step": 34 }, { "epoch": 0.020186484667884263, "grad_norm": 4.575008869171143, "learning_rate": 0.00019907822323757225, "loss": 0.1154, "step": 35 }, { "epoch": 0.020763241372680957, "grad_norm": 5.928327560424805, "learning_rate": 0.00019900313146326384, "loss": 0.2387, "step": 36 }, { "epoch": 0.02133999807747765, "grad_norm": 21.27695655822754, "learning_rate": 0.00019892511491753124, "loss": 0.202, "step": 37 }, { "epoch": 0.021916754782274343, "grad_norm": 3.236907482147217, "learning_rate": 0.00019884417590515573, "loss": 0.1093, "step": 38 }, { "epoch": 0.022493511487071037, "grad_norm": 2.1399052143096924, "learning_rate": 0.00019876031681725467, "loss": 0.0944, "step": 39 }, { "epoch": 0.02307026819186773, "grad_norm": 3.7547476291656494, "learning_rate": 0.000198673540131211, "loss": 0.1158, "step": 40 }, { "epoch": 0.023647024896664422, "grad_norm": 2.028630256652832, "learning_rate": 0.0001985838484105999, "loss": 0.062, "step": 41 }, { "epoch": 0.024223781601461117, "grad_norm": 1.1861765384674072, "learning_rate": 0.0001984912443051131, "loss": 0.0637, "step": 42 }, { "epoch": 0.02480053830625781, "grad_norm": 4.997815132141113, "learning_rate": 0.0001983957305504807, "loss": 0.0905, "step": 43 }, { "epoch": 0.025377295011054502, "grad_norm": 3.761312484741211, "learning_rate": 0.0001982973099683902, "loss": 0.0537, "step": 44 }, { "epoch": 0.025954051715851197, "grad_norm": 8.485271453857422, "learning_rate": 0.0001981959854664032, "loss": 0.2039, "step": 45 }, { "epoch": 0.02653080842064789, "grad_norm": 4.167719841003418, "learning_rate": 0.00019809176003786954, "loss": 0.1507, "step": 46 }, { "epoch": 0.027107565125444582, "grad_norm": 13.948970794677734, "learning_rate": 0.00019798463676183888, "loss": 0.0811, "step": 47 }, { "epoch": 0.027684321830241276, "grad_norm": 16.908702850341797, "learning_rate": 0.00019787461880296965, "loss": 0.2963, "step": 48 }, { "epoch": 0.02826107853503797, "grad_norm": 2.1487314701080322, "learning_rate": 0.00019776170941143565, "loss": 0.0582, "step": 49 }, { "epoch": 0.02883783523983466, "grad_norm": 3.16274356842041, "learning_rate": 0.00019764591192282993, "loss": 0.0622, "step": 50 }, { "epoch": 0.029414591944631356, "grad_norm": 2.1081337928771973, "learning_rate": 0.0001975272297580664, "loss": 0.1675, "step": 51 }, { "epoch": 0.02999134864942805, "grad_norm": 1.3357646465301514, "learning_rate": 0.00019740566642327867, "loss": 0.0394, "step": 52 }, { "epoch": 0.03056810535422474, "grad_norm": 2.5485501289367676, "learning_rate": 0.00019728122550971642, "loss": 0.1227, "step": 53 }, { "epoch": 0.031144862059021436, "grad_norm": 7.7235870361328125, "learning_rate": 0.0001971539106936394, "loss": 0.1759, "step": 54 }, { "epoch": 0.03172161876381813, "grad_norm": 5.811506748199463, "learning_rate": 0.00019702372573620881, "loss": 0.1152, "step": 55 }, { "epoch": 0.032298375468614825, "grad_norm": 1.4860045909881592, "learning_rate": 0.00019689067448337618, "loss": 0.0469, "step": 56 }, { "epoch": 0.032875132173411516, "grad_norm": 1.4226983785629272, "learning_rate": 0.00019675476086576972, "loss": 0.0395, "step": 57 }, { "epoch": 0.03345188887820821, "grad_norm": 0.34651094675064087, "learning_rate": 0.0001966159888985782, "loss": 0.0185, "step": 58 }, { "epoch": 0.034028645583004904, "grad_norm": 0.8589548468589783, "learning_rate": 0.00019647436268143247, "loss": 0.0172, "step": 59 }, { "epoch": 0.034605402287801595, "grad_norm": 1.4422504901885986, "learning_rate": 0.00019632988639828406, "loss": 0.0369, "step": 60 }, { "epoch": 0.035182158992598286, "grad_norm": 2.7948107719421387, "learning_rate": 0.00019618256431728194, "loss": 0.0839, "step": 61 }, { "epoch": 0.035758915697394984, "grad_norm": 0.973739743232727, "learning_rate": 0.00019603240079064604, "loss": 0.0967, "step": 62 }, { "epoch": 0.036335672402191675, "grad_norm": 1.0291193723678589, "learning_rate": 0.00019587940025453908, "loss": 0.0121, "step": 63 }, { "epoch": 0.036912429106988366, "grad_norm": 3.7419233322143555, "learning_rate": 0.00019572356722893518, "loss": 0.0257, "step": 64 }, { "epoch": 0.037489185811785064, "grad_norm": 0.9348726868629456, "learning_rate": 0.00019556490631748651, "loss": 0.0038, "step": 65 }, { "epoch": 0.038065942516581755, "grad_norm": 2.067941904067993, "learning_rate": 0.00019540342220738724, "loss": 0.011, "step": 66 }, { "epoch": 0.038642699221378446, "grad_norm": 5.878370761871338, "learning_rate": 0.00019523911966923507, "loss": 0.0176, "step": 67 }, { "epoch": 0.039219455926175144, "grad_norm": 2.2632899284362793, "learning_rate": 0.00019507200355689026, "loss": 0.0224, "step": 68 }, { "epoch": 0.039796212630971835, "grad_norm": 3.178539752960205, "learning_rate": 0.00019490207880733232, "loss": 0.0314, "step": 69 }, { "epoch": 0.040372969335768526, "grad_norm": 1.259251594543457, "learning_rate": 0.0001947293504405141, "loss": 0.0035, "step": 70 }, { "epoch": 0.040949726040565224, "grad_norm": 0.4863012433052063, "learning_rate": 0.00019455382355921353, "loss": 0.0102, "step": 71 }, { "epoch": 0.041526482745361915, "grad_norm": 52.19420623779297, "learning_rate": 0.00019437550334888278, "loss": 0.1446, "step": 72 }, { "epoch": 0.042103239450158605, "grad_norm": 1.488041639328003, "learning_rate": 0.00019419439507749515, "loss": 0.0246, "step": 73 }, { "epoch": 0.0426799961549553, "grad_norm": 0.3983250856399536, "learning_rate": 0.0001940105040953895, "loss": 0.0038, "step": 74 }, { "epoch": 0.043256752859751994, "grad_norm": 1.9497828483581543, "learning_rate": 0.00019382383583511206, "loss": 0.0124, "step": 75 }, { "epoch": 0.043833509564548685, "grad_norm": 5.796697616577148, "learning_rate": 0.00019363439581125601, "loss": 0.0264, "step": 76 }, { "epoch": 0.04441026626934538, "grad_norm": 4.687071323394775, "learning_rate": 0.00019344218962029857, "loss": 0.017, "step": 77 }, { "epoch": 0.044987022974142074, "grad_norm": 3.4617161750793457, "learning_rate": 0.00019324722294043558, "loss": 0.0973, "step": 78 }, { "epoch": 0.045563779678938765, "grad_norm": 1.2890996932983398, "learning_rate": 0.00019304950153141393, "loss": 0.0152, "step": 79 }, { "epoch": 0.04614053638373546, "grad_norm": 2.0128302574157715, "learning_rate": 0.00019284903123436127, "loss": 0.035, "step": 80 }, { "epoch": 0.046717293088532154, "grad_norm": 0.14461462199687958, "learning_rate": 0.00019264581797161343, "loss": 0.004, "step": 81 }, { "epoch": 0.047294049793328845, "grad_norm": 0.43382060527801514, "learning_rate": 0.00019243986774653956, "loss": 0.0044, "step": 82 }, { "epoch": 0.04787080649812554, "grad_norm": 0.5092380046844482, "learning_rate": 0.00019223118664336467, "loss": 0.0032, "step": 83 }, { "epoch": 0.048447563202922234, "grad_norm": 0.5211856365203857, "learning_rate": 0.0001920197808269901, "loss": 0.0059, "step": 84 }, { "epoch": 0.049024319907718925, "grad_norm": 1.6433213949203491, "learning_rate": 0.00019180565654281103, "loss": 0.1625, "step": 85 }, { "epoch": 0.04960107661251562, "grad_norm": 2.427119016647339, "learning_rate": 0.00019158882011653235, "loss": 0.0246, "step": 86 }, { "epoch": 0.05017783331731231, "grad_norm": 0.8261755108833313, "learning_rate": 0.00019136927795398157, "loss": 0.0113, "step": 87 }, { "epoch": 0.050754590022109004, "grad_norm": 0.15122078359127045, "learning_rate": 0.00019114703654091961, "loss": 0.005, "step": 88 }, { "epoch": 0.0513313467269057, "grad_norm": 0.5468443632125854, "learning_rate": 0.00019092210244284926, "loss": 0.017, "step": 89 }, { "epoch": 0.05190810343170239, "grad_norm": 0.640923798084259, "learning_rate": 0.00019069448230482118, "loss": 0.0323, "step": 90 }, { "epoch": 0.052484860136499084, "grad_norm": 0.2306308001279831, "learning_rate": 0.00019046418285123754, "loss": 0.0039, "step": 91 }, { "epoch": 0.05306161684129578, "grad_norm": 0.3201711177825928, "learning_rate": 0.00019023121088565352, "loss": 0.0037, "step": 92 }, { "epoch": 0.05363837354609247, "grad_norm": 1.7555040121078491, "learning_rate": 0.00018999557329057605, "loss": 0.0232, "step": 93 }, { "epoch": 0.054215130250889164, "grad_norm": 0.04883858934044838, "learning_rate": 0.00018975727702726076, "loss": 0.0016, "step": 94 }, { "epoch": 0.05479188695568586, "grad_norm": 0.11811670660972595, "learning_rate": 0.00018951632913550626, "loss": 0.0031, "step": 95 }, { "epoch": 0.05536864366048255, "grad_norm": 0.3842681050300598, "learning_rate": 0.000189272736733446, "loss": 0.0032, "step": 96 }, { "epoch": 0.055945400365279244, "grad_norm": 0.34558194875717163, "learning_rate": 0.0001890265070173382, "loss": 0.0041, "step": 97 }, { "epoch": 0.05652215707007594, "grad_norm": 0.23080027103424072, "learning_rate": 0.0001887776472613532, "loss": 0.0052, "step": 98 }, { "epoch": 0.05709891377487263, "grad_norm": 0.8554419875144958, "learning_rate": 0.00018852616481735841, "loss": 0.0142, "step": 99 }, { "epoch": 0.05767567047966932, "grad_norm": 0.6974347233772278, "learning_rate": 0.00018827206711470137, "loss": 0.0136, "step": 100 }, { "epoch": 0.05767567047966932, "eval_loss": 0.011846823617815971, "eval_runtime": 218.0508, "eval_samples_per_second": 7.952, "eval_steps_per_second": 1.99, "step": 100 }, { "epoch": 0.05825242718446602, "grad_norm": 0.20044860243797302, "learning_rate": 0.00018801536165999008, "loss": 0.0015, "step": 101 }, { "epoch": 0.05882918388926271, "grad_norm": 0.08200179040431976, "learning_rate": 0.00018775605603687127, "loss": 0.0009, "step": 102 }, { "epoch": 0.0594059405940594, "grad_norm": 0.5734694600105286, "learning_rate": 0.00018749415790580648, "loss": 0.0098, "step": 103 }, { "epoch": 0.0599826972988561, "grad_norm": 0.7785372138023376, "learning_rate": 0.00018722967500384564, "loss": 0.0114, "step": 104 }, { "epoch": 0.06055945400365279, "grad_norm": 0.019551914185285568, "learning_rate": 0.0001869626151443985, "loss": 0.0006, "step": 105 }, { "epoch": 0.06113621070844948, "grad_norm": 0.01751401647925377, "learning_rate": 0.00018669298621700378, "loss": 0.0004, "step": 106 }, { "epoch": 0.06171296741324618, "grad_norm": 0.8233848214149475, "learning_rate": 0.00018642079618709628, "loss": 0.0021, "step": 107 }, { "epoch": 0.06228972411804287, "grad_norm": 1.2167710065841675, "learning_rate": 0.00018614605309577137, "loss": 0.0083, "step": 108 }, { "epoch": 0.06286648082283956, "grad_norm": 2.0247766971588135, "learning_rate": 0.00018586876505954743, "loss": 0.0136, "step": 109 }, { "epoch": 0.06344323752763625, "grad_norm": 0.21868391335010529, "learning_rate": 0.00018558894027012626, "loss": 0.0047, "step": 110 }, { "epoch": 0.06401999423243296, "grad_norm": 3.0512166023254395, "learning_rate": 0.00018530658699415087, "loss": 0.046, "step": 111 }, { "epoch": 0.06459675093722965, "grad_norm": 0.36378923058509827, "learning_rate": 0.00018502171357296144, "loss": 0.002, "step": 112 }, { "epoch": 0.06517350764202634, "grad_norm": 3.8020143508911133, "learning_rate": 0.00018473432842234868, "loss": 0.0282, "step": 113 }, { "epoch": 0.06575026434682303, "grad_norm": 0.5668691396713257, "learning_rate": 0.00018444444003230549, "loss": 0.0029, "step": 114 }, { "epoch": 0.06632702105161972, "grad_norm": 0.14165480434894562, "learning_rate": 0.0001841520569667759, "loss": 0.0017, "step": 115 }, { "epoch": 0.06690377775641641, "grad_norm": 0.356355756521225, "learning_rate": 0.00018385718786340216, "loss": 0.0032, "step": 116 }, { "epoch": 0.06748053446121312, "grad_norm": 1.456584095954895, "learning_rate": 0.00018355984143326968, "loss": 0.0164, "step": 117 }, { "epoch": 0.06805729116600981, "grad_norm": 0.06781327724456787, "learning_rate": 0.00018326002646064948, "loss": 0.0008, "step": 118 }, { "epoch": 0.0686340478708065, "grad_norm": 0.03936789557337761, "learning_rate": 0.00018295775180273883, "loss": 0.0006, "step": 119 }, { "epoch": 0.06921080457560319, "grad_norm": 0.08526153862476349, "learning_rate": 0.0001826530263893995, "loss": 0.0009, "step": 120 }, { "epoch": 0.06978756128039988, "grad_norm": 1.0191057920455933, "learning_rate": 0.00018234585922289408, "loss": 0.0168, "step": 121 }, { "epoch": 0.07036431798519657, "grad_norm": 13.370498657226562, "learning_rate": 0.0001820362593776198, "loss": 0.5285, "step": 122 }, { "epoch": 0.07094107468999328, "grad_norm": 0.11651613563299179, "learning_rate": 0.0001817242359998408, "loss": 0.0012, "step": 123 }, { "epoch": 0.07151783139478997, "grad_norm": 0.13910582661628723, "learning_rate": 0.00018140979830741754, "loss": 0.0016, "step": 124 }, { "epoch": 0.07209458809958666, "grad_norm": 0.03518282249569893, "learning_rate": 0.0001810929555895348, "loss": 0.0004, "step": 125 }, { "epoch": 0.07267134480438335, "grad_norm": 0.20222316682338715, "learning_rate": 0.00018077371720642707, "loss": 0.0017, "step": 126 }, { "epoch": 0.07324810150918004, "grad_norm": 1.6948459148406982, "learning_rate": 0.0001804520925891021, "loss": 0.0206, "step": 127 }, { "epoch": 0.07382485821397673, "grad_norm": 0.5087080597877502, "learning_rate": 0.00018012809123906228, "loss": 0.0035, "step": 128 }, { "epoch": 0.07440161491877344, "grad_norm": 1.0946658849716187, "learning_rate": 0.000179801722728024, "loss": 0.0212, "step": 129 }, { "epoch": 0.07497837162357013, "grad_norm": 7.545551300048828, "learning_rate": 0.0001794729966976346, "loss": 0.3633, "step": 130 }, { "epoch": 0.07555512832836682, "grad_norm": 0.43119239807128906, "learning_rate": 0.00017914192285918806, "loss": 0.0012, "step": 131 }, { "epoch": 0.07613188503316351, "grad_norm": 0.3399253487586975, "learning_rate": 0.00017880851099333762, "loss": 0.004, "step": 132 }, { "epoch": 0.0767086417379602, "grad_norm": 0.2951245903968811, "learning_rate": 0.00017847277094980703, "loss": 0.0029, "step": 133 }, { "epoch": 0.07728539844275689, "grad_norm": 1.1110440492630005, "learning_rate": 0.00017813471264709958, "loss": 0.0059, "step": 134 }, { "epoch": 0.0778621551475536, "grad_norm": 3.981778383255005, "learning_rate": 0.0001777943460722051, "loss": 0.0168, "step": 135 }, { "epoch": 0.07843891185235029, "grad_norm": 0.5274855494499207, "learning_rate": 0.00017745168128030481, "loss": 0.0345, "step": 136 }, { "epoch": 0.07901566855714698, "grad_norm": 0.611136794090271, "learning_rate": 0.0001771067283944744, "loss": 0.0082, "step": 137 }, { "epoch": 0.07959242526194367, "grad_norm": 6.21735954284668, "learning_rate": 0.00017675949760538492, "loss": 0.1663, "step": 138 }, { "epoch": 0.08016918196674036, "grad_norm": 0.16410556435585022, "learning_rate": 0.00017640999917100164, "loss": 0.0011, "step": 139 }, { "epoch": 0.08074593867153705, "grad_norm": 8.113507270812988, "learning_rate": 0.00017605824341628118, "loss": 0.0862, "step": 140 }, { "epoch": 0.08132269537633376, "grad_norm": 2.632920265197754, "learning_rate": 0.00017570424073286634, "loss": 0.0879, "step": 141 }, { "epoch": 0.08189945208113045, "grad_norm": 0.20083442330360413, "learning_rate": 0.00017534800157877918, "loss": 0.0027, "step": 142 }, { "epoch": 0.08247620878592714, "grad_norm": 0.07561109215021133, "learning_rate": 0.000174989536478112, "loss": 0.002, "step": 143 }, { "epoch": 0.08305296549072383, "grad_norm": 1.2922168970108032, "learning_rate": 0.00017462885602071663, "loss": 0.0208, "step": 144 }, { "epoch": 0.08362972219552052, "grad_norm": 1.5450738668441772, "learning_rate": 0.00017426597086189126, "loss": 0.0118, "step": 145 }, { "epoch": 0.08420647890031721, "grad_norm": 0.2077200561761856, "learning_rate": 0.00017390089172206592, "loss": 0.0051, "step": 146 }, { "epoch": 0.08478323560511392, "grad_norm": 0.3651249408721924, "learning_rate": 0.0001735336293864857, "loss": 0.0112, "step": 147 }, { "epoch": 0.0853599923099106, "grad_norm": 0.7486603260040283, "learning_rate": 0.00017316419470489209, "loss": 0.0075, "step": 148 }, { "epoch": 0.0859367490147073, "grad_norm": 0.06879378110170364, "learning_rate": 0.00017279259859120255, "loss": 0.002, "step": 149 }, { "epoch": 0.08651350571950399, "grad_norm": 0.10414671897888184, "learning_rate": 0.00017241885202318787, "loss": 0.0033, "step": 150 }, { "epoch": 0.08709026242430068, "grad_norm": 2.870335578918457, "learning_rate": 0.00017204296604214818, "loss": 0.0103, "step": 151 }, { "epoch": 0.08766701912909737, "grad_norm": 0.5510638356208801, "learning_rate": 0.00017166495175258652, "loss": 0.0089, "step": 152 }, { "epoch": 0.08824377583389408, "grad_norm": 0.03852595388889313, "learning_rate": 0.0001712848203218809, "loss": 0.0017, "step": 153 }, { "epoch": 0.08882053253869077, "grad_norm": 1.3544578552246094, "learning_rate": 0.0001709025829799544, "loss": 0.0029, "step": 154 }, { "epoch": 0.08939728924348746, "grad_norm": 10.156106948852539, "learning_rate": 0.00017051825101894322, "loss": 0.0678, "step": 155 }, { "epoch": 0.08997404594828415, "grad_norm": 0.6341680884361267, "learning_rate": 0.0001701318357928634, "loss": 0.0045, "step": 156 }, { "epoch": 0.09055080265308084, "grad_norm": 8.50924015045166, "learning_rate": 0.00016974334871727517, "loss": 0.4122, "step": 157 }, { "epoch": 0.09112755935787753, "grad_norm": 0.1829361468553543, "learning_rate": 0.00016935280126894578, "loss": 0.0017, "step": 158 }, { "epoch": 0.09170431606267423, "grad_norm": 0.5836789011955261, "learning_rate": 0.00016896020498551045, "loss": 0.0078, "step": 159 }, { "epoch": 0.09228107276747093, "grad_norm": 0.8306235074996948, "learning_rate": 0.00016856557146513156, "loss": 0.0017, "step": 160 }, { "epoch": 0.09285782947226762, "grad_norm": 0.1426941305398941, "learning_rate": 0.00016816891236615588, "loss": 0.0011, "step": 161 }, { "epoch": 0.09343458617706431, "grad_norm": 9.040135383605957, "learning_rate": 0.00016777023940677034, "loss": 0.0109, "step": 162 }, { "epoch": 0.094011342881861, "grad_norm": 1.1638931035995483, "learning_rate": 0.00016736956436465573, "loss": 0.0035, "step": 163 }, { "epoch": 0.09458809958665769, "grad_norm": 2.2850794792175293, "learning_rate": 0.0001669668990766388, "loss": 0.0986, "step": 164 }, { "epoch": 0.0951648562914544, "grad_norm": 0.22328975796699524, "learning_rate": 0.00016656225543834244, "loss": 0.0029, "step": 165 }, { "epoch": 0.09574161299625109, "grad_norm": 0.0838567242026329, "learning_rate": 0.00016615564540383465, "loss": 0.0007, "step": 166 }, { "epoch": 0.09631836970104778, "grad_norm": 0.6008310317993164, "learning_rate": 0.0001657470809852749, "loss": 0.0087, "step": 167 }, { "epoch": 0.09689512640584447, "grad_norm": 0.9929326772689819, "learning_rate": 0.00016533657425255952, "loss": 0.0074, "step": 168 }, { "epoch": 0.09747188311064116, "grad_norm": 4.873377799987793, "learning_rate": 0.00016492413733296517, "loss": 0.031, "step": 169 }, { "epoch": 0.09804863981543785, "grad_norm": 0.5778681039810181, "learning_rate": 0.00016450978241079045, "loss": 0.0023, "step": 170 }, { "epoch": 0.09862539652023455, "grad_norm": 1.108101487159729, "learning_rate": 0.00016409352172699603, "loss": 0.0054, "step": 171 }, { "epoch": 0.09920215322503124, "grad_norm": 5.106866836547852, "learning_rate": 0.00016367536757884286, "loss": 0.034, "step": 172 }, { "epoch": 0.09977890992982794, "grad_norm": 0.020122259855270386, "learning_rate": 0.00016325533231952924, "loss": 0.0004, "step": 173 }, { "epoch": 0.10035566663462463, "grad_norm": 0.05907014012336731, "learning_rate": 0.00016283342835782548, "loss": 0.0008, "step": 174 }, { "epoch": 0.10093242333942132, "grad_norm": 0.10327869653701782, "learning_rate": 0.00016240966815770754, "loss": 0.0012, "step": 175 }, { "epoch": 0.10150918004421801, "grad_norm": 0.09473896026611328, "learning_rate": 0.00016198406423798882, "loss": 0.0012, "step": 176 }, { "epoch": 0.10208593674901471, "grad_norm": 0.04762684926390648, "learning_rate": 0.00016155662917195017, "loss": 0.001, "step": 177 }, { "epoch": 0.1026626934538114, "grad_norm": 0.24230125546455383, "learning_rate": 0.00016112737558696862, "loss": 0.0063, "step": 178 }, { "epoch": 0.1032394501586081, "grad_norm": 1.7020281553268433, "learning_rate": 0.00016069631616414428, "loss": 0.0104, "step": 179 }, { "epoch": 0.10381620686340479, "grad_norm": 0.37646153569221497, "learning_rate": 0.00016026346363792567, "loss": 0.0074, "step": 180 }, { "epoch": 0.10439296356820148, "grad_norm": 1.5477365255355835, "learning_rate": 0.00015982883079573355, "loss": 0.0062, "step": 181 }, { "epoch": 0.10496972027299817, "grad_norm": 0.43806129693984985, "learning_rate": 0.0001593924304775831, "loss": 0.0056, "step": 182 }, { "epoch": 0.10554647697779487, "grad_norm": 1.5641403198242188, "learning_rate": 0.00015895427557570486, "loss": 0.0144, "step": 183 }, { "epoch": 0.10612323368259156, "grad_norm": 0.05264851078391075, "learning_rate": 0.00015851437903416338, "loss": 0.0014, "step": 184 }, { "epoch": 0.10669999038738825, "grad_norm": 0.3245653808116913, "learning_rate": 0.00015807275384847528, "loss": 0.0047, "step": 185 }, { "epoch": 0.10727674709218495, "grad_norm": 7.0433526039123535, "learning_rate": 0.00015762941306522504, "loss": 0.0498, "step": 186 }, { "epoch": 0.10785350379698164, "grad_norm": 0.10678967833518982, "learning_rate": 0.00015718436978167977, "loss": 0.001, "step": 187 }, { "epoch": 0.10843026050177833, "grad_norm": 0.24095317721366882, "learning_rate": 0.00015673763714540214, "loss": 0.0033, "step": 188 }, { "epoch": 0.10900701720657503, "grad_norm": 0.05762599781155586, "learning_rate": 0.00015628922835386207, "loss": 0.0069, "step": 189 }, { "epoch": 0.10958377391137172, "grad_norm": 0.7496973276138306, "learning_rate": 0.00015583915665404687, "loss": 0.0095, "step": 190 }, { "epoch": 0.11016053061616841, "grad_norm": 0.3829174339771271, "learning_rate": 0.00015538743534206968, "loss": 0.0111, "step": 191 }, { "epoch": 0.1107372873209651, "grad_norm": 4.417044639587402, "learning_rate": 0.00015493407776277698, "loss": 0.0219, "step": 192 }, { "epoch": 0.1113140440257618, "grad_norm": 0.21296919882297516, "learning_rate": 0.00015447909730935413, "loss": 0.0015, "step": 193 }, { "epoch": 0.11189080073055849, "grad_norm": 1.1824345588684082, "learning_rate": 0.00015402250742292984, "loss": 0.0162, "step": 194 }, { "epoch": 0.11246755743535519, "grad_norm": 0.028691312298178673, "learning_rate": 0.00015356432159217893, "loss": 0.0006, "step": 195 }, { "epoch": 0.11304431414015188, "grad_norm": 0.00718648498877883, "learning_rate": 0.00015310455335292402, "loss": 0.0004, "step": 196 }, { "epoch": 0.11362107084494857, "grad_norm": 3.508272171020508, "learning_rate": 0.0001526432162877356, "loss": 0.0565, "step": 197 }, { "epoch": 0.11419782754974526, "grad_norm": 1.5187373161315918, "learning_rate": 0.00015218032402553066, "loss": 0.0092, "step": 198 }, { "epoch": 0.11477458425454196, "grad_norm": 1.3543766736984253, "learning_rate": 0.00015171589024117022, "loss": 0.0135, "step": 199 }, { "epoch": 0.11535134095933865, "grad_norm": 0.20114730298519135, "learning_rate": 0.00015124992865505523, "loss": 0.0021, "step": 200 }, { "epoch": 0.11535134095933865, "eval_loss": 0.0015093678375706077, "eval_runtime": 218.0466, "eval_samples_per_second": 7.952, "eval_steps_per_second": 1.99, "step": 200 }, { "epoch": 0.11592809766413535, "grad_norm": 0.05164756998419762, "learning_rate": 0.0001507824530327213, "loss": 0.0008, "step": 201 }, { "epoch": 0.11650485436893204, "grad_norm": 0.038486115634441376, "learning_rate": 0.00015031347718443211, "loss": 0.0009, "step": 202 }, { "epoch": 0.11708161107372873, "grad_norm": 0.9018555879592896, "learning_rate": 0.0001498430149647711, "loss": 0.0142, "step": 203 }, { "epoch": 0.11765836777852542, "grad_norm": 1.4734240770339966, "learning_rate": 0.00014937108027223266, "loss": 0.0128, "step": 204 }, { "epoch": 0.11823512448332212, "grad_norm": 1.4061514139175415, "learning_rate": 0.00014889768704881112, "loss": 0.0168, "step": 205 }, { "epoch": 0.1188118811881188, "grad_norm": 0.21486225724220276, "learning_rate": 0.00014842284927958906, "loss": 0.004, "step": 206 }, { "epoch": 0.11938863789291551, "grad_norm": 0.7184436917304993, "learning_rate": 0.00014794658099232425, "loss": 0.006, "step": 207 }, { "epoch": 0.1199653945977122, "grad_norm": 0.3992283046245575, "learning_rate": 0.00014746889625703502, "loss": 0.0033, "step": 208 }, { "epoch": 0.1205421513025089, "grad_norm": 0.4593133330345154, "learning_rate": 0.00014698980918558468, "loss": 0.0066, "step": 209 }, { "epoch": 0.12111890800730558, "grad_norm": 0.5490407347679138, "learning_rate": 0.00014650933393126476, "loss": 0.0091, "step": 210 }, { "epoch": 0.12169566471210227, "grad_norm": 0.022278431802988052, "learning_rate": 0.0001460274846883767, "loss": 0.0006, "step": 211 }, { "epoch": 0.12227242141689897, "grad_norm": 0.021869247779250145, "learning_rate": 0.0001455442756918126, "loss": 0.0005, "step": 212 }, { "epoch": 0.12284917812169567, "grad_norm": 0.18274378776550293, "learning_rate": 0.0001450597212166348, "loss": 0.0017, "step": 213 }, { "epoch": 0.12342593482649236, "grad_norm": 0.061118919402360916, "learning_rate": 0.00014457383557765386, "loss": 0.0008, "step": 214 }, { "epoch": 0.12400269153128905, "grad_norm": 0.1736149936914444, "learning_rate": 0.000144086633129006, "loss": 0.0016, "step": 215 }, { "epoch": 0.12457944823608574, "grad_norm": 1.178398847579956, "learning_rate": 0.00014359812826372895, "loss": 0.0226, "step": 216 }, { "epoch": 0.12515620494088245, "grad_norm": 0.39778366684913635, "learning_rate": 0.00014310833541333656, "loss": 0.0033, "step": 217 }, { "epoch": 0.12573296164567913, "grad_norm": 4.687065601348877, "learning_rate": 0.00014261726904739273, "loss": 0.0761, "step": 218 }, { "epoch": 0.12630971835047583, "grad_norm": 0.00944424793124199, "learning_rate": 0.00014212494367308372, "loss": 0.0003, "step": 219 }, { "epoch": 0.1268864750552725, "grad_norm": 0.01624249666929245, "learning_rate": 0.00014163137383478984, "loss": 0.0005, "step": 220 }, { "epoch": 0.1274632317600692, "grad_norm": 2.4178357124328613, "learning_rate": 0.00014113657411365547, "loss": 0.003, "step": 221 }, { "epoch": 0.12803998846486592, "grad_norm": 0.5570788383483887, "learning_rate": 0.00014064055912715845, "loss": 0.0064, "step": 222 }, { "epoch": 0.1286167451696626, "grad_norm": 0.07918275892734528, "learning_rate": 0.0001401433435286784, "loss": 0.001, "step": 223 }, { "epoch": 0.1291935018744593, "grad_norm": 0.11692162603139877, "learning_rate": 0.00013964494200706344, "loss": 0.0016, "step": 224 }, { "epoch": 0.12977025857925598, "grad_norm": 0.19409684836864471, "learning_rate": 0.0001391453692861967, "loss": 0.0015, "step": 225 }, { "epoch": 0.13034701528405268, "grad_norm": 0.4033575654029846, "learning_rate": 0.00013864464012456102, "loss": 0.006, "step": 226 }, { "epoch": 0.13092377198884936, "grad_norm": 0.331083208322525, "learning_rate": 0.00013814276931480308, "loss": 0.0095, "step": 227 }, { "epoch": 0.13150052869364606, "grad_norm": 0.07267259806394577, "learning_rate": 0.0001376397716832963, "loss": 0.0006, "step": 228 }, { "epoch": 0.13207728539844277, "grad_norm": 0.23224855959415436, "learning_rate": 0.00013713566208970303, "loss": 0.0028, "step": 229 }, { "epoch": 0.13265404210323944, "grad_norm": 1.4272027015686035, "learning_rate": 0.00013663045542653538, "loss": 0.0321, "step": 230 }, { "epoch": 0.13323079880803615, "grad_norm": 0.47390782833099365, "learning_rate": 0.00013612416661871533, "loss": 0.0199, "step": 231 }, { "epoch": 0.13380755551283283, "grad_norm": 0.22918830811977386, "learning_rate": 0.0001356168106231337, "loss": 0.0048, "step": 232 }, { "epoch": 0.13438431221762953, "grad_norm": 0.052093904465436935, "learning_rate": 0.0001351084024282086, "loss": 0.0006, "step": 233 }, { "epoch": 0.13496106892242624, "grad_norm": 0.18260619044303894, "learning_rate": 0.0001345989570534423, "loss": 0.0056, "step": 234 }, { "epoch": 0.1355378256272229, "grad_norm": 4.663100719451904, "learning_rate": 0.00013408848954897756, "loss": 0.1669, "step": 235 }, { "epoch": 0.13611458233201962, "grad_norm": 1.1344698667526245, "learning_rate": 0.00013357701499515346, "loss": 0.0076, "step": 236 }, { "epoch": 0.1366913390368163, "grad_norm": 0.2477646768093109, "learning_rate": 0.00013306454850205913, "loss": 0.0344, "step": 237 }, { "epoch": 0.137268095741613, "grad_norm": 0.1280204802751541, "learning_rate": 0.00013255110520908806, "loss": 0.0122, "step": 238 }, { "epoch": 0.13784485244640968, "grad_norm": 0.02419539913535118, "learning_rate": 0.00013203670028449045, "loss": 0.0009, "step": 239 }, { "epoch": 0.13842160915120638, "grad_norm": 3.135305404663086, "learning_rate": 0.00013152134892492525, "loss": 0.0179, "step": 240 }, { "epoch": 0.1389983658560031, "grad_norm": 0.20896202325820923, "learning_rate": 0.0001310050663550112, "loss": 0.0104, "step": 241 }, { "epoch": 0.13957512256079976, "grad_norm": 0.11529813706874847, "learning_rate": 0.00013048786782687705, "loss": 0.001, "step": 242 }, { "epoch": 0.14015187926559647, "grad_norm": 0.007383347023278475, "learning_rate": 0.00012996976861971094, "loss": 0.0003, "step": 243 }, { "epoch": 0.14072863597039315, "grad_norm": 0.1621176153421402, "learning_rate": 0.00012945078403930915, "loss": 0.002, "step": 244 }, { "epoch": 0.14130539267518985, "grad_norm": 1.5596797466278076, "learning_rate": 0.00012893092941762376, "loss": 0.0835, "step": 245 }, { "epoch": 0.14188214937998656, "grad_norm": 0.03435186296701431, "learning_rate": 0.0001284102201123098, "loss": 0.0007, "step": 246 }, { "epoch": 0.14245890608478323, "grad_norm": 6.803732872009277, "learning_rate": 0.00012788867150627161, "loss": 0.0405, "step": 247 }, { "epoch": 0.14303566278957994, "grad_norm": 3.6043403148651123, "learning_rate": 0.0001273662990072083, "loss": 0.0482, "step": 248 }, { "epoch": 0.14361241949437661, "grad_norm": 0.04856592416763306, "learning_rate": 0.00012684311804715855, "loss": 0.001, "step": 249 }, { "epoch": 0.14418917619917332, "grad_norm": 0.9073469042778015, "learning_rate": 0.0001263191440820448, "loss": 0.0084, "step": 250 }, { "epoch": 0.14476593290397, "grad_norm": 0.042365387082099915, "learning_rate": 0.00012579439259121662, "loss": 0.0012, "step": 251 }, { "epoch": 0.1453426896087667, "grad_norm": 5.175539493560791, "learning_rate": 0.00012526887907699348, "loss": 0.141, "step": 252 }, { "epoch": 0.1459194463135634, "grad_norm": 1.012317419052124, "learning_rate": 0.00012474261906420653, "loss": 0.0747, "step": 253 }, { "epoch": 0.14649620301836008, "grad_norm": 1.7460826635360718, "learning_rate": 0.00012421562809974033, "loss": 0.0277, "step": 254 }, { "epoch": 0.1470729597231568, "grad_norm": 1.2721880674362183, "learning_rate": 0.00012368792175207317, "loss": 0.0119, "step": 255 }, { "epoch": 0.14764971642795346, "grad_norm": 0.020873263478279114, "learning_rate": 0.00012315951561081754, "loss": 0.0005, "step": 256 }, { "epoch": 0.14822647313275017, "grad_norm": 0.016365395858883858, "learning_rate": 0.00012263042528625926, "loss": 0.0008, "step": 257 }, { "epoch": 0.14880322983754687, "grad_norm": 0.012678143568336964, "learning_rate": 0.00012210066640889648, "loss": 0.0007, "step": 258 }, { "epoch": 0.14937998654234355, "grad_norm": 0.01748340018093586, "learning_rate": 0.00012157025462897789, "loss": 0.0008, "step": 259 }, { "epoch": 0.14995674324714026, "grad_norm": 0.18821367621421814, "learning_rate": 0.00012103920561604027, "loss": 0.0081, "step": 260 }, { "epoch": 0.15053349995193693, "grad_norm": 0.0870683342218399, "learning_rate": 0.00012050753505844581, "loss": 0.0016, "step": 261 }, { "epoch": 0.15111025665673364, "grad_norm": 0.04264909401535988, "learning_rate": 0.00011997525866291841, "loss": 0.0013, "step": 262 }, { "epoch": 0.15168701336153032, "grad_norm": 0.14858825504779816, "learning_rate": 0.00011944239215407982, "loss": 0.0017, "step": 263 }, { "epoch": 0.15226377006632702, "grad_norm": 0.26314616203308105, "learning_rate": 0.00011890895127398495, "loss": 0.0031, "step": 264 }, { "epoch": 0.15284052677112372, "grad_norm": 0.121333546936512, "learning_rate": 0.00011837495178165706, "loss": 0.0021, "step": 265 }, { "epoch": 0.1534172834759204, "grad_norm": 0.04253160580992699, "learning_rate": 0.00011784040945262184, "loss": 0.0013, "step": 266 }, { "epoch": 0.1539940401807171, "grad_norm": 0.13042855262756348, "learning_rate": 0.00011730534007844185, "loss": 0.0014, "step": 267 }, { "epoch": 0.15457079688551378, "grad_norm": 0.02027931809425354, "learning_rate": 0.00011676975946624944, "loss": 0.0006, "step": 268 }, { "epoch": 0.1551475535903105, "grad_norm": 1.0954957008361816, "learning_rate": 0.00011623368343828028, "loss": 0.0178, "step": 269 }, { "epoch": 0.1557243102951072, "grad_norm": 0.012319994159042835, "learning_rate": 0.00011569712783140558, "loss": 0.0004, "step": 270 }, { "epoch": 0.15630106699990387, "grad_norm": 0.043320175260305405, "learning_rate": 0.00011516010849666445, "loss": 0.0005, "step": 271 }, { "epoch": 0.15687782370470058, "grad_norm": 0.4636062979698181, "learning_rate": 0.00011462264129879554, "loss": 0.0027, "step": 272 }, { "epoch": 0.15745458040949725, "grad_norm": 0.4148387014865875, "learning_rate": 0.00011408474211576833, "loss": 0.0057, "step": 273 }, { "epoch": 0.15803133711429396, "grad_norm": 0.004452509339898825, "learning_rate": 0.00011354642683831414, "loss": 0.0001, "step": 274 }, { "epoch": 0.15860809381909063, "grad_norm": 0.07342913746833801, "learning_rate": 0.00011300771136945658, "loss": 0.0008, "step": 275 }, { "epoch": 0.15918485052388734, "grad_norm": 0.003252458292990923, "learning_rate": 0.00011246861162404183, "loss": 0.0001, "step": 276 }, { "epoch": 0.15976160722868404, "grad_norm": 0.007086520083248615, "learning_rate": 0.00011192914352826849, "loss": 0.0003, "step": 277 }, { "epoch": 0.16033836393348072, "grad_norm": 0.12407305836677551, "learning_rate": 0.00011138932301921703, "loss": 0.0008, "step": 278 }, { "epoch": 0.16091512063827743, "grad_norm": 0.11356565356254578, "learning_rate": 0.0001108491660443789, "loss": 0.0038, "step": 279 }, { "epoch": 0.1614918773430741, "grad_norm": 0.09582390636205673, "learning_rate": 0.0001103086885611856, "loss": 0.0012, "step": 280 }, { "epoch": 0.1620686340478708, "grad_norm": 10.322026252746582, "learning_rate": 0.0001097679065365371, "loss": 0.0147, "step": 281 }, { "epoch": 0.1626453907526675, "grad_norm": 0.7572196125984192, "learning_rate": 0.00010922683594633021, "loss": 0.0049, "step": 282 }, { "epoch": 0.1632221474574642, "grad_norm": 0.07926348596811295, "learning_rate": 0.00010868549277498661, "loss": 0.0009, "step": 283 }, { "epoch": 0.1637989041622609, "grad_norm": 0.009958090260624886, "learning_rate": 0.00010814389301498067, "loss": 0.0002, "step": 284 }, { "epoch": 0.16437566086705757, "grad_norm": 2.576895236968994, "learning_rate": 0.00010760205266636685, "loss": 0.0788, "step": 285 }, { "epoch": 0.16495241757185428, "grad_norm": 0.3327861428260803, "learning_rate": 0.00010705998773630721, "loss": 0.004, "step": 286 }, { "epoch": 0.16552917427665095, "grad_norm": 0.5059621334075928, "learning_rate": 0.00010651771423859844, "loss": 0.0066, "step": 287 }, { "epoch": 0.16610593098144766, "grad_norm": 0.6268529891967773, "learning_rate": 0.0001059752481931988, "loss": 0.0037, "step": 288 }, { "epoch": 0.16668268768624436, "grad_norm": 0.4354836940765381, "learning_rate": 0.0001054326056257547, "loss": 0.0192, "step": 289 }, { "epoch": 0.16725944439104104, "grad_norm": 0.027867276221513748, "learning_rate": 0.00010488980256712762, "loss": 0.0007, "step": 290 }, { "epoch": 0.16783620109583774, "grad_norm": 0.032751694321632385, "learning_rate": 0.00010434685505292008, "loss": 0.0005, "step": 291 }, { "epoch": 0.16841295780063442, "grad_norm": 0.8886283040046692, "learning_rate": 0.0001038037791230023, "loss": 0.0015, "step": 292 }, { "epoch": 0.16898971450543113, "grad_norm": 0.16040530800819397, "learning_rate": 0.00010326059082103811, "loss": 0.0043, "step": 293 }, { "epoch": 0.16956647121022783, "grad_norm": 2.298178195953369, "learning_rate": 0.00010271730619401112, "loss": 0.0685, "step": 294 }, { "epoch": 0.1701432279150245, "grad_norm": 3.5856754779815674, "learning_rate": 0.00010217394129175046, "loss": 0.0209, "step": 295 }, { "epoch": 0.1707199846198212, "grad_norm": 1.4398480653762817, "learning_rate": 0.00010163051216645694, "loss": 0.0044, "step": 296 }, { "epoch": 0.1712967413246179, "grad_norm": 0.9761179685592651, "learning_rate": 0.00010108703487222855, "loss": 0.0222, "step": 297 }, { "epoch": 0.1718734980294146, "grad_norm": 0.37598463892936707, "learning_rate": 0.00010054352546458634, "loss": 0.0028, "step": 298 }, { "epoch": 0.17245025473421127, "grad_norm": 0.014264197088778019, "learning_rate": 0.0001, "loss": 0.0003, "step": 299 }, { "epoch": 0.17302701143900798, "grad_norm": 1.332945704460144, "learning_rate": 9.94564745354137e-05, "loss": 0.0276, "step": 300 }, { "epoch": 0.17302701143900798, "eval_loss": 0.0013285890454426408, "eval_runtime": 218.0721, "eval_samples_per_second": 7.951, "eval_steps_per_second": 1.99, "step": 300 }, { "epoch": 0.17360376814380468, "grad_norm": 0.04305526986718178, "learning_rate": 9.891296512777145e-05, "loss": 0.0004, "step": 301 }, { "epoch": 0.17418052484860136, "grad_norm": 0.3893098533153534, "learning_rate": 9.836948783354309e-05, "loss": 0.0055, "step": 302 }, { "epoch": 0.17475728155339806, "grad_norm": 0.025545770302414894, "learning_rate": 9.782605870824956e-05, "loss": 0.0009, "step": 303 }, { "epoch": 0.17533403825819474, "grad_norm": 0.4669552743434906, "learning_rate": 9.72826938059889e-05, "loss": 0.0052, "step": 304 }, { "epoch": 0.17591079496299145, "grad_norm": 0.039799150079488754, "learning_rate": 9.67394091789619e-05, "loss": 0.0013, "step": 305 }, { "epoch": 0.17648755166778815, "grad_norm": 1.244637370109558, "learning_rate": 9.619622087699775e-05, "loss": 0.0105, "step": 306 }, { "epoch": 0.17706430837258483, "grad_norm": 0.12360428273677826, "learning_rate": 9.565314494707995e-05, "loss": 0.0014, "step": 307 }, { "epoch": 0.17764106507738153, "grad_norm": 0.5047913789749146, "learning_rate": 9.511019743287242e-05, "loss": 0.0049, "step": 308 }, { "epoch": 0.1782178217821782, "grad_norm": 0.011301705613732338, "learning_rate": 9.456739437424529e-05, "loss": 0.0006, "step": 309 }, { "epoch": 0.17879457848697491, "grad_norm": 0.01795639842748642, "learning_rate": 9.402475180680125e-05, "loss": 0.0004, "step": 310 }, { "epoch": 0.1793713351917716, "grad_norm": 0.31354615092277527, "learning_rate": 9.348228576140157e-05, "loss": 0.0026, "step": 311 }, { "epoch": 0.1799480918965683, "grad_norm": 2.3069868087768555, "learning_rate": 9.294001226369282e-05, "loss": 0.111, "step": 312 }, { "epoch": 0.180524848601365, "grad_norm": 0.009788228198885918, "learning_rate": 9.239794733363316e-05, "loss": 0.0004, "step": 313 }, { "epoch": 0.18110160530616168, "grad_norm": 0.1635705977678299, "learning_rate": 9.185610698501938e-05, "loss": 0.0049, "step": 314 }, { "epoch": 0.18167836201095838, "grad_norm": 4.21808385848999, "learning_rate": 9.13145072250134e-05, "loss": 0.0147, "step": 315 }, { "epoch": 0.18225511871575506, "grad_norm": 3.3364384174346924, "learning_rate": 9.077316405366981e-05, "loss": 0.0334, "step": 316 }, { "epoch": 0.18283187542055176, "grad_norm": 0.6934912204742432, "learning_rate": 9.023209346346293e-05, "loss": 0.0045, "step": 317 }, { "epoch": 0.18340863212534847, "grad_norm": 0.23482157289981842, "learning_rate": 8.969131143881444e-05, "loss": 0.0013, "step": 318 }, { "epoch": 0.18398538883014515, "grad_norm": 0.05289175361394882, "learning_rate": 8.915083395562112e-05, "loss": 0.0011, "step": 319 }, { "epoch": 0.18456214553494185, "grad_norm": 0.024139447137713432, "learning_rate": 8.8610676980783e-05, "loss": 0.0008, "step": 320 }, { "epoch": 0.18513890223973853, "grad_norm": 0.016590509563684464, "learning_rate": 8.807085647173151e-05, "loss": 0.0006, "step": 321 }, { "epoch": 0.18571565894453523, "grad_norm": 0.010186581872403622, "learning_rate": 8.753138837595817e-05, "loss": 0.0005, "step": 322 }, { "epoch": 0.1862924156493319, "grad_norm": 0.30127134919166565, "learning_rate": 8.699228863054345e-05, "loss": 0.0034, "step": 323 }, { "epoch": 0.18686917235412862, "grad_norm": 0.2263249158859253, "learning_rate": 8.645357316168589e-05, "loss": 0.0018, "step": 324 }, { "epoch": 0.18744592905892532, "grad_norm": 0.01500785443931818, "learning_rate": 8.591525788423168e-05, "loss": 0.0005, "step": 325 }, { "epoch": 0.188022685763722, "grad_norm": 0.06944471597671509, "learning_rate": 8.537735870120446e-05, "loss": 0.0009, "step": 326 }, { "epoch": 0.1885994424685187, "grad_norm": 0.33884745836257935, "learning_rate": 8.483989150333556e-05, "loss": 0.0093, "step": 327 }, { "epoch": 0.18917619917331538, "grad_norm": 0.030364906415343285, "learning_rate": 8.430287216859443e-05, "loss": 0.0006, "step": 328 }, { "epoch": 0.18975295587811208, "grad_norm": 0.11773938685655594, "learning_rate": 8.376631656171973e-05, "loss": 0.0007, "step": 329 }, { "epoch": 0.1903297125829088, "grad_norm": 0.006642343942075968, "learning_rate": 8.323024053375057e-05, "loss": 0.0002, "step": 330 }, { "epoch": 0.19090646928770547, "grad_norm": 0.012968290597200394, "learning_rate": 8.26946599215582e-05, "loss": 0.0004, "step": 331 }, { "epoch": 0.19148322599250217, "grad_norm": 0.20224778354167938, "learning_rate": 8.215959054737817e-05, "loss": 0.0157, "step": 332 }, { "epoch": 0.19205998269729885, "grad_norm": 0.009717811830341816, "learning_rate": 8.162504821834295e-05, "loss": 0.0004, "step": 333 }, { "epoch": 0.19263673940209555, "grad_norm": 0.036681290715932846, "learning_rate": 8.109104872601504e-05, "loss": 0.0005, "step": 334 }, { "epoch": 0.19321349610689223, "grad_norm": 0.006973525043576956, "learning_rate": 8.055760784592024e-05, "loss": 0.0002, "step": 335 }, { "epoch": 0.19379025281168893, "grad_norm": 3.150785446166992, "learning_rate": 8.002474133708163e-05, "loss": 0.0229, "step": 336 }, { "epoch": 0.19436700951648564, "grad_norm": 0.05632968246936798, "learning_rate": 7.949246494155421e-05, "loss": 0.001, "step": 337 }, { "epoch": 0.19494376622128232, "grad_norm": 0.09060730785131454, "learning_rate": 7.896079438395975e-05, "loss": 0.0015, "step": 338 }, { "epoch": 0.19552052292607902, "grad_norm": 0.9037878513336182, "learning_rate": 7.842974537102216e-05, "loss": 0.0142, "step": 339 }, { "epoch": 0.1960972796308757, "grad_norm": 1.405368685722351, "learning_rate": 7.789933359110355e-05, "loss": 0.0397, "step": 340 }, { "epoch": 0.1966740363356724, "grad_norm": 0.0061925239861011505, "learning_rate": 7.736957471374076e-05, "loss": 0.0003, "step": 341 }, { "epoch": 0.1972507930404691, "grad_norm": 0.019943522289395332, "learning_rate": 7.684048438918248e-05, "loss": 0.0004, "step": 342 }, { "epoch": 0.19782754974526578, "grad_norm": 0.042050715535879135, "learning_rate": 7.631207824792687e-05, "loss": 0.0007, "step": 343 }, { "epoch": 0.1984043064500625, "grad_norm": 2.478391170501709, "learning_rate": 7.578437190025972e-05, "loss": 0.0174, "step": 344 }, { "epoch": 0.19898106315485917, "grad_norm": 0.21869735419750214, "learning_rate": 7.525738093579349e-05, "loss": 0.0017, "step": 345 }, { "epoch": 0.19955781985965587, "grad_norm": 0.3794045150279999, "learning_rate": 7.473112092300654e-05, "loss": 0.0024, "step": 346 }, { "epoch": 0.20013457656445255, "grad_norm": 0.2923913300037384, "learning_rate": 7.420560740878334e-05, "loss": 0.011, "step": 347 }, { "epoch": 0.20071133326924925, "grad_norm": 0.052272677421569824, "learning_rate": 7.368085591795522e-05, "loss": 0.0025, "step": 348 }, { "epoch": 0.20128808997404596, "grad_norm": 1.43683660030365, "learning_rate": 7.315688195284148e-05, "loss": 0.0381, "step": 349 }, { "epoch": 0.20186484667884264, "grad_norm": 0.012283033691346645, "learning_rate": 7.263370099279172e-05, "loss": 0.0002, "step": 350 }, { "epoch": 0.20244160338363934, "grad_norm": 0.029703160747885704, "learning_rate": 7.211132849372838e-05, "loss": 0.0006, "step": 351 }, { "epoch": 0.20301836008843602, "grad_norm": 0.029861683025956154, "learning_rate": 7.158977988769023e-05, "loss": 0.0005, "step": 352 }, { "epoch": 0.20359511679323272, "grad_norm": 0.01865243725478649, "learning_rate": 7.106907058237627e-05, "loss": 0.0005, "step": 353 }, { "epoch": 0.20417187349802943, "grad_norm": 0.06887234002351761, "learning_rate": 7.054921596069086e-05, "loss": 0.0015, "step": 354 }, { "epoch": 0.2047486302028261, "grad_norm": 0.2178819179534912, "learning_rate": 7.003023138028905e-05, "loss": 0.0025, "step": 355 }, { "epoch": 0.2053253869076228, "grad_norm": 0.005656382068991661, "learning_rate": 6.9512132173123e-05, "loss": 0.0003, "step": 356 }, { "epoch": 0.20590214361241949, "grad_norm": 0.07464775443077087, "learning_rate": 6.899493364498883e-05, "loss": 0.0017, "step": 357 }, { "epoch": 0.2064789003172162, "grad_norm": 0.6827694773674011, "learning_rate": 6.847865107507477e-05, "loss": 0.0011, "step": 358 }, { "epoch": 0.20705565702201287, "grad_norm": 0.7878766059875488, "learning_rate": 6.796329971550957e-05, "loss": 0.0015, "step": 359 }, { "epoch": 0.20763241372680957, "grad_norm": 0.006946924142539501, "learning_rate": 6.744889479091196e-05, "loss": 0.0004, "step": 360 }, { "epoch": 0.20820917043160628, "grad_norm": 0.8039273023605347, "learning_rate": 6.693545149794089e-05, "loss": 0.0202, "step": 361 }, { "epoch": 0.20878592713640295, "grad_norm": 0.4323229193687439, "learning_rate": 6.642298500484658e-05, "loss": 0.0183, "step": 362 }, { "epoch": 0.20936268384119966, "grad_norm": 0.2523692548274994, "learning_rate": 6.591151045102242e-05, "loss": 0.0012, "step": 363 }, { "epoch": 0.20993944054599634, "grad_norm": 0.0059658498503267765, "learning_rate": 6.540104294655777e-05, "loss": 0.0003, "step": 364 }, { "epoch": 0.21051619725079304, "grad_norm": 0.007587919011712074, "learning_rate": 6.489159757179145e-05, "loss": 0.0004, "step": 365 }, { "epoch": 0.21109295395558975, "grad_norm": 0.19016094505786896, "learning_rate": 6.438318937686632e-05, "loss": 0.0018, "step": 366 }, { "epoch": 0.21166971066038642, "grad_norm": 0.04421047493815422, "learning_rate": 6.387583338128471e-05, "loss": 0.0005, "step": 367 }, { "epoch": 0.21224646736518313, "grad_norm": 0.017507120966911316, "learning_rate": 6.336954457346462e-05, "loss": 0.0004, "step": 368 }, { "epoch": 0.2128232240699798, "grad_norm": 0.2780638635158539, "learning_rate": 6.286433791029698e-05, "loss": 0.0126, "step": 369 }, { "epoch": 0.2133999807747765, "grad_norm": 0.9244225025177002, "learning_rate": 6.236022831670371e-05, "loss": 0.0091, "step": 370 }, { "epoch": 0.2139767374795732, "grad_norm": 0.06660525500774384, "learning_rate": 6.185723068519695e-05, "loss": 0.0006, "step": 371 }, { "epoch": 0.2145534941843699, "grad_norm": 0.21372446417808533, "learning_rate": 6.135535987543899e-05, "loss": 0.0026, "step": 372 }, { "epoch": 0.2151302508891666, "grad_norm": 0.695388913154602, "learning_rate": 6.085463071380333e-05, "loss": 0.0052, "step": 373 }, { "epoch": 0.21570700759396327, "grad_norm": 0.02092049829661846, "learning_rate": 6.035505799293657e-05, "loss": 0.0005, "step": 374 }, { "epoch": 0.21628376429875998, "grad_norm": 0.06331460177898407, "learning_rate": 5.9856656471321636e-05, "loss": 0.0014, "step": 375 }, { "epoch": 0.21686052100355666, "grad_norm": 0.009214053861796856, "learning_rate": 5.935944087284154e-05, "loss": 0.0003, "step": 376 }, { "epoch": 0.21743727770835336, "grad_norm": 0.04911256581544876, "learning_rate": 5.886342588634458e-05, "loss": 0.0006, "step": 377 }, { "epoch": 0.21801403441315007, "grad_norm": 0.022015627473592758, "learning_rate": 5.836862616521016e-05, "loss": 0.0005, "step": 378 }, { "epoch": 0.21859079111794674, "grad_norm": 0.022273177281022072, "learning_rate": 5.787505632691625e-05, "loss": 0.0005, "step": 379 }, { "epoch": 0.21916754782274345, "grad_norm": 3.0781235694885254, "learning_rate": 5.738273095260728e-05, "loss": 0.0255, "step": 380 }, { "epoch": 0.21974430452754012, "grad_norm": 0.07478675246238708, "learning_rate": 5.6891664586663474e-05, "loss": 0.0012, "step": 381 }, { "epoch": 0.22032106123233683, "grad_norm": 0.4527650475502014, "learning_rate": 5.64018717362711e-05, "loss": 0.007, "step": 382 }, { "epoch": 0.2208978179371335, "grad_norm": 0.05685536935925484, "learning_rate": 5.5913366870994025e-05, "loss": 0.0005, "step": 383 }, { "epoch": 0.2214745746419302, "grad_norm": 0.03591479733586311, "learning_rate": 5.542616442234618e-05, "loss": 0.0007, "step": 384 }, { "epoch": 0.22205133134672692, "grad_norm": 0.0037991597782820463, "learning_rate": 5.494027878336528e-05, "loss": 0.0002, "step": 385 }, { "epoch": 0.2226280880515236, "grad_norm": 0.006658824626356363, "learning_rate": 5.445572430818744e-05, "loss": 0.0002, "step": 386 }, { "epoch": 0.2232048447563203, "grad_norm": 0.03407277166843414, "learning_rate": 5.397251531162332e-05, "loss": 0.0005, "step": 387 }, { "epoch": 0.22378160146111697, "grad_norm": 0.007777561899274588, "learning_rate": 5.349066606873525e-05, "loss": 0.0004, "step": 388 }, { "epoch": 0.22435835816591368, "grad_norm": 0.015606683678925037, "learning_rate": 5.3010190814415317e-05, "loss": 0.0004, "step": 389 }, { "epoch": 0.22493511487071038, "grad_norm": 0.18698696792125702, "learning_rate": 5.253110374296501e-05, "loss": 0.0046, "step": 390 }, { "epoch": 0.22551187157550706, "grad_norm": 0.008487400598824024, "learning_rate": 5.205341900767575e-05, "loss": 0.0004, "step": 391 }, { "epoch": 0.22608862828030377, "grad_norm": 0.264141708612442, "learning_rate": 5.1577150720410935e-05, "loss": 0.0031, "step": 392 }, { "epoch": 0.22666538498510044, "grad_norm": 0.015004309825599194, "learning_rate": 5.11023129511889e-05, "loss": 0.0003, "step": 393 }, { "epoch": 0.22724214168989715, "grad_norm": 0.07077132165431976, "learning_rate": 5.0628919727767386e-05, "loss": 0.0008, "step": 394 }, { "epoch": 0.22781889839469383, "grad_norm": 0.036862704902887344, "learning_rate": 5.015698503522892e-05, "loss": 0.0006, "step": 395 }, { "epoch": 0.22839565509949053, "grad_norm": 0.033869676291942596, "learning_rate": 4.9686522815567936e-05, "loss": 0.0006, "step": 396 }, { "epoch": 0.22897241180428723, "grad_norm": 0.0034785671159625053, "learning_rate": 4.921754696727869e-05, "loss": 0.0002, "step": 397 }, { "epoch": 0.2295491685090839, "grad_norm": 0.052212730050086975, "learning_rate": 4.875007134494479e-05, "loss": 0.0005, "step": 398 }, { "epoch": 0.23012592521388062, "grad_norm": 3.2847325801849365, "learning_rate": 4.82841097588298e-05, "loss": 0.0181, "step": 399 }, { "epoch": 0.2307026819186773, "grad_norm": 0.022678054869174957, "learning_rate": 4.7819675974469356e-05, "loss": 0.0003, "step": 400 }, { "epoch": 0.2307026819186773, "eval_loss": 0.0011641463497653604, "eval_runtime": 218.0453, "eval_samples_per_second": 7.952, "eval_steps_per_second": 1.99, "step": 400 }, { "epoch": 0.231279438623474, "grad_norm": 0.04207810387015343, "learning_rate": 4.735678371226441e-05, "loss": 0.0007, "step": 401 }, { "epoch": 0.2318561953282707, "grad_norm": 0.050111718475818634, "learning_rate": 4.6895446647076005e-05, "loss": 0.0005, "step": 402 }, { "epoch": 0.23243295203306738, "grad_norm": 0.1287953108549118, "learning_rate": 4.64356784078211e-05, "loss": 0.0006, "step": 403 }, { "epoch": 0.23300970873786409, "grad_norm": 0.32896485924720764, "learning_rate": 4.5977492577070194e-05, "loss": 0.0011, "step": 404 }, { "epoch": 0.23358646544266076, "grad_norm": 0.010879968293011189, "learning_rate": 4.552090269064587e-05, "loss": 0.0004, "step": 405 }, { "epoch": 0.23416322214745747, "grad_norm": 0.00759140495210886, "learning_rate": 4.5065922237223054e-05, "loss": 0.0002, "step": 406 }, { "epoch": 0.23473997885225414, "grad_norm": 0.003645148128271103, "learning_rate": 4.461256465793032e-05, "loss": 0.0002, "step": 407 }, { "epoch": 0.23531673555705085, "grad_norm": 0.01244024746119976, "learning_rate": 4.416084334595314e-05, "loss": 0.0004, "step": 408 }, { "epoch": 0.23589349226184755, "grad_norm": 0.05447947233915329, "learning_rate": 4.3710771646137904e-05, "loss": 0.0006, "step": 409 }, { "epoch": 0.23647024896664423, "grad_norm": 2.7542481422424316, "learning_rate": 4.326236285459789e-05, "loss": 0.0143, "step": 410 }, { "epoch": 0.23704700567144094, "grad_norm": 0.5154246091842651, "learning_rate": 4.281563021832027e-05, "loss": 0.012, "step": 411 }, { "epoch": 0.2376237623762376, "grad_norm": 0.026547789573669434, "learning_rate": 4.237058693477499e-05, "loss": 0.0007, "step": 412 }, { "epoch": 0.23820051908103432, "grad_norm": 0.017527271062135696, "learning_rate": 4.192724615152475e-05, "loss": 0.0005, "step": 413 }, { "epoch": 0.23877727578583102, "grad_norm": 0.2022116631269455, "learning_rate": 4.1485620965836626e-05, "loss": 0.0007, "step": 414 }, { "epoch": 0.2393540324906277, "grad_norm": 0.49033060669898987, "learning_rate": 4.1045724424295186e-05, "loss": 0.0024, "step": 415 }, { "epoch": 0.2399307891954244, "grad_norm": 0.0036780948285013437, "learning_rate": 4.060756952241691e-05, "loss": 0.0002, "step": 416 }, { "epoch": 0.24050754590022108, "grad_norm": 0.15310168266296387, "learning_rate": 4.017116920426651e-05, "loss": 0.0012, "step": 417 }, { "epoch": 0.2410843026050178, "grad_norm": 0.11326033622026443, "learning_rate": 3.973653636207437e-05, "loss": 0.001, "step": 418 }, { "epoch": 0.24166105930981446, "grad_norm": 0.009829124435782433, "learning_rate": 3.9303683835855746e-05, "loss": 0.0002, "step": 419 }, { "epoch": 0.24223781601461117, "grad_norm": 0.5462132692337036, "learning_rate": 3.887262441303139e-05, "loss": 0.0038, "step": 420 }, { "epoch": 0.24281457271940787, "grad_norm": 0.002651052549481392, "learning_rate": 3.844337082804984e-05, "loss": 0.0001, "step": 421 }, { "epoch": 0.24339132942420455, "grad_norm": 0.006307276897132397, "learning_rate": 3.801593576201118e-05, "loss": 0.0003, "step": 422 }, { "epoch": 0.24396808612900125, "grad_norm": 0.0033793742768466473, "learning_rate": 3.759033184229247e-05, "loss": 0.0003, "step": 423 }, { "epoch": 0.24454484283379793, "grad_norm": 0.040325820446014404, "learning_rate": 3.7166571642174544e-05, "loss": 0.0005, "step": 424 }, { "epoch": 0.24512159953859464, "grad_norm": 0.06400511413812637, "learning_rate": 3.674466768047078e-05, "loss": 0.0006, "step": 425 }, { "epoch": 0.24569835624339134, "grad_norm": 0.5128046870231628, "learning_rate": 3.632463242115714e-05, "loss": 0.001, "step": 426 }, { "epoch": 0.24627511294818802, "grad_norm": 0.0029268236830830574, "learning_rate": 3.590647827300405e-05, "loss": 0.0002, "step": 427 }, { "epoch": 0.24685186965298472, "grad_norm": 0.012032849714159966, "learning_rate": 3.549021758920955e-05, "loss": 0.0002, "step": 428 }, { "epoch": 0.2474286263577814, "grad_norm": 1.6958905458450317, "learning_rate": 3.507586266703482e-05, "loss": 0.1096, "step": 429 }, { "epoch": 0.2480053830625781, "grad_norm": 0.1854211688041687, "learning_rate": 3.466342574744047e-05, "loss": 0.0012, "step": 430 }, { "epoch": 0.24858213976737478, "grad_norm": 0.005380729679018259, "learning_rate": 3.4252919014725136e-05, "loss": 0.0002, "step": 431 }, { "epoch": 0.2491588964721715, "grad_norm": 0.00903862714767456, "learning_rate": 3.384435459616536e-05, "loss": 0.0002, "step": 432 }, { "epoch": 0.2497356531769682, "grad_norm": 0.01038447581231594, "learning_rate": 3.343774456165756e-05, "loss": 0.0003, "step": 433 }, { "epoch": 0.2503124098817649, "grad_norm": 2.1181607246398926, "learning_rate": 3.303310092336125e-05, "loss": 0.0081, "step": 434 }, { "epoch": 0.2508891665865616, "grad_norm": 0.20273631811141968, "learning_rate": 3.263043563534428e-05, "loss": 0.0013, "step": 435 }, { "epoch": 0.25146592329135825, "grad_norm": 0.006932437885552645, "learning_rate": 3.222976059322969e-05, "loss": 0.0003, "step": 436 }, { "epoch": 0.25204267999615493, "grad_norm": 0.010318148881196976, "learning_rate": 3.1831087633844145e-05, "loss": 0.0003, "step": 437 }, { "epoch": 0.25261943670095166, "grad_norm": 0.01054286491125822, "learning_rate": 3.143442853486846e-05, "loss": 0.0004, "step": 438 }, { "epoch": 0.25319619340574834, "grad_norm": 0.017296332865953445, "learning_rate": 3.1039795014489556e-05, "loss": 0.0004, "step": 439 }, { "epoch": 0.253772950110545, "grad_norm": 0.0033669646363705397, "learning_rate": 3.0647198731054236e-05, "loss": 0.0002, "step": 440 }, { "epoch": 0.25434970681534175, "grad_norm": 0.016607975587248802, "learning_rate": 3.0256651282724857e-05, "loss": 0.0006, "step": 441 }, { "epoch": 0.2549264635201384, "grad_norm": 0.0015658403281122446, "learning_rate": 2.9868164207136616e-05, "loss": 0.0001, "step": 442 }, { "epoch": 0.2555032202249351, "grad_norm": 0.031042572110891342, "learning_rate": 2.948174898105679e-05, "loss": 0.0003, "step": 443 }, { "epoch": 0.25607997692973183, "grad_norm": 0.006565702613443136, "learning_rate": 2.9097417020045648e-05, "loss": 0.0002, "step": 444 }, { "epoch": 0.2566567336345285, "grad_norm": 0.031551264226436615, "learning_rate": 2.87151796781191e-05, "loss": 0.001, "step": 445 }, { "epoch": 0.2572334903393252, "grad_norm": 0.029418721795082092, "learning_rate": 2.8335048247413486e-05, "loss": 0.0005, "step": 446 }, { "epoch": 0.25781024704412187, "grad_norm": 0.10343458503484726, "learning_rate": 2.795703395785184e-05, "loss": 0.0015, "step": 447 }, { "epoch": 0.2583870037489186, "grad_norm": 0.012583478353917599, "learning_rate": 2.758114797681215e-05, "loss": 0.0004, "step": 448 }, { "epoch": 0.2589637604537153, "grad_norm": 0.0019141758093610406, "learning_rate": 2.7207401408797483e-05, "loss": 0.0001, "step": 449 }, { "epoch": 0.25954051715851195, "grad_norm": 0.02962387539446354, "learning_rate": 2.6835805295107897e-05, "loss": 0.0002, "step": 450 }, { "epoch": 0.2601172738633087, "grad_norm": 0.017783278599381447, "learning_rate": 2.646637061351429e-05, "loss": 0.0002, "step": 451 }, { "epoch": 0.26069403056810536, "grad_norm": 1.4393969774246216, "learning_rate": 2.6099108277934103e-05, "loss": 0.0557, "step": 452 }, { "epoch": 0.26127078727290204, "grad_norm": 0.014345666393637657, "learning_rate": 2.5734029138108773e-05, "loss": 0.0001, "step": 453 }, { "epoch": 0.2618475439776987, "grad_norm": 1.9257153272628784, "learning_rate": 2.5371143979283396e-05, "loss": 0.0058, "step": 454 }, { "epoch": 0.26242430068249545, "grad_norm": 0.002763155847787857, "learning_rate": 2.5010463521887996e-05, "loss": 0.0002, "step": 455 }, { "epoch": 0.2630010573872921, "grad_norm": 0.005358894355595112, "learning_rate": 2.4651998421220847e-05, "loss": 0.0002, "step": 456 }, { "epoch": 0.2635778140920888, "grad_norm": 0.0020093510393053293, "learning_rate": 2.42957592671337e-05, "loss": 0.0001, "step": 457 }, { "epoch": 0.26415457079688553, "grad_norm": 0.05425187200307846, "learning_rate": 2.3941756583718845e-05, "loss": 0.0003, "step": 458 }, { "epoch": 0.2647313275016822, "grad_norm": 0.031319353729486465, "learning_rate": 2.3590000828998372e-05, "loss": 0.0004, "step": 459 }, { "epoch": 0.2653080842064789, "grad_norm": 0.004056790843605995, "learning_rate": 2.324050239461507e-05, "loss": 0.0003, "step": 460 }, { "epoch": 0.26588484091127557, "grad_norm": 0.0011511758202686906, "learning_rate": 2.289327160552559e-05, "loss": 0.0001, "step": 461 }, { "epoch": 0.2664615976160723, "grad_norm": 0.024370364844799042, "learning_rate": 2.2548318719695182e-05, "loss": 0.0003, "step": 462 }, { "epoch": 0.267038354320869, "grad_norm": 0.002782194409519434, "learning_rate": 2.22056539277949e-05, "loss": 0.0002, "step": 463 }, { "epoch": 0.26761511102566565, "grad_norm": 0.15251454710960388, "learning_rate": 2.186528735290041e-05, "loss": 0.0022, "step": 464 }, { "epoch": 0.2681918677304624, "grad_norm": 0.15815432369709015, "learning_rate": 2.1527229050193e-05, "loss": 0.0016, "step": 465 }, { "epoch": 0.26876862443525906, "grad_norm": 0.1786395162343979, "learning_rate": 2.1191489006662414e-05, "loss": 0.0076, "step": 466 }, { "epoch": 0.26934538114005574, "grad_norm": 0.38532155752182007, "learning_rate": 2.085807714081195e-05, "loss": 0.0046, "step": 467 }, { "epoch": 0.26992213784485247, "grad_norm": 1.1408209800720215, "learning_rate": 2.0527003302365412e-05, "loss": 0.0057, "step": 468 }, { "epoch": 0.27049889454964915, "grad_norm": 0.14827367663383484, "learning_rate": 2.0198277271976052e-05, "loss": 0.0021, "step": 469 }, { "epoch": 0.2710756512544458, "grad_norm": 0.003028438426554203, "learning_rate": 1.9871908760937717e-05, "loss": 0.0001, "step": 470 }, { "epoch": 0.2716524079592425, "grad_norm": 0.010827925056219101, "learning_rate": 1.9547907410897904e-05, "loss": 0.0003, "step": 471 }, { "epoch": 0.27222916466403924, "grad_norm": 0.034201521426439285, "learning_rate": 1.9226282793572924e-05, "loss": 0.0005, "step": 472 }, { "epoch": 0.2728059213688359, "grad_norm": 0.0016246727900579572, "learning_rate": 1.8907044410465225e-05, "loss": 0.0001, "step": 473 }, { "epoch": 0.2733826780736326, "grad_norm": 0.011455918662250042, "learning_rate": 1.8590201692582476e-05, "loss": 0.0002, "step": 474 }, { "epoch": 0.2739594347784293, "grad_norm": 0.02296796254813671, "learning_rate": 1.8275764000159222e-05, "loss": 0.0003, "step": 475 }, { "epoch": 0.274536191483226, "grad_norm": 0.05785752832889557, "learning_rate": 1.7963740622380197e-05, "loss": 0.0019, "step": 476 }, { "epoch": 0.2751129481880227, "grad_norm": 0.01563318818807602, "learning_rate": 1.7654140777105953e-05, "loss": 0.0001, "step": 477 }, { "epoch": 0.27568970489281935, "grad_norm": 0.2800479531288147, "learning_rate": 1.7346973610600527e-05, "loss": 0.0013, "step": 478 }, { "epoch": 0.2762664615976161, "grad_norm": 0.007839820347726345, "learning_rate": 1.7042248197261202e-05, "loss": 0.0004, "step": 479 }, { "epoch": 0.27684321830241276, "grad_norm": 0.11920185387134552, "learning_rate": 1.6739973539350538e-05, "loss": 0.0008, "step": 480 }, { "epoch": 0.27741997500720944, "grad_norm": 0.044166430830955505, "learning_rate": 1.644015856673031e-05, "loss": 0.0006, "step": 481 }, { "epoch": 0.2779967317120062, "grad_norm": 0.007687595672905445, "learning_rate": 1.6142812136597853e-05, "loss": 0.0003, "step": 482 }, { "epoch": 0.27857348841680285, "grad_norm": 0.36322271823883057, "learning_rate": 1.5847943033224134e-05, "loss": 0.0007, "step": 483 }, { "epoch": 0.2791502451215995, "grad_norm": 0.03229326382279396, "learning_rate": 1.555555996769452e-05, "loss": 0.0005, "step": 484 }, { "epoch": 0.2797270018263962, "grad_norm": 0.060807004570961, "learning_rate": 1.526567157765132e-05, "loss": 0.0009, "step": 485 }, { "epoch": 0.28030375853119294, "grad_norm": 0.0012857717229053378, "learning_rate": 1.4978286427038601e-05, "loss": 0.0001, "step": 486 }, { "epoch": 0.2808805152359896, "grad_norm": 0.5338165163993835, "learning_rate": 1.4693413005849143e-05, "loss": 0.0009, "step": 487 }, { "epoch": 0.2814572719407863, "grad_norm": 0.03121727518737316, "learning_rate": 1.4411059729873765e-05, "loss": 0.0006, "step": 488 }, { "epoch": 0.282034028645583, "grad_norm": 0.055299464613199234, "learning_rate": 1.4131234940452598e-05, "loss": 0.0008, "step": 489 }, { "epoch": 0.2826107853503797, "grad_norm": 0.004643857013434172, "learning_rate": 1.3853946904228676e-05, "loss": 0.0002, "step": 490 }, { "epoch": 0.2831875420551764, "grad_norm": 0.6359913349151611, "learning_rate": 1.357920381290374e-05, "loss": 0.038, "step": 491 }, { "epoch": 0.2837642987599731, "grad_norm": 0.8690559267997742, "learning_rate": 1.3307013782996235e-05, "loss": 0.0355, "step": 492 }, { "epoch": 0.2843410554647698, "grad_norm": 0.5384951233863831, "learning_rate": 1.303738485560153e-05, "loss": 0.0196, "step": 493 }, { "epoch": 0.28491781216956646, "grad_norm": 0.005789866670966148, "learning_rate": 1.2770324996154381e-05, "loss": 0.0002, "step": 494 }, { "epoch": 0.28549456887436314, "grad_norm": 0.7110347151756287, "learning_rate": 1.2505842094193521e-05, "loss": 0.0086, "step": 495 }, { "epoch": 0.2860713255791599, "grad_norm": 0.008576606400310993, "learning_rate": 1.2243943963128734e-05, "loss": 0.0004, "step": 496 }, { "epoch": 0.28664808228395655, "grad_norm": 0.09735046327114105, "learning_rate": 1.1984638340009934e-05, "loss": 0.0054, "step": 497 }, { "epoch": 0.28722483898875323, "grad_norm": 0.06272801011800766, "learning_rate": 1.1727932885298654e-05, "loss": 0.0005, "step": 498 }, { "epoch": 0.28780159569354996, "grad_norm": 0.02203204482793808, "learning_rate": 1.14738351826416e-05, "loss": 0.0004, "step": 499 }, { "epoch": 0.28837835239834664, "grad_norm": 0.0017129405168816447, "learning_rate": 1.1222352738646825e-05, "loss": 0.0001, "step": 500 }, { "epoch": 0.28837835239834664, "eval_loss": 0.0009602023055776954, "eval_runtime": 218.136, "eval_samples_per_second": 7.949, "eval_steps_per_second": 1.99, "step": 500 }, { "epoch": 0.2889551091031433, "grad_norm": 0.21408432722091675, "learning_rate": 1.0973492982661793e-05, "loss": 0.0032, "step": 501 }, { "epoch": 0.28953186580794, "grad_norm": 0.07948876172304153, "learning_rate": 1.0727263266554011e-05, "loss": 0.0017, "step": 502 }, { "epoch": 0.2901086225127367, "grad_norm": 0.16862964630126953, "learning_rate": 1.0483670864493778e-05, "loss": 0.0019, "step": 503 }, { "epoch": 0.2906853792175334, "grad_norm": 0.01340674702078104, "learning_rate": 1.024272297273925e-05, "loss": 0.0002, "step": 504 }, { "epoch": 0.2912621359223301, "grad_norm": 0.0023153014481067657, "learning_rate": 1.0004426709423974e-05, "loss": 0.0001, "step": 505 }, { "epoch": 0.2918388926271268, "grad_norm": 0.005769818089902401, "learning_rate": 9.768789114346499e-06, "loss": 0.0002, "step": 506 }, { "epoch": 0.2924156493319235, "grad_norm": 0.019190780818462372, "learning_rate": 9.535817148762461e-06, "loss": 0.0002, "step": 507 }, { "epoch": 0.29299240603672017, "grad_norm": 1.366793155670166, "learning_rate": 9.305517695178833e-06, "loss": 0.005, "step": 508 }, { "epoch": 0.29356916274151684, "grad_norm": 0.017835253849625587, "learning_rate": 9.07789755715075e-06, "loss": 0.0003, "step": 509 }, { "epoch": 0.2941459194463136, "grad_norm": 0.05861745774745941, "learning_rate": 8.85296345908041e-06, "loss": 0.0005, "step": 510 }, { "epoch": 0.29472267615111025, "grad_norm": 0.4936632812023163, "learning_rate": 8.630722046018458e-06, "loss": 0.0008, "step": 511 }, { "epoch": 0.29529943285590693, "grad_norm": 0.008170605637133121, "learning_rate": 8.411179883467667e-06, "loss": 0.0003, "step": 512 }, { "epoch": 0.29587618956070366, "grad_norm": 0.002018228406086564, "learning_rate": 8.194343457188991e-06, "loss": 0.0001, "step": 513 }, { "epoch": 0.29645294626550034, "grad_norm": 0.1278875768184662, "learning_rate": 7.98021917300993e-06, "loss": 0.001, "step": 514 }, { "epoch": 0.297029702970297, "grad_norm": 0.18191634118556976, "learning_rate": 7.76881335663534e-06, "loss": 0.0046, "step": 515 }, { "epoch": 0.29760645967509375, "grad_norm": 0.00487172557041049, "learning_rate": 7.560132253460483e-06, "loss": 0.0002, "step": 516 }, { "epoch": 0.2981832163798904, "grad_norm": 0.003059436334297061, "learning_rate": 7.354182028386591e-06, "loss": 0.0002, "step": 517 }, { "epoch": 0.2987599730846871, "grad_norm": 0.4325723946094513, "learning_rate": 7.150968765638743e-06, "loss": 0.0108, "step": 518 }, { "epoch": 0.2993367297894838, "grad_norm": 0.6904163360595703, "learning_rate": 6.950498468586075e-06, "loss": 0.004, "step": 519 }, { "epoch": 0.2999134864942805, "grad_norm": 0.02728419378399849, "learning_rate": 6.75277705956443e-06, "loss": 0.0004, "step": 520 }, { "epoch": 0.3004902431990772, "grad_norm": 0.0645672082901001, "learning_rate": 6.5578103797014455e-06, "loss": 0.0012, "step": 521 }, { "epoch": 0.30106699990387387, "grad_norm": 0.13279329240322113, "learning_rate": 6.365604188743979e-06, "loss": 0.0008, "step": 522 }, { "epoch": 0.3016437566086706, "grad_norm": 0.020064158365130424, "learning_rate": 6.176164164887932e-06, "loss": 0.0005, "step": 523 }, { "epoch": 0.3022205133134673, "grad_norm": 0.02414029650390148, "learning_rate": 5.9894959046105095e-06, "loss": 0.0005, "step": 524 }, { "epoch": 0.30279727001826395, "grad_norm": 0.11476976424455643, "learning_rate": 5.805604922504859e-06, "loss": 0.0035, "step": 525 }, { "epoch": 0.30337402672306063, "grad_norm": 0.15488499402999878, "learning_rate": 5.6244966511172505e-06, "loss": 0.006, "step": 526 }, { "epoch": 0.30395078342785736, "grad_norm": 1.3745477199554443, "learning_rate": 5.446176440786488e-06, "loss": 0.0229, "step": 527 }, { "epoch": 0.30452754013265404, "grad_norm": 0.003079983638599515, "learning_rate": 5.270649559485907e-06, "loss": 0.0001, "step": 528 }, { "epoch": 0.3051042968374507, "grad_norm": 0.018386349081993103, "learning_rate": 5.097921192667687e-06, "loss": 0.0004, "step": 529 }, { "epoch": 0.30568105354224745, "grad_norm": 0.9645707011222839, "learning_rate": 4.92799644310975e-06, "loss": 0.0022, "step": 530 }, { "epoch": 0.3062578102470441, "grad_norm": 0.054794877767562866, "learning_rate": 4.7608803307649385e-06, "loss": 0.0052, "step": 531 }, { "epoch": 0.3068345669518408, "grad_norm": 0.18055804073810577, "learning_rate": 4.596577792612755e-06, "loss": 0.0008, "step": 532 }, { "epoch": 0.3074113236566375, "grad_norm": 0.009832300245761871, "learning_rate": 4.4350936825134805e-06, "loss": 0.0002, "step": 533 }, { "epoch": 0.3079880803614342, "grad_norm": 0.0028213104233145714, "learning_rate": 4.27643277106482e-06, "loss": 0.0002, "step": 534 }, { "epoch": 0.3085648370662309, "grad_norm": 0.004487840458750725, "learning_rate": 4.120599745460918e-06, "loss": 0.0002, "step": 535 }, { "epoch": 0.30914159377102757, "grad_norm": 0.0015976703725755215, "learning_rate": 3.967599209353967e-06, "loss": 0.0001, "step": 536 }, { "epoch": 0.3097183504758243, "grad_norm": 0.013129237107932568, "learning_rate": 3.817435682718096e-06, "loss": 0.0002, "step": 537 }, { "epoch": 0.310295107180621, "grad_norm": 0.3533538281917572, "learning_rate": 3.670113601715941e-06, "loss": 0.0187, "step": 538 }, { "epoch": 0.31087186388541765, "grad_norm": 0.1365794688463211, "learning_rate": 3.525637318567554e-06, "loss": 0.0024, "step": 539 }, { "epoch": 0.3114486205902144, "grad_norm": 0.0068528540432453156, "learning_rate": 3.384011101421802e-06, "loss": 0.0003, "step": 540 }, { "epoch": 0.31202537729501106, "grad_norm": 0.006609945558011532, "learning_rate": 3.2452391342303046e-06, "loss": 0.0002, "step": 541 }, { "epoch": 0.31260213399980774, "grad_norm": 4.070964813232422, "learning_rate": 3.1093255166238176e-06, "loss": 0.0221, "step": 542 }, { "epoch": 0.3131788907046044, "grad_norm": 0.1819545030593872, "learning_rate": 2.976274263791179e-06, "loss": 0.0043, "step": 543 }, { "epoch": 0.31375564740940115, "grad_norm": 0.004702473059296608, "learning_rate": 2.8460893063606e-06, "loss": 0.0001, "step": 544 }, { "epoch": 0.3143324041141978, "grad_norm": 0.1847194880247116, "learning_rate": 2.718774490283593e-06, "loss": 0.0014, "step": 545 }, { "epoch": 0.3149091608189945, "grad_norm": 0.06856244802474976, "learning_rate": 2.5943335767213304e-06, "loss": 0.0003, "step": 546 }, { "epoch": 0.31548591752379124, "grad_norm": 0.021140173077583313, "learning_rate": 2.4727702419335864e-06, "loss": 0.0004, "step": 547 }, { "epoch": 0.3160626742285879, "grad_norm": 0.30309152603149414, "learning_rate": 2.3540880771700803e-06, "loss": 0.0025, "step": 548 }, { "epoch": 0.3166394309333846, "grad_norm": 0.07376673817634583, "learning_rate": 2.2382905885643844e-06, "loss": 0.0007, "step": 549 }, { "epoch": 0.31721618763818127, "grad_norm": 0.012836321257054806, "learning_rate": 2.125381197030374e-06, "loss": 0.0001, "step": 550 }, { "epoch": 0.317792944342978, "grad_norm": 0.012486038729548454, "learning_rate": 2.0153632381611498e-06, "loss": 0.0004, "step": 551 }, { "epoch": 0.3183697010477747, "grad_norm": 2.240330696105957, "learning_rate": 1.908239962130476e-06, "loss": 0.0023, "step": 552 }, { "epoch": 0.31894645775257136, "grad_norm": 1.6802948713302612, "learning_rate": 1.8040145335968318e-06, "loss": 0.0176, "step": 553 }, { "epoch": 0.3195232144573681, "grad_norm": 0.055240605026483536, "learning_rate": 1.7026900316098215e-06, "loss": 0.0005, "step": 554 }, { "epoch": 0.32009997116216476, "grad_norm": 0.0030760413501411676, "learning_rate": 1.6042694495193022e-06, "loss": 0.0002, "step": 555 }, { "epoch": 0.32067672786696144, "grad_norm": 0.010371128097176552, "learning_rate": 1.5087556948868876e-06, "loss": 0.0001, "step": 556 }, { "epoch": 0.3212534845717582, "grad_norm": 0.21550880372524261, "learning_rate": 1.4161515894001165e-06, "loss": 0.0005, "step": 557 }, { "epoch": 0.32183024127655485, "grad_norm": 0.030834214761853218, "learning_rate": 1.3264598687890205e-06, "loss": 0.0004, "step": 558 }, { "epoch": 0.32240699798135153, "grad_norm": 0.04491027444601059, "learning_rate": 1.2396831827453436e-06, "loss": 0.0013, "step": 559 }, { "epoch": 0.3229837546861482, "grad_norm": 0.0054609524086117744, "learning_rate": 1.1558240948443044e-06, "loss": 0.0002, "step": 560 }, { "epoch": 0.32356051139094494, "grad_norm": 0.06006991118192673, "learning_rate": 1.0748850824687795e-06, "loss": 0.0011, "step": 561 }, { "epoch": 0.3241372680957416, "grad_norm": 0.09065622091293335, "learning_rate": 9.968685367361618e-07, "loss": 0.0012, "step": 562 }, { "epoch": 0.3247140248005383, "grad_norm": 0.11779052764177322, "learning_rate": 9.217767624277396e-07, "loss": 0.0009, "step": 563 }, { "epoch": 0.325290781505335, "grad_norm": 0.004247438628226519, "learning_rate": 8.496119779205725e-07, "loss": 0.0002, "step": 564 }, { "epoch": 0.3258675382101317, "grad_norm": 0.26266857981681824, "learning_rate": 7.803763151219779e-07, "loss": 0.0012, "step": 565 }, { "epoch": 0.3264442949149284, "grad_norm": 0.17352361977100372, "learning_rate": 7.140718194065032e-07, "loss": 0.0046, "step": 566 }, { "epoch": 0.32702105161972506, "grad_norm": 0.003937472123652697, "learning_rate": 6.507004495555969e-07, "loss": 0.0002, "step": 567 }, { "epoch": 0.3275978083245218, "grad_norm": 0.0046991510316729546, "learning_rate": 5.902640776996315e-07, "loss": 0.0002, "step": 568 }, { "epoch": 0.32817456502931847, "grad_norm": 0.007674611173570156, "learning_rate": 5.327644892626938e-07, "loss": 0.0001, "step": 569 }, { "epoch": 0.32875132173411514, "grad_norm": 0.011769871227443218, "learning_rate": 4.782033829097587e-07, "loss": 0.0002, "step": 570 }, { "epoch": 0.3293280784389119, "grad_norm": 0.3156275749206543, "learning_rate": 4.2658237049655323e-07, "loss": 0.0011, "step": 571 }, { "epoch": 0.32990483514370855, "grad_norm": 0.14030705392360687, "learning_rate": 3.779029770219378e-07, "loss": 0.0006, "step": 572 }, { "epoch": 0.33048159184850523, "grad_norm": 1.7659718990325928, "learning_rate": 3.3216664058283165e-07, "loss": 0.0134, "step": 573 }, { "epoch": 0.3310583485533019, "grad_norm": 0.0017080976394936442, "learning_rate": 2.893747123317581e-07, "loss": 0.0001, "step": 574 }, { "epoch": 0.33163510525809864, "grad_norm": 0.003221493447199464, "learning_rate": 2.4952845643689827e-07, "loss": 0.0001, "step": 575 }, { "epoch": 0.3322118619628953, "grad_norm": 0.006105966400355101, "learning_rate": 2.1262905004475475e-07, "loss": 0.0001, "step": 576 }, { "epoch": 0.332788618667692, "grad_norm": 0.9341827034950256, "learning_rate": 1.786775832454013e-07, "loss": 0.0124, "step": 577 }, { "epoch": 0.3333653753724887, "grad_norm": 0.0012107370421290398, "learning_rate": 1.4767505904021983e-07, "loss": 0.0001, "step": 578 }, { "epoch": 0.3339421320772854, "grad_norm": 0.004891383461654186, "learning_rate": 1.1962239331234637e-07, "loss": 0.0002, "step": 579 }, { "epoch": 0.3345188887820821, "grad_norm": 0.006321438588202, "learning_rate": 9.452041479954821e-08, "loss": 0.0003, "step": 580 }, { "epoch": 0.3350956454868788, "grad_norm": 0.16756032407283783, "learning_rate": 7.236986506978794e-08, "loss": 0.001, "step": 581 }, { "epoch": 0.3356724021916755, "grad_norm": 0.14708253741264343, "learning_rate": 5.317139849928543e-08, "loss": 0.0062, "step": 582 }, { "epoch": 0.33624915889647217, "grad_norm": 0.4846436679363251, "learning_rate": 3.692558225317777e-08, "loss": 0.0169, "step": 583 }, { "epoch": 0.33682591560126884, "grad_norm": 0.0019764131866395473, "learning_rate": 2.363289626882148e-08, "loss": 0.0001, "step": 584 }, { "epoch": 0.3374026723060656, "grad_norm": 0.016219504177570343, "learning_rate": 1.3293733241537266e-08, "loss": 0.0004, "step": 585 }, { "epoch": 0.33797942901086225, "grad_norm": 0.052470527589321136, "learning_rate": 5.908398613074795e-09, "loss": 0.0007, "step": 586 }, { "epoch": 0.33855618571565893, "grad_norm": 0.0014326622476801276, "learning_rate": 1.4771105625421834e-09, "loss": 0.0001, "step": 587 }, { "epoch": 0.33913294242045566, "grad_norm": 0.028719905763864517, "learning_rate": 0.0, "loss": 0.0004, "step": 588 } ], "logging_steps": 1, "max_steps": 588, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.243015677103571e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }